novel-downloader 1.2.2__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -2
- novel_downloader/cli/__init__.py +0 -1
- novel_downloader/cli/clean.py +2 -10
- novel_downloader/cli/download.py +16 -22
- novel_downloader/cli/interactive.py +0 -1
- novel_downloader/cli/main.py +1 -3
- novel_downloader/cli/settings.py +8 -8
- novel_downloader/config/__init__.py +0 -1
- novel_downloader/config/adapter.py +32 -27
- novel_downloader/config/loader.py +116 -108
- novel_downloader/config/models.py +35 -29
- novel_downloader/config/site_rules.py +2 -4
- novel_downloader/core/__init__.py +0 -1
- novel_downloader/core/downloaders/__init__.py +4 -4
- novel_downloader/core/downloaders/base/__init__.py +14 -0
- novel_downloader/core/downloaders/{base_async_downloader.py → base/base_async.py} +49 -53
- novel_downloader/core/downloaders/{base_downloader.py → base/base_sync.py} +64 -43
- novel_downloader/core/downloaders/biquge/__init__.py +12 -0
- novel_downloader/core/downloaders/biquge/biquge_sync.py +25 -0
- novel_downloader/core/downloaders/common/__init__.py +14 -0
- novel_downloader/core/downloaders/{common_asynb_downloader.py → common/common_async.py} +42 -33
- novel_downloader/core/downloaders/{common_downloader.py → common/common_sync.py} +33 -21
- novel_downloader/core/downloaders/qidian/__init__.py +10 -0
- novel_downloader/core/downloaders/{qidian_downloader.py → qidian/qidian_sync.py} +79 -62
- novel_downloader/core/factory/__init__.py +4 -5
- novel_downloader/core/factory/{downloader_factory.py → downloader.py} +25 -26
- novel_downloader/core/factory/{parser_factory.py → parser.py} +12 -14
- novel_downloader/core/factory/{requester_factory.py → requester.py} +29 -16
- novel_downloader/core/factory/{saver_factory.py → saver.py} +4 -9
- novel_downloader/core/interfaces/__init__.py +8 -9
- novel_downloader/core/interfaces/{async_downloader_protocol.py → async_downloader.py} +4 -5
- novel_downloader/core/interfaces/{async_requester_protocol.py → async_requester.py} +23 -12
- novel_downloader/core/interfaces/{parser_protocol.py → parser.py} +11 -6
- novel_downloader/core/interfaces/{saver_protocol.py → saver.py} +2 -3
- novel_downloader/core/interfaces/{downloader_protocol.py → sync_downloader.py} +6 -7
- novel_downloader/core/interfaces/{requester_protocol.py → sync_requester.py} +31 -17
- novel_downloader/core/parsers/__init__.py +5 -4
- novel_downloader/core/parsers/{base_parser.py → base.py} +18 -9
- novel_downloader/core/parsers/biquge/__init__.py +10 -0
- novel_downloader/core/parsers/biquge/main_parser.py +126 -0
- novel_downloader/core/parsers/{common_parser → common}/__init__.py +2 -3
- novel_downloader/core/parsers/{common_parser → common}/helper.py +13 -13
- novel_downloader/core/parsers/{common_parser → common}/main_parser.py +15 -9
- novel_downloader/core/parsers/{qidian_parser → qidian}/__init__.py +2 -3
- novel_downloader/core/parsers/{qidian_parser → qidian}/browser/__init__.py +2 -3
- novel_downloader/core/parsers/{qidian_parser → qidian}/browser/chapter_encrypted.py +40 -48
- novel_downloader/core/parsers/{qidian_parser → qidian}/browser/chapter_normal.py +17 -21
- novel_downloader/core/parsers/{qidian_parser → qidian}/browser/chapter_router.py +10 -9
- novel_downloader/core/parsers/{qidian_parser → qidian}/browser/main_parser.py +14 -10
- novel_downloader/core/parsers/{qidian_parser → qidian}/session/__init__.py +2 -3
- novel_downloader/core/parsers/{qidian_parser → qidian}/session/chapter_encrypted.py +36 -44
- novel_downloader/core/parsers/{qidian_parser → qidian}/session/chapter_normal.py +19 -23
- novel_downloader/core/parsers/{qidian_parser → qidian}/session/chapter_router.py +10 -9
- novel_downloader/core/parsers/{qidian_parser → qidian}/session/main_parser.py +14 -10
- novel_downloader/core/parsers/{qidian_parser → qidian}/session/node_decryptor.py +7 -10
- novel_downloader/core/parsers/{qidian_parser → qidian}/shared/__init__.py +2 -3
- novel_downloader/core/parsers/{qidian_parser → qidian}/shared/book_info_parser.py +5 -6
- novel_downloader/core/parsers/{qidian_parser → qidian}/shared/helpers.py +7 -8
- novel_downloader/core/requesters/__init__.py +9 -5
- novel_downloader/core/requesters/base/__init__.py +16 -0
- novel_downloader/core/requesters/{base_async_session.py → base/async_session.py} +177 -73
- novel_downloader/core/requesters/base/browser.py +340 -0
- novel_downloader/core/requesters/base/session.py +364 -0
- novel_downloader/core/requesters/biquge/__init__.py +12 -0
- novel_downloader/core/requesters/biquge/session.py +90 -0
- novel_downloader/core/requesters/{common_requester → common}/__init__.py +4 -5
- novel_downloader/core/requesters/common/async_session.py +96 -0
- novel_downloader/core/requesters/common/session.py +113 -0
- novel_downloader/core/requesters/qidian/__init__.py +21 -0
- novel_downloader/core/requesters/qidian/broswer.py +307 -0
- novel_downloader/core/requesters/qidian/session.py +287 -0
- novel_downloader/core/savers/__init__.py +5 -3
- novel_downloader/core/savers/{base_saver.py → base.py} +12 -13
- novel_downloader/core/savers/biquge.py +25 -0
- novel_downloader/core/savers/{common_saver → common}/__init__.py +2 -3
- novel_downloader/core/savers/{common_saver/common_epub.py → common/epub.py} +23 -51
- novel_downloader/core/savers/{common_saver → common}/main_saver.py +43 -9
- novel_downloader/core/savers/{common_saver/common_txt.py → common/txt.py} +16 -46
- novel_downloader/core/savers/epub_utils/__init__.py +0 -1
- novel_downloader/core/savers/epub_utils/css_builder.py +13 -7
- novel_downloader/core/savers/epub_utils/initializer.py +4 -5
- novel_downloader/core/savers/epub_utils/text_to_html.py +2 -3
- novel_downloader/core/savers/epub_utils/volume_intro.py +1 -3
- novel_downloader/core/savers/{qidian_saver.py → qidian.py} +12 -6
- novel_downloader/locales/en.json +8 -4
- novel_downloader/locales/zh.json +5 -1
- novel_downloader/resources/config/settings.toml +88 -0
- novel_downloader/utils/cache.py +2 -2
- novel_downloader/utils/chapter_storage.py +340 -0
- novel_downloader/utils/constants.py +6 -4
- novel_downloader/utils/crypto_utils.py +3 -3
- novel_downloader/utils/file_utils/__init__.py +0 -1
- novel_downloader/utils/file_utils/io.py +12 -17
- novel_downloader/utils/file_utils/normalize.py +1 -3
- novel_downloader/utils/file_utils/sanitize.py +2 -9
- novel_downloader/utils/fontocr/__init__.py +0 -1
- novel_downloader/utils/fontocr/ocr_v1.py +19 -22
- novel_downloader/utils/fontocr/ocr_v2.py +147 -60
- novel_downloader/utils/hash_store.py +19 -20
- novel_downloader/utils/hash_utils.py +0 -1
- novel_downloader/utils/i18n.py +3 -4
- novel_downloader/utils/logger.py +5 -6
- novel_downloader/utils/model_loader.py +5 -8
- novel_downloader/utils/network.py +9 -10
- novel_downloader/utils/state.py +6 -7
- novel_downloader/utils/text_utils/__init__.py +0 -1
- novel_downloader/utils/text_utils/chapter_formatting.py +2 -7
- novel_downloader/utils/text_utils/diff_display.py +0 -1
- novel_downloader/utils/text_utils/font_mapping.py +1 -4
- novel_downloader/utils/text_utils/text_cleaning.py +0 -1
- novel_downloader/utils/time_utils/__init__.py +0 -1
- novel_downloader/utils/time_utils/datetime_utils.py +8 -10
- novel_downloader/utils/time_utils/sleep_utils.py +1 -3
- {novel_downloader-1.2.2.dist-info → novel_downloader-1.3.1.dist-info}/METADATA +14 -17
- novel_downloader-1.3.1.dist-info/RECORD +127 -0
- {novel_downloader-1.2.2.dist-info → novel_downloader-1.3.1.dist-info}/WHEEL +1 -1
- novel_downloader/core/requesters/base_browser.py +0 -214
- novel_downloader/core/requesters/base_session.py +0 -246
- novel_downloader/core/requesters/common_requester/common_async_session.py +0 -98
- novel_downloader/core/requesters/common_requester/common_session.py +0 -126
- novel_downloader/core/requesters/qidian_requester/__init__.py +0 -22
- novel_downloader/core/requesters/qidian_requester/qidian_broswer.py +0 -396
- novel_downloader/core/requesters/qidian_requester/qidian_session.py +0 -202
- novel_downloader/resources/config/settings.yaml +0 -76
- novel_downloader-1.2.2.dist-info/RECORD +0 -115
- {novel_downloader-1.2.2.dist-info → novel_downloader-1.3.1.dist-info}/entry_points.txt +0 -0
- {novel_downloader-1.2.2.dist-info → novel_downloader-1.3.1.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-1.2.2.dist-info → novel_downloader-1.3.1.dist-info}/top_level.txt +0 -0
@@ -1,26 +1,25 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.interfaces.
|
3
|
+
novel_downloader.core.interfaces.async_downloader
|
5
4
|
----------------------------------------------------------
|
6
5
|
|
7
6
|
This module defines the AsyncDownloaderProtocol, a structural interface
|
8
7
|
that outlines the expected behavior of any downloader class.
|
9
8
|
"""
|
10
9
|
|
11
|
-
from typing import
|
10
|
+
from typing import Protocol
|
12
11
|
|
13
12
|
|
14
13
|
class AsyncDownloaderProtocol(Protocol):
|
15
14
|
"""
|
16
|
-
Protocol for fully
|
15
|
+
Protocol for fully-asynchronous downloader classes.
|
17
16
|
|
18
17
|
Defines the expected interface for any downloader implementation,
|
19
18
|
including both batch and single book downloads,
|
20
19
|
as well as optional pre-download hooks.
|
21
20
|
"""
|
22
21
|
|
23
|
-
async def download(self, book_ids:
|
22
|
+
async def download(self, book_ids: list[str]) -> None:
|
24
23
|
"""
|
25
24
|
Batch download entry point.
|
26
25
|
|
@@ -1,15 +1,14 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.interfaces.
|
3
|
+
novel_downloader.core.interfaces.async_requester
|
5
4
|
--------------------------------------------------------
|
6
5
|
|
7
6
|
Defines the AsyncRequesterProtocol interface for fetching raw HTML or JSON
|
8
7
|
for book info pages, individual chapters, managing request lifecycle,
|
9
|
-
and optionally retrieving a user's authenticated bookcase
|
8
|
+
and optionally retrieving a user's authenticated bookcase.
|
10
9
|
"""
|
11
10
|
|
12
|
-
from typing import
|
11
|
+
from typing import Any, Literal, Protocol, runtime_checkable
|
13
12
|
|
14
13
|
|
15
14
|
@runtime_checkable
|
@@ -24,7 +23,13 @@ class AsyncRequesterProtocol(Protocol):
|
|
24
23
|
def is_async(self) -> Literal[True]:
|
25
24
|
...
|
26
25
|
|
27
|
-
async def login(
|
26
|
+
async def login(
|
27
|
+
self,
|
28
|
+
username: str = "",
|
29
|
+
password: str = "",
|
30
|
+
manual_login: bool = False,
|
31
|
+
**kwargs: Any,
|
32
|
+
) -> bool:
|
28
33
|
"""
|
29
34
|
Attempt to log in asynchronously.
|
30
35
|
:returns: True if login succeeded.
|
@@ -32,41 +37,47 @@ class AsyncRequesterProtocol(Protocol):
|
|
32
37
|
...
|
33
38
|
|
34
39
|
async def get_book_info(
|
35
|
-
self,
|
40
|
+
self,
|
41
|
+
book_id: str,
|
42
|
+
**kwargs: Any,
|
36
43
|
) -> str:
|
37
44
|
"""
|
38
45
|
Fetch the raw HTML (or JSON) of the book info page asynchronously.
|
39
46
|
|
40
47
|
:param book_id: The book identifier.
|
41
|
-
:param wait_time: Base number of seconds to wait before returning content.
|
42
48
|
:return: The page content as a string.
|
43
49
|
"""
|
44
50
|
...
|
45
51
|
|
46
52
|
async def get_book_chapter(
|
47
|
-
self,
|
53
|
+
self,
|
54
|
+
book_id: str,
|
55
|
+
chapter_id: str,
|
56
|
+
**kwargs: Any,
|
48
57
|
) -> str:
|
49
58
|
"""
|
50
59
|
Fetch the raw HTML (or JSON) of a single chapter asynchronously.
|
51
60
|
|
52
61
|
:param book_id: The book identifier.
|
53
62
|
:param chapter_id: The chapter identifier.
|
54
|
-
:param wait_time: Base number of seconds to wait before returning content.
|
55
63
|
:return: The chapter content as a string.
|
56
64
|
"""
|
57
65
|
...
|
58
66
|
|
59
|
-
async def get_bookcase(
|
67
|
+
async def get_bookcase(
|
68
|
+
self,
|
69
|
+
page: int = 1,
|
70
|
+
**kwargs: Any,
|
71
|
+
) -> str:
|
60
72
|
"""
|
61
73
|
Optional: Retrieve the HTML content of the authenticated
|
62
74
|
user's bookcase page asynchronously.
|
63
75
|
|
64
|
-
:param wait_time: Base number of seconds to wait before returning content.
|
65
76
|
:return: The HTML markup of the bookcase page.
|
66
77
|
"""
|
67
78
|
...
|
68
79
|
|
69
|
-
async def
|
80
|
+
async def close(self) -> None:
|
70
81
|
"""
|
71
82
|
Shutdown and clean up any resources (e.g., close aiohttp session).
|
72
83
|
"""
|
@@ -1,14 +1,15 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.interfaces.
|
5
|
-
|
3
|
+
novel_downloader.core.interfaces.parser
|
4
|
+
---------------------------------------
|
6
5
|
|
7
6
|
Defines the ParserProtocol interface for extracting book metadata,
|
8
7
|
parsing individual chapter content, and setting parser context via book_id.
|
9
8
|
"""
|
10
9
|
|
11
|
-
from typing import Any,
|
10
|
+
from typing import Any, Protocol, runtime_checkable
|
11
|
+
|
12
|
+
from novel_downloader.utils.chapter_storage import ChapterDict
|
12
13
|
|
13
14
|
|
14
15
|
@runtime_checkable
|
@@ -20,7 +21,7 @@ class ParserProtocol(Protocol):
|
|
20
21
|
- accept a book_id context for multi-step workflows.
|
21
22
|
"""
|
22
23
|
|
23
|
-
def parse_book_info(self, html_str: str) ->
|
24
|
+
def parse_book_info(self, html_str: str) -> dict[str, Any]:
|
24
25
|
"""
|
25
26
|
Parse and return a dictionary of book information from the raw HTML.
|
26
27
|
|
@@ -29,7 +30,11 @@ class ParserProtocol(Protocol):
|
|
29
30
|
"""
|
30
31
|
...
|
31
32
|
|
32
|
-
def parse_chapter(
|
33
|
+
def parse_chapter(
|
34
|
+
self,
|
35
|
+
html_str: str,
|
36
|
+
chapter_id: str,
|
37
|
+
) -> ChapterDict | None:
|
33
38
|
"""
|
34
39
|
Parse and return the text content of one chapter.
|
35
40
|
|
@@ -1,8 +1,7 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.interfaces.
|
5
|
-
|
3
|
+
novel_downloader.core.interfaces.saver
|
4
|
+
--------------------------------------
|
6
5
|
|
7
6
|
Defines the SaverProtocol interface for persisting completed books in
|
8
7
|
TXT, EPUB, Markdown, and PDF formats.
|
@@ -1,17 +1,16 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.interfaces.
|
5
|
-
|
3
|
+
novel_downloader.core.interfaces.sync_downloader
|
4
|
+
------------------------------------------------
|
6
5
|
|
7
|
-
This module defines the
|
6
|
+
This module defines the SyncDownloaderProtocol, a structural interface
|
8
7
|
that outlines the expected behavior of any downloader class.
|
9
8
|
"""
|
10
9
|
|
11
|
-
from typing import
|
10
|
+
from typing import Protocol
|
12
11
|
|
13
12
|
|
14
|
-
class
|
13
|
+
class SyncDownloaderProtocol(Protocol):
|
15
14
|
"""
|
16
15
|
Protocol for downloader classes.
|
17
16
|
|
@@ -20,7 +19,7 @@ class DownloaderProtocol(Protocol):
|
|
20
19
|
as well as optional pre-download hooks.
|
21
20
|
"""
|
22
21
|
|
23
|
-
def download(self, book_ids:
|
22
|
+
def download(self, book_ids: list[str]) -> None:
|
24
23
|
"""
|
25
24
|
Batch download entry point.
|
26
25
|
|
@@ -1,19 +1,18 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.interfaces.
|
5
|
-
|
3
|
+
novel_downloader.core.interfaces.sync_requester
|
4
|
+
-----------------------------------------------
|
6
5
|
|
7
6
|
Defines the RequesterProtocol interface for fetching raw HTML or JSON
|
8
7
|
for book info pages, individual chapters, managing request lifecycle,
|
9
8
|
and optionally retrieving a user's authenticated bookcase.
|
10
9
|
"""
|
11
10
|
|
12
|
-
from typing import
|
11
|
+
from typing import Any, Literal, Protocol, runtime_checkable
|
13
12
|
|
14
13
|
|
15
14
|
@runtime_checkable
|
16
|
-
class
|
15
|
+
class SyncRequesterProtocol(Protocol):
|
17
16
|
"""
|
18
17
|
A requester must be able to fetch raw HTML/data for:
|
19
18
|
- a book's info page,
|
@@ -23,46 +22,61 @@ class RequesterProtocol(Protocol):
|
|
23
22
|
def is_async(self) -> Literal[False]:
|
24
23
|
...
|
25
24
|
|
26
|
-
def login(
|
25
|
+
def login(
|
26
|
+
self,
|
27
|
+
username: str = "",
|
28
|
+
password: str = "",
|
29
|
+
manual_login: bool = False,
|
30
|
+
**kwargs: Any,
|
31
|
+
) -> bool:
|
27
32
|
"""
|
28
33
|
Attempt to log in
|
29
34
|
"""
|
30
35
|
...
|
31
36
|
|
32
|
-
def get_book_info(
|
37
|
+
def get_book_info(
|
38
|
+
self,
|
39
|
+
book_id: str,
|
40
|
+
**kwargs: Any,
|
41
|
+
) -> str:
|
33
42
|
"""
|
34
43
|
Fetch the raw HTML (or JSON) of the book info page.
|
35
44
|
|
36
45
|
:param book_id: The book identifier.
|
37
|
-
:param wait_time: Base number of seconds to wait before returning content.
|
38
46
|
:return: The page content as a string.
|
39
47
|
"""
|
40
48
|
...
|
41
49
|
|
42
50
|
def get_book_chapter(
|
43
|
-
self,
|
51
|
+
self,
|
52
|
+
book_id: str,
|
53
|
+
chapter_id: str,
|
54
|
+
**kwargs: Any,
|
44
55
|
) -> str:
|
45
56
|
"""
|
46
57
|
Fetch the raw HTML (or JSON) of a single chapter.
|
47
58
|
|
48
59
|
:param book_id: The book identifier.
|
49
60
|
:param chapter_id: The chapter identifier.
|
50
|
-
:param wait_time: Base number of seconds to wait before returning content.
|
51
61
|
:return: The chapter content as a string.
|
52
62
|
"""
|
53
63
|
...
|
54
64
|
|
55
|
-
def
|
65
|
+
def get_bookcase(
|
66
|
+
self,
|
67
|
+
page: int = 1,
|
68
|
+
**kwargs: Any,
|
69
|
+
) -> str:
|
56
70
|
"""
|
57
|
-
|
71
|
+
Optional: Retrieve the HTML content of the authenticated user's bookcase page.
|
72
|
+
|
73
|
+
:param page: Page idx
|
74
|
+
:return: The HTML markup of the bookcase page.
|
58
75
|
"""
|
59
76
|
...
|
60
77
|
|
61
|
-
def
|
78
|
+
def close(self) -> None:
|
62
79
|
"""
|
63
|
-
|
64
|
-
|
65
|
-
:param wait_time: Base number of seconds to wait before returning content.
|
66
|
-
:return: The HTML markup of the bookcase page.
|
80
|
+
Shutdown and cleans up resources.
|
67
81
|
"""
|
68
82
|
...
|
@@ -1,5 +1,4 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
3
|
novel_downloader.core.parsers
|
5
4
|
-----------------------------
|
@@ -8,20 +7,22 @@ This package defines all site-specific parsing modules
|
|
8
7
|
for the novel_downloader framework.
|
9
8
|
|
10
9
|
Currently supported:
|
11
|
-
- Qidian (起点中文网)
|
10
|
+
- Qidian (起点中文网)
|
12
11
|
|
13
12
|
Modules:
|
14
13
|
- qidian_parser
|
15
14
|
- common_parser
|
16
15
|
"""
|
17
16
|
|
18
|
-
from .
|
19
|
-
from .
|
17
|
+
from .biquge import BiqugeParser
|
18
|
+
from .common import CommonParser
|
19
|
+
from .qidian import (
|
20
20
|
QidianBrowserParser,
|
21
21
|
QidianSessionParser,
|
22
22
|
)
|
23
23
|
|
24
24
|
__all__ = [
|
25
|
+
"BiqugeParser",
|
25
26
|
"CommonParser",
|
26
27
|
"QidianBrowserParser",
|
27
28
|
"QidianSessionParser",
|
@@ -1,8 +1,7 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.parsers.
|
5
|
-
|
3
|
+
novel_downloader.core.parsers.base
|
4
|
+
----------------------------------
|
6
5
|
|
7
6
|
This module defines the BaseParser abstract class, which implements the
|
8
7
|
ParserProtocol interface and provides a structured foundation for
|
@@ -16,10 +15,11 @@ a standard parsing interface for:
|
|
16
15
|
|
17
16
|
import abc
|
18
17
|
from pathlib import Path
|
19
|
-
from typing import Any
|
18
|
+
from typing import Any
|
20
19
|
|
21
20
|
from novel_downloader.config import ParserConfig
|
22
21
|
from novel_downloader.core.interfaces import ParserProtocol
|
22
|
+
from novel_downloader.utils.chapter_storage import ChapterDict
|
23
23
|
|
24
24
|
|
25
25
|
class BaseParser(ParserProtocol, abc.ABC):
|
@@ -33,19 +33,23 @@ class BaseParser(ParserProtocol, abc.ABC):
|
|
33
33
|
Subclasses must implement actual parsing logic for specific sites.
|
34
34
|
"""
|
35
35
|
|
36
|
-
def __init__(
|
36
|
+
def __init__(
|
37
|
+
self,
|
38
|
+
config: ParserConfig,
|
39
|
+
):
|
37
40
|
"""
|
38
41
|
Initialize the parser with a configuration object.
|
39
42
|
|
40
43
|
:param config: ParserConfig object controlling parsing behavior.
|
41
44
|
"""
|
42
45
|
self._config = config
|
43
|
-
self._book_id:
|
46
|
+
self._book_id: str | None = None
|
44
47
|
|
45
48
|
self._base_cache_dir = Path(config.cache_dir)
|
49
|
+
self._cache_dir = self._base_cache_dir
|
46
50
|
|
47
51
|
@abc.abstractmethod
|
48
|
-
def parse_book_info(self, html_str: str) ->
|
52
|
+
def parse_book_info(self, html_str: str) -> dict[str, Any]:
|
49
53
|
"""
|
50
54
|
Parse a book info page and extract metadata and chapter structure.
|
51
55
|
|
@@ -58,7 +62,11 @@ class BaseParser(ParserProtocol, abc.ABC):
|
|
58
62
|
...
|
59
63
|
|
60
64
|
@abc.abstractmethod
|
61
|
-
def parse_chapter(
|
65
|
+
def parse_chapter(
|
66
|
+
self,
|
67
|
+
html_str: str,
|
68
|
+
chapter_id: str,
|
69
|
+
) -> ChapterDict | None:
|
62
70
|
"""
|
63
71
|
Parse a single chapter page and extract clean text or simplified HTML.
|
64
72
|
|
@@ -69,7 +77,7 @@ class BaseParser(ParserProtocol, abc.ABC):
|
|
69
77
|
...
|
70
78
|
|
71
79
|
@property
|
72
|
-
def book_id(self) ->
|
80
|
+
def book_id(self) -> str | None:
|
73
81
|
"""
|
74
82
|
Current book ID in context.
|
75
83
|
|
@@ -85,6 +93,7 @@ class BaseParser(ParserProtocol, abc.ABC):
|
|
85
93
|
:param value: Book identifier.
|
86
94
|
"""
|
87
95
|
self._book_id = value
|
96
|
+
self._cache_dir = self._base_cache_dir / value
|
88
97
|
self._on_book_id_set()
|
89
98
|
|
90
99
|
def _on_book_id_set(self) -> None:
|
@@ -0,0 +1,126 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.biquge.main_parser
|
4
|
+
------------------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
import re
|
9
|
+
from typing import Any
|
10
|
+
|
11
|
+
from lxml import etree
|
12
|
+
from lxml.etree import _Element
|
13
|
+
|
14
|
+
from novel_downloader.core.parsers.base import BaseParser
|
15
|
+
from novel_downloader.utils.chapter_storage import ChapterDict
|
16
|
+
|
17
|
+
|
18
|
+
class BiqugeParser(BaseParser):
|
19
|
+
""" """
|
20
|
+
|
21
|
+
def parse_book_info(self, html_str: str) -> dict[str, Any]:
|
22
|
+
"""
|
23
|
+
Parse a book info page and extract metadata and chapter structure.
|
24
|
+
|
25
|
+
:param html: Raw HTML of the book info page.
|
26
|
+
:return: Parsed metadata and chapter structure as a dictionary.
|
27
|
+
"""
|
28
|
+
tree = etree.HTML(html_str, parser=None)
|
29
|
+
result: dict[str, Any] = {}
|
30
|
+
|
31
|
+
def extract_text(elem: _Element | None) -> str:
|
32
|
+
if elem is None:
|
33
|
+
return ""
|
34
|
+
return "".join(elem.itertext(tag=None)).strip()
|
35
|
+
|
36
|
+
# 书名
|
37
|
+
book_name_elem = tree.xpath('//div[@id="info"]/h1')
|
38
|
+
result["book_name"] = extract_text(book_name_elem[0]) if book_name_elem else ""
|
39
|
+
|
40
|
+
# 作者
|
41
|
+
author_elem = tree.xpath('//div[@id="info"]/p[1]')
|
42
|
+
if author_elem:
|
43
|
+
author_text = extract_text(author_elem[0]).replace("\u00a0", "")
|
44
|
+
match = re.search(r"作\s*者[::]?\s*(\S+)", author_text)
|
45
|
+
result["author"] = match.group(1).strip() if match else ""
|
46
|
+
else:
|
47
|
+
result["author"] = ""
|
48
|
+
|
49
|
+
# 封面
|
50
|
+
cover_elem = tree.xpath('//div[@id="fmimg"]/img/@src')
|
51
|
+
result["cover_url"] = cover_elem[0].strip() if cover_elem else ""
|
52
|
+
|
53
|
+
# 最后更新时间
|
54
|
+
update_elem = tree.xpath('//div[@id="info"]/p[3]')
|
55
|
+
if update_elem:
|
56
|
+
update_text = extract_text(update_elem[0])
|
57
|
+
match = re.search(r"最后更新[::]\s*(\S+)", update_text)
|
58
|
+
result["update_time"] = match.group(1).strip() if match else ""
|
59
|
+
else:
|
60
|
+
result["update_time"] = ""
|
61
|
+
|
62
|
+
# 简介
|
63
|
+
intro_elem = tree.xpath('//div[@id="intro"]')
|
64
|
+
result["summary"] = extract_text(intro_elem[0]) if intro_elem else ""
|
65
|
+
|
66
|
+
# 卷和章节
|
67
|
+
chapters = []
|
68
|
+
in_main_volume = False
|
69
|
+
|
70
|
+
list_dl = tree.xpath('//div[@id="list"]/dl')[0]
|
71
|
+
for elem in list_dl:
|
72
|
+
if elem.tag == "dt":
|
73
|
+
text = "".join(elem.itertext()).strip()
|
74
|
+
in_main_volume = "正文" in text
|
75
|
+
elif in_main_volume and elem.tag == "dd":
|
76
|
+
a: list[_Element] = elem.xpath("./a")
|
77
|
+
if a:
|
78
|
+
title = "".join(a[0].itertext(tag=None)).strip()
|
79
|
+
url = a[0].get("href", "").strip()
|
80
|
+
href_cleaned = url.replace(".html", "")
|
81
|
+
chapter_id_match = re.search(r"/(\d+)$", href_cleaned)
|
82
|
+
chapter_id = chapter_id_match.group(1) if chapter_id_match else ""
|
83
|
+
chapters.append(
|
84
|
+
{"title": title, "url": url, "chapterId": chapter_id}
|
85
|
+
)
|
86
|
+
|
87
|
+
result["volumes"] = [{"volume_name": "正文", "chapters": chapters}]
|
88
|
+
|
89
|
+
return result
|
90
|
+
|
91
|
+
def parse_chapter(
|
92
|
+
self,
|
93
|
+
html_str: str,
|
94
|
+
chapter_id: str,
|
95
|
+
) -> ChapterDict | None:
|
96
|
+
"""
|
97
|
+
Parse a single chapter page and extract clean text or simplified HTML.
|
98
|
+
|
99
|
+
:param html: Raw HTML of the chapter page.
|
100
|
+
:param chapter_id: Identifier of the chapter being parsed.
|
101
|
+
:return: Cleaned chapter content as plain text or minimal HTML.
|
102
|
+
"""
|
103
|
+
tree = etree.HTML(html_str, parser=None)
|
104
|
+
|
105
|
+
# 提取标题
|
106
|
+
title_elem = tree.xpath('//div[@class="bookname"]/h1')
|
107
|
+
title = "".join(title_elem[0].itertext()).strip() if title_elem else ""
|
108
|
+
if not title:
|
109
|
+
title = f"第 {chapter_id} 章"
|
110
|
+
|
111
|
+
# 提取内容
|
112
|
+
content_elem = tree.xpath('//div[@id="content"]')
|
113
|
+
paragraphs = content_elem[0].xpath(".//p") if content_elem else []
|
114
|
+
paragraph_texts = [
|
115
|
+
"".join(p.itertext()).strip() for p in paragraphs if p is not None
|
116
|
+
]
|
117
|
+
content = "\n\n".join([p for p in paragraph_texts if p])
|
118
|
+
if not content.strip():
|
119
|
+
return None
|
120
|
+
|
121
|
+
return {
|
122
|
+
"id": chapter_id,
|
123
|
+
"title": title,
|
124
|
+
"content": content,
|
125
|
+
"extra": {"site": "biquge"},
|
126
|
+
}
|
@@ -1,8 +1,7 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.parsers.
|
5
|
-
|
3
|
+
novel_downloader.core.parsers.common
|
4
|
+
------------------------------------
|
6
5
|
|
7
6
|
This module provides a CommonParser class that implements
|
8
7
|
general-purpose parsing logic for extracting novel metadata
|
@@ -1,15 +1,15 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.parsers.
|
5
|
-
|
3
|
+
novel_downloader.core.parsers.common.helpers
|
4
|
+
--------------------------------------------
|
6
5
|
|
7
6
|
Shared utility functions for parsing Common pages.
|
8
7
|
"""
|
9
8
|
|
10
9
|
import logging
|
11
10
|
import re
|
12
|
-
from
|
11
|
+
from collections.abc import Iterable, Iterator
|
12
|
+
from typing import Any, cast
|
13
13
|
|
14
14
|
from bs4 import BeautifulSoup, Tag
|
15
15
|
|
@@ -47,7 +47,7 @@ class HTMLExtractor:
|
|
47
47
|
self._html = html
|
48
48
|
self._soup = html_to_soup(html)
|
49
49
|
|
50
|
-
def extract_book_info(self, rules: BookInfoRules) ->
|
50
|
+
def extract_book_info(self, rules: BookInfoRules) -> dict[str, Any]:
|
51
51
|
"""
|
52
52
|
Extract structured book information from HTML according to the given rules.
|
53
53
|
|
@@ -56,7 +56,7 @@ class HTMLExtractor:
|
|
56
56
|
:param rules: Extraction configuration specifying how to extract.
|
57
57
|
:return: A dictionary containing extracted book information.
|
58
58
|
"""
|
59
|
-
book_info:
|
59
|
+
book_info: dict[str, Any] = {}
|
60
60
|
|
61
61
|
for field_name, field_rules in rules.items():
|
62
62
|
if field_rules is None:
|
@@ -72,7 +72,7 @@ class HTMLExtractor:
|
|
72
72
|
|
73
73
|
return book_info
|
74
74
|
|
75
|
-
def extract_field(self, steps:
|
75
|
+
def extract_field(self, steps: list[RuleStep]) -> str:
|
76
76
|
"""
|
77
77
|
Execute a list of extraction steps on the given HTML.
|
78
78
|
|
@@ -209,7 +209,7 @@ class HTMLExtractor:
|
|
209
209
|
return str(current.get_text().strip())
|
210
210
|
return str(current or "").strip()
|
211
211
|
|
212
|
-
def extract_mixed_volumes(self, volume_rule: VolumesRules) ->
|
212
|
+
def extract_mixed_volumes(self, volume_rule: VolumesRules) -> list[dict[str, Any]]:
|
213
213
|
"""
|
214
214
|
Special mode: mixed <volume> and <chapter> under same parent.
|
215
215
|
(e.g., dt / dd pattern in BiQuGe)
|
@@ -228,8 +228,8 @@ class HTMLExtractor:
|
|
228
228
|
"chapter_selector 和 volume_name_steps"
|
229
229
|
)
|
230
230
|
|
231
|
-
volumes:
|
232
|
-
current_volume:
|
231
|
+
volumes: list[dict[str, Any]] = []
|
232
|
+
current_volume: dict[str, Any] | None = None
|
233
233
|
if not chapter_steps_list:
|
234
234
|
chapter_steps_list = []
|
235
235
|
chapter_info_steps = {item["key"]: item["steps"] for item in chapter_steps_list}
|
@@ -258,7 +258,7 @@ class HTMLExtractor:
|
|
258
258
|
|
259
259
|
return volumes
|
260
260
|
|
261
|
-
def extract_volume_blocks(self, volume_rule: VolumesRules) ->
|
261
|
+
def extract_volume_blocks(self, volume_rule: VolumesRules) -> list[dict[str, Any]]:
|
262
262
|
volume_selector = volume_rule.get("volume_selector")
|
263
263
|
volume_name_steps = volume_rule.get("volume_name_steps")
|
264
264
|
chapter_selector = volume_rule["chapter_selector"]
|
@@ -285,7 +285,7 @@ class HTMLExtractor:
|
|
285
285
|
|
286
286
|
return volumes
|
287
287
|
|
288
|
-
def extract_flat_chapters(self, volume_rule: VolumesRules) ->
|
288
|
+
def extract_flat_chapters(self, volume_rule: VolumesRules) -> list[dict[str, Any]]:
|
289
289
|
chapter_selector = volume_rule["chapter_selector"]
|
290
290
|
chapter_steps_list = volume_rule["chapter_steps"]
|
291
291
|
volume_selector = volume_rule.get("volume_selector")
|
@@ -312,7 +312,7 @@ class HTMLExtractor:
|
|
312
312
|
|
313
313
|
def extract_volumes_structure(
|
314
314
|
self, volume_rule: VolumesRules
|
315
|
-
) ->
|
315
|
+
) -> list[dict[str, Any]]:
|
316
316
|
volume_mode = volume_rule.get("volume_mode", "normal")
|
317
317
|
if volume_mode == "mixed":
|
318
318
|
return self.extract_mixed_volumes(volume_rule)
|