novel-downloader 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -2
- novel_downloader/cli/__init__.py +0 -1
- novel_downloader/cli/clean.py +2 -10
- novel_downloader/cli/download.py +18 -22
- novel_downloader/cli/interactive.py +0 -1
- novel_downloader/cli/main.py +1 -3
- novel_downloader/cli/settings.py +8 -8
- novel_downloader/config/__init__.py +0 -1
- novel_downloader/config/adapter.py +48 -18
- novel_downloader/config/loader.py +116 -108
- novel_downloader/config/models.py +41 -32
- novel_downloader/config/site_rules.py +2 -4
- novel_downloader/core/__init__.py +0 -1
- novel_downloader/core/downloaders/__init__.py +4 -4
- novel_downloader/core/downloaders/base/__init__.py +14 -0
- novel_downloader/core/downloaders/{base_async_downloader.py → base/base_async.py} +49 -53
- novel_downloader/core/downloaders/{base_downloader.py → base/base_sync.py} +64 -43
- novel_downloader/core/downloaders/biquge/__init__.py +12 -0
- novel_downloader/core/downloaders/biquge/biquge_sync.py +25 -0
- novel_downloader/core/downloaders/common/__init__.py +14 -0
- novel_downloader/core/downloaders/{common_asynb_downloader.py → common/common_async.py} +42 -33
- novel_downloader/core/downloaders/{common_downloader.py → common/common_sync.py} +34 -23
- novel_downloader/core/downloaders/qidian/__init__.py +10 -0
- novel_downloader/core/downloaders/{qidian_downloader.py → qidian/qidian_sync.py} +80 -64
- novel_downloader/core/factory/__init__.py +4 -5
- novel_downloader/core/factory/{downloader_factory.py → downloader.py} +36 -35
- novel_downloader/core/factory/{parser_factory.py → parser.py} +12 -14
- novel_downloader/core/factory/{requester_factory.py → requester.py} +29 -16
- novel_downloader/core/factory/{saver_factory.py → saver.py} +4 -9
- novel_downloader/core/interfaces/__init__.py +8 -9
- novel_downloader/core/interfaces/{async_downloader_protocol.py → async_downloader.py} +4 -5
- novel_downloader/core/interfaces/{async_requester_protocol.py → async_requester.py} +26 -12
- novel_downloader/core/interfaces/{parser_protocol.py → parser.py} +11 -6
- novel_downloader/core/interfaces/{saver_protocol.py → saver.py} +2 -3
- novel_downloader/core/interfaces/{downloader_protocol.py → sync_downloader.py} +6 -7
- novel_downloader/core/interfaces/{requester_protocol.py → sync_requester.py} +34 -17
- novel_downloader/core/parsers/__init__.py +5 -4
- novel_downloader/core/parsers/{base_parser.py → base.py} +20 -11
- novel_downloader/core/parsers/biquge/__init__.py +10 -0
- novel_downloader/core/parsers/biquge/main_parser.py +126 -0
- novel_downloader/core/parsers/{common_parser → common}/__init__.py +2 -3
- novel_downloader/core/parsers/{common_parser → common}/helper.py +20 -18
- novel_downloader/core/parsers/{common_parser → common}/main_parser.py +15 -9
- novel_downloader/core/parsers/{qidian_parser → qidian}/__init__.py +2 -3
- novel_downloader/core/parsers/{qidian_parser → qidian}/browser/__init__.py +2 -3
- novel_downloader/core/parsers/{qidian_parser → qidian}/browser/chapter_encrypted.py +41 -49
- novel_downloader/core/parsers/{qidian_parser → qidian}/browser/chapter_normal.py +17 -21
- novel_downloader/core/parsers/{qidian_parser → qidian}/browser/chapter_router.py +10 -9
- novel_downloader/core/parsers/{qidian_parser → qidian}/browser/main_parser.py +16 -12
- novel_downloader/core/parsers/{qidian_parser → qidian}/session/__init__.py +2 -3
- novel_downloader/core/parsers/{qidian_parser → qidian}/session/chapter_encrypted.py +37 -45
- novel_downloader/core/parsers/{qidian_parser → qidian}/session/chapter_normal.py +19 -23
- novel_downloader/core/parsers/{qidian_parser → qidian}/session/chapter_router.py +10 -9
- novel_downloader/core/parsers/{qidian_parser → qidian}/session/main_parser.py +16 -12
- novel_downloader/core/parsers/{qidian_parser → qidian}/session/node_decryptor.py +7 -10
- novel_downloader/core/parsers/{qidian_parser → qidian}/shared/__init__.py +2 -3
- novel_downloader/core/parsers/qidian/shared/book_info_parser.py +150 -0
- novel_downloader/core/parsers/{qidian_parser → qidian}/shared/helpers.py +9 -10
- novel_downloader/core/requesters/__init__.py +9 -5
- novel_downloader/core/requesters/base/__init__.py +16 -0
- novel_downloader/core/requesters/{base_async_session.py → base/async_session.py} +180 -73
- novel_downloader/core/requesters/base/browser.py +340 -0
- novel_downloader/core/requesters/base/session.py +364 -0
- novel_downloader/core/requesters/biquge/__init__.py +12 -0
- novel_downloader/core/requesters/biquge/session.py +90 -0
- novel_downloader/core/requesters/{common_requester → common}/__init__.py +4 -5
- novel_downloader/core/requesters/common/async_session.py +96 -0
- novel_downloader/core/requesters/common/session.py +113 -0
- novel_downloader/core/requesters/qidian/__init__.py +21 -0
- novel_downloader/core/requesters/qidian/broswer.py +306 -0
- novel_downloader/core/requesters/qidian/session.py +287 -0
- novel_downloader/core/savers/__init__.py +5 -3
- novel_downloader/core/savers/{base_saver.py → base.py} +12 -13
- novel_downloader/core/savers/biquge.py +25 -0
- novel_downloader/core/savers/{common_saver → common}/__init__.py +2 -3
- novel_downloader/core/savers/{common_saver/common_epub.py → common/epub.py} +24 -52
- novel_downloader/core/savers/{common_saver → common}/main_saver.py +43 -9
- novel_downloader/core/savers/{common_saver/common_txt.py → common/txt.py} +16 -46
- novel_downloader/core/savers/epub_utils/__init__.py +0 -1
- novel_downloader/core/savers/epub_utils/css_builder.py +13 -7
- novel_downloader/core/savers/epub_utils/initializer.py +4 -5
- novel_downloader/core/savers/epub_utils/text_to_html.py +2 -3
- novel_downloader/core/savers/epub_utils/volume_intro.py +1 -3
- novel_downloader/core/savers/{qidian_saver.py → qidian.py} +12 -6
- novel_downloader/locales/en.json +12 -4
- novel_downloader/locales/zh.json +9 -1
- novel_downloader/resources/config/settings.toml +88 -0
- novel_downloader/utils/cache.py +2 -2
- novel_downloader/utils/chapter_storage.py +340 -0
- novel_downloader/utils/constants.py +8 -5
- novel_downloader/utils/crypto_utils.py +3 -3
- novel_downloader/utils/file_utils/__init__.py +0 -1
- novel_downloader/utils/file_utils/io.py +12 -17
- novel_downloader/utils/file_utils/normalize.py +1 -3
- novel_downloader/utils/file_utils/sanitize.py +2 -9
- novel_downloader/utils/fontocr/__init__.py +0 -1
- novel_downloader/utils/fontocr/ocr_v1.py +19 -22
- novel_downloader/utils/fontocr/ocr_v2.py +147 -60
- novel_downloader/utils/hash_store.py +19 -20
- novel_downloader/utils/hash_utils.py +0 -1
- novel_downloader/utils/i18n.py +3 -4
- novel_downloader/utils/logger.py +5 -6
- novel_downloader/utils/model_loader.py +5 -8
- novel_downloader/utils/network.py +9 -10
- novel_downloader/utils/state.py +6 -7
- novel_downloader/utils/text_utils/__init__.py +0 -1
- novel_downloader/utils/text_utils/chapter_formatting.py +2 -7
- novel_downloader/utils/text_utils/diff_display.py +0 -1
- novel_downloader/utils/text_utils/font_mapping.py +1 -4
- novel_downloader/utils/text_utils/text_cleaning.py +0 -1
- novel_downloader/utils/time_utils/__init__.py +0 -1
- novel_downloader/utils/time_utils/datetime_utils.py +9 -11
- novel_downloader/utils/time_utils/sleep_utils.py +27 -13
- {novel_downloader-1.2.1.dist-info → novel_downloader-1.3.0.dist-info}/METADATA +14 -17
- novel_downloader-1.3.0.dist-info/RECORD +127 -0
- {novel_downloader-1.2.1.dist-info → novel_downloader-1.3.0.dist-info}/WHEEL +1 -1
- novel_downloader/core/parsers/qidian_parser/shared/book_info_parser.py +0 -95
- novel_downloader/core/requesters/base_browser.py +0 -210
- novel_downloader/core/requesters/base_session.py +0 -243
- novel_downloader/core/requesters/common_requester/common_async_session.py +0 -98
- novel_downloader/core/requesters/common_requester/common_session.py +0 -126
- novel_downloader/core/requesters/qidian_requester/__init__.py +0 -22
- novel_downloader/core/requesters/qidian_requester/qidian_broswer.py +0 -377
- novel_downloader/core/requesters/qidian_requester/qidian_session.py +0 -202
- novel_downloader/resources/config/settings.yaml +0 -76
- novel_downloader-1.2.1.dist-info/RECORD +0 -115
- {novel_downloader-1.2.1.dist-info → novel_downloader-1.3.0.dist-info}/entry_points.txt +0 -0
- {novel_downloader-1.2.1.dist-info → novel_downloader-1.3.0.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-1.2.1.dist-info → novel_downloader-1.3.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,4 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
3
|
novel_downloader.core.interfaces
|
5
4
|
--------------------------------
|
@@ -15,18 +14,18 @@ Included protocols:
|
|
15
14
|
- SaverProtocol
|
16
15
|
"""
|
17
16
|
|
18
|
-
from .
|
19
|
-
from .
|
20
|
-
from .
|
21
|
-
from .
|
22
|
-
from .
|
23
|
-
from .
|
17
|
+
from .async_downloader import AsyncDownloaderProtocol
|
18
|
+
from .async_requester import AsyncRequesterProtocol
|
19
|
+
from .parser import ParserProtocol
|
20
|
+
from .saver import SaverProtocol
|
21
|
+
from .sync_downloader import SyncDownloaderProtocol
|
22
|
+
from .sync_requester import SyncRequesterProtocol
|
24
23
|
|
25
24
|
__all__ = [
|
26
25
|
"AsyncDownloaderProtocol",
|
27
26
|
"AsyncRequesterProtocol",
|
28
|
-
"DownloaderProtocol",
|
29
27
|
"ParserProtocol",
|
30
|
-
"RequesterProtocol",
|
31
28
|
"SaverProtocol",
|
29
|
+
"SyncDownloaderProtocol",
|
30
|
+
"SyncRequesterProtocol",
|
32
31
|
]
|
@@ -1,26 +1,25 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.interfaces.
|
3
|
+
novel_downloader.core.interfaces.async_downloader
|
5
4
|
----------------------------------------------------------
|
6
5
|
|
7
6
|
This module defines the AsyncDownloaderProtocol, a structural interface
|
8
7
|
that outlines the expected behavior of any downloader class.
|
9
8
|
"""
|
10
9
|
|
11
|
-
from typing import
|
10
|
+
from typing import Protocol
|
12
11
|
|
13
12
|
|
14
13
|
class AsyncDownloaderProtocol(Protocol):
|
15
14
|
"""
|
16
|
-
Protocol for fully
|
15
|
+
Protocol for fully-asynchronous downloader classes.
|
17
16
|
|
18
17
|
Defines the expected interface for any downloader implementation,
|
19
18
|
including both batch and single book downloads,
|
20
19
|
as well as optional pre-download hooks.
|
21
20
|
"""
|
22
21
|
|
23
|
-
async def download(self, book_ids:
|
22
|
+
async def download(self, book_ids: list[str]) -> None:
|
24
23
|
"""
|
25
24
|
Batch download entry point.
|
26
25
|
|
@@ -1,15 +1,14 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.interfaces.
|
3
|
+
novel_downloader.core.interfaces.async_requester
|
5
4
|
--------------------------------------------------------
|
6
5
|
|
7
6
|
Defines the AsyncRequesterProtocol interface for fetching raw HTML or JSON
|
8
7
|
for book info pages, individual chapters, managing request lifecycle,
|
9
|
-
and optionally retrieving a user's authenticated bookcase
|
8
|
+
and optionally retrieving a user's authenticated bookcase.
|
10
9
|
"""
|
11
10
|
|
12
|
-
from typing import
|
11
|
+
from typing import Any, Literal, Protocol, runtime_checkable
|
13
12
|
|
14
13
|
|
15
14
|
@runtime_checkable
|
@@ -21,7 +20,16 @@ class AsyncRequesterProtocol(Protocol):
|
|
21
20
|
and manage login/shutdown asynchronously.
|
22
21
|
"""
|
23
22
|
|
24
|
-
|
23
|
+
def is_async(self) -> Literal[True]:
|
24
|
+
...
|
25
|
+
|
26
|
+
async def login(
|
27
|
+
self,
|
28
|
+
username: str = "",
|
29
|
+
password: str = "",
|
30
|
+
manual_login: bool = False,
|
31
|
+
**kwargs: Any,
|
32
|
+
) -> bool:
|
25
33
|
"""
|
26
34
|
Attempt to log in asynchronously.
|
27
35
|
:returns: True if login succeeded.
|
@@ -29,41 +37,47 @@ class AsyncRequesterProtocol(Protocol):
|
|
29
37
|
...
|
30
38
|
|
31
39
|
async def get_book_info(
|
32
|
-
self,
|
40
|
+
self,
|
41
|
+
book_id: str,
|
42
|
+
**kwargs: Any,
|
33
43
|
) -> str:
|
34
44
|
"""
|
35
45
|
Fetch the raw HTML (or JSON) of the book info page asynchronously.
|
36
46
|
|
37
47
|
:param book_id: The book identifier.
|
38
|
-
:param wait_time: Base number of seconds to wait before returning content.
|
39
48
|
:return: The page content as a string.
|
40
49
|
"""
|
41
50
|
...
|
42
51
|
|
43
52
|
async def get_book_chapter(
|
44
|
-
self,
|
53
|
+
self,
|
54
|
+
book_id: str,
|
55
|
+
chapter_id: str,
|
56
|
+
**kwargs: Any,
|
45
57
|
) -> str:
|
46
58
|
"""
|
47
59
|
Fetch the raw HTML (or JSON) of a single chapter asynchronously.
|
48
60
|
|
49
61
|
:param book_id: The book identifier.
|
50
62
|
:param chapter_id: The chapter identifier.
|
51
|
-
:param wait_time: Base number of seconds to wait before returning content.
|
52
63
|
:return: The chapter content as a string.
|
53
64
|
"""
|
54
65
|
...
|
55
66
|
|
56
|
-
async def get_bookcase(
|
67
|
+
async def get_bookcase(
|
68
|
+
self,
|
69
|
+
page: int = 1,
|
70
|
+
**kwargs: Any,
|
71
|
+
) -> str:
|
57
72
|
"""
|
58
73
|
Optional: Retrieve the HTML content of the authenticated
|
59
74
|
user's bookcase page asynchronously.
|
60
75
|
|
61
|
-
:param wait_time: Base number of seconds to wait before returning content.
|
62
76
|
:return: The HTML markup of the bookcase page.
|
63
77
|
"""
|
64
78
|
...
|
65
79
|
|
66
|
-
async def
|
80
|
+
async def close(self) -> None:
|
67
81
|
"""
|
68
82
|
Shutdown and clean up any resources (e.g., close aiohttp session).
|
69
83
|
"""
|
@@ -1,14 +1,15 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.interfaces.
|
5
|
-
|
3
|
+
novel_downloader.core.interfaces.parser
|
4
|
+
---------------------------------------
|
6
5
|
|
7
6
|
Defines the ParserProtocol interface for extracting book metadata,
|
8
7
|
parsing individual chapter content, and setting parser context via book_id.
|
9
8
|
"""
|
10
9
|
|
11
|
-
from typing import Any,
|
10
|
+
from typing import Any, Protocol, runtime_checkable
|
11
|
+
|
12
|
+
from novel_downloader.utils.chapter_storage import ChapterDict
|
12
13
|
|
13
14
|
|
14
15
|
@runtime_checkable
|
@@ -20,7 +21,7 @@ class ParserProtocol(Protocol):
|
|
20
21
|
- accept a book_id context for multi-step workflows.
|
21
22
|
"""
|
22
23
|
|
23
|
-
def parse_book_info(self, html_str: str) ->
|
24
|
+
def parse_book_info(self, html_str: str) -> dict[str, Any]:
|
24
25
|
"""
|
25
26
|
Parse and return a dictionary of book information from the raw HTML.
|
26
27
|
|
@@ -29,7 +30,11 @@ class ParserProtocol(Protocol):
|
|
29
30
|
"""
|
30
31
|
...
|
31
32
|
|
32
|
-
def parse_chapter(
|
33
|
+
def parse_chapter(
|
34
|
+
self,
|
35
|
+
html_str: str,
|
36
|
+
chapter_id: str,
|
37
|
+
) -> ChapterDict | None:
|
33
38
|
"""
|
34
39
|
Parse and return the text content of one chapter.
|
35
40
|
|
@@ -1,8 +1,7 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.interfaces.
|
5
|
-
|
3
|
+
novel_downloader.core.interfaces.saver
|
4
|
+
--------------------------------------
|
6
5
|
|
7
6
|
Defines the SaverProtocol interface for persisting completed books in
|
8
7
|
TXT, EPUB, Markdown, and PDF formats.
|
@@ -1,17 +1,16 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.interfaces.
|
5
|
-
|
3
|
+
novel_downloader.core.interfaces.sync_downloader
|
4
|
+
------------------------------------------------
|
6
5
|
|
7
|
-
This module defines the
|
6
|
+
This module defines the SyncDownloaderProtocol, a structural interface
|
8
7
|
that outlines the expected behavior of any downloader class.
|
9
8
|
"""
|
10
9
|
|
11
|
-
from typing import
|
10
|
+
from typing import Protocol
|
12
11
|
|
13
12
|
|
14
|
-
class
|
13
|
+
class SyncDownloaderProtocol(Protocol):
|
15
14
|
"""
|
16
15
|
Protocol for downloader classes.
|
17
16
|
|
@@ -20,7 +19,7 @@ class DownloaderProtocol(Protocol):
|
|
20
19
|
as well as optional pre-download hooks.
|
21
20
|
"""
|
22
21
|
|
23
|
-
def download(self, book_ids:
|
22
|
+
def download(self, book_ids: list[str]) -> None:
|
24
23
|
"""
|
25
24
|
Batch download entry point.
|
26
25
|
|
@@ -1,65 +1,82 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.interfaces.
|
5
|
-
|
3
|
+
novel_downloader.core.interfaces.sync_requester
|
4
|
+
-----------------------------------------------
|
6
5
|
|
7
6
|
Defines the RequesterProtocol interface for fetching raw HTML or JSON
|
8
7
|
for book info pages, individual chapters, managing request lifecycle,
|
9
8
|
and optionally retrieving a user's authenticated bookcase.
|
10
9
|
"""
|
11
10
|
|
12
|
-
from typing import
|
11
|
+
from typing import Any, Literal, Protocol, runtime_checkable
|
13
12
|
|
14
13
|
|
15
14
|
@runtime_checkable
|
16
|
-
class
|
15
|
+
class SyncRequesterProtocol(Protocol):
|
17
16
|
"""
|
18
17
|
A requester must be able to fetch raw HTML/data for:
|
19
18
|
- a book's info page,
|
20
19
|
- a specific chapter page.
|
21
20
|
"""
|
22
21
|
|
23
|
-
def
|
22
|
+
def is_async(self) -> Literal[False]:
|
23
|
+
...
|
24
|
+
|
25
|
+
def login(
|
26
|
+
self,
|
27
|
+
username: str = "",
|
28
|
+
password: str = "",
|
29
|
+
manual_login: bool = False,
|
30
|
+
**kwargs: Any,
|
31
|
+
) -> bool:
|
24
32
|
"""
|
25
33
|
Attempt to log in
|
26
34
|
"""
|
27
35
|
...
|
28
36
|
|
29
|
-
def get_book_info(
|
37
|
+
def get_book_info(
|
38
|
+
self,
|
39
|
+
book_id: str,
|
40
|
+
**kwargs: Any,
|
41
|
+
) -> str:
|
30
42
|
"""
|
31
43
|
Fetch the raw HTML (or JSON) of the book info page.
|
32
44
|
|
33
45
|
:param book_id: The book identifier.
|
34
|
-
:param wait_time: Base number of seconds to wait before returning content.
|
35
46
|
:return: The page content as a string.
|
36
47
|
"""
|
37
48
|
...
|
38
49
|
|
39
50
|
def get_book_chapter(
|
40
|
-
self,
|
51
|
+
self,
|
52
|
+
book_id: str,
|
53
|
+
chapter_id: str,
|
54
|
+
**kwargs: Any,
|
41
55
|
) -> str:
|
42
56
|
"""
|
43
57
|
Fetch the raw HTML (or JSON) of a single chapter.
|
44
58
|
|
45
59
|
:param book_id: The book identifier.
|
46
60
|
:param chapter_id: The chapter identifier.
|
47
|
-
:param wait_time: Base number of seconds to wait before returning content.
|
48
61
|
:return: The chapter content as a string.
|
49
62
|
"""
|
50
63
|
...
|
51
64
|
|
52
|
-
def
|
65
|
+
def get_bookcase(
|
66
|
+
self,
|
67
|
+
page: int = 1,
|
68
|
+
**kwargs: Any,
|
69
|
+
) -> str:
|
53
70
|
"""
|
54
|
-
|
71
|
+
Optional: Retrieve the HTML content of the authenticated user's bookcase page.
|
72
|
+
|
73
|
+
:param page: Page idx
|
74
|
+
:return: The HTML markup of the bookcase page.
|
55
75
|
"""
|
56
76
|
...
|
57
77
|
|
58
|
-
def
|
78
|
+
def close(self) -> None:
|
59
79
|
"""
|
60
|
-
|
61
|
-
|
62
|
-
:param wait_time: Base number of seconds to wait before returning content.
|
63
|
-
:return: The HTML markup of the bookcase page.
|
80
|
+
Shutdown and cleans up resources.
|
64
81
|
"""
|
65
82
|
...
|
@@ -1,5 +1,4 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
3
|
novel_downloader.core.parsers
|
5
4
|
-----------------------------
|
@@ -8,20 +7,22 @@ This package defines all site-specific parsing modules
|
|
8
7
|
for the novel_downloader framework.
|
9
8
|
|
10
9
|
Currently supported:
|
11
|
-
- Qidian (起点中文网)
|
10
|
+
- Qidian (起点中文网)
|
12
11
|
|
13
12
|
Modules:
|
14
13
|
- qidian_parser
|
15
14
|
- common_parser
|
16
15
|
"""
|
17
16
|
|
18
|
-
from .
|
19
|
-
from .
|
17
|
+
from .biquge import BiqugeParser
|
18
|
+
from .common import CommonParser
|
19
|
+
from .qidian import (
|
20
20
|
QidianBrowserParser,
|
21
21
|
QidianSessionParser,
|
22
22
|
)
|
23
23
|
|
24
24
|
__all__ = [
|
25
|
+
"BiqugeParser",
|
25
26
|
"CommonParser",
|
26
27
|
"QidianBrowserParser",
|
27
28
|
"QidianSessionParser",
|
@@ -1,8 +1,7 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.parsers.
|
5
|
-
|
3
|
+
novel_downloader.core.parsers.base
|
4
|
+
----------------------------------
|
6
5
|
|
7
6
|
This module defines the BaseParser abstract class, which implements the
|
8
7
|
ParserProtocol interface and provides a structured foundation for
|
@@ -16,10 +15,11 @@ a standard parsing interface for:
|
|
16
15
|
|
17
16
|
import abc
|
18
17
|
from pathlib import Path
|
19
|
-
from typing import Any
|
18
|
+
from typing import Any
|
20
19
|
|
21
20
|
from novel_downloader.config import ParserConfig
|
22
21
|
from novel_downloader.core.interfaces import ParserProtocol
|
22
|
+
from novel_downloader.utils.chapter_storage import ChapterDict
|
23
23
|
|
24
24
|
|
25
25
|
class BaseParser(ParserProtocol, abc.ABC):
|
@@ -33,43 +33,51 @@ class BaseParser(ParserProtocol, abc.ABC):
|
|
33
33
|
Subclasses must implement actual parsing logic for specific sites.
|
34
34
|
"""
|
35
35
|
|
36
|
-
def __init__(
|
36
|
+
def __init__(
|
37
|
+
self,
|
38
|
+
config: ParserConfig,
|
39
|
+
):
|
37
40
|
"""
|
38
41
|
Initialize the parser with a configuration object.
|
39
42
|
|
40
43
|
:param config: ParserConfig object controlling parsing behavior.
|
41
44
|
"""
|
42
45
|
self._config = config
|
43
|
-
self._book_id:
|
46
|
+
self._book_id: str | None = None
|
44
47
|
|
45
48
|
self._base_cache_dir = Path(config.cache_dir)
|
49
|
+
self._cache_dir = self._base_cache_dir
|
46
50
|
|
47
51
|
@abc.abstractmethod
|
48
|
-
def parse_book_info(self,
|
52
|
+
def parse_book_info(self, html_str: str) -> dict[str, Any]:
|
49
53
|
"""
|
50
54
|
Parse a book info page and extract metadata and chapter structure.
|
51
55
|
|
52
56
|
Depending on the site structure, the return dict may include a
|
53
57
|
flat `chapters` list or nested `volumes` with chapter groups.
|
54
58
|
|
55
|
-
:param
|
59
|
+
:param html_str: Raw HTML of the book info page.
|
56
60
|
:return: Parsed metadata and chapter structure as a dictionary.
|
57
61
|
"""
|
58
62
|
...
|
59
63
|
|
60
64
|
@abc.abstractmethod
|
61
|
-
def parse_chapter(
|
65
|
+
def parse_chapter(
|
66
|
+
self,
|
67
|
+
html_str: str,
|
68
|
+
chapter_id: str,
|
69
|
+
) -> ChapterDict | None:
|
62
70
|
"""
|
63
71
|
Parse a single chapter page and extract clean text or simplified HTML.
|
64
72
|
|
65
|
-
:param
|
73
|
+
:param html_str: Raw HTML of the chapter page.
|
66
74
|
:param chapter_id: Identifier of the chapter being parsed.
|
67
75
|
:return: Cleaned chapter content as plain text or minimal HTML.
|
68
76
|
"""
|
69
77
|
...
|
70
78
|
|
71
79
|
@property
|
72
|
-
def book_id(self) ->
|
80
|
+
def book_id(self) -> str | None:
|
73
81
|
"""
|
74
82
|
Current book ID in context.
|
75
83
|
|
@@ -85,6 +93,7 @@ class BaseParser(ParserProtocol, abc.ABC):
|
|
85
93
|
:param value: Book identifier.
|
86
94
|
"""
|
87
95
|
self._book_id = value
|
96
|
+
self._cache_dir = self._base_cache_dir / value
|
88
97
|
self._on_book_id_set()
|
89
98
|
|
90
99
|
def _on_book_id_set(self) -> None:
|
@@ -0,0 +1,126 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.biquge.main_parser
|
4
|
+
------------------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
import re
|
9
|
+
from typing import Any
|
10
|
+
|
11
|
+
from lxml import etree
|
12
|
+
from lxml.etree import _Element
|
13
|
+
|
14
|
+
from novel_downloader.core.parsers.base import BaseParser
|
15
|
+
from novel_downloader.utils.chapter_storage import ChapterDict
|
16
|
+
|
17
|
+
|
18
|
+
class BiqugeParser(BaseParser):
|
19
|
+
""" """
|
20
|
+
|
21
|
+
def parse_book_info(self, html_str: str) -> dict[str, Any]:
|
22
|
+
"""
|
23
|
+
Parse a book info page and extract metadata and chapter structure.
|
24
|
+
|
25
|
+
:param html: Raw HTML of the book info page.
|
26
|
+
:return: Parsed metadata and chapter structure as a dictionary.
|
27
|
+
"""
|
28
|
+
tree = etree.HTML(html_str, parser=None)
|
29
|
+
result: dict[str, Any] = {}
|
30
|
+
|
31
|
+
def extract_text(elem: _Element | None) -> str:
|
32
|
+
if elem is None:
|
33
|
+
return ""
|
34
|
+
return "".join(elem.itertext(tag=None)).strip()
|
35
|
+
|
36
|
+
# 书名
|
37
|
+
book_name_elem = tree.xpath('//div[@id="info"]/h1')
|
38
|
+
result["book_name"] = extract_text(book_name_elem[0]) if book_name_elem else ""
|
39
|
+
|
40
|
+
# 作者
|
41
|
+
author_elem = tree.xpath('//div[@id="info"]/p[1]')
|
42
|
+
if author_elem:
|
43
|
+
author_text = extract_text(author_elem[0]).replace("\u00a0", "")
|
44
|
+
match = re.search(r"作\s*者[::]?\s*(\S+)", author_text)
|
45
|
+
result["author"] = match.group(1).strip() if match else ""
|
46
|
+
else:
|
47
|
+
result["author"] = ""
|
48
|
+
|
49
|
+
# 封面
|
50
|
+
cover_elem = tree.xpath('//div[@id="fmimg"]/img/@src')
|
51
|
+
result["cover_url"] = cover_elem[0].strip() if cover_elem else ""
|
52
|
+
|
53
|
+
# 最后更新时间
|
54
|
+
update_elem = tree.xpath('//div[@id="info"]/p[3]')
|
55
|
+
if update_elem:
|
56
|
+
update_text = extract_text(update_elem[0])
|
57
|
+
match = re.search(r"最后更新[::]\s*(\S+)", update_text)
|
58
|
+
result["update_time"] = match.group(1).strip() if match else ""
|
59
|
+
else:
|
60
|
+
result["update_time"] = ""
|
61
|
+
|
62
|
+
# 简介
|
63
|
+
intro_elem = tree.xpath('//div[@id="intro"]')
|
64
|
+
result["summary"] = extract_text(intro_elem[0]) if intro_elem else ""
|
65
|
+
|
66
|
+
# 卷和章节
|
67
|
+
chapters = []
|
68
|
+
in_main_volume = False
|
69
|
+
|
70
|
+
list_dl = tree.xpath('//div[@id="list"]/dl')[0]
|
71
|
+
for elem in list_dl:
|
72
|
+
if elem.tag == "dt":
|
73
|
+
text = "".join(elem.itertext()).strip()
|
74
|
+
in_main_volume = "正文" in text
|
75
|
+
elif in_main_volume and elem.tag == "dd":
|
76
|
+
a: list[_Element] = elem.xpath("./a")
|
77
|
+
if a:
|
78
|
+
title = "".join(a[0].itertext(tag=None)).strip()
|
79
|
+
url = a[0].get("href", "").strip()
|
80
|
+
href_cleaned = url.replace(".html", "")
|
81
|
+
chapter_id_match = re.search(r"/(\d+)$", href_cleaned)
|
82
|
+
chapter_id = chapter_id_match.group(1) if chapter_id_match else ""
|
83
|
+
chapters.append(
|
84
|
+
{"title": title, "url": url, "chapterId": chapter_id}
|
85
|
+
)
|
86
|
+
|
87
|
+
result["volumes"] = [{"volume_name": "正文", "chapters": chapters}]
|
88
|
+
|
89
|
+
return result
|
90
|
+
|
91
|
+
def parse_chapter(
|
92
|
+
self,
|
93
|
+
html_str: str,
|
94
|
+
chapter_id: str,
|
95
|
+
) -> ChapterDict | None:
|
96
|
+
"""
|
97
|
+
Parse a single chapter page and extract clean text or simplified HTML.
|
98
|
+
|
99
|
+
:param html: Raw HTML of the chapter page.
|
100
|
+
:param chapter_id: Identifier of the chapter being parsed.
|
101
|
+
:return: Cleaned chapter content as plain text or minimal HTML.
|
102
|
+
"""
|
103
|
+
tree = etree.HTML(html_str, parser=None)
|
104
|
+
|
105
|
+
# 提取标题
|
106
|
+
title_elem = tree.xpath('//div[@class="bookname"]/h1')
|
107
|
+
title = "".join(title_elem[0].itertext()).strip() if title_elem else ""
|
108
|
+
if not title:
|
109
|
+
title = f"第 {chapter_id} 章"
|
110
|
+
|
111
|
+
# 提取内容
|
112
|
+
content_elem = tree.xpath('//div[@id="content"]')
|
113
|
+
paragraphs = content_elem[0].xpath(".//p") if content_elem else []
|
114
|
+
paragraph_texts = [
|
115
|
+
"".join(p.itertext()).strip() for p in paragraphs if p is not None
|
116
|
+
]
|
117
|
+
content = "\n\n".join([p for p in paragraph_texts if p])
|
118
|
+
if not content.strip():
|
119
|
+
return None
|
120
|
+
|
121
|
+
return {
|
122
|
+
"id": chapter_id,
|
123
|
+
"title": title,
|
124
|
+
"content": content,
|
125
|
+
"extra": {"site": "biquge"},
|
126
|
+
}
|
@@ -1,8 +1,7 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.parsers.
|
5
|
-
|
3
|
+
novel_downloader.core.parsers.common
|
4
|
+
------------------------------------
|
6
5
|
|
7
6
|
This module provides a CommonParser class that implements
|
8
7
|
general-purpose parsing logic for extracting novel metadata
|