novel-downloader 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -2
- novel_downloader/cli/__init__.py +0 -1
- novel_downloader/cli/clean.py +2 -10
- novel_downloader/cli/download.py +18 -22
- novel_downloader/cli/interactive.py +0 -1
- novel_downloader/cli/main.py +1 -3
- novel_downloader/cli/settings.py +8 -8
- novel_downloader/config/__init__.py +0 -1
- novel_downloader/config/adapter.py +48 -18
- novel_downloader/config/loader.py +116 -108
- novel_downloader/config/models.py +41 -32
- novel_downloader/config/site_rules.py +2 -4
- novel_downloader/core/__init__.py +0 -1
- novel_downloader/core/downloaders/__init__.py +4 -4
- novel_downloader/core/downloaders/base/__init__.py +14 -0
- novel_downloader/core/downloaders/{base_async_downloader.py → base/base_async.py} +49 -53
- novel_downloader/core/downloaders/{base_downloader.py → base/base_sync.py} +64 -43
- novel_downloader/core/downloaders/biquge/__init__.py +12 -0
- novel_downloader/core/downloaders/biquge/biquge_sync.py +25 -0
- novel_downloader/core/downloaders/common/__init__.py +14 -0
- novel_downloader/core/downloaders/{common_asynb_downloader.py → common/common_async.py} +42 -33
- novel_downloader/core/downloaders/{common_downloader.py → common/common_sync.py} +34 -23
- novel_downloader/core/downloaders/qidian/__init__.py +10 -0
- novel_downloader/core/downloaders/{qidian_downloader.py → qidian/qidian_sync.py} +80 -64
- novel_downloader/core/factory/__init__.py +4 -5
- novel_downloader/core/factory/{downloader_factory.py → downloader.py} +36 -35
- novel_downloader/core/factory/{parser_factory.py → parser.py} +12 -14
- novel_downloader/core/factory/{requester_factory.py → requester.py} +29 -16
- novel_downloader/core/factory/{saver_factory.py → saver.py} +4 -9
- novel_downloader/core/interfaces/__init__.py +8 -9
- novel_downloader/core/interfaces/{async_downloader_protocol.py → async_downloader.py} +4 -5
- novel_downloader/core/interfaces/{async_requester_protocol.py → async_requester.py} +26 -12
- novel_downloader/core/interfaces/{parser_protocol.py → parser.py} +11 -6
- novel_downloader/core/interfaces/{saver_protocol.py → saver.py} +2 -3
- novel_downloader/core/interfaces/{downloader_protocol.py → sync_downloader.py} +6 -7
- novel_downloader/core/interfaces/{requester_protocol.py → sync_requester.py} +34 -17
- novel_downloader/core/parsers/__init__.py +5 -4
- novel_downloader/core/parsers/{base_parser.py → base.py} +20 -11
- novel_downloader/core/parsers/biquge/__init__.py +10 -0
- novel_downloader/core/parsers/biquge/main_parser.py +126 -0
- novel_downloader/core/parsers/{common_parser → common}/__init__.py +2 -3
- novel_downloader/core/parsers/{common_parser → common}/helper.py +20 -18
- novel_downloader/core/parsers/{common_parser → common}/main_parser.py +15 -9
- novel_downloader/core/parsers/{qidian_parser → qidian}/__init__.py +2 -3
- novel_downloader/core/parsers/{qidian_parser → qidian}/browser/__init__.py +2 -3
- novel_downloader/core/parsers/{qidian_parser → qidian}/browser/chapter_encrypted.py +41 -49
- novel_downloader/core/parsers/{qidian_parser → qidian}/browser/chapter_normal.py +17 -21
- novel_downloader/core/parsers/{qidian_parser → qidian}/browser/chapter_router.py +10 -9
- novel_downloader/core/parsers/{qidian_parser → qidian}/browser/main_parser.py +16 -12
- novel_downloader/core/parsers/{qidian_parser → qidian}/session/__init__.py +2 -3
- novel_downloader/core/parsers/{qidian_parser → qidian}/session/chapter_encrypted.py +37 -45
- novel_downloader/core/parsers/{qidian_parser → qidian}/session/chapter_normal.py +19 -23
- novel_downloader/core/parsers/{qidian_parser → qidian}/session/chapter_router.py +10 -9
- novel_downloader/core/parsers/{qidian_parser → qidian}/session/main_parser.py +16 -12
- novel_downloader/core/parsers/{qidian_parser → qidian}/session/node_decryptor.py +7 -10
- novel_downloader/core/parsers/{qidian_parser → qidian}/shared/__init__.py +2 -3
- novel_downloader/core/parsers/qidian/shared/book_info_parser.py +150 -0
- novel_downloader/core/parsers/{qidian_parser → qidian}/shared/helpers.py +9 -10
- novel_downloader/core/requesters/__init__.py +9 -5
- novel_downloader/core/requesters/base/__init__.py +16 -0
- novel_downloader/core/requesters/{base_async_session.py → base/async_session.py} +180 -73
- novel_downloader/core/requesters/base/browser.py +340 -0
- novel_downloader/core/requesters/base/session.py +364 -0
- novel_downloader/core/requesters/biquge/__init__.py +12 -0
- novel_downloader/core/requesters/biquge/session.py +90 -0
- novel_downloader/core/requesters/{common_requester → common}/__init__.py +4 -5
- novel_downloader/core/requesters/common/async_session.py +96 -0
- novel_downloader/core/requesters/common/session.py +113 -0
- novel_downloader/core/requesters/qidian/__init__.py +21 -0
- novel_downloader/core/requesters/qidian/broswer.py +306 -0
- novel_downloader/core/requesters/qidian/session.py +287 -0
- novel_downloader/core/savers/__init__.py +5 -3
- novel_downloader/core/savers/{base_saver.py → base.py} +12 -13
- novel_downloader/core/savers/biquge.py +25 -0
- novel_downloader/core/savers/{common_saver → common}/__init__.py +2 -3
- novel_downloader/core/savers/{common_saver/common_epub.py → common/epub.py} +24 -52
- novel_downloader/core/savers/{common_saver → common}/main_saver.py +43 -9
- novel_downloader/core/savers/{common_saver/common_txt.py → common/txt.py} +16 -46
- novel_downloader/core/savers/epub_utils/__init__.py +0 -1
- novel_downloader/core/savers/epub_utils/css_builder.py +13 -7
- novel_downloader/core/savers/epub_utils/initializer.py +4 -5
- novel_downloader/core/savers/epub_utils/text_to_html.py +2 -3
- novel_downloader/core/savers/epub_utils/volume_intro.py +1 -3
- novel_downloader/core/savers/{qidian_saver.py → qidian.py} +12 -6
- novel_downloader/locales/en.json +12 -4
- novel_downloader/locales/zh.json +9 -1
- novel_downloader/resources/config/settings.toml +88 -0
- novel_downloader/utils/cache.py +2 -2
- novel_downloader/utils/chapter_storage.py +340 -0
- novel_downloader/utils/constants.py +8 -5
- novel_downloader/utils/crypto_utils.py +3 -3
- novel_downloader/utils/file_utils/__init__.py +0 -1
- novel_downloader/utils/file_utils/io.py +12 -17
- novel_downloader/utils/file_utils/normalize.py +1 -3
- novel_downloader/utils/file_utils/sanitize.py +2 -9
- novel_downloader/utils/fontocr/__init__.py +0 -1
- novel_downloader/utils/fontocr/ocr_v1.py +19 -22
- novel_downloader/utils/fontocr/ocr_v2.py +147 -60
- novel_downloader/utils/hash_store.py +19 -20
- novel_downloader/utils/hash_utils.py +0 -1
- novel_downloader/utils/i18n.py +3 -4
- novel_downloader/utils/logger.py +5 -6
- novel_downloader/utils/model_loader.py +5 -8
- novel_downloader/utils/network.py +9 -10
- novel_downloader/utils/state.py +6 -7
- novel_downloader/utils/text_utils/__init__.py +0 -1
- novel_downloader/utils/text_utils/chapter_formatting.py +2 -7
- novel_downloader/utils/text_utils/diff_display.py +0 -1
- novel_downloader/utils/text_utils/font_mapping.py +1 -4
- novel_downloader/utils/text_utils/text_cleaning.py +0 -1
- novel_downloader/utils/time_utils/__init__.py +0 -1
- novel_downloader/utils/time_utils/datetime_utils.py +9 -11
- novel_downloader/utils/time_utils/sleep_utils.py +27 -13
- {novel_downloader-1.2.1.dist-info → novel_downloader-1.3.0.dist-info}/METADATA +14 -17
- novel_downloader-1.3.0.dist-info/RECORD +127 -0
- {novel_downloader-1.2.1.dist-info → novel_downloader-1.3.0.dist-info}/WHEEL +1 -1
- novel_downloader/core/parsers/qidian_parser/shared/book_info_parser.py +0 -95
- novel_downloader/core/requesters/base_browser.py +0 -210
- novel_downloader/core/requesters/base_session.py +0 -243
- novel_downloader/core/requesters/common_requester/common_async_session.py +0 -98
- novel_downloader/core/requesters/common_requester/common_session.py +0 -126
- novel_downloader/core/requesters/qidian_requester/__init__.py +0 -22
- novel_downloader/core/requesters/qidian_requester/qidian_broswer.py +0 -377
- novel_downloader/core/requesters/qidian_requester/qidian_session.py +0 -202
- novel_downloader/resources/config/settings.yaml +0 -76
- novel_downloader-1.2.1.dist-info/RECORD +0 -115
- {novel_downloader-1.2.1.dist-info → novel_downloader-1.3.0.dist-info}/entry_points.txt +0 -0
- {novel_downloader-1.2.1.dist-info → novel_downloader-1.3.0.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-1.2.1.dist-info → novel_downloader-1.3.0.dist-info}/top_level.txt +0 -0
@@ -1,15 +1,15 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.parsers.
|
5
|
-
|
3
|
+
novel_downloader.core.parsers.common.helpers
|
4
|
+
--------------------------------------------
|
6
5
|
|
7
6
|
Shared utility functions for parsing Common pages.
|
8
7
|
"""
|
9
8
|
|
10
9
|
import logging
|
11
10
|
import re
|
12
|
-
from
|
11
|
+
from collections.abc import Iterable, Iterator
|
12
|
+
from typing import Any, cast
|
13
13
|
|
14
14
|
from bs4 import BeautifulSoup, Tag
|
15
15
|
|
@@ -47,7 +47,7 @@ class HTMLExtractor:
|
|
47
47
|
self._html = html
|
48
48
|
self._soup = html_to_soup(html)
|
49
49
|
|
50
|
-
def extract_book_info(self, rules: BookInfoRules) ->
|
50
|
+
def extract_book_info(self, rules: BookInfoRules) -> dict[str, Any]:
|
51
51
|
"""
|
52
52
|
Extract structured book information from HTML according to the given rules.
|
53
53
|
|
@@ -56,7 +56,7 @@ class HTMLExtractor:
|
|
56
56
|
:param rules: Extraction configuration specifying how to extract.
|
57
57
|
:return: A dictionary containing extracted book information.
|
58
58
|
"""
|
59
|
-
book_info:
|
59
|
+
book_info: dict[str, Any] = {}
|
60
60
|
|
61
61
|
for field_name, field_rules in rules.items():
|
62
62
|
if field_rules is None:
|
@@ -72,7 +72,7 @@ class HTMLExtractor:
|
|
72
72
|
|
73
73
|
return book_info
|
74
74
|
|
75
|
-
def extract_field(self, steps:
|
75
|
+
def extract_field(self, steps: list[RuleStep]) -> str:
|
76
76
|
"""
|
77
77
|
Execute a list of extraction steps on the given HTML.
|
78
78
|
|
@@ -188,7 +188,7 @@ class HTMLExtractor:
|
|
188
188
|
current = sep.join(current)
|
189
189
|
|
190
190
|
elif t == "attr":
|
191
|
-
name = step.get("attr")
|
191
|
+
name = step.get("attr") or ""
|
192
192
|
if isinstance(current, list):
|
193
193
|
current = [elem.get(name, "") for elem in current]
|
194
194
|
elif isinstance(current, Tag):
|
@@ -209,16 +209,16 @@ class HTMLExtractor:
|
|
209
209
|
return str(current.get_text().strip())
|
210
210
|
return str(current or "").strip()
|
211
211
|
|
212
|
-
def extract_mixed_volumes(self, volume_rule: VolumesRules) ->
|
212
|
+
def extract_mixed_volumes(self, volume_rule: VolumesRules) -> list[dict[str, Any]]:
|
213
213
|
"""
|
214
214
|
Special mode: mixed <volume> and <chapter> under same parent.
|
215
215
|
(e.g., dt / dd pattern in BiQuGe)
|
216
216
|
"""
|
217
217
|
list_selector = volume_rule.get("list_selector")
|
218
218
|
volume_selector = volume_rule.get("volume_selector")
|
219
|
-
chapter_selector = volume_rule.get("chapter_selector")
|
220
219
|
volume_name_steps = volume_rule.get("volume_name_steps")
|
221
|
-
|
220
|
+
chapter_selector = volume_rule["chapter_selector"]
|
221
|
+
chapter_steps_list = volume_rule["chapter_steps"]
|
222
222
|
|
223
223
|
if not (
|
224
224
|
list_selector and volume_selector and chapter_selector and volume_name_steps
|
@@ -228,8 +228,8 @@ class HTMLExtractor:
|
|
228
228
|
"chapter_selector 和 volume_name_steps"
|
229
229
|
)
|
230
230
|
|
231
|
-
volumes:
|
232
|
-
current_volume:
|
231
|
+
volumes: list[dict[str, Any]] = []
|
232
|
+
current_volume: dict[str, Any] | None = None
|
233
233
|
if not chapter_steps_list:
|
234
234
|
chapter_steps_list = []
|
235
235
|
chapter_info_steps = {item["key"]: item["steps"] for item in chapter_steps_list}
|
@@ -241,6 +241,8 @@ class HTMLExtractor:
|
|
241
241
|
for elem in list_area.find_all(
|
242
242
|
[volume_selector, chapter_selector], recursive=True
|
243
243
|
):
|
244
|
+
if not isinstance(elem, Tag):
|
245
|
+
continue
|
244
246
|
if elem.name == volume_selector:
|
245
247
|
extractor = HTMLExtractor(str(elem))
|
246
248
|
volume_name = extractor.extract_field(volume_name_steps)
|
@@ -256,10 +258,10 @@ class HTMLExtractor:
|
|
256
258
|
|
257
259
|
return volumes
|
258
260
|
|
259
|
-
def extract_volume_blocks(self, volume_rule: VolumesRules) ->
|
260
|
-
volume_selector = volume_rule
|
261
|
+
def extract_volume_blocks(self, volume_rule: VolumesRules) -> list[dict[str, Any]]:
|
262
|
+
volume_selector = volume_rule.get("volume_selector")
|
263
|
+
volume_name_steps = volume_rule.get("volume_name_steps")
|
261
264
|
chapter_selector = volume_rule["chapter_selector"]
|
262
|
-
volume_name_steps = volume_rule["volume_name_steps"]
|
263
265
|
chapter_steps_list = volume_rule["chapter_steps"]
|
264
266
|
if not (volume_selector and volume_name_steps):
|
265
267
|
raise ValueError(
|
@@ -283,7 +285,7 @@ class HTMLExtractor:
|
|
283
285
|
|
284
286
|
return volumes
|
285
287
|
|
286
|
-
def extract_flat_chapters(self, volume_rule: VolumesRules) ->
|
288
|
+
def extract_flat_chapters(self, volume_rule: VolumesRules) -> list[dict[str, Any]]:
|
287
289
|
chapter_selector = volume_rule["chapter_selector"]
|
288
290
|
chapter_steps_list = volume_rule["chapter_steps"]
|
289
291
|
volume_selector = volume_rule.get("volume_selector")
|
@@ -310,7 +312,7 @@ class HTMLExtractor:
|
|
310
312
|
|
311
313
|
def extract_volumes_structure(
|
312
314
|
self, volume_rule: VolumesRules
|
313
|
-
) ->
|
315
|
+
) -> list[dict[str, Any]]:
|
314
316
|
volume_mode = volume_rule.get("volume_mode", "normal")
|
315
317
|
if volume_mode == "mixed":
|
316
318
|
return self.extract_mixed_volumes(volume_rule)
|
@@ -1,18 +1,18 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.parsers.
|
5
|
-
|
3
|
+
novel_downloader.core.parsers.common.main_parser
|
4
|
+
------------------------------------------------
|
6
5
|
|
7
6
|
This package provides parsing components for handling
|
8
7
|
Common pages.
|
9
8
|
"""
|
10
9
|
|
11
|
-
from typing import Any
|
10
|
+
from typing import Any
|
12
11
|
|
13
12
|
from novel_downloader.config import ParserConfig, SiteRules
|
13
|
+
from novel_downloader.core.parsers.base import BaseParser
|
14
|
+
from novel_downloader.utils.chapter_storage import ChapterDict
|
14
15
|
|
15
|
-
from ..base_parser import BaseParser
|
16
16
|
from .helper import HTMLExtractor
|
17
17
|
|
18
18
|
|
@@ -35,7 +35,7 @@ class CommonParser(BaseParser):
|
|
35
35
|
self._site = site
|
36
36
|
self._site_rule = site_rule
|
37
37
|
|
38
|
-
def parse_book_info(self, html_str: str) ->
|
38
|
+
def parse_book_info(self, html_str: str) -> dict[str, Any]:
|
39
39
|
"""
|
40
40
|
Parse a book info page and extract metadata and chapter structure.
|
41
41
|
|
@@ -46,7 +46,11 @@ class CommonParser(BaseParser):
|
|
46
46
|
rules = self._site_rule["book_info"]
|
47
47
|
return extractor.extract_book_info(rules)
|
48
48
|
|
49
|
-
def parse_chapter(
|
49
|
+
def parse_chapter(
|
50
|
+
self,
|
51
|
+
html_str: str,
|
52
|
+
chapter_id: str,
|
53
|
+
) -> ChapterDict | None:
|
50
54
|
"""
|
51
55
|
Parse a single chapter page and extract clean text or simplified HTML.
|
52
56
|
|
@@ -66,13 +70,15 @@ class CommonParser(BaseParser):
|
|
66
70
|
title = extractor.extract_field(title_steps["steps"]) if title_steps else ""
|
67
71
|
content = extractor.extract_field(content_steps["steps"])
|
68
72
|
if not content:
|
69
|
-
return
|
73
|
+
return None
|
70
74
|
|
71
75
|
return {
|
72
76
|
"id": chapter_id,
|
73
77
|
"title": title or "Untitled",
|
74
78
|
"content": content,
|
75
|
-
"
|
79
|
+
"extra": {
|
80
|
+
"site": self._site,
|
81
|
+
},
|
76
82
|
}
|
77
83
|
|
78
84
|
@property
|
@@ -1,8 +1,7 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.parsers.
|
5
|
-
|
3
|
+
novel_downloader.core.parsers.qidian
|
4
|
+
------------------------------------
|
6
5
|
|
7
6
|
This package provides parsing implementations for the Qidian platform.
|
8
7
|
|
@@ -1,8 +1,7 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.parsers.
|
5
|
-
|
3
|
+
novel_downloader.core.parsers.qidian.browser
|
4
|
+
--------------------------------------------
|
6
5
|
|
7
6
|
This package provides parsing components for handling Qidian
|
8
7
|
pages that have been rendered by a browser engine.
|
@@ -1,8 +1,7 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.parsers.
|
5
|
-
|
3
|
+
novel_downloader.core.parsers.qidian.browser.chapter_encrypted
|
4
|
+
--------------------------------------------------------------
|
6
5
|
|
7
6
|
Support for parsing encrypted chapters from Qidian using font OCR mapping,
|
8
7
|
CSS rules, and custom rendering logic.
|
@@ -19,11 +18,12 @@ from __future__ import annotations
|
|
19
18
|
import json
|
20
19
|
import logging
|
21
20
|
from pathlib import Path
|
22
|
-
from typing import TYPE_CHECKING, Any
|
21
|
+
from typing import TYPE_CHECKING, Any
|
23
22
|
|
24
23
|
import tinycss2
|
25
24
|
from bs4 import BeautifulSoup, Tag
|
26
25
|
|
26
|
+
from novel_downloader.utils.chapter_storage import ChapterDict
|
27
27
|
from novel_downloader.utils.network import download_font_file
|
28
28
|
from novel_downloader.utils.text_utils import apply_font_mapping
|
29
29
|
|
@@ -43,7 +43,7 @@ def parse_encrypted_chapter(
|
|
43
43
|
parser: QidianBrowserParser,
|
44
44
|
soup: BeautifulSoup,
|
45
45
|
chapter_id: str,
|
46
|
-
) ->
|
46
|
+
) -> ChapterDict | None:
|
47
47
|
"""
|
48
48
|
Extract and return the formatted textual content of an encrypted chapter.
|
49
49
|
|
@@ -61,15 +61,15 @@ def parse_encrypted_chapter(
|
|
61
61
|
"""
|
62
62
|
try:
|
63
63
|
if not (parser._decode_font and parser._font_ocr):
|
64
|
-
return
|
64
|
+
return None
|
65
65
|
ssr_data = find_ssr_page_context(soup)
|
66
66
|
chapter_info = extract_chapter_info(ssr_data)
|
67
67
|
if not chapter_info:
|
68
68
|
logger.warning(
|
69
69
|
"[Parser] ssr_chapterInfo not found for chapter '%s'", chapter_id
|
70
70
|
)
|
71
|
-
return
|
72
|
-
debug_base_dir:
|
71
|
+
return None
|
72
|
+
debug_base_dir: Path | None = None
|
73
73
|
if parser._font_debug_dir:
|
74
74
|
debug_base_dir = parser._font_debug_dir / chapter_id
|
75
75
|
debug_base_dir.mkdir(parents=True, exist_ok=True)
|
@@ -85,10 +85,7 @@ def parse_encrypted_chapter(
|
|
85
85
|
update_timestamp = chapter_info.get("updateTimestamp", 0)
|
86
86
|
modify_time = chapter_info.get("modifyTime", 0)
|
87
87
|
word_count = chapter_info.get("wordsCount", 0)
|
88
|
-
vip = bool(chapter_info.get("vipStatus", 0))
|
89
|
-
is_buy = bool(chapter_info.get("isBuy", 0))
|
90
88
|
seq = chapter_info.get("seq", None)
|
91
|
-
order = chapter_info.get("chapterOrder", None)
|
92
89
|
volume = chapter_info.get("extra", {}).get("volumeName", "")
|
93
90
|
|
94
91
|
# extract + save font
|
@@ -133,7 +130,7 @@ def parse_encrypted_chapter(
|
|
133
130
|
logger.warning(
|
134
131
|
f"[Parser] No end_number found after parsing chapter '{chapter_id}'"
|
135
132
|
)
|
136
|
-
return
|
133
|
+
return None
|
137
134
|
|
138
135
|
paragraphs_str, refl_list = render_paragraphs(
|
139
136
|
main_paragraphs, paragraphs_rules, end_number
|
@@ -143,7 +140,7 @@ def parse_encrypted_chapter(
|
|
143
140
|
paragraphs_str_path.write_text(paragraphs_str, encoding="utf-8")
|
144
141
|
|
145
142
|
# Run OCR + fallback mapping
|
146
|
-
char_set =
|
143
|
+
char_set = {c for c in paragraphs_str if c not in {" ", "\n", "\u3000"}}
|
147
144
|
refl_set = set(refl_list)
|
148
145
|
char_set = char_set - refl_set
|
149
146
|
if debug_base_dir:
|
@@ -174,33 +171,31 @@ def parse_encrypted_chapter(
|
|
174
171
|
final_paragraphs_str = "\n\n".join(
|
175
172
|
line.strip() for line in original_text.splitlines() if line.strip()
|
176
173
|
)
|
177
|
-
|
174
|
+
return {
|
178
175
|
"id": str(chapter_id),
|
179
176
|
"title": title,
|
180
177
|
"content": final_paragraphs_str,
|
181
|
-
"
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
"volume": volume,
|
178
|
+
"extra": {
|
179
|
+
"author_say": author_say.strip() if author_say else "",
|
180
|
+
"updated_at": update_time,
|
181
|
+
"update_timestamp": update_timestamp,
|
182
|
+
"modify_time": modify_time,
|
183
|
+
"word_count": word_count,
|
184
|
+
"seq": seq,
|
185
|
+
"volume": volume,
|
186
|
+
},
|
191
187
|
}
|
192
|
-
return chapter_info
|
193
188
|
|
194
189
|
except Exception as e:
|
195
190
|
logger.warning(
|
196
191
|
"[Parser] parse error for encrypted chapter '%s': %s", chapter_id, e
|
197
192
|
)
|
198
|
-
return
|
193
|
+
return None
|
199
194
|
|
200
195
|
|
201
196
|
def extract_paragraphs_recursively(
|
202
197
|
soup: BeautifulSoup, chapter_id: str = ""
|
203
|
-
) ->
|
198
|
+
) -> list[dict[str, Any]]:
|
204
199
|
"""
|
205
200
|
Extracts paragraph elements under <main id="c-{chapter_id}"> from HTML
|
206
201
|
and converts them to a nested data structure for further processing.
|
@@ -211,7 +206,7 @@ def extract_paragraphs_recursively(
|
|
211
206
|
:return list: List of parsed <p> paragraph data.
|
212
207
|
"""
|
213
208
|
|
214
|
-
def parse_element(elem: Any) ->
|
209
|
+
def parse_element(elem: Any) -> dict[str, Any] | None:
|
215
210
|
if not isinstance(elem, Tag):
|
216
211
|
return None
|
217
212
|
result = {"tag": elem.name, "attrs": dict(elem.attrs), "data": []}
|
@@ -229,7 +224,7 @@ def extract_paragraphs_recursively(
|
|
229
224
|
if chapter_id:
|
230
225
|
main_id = f"c-{chapter_id}"
|
231
226
|
main_tag = soup.find("main", id=main_id)
|
232
|
-
if not main_tag:
|
227
|
+
if not isinstance(main_tag, Tag):
|
233
228
|
return []
|
234
229
|
else:
|
235
230
|
main_tag = soup
|
@@ -243,7 +238,7 @@ def extract_paragraphs_recursively(
|
|
243
238
|
return result
|
244
239
|
|
245
240
|
|
246
|
-
def parse_rule(css_str: str) ->
|
241
|
+
def parse_rule(css_str: str) -> dict[str, Any]:
|
247
242
|
"""
|
248
243
|
Parse a CSS string and extract style rules for rendering.
|
249
244
|
|
@@ -258,7 +253,7 @@ def parse_rule(css_str: str) -> Dict[str, Any]:
|
|
258
253
|
:return: Dict with "rules" and "orders" for rendering.
|
259
254
|
"""
|
260
255
|
|
261
|
-
rules:
|
256
|
+
rules: dict[str, Any] = {}
|
262
257
|
orders = []
|
263
258
|
|
264
259
|
stylesheet = tinycss2.parse_stylesheet(
|
@@ -322,7 +317,7 @@ def parse_rule(css_str: str) -> Dict[str, Any]:
|
|
322
317
|
return {"rules": rules, "orders": orders}
|
323
318
|
|
324
319
|
|
325
|
-
def parse_paragraph_names(rules:
|
320
|
+
def parse_paragraph_names(rules: dict[str, Any]) -> set[str]:
|
326
321
|
"""
|
327
322
|
Extract all paragraph selector names from parsed rules, excluding "sy".
|
328
323
|
"""
|
@@ -335,16 +330,16 @@ def parse_paragraph_names(rules: Dict[str, Any]) -> Set[str]:
|
|
335
330
|
|
336
331
|
|
337
332
|
def parse_end_number(
|
338
|
-
main_paragraphs:
|
339
|
-
) ->
|
333
|
+
main_paragraphs: list[dict[str, Any]], paragraph_names: set[str]
|
334
|
+
) -> int | None:
|
340
335
|
"""
|
341
336
|
Find the most frequent numeric suffix from tag names
|
342
337
|
matched by given paragraph prefixes.
|
343
338
|
"""
|
344
|
-
end_numbers:
|
339
|
+
end_numbers: dict[int, int] = {}
|
345
340
|
sorted_names = sorted(paragraph_names, key=len, reverse=True)
|
346
341
|
|
347
|
-
def rec_parse(item:
|
342
|
+
def rec_parse(item: list[Any] | dict[str, Any]) -> None:
|
348
343
|
if isinstance(item, list):
|
349
344
|
for element in item:
|
350
345
|
rec_parse(element)
|
@@ -359,7 +354,7 @@ def parse_end_number(
|
|
359
354
|
end_numbers[num] = end_numbers.get(num, 0) + 1
|
360
355
|
break
|
361
356
|
for val in item.values():
|
362
|
-
if isinstance(val, (list
|
357
|
+
if isinstance(val, (list | dict)):
|
363
358
|
rec_parse(val)
|
364
359
|
|
365
360
|
rec_parse(main_paragraphs)
|
@@ -381,10 +376,10 @@ def parse_end_number(
|
|
381
376
|
|
382
377
|
|
383
378
|
def render_paragraphs(
|
384
|
-
main_paragraphs:
|
385
|
-
rules:
|
379
|
+
main_paragraphs: list[dict[str, Any]],
|
380
|
+
rules: dict[str, Any],
|
386
381
|
end_number: int,
|
387
|
-
) ->
|
382
|
+
) -> tuple[str, list[str]]:
|
388
383
|
"""
|
389
384
|
Applies the parsed CSS rules to the paragraph structure and
|
390
385
|
reconstructs the visible text.
|
@@ -403,11 +398,11 @@ def render_paragraphs(
|
|
403
398
|
- A reconstructed paragraph string with line breaks.
|
404
399
|
- A list of mirrored (reflected) characters for later OCR processing.
|
405
400
|
"""
|
406
|
-
orders:
|
401
|
+
orders: list[tuple[str, str]] = rules.get("orders", [])
|
407
402
|
rules = rules.get("rules", {})
|
408
|
-
refl_list:
|
403
|
+
refl_list: list[str] = []
|
409
404
|
|
410
|
-
def apply_rule(data:
|
405
|
+
def apply_rule(data: dict[str, Any], rule: dict[str, Any]) -> str:
|
411
406
|
if rule.get("delete-all", False):
|
412
407
|
return ""
|
413
408
|
|
@@ -418,10 +413,7 @@ def render_paragraphs(
|
|
418
413
|
curr_str += first_data
|
419
414
|
|
420
415
|
if rule.get("delete-first", False):
|
421
|
-
if len(curr_str) <= 1:
|
422
|
-
curr_str = ""
|
423
|
-
else:
|
424
|
-
curr_str = curr_str[1:]
|
416
|
+
curr_str = "" if len(curr_str) <= 1 else curr_str[1:]
|
425
417
|
|
426
418
|
curr_str += rule.get("append-end-char", "")
|
427
419
|
|
@@ -480,7 +472,7 @@ def render_paragraphs(
|
|
480
472
|
logger.debug(f"[parser] not find p_class_str: {class_list}")
|
481
473
|
continue
|
482
474
|
# 普通标签处理,根据 orders 顺序匹配
|
483
|
-
for ord_selector,
|
475
|
+
for ord_selector, _ in orders:
|
484
476
|
tag_name = f"{ord_selector}{end_number}"
|
485
477
|
if data.get("tag") != tag_name:
|
486
478
|
continue
|
@@ -489,7 +481,7 @@ def render_paragraphs(
|
|
489
481
|
ordered_cache[ord_selector] = apply_rule(data, curr_rule)
|
490
482
|
break
|
491
483
|
# 最后按 orders 顺序拼接
|
492
|
-
for ord_selector,
|
484
|
+
for ord_selector, _ in orders:
|
493
485
|
if ord_selector in ordered_cache:
|
494
486
|
paragraphs_str += ordered_cache[ord_selector]
|
495
487
|
|
@@ -1,18 +1,18 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.parsers.
|
5
|
-
|
3
|
+
novel_downloader.core.parsers.qidian.browser.chapter_normal
|
4
|
+
-----------------------------------------------------------
|
6
5
|
|
7
6
|
Parser logic for extracting readable text from Qidian chapters
|
8
7
|
that use plain (non-encrypted) browser-rendered HTML.
|
9
8
|
"""
|
10
9
|
|
11
10
|
import logging
|
12
|
-
from typing import Any, Dict
|
13
11
|
|
14
12
|
from bs4 import BeautifulSoup
|
15
13
|
|
14
|
+
from novel_downloader.utils.chapter_storage import ChapterDict
|
15
|
+
|
16
16
|
from ..shared import (
|
17
17
|
extract_chapter_info,
|
18
18
|
find_ssr_page_context,
|
@@ -24,7 +24,7 @@ logger = logging.getLogger(__name__)
|
|
24
24
|
def parse_normal_chapter(
|
25
25
|
soup: BeautifulSoup,
|
26
26
|
chapter_id: str,
|
27
|
-
) ->
|
27
|
+
) -> ChapterDict | None:
|
28
28
|
"""
|
29
29
|
Extract and format the chapter text from a normal Qidian page.
|
30
30
|
Returns empty string if VIP/encrypted.
|
@@ -44,7 +44,7 @@ def parse_normal_chapter(
|
|
44
44
|
main = soup.select_one("div#app div#reader-content main")
|
45
45
|
if not main:
|
46
46
|
logger.warning("[Parser] Main content not found for chapter")
|
47
|
-
return
|
47
|
+
return None
|
48
48
|
|
49
49
|
ssr_data = find_ssr_page_context(soup)
|
50
50
|
chapter_info = extract_chapter_info(ssr_data)
|
@@ -52,7 +52,7 @@ def parse_normal_chapter(
|
|
52
52
|
logger.warning(
|
53
53
|
"[Parser] ssr_chapterInfo not found for chapter '%s'", chapter_id
|
54
54
|
)
|
55
|
-
return
|
55
|
+
return None
|
56
56
|
|
57
57
|
title = chapter_info.get("chapterName", "Untitled")
|
58
58
|
chapter_id = chapter_info.get("chapterId", "")
|
@@ -61,10 +61,7 @@ def parse_normal_chapter(
|
|
61
61
|
update_timestamp = chapter_info.get("updateTimestamp", 0)
|
62
62
|
modify_time = chapter_info.get("modifyTime", 0)
|
63
63
|
word_count = chapter_info.get("wordsCount", 0)
|
64
|
-
vip = bool(chapter_info.get("vipStatus", 0))
|
65
|
-
is_buy = bool(chapter_info.get("isBuy", 0))
|
66
64
|
seq = chapter_info.get("seq", None)
|
67
|
-
order = chapter_info.get("chapterOrder", None)
|
68
65
|
volume = chapter_info.get("extra", {}).get("volumeName", "")
|
69
66
|
|
70
67
|
# remove review spans
|
@@ -78,20 +75,19 @@ def parse_normal_chapter(
|
|
78
75
|
"id": str(chapter_id),
|
79
76
|
"title": title,
|
80
77
|
"content": chapter_text,
|
81
|
-
"
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
"volume": volume,
|
78
|
+
"extra": {
|
79
|
+
"author_say": author_say.strip() if author_say else "",
|
80
|
+
"updated_at": update_time,
|
81
|
+
"update_timestamp": update_timestamp,
|
82
|
+
"modify_time": modify_time,
|
83
|
+
"word_count": word_count,
|
84
|
+
"seq": seq,
|
85
|
+
"volume": volume,
|
86
|
+
},
|
91
87
|
}
|
92
88
|
|
93
89
|
except Exception as e:
|
94
90
|
logger.warning(
|
95
91
|
"[Parser] parse error for normal chapter '%s': %s", chapter_id, e
|
96
92
|
)
|
97
|
-
|
93
|
+
return None
|
@@ -1,8 +1,7 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.parsers.
|
5
|
-
|
3
|
+
novel_downloader.core.parsers.qidian.browser.chapter_router
|
4
|
+
-----------------------------------------------------------
|
6
5
|
|
7
6
|
Routing logic for selecting the correct chapter parser for Qidian browser pages.
|
8
7
|
|
@@ -13,7 +12,9 @@ routes the parsing task to either the encrypted or normal chapter parser.
|
|
13
12
|
from __future__ import annotations
|
14
13
|
|
15
14
|
import logging
|
16
|
-
from typing import TYPE_CHECKING
|
15
|
+
from typing import TYPE_CHECKING
|
16
|
+
|
17
|
+
from novel_downloader.utils.chapter_storage import ChapterDict
|
17
18
|
|
18
19
|
from ..shared import (
|
19
20
|
can_view_chapter,
|
@@ -32,7 +33,7 @@ def parse_chapter(
|
|
32
33
|
parser: QidianBrowserParser,
|
33
34
|
html_str: str,
|
34
35
|
chapter_id: str,
|
35
|
-
) ->
|
36
|
+
) -> ChapterDict | None:
|
36
37
|
"""
|
37
38
|
Extract and return the formatted textual content of chapter.
|
38
39
|
|
@@ -48,11 +49,11 @@ def parse_chapter(
|
|
48
49
|
logger.warning(
|
49
50
|
"[Parser] Chapter '%s' is not purchased or inaccessible.", chapter_id
|
50
51
|
)
|
51
|
-
return
|
52
|
+
return None
|
52
53
|
|
53
54
|
if is_encrypted(soup):
|
54
55
|
if not parser._decode_font:
|
55
|
-
return
|
56
|
+
return None
|
56
57
|
try:
|
57
58
|
from .chapter_encrypted import parse_encrypted_chapter
|
58
59
|
|
@@ -62,9 +63,9 @@ def parse_chapter(
|
|
62
63
|
"[Parser] Encrypted chapter '%s' requires extra dependencies.",
|
63
64
|
chapter_id,
|
64
65
|
)
|
65
|
-
return
|
66
|
+
return None
|
66
67
|
|
67
68
|
return parse_normal_chapter(soup, chapter_id)
|
68
69
|
except Exception as e:
|
69
70
|
logger.warning("[Parser] parse error for chapter '%s': %s", chapter_id, e)
|
70
|
-
|
71
|
+
return None
|