novel-downloader 1.2.2__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -2
- novel_downloader/cli/__init__.py +0 -1
- novel_downloader/cli/clean.py +2 -10
- novel_downloader/cli/download.py +16 -22
- novel_downloader/cli/interactive.py +0 -1
- novel_downloader/cli/main.py +1 -3
- novel_downloader/cli/settings.py +8 -8
- novel_downloader/config/__init__.py +0 -1
- novel_downloader/config/adapter.py +32 -27
- novel_downloader/config/loader.py +116 -108
- novel_downloader/config/models.py +35 -29
- novel_downloader/config/site_rules.py +2 -4
- novel_downloader/core/__init__.py +0 -1
- novel_downloader/core/downloaders/__init__.py +4 -4
- novel_downloader/core/downloaders/base/__init__.py +14 -0
- novel_downloader/core/downloaders/{base_async_downloader.py → base/base_async.py} +49 -53
- novel_downloader/core/downloaders/{base_downloader.py → base/base_sync.py} +64 -43
- novel_downloader/core/downloaders/biquge/__init__.py +12 -0
- novel_downloader/core/downloaders/biquge/biquge_sync.py +25 -0
- novel_downloader/core/downloaders/common/__init__.py +14 -0
- novel_downloader/core/downloaders/{common_asynb_downloader.py → common/common_async.py} +42 -33
- novel_downloader/core/downloaders/{common_downloader.py → common/common_sync.py} +33 -21
- novel_downloader/core/downloaders/qidian/__init__.py +10 -0
- novel_downloader/core/downloaders/{qidian_downloader.py → qidian/qidian_sync.py} +79 -62
- novel_downloader/core/factory/__init__.py +4 -5
- novel_downloader/core/factory/{downloader_factory.py → downloader.py} +25 -26
- novel_downloader/core/factory/{parser_factory.py → parser.py} +12 -14
- novel_downloader/core/factory/{requester_factory.py → requester.py} +29 -16
- novel_downloader/core/factory/{saver_factory.py → saver.py} +4 -9
- novel_downloader/core/interfaces/__init__.py +8 -9
- novel_downloader/core/interfaces/{async_downloader_protocol.py → async_downloader.py} +4 -5
- novel_downloader/core/interfaces/{async_requester_protocol.py → async_requester.py} +23 -12
- novel_downloader/core/interfaces/{parser_protocol.py → parser.py} +11 -6
- novel_downloader/core/interfaces/{saver_protocol.py → saver.py} +2 -3
- novel_downloader/core/interfaces/{downloader_protocol.py → sync_downloader.py} +6 -7
- novel_downloader/core/interfaces/{requester_protocol.py → sync_requester.py} +31 -17
- novel_downloader/core/parsers/__init__.py +5 -4
- novel_downloader/core/parsers/{base_parser.py → base.py} +18 -9
- novel_downloader/core/parsers/biquge/__init__.py +10 -0
- novel_downloader/core/parsers/biquge/main_parser.py +126 -0
- novel_downloader/core/parsers/{common_parser → common}/__init__.py +2 -3
- novel_downloader/core/parsers/{common_parser → common}/helper.py +13 -13
- novel_downloader/core/parsers/{common_parser → common}/main_parser.py +15 -9
- novel_downloader/core/parsers/{qidian_parser → qidian}/__init__.py +2 -3
- novel_downloader/core/parsers/{qidian_parser → qidian}/browser/__init__.py +2 -3
- novel_downloader/core/parsers/{qidian_parser → qidian}/browser/chapter_encrypted.py +40 -48
- novel_downloader/core/parsers/{qidian_parser → qidian}/browser/chapter_normal.py +17 -21
- novel_downloader/core/parsers/{qidian_parser → qidian}/browser/chapter_router.py +10 -9
- novel_downloader/core/parsers/{qidian_parser → qidian}/browser/main_parser.py +14 -10
- novel_downloader/core/parsers/{qidian_parser → qidian}/session/__init__.py +2 -3
- novel_downloader/core/parsers/{qidian_parser → qidian}/session/chapter_encrypted.py +36 -44
- novel_downloader/core/parsers/{qidian_parser → qidian}/session/chapter_normal.py +19 -23
- novel_downloader/core/parsers/{qidian_parser → qidian}/session/chapter_router.py +10 -9
- novel_downloader/core/parsers/{qidian_parser → qidian}/session/main_parser.py +14 -10
- novel_downloader/core/parsers/{qidian_parser → qidian}/session/node_decryptor.py +7 -10
- novel_downloader/core/parsers/{qidian_parser → qidian}/shared/__init__.py +2 -3
- novel_downloader/core/parsers/{qidian_parser → qidian}/shared/book_info_parser.py +5 -6
- novel_downloader/core/parsers/{qidian_parser → qidian}/shared/helpers.py +7 -8
- novel_downloader/core/requesters/__init__.py +9 -5
- novel_downloader/core/requesters/base/__init__.py +16 -0
- novel_downloader/core/requesters/{base_async_session.py → base/async_session.py} +177 -73
- novel_downloader/core/requesters/base/browser.py +340 -0
- novel_downloader/core/requesters/base/session.py +364 -0
- novel_downloader/core/requesters/biquge/__init__.py +12 -0
- novel_downloader/core/requesters/biquge/session.py +90 -0
- novel_downloader/core/requesters/{common_requester → common}/__init__.py +4 -5
- novel_downloader/core/requesters/common/async_session.py +96 -0
- novel_downloader/core/requesters/common/session.py +113 -0
- novel_downloader/core/requesters/qidian/__init__.py +21 -0
- novel_downloader/core/requesters/qidian/broswer.py +307 -0
- novel_downloader/core/requesters/qidian/session.py +287 -0
- novel_downloader/core/savers/__init__.py +5 -3
- novel_downloader/core/savers/{base_saver.py → base.py} +12 -13
- novel_downloader/core/savers/biquge.py +25 -0
- novel_downloader/core/savers/{common_saver → common}/__init__.py +2 -3
- novel_downloader/core/savers/{common_saver/common_epub.py → common/epub.py} +23 -51
- novel_downloader/core/savers/{common_saver → common}/main_saver.py +43 -9
- novel_downloader/core/savers/{common_saver/common_txt.py → common/txt.py} +16 -46
- novel_downloader/core/savers/epub_utils/__init__.py +0 -1
- novel_downloader/core/savers/epub_utils/css_builder.py +13 -7
- novel_downloader/core/savers/epub_utils/initializer.py +4 -5
- novel_downloader/core/savers/epub_utils/text_to_html.py +2 -3
- novel_downloader/core/savers/epub_utils/volume_intro.py +1 -3
- novel_downloader/core/savers/{qidian_saver.py → qidian.py} +12 -6
- novel_downloader/locales/en.json +8 -4
- novel_downloader/locales/zh.json +5 -1
- novel_downloader/resources/config/settings.toml +88 -0
- novel_downloader/utils/cache.py +2 -2
- novel_downloader/utils/chapter_storage.py +340 -0
- novel_downloader/utils/constants.py +6 -4
- novel_downloader/utils/crypto_utils.py +3 -3
- novel_downloader/utils/file_utils/__init__.py +0 -1
- novel_downloader/utils/file_utils/io.py +12 -17
- novel_downloader/utils/file_utils/normalize.py +1 -3
- novel_downloader/utils/file_utils/sanitize.py +2 -9
- novel_downloader/utils/fontocr/__init__.py +0 -1
- novel_downloader/utils/fontocr/ocr_v1.py +19 -22
- novel_downloader/utils/fontocr/ocr_v2.py +147 -60
- novel_downloader/utils/hash_store.py +19 -20
- novel_downloader/utils/hash_utils.py +0 -1
- novel_downloader/utils/i18n.py +3 -4
- novel_downloader/utils/logger.py +5 -6
- novel_downloader/utils/model_loader.py +5 -8
- novel_downloader/utils/network.py +9 -10
- novel_downloader/utils/state.py +6 -7
- novel_downloader/utils/text_utils/__init__.py +0 -1
- novel_downloader/utils/text_utils/chapter_formatting.py +2 -7
- novel_downloader/utils/text_utils/diff_display.py +0 -1
- novel_downloader/utils/text_utils/font_mapping.py +1 -4
- novel_downloader/utils/text_utils/text_cleaning.py +0 -1
- novel_downloader/utils/time_utils/__init__.py +0 -1
- novel_downloader/utils/time_utils/datetime_utils.py +8 -10
- novel_downloader/utils/time_utils/sleep_utils.py +1 -3
- {novel_downloader-1.2.2.dist-info → novel_downloader-1.3.1.dist-info}/METADATA +14 -17
- novel_downloader-1.3.1.dist-info/RECORD +127 -0
- {novel_downloader-1.2.2.dist-info → novel_downloader-1.3.1.dist-info}/WHEEL +1 -1
- novel_downloader/core/requesters/base_browser.py +0 -214
- novel_downloader/core/requesters/base_session.py +0 -246
- novel_downloader/core/requesters/common_requester/common_async_session.py +0 -98
- novel_downloader/core/requesters/common_requester/common_session.py +0 -126
- novel_downloader/core/requesters/qidian_requester/__init__.py +0 -22
- novel_downloader/core/requesters/qidian_requester/qidian_broswer.py +0 -396
- novel_downloader/core/requesters/qidian_requester/qidian_session.py +0 -202
- novel_downloader/resources/config/settings.yaml +0 -76
- novel_downloader-1.2.2.dist-info/RECORD +0 -115
- {novel_downloader-1.2.2.dist-info → novel_downloader-1.3.1.dist-info}/entry_points.txt +0 -0
- {novel_downloader-1.2.2.dist-info → novel_downloader-1.3.1.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-1.2.2.dist-info → novel_downloader-1.3.1.dist-info}/top_level.txt +0 -0
@@ -1,18 +1,18 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.parsers.
|
5
|
-
|
3
|
+
novel_downloader.core.parsers.common.main_parser
|
4
|
+
------------------------------------------------
|
6
5
|
|
7
6
|
This package provides parsing components for handling
|
8
7
|
Common pages.
|
9
8
|
"""
|
10
9
|
|
11
|
-
from typing import Any
|
10
|
+
from typing import Any
|
12
11
|
|
13
12
|
from novel_downloader.config import ParserConfig, SiteRules
|
13
|
+
from novel_downloader.core.parsers.base import BaseParser
|
14
|
+
from novel_downloader.utils.chapter_storage import ChapterDict
|
14
15
|
|
15
|
-
from ..base_parser import BaseParser
|
16
16
|
from .helper import HTMLExtractor
|
17
17
|
|
18
18
|
|
@@ -35,7 +35,7 @@ class CommonParser(BaseParser):
|
|
35
35
|
self._site = site
|
36
36
|
self._site_rule = site_rule
|
37
37
|
|
38
|
-
def parse_book_info(self, html_str: str) ->
|
38
|
+
def parse_book_info(self, html_str: str) -> dict[str, Any]:
|
39
39
|
"""
|
40
40
|
Parse a book info page and extract metadata and chapter structure.
|
41
41
|
|
@@ -46,7 +46,11 @@ class CommonParser(BaseParser):
|
|
46
46
|
rules = self._site_rule["book_info"]
|
47
47
|
return extractor.extract_book_info(rules)
|
48
48
|
|
49
|
-
def parse_chapter(
|
49
|
+
def parse_chapter(
|
50
|
+
self,
|
51
|
+
html_str: str,
|
52
|
+
chapter_id: str,
|
53
|
+
) -> ChapterDict | None:
|
50
54
|
"""
|
51
55
|
Parse a single chapter page and extract clean text or simplified HTML.
|
52
56
|
|
@@ -66,13 +70,15 @@ class CommonParser(BaseParser):
|
|
66
70
|
title = extractor.extract_field(title_steps["steps"]) if title_steps else ""
|
67
71
|
content = extractor.extract_field(content_steps["steps"])
|
68
72
|
if not content:
|
69
|
-
return
|
73
|
+
return None
|
70
74
|
|
71
75
|
return {
|
72
76
|
"id": chapter_id,
|
73
77
|
"title": title or "Untitled",
|
74
78
|
"content": content,
|
75
|
-
"
|
79
|
+
"extra": {
|
80
|
+
"site": self._site,
|
81
|
+
},
|
76
82
|
}
|
77
83
|
|
78
84
|
@property
|
@@ -1,8 +1,7 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.parsers.
|
5
|
-
|
3
|
+
novel_downloader.core.parsers.qidian
|
4
|
+
------------------------------------
|
6
5
|
|
7
6
|
This package provides parsing implementations for the Qidian platform.
|
8
7
|
|
@@ -1,8 +1,7 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.parsers.
|
5
|
-
|
3
|
+
novel_downloader.core.parsers.qidian.browser
|
4
|
+
--------------------------------------------
|
6
5
|
|
7
6
|
This package provides parsing components for handling Qidian
|
8
7
|
pages that have been rendered by a browser engine.
|
@@ -1,8 +1,7 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.parsers.
|
5
|
-
|
3
|
+
novel_downloader.core.parsers.qidian.browser.chapter_encrypted
|
4
|
+
--------------------------------------------------------------
|
6
5
|
|
7
6
|
Support for parsing encrypted chapters from Qidian using font OCR mapping,
|
8
7
|
CSS rules, and custom rendering logic.
|
@@ -19,11 +18,12 @@ from __future__ import annotations
|
|
19
18
|
import json
|
20
19
|
import logging
|
21
20
|
from pathlib import Path
|
22
|
-
from typing import TYPE_CHECKING, Any
|
21
|
+
from typing import TYPE_CHECKING, Any
|
23
22
|
|
24
23
|
import tinycss2
|
25
24
|
from bs4 import BeautifulSoup, Tag
|
26
25
|
|
26
|
+
from novel_downloader.utils.chapter_storage import ChapterDict
|
27
27
|
from novel_downloader.utils.network import download_font_file
|
28
28
|
from novel_downloader.utils.text_utils import apply_font_mapping
|
29
29
|
|
@@ -43,7 +43,7 @@ def parse_encrypted_chapter(
|
|
43
43
|
parser: QidianBrowserParser,
|
44
44
|
soup: BeautifulSoup,
|
45
45
|
chapter_id: str,
|
46
|
-
) ->
|
46
|
+
) -> ChapterDict | None:
|
47
47
|
"""
|
48
48
|
Extract and return the formatted textual content of an encrypted chapter.
|
49
49
|
|
@@ -61,15 +61,15 @@ def parse_encrypted_chapter(
|
|
61
61
|
"""
|
62
62
|
try:
|
63
63
|
if not (parser._decode_font and parser._font_ocr):
|
64
|
-
return
|
64
|
+
return None
|
65
65
|
ssr_data = find_ssr_page_context(soup)
|
66
66
|
chapter_info = extract_chapter_info(ssr_data)
|
67
67
|
if not chapter_info:
|
68
68
|
logger.warning(
|
69
69
|
"[Parser] ssr_chapterInfo not found for chapter '%s'", chapter_id
|
70
70
|
)
|
71
|
-
return
|
72
|
-
debug_base_dir:
|
71
|
+
return None
|
72
|
+
debug_base_dir: Path | None = None
|
73
73
|
if parser._font_debug_dir:
|
74
74
|
debug_base_dir = parser._font_debug_dir / chapter_id
|
75
75
|
debug_base_dir.mkdir(parents=True, exist_ok=True)
|
@@ -85,10 +85,7 @@ def parse_encrypted_chapter(
|
|
85
85
|
update_timestamp = chapter_info.get("updateTimestamp", 0)
|
86
86
|
modify_time = chapter_info.get("modifyTime", 0)
|
87
87
|
word_count = chapter_info.get("wordsCount", 0)
|
88
|
-
vip = bool(chapter_info.get("vipStatus", 0))
|
89
|
-
is_buy = bool(chapter_info.get("isBuy", 0))
|
90
88
|
seq = chapter_info.get("seq", None)
|
91
|
-
order = chapter_info.get("chapterOrder", None)
|
92
89
|
volume = chapter_info.get("extra", {}).get("volumeName", "")
|
93
90
|
|
94
91
|
# extract + save font
|
@@ -133,7 +130,7 @@ def parse_encrypted_chapter(
|
|
133
130
|
logger.warning(
|
134
131
|
f"[Parser] No end_number found after parsing chapter '{chapter_id}'"
|
135
132
|
)
|
136
|
-
return
|
133
|
+
return None
|
137
134
|
|
138
135
|
paragraphs_str, refl_list = render_paragraphs(
|
139
136
|
main_paragraphs, paragraphs_rules, end_number
|
@@ -143,7 +140,7 @@ def parse_encrypted_chapter(
|
|
143
140
|
paragraphs_str_path.write_text(paragraphs_str, encoding="utf-8")
|
144
141
|
|
145
142
|
# Run OCR + fallback mapping
|
146
|
-
char_set =
|
143
|
+
char_set = {c for c in paragraphs_str if c not in {" ", "\n", "\u3000"}}
|
147
144
|
refl_set = set(refl_list)
|
148
145
|
char_set = char_set - refl_set
|
149
146
|
if debug_base_dir:
|
@@ -174,33 +171,31 @@ def parse_encrypted_chapter(
|
|
174
171
|
final_paragraphs_str = "\n\n".join(
|
175
172
|
line.strip() for line in original_text.splitlines() if line.strip()
|
176
173
|
)
|
177
|
-
|
174
|
+
return {
|
178
175
|
"id": str(chapter_id),
|
179
176
|
"title": title,
|
180
177
|
"content": final_paragraphs_str,
|
181
|
-
"
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
"volume": volume,
|
178
|
+
"extra": {
|
179
|
+
"author_say": author_say.strip() if author_say else "",
|
180
|
+
"updated_at": update_time,
|
181
|
+
"update_timestamp": update_timestamp,
|
182
|
+
"modify_time": modify_time,
|
183
|
+
"word_count": word_count,
|
184
|
+
"seq": seq,
|
185
|
+
"volume": volume,
|
186
|
+
},
|
191
187
|
}
|
192
|
-
return chapter_info
|
193
188
|
|
194
189
|
except Exception as e:
|
195
190
|
logger.warning(
|
196
191
|
"[Parser] parse error for encrypted chapter '%s': %s", chapter_id, e
|
197
192
|
)
|
198
|
-
return
|
193
|
+
return None
|
199
194
|
|
200
195
|
|
201
196
|
def extract_paragraphs_recursively(
|
202
197
|
soup: BeautifulSoup, chapter_id: str = ""
|
203
|
-
) ->
|
198
|
+
) -> list[dict[str, Any]]:
|
204
199
|
"""
|
205
200
|
Extracts paragraph elements under <main id="c-{chapter_id}"> from HTML
|
206
201
|
and converts them to a nested data structure for further processing.
|
@@ -211,7 +206,7 @@ def extract_paragraphs_recursively(
|
|
211
206
|
:return list: List of parsed <p> paragraph data.
|
212
207
|
"""
|
213
208
|
|
214
|
-
def parse_element(elem: Any) ->
|
209
|
+
def parse_element(elem: Any) -> dict[str, Any] | None:
|
215
210
|
if not isinstance(elem, Tag):
|
216
211
|
return None
|
217
212
|
result = {"tag": elem.name, "attrs": dict(elem.attrs), "data": []}
|
@@ -243,7 +238,7 @@ def extract_paragraphs_recursively(
|
|
243
238
|
return result
|
244
239
|
|
245
240
|
|
246
|
-
def parse_rule(css_str: str) ->
|
241
|
+
def parse_rule(css_str: str) -> dict[str, Any]:
|
247
242
|
"""
|
248
243
|
Parse a CSS string and extract style rules for rendering.
|
249
244
|
|
@@ -258,7 +253,7 @@ def parse_rule(css_str: str) -> Dict[str, Any]:
|
|
258
253
|
:return: Dict with "rules" and "orders" for rendering.
|
259
254
|
"""
|
260
255
|
|
261
|
-
rules:
|
256
|
+
rules: dict[str, Any] = {}
|
262
257
|
orders = []
|
263
258
|
|
264
259
|
stylesheet = tinycss2.parse_stylesheet(
|
@@ -322,7 +317,7 @@ def parse_rule(css_str: str) -> Dict[str, Any]:
|
|
322
317
|
return {"rules": rules, "orders": orders}
|
323
318
|
|
324
319
|
|
325
|
-
def parse_paragraph_names(rules:
|
320
|
+
def parse_paragraph_names(rules: dict[str, Any]) -> set[str]:
|
326
321
|
"""
|
327
322
|
Extract all paragraph selector names from parsed rules, excluding "sy".
|
328
323
|
"""
|
@@ -335,16 +330,16 @@ def parse_paragraph_names(rules: Dict[str, Any]) -> Set[str]:
|
|
335
330
|
|
336
331
|
|
337
332
|
def parse_end_number(
|
338
|
-
main_paragraphs:
|
339
|
-
) ->
|
333
|
+
main_paragraphs: list[dict[str, Any]], paragraph_names: set[str]
|
334
|
+
) -> int | None:
|
340
335
|
"""
|
341
336
|
Find the most frequent numeric suffix from tag names
|
342
337
|
matched by given paragraph prefixes.
|
343
338
|
"""
|
344
|
-
end_numbers:
|
339
|
+
end_numbers: dict[int, int] = {}
|
345
340
|
sorted_names = sorted(paragraph_names, key=len, reverse=True)
|
346
341
|
|
347
|
-
def rec_parse(item:
|
342
|
+
def rec_parse(item: list[Any] | dict[str, Any]) -> None:
|
348
343
|
if isinstance(item, list):
|
349
344
|
for element in item:
|
350
345
|
rec_parse(element)
|
@@ -359,7 +354,7 @@ def parse_end_number(
|
|
359
354
|
end_numbers[num] = end_numbers.get(num, 0) + 1
|
360
355
|
break
|
361
356
|
for val in item.values():
|
362
|
-
if isinstance(val, (list
|
357
|
+
if isinstance(val, (list | dict)):
|
363
358
|
rec_parse(val)
|
364
359
|
|
365
360
|
rec_parse(main_paragraphs)
|
@@ -381,10 +376,10 @@ def parse_end_number(
|
|
381
376
|
|
382
377
|
|
383
378
|
def render_paragraphs(
|
384
|
-
main_paragraphs:
|
385
|
-
rules:
|
379
|
+
main_paragraphs: list[dict[str, Any]],
|
380
|
+
rules: dict[str, Any],
|
386
381
|
end_number: int,
|
387
|
-
) ->
|
382
|
+
) -> tuple[str, list[str]]:
|
388
383
|
"""
|
389
384
|
Applies the parsed CSS rules to the paragraph structure and
|
390
385
|
reconstructs the visible text.
|
@@ -403,11 +398,11 @@ def render_paragraphs(
|
|
403
398
|
- A reconstructed paragraph string with line breaks.
|
404
399
|
- A list of mirrored (reflected) characters for later OCR processing.
|
405
400
|
"""
|
406
|
-
orders:
|
401
|
+
orders: list[tuple[str, str]] = rules.get("orders", [])
|
407
402
|
rules = rules.get("rules", {})
|
408
|
-
refl_list:
|
403
|
+
refl_list: list[str] = []
|
409
404
|
|
410
|
-
def apply_rule(data:
|
405
|
+
def apply_rule(data: dict[str, Any], rule: dict[str, Any]) -> str:
|
411
406
|
if rule.get("delete-all", False):
|
412
407
|
return ""
|
413
408
|
|
@@ -418,10 +413,7 @@ def render_paragraphs(
|
|
418
413
|
curr_str += first_data
|
419
414
|
|
420
415
|
if rule.get("delete-first", False):
|
421
|
-
if len(curr_str) <= 1:
|
422
|
-
curr_str = ""
|
423
|
-
else:
|
424
|
-
curr_str = curr_str[1:]
|
416
|
+
curr_str = "" if len(curr_str) <= 1 else curr_str[1:]
|
425
417
|
|
426
418
|
curr_str += rule.get("append-end-char", "")
|
427
419
|
|
@@ -480,7 +472,7 @@ def render_paragraphs(
|
|
480
472
|
logger.debug(f"[parser] not find p_class_str: {class_list}")
|
481
473
|
continue
|
482
474
|
# 普通标签处理,根据 orders 顺序匹配
|
483
|
-
for ord_selector,
|
475
|
+
for ord_selector, _ in orders:
|
484
476
|
tag_name = f"{ord_selector}{end_number}"
|
485
477
|
if data.get("tag") != tag_name:
|
486
478
|
continue
|
@@ -489,7 +481,7 @@ def render_paragraphs(
|
|
489
481
|
ordered_cache[ord_selector] = apply_rule(data, curr_rule)
|
490
482
|
break
|
491
483
|
# 最后按 orders 顺序拼接
|
492
|
-
for ord_selector,
|
484
|
+
for ord_selector, _ in orders:
|
493
485
|
if ord_selector in ordered_cache:
|
494
486
|
paragraphs_str += ordered_cache[ord_selector]
|
495
487
|
|
@@ -1,18 +1,18 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.parsers.
|
5
|
-
|
3
|
+
novel_downloader.core.parsers.qidian.browser.chapter_normal
|
4
|
+
-----------------------------------------------------------
|
6
5
|
|
7
6
|
Parser logic for extracting readable text from Qidian chapters
|
8
7
|
that use plain (non-encrypted) browser-rendered HTML.
|
9
8
|
"""
|
10
9
|
|
11
10
|
import logging
|
12
|
-
from typing import Any, Dict
|
13
11
|
|
14
12
|
from bs4 import BeautifulSoup
|
15
13
|
|
14
|
+
from novel_downloader.utils.chapter_storage import ChapterDict
|
15
|
+
|
16
16
|
from ..shared import (
|
17
17
|
extract_chapter_info,
|
18
18
|
find_ssr_page_context,
|
@@ -24,7 +24,7 @@ logger = logging.getLogger(__name__)
|
|
24
24
|
def parse_normal_chapter(
|
25
25
|
soup: BeautifulSoup,
|
26
26
|
chapter_id: str,
|
27
|
-
) ->
|
27
|
+
) -> ChapterDict | None:
|
28
28
|
"""
|
29
29
|
Extract and format the chapter text from a normal Qidian page.
|
30
30
|
Returns empty string if VIP/encrypted.
|
@@ -44,7 +44,7 @@ def parse_normal_chapter(
|
|
44
44
|
main = soup.select_one("div#app div#reader-content main")
|
45
45
|
if not main:
|
46
46
|
logger.warning("[Parser] Main content not found for chapter")
|
47
|
-
return
|
47
|
+
return None
|
48
48
|
|
49
49
|
ssr_data = find_ssr_page_context(soup)
|
50
50
|
chapter_info = extract_chapter_info(ssr_data)
|
@@ -52,7 +52,7 @@ def parse_normal_chapter(
|
|
52
52
|
logger.warning(
|
53
53
|
"[Parser] ssr_chapterInfo not found for chapter '%s'", chapter_id
|
54
54
|
)
|
55
|
-
return
|
55
|
+
return None
|
56
56
|
|
57
57
|
title = chapter_info.get("chapterName", "Untitled")
|
58
58
|
chapter_id = chapter_info.get("chapterId", "")
|
@@ -61,10 +61,7 @@ def parse_normal_chapter(
|
|
61
61
|
update_timestamp = chapter_info.get("updateTimestamp", 0)
|
62
62
|
modify_time = chapter_info.get("modifyTime", 0)
|
63
63
|
word_count = chapter_info.get("wordsCount", 0)
|
64
|
-
vip = bool(chapter_info.get("vipStatus", 0))
|
65
|
-
is_buy = bool(chapter_info.get("isBuy", 0))
|
66
64
|
seq = chapter_info.get("seq", None)
|
67
|
-
order = chapter_info.get("chapterOrder", None)
|
68
65
|
volume = chapter_info.get("extra", {}).get("volumeName", "")
|
69
66
|
|
70
67
|
# remove review spans
|
@@ -78,20 +75,19 @@ def parse_normal_chapter(
|
|
78
75
|
"id": str(chapter_id),
|
79
76
|
"title": title,
|
80
77
|
"content": chapter_text,
|
81
|
-
"
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
"volume": volume,
|
78
|
+
"extra": {
|
79
|
+
"author_say": author_say.strip() if author_say else "",
|
80
|
+
"updated_at": update_time,
|
81
|
+
"update_timestamp": update_timestamp,
|
82
|
+
"modify_time": modify_time,
|
83
|
+
"word_count": word_count,
|
84
|
+
"seq": seq,
|
85
|
+
"volume": volume,
|
86
|
+
},
|
91
87
|
}
|
92
88
|
|
93
89
|
except Exception as e:
|
94
90
|
logger.warning(
|
95
91
|
"[Parser] parse error for normal chapter '%s': %s", chapter_id, e
|
96
92
|
)
|
97
|
-
|
93
|
+
return None
|
@@ -1,8 +1,7 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.parsers.
|
5
|
-
|
3
|
+
novel_downloader.core.parsers.qidian.browser.chapter_router
|
4
|
+
-----------------------------------------------------------
|
6
5
|
|
7
6
|
Routing logic for selecting the correct chapter parser for Qidian browser pages.
|
8
7
|
|
@@ -13,7 +12,9 @@ routes the parsing task to either the encrypted or normal chapter parser.
|
|
13
12
|
from __future__ import annotations
|
14
13
|
|
15
14
|
import logging
|
16
|
-
from typing import TYPE_CHECKING
|
15
|
+
from typing import TYPE_CHECKING
|
16
|
+
|
17
|
+
from novel_downloader.utils.chapter_storage import ChapterDict
|
17
18
|
|
18
19
|
from ..shared import (
|
19
20
|
can_view_chapter,
|
@@ -32,7 +33,7 @@ def parse_chapter(
|
|
32
33
|
parser: QidianBrowserParser,
|
33
34
|
html_str: str,
|
34
35
|
chapter_id: str,
|
35
|
-
) ->
|
36
|
+
) -> ChapterDict | None:
|
36
37
|
"""
|
37
38
|
Extract and return the formatted textual content of chapter.
|
38
39
|
|
@@ -48,11 +49,11 @@ def parse_chapter(
|
|
48
49
|
logger.warning(
|
49
50
|
"[Parser] Chapter '%s' is not purchased or inaccessible.", chapter_id
|
50
51
|
)
|
51
|
-
return
|
52
|
+
return None
|
52
53
|
|
53
54
|
if is_encrypted(soup):
|
54
55
|
if not parser._decode_font:
|
55
|
-
return
|
56
|
+
return None
|
56
57
|
try:
|
57
58
|
from .chapter_encrypted import parse_encrypted_chapter
|
58
59
|
|
@@ -62,9 +63,9 @@ def parse_chapter(
|
|
62
63
|
"[Parser] Encrypted chapter '%s' requires extra dependencies.",
|
63
64
|
chapter_id,
|
64
65
|
)
|
65
|
-
return
|
66
|
+
return None
|
66
67
|
|
67
68
|
return parse_normal_chapter(soup, chapter_id)
|
68
69
|
except Exception as e:
|
69
70
|
logger.warning("[Parser] parse error for chapter '%s': %s", chapter_id, e)
|
70
|
-
|
71
|
+
return None
|
@@ -1,8 +1,7 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.parsers.
|
5
|
-
|
3
|
+
novel_downloader.core.parsers.qidian.browser.main_parser
|
4
|
+
--------------------------------------------------------
|
6
5
|
|
7
6
|
Main parser class for handling Qidian chapters rendered via a browser environment.
|
8
7
|
|
@@ -13,10 +12,11 @@ content extracted from dynamically rendered Qidian HTML pages.
|
|
13
12
|
from __future__ import annotations
|
14
13
|
|
15
14
|
from pathlib import Path
|
16
|
-
from typing import TYPE_CHECKING, Any
|
15
|
+
from typing import TYPE_CHECKING, Any
|
17
16
|
|
18
17
|
from novel_downloader.config.models import ParserConfig
|
19
|
-
from novel_downloader.core.parsers.
|
18
|
+
from novel_downloader.core.parsers.base import BaseParser
|
19
|
+
from novel_downloader.utils.chapter_storage import ChapterDict
|
20
20
|
|
21
21
|
from ..shared import (
|
22
22
|
is_encrypted,
|
@@ -47,9 +47,9 @@ class QidianBrowserParser(BaseParser):
|
|
47
47
|
|
48
48
|
self._fixed_font_dir: Path = self._base_cache_dir / "fixed_fonts"
|
49
49
|
self._fixed_font_dir.mkdir(parents=True, exist_ok=True)
|
50
|
-
self._font_debug_dir:
|
50
|
+
self._font_debug_dir: Path | None = None
|
51
51
|
|
52
|
-
self._font_ocr:
|
52
|
+
self._font_ocr: FontOCR | None = None
|
53
53
|
if self._decode_font:
|
54
54
|
from novel_downloader.utils.fontocr import FontOCR
|
55
55
|
|
@@ -66,10 +66,10 @@ class QidianBrowserParser(BaseParser):
|
|
66
66
|
vec_weight=config.vec_weight,
|
67
67
|
font_debug=config.save_font_debug,
|
68
68
|
)
|
69
|
-
self._font_debug_dir = self._base_cache_dir / "font_debug"
|
69
|
+
self._font_debug_dir = self._base_cache_dir / "qidian" / "font_debug"
|
70
70
|
self._font_debug_dir.mkdir(parents=True, exist_ok=True)
|
71
71
|
|
72
|
-
def parse_book_info(self, html_str: str) ->
|
72
|
+
def parse_book_info(self, html_str: str) -> dict[str, Any]:
|
73
73
|
"""
|
74
74
|
Parse a book info page and extract metadata and chapter structure.
|
75
75
|
|
@@ -78,7 +78,11 @@ class QidianBrowserParser(BaseParser):
|
|
78
78
|
"""
|
79
79
|
return parse_book_info(html_str)
|
80
80
|
|
81
|
-
def parse_chapter(
|
81
|
+
def parse_chapter(
|
82
|
+
self,
|
83
|
+
html_str: str,
|
84
|
+
chapter_id: str,
|
85
|
+
) -> ChapterDict | None:
|
82
86
|
"""
|
83
87
|
:param html: Raw HTML of the chapter page.
|
84
88
|
:param chapter_id: Identifier of the chapter being parsed.
|
@@ -1,8 +1,7 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.parsers.
|
5
|
-
|
3
|
+
novel_downloader.core.parsers.qidian.session
|
4
|
+
--------------------------------------------
|
6
5
|
|
7
6
|
This package provides parsing components for handling Qidian
|
8
7
|
pages that have been rendered by a session.
|