novel-downloader 1.3.2__py3-none-any.whl → 1.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -1
- novel_downloader/cli/clean.py +97 -78
- novel_downloader/cli/config.py +177 -0
- novel_downloader/cli/download.py +132 -87
- novel_downloader/cli/export.py +77 -0
- novel_downloader/cli/main.py +21 -28
- novel_downloader/config/__init__.py +1 -25
- novel_downloader/config/adapter.py +32 -31
- novel_downloader/config/loader.py +3 -3
- novel_downloader/config/site_rules.py +1 -2
- novel_downloader/core/__init__.py +3 -6
- novel_downloader/core/downloaders/__init__.py +10 -13
- novel_downloader/core/downloaders/base.py +233 -0
- novel_downloader/core/downloaders/biquge.py +27 -0
- novel_downloader/core/downloaders/common.py +414 -0
- novel_downloader/core/downloaders/esjzone.py +27 -0
- novel_downloader/core/downloaders/linovelib.py +27 -0
- novel_downloader/core/downloaders/qianbi.py +27 -0
- novel_downloader/core/downloaders/qidian.py +352 -0
- novel_downloader/core/downloaders/sfacg.py +27 -0
- novel_downloader/core/downloaders/yamibo.py +27 -0
- novel_downloader/core/exporters/__init__.py +37 -0
- novel_downloader/core/{savers → exporters}/base.py +73 -44
- novel_downloader/core/exporters/biquge.py +25 -0
- novel_downloader/core/exporters/common/__init__.py +12 -0
- novel_downloader/core/{savers → exporters}/common/epub.py +40 -52
- novel_downloader/core/{savers/common/main_saver.py → exporters/common/main_exporter.py} +36 -39
- novel_downloader/core/{savers → exporters}/common/txt.py +20 -24
- novel_downloader/core/exporters/epub_utils/__init__.py +40 -0
- novel_downloader/core/{savers → exporters}/epub_utils/css_builder.py +2 -1
- novel_downloader/core/exporters/epub_utils/image_loader.py +131 -0
- novel_downloader/core/{savers → exporters}/epub_utils/initializer.py +6 -3
- novel_downloader/core/{savers → exporters}/epub_utils/text_to_html.py +49 -2
- novel_downloader/core/{savers → exporters}/epub_utils/volume_intro.py +2 -1
- novel_downloader/core/exporters/esjzone.py +25 -0
- novel_downloader/core/exporters/linovelib/__init__.py +10 -0
- novel_downloader/core/exporters/linovelib/epub.py +449 -0
- novel_downloader/core/exporters/linovelib/main_exporter.py +127 -0
- novel_downloader/core/exporters/linovelib/txt.py +129 -0
- novel_downloader/core/exporters/qianbi.py +25 -0
- novel_downloader/core/{savers → exporters}/qidian.py +8 -8
- novel_downloader/core/exporters/sfacg.py +25 -0
- novel_downloader/core/exporters/yamibo.py +25 -0
- novel_downloader/core/factory/__init__.py +5 -17
- novel_downloader/core/factory/downloader.py +24 -126
- novel_downloader/core/factory/exporter.py +58 -0
- novel_downloader/core/factory/fetcher.py +96 -0
- novel_downloader/core/factory/parser.py +17 -12
- novel_downloader/core/{requesters → fetchers}/__init__.py +22 -15
- novel_downloader/core/{requesters → fetchers}/base/__init__.py +2 -4
- novel_downloader/core/fetchers/base/browser.py +383 -0
- novel_downloader/core/fetchers/base/rate_limiter.py +86 -0
- novel_downloader/core/fetchers/base/session.py +419 -0
- novel_downloader/core/fetchers/biquge/__init__.py +14 -0
- novel_downloader/core/{requesters/biquge/async_session.py → fetchers/biquge/browser.py} +18 -6
- novel_downloader/core/{requesters → fetchers}/biquge/session.py +23 -30
- novel_downloader/core/fetchers/common/__init__.py +14 -0
- novel_downloader/core/fetchers/common/browser.py +79 -0
- novel_downloader/core/{requesters/common/async_session.py → fetchers/common/session.py} +8 -25
- novel_downloader/core/fetchers/esjzone/__init__.py +14 -0
- novel_downloader/core/fetchers/esjzone/browser.py +202 -0
- novel_downloader/core/{requesters/esjzone/async_session.py → fetchers/esjzone/session.py} +62 -42
- novel_downloader/core/fetchers/linovelib/__init__.py +14 -0
- novel_downloader/core/fetchers/linovelib/browser.py +178 -0
- novel_downloader/core/fetchers/linovelib/session.py +178 -0
- novel_downloader/core/fetchers/qianbi/__init__.py +14 -0
- novel_downloader/core/{requesters/qianbi/session.py → fetchers/qianbi/browser.py} +30 -48
- novel_downloader/core/{requesters/qianbi/async_session.py → fetchers/qianbi/session.py} +18 -6
- novel_downloader/core/fetchers/qidian/__init__.py +14 -0
- novel_downloader/core/fetchers/qidian/browser.py +266 -0
- novel_downloader/core/fetchers/qidian/session.py +326 -0
- novel_downloader/core/fetchers/sfacg/__init__.py +14 -0
- novel_downloader/core/fetchers/sfacg/browser.py +189 -0
- novel_downloader/core/{requesters/sfacg/async_session.py → fetchers/sfacg/session.py} +43 -73
- novel_downloader/core/fetchers/yamibo/__init__.py +14 -0
- novel_downloader/core/fetchers/yamibo/browser.py +229 -0
- novel_downloader/core/{requesters/yamibo/async_session.py → fetchers/yamibo/session.py} +62 -44
- novel_downloader/core/interfaces/__init__.py +8 -12
- novel_downloader/core/interfaces/downloader.py +54 -0
- novel_downloader/core/interfaces/{saver.py → exporter.py} +12 -12
- novel_downloader/core/interfaces/fetcher.py +162 -0
- novel_downloader/core/interfaces/parser.py +6 -7
- novel_downloader/core/parsers/__init__.py +5 -6
- novel_downloader/core/parsers/base.py +9 -13
- novel_downloader/core/parsers/biquge/main_parser.py +12 -13
- novel_downloader/core/parsers/common/helper.py +3 -3
- novel_downloader/core/parsers/common/main_parser.py +39 -34
- novel_downloader/core/parsers/esjzone/main_parser.py +24 -17
- novel_downloader/core/parsers/linovelib/__init__.py +10 -0
- novel_downloader/core/parsers/linovelib/main_parser.py +210 -0
- novel_downloader/core/parsers/qianbi/main_parser.py +21 -15
- novel_downloader/core/parsers/qidian/__init__.py +2 -11
- novel_downloader/core/parsers/qidian/book_info_parser.py +113 -0
- novel_downloader/core/parsers/qidian/{browser/chapter_encrypted.py → chapter_encrypted.py} +162 -135
- novel_downloader/core/parsers/qidian/chapter_normal.py +150 -0
- novel_downloader/core/parsers/qidian/{session/chapter_router.py → chapter_router.py} +15 -15
- novel_downloader/core/parsers/qidian/{browser/main_parser.py → main_parser.py} +49 -40
- novel_downloader/core/parsers/qidian/utils/__init__.py +27 -0
- novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +145 -0
- novel_downloader/core/parsers/qidian/{shared → utils}/helpers.py +41 -68
- novel_downloader/core/parsers/qidian/{session → utils}/node_decryptor.py +64 -50
- novel_downloader/core/parsers/sfacg/main_parser.py +12 -12
- novel_downloader/core/parsers/yamibo/main_parser.py +10 -10
- novel_downloader/locales/en.json +18 -2
- novel_downloader/locales/zh.json +18 -2
- novel_downloader/models/__init__.py +64 -0
- novel_downloader/models/browser.py +21 -0
- novel_downloader/models/chapter.py +25 -0
- novel_downloader/models/config.py +100 -0
- novel_downloader/models/login.py +20 -0
- novel_downloader/models/site_rules.py +99 -0
- novel_downloader/models/tasks.py +33 -0
- novel_downloader/models/types.py +15 -0
- novel_downloader/resources/config/settings.toml +31 -25
- novel_downloader/resources/json/linovelib_font_map.json +3573 -0
- novel_downloader/tui/__init__.py +7 -0
- novel_downloader/tui/app.py +32 -0
- novel_downloader/tui/main.py +17 -0
- novel_downloader/tui/screens/__init__.py +14 -0
- novel_downloader/tui/screens/home.py +191 -0
- novel_downloader/tui/screens/login.py +74 -0
- novel_downloader/tui/styles/home_layout.tcss +79 -0
- novel_downloader/tui/widgets/richlog_handler.py +24 -0
- novel_downloader/utils/__init__.py +6 -0
- novel_downloader/utils/chapter_storage.py +25 -38
- novel_downloader/utils/constants.py +15 -5
- novel_downloader/utils/cookies.py +66 -0
- novel_downloader/utils/crypto_utils.py +1 -74
- novel_downloader/utils/file_utils/io.py +1 -1
- novel_downloader/utils/fontocr/ocr_v1.py +2 -1
- novel_downloader/utils/fontocr/ocr_v2.py +2 -2
- novel_downloader/utils/hash_store.py +10 -18
- novel_downloader/utils/hash_utils.py +3 -2
- novel_downloader/utils/logger.py +2 -3
- novel_downloader/utils/network.py +53 -39
- novel_downloader/utils/text_utils/chapter_formatting.py +6 -1
- novel_downloader/utils/text_utils/font_mapping.py +1 -1
- novel_downloader/utils/text_utils/text_cleaning.py +1 -1
- novel_downloader/utils/time_utils/datetime_utils.py +3 -3
- novel_downloader/utils/time_utils/sleep_utils.py +3 -3
- {novel_downloader-1.3.2.dist-info → novel_downloader-1.4.0.dist-info}/METADATA +72 -38
- novel_downloader-1.4.0.dist-info/RECORD +170 -0
- {novel_downloader-1.3.2.dist-info → novel_downloader-1.4.0.dist-info}/WHEEL +1 -1
- {novel_downloader-1.3.2.dist-info → novel_downloader-1.4.0.dist-info}/entry_points.txt +1 -0
- novel_downloader/cli/interactive.py +0 -66
- novel_downloader/cli/settings.py +0 -177
- novel_downloader/config/models.py +0 -187
- novel_downloader/core/downloaders/base/__init__.py +0 -14
- novel_downloader/core/downloaders/base/base_async.py +0 -153
- novel_downloader/core/downloaders/base/base_sync.py +0 -208
- novel_downloader/core/downloaders/biquge/__init__.py +0 -14
- novel_downloader/core/downloaders/biquge/biquge_async.py +0 -27
- novel_downloader/core/downloaders/biquge/biquge_sync.py +0 -27
- novel_downloader/core/downloaders/common/__init__.py +0 -14
- novel_downloader/core/downloaders/common/common_async.py +0 -218
- novel_downloader/core/downloaders/common/common_sync.py +0 -210
- novel_downloader/core/downloaders/esjzone/__init__.py +0 -14
- novel_downloader/core/downloaders/esjzone/esjzone_async.py +0 -27
- novel_downloader/core/downloaders/esjzone/esjzone_sync.py +0 -27
- novel_downloader/core/downloaders/qianbi/__init__.py +0 -14
- novel_downloader/core/downloaders/qianbi/qianbi_async.py +0 -27
- novel_downloader/core/downloaders/qianbi/qianbi_sync.py +0 -27
- novel_downloader/core/downloaders/qidian/__init__.py +0 -10
- novel_downloader/core/downloaders/qidian/qidian_sync.py +0 -227
- novel_downloader/core/downloaders/sfacg/__init__.py +0 -14
- novel_downloader/core/downloaders/sfacg/sfacg_async.py +0 -27
- novel_downloader/core/downloaders/sfacg/sfacg_sync.py +0 -27
- novel_downloader/core/downloaders/yamibo/__init__.py +0 -14
- novel_downloader/core/downloaders/yamibo/yamibo_async.py +0 -27
- novel_downloader/core/downloaders/yamibo/yamibo_sync.py +0 -27
- novel_downloader/core/factory/requester.py +0 -144
- novel_downloader/core/factory/saver.py +0 -56
- novel_downloader/core/interfaces/async_downloader.py +0 -36
- novel_downloader/core/interfaces/async_requester.py +0 -84
- novel_downloader/core/interfaces/sync_downloader.py +0 -36
- novel_downloader/core/interfaces/sync_requester.py +0 -82
- novel_downloader/core/parsers/qidian/browser/__init__.py +0 -12
- novel_downloader/core/parsers/qidian/browser/chapter_normal.py +0 -93
- novel_downloader/core/parsers/qidian/browser/chapter_router.py +0 -71
- novel_downloader/core/parsers/qidian/session/__init__.py +0 -12
- novel_downloader/core/parsers/qidian/session/chapter_encrypted.py +0 -443
- novel_downloader/core/parsers/qidian/session/chapter_normal.py +0 -115
- novel_downloader/core/parsers/qidian/session/main_parser.py +0 -128
- novel_downloader/core/parsers/qidian/shared/__init__.py +0 -37
- novel_downloader/core/parsers/qidian/shared/book_info_parser.py +0 -150
- novel_downloader/core/requesters/base/async_session.py +0 -410
- novel_downloader/core/requesters/base/browser.py +0 -337
- novel_downloader/core/requesters/base/session.py +0 -378
- novel_downloader/core/requesters/biquge/__init__.py +0 -14
- novel_downloader/core/requesters/common/__init__.py +0 -17
- novel_downloader/core/requesters/common/session.py +0 -113
- novel_downloader/core/requesters/esjzone/__init__.py +0 -13
- novel_downloader/core/requesters/esjzone/session.py +0 -235
- novel_downloader/core/requesters/qianbi/__init__.py +0 -13
- novel_downloader/core/requesters/qidian/__init__.py +0 -21
- novel_downloader/core/requesters/qidian/broswer.py +0 -307
- novel_downloader/core/requesters/qidian/session.py +0 -290
- novel_downloader/core/requesters/sfacg/__init__.py +0 -13
- novel_downloader/core/requesters/sfacg/session.py +0 -242
- novel_downloader/core/requesters/yamibo/__init__.py +0 -13
- novel_downloader/core/requesters/yamibo/session.py +0 -237
- novel_downloader/core/savers/__init__.py +0 -34
- novel_downloader/core/savers/biquge.py +0 -25
- novel_downloader/core/savers/common/__init__.py +0 -12
- novel_downloader/core/savers/epub_utils/__init__.py +0 -26
- novel_downloader/core/savers/esjzone.py +0 -25
- novel_downloader/core/savers/qianbi.py +0 -25
- novel_downloader/core/savers/sfacg.py +0 -25
- novel_downloader/core/savers/yamibo.py +0 -25
- novel_downloader/resources/config/rules.toml +0 -196
- novel_downloader-1.3.2.dist-info/RECORD +0 -165
- {novel_downloader-1.3.2.dist-info → novel_downloader-1.4.0.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-1.3.2.dist-info → novel_downloader-1.4.0.dist-info}/top_level.txt +0 -0
@@ -1,16 +1,10 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
2
|
"""
|
3
|
-
novel_downloader.core.parsers.qidian.
|
4
|
-
|
3
|
+
novel_downloader.core.parsers.qidian.chapter_encrypted
|
4
|
+
------------------------------------------------------
|
5
5
|
|
6
6
|
Support for parsing encrypted chapters from Qidian using font OCR mapping,
|
7
7
|
CSS rules, and custom rendering logic.
|
8
|
-
|
9
|
-
Includes:
|
10
|
-
- Font downloading and caching
|
11
|
-
- Encrypted paragraph extraction
|
12
|
-
- Custom CSS parsing and layout restoration
|
13
|
-
- Font-based OCR decryption and mapping
|
14
8
|
"""
|
15
9
|
|
16
10
|
from __future__ import annotations
|
@@ -21,27 +15,30 @@ from pathlib import Path
|
|
21
15
|
from typing import TYPE_CHECKING, Any
|
22
16
|
|
23
17
|
import tinycss2
|
24
|
-
from
|
18
|
+
from lxml import html
|
25
19
|
|
26
|
-
from novel_downloader.
|
20
|
+
from novel_downloader.models import ChapterDict
|
27
21
|
from novel_downloader.utils.network import download_font_file
|
28
22
|
from novel_downloader.utils.text_utils import apply_font_mapping
|
29
23
|
|
30
|
-
from
|
24
|
+
from .utils import (
|
31
25
|
extract_chapter_info,
|
32
26
|
find_ssr_page_context,
|
27
|
+
get_decryptor,
|
28
|
+
vip_status,
|
33
29
|
)
|
34
30
|
|
35
31
|
if TYPE_CHECKING:
|
36
|
-
from .main_parser import
|
32
|
+
from .main_parser import QidianParser
|
37
33
|
|
38
34
|
logger = logging.getLogger(__name__)
|
39
35
|
IGNORED_CLASS_LISTS = {"title", "review"}
|
36
|
+
NON_CONTENT_KEYWORDS = {"旧版", "反馈", "扫码"}
|
40
37
|
|
41
38
|
|
42
39
|
def parse_encrypted_chapter(
|
43
|
-
parser:
|
44
|
-
|
40
|
+
parser: QidianParser,
|
41
|
+
html_str: str,
|
45
42
|
chapter_id: str,
|
46
43
|
) -> ChapterDict | None:
|
47
44
|
"""
|
@@ -52,9 +49,8 @@ def parse_encrypted_chapter(
|
|
52
49
|
3. Decode and save randomFont bytes; download fixedFont via download_font().
|
53
50
|
4. Extract paragraph structures and save debug JSON.
|
54
51
|
5. Parse CSS rules and save debug JSON.
|
55
|
-
6.
|
56
|
-
7.
|
57
|
-
8. Extracts paragraph texts and formats them.
|
52
|
+
6. Render encrypted paragraphs, then run OCR font-mapping.
|
53
|
+
7. Extracts paragraph texts and formats them.
|
58
54
|
|
59
55
|
:param html_str: Raw HTML content of the chapter page.
|
60
56
|
:return: Formatted chapter text or empty string if not parsable.
|
@@ -62,13 +58,14 @@ def parse_encrypted_chapter(
|
|
62
58
|
try:
|
63
59
|
if not (parser._decode_font and parser._font_ocr):
|
64
60
|
return None
|
65
|
-
ssr_data = find_ssr_page_context(
|
61
|
+
ssr_data = find_ssr_page_context(html_str)
|
66
62
|
chapter_info = extract_chapter_info(ssr_data)
|
67
63
|
if not chapter_info:
|
68
64
|
logger.warning(
|
69
65
|
"[Parser] ssr_chapterInfo not found for chapter '%s'", chapter_id
|
70
66
|
)
|
71
67
|
return None
|
68
|
+
|
72
69
|
debug_base_dir: Path | None = None
|
73
70
|
if parser._font_debug_dir:
|
74
71
|
debug_base_dir = parser._font_debug_dir / chapter_id
|
@@ -79,7 +76,9 @@ def parse_encrypted_chapter(
|
|
79
76
|
fixedFontWoff2_url = chapter_info["fixedFontWoff2"]
|
80
77
|
|
81
78
|
title = chapter_info.get("chapterName", "Untitled")
|
82
|
-
|
79
|
+
raw_html = chapter_info.get("content", "")
|
80
|
+
chapter_id = chapter_info.get("chapterId", chapter_id)
|
81
|
+
fkp = chapter_info.get("fkp", "")
|
83
82
|
author_say = chapter_info.get("authorSay", "")
|
84
83
|
update_time = chapter_info.get("updateTime", "")
|
85
84
|
update_timestamp = chapter_info.get("updateTimestamp", 0)
|
@@ -101,7 +100,26 @@ def parse_encrypted_chapter(
|
|
101
100
|
raise ValueError("fixed_path is None: failed to download font")
|
102
101
|
|
103
102
|
# Extract and render paragraphs from HTML with CSS rules
|
104
|
-
main_paragraphs = extract_paragraphs_recursively(
|
103
|
+
main_paragraphs = extract_paragraphs_recursively(html_str, chapter_id)
|
104
|
+
if not main_paragraphs or contains_keywords(
|
105
|
+
main_paragraphs, NON_CONTENT_KEYWORDS
|
106
|
+
):
|
107
|
+
if vip_status(ssr_data):
|
108
|
+
try:
|
109
|
+
decryptor = get_decryptor()
|
110
|
+
raw_html = decryptor.decrypt(
|
111
|
+
raw_html,
|
112
|
+
chapter_id,
|
113
|
+
fkp,
|
114
|
+
parser._fuid,
|
115
|
+
)
|
116
|
+
except Exception as e:
|
117
|
+
logger.error(
|
118
|
+
"[Parser] decryption failed for '%s': %s", chapter_id, e
|
119
|
+
)
|
120
|
+
return None
|
121
|
+
main_paragraphs = extract_paragraphs_recursively(raw_html, chapter_id)
|
122
|
+
|
105
123
|
if debug_base_dir:
|
106
124
|
main_paragraphs_path = debug_base_dir / "main_paragraphs_debug.json"
|
107
125
|
main_paragraphs_path.write_text(
|
@@ -117,23 +135,11 @@ def parse_encrypted_chapter(
|
|
117
135
|
encoding="utf-8",
|
118
136
|
)
|
119
137
|
|
120
|
-
|
121
|
-
end_number = parse_end_number(main_paragraphs, paragraph_names)
|
122
|
-
if debug_base_dir:
|
123
|
-
paragraphs_rules_path = debug_base_dir / "paragraph_names_debug.txt"
|
124
|
-
temp = f"names:\n{paragraph_names}\n\nend_number: {end_number}"
|
125
|
-
paragraphs_rules_path.write_text(
|
126
|
-
temp,
|
127
|
-
encoding="utf-8",
|
128
|
-
)
|
129
|
-
if not end_number:
|
130
|
-
logger.warning(
|
131
|
-
f"[Parser] No end_number found after parsing chapter '{chapter_id}'"
|
132
|
-
)
|
133
|
-
return None
|
134
|
-
|
138
|
+
end_number = parse_end_number(main_paragraphs, paragraphs_rules)
|
135
139
|
paragraphs_str, refl_list = render_paragraphs(
|
136
|
-
main_paragraphs,
|
140
|
+
main_paragraphs,
|
141
|
+
paragraphs_rules,
|
142
|
+
end_number,
|
137
143
|
)
|
138
144
|
if debug_base_dir:
|
139
145
|
paragraphs_str_path = debug_base_dir / f"{chapter_id}_debug.txt"
|
@@ -173,7 +179,7 @@ def parse_encrypted_chapter(
|
|
173
179
|
)
|
174
180
|
return {
|
175
181
|
"id": str(chapter_id),
|
176
|
-
"title": title,
|
182
|
+
"title": str(title),
|
177
183
|
"content": final_paragraphs_str,
|
178
184
|
"extra": {
|
179
185
|
"author_say": author_say.strip() if author_say else "",
|
@@ -183,6 +189,7 @@ def parse_encrypted_chapter(
|
|
183
189
|
"word_count": word_count,
|
184
190
|
"seq": seq,
|
185
191
|
"volume": volume,
|
192
|
+
"encrypted": True,
|
186
193
|
},
|
187
194
|
}
|
188
195
|
|
@@ -194,48 +201,46 @@ def parse_encrypted_chapter(
|
|
194
201
|
|
195
202
|
|
196
203
|
def extract_paragraphs_recursively(
|
197
|
-
|
204
|
+
html_str: str,
|
205
|
+
chapter_id: str,
|
198
206
|
) -> list[dict[str, Any]]:
|
199
|
-
|
200
|
-
|
201
|
-
|
207
|
+
def parse_element(elem: html.HtmlElement) -> dict[str, Any]:
|
208
|
+
class_attr = elem.attrib.get("class", "")
|
209
|
+
class_list = class_attr.split() if isinstance(class_attr, str) else class_attr
|
210
|
+
if "review" in class_list:
|
211
|
+
return {}
|
212
|
+
|
213
|
+
# Build attrs with class as list
|
214
|
+
attrs = {k: v.split() if k == "class" else v for k, v in elem.attrib.items()}
|
215
|
+
|
216
|
+
node: dict[str, Any] = {
|
217
|
+
"tag": elem.tag,
|
218
|
+
"attrs": attrs,
|
219
|
+
"data": [],
|
220
|
+
}
|
202
221
|
|
203
|
-
|
204
|
-
|
222
|
+
# Append entire elem.text if present (no splitting)
|
223
|
+
if elem.text:
|
224
|
+
node["data"].append(elem.text)
|
205
225
|
|
206
|
-
|
207
|
-
|
226
|
+
# Recurse into children
|
227
|
+
for child in elem.iterchildren(tag=None):
|
228
|
+
child_dict = parse_element(child)
|
229
|
+
if child_dict:
|
230
|
+
node["data"].append(child_dict)
|
208
231
|
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
result["data"].append(text)
|
222
|
-
return result
|
223
|
-
|
224
|
-
if chapter_id:
|
225
|
-
main_id = f"c-{chapter_id}"
|
226
|
-
main_tag = soup.find("main", id=main_id)
|
227
|
-
if not isinstance(main_tag, Tag):
|
228
|
-
return []
|
229
|
-
else:
|
230
|
-
main_tag = soup
|
231
|
-
|
232
|
-
result = []
|
233
|
-
for p in main_tag.find_all("p"):
|
234
|
-
parsed_p = parse_element(p)
|
235
|
-
if parsed_p:
|
236
|
-
result.append(parsed_p)
|
237
|
-
|
238
|
-
return result
|
232
|
+
# Append entire tail string (no split)
|
233
|
+
if child.tail:
|
234
|
+
node["data"].append(child.tail)
|
235
|
+
|
236
|
+
return node
|
237
|
+
|
238
|
+
tree = html.fromstring(html_str)
|
239
|
+
|
240
|
+
# Try to find <main id="c-{chapter_id}">
|
241
|
+
main_elem = tree.xpath(f'//main[@id="c-{chapter_id}"]')
|
242
|
+
search_root = main_elem[0] if main_elem else tree
|
243
|
+
return [parse_element(p) for p in search_root.findall(".//p")]
|
239
244
|
|
240
245
|
|
241
246
|
def parse_rule(css_str: str) -> dict[str, Any]:
|
@@ -317,68 +322,10 @@ def parse_rule(css_str: str) -> dict[str, Any]:
|
|
317
322
|
return {"rules": rules, "orders": orders}
|
318
323
|
|
319
324
|
|
320
|
-
def parse_paragraph_names(rules: dict[str, Any]) -> set[str]:
|
321
|
-
"""
|
322
|
-
Extract all paragraph selector names from parsed rules, excluding "sy".
|
323
|
-
"""
|
324
|
-
paragraph_names = set()
|
325
|
-
for group, group_rules in rules.get("rules", {}).items():
|
326
|
-
if group == "sy":
|
327
|
-
continue
|
328
|
-
paragraph_names.update(group_rules.keys())
|
329
|
-
return paragraph_names
|
330
|
-
|
331
|
-
|
332
|
-
def parse_end_number(
|
333
|
-
main_paragraphs: list[dict[str, Any]], paragraph_names: set[str]
|
334
|
-
) -> int | None:
|
335
|
-
"""
|
336
|
-
Find the most frequent numeric suffix from tag names
|
337
|
-
matched by given paragraph prefixes.
|
338
|
-
"""
|
339
|
-
end_numbers: dict[int, int] = {}
|
340
|
-
sorted_names = sorted(paragraph_names, key=len, reverse=True)
|
341
|
-
|
342
|
-
def rec_parse(item: list[Any] | dict[str, Any]) -> None:
|
343
|
-
if isinstance(item, list):
|
344
|
-
for element in item:
|
345
|
-
rec_parse(element)
|
346
|
-
elif isinstance(item, dict):
|
347
|
-
tag = item.get("tag")
|
348
|
-
if isinstance(tag, str):
|
349
|
-
for prefix in sorted_names:
|
350
|
-
if tag.startswith(prefix):
|
351
|
-
remain = tag[len(prefix) :]
|
352
|
-
if remain.isdigit():
|
353
|
-
num = int(remain)
|
354
|
-
end_numbers[num] = end_numbers.get(num, 0) + 1
|
355
|
-
break
|
356
|
-
for val in item.values():
|
357
|
-
if isinstance(val, (list | dict)):
|
358
|
-
rec_parse(val)
|
359
|
-
|
360
|
-
rec_parse(main_paragraphs)
|
361
|
-
|
362
|
-
if not end_numbers:
|
363
|
-
logger.warning("[Parser] No valid ending numbers found")
|
364
|
-
return None
|
365
|
-
|
366
|
-
sorted_numbers = sorted(
|
367
|
-
end_numbers.items(), key=lambda x: (x[1], x[0]), reverse=True
|
368
|
-
)
|
369
|
-
|
370
|
-
logger.debug(
|
371
|
-
"[Parser] Top 3 end numbers:\n%s",
|
372
|
-
"\n".join(f"{n}: {c}" for n, c in sorted_numbers[:3]),
|
373
|
-
)
|
374
|
-
|
375
|
-
return sorted_numbers[0][0]
|
376
|
-
|
377
|
-
|
378
325
|
def render_paragraphs(
|
379
326
|
main_paragraphs: list[dict[str, Any]],
|
380
327
|
rules: dict[str, Any],
|
381
|
-
end_number:
|
328
|
+
end_number: str = "",
|
382
329
|
) -> tuple[str, list[str]]:
|
383
330
|
"""
|
384
331
|
Applies the parsed CSS rules to the paragraph structure and
|
@@ -392,7 +339,6 @@ def render_paragraphs(
|
|
392
339
|
:param rules: A dictionary with keys 'orders' and 'rules', parsed from CSS.
|
393
340
|
- rules['orders']: List of (selector, id) tuples.
|
394
341
|
- rules['rules']: Nested dict containing transformation rules.
|
395
|
-
:param end_number: HTML tag suffix (e.g. span123 -> 123).
|
396
342
|
|
397
343
|
:return:
|
398
344
|
- A reconstructed paragraph string with line breaks.
|
@@ -488,3 +434,84 @@ def render_paragraphs(
|
|
488
434
|
paragraphs_str += "\n\n"
|
489
435
|
|
490
436
|
return paragraphs_str, refl_list
|
437
|
+
|
438
|
+
|
439
|
+
def parse_paragraph_names(rules: dict[str, Any]) -> set[str]:
|
440
|
+
"""
|
441
|
+
Extract all paragraph selector names from parsed rules, excluding "sy".
|
442
|
+
"""
|
443
|
+
paragraph_names = set()
|
444
|
+
for group, group_rules in rules.get("rules", {}).items():
|
445
|
+
if group == "sy":
|
446
|
+
continue
|
447
|
+
paragraph_names.update(group_rules.keys())
|
448
|
+
return paragraph_names
|
449
|
+
|
450
|
+
|
451
|
+
def parse_end_number(
|
452
|
+
main_paragraphs: list[dict[str, Any]],
|
453
|
+
rules: dict[str, Any],
|
454
|
+
) -> str:
|
455
|
+
"""
|
456
|
+
Find the most frequent numeric suffix from tag names
|
457
|
+
matched by given paragraph prefixes.
|
458
|
+
"""
|
459
|
+
paragraph_names = parse_paragraph_names(rules)
|
460
|
+
end_numbers: dict[int, int] = {}
|
461
|
+
prefix_hits = 0
|
462
|
+
sorted_names = sorted(paragraph_names, key=len, reverse=True)
|
463
|
+
|
464
|
+
def rec_parse(item: list[Any] | dict[str, Any]) -> None:
|
465
|
+
nonlocal prefix_hits
|
466
|
+
if isinstance(item, list):
|
467
|
+
for element in item:
|
468
|
+
rec_parse(element)
|
469
|
+
elif isinstance(item, dict):
|
470
|
+
tag = item.get("tag")
|
471
|
+
if isinstance(tag, str):
|
472
|
+
for prefix in sorted_names:
|
473
|
+
if tag.startswith(prefix):
|
474
|
+
prefix_hits += 1
|
475
|
+
remain = tag[len(prefix) :]
|
476
|
+
if remain.isdigit():
|
477
|
+
num = int(remain)
|
478
|
+
end_numbers[num] = end_numbers.get(num, 0) + 1
|
479
|
+
break
|
480
|
+
for val in item.values():
|
481
|
+
if isinstance(val, (list | dict)):
|
482
|
+
rec_parse(val)
|
483
|
+
|
484
|
+
rec_parse(main_paragraphs)
|
485
|
+
|
486
|
+
if not end_numbers:
|
487
|
+
logger.debug("[Parser] No valid ending numbers found")
|
488
|
+
return ""
|
489
|
+
|
490
|
+
sorted_numbers = sorted(
|
491
|
+
end_numbers.items(), key=lambda x: (x[1], x[0]), reverse=True
|
492
|
+
)
|
493
|
+
|
494
|
+
logger.debug(
|
495
|
+
"[Parser] Top 3 end numbers:\n%s",
|
496
|
+
"\n".join(f"{n}: {c}" for n, c in sorted_numbers[:3]),
|
497
|
+
)
|
498
|
+
most_common_number, most_common_count = sorted_numbers[0]
|
499
|
+
if most_common_count <= prefix_hits / 2:
|
500
|
+
logger.debug(
|
501
|
+
"[Parser] Top number (%s) does not exceed 50%% threshold: %d of %d",
|
502
|
+
most_common_number,
|
503
|
+
most_common_count,
|
504
|
+
prefix_hits,
|
505
|
+
)
|
506
|
+
return ""
|
507
|
+
|
508
|
+
return str(most_common_number)
|
509
|
+
|
510
|
+
|
511
|
+
def contains_keywords(paragraphs: list[dict[str, Any]], keywords: set[str]) -> bool:
|
512
|
+
for para in paragraphs:
|
513
|
+
data = para.get("data", [])
|
514
|
+
for item in data:
|
515
|
+
if isinstance(item, str) and any(kw in item for kw in keywords):
|
516
|
+
return True
|
517
|
+
return False
|
@@ -0,0 +1,150 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.qidian.chapter_normal
|
4
|
+
---------------------------------------------------
|
5
|
+
|
6
|
+
Parser logic for extracting readable text from Qidian chapters
|
7
|
+
that use plain (non-encrypted) browser-rendered HTML.
|
8
|
+
"""
|
9
|
+
|
10
|
+
from __future__ import annotations
|
11
|
+
|
12
|
+
import logging
|
13
|
+
from typing import TYPE_CHECKING
|
14
|
+
|
15
|
+
from lxml import html
|
16
|
+
|
17
|
+
from novel_downloader.models import ChapterDict
|
18
|
+
|
19
|
+
from .utils import (
|
20
|
+
extract_chapter_info,
|
21
|
+
find_ssr_page_context,
|
22
|
+
get_decryptor,
|
23
|
+
vip_status,
|
24
|
+
)
|
25
|
+
|
26
|
+
if TYPE_CHECKING:
|
27
|
+
from .main_parser import QidianParser
|
28
|
+
|
29
|
+
logger = logging.getLogger(__name__)
|
30
|
+
|
31
|
+
|
32
|
+
def parse_normal_chapter(
|
33
|
+
parser: QidianParser,
|
34
|
+
html_str: str,
|
35
|
+
chapter_id: str,
|
36
|
+
) -> ChapterDict | None:
|
37
|
+
"""
|
38
|
+
Extract structured chapter info from a normal Qidian page.
|
39
|
+
|
40
|
+
:param html_str: Chapter HTML.
|
41
|
+
:param chapter_id: Chapter identifier (string).
|
42
|
+
:return: a dictionary with keys like 'id', 'title', 'content', etc.
|
43
|
+
"""
|
44
|
+
try:
|
45
|
+
ssr_data = find_ssr_page_context(html_str)
|
46
|
+
chapter_info = extract_chapter_info(ssr_data)
|
47
|
+
if not chapter_info:
|
48
|
+
logger.warning(
|
49
|
+
"[Parser] ssr_chapterInfo not found for chapter '%s'", chapter_id
|
50
|
+
)
|
51
|
+
return None
|
52
|
+
|
53
|
+
title = chapter_info.get("chapterName", "Untitled")
|
54
|
+
raw_html = chapter_info.get("content", "")
|
55
|
+
chapter_id = chapter_info.get("chapterId", chapter_id)
|
56
|
+
fkp = chapter_info.get("fkp", "")
|
57
|
+
author_say = chapter_info.get("authorSay", "")
|
58
|
+
update_time = chapter_info.get("updateTime", "")
|
59
|
+
update_timestamp = chapter_info.get("updateTimestamp", 0)
|
60
|
+
modify_time = chapter_info.get("modifyTime", 0)
|
61
|
+
word_count = chapter_info.get("wordsCount", 0)
|
62
|
+
seq = chapter_info.get("seq", None)
|
63
|
+
volume = chapter_info.get("extra", {}).get("volumeName", "")
|
64
|
+
|
65
|
+
chapter_text = _parse_browser_paragraph(html_str)
|
66
|
+
if not chapter_text:
|
67
|
+
chapter_text = _parse_session_paragraph(
|
68
|
+
html_str=raw_html,
|
69
|
+
is_vip=vip_status(ssr_data),
|
70
|
+
chapter_id=chapter_id,
|
71
|
+
fkp=fkp,
|
72
|
+
fuid=parser._fuid,
|
73
|
+
)
|
74
|
+
if not chapter_text:
|
75
|
+
return None
|
76
|
+
|
77
|
+
return {
|
78
|
+
"id": str(chapter_id),
|
79
|
+
"title": title,
|
80
|
+
"content": chapter_text,
|
81
|
+
"extra": {
|
82
|
+
"author_say": author_say.strip() if author_say else "",
|
83
|
+
"updated_at": update_time,
|
84
|
+
"update_timestamp": update_timestamp,
|
85
|
+
"modify_time": modify_time,
|
86
|
+
"word_count": word_count,
|
87
|
+
"seq": seq,
|
88
|
+
"volume": volume,
|
89
|
+
"encrypted": False,
|
90
|
+
},
|
91
|
+
}
|
92
|
+
except Exception as e:
|
93
|
+
logger.warning(
|
94
|
+
"[Parser] parse error for normal chapter '%s': %s", chapter_id, e
|
95
|
+
)
|
96
|
+
return None
|
97
|
+
|
98
|
+
|
99
|
+
def _parse_browser_paragraph(html_str: str) -> str:
|
100
|
+
try:
|
101
|
+
tree = html.fromstring(html_str)
|
102
|
+
main = tree.xpath('//div[@id="app"]//div[@id="reader-content"]//main')
|
103
|
+
if not main:
|
104
|
+
return ""
|
105
|
+
main = main[0]
|
106
|
+
|
107
|
+
content_spans = main.xpath('.//span[contains(@class, "content-text")]')
|
108
|
+
|
109
|
+
paragraph_texts = [
|
110
|
+
span.text_content().strip()
|
111
|
+
for span in content_spans
|
112
|
+
if span.text_content().strip()
|
113
|
+
]
|
114
|
+
|
115
|
+
chapter_text = "\n\n".join(paragraph_texts)
|
116
|
+
return chapter_text
|
117
|
+
|
118
|
+
except Exception as e:
|
119
|
+
logger.error("[Parser] _parse_paragraph failed: %s", e)
|
120
|
+
return ""
|
121
|
+
|
122
|
+
|
123
|
+
def _parse_session_paragraph(
|
124
|
+
html_str: str,
|
125
|
+
is_vip: bool,
|
126
|
+
chapter_id: str,
|
127
|
+
fkp: str,
|
128
|
+
fuid: str,
|
129
|
+
) -> str:
|
130
|
+
try:
|
131
|
+
raw_html = html_str
|
132
|
+
|
133
|
+
if is_vip:
|
134
|
+
try:
|
135
|
+
decryptor = get_decryptor()
|
136
|
+
raw_html = decryptor.decrypt(raw_html, chapter_id, fkp, fuid)
|
137
|
+
except Exception as e:
|
138
|
+
logger.error("[Parser] decryption failed for '%s': %s", chapter_id, e)
|
139
|
+
return ""
|
140
|
+
|
141
|
+
tree = html.fromstring(raw_html)
|
142
|
+
paras = tree.xpath(".//p")
|
143
|
+
paragraph_texts = [
|
144
|
+
p.text_content().strip() for p in paras if p.text_content().strip()
|
145
|
+
]
|
146
|
+
return "\n\n".join(paragraph_texts)
|
147
|
+
|
148
|
+
except Exception as e:
|
149
|
+
logger.error("[Parser] _parse_paragraph failed: %s", e)
|
150
|
+
return ""
|
@@ -1,9 +1,9 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
2
|
"""
|
3
|
-
novel_downloader.core.parsers.qidian.
|
4
|
-
|
3
|
+
novel_downloader.core.parsers.qidian.chapter_router
|
4
|
+
---------------------------------------------------
|
5
5
|
|
6
|
-
Routing logic for selecting the correct chapter parser for Qidian
|
6
|
+
Routing logic for selecting the correct chapter parser for Qidian pages.
|
7
7
|
"""
|
8
8
|
|
9
9
|
from __future__ import annotations
|
@@ -11,50 +11,50 @@ from __future__ import annotations
|
|
11
11
|
import logging
|
12
12
|
from typing import TYPE_CHECKING
|
13
13
|
|
14
|
-
from novel_downloader.
|
14
|
+
from novel_downloader.models import ChapterDict
|
15
15
|
|
16
|
-
from
|
16
|
+
from .chapter_normal import parse_normal_chapter
|
17
|
+
from .utils import (
|
17
18
|
can_view_chapter,
|
18
|
-
|
19
|
+
find_ssr_page_context,
|
19
20
|
is_encrypted,
|
20
21
|
)
|
21
|
-
from .chapter_normal import parse_normal_chapter
|
22
22
|
|
23
23
|
if TYPE_CHECKING:
|
24
|
-
from .main_parser import
|
24
|
+
from .main_parser import QidianParser
|
25
25
|
|
26
26
|
logger = logging.getLogger(__name__)
|
27
27
|
|
28
28
|
|
29
29
|
def parse_chapter(
|
30
|
-
parser:
|
30
|
+
parser: QidianParser,
|
31
31
|
html_str: str,
|
32
32
|
chapter_id: str,
|
33
33
|
) -> ChapterDict | None:
|
34
34
|
"""
|
35
35
|
Extract and return the formatted textual content of chapter.
|
36
36
|
|
37
|
-
:param parser: Instance of
|
37
|
+
:param parser: Instance of QidianParser.
|
38
38
|
:param html_str: Raw HTML content of the chapter page.
|
39
39
|
:param chapter_id: Identifier of the chapter being parsed.
|
40
40
|
:return: Formatted chapter text or empty string if not parsable.
|
41
41
|
"""
|
42
42
|
try:
|
43
|
-
|
43
|
+
ssr_data = find_ssr_page_context(html_str)
|
44
44
|
|
45
|
-
if not can_view_chapter(
|
45
|
+
if not can_view_chapter(ssr_data):
|
46
46
|
logger.warning(
|
47
47
|
"[Parser] Chapter '%s' is not purchased or inaccessible.", chapter_id
|
48
48
|
)
|
49
49
|
return None
|
50
50
|
|
51
|
-
if is_encrypted(
|
51
|
+
if is_encrypted(ssr_data):
|
52
52
|
if not parser._decode_font:
|
53
53
|
return None
|
54
54
|
try:
|
55
55
|
from .chapter_encrypted import parse_encrypted_chapter
|
56
56
|
|
57
|
-
return parse_encrypted_chapter(parser,
|
57
|
+
return parse_encrypted_chapter(parser, html_str, chapter_id)
|
58
58
|
except ImportError:
|
59
59
|
logger.warning(
|
60
60
|
"[Parser] Encrypted chapter '%s' requires extra dependencies.",
|
@@ -62,7 +62,7 @@ def parse_chapter(
|
|
62
62
|
)
|
63
63
|
return None
|
64
64
|
|
65
|
-
return parse_normal_chapter(
|
65
|
+
return parse_normal_chapter(parser, html_str, chapter_id)
|
66
66
|
except Exception as e:
|
67
67
|
logger.warning("[Parser] parse error for chapter '%s': %s", chapter_id, e)
|
68
68
|
return None
|