novel-downloader 2.0.1__py3-none-any.whl → 2.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -1
- novel_downloader/cli/download.py +11 -8
- novel_downloader/cli/export.py +17 -17
- novel_downloader/cli/ui.py +28 -1
- novel_downloader/config/adapter.py +27 -1
- novel_downloader/core/archived/deqixs/fetcher.py +1 -28
- novel_downloader/core/downloaders/__init__.py +2 -0
- novel_downloader/core/downloaders/base.py +34 -85
- novel_downloader/core/downloaders/common.py +147 -171
- novel_downloader/core/downloaders/qianbi.py +30 -64
- novel_downloader/core/downloaders/qidian.py +157 -184
- novel_downloader/core/downloaders/qqbook.py +292 -0
- novel_downloader/core/downloaders/registry.py +2 -2
- novel_downloader/core/exporters/__init__.py +2 -0
- novel_downloader/core/exporters/base.py +37 -59
- novel_downloader/core/exporters/common.py +620 -0
- novel_downloader/core/exporters/linovelib.py +47 -0
- novel_downloader/core/exporters/qidian.py +41 -12
- novel_downloader/core/exporters/qqbook.py +28 -0
- novel_downloader/core/exporters/registry.py +2 -2
- novel_downloader/core/fetchers/__init__.py +4 -2
- novel_downloader/core/fetchers/aaatxt.py +2 -22
- novel_downloader/core/fetchers/b520.py +3 -23
- novel_downloader/core/fetchers/base.py +80 -105
- novel_downloader/core/fetchers/biquyuedu.py +2 -22
- novel_downloader/core/fetchers/dxmwx.py +10 -22
- novel_downloader/core/fetchers/esjzone.py +6 -29
- novel_downloader/core/fetchers/guidaye.py +2 -22
- novel_downloader/core/fetchers/hetushu.py +9 -29
- novel_downloader/core/fetchers/i25zw.py +2 -16
- novel_downloader/core/fetchers/ixdzs8.py +2 -16
- novel_downloader/core/fetchers/jpxs123.py +2 -16
- novel_downloader/core/fetchers/lewenn.py +2 -22
- novel_downloader/core/fetchers/linovelib.py +4 -20
- novel_downloader/core/fetchers/{eightnovel.py → n8novel.py} +12 -40
- novel_downloader/core/fetchers/piaotia.py +2 -16
- novel_downloader/core/fetchers/qbtr.py +2 -16
- novel_downloader/core/fetchers/qianbi.py +1 -20
- novel_downloader/core/fetchers/qidian.py +7 -33
- novel_downloader/core/fetchers/qqbook.py +177 -0
- novel_downloader/core/fetchers/quanben5.py +9 -29
- novel_downloader/core/fetchers/rate_limiter.py +22 -53
- novel_downloader/core/fetchers/sfacg.py +3 -16
- novel_downloader/core/fetchers/shencou.py +2 -16
- novel_downloader/core/fetchers/shuhaige.py +2 -22
- novel_downloader/core/fetchers/tongrenquan.py +2 -22
- novel_downloader/core/fetchers/ttkan.py +3 -14
- novel_downloader/core/fetchers/wanbengo.py +2 -22
- novel_downloader/core/fetchers/xiaoshuowu.py +2 -16
- novel_downloader/core/fetchers/xiguashuwu.py +4 -20
- novel_downloader/core/fetchers/xs63b.py +3 -15
- novel_downloader/core/fetchers/xshbook.py +2 -22
- novel_downloader/core/fetchers/yamibo.py +4 -28
- novel_downloader/core/fetchers/yibige.py +13 -26
- novel_downloader/core/interfaces/exporter.py +19 -7
- novel_downloader/core/interfaces/fetcher.py +21 -47
- novel_downloader/core/parsers/__init__.py +4 -2
- novel_downloader/core/parsers/b520.py +2 -2
- novel_downloader/core/parsers/base.py +4 -39
- novel_downloader/core/parsers/{eightnovel.py → n8novel.py} +5 -5
- novel_downloader/core/parsers/{qidian/main_parser.py → qidian.py} +147 -266
- novel_downloader/core/parsers/qqbook.py +709 -0
- novel_downloader/core/parsers/xiguashuwu.py +3 -4
- novel_downloader/core/searchers/__init__.py +2 -2
- novel_downloader/core/searchers/b520.py +1 -1
- novel_downloader/core/searchers/base.py +2 -2
- novel_downloader/core/searchers/{eightnovel.py → n8novel.py} +5 -5
- novel_downloader/models/__init__.py +2 -0
- novel_downloader/models/book.py +1 -0
- novel_downloader/models/config.py +12 -0
- novel_downloader/resources/config/settings.toml +23 -5
- novel_downloader/resources/js_scripts/expr_to_json.js +14 -0
- novel_downloader/resources/js_scripts/qidian_decrypt_node.js +21 -16
- novel_downloader/resources/js_scripts/qq_decrypt_node.js +92 -0
- novel_downloader/utils/constants.py +6 -0
- novel_downloader/utils/crypto_utils/aes_util.py +1 -1
- novel_downloader/utils/epub/constants.py +1 -6
- novel_downloader/utils/fontocr/core.py +2 -0
- novel_downloader/utils/fontocr/loader.py +10 -8
- novel_downloader/utils/node_decryptor/__init__.py +13 -0
- novel_downloader/utils/node_decryptor/decryptor.py +342 -0
- novel_downloader/{core/parsers/qidian/utils → utils/node_decryptor}/decryptor_fetcher.py +5 -6
- novel_downloader/web/pages/download.py +1 -1
- novel_downloader/web/pages/search.py +1 -1
- novel_downloader/web/services/task_manager.py +2 -0
- {novel_downloader-2.0.1.dist-info → novel_downloader-2.0.2.dist-info}/METADATA +4 -1
- {novel_downloader-2.0.1.dist-info → novel_downloader-2.0.2.dist-info}/RECORD +91 -94
- novel_downloader/core/exporters/common/__init__.py +0 -11
- novel_downloader/core/exporters/common/epub.py +0 -198
- novel_downloader/core/exporters/common/main_exporter.py +0 -64
- novel_downloader/core/exporters/common/txt.py +0 -146
- novel_downloader/core/exporters/epub_util.py +0 -215
- novel_downloader/core/exporters/linovelib/__init__.py +0 -11
- novel_downloader/core/exporters/linovelib/epub.py +0 -349
- novel_downloader/core/exporters/linovelib/main_exporter.py +0 -66
- novel_downloader/core/exporters/linovelib/txt.py +0 -139
- novel_downloader/core/exporters/txt_util.py +0 -67
- novel_downloader/core/parsers/qidian/__init__.py +0 -10
- novel_downloader/core/parsers/qidian/utils/__init__.py +0 -11
- novel_downloader/core/parsers/qidian/utils/node_decryptor.py +0 -175
- {novel_downloader-2.0.1.dist-info → novel_downloader-2.0.2.dist-info}/WHEEL +0 -0
- {novel_downloader-2.0.1.dist-info → novel_downloader-2.0.2.dist-info}/entry_points.txt +0 -0
- {novel_downloader-2.0.1.dist-info → novel_downloader-2.0.2.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-2.0.1.dist-info → novel_downloader-2.0.2.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
2
|
"""
|
3
|
-
novel_downloader.core.parsers.
|
4
|
-
|
3
|
+
novel_downloader.core.parsers.n8novel
|
4
|
+
-------------------------------------
|
5
5
|
|
6
6
|
"""
|
7
7
|
|
@@ -21,9 +21,9 @@ from novel_downloader.models import (
|
|
21
21
|
|
22
22
|
|
23
23
|
@register_parser(
|
24
|
-
site_keys=["
|
24
|
+
site_keys=["n8novel", "8novel"],
|
25
25
|
)
|
26
|
-
class
|
26
|
+
class N8novelParser(BaseParser):
|
27
27
|
"""
|
28
28
|
Parser for 无限轻小说 book pages.
|
29
29
|
"""
|
@@ -177,7 +177,7 @@ class EightnovelParser(BaseParser):
|
|
177
177
|
"id": chapter_id,
|
178
178
|
"title": title,
|
179
179
|
"content": content,
|
180
|
-
"extra": {"site": "
|
180
|
+
"extra": {"site": "n8novel"},
|
181
181
|
}
|
182
182
|
|
183
183
|
@staticmethod
|
@@ -1,16 +1,14 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
2
|
"""
|
3
|
-
novel_downloader.core.parsers.qidian
|
4
|
-
|
3
|
+
novel_downloader.core.parsers.qidian
|
4
|
+
------------------------------------
|
5
5
|
|
6
|
-
Main parser class for handling Qidian HTML
|
7
6
|
"""
|
8
7
|
|
9
8
|
from __future__ import annotations
|
10
9
|
|
11
10
|
import json
|
12
11
|
import logging
|
13
|
-
import re
|
14
12
|
from contextlib import suppress
|
15
13
|
from html import unescape
|
16
14
|
from pathlib import Path
|
@@ -34,10 +32,7 @@ from novel_downloader.utils import (
|
|
34
32
|
from novel_downloader.utils.constants import DATA_DIR
|
35
33
|
from novel_downloader.utils.cookies import get_cookie_value
|
36
34
|
from novel_downloader.utils.fontocr import get_font_ocr
|
37
|
-
|
38
|
-
from .utils import (
|
39
|
-
get_decryptor,
|
40
|
-
)
|
35
|
+
from novel_downloader.utils.node_decryptor import get_decryptor
|
41
36
|
|
42
37
|
logger = logging.getLogger(__name__)
|
43
38
|
|
@@ -69,25 +64,16 @@ class QidianParser(BaseParser):
|
|
69
64
|
Parser for 起点中文网 site.
|
70
65
|
"""
|
71
66
|
|
72
|
-
|
73
|
-
_RE_ATTR = re.compile(r"attr\(\s*([^)]+?)\s*\)", re.I)
|
74
|
-
_RE_SCALEX = re.compile(r"scalex\(\s*-?1\s*\)", re.I)
|
75
|
-
|
76
|
-
def __init__(
|
77
|
-
self,
|
78
|
-
config: ParserConfig,
|
79
|
-
fuid: str = "",
|
80
|
-
):
|
67
|
+
def __init__(self, config: ParserConfig, fuid: str = ""):
|
81
68
|
"""
|
82
69
|
Initialize the QidianParser with the given configuration.
|
83
|
-
|
84
|
-
:param config: ParserConfig object controlling:
|
85
70
|
"""
|
86
71
|
super().__init__(config)
|
87
72
|
|
88
|
-
self.
|
89
|
-
self._fixed_font_dir
|
90
|
-
self.
|
73
|
+
self._rand_path = self._base_cache_dir / "qidian" / "randomFont.ttf"
|
74
|
+
self._fixed_font_dir = self._base_cache_dir / "qidian" / "fixed_fonts"
|
75
|
+
self._fixed_map_dir = self._base_cache_dir / "qidian" / "fixed_font_map"
|
76
|
+
self._debug_dir = Path.cwd() / "debug" / "qidian"
|
91
77
|
|
92
78
|
state_files = [
|
93
79
|
DATA_DIR / "qidian" / "session_state.cookies",
|
@@ -99,12 +85,6 @@ class QidianParser(BaseParser):
|
|
99
85
|
html_list: list[str],
|
100
86
|
**kwargs: Any,
|
101
87
|
) -> BookInfoDict | None:
|
102
|
-
"""
|
103
|
-
Parse a book info page and extract metadata and chapter structure.
|
104
|
-
|
105
|
-
:param html_list: Raw HTML of the book info page.
|
106
|
-
:return: Parsed metadata and chapter structure as a dictionary.
|
107
|
-
"""
|
108
88
|
if not html_list:
|
109
89
|
return None
|
110
90
|
|
@@ -172,57 +152,39 @@ class QidianParser(BaseParser):
|
|
172
152
|
chapter_id: str,
|
173
153
|
**kwargs: Any,
|
174
154
|
) -> ChapterDict | None:
|
175
|
-
"""
|
176
|
-
:param html_list: Raw HTML of the chapter page.
|
177
|
-
:param chapter_id: Identifier of the chapter being parsed.
|
178
|
-
:return: Cleaned chapter content as plain text.
|
179
|
-
"""
|
180
155
|
if not html_list:
|
156
|
+
logger.warning("[Parser] chapter_id=%s :: html_list is empty", chapter_id)
|
181
157
|
return None
|
182
158
|
try:
|
183
159
|
ssr_data = self._find_ssr_page_context(html_list[0])
|
184
160
|
chapter_info = self._extract_chapter_info(ssr_data)
|
185
|
-
if not chapter_info:
|
186
|
-
logger.warning(
|
187
|
-
"[Parser] ssr_chapterInfo not found for chapter '%s'", chapter_id
|
188
|
-
)
|
189
|
-
return None
|
190
|
-
|
191
|
-
if not self._can_view_chapter(chapter_info):
|
192
|
-
logger.warning(
|
193
|
-
"[Parser] Chapter '%s' is not purchased or inaccessible.",
|
194
|
-
chapter_id,
|
195
|
-
)
|
196
|
-
return None
|
197
|
-
|
198
|
-
if self._is_encrypted(ssr_data):
|
199
|
-
if not self._decode_font:
|
200
|
-
return None
|
201
|
-
return self.parse_encrypted_chapter(chapter_info, chapter_id)
|
202
|
-
|
203
|
-
return self.parse_normal_chapter(chapter_info, chapter_id)
|
204
|
-
|
205
161
|
except Exception as e:
|
206
|
-
logger.warning(
|
207
|
-
|
162
|
+
logger.warning(
|
163
|
+
"[Parser] chapter_id=%s :: failed to locate ssr_pageContext block: %s",
|
164
|
+
chapter_id,
|
165
|
+
e,
|
166
|
+
)
|
167
|
+
return None
|
208
168
|
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
169
|
+
if not chapter_info:
|
170
|
+
logger.warning(
|
171
|
+
"[Parser] ssr_chapterInfo not found for chapter '%s'", chapter_id
|
172
|
+
)
|
173
|
+
return None
|
174
|
+
|
175
|
+
if not self._can_view_chapter(chapter_info):
|
176
|
+
logger.warning(
|
177
|
+
"[Parser] Chapter '%s' is not purchased or inaccessible.",
|
178
|
+
chapter_id,
|
179
|
+
)
|
180
|
+
return None
|
216
181
|
|
217
|
-
:param chapter_info: Parsed chapter info block from ssr data.
|
218
|
-
:param chapter_id: Chapter identifier (string).
|
219
|
-
:return: a dictionary with keys like 'id', 'title', 'content', etc.
|
220
|
-
"""
|
221
182
|
duplicated = self._is_duplicated(chapter_info)
|
183
|
+
encrypted = self._is_encrypted(chapter_info)
|
222
184
|
|
223
185
|
title = chapter_info.get("chapterName", "Untitled")
|
224
186
|
raw_html = chapter_info.get("content", "")
|
225
|
-
|
187
|
+
cid = str(chapter_info.get("chapterId") or chapter_id)
|
226
188
|
fkp = chapter_info.get("fkp", "")
|
227
189
|
author_say = chapter_info.get("authorSay", "").strip()
|
228
190
|
update_time = chapter_info.get("updateTime", "")
|
@@ -234,19 +196,25 @@ class QidianParser(BaseParser):
|
|
234
196
|
|
235
197
|
if self._is_vip(chapter_info):
|
236
198
|
decryptor = get_decryptor()
|
237
|
-
raw_html = decryptor.
|
199
|
+
raw_html = decryptor.decrypt_qd(raw_html, cid, fkp, self._fuid)
|
238
200
|
|
239
|
-
|
240
|
-
|
241
|
-
|
201
|
+
chapter_text = (
|
202
|
+
self._parse_font_encrypted(raw_html, chapter_info, cid)
|
203
|
+
if encrypted
|
204
|
+
else self._parse_normal(raw_html)
|
205
|
+
)
|
242
206
|
if not chapter_text:
|
207
|
+
logger.warning(
|
208
|
+
"[Parser] chapter_id=%s :: content empty after decryption/font-mapping",
|
209
|
+
chapter_id,
|
210
|
+
)
|
243
211
|
return None
|
244
212
|
|
245
213
|
if self._use_truncation and duplicated:
|
246
214
|
chapter_text = truncate_half_lines(chapter_text)
|
247
215
|
|
248
216
|
return {
|
249
|
-
"id":
|
217
|
+
"id": cid,
|
250
218
|
"title": title,
|
251
219
|
"content": chapter_text,
|
252
220
|
"extra": {
|
@@ -258,107 +226,116 @@ class QidianParser(BaseParser):
|
|
258
226
|
"duplicated": duplicated,
|
259
227
|
"seq": seq,
|
260
228
|
"volume": volume,
|
261
|
-
"encrypted":
|
229
|
+
"encrypted": encrypted,
|
262
230
|
},
|
263
231
|
}
|
264
232
|
|
265
|
-
def
|
233
|
+
def _parse_normal(self, raw_html: str) -> str:
|
234
|
+
"""
|
235
|
+
Extract structured chapter content from a normal Qidian page.
|
236
|
+
"""
|
237
|
+
parts = raw_html.split("<p>")
|
238
|
+
paragraphs = [unescape(p).strip() for p in parts if p.strip()]
|
239
|
+
chapter_text = "\n".join(paragraphs)
|
240
|
+
if not chapter_text:
|
241
|
+
return ""
|
242
|
+
return chapter_text
|
243
|
+
|
244
|
+
def _parse_font_encrypted(
|
266
245
|
self,
|
246
|
+
raw_html: str,
|
267
247
|
chapter_info: dict[str, Any],
|
268
|
-
|
269
|
-
) ->
|
248
|
+
cid: str,
|
249
|
+
) -> str:
|
270
250
|
"""
|
271
|
-
Extract and return the formatted textual content of an encrypted chapter.
|
272
|
-
|
273
251
|
Steps:
|
274
252
|
1. Decode and save randomFont bytes; download fixedFont via download().
|
275
253
|
2. Parse CSS rules and save debug JSON.
|
276
254
|
3. Render encrypted paragraphs, then run OCR font-mapping.
|
277
255
|
4. Extracts paragraph texts and formats them.
|
278
|
-
|
279
|
-
:param chapter_info: Parsed chapter info block from ssr data.
|
280
|
-
:return: Formatted chapter text or empty string if not parsable.
|
281
256
|
"""
|
282
|
-
|
257
|
+
if not self._decode_font:
|
258
|
+
logger.warning(
|
259
|
+
"[Parser] chapter_id=%s :: font decryption skipped "
|
260
|
+
"(set `decode_font=True` to enable)",
|
261
|
+
cid,
|
262
|
+
)
|
263
|
+
return ""
|
264
|
+
|
265
|
+
css_str = chapter_info.get("css")
|
266
|
+
random_font_str = chapter_info.get("randomFont")
|
267
|
+
rf = json.loads(random_font_str) if isinstance(random_font_str, str) else None
|
268
|
+
rf_data = rf.get("data") if rf else None
|
269
|
+
fixed_woff2_url = chapter_info.get("fixedFontWoff2")
|
270
|
+
|
271
|
+
if not css_str:
|
272
|
+
logger.warning("[Parser] cid=%s :: css missing or empty", cid)
|
273
|
+
return ""
|
274
|
+
if not rf_data:
|
275
|
+
logger.warning("[Parser] cid=%s :: randomFont.data missing or empty", cid)
|
276
|
+
return ""
|
277
|
+
if not fixed_woff2_url:
|
278
|
+
logger.warning("[Parser] cid=%s :: fixedFontWoff2 missing or empty", cid)
|
279
|
+
return ""
|
280
|
+
|
281
|
+
debug_dir = self._debug_dir / "font_debug" / cid
|
283
282
|
if self._save_font_debug:
|
284
283
|
debug_dir.mkdir(parents=True, exist_ok=True)
|
285
284
|
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
author_say = chapter_info.get("authorSay", "").strip()
|
297
|
-
update_time = chapter_info.get("updateTime", "")
|
298
|
-
update_timestamp = chapter_info.get("updateTimestamp", 0)
|
299
|
-
modify_time = chapter_info.get("modifyTime", 0)
|
300
|
-
word_count = chapter_info.get("actualWords", 0)
|
301
|
-
seq = chapter_info.get("seq")
|
302
|
-
volume = chapter_info.get("extra", {}).get("volumeName", "")
|
303
|
-
|
304
|
-
# extract + save font
|
305
|
-
rf = json.loads(randomFont_str)
|
306
|
-
rand_path = self._base_cache_dir / "randomFont.ttf"
|
307
|
-
rand_path.parent.mkdir(parents=True, exist_ok=True)
|
308
|
-
rand_path.write_bytes(bytes(rf["data"]))
|
285
|
+
try:
|
286
|
+
self._rand_path.parent.mkdir(parents=True, exist_ok=True)
|
287
|
+
self._rand_path.write_bytes(bytes(rf_data))
|
288
|
+
except Exception as e:
|
289
|
+
logger.error(
|
290
|
+
"[Parser] cid=%s :: failed to write randomFont.ttf",
|
291
|
+
cid,
|
292
|
+
exc_info=e,
|
293
|
+
)
|
294
|
+
return ""
|
309
295
|
|
310
296
|
fixed_path = download(
|
311
|
-
url=
|
297
|
+
url=fixed_woff2_url,
|
312
298
|
target_dir=self._fixed_font_dir,
|
299
|
+
on_exist="skip",
|
313
300
|
)
|
314
301
|
if fixed_path is None:
|
315
302
|
logger.warning(
|
316
|
-
"[Parser] failed to download fixedfont for chapter '%s'",
|
317
|
-
)
|
318
|
-
return None
|
319
|
-
|
320
|
-
# Extract and render paragraphs from HTML with CSS rules
|
321
|
-
if self._is_vip(chapter_info):
|
322
|
-
decryptor = get_decryptor()
|
323
|
-
raw_html = decryptor.decrypt(
|
324
|
-
raw_html,
|
325
|
-
chapter_id,
|
326
|
-
fkp,
|
327
|
-
self._fuid,
|
303
|
+
"[Parser] failed to download fixedfont for chapter '%s'", cid
|
328
304
|
)
|
305
|
+
return ""
|
329
306
|
|
330
307
|
css_rules = self._parse_css_rules(css_str)
|
331
308
|
paragraphs_str, refl_list = self._render_visible_text(raw_html, css_rules)
|
332
309
|
if self._save_font_debug:
|
333
|
-
|
334
|
-
|
310
|
+
(debug_dir / f"{cid}_debug.txt").write_text(
|
311
|
+
paragraphs_str, encoding="utf-8"
|
312
|
+
)
|
335
313
|
|
336
314
|
# Run OCR + fallback mapping
|
337
315
|
char_set = {c for c in paragraphs_str if c not in {" ", "\n", "\u3000"}}
|
338
316
|
refl_set = set(refl_list)
|
339
317
|
char_set = char_set - refl_set
|
340
318
|
if self._save_font_debug:
|
341
|
-
|
342
|
-
|
343
|
-
char_sets_path.write_text(
|
344
|
-
temp,
|
319
|
+
(debug_dir / "char_set_debug.txt").write_text(
|
320
|
+
f"char_set:\n{char_set}\n\nrefl_set:\n{refl_set}",
|
345
321
|
encoding="utf-8",
|
346
322
|
)
|
347
323
|
|
348
324
|
mapping_result = self._generate_font_map(
|
349
325
|
fixed_font_path=fixed_path,
|
350
|
-
random_font_path=
|
326
|
+
random_font_path=self._rand_path,
|
351
327
|
char_set=char_set,
|
352
328
|
refl_set=refl_set,
|
353
|
-
|
354
|
-
batch_size=self._config.batch_size,
|
329
|
+
batch_size=self._batch_size,
|
355
330
|
)
|
356
331
|
if not mapping_result:
|
357
|
-
|
332
|
+
logger.warning(
|
333
|
+
"[Parser] font mapping returned empty result for chapter '%s'", cid
|
334
|
+
)
|
335
|
+
return ""
|
358
336
|
|
359
337
|
if self._save_font_debug:
|
360
|
-
|
361
|
-
mapping_json_path.write_text(
|
338
|
+
(debug_dir / "font_mapping.json").write_text(
|
362
339
|
json.dumps(mapping_result, ensure_ascii=False, indent=2),
|
363
340
|
encoding="utf-8",
|
364
341
|
)
|
@@ -369,28 +346,9 @@ class QidianParser(BaseParser):
|
|
369
346
|
font_map=mapping_result,
|
370
347
|
)
|
371
348
|
|
372
|
-
|
349
|
+
return "\n".join(
|
373
350
|
line.strip() for line in original_text.splitlines() if line.strip()
|
374
351
|
)
|
375
|
-
if self._use_truncation and duplicated:
|
376
|
-
final_paragraphs_str = truncate_half_lines(final_paragraphs_str)
|
377
|
-
|
378
|
-
return {
|
379
|
-
"id": str(chapter_id),
|
380
|
-
"title": str(title),
|
381
|
-
"content": final_paragraphs_str,
|
382
|
-
"extra": {
|
383
|
-
"author_say": author_say,
|
384
|
-
"updated_at": update_time,
|
385
|
-
"update_timestamp": update_timestamp,
|
386
|
-
"modify_time": modify_time,
|
387
|
-
"word_count": word_count,
|
388
|
-
"duplicated": duplicated,
|
389
|
-
"seq": seq,
|
390
|
-
"volume": volume,
|
391
|
-
"encrypted": True,
|
392
|
-
},
|
393
|
-
}
|
394
352
|
|
395
353
|
@staticmethod
|
396
354
|
def _find_ssr_page_context(html_str: str) -> dict[str, Any]:
|
@@ -417,17 +375,6 @@ class QidianParser(BaseParser):
|
|
417
375
|
chapter_info = page_data.get("chapterInfo", {})
|
418
376
|
return chapter_info if isinstance(chapter_info, dict) else {}
|
419
377
|
|
420
|
-
@staticmethod
|
421
|
-
def _is_restricted_page(html_str: str) -> bool:
|
422
|
-
"""
|
423
|
-
Return True if page content indicates access restriction
|
424
|
-
(e.g. not subscribed/purchased).
|
425
|
-
|
426
|
-
:param html_str: Raw HTML string.
|
427
|
-
"""
|
428
|
-
markers = ["这是VIP章节", "需要订阅", "订阅后才能阅读"]
|
429
|
-
return any(m in html_str for m in markers)
|
430
|
-
|
431
378
|
@classmethod
|
432
379
|
def _is_vip(cls, chapter_info: dict[str, Any]) -> bool:
|
433
380
|
"""
|
@@ -458,30 +405,22 @@ class QidianParser(BaseParser):
|
|
458
405
|
return bool(efw_flag == 1)
|
459
406
|
|
460
407
|
@classmethod
|
461
|
-
def _is_encrypted(cls,
|
408
|
+
def _is_encrypted(cls, chapter_info: dict[str, Any]) -> bool:
|
462
409
|
"""
|
463
410
|
Return True if content is encrypted.
|
464
411
|
|
465
412
|
Chapter Encryption Status (cES):
|
466
413
|
* 0: 内容是'明文'
|
467
414
|
* 2: 字体加密
|
468
|
-
|
469
|
-
:param content: HTML content, either as a raw string or a BeautifulSoup object.
|
470
|
-
:return: True if encrypted marker is found, else False.
|
471
415
|
"""
|
472
|
-
ssr_data = (
|
473
|
-
cls._find_ssr_page_context(content) if isinstance(content, str) else content
|
474
|
-
)
|
475
|
-
chapter_info = cls._extract_chapter_info(ssr_data)
|
476
416
|
return int(chapter_info.get("cES", 0)) == 2
|
477
417
|
|
478
|
-
@staticmethod
|
479
418
|
def _generate_font_map(
|
419
|
+
self,
|
480
420
|
fixed_font_path: Path,
|
481
421
|
random_font_path: Path,
|
482
422
|
char_set: set[str],
|
483
423
|
refl_set: set[str],
|
484
|
-
cache_dir: Path,
|
485
424
|
batch_size: int = 32,
|
486
425
|
) -> dict[str, str]:
|
487
426
|
"""
|
@@ -494,17 +433,16 @@ class QidianParser(BaseParser):
|
|
494
433
|
:param random_font_path: random font file.
|
495
434
|
:param char_set: Characters to match directly.
|
496
435
|
:param refl_set: Characters to match in flipped form.
|
497
|
-
:param cache_dir: Directory to save/load cached results.
|
498
436
|
:param batch_size: How many chars to OCR per batch.
|
499
437
|
|
500
438
|
:return: { obf_char: real_char, ... }
|
501
439
|
"""
|
502
|
-
font_ocr = get_font_ocr()
|
440
|
+
font_ocr = get_font_ocr(self._fontocr_cfg)
|
503
441
|
if not font_ocr:
|
504
442
|
return {}
|
505
443
|
|
506
444
|
mapping_result: dict[str, str] = {}
|
507
|
-
fixed_map_file =
|
445
|
+
fixed_map_file = self._fixed_map_dir / f"{fixed_font_path.stem}.json"
|
508
446
|
fixed_map_file.parent.mkdir(parents=True, exist_ok=True)
|
509
447
|
|
510
448
|
# load existing cache
|
@@ -587,9 +525,8 @@ class QidianParser(BaseParser):
|
|
587
525
|
|
588
526
|
Returns None if can't extract a tag.
|
589
527
|
"""
|
590
|
-
sel = selector.strip()
|
591
528
|
# If it has spaces, take the rightmost simple selector
|
592
|
-
last =
|
529
|
+
last = selector.strip().split()[-1]
|
593
530
|
# Drop ::pseudo
|
594
531
|
last = last.split("::", 1)[0]
|
595
532
|
# If it's like 'span[attr=..]' keep 'span'
|
@@ -604,50 +541,12 @@ class QidianParser(BaseParser):
|
|
604
541
|
"""
|
605
542
|
Parse 'name:value;...' inside a block. Tolerates quotes and attr().
|
606
543
|
"""
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
quote = None # track ' or "
|
614
|
-
while i < n:
|
615
|
-
c = block[i]
|
616
|
-
if quote:
|
617
|
-
# inside quotes
|
618
|
-
if c == "\\" and i + 1 < n:
|
619
|
-
# keep escaped char
|
620
|
-
(name if in_name else val).append(c)
|
621
|
-
i += 1
|
622
|
-
(name if in_name else val).append(block[i])
|
623
|
-
elif c == quote:
|
624
|
-
(name if in_name else val).append(c)
|
625
|
-
quote = None
|
626
|
-
else:
|
627
|
-
(name if in_name else val).append(c)
|
628
|
-
else:
|
629
|
-
if c in ("'", '"'):
|
630
|
-
(name if in_name else val).append(c)
|
631
|
-
quote = c
|
632
|
-
elif in_name and c == ":":
|
633
|
-
in_name = False
|
634
|
-
elif c == ";":
|
635
|
-
nm = "".join(name).strip().lower()
|
636
|
-
vl = "".join(val).strip()
|
637
|
-
if nm:
|
638
|
-
decls.append((nm, vl))
|
639
|
-
name.clear()
|
640
|
-
val.clear()
|
641
|
-
in_name = True
|
642
|
-
else:
|
643
|
-
(name if in_name else val).append(c)
|
644
|
-
i += 1
|
645
|
-
|
646
|
-
if name or val:
|
647
|
-
nm = "".join(name).strip().lower()
|
648
|
-
vl = "".join(val).strip()
|
649
|
-
if nm:
|
650
|
-
decls.append((nm, vl))
|
544
|
+
parts = [d.strip() for d in block.split(";") if d.strip()]
|
545
|
+
decls = []
|
546
|
+
for p in parts:
|
547
|
+
if ":" in p:
|
548
|
+
name, val = p.split(":", 1)
|
549
|
+
decls.append((name.strip().lower(), val.strip()))
|
651
550
|
return decls
|
652
551
|
|
653
552
|
@classmethod
|
@@ -661,83 +560,65 @@ class QidianParser(BaseParser):
|
|
661
560
|
rules: Rules = {"orders": [], "sy": {}, "p_rules": {}}
|
662
561
|
order_pairs: list[tuple[str, int]] = []
|
663
562
|
|
664
|
-
|
563
|
+
pos = 0
|
665
564
|
while True:
|
666
|
-
b1 = css_str.find("{",
|
565
|
+
b1 = css_str.find("{", pos)
|
667
566
|
if b1 == -1:
|
668
567
|
break
|
669
|
-
selector = css_str[
|
568
|
+
selector = css_str[pos:b1].strip().lower()
|
670
569
|
b2 = css_str.find("}", b1 + 1)
|
671
570
|
if b2 == -1:
|
672
571
|
break
|
673
572
|
block = css_str[b1 + 1 : b2]
|
674
|
-
|
573
|
+
pos = b2 + 1
|
675
574
|
|
676
575
|
decls = cls._parse_decls(block)
|
677
|
-
|
678
576
|
new_rule: Rule = {}
|
679
577
|
order_val: int | None = None
|
680
578
|
|
681
579
|
for name, value in decls:
|
682
580
|
v = value.strip()
|
683
581
|
if name == "font-size" and v == "0":
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
if cls._RE_SCALEX.search(v.replace(" ", "")):
|
690
|
-
new_rule["transform_flip_x"] = True
|
582
|
+
new_rule[
|
583
|
+
"delete_first" if "::first-letter" in selector else "delete_all"
|
584
|
+
] = True
|
585
|
+
elif name == "transform" and "scalex(-1" in v.replace(" ", "").lower():
|
586
|
+
new_rule["transform_flip_x"] = True
|
691
587
|
elif name == "order":
|
692
|
-
with suppress(ValueError
|
588
|
+
with suppress(ValueError):
|
693
589
|
order_val = int(v)
|
694
590
|
elif name == "content":
|
695
|
-
# normalize: remove outer quotes
|
696
591
|
if "::after" in selector:
|
697
|
-
|
698
|
-
|
699
|
-
new_rule["append_end_attr"] = m.group(1)
|
592
|
+
if v.lower().startswith("attr("):
|
593
|
+
new_rule["append_end_attr"] = v[5:-1].strip()
|
700
594
|
else:
|
701
|
-
|
702
|
-
new_rule["append_end_char"] = s
|
595
|
+
new_rule["append_end_char"] = v.strip().strip("\"'")
|
703
596
|
elif "::before" in selector:
|
704
|
-
|
705
|
-
|
706
|
-
new_rule["append_start_attr"] = m.group(1)
|
597
|
+
if v.lower().startswith("attr("):
|
598
|
+
new_rule["append_start_attr"] = v[5:-1].strip()
|
707
599
|
else:
|
708
|
-
|
709
|
-
new_rule["append_start_char"] = s
|
600
|
+
new_rule["append_start_char"] = v.strip().strip("\"'")
|
710
601
|
|
711
|
-
# classification
|
712
602
|
if selector.startswith(".sy-"):
|
713
603
|
key = selector.lstrip(".")
|
714
|
-
|
715
|
-
rules["sy"][key] = {**old, **new_rule} if old else (new_rule or {})
|
716
|
-
|
604
|
+
rules["sy"][key] = {**rules["sy"].get(key, {}), **new_rule}
|
717
605
|
elif selector.startswith(".p") and " " in selector:
|
718
606
|
p_cls, right = selector.split(" ", 1)
|
719
|
-
p_cls = p_cls.lstrip(".")
|
720
607
|
tag = cls._only_tag(right)
|
721
608
|
if tag:
|
722
|
-
|
723
|
-
rules["p_rules"]
|
724
|
-
|
725
|
-
|
609
|
+
p_cls = p_cls.lstrip(".")
|
610
|
+
rules["p_rules"].setdefault(p_cls, {})
|
611
|
+
rules["p_rules"][p_cls][tag] = {
|
612
|
+
**rules["p_rules"][p_cls].get(tag, {}),
|
613
|
+
**new_rule,
|
614
|
+
}
|
726
615
|
|
727
616
|
if order_val is not None:
|
728
|
-
|
729
|
-
if
|
730
|
-
order_pairs.append((
|
731
|
-
|
732
|
-
|
733
|
-
order_pairs.sort(key=lambda t: t[1])
|
734
|
-
seen = set()
|
735
|
-
orders: list[str] = []
|
736
|
-
for tag, _ in order_pairs:
|
737
|
-
if tag not in seen:
|
738
|
-
seen.add(tag)
|
739
|
-
orders.append(tag)
|
740
|
-
rules["orders"] = orders
|
617
|
+
tag = cls._only_tag(selector)
|
618
|
+
if tag:
|
619
|
+
order_pairs.append((tag, order_val))
|
620
|
+
|
621
|
+
rules["orders"] = [t for t, _ in sorted(order_pairs, key=lambda x: x[1])]
|
741
622
|
return rules
|
742
623
|
|
743
624
|
@staticmethod
|