novel-downloader 2.0.1__py3-none-any.whl → 2.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -1
- novel_downloader/cli/download.py +11 -8
- novel_downloader/cli/export.py +17 -17
- novel_downloader/cli/ui.py +28 -1
- novel_downloader/config/adapter.py +27 -1
- novel_downloader/core/archived/deqixs/fetcher.py +1 -28
- novel_downloader/core/downloaders/__init__.py +2 -0
- novel_downloader/core/downloaders/base.py +34 -85
- novel_downloader/core/downloaders/common.py +147 -171
- novel_downloader/core/downloaders/qianbi.py +30 -64
- novel_downloader/core/downloaders/qidian.py +157 -184
- novel_downloader/core/downloaders/qqbook.py +292 -0
- novel_downloader/core/downloaders/registry.py +2 -2
- novel_downloader/core/exporters/__init__.py +2 -0
- novel_downloader/core/exporters/base.py +37 -59
- novel_downloader/core/exporters/common.py +620 -0
- novel_downloader/core/exporters/linovelib.py +47 -0
- novel_downloader/core/exporters/qidian.py +41 -12
- novel_downloader/core/exporters/qqbook.py +28 -0
- novel_downloader/core/exporters/registry.py +2 -2
- novel_downloader/core/fetchers/__init__.py +4 -2
- novel_downloader/core/fetchers/aaatxt.py +2 -22
- novel_downloader/core/fetchers/b520.py +3 -23
- novel_downloader/core/fetchers/base.py +80 -105
- novel_downloader/core/fetchers/biquyuedu.py +2 -22
- novel_downloader/core/fetchers/dxmwx.py +10 -22
- novel_downloader/core/fetchers/esjzone.py +6 -29
- novel_downloader/core/fetchers/guidaye.py +2 -22
- novel_downloader/core/fetchers/hetushu.py +9 -29
- novel_downloader/core/fetchers/i25zw.py +2 -16
- novel_downloader/core/fetchers/ixdzs8.py +2 -16
- novel_downloader/core/fetchers/jpxs123.py +2 -16
- novel_downloader/core/fetchers/lewenn.py +2 -22
- novel_downloader/core/fetchers/linovelib.py +4 -20
- novel_downloader/core/fetchers/{eightnovel.py → n8novel.py} +12 -40
- novel_downloader/core/fetchers/piaotia.py +2 -16
- novel_downloader/core/fetchers/qbtr.py +2 -16
- novel_downloader/core/fetchers/qianbi.py +1 -20
- novel_downloader/core/fetchers/qidian.py +7 -33
- novel_downloader/core/fetchers/qqbook.py +177 -0
- novel_downloader/core/fetchers/quanben5.py +9 -29
- novel_downloader/core/fetchers/rate_limiter.py +22 -53
- novel_downloader/core/fetchers/sfacg.py +3 -16
- novel_downloader/core/fetchers/shencou.py +2 -16
- novel_downloader/core/fetchers/shuhaige.py +2 -22
- novel_downloader/core/fetchers/tongrenquan.py +2 -22
- novel_downloader/core/fetchers/ttkan.py +3 -14
- novel_downloader/core/fetchers/wanbengo.py +2 -22
- novel_downloader/core/fetchers/xiaoshuowu.py +2 -16
- novel_downloader/core/fetchers/xiguashuwu.py +4 -20
- novel_downloader/core/fetchers/xs63b.py +3 -15
- novel_downloader/core/fetchers/xshbook.py +2 -22
- novel_downloader/core/fetchers/yamibo.py +4 -28
- novel_downloader/core/fetchers/yibige.py +13 -26
- novel_downloader/core/interfaces/exporter.py +19 -7
- novel_downloader/core/interfaces/fetcher.py +21 -47
- novel_downloader/core/parsers/__init__.py +4 -2
- novel_downloader/core/parsers/b520.py +2 -2
- novel_downloader/core/parsers/base.py +4 -39
- novel_downloader/core/parsers/{eightnovel.py → n8novel.py} +5 -5
- novel_downloader/core/parsers/{qidian/main_parser.py → qidian.py} +147 -266
- novel_downloader/core/parsers/qqbook.py +709 -0
- novel_downloader/core/parsers/xiguashuwu.py +3 -4
- novel_downloader/core/searchers/__init__.py +2 -2
- novel_downloader/core/searchers/b520.py +1 -1
- novel_downloader/core/searchers/base.py +2 -2
- novel_downloader/core/searchers/{eightnovel.py → n8novel.py} +5 -5
- novel_downloader/models/__init__.py +2 -0
- novel_downloader/models/book.py +1 -0
- novel_downloader/models/config.py +12 -0
- novel_downloader/resources/config/settings.toml +23 -5
- novel_downloader/resources/js_scripts/expr_to_json.js +14 -0
- novel_downloader/resources/js_scripts/qidian_decrypt_node.js +21 -16
- novel_downloader/resources/js_scripts/qq_decrypt_node.js +92 -0
- novel_downloader/utils/constants.py +6 -0
- novel_downloader/utils/crypto_utils/aes_util.py +1 -1
- novel_downloader/utils/epub/constants.py +1 -6
- novel_downloader/utils/fontocr/core.py +2 -0
- novel_downloader/utils/fontocr/loader.py +10 -8
- novel_downloader/utils/node_decryptor/__init__.py +13 -0
- novel_downloader/utils/node_decryptor/decryptor.py +342 -0
- novel_downloader/{core/parsers/qidian/utils → utils/node_decryptor}/decryptor_fetcher.py +5 -6
- novel_downloader/web/pages/download.py +1 -1
- novel_downloader/web/pages/search.py +1 -1
- novel_downloader/web/services/task_manager.py +2 -0
- {novel_downloader-2.0.1.dist-info → novel_downloader-2.0.2.dist-info}/METADATA +4 -1
- {novel_downloader-2.0.1.dist-info → novel_downloader-2.0.2.dist-info}/RECORD +91 -94
- novel_downloader/core/exporters/common/__init__.py +0 -11
- novel_downloader/core/exporters/common/epub.py +0 -198
- novel_downloader/core/exporters/common/main_exporter.py +0 -64
- novel_downloader/core/exporters/common/txt.py +0 -146
- novel_downloader/core/exporters/epub_util.py +0 -215
- novel_downloader/core/exporters/linovelib/__init__.py +0 -11
- novel_downloader/core/exporters/linovelib/epub.py +0 -349
- novel_downloader/core/exporters/linovelib/main_exporter.py +0 -66
- novel_downloader/core/exporters/linovelib/txt.py +0 -139
- novel_downloader/core/exporters/txt_util.py +0 -67
- novel_downloader/core/parsers/qidian/__init__.py +0 -10
- novel_downloader/core/parsers/qidian/utils/__init__.py +0 -11
- novel_downloader/core/parsers/qidian/utils/node_decryptor.py +0 -175
- {novel_downloader-2.0.1.dist-info → novel_downloader-2.0.2.dist-info}/WHEEL +0 -0
- {novel_downloader-2.0.1.dist-info → novel_downloader-2.0.2.dist-info}/entry_points.txt +0 -0
- {novel_downloader-2.0.1.dist-info → novel_downloader-2.0.2.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-2.0.1.dist-info → novel_downloader-2.0.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,709 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.qqbook
|
4
|
+
------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
from __future__ import annotations
|
9
|
+
|
10
|
+
import json
|
11
|
+
import logging
|
12
|
+
import re
|
13
|
+
from contextlib import suppress
|
14
|
+
from pathlib import Path
|
15
|
+
from typing import Any, TypedDict
|
16
|
+
|
17
|
+
from lxml import html
|
18
|
+
|
19
|
+
from novel_downloader.core.parsers.base import BaseParser
|
20
|
+
from novel_downloader.core.parsers.registry import register_parser
|
21
|
+
from novel_downloader.models import (
|
22
|
+
BookInfoDict,
|
23
|
+
ChapterDict,
|
24
|
+
ChapterInfoDict,
|
25
|
+
ParserConfig,
|
26
|
+
VolumeInfoDict,
|
27
|
+
)
|
28
|
+
from novel_downloader.utils import download
|
29
|
+
from novel_downloader.utils.fontocr import get_font_ocr
|
30
|
+
from novel_downloader.utils.node_decryptor import get_decryptor
|
31
|
+
|
32
|
+
logger = logging.getLogger(__name__)
|
33
|
+
|
34
|
+
|
35
|
+
class Rule(TypedDict, total=False):
|
36
|
+
delete_all: bool
|
37
|
+
delete_first: bool
|
38
|
+
transform_flip_x: bool
|
39
|
+
append_start_char: str
|
40
|
+
append_end_char: str
|
41
|
+
append_start_attr: str
|
42
|
+
append_end_attr: str
|
43
|
+
|
44
|
+
|
45
|
+
class Rules(TypedDict):
|
46
|
+
# e.g., orders = ["i", "em", "span"]
|
47
|
+
orders: list[str]
|
48
|
+
# e.g., sy["sy-3"] -> Rule
|
49
|
+
sy: dict[str, Rule]
|
50
|
+
# e.g., p_rules["p3"]["i"] -> Rule
|
51
|
+
p_rules: dict[str, dict[str, Rule]]
|
52
|
+
|
53
|
+
|
54
|
+
@register_parser(
|
55
|
+
site_keys=["qqbook", "qq"],
|
56
|
+
)
|
57
|
+
class QqbookParser(BaseParser):
|
58
|
+
"""
|
59
|
+
Parser for QQ 阅读 site.
|
60
|
+
"""
|
61
|
+
|
62
|
+
_NUXT_BLOCK_RE = re.compile(
|
63
|
+
r"window\.__NUXT__\s*=\s*([\s\S]*?);?\s*<\/script>",
|
64
|
+
re.S,
|
65
|
+
)
|
66
|
+
|
67
|
+
def __init__(self, config: ParserConfig):
|
68
|
+
"""
|
69
|
+
Initialize the QqbookParser with the given configuration.
|
70
|
+
"""
|
71
|
+
super().__init__(config)
|
72
|
+
|
73
|
+
self._rand_path = self._base_cache_dir / "qqbook" / "randomFont.ttf"
|
74
|
+
self._fixed_font_dir = self._base_cache_dir / "qqbook" / "fixed_fonts"
|
75
|
+
self._fixed_map_dir = self._base_cache_dir / "qqbook" / "fixed_font_map"
|
76
|
+
self._debug_dir = Path.cwd() / "debug" / "qqbook"
|
77
|
+
|
78
|
+
def parse_book_info(
|
79
|
+
self,
|
80
|
+
html_list: list[str],
|
81
|
+
**kwargs: Any,
|
82
|
+
) -> BookInfoDict | None:
|
83
|
+
"""
|
84
|
+
Parse a book info page and extract metadata and chapter structure.
|
85
|
+
|
86
|
+
Order: [info, catalog]
|
87
|
+
|
88
|
+
:param html_list: Raw HTML of the book info page.
|
89
|
+
:return: Parsed metadata and chapter structure as a dictionary.
|
90
|
+
"""
|
91
|
+
if len(html_list) < 2:
|
92
|
+
return None
|
93
|
+
|
94
|
+
info_tree = html.fromstring(html_list[0])
|
95
|
+
catalog_dict = json.loads(html_list[1])
|
96
|
+
|
97
|
+
book_name = self._first_str(
|
98
|
+
info_tree.xpath('//meta[@property="og:novel:book_name"]/@content')
|
99
|
+
) or self._first_str(
|
100
|
+
info_tree.xpath('//h1[contains(@class, "book-title")]/text()')
|
101
|
+
)
|
102
|
+
author = self._first_str(
|
103
|
+
info_tree.xpath('//meta[@property="og:novel:author"]/@content')
|
104
|
+
) or self._first_str(
|
105
|
+
info_tree.xpath(
|
106
|
+
'//div[contains(@class,"book-meta")]//a[contains(@class,"author")]/text()'
|
107
|
+
),
|
108
|
+
replaces=[(" 著", ""), ("著", "")],
|
109
|
+
)
|
110
|
+
cover_url = self._first_str(
|
111
|
+
info_tree.xpath('//meta[@property="og:image"]/@content')
|
112
|
+
) or self._first_str(
|
113
|
+
info_tree.xpath('//div[contains(@class,"book-cover")]//img/@src')
|
114
|
+
)
|
115
|
+
update_time = self._first_str(
|
116
|
+
info_tree.xpath('//meta[@property="og:novel:update_time"]/@content')
|
117
|
+
) or self._first_str(
|
118
|
+
info_tree.xpath('//div[contains(@class,"update-time")]/text()'),
|
119
|
+
replaces=[("更新时间:", "")],
|
120
|
+
)
|
121
|
+
serial_status = self._first_str(
|
122
|
+
info_tree.xpath('//meta[@property="og:novel:status"]/@content')
|
123
|
+
)
|
124
|
+
# tags
|
125
|
+
tags = [
|
126
|
+
t.strip()
|
127
|
+
for t in info_tree.xpath(
|
128
|
+
'//div[contains(@class,"book-tags")]//a[contains(@class,"tag")]/text()'
|
129
|
+
)
|
130
|
+
if t.strip()
|
131
|
+
]
|
132
|
+
# summary
|
133
|
+
summary_raw = "\n".join(
|
134
|
+
info_tree.xpath('//div[contains(@class,"book-intro")]//text()')
|
135
|
+
)
|
136
|
+
summary = (
|
137
|
+
self._norm_space(summary_raw)
|
138
|
+
if summary_raw
|
139
|
+
else self._first_str(
|
140
|
+
info_tree.xpath('//meta[@property="og:description"]/@content')
|
141
|
+
)
|
142
|
+
)
|
143
|
+
|
144
|
+
# book_id for chapter URLs
|
145
|
+
read_url = self._first_str(
|
146
|
+
info_tree.xpath('//meta[@property="og:novel:read_url"]/@content')
|
147
|
+
) or self._first_str(info_tree.xpath('//meta[@property="og:url"]/@content'))
|
148
|
+
book_id = ""
|
149
|
+
if read_url:
|
150
|
+
book_id = read_url.rstrip("/").split("/")[-1]
|
151
|
+
|
152
|
+
# Chapters from the book_list
|
153
|
+
data = catalog_dict.get("data") or []
|
154
|
+
chapters: list[ChapterInfoDict] = []
|
155
|
+
for item in data:
|
156
|
+
cid = str(item.get("cid"))
|
157
|
+
title = str(item.get("chapterName", "")).strip()
|
158
|
+
accessible = bool(item.get("free") or item.get("purchased"))
|
159
|
+
chap: ChapterInfoDict = {
|
160
|
+
"title": title,
|
161
|
+
"chapterId": cid,
|
162
|
+
"url": f"/book-read/{book_id}/{cid}" if book_id and cid else "",
|
163
|
+
"accessible": accessible,
|
164
|
+
}
|
165
|
+
chapters.append(chap)
|
166
|
+
|
167
|
+
volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
|
168
|
+
|
169
|
+
return {
|
170
|
+
"book_name": book_name,
|
171
|
+
"author": author,
|
172
|
+
"cover_url": cover_url,
|
173
|
+
"update_time": update_time,
|
174
|
+
"serial_status": serial_status,
|
175
|
+
"tags": tags,
|
176
|
+
"summary": summary,
|
177
|
+
"volumes": volumes,
|
178
|
+
"extra": {},
|
179
|
+
}
|
180
|
+
|
181
|
+
def parse_chapter(
|
182
|
+
self,
|
183
|
+
html_list: list[str],
|
184
|
+
chapter_id: str,
|
185
|
+
**kwargs: Any,
|
186
|
+
) -> ChapterDict | None:
|
187
|
+
if not html_list:
|
188
|
+
logger.warning("[Parser] chapter_id=%s :: html_list is empty", chapter_id)
|
189
|
+
return None
|
190
|
+
try:
|
191
|
+
nuxt_block = self._find_nuxt_block(html_list[0])
|
192
|
+
data_list = nuxt_block.get("data")
|
193
|
+
if not data_list:
|
194
|
+
return None
|
195
|
+
data_block = data_list[0]
|
196
|
+
except Exception as e:
|
197
|
+
logger.warning(
|
198
|
+
"[Parser] chapter_id=%s :: failed to locate Nuxt block: %s",
|
199
|
+
chapter_id,
|
200
|
+
e,
|
201
|
+
)
|
202
|
+
return None
|
203
|
+
|
204
|
+
curr_content = data_block.get("currentContent") or {}
|
205
|
+
if not curr_content:
|
206
|
+
logger.warning(
|
207
|
+
"[Parser] chapter_id=%s :: currentContent missing or empty", chapter_id
|
208
|
+
)
|
209
|
+
return None
|
210
|
+
|
211
|
+
content = curr_content.get("content", "")
|
212
|
+
if not content:
|
213
|
+
logger.warning(
|
214
|
+
"[Parser] chapter_id=%s :: raw 'content' missing or empty", chapter_id
|
215
|
+
)
|
216
|
+
return None
|
217
|
+
|
218
|
+
title = data_block.get("chapterTitle", "Untitled")
|
219
|
+
cid = str(data_block.get("cid") or chapter_id)
|
220
|
+
bk_cfg = data_block.get("fkConfig") or {}
|
221
|
+
encrypt = curr_content.get("encrypt", False)
|
222
|
+
font_encrypt = bool(curr_content.get("fontEncrypt"))
|
223
|
+
font_resp = curr_content.get("fontResponse") or {}
|
224
|
+
|
225
|
+
update_time = curr_content.get("updateTime") or ""
|
226
|
+
word_count = curr_content.get("totalWords") or ""
|
227
|
+
|
228
|
+
logger.debug(
|
229
|
+
"[Parser]chapter_id=%s :: meta title=%r encrypt=%s font_encrypt=%s",
|
230
|
+
chapter_id,
|
231
|
+
title,
|
232
|
+
encrypt,
|
233
|
+
font_encrypt,
|
234
|
+
)
|
235
|
+
|
236
|
+
if encrypt:
|
237
|
+
try:
|
238
|
+
content = self._parse_encrypted(content=content, cid=cid, bk_cfg=bk_cfg)
|
239
|
+
except Exception as e:
|
240
|
+
logger.warning(
|
241
|
+
"[Parser] chapter_id=%s :: encrypted content decryption failed: %s",
|
242
|
+
chapter_id,
|
243
|
+
e,
|
244
|
+
)
|
245
|
+
return None
|
246
|
+
|
247
|
+
if font_encrypt:
|
248
|
+
content = self._parse_font_encrypted(
|
249
|
+
content=content,
|
250
|
+
font_resp=font_resp,
|
251
|
+
cid=cid,
|
252
|
+
)
|
253
|
+
|
254
|
+
if not content:
|
255
|
+
logger.warning(
|
256
|
+
"[Parser] chapter_id=%s :: content empty after decryption/font-mapping",
|
257
|
+
chapter_id,
|
258
|
+
)
|
259
|
+
return None
|
260
|
+
|
261
|
+
return {
|
262
|
+
"id": cid,
|
263
|
+
"title": title,
|
264
|
+
"content": content,
|
265
|
+
"extra": {
|
266
|
+
"site": "qqbook",
|
267
|
+
"updated_at": update_time,
|
268
|
+
"word_count": word_count,
|
269
|
+
"encrypt": encrypt,
|
270
|
+
"font_encrypt": font_encrypt,
|
271
|
+
},
|
272
|
+
}
|
273
|
+
|
274
|
+
def _parse_encrypted(
|
275
|
+
self,
|
276
|
+
content: str,
|
277
|
+
cid: str,
|
278
|
+
bk_cfg: dict[str, Any],
|
279
|
+
) -> str:
|
280
|
+
decryptor = get_decryptor()
|
281
|
+
fkp = bk_cfg.get("fkp", "")
|
282
|
+
fuid = bk_cfg.get("fuid", "")
|
283
|
+
return decryptor.decrypt_qq(
|
284
|
+
ciphertext=content,
|
285
|
+
chapter_id=cid,
|
286
|
+
fkp=fkp,
|
287
|
+
fuid=fuid,
|
288
|
+
)
|
289
|
+
|
290
|
+
def _parse_font_encrypted(
|
291
|
+
self,
|
292
|
+
content: str,
|
293
|
+
font_resp: dict[str, Any],
|
294
|
+
cid: str,
|
295
|
+
) -> str:
|
296
|
+
"""
|
297
|
+
Steps:
|
298
|
+
1. Decode and save randomFont bytes; download fixedFont via download().
|
299
|
+
2. Parse CSS rules and save debug JSON.
|
300
|
+
3. Render encrypted paragraphs, then run OCR font-mapping.
|
301
|
+
4. Extracts paragraph texts and formats them.
|
302
|
+
"""
|
303
|
+
if not self._decode_font:
|
304
|
+
logger.warning(
|
305
|
+
"[Parser] chapter_id=%s :: font decryption skipped "
|
306
|
+
"(set `decode_font=True` to enable)",
|
307
|
+
cid,
|
308
|
+
)
|
309
|
+
return ""
|
310
|
+
|
311
|
+
css_str = font_resp.get("css")
|
312
|
+
random_font = font_resp.get("randomFont") or {}
|
313
|
+
rf_data = random_font.get("data") if isinstance(random_font, dict) else None
|
314
|
+
fixed_woff2_url = font_resp.get("fixedFontWoff2")
|
315
|
+
|
316
|
+
if not css_str:
|
317
|
+
logger.warning("[Parser] cid=%s :: css missing or empty", cid)
|
318
|
+
return ""
|
319
|
+
if not rf_data:
|
320
|
+
logger.warning("[Parser] cid=%s :: randomFont.data missing or empty", cid)
|
321
|
+
return ""
|
322
|
+
if not fixed_woff2_url:
|
323
|
+
logger.warning("[Parser] cid=%s :: fixedFontWoff2 missing or empty", cid)
|
324
|
+
return ""
|
325
|
+
|
326
|
+
debug_dir = self._debug_dir / "font_debug" / cid
|
327
|
+
if self._save_font_debug:
|
328
|
+
debug_dir.mkdir(parents=True, exist_ok=True)
|
329
|
+
|
330
|
+
try:
|
331
|
+
self._rand_path.parent.mkdir(parents=True, exist_ok=True)
|
332
|
+
self._rand_path.write_bytes(bytes(rf_data))
|
333
|
+
except Exception as e:
|
334
|
+
logger.error(
|
335
|
+
"[Parser] cid=%s :: failed to write randomFont.ttf",
|
336
|
+
cid,
|
337
|
+
exc_info=e,
|
338
|
+
)
|
339
|
+
return ""
|
340
|
+
|
341
|
+
fixed_path = download(
|
342
|
+
url=fixed_woff2_url,
|
343
|
+
target_dir=self._fixed_font_dir,
|
344
|
+
on_exist="skip",
|
345
|
+
)
|
346
|
+
if fixed_path is None:
|
347
|
+
logger.warning(
|
348
|
+
"[Parser] failed to download fixedfont for chapter '%s'", cid
|
349
|
+
)
|
350
|
+
return ""
|
351
|
+
|
352
|
+
css_rules = self._parse_css_rules(css_str)
|
353
|
+
paragraphs_str, refl_list = self._render_visible_text(content, css_rules)
|
354
|
+
if self._save_font_debug:
|
355
|
+
(debug_dir / f"{cid}_debug.txt").write_text(
|
356
|
+
paragraphs_str, encoding="utf-8"
|
357
|
+
)
|
358
|
+
|
359
|
+
# Run OCR + fallback mapping
|
360
|
+
char_set = {c for c in paragraphs_str if c not in {" ", "\n", "\u3000"}}
|
361
|
+
refl_set = set(refl_list)
|
362
|
+
char_set = char_set - refl_set
|
363
|
+
if self._save_font_debug:
|
364
|
+
(debug_dir / "char_set_debug.txt").write_text(
|
365
|
+
f"char_set:\n{char_set}\n\nrefl_set:\n{refl_set}",
|
366
|
+
encoding="utf-8",
|
367
|
+
)
|
368
|
+
|
369
|
+
mapping_result = self._generate_font_map(
|
370
|
+
fixed_font_path=fixed_path,
|
371
|
+
random_font_path=self._rand_path,
|
372
|
+
char_set=char_set,
|
373
|
+
refl_set=refl_set,
|
374
|
+
batch_size=self._batch_size,
|
375
|
+
)
|
376
|
+
if not mapping_result:
|
377
|
+
logger.warning(
|
378
|
+
"[Parser] font mapping returned empty result for chapter '%s'", cid
|
379
|
+
)
|
380
|
+
return ""
|
381
|
+
|
382
|
+
if self._save_font_debug:
|
383
|
+
(debug_dir / "font_mapping.json").write_text(
|
384
|
+
json.dumps(mapping_result, ensure_ascii=False, indent=2),
|
385
|
+
encoding="utf-8",
|
386
|
+
)
|
387
|
+
|
388
|
+
# Reconstruct final readable text
|
389
|
+
original_text = self._apply_font_mapping(
|
390
|
+
text=paragraphs_str,
|
391
|
+
font_map=mapping_result,
|
392
|
+
)
|
393
|
+
|
394
|
+
final_paragraphs_str = "\n".join(
|
395
|
+
line.strip() for line in original_text.splitlines() if line.strip()
|
396
|
+
)
|
397
|
+
|
398
|
+
return final_paragraphs_str
|
399
|
+
|
400
|
+
@classmethod
|
401
|
+
def _find_nuxt_block(cls, html_str: str) -> dict[str, Any]:
|
402
|
+
m = cls._NUXT_BLOCK_RE.search(html_str)
|
403
|
+
if not m:
|
404
|
+
return {}
|
405
|
+
js_code = m.group(1).rstrip() # RHS only
|
406
|
+
decryptor = get_decryptor()
|
407
|
+
return decryptor.eval_to_json(js_code)
|
408
|
+
|
409
|
+
def _generate_font_map(
|
410
|
+
self,
|
411
|
+
fixed_font_path: Path,
|
412
|
+
random_font_path: Path,
|
413
|
+
char_set: set[str],
|
414
|
+
refl_set: set[str],
|
415
|
+
batch_size: int = 32,
|
416
|
+
) -> dict[str, str]:
|
417
|
+
"""
|
418
|
+
Build a mapping from scrambled font chars to real chars.
|
419
|
+
|
420
|
+
Uses OCR to decode and generate mapping from a fixed obfuscated font
|
421
|
+
and an random obfuscated font. Results are cached in JSON.
|
422
|
+
|
423
|
+
:param fixed_font_path: fixed font file.
|
424
|
+
:param random_font_path: random font file.
|
425
|
+
:param char_set: Characters to match directly.
|
426
|
+
:param refl_set: Characters to match in flipped form.
|
427
|
+
:param cache_dir: Directory to save/load cached results.
|
428
|
+
:param batch_size: How many chars to OCR per batch.
|
429
|
+
|
430
|
+
:return: { obf_char: real_char, ... }
|
431
|
+
"""
|
432
|
+
font_ocr = get_font_ocr(self._fontocr_cfg)
|
433
|
+
if not font_ocr:
|
434
|
+
return {}
|
435
|
+
|
436
|
+
mapping_result: dict[str, str] = {}
|
437
|
+
fixed_map_file = self._fixed_map_dir / f"{fixed_font_path.stem}.json"
|
438
|
+
fixed_map_file.parent.mkdir(parents=True, exist_ok=True)
|
439
|
+
|
440
|
+
# load existing cache
|
441
|
+
try:
|
442
|
+
with open(fixed_map_file, encoding="utf-8") as f:
|
443
|
+
fixed_map = json.load(f)
|
444
|
+
cached_chars = set(fixed_map.keys())
|
445
|
+
mapping_result.update(
|
446
|
+
{ch: fixed_map[ch] for ch in char_set if ch in fixed_map}
|
447
|
+
)
|
448
|
+
mapping_result.update(
|
449
|
+
{ch: fixed_map[ch] for ch in refl_set if ch in fixed_map}
|
450
|
+
)
|
451
|
+
char_set = char_set - cached_chars
|
452
|
+
refl_set = refl_set - cached_chars
|
453
|
+
except Exception:
|
454
|
+
fixed_map = {}
|
455
|
+
cached_chars = set()
|
456
|
+
|
457
|
+
# prepare font renderers and cmap sets
|
458
|
+
fixed_chars = font_ocr.extract_font_charset(fixed_font_path)
|
459
|
+
random_chars = font_ocr.extract_font_charset(random_font_path)
|
460
|
+
fixed_font = font_ocr.load_render_font(fixed_font_path)
|
461
|
+
random_font = font_ocr.load_render_font(random_font_path)
|
462
|
+
|
463
|
+
# process normal and reflected sets together
|
464
|
+
rendered = []
|
465
|
+
for chars, reflect in [(char_set, False), (refl_set, True)]:
|
466
|
+
for ch in chars:
|
467
|
+
if ch in fixed_chars:
|
468
|
+
font = fixed_font
|
469
|
+
elif ch in random_chars:
|
470
|
+
font = random_font
|
471
|
+
else:
|
472
|
+
continue
|
473
|
+
rendered.append(
|
474
|
+
(ch, font_ocr.render_char_image_array(ch, font, reflect))
|
475
|
+
)
|
476
|
+
|
477
|
+
if rendered:
|
478
|
+
# query OCR+vec simultaneously
|
479
|
+
imgs_to_query = [img for _, img in rendered]
|
480
|
+
fused = font_ocr.predict(imgs_to_query, batch_size=batch_size)
|
481
|
+
|
482
|
+
# pick best per char, apply threshold + cache
|
483
|
+
for (ch, _), preds in zip(rendered, fused, strict=False):
|
484
|
+
if not preds:
|
485
|
+
continue
|
486
|
+
real_char, _ = preds
|
487
|
+
mapping_result[ch] = real_char
|
488
|
+
fixed_map[ch] = real_char
|
489
|
+
|
490
|
+
# persist updated fixed_map
|
491
|
+
try:
|
492
|
+
with open(fixed_map_file, "w", encoding="utf-8") as f:
|
493
|
+
json.dump(fixed_map, f, ensure_ascii=False, indent=2)
|
494
|
+
except Exception as e:
|
495
|
+
logger.error("[FontOCR] Failed to save fixed map: %s", e)
|
496
|
+
|
497
|
+
return mapping_result
|
498
|
+
|
499
|
+
@staticmethod
|
500
|
+
def _apply_font_mapping(text: str, font_map: dict[str, str]) -> str:
|
501
|
+
"""
|
502
|
+
Replace each character in `text` using `font_map`,
|
503
|
+
leaving unmapped characters unchanged.
|
504
|
+
|
505
|
+
:param text: The input string, possibly containing obfuscated font chars.
|
506
|
+
:param font_map: A dict mapping obfuscated chars to real chars.
|
507
|
+
:return: The de-obfuscated text.
|
508
|
+
"""
|
509
|
+
return "".join(font_map.get(ch, ch) for ch in text)
|
510
|
+
|
511
|
+
@staticmethod
|
512
|
+
def _only_tag(selector: str) -> str | None:
|
513
|
+
"""
|
514
|
+
Normalize a selector into just its tag name for ordering.
|
515
|
+
|
516
|
+
Handles forms like 'i', 'em::before', '.p3 i', '.p2 span::after'.
|
517
|
+
|
518
|
+
Returns None if can't extract a tag.
|
519
|
+
"""
|
520
|
+
# If it has spaces, take the rightmost simple selector
|
521
|
+
last = selector.strip().split()[-1]
|
522
|
+
# Drop ::pseudo
|
523
|
+
last = last.split("::", 1)[0]
|
524
|
+
# If it's like 'span[attr=..]' keep 'span'
|
525
|
+
last = last.split("[", 1)[0]
|
526
|
+
# If it starts with '.', it's not a tag
|
527
|
+
if not last or last.startswith("."):
|
528
|
+
return None
|
529
|
+
return last
|
530
|
+
|
531
|
+
@staticmethod
|
532
|
+
def _parse_decls(block: str) -> list[tuple[str, str]]:
|
533
|
+
"""
|
534
|
+
Parse 'name:value;...' inside a block. Tolerates quotes and attr().
|
535
|
+
"""
|
536
|
+
parts = [d.strip() for d in block.split(";") if d.strip()]
|
537
|
+
decls = []
|
538
|
+
for p in parts:
|
539
|
+
if ":" in p:
|
540
|
+
name, val = p.split(":", 1)
|
541
|
+
decls.append((name.strip().lower(), val.strip()))
|
542
|
+
return decls
|
543
|
+
|
544
|
+
@classmethod
|
545
|
+
def _parse_css_rules(cls, css_str: str) -> Rules:
|
546
|
+
"""
|
547
|
+
Produces normalized Rules with:
|
548
|
+
* orders: list[str] of tag names sorted by numeric 'order'
|
549
|
+
* sy: '.sy-*' class rules
|
550
|
+
* p_rules: '.p* <tag>' rules, indexed by p-class then tag
|
551
|
+
"""
|
552
|
+
rules: Rules = {"orders": [], "sy": {}, "p_rules": {}}
|
553
|
+
order_pairs: list[tuple[str, int]] = []
|
554
|
+
|
555
|
+
pos = 0
|
556
|
+
while True:
|
557
|
+
b1 = css_str.find("{", pos)
|
558
|
+
if b1 == -1:
|
559
|
+
break
|
560
|
+
selector = css_str[pos:b1].strip().lower()
|
561
|
+
b2 = css_str.find("}", b1 + 1)
|
562
|
+
if b2 == -1:
|
563
|
+
break
|
564
|
+
block = css_str[b1 + 1 : b2]
|
565
|
+
pos = b2 + 1
|
566
|
+
|
567
|
+
decls = cls._parse_decls(block)
|
568
|
+
new_rule: Rule = {}
|
569
|
+
order_val: int | None = None
|
570
|
+
|
571
|
+
for name, value in decls:
|
572
|
+
v = value.strip()
|
573
|
+
if name == "font-size" and v == "0":
|
574
|
+
new_rule[
|
575
|
+
"delete_first" if "::first-letter" in selector else "delete_all"
|
576
|
+
] = True
|
577
|
+
elif name == "transform" and "scalex(-1" in v.replace(" ", "").lower():
|
578
|
+
new_rule["transform_flip_x"] = True
|
579
|
+
elif name == "order":
|
580
|
+
with suppress(ValueError):
|
581
|
+
order_val = int(v)
|
582
|
+
elif name == "content":
|
583
|
+
if "::after" in selector:
|
584
|
+
if v.lower().startswith("attr("):
|
585
|
+
new_rule["append_end_attr"] = v[5:-1].strip()
|
586
|
+
else:
|
587
|
+
new_rule["append_end_char"] = v.strip().strip("\"'")
|
588
|
+
elif "::before" in selector:
|
589
|
+
if v.lower().startswith("attr("):
|
590
|
+
new_rule["append_start_attr"] = v[5:-1].strip()
|
591
|
+
else:
|
592
|
+
new_rule["append_start_char"] = v.strip().strip("\"'")
|
593
|
+
|
594
|
+
if selector.startswith(".sy-"):
|
595
|
+
key = selector.lstrip(".")
|
596
|
+
rules["sy"][key] = {**rules["sy"].get(key, {}), **new_rule}
|
597
|
+
elif selector.startswith(".p") and " " in selector:
|
598
|
+
p_cls, right = selector.split(" ", 1)
|
599
|
+
tag = cls._only_tag(right)
|
600
|
+
if tag:
|
601
|
+
p_cls = p_cls.lstrip(".")
|
602
|
+
rules["p_rules"].setdefault(p_cls, {})
|
603
|
+
rules["p_rules"][p_cls][tag] = {
|
604
|
+
**rules["p_rules"][p_cls].get(tag, {}),
|
605
|
+
**new_rule,
|
606
|
+
}
|
607
|
+
|
608
|
+
if order_val is not None:
|
609
|
+
tag = cls._only_tag(selector)
|
610
|
+
if tag:
|
611
|
+
order_pairs.append((tag, order_val))
|
612
|
+
|
613
|
+
rules["orders"] = [t for t, _ in sorted(order_pairs, key=lambda x: x[1])]
|
614
|
+
return rules
|
615
|
+
|
616
|
+
@staticmethod
|
617
|
+
def _render_visible_text(html_str: str, rules: Rules) -> tuple[str, list[str]]:
|
618
|
+
"""
|
619
|
+
Renderer the HTML using pre-parsed Rules.
|
620
|
+
"""
|
621
|
+
tree = html.fromstring(html_str)
|
622
|
+
paragraphs_out: list[str] = []
|
623
|
+
refl_list: list[str] = []
|
624
|
+
orders = rules.get("orders") or []
|
625
|
+
p_rules = rules.get("p_rules") or {}
|
626
|
+
sy_rules = rules.get("sy") or {}
|
627
|
+
|
628
|
+
def _class_list(el: html.HtmlElement) -> list[str]:
|
629
|
+
cls = el.get("class")
|
630
|
+
return cls.split() if cls else []
|
631
|
+
|
632
|
+
def _apply_rule(el: html.HtmlElement, rule: Rule) -> str:
|
633
|
+
if rule.get("delete_all"):
|
634
|
+
return ""
|
635
|
+
|
636
|
+
parts: list[str] = []
|
637
|
+
if "append_start_char" in rule:
|
638
|
+
parts.append(rule["append_start_char"])
|
639
|
+
if "append_start_attr" in rule:
|
640
|
+
parts.append(el.get(rule["append_start_attr"], ""))
|
641
|
+
|
642
|
+
text = el.text or ""
|
643
|
+
if rule.get("delete_first") and text:
|
644
|
+
text = text[1:]
|
645
|
+
parts.append(text)
|
646
|
+
|
647
|
+
if "append_end_char" in rule:
|
648
|
+
parts.append(rule["append_end_char"])
|
649
|
+
if "append_end_attr" in rule:
|
650
|
+
parts.append(el.get(rule["append_end_attr"], ""))
|
651
|
+
|
652
|
+
s = "".join(parts)
|
653
|
+
|
654
|
+
if rule.get("transform_flip_x") and s:
|
655
|
+
refl_list.append(s)
|
656
|
+
|
657
|
+
return s
|
658
|
+
|
659
|
+
for p in tree.findall(".//p"):
|
660
|
+
p_classes = _class_list(p)
|
661
|
+
p_key = next((c for c in p_classes if c.startswith("p")), None)
|
662
|
+
has_ordered_rules = p_key in p_rules
|
663
|
+
|
664
|
+
buf_parts: list[str] = []
|
665
|
+
|
666
|
+
if p.text and not has_ordered_rules:
|
667
|
+
buf_parts.append(p.text)
|
668
|
+
|
669
|
+
ordered_cache: dict[str, list[str]] = {}
|
670
|
+
|
671
|
+
for child in p:
|
672
|
+
tag = str(child.tag)
|
673
|
+
|
674
|
+
# Handle inline <y class="sy-*"> spans
|
675
|
+
if tag == "y" and not has_ordered_rules:
|
676
|
+
y_cls = next(
|
677
|
+
(c for c in _class_list(child) if c.startswith("sy-")), None
|
678
|
+
)
|
679
|
+
if y_cls and y_cls in sy_rules:
|
680
|
+
buf_parts.append(_apply_rule(child, sy_rules[y_cls]))
|
681
|
+
else:
|
682
|
+
buf_parts.append(child.text or "")
|
683
|
+
if child.tail:
|
684
|
+
buf_parts.append(child.tail)
|
685
|
+
continue
|
686
|
+
|
687
|
+
# Handle ordered paragraphs: only cache tags that appear in `orders`
|
688
|
+
if p_key and has_ordered_rules and tag in orders:
|
689
|
+
rule = p_rules[p_key].get(tag, {})
|
690
|
+
ordered_cache.setdefault(tag, []).append(_apply_rule(child, rule))
|
691
|
+
continue
|
692
|
+
|
693
|
+
# Non-ordered, non-<y> nodes: include text + tails as-is
|
694
|
+
if not has_ordered_rules:
|
695
|
+
buf_parts.append(child.text or "")
|
696
|
+
if child.tail:
|
697
|
+
buf_parts.append(child.tail)
|
698
|
+
|
699
|
+
# If ordered, flush in global orders with all duplicates preserved
|
700
|
+
if has_ordered_rules:
|
701
|
+
for tag in orders:
|
702
|
+
if tag in ordered_cache:
|
703
|
+
buf_parts.extend(ordered_cache[tag])
|
704
|
+
|
705
|
+
para = "".join(buf_parts)
|
706
|
+
if para:
|
707
|
+
paragraphs_out.append(para)
|
708
|
+
|
709
|
+
return "\n".join(paragraphs_out), refl_list
|