novel-downloader 2.0.1__py3-none-any.whl → 2.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/download.py +11 -8
  3. novel_downloader/cli/export.py +17 -17
  4. novel_downloader/cli/ui.py +28 -1
  5. novel_downloader/config/adapter.py +27 -1
  6. novel_downloader/core/archived/deqixs/fetcher.py +1 -28
  7. novel_downloader/core/downloaders/__init__.py +2 -0
  8. novel_downloader/core/downloaders/base.py +34 -85
  9. novel_downloader/core/downloaders/common.py +147 -171
  10. novel_downloader/core/downloaders/qianbi.py +30 -64
  11. novel_downloader/core/downloaders/qidian.py +157 -184
  12. novel_downloader/core/downloaders/qqbook.py +292 -0
  13. novel_downloader/core/downloaders/registry.py +2 -2
  14. novel_downloader/core/exporters/__init__.py +2 -0
  15. novel_downloader/core/exporters/base.py +37 -59
  16. novel_downloader/core/exporters/common.py +620 -0
  17. novel_downloader/core/exporters/linovelib.py +47 -0
  18. novel_downloader/core/exporters/qidian.py +41 -12
  19. novel_downloader/core/exporters/qqbook.py +28 -0
  20. novel_downloader/core/exporters/registry.py +2 -2
  21. novel_downloader/core/fetchers/__init__.py +4 -2
  22. novel_downloader/core/fetchers/aaatxt.py +2 -22
  23. novel_downloader/core/fetchers/b520.py +3 -23
  24. novel_downloader/core/fetchers/base.py +80 -105
  25. novel_downloader/core/fetchers/biquyuedu.py +2 -22
  26. novel_downloader/core/fetchers/dxmwx.py +10 -22
  27. novel_downloader/core/fetchers/esjzone.py +6 -29
  28. novel_downloader/core/fetchers/guidaye.py +2 -22
  29. novel_downloader/core/fetchers/hetushu.py +9 -29
  30. novel_downloader/core/fetchers/i25zw.py +2 -16
  31. novel_downloader/core/fetchers/ixdzs8.py +2 -16
  32. novel_downloader/core/fetchers/jpxs123.py +2 -16
  33. novel_downloader/core/fetchers/lewenn.py +2 -22
  34. novel_downloader/core/fetchers/linovelib.py +4 -20
  35. novel_downloader/core/fetchers/{eightnovel.py → n8novel.py} +12 -40
  36. novel_downloader/core/fetchers/piaotia.py +2 -16
  37. novel_downloader/core/fetchers/qbtr.py +2 -16
  38. novel_downloader/core/fetchers/qianbi.py +1 -20
  39. novel_downloader/core/fetchers/qidian.py +7 -33
  40. novel_downloader/core/fetchers/qqbook.py +177 -0
  41. novel_downloader/core/fetchers/quanben5.py +9 -29
  42. novel_downloader/core/fetchers/rate_limiter.py +22 -53
  43. novel_downloader/core/fetchers/sfacg.py +3 -16
  44. novel_downloader/core/fetchers/shencou.py +2 -16
  45. novel_downloader/core/fetchers/shuhaige.py +2 -22
  46. novel_downloader/core/fetchers/tongrenquan.py +2 -22
  47. novel_downloader/core/fetchers/ttkan.py +3 -14
  48. novel_downloader/core/fetchers/wanbengo.py +2 -22
  49. novel_downloader/core/fetchers/xiaoshuowu.py +2 -16
  50. novel_downloader/core/fetchers/xiguashuwu.py +4 -20
  51. novel_downloader/core/fetchers/xs63b.py +3 -15
  52. novel_downloader/core/fetchers/xshbook.py +2 -22
  53. novel_downloader/core/fetchers/yamibo.py +4 -28
  54. novel_downloader/core/fetchers/yibige.py +13 -26
  55. novel_downloader/core/interfaces/exporter.py +19 -7
  56. novel_downloader/core/interfaces/fetcher.py +21 -47
  57. novel_downloader/core/parsers/__init__.py +4 -2
  58. novel_downloader/core/parsers/b520.py +2 -2
  59. novel_downloader/core/parsers/base.py +4 -39
  60. novel_downloader/core/parsers/{eightnovel.py → n8novel.py} +5 -5
  61. novel_downloader/core/parsers/{qidian/main_parser.py → qidian.py} +147 -266
  62. novel_downloader/core/parsers/qqbook.py +709 -0
  63. novel_downloader/core/parsers/xiguashuwu.py +3 -4
  64. novel_downloader/core/searchers/__init__.py +2 -2
  65. novel_downloader/core/searchers/b520.py +1 -1
  66. novel_downloader/core/searchers/base.py +2 -2
  67. novel_downloader/core/searchers/{eightnovel.py → n8novel.py} +5 -5
  68. novel_downloader/models/__init__.py +2 -0
  69. novel_downloader/models/book.py +1 -0
  70. novel_downloader/models/config.py +12 -0
  71. novel_downloader/resources/config/settings.toml +23 -5
  72. novel_downloader/resources/js_scripts/expr_to_json.js +14 -0
  73. novel_downloader/resources/js_scripts/qidian_decrypt_node.js +21 -16
  74. novel_downloader/resources/js_scripts/qq_decrypt_node.js +92 -0
  75. novel_downloader/utils/constants.py +6 -0
  76. novel_downloader/utils/crypto_utils/aes_util.py +1 -1
  77. novel_downloader/utils/epub/constants.py +1 -6
  78. novel_downloader/utils/fontocr/core.py +2 -0
  79. novel_downloader/utils/fontocr/loader.py +10 -8
  80. novel_downloader/utils/node_decryptor/__init__.py +13 -0
  81. novel_downloader/utils/node_decryptor/decryptor.py +342 -0
  82. novel_downloader/{core/parsers/qidian/utils → utils/node_decryptor}/decryptor_fetcher.py +5 -6
  83. novel_downloader/web/pages/download.py +1 -1
  84. novel_downloader/web/pages/search.py +1 -1
  85. novel_downloader/web/services/task_manager.py +2 -0
  86. {novel_downloader-2.0.1.dist-info → novel_downloader-2.0.2.dist-info}/METADATA +4 -1
  87. {novel_downloader-2.0.1.dist-info → novel_downloader-2.0.2.dist-info}/RECORD +91 -94
  88. novel_downloader/core/exporters/common/__init__.py +0 -11
  89. novel_downloader/core/exporters/common/epub.py +0 -198
  90. novel_downloader/core/exporters/common/main_exporter.py +0 -64
  91. novel_downloader/core/exporters/common/txt.py +0 -146
  92. novel_downloader/core/exporters/epub_util.py +0 -215
  93. novel_downloader/core/exporters/linovelib/__init__.py +0 -11
  94. novel_downloader/core/exporters/linovelib/epub.py +0 -349
  95. novel_downloader/core/exporters/linovelib/main_exporter.py +0 -66
  96. novel_downloader/core/exporters/linovelib/txt.py +0 -139
  97. novel_downloader/core/exporters/txt_util.py +0 -67
  98. novel_downloader/core/parsers/qidian/__init__.py +0 -10
  99. novel_downloader/core/parsers/qidian/utils/__init__.py +0 -11
  100. novel_downloader/core/parsers/qidian/utils/node_decryptor.py +0 -175
  101. {novel_downloader-2.0.1.dist-info → novel_downloader-2.0.2.dist-info}/WHEEL +0 -0
  102. {novel_downloader-2.0.1.dist-info → novel_downloader-2.0.2.dist-info}/entry_points.txt +0 -0
  103. {novel_downloader-2.0.1.dist-info → novel_downloader-2.0.2.dist-info}/licenses/LICENSE +0 -0
  104. {novel_downloader-2.0.1.dist-info → novel_downloader-2.0.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,709 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.qqbook
4
+ ------------------------------------
5
+
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import logging
12
+ import re
13
+ from contextlib import suppress
14
+ from pathlib import Path
15
+ from typing import Any, TypedDict
16
+
17
+ from lxml import html
18
+
19
+ from novel_downloader.core.parsers.base import BaseParser
20
+ from novel_downloader.core.parsers.registry import register_parser
21
+ from novel_downloader.models import (
22
+ BookInfoDict,
23
+ ChapterDict,
24
+ ChapterInfoDict,
25
+ ParserConfig,
26
+ VolumeInfoDict,
27
+ )
28
+ from novel_downloader.utils import download
29
+ from novel_downloader.utils.fontocr import get_font_ocr
30
+ from novel_downloader.utils.node_decryptor import get_decryptor
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ class Rule(TypedDict, total=False):
36
+ delete_all: bool
37
+ delete_first: bool
38
+ transform_flip_x: bool
39
+ append_start_char: str
40
+ append_end_char: str
41
+ append_start_attr: str
42
+ append_end_attr: str
43
+
44
+
45
+ class Rules(TypedDict):
46
+ # e.g., orders = ["i", "em", "span"]
47
+ orders: list[str]
48
+ # e.g., sy["sy-3"] -> Rule
49
+ sy: dict[str, Rule]
50
+ # e.g., p_rules["p3"]["i"] -> Rule
51
+ p_rules: dict[str, dict[str, Rule]]
52
+
53
+
54
+ @register_parser(
55
+ site_keys=["qqbook", "qq"],
56
+ )
57
+ class QqbookParser(BaseParser):
58
+ """
59
+ Parser for QQ 阅读 site.
60
+ """
61
+
62
+ _NUXT_BLOCK_RE = re.compile(
63
+ r"window\.__NUXT__\s*=\s*([\s\S]*?);?\s*<\/script>",
64
+ re.S,
65
+ )
66
+
67
+ def __init__(self, config: ParserConfig):
68
+ """
69
+ Initialize the QqbookParser with the given configuration.
70
+ """
71
+ super().__init__(config)
72
+
73
+ self._rand_path = self._base_cache_dir / "qqbook" / "randomFont.ttf"
74
+ self._fixed_font_dir = self._base_cache_dir / "qqbook" / "fixed_fonts"
75
+ self._fixed_map_dir = self._base_cache_dir / "qqbook" / "fixed_font_map"
76
+ self._debug_dir = Path.cwd() / "debug" / "qqbook"
77
+
78
+ def parse_book_info(
79
+ self,
80
+ html_list: list[str],
81
+ **kwargs: Any,
82
+ ) -> BookInfoDict | None:
83
+ """
84
+ Parse a book info page and extract metadata and chapter structure.
85
+
86
+ Order: [info, catalog]
87
+
88
+ :param html_list: Raw HTML of the book info page.
89
+ :return: Parsed metadata and chapter structure as a dictionary.
90
+ """
91
+ if len(html_list) < 2:
92
+ return None
93
+
94
+ info_tree = html.fromstring(html_list[0])
95
+ catalog_dict = json.loads(html_list[1])
96
+
97
+ book_name = self._first_str(
98
+ info_tree.xpath('//meta[@property="og:novel:book_name"]/@content')
99
+ ) or self._first_str(
100
+ info_tree.xpath('//h1[contains(@class, "book-title")]/text()')
101
+ )
102
+ author = self._first_str(
103
+ info_tree.xpath('//meta[@property="og:novel:author"]/@content')
104
+ ) or self._first_str(
105
+ info_tree.xpath(
106
+ '//div[contains(@class,"book-meta")]//a[contains(@class,"author")]/text()'
107
+ ),
108
+ replaces=[(" 著", ""), ("著", "")],
109
+ )
110
+ cover_url = self._first_str(
111
+ info_tree.xpath('//meta[@property="og:image"]/@content')
112
+ ) or self._first_str(
113
+ info_tree.xpath('//div[contains(@class,"book-cover")]//img/@src')
114
+ )
115
+ update_time = self._first_str(
116
+ info_tree.xpath('//meta[@property="og:novel:update_time"]/@content')
117
+ ) or self._first_str(
118
+ info_tree.xpath('//div[contains(@class,"update-time")]/text()'),
119
+ replaces=[("更新时间:", "")],
120
+ )
121
+ serial_status = self._first_str(
122
+ info_tree.xpath('//meta[@property="og:novel:status"]/@content')
123
+ )
124
+ # tags
125
+ tags = [
126
+ t.strip()
127
+ for t in info_tree.xpath(
128
+ '//div[contains(@class,"book-tags")]//a[contains(@class,"tag")]/text()'
129
+ )
130
+ if t.strip()
131
+ ]
132
+ # summary
133
+ summary_raw = "\n".join(
134
+ info_tree.xpath('//div[contains(@class,"book-intro")]//text()')
135
+ )
136
+ summary = (
137
+ self._norm_space(summary_raw)
138
+ if summary_raw
139
+ else self._first_str(
140
+ info_tree.xpath('//meta[@property="og:description"]/@content')
141
+ )
142
+ )
143
+
144
+ # book_id for chapter URLs
145
+ read_url = self._first_str(
146
+ info_tree.xpath('//meta[@property="og:novel:read_url"]/@content')
147
+ ) or self._first_str(info_tree.xpath('//meta[@property="og:url"]/@content'))
148
+ book_id = ""
149
+ if read_url:
150
+ book_id = read_url.rstrip("/").split("/")[-1]
151
+
152
+ # Chapters from the book_list
153
+ data = catalog_dict.get("data") or []
154
+ chapters: list[ChapterInfoDict] = []
155
+ for item in data:
156
+ cid = str(item.get("cid"))
157
+ title = str(item.get("chapterName", "")).strip()
158
+ accessible = bool(item.get("free") or item.get("purchased"))
159
+ chap: ChapterInfoDict = {
160
+ "title": title,
161
+ "chapterId": cid,
162
+ "url": f"/book-read/{book_id}/{cid}" if book_id and cid else "",
163
+ "accessible": accessible,
164
+ }
165
+ chapters.append(chap)
166
+
167
+ volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
168
+
169
+ return {
170
+ "book_name": book_name,
171
+ "author": author,
172
+ "cover_url": cover_url,
173
+ "update_time": update_time,
174
+ "serial_status": serial_status,
175
+ "tags": tags,
176
+ "summary": summary,
177
+ "volumes": volumes,
178
+ "extra": {},
179
+ }
180
+
181
+ def parse_chapter(
182
+ self,
183
+ html_list: list[str],
184
+ chapter_id: str,
185
+ **kwargs: Any,
186
+ ) -> ChapterDict | None:
187
+ if not html_list:
188
+ logger.warning("[Parser] chapter_id=%s :: html_list is empty", chapter_id)
189
+ return None
190
+ try:
191
+ nuxt_block = self._find_nuxt_block(html_list[0])
192
+ data_list = nuxt_block.get("data")
193
+ if not data_list:
194
+ return None
195
+ data_block = data_list[0]
196
+ except Exception as e:
197
+ logger.warning(
198
+ "[Parser] chapter_id=%s :: failed to locate Nuxt block: %s",
199
+ chapter_id,
200
+ e,
201
+ )
202
+ return None
203
+
204
+ curr_content = data_block.get("currentContent") or {}
205
+ if not curr_content:
206
+ logger.warning(
207
+ "[Parser] chapter_id=%s :: currentContent missing or empty", chapter_id
208
+ )
209
+ return None
210
+
211
+ content = curr_content.get("content", "")
212
+ if not content:
213
+ logger.warning(
214
+ "[Parser] chapter_id=%s :: raw 'content' missing or empty", chapter_id
215
+ )
216
+ return None
217
+
218
+ title = data_block.get("chapterTitle", "Untitled")
219
+ cid = str(data_block.get("cid") or chapter_id)
220
+ bk_cfg = data_block.get("fkConfig") or {}
221
+ encrypt = curr_content.get("encrypt", False)
222
+ font_encrypt = bool(curr_content.get("fontEncrypt"))
223
+ font_resp = curr_content.get("fontResponse") or {}
224
+
225
+ update_time = curr_content.get("updateTime") or ""
226
+ word_count = curr_content.get("totalWords") or ""
227
+
228
+ logger.debug(
229
+ "[Parser]chapter_id=%s :: meta title=%r encrypt=%s font_encrypt=%s",
230
+ chapter_id,
231
+ title,
232
+ encrypt,
233
+ font_encrypt,
234
+ )
235
+
236
+ if encrypt:
237
+ try:
238
+ content = self._parse_encrypted(content=content, cid=cid, bk_cfg=bk_cfg)
239
+ except Exception as e:
240
+ logger.warning(
241
+ "[Parser] chapter_id=%s :: encrypted content decryption failed: %s",
242
+ chapter_id,
243
+ e,
244
+ )
245
+ return None
246
+
247
+ if font_encrypt:
248
+ content = self._parse_font_encrypted(
249
+ content=content,
250
+ font_resp=font_resp,
251
+ cid=cid,
252
+ )
253
+
254
+ if not content:
255
+ logger.warning(
256
+ "[Parser] chapter_id=%s :: content empty after decryption/font-mapping",
257
+ chapter_id,
258
+ )
259
+ return None
260
+
261
+ return {
262
+ "id": cid,
263
+ "title": title,
264
+ "content": content,
265
+ "extra": {
266
+ "site": "qqbook",
267
+ "updated_at": update_time,
268
+ "word_count": word_count,
269
+ "encrypt": encrypt,
270
+ "font_encrypt": font_encrypt,
271
+ },
272
+ }
273
+
274
+ def _parse_encrypted(
275
+ self,
276
+ content: str,
277
+ cid: str,
278
+ bk_cfg: dict[str, Any],
279
+ ) -> str:
280
+ decryptor = get_decryptor()
281
+ fkp = bk_cfg.get("fkp", "")
282
+ fuid = bk_cfg.get("fuid", "")
283
+ return decryptor.decrypt_qq(
284
+ ciphertext=content,
285
+ chapter_id=cid,
286
+ fkp=fkp,
287
+ fuid=fuid,
288
+ )
289
+
290
+ def _parse_font_encrypted(
291
+ self,
292
+ content: str,
293
+ font_resp: dict[str, Any],
294
+ cid: str,
295
+ ) -> str:
296
+ """
297
+ Steps:
298
+ 1. Decode and save randomFont bytes; download fixedFont via download().
299
+ 2. Parse CSS rules and save debug JSON.
300
+ 3. Render encrypted paragraphs, then run OCR font-mapping.
301
+ 4. Extracts paragraph texts and formats them.
302
+ """
303
+ if not self._decode_font:
304
+ logger.warning(
305
+ "[Parser] chapter_id=%s :: font decryption skipped "
306
+ "(set `decode_font=True` to enable)",
307
+ cid,
308
+ )
309
+ return ""
310
+
311
+ css_str = font_resp.get("css")
312
+ random_font = font_resp.get("randomFont") or {}
313
+ rf_data = random_font.get("data") if isinstance(random_font, dict) else None
314
+ fixed_woff2_url = font_resp.get("fixedFontWoff2")
315
+
316
+ if not css_str:
317
+ logger.warning("[Parser] cid=%s :: css missing or empty", cid)
318
+ return ""
319
+ if not rf_data:
320
+ logger.warning("[Parser] cid=%s :: randomFont.data missing or empty", cid)
321
+ return ""
322
+ if not fixed_woff2_url:
323
+ logger.warning("[Parser] cid=%s :: fixedFontWoff2 missing or empty", cid)
324
+ return ""
325
+
326
+ debug_dir = self._debug_dir / "font_debug" / cid
327
+ if self._save_font_debug:
328
+ debug_dir.mkdir(parents=True, exist_ok=True)
329
+
330
+ try:
331
+ self._rand_path.parent.mkdir(parents=True, exist_ok=True)
332
+ self._rand_path.write_bytes(bytes(rf_data))
333
+ except Exception as e:
334
+ logger.error(
335
+ "[Parser] cid=%s :: failed to write randomFont.ttf",
336
+ cid,
337
+ exc_info=e,
338
+ )
339
+ return ""
340
+
341
+ fixed_path = download(
342
+ url=fixed_woff2_url,
343
+ target_dir=self._fixed_font_dir,
344
+ on_exist="skip",
345
+ )
346
+ if fixed_path is None:
347
+ logger.warning(
348
+ "[Parser] failed to download fixedfont for chapter '%s'", cid
349
+ )
350
+ return ""
351
+
352
+ css_rules = self._parse_css_rules(css_str)
353
+ paragraphs_str, refl_list = self._render_visible_text(content, css_rules)
354
+ if self._save_font_debug:
355
+ (debug_dir / f"{cid}_debug.txt").write_text(
356
+ paragraphs_str, encoding="utf-8"
357
+ )
358
+
359
+ # Run OCR + fallback mapping
360
+ char_set = {c for c in paragraphs_str if c not in {" ", "\n", "\u3000"}}
361
+ refl_set = set(refl_list)
362
+ char_set = char_set - refl_set
363
+ if self._save_font_debug:
364
+ (debug_dir / "char_set_debug.txt").write_text(
365
+ f"char_set:\n{char_set}\n\nrefl_set:\n{refl_set}",
366
+ encoding="utf-8",
367
+ )
368
+
369
+ mapping_result = self._generate_font_map(
370
+ fixed_font_path=fixed_path,
371
+ random_font_path=self._rand_path,
372
+ char_set=char_set,
373
+ refl_set=refl_set,
374
+ batch_size=self._batch_size,
375
+ )
376
+ if not mapping_result:
377
+ logger.warning(
378
+ "[Parser] font mapping returned empty result for chapter '%s'", cid
379
+ )
380
+ return ""
381
+
382
+ if self._save_font_debug:
383
+ (debug_dir / "font_mapping.json").write_text(
384
+ json.dumps(mapping_result, ensure_ascii=False, indent=2),
385
+ encoding="utf-8",
386
+ )
387
+
388
+ # Reconstruct final readable text
389
+ original_text = self._apply_font_mapping(
390
+ text=paragraphs_str,
391
+ font_map=mapping_result,
392
+ )
393
+
394
+ final_paragraphs_str = "\n".join(
395
+ line.strip() for line in original_text.splitlines() if line.strip()
396
+ )
397
+
398
+ return final_paragraphs_str
399
+
400
+ @classmethod
401
+ def _find_nuxt_block(cls, html_str: str) -> dict[str, Any]:
402
+ m = cls._NUXT_BLOCK_RE.search(html_str)
403
+ if not m:
404
+ return {}
405
+ js_code = m.group(1).rstrip() # RHS only
406
+ decryptor = get_decryptor()
407
+ return decryptor.eval_to_json(js_code)
408
+
409
+ def _generate_font_map(
410
+ self,
411
+ fixed_font_path: Path,
412
+ random_font_path: Path,
413
+ char_set: set[str],
414
+ refl_set: set[str],
415
+ batch_size: int = 32,
416
+ ) -> dict[str, str]:
417
+ """
418
+ Build a mapping from scrambled font chars to real chars.
419
+
420
+ Uses OCR to decode and generate mapping from a fixed obfuscated font
421
+ and an random obfuscated font. Results are cached in JSON.
422
+
423
+ :param fixed_font_path: fixed font file.
424
+ :param random_font_path: random font file.
425
+ :param char_set: Characters to match directly.
426
+ :param refl_set: Characters to match in flipped form.
427
+ :param cache_dir: Directory to save/load cached results.
428
+ :param batch_size: How many chars to OCR per batch.
429
+
430
+ :return: { obf_char: real_char, ... }
431
+ """
432
+ font_ocr = get_font_ocr(self._fontocr_cfg)
433
+ if not font_ocr:
434
+ return {}
435
+
436
+ mapping_result: dict[str, str] = {}
437
+ fixed_map_file = self._fixed_map_dir / f"{fixed_font_path.stem}.json"
438
+ fixed_map_file.parent.mkdir(parents=True, exist_ok=True)
439
+
440
+ # load existing cache
441
+ try:
442
+ with open(fixed_map_file, encoding="utf-8") as f:
443
+ fixed_map = json.load(f)
444
+ cached_chars = set(fixed_map.keys())
445
+ mapping_result.update(
446
+ {ch: fixed_map[ch] for ch in char_set if ch in fixed_map}
447
+ )
448
+ mapping_result.update(
449
+ {ch: fixed_map[ch] for ch in refl_set if ch in fixed_map}
450
+ )
451
+ char_set = char_set - cached_chars
452
+ refl_set = refl_set - cached_chars
453
+ except Exception:
454
+ fixed_map = {}
455
+ cached_chars = set()
456
+
457
+ # prepare font renderers and cmap sets
458
+ fixed_chars = font_ocr.extract_font_charset(fixed_font_path)
459
+ random_chars = font_ocr.extract_font_charset(random_font_path)
460
+ fixed_font = font_ocr.load_render_font(fixed_font_path)
461
+ random_font = font_ocr.load_render_font(random_font_path)
462
+
463
+ # process normal and reflected sets together
464
+ rendered = []
465
+ for chars, reflect in [(char_set, False), (refl_set, True)]:
466
+ for ch in chars:
467
+ if ch in fixed_chars:
468
+ font = fixed_font
469
+ elif ch in random_chars:
470
+ font = random_font
471
+ else:
472
+ continue
473
+ rendered.append(
474
+ (ch, font_ocr.render_char_image_array(ch, font, reflect))
475
+ )
476
+
477
+ if rendered:
478
+ # query OCR+vec simultaneously
479
+ imgs_to_query = [img for _, img in rendered]
480
+ fused = font_ocr.predict(imgs_to_query, batch_size=batch_size)
481
+
482
+ # pick best per char, apply threshold + cache
483
+ for (ch, _), preds in zip(rendered, fused, strict=False):
484
+ if not preds:
485
+ continue
486
+ real_char, _ = preds
487
+ mapping_result[ch] = real_char
488
+ fixed_map[ch] = real_char
489
+
490
+ # persist updated fixed_map
491
+ try:
492
+ with open(fixed_map_file, "w", encoding="utf-8") as f:
493
+ json.dump(fixed_map, f, ensure_ascii=False, indent=2)
494
+ except Exception as e:
495
+ logger.error("[FontOCR] Failed to save fixed map: %s", e)
496
+
497
+ return mapping_result
498
+
499
+ @staticmethod
500
+ def _apply_font_mapping(text: str, font_map: dict[str, str]) -> str:
501
+ """
502
+ Replace each character in `text` using `font_map`,
503
+ leaving unmapped characters unchanged.
504
+
505
+ :param text: The input string, possibly containing obfuscated font chars.
506
+ :param font_map: A dict mapping obfuscated chars to real chars.
507
+ :return: The de-obfuscated text.
508
+ """
509
+ return "".join(font_map.get(ch, ch) for ch in text)
510
+
511
+ @staticmethod
512
+ def _only_tag(selector: str) -> str | None:
513
+ """
514
+ Normalize a selector into just its tag name for ordering.
515
+
516
+ Handles forms like 'i', 'em::before', '.p3 i', '.p2 span::after'.
517
+
518
+ Returns None if can't extract a tag.
519
+ """
520
+ # If it has spaces, take the rightmost simple selector
521
+ last = selector.strip().split()[-1]
522
+ # Drop ::pseudo
523
+ last = last.split("::", 1)[0]
524
+ # If it's like 'span[attr=..]' keep 'span'
525
+ last = last.split("[", 1)[0]
526
+ # If it starts with '.', it's not a tag
527
+ if not last or last.startswith("."):
528
+ return None
529
+ return last
530
+
531
+ @staticmethod
532
+ def _parse_decls(block: str) -> list[tuple[str, str]]:
533
+ """
534
+ Parse 'name:value;...' inside a block. Tolerates quotes and attr().
535
+ """
536
+ parts = [d.strip() for d in block.split(";") if d.strip()]
537
+ decls = []
538
+ for p in parts:
539
+ if ":" in p:
540
+ name, val = p.split(":", 1)
541
+ decls.append((name.strip().lower(), val.strip()))
542
+ return decls
543
+
544
+ @classmethod
545
+ def _parse_css_rules(cls, css_str: str) -> Rules:
546
+ """
547
+ Produces normalized Rules with:
548
+ * orders: list[str] of tag names sorted by numeric 'order'
549
+ * sy: '.sy-*' class rules
550
+ * p_rules: '.p* <tag>' rules, indexed by p-class then tag
551
+ """
552
+ rules: Rules = {"orders": [], "sy": {}, "p_rules": {}}
553
+ order_pairs: list[tuple[str, int]] = []
554
+
555
+ pos = 0
556
+ while True:
557
+ b1 = css_str.find("{", pos)
558
+ if b1 == -1:
559
+ break
560
+ selector = css_str[pos:b1].strip().lower()
561
+ b2 = css_str.find("}", b1 + 1)
562
+ if b2 == -1:
563
+ break
564
+ block = css_str[b1 + 1 : b2]
565
+ pos = b2 + 1
566
+
567
+ decls = cls._parse_decls(block)
568
+ new_rule: Rule = {}
569
+ order_val: int | None = None
570
+
571
+ for name, value in decls:
572
+ v = value.strip()
573
+ if name == "font-size" and v == "0":
574
+ new_rule[
575
+ "delete_first" if "::first-letter" in selector else "delete_all"
576
+ ] = True
577
+ elif name == "transform" and "scalex(-1" in v.replace(" ", "").lower():
578
+ new_rule["transform_flip_x"] = True
579
+ elif name == "order":
580
+ with suppress(ValueError):
581
+ order_val = int(v)
582
+ elif name == "content":
583
+ if "::after" in selector:
584
+ if v.lower().startswith("attr("):
585
+ new_rule["append_end_attr"] = v[5:-1].strip()
586
+ else:
587
+ new_rule["append_end_char"] = v.strip().strip("\"'")
588
+ elif "::before" in selector:
589
+ if v.lower().startswith("attr("):
590
+ new_rule["append_start_attr"] = v[5:-1].strip()
591
+ else:
592
+ new_rule["append_start_char"] = v.strip().strip("\"'")
593
+
594
+ if selector.startswith(".sy-"):
595
+ key = selector.lstrip(".")
596
+ rules["sy"][key] = {**rules["sy"].get(key, {}), **new_rule}
597
+ elif selector.startswith(".p") and " " in selector:
598
+ p_cls, right = selector.split(" ", 1)
599
+ tag = cls._only_tag(right)
600
+ if tag:
601
+ p_cls = p_cls.lstrip(".")
602
+ rules["p_rules"].setdefault(p_cls, {})
603
+ rules["p_rules"][p_cls][tag] = {
604
+ **rules["p_rules"][p_cls].get(tag, {}),
605
+ **new_rule,
606
+ }
607
+
608
+ if order_val is not None:
609
+ tag = cls._only_tag(selector)
610
+ if tag:
611
+ order_pairs.append((tag, order_val))
612
+
613
+ rules["orders"] = [t for t, _ in sorted(order_pairs, key=lambda x: x[1])]
614
+ return rules
615
+
616
+ @staticmethod
617
+ def _render_visible_text(html_str: str, rules: Rules) -> tuple[str, list[str]]:
618
+ """
619
+ Renderer the HTML using pre-parsed Rules.
620
+ """
621
+ tree = html.fromstring(html_str)
622
+ paragraphs_out: list[str] = []
623
+ refl_list: list[str] = []
624
+ orders = rules.get("orders") or []
625
+ p_rules = rules.get("p_rules") or {}
626
+ sy_rules = rules.get("sy") or {}
627
+
628
+ def _class_list(el: html.HtmlElement) -> list[str]:
629
+ cls = el.get("class")
630
+ return cls.split() if cls else []
631
+
632
+ def _apply_rule(el: html.HtmlElement, rule: Rule) -> str:
633
+ if rule.get("delete_all"):
634
+ return ""
635
+
636
+ parts: list[str] = []
637
+ if "append_start_char" in rule:
638
+ parts.append(rule["append_start_char"])
639
+ if "append_start_attr" in rule:
640
+ parts.append(el.get(rule["append_start_attr"], ""))
641
+
642
+ text = el.text or ""
643
+ if rule.get("delete_first") and text:
644
+ text = text[1:]
645
+ parts.append(text)
646
+
647
+ if "append_end_char" in rule:
648
+ parts.append(rule["append_end_char"])
649
+ if "append_end_attr" in rule:
650
+ parts.append(el.get(rule["append_end_attr"], ""))
651
+
652
+ s = "".join(parts)
653
+
654
+ if rule.get("transform_flip_x") and s:
655
+ refl_list.append(s)
656
+
657
+ return s
658
+
659
+ for p in tree.findall(".//p"):
660
+ p_classes = _class_list(p)
661
+ p_key = next((c for c in p_classes if c.startswith("p")), None)
662
+ has_ordered_rules = p_key in p_rules
663
+
664
+ buf_parts: list[str] = []
665
+
666
+ if p.text and not has_ordered_rules:
667
+ buf_parts.append(p.text)
668
+
669
+ ordered_cache: dict[str, list[str]] = {}
670
+
671
+ for child in p:
672
+ tag = str(child.tag)
673
+
674
+ # Handle inline <y class="sy-*"> spans
675
+ if tag == "y" and not has_ordered_rules:
676
+ y_cls = next(
677
+ (c for c in _class_list(child) if c.startswith("sy-")), None
678
+ )
679
+ if y_cls and y_cls in sy_rules:
680
+ buf_parts.append(_apply_rule(child, sy_rules[y_cls]))
681
+ else:
682
+ buf_parts.append(child.text or "")
683
+ if child.tail:
684
+ buf_parts.append(child.tail)
685
+ continue
686
+
687
+ # Handle ordered paragraphs: only cache tags that appear in `orders`
688
+ if p_key and has_ordered_rules and tag in orders:
689
+ rule = p_rules[p_key].get(tag, {})
690
+ ordered_cache.setdefault(tag, []).append(_apply_rule(child, rule))
691
+ continue
692
+
693
+ # Non-ordered, non-<y> nodes: include text + tails as-is
694
+ if not has_ordered_rules:
695
+ buf_parts.append(child.text or "")
696
+ if child.tail:
697
+ buf_parts.append(child.tail)
698
+
699
+ # If ordered, flush in global orders with all duplicates preserved
700
+ if has_ordered_rules:
701
+ for tag in orders:
702
+ if tag in ordered_cache:
703
+ buf_parts.extend(ordered_cache[tag])
704
+
705
+ para = "".join(buf_parts)
706
+ if para:
707
+ paragraphs_out.append(para)
708
+
709
+ return "\n".join(paragraphs_out), refl_list