novel-downloader 2.0.0__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/download.py +3 -3
  3. novel_downloader/cli/export.py +1 -1
  4. novel_downloader/cli/ui.py +7 -7
  5. novel_downloader/config/adapter.py +191 -154
  6. novel_downloader/core/__init__.py +5 -6
  7. novel_downloader/core/exporters/common/txt.py +9 -9
  8. novel_downloader/core/exporters/linovelib/txt.py +9 -9
  9. novel_downloader/core/fetchers/qidian.py +20 -35
  10. novel_downloader/core/interfaces/fetcher.py +2 -2
  11. novel_downloader/core/interfaces/parser.py +2 -2
  12. novel_downloader/core/parsers/base.py +1 -0
  13. novel_downloader/core/parsers/eightnovel.py +2 -2
  14. novel_downloader/core/parsers/esjzone.py +3 -3
  15. novel_downloader/core/parsers/qidian/main_parser.py +747 -12
  16. novel_downloader/core/parsers/qidian/utils/__init__.py +2 -21
  17. novel_downloader/core/parsers/qidian/utils/node_decryptor.py +4 -4
  18. novel_downloader/core/parsers/xiguashuwu.py +6 -12
  19. novel_downloader/locales/en.json +3 -3
  20. novel_downloader/locales/zh.json +3 -3
  21. novel_downloader/utils/__init__.py +0 -2
  22. novel_downloader/utils/chapter_storage.py +2 -3
  23. novel_downloader/utils/constants.py +1 -3
  24. novel_downloader/utils/cookies.py +32 -17
  25. novel_downloader/utils/crypto_utils/__init__.py +0 -6
  26. novel_downloader/utils/crypto_utils/rc4.py +40 -50
  27. novel_downloader/utils/epub/__init__.py +2 -3
  28. novel_downloader/utils/epub/builder.py +6 -6
  29. novel_downloader/utils/epub/constants.py +5 -5
  30. novel_downloader/utils/epub/documents.py +7 -7
  31. novel_downloader/utils/epub/models.py +8 -8
  32. novel_downloader/utils/epub/utils.py +10 -10
  33. novel_downloader/utils/file_utils/io.py +48 -73
  34. novel_downloader/utils/file_utils/normalize.py +1 -7
  35. novel_downloader/utils/file_utils/sanitize.py +4 -11
  36. novel_downloader/utils/fontocr/__init__.py +13 -0
  37. novel_downloader/utils/{fontocr.py → fontocr/core.py} +70 -61
  38. novel_downloader/utils/fontocr/loader.py +50 -0
  39. novel_downloader/utils/logger.py +80 -56
  40. novel_downloader/utils/network.py +16 -40
  41. novel_downloader/utils/text_utils/text_cleaner.py +39 -30
  42. novel_downloader/utils/text_utils/truncate_utils.py +3 -14
  43. novel_downloader/utils/time_utils/sleep_utils.py +53 -43
  44. novel_downloader/web/main.py +1 -1
  45. novel_downloader/web/pages/search.py +3 -3
  46. {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.1.dist-info}/METADATA +2 -1
  47. {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.1.dist-info}/RECORD +51 -55
  48. novel_downloader/core/parsers/qidian/book_info_parser.py +0 -89
  49. novel_downloader/core/parsers/qidian/chapter_encrypted.py +0 -470
  50. novel_downloader/core/parsers/qidian/chapter_normal.py +0 -126
  51. novel_downloader/core/parsers/qidian/chapter_router.py +0 -68
  52. novel_downloader/core/parsers/qidian/utils/fontmap_recover.py +0 -143
  53. novel_downloader/core/parsers/qidian/utils/helpers.py +0 -110
  54. {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.1.dist-info}/WHEEL +0 -0
  55. {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.1.dist-info}/entry_points.txt +0 -0
  56. {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.1.dist-info}/licenses/LICENSE +0 -0
  57. {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.1.dist-info}/top_level.txt +0 -0
@@ -8,27 +8,59 @@ Main parser class for handling Qidian HTML
8
8
 
9
9
  from __future__ import annotations
10
10
 
11
+ import json
11
12
  import logging
13
+ import re
14
+ from contextlib import suppress
15
+ from html import unescape
12
16
  from pathlib import Path
13
- from typing import Any
17
+ from typing import Any, TypedDict
18
+
19
+ from lxml import html
14
20
 
15
21
  from novel_downloader.core.parsers.base import BaseParser
16
22
  from novel_downloader.core.parsers.registry import register_parser
17
23
  from novel_downloader.models import (
18
24
  BookInfoDict,
19
25
  ChapterDict,
26
+ ChapterInfoDict,
20
27
  ParserConfig,
28
+ VolumeInfoDict,
29
+ )
30
+ from novel_downloader.utils import (
31
+ download,
32
+ truncate_half_lines,
21
33
  )
22
34
  from novel_downloader.utils.constants import DATA_DIR
23
35
  from novel_downloader.utils.cookies import get_cookie_value
36
+ from novel_downloader.utils.fontocr import get_font_ocr
24
37
 
25
- from .book_info_parser import parse_book_info
26
- from .chapter_router import parse_chapter
27
- from .utils import is_encrypted
38
+ from .utils import (
39
+ get_decryptor,
40
+ )
28
41
 
29
42
  logger = logging.getLogger(__name__)
30
43
 
31
44
 
45
+ class Rule(TypedDict, total=False):
46
+ delete_all: bool
47
+ delete_first: bool
48
+ transform_flip_x: bool
49
+ append_start_char: str
50
+ append_end_char: str
51
+ append_start_attr: str
52
+ append_end_attr: str
53
+
54
+
55
+ class Rules(TypedDict):
56
+ # e.g., orders = ["i", "em", "span"]
57
+ orders: list[str]
58
+ # e.g., sy["sy-3"] -> Rule
59
+ sy: dict[str, Rule]
60
+ # e.g., p_rules["p3"]["i"] -> Rule
61
+ p_rules: dict[str, dict[str, Rule]]
62
+
63
+
32
64
  @register_parser(
33
65
  site_keys=["qidian", "qd"],
34
66
  )
@@ -37,6 +69,10 @@ class QidianParser(BaseParser):
37
69
  Parser for 起点中文网 site.
38
70
  """
39
71
 
72
+ _RE_P_DELIM = re.compile(r"(?i)<\s*p\s*>")
73
+ _RE_ATTR = re.compile(r"attr\(\s*([^)]+?)\s*\)", re.I)
74
+ _RE_SCALEX = re.compile(r"scalex\(\s*-?1\s*\)", re.I)
75
+
40
76
  def __init__(
41
77
  self,
42
78
  config: ParserConfig,
@@ -71,7 +107,64 @@ class QidianParser(BaseParser):
71
107
  """
72
108
  if not html_list:
73
109
  return None
74
- return parse_book_info(html_list[0])
110
+
111
+ doc = html.fromstring(html_list[0])
112
+
113
+ book_name = self._first_str(doc.xpath('//h1[@id="bookName"]/text()'))
114
+ author = self._first_str(doc.xpath('//a[@class="writer-name"]/text()'))
115
+
116
+ book_id = doc.xpath('//a[@id="bookImg"]/@data-bid')[0]
117
+ cover_url = f"https://bookcover.yuewen.com/qdbimg/349573/{book_id}/600.webp"
118
+
119
+ update_time = self._first_str(
120
+ doc.xpath('//span[@class="update-time"]/text()'),
121
+ replaces=[("更新时间:", "")],
122
+ )
123
+ serial_status = self._first_str(
124
+ doc.xpath('//p[@class="book-attribute"]/span[1]/text()')
125
+ )
126
+
127
+ tags = [
128
+ t.strip()
129
+ for t in doc.xpath('//p[contains(@class,"all-label")]//a/text()')
130
+ if t.strip()
131
+ ]
132
+
133
+ word_count = self._first_str(doc.xpath('//p[@class="count"]/em[1]/text()'))
134
+ summary_brief = self._first_str(doc.xpath('//p[@class="intro"]/text()'))
135
+
136
+ raw_lines = [
137
+ s.strip()
138
+ for s in doc.xpath('//p[@id="book-intro-detail"]//text()')
139
+ if s.strip()
140
+ ]
141
+ summary = "\n".join(raw_lines)
142
+
143
+ volumes: list[VolumeInfoDict] = []
144
+ for vol in doc.xpath('//div[@id="allCatalog"]//div[@class="catalog-volume"]'):
145
+ vol_name = self._first_str(vol.xpath('.//h3[@class="volume-name"]/text()'))
146
+ vol_name = vol_name.split(chr(183))[0].strip()
147
+ chapters: list[ChapterInfoDict] = []
148
+ for li in vol.xpath('.//ul[contains(@class,"volume-chapters")]/li'):
149
+ title = self._first_str(li.xpath('.//a[@class="chapter-name"]/text()'))
150
+ url = self._first_str(li.xpath('.//a[@class="chapter-name"]/@href'))
151
+ cid = url.rstrip("/").split("/")[-1] if url else ""
152
+ chapters.append({"title": title, "url": url, "chapterId": cid})
153
+ volumes.append({"volume_name": vol_name, "chapters": chapters})
154
+
155
+ return {
156
+ "book_name": book_name,
157
+ "author": author,
158
+ "cover_url": cover_url,
159
+ "update_time": update_time,
160
+ "word_count": word_count,
161
+ "serial_status": serial_status,
162
+ "tags": tags,
163
+ "summary_brief": summary_brief,
164
+ "summary": summary,
165
+ "volumes": volumes,
166
+ "extra": {},
167
+ }
75
168
 
76
169
  def parse_chapter(
77
170
  self,
@@ -86,16 +179,658 @@ class QidianParser(BaseParser):
86
179
  """
87
180
  if not html_list:
88
181
  return None
89
- return parse_chapter(self, html_list[0], chapter_id)
182
+ try:
183
+ ssr_data = self._find_ssr_page_context(html_list[0])
184
+ chapter_info = self._extract_chapter_info(ssr_data)
185
+ if not chapter_info:
186
+ logger.warning(
187
+ "[Parser] ssr_chapterInfo not found for chapter '%s'", chapter_id
188
+ )
189
+ return None
190
+
191
+ if not self._can_view_chapter(chapter_info):
192
+ logger.warning(
193
+ "[Parser] Chapter '%s' is not purchased or inaccessible.",
194
+ chapter_id,
195
+ )
196
+ return None
197
+
198
+ if self._is_encrypted(ssr_data):
199
+ if not self._decode_font:
200
+ return None
201
+ return self.parse_encrypted_chapter(chapter_info, chapter_id)
202
+
203
+ return self.parse_normal_chapter(chapter_info, chapter_id)
204
+
205
+ except Exception as e:
206
+ logger.warning("[Parser] parse error for chapter '%s': %s", chapter_id, e)
207
+ return None
208
+
209
+ def parse_normal_chapter(
210
+ self,
211
+ chapter_info: dict[str, Any],
212
+ chapter_id: str,
213
+ ) -> ChapterDict | None:
214
+ """
215
+ Extract structured chapter info from a normal Qidian page.
216
+
217
+ :param chapter_info: Parsed chapter info block from ssr data.
218
+ :param chapter_id: Chapter identifier (string).
219
+ :return: a dictionary with keys like 'id', 'title', 'content', etc.
220
+ """
221
+ duplicated = self._is_duplicated(chapter_info)
222
+
223
+ title = chapter_info.get("chapterName", "Untitled")
224
+ raw_html = chapter_info.get("content", "")
225
+ chapter_id = chapter_info.get("chapterId", chapter_id)
226
+ fkp = chapter_info.get("fkp", "")
227
+ author_say = chapter_info.get("authorSay", "").strip()
228
+ update_time = chapter_info.get("updateTime", "")
229
+ update_timestamp = chapter_info.get("updateTimestamp", 0)
230
+ modify_time = chapter_info.get("modifyTime", 0)
231
+ word_count = chapter_info.get("actualWords", 0)
232
+ seq = chapter_info.get("seq")
233
+ volume = chapter_info.get("extra", {}).get("volumeName", "")
234
+
235
+ if self._is_vip(chapter_info):
236
+ decryptor = get_decryptor()
237
+ raw_html = decryptor.decrypt(raw_html, chapter_id, fkp, self._fuid)
238
+
239
+ parts = self._RE_P_DELIM.split(raw_html)
240
+ paragraphs = [unescape(p).strip() for p in parts if p.strip()]
241
+ chapter_text = "\n".join(paragraphs)
242
+ if not chapter_text:
243
+ return None
244
+
245
+ if self._use_truncation and duplicated:
246
+ chapter_text = truncate_half_lines(chapter_text)
247
+
248
+ return {
249
+ "id": str(chapter_id),
250
+ "title": title,
251
+ "content": chapter_text,
252
+ "extra": {
253
+ "author_say": author_say,
254
+ "updated_at": update_time,
255
+ "update_timestamp": update_timestamp,
256
+ "modify_time": modify_time,
257
+ "word_count": word_count,
258
+ "duplicated": duplicated,
259
+ "seq": seq,
260
+ "volume": volume,
261
+ "encrypted": False,
262
+ },
263
+ }
264
+
265
+ def parse_encrypted_chapter(
266
+ self,
267
+ chapter_info: dict[str, Any],
268
+ chapter_id: str,
269
+ ) -> ChapterDict | None:
270
+ """
271
+ Extract and return the formatted textual content of an encrypted chapter.
272
+
273
+ Steps:
274
+ 1. Decode and save randomFont bytes; download fixedFont via download().
275
+ 2. Parse CSS rules and save debug JSON.
276
+ 3. Render encrypted paragraphs, then run OCR font-mapping.
277
+ 4. Extracts paragraph texts and formats them.
278
+
279
+ :param chapter_info: Parsed chapter info block from ssr data.
280
+ :return: Formatted chapter text or empty string if not parsable.
281
+ """
282
+ debug_dir = self._debug_dir / "qidian" / "font_debug" / chapter_id
283
+ if self._save_font_debug:
284
+ debug_dir.mkdir(parents=True, exist_ok=True)
285
+
286
+ duplicated = self._is_duplicated(chapter_info)
287
+
288
+ css_str = chapter_info["css"]
289
+ randomFont_str = chapter_info["randomFont"]
290
+ fixedFontWoff2_url = chapter_info["fixedFontWoff2"]
291
+
292
+ title = chapter_info.get("chapterName", "Untitled")
293
+ raw_html = chapter_info.get("content", "")
294
+ chapter_id = chapter_info.get("chapterId", chapter_id)
295
+ fkp = chapter_info.get("fkp", "")
296
+ author_say = chapter_info.get("authorSay", "").strip()
297
+ update_time = chapter_info.get("updateTime", "")
298
+ update_timestamp = chapter_info.get("updateTimestamp", 0)
299
+ modify_time = chapter_info.get("modifyTime", 0)
300
+ word_count = chapter_info.get("actualWords", 0)
301
+ seq = chapter_info.get("seq")
302
+ volume = chapter_info.get("extra", {}).get("volumeName", "")
303
+
304
+ # extract + save font
305
+ rf = json.loads(randomFont_str)
306
+ rand_path = self._base_cache_dir / "randomFont.ttf"
307
+ rand_path.parent.mkdir(parents=True, exist_ok=True)
308
+ rand_path.write_bytes(bytes(rf["data"]))
309
+
310
+ fixed_path = download(
311
+ url=fixedFontWoff2_url,
312
+ target_dir=self._fixed_font_dir,
313
+ )
314
+ if fixed_path is None:
315
+ logger.warning(
316
+ "[Parser] failed to download fixedfont for chapter '%s'", chapter_id
317
+ )
318
+ return None
319
+
320
+ # Extract and render paragraphs from HTML with CSS rules
321
+ if self._is_vip(chapter_info):
322
+ decryptor = get_decryptor()
323
+ raw_html = decryptor.decrypt(
324
+ raw_html,
325
+ chapter_id,
326
+ fkp,
327
+ self._fuid,
328
+ )
329
+
330
+ css_rules = self._parse_css_rules(css_str)
331
+ paragraphs_str, refl_list = self._render_visible_text(raw_html, css_rules)
332
+ if self._save_font_debug:
333
+ paragraphs_str_path = debug_dir / f"{chapter_id}_debug.txt"
334
+ paragraphs_str_path.write_text(paragraphs_str, encoding="utf-8")
335
+
336
+ # Run OCR + fallback mapping
337
+ char_set = {c for c in paragraphs_str if c not in {" ", "\n", "\u3000"}}
338
+ refl_set = set(refl_list)
339
+ char_set = char_set - refl_set
340
+ if self._save_font_debug:
341
+ char_sets_path = debug_dir / "char_set_debug.txt"
342
+ temp = f"char_set:\n{char_set}\n\nrefl_set:\n{refl_set}"
343
+ char_sets_path.write_text(
344
+ temp,
345
+ encoding="utf-8",
346
+ )
347
+
348
+ mapping_result = self._generate_font_map(
349
+ fixed_font_path=fixed_path,
350
+ random_font_path=rand_path,
351
+ char_set=char_set,
352
+ refl_set=refl_set,
353
+ cache_dir=self._base_cache_dir,
354
+ batch_size=self._config.batch_size,
355
+ )
356
+ if not mapping_result:
357
+ return None
358
+
359
+ if self._save_font_debug:
360
+ mapping_json_path = debug_dir / "font_mapping.json"
361
+ mapping_json_path.write_text(
362
+ json.dumps(mapping_result, ensure_ascii=False, indent=2),
363
+ encoding="utf-8",
364
+ )
365
+
366
+ # Reconstruct final readable text
367
+ original_text = self._apply_font_mapping(
368
+ text=paragraphs_str,
369
+ font_map=mapping_result,
370
+ )
371
+
372
+ final_paragraphs_str = "\n".join(
373
+ line.strip() for line in original_text.splitlines() if line.strip()
374
+ )
375
+ if self._use_truncation and duplicated:
376
+ final_paragraphs_str = truncate_half_lines(final_paragraphs_str)
377
+
378
+ return {
379
+ "id": str(chapter_id),
380
+ "title": str(title),
381
+ "content": final_paragraphs_str,
382
+ "extra": {
383
+ "author_say": author_say,
384
+ "updated_at": update_time,
385
+ "update_timestamp": update_timestamp,
386
+ "modify_time": modify_time,
387
+ "word_count": word_count,
388
+ "duplicated": duplicated,
389
+ "seq": seq,
390
+ "volume": volume,
391
+ "encrypted": True,
392
+ },
393
+ }
394
+
395
+ @staticmethod
396
+ def _find_ssr_page_context(html_str: str) -> dict[str, Any]:
397
+ """
398
+ Extract SSR JSON from <script id="vite-plugin-ssr_pageContext">.
399
+ """
400
+ tree = html.fromstring(html_str)
401
+ script = tree.xpath('//script[@id="vite-plugin-ssr_pageContext"]/text()')
402
+ return json.loads(script[0].strip()) if script else {}
403
+
404
+ @staticmethod
405
+ def _extract_chapter_info(ssr_data: dict[str, Any]) -> dict[str, Any]:
406
+ """
407
+ Extract the 'chapterInfo' dictionary from the SSR page context.
408
+
409
+ This handles nested key access and returns an empty dict if missing.
410
+
411
+ :param ssr_data: The full SSR data object from _find_ssr_page_context().
412
+ :return: A dict with chapter metadata such as chapterName, authorSay, etc.
413
+ """
414
+ page_context = ssr_data.get("pageContext", {})
415
+ page_props = page_context.get("pageProps", {})
416
+ page_data = page_props.get("pageData", {})
417
+ chapter_info = page_data.get("chapterInfo", {})
418
+ return chapter_info if isinstance(chapter_info, dict) else {}
419
+
420
+ @staticmethod
421
+ def _is_restricted_page(html_str: str) -> bool:
422
+ """
423
+ Return True if page content indicates access restriction
424
+ (e.g. not subscribed/purchased).
425
+
426
+ :param html_str: Raw HTML string.
427
+ """
428
+ markers = ["这是VIP章节", "需要订阅", "订阅后才能阅读"]
429
+ return any(m in html_str for m in markers)
90
430
 
91
- def is_encrypted(self, html_str: str) -> bool:
431
+ @classmethod
432
+ def _is_vip(cls, chapter_info: dict[str, Any]) -> bool:
433
+ """
434
+ :return: True if VIP, False otherwise.
435
+ """
436
+ vip_flag = chapter_info.get("vipStatus", 0)
437
+ fens_flag = chapter_info.get("fEnS", 0)
438
+ return bool(vip_flag == 1 and fens_flag != 0)
439
+
440
+ @classmethod
441
+ def _can_view_chapter(cls, chapter_info: dict[str, Any]) -> bool:
442
+ """
443
+ A chapter is not viewable if it is marked as VIP
444
+ and has not been purchased.
445
+
446
+ :return: True if viewable, False otherwise.
447
+ """
448
+ is_buy = chapter_info.get("isBuy", 0)
449
+ vip_status = chapter_info.get("vipStatus", 0)
450
+ return not (vip_status == 1 and is_buy == 0)
451
+
452
+ @classmethod
453
+ def _is_duplicated(cls, chapter_info: dict[str, Any]) -> bool:
454
+ """
455
+ Check if chapter is marked as duplicated (eFW = 1).
456
+ """
457
+ efw_flag = chapter_info.get("eFW", 0)
458
+ return bool(efw_flag == 1)
459
+
460
+ @classmethod
461
+ def _is_encrypted(cls, content: str | dict[str, Any]) -> bool:
92
462
  """
93
463
  Return True if content is encrypted.
94
464
 
95
- :param html: Raw HTML of the chapter page.
465
+ Chapter Encryption Status (cES):
466
+ * 0: 内容是'明文'
467
+ * 2: 字体加密
468
+
469
+ :param content: HTML content, either as a raw string or a BeautifulSoup object.
470
+ :return: True if encrypted marker is found, else False.
471
+ """
472
+ ssr_data = (
473
+ cls._find_ssr_page_context(content) if isinstance(content, str) else content
474
+ )
475
+ chapter_info = cls._extract_chapter_info(ssr_data)
476
+ return int(chapter_info.get("cES", 0)) == 2
477
+
478
+ @staticmethod
479
+ def _generate_font_map(
480
+ fixed_font_path: Path,
481
+ random_font_path: Path,
482
+ char_set: set[str],
483
+ refl_set: set[str],
484
+ cache_dir: Path,
485
+ batch_size: int = 32,
486
+ ) -> dict[str, str]:
487
+ """
488
+ Build a mapping from scrambled font chars to real chars.
489
+
490
+ Uses OCR to decode and generate mapping from a fixed obfuscated font
491
+ and an random obfuscated font. Results are cached in JSON.
492
+
493
+ :param fixed_font_path: fixed font file.
494
+ :param random_font_path: random font file.
495
+ :param char_set: Characters to match directly.
496
+ :param refl_set: Characters to match in flipped form.
497
+ :param cache_dir: Directory to save/load cached results.
498
+ :param batch_size: How many chars to OCR per batch.
499
+
500
+ :return: { obf_char: real_char, ... }
501
+ """
502
+ font_ocr = get_font_ocr()
503
+ if not font_ocr:
504
+ return {}
505
+
506
+ mapping_result: dict[str, str] = {}
507
+ fixed_map_file = cache_dir / "fixed_font_map" / f"{fixed_font_path.stem}.json"
508
+ fixed_map_file.parent.mkdir(parents=True, exist_ok=True)
509
+
510
+ # load existing cache
511
+ try:
512
+ with open(fixed_map_file, encoding="utf-8") as f:
513
+ fixed_map = json.load(f)
514
+ cached_chars = set(fixed_map.keys())
515
+ mapping_result.update(
516
+ {ch: fixed_map[ch] for ch in char_set if ch in fixed_map}
517
+ )
518
+ mapping_result.update(
519
+ {ch: fixed_map[ch] for ch in refl_set if ch in fixed_map}
520
+ )
521
+ char_set = char_set - cached_chars
522
+ refl_set = refl_set - cached_chars
523
+ except Exception:
524
+ fixed_map = {}
525
+ cached_chars = set()
526
+
527
+ # prepare font renderers and cmap sets
528
+ fixed_chars = font_ocr.extract_font_charset(fixed_font_path)
529
+ random_chars = font_ocr.extract_font_charset(random_font_path)
530
+ fixed_font = font_ocr.load_render_font(fixed_font_path)
531
+ random_font = font_ocr.load_render_font(random_font_path)
532
+
533
+ # process normal and reflected sets together
534
+ rendered = []
535
+ for chars, reflect in [(char_set, False), (refl_set, True)]:
536
+ for ch in chars:
537
+ if ch in fixed_chars:
538
+ font = fixed_font
539
+ elif ch in random_chars:
540
+ font = random_font
541
+ else:
542
+ continue
543
+ rendered.append(
544
+ (ch, font_ocr.render_char_image_array(ch, font, reflect))
545
+ )
546
+
547
+ if rendered:
548
+ # query OCR+vec simultaneously
549
+ imgs_to_query = [img for _, img in rendered]
550
+ fused = font_ocr.predict(imgs_to_query, batch_size=batch_size)
551
+
552
+ # pick best per char, apply threshold + cache
553
+ for (ch, _), preds in zip(rendered, fused, strict=False):
554
+ if not preds:
555
+ continue
556
+ real_char, _ = preds
557
+ mapping_result[ch] = real_char
558
+ fixed_map[ch] = real_char
559
+
560
+ # persist updated fixed_map
561
+ try:
562
+ with open(fixed_map_file, "w", encoding="utf-8") as f:
563
+ json.dump(fixed_map, f, ensure_ascii=False, indent=2)
564
+ except Exception as e:
565
+ logger.error("[FontOCR] Failed to save fixed map: %s", e)
566
+
567
+ return mapping_result
568
+
569
+ @staticmethod
570
+ def _apply_font_mapping(text: str, font_map: dict[str, str]) -> str:
96
571
  """
97
- return is_encrypted(html_str)
572
+ Replace each character in `text` using `font_map`,
573
+ leaving unmapped characters unchanged.
574
+
575
+ :param text: The input string, possibly containing obfuscated font chars.
576
+ :param font_map: A dict mapping obfuscated chars to real chars.
577
+ :return: The de-obfuscated text.
578
+ """
579
+ return "".join(font_map.get(ch, ch) for ch in text)
580
+
581
+ @staticmethod
582
+ def _only_tag(selector: str) -> str | None:
583
+ """
584
+ Normalize a selector into just its tag name for ordering.
585
+
586
+ Handles forms like 'i', 'em::before', '.p3 i', '.p2 span::after'.
587
+
588
+ Returns None if can't extract a tag.
589
+ """
590
+ sel = selector.strip()
591
+ # If it has spaces, take the rightmost simple selector
592
+ last = sel.split()[-1]
593
+ # Drop ::pseudo
594
+ last = last.split("::", 1)[0]
595
+ # If it's like 'span[attr=..]' keep 'span'
596
+ last = last.split("[", 1)[0]
597
+ # If it starts with '.', it's not a tag
598
+ if not last or last.startswith("."):
599
+ return None
600
+ return last
601
+
602
+ @staticmethod
603
+ def _parse_decls(block: str) -> list[tuple[str, str]]:
604
+ """
605
+ Parse 'name:value;...' inside a block. Tolerates quotes and attr().
606
+ """
607
+ decls: list[tuple[str, str]] = []
608
+ i = 0
609
+ n = len(block)
610
+ name: list[str] = []
611
+ val: list[str] = []
612
+ in_name = True
613
+ quote = None # track ' or "
614
+ while i < n:
615
+ c = block[i]
616
+ if quote:
617
+ # inside quotes
618
+ if c == "\\" and i + 1 < n:
619
+ # keep escaped char
620
+ (name if in_name else val).append(c)
621
+ i += 1
622
+ (name if in_name else val).append(block[i])
623
+ elif c == quote:
624
+ (name if in_name else val).append(c)
625
+ quote = None
626
+ else:
627
+ (name if in_name else val).append(c)
628
+ else:
629
+ if c in ("'", '"'):
630
+ (name if in_name else val).append(c)
631
+ quote = c
632
+ elif in_name and c == ":":
633
+ in_name = False
634
+ elif c == ";":
635
+ nm = "".join(name).strip().lower()
636
+ vl = "".join(val).strip()
637
+ if nm:
638
+ decls.append((nm, vl))
639
+ name.clear()
640
+ val.clear()
641
+ in_name = True
642
+ else:
643
+ (name if in_name else val).append(c)
644
+ i += 1
645
+
646
+ if name or val:
647
+ nm = "".join(name).strip().lower()
648
+ vl = "".join(val).strip()
649
+ if nm:
650
+ decls.append((nm, vl))
651
+ return decls
652
+
653
+ @classmethod
654
+ def _parse_css_rules(cls, css_str: str) -> Rules:
655
+ """
656
+ Produces normalized Rules with:
657
+ * orders: list[str] of tag names sorted by numeric 'order'
658
+ * sy: '.sy-*' class rules
659
+ * p_rules: '.p* <tag>' rules, indexed by p-class then tag
660
+ """
661
+ rules: Rules = {"orders": [], "sy": {}, "p_rules": {}}
662
+ order_pairs: list[tuple[str, int]] = []
663
+
664
+ i = 0
665
+ while True:
666
+ b1 = css_str.find("{", i)
667
+ if b1 == -1:
668
+ break
669
+ selector = css_str[i:b1].strip().lower()
670
+ b2 = css_str.find("}", b1 + 1)
671
+ if b2 == -1:
672
+ break
673
+ block = css_str[b1 + 1 : b2]
674
+ i = b2 + 1
675
+
676
+ decls = cls._parse_decls(block)
677
+
678
+ new_rule: Rule = {}
679
+ order_val: int | None = None
680
+
681
+ for name, value in decls:
682
+ v = value.strip()
683
+ if name == "font-size" and v == "0":
684
+ if "::first-letter" in selector:
685
+ new_rule["delete_first"] = True
686
+ else:
687
+ new_rule["delete_all"] = True
688
+ elif name == "transform":
689
+ if cls._RE_SCALEX.search(v.replace(" ", "")):
690
+ new_rule["transform_flip_x"] = True
691
+ elif name == "order":
692
+ with suppress(ValueError, TypeError):
693
+ order_val = int(v)
694
+ elif name == "content":
695
+ # normalize: remove outer quotes
696
+ if "::after" in selector:
697
+ m = cls._RE_ATTR.search(v)
698
+ if m:
699
+ new_rule["append_end_attr"] = m.group(1)
700
+ else:
701
+ s = v.strip().strip("\"'")
702
+ new_rule["append_end_char"] = s
703
+ elif "::before" in selector:
704
+ m = cls._RE_ATTR.search(v)
705
+ if m:
706
+ new_rule["append_start_attr"] = m.group(1)
707
+ else:
708
+ s = v.strip().strip("\"'")
709
+ new_rule["append_start_char"] = s
710
+
711
+ # classification
712
+ if selector.startswith(".sy-"):
713
+ key = selector.lstrip(".")
714
+ old = rules["sy"].get(key)
715
+ rules["sy"][key] = {**old, **new_rule} if old else (new_rule or {})
716
+
717
+ elif selector.startswith(".p") and " " in selector:
718
+ p_cls, right = selector.split(" ", 1)
719
+ p_cls = p_cls.lstrip(".")
720
+ tag = cls._only_tag(right)
721
+ if tag:
722
+ prev = rules["p_rules"].setdefault(p_cls, {}).get(tag)
723
+ rules["p_rules"][p_cls][tag] = (
724
+ {**prev, **new_rule} if prev else (new_rule or {})
725
+ )
726
+
727
+ if order_val is not None:
728
+ tag_for_order = cls._only_tag(selector)
729
+ if tag_for_order:
730
+ order_pairs.append((tag_for_order, order_val))
731
+
732
+ # normalize orders
733
+ order_pairs.sort(key=lambda t: t[1])
734
+ seen = set()
735
+ orders: list[str] = []
736
+ for tag, _ in order_pairs:
737
+ if tag not in seen:
738
+ seen.add(tag)
739
+ orders.append(tag)
740
+ rules["orders"] = orders
741
+ return rules
742
+
743
+ @staticmethod
744
+ def _render_visible_text(html_str: str, rules: Rules) -> tuple[str, list[str]]:
745
+ """
746
+ Renderer the HTML using pre-parsed Rules.
747
+ """
748
+ tree = html.fromstring(html_str)
749
+ paragraphs_out: list[str] = []
750
+ refl_list: list[str] = []
751
+ orders = rules.get("orders") or []
752
+ p_rules = rules.get("p_rules") or {}
753
+ sy_rules = rules.get("sy") or {}
754
+
755
+ def _class_list(el: html.HtmlElement) -> list[str]:
756
+ cls = el.get("class")
757
+ return cls.split() if cls else []
758
+
759
+ def _apply_rule(el: html.HtmlElement, rule: Rule) -> str:
760
+ if rule.get("delete_all"):
761
+ return ""
762
+
763
+ parts: list[str] = []
764
+ if "append_start_char" in rule:
765
+ parts.append(rule["append_start_char"])
766
+ if "append_start_attr" in rule:
767
+ parts.append(el.get(rule["append_start_attr"], ""))
768
+
769
+ text = el.text or ""
770
+ if rule.get("delete_first") and text:
771
+ text = text[1:]
772
+ parts.append(text)
773
+
774
+ if "append_end_char" in rule:
775
+ parts.append(rule["append_end_char"])
776
+ if "append_end_attr" in rule:
777
+ parts.append(el.get(rule["append_end_attr"], ""))
778
+
779
+ s = "".join(parts)
780
+
781
+ if rule.get("transform_flip_x") and s:
782
+ refl_list.append(s)
783
+
784
+ return s
785
+
786
+ for p in tree.findall(".//p"):
787
+ p_classes = _class_list(p)
788
+ p_key = next((c for c in p_classes if c.startswith("p")), None)
789
+ has_ordered_rules = p_key in p_rules
790
+
791
+ buf_parts: list[str] = []
792
+
793
+ if p.text and not has_ordered_rules:
794
+ buf_parts.append(p.text)
795
+
796
+ ordered_cache: dict[str, list[str]] = {}
797
+
798
+ for child in p:
799
+ tag = str(child.tag)
800
+
801
+ # Handle inline <y class="sy-*"> spans
802
+ if tag == "y" and not has_ordered_rules:
803
+ y_cls = next(
804
+ (c for c in _class_list(child) if c.startswith("sy-")), None
805
+ )
806
+ if y_cls and y_cls in sy_rules:
807
+ buf_parts.append(_apply_rule(child, sy_rules[y_cls]))
808
+ else:
809
+ buf_parts.append(child.text or "")
810
+ if child.tail:
811
+ buf_parts.append(child.tail)
812
+ continue
813
+
814
+ # Handle ordered paragraphs: only cache tags that appear in `orders`
815
+ if p_key and has_ordered_rules and tag in orders:
816
+ rule = p_rules[p_key].get(tag, {})
817
+ ordered_cache.setdefault(tag, []).append(_apply_rule(child, rule))
818
+ continue
819
+
820
+ # Non-ordered, non-<y> nodes: include text + tails as-is
821
+ if not has_ordered_rules:
822
+ buf_parts.append(child.text or "")
823
+ if child.tail:
824
+ buf_parts.append(child.tail)
825
+
826
+ # If ordered, flush in global orders with all duplicates preserved
827
+ if has_ordered_rules:
828
+ for tag in orders:
829
+ if tag in ordered_cache:
830
+ buf_parts.extend(ordered_cache[tag])
831
+
832
+ para = "".join(buf_parts)
833
+ if para:
834
+ paragraphs_out.append(para)
98
835
 
99
- @property
100
- def save_font_debug(self) -> bool:
101
- return self._config.save_font_debug
836
+ return "\n".join(paragraphs_out), refl_list