novel-downloader 2.0.0__py3-none-any.whl → 2.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/download.py +14 -11
  3. novel_downloader/cli/export.py +19 -19
  4. novel_downloader/cli/ui.py +35 -8
  5. novel_downloader/config/adapter.py +216 -153
  6. novel_downloader/core/__init__.py +5 -6
  7. novel_downloader/core/archived/deqixs/fetcher.py +1 -28
  8. novel_downloader/core/downloaders/__init__.py +2 -0
  9. novel_downloader/core/downloaders/base.py +34 -85
  10. novel_downloader/core/downloaders/common.py +147 -171
  11. novel_downloader/core/downloaders/qianbi.py +30 -64
  12. novel_downloader/core/downloaders/qidian.py +157 -184
  13. novel_downloader/core/downloaders/qqbook.py +292 -0
  14. novel_downloader/core/downloaders/registry.py +2 -2
  15. novel_downloader/core/exporters/__init__.py +2 -0
  16. novel_downloader/core/exporters/base.py +37 -59
  17. novel_downloader/core/exporters/common.py +620 -0
  18. novel_downloader/core/exporters/linovelib.py +47 -0
  19. novel_downloader/core/exporters/qidian.py +41 -12
  20. novel_downloader/core/exporters/qqbook.py +28 -0
  21. novel_downloader/core/exporters/registry.py +2 -2
  22. novel_downloader/core/fetchers/__init__.py +4 -2
  23. novel_downloader/core/fetchers/aaatxt.py +2 -22
  24. novel_downloader/core/fetchers/b520.py +3 -23
  25. novel_downloader/core/fetchers/base.py +80 -105
  26. novel_downloader/core/fetchers/biquyuedu.py +2 -22
  27. novel_downloader/core/fetchers/dxmwx.py +10 -22
  28. novel_downloader/core/fetchers/esjzone.py +6 -29
  29. novel_downloader/core/fetchers/guidaye.py +2 -22
  30. novel_downloader/core/fetchers/hetushu.py +9 -29
  31. novel_downloader/core/fetchers/i25zw.py +2 -16
  32. novel_downloader/core/fetchers/ixdzs8.py +2 -16
  33. novel_downloader/core/fetchers/jpxs123.py +2 -16
  34. novel_downloader/core/fetchers/lewenn.py +2 -22
  35. novel_downloader/core/fetchers/linovelib.py +4 -20
  36. novel_downloader/core/fetchers/{eightnovel.py → n8novel.py} +12 -40
  37. novel_downloader/core/fetchers/piaotia.py +2 -16
  38. novel_downloader/core/fetchers/qbtr.py +2 -16
  39. novel_downloader/core/fetchers/qianbi.py +1 -20
  40. novel_downloader/core/fetchers/qidian.py +27 -68
  41. novel_downloader/core/fetchers/qqbook.py +177 -0
  42. novel_downloader/core/fetchers/quanben5.py +9 -29
  43. novel_downloader/core/fetchers/rate_limiter.py +22 -53
  44. novel_downloader/core/fetchers/sfacg.py +3 -16
  45. novel_downloader/core/fetchers/shencou.py +2 -16
  46. novel_downloader/core/fetchers/shuhaige.py +2 -22
  47. novel_downloader/core/fetchers/tongrenquan.py +2 -22
  48. novel_downloader/core/fetchers/ttkan.py +3 -14
  49. novel_downloader/core/fetchers/wanbengo.py +2 -22
  50. novel_downloader/core/fetchers/xiaoshuowu.py +2 -16
  51. novel_downloader/core/fetchers/xiguashuwu.py +4 -20
  52. novel_downloader/core/fetchers/xs63b.py +3 -15
  53. novel_downloader/core/fetchers/xshbook.py +2 -22
  54. novel_downloader/core/fetchers/yamibo.py +4 -28
  55. novel_downloader/core/fetchers/yibige.py +13 -26
  56. novel_downloader/core/interfaces/exporter.py +19 -7
  57. novel_downloader/core/interfaces/fetcher.py +23 -49
  58. novel_downloader/core/interfaces/parser.py +2 -2
  59. novel_downloader/core/parsers/__init__.py +4 -2
  60. novel_downloader/core/parsers/b520.py +2 -2
  61. novel_downloader/core/parsers/base.py +5 -39
  62. novel_downloader/core/parsers/esjzone.py +3 -3
  63. novel_downloader/core/parsers/{eightnovel.py → n8novel.py} +7 -7
  64. novel_downloader/core/parsers/qidian.py +717 -0
  65. novel_downloader/core/parsers/qqbook.py +709 -0
  66. novel_downloader/core/parsers/xiguashuwu.py +8 -15
  67. novel_downloader/core/searchers/__init__.py +2 -2
  68. novel_downloader/core/searchers/b520.py +1 -1
  69. novel_downloader/core/searchers/base.py +2 -2
  70. novel_downloader/core/searchers/{eightnovel.py → n8novel.py} +5 -5
  71. novel_downloader/locales/en.json +3 -3
  72. novel_downloader/locales/zh.json +3 -3
  73. novel_downloader/models/__init__.py +2 -0
  74. novel_downloader/models/book.py +1 -0
  75. novel_downloader/models/config.py +12 -0
  76. novel_downloader/resources/config/settings.toml +23 -5
  77. novel_downloader/resources/js_scripts/expr_to_json.js +14 -0
  78. novel_downloader/resources/js_scripts/qidian_decrypt_node.js +21 -16
  79. novel_downloader/resources/js_scripts/qq_decrypt_node.js +92 -0
  80. novel_downloader/utils/__init__.py +0 -2
  81. novel_downloader/utils/chapter_storage.py +2 -3
  82. novel_downloader/utils/constants.py +7 -3
  83. novel_downloader/utils/cookies.py +32 -17
  84. novel_downloader/utils/crypto_utils/__init__.py +0 -6
  85. novel_downloader/utils/crypto_utils/aes_util.py +1 -1
  86. novel_downloader/utils/crypto_utils/rc4.py +40 -50
  87. novel_downloader/utils/epub/__init__.py +2 -3
  88. novel_downloader/utils/epub/builder.py +6 -6
  89. novel_downloader/utils/epub/constants.py +1 -6
  90. novel_downloader/utils/epub/documents.py +7 -7
  91. novel_downloader/utils/epub/models.py +8 -8
  92. novel_downloader/utils/epub/utils.py +10 -10
  93. novel_downloader/utils/file_utils/io.py +48 -73
  94. novel_downloader/utils/file_utils/normalize.py +1 -7
  95. novel_downloader/utils/file_utils/sanitize.py +4 -11
  96. novel_downloader/utils/fontocr/__init__.py +13 -0
  97. novel_downloader/utils/{fontocr.py → fontocr/core.py} +72 -61
  98. novel_downloader/utils/fontocr/loader.py +52 -0
  99. novel_downloader/utils/logger.py +80 -56
  100. novel_downloader/utils/network.py +16 -40
  101. novel_downloader/utils/node_decryptor/__init__.py +13 -0
  102. novel_downloader/utils/node_decryptor/decryptor.py +342 -0
  103. novel_downloader/{core/parsers/qidian/utils → utils/node_decryptor}/decryptor_fetcher.py +5 -6
  104. novel_downloader/utils/text_utils/text_cleaner.py +39 -30
  105. novel_downloader/utils/text_utils/truncate_utils.py +3 -14
  106. novel_downloader/utils/time_utils/sleep_utils.py +53 -43
  107. novel_downloader/web/main.py +1 -1
  108. novel_downloader/web/pages/download.py +1 -1
  109. novel_downloader/web/pages/search.py +4 -4
  110. novel_downloader/web/services/task_manager.py +2 -0
  111. {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.2.dist-info}/METADATA +5 -1
  112. novel_downloader-2.0.2.dist-info/RECORD +203 -0
  113. novel_downloader/core/exporters/common/__init__.py +0 -11
  114. novel_downloader/core/exporters/common/epub.py +0 -198
  115. novel_downloader/core/exporters/common/main_exporter.py +0 -64
  116. novel_downloader/core/exporters/common/txt.py +0 -146
  117. novel_downloader/core/exporters/epub_util.py +0 -215
  118. novel_downloader/core/exporters/linovelib/__init__.py +0 -11
  119. novel_downloader/core/exporters/linovelib/epub.py +0 -349
  120. novel_downloader/core/exporters/linovelib/main_exporter.py +0 -66
  121. novel_downloader/core/exporters/linovelib/txt.py +0 -139
  122. novel_downloader/core/exporters/txt_util.py +0 -67
  123. novel_downloader/core/parsers/qidian/__init__.py +0 -10
  124. novel_downloader/core/parsers/qidian/book_info_parser.py +0 -89
  125. novel_downloader/core/parsers/qidian/chapter_encrypted.py +0 -470
  126. novel_downloader/core/parsers/qidian/chapter_normal.py +0 -126
  127. novel_downloader/core/parsers/qidian/chapter_router.py +0 -68
  128. novel_downloader/core/parsers/qidian/main_parser.py +0 -101
  129. novel_downloader/core/parsers/qidian/utils/__init__.py +0 -30
  130. novel_downloader/core/parsers/qidian/utils/fontmap_recover.py +0 -143
  131. novel_downloader/core/parsers/qidian/utils/helpers.py +0 -110
  132. novel_downloader/core/parsers/qidian/utils/node_decryptor.py +0 -175
  133. novel_downloader-2.0.0.dist-info/RECORD +0 -210
  134. {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.2.dist-info}/WHEEL +0 -0
  135. {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.2.dist-info}/entry_points.txt +0 -0
  136. {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.2.dist-info}/licenses/LICENSE +0 -0
  137. {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,717 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.qidian
4
+ ------------------------------------
5
+
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import logging
12
+ from contextlib import suppress
13
+ from html import unescape
14
+ from pathlib import Path
15
+ from typing import Any, TypedDict
16
+
17
+ from lxml import html
18
+
19
+ from novel_downloader.core.parsers.base import BaseParser
20
+ from novel_downloader.core.parsers.registry import register_parser
21
+ from novel_downloader.models import (
22
+ BookInfoDict,
23
+ ChapterDict,
24
+ ChapterInfoDict,
25
+ ParserConfig,
26
+ VolumeInfoDict,
27
+ )
28
+ from novel_downloader.utils import (
29
+ download,
30
+ truncate_half_lines,
31
+ )
32
+ from novel_downloader.utils.constants import DATA_DIR
33
+ from novel_downloader.utils.cookies import get_cookie_value
34
+ from novel_downloader.utils.fontocr import get_font_ocr
35
+ from novel_downloader.utils.node_decryptor import get_decryptor
36
+
37
+ logger = logging.getLogger(__name__)
38
+
39
+
40
+ class Rule(TypedDict, total=False):
41
+ delete_all: bool
42
+ delete_first: bool
43
+ transform_flip_x: bool
44
+ append_start_char: str
45
+ append_end_char: str
46
+ append_start_attr: str
47
+ append_end_attr: str
48
+
49
+
50
+ class Rules(TypedDict):
51
+ # e.g., orders = ["i", "em", "span"]
52
+ orders: list[str]
53
+ # e.g., sy["sy-3"] -> Rule
54
+ sy: dict[str, Rule]
55
+ # e.g., p_rules["p3"]["i"] -> Rule
56
+ p_rules: dict[str, dict[str, Rule]]
57
+
58
+
59
+ @register_parser(
60
+ site_keys=["qidian", "qd"],
61
+ )
62
+ class QidianParser(BaseParser):
63
+ """
64
+ Parser for 起点中文网 site.
65
+ """
66
+
67
+ def __init__(self, config: ParserConfig, fuid: str = ""):
68
+ """
69
+ Initialize the QidianParser with the given configuration.
70
+ """
71
+ super().__init__(config)
72
+
73
+ self._rand_path = self._base_cache_dir / "qidian" / "randomFont.ttf"
74
+ self._fixed_font_dir = self._base_cache_dir / "qidian" / "fixed_fonts"
75
+ self._fixed_map_dir = self._base_cache_dir / "qidian" / "fixed_font_map"
76
+ self._debug_dir = Path.cwd() / "debug" / "qidian"
77
+
78
+ state_files = [
79
+ DATA_DIR / "qidian" / "session_state.cookies",
80
+ ]
81
+ self._fuid: str = fuid or get_cookie_value(state_files, "ywguid")
82
+
83
+ def parse_book_info(
84
+ self,
85
+ html_list: list[str],
86
+ **kwargs: Any,
87
+ ) -> BookInfoDict | None:
88
+ if not html_list:
89
+ return None
90
+
91
+ doc = html.fromstring(html_list[0])
92
+
93
+ book_name = self._first_str(doc.xpath('//h1[@id="bookName"]/text()'))
94
+ author = self._first_str(doc.xpath('//a[@class="writer-name"]/text()'))
95
+
96
+ book_id = doc.xpath('//a[@id="bookImg"]/@data-bid')[0]
97
+ cover_url = f"https://bookcover.yuewen.com/qdbimg/349573/{book_id}/600.webp"
98
+
99
+ update_time = self._first_str(
100
+ doc.xpath('//span[@class="update-time"]/text()'),
101
+ replaces=[("更新时间:", "")],
102
+ )
103
+ serial_status = self._first_str(
104
+ doc.xpath('//p[@class="book-attribute"]/span[1]/text()')
105
+ )
106
+
107
+ tags = [
108
+ t.strip()
109
+ for t in doc.xpath('//p[contains(@class,"all-label")]//a/text()')
110
+ if t.strip()
111
+ ]
112
+
113
+ word_count = self._first_str(doc.xpath('//p[@class="count"]/em[1]/text()'))
114
+ summary_brief = self._first_str(doc.xpath('//p[@class="intro"]/text()'))
115
+
116
+ raw_lines = [
117
+ s.strip()
118
+ for s in doc.xpath('//p[@id="book-intro-detail"]//text()')
119
+ if s.strip()
120
+ ]
121
+ summary = "\n".join(raw_lines)
122
+
123
+ volumes: list[VolumeInfoDict] = []
124
+ for vol in doc.xpath('//div[@id="allCatalog"]//div[@class="catalog-volume"]'):
125
+ vol_name = self._first_str(vol.xpath('.//h3[@class="volume-name"]/text()'))
126
+ vol_name = vol_name.split(chr(183))[0].strip()
127
+ chapters: list[ChapterInfoDict] = []
128
+ for li in vol.xpath('.//ul[contains(@class,"volume-chapters")]/li'):
129
+ title = self._first_str(li.xpath('.//a[@class="chapter-name"]/text()'))
130
+ url = self._first_str(li.xpath('.//a[@class="chapter-name"]/@href'))
131
+ cid = url.rstrip("/").split("/")[-1] if url else ""
132
+ chapters.append({"title": title, "url": url, "chapterId": cid})
133
+ volumes.append({"volume_name": vol_name, "chapters": chapters})
134
+
135
+ return {
136
+ "book_name": book_name,
137
+ "author": author,
138
+ "cover_url": cover_url,
139
+ "update_time": update_time,
140
+ "word_count": word_count,
141
+ "serial_status": serial_status,
142
+ "tags": tags,
143
+ "summary_brief": summary_brief,
144
+ "summary": summary,
145
+ "volumes": volumes,
146
+ "extra": {},
147
+ }
148
+
149
+ def parse_chapter(
150
+ self,
151
+ html_list: list[str],
152
+ chapter_id: str,
153
+ **kwargs: Any,
154
+ ) -> ChapterDict | None:
155
+ if not html_list:
156
+ logger.warning("[Parser] chapter_id=%s :: html_list is empty", chapter_id)
157
+ return None
158
+ try:
159
+ ssr_data = self._find_ssr_page_context(html_list[0])
160
+ chapter_info = self._extract_chapter_info(ssr_data)
161
+ except Exception as e:
162
+ logger.warning(
163
+ "[Parser] chapter_id=%s :: failed to locate ssr_pageContext block: %s",
164
+ chapter_id,
165
+ e,
166
+ )
167
+ return None
168
+
169
+ if not chapter_info:
170
+ logger.warning(
171
+ "[Parser] ssr_chapterInfo not found for chapter '%s'", chapter_id
172
+ )
173
+ return None
174
+
175
+ if not self._can_view_chapter(chapter_info):
176
+ logger.warning(
177
+ "[Parser] Chapter '%s' is not purchased or inaccessible.",
178
+ chapter_id,
179
+ )
180
+ return None
181
+
182
+ duplicated = self._is_duplicated(chapter_info)
183
+ encrypted = self._is_encrypted(chapter_info)
184
+
185
+ title = chapter_info.get("chapterName", "Untitled")
186
+ raw_html = chapter_info.get("content", "")
187
+ cid = str(chapter_info.get("chapterId") or chapter_id)
188
+ fkp = chapter_info.get("fkp", "")
189
+ author_say = chapter_info.get("authorSay", "").strip()
190
+ update_time = chapter_info.get("updateTime", "")
191
+ update_timestamp = chapter_info.get("updateTimestamp", 0)
192
+ modify_time = chapter_info.get("modifyTime", 0)
193
+ word_count = chapter_info.get("actualWords", 0)
194
+ seq = chapter_info.get("seq")
195
+ volume = chapter_info.get("extra", {}).get("volumeName", "")
196
+
197
+ if self._is_vip(chapter_info):
198
+ decryptor = get_decryptor()
199
+ raw_html = decryptor.decrypt_qd(raw_html, cid, fkp, self._fuid)
200
+
201
+ chapter_text = (
202
+ self._parse_font_encrypted(raw_html, chapter_info, cid)
203
+ if encrypted
204
+ else self._parse_normal(raw_html)
205
+ )
206
+ if not chapter_text:
207
+ logger.warning(
208
+ "[Parser] chapter_id=%s :: content empty after decryption/font-mapping",
209
+ chapter_id,
210
+ )
211
+ return None
212
+
213
+ if self._use_truncation and duplicated:
214
+ chapter_text = truncate_half_lines(chapter_text)
215
+
216
+ return {
217
+ "id": cid,
218
+ "title": title,
219
+ "content": chapter_text,
220
+ "extra": {
221
+ "author_say": author_say,
222
+ "updated_at": update_time,
223
+ "update_timestamp": update_timestamp,
224
+ "modify_time": modify_time,
225
+ "word_count": word_count,
226
+ "duplicated": duplicated,
227
+ "seq": seq,
228
+ "volume": volume,
229
+ "encrypted": encrypted,
230
+ },
231
+ }
232
+
233
+ def _parse_normal(self, raw_html: str) -> str:
234
+ """
235
+ Extract structured chapter content from a normal Qidian page.
236
+ """
237
+ parts = raw_html.split("<p>")
238
+ paragraphs = [unescape(p).strip() for p in parts if p.strip()]
239
+ chapter_text = "\n".join(paragraphs)
240
+ if not chapter_text:
241
+ return ""
242
+ return chapter_text
243
+
244
+ def _parse_font_encrypted(
245
+ self,
246
+ raw_html: str,
247
+ chapter_info: dict[str, Any],
248
+ cid: str,
249
+ ) -> str:
250
+ """
251
+ Steps:
252
+ 1. Decode and save randomFont bytes; download fixedFont via download().
253
+ 2. Parse CSS rules and save debug JSON.
254
+ 3. Render encrypted paragraphs, then run OCR font-mapping.
255
+ 4. Extracts paragraph texts and formats them.
256
+ """
257
+ if not self._decode_font:
258
+ logger.warning(
259
+ "[Parser] chapter_id=%s :: font decryption skipped "
260
+ "(set `decode_font=True` to enable)",
261
+ cid,
262
+ )
263
+ return ""
264
+
265
+ css_str = chapter_info.get("css")
266
+ random_font_str = chapter_info.get("randomFont")
267
+ rf = json.loads(random_font_str) if isinstance(random_font_str, str) else None
268
+ rf_data = rf.get("data") if rf else None
269
+ fixed_woff2_url = chapter_info.get("fixedFontWoff2")
270
+
271
+ if not css_str:
272
+ logger.warning("[Parser] cid=%s :: css missing or empty", cid)
273
+ return ""
274
+ if not rf_data:
275
+ logger.warning("[Parser] cid=%s :: randomFont.data missing or empty", cid)
276
+ return ""
277
+ if not fixed_woff2_url:
278
+ logger.warning("[Parser] cid=%s :: fixedFontWoff2 missing or empty", cid)
279
+ return ""
280
+
281
+ debug_dir = self._debug_dir / "font_debug" / cid
282
+ if self._save_font_debug:
283
+ debug_dir.mkdir(parents=True, exist_ok=True)
284
+
285
+ try:
286
+ self._rand_path.parent.mkdir(parents=True, exist_ok=True)
287
+ self._rand_path.write_bytes(bytes(rf_data))
288
+ except Exception as e:
289
+ logger.error(
290
+ "[Parser] cid=%s :: failed to write randomFont.ttf",
291
+ cid,
292
+ exc_info=e,
293
+ )
294
+ return ""
295
+
296
+ fixed_path = download(
297
+ url=fixed_woff2_url,
298
+ target_dir=self._fixed_font_dir,
299
+ on_exist="skip",
300
+ )
301
+ if fixed_path is None:
302
+ logger.warning(
303
+ "[Parser] failed to download fixedfont for chapter '%s'", cid
304
+ )
305
+ return ""
306
+
307
+ css_rules = self._parse_css_rules(css_str)
308
+ paragraphs_str, refl_list = self._render_visible_text(raw_html, css_rules)
309
+ if self._save_font_debug:
310
+ (debug_dir / f"{cid}_debug.txt").write_text(
311
+ paragraphs_str, encoding="utf-8"
312
+ )
313
+
314
+ # Run OCR + fallback mapping
315
+ char_set = {c for c in paragraphs_str if c not in {" ", "\n", "\u3000"}}
316
+ refl_set = set(refl_list)
317
+ char_set = char_set - refl_set
318
+ if self._save_font_debug:
319
+ (debug_dir / "char_set_debug.txt").write_text(
320
+ f"char_set:\n{char_set}\n\nrefl_set:\n{refl_set}",
321
+ encoding="utf-8",
322
+ )
323
+
324
+ mapping_result = self._generate_font_map(
325
+ fixed_font_path=fixed_path,
326
+ random_font_path=self._rand_path,
327
+ char_set=char_set,
328
+ refl_set=refl_set,
329
+ batch_size=self._batch_size,
330
+ )
331
+ if not mapping_result:
332
+ logger.warning(
333
+ "[Parser] font mapping returned empty result for chapter '%s'", cid
334
+ )
335
+ return ""
336
+
337
+ if self._save_font_debug:
338
+ (debug_dir / "font_mapping.json").write_text(
339
+ json.dumps(mapping_result, ensure_ascii=False, indent=2),
340
+ encoding="utf-8",
341
+ )
342
+
343
+ # Reconstruct final readable text
344
+ original_text = self._apply_font_mapping(
345
+ text=paragraphs_str,
346
+ font_map=mapping_result,
347
+ )
348
+
349
+ return "\n".join(
350
+ line.strip() for line in original_text.splitlines() if line.strip()
351
+ )
352
+
353
+ @staticmethod
354
+ def _find_ssr_page_context(html_str: str) -> dict[str, Any]:
355
+ """
356
+ Extract SSR JSON from <script id="vite-plugin-ssr_pageContext">.
357
+ """
358
+ tree = html.fromstring(html_str)
359
+ script = tree.xpath('//script[@id="vite-plugin-ssr_pageContext"]/text()')
360
+ return json.loads(script[0].strip()) if script else {}
361
+
362
+ @staticmethod
363
+ def _extract_chapter_info(ssr_data: dict[str, Any]) -> dict[str, Any]:
364
+ """
365
+ Extract the 'chapterInfo' dictionary from the SSR page context.
366
+
367
+ This handles nested key access and returns an empty dict if missing.
368
+
369
+ :param ssr_data: The full SSR data object from _find_ssr_page_context().
370
+ :return: A dict with chapter metadata such as chapterName, authorSay, etc.
371
+ """
372
+ page_context = ssr_data.get("pageContext", {})
373
+ page_props = page_context.get("pageProps", {})
374
+ page_data = page_props.get("pageData", {})
375
+ chapter_info = page_data.get("chapterInfo", {})
376
+ return chapter_info if isinstance(chapter_info, dict) else {}
377
+
378
+ @classmethod
379
+ def _is_vip(cls, chapter_info: dict[str, Any]) -> bool:
380
+ """
381
+ :return: True if VIP, False otherwise.
382
+ """
383
+ vip_flag = chapter_info.get("vipStatus", 0)
384
+ fens_flag = chapter_info.get("fEnS", 0)
385
+ return bool(vip_flag == 1 and fens_flag != 0)
386
+
387
+ @classmethod
388
+ def _can_view_chapter(cls, chapter_info: dict[str, Any]) -> bool:
389
+ """
390
+ A chapter is not viewable if it is marked as VIP
391
+ and has not been purchased.
392
+
393
+ :return: True if viewable, False otherwise.
394
+ """
395
+ is_buy = chapter_info.get("isBuy", 0)
396
+ vip_status = chapter_info.get("vipStatus", 0)
397
+ return not (vip_status == 1 and is_buy == 0)
398
+
399
+ @classmethod
400
+ def _is_duplicated(cls, chapter_info: dict[str, Any]) -> bool:
401
+ """
402
+ Check if chapter is marked as duplicated (eFW = 1).
403
+ """
404
+ efw_flag = chapter_info.get("eFW", 0)
405
+ return bool(efw_flag == 1)
406
+
407
+ @classmethod
408
+ def _is_encrypted(cls, chapter_info: dict[str, Any]) -> bool:
409
+ """
410
+ Return True if content is encrypted.
411
+
412
+ Chapter Encryption Status (cES):
413
+ * 0: 内容是'明文'
414
+ * 2: 字体加密
415
+ """
416
+ return int(chapter_info.get("cES", 0)) == 2
417
+
418
+ def _generate_font_map(
419
+ self,
420
+ fixed_font_path: Path,
421
+ random_font_path: Path,
422
+ char_set: set[str],
423
+ refl_set: set[str],
424
+ batch_size: int = 32,
425
+ ) -> dict[str, str]:
426
+ """
427
+ Build a mapping from scrambled font chars to real chars.
428
+
429
+ Uses OCR to decode and generate mapping from a fixed obfuscated font
430
+ and an random obfuscated font. Results are cached in JSON.
431
+
432
+ :param fixed_font_path: fixed font file.
433
+ :param random_font_path: random font file.
434
+ :param char_set: Characters to match directly.
435
+ :param refl_set: Characters to match in flipped form.
436
+ :param batch_size: How many chars to OCR per batch.
437
+
438
+ :return: { obf_char: real_char, ... }
439
+ """
440
+ font_ocr = get_font_ocr(self._fontocr_cfg)
441
+ if not font_ocr:
442
+ return {}
443
+
444
+ mapping_result: dict[str, str] = {}
445
+ fixed_map_file = self._fixed_map_dir / f"{fixed_font_path.stem}.json"
446
+ fixed_map_file.parent.mkdir(parents=True, exist_ok=True)
447
+
448
+ # load existing cache
449
+ try:
450
+ with open(fixed_map_file, encoding="utf-8") as f:
451
+ fixed_map = json.load(f)
452
+ cached_chars = set(fixed_map.keys())
453
+ mapping_result.update(
454
+ {ch: fixed_map[ch] for ch in char_set if ch in fixed_map}
455
+ )
456
+ mapping_result.update(
457
+ {ch: fixed_map[ch] for ch in refl_set if ch in fixed_map}
458
+ )
459
+ char_set = char_set - cached_chars
460
+ refl_set = refl_set - cached_chars
461
+ except Exception:
462
+ fixed_map = {}
463
+ cached_chars = set()
464
+
465
+ # prepare font renderers and cmap sets
466
+ fixed_chars = font_ocr.extract_font_charset(fixed_font_path)
467
+ random_chars = font_ocr.extract_font_charset(random_font_path)
468
+ fixed_font = font_ocr.load_render_font(fixed_font_path)
469
+ random_font = font_ocr.load_render_font(random_font_path)
470
+
471
+ # process normal and reflected sets together
472
+ rendered = []
473
+ for chars, reflect in [(char_set, False), (refl_set, True)]:
474
+ for ch in chars:
475
+ if ch in fixed_chars:
476
+ font = fixed_font
477
+ elif ch in random_chars:
478
+ font = random_font
479
+ else:
480
+ continue
481
+ rendered.append(
482
+ (ch, font_ocr.render_char_image_array(ch, font, reflect))
483
+ )
484
+
485
+ if rendered:
486
+ # query OCR+vec simultaneously
487
+ imgs_to_query = [img for _, img in rendered]
488
+ fused = font_ocr.predict(imgs_to_query, batch_size=batch_size)
489
+
490
+ # pick best per char, apply threshold + cache
491
+ for (ch, _), preds in zip(rendered, fused, strict=False):
492
+ if not preds:
493
+ continue
494
+ real_char, _ = preds
495
+ mapping_result[ch] = real_char
496
+ fixed_map[ch] = real_char
497
+
498
+ # persist updated fixed_map
499
+ try:
500
+ with open(fixed_map_file, "w", encoding="utf-8") as f:
501
+ json.dump(fixed_map, f, ensure_ascii=False, indent=2)
502
+ except Exception as e:
503
+ logger.error("[FontOCR] Failed to save fixed map: %s", e)
504
+
505
+ return mapping_result
506
+
507
+ @staticmethod
508
+ def _apply_font_mapping(text: str, font_map: dict[str, str]) -> str:
509
+ """
510
+ Replace each character in `text` using `font_map`,
511
+ leaving unmapped characters unchanged.
512
+
513
+ :param text: The input string, possibly containing obfuscated font chars.
514
+ :param font_map: A dict mapping obfuscated chars to real chars.
515
+ :return: The de-obfuscated text.
516
+ """
517
+ return "".join(font_map.get(ch, ch) for ch in text)
518
+
519
+ @staticmethod
520
+ def _only_tag(selector: str) -> str | None:
521
+ """
522
+ Normalize a selector into just its tag name for ordering.
523
+
524
+ Handles forms like 'i', 'em::before', '.p3 i', '.p2 span::after'.
525
+
526
+ Returns None if can't extract a tag.
527
+ """
528
+ # If it has spaces, take the rightmost simple selector
529
+ last = selector.strip().split()[-1]
530
+ # Drop ::pseudo
531
+ last = last.split("::", 1)[0]
532
+ # If it's like 'span[attr=..]' keep 'span'
533
+ last = last.split("[", 1)[0]
534
+ # If it starts with '.', it's not a tag
535
+ if not last or last.startswith("."):
536
+ return None
537
+ return last
538
+
539
+ @staticmethod
540
+ def _parse_decls(block: str) -> list[tuple[str, str]]:
541
+ """
542
+ Parse 'name:value;...' inside a block. Tolerates quotes and attr().
543
+ """
544
+ parts = [d.strip() for d in block.split(";") if d.strip()]
545
+ decls = []
546
+ for p in parts:
547
+ if ":" in p:
548
+ name, val = p.split(":", 1)
549
+ decls.append((name.strip().lower(), val.strip()))
550
+ return decls
551
+
552
+ @classmethod
553
+ def _parse_css_rules(cls, css_str: str) -> Rules:
554
+ """
555
+ Produces normalized Rules with:
556
+ * orders: list[str] of tag names sorted by numeric 'order'
557
+ * sy: '.sy-*' class rules
558
+ * p_rules: '.p* <tag>' rules, indexed by p-class then tag
559
+ """
560
+ rules: Rules = {"orders": [], "sy": {}, "p_rules": {}}
561
+ order_pairs: list[tuple[str, int]] = []
562
+
563
+ pos = 0
564
+ while True:
565
+ b1 = css_str.find("{", pos)
566
+ if b1 == -1:
567
+ break
568
+ selector = css_str[pos:b1].strip().lower()
569
+ b2 = css_str.find("}", b1 + 1)
570
+ if b2 == -1:
571
+ break
572
+ block = css_str[b1 + 1 : b2]
573
+ pos = b2 + 1
574
+
575
+ decls = cls._parse_decls(block)
576
+ new_rule: Rule = {}
577
+ order_val: int | None = None
578
+
579
+ for name, value in decls:
580
+ v = value.strip()
581
+ if name == "font-size" and v == "0":
582
+ new_rule[
583
+ "delete_first" if "::first-letter" in selector else "delete_all"
584
+ ] = True
585
+ elif name == "transform" and "scalex(-1" in v.replace(" ", "").lower():
586
+ new_rule["transform_flip_x"] = True
587
+ elif name == "order":
588
+ with suppress(ValueError):
589
+ order_val = int(v)
590
+ elif name == "content":
591
+ if "::after" in selector:
592
+ if v.lower().startswith("attr("):
593
+ new_rule["append_end_attr"] = v[5:-1].strip()
594
+ else:
595
+ new_rule["append_end_char"] = v.strip().strip("\"'")
596
+ elif "::before" in selector:
597
+ if v.lower().startswith("attr("):
598
+ new_rule["append_start_attr"] = v[5:-1].strip()
599
+ else:
600
+ new_rule["append_start_char"] = v.strip().strip("\"'")
601
+
602
+ if selector.startswith(".sy-"):
603
+ key = selector.lstrip(".")
604
+ rules["sy"][key] = {**rules["sy"].get(key, {}), **new_rule}
605
+ elif selector.startswith(".p") and " " in selector:
606
+ p_cls, right = selector.split(" ", 1)
607
+ tag = cls._only_tag(right)
608
+ if tag:
609
+ p_cls = p_cls.lstrip(".")
610
+ rules["p_rules"].setdefault(p_cls, {})
611
+ rules["p_rules"][p_cls][tag] = {
612
+ **rules["p_rules"][p_cls].get(tag, {}),
613
+ **new_rule,
614
+ }
615
+
616
+ if order_val is not None:
617
+ tag = cls._only_tag(selector)
618
+ if tag:
619
+ order_pairs.append((tag, order_val))
620
+
621
+ rules["orders"] = [t for t, _ in sorted(order_pairs, key=lambda x: x[1])]
622
+ return rules
623
+
624
+ @staticmethod
625
+ def _render_visible_text(html_str: str, rules: Rules) -> tuple[str, list[str]]:
626
+ """
627
+ Renderer the HTML using pre-parsed Rules.
628
+ """
629
+ tree = html.fromstring(html_str)
630
+ paragraphs_out: list[str] = []
631
+ refl_list: list[str] = []
632
+ orders = rules.get("orders") or []
633
+ p_rules = rules.get("p_rules") or {}
634
+ sy_rules = rules.get("sy") or {}
635
+
636
+ def _class_list(el: html.HtmlElement) -> list[str]:
637
+ cls = el.get("class")
638
+ return cls.split() if cls else []
639
+
640
+ def _apply_rule(el: html.HtmlElement, rule: Rule) -> str:
641
+ if rule.get("delete_all"):
642
+ return ""
643
+
644
+ parts: list[str] = []
645
+ if "append_start_char" in rule:
646
+ parts.append(rule["append_start_char"])
647
+ if "append_start_attr" in rule:
648
+ parts.append(el.get(rule["append_start_attr"], ""))
649
+
650
+ text = el.text or ""
651
+ if rule.get("delete_first") and text:
652
+ text = text[1:]
653
+ parts.append(text)
654
+
655
+ if "append_end_char" in rule:
656
+ parts.append(rule["append_end_char"])
657
+ if "append_end_attr" in rule:
658
+ parts.append(el.get(rule["append_end_attr"], ""))
659
+
660
+ s = "".join(parts)
661
+
662
+ if rule.get("transform_flip_x") and s:
663
+ refl_list.append(s)
664
+
665
+ return s
666
+
667
+ for p in tree.findall(".//p"):
668
+ p_classes = _class_list(p)
669
+ p_key = next((c for c in p_classes if c.startswith("p")), None)
670
+ has_ordered_rules = p_key in p_rules
671
+
672
+ buf_parts: list[str] = []
673
+
674
+ if p.text and not has_ordered_rules:
675
+ buf_parts.append(p.text)
676
+
677
+ ordered_cache: dict[str, list[str]] = {}
678
+
679
+ for child in p:
680
+ tag = str(child.tag)
681
+
682
+ # Handle inline <y class="sy-*"> spans
683
+ if tag == "y" and not has_ordered_rules:
684
+ y_cls = next(
685
+ (c for c in _class_list(child) if c.startswith("sy-")), None
686
+ )
687
+ if y_cls and y_cls in sy_rules:
688
+ buf_parts.append(_apply_rule(child, sy_rules[y_cls]))
689
+ else:
690
+ buf_parts.append(child.text or "")
691
+ if child.tail:
692
+ buf_parts.append(child.tail)
693
+ continue
694
+
695
+ # Handle ordered paragraphs: only cache tags that appear in `orders`
696
+ if p_key and has_ordered_rules and tag in orders:
697
+ rule = p_rules[p_key].get(tag, {})
698
+ ordered_cache.setdefault(tag, []).append(_apply_rule(child, rule))
699
+ continue
700
+
701
+ # Non-ordered, non-<y> nodes: include text + tails as-is
702
+ if not has_ordered_rules:
703
+ buf_parts.append(child.text or "")
704
+ if child.tail:
705
+ buf_parts.append(child.tail)
706
+
707
+ # If ordered, flush in global orders with all duplicates preserved
708
+ if has_ordered_rules:
709
+ for tag in orders:
710
+ if tag in ordered_cache:
711
+ buf_parts.extend(ordered_cache[tag])
712
+
713
+ para = "".join(buf_parts)
714
+ if para:
715
+ paragraphs_out.append(para)
716
+
717
+ return "\n".join(paragraphs_out), refl_list