novel-downloader 1.4.5__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/__init__.py +2 -4
  3. novel_downloader/cli/clean.py +21 -88
  4. novel_downloader/cli/config.py +27 -104
  5. novel_downloader/cli/download.py +78 -66
  6. novel_downloader/cli/export.py +20 -21
  7. novel_downloader/cli/main.py +3 -1
  8. novel_downloader/cli/search.py +120 -0
  9. novel_downloader/cli/ui.py +156 -0
  10. novel_downloader/config/__init__.py +10 -14
  11. novel_downloader/config/adapter.py +195 -99
  12. novel_downloader/config/{loader.py → file_io.py} +53 -27
  13. novel_downloader/core/__init__.py +14 -13
  14. novel_downloader/core/archived/deqixs/fetcher.py +115 -0
  15. novel_downloader/core/archived/deqixs/parser.py +132 -0
  16. novel_downloader/core/archived/deqixs/searcher.py +89 -0
  17. novel_downloader/core/archived/qidian/searcher.py +79 -0
  18. novel_downloader/core/archived/wanbengo/searcher.py +98 -0
  19. novel_downloader/core/archived/xshbook/searcher.py +93 -0
  20. novel_downloader/core/downloaders/__init__.py +8 -30
  21. novel_downloader/core/downloaders/base.py +182 -30
  22. novel_downloader/core/downloaders/common.py +217 -384
  23. novel_downloader/core/downloaders/qianbi.py +332 -4
  24. novel_downloader/core/downloaders/qidian.py +250 -290
  25. novel_downloader/core/downloaders/registry.py +69 -0
  26. novel_downloader/core/downloaders/signals.py +46 -0
  27. novel_downloader/core/exporters/__init__.py +8 -26
  28. novel_downloader/core/exporters/base.py +107 -31
  29. novel_downloader/core/exporters/common/__init__.py +3 -4
  30. novel_downloader/core/exporters/common/epub.py +92 -171
  31. novel_downloader/core/exporters/common/main_exporter.py +14 -67
  32. novel_downloader/core/exporters/common/txt.py +90 -86
  33. novel_downloader/core/exporters/epub_util.py +184 -1327
  34. novel_downloader/core/exporters/linovelib/__init__.py +3 -2
  35. novel_downloader/core/exporters/linovelib/epub.py +165 -222
  36. novel_downloader/core/exporters/linovelib/main_exporter.py +10 -71
  37. novel_downloader/core/exporters/linovelib/txt.py +76 -66
  38. novel_downloader/core/exporters/qidian.py +15 -11
  39. novel_downloader/core/exporters/registry.py +55 -0
  40. novel_downloader/core/exporters/txt_util.py +67 -0
  41. novel_downloader/core/fetchers/__init__.py +57 -56
  42. novel_downloader/core/fetchers/aaatxt.py +83 -0
  43. novel_downloader/core/fetchers/{biquge/session.py → b520.py} +10 -10
  44. novel_downloader/core/fetchers/{base/session.py → base.py} +63 -47
  45. novel_downloader/core/fetchers/biquyuedu.py +83 -0
  46. novel_downloader/core/fetchers/dxmwx.py +110 -0
  47. novel_downloader/core/fetchers/eightnovel.py +139 -0
  48. novel_downloader/core/fetchers/{esjzone/session.py → esjzone.py} +23 -11
  49. novel_downloader/core/fetchers/guidaye.py +85 -0
  50. novel_downloader/core/fetchers/hetushu.py +92 -0
  51. novel_downloader/core/fetchers/{qianbi/browser.py → i25zw.py} +22 -26
  52. novel_downloader/core/fetchers/ixdzs8.py +113 -0
  53. novel_downloader/core/fetchers/jpxs123.py +101 -0
  54. novel_downloader/core/fetchers/{biquge/browser.py → lewenn.py} +15 -15
  55. novel_downloader/core/fetchers/{linovelib/session.py → linovelib.py} +16 -12
  56. novel_downloader/core/fetchers/piaotia.py +105 -0
  57. novel_downloader/core/fetchers/qbtr.py +101 -0
  58. novel_downloader/core/fetchers/{qianbi/session.py → qianbi.py} +9 -9
  59. novel_downloader/core/fetchers/{qidian/session.py → qidian.py} +55 -40
  60. novel_downloader/core/fetchers/quanben5.py +92 -0
  61. novel_downloader/core/fetchers/{base/rate_limiter.py → rate_limiter.py} +2 -2
  62. novel_downloader/core/fetchers/registry.py +60 -0
  63. novel_downloader/core/fetchers/{sfacg/session.py → sfacg.py} +11 -9
  64. novel_downloader/core/fetchers/shencou.py +106 -0
  65. novel_downloader/core/fetchers/{common/browser.py → shuhaige.py} +24 -19
  66. novel_downloader/core/fetchers/tongrenquan.py +84 -0
  67. novel_downloader/core/fetchers/ttkan.py +95 -0
  68. novel_downloader/core/fetchers/{common/session.py → wanbengo.py} +21 -17
  69. novel_downloader/core/fetchers/xiaoshuowu.py +106 -0
  70. novel_downloader/core/fetchers/xiguashuwu.py +177 -0
  71. novel_downloader/core/fetchers/xs63b.py +171 -0
  72. novel_downloader/core/fetchers/xshbook.py +85 -0
  73. novel_downloader/core/fetchers/{yamibo/session.py → yamibo.py} +23 -11
  74. novel_downloader/core/fetchers/yibige.py +114 -0
  75. novel_downloader/core/interfaces/__init__.py +8 -14
  76. novel_downloader/core/interfaces/downloader.py +6 -2
  77. novel_downloader/core/interfaces/exporter.py +7 -7
  78. novel_downloader/core/interfaces/fetcher.py +4 -17
  79. novel_downloader/core/interfaces/parser.py +5 -6
  80. novel_downloader/core/interfaces/searcher.py +26 -0
  81. novel_downloader/core/parsers/__init__.py +58 -22
  82. novel_downloader/core/parsers/aaatxt.py +132 -0
  83. novel_downloader/core/parsers/b520.py +116 -0
  84. novel_downloader/core/parsers/base.py +63 -12
  85. novel_downloader/core/parsers/biquyuedu.py +133 -0
  86. novel_downloader/core/parsers/dxmwx.py +162 -0
  87. novel_downloader/core/parsers/eightnovel.py +224 -0
  88. novel_downloader/core/parsers/{esjzone/main_parser.py → esjzone.py} +67 -67
  89. novel_downloader/core/parsers/guidaye.py +128 -0
  90. novel_downloader/core/parsers/hetushu.py +139 -0
  91. novel_downloader/core/parsers/i25zw.py +137 -0
  92. novel_downloader/core/parsers/ixdzs8.py +186 -0
  93. novel_downloader/core/parsers/jpxs123.py +137 -0
  94. novel_downloader/core/parsers/lewenn.py +142 -0
  95. novel_downloader/core/parsers/{linovelib/main_parser.py → linovelib.py} +54 -65
  96. novel_downloader/core/parsers/piaotia.py +189 -0
  97. novel_downloader/core/parsers/qbtr.py +136 -0
  98. novel_downloader/core/parsers/{qianbi/main_parser.py → qianbi.py} +54 -51
  99. novel_downloader/core/parsers/qidian/__init__.py +2 -2
  100. novel_downloader/core/parsers/qidian/book_info_parser.py +58 -59
  101. novel_downloader/core/parsers/qidian/chapter_encrypted.py +290 -346
  102. novel_downloader/core/parsers/qidian/chapter_normal.py +25 -56
  103. novel_downloader/core/parsers/qidian/main_parser.py +19 -57
  104. novel_downloader/core/parsers/qidian/utils/__init__.py +12 -11
  105. novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +6 -7
  106. novel_downloader/core/parsers/qidian/utils/fontmap_recover.py +143 -0
  107. novel_downloader/core/parsers/qidian/utils/helpers.py +0 -4
  108. novel_downloader/core/parsers/qidian/utils/node_decryptor.py +2 -2
  109. novel_downloader/core/parsers/quanben5.py +103 -0
  110. novel_downloader/core/parsers/registry.py +57 -0
  111. novel_downloader/core/parsers/{sfacg/main_parser.py → sfacg.py} +46 -48
  112. novel_downloader/core/parsers/shencou.py +215 -0
  113. novel_downloader/core/parsers/shuhaige.py +111 -0
  114. novel_downloader/core/parsers/tongrenquan.py +116 -0
  115. novel_downloader/core/parsers/ttkan.py +132 -0
  116. novel_downloader/core/parsers/wanbengo.py +191 -0
  117. novel_downloader/core/parsers/xiaoshuowu.py +173 -0
  118. novel_downloader/core/parsers/xiguashuwu.py +435 -0
  119. novel_downloader/core/parsers/xs63b.py +161 -0
  120. novel_downloader/core/parsers/xshbook.py +134 -0
  121. novel_downloader/core/parsers/yamibo.py +155 -0
  122. novel_downloader/core/parsers/yibige.py +166 -0
  123. novel_downloader/core/searchers/__init__.py +51 -0
  124. novel_downloader/core/searchers/aaatxt.py +107 -0
  125. novel_downloader/core/searchers/b520.py +84 -0
  126. novel_downloader/core/searchers/base.py +168 -0
  127. novel_downloader/core/searchers/dxmwx.py +105 -0
  128. novel_downloader/core/searchers/eightnovel.py +84 -0
  129. novel_downloader/core/searchers/esjzone.py +102 -0
  130. novel_downloader/core/searchers/hetushu.py +92 -0
  131. novel_downloader/core/searchers/i25zw.py +93 -0
  132. novel_downloader/core/searchers/ixdzs8.py +107 -0
  133. novel_downloader/core/searchers/jpxs123.py +107 -0
  134. novel_downloader/core/searchers/piaotia.py +100 -0
  135. novel_downloader/core/searchers/qbtr.py +106 -0
  136. novel_downloader/core/searchers/qianbi.py +165 -0
  137. novel_downloader/core/searchers/quanben5.py +144 -0
  138. novel_downloader/core/searchers/registry.py +79 -0
  139. novel_downloader/core/searchers/shuhaige.py +124 -0
  140. novel_downloader/core/searchers/tongrenquan.py +110 -0
  141. novel_downloader/core/searchers/ttkan.py +92 -0
  142. novel_downloader/core/searchers/xiaoshuowu.py +122 -0
  143. novel_downloader/core/searchers/xiguashuwu.py +95 -0
  144. novel_downloader/core/searchers/xs63b.py +104 -0
  145. novel_downloader/locales/en.json +36 -79
  146. novel_downloader/locales/zh.json +37 -80
  147. novel_downloader/models/__init__.py +23 -50
  148. novel_downloader/models/book.py +44 -0
  149. novel_downloader/models/config.py +16 -43
  150. novel_downloader/models/login.py +1 -1
  151. novel_downloader/models/search.py +21 -0
  152. novel_downloader/resources/config/settings.toml +39 -74
  153. novel_downloader/resources/css_styles/intro.css +83 -0
  154. novel_downloader/resources/css_styles/main.css +30 -89
  155. novel_downloader/resources/json/xiguashuwu.json +718 -0
  156. novel_downloader/utils/__init__.py +43 -0
  157. novel_downloader/utils/chapter_storage.py +247 -226
  158. novel_downloader/utils/constants.py +5 -50
  159. novel_downloader/utils/cookies.py +6 -18
  160. novel_downloader/utils/crypto_utils/__init__.py +13 -0
  161. novel_downloader/utils/crypto_utils/aes_util.py +90 -0
  162. novel_downloader/utils/crypto_utils/aes_v1.py +619 -0
  163. novel_downloader/utils/crypto_utils/aes_v2.py +1143 -0
  164. novel_downloader/utils/{crypto_utils.py → crypto_utils/rc4.py} +3 -10
  165. novel_downloader/utils/epub/__init__.py +34 -0
  166. novel_downloader/utils/epub/builder.py +377 -0
  167. novel_downloader/utils/epub/constants.py +118 -0
  168. novel_downloader/utils/epub/documents.py +297 -0
  169. novel_downloader/utils/epub/models.py +120 -0
  170. novel_downloader/utils/epub/utils.py +179 -0
  171. novel_downloader/utils/file_utils/__init__.py +5 -30
  172. novel_downloader/utils/file_utils/io.py +9 -150
  173. novel_downloader/utils/file_utils/normalize.py +2 -2
  174. novel_downloader/utils/file_utils/sanitize.py +2 -7
  175. novel_downloader/utils/fontocr.py +207 -0
  176. novel_downloader/utils/i18n.py +2 -0
  177. novel_downloader/utils/logger.py +10 -16
  178. novel_downloader/utils/network.py +111 -252
  179. novel_downloader/utils/state.py +5 -90
  180. novel_downloader/utils/text_utils/__init__.py +16 -21
  181. novel_downloader/utils/text_utils/diff_display.py +6 -9
  182. novel_downloader/utils/text_utils/numeric_conversion.py +253 -0
  183. novel_downloader/utils/text_utils/text_cleaner.py +179 -0
  184. novel_downloader/utils/text_utils/truncate_utils.py +62 -0
  185. novel_downloader/utils/time_utils/__init__.py +6 -12
  186. novel_downloader/utils/time_utils/datetime_utils.py +23 -33
  187. novel_downloader/utils/time_utils/sleep_utils.py +5 -10
  188. novel_downloader/web/__init__.py +13 -0
  189. novel_downloader/web/components/__init__.py +11 -0
  190. novel_downloader/web/components/navigation.py +35 -0
  191. novel_downloader/web/main.py +66 -0
  192. novel_downloader/web/pages/__init__.py +17 -0
  193. novel_downloader/web/pages/download.py +78 -0
  194. novel_downloader/web/pages/progress.py +147 -0
  195. novel_downloader/web/pages/search.py +329 -0
  196. novel_downloader/web/services/__init__.py +17 -0
  197. novel_downloader/web/services/client_dialog.py +164 -0
  198. novel_downloader/web/services/cred_broker.py +113 -0
  199. novel_downloader/web/services/cred_models.py +35 -0
  200. novel_downloader/web/services/task_manager.py +264 -0
  201. novel_downloader-2.0.0.dist-info/METADATA +171 -0
  202. novel_downloader-2.0.0.dist-info/RECORD +210 -0
  203. {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/entry_points.txt +1 -1
  204. novel_downloader/config/site_rules.py +0 -94
  205. novel_downloader/core/downloaders/biquge.py +0 -25
  206. novel_downloader/core/downloaders/esjzone.py +0 -25
  207. novel_downloader/core/downloaders/linovelib.py +0 -25
  208. novel_downloader/core/downloaders/sfacg.py +0 -25
  209. novel_downloader/core/downloaders/yamibo.py +0 -25
  210. novel_downloader/core/exporters/biquge.py +0 -25
  211. novel_downloader/core/exporters/esjzone.py +0 -25
  212. novel_downloader/core/exporters/qianbi.py +0 -25
  213. novel_downloader/core/exporters/sfacg.py +0 -25
  214. novel_downloader/core/exporters/yamibo.py +0 -25
  215. novel_downloader/core/factory/__init__.py +0 -20
  216. novel_downloader/core/factory/downloader.py +0 -73
  217. novel_downloader/core/factory/exporter.py +0 -58
  218. novel_downloader/core/factory/fetcher.py +0 -96
  219. novel_downloader/core/factory/parser.py +0 -86
  220. novel_downloader/core/fetchers/base/__init__.py +0 -14
  221. novel_downloader/core/fetchers/base/browser.py +0 -403
  222. novel_downloader/core/fetchers/biquge/__init__.py +0 -14
  223. novel_downloader/core/fetchers/common/__init__.py +0 -14
  224. novel_downloader/core/fetchers/esjzone/__init__.py +0 -14
  225. novel_downloader/core/fetchers/esjzone/browser.py +0 -204
  226. novel_downloader/core/fetchers/linovelib/__init__.py +0 -14
  227. novel_downloader/core/fetchers/linovelib/browser.py +0 -193
  228. novel_downloader/core/fetchers/qianbi/__init__.py +0 -14
  229. novel_downloader/core/fetchers/qidian/__init__.py +0 -14
  230. novel_downloader/core/fetchers/qidian/browser.py +0 -318
  231. novel_downloader/core/fetchers/sfacg/__init__.py +0 -14
  232. novel_downloader/core/fetchers/sfacg/browser.py +0 -189
  233. novel_downloader/core/fetchers/yamibo/__init__.py +0 -14
  234. novel_downloader/core/fetchers/yamibo/browser.py +0 -229
  235. novel_downloader/core/parsers/biquge/__init__.py +0 -10
  236. novel_downloader/core/parsers/biquge/main_parser.py +0 -134
  237. novel_downloader/core/parsers/common/__init__.py +0 -13
  238. novel_downloader/core/parsers/common/helper.py +0 -323
  239. novel_downloader/core/parsers/common/main_parser.py +0 -106
  240. novel_downloader/core/parsers/esjzone/__init__.py +0 -10
  241. novel_downloader/core/parsers/linovelib/__init__.py +0 -10
  242. novel_downloader/core/parsers/qianbi/__init__.py +0 -10
  243. novel_downloader/core/parsers/sfacg/__init__.py +0 -10
  244. novel_downloader/core/parsers/yamibo/__init__.py +0 -10
  245. novel_downloader/core/parsers/yamibo/main_parser.py +0 -194
  246. novel_downloader/models/browser.py +0 -21
  247. novel_downloader/models/chapter.py +0 -25
  248. novel_downloader/models/site_rules.py +0 -99
  249. novel_downloader/models/tasks.py +0 -33
  250. novel_downloader/models/types.py +0 -15
  251. novel_downloader/resources/css_styles/volume-intro.css +0 -56
  252. novel_downloader/resources/json/replace_word_map.json +0 -4
  253. novel_downloader/resources/text/blacklist.txt +0 -22
  254. novel_downloader/tui/__init__.py +0 -7
  255. novel_downloader/tui/app.py +0 -32
  256. novel_downloader/tui/main.py +0 -17
  257. novel_downloader/tui/screens/__init__.py +0 -14
  258. novel_downloader/tui/screens/home.py +0 -198
  259. novel_downloader/tui/screens/login.py +0 -74
  260. novel_downloader/tui/styles/home_layout.tcss +0 -79
  261. novel_downloader/tui/widgets/richlog_handler.py +0 -24
  262. novel_downloader/utils/cache.py +0 -24
  263. novel_downloader/utils/fontocr/__init__.py +0 -22
  264. novel_downloader/utils/fontocr/model_loader.py +0 -69
  265. novel_downloader/utils/fontocr/ocr_v1.py +0 -303
  266. novel_downloader/utils/fontocr/ocr_v2.py +0 -752
  267. novel_downloader/utils/hash_store.py +0 -279
  268. novel_downloader/utils/hash_utils.py +0 -103
  269. novel_downloader/utils/text_utils/chapter_formatting.py +0 -46
  270. novel_downloader/utils/text_utils/font_mapping.py +0 -28
  271. novel_downloader/utils/text_utils/text_cleaning.py +0 -107
  272. novel_downloader-1.4.5.dist-info/METADATA +0 -196
  273. novel_downloader-1.4.5.dist-info/RECORD +0 -165
  274. {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/WHEEL +0 -0
  275. {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/licenses/LICENSE +0 -0
  276. {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/top_level.txt +0 -0
@@ -11,16 +11,15 @@ from __future__ import annotations
11
11
 
12
12
  import json
13
13
  import logging
14
- from pathlib import Path
15
- from typing import TYPE_CHECKING, Any
14
+ import re
15
+ from contextlib import suppress
16
+ from typing import TYPE_CHECKING, TypedDict
16
17
 
17
- import tinycss2
18
18
  from lxml import html
19
19
 
20
20
  from novel_downloader.models import ChapterDict
21
- from novel_downloader.utils.network import download_font_file
22
- from novel_downloader.utils.text_utils import (
23
- apply_font_mapping,
21
+ from novel_downloader.utils import (
22
+ download,
24
23
  truncate_half_lines,
25
24
  )
26
25
 
@@ -31,13 +30,36 @@ from .utils import (
31
30
  is_duplicated,
32
31
  vip_status,
33
32
  )
33
+ from .utils.fontmap_recover import (
34
+ apply_font_mapping,
35
+ generate_font_map,
36
+ )
34
37
 
35
38
  if TYPE_CHECKING:
36
39
  from .main_parser import QidianParser
37
40
 
38
41
  logger = logging.getLogger(__name__)
39
- IGNORED_CLASS_LISTS = {"title", "review"}
40
- NON_CONTENT_KEYWORDS = {"旧版", "反馈", "扫码"}
42
+ _RE_ATTR = re.compile(r"attr\(\s*([^)]+?)\s*\)", re.I)
43
+ _RE_SCALEX = re.compile(r"scalex\(\s*-?1\s*\)", re.I)
44
+
45
+
46
+ class Rule(TypedDict, total=False):
47
+ delete_all: bool
48
+ delete_first: bool
49
+ transform_flip_x: bool
50
+ append_start_char: str
51
+ append_end_char: str
52
+ append_start_attr: str
53
+ append_end_attr: str
54
+
55
+
56
+ class Rules(TypedDict):
57
+ # e.g., orders = ["i", "em", "span"]
58
+ orders: list[str]
59
+ # e.g., sy["sy-3"] -> Rule
60
+ sy: dict[str, Rule]
61
+ # e.g., p_rules["p3"]["i"] -> Rule
62
+ p_rules: dict[str, dict[str, Rule]]
41
63
 
42
64
 
43
65
  def parse_encrypted_chapter(
@@ -60,7 +82,7 @@ def parse_encrypted_chapter(
60
82
  :return: Formatted chapter text or empty string if not parsable.
61
83
  """
62
84
  try:
63
- if not (parser._decode_font and parser._font_ocr):
85
+ if not parser._decode_font:
64
86
  return None
65
87
  ssr_data = find_ssr_page_context(html_str)
66
88
  chapter_info = extract_chapter_info(ssr_data)
@@ -70,10 +92,9 @@ def parse_encrypted_chapter(
70
92
  )
71
93
  return None
72
94
 
73
- debug_base_dir: Path | None = None
74
- if parser._font_debug_dir:
75
- debug_base_dir = parser._font_debug_dir / chapter_id
76
- debug_base_dir.mkdir(parents=True, exist_ok=True)
95
+ debug_dir = parser._debug_dir / "font_debug" / "qidian" / chapter_id
96
+ if parser.save_font_debug:
97
+ debug_dir.mkdir(parents=True, exist_ok=True)
77
98
 
78
99
  css_str = chapter_info["css"]
79
100
  randomFont_str = chapter_info["randomFont"]
@@ -98,88 +119,71 @@ def parse_encrypted_chapter(
98
119
  rand_path.parent.mkdir(parents=True, exist_ok=True)
99
120
  rand_path.write_bytes(bytes(rf["data"]))
100
121
 
101
- fixed_path = download_font_file(
102
- url=fixedFontWoff2_url, target_folder=parser._fixed_font_dir
122
+ fixed_path = download(
123
+ url=fixedFontWoff2_url,
124
+ target_dir=parser._fixed_font_dir,
125
+ stream=True,
103
126
  )
104
127
  if fixed_path is None:
105
128
  raise ValueError("fixed_path is None: failed to download font")
106
129
 
107
130
  # Extract and render paragraphs from HTML with CSS rules
108
- main_paragraphs = extract_paragraphs_recursively(html_str, chapter_id)
109
- if not main_paragraphs or contains_keywords(
110
- main_paragraphs, NON_CONTENT_KEYWORDS
111
- ):
112
- if vip_status(ssr_data):
113
- try:
114
- decryptor = get_decryptor()
115
- raw_html = decryptor.decrypt(
116
- raw_html,
117
- chapter_id,
118
- fkp,
119
- parser._fuid,
120
- )
121
- except Exception as e:
122
- logger.error(
123
- "[Parser] decryption failed for '%s': %s", chapter_id, e
124
- )
125
- return None
126
- main_paragraphs = extract_paragraphs_recursively(raw_html, chapter_id)
127
-
128
- if debug_base_dir:
129
- main_paragraphs_path = debug_base_dir / "main_paragraphs_debug.json"
130
- main_paragraphs_path.write_text(
131
- json.dumps(main_paragraphs, ensure_ascii=False, indent=2),
132
- encoding="utf-8",
133
- )
134
-
135
- paragraphs_rules = parse_rule(css_str)
136
- if debug_base_dir:
137
- paragraphs_rules_path = debug_base_dir / "paragraphs_rules_debug.json"
138
- paragraphs_rules_path.write_text(
139
- json.dumps(paragraphs_rules, ensure_ascii=False, indent=2),
140
- encoding="utf-8",
141
- )
142
-
143
- end_number = parse_end_number(main_paragraphs, paragraphs_rules)
144
- paragraphs_str, refl_list = render_paragraphs(
145
- main_paragraphs,
146
- paragraphs_rules,
147
- end_number,
148
- )
149
- if debug_base_dir:
150
- paragraphs_str_path = debug_base_dir / f"{chapter_id}_debug.txt"
131
+ if vip_status(ssr_data):
132
+ try:
133
+ decryptor = get_decryptor()
134
+ raw_html = decryptor.decrypt(
135
+ raw_html,
136
+ chapter_id,
137
+ fkp,
138
+ parser._fuid,
139
+ )
140
+ except Exception as e:
141
+ logger.error("[Parser] decryption failed for '%s': %s", chapter_id, e)
142
+ return None
143
+
144
+ css_rules = parse_css_rules(css_str)
145
+ paragraphs_str, refl_list = render_visible_text(raw_html, css_rules)
146
+ if parser.save_font_debug:
147
+ paragraphs_str_path = debug_dir / f"{chapter_id}_debug.txt"
151
148
  paragraphs_str_path.write_text(paragraphs_str, encoding="utf-8")
152
149
 
153
150
  # Run OCR + fallback mapping
154
151
  char_set = {c for c in paragraphs_str if c not in {" ", "\n", "\u3000"}}
155
152
  refl_set = set(refl_list)
156
153
  char_set = char_set - refl_set
157
- if debug_base_dir:
158
- char_sets_path = debug_base_dir / "char_set_debug.txt"
154
+ if parser.save_font_debug:
155
+ char_sets_path = debug_dir / "char_set_debug.txt"
159
156
  temp = f"char_set:\n{char_set}\n\nrefl_set:\n{refl_set}"
160
157
  char_sets_path.write_text(
161
158
  temp,
162
159
  encoding="utf-8",
163
160
  )
164
161
 
165
- mapping_result = parser._font_ocr.generate_font_map(
162
+ mapping_result = generate_font_map(
166
163
  fixed_font_path=fixed_path,
167
164
  random_font_path=rand_path,
168
165
  char_set=char_set,
169
166
  refl_set=refl_set,
170
- chapter_id=chapter_id,
167
+ cache_dir=parser._base_cache_dir,
168
+ batch_size=parser._config.batch_size,
171
169
  )
172
- if debug_base_dir:
173
- mapping_json_path = debug_base_dir / "font_mapping.json"
170
+ if not mapping_result:
171
+ return None
172
+
173
+ if parser.save_font_debug:
174
+ mapping_json_path = debug_dir / "font_mapping.json"
174
175
  mapping_json_path.write_text(
175
176
  json.dumps(mapping_result, ensure_ascii=False, indent=2),
176
177
  encoding="utf-8",
177
178
  )
178
179
 
179
180
  # Reconstruct final readable text
180
- original_text = apply_font_mapping(paragraphs_str, mapping_result)
181
+ original_text = apply_font_mapping(
182
+ text=paragraphs_str,
183
+ font_map=mapping_result,
184
+ )
181
185
 
182
- final_paragraphs_str = "\n\n".join(
186
+ final_paragraphs_str = "\n".join(
183
187
  line.strip() for line in original_text.splitlines() if line.strip()
184
188
  )
185
189
  if parser._use_truncation and duplicated:
@@ -209,318 +213,258 @@ def parse_encrypted_chapter(
209
213
  return None
210
214
 
211
215
 
212
- def extract_paragraphs_recursively(
213
- html_str: str,
214
- chapter_id: str,
215
- ) -> list[dict[str, Any]]:
216
- def parse_element(elem: html.HtmlElement) -> dict[str, Any]:
217
- class_attr = elem.attrib.get("class", "")
218
- class_list = class_attr.split() if isinstance(class_attr, str) else class_attr
219
- if "review" in class_list:
220
- return {}
221
-
222
- # Build attrs with class as list
223
- attrs = {k: v.split() if k == "class" else v for k, v in elem.attrib.items()}
224
-
225
- node: dict[str, Any] = {
226
- "tag": elem.tag,
227
- "attrs": attrs,
228
- "data": [],
229
- }
230
-
231
- # Append entire elem.text if present (no splitting)
232
- if elem.text:
233
- node["data"].append(elem.text)
234
-
235
- # Recurse into children
236
- for child in elem.iterchildren(tag=None):
237
- child_dict = parse_element(child)
238
- if child_dict:
239
- node["data"].append(child_dict)
240
-
241
- # Append entire tail string (no split)
242
- if child.tail:
243
- node["data"].append(child.tail)
244
-
245
- return node
246
-
247
- tree = html.fromstring(html_str)
248
-
249
- # Try to find <main id="c-{chapter_id}">
250
- main_elem = tree.xpath(f'//main[@id="c-{chapter_id}"]')
251
- search_root = main_elem[0] if main_elem else tree
252
- return [parse_element(p) for p in search_root.findall(".//p")]
253
-
254
-
255
- def parse_rule(css_str: str) -> dict[str, Any]:
216
+ def _only_tag(selector: str) -> str | None:
256
217
  """
257
- Parse a CSS string and extract style rules for rendering.
218
+ Normalize a selector into just its tag name for ordering.
258
219
 
259
- Handles:
260
- - font-size:0 (mark for deletion)
261
- - scaleX(-1) (mark as mirrored)
262
- - ::before / ::after with content or attr()
263
- - class + tag selector mapping
264
- - custom rendering order via 'order'
220
+ Handles forms like 'i', 'em::before', '.p3 i', '.p2 span::after'.
265
221
 
266
- :param css_str: Raw CSS stylesheet string.
267
- :return: Dict with "rules" and "orders" for rendering.
222
+ Returns None if can't extract a tag.
268
223
  """
269
-
270
- rules: dict[str, Any] = {}
271
- orders = []
272
-
273
- stylesheet = tinycss2.parse_stylesheet(
274
- css_str, skip_comments=True, skip_whitespace=True
275
- )
276
-
277
- for rule in stylesheet:
278
- if rule.type != "qualified-rule":
279
- continue
280
-
281
- selector = tinycss2.serialize(rule.prelude).strip()
282
- declarations = tinycss2.parse_declaration_list(rule.content)
283
-
284
- parsed = {}
285
- order_val = None
286
-
287
- for decl in declarations:
288
- if decl.type != "declaration":
289
- continue
290
- name = decl.lower_name
291
- value = tinycss2.serialize(decl.value).strip()
292
-
293
- if name == "font-size" and value == "0":
224
+ sel = selector.strip()
225
+ # If it has spaces, take the rightmost simple selector
226
+ last = sel.split()[-1]
227
+ # Drop ::pseudo
228
+ last = last.split("::", 1)[0]
229
+ # If it's like 'span[attr=..]' keep 'span'
230
+ last = last.split("[", 1)[0]
231
+ # If it starts with '.', it's not a tag
232
+ if not last or last.startswith("."):
233
+ return None
234
+ return last
235
+
236
+
237
+ def _parse_decls(block: str) -> list[tuple[str, str]]:
238
+ """
239
+ Parse 'name:value;...' inside a block. Tolerates quotes and attr().
240
+ """
241
+ decls: list[tuple[str, str]] = []
242
+ i = 0
243
+ n = len(block)
244
+ name: list[str] = []
245
+ val: list[str] = []
246
+ in_name = True
247
+ quote = None # track ' or "
248
+ while i < n:
249
+ c = block[i]
250
+ if quote:
251
+ # inside quotes
252
+ if c == "\\" and i + 1 < n:
253
+ # keep escaped char
254
+ (name if in_name else val).append(c)
255
+ i += 1
256
+ (name if in_name else val).append(block[i])
257
+ elif c == quote:
258
+ (name if in_name else val).append(c)
259
+ quote = None
260
+ else:
261
+ (name if in_name else val).append(c)
262
+ else:
263
+ if c in ("'", '"'):
264
+ (name if in_name else val).append(c)
265
+ quote = c
266
+ elif in_name and c == ":":
267
+ in_name = False
268
+ elif c == ";":
269
+ nm = "".join(name).strip().lower()
270
+ vl = "".join(val).strip()
271
+ if nm:
272
+ decls.append((nm, vl))
273
+ name.clear()
274
+ val.clear()
275
+ in_name = True
276
+ else:
277
+ (name if in_name else val).append(c)
278
+ i += 1
279
+
280
+ if name or val:
281
+ nm = "".join(name).strip().lower()
282
+ vl = "".join(val).strip()
283
+ if nm:
284
+ decls.append((nm, vl))
285
+ return decls
286
+
287
+
288
+ def parse_css_rules(css_str: str) -> Rules:
289
+ """
290
+ Produces normalized Rules with:
291
+ - orders: list[str] of tag names sorted by numeric 'order'
292
+ - sy: '.sy-*' class rules
293
+ - p_rules: '.p* <tag>' rules, indexed by p-class then tag
294
+ """
295
+ rules: Rules = {"orders": [], "sy": {}, "p_rules": {}}
296
+ order_pairs: list[tuple[str, int]] = []
297
+
298
+ i = 0
299
+ while True:
300
+ b1 = css_str.find("{", i)
301
+ if b1 == -1:
302
+ break
303
+ selector = css_str[i:b1].strip().lower()
304
+ b2 = css_str.find("}", b1 + 1)
305
+ if b2 == -1:
306
+ break
307
+ block = css_str[b1 + 1 : b2]
308
+ i = b2 + 1
309
+
310
+ decls = _parse_decls(block)
311
+
312
+ new_rule: Rule = {}
313
+ order_val: int | None = None
314
+
315
+ for name, value in decls:
316
+ v = value.strip()
317
+ if name == "font-size" and v == "0":
294
318
  if "::first-letter" in selector:
295
- parsed["delete-first"] = True
319
+ new_rule["delete_first"] = True
296
320
  else:
297
- parsed["delete-all"] = True
298
- elif name == "transform" and value.lower() == "scalex(-1)":
299
- parsed["transform-x_-1"] = True
321
+ new_rule["delete_all"] = True
322
+ elif name == "transform":
323
+ if _RE_SCALEX.search(v.replace(" ", "")):
324
+ new_rule["transform_flip_x"] = True
300
325
  elif name == "order":
301
- order_val = value
326
+ with suppress(ValueError, TypeError):
327
+ order_val = int(v)
302
328
  elif name == "content":
329
+ # normalize: remove outer quotes
303
330
  if "::after" in selector:
304
- if "attr(" in value:
305
- parsed["append-end-attr"] = value.split("attr(")[1].split(")")[
306
- 0
307
- ]
331
+ m = _RE_ATTR.search(v)
332
+ if m:
333
+ new_rule["append_end_attr"] = m.group(1)
308
334
  else:
309
- parsed["append-end-char"] = value.strip("\"'")
335
+ s = v.strip().strip("\"'")
336
+ new_rule["append_end_char"] = s
310
337
  elif "::before" in selector:
311
- if "attr(" in value:
312
- parsed["append-start-attr"] = value.split("attr(")[1].split(
313
- ")"
314
- )[0]
338
+ m = _RE_ATTR.search(v)
339
+ if m:
340
+ new_rule["append_start_attr"] = m.group(1)
315
341
  else:
316
- parsed["append-start-char"] = value.strip("\"'")
342
+ s = v.strip().strip("\"'")
343
+ new_rule["append_start_char"] = s
317
344
 
318
- # Store in structure
345
+ # classification
319
346
  if selector.startswith(".sy-"):
320
- rules.setdefault("sy", {})[selector[1:]] = parsed
347
+ key = selector.lstrip(".")
348
+ old = rules["sy"].get(key)
349
+ rules["sy"][key] = {**old, **new_rule} if old else (new_rule or {})
350
+
321
351
  elif selector.startswith(".p") and " " in selector:
322
- class_str, tag_part = selector.split(" ", 1)
323
- class_str = class_str.lstrip(".")
324
- tag_part = tag_part.split("::")[0]
325
- rules.setdefault(class_str, {}).setdefault(tag_part, {}).update(parsed)
352
+ p_cls, right = selector.split(" ", 1)
353
+ p_cls = p_cls.lstrip(".")
354
+ tag = _only_tag(right)
355
+ if tag:
356
+ prev = rules["p_rules"].setdefault(p_cls, {}).get(tag)
357
+ rules["p_rules"][p_cls][tag] = (
358
+ {**prev, **new_rule} if prev else (new_rule or {})
359
+ )
360
+
361
+ if order_val is not None:
362
+ tag_for_order = _only_tag(selector)
363
+ if tag_for_order:
364
+ order_pairs.append((tag_for_order, order_val))
365
+
366
+ # normalize orders
367
+ order_pairs.sort(key=lambda t: t[1])
368
+ seen = set()
369
+ orders: list[str] = []
370
+ for tag, _num in order_pairs:
371
+ if tag not in seen:
372
+ seen.add(tag)
373
+ orders.append(tag)
374
+ rules["orders"] = orders
375
+ return rules
376
+
377
+
378
+ def render_visible_text(html_str: str, rules: Rules) -> tuple[str, list[str]]:
379
+ """
380
+ Renderer the HTML using pre-parsed Rules.
381
+ """
382
+ tree = html.fromstring(html_str)
383
+ paragraphs_out: list[str] = []
384
+ refl_list: list[str] = []
385
+ orders = rules.get("orders") or []
386
+ p_rules = rules.get("p_rules") or {}
387
+ sy_rules = rules.get("sy") or {}
326
388
 
327
- if order_val:
328
- orders.append((selector, order_val))
389
+ def _class_list(el: html.HtmlElement) -> list[str]:
390
+ cls = el.get("class")
391
+ return cls.split() if cls else []
329
392
 
330
- orders.sort(key=lambda x: int(x[1]))
331
- return {"rules": rules, "orders": orders}
393
+ def _apply_rule(el: html.HtmlElement, rule: Rule) -> str:
394
+ if rule.get("delete_all"):
395
+ return ""
332
396
 
397
+ parts: list[str] = []
398
+ if "append_start_char" in rule:
399
+ parts.append(rule["append_start_char"])
400
+ if "append_start_attr" in rule:
401
+ parts.append(el.get(rule["append_start_attr"], ""))
333
402
 
334
- def render_paragraphs(
335
- main_paragraphs: list[dict[str, Any]],
336
- rules: dict[str, Any],
337
- end_number: str = "",
338
- ) -> tuple[str, list[str]]:
339
- """
340
- Applies the parsed CSS rules to the paragraph structure and
341
- reconstructs the visible text.
403
+ text = el.text or ""
404
+ if rule.get("delete_first") and text:
405
+ text = text[1:]
406
+ parts.append(text)
342
407
 
343
- Handles special class styles like .sy-*, text order control,
344
- mirrored characters, etc.
408
+ if "append_end_char" in rule:
409
+ parts.append(rule["append_end_char"])
410
+ if "append_end_attr" in rule:
411
+ parts.append(el.get(rule["append_end_attr"], ""))
345
412
 
346
- :param main_paragraphs: A list of paragraph dictionaries, each with 'attrs'
347
- and 'data' fields representing structured content.
348
- :param rules: A dictionary with keys 'orders' and 'rules', parsed from CSS.
349
- - rules['orders']: List of (selector, id) tuples.
350
- - rules['rules']: Nested dict containing transformation rules.
413
+ s = "".join(parts)
351
414
 
352
- :return:
353
- - A reconstructed paragraph string with line breaks.
354
- - A list of mirrored (reflected) characters for later OCR processing.
355
- """
356
- orders: list[tuple[str, str]] = rules.get("orders", [])
357
- rules = rules.get("rules", {})
358
- refl_list: list[str] = []
415
+ if rule.get("transform_flip_x") and s:
416
+ refl_list.append(s)
359
417
 
360
- def apply_rule(data: dict[str, Any], rule: dict[str, Any]) -> str:
361
- if rule.get("delete-all", False):
362
- return ""
418
+ return s
363
419
 
364
- curr_str = ""
365
- if isinstance(data.get("data"), list) and data["data"]:
366
- first_data = data["data"][0]
367
- if isinstance(first_data, str):
368
- curr_str += first_data
420
+ for p in tree.findall(".//p"):
421
+ p_classes = _class_list(p)
422
+ p_key = next((c for c in p_classes if c.startswith("p")), None)
423
+ has_ordered_rules = p_key in p_rules
369
424
 
370
- if rule.get("delete-first", False):
371
- curr_str = "" if len(curr_str) <= 1 else curr_str[1:]
425
+ buf_parts: list[str] = []
372
426
 
373
- curr_str += rule.get("append-end-char", "")
427
+ if p.text and not has_ordered_rules:
428
+ buf_parts.append(p.text)
374
429
 
375
- attr_name = rule.get("append-end-attr", "")
376
- if attr_name:
377
- curr_str += data.get("attrs", {}).get(f"{attr_name}{end_number}", "")
430
+ ordered_cache: dict[str, list[str]] = {}
378
431
 
379
- curr_str = rule.get("append-start-char", "") + curr_str
432
+ for child in p:
433
+ tag = str(child.tag)
380
434
 
381
- attr_name = rule.get("append-start-attr", "")
382
- if attr_name:
383
- curr_str = (
384
- data.get("attrs", {}).get(f"{attr_name}{end_number}", "") + curr_str
385
- )
435
+ # Handle inline <y class="sy-*"> spans
436
+ if tag == "y" and not has_ordered_rules:
437
+ y_cls = next(
438
+ (c for c in _class_list(child) if c.startswith("sy-")), None
439
+ )
440
+ if y_cls and y_cls in sy_rules:
441
+ buf_parts.append(_apply_rule(child, sy_rules[y_cls]))
442
+ else:
443
+ buf_parts.append(child.text or "")
444
+ if child.tail:
445
+ buf_parts.append(child.tail)
446
+ continue
386
447
 
387
- if rule.get("transform-x_-1", False):
388
- refl_list.append(curr_str)
389
- return curr_str
390
-
391
- paragraphs_str = ""
392
- for paragraph in main_paragraphs:
393
- class_list = paragraph.get("attrs", {}).get("class", [])
394
- p_class_str = next((c for c in class_list if c.startswith("p")), None)
395
- curr_datas = paragraph.get("data", [])
396
-
397
- ordered_cache = {}
398
- for data in curr_datas:
399
- # 文本节点直接加
400
- if isinstance(data, str):
401
- paragraphs_str += data
448
+ # Handle ordered paragraphs: only cache tags that appear in `orders`
449
+ if p_key and has_ordered_rules and tag in orders:
450
+ rule = p_rules[p_key].get(tag, {})
451
+ ordered_cache.setdefault(tag, []).append(_apply_rule(child, rule))
402
452
  continue
403
453
 
404
- if isinstance(data, dict):
405
- tag = data.get("tag", "")
406
- attrs = data.get("attrs", {})
407
-
408
- # 跳过 span.review
409
- if tag == "span" and "class" in attrs and "review" in attrs["class"]:
410
- continue
411
-
412
- # sy 类型标签处理
413
- if tag == "y":
414
- tag_class_list = attrs.get("class", [])
415
- tag_class = next(
416
- (c for c in tag_class_list if c.startswith("sy-")), None
417
- )
418
-
419
- if tag_class in rules.get("sy", {}):
420
- curr_rule = rules["sy"][tag_class]
421
- paragraphs_str += apply_rule(data, curr_rule)
422
- continue
423
-
424
- if not p_class_str:
425
- if any(cls in IGNORED_CLASS_LISTS for cls in class_list):
426
- continue
427
- logger.debug(f"[parser] not find p_class_str: {class_list}")
428
- continue
429
- # 普通标签处理,根据 orders 顺序匹配
430
- for ord_selector, _ in orders:
431
- tag_name = f"{ord_selector}{end_number}"
432
- if data.get("tag") != tag_name:
433
- continue
434
- curr_rule = rules.get(p_class_str, {}).get(ord_selector)
435
- curr_rule = curr_rule if curr_rule else {}
436
- ordered_cache[ord_selector] = apply_rule(data, curr_rule)
437
- break
438
- # 最后按 orders 顺序拼接
439
- for ord_selector, _ in orders:
440
- if ord_selector in ordered_cache:
441
- paragraphs_str += ordered_cache[ord_selector]
442
-
443
- paragraphs_str += "\n\n"
444
-
445
- return paragraphs_str, refl_list
446
-
447
-
448
- def parse_paragraph_names(rules: dict[str, Any]) -> set[str]:
449
- """
450
- Extract all paragraph selector names from parsed rules, excluding "sy".
451
- """
452
- paragraph_names = set()
453
- for group, group_rules in rules.get("rules", {}).items():
454
- if group == "sy":
455
- continue
456
- paragraph_names.update(group_rules.keys())
457
- return paragraph_names
458
-
459
-
460
- def parse_end_number(
461
- main_paragraphs: list[dict[str, Any]],
462
- rules: dict[str, Any],
463
- ) -> str:
464
- """
465
- Find the most frequent numeric suffix from tag names
466
- matched by given paragraph prefixes.
467
- """
468
- paragraph_names = parse_paragraph_names(rules)
469
- end_numbers: dict[int, int] = {}
470
- prefix_hits = 0
471
- sorted_names = sorted(paragraph_names, key=len, reverse=True)
472
-
473
- def rec_parse(item: list[Any] | dict[str, Any]) -> None:
474
- nonlocal prefix_hits
475
- if isinstance(item, list):
476
- for element in item:
477
- rec_parse(element)
478
- elif isinstance(item, dict):
479
- tag = item.get("tag")
480
- if isinstance(tag, str):
481
- for prefix in sorted_names:
482
- if tag.startswith(prefix):
483
- prefix_hits += 1
484
- remain = tag[len(prefix) :]
485
- if remain.isdigit():
486
- num = int(remain)
487
- end_numbers[num] = end_numbers.get(num, 0) + 1
488
- break
489
- for val in item.values():
490
- if isinstance(val, (list | dict)):
491
- rec_parse(val)
492
-
493
- rec_parse(main_paragraphs)
494
-
495
- if not end_numbers:
496
- logger.debug("[Parser] No valid ending numbers found")
497
- return ""
498
-
499
- sorted_numbers = sorted(
500
- end_numbers.items(), key=lambda x: (x[1], x[0]), reverse=True
501
- )
502
-
503
- logger.debug(
504
- "[Parser] Top 3 end numbers:\n%s",
505
- "\n".join(f"{n}: {c}" for n, c in sorted_numbers[:3]),
506
- )
507
- most_common_number, most_common_count = sorted_numbers[0]
508
- if most_common_count <= prefix_hits / 2:
509
- logger.debug(
510
- "[Parser] Top number (%s) does not exceed 50%% threshold: %d of %d",
511
- most_common_number,
512
- most_common_count,
513
- prefix_hits,
514
- )
515
- return ""
454
+ # Non-ordered, non-<y> nodes: include text + tails as-is
455
+ if not has_ordered_rules:
456
+ buf_parts.append(child.text or "")
457
+ if child.tail:
458
+ buf_parts.append(child.tail)
516
459
 
517
- return str(most_common_number)
460
+ # If ordered, flush in global orders with all duplicates preserved
461
+ if has_ordered_rules:
462
+ for tag in orders:
463
+ if tag in ordered_cache:
464
+ buf_parts.extend(ordered_cache[tag])
518
465
 
466
+ para = "".join(buf_parts)
467
+ if para:
468
+ paragraphs_out.append(para)
519
469
 
520
- def contains_keywords(paragraphs: list[dict[str, Any]], keywords: set[str]) -> bool:
521
- for para in paragraphs:
522
- data = para.get("data", [])
523
- for item in data:
524
- if isinstance(item, str) and any(kw in item for kw in keywords):
525
- return True
526
- return False
470
+ return "\n".join(paragraphs_out), refl_list