novel-downloader 1.5.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (241) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/__init__.py +1 -3
  3. novel_downloader/cli/clean.py +21 -88
  4. novel_downloader/cli/config.py +26 -21
  5. novel_downloader/cli/download.py +77 -64
  6. novel_downloader/cli/export.py +16 -20
  7. novel_downloader/cli/main.py +1 -1
  8. novel_downloader/cli/search.py +62 -65
  9. novel_downloader/cli/ui.py +156 -0
  10. novel_downloader/config/__init__.py +8 -5
  11. novel_downloader/config/adapter.py +65 -105
  12. novel_downloader/config/{loader.py → file_io.py} +53 -26
  13. novel_downloader/core/__init__.py +1 -0
  14. novel_downloader/core/archived/deqixs/fetcher.py +115 -0
  15. novel_downloader/core/archived/deqixs/parser.py +132 -0
  16. novel_downloader/core/archived/deqixs/searcher.py +89 -0
  17. novel_downloader/core/{searchers/qidian.py → archived/qidian/searcher.py} +12 -20
  18. novel_downloader/core/archived/wanbengo/searcher.py +98 -0
  19. novel_downloader/core/archived/xshbook/searcher.py +93 -0
  20. novel_downloader/core/downloaders/__init__.py +3 -24
  21. novel_downloader/core/downloaders/base.py +49 -23
  22. novel_downloader/core/downloaders/common.py +191 -137
  23. novel_downloader/core/downloaders/qianbi.py +187 -146
  24. novel_downloader/core/downloaders/qidian.py +187 -141
  25. novel_downloader/core/downloaders/registry.py +4 -2
  26. novel_downloader/core/downloaders/signals.py +46 -0
  27. novel_downloader/core/exporters/__init__.py +3 -20
  28. novel_downloader/core/exporters/base.py +33 -37
  29. novel_downloader/core/exporters/common/__init__.py +1 -2
  30. novel_downloader/core/exporters/common/epub.py +15 -10
  31. novel_downloader/core/exporters/common/main_exporter.py +19 -12
  32. novel_downloader/core/exporters/common/txt.py +14 -9
  33. novel_downloader/core/exporters/epub_util.py +59 -29
  34. novel_downloader/core/exporters/linovelib/__init__.py +1 -0
  35. novel_downloader/core/exporters/linovelib/epub.py +23 -25
  36. novel_downloader/core/exporters/linovelib/main_exporter.py +8 -12
  37. novel_downloader/core/exporters/linovelib/txt.py +17 -11
  38. novel_downloader/core/exporters/qidian.py +2 -8
  39. novel_downloader/core/exporters/registry.py +4 -2
  40. novel_downloader/core/exporters/txt_util.py +7 -7
  41. novel_downloader/core/fetchers/__init__.py +54 -48
  42. novel_downloader/core/fetchers/aaatxt.py +83 -0
  43. novel_downloader/core/fetchers/{biquge/session.py → b520.py} +6 -11
  44. novel_downloader/core/fetchers/{base/session.py → base.py} +37 -46
  45. novel_downloader/core/fetchers/{biquge/browser.py → biquyuedu.py} +12 -17
  46. novel_downloader/core/fetchers/dxmwx.py +110 -0
  47. novel_downloader/core/fetchers/eightnovel.py +139 -0
  48. novel_downloader/core/fetchers/{esjzone/session.py → esjzone.py} +19 -12
  49. novel_downloader/core/fetchers/guidaye.py +85 -0
  50. novel_downloader/core/fetchers/hetushu.py +92 -0
  51. novel_downloader/core/fetchers/{qianbi/browser.py → i25zw.py} +19 -28
  52. novel_downloader/core/fetchers/ixdzs8.py +113 -0
  53. novel_downloader/core/fetchers/jpxs123.py +101 -0
  54. novel_downloader/core/fetchers/lewenn.py +83 -0
  55. novel_downloader/core/fetchers/{linovelib/session.py → linovelib.py} +12 -13
  56. novel_downloader/core/fetchers/piaotia.py +105 -0
  57. novel_downloader/core/fetchers/qbtr.py +101 -0
  58. novel_downloader/core/fetchers/{qianbi/session.py → qianbi.py} +5 -10
  59. novel_downloader/core/fetchers/{qidian/session.py → qidian.py} +46 -39
  60. novel_downloader/core/fetchers/quanben5.py +92 -0
  61. novel_downloader/core/fetchers/{base/rate_limiter.py → rate_limiter.py} +2 -2
  62. novel_downloader/core/fetchers/registry.py +5 -16
  63. novel_downloader/core/fetchers/{sfacg/session.py → sfacg.py} +7 -10
  64. novel_downloader/core/fetchers/shencou.py +106 -0
  65. novel_downloader/core/fetchers/shuhaige.py +84 -0
  66. novel_downloader/core/fetchers/tongrenquan.py +84 -0
  67. novel_downloader/core/fetchers/ttkan.py +95 -0
  68. novel_downloader/core/fetchers/wanbengo.py +83 -0
  69. novel_downloader/core/fetchers/xiaoshuowu.py +106 -0
  70. novel_downloader/core/fetchers/xiguashuwu.py +177 -0
  71. novel_downloader/core/fetchers/xs63b.py +171 -0
  72. novel_downloader/core/fetchers/xshbook.py +85 -0
  73. novel_downloader/core/fetchers/{yamibo/session.py → yamibo.py} +19 -12
  74. novel_downloader/core/fetchers/yibige.py +114 -0
  75. novel_downloader/core/interfaces/__init__.py +1 -9
  76. novel_downloader/core/interfaces/downloader.py +6 -2
  77. novel_downloader/core/interfaces/exporter.py +7 -7
  78. novel_downloader/core/interfaces/fetcher.py +4 -17
  79. novel_downloader/core/interfaces/parser.py +5 -6
  80. novel_downloader/core/interfaces/searcher.py +9 -1
  81. novel_downloader/core/parsers/__init__.py +49 -12
  82. novel_downloader/core/parsers/aaatxt.py +132 -0
  83. novel_downloader/core/parsers/b520.py +116 -0
  84. novel_downloader/core/parsers/base.py +63 -12
  85. novel_downloader/core/parsers/biquyuedu.py +133 -0
  86. novel_downloader/core/parsers/dxmwx.py +162 -0
  87. novel_downloader/core/parsers/eightnovel.py +224 -0
  88. novel_downloader/core/parsers/esjzone.py +61 -66
  89. novel_downloader/core/parsers/guidaye.py +128 -0
  90. novel_downloader/core/parsers/hetushu.py +139 -0
  91. novel_downloader/core/parsers/i25zw.py +137 -0
  92. novel_downloader/core/parsers/ixdzs8.py +186 -0
  93. novel_downloader/core/parsers/jpxs123.py +137 -0
  94. novel_downloader/core/parsers/lewenn.py +142 -0
  95. novel_downloader/core/parsers/linovelib.py +48 -64
  96. novel_downloader/core/parsers/piaotia.py +189 -0
  97. novel_downloader/core/parsers/qbtr.py +136 -0
  98. novel_downloader/core/parsers/qianbi.py +48 -50
  99. novel_downloader/core/parsers/qidian/book_info_parser.py +58 -59
  100. novel_downloader/core/parsers/qidian/chapter_encrypted.py +272 -330
  101. novel_downloader/core/parsers/qidian/chapter_normal.py +24 -55
  102. novel_downloader/core/parsers/qidian/main_parser.py +11 -38
  103. novel_downloader/core/parsers/qidian/utils/__init__.py +1 -0
  104. novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +1 -1
  105. novel_downloader/core/parsers/qidian/utils/fontmap_recover.py +143 -0
  106. novel_downloader/core/parsers/qidian/utils/helpers.py +0 -4
  107. novel_downloader/core/parsers/quanben5.py +103 -0
  108. novel_downloader/core/parsers/registry.py +5 -16
  109. novel_downloader/core/parsers/sfacg.py +38 -45
  110. novel_downloader/core/parsers/shencou.py +215 -0
  111. novel_downloader/core/parsers/shuhaige.py +111 -0
  112. novel_downloader/core/parsers/tongrenquan.py +116 -0
  113. novel_downloader/core/parsers/ttkan.py +132 -0
  114. novel_downloader/core/parsers/wanbengo.py +191 -0
  115. novel_downloader/core/parsers/xiaoshuowu.py +173 -0
  116. novel_downloader/core/parsers/xiguashuwu.py +435 -0
  117. novel_downloader/core/parsers/xs63b.py +161 -0
  118. novel_downloader/core/parsers/xshbook.py +134 -0
  119. novel_downloader/core/parsers/yamibo.py +87 -131
  120. novel_downloader/core/parsers/yibige.py +166 -0
  121. novel_downloader/core/searchers/__init__.py +34 -3
  122. novel_downloader/core/searchers/aaatxt.py +107 -0
  123. novel_downloader/core/searchers/{biquge.py → b520.py} +29 -28
  124. novel_downloader/core/searchers/base.py +112 -36
  125. novel_downloader/core/searchers/dxmwx.py +105 -0
  126. novel_downloader/core/searchers/eightnovel.py +84 -0
  127. novel_downloader/core/searchers/esjzone.py +43 -25
  128. novel_downloader/core/searchers/hetushu.py +92 -0
  129. novel_downloader/core/searchers/i25zw.py +93 -0
  130. novel_downloader/core/searchers/ixdzs8.py +107 -0
  131. novel_downloader/core/searchers/jpxs123.py +107 -0
  132. novel_downloader/core/searchers/piaotia.py +100 -0
  133. novel_downloader/core/searchers/qbtr.py +106 -0
  134. novel_downloader/core/searchers/qianbi.py +74 -40
  135. novel_downloader/core/searchers/quanben5.py +144 -0
  136. novel_downloader/core/searchers/registry.py +24 -8
  137. novel_downloader/core/searchers/shuhaige.py +124 -0
  138. novel_downloader/core/searchers/tongrenquan.py +110 -0
  139. novel_downloader/core/searchers/ttkan.py +92 -0
  140. novel_downloader/core/searchers/xiaoshuowu.py +122 -0
  141. novel_downloader/core/searchers/xiguashuwu.py +95 -0
  142. novel_downloader/core/searchers/xs63b.py +104 -0
  143. novel_downloader/locales/en.json +31 -82
  144. novel_downloader/locales/zh.json +32 -83
  145. novel_downloader/models/__init__.py +21 -22
  146. novel_downloader/models/book.py +44 -0
  147. novel_downloader/models/config.py +4 -37
  148. novel_downloader/models/login.py +1 -1
  149. novel_downloader/models/search.py +5 -0
  150. novel_downloader/resources/config/settings.toml +8 -70
  151. novel_downloader/resources/json/xiguashuwu.json +718 -0
  152. novel_downloader/utils/__init__.py +13 -22
  153. novel_downloader/utils/chapter_storage.py +3 -2
  154. novel_downloader/utils/constants.py +4 -29
  155. novel_downloader/utils/cookies.py +6 -18
  156. novel_downloader/utils/crypto_utils/__init__.py +13 -0
  157. novel_downloader/utils/crypto_utils/aes_util.py +90 -0
  158. novel_downloader/utils/crypto_utils/aes_v1.py +619 -0
  159. novel_downloader/utils/crypto_utils/aes_v2.py +1143 -0
  160. novel_downloader/utils/{crypto_utils.py → crypto_utils/rc4.py} +3 -10
  161. novel_downloader/utils/epub/__init__.py +1 -1
  162. novel_downloader/utils/epub/constants.py +57 -16
  163. novel_downloader/utils/epub/documents.py +88 -194
  164. novel_downloader/utils/epub/models.py +0 -14
  165. novel_downloader/utils/epub/utils.py +63 -96
  166. novel_downloader/utils/file_utils/__init__.py +2 -23
  167. novel_downloader/utils/file_utils/io.py +3 -113
  168. novel_downloader/utils/file_utils/sanitize.py +0 -4
  169. novel_downloader/utils/fontocr.py +207 -0
  170. novel_downloader/utils/logger.py +8 -16
  171. novel_downloader/utils/network.py +2 -2
  172. novel_downloader/utils/state.py +4 -90
  173. novel_downloader/utils/text_utils/__init__.py +1 -7
  174. novel_downloader/utils/text_utils/diff_display.py +5 -7
  175. novel_downloader/utils/time_utils/__init__.py +5 -11
  176. novel_downloader/utils/time_utils/datetime_utils.py +20 -29
  177. novel_downloader/utils/time_utils/sleep_utils.py +4 -8
  178. novel_downloader/web/__init__.py +13 -0
  179. novel_downloader/web/components/__init__.py +11 -0
  180. novel_downloader/web/components/navigation.py +35 -0
  181. novel_downloader/web/main.py +66 -0
  182. novel_downloader/web/pages/__init__.py +17 -0
  183. novel_downloader/web/pages/download.py +78 -0
  184. novel_downloader/web/pages/progress.py +147 -0
  185. novel_downloader/web/pages/search.py +329 -0
  186. novel_downloader/web/services/__init__.py +17 -0
  187. novel_downloader/web/services/client_dialog.py +164 -0
  188. novel_downloader/web/services/cred_broker.py +113 -0
  189. novel_downloader/web/services/cred_models.py +35 -0
  190. novel_downloader/web/services/task_manager.py +264 -0
  191. novel_downloader-2.0.0.dist-info/METADATA +171 -0
  192. novel_downloader-2.0.0.dist-info/RECORD +210 -0
  193. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/entry_points.txt +1 -1
  194. novel_downloader/core/downloaders/biquge.py +0 -29
  195. novel_downloader/core/downloaders/esjzone.py +0 -29
  196. novel_downloader/core/downloaders/linovelib.py +0 -29
  197. novel_downloader/core/downloaders/sfacg.py +0 -29
  198. novel_downloader/core/downloaders/yamibo.py +0 -29
  199. novel_downloader/core/exporters/biquge.py +0 -22
  200. novel_downloader/core/exporters/esjzone.py +0 -22
  201. novel_downloader/core/exporters/qianbi.py +0 -22
  202. novel_downloader/core/exporters/sfacg.py +0 -22
  203. novel_downloader/core/exporters/yamibo.py +0 -22
  204. novel_downloader/core/fetchers/base/__init__.py +0 -14
  205. novel_downloader/core/fetchers/base/browser.py +0 -422
  206. novel_downloader/core/fetchers/biquge/__init__.py +0 -14
  207. novel_downloader/core/fetchers/esjzone/__init__.py +0 -14
  208. novel_downloader/core/fetchers/esjzone/browser.py +0 -209
  209. novel_downloader/core/fetchers/linovelib/__init__.py +0 -14
  210. novel_downloader/core/fetchers/linovelib/browser.py +0 -198
  211. novel_downloader/core/fetchers/qianbi/__init__.py +0 -14
  212. novel_downloader/core/fetchers/qidian/__init__.py +0 -14
  213. novel_downloader/core/fetchers/qidian/browser.py +0 -326
  214. novel_downloader/core/fetchers/sfacg/__init__.py +0 -14
  215. novel_downloader/core/fetchers/sfacg/browser.py +0 -194
  216. novel_downloader/core/fetchers/yamibo/__init__.py +0 -14
  217. novel_downloader/core/fetchers/yamibo/browser.py +0 -234
  218. novel_downloader/core/parsers/biquge.py +0 -139
  219. novel_downloader/models/chapter.py +0 -25
  220. novel_downloader/models/types.py +0 -13
  221. novel_downloader/tui/__init__.py +0 -7
  222. novel_downloader/tui/app.py +0 -32
  223. novel_downloader/tui/main.py +0 -17
  224. novel_downloader/tui/screens/__init__.py +0 -14
  225. novel_downloader/tui/screens/home.py +0 -198
  226. novel_downloader/tui/screens/login.py +0 -74
  227. novel_downloader/tui/styles/home_layout.tcss +0 -79
  228. novel_downloader/tui/widgets/richlog_handler.py +0 -24
  229. novel_downloader/utils/cache.py +0 -24
  230. novel_downloader/utils/fontocr/__init__.py +0 -22
  231. novel_downloader/utils/fontocr/hash_store.py +0 -280
  232. novel_downloader/utils/fontocr/hash_utils.py +0 -103
  233. novel_downloader/utils/fontocr/model_loader.py +0 -69
  234. novel_downloader/utils/fontocr/ocr_v1.py +0 -315
  235. novel_downloader/utils/fontocr/ocr_v2.py +0 -764
  236. novel_downloader/utils/fontocr/ocr_v3.py +0 -744
  237. novel_downloader-1.5.0.dist-info/METADATA +0 -196
  238. novel_downloader-1.5.0.dist-info/RECORD +0 -164
  239. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/WHEEL +0 -0
  240. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/licenses/LICENSE +0 -0
  241. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/top_level.txt +0 -0
@@ -11,9 +11,10 @@ from __future__ import annotations
11
11
 
12
12
  import json
13
13
  import logging
14
- from typing import TYPE_CHECKING, Any
14
+ import re
15
+ from contextlib import suppress
16
+ from typing import TYPE_CHECKING, TypedDict
15
17
 
16
- import tinycss2
17
18
  from lxml import html
18
19
 
19
20
  from novel_downloader.models import ChapterDict
@@ -29,13 +30,36 @@ from .utils import (
29
30
  is_duplicated,
30
31
  vip_status,
31
32
  )
33
+ from .utils.fontmap_recover import (
34
+ apply_font_mapping,
35
+ generate_font_map,
36
+ )
32
37
 
33
38
  if TYPE_CHECKING:
34
39
  from .main_parser import QidianParser
35
40
 
36
41
  logger = logging.getLogger(__name__)
37
- IGNORED_CLASS_LISTS = {"title", "review"}
38
- NON_CONTENT_KEYWORDS = {"旧版", "反馈", "扫码"}
42
+ _RE_ATTR = re.compile(r"attr\(\s*([^)]+?)\s*\)", re.I)
43
+ _RE_SCALEX = re.compile(r"scalex\(\s*-?1\s*\)", re.I)
44
+
45
+
46
+ class Rule(TypedDict, total=False):
47
+ delete_all: bool
48
+ delete_first: bool
49
+ transform_flip_x: bool
50
+ append_start_char: str
51
+ append_end_char: str
52
+ append_start_attr: str
53
+ append_end_attr: str
54
+
55
+
56
+ class Rules(TypedDict):
57
+ # e.g., orders = ["i", "em", "span"]
58
+ orders: list[str]
59
+ # e.g., sy["sy-3"] -> Rule
60
+ sy: dict[str, Rule]
61
+ # e.g., p_rules["p3"]["i"] -> Rule
62
+ p_rules: dict[str, dict[str, Rule]]
39
63
 
40
64
 
41
65
  def parse_encrypted_chapter(
@@ -58,7 +82,7 @@ def parse_encrypted_chapter(
58
82
  :return: Formatted chapter text or empty string if not parsable.
59
83
  """
60
84
  try:
61
- if not (parser._decode_font and parser._font_ocr):
85
+ if not parser._decode_font:
62
86
  return None
63
87
  ssr_data = find_ssr_page_context(html_str)
64
88
  chapter_info = extract_chapter_info(ssr_data)
@@ -104,47 +128,21 @@ def parse_encrypted_chapter(
104
128
  raise ValueError("fixed_path is None: failed to download font")
105
129
 
106
130
  # Extract and render paragraphs from HTML with CSS rules
107
- main_paragraphs = extract_paragraphs_recursively(html_str, chapter_id)
108
- if not main_paragraphs or contains_keywords(
109
- main_paragraphs, NON_CONTENT_KEYWORDS
110
- ):
111
- if vip_status(ssr_data):
112
- try:
113
- decryptor = get_decryptor()
114
- raw_html = decryptor.decrypt(
115
- raw_html,
116
- chapter_id,
117
- fkp,
118
- parser._fuid,
119
- )
120
- except Exception as e:
121
- logger.error(
122
- "[Parser] decryption failed for '%s': %s", chapter_id, e
123
- )
124
- return None
125
- main_paragraphs = extract_paragraphs_recursively(raw_html, chapter_id)
126
-
127
- if parser.save_font_debug:
128
- main_paragraphs_path = debug_dir / "main_paragraphs_debug.json"
129
- main_paragraphs_path.write_text(
130
- json.dumps(main_paragraphs, ensure_ascii=False, indent=2),
131
- encoding="utf-8",
132
- )
133
-
134
- paragraphs_rules = parse_rule(css_str)
135
- if parser.save_font_debug:
136
- paragraphs_rules_path = debug_dir / "paragraphs_rules_debug.json"
137
- paragraphs_rules_path.write_text(
138
- json.dumps(paragraphs_rules, ensure_ascii=False, indent=2),
139
- encoding="utf-8",
140
- )
141
-
142
- end_number = parse_end_number(main_paragraphs, paragraphs_rules)
143
- paragraphs_str, refl_list = render_paragraphs(
144
- main_paragraphs,
145
- paragraphs_rules,
146
- end_number,
147
- )
131
+ if vip_status(ssr_data):
132
+ try:
133
+ decryptor = get_decryptor()
134
+ raw_html = decryptor.decrypt(
135
+ raw_html,
136
+ chapter_id,
137
+ fkp,
138
+ parser._fuid,
139
+ )
140
+ except Exception as e:
141
+ logger.error("[Parser] decryption failed for '%s': %s", chapter_id, e)
142
+ return None
143
+
144
+ css_rules = parse_css_rules(css_str)
145
+ paragraphs_str, refl_list = render_visible_text(raw_html, css_rules)
148
146
  if parser.save_font_debug:
149
147
  paragraphs_str_path = debug_dir / f"{chapter_id}_debug.txt"
150
148
  paragraphs_str_path.write_text(paragraphs_str, encoding="utf-8")
@@ -161,13 +159,17 @@ def parse_encrypted_chapter(
161
159
  encoding="utf-8",
162
160
  )
163
161
 
164
- mapping_result = parser._font_ocr.generate_font_map(
162
+ mapping_result = generate_font_map(
165
163
  fixed_font_path=fixed_path,
166
164
  random_font_path=rand_path,
167
165
  char_set=char_set,
168
166
  refl_set=refl_set,
169
- chapter_id=chapter_id,
167
+ cache_dir=parser._base_cache_dir,
168
+ batch_size=parser._config.batch_size,
170
169
  )
170
+ if not mapping_result:
171
+ return None
172
+
171
173
  if parser.save_font_debug:
172
174
  mapping_json_path = debug_dir / "font_mapping.json"
173
175
  mapping_json_path.write_text(
@@ -176,12 +178,12 @@ def parse_encrypted_chapter(
176
178
  )
177
179
 
178
180
  # Reconstruct final readable text
179
- original_text = parser._font_ocr.apply_font_mapping(
181
+ original_text = apply_font_mapping(
180
182
  text=paragraphs_str,
181
183
  font_map=mapping_result,
182
184
  )
183
185
 
184
- final_paragraphs_str = "\n\n".join(
186
+ final_paragraphs_str = "\n".join(
185
187
  line.strip() for line in original_text.splitlines() if line.strip()
186
188
  )
187
189
  if parser._use_truncation and duplicated:
@@ -211,318 +213,258 @@ def parse_encrypted_chapter(
211
213
  return None
212
214
 
213
215
 
214
- def extract_paragraphs_recursively(
215
- html_str: str,
216
- chapter_id: str,
217
- ) -> list[dict[str, Any]]:
218
- def parse_element(elem: html.HtmlElement) -> dict[str, Any]:
219
- class_attr = elem.attrib.get("class", "")
220
- class_list = class_attr.split() if isinstance(class_attr, str) else class_attr
221
- if "review" in class_list:
222
- return {}
223
-
224
- # Build attrs with class as list
225
- attrs = {k: v.split() if k == "class" else v for k, v in elem.attrib.items()}
226
-
227
- node: dict[str, Any] = {
228
- "tag": elem.tag,
229
- "attrs": attrs,
230
- "data": [],
231
- }
232
-
233
- # Append entire elem.text if present (no splitting)
234
- if elem.text:
235
- node["data"].append(elem.text)
236
-
237
- # Recurse into children
238
- for child in elem.iterchildren(tag=None):
239
- child_dict = parse_element(child)
240
- if child_dict:
241
- node["data"].append(child_dict)
242
-
243
- # Append entire tail string (no split)
244
- if child.tail:
245
- node["data"].append(child.tail)
246
-
247
- return node
248
-
249
- tree = html.fromstring(html_str)
250
-
251
- # Try to find <main id="c-{chapter_id}">
252
- main_elem = tree.xpath(f'//main[@id="c-{chapter_id}"]')
253
- search_root = main_elem[0] if main_elem else tree
254
- return [parse_element(p) for p in search_root.findall(".//p")]
255
-
256
-
257
- def parse_rule(css_str: str) -> dict[str, Any]:
216
+ def _only_tag(selector: str) -> str | None:
258
217
  """
259
- Parse a CSS string and extract style rules for rendering.
218
+ Normalize a selector into just its tag name for ordering.
260
219
 
261
- Handles:
262
- - font-size:0 (mark for deletion)
263
- - scaleX(-1) (mark as mirrored)
264
- - ::before / ::after with content or attr()
265
- - class + tag selector mapping
266
- - custom rendering order via 'order'
220
+ Handles forms like 'i', 'em::before', '.p3 i', '.p2 span::after'.
267
221
 
268
- :param css_str: Raw CSS stylesheet string.
269
- :return: Dict with "rules" and "orders" for rendering.
222
+ Returns None if can't extract a tag.
270
223
  """
271
-
272
- rules: dict[str, Any] = {}
273
- orders = []
274
-
275
- stylesheet = tinycss2.parse_stylesheet(
276
- css_str, skip_comments=True, skip_whitespace=True
277
- )
278
-
279
- for rule in stylesheet:
280
- if rule.type != "qualified-rule":
281
- continue
282
-
283
- selector = tinycss2.serialize(rule.prelude).strip()
284
- declarations = tinycss2.parse_declaration_list(rule.content)
285
-
286
- parsed = {}
287
- order_val = None
288
-
289
- for decl in declarations:
290
- if decl.type != "declaration":
291
- continue
292
- name = decl.lower_name
293
- value = tinycss2.serialize(decl.value).strip()
294
-
295
- if name == "font-size" and value == "0":
224
+ sel = selector.strip()
225
+ # If it has spaces, take the rightmost simple selector
226
+ last = sel.split()[-1]
227
+ # Drop ::pseudo
228
+ last = last.split("::", 1)[0]
229
+ # If it's like 'span[attr=..]' keep 'span'
230
+ last = last.split("[", 1)[0]
231
+ # If it starts with '.', it's not a tag
232
+ if not last or last.startswith("."):
233
+ return None
234
+ return last
235
+
236
+
237
+ def _parse_decls(block: str) -> list[tuple[str, str]]:
238
+ """
239
+ Parse 'name:value;...' inside a block. Tolerates quotes and attr().
240
+ """
241
+ decls: list[tuple[str, str]] = []
242
+ i = 0
243
+ n = len(block)
244
+ name: list[str] = []
245
+ val: list[str] = []
246
+ in_name = True
247
+ quote = None # track ' or "
248
+ while i < n:
249
+ c = block[i]
250
+ if quote:
251
+ # inside quotes
252
+ if c == "\\" and i + 1 < n:
253
+ # keep escaped char
254
+ (name if in_name else val).append(c)
255
+ i += 1
256
+ (name if in_name else val).append(block[i])
257
+ elif c == quote:
258
+ (name if in_name else val).append(c)
259
+ quote = None
260
+ else:
261
+ (name if in_name else val).append(c)
262
+ else:
263
+ if c in ("'", '"'):
264
+ (name if in_name else val).append(c)
265
+ quote = c
266
+ elif in_name and c == ":":
267
+ in_name = False
268
+ elif c == ";":
269
+ nm = "".join(name).strip().lower()
270
+ vl = "".join(val).strip()
271
+ if nm:
272
+ decls.append((nm, vl))
273
+ name.clear()
274
+ val.clear()
275
+ in_name = True
276
+ else:
277
+ (name if in_name else val).append(c)
278
+ i += 1
279
+
280
+ if name or val:
281
+ nm = "".join(name).strip().lower()
282
+ vl = "".join(val).strip()
283
+ if nm:
284
+ decls.append((nm, vl))
285
+ return decls
286
+
287
+
288
+ def parse_css_rules(css_str: str) -> Rules:
289
+ """
290
+ Produces normalized Rules with:
291
+ - orders: list[str] of tag names sorted by numeric 'order'
292
+ - sy: '.sy-*' class rules
293
+ - p_rules: '.p* <tag>' rules, indexed by p-class then tag
294
+ """
295
+ rules: Rules = {"orders": [], "sy": {}, "p_rules": {}}
296
+ order_pairs: list[tuple[str, int]] = []
297
+
298
+ i = 0
299
+ while True:
300
+ b1 = css_str.find("{", i)
301
+ if b1 == -1:
302
+ break
303
+ selector = css_str[i:b1].strip().lower()
304
+ b2 = css_str.find("}", b1 + 1)
305
+ if b2 == -1:
306
+ break
307
+ block = css_str[b1 + 1 : b2]
308
+ i = b2 + 1
309
+
310
+ decls = _parse_decls(block)
311
+
312
+ new_rule: Rule = {}
313
+ order_val: int | None = None
314
+
315
+ for name, value in decls:
316
+ v = value.strip()
317
+ if name == "font-size" and v == "0":
296
318
  if "::first-letter" in selector:
297
- parsed["delete-first"] = True
319
+ new_rule["delete_first"] = True
298
320
  else:
299
- parsed["delete-all"] = True
300
- elif name == "transform" and value.lower() == "scalex(-1)":
301
- parsed["transform-x_-1"] = True
321
+ new_rule["delete_all"] = True
322
+ elif name == "transform":
323
+ if _RE_SCALEX.search(v.replace(" ", "")):
324
+ new_rule["transform_flip_x"] = True
302
325
  elif name == "order":
303
- order_val = value
326
+ with suppress(ValueError, TypeError):
327
+ order_val = int(v)
304
328
  elif name == "content":
329
+ # normalize: remove outer quotes
305
330
  if "::after" in selector:
306
- if "attr(" in value:
307
- parsed["append-end-attr"] = value.split("attr(")[1].split(")")[
308
- 0
309
- ]
331
+ m = _RE_ATTR.search(v)
332
+ if m:
333
+ new_rule["append_end_attr"] = m.group(1)
310
334
  else:
311
- parsed["append-end-char"] = value.strip("\"'")
335
+ s = v.strip().strip("\"'")
336
+ new_rule["append_end_char"] = s
312
337
  elif "::before" in selector:
313
- if "attr(" in value:
314
- parsed["append-start-attr"] = value.split("attr(")[1].split(
315
- ")"
316
- )[0]
338
+ m = _RE_ATTR.search(v)
339
+ if m:
340
+ new_rule["append_start_attr"] = m.group(1)
317
341
  else:
318
- parsed["append-start-char"] = value.strip("\"'")
342
+ s = v.strip().strip("\"'")
343
+ new_rule["append_start_char"] = s
319
344
 
320
- # Store in structure
345
+ # classification
321
346
  if selector.startswith(".sy-"):
322
- rules.setdefault("sy", {})[selector[1:]] = parsed
347
+ key = selector.lstrip(".")
348
+ old = rules["sy"].get(key)
349
+ rules["sy"][key] = {**old, **new_rule} if old else (new_rule or {})
350
+
323
351
  elif selector.startswith(".p") and " " in selector:
324
- class_str, tag_part = selector.split(" ", 1)
325
- class_str = class_str.lstrip(".")
326
- tag_part = tag_part.split("::")[0]
327
- rules.setdefault(class_str, {}).setdefault(tag_part, {}).update(parsed)
352
+ p_cls, right = selector.split(" ", 1)
353
+ p_cls = p_cls.lstrip(".")
354
+ tag = _only_tag(right)
355
+ if tag:
356
+ prev = rules["p_rules"].setdefault(p_cls, {}).get(tag)
357
+ rules["p_rules"][p_cls][tag] = (
358
+ {**prev, **new_rule} if prev else (new_rule or {})
359
+ )
360
+
361
+ if order_val is not None:
362
+ tag_for_order = _only_tag(selector)
363
+ if tag_for_order:
364
+ order_pairs.append((tag_for_order, order_val))
365
+
366
+ # normalize orders
367
+ order_pairs.sort(key=lambda t: t[1])
368
+ seen = set()
369
+ orders: list[str] = []
370
+ for tag, _num in order_pairs:
371
+ if tag not in seen:
372
+ seen.add(tag)
373
+ orders.append(tag)
374
+ rules["orders"] = orders
375
+ return rules
376
+
377
+
378
+ def render_visible_text(html_str: str, rules: Rules) -> tuple[str, list[str]]:
379
+ """
380
+ Renderer the HTML using pre-parsed Rules.
381
+ """
382
+ tree = html.fromstring(html_str)
383
+ paragraphs_out: list[str] = []
384
+ refl_list: list[str] = []
385
+ orders = rules.get("orders") or []
386
+ p_rules = rules.get("p_rules") or {}
387
+ sy_rules = rules.get("sy") or {}
328
388
 
329
- if order_val:
330
- orders.append((selector, order_val))
389
+ def _class_list(el: html.HtmlElement) -> list[str]:
390
+ cls = el.get("class")
391
+ return cls.split() if cls else []
331
392
 
332
- orders.sort(key=lambda x: int(x[1]))
333
- return {"rules": rules, "orders": orders}
393
+ def _apply_rule(el: html.HtmlElement, rule: Rule) -> str:
394
+ if rule.get("delete_all"):
395
+ return ""
334
396
 
397
+ parts: list[str] = []
398
+ if "append_start_char" in rule:
399
+ parts.append(rule["append_start_char"])
400
+ if "append_start_attr" in rule:
401
+ parts.append(el.get(rule["append_start_attr"], ""))
335
402
 
336
- def render_paragraphs(
337
- main_paragraphs: list[dict[str, Any]],
338
- rules: dict[str, Any],
339
- end_number: str = "",
340
- ) -> tuple[str, list[str]]:
341
- """
342
- Applies the parsed CSS rules to the paragraph structure and
343
- reconstructs the visible text.
403
+ text = el.text or ""
404
+ if rule.get("delete_first") and text:
405
+ text = text[1:]
406
+ parts.append(text)
344
407
 
345
- Handles special class styles like .sy-*, text order control,
346
- mirrored characters, etc.
408
+ if "append_end_char" in rule:
409
+ parts.append(rule["append_end_char"])
410
+ if "append_end_attr" in rule:
411
+ parts.append(el.get(rule["append_end_attr"], ""))
347
412
 
348
- :param main_paragraphs: A list of paragraph dictionaries, each with 'attrs'
349
- and 'data' fields representing structured content.
350
- :param rules: A dictionary with keys 'orders' and 'rules', parsed from CSS.
351
- - rules['orders']: List of (selector, id) tuples.
352
- - rules['rules']: Nested dict containing transformation rules.
413
+ s = "".join(parts)
353
414
 
354
- :return:
355
- - A reconstructed paragraph string with line breaks.
356
- - A list of mirrored (reflected) characters for later OCR processing.
357
- """
358
- orders: list[tuple[str, str]] = rules.get("orders", [])
359
- rules = rules.get("rules", {})
360
- refl_list: list[str] = []
415
+ if rule.get("transform_flip_x") and s:
416
+ refl_list.append(s)
361
417
 
362
- def apply_rule(data: dict[str, Any], rule: dict[str, Any]) -> str:
363
- if rule.get("delete-all", False):
364
- return ""
418
+ return s
365
419
 
366
- curr_str = ""
367
- if isinstance(data.get("data"), list) and data["data"]:
368
- first_data = data["data"][0]
369
- if isinstance(first_data, str):
370
- curr_str += first_data
420
+ for p in tree.findall(".//p"):
421
+ p_classes = _class_list(p)
422
+ p_key = next((c for c in p_classes if c.startswith("p")), None)
423
+ has_ordered_rules = p_key in p_rules
371
424
 
372
- if rule.get("delete-first", False):
373
- curr_str = "" if len(curr_str) <= 1 else curr_str[1:]
425
+ buf_parts: list[str] = []
374
426
 
375
- curr_str += rule.get("append-end-char", "")
427
+ if p.text and not has_ordered_rules:
428
+ buf_parts.append(p.text)
376
429
 
377
- attr_name = rule.get("append-end-attr", "")
378
- if attr_name:
379
- curr_str += data.get("attrs", {}).get(f"{attr_name}{end_number}", "")
430
+ ordered_cache: dict[str, list[str]] = {}
380
431
 
381
- curr_str = rule.get("append-start-char", "") + curr_str
432
+ for child in p:
433
+ tag = str(child.tag)
382
434
 
383
- attr_name = rule.get("append-start-attr", "")
384
- if attr_name:
385
- curr_str = (
386
- data.get("attrs", {}).get(f"{attr_name}{end_number}", "") + curr_str
387
- )
435
+ # Handle inline <y class="sy-*"> spans
436
+ if tag == "y" and not has_ordered_rules:
437
+ y_cls = next(
438
+ (c for c in _class_list(child) if c.startswith("sy-")), None
439
+ )
440
+ if y_cls and y_cls in sy_rules:
441
+ buf_parts.append(_apply_rule(child, sy_rules[y_cls]))
442
+ else:
443
+ buf_parts.append(child.text or "")
444
+ if child.tail:
445
+ buf_parts.append(child.tail)
446
+ continue
388
447
 
389
- if rule.get("transform-x_-1", False):
390
- refl_list.append(curr_str)
391
- return curr_str
392
-
393
- paragraphs_str = ""
394
- for paragraph in main_paragraphs:
395
- class_list = paragraph.get("attrs", {}).get("class", [])
396
- p_class_str = next((c for c in class_list if c.startswith("p")), None)
397
- curr_datas = paragraph.get("data", [])
398
-
399
- ordered_cache = {}
400
- for data in curr_datas:
401
- # 文本节点直接加
402
- if isinstance(data, str):
403
- paragraphs_str += data
448
+ # Handle ordered paragraphs: only cache tags that appear in `orders`
449
+ if p_key and has_ordered_rules and tag in orders:
450
+ rule = p_rules[p_key].get(tag, {})
451
+ ordered_cache.setdefault(tag, []).append(_apply_rule(child, rule))
404
452
  continue
405
453
 
406
- if isinstance(data, dict):
407
- tag = data.get("tag", "")
408
- attrs = data.get("attrs", {})
409
-
410
- # 跳过 span.review
411
- if tag == "span" and "class" in attrs and "review" in attrs["class"]:
412
- continue
413
-
414
- # sy 类型标签处理
415
- if tag == "y":
416
- tag_class_list = attrs.get("class", [])
417
- tag_class = next(
418
- (c for c in tag_class_list if c.startswith("sy-")), None
419
- )
420
-
421
- if tag_class in rules.get("sy", {}):
422
- curr_rule = rules["sy"][tag_class]
423
- paragraphs_str += apply_rule(data, curr_rule)
424
- continue
425
-
426
- if not p_class_str:
427
- if any(cls in IGNORED_CLASS_LISTS for cls in class_list):
428
- continue
429
- logger.debug(f"[parser] not find p_class_str: {class_list}")
430
- continue
431
- # 普通标签处理,根据 orders 顺序匹配
432
- for ord_selector, _ in orders:
433
- tag_name = f"{ord_selector}{end_number}"
434
- if data.get("tag") != tag_name:
435
- continue
436
- curr_rule = rules.get(p_class_str, {}).get(ord_selector)
437
- curr_rule = curr_rule if curr_rule else {}
438
- ordered_cache[ord_selector] = apply_rule(data, curr_rule)
439
- break
440
- # 最后按 orders 顺序拼接
441
- for ord_selector, _ in orders:
442
- if ord_selector in ordered_cache:
443
- paragraphs_str += ordered_cache[ord_selector]
444
-
445
- paragraphs_str += "\n\n"
446
-
447
- return paragraphs_str, refl_list
448
-
449
-
450
- def parse_paragraph_names(rules: dict[str, Any]) -> set[str]:
451
- """
452
- Extract all paragraph selector names from parsed rules, excluding "sy".
453
- """
454
- paragraph_names = set()
455
- for group, group_rules in rules.get("rules", {}).items():
456
- if group == "sy":
457
- continue
458
- paragraph_names.update(group_rules.keys())
459
- return paragraph_names
460
-
461
-
462
- def parse_end_number(
463
- main_paragraphs: list[dict[str, Any]],
464
- rules: dict[str, Any],
465
- ) -> str:
466
- """
467
- Find the most frequent numeric suffix from tag names
468
- matched by given paragraph prefixes.
469
- """
470
- paragraph_names = parse_paragraph_names(rules)
471
- end_numbers: dict[int, int] = {}
472
- prefix_hits = 0
473
- sorted_names = sorted(paragraph_names, key=len, reverse=True)
474
-
475
- def rec_parse(item: list[Any] | dict[str, Any]) -> None:
476
- nonlocal prefix_hits
477
- if isinstance(item, list):
478
- for element in item:
479
- rec_parse(element)
480
- elif isinstance(item, dict):
481
- tag = item.get("tag")
482
- if isinstance(tag, str):
483
- for prefix in sorted_names:
484
- if tag.startswith(prefix):
485
- prefix_hits += 1
486
- remain = tag[len(prefix) :]
487
- if remain.isdigit():
488
- num = int(remain)
489
- end_numbers[num] = end_numbers.get(num, 0) + 1
490
- break
491
- for val in item.values():
492
- if isinstance(val, (list | dict)):
493
- rec_parse(val)
494
-
495
- rec_parse(main_paragraphs)
496
-
497
- if not end_numbers:
498
- logger.debug("[Parser] No valid ending numbers found")
499
- return ""
500
-
501
- sorted_numbers = sorted(
502
- end_numbers.items(), key=lambda x: (x[1], x[0]), reverse=True
503
- )
504
-
505
- logger.debug(
506
- "[Parser] Top 3 end numbers:\n%s",
507
- "\n".join(f"{n}: {c}" for n, c in sorted_numbers[:3]),
508
- )
509
- most_common_number, most_common_count = sorted_numbers[0]
510
- if most_common_count <= prefix_hits / 2:
511
- logger.debug(
512
- "[Parser] Top number (%s) does not exceed 50%% threshold: %d of %d",
513
- most_common_number,
514
- most_common_count,
515
- prefix_hits,
516
- )
517
- return ""
454
+ # Non-ordered, non-<y> nodes: include text + tails as-is
455
+ if not has_ordered_rules:
456
+ buf_parts.append(child.text or "")
457
+ if child.tail:
458
+ buf_parts.append(child.tail)
518
459
 
519
- return str(most_common_number)
460
+ # If ordered, flush in global orders with all duplicates preserved
461
+ if has_ordered_rules:
462
+ for tag in orders:
463
+ if tag in ordered_cache:
464
+ buf_parts.extend(ordered_cache[tag])
520
465
 
466
+ para = "".join(buf_parts)
467
+ if para:
468
+ paragraphs_out.append(para)
521
469
 
522
- def contains_keywords(paragraphs: list[dict[str, Any]], keywords: set[str]) -> bool:
523
- for para in paragraphs:
524
- data = para.get("data", [])
525
- for item in data:
526
- if isinstance(item, str) and any(kw in item for kw in keywords):
527
- return True
528
- return False
470
+ return "\n".join(paragraphs_out), refl_list