novel-downloader 1.5.0__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (248) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/__init__.py +1 -3
  3. novel_downloader/cli/clean.py +21 -88
  4. novel_downloader/cli/config.py +26 -21
  5. novel_downloader/cli/download.py +79 -66
  6. novel_downloader/cli/export.py +17 -21
  7. novel_downloader/cli/main.py +1 -1
  8. novel_downloader/cli/search.py +62 -65
  9. novel_downloader/cli/ui.py +156 -0
  10. novel_downloader/config/__init__.py +8 -5
  11. novel_downloader/config/adapter.py +206 -209
  12. novel_downloader/config/{loader.py → file_io.py} +53 -26
  13. novel_downloader/core/__init__.py +5 -5
  14. novel_downloader/core/archived/deqixs/fetcher.py +115 -0
  15. novel_downloader/core/archived/deqixs/parser.py +132 -0
  16. novel_downloader/core/archived/deqixs/searcher.py +89 -0
  17. novel_downloader/core/{searchers/qidian.py → archived/qidian/searcher.py} +12 -20
  18. novel_downloader/core/archived/wanbengo/searcher.py +98 -0
  19. novel_downloader/core/archived/xshbook/searcher.py +93 -0
  20. novel_downloader/core/downloaders/__init__.py +3 -24
  21. novel_downloader/core/downloaders/base.py +49 -23
  22. novel_downloader/core/downloaders/common.py +191 -137
  23. novel_downloader/core/downloaders/qianbi.py +187 -146
  24. novel_downloader/core/downloaders/qidian.py +187 -141
  25. novel_downloader/core/downloaders/registry.py +4 -2
  26. novel_downloader/core/downloaders/signals.py +46 -0
  27. novel_downloader/core/exporters/__init__.py +3 -20
  28. novel_downloader/core/exporters/base.py +33 -37
  29. novel_downloader/core/exporters/common/__init__.py +1 -2
  30. novel_downloader/core/exporters/common/epub.py +15 -10
  31. novel_downloader/core/exporters/common/main_exporter.py +19 -12
  32. novel_downloader/core/exporters/common/txt.py +17 -12
  33. novel_downloader/core/exporters/epub_util.py +59 -29
  34. novel_downloader/core/exporters/linovelib/__init__.py +1 -0
  35. novel_downloader/core/exporters/linovelib/epub.py +23 -25
  36. novel_downloader/core/exporters/linovelib/main_exporter.py +8 -12
  37. novel_downloader/core/exporters/linovelib/txt.py +20 -14
  38. novel_downloader/core/exporters/qidian.py +2 -8
  39. novel_downloader/core/exporters/registry.py +4 -2
  40. novel_downloader/core/exporters/txt_util.py +7 -7
  41. novel_downloader/core/fetchers/__init__.py +54 -48
  42. novel_downloader/core/fetchers/aaatxt.py +83 -0
  43. novel_downloader/core/fetchers/{biquge/session.py → b520.py} +6 -11
  44. novel_downloader/core/fetchers/{base/session.py → base.py} +37 -46
  45. novel_downloader/core/fetchers/{biquge/browser.py → biquyuedu.py} +12 -17
  46. novel_downloader/core/fetchers/dxmwx.py +110 -0
  47. novel_downloader/core/fetchers/eightnovel.py +139 -0
  48. novel_downloader/core/fetchers/{esjzone/session.py → esjzone.py} +19 -12
  49. novel_downloader/core/fetchers/guidaye.py +85 -0
  50. novel_downloader/core/fetchers/hetushu.py +92 -0
  51. novel_downloader/core/fetchers/{qianbi/browser.py → i25zw.py} +19 -28
  52. novel_downloader/core/fetchers/ixdzs8.py +113 -0
  53. novel_downloader/core/fetchers/jpxs123.py +101 -0
  54. novel_downloader/core/fetchers/lewenn.py +83 -0
  55. novel_downloader/core/fetchers/{linovelib/session.py → linovelib.py} +12 -13
  56. novel_downloader/core/fetchers/piaotia.py +105 -0
  57. novel_downloader/core/fetchers/qbtr.py +101 -0
  58. novel_downloader/core/fetchers/{qianbi/session.py → qianbi.py} +5 -10
  59. novel_downloader/core/fetchers/{qidian/session.py → qidian.py} +56 -64
  60. novel_downloader/core/fetchers/quanben5.py +92 -0
  61. novel_downloader/core/fetchers/{base/rate_limiter.py → rate_limiter.py} +2 -2
  62. novel_downloader/core/fetchers/registry.py +5 -16
  63. novel_downloader/core/fetchers/{sfacg/session.py → sfacg.py} +7 -10
  64. novel_downloader/core/fetchers/shencou.py +106 -0
  65. novel_downloader/core/fetchers/shuhaige.py +84 -0
  66. novel_downloader/core/fetchers/tongrenquan.py +84 -0
  67. novel_downloader/core/fetchers/ttkan.py +95 -0
  68. novel_downloader/core/fetchers/wanbengo.py +83 -0
  69. novel_downloader/core/fetchers/xiaoshuowu.py +106 -0
  70. novel_downloader/core/fetchers/xiguashuwu.py +177 -0
  71. novel_downloader/core/fetchers/xs63b.py +171 -0
  72. novel_downloader/core/fetchers/xshbook.py +85 -0
  73. novel_downloader/core/fetchers/{yamibo/session.py → yamibo.py} +19 -12
  74. novel_downloader/core/fetchers/yibige.py +114 -0
  75. novel_downloader/core/interfaces/__init__.py +1 -9
  76. novel_downloader/core/interfaces/downloader.py +6 -2
  77. novel_downloader/core/interfaces/exporter.py +7 -7
  78. novel_downloader/core/interfaces/fetcher.py +6 -19
  79. novel_downloader/core/interfaces/parser.py +7 -8
  80. novel_downloader/core/interfaces/searcher.py +9 -1
  81. novel_downloader/core/parsers/__init__.py +49 -12
  82. novel_downloader/core/parsers/aaatxt.py +132 -0
  83. novel_downloader/core/parsers/b520.py +116 -0
  84. novel_downloader/core/parsers/base.py +64 -12
  85. novel_downloader/core/parsers/biquyuedu.py +133 -0
  86. novel_downloader/core/parsers/dxmwx.py +162 -0
  87. novel_downloader/core/parsers/eightnovel.py +224 -0
  88. novel_downloader/core/parsers/esjzone.py +64 -69
  89. novel_downloader/core/parsers/guidaye.py +128 -0
  90. novel_downloader/core/parsers/hetushu.py +139 -0
  91. novel_downloader/core/parsers/i25zw.py +137 -0
  92. novel_downloader/core/parsers/ixdzs8.py +186 -0
  93. novel_downloader/core/parsers/jpxs123.py +137 -0
  94. novel_downloader/core/parsers/lewenn.py +142 -0
  95. novel_downloader/core/parsers/linovelib.py +48 -64
  96. novel_downloader/core/parsers/piaotia.py +189 -0
  97. novel_downloader/core/parsers/qbtr.py +136 -0
  98. novel_downloader/core/parsers/qianbi.py +48 -50
  99. novel_downloader/core/parsers/qidian/main_parser.py +756 -48
  100. novel_downloader/core/parsers/qidian/utils/__init__.py +3 -21
  101. novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +1 -1
  102. novel_downloader/core/parsers/qidian/utils/node_decryptor.py +4 -4
  103. novel_downloader/core/parsers/quanben5.py +103 -0
  104. novel_downloader/core/parsers/registry.py +5 -16
  105. novel_downloader/core/parsers/sfacg.py +38 -45
  106. novel_downloader/core/parsers/shencou.py +215 -0
  107. novel_downloader/core/parsers/shuhaige.py +111 -0
  108. novel_downloader/core/parsers/tongrenquan.py +116 -0
  109. novel_downloader/core/parsers/ttkan.py +132 -0
  110. novel_downloader/core/parsers/wanbengo.py +191 -0
  111. novel_downloader/core/parsers/xiaoshuowu.py +173 -0
  112. novel_downloader/core/parsers/xiguashuwu.py +429 -0
  113. novel_downloader/core/parsers/xs63b.py +161 -0
  114. novel_downloader/core/parsers/xshbook.py +134 -0
  115. novel_downloader/core/parsers/yamibo.py +87 -131
  116. novel_downloader/core/parsers/yibige.py +166 -0
  117. novel_downloader/core/searchers/__init__.py +34 -3
  118. novel_downloader/core/searchers/aaatxt.py +107 -0
  119. novel_downloader/core/searchers/{biquge.py → b520.py} +29 -28
  120. novel_downloader/core/searchers/base.py +112 -36
  121. novel_downloader/core/searchers/dxmwx.py +105 -0
  122. novel_downloader/core/searchers/eightnovel.py +84 -0
  123. novel_downloader/core/searchers/esjzone.py +43 -25
  124. novel_downloader/core/searchers/hetushu.py +92 -0
  125. novel_downloader/core/searchers/i25zw.py +93 -0
  126. novel_downloader/core/searchers/ixdzs8.py +107 -0
  127. novel_downloader/core/searchers/jpxs123.py +107 -0
  128. novel_downloader/core/searchers/piaotia.py +100 -0
  129. novel_downloader/core/searchers/qbtr.py +106 -0
  130. novel_downloader/core/searchers/qianbi.py +74 -40
  131. novel_downloader/core/searchers/quanben5.py +144 -0
  132. novel_downloader/core/searchers/registry.py +24 -8
  133. novel_downloader/core/searchers/shuhaige.py +124 -0
  134. novel_downloader/core/searchers/tongrenquan.py +110 -0
  135. novel_downloader/core/searchers/ttkan.py +92 -0
  136. novel_downloader/core/searchers/xiaoshuowu.py +122 -0
  137. novel_downloader/core/searchers/xiguashuwu.py +95 -0
  138. novel_downloader/core/searchers/xs63b.py +104 -0
  139. novel_downloader/locales/en.json +34 -85
  140. novel_downloader/locales/zh.json +35 -86
  141. novel_downloader/models/__init__.py +21 -22
  142. novel_downloader/models/book.py +44 -0
  143. novel_downloader/models/config.py +4 -37
  144. novel_downloader/models/login.py +1 -1
  145. novel_downloader/models/search.py +5 -0
  146. novel_downloader/resources/config/settings.toml +8 -70
  147. novel_downloader/resources/json/xiguashuwu.json +718 -0
  148. novel_downloader/utils/__init__.py +13 -24
  149. novel_downloader/utils/chapter_storage.py +5 -5
  150. novel_downloader/utils/constants.py +4 -31
  151. novel_downloader/utils/cookies.py +38 -35
  152. novel_downloader/utils/crypto_utils/__init__.py +7 -0
  153. novel_downloader/utils/crypto_utils/aes_util.py +90 -0
  154. novel_downloader/utils/crypto_utils/aes_v1.py +619 -0
  155. novel_downloader/utils/crypto_utils/aes_v2.py +1143 -0
  156. novel_downloader/utils/crypto_utils/rc4.py +54 -0
  157. novel_downloader/utils/epub/__init__.py +3 -4
  158. novel_downloader/utils/epub/builder.py +6 -6
  159. novel_downloader/utils/epub/constants.py +62 -21
  160. novel_downloader/utils/epub/documents.py +95 -201
  161. novel_downloader/utils/epub/models.py +8 -22
  162. novel_downloader/utils/epub/utils.py +73 -106
  163. novel_downloader/utils/file_utils/__init__.py +2 -23
  164. novel_downloader/utils/file_utils/io.py +53 -188
  165. novel_downloader/utils/file_utils/normalize.py +1 -7
  166. novel_downloader/utils/file_utils/sanitize.py +4 -15
  167. novel_downloader/utils/fontocr/__init__.py +5 -14
  168. novel_downloader/utils/fontocr/core.py +216 -0
  169. novel_downloader/utils/fontocr/loader.py +50 -0
  170. novel_downloader/utils/logger.py +81 -65
  171. novel_downloader/utils/network.py +17 -41
  172. novel_downloader/utils/state.py +4 -90
  173. novel_downloader/utils/text_utils/__init__.py +1 -7
  174. novel_downloader/utils/text_utils/diff_display.py +5 -7
  175. novel_downloader/utils/text_utils/text_cleaner.py +39 -30
  176. novel_downloader/utils/text_utils/truncate_utils.py +3 -14
  177. novel_downloader/utils/time_utils/__init__.py +5 -11
  178. novel_downloader/utils/time_utils/datetime_utils.py +20 -29
  179. novel_downloader/utils/time_utils/sleep_utils.py +55 -49
  180. novel_downloader/web/__init__.py +13 -0
  181. novel_downloader/web/components/__init__.py +11 -0
  182. novel_downloader/web/components/navigation.py +35 -0
  183. novel_downloader/web/main.py +66 -0
  184. novel_downloader/web/pages/__init__.py +17 -0
  185. novel_downloader/web/pages/download.py +78 -0
  186. novel_downloader/web/pages/progress.py +147 -0
  187. novel_downloader/web/pages/search.py +329 -0
  188. novel_downloader/web/services/__init__.py +17 -0
  189. novel_downloader/web/services/client_dialog.py +164 -0
  190. novel_downloader/web/services/cred_broker.py +113 -0
  191. novel_downloader/web/services/cred_models.py +35 -0
  192. novel_downloader/web/services/task_manager.py +264 -0
  193. novel_downloader-2.0.1.dist-info/METADATA +172 -0
  194. novel_downloader-2.0.1.dist-info/RECORD +206 -0
  195. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/entry_points.txt +1 -1
  196. novel_downloader/core/downloaders/biquge.py +0 -29
  197. novel_downloader/core/downloaders/esjzone.py +0 -29
  198. novel_downloader/core/downloaders/linovelib.py +0 -29
  199. novel_downloader/core/downloaders/sfacg.py +0 -29
  200. novel_downloader/core/downloaders/yamibo.py +0 -29
  201. novel_downloader/core/exporters/biquge.py +0 -22
  202. novel_downloader/core/exporters/esjzone.py +0 -22
  203. novel_downloader/core/exporters/qianbi.py +0 -22
  204. novel_downloader/core/exporters/sfacg.py +0 -22
  205. novel_downloader/core/exporters/yamibo.py +0 -22
  206. novel_downloader/core/fetchers/base/__init__.py +0 -14
  207. novel_downloader/core/fetchers/base/browser.py +0 -422
  208. novel_downloader/core/fetchers/biquge/__init__.py +0 -14
  209. novel_downloader/core/fetchers/esjzone/__init__.py +0 -14
  210. novel_downloader/core/fetchers/esjzone/browser.py +0 -209
  211. novel_downloader/core/fetchers/linovelib/__init__.py +0 -14
  212. novel_downloader/core/fetchers/linovelib/browser.py +0 -198
  213. novel_downloader/core/fetchers/qianbi/__init__.py +0 -14
  214. novel_downloader/core/fetchers/qidian/__init__.py +0 -14
  215. novel_downloader/core/fetchers/qidian/browser.py +0 -326
  216. novel_downloader/core/fetchers/sfacg/__init__.py +0 -14
  217. novel_downloader/core/fetchers/sfacg/browser.py +0 -194
  218. novel_downloader/core/fetchers/yamibo/__init__.py +0 -14
  219. novel_downloader/core/fetchers/yamibo/browser.py +0 -234
  220. novel_downloader/core/parsers/biquge.py +0 -139
  221. novel_downloader/core/parsers/qidian/book_info_parser.py +0 -90
  222. novel_downloader/core/parsers/qidian/chapter_encrypted.py +0 -528
  223. novel_downloader/core/parsers/qidian/chapter_normal.py +0 -157
  224. novel_downloader/core/parsers/qidian/chapter_router.py +0 -68
  225. novel_downloader/core/parsers/qidian/utils/helpers.py +0 -114
  226. novel_downloader/models/chapter.py +0 -25
  227. novel_downloader/models/types.py +0 -13
  228. novel_downloader/tui/__init__.py +0 -7
  229. novel_downloader/tui/app.py +0 -32
  230. novel_downloader/tui/main.py +0 -17
  231. novel_downloader/tui/screens/__init__.py +0 -14
  232. novel_downloader/tui/screens/home.py +0 -198
  233. novel_downloader/tui/screens/login.py +0 -74
  234. novel_downloader/tui/styles/home_layout.tcss +0 -79
  235. novel_downloader/tui/widgets/richlog_handler.py +0 -24
  236. novel_downloader/utils/cache.py +0 -24
  237. novel_downloader/utils/crypto_utils.py +0 -71
  238. novel_downloader/utils/fontocr/hash_store.py +0 -280
  239. novel_downloader/utils/fontocr/hash_utils.py +0 -103
  240. novel_downloader/utils/fontocr/model_loader.py +0 -69
  241. novel_downloader/utils/fontocr/ocr_v1.py +0 -315
  242. novel_downloader/utils/fontocr/ocr_v2.py +0 -764
  243. novel_downloader/utils/fontocr/ocr_v3.py +0 -744
  244. novel_downloader-1.5.0.dist-info/METADATA +0 -196
  245. novel_downloader-1.5.0.dist-info/RECORD +0 -164
  246. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/WHEEL +0 -0
  247. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/licenses/LICENSE +0 -0
  248. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/top_level.txt +0 -0
@@ -1,528 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- novel_downloader.core.parsers.qidian.chapter_encrypted
4
- ------------------------------------------------------
5
-
6
- Support for parsing encrypted chapters from Qidian using font OCR mapping,
7
- CSS rules, and custom rendering logic.
8
- """
9
-
10
- from __future__ import annotations
11
-
12
- import json
13
- import logging
14
- from typing import TYPE_CHECKING, Any
15
-
16
- import tinycss2
17
- from lxml import html
18
-
19
- from novel_downloader.models import ChapterDict
20
- from novel_downloader.utils import (
21
- download,
22
- truncate_half_lines,
23
- )
24
-
25
- from .utils import (
26
- extract_chapter_info,
27
- find_ssr_page_context,
28
- get_decryptor,
29
- is_duplicated,
30
- vip_status,
31
- )
32
-
33
- if TYPE_CHECKING:
34
- from .main_parser import QidianParser
35
-
36
- logger = logging.getLogger(__name__)
37
- IGNORED_CLASS_LISTS = {"title", "review"}
38
- NON_CONTENT_KEYWORDS = {"旧版", "反馈", "扫码"}
39
-
40
-
41
- def parse_encrypted_chapter(
42
- parser: QidianParser,
43
- html_str: str,
44
- chapter_id: str,
45
- ) -> ChapterDict | None:
46
- """
47
- Extract and return the formatted textual content of an encrypted chapter.
48
-
49
- Steps:
50
- 1. Load SSR JSON context for CSS, fonts, and metadata.
51
- 3. Decode and save randomFont bytes; download fixedFont via download_font().
52
- 4. Extract paragraph structures and save debug JSON.
53
- 5. Parse CSS rules and save debug JSON.
54
- 6. Render encrypted paragraphs, then run OCR font-mapping.
55
- 7. Extracts paragraph texts and formats them.
56
-
57
- :param html_str: Raw HTML content of the chapter page.
58
- :return: Formatted chapter text or empty string if not parsable.
59
- """
60
- try:
61
- if not (parser._decode_font and parser._font_ocr):
62
- return None
63
- ssr_data = find_ssr_page_context(html_str)
64
- chapter_info = extract_chapter_info(ssr_data)
65
- if not chapter_info:
66
- logger.warning(
67
- "[Parser] ssr_chapterInfo not found for chapter '%s'", chapter_id
68
- )
69
- return None
70
-
71
- debug_dir = parser._debug_dir / "font_debug" / "qidian" / chapter_id
72
- if parser.save_font_debug:
73
- debug_dir.mkdir(parents=True, exist_ok=True)
74
-
75
- css_str = chapter_info["css"]
76
- randomFont_str = chapter_info["randomFont"]
77
- fixedFontWoff2_url = chapter_info["fixedFontWoff2"]
78
-
79
- title = chapter_info.get("chapterName", "Untitled")
80
- duplicated = is_duplicated(ssr_data)
81
- raw_html = chapter_info.get("content", "")
82
- chapter_id = chapter_info.get("chapterId", chapter_id)
83
- fkp = chapter_info.get("fkp", "")
84
- author_say = chapter_info.get("authorSay", "")
85
- update_time = chapter_info.get("updateTime", "")
86
- update_timestamp = chapter_info.get("updateTimestamp", 0)
87
- modify_time = chapter_info.get("modifyTime", 0)
88
- word_count = chapter_info.get("actualWords", 0)
89
- seq = chapter_info.get("seq", None)
90
- volume = chapter_info.get("extra", {}).get("volumeName", "")
91
-
92
- # extract + save font
93
- rf = json.loads(randomFont_str)
94
- rand_path = parser._base_cache_dir / "randomFont.ttf"
95
- rand_path.parent.mkdir(parents=True, exist_ok=True)
96
- rand_path.write_bytes(bytes(rf["data"]))
97
-
98
- fixed_path = download(
99
- url=fixedFontWoff2_url,
100
- target_dir=parser._fixed_font_dir,
101
- stream=True,
102
- )
103
- if fixed_path is None:
104
- raise ValueError("fixed_path is None: failed to download font")
105
-
106
- # Extract and render paragraphs from HTML with CSS rules
107
- main_paragraphs = extract_paragraphs_recursively(html_str, chapter_id)
108
- if not main_paragraphs or contains_keywords(
109
- main_paragraphs, NON_CONTENT_KEYWORDS
110
- ):
111
- if vip_status(ssr_data):
112
- try:
113
- decryptor = get_decryptor()
114
- raw_html = decryptor.decrypt(
115
- raw_html,
116
- chapter_id,
117
- fkp,
118
- parser._fuid,
119
- )
120
- except Exception as e:
121
- logger.error(
122
- "[Parser] decryption failed for '%s': %s", chapter_id, e
123
- )
124
- return None
125
- main_paragraphs = extract_paragraphs_recursively(raw_html, chapter_id)
126
-
127
- if parser.save_font_debug:
128
- main_paragraphs_path = debug_dir / "main_paragraphs_debug.json"
129
- main_paragraphs_path.write_text(
130
- json.dumps(main_paragraphs, ensure_ascii=False, indent=2),
131
- encoding="utf-8",
132
- )
133
-
134
- paragraphs_rules = parse_rule(css_str)
135
- if parser.save_font_debug:
136
- paragraphs_rules_path = debug_dir / "paragraphs_rules_debug.json"
137
- paragraphs_rules_path.write_text(
138
- json.dumps(paragraphs_rules, ensure_ascii=False, indent=2),
139
- encoding="utf-8",
140
- )
141
-
142
- end_number = parse_end_number(main_paragraphs, paragraphs_rules)
143
- paragraphs_str, refl_list = render_paragraphs(
144
- main_paragraphs,
145
- paragraphs_rules,
146
- end_number,
147
- )
148
- if parser.save_font_debug:
149
- paragraphs_str_path = debug_dir / f"{chapter_id}_debug.txt"
150
- paragraphs_str_path.write_text(paragraphs_str, encoding="utf-8")
151
-
152
- # Run OCR + fallback mapping
153
- char_set = {c for c in paragraphs_str if c not in {" ", "\n", "\u3000"}}
154
- refl_set = set(refl_list)
155
- char_set = char_set - refl_set
156
- if parser.save_font_debug:
157
- char_sets_path = debug_dir / "char_set_debug.txt"
158
- temp = f"char_set:\n{char_set}\n\nrefl_set:\n{refl_set}"
159
- char_sets_path.write_text(
160
- temp,
161
- encoding="utf-8",
162
- )
163
-
164
- mapping_result = parser._font_ocr.generate_font_map(
165
- fixed_font_path=fixed_path,
166
- random_font_path=rand_path,
167
- char_set=char_set,
168
- refl_set=refl_set,
169
- chapter_id=chapter_id,
170
- )
171
- if parser.save_font_debug:
172
- mapping_json_path = debug_dir / "font_mapping.json"
173
- mapping_json_path.write_text(
174
- json.dumps(mapping_result, ensure_ascii=False, indent=2),
175
- encoding="utf-8",
176
- )
177
-
178
- # Reconstruct final readable text
179
- original_text = parser._font_ocr.apply_font_mapping(
180
- text=paragraphs_str,
181
- font_map=mapping_result,
182
- )
183
-
184
- final_paragraphs_str = "\n\n".join(
185
- line.strip() for line in original_text.splitlines() if line.strip()
186
- )
187
- if parser._use_truncation and duplicated:
188
- final_paragraphs_str = truncate_half_lines(final_paragraphs_str)
189
-
190
- return {
191
- "id": str(chapter_id),
192
- "title": str(title),
193
- "content": final_paragraphs_str,
194
- "extra": {
195
- "author_say": author_say.strip() if author_say else "",
196
- "updated_at": update_time,
197
- "update_timestamp": update_timestamp,
198
- "modify_time": modify_time,
199
- "word_count": word_count,
200
- "duplicated": duplicated,
201
- "seq": seq,
202
- "volume": volume,
203
- "encrypted": True,
204
- },
205
- }
206
-
207
- except Exception as e:
208
- logger.warning(
209
- "[Parser] parse error for encrypted chapter '%s': %s", chapter_id, e
210
- )
211
- return None
212
-
213
-
214
- def extract_paragraphs_recursively(
215
- html_str: str,
216
- chapter_id: str,
217
- ) -> list[dict[str, Any]]:
218
- def parse_element(elem: html.HtmlElement) -> dict[str, Any]:
219
- class_attr = elem.attrib.get("class", "")
220
- class_list = class_attr.split() if isinstance(class_attr, str) else class_attr
221
- if "review" in class_list:
222
- return {}
223
-
224
- # Build attrs with class as list
225
- attrs = {k: v.split() if k == "class" else v for k, v in elem.attrib.items()}
226
-
227
- node: dict[str, Any] = {
228
- "tag": elem.tag,
229
- "attrs": attrs,
230
- "data": [],
231
- }
232
-
233
- # Append entire elem.text if present (no splitting)
234
- if elem.text:
235
- node["data"].append(elem.text)
236
-
237
- # Recurse into children
238
- for child in elem.iterchildren(tag=None):
239
- child_dict = parse_element(child)
240
- if child_dict:
241
- node["data"].append(child_dict)
242
-
243
- # Append entire tail string (no split)
244
- if child.tail:
245
- node["data"].append(child.tail)
246
-
247
- return node
248
-
249
- tree = html.fromstring(html_str)
250
-
251
- # Try to find <main id="c-{chapter_id}">
252
- main_elem = tree.xpath(f'//main[@id="c-{chapter_id}"]')
253
- search_root = main_elem[0] if main_elem else tree
254
- return [parse_element(p) for p in search_root.findall(".//p")]
255
-
256
-
257
- def parse_rule(css_str: str) -> dict[str, Any]:
258
- """
259
- Parse a CSS string and extract style rules for rendering.
260
-
261
- Handles:
262
- - font-size:0 (mark for deletion)
263
- - scaleX(-1) (mark as mirrored)
264
- - ::before / ::after with content or attr()
265
- - class + tag selector mapping
266
- - custom rendering order via 'order'
267
-
268
- :param css_str: Raw CSS stylesheet string.
269
- :return: Dict with "rules" and "orders" for rendering.
270
- """
271
-
272
- rules: dict[str, Any] = {}
273
- orders = []
274
-
275
- stylesheet = tinycss2.parse_stylesheet(
276
- css_str, skip_comments=True, skip_whitespace=True
277
- )
278
-
279
- for rule in stylesheet:
280
- if rule.type != "qualified-rule":
281
- continue
282
-
283
- selector = tinycss2.serialize(rule.prelude).strip()
284
- declarations = tinycss2.parse_declaration_list(rule.content)
285
-
286
- parsed = {}
287
- order_val = None
288
-
289
- for decl in declarations:
290
- if decl.type != "declaration":
291
- continue
292
- name = decl.lower_name
293
- value = tinycss2.serialize(decl.value).strip()
294
-
295
- if name == "font-size" and value == "0":
296
- if "::first-letter" in selector:
297
- parsed["delete-first"] = True
298
- else:
299
- parsed["delete-all"] = True
300
- elif name == "transform" and value.lower() == "scalex(-1)":
301
- parsed["transform-x_-1"] = True
302
- elif name == "order":
303
- order_val = value
304
- elif name == "content":
305
- if "::after" in selector:
306
- if "attr(" in value:
307
- parsed["append-end-attr"] = value.split("attr(")[1].split(")")[
308
- 0
309
- ]
310
- else:
311
- parsed["append-end-char"] = value.strip("\"'")
312
- elif "::before" in selector:
313
- if "attr(" in value:
314
- parsed["append-start-attr"] = value.split("attr(")[1].split(
315
- ")"
316
- )[0]
317
- else:
318
- parsed["append-start-char"] = value.strip("\"'")
319
-
320
- # Store in structure
321
- if selector.startswith(".sy-"):
322
- rules.setdefault("sy", {})[selector[1:]] = parsed
323
- elif selector.startswith(".p") and " " in selector:
324
- class_str, tag_part = selector.split(" ", 1)
325
- class_str = class_str.lstrip(".")
326
- tag_part = tag_part.split("::")[0]
327
- rules.setdefault(class_str, {}).setdefault(tag_part, {}).update(parsed)
328
-
329
- if order_val:
330
- orders.append((selector, order_val))
331
-
332
- orders.sort(key=lambda x: int(x[1]))
333
- return {"rules": rules, "orders": orders}
334
-
335
-
336
- def render_paragraphs(
337
- main_paragraphs: list[dict[str, Any]],
338
- rules: dict[str, Any],
339
- end_number: str = "",
340
- ) -> tuple[str, list[str]]:
341
- """
342
- Applies the parsed CSS rules to the paragraph structure and
343
- reconstructs the visible text.
344
-
345
- Handles special class styles like .sy-*, text order control,
346
- mirrored characters, etc.
347
-
348
- :param main_paragraphs: A list of paragraph dictionaries, each with 'attrs'
349
- and 'data' fields representing structured content.
350
- :param rules: A dictionary with keys 'orders' and 'rules', parsed from CSS.
351
- - rules['orders']: List of (selector, id) tuples.
352
- - rules['rules']: Nested dict containing transformation rules.
353
-
354
- :return:
355
- - A reconstructed paragraph string with line breaks.
356
- - A list of mirrored (reflected) characters for later OCR processing.
357
- """
358
- orders: list[tuple[str, str]] = rules.get("orders", [])
359
- rules = rules.get("rules", {})
360
- refl_list: list[str] = []
361
-
362
- def apply_rule(data: dict[str, Any], rule: dict[str, Any]) -> str:
363
- if rule.get("delete-all", False):
364
- return ""
365
-
366
- curr_str = ""
367
- if isinstance(data.get("data"), list) and data["data"]:
368
- first_data = data["data"][0]
369
- if isinstance(first_data, str):
370
- curr_str += first_data
371
-
372
- if rule.get("delete-first", False):
373
- curr_str = "" if len(curr_str) <= 1 else curr_str[1:]
374
-
375
- curr_str += rule.get("append-end-char", "")
376
-
377
- attr_name = rule.get("append-end-attr", "")
378
- if attr_name:
379
- curr_str += data.get("attrs", {}).get(f"{attr_name}{end_number}", "")
380
-
381
- curr_str = rule.get("append-start-char", "") + curr_str
382
-
383
- attr_name = rule.get("append-start-attr", "")
384
- if attr_name:
385
- curr_str = (
386
- data.get("attrs", {}).get(f"{attr_name}{end_number}", "") + curr_str
387
- )
388
-
389
- if rule.get("transform-x_-1", False):
390
- refl_list.append(curr_str)
391
- return curr_str
392
-
393
- paragraphs_str = ""
394
- for paragraph in main_paragraphs:
395
- class_list = paragraph.get("attrs", {}).get("class", [])
396
- p_class_str = next((c for c in class_list if c.startswith("p")), None)
397
- curr_datas = paragraph.get("data", [])
398
-
399
- ordered_cache = {}
400
- for data in curr_datas:
401
- # 文本节点直接加
402
- if isinstance(data, str):
403
- paragraphs_str += data
404
- continue
405
-
406
- if isinstance(data, dict):
407
- tag = data.get("tag", "")
408
- attrs = data.get("attrs", {})
409
-
410
- # 跳过 span.review
411
- if tag == "span" and "class" in attrs and "review" in attrs["class"]:
412
- continue
413
-
414
- # sy 类型标签处理
415
- if tag == "y":
416
- tag_class_list = attrs.get("class", [])
417
- tag_class = next(
418
- (c for c in tag_class_list if c.startswith("sy-")), None
419
- )
420
-
421
- if tag_class in rules.get("sy", {}):
422
- curr_rule = rules["sy"][tag_class]
423
- paragraphs_str += apply_rule(data, curr_rule)
424
- continue
425
-
426
- if not p_class_str:
427
- if any(cls in IGNORED_CLASS_LISTS for cls in class_list):
428
- continue
429
- logger.debug(f"[parser] not find p_class_str: {class_list}")
430
- continue
431
- # 普通标签处理,根据 orders 顺序匹配
432
- for ord_selector, _ in orders:
433
- tag_name = f"{ord_selector}{end_number}"
434
- if data.get("tag") != tag_name:
435
- continue
436
- curr_rule = rules.get(p_class_str, {}).get(ord_selector)
437
- curr_rule = curr_rule if curr_rule else {}
438
- ordered_cache[ord_selector] = apply_rule(data, curr_rule)
439
- break
440
- # 最后按 orders 顺序拼接
441
- for ord_selector, _ in orders:
442
- if ord_selector in ordered_cache:
443
- paragraphs_str += ordered_cache[ord_selector]
444
-
445
- paragraphs_str += "\n\n"
446
-
447
- return paragraphs_str, refl_list
448
-
449
-
450
- def parse_paragraph_names(rules: dict[str, Any]) -> set[str]:
451
- """
452
- Extract all paragraph selector names from parsed rules, excluding "sy".
453
- """
454
- paragraph_names = set()
455
- for group, group_rules in rules.get("rules", {}).items():
456
- if group == "sy":
457
- continue
458
- paragraph_names.update(group_rules.keys())
459
- return paragraph_names
460
-
461
-
462
- def parse_end_number(
463
- main_paragraphs: list[dict[str, Any]],
464
- rules: dict[str, Any],
465
- ) -> str:
466
- """
467
- Find the most frequent numeric suffix from tag names
468
- matched by given paragraph prefixes.
469
- """
470
- paragraph_names = parse_paragraph_names(rules)
471
- end_numbers: dict[int, int] = {}
472
- prefix_hits = 0
473
- sorted_names = sorted(paragraph_names, key=len, reverse=True)
474
-
475
- def rec_parse(item: list[Any] | dict[str, Any]) -> None:
476
- nonlocal prefix_hits
477
- if isinstance(item, list):
478
- for element in item:
479
- rec_parse(element)
480
- elif isinstance(item, dict):
481
- tag = item.get("tag")
482
- if isinstance(tag, str):
483
- for prefix in sorted_names:
484
- if tag.startswith(prefix):
485
- prefix_hits += 1
486
- remain = tag[len(prefix) :]
487
- if remain.isdigit():
488
- num = int(remain)
489
- end_numbers[num] = end_numbers.get(num, 0) + 1
490
- break
491
- for val in item.values():
492
- if isinstance(val, (list | dict)):
493
- rec_parse(val)
494
-
495
- rec_parse(main_paragraphs)
496
-
497
- if not end_numbers:
498
- logger.debug("[Parser] No valid ending numbers found")
499
- return ""
500
-
501
- sorted_numbers = sorted(
502
- end_numbers.items(), key=lambda x: (x[1], x[0]), reverse=True
503
- )
504
-
505
- logger.debug(
506
- "[Parser] Top 3 end numbers:\n%s",
507
- "\n".join(f"{n}: {c}" for n, c in sorted_numbers[:3]),
508
- )
509
- most_common_number, most_common_count = sorted_numbers[0]
510
- if most_common_count <= prefix_hits / 2:
511
- logger.debug(
512
- "[Parser] Top number (%s) does not exceed 50%% threshold: %d of %d",
513
- most_common_number,
514
- most_common_count,
515
- prefix_hits,
516
- )
517
- return ""
518
-
519
- return str(most_common_number)
520
-
521
-
522
- def contains_keywords(paragraphs: list[dict[str, Any]], keywords: set[str]) -> bool:
523
- for para in paragraphs:
524
- data = para.get("data", [])
525
- for item in data:
526
- if isinstance(item, str) and any(kw in item for kw in keywords):
527
- return True
528
- return False
@@ -1,157 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- novel_downloader.core.parsers.qidian.chapter_normal
4
- ---------------------------------------------------
5
-
6
- Parser logic for extracting readable text from Qidian chapters
7
- that use plain (non-encrypted) browser-rendered HTML.
8
- """
9
-
10
- from __future__ import annotations
11
-
12
- import logging
13
- from typing import TYPE_CHECKING
14
-
15
- from lxml import html
16
-
17
- from novel_downloader.models import ChapterDict
18
- from novel_downloader.utils import truncate_half_lines
19
-
20
- from .utils import (
21
- extract_chapter_info,
22
- find_ssr_page_context,
23
- get_decryptor,
24
- is_duplicated,
25
- vip_status,
26
- )
27
-
28
- if TYPE_CHECKING:
29
- from .main_parser import QidianParser
30
-
31
- logger = logging.getLogger(__name__)
32
-
33
-
34
- def parse_normal_chapter(
35
- parser: QidianParser,
36
- html_str: str,
37
- chapter_id: str,
38
- ) -> ChapterDict | None:
39
- """
40
- Extract structured chapter info from a normal Qidian page.
41
-
42
- :param html_str: Chapter HTML.
43
- :param chapter_id: Chapter identifier (string).
44
- :return: a dictionary with keys like 'id', 'title', 'content', etc.
45
- """
46
- try:
47
- ssr_data = find_ssr_page_context(html_str)
48
- chapter_info = extract_chapter_info(ssr_data)
49
- if not chapter_info:
50
- logger.warning(
51
- "[Parser] ssr_chapterInfo not found for chapter '%s'", chapter_id
52
- )
53
- return None
54
-
55
- title = chapter_info.get("chapterName", "Untitled")
56
- duplicated = is_duplicated(ssr_data)
57
- raw_html = chapter_info.get("content", "")
58
- chapter_id = chapter_info.get("chapterId", chapter_id)
59
- fkp = chapter_info.get("fkp", "")
60
- author_say = chapter_info.get("authorSay", "")
61
- update_time = chapter_info.get("updateTime", "")
62
- update_timestamp = chapter_info.get("updateTimestamp", 0)
63
- modify_time = chapter_info.get("modifyTime", 0)
64
- word_count = chapter_info.get("actualWords", 0)
65
- seq = chapter_info.get("seq", None)
66
- volume = chapter_info.get("extra", {}).get("volumeName", "")
67
-
68
- chapter_text = _parse_browser_paragraph(html_str)
69
- if not chapter_text:
70
- chapter_text = _parse_session_paragraph(
71
- html_str=raw_html,
72
- is_vip=vip_status(ssr_data),
73
- chapter_id=chapter_id,
74
- fkp=fkp,
75
- fuid=parser._fuid,
76
- )
77
- if not chapter_text:
78
- return None
79
-
80
- if parser._use_truncation and duplicated:
81
- chapter_text = truncate_half_lines(chapter_text)
82
-
83
- return {
84
- "id": str(chapter_id),
85
- "title": title,
86
- "content": chapter_text,
87
- "extra": {
88
- "author_say": author_say.strip() if author_say else "",
89
- "updated_at": update_time,
90
- "update_timestamp": update_timestamp,
91
- "modify_time": modify_time,
92
- "word_count": word_count,
93
- "duplicated": duplicated,
94
- "seq": seq,
95
- "volume": volume,
96
- "encrypted": False,
97
- },
98
- }
99
- except Exception as e:
100
- logger.warning(
101
- "[Parser] parse error for normal chapter '%s': %s", chapter_id, e
102
- )
103
- return None
104
-
105
-
106
- def _parse_browser_paragraph(html_str: str) -> str:
107
- try:
108
- tree = html.fromstring(html_str)
109
- main = tree.xpath('//div[@id="app"]//div[@id="reader-content"]//main')
110
- if not main:
111
- return ""
112
- main = main[0]
113
-
114
- content_spans = main.xpath('.//span[contains(@class, "content-text")]')
115
-
116
- paragraph_texts = [
117
- span.text_content().strip()
118
- for span in content_spans
119
- if span.text_content().strip()
120
- ]
121
-
122
- chapter_text = "\n\n".join(paragraph_texts)
123
- return chapter_text
124
-
125
- except Exception as e:
126
- logger.error("[Parser] _parse_paragraph failed: %s", e)
127
- return ""
128
-
129
-
130
- def _parse_session_paragraph(
131
- html_str: str,
132
- is_vip: bool,
133
- chapter_id: str,
134
- fkp: str,
135
- fuid: str,
136
- ) -> str:
137
- try:
138
- raw_html = html_str
139
-
140
- if is_vip:
141
- try:
142
- decryptor = get_decryptor()
143
- raw_html = decryptor.decrypt(raw_html, chapter_id, fkp, fuid)
144
- except Exception as e:
145
- logger.error("[Parser] decryption failed for '%s': %s", chapter_id, e)
146
- return ""
147
-
148
- tree = html.fromstring(raw_html)
149
- paras = tree.xpath(".//p")
150
- paragraph_texts = [
151
- p.text_content().strip() for p in paras if p.text_content().strip()
152
- ]
153
- return "\n\n".join(paragraph_texts)
154
-
155
- except Exception as e:
156
- logger.error("[Parser] _parse_paragraph failed: %s", e)
157
- return ""