novel-downloader 1.4.5__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/__init__.py +2 -4
  3. novel_downloader/cli/clean.py +21 -88
  4. novel_downloader/cli/config.py +27 -104
  5. novel_downloader/cli/download.py +78 -66
  6. novel_downloader/cli/export.py +20 -21
  7. novel_downloader/cli/main.py +3 -1
  8. novel_downloader/cli/search.py +120 -0
  9. novel_downloader/cli/ui.py +156 -0
  10. novel_downloader/config/__init__.py +10 -14
  11. novel_downloader/config/adapter.py +195 -99
  12. novel_downloader/config/{loader.py → file_io.py} +53 -27
  13. novel_downloader/core/__init__.py +14 -13
  14. novel_downloader/core/archived/deqixs/fetcher.py +115 -0
  15. novel_downloader/core/archived/deqixs/parser.py +132 -0
  16. novel_downloader/core/archived/deqixs/searcher.py +89 -0
  17. novel_downloader/core/archived/qidian/searcher.py +79 -0
  18. novel_downloader/core/archived/wanbengo/searcher.py +98 -0
  19. novel_downloader/core/archived/xshbook/searcher.py +93 -0
  20. novel_downloader/core/downloaders/__init__.py +8 -30
  21. novel_downloader/core/downloaders/base.py +182 -30
  22. novel_downloader/core/downloaders/common.py +217 -384
  23. novel_downloader/core/downloaders/qianbi.py +332 -4
  24. novel_downloader/core/downloaders/qidian.py +250 -290
  25. novel_downloader/core/downloaders/registry.py +69 -0
  26. novel_downloader/core/downloaders/signals.py +46 -0
  27. novel_downloader/core/exporters/__init__.py +8 -26
  28. novel_downloader/core/exporters/base.py +107 -31
  29. novel_downloader/core/exporters/common/__init__.py +3 -4
  30. novel_downloader/core/exporters/common/epub.py +92 -171
  31. novel_downloader/core/exporters/common/main_exporter.py +14 -67
  32. novel_downloader/core/exporters/common/txt.py +90 -86
  33. novel_downloader/core/exporters/epub_util.py +184 -1327
  34. novel_downloader/core/exporters/linovelib/__init__.py +3 -2
  35. novel_downloader/core/exporters/linovelib/epub.py +165 -222
  36. novel_downloader/core/exporters/linovelib/main_exporter.py +10 -71
  37. novel_downloader/core/exporters/linovelib/txt.py +76 -66
  38. novel_downloader/core/exporters/qidian.py +15 -11
  39. novel_downloader/core/exporters/registry.py +55 -0
  40. novel_downloader/core/exporters/txt_util.py +67 -0
  41. novel_downloader/core/fetchers/__init__.py +57 -56
  42. novel_downloader/core/fetchers/aaatxt.py +83 -0
  43. novel_downloader/core/fetchers/{biquge/session.py → b520.py} +10 -10
  44. novel_downloader/core/fetchers/{base/session.py → base.py} +63 -47
  45. novel_downloader/core/fetchers/biquyuedu.py +83 -0
  46. novel_downloader/core/fetchers/dxmwx.py +110 -0
  47. novel_downloader/core/fetchers/eightnovel.py +139 -0
  48. novel_downloader/core/fetchers/{esjzone/session.py → esjzone.py} +23 -11
  49. novel_downloader/core/fetchers/guidaye.py +85 -0
  50. novel_downloader/core/fetchers/hetushu.py +92 -0
  51. novel_downloader/core/fetchers/{qianbi/browser.py → i25zw.py} +22 -26
  52. novel_downloader/core/fetchers/ixdzs8.py +113 -0
  53. novel_downloader/core/fetchers/jpxs123.py +101 -0
  54. novel_downloader/core/fetchers/{biquge/browser.py → lewenn.py} +15 -15
  55. novel_downloader/core/fetchers/{linovelib/session.py → linovelib.py} +16 -12
  56. novel_downloader/core/fetchers/piaotia.py +105 -0
  57. novel_downloader/core/fetchers/qbtr.py +101 -0
  58. novel_downloader/core/fetchers/{qianbi/session.py → qianbi.py} +9 -9
  59. novel_downloader/core/fetchers/{qidian/session.py → qidian.py} +55 -40
  60. novel_downloader/core/fetchers/quanben5.py +92 -0
  61. novel_downloader/core/fetchers/{base/rate_limiter.py → rate_limiter.py} +2 -2
  62. novel_downloader/core/fetchers/registry.py +60 -0
  63. novel_downloader/core/fetchers/{sfacg/session.py → sfacg.py} +11 -9
  64. novel_downloader/core/fetchers/shencou.py +106 -0
  65. novel_downloader/core/fetchers/{common/browser.py → shuhaige.py} +24 -19
  66. novel_downloader/core/fetchers/tongrenquan.py +84 -0
  67. novel_downloader/core/fetchers/ttkan.py +95 -0
  68. novel_downloader/core/fetchers/{common/session.py → wanbengo.py} +21 -17
  69. novel_downloader/core/fetchers/xiaoshuowu.py +106 -0
  70. novel_downloader/core/fetchers/xiguashuwu.py +177 -0
  71. novel_downloader/core/fetchers/xs63b.py +171 -0
  72. novel_downloader/core/fetchers/xshbook.py +85 -0
  73. novel_downloader/core/fetchers/{yamibo/session.py → yamibo.py} +23 -11
  74. novel_downloader/core/fetchers/yibige.py +114 -0
  75. novel_downloader/core/interfaces/__init__.py +8 -14
  76. novel_downloader/core/interfaces/downloader.py +6 -2
  77. novel_downloader/core/interfaces/exporter.py +7 -7
  78. novel_downloader/core/interfaces/fetcher.py +4 -17
  79. novel_downloader/core/interfaces/parser.py +5 -6
  80. novel_downloader/core/interfaces/searcher.py +26 -0
  81. novel_downloader/core/parsers/__init__.py +58 -22
  82. novel_downloader/core/parsers/aaatxt.py +132 -0
  83. novel_downloader/core/parsers/b520.py +116 -0
  84. novel_downloader/core/parsers/base.py +63 -12
  85. novel_downloader/core/parsers/biquyuedu.py +133 -0
  86. novel_downloader/core/parsers/dxmwx.py +162 -0
  87. novel_downloader/core/parsers/eightnovel.py +224 -0
  88. novel_downloader/core/parsers/{esjzone/main_parser.py → esjzone.py} +67 -67
  89. novel_downloader/core/parsers/guidaye.py +128 -0
  90. novel_downloader/core/parsers/hetushu.py +139 -0
  91. novel_downloader/core/parsers/i25zw.py +137 -0
  92. novel_downloader/core/parsers/ixdzs8.py +186 -0
  93. novel_downloader/core/parsers/jpxs123.py +137 -0
  94. novel_downloader/core/parsers/lewenn.py +142 -0
  95. novel_downloader/core/parsers/{linovelib/main_parser.py → linovelib.py} +54 -65
  96. novel_downloader/core/parsers/piaotia.py +189 -0
  97. novel_downloader/core/parsers/qbtr.py +136 -0
  98. novel_downloader/core/parsers/{qianbi/main_parser.py → qianbi.py} +54 -51
  99. novel_downloader/core/parsers/qidian/__init__.py +2 -2
  100. novel_downloader/core/parsers/qidian/book_info_parser.py +58 -59
  101. novel_downloader/core/parsers/qidian/chapter_encrypted.py +290 -346
  102. novel_downloader/core/parsers/qidian/chapter_normal.py +25 -56
  103. novel_downloader/core/parsers/qidian/main_parser.py +19 -57
  104. novel_downloader/core/parsers/qidian/utils/__init__.py +12 -11
  105. novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +6 -7
  106. novel_downloader/core/parsers/qidian/utils/fontmap_recover.py +143 -0
  107. novel_downloader/core/parsers/qidian/utils/helpers.py +0 -4
  108. novel_downloader/core/parsers/qidian/utils/node_decryptor.py +2 -2
  109. novel_downloader/core/parsers/quanben5.py +103 -0
  110. novel_downloader/core/parsers/registry.py +57 -0
  111. novel_downloader/core/parsers/{sfacg/main_parser.py → sfacg.py} +46 -48
  112. novel_downloader/core/parsers/shencou.py +215 -0
  113. novel_downloader/core/parsers/shuhaige.py +111 -0
  114. novel_downloader/core/parsers/tongrenquan.py +116 -0
  115. novel_downloader/core/parsers/ttkan.py +132 -0
  116. novel_downloader/core/parsers/wanbengo.py +191 -0
  117. novel_downloader/core/parsers/xiaoshuowu.py +173 -0
  118. novel_downloader/core/parsers/xiguashuwu.py +435 -0
  119. novel_downloader/core/parsers/xs63b.py +161 -0
  120. novel_downloader/core/parsers/xshbook.py +134 -0
  121. novel_downloader/core/parsers/yamibo.py +155 -0
  122. novel_downloader/core/parsers/yibige.py +166 -0
  123. novel_downloader/core/searchers/__init__.py +51 -0
  124. novel_downloader/core/searchers/aaatxt.py +107 -0
  125. novel_downloader/core/searchers/b520.py +84 -0
  126. novel_downloader/core/searchers/base.py +168 -0
  127. novel_downloader/core/searchers/dxmwx.py +105 -0
  128. novel_downloader/core/searchers/eightnovel.py +84 -0
  129. novel_downloader/core/searchers/esjzone.py +102 -0
  130. novel_downloader/core/searchers/hetushu.py +92 -0
  131. novel_downloader/core/searchers/i25zw.py +93 -0
  132. novel_downloader/core/searchers/ixdzs8.py +107 -0
  133. novel_downloader/core/searchers/jpxs123.py +107 -0
  134. novel_downloader/core/searchers/piaotia.py +100 -0
  135. novel_downloader/core/searchers/qbtr.py +106 -0
  136. novel_downloader/core/searchers/qianbi.py +165 -0
  137. novel_downloader/core/searchers/quanben5.py +144 -0
  138. novel_downloader/core/searchers/registry.py +79 -0
  139. novel_downloader/core/searchers/shuhaige.py +124 -0
  140. novel_downloader/core/searchers/tongrenquan.py +110 -0
  141. novel_downloader/core/searchers/ttkan.py +92 -0
  142. novel_downloader/core/searchers/xiaoshuowu.py +122 -0
  143. novel_downloader/core/searchers/xiguashuwu.py +95 -0
  144. novel_downloader/core/searchers/xs63b.py +104 -0
  145. novel_downloader/locales/en.json +36 -79
  146. novel_downloader/locales/zh.json +37 -80
  147. novel_downloader/models/__init__.py +23 -50
  148. novel_downloader/models/book.py +44 -0
  149. novel_downloader/models/config.py +16 -43
  150. novel_downloader/models/login.py +1 -1
  151. novel_downloader/models/search.py +21 -0
  152. novel_downloader/resources/config/settings.toml +39 -74
  153. novel_downloader/resources/css_styles/intro.css +83 -0
  154. novel_downloader/resources/css_styles/main.css +30 -89
  155. novel_downloader/resources/json/xiguashuwu.json +718 -0
  156. novel_downloader/utils/__init__.py +43 -0
  157. novel_downloader/utils/chapter_storage.py +247 -226
  158. novel_downloader/utils/constants.py +5 -50
  159. novel_downloader/utils/cookies.py +6 -18
  160. novel_downloader/utils/crypto_utils/__init__.py +13 -0
  161. novel_downloader/utils/crypto_utils/aes_util.py +90 -0
  162. novel_downloader/utils/crypto_utils/aes_v1.py +619 -0
  163. novel_downloader/utils/crypto_utils/aes_v2.py +1143 -0
  164. novel_downloader/utils/{crypto_utils.py → crypto_utils/rc4.py} +3 -10
  165. novel_downloader/utils/epub/__init__.py +34 -0
  166. novel_downloader/utils/epub/builder.py +377 -0
  167. novel_downloader/utils/epub/constants.py +118 -0
  168. novel_downloader/utils/epub/documents.py +297 -0
  169. novel_downloader/utils/epub/models.py +120 -0
  170. novel_downloader/utils/epub/utils.py +179 -0
  171. novel_downloader/utils/file_utils/__init__.py +5 -30
  172. novel_downloader/utils/file_utils/io.py +9 -150
  173. novel_downloader/utils/file_utils/normalize.py +2 -2
  174. novel_downloader/utils/file_utils/sanitize.py +2 -7
  175. novel_downloader/utils/fontocr.py +207 -0
  176. novel_downloader/utils/i18n.py +2 -0
  177. novel_downloader/utils/logger.py +10 -16
  178. novel_downloader/utils/network.py +111 -252
  179. novel_downloader/utils/state.py +5 -90
  180. novel_downloader/utils/text_utils/__init__.py +16 -21
  181. novel_downloader/utils/text_utils/diff_display.py +6 -9
  182. novel_downloader/utils/text_utils/numeric_conversion.py +253 -0
  183. novel_downloader/utils/text_utils/text_cleaner.py +179 -0
  184. novel_downloader/utils/text_utils/truncate_utils.py +62 -0
  185. novel_downloader/utils/time_utils/__init__.py +6 -12
  186. novel_downloader/utils/time_utils/datetime_utils.py +23 -33
  187. novel_downloader/utils/time_utils/sleep_utils.py +5 -10
  188. novel_downloader/web/__init__.py +13 -0
  189. novel_downloader/web/components/__init__.py +11 -0
  190. novel_downloader/web/components/navigation.py +35 -0
  191. novel_downloader/web/main.py +66 -0
  192. novel_downloader/web/pages/__init__.py +17 -0
  193. novel_downloader/web/pages/download.py +78 -0
  194. novel_downloader/web/pages/progress.py +147 -0
  195. novel_downloader/web/pages/search.py +329 -0
  196. novel_downloader/web/services/__init__.py +17 -0
  197. novel_downloader/web/services/client_dialog.py +164 -0
  198. novel_downloader/web/services/cred_broker.py +113 -0
  199. novel_downloader/web/services/cred_models.py +35 -0
  200. novel_downloader/web/services/task_manager.py +264 -0
  201. novel_downloader-2.0.0.dist-info/METADATA +171 -0
  202. novel_downloader-2.0.0.dist-info/RECORD +210 -0
  203. {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/entry_points.txt +1 -1
  204. novel_downloader/config/site_rules.py +0 -94
  205. novel_downloader/core/downloaders/biquge.py +0 -25
  206. novel_downloader/core/downloaders/esjzone.py +0 -25
  207. novel_downloader/core/downloaders/linovelib.py +0 -25
  208. novel_downloader/core/downloaders/sfacg.py +0 -25
  209. novel_downloader/core/downloaders/yamibo.py +0 -25
  210. novel_downloader/core/exporters/biquge.py +0 -25
  211. novel_downloader/core/exporters/esjzone.py +0 -25
  212. novel_downloader/core/exporters/qianbi.py +0 -25
  213. novel_downloader/core/exporters/sfacg.py +0 -25
  214. novel_downloader/core/exporters/yamibo.py +0 -25
  215. novel_downloader/core/factory/__init__.py +0 -20
  216. novel_downloader/core/factory/downloader.py +0 -73
  217. novel_downloader/core/factory/exporter.py +0 -58
  218. novel_downloader/core/factory/fetcher.py +0 -96
  219. novel_downloader/core/factory/parser.py +0 -86
  220. novel_downloader/core/fetchers/base/__init__.py +0 -14
  221. novel_downloader/core/fetchers/base/browser.py +0 -403
  222. novel_downloader/core/fetchers/biquge/__init__.py +0 -14
  223. novel_downloader/core/fetchers/common/__init__.py +0 -14
  224. novel_downloader/core/fetchers/esjzone/__init__.py +0 -14
  225. novel_downloader/core/fetchers/esjzone/browser.py +0 -204
  226. novel_downloader/core/fetchers/linovelib/__init__.py +0 -14
  227. novel_downloader/core/fetchers/linovelib/browser.py +0 -193
  228. novel_downloader/core/fetchers/qianbi/__init__.py +0 -14
  229. novel_downloader/core/fetchers/qidian/__init__.py +0 -14
  230. novel_downloader/core/fetchers/qidian/browser.py +0 -318
  231. novel_downloader/core/fetchers/sfacg/__init__.py +0 -14
  232. novel_downloader/core/fetchers/sfacg/browser.py +0 -189
  233. novel_downloader/core/fetchers/yamibo/__init__.py +0 -14
  234. novel_downloader/core/fetchers/yamibo/browser.py +0 -229
  235. novel_downloader/core/parsers/biquge/__init__.py +0 -10
  236. novel_downloader/core/parsers/biquge/main_parser.py +0 -134
  237. novel_downloader/core/parsers/common/__init__.py +0 -13
  238. novel_downloader/core/parsers/common/helper.py +0 -323
  239. novel_downloader/core/parsers/common/main_parser.py +0 -106
  240. novel_downloader/core/parsers/esjzone/__init__.py +0 -10
  241. novel_downloader/core/parsers/linovelib/__init__.py +0 -10
  242. novel_downloader/core/parsers/qianbi/__init__.py +0 -10
  243. novel_downloader/core/parsers/sfacg/__init__.py +0 -10
  244. novel_downloader/core/parsers/yamibo/__init__.py +0 -10
  245. novel_downloader/core/parsers/yamibo/main_parser.py +0 -194
  246. novel_downloader/models/browser.py +0 -21
  247. novel_downloader/models/chapter.py +0 -25
  248. novel_downloader/models/site_rules.py +0 -99
  249. novel_downloader/models/tasks.py +0 -33
  250. novel_downloader/models/types.py +0 -15
  251. novel_downloader/resources/css_styles/volume-intro.css +0 -56
  252. novel_downloader/resources/json/replace_word_map.json +0 -4
  253. novel_downloader/resources/text/blacklist.txt +0 -22
  254. novel_downloader/tui/__init__.py +0 -7
  255. novel_downloader/tui/app.py +0 -32
  256. novel_downloader/tui/main.py +0 -17
  257. novel_downloader/tui/screens/__init__.py +0 -14
  258. novel_downloader/tui/screens/home.py +0 -198
  259. novel_downloader/tui/screens/login.py +0 -74
  260. novel_downloader/tui/styles/home_layout.tcss +0 -79
  261. novel_downloader/tui/widgets/richlog_handler.py +0 -24
  262. novel_downloader/utils/cache.py +0 -24
  263. novel_downloader/utils/fontocr/__init__.py +0 -22
  264. novel_downloader/utils/fontocr/model_loader.py +0 -69
  265. novel_downloader/utils/fontocr/ocr_v1.py +0 -303
  266. novel_downloader/utils/fontocr/ocr_v2.py +0 -752
  267. novel_downloader/utils/hash_store.py +0 -279
  268. novel_downloader/utils/hash_utils.py +0 -103
  269. novel_downloader/utils/text_utils/chapter_formatting.py +0 -46
  270. novel_downloader/utils/text_utils/font_mapping.py +0 -28
  271. novel_downloader/utils/text_utils/text_cleaning.py +0 -107
  272. novel_downloader-1.4.5.dist-info/METADATA +0 -196
  273. novel_downloader-1.4.5.dist-info/RECORD +0 -165
  274. {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/WHEEL +0 -0
  275. {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/licenses/LICENSE +0 -0
  276. {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,435 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.xiguashuwu
4
+ ----------------------------------------
5
+
6
+ """
7
+
8
+ import base64
9
+ import hashlib
10
+ import json
11
+ import logging
12
+ import re
13
+ import urllib.parse
14
+ from typing import Any
15
+
16
+ import requests
17
+ from lxml import html
18
+
19
+ from novel_downloader.core.parsers.base import BaseParser
20
+ from novel_downloader.core.parsers.registry import register_parser
21
+ from novel_downloader.models import (
22
+ BookInfoDict,
23
+ ChapterDict,
24
+ ChapterInfoDict,
25
+ VolumeInfoDict,
26
+ )
27
+ from novel_downloader.utils.constants import (
28
+ DEFAULT_USER_HEADERS,
29
+ XIGUASHUWU_FONT_MAP_PATH,
30
+ )
31
+ from novel_downloader.utils.crypto_utils.aes_util import aes_cbc_decrypt
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ @register_parser(
37
+ site_keys=["xiguashuwu"],
38
+ )
39
+ class XiguashuwuParser(BaseParser):
40
+ """
41
+ Parser for 西瓜书屋 book pages.
42
+ """
43
+
44
+ BASE_URL = "https://www.xiguashuwu.com"
45
+ _CONF_THRESHOLD = 0.60
46
+ _FONT_MAP: dict[str, str] = json.loads(
47
+ XIGUASHUWU_FONT_MAP_PATH.read_text(encoding="utf-8")
48
+ )
49
+ _GLYPH_CACHE: dict[str, str] = {}
50
+
51
+ _CODEURL_PATTERN = re.compile(
52
+ r"""var\s+codeurl\s*=\s*['"]?(\d+)['"]?;?""", re.IGNORECASE
53
+ )
54
+
55
+ _NRID_PATTERN = re.compile(
56
+ r"""var\s+nrid\s*=\s*['"]?([A-Za-z0-9]+)['"]?;?""", re.IGNORECASE
57
+ )
58
+
59
+ _NEWCON_PATTERN = re.compile(
60
+ r"""let\s+newcon\s*=\s*decodeURIComponent\(\s*['"](.+?)['"]\s*\);?""",
61
+ re.IGNORECASE,
62
+ )
63
+
64
+ _D_CALL_PATTERN = re.compile(
65
+ r"""d\(\s*[^,]+,\s*['"]([0-9A-Fa-f]{32})['"]\s*\);?""", re.IGNORECASE
66
+ )
67
+
68
+ def parse_book_info(
69
+ self,
70
+ html_list: list[str],
71
+ **kwargs: Any,
72
+ ) -> BookInfoDict | None:
73
+ """
74
+ Parse a book info page and extract metadata and chapter structure.
75
+
76
+ :param html_list: Raw HTML of the book info page.
77
+ :return: Parsed metadata and chapter structure as a dictionary.
78
+ """
79
+ if not html_list:
80
+ return None
81
+ info_tree = html.fromstring(html_list[0])
82
+
83
+ book_name = self._first_str(info_tree.xpath('//p[@class="title"]/text()'))
84
+
85
+ author = self._first_str(info_tree.xpath('//p[@class="author"]//a/text()'))
86
+
87
+ cover_rel = info_tree.xpath(
88
+ '//div[@class="BGsectionOne-top-left"]//img/@_src'
89
+ ) or info_tree.xpath('//div[@class="BGsectionOne-top-left"]//img/@src')
90
+ cover_url = self.BASE_URL + self._first_str(cover_rel)
91
+
92
+ tags = [
93
+ self._first_str(info_tree.xpath('//p[@class="category"]/span[1]/a/text()'))
94
+ ]
95
+
96
+ update_time = self._first_str(info_tree.xpath('//p[@class="time"]/span/text()'))
97
+
98
+ paras = info_tree.xpath('//section[@id="intro"]//p')
99
+ summary = "\n".join(p.xpath("string()").strip() for p in paras).strip()
100
+
101
+ chapters: list[ChapterInfoDict] = []
102
+ for catalog_html in html_list[1:]:
103
+ cat_tree = html.fromstring(catalog_html)
104
+ links = cat_tree.xpath(
105
+ '//section[contains(@class,"BCsectionTwo")]'
106
+ '[.//h3[text()="正文"]]//ol//li/a'
107
+ )
108
+ for a in links:
109
+ title = a.xpath("string()").strip()
110
+ href = a.get("href", "").strip()
111
+ # chapterId is filename sans extension
112
+ chapter_id = href.rsplit("/", 1)[-1].split(".", 1)[0]
113
+ chapters.append(
114
+ ChapterInfoDict(
115
+ title=title,
116
+ url=self.BASE_URL + href,
117
+ chapterId=chapter_id,
118
+ )
119
+ )
120
+
121
+ volumes: list[VolumeInfoDict] = [
122
+ VolumeInfoDict(volume_name="正文", chapters=chapters)
123
+ ]
124
+
125
+ return BookInfoDict(
126
+ book_name=book_name,
127
+ author=author,
128
+ cover_url=cover_url,
129
+ update_time=update_time,
130
+ tags=tags,
131
+ summary=summary,
132
+ volumes=volumes,
133
+ extra={},
134
+ )
135
+
136
+ def parse_chapter(
137
+ self,
138
+ html_list: list[str],
139
+ chapter_id: str,
140
+ **kwargs: Any,
141
+ ) -> ChapterDict | None:
142
+ """
143
+ Parse chapter pages and extract clean text or simplified HTML.
144
+
145
+ :param html_list: Raw HTML of the chapter page.
146
+ :param chapter_id: Identifier of the chapter being parsed.
147
+ :return: Cleaned chapter content as plain text or minimal HTML.
148
+ """
149
+ if not html_list:
150
+ return None
151
+
152
+ title_text = ""
153
+ paragraphs: list[str] = []
154
+
155
+ for page_idx, html_str in enumerate(html_list, start=1):
156
+ if page_idx == 1:
157
+ tree = html.fromstring(html_str)
158
+ title_text = self._extract_chapter_title(tree)
159
+ paragraphs.extend(self._parse_chapter_page1(tree))
160
+ elif page_idx == 2:
161
+ paragraphs.extend(self._parse_chapter_page2(html_str))
162
+ else:
163
+ paragraphs.extend(self._parse_chapter_page3plus(html_str))
164
+
165
+ content = "\n".join(paragraphs).strip()
166
+ if not content:
167
+ return None
168
+
169
+ return {
170
+ "id": chapter_id,
171
+ "title": title_text,
172
+ "content": content,
173
+ "extra": {"site": "xiguashuwu"},
174
+ }
175
+
176
+ @classmethod
177
+ def _parse_chapter_page1(cls, tree: html.HtmlElement) -> list[str]:
178
+ """
179
+ Parse page 1 of the chapter: plain text, no encryption or obfuscation.
180
+
181
+ This method extracts all visible text from the element with id="C0NTENT",
182
+ removes known ad sections
183
+
184
+ :param tree: Parsed HTML element tree of the chapter page.
185
+ :return: List of text lines in reading order.
186
+ """
187
+ try:
188
+ # note: 'C0NTENT' contains a zero, not the letter 'O'
189
+ content_div = tree.xpath('//*[@id="C0NTENT"]')
190
+ if not content_div:
191
+ return []
192
+ content_div = content_div[0]
193
+
194
+ # Remove advertisement or irrelevant sections
195
+ for ad in content_div.xpath('.//div[@class="s_m"]'):
196
+ ad.getparent().remove(ad)
197
+
198
+ lines = content_div.xpath(".//text()")
199
+ return [line.strip() for line in lines if line.strip()]
200
+ except Exception as e:
201
+ logger.warning("Failed to parse chapter page 1: %s", e)
202
+ return []
203
+
204
+ def _parse_chapter_page2(self, html_str: str) -> list[str]:
205
+ """
206
+ Parse page 2 of the chapter: content order shuffled by JavaScript,
207
+ and text replaced with images.
208
+
209
+ :param html_str: Raw HTML string of the chapter page.
210
+ :return: List of text lines extracted in correct reading order.
211
+ """
212
+ try:
213
+ tree = html.fromstring(html_str)
214
+ # Extract ordering metadata
215
+ order_raw = self._parse_client_meta(tree)
216
+ codeurl = self._parse_codeurl(html_str)
217
+ nrid = self._parse_nrid(html_str)
218
+ order_list = self._restore_order(order_raw, codeurl)
219
+
220
+ # Extract paragraphs in raw order
221
+ content_divs = tree.xpath(f'//*[@id="{nrid}"]')
222
+ if not content_divs:
223
+ return []
224
+ paragraphs = self._rebuild_paragraphs(content_divs[0])
225
+
226
+ # Reorder paragraphs
227
+ reordered: list[str] = []
228
+ for idx in order_list:
229
+ if 0 <= idx < len(paragraphs):
230
+ reordered.append(paragraphs[idx])
231
+ return reordered
232
+ except Exception as e:
233
+ logger.warning("Failed to parse chapter page 2: %s", e)
234
+ return []
235
+
236
+ def _parse_chapter_page3plus(self, html_str: str) -> list[str]:
237
+ """
238
+ Parse pages 3 and beyond of the chapter: AES-encrypted text
239
+ replaced with images.
240
+
241
+ :param html_str: Raw HTML string of the chapter page.
242
+ :return: List of decrypted text lines in reading order.
243
+ """
244
+ try:
245
+ newcon = self._parse_newcon(html_str)
246
+ d_key = self._parse_d_key(html_str)
247
+ full_html = self._decrypt_d(newcon, d_key)
248
+ tree = html.fromstring(full_html)
249
+ paragraphs = self._rebuild_paragraphs(tree)
250
+ return paragraphs
251
+ except Exception as e:
252
+ logger.warning("Failed to parse chapter page 3+: %s", e)
253
+ return []
254
+
255
+ @classmethod
256
+ def _extract_chapter_title(cls, tree: html.HtmlElement) -> str:
257
+ """
258
+ Extract the chapter title from the HTML tree.
259
+
260
+ The title is expected to be located inside:
261
+ <h1 id="chapterTitle">...</h1>
262
+
263
+ :param tree: Parsed HTML element tree of the chapter page.
264
+ :return: Chapter title as a string, or an empty string if not found.
265
+ """
266
+ return cls._first_str(tree.xpath('//h1[@id="chapterTitle"]/text()'))
267
+
268
+ def _char_from_img(self, url: str) -> str:
269
+ """
270
+ Given an <img> src URL, return the mapped character if this image
271
+ represents a single glyph.
272
+ """
273
+ fname = url.split("/")[-1].split("?", 1)[0]
274
+ char = self._FONT_MAP.get(fname)
275
+ if char:
276
+ return char
277
+ if url in self._GLYPH_CACHE:
278
+ return self._GLYPH_CACHE[url]
279
+ if self._decode_font:
280
+ char = self._recognize_glyph_from_url(url)
281
+ if char:
282
+ self._GLYPH_CACHE[url] = char
283
+ return char
284
+ return f'<img src="{url}" />'
285
+
286
+ @classmethod
287
+ def _recognize_glyph_from_url(cls, url: str) -> str | None:
288
+ """
289
+ Download the glyph image at `url` and run the font OCR on it.
290
+
291
+ :param url: Fully-qualified <img src="..."> URL to a single-glyph image.
292
+ :return: The recognized character (top-1) if OCR succeeds, otherwise None.
293
+ """
294
+ try:
295
+ import io
296
+
297
+ import numpy as np
298
+ from PIL import Image
299
+
300
+ from novel_downloader.utils.fontocr import get_font_ocr
301
+
302
+ resp = requests.get(url, headers=DEFAULT_USER_HEADERS, timeout=15)
303
+ resp.raise_for_status()
304
+
305
+ im = Image.open(io.BytesIO(resp.content)).convert("RGB")
306
+ img_np = np.asarray(im)
307
+
308
+ ocr = get_font_ocr(batch_size=1)
309
+ char, score = ocr.predict([img_np], top_k=1)[0][0]
310
+
311
+ return char if score >= cls._CONF_THRESHOLD else None
312
+
313
+ except ImportError:
314
+ logger.warning("[Parser] FontOCR not available, font decoding will skip")
315
+ except Exception as e:
316
+ logger.warning("[Parser] Failed to ocr glyph image %s: %s", url, e)
317
+ return None
318
+
319
+ @classmethod
320
+ def _parse_codeurl(cls, text: str) -> int:
321
+ """
322
+ Extract the integer from `var codeurl="7";`.
323
+
324
+ Raises ValueError if not found.
325
+ """
326
+ m = cls._CODEURL_PATTERN.search(text)
327
+ if not m:
328
+ raise ValueError("codeurl not found")
329
+ return int(m.group(1))
330
+
331
+ @classmethod
332
+ def _parse_nrid(cls, text: str) -> str:
333
+ """
334
+ Extract the string from `var nrid="FGQSWYBCK";`.
335
+
336
+ Raises ValueError if not found.
337
+ """
338
+ m = cls._NRID_PATTERN.search(text)
339
+ if not m:
340
+ raise ValueError("nrid not found")
341
+ return m.group(1)
342
+
343
+ @classmethod
344
+ def _parse_newcon(cls, text: str) -> str:
345
+ """
346
+ Extract and decode the percent-encoded argument of
347
+ `let newcon=decodeURIComponent("...");`.
348
+
349
+ Raises ValueError if not found.
350
+ """
351
+ m = cls._NEWCON_PATTERN.search(text)
352
+ if not m:
353
+ raise ValueError("newcon not found")
354
+ return urllib.parse.unquote(m.group(1))
355
+
356
+ @classmethod
357
+ def _parse_d_key(cls, text: str) -> str:
358
+ """
359
+ Extract the second argument (the hex key) from `d(newcon, "...");`.
360
+
361
+ Raises ValueError if not found.
362
+ """
363
+ m = cls._D_CALL_PATTERN.search(text)
364
+ if not m:
365
+ raise ValueError("d() call with key not found")
366
+ return m.group(1)
367
+
368
+ @classmethod
369
+ def _parse_client_meta(cls, tree: html.HtmlElement) -> str:
370
+ """
371
+ Given an lxml.html tree, return the `content` of
372
+ <meta name="client" content="..."/> in <head>.
373
+
374
+ Raises ValueError if missing.
375
+ """
376
+ vals = tree.xpath("//head/meta[@name='client']/@content")
377
+ if not vals:
378
+ raise ValueError("client meta not found")
379
+ return str(vals[0])
380
+
381
+ @staticmethod
382
+ def _restore_order(raw_b64: str, code: int) -> list[int]:
383
+ decoded = base64.b64decode(raw_b64).decode("utf-8")
384
+ fragments = re.split(r"[A-Z]+%", decoded)
385
+
386
+ order = [0] * len(fragments)
387
+ for i, m in enumerate(fragments):
388
+ # UpWz logic: k = ceil(parseInt(m) - ceil((i+1) % codeurl))
389
+ k = int(m) - ((i + 1) % code)
390
+ order[k] = i
391
+ return order
392
+
393
+ @staticmethod
394
+ def _decrypt_d(a: str, b: str) -> str:
395
+ digest = hashlib.md5(b.encode("utf-8")).hexdigest() # 32 hex chars
396
+
397
+ iv = digest[:16].encode("utf-8")
398
+ key = digest[16:].encode("utf-8")
399
+
400
+ ct = base64.b64decode(a)
401
+ plaintext = aes_cbc_decrypt(key, iv, ct, block_size=32)
402
+
403
+ return plaintext.decode("utf-8")
404
+
405
+ def _rebuild_paragraphs(self, content_div: html.HtmlElement) -> list[str]:
406
+ """
407
+ Given a content container element, reconstruct each paragraph by
408
+ interleaving normal text nodes and <img>-based glyphs.
409
+
410
+ Uses `_char_from_img` to map image glyphs to characters.
411
+
412
+ :param content_div: The HTML element containing <p> paragraphs.
413
+ :return: List of reconstructed paragraph strings.
414
+ """
415
+ paragraphs: list[str] = []
416
+ for p in content_div.xpath(".//p"):
417
+ parts: list[str] = []
418
+
419
+ # Leading text before any children
420
+ if p.text and p.text.strip():
421
+ parts.append(p.text.strip())
422
+
423
+ for child in p:
424
+ tag = child.tag.lower()
425
+ if tag == "img":
426
+ src = (child.get("src") or "").strip()
427
+ full = src if src.startswith("http") else self.BASE_URL + src
428
+ parts.append(self._char_from_img(full))
429
+ # Append any tail text after this child
430
+ if child.tail and child.tail.strip():
431
+ parts.append(child.tail.strip())
432
+
433
+ paragraph = "".join(parts).strip()
434
+ paragraphs.append(paragraph)
435
+ return paragraphs
@@ -0,0 +1,161 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.xs63b
4
+ -----------------------------------
5
+
6
+ """
7
+
8
+ import re
9
+ from typing import Any
10
+
11
+ from lxml import html
12
+
13
+ from novel_downloader.core.parsers.base import BaseParser
14
+ from novel_downloader.core.parsers.registry import register_parser
15
+ from novel_downloader.models import (
16
+ BookInfoDict,
17
+ ChapterDict,
18
+ ChapterInfoDict,
19
+ VolumeInfoDict,
20
+ )
21
+
22
+
23
+ @register_parser(
24
+ site_keys=["xs63b"],
25
+ )
26
+ class Xs63bParser(BaseParser):
27
+ """
28
+ Parser for 小说路上 book pages.
29
+ """
30
+
31
+ TITLE_SELECTOR = "//div[@class='block_txt2']//h2/text()"
32
+ AUTHOR_SELECTOR = "//p[contains(., '作者')]/a/text()"
33
+ TYPE_SELECTOR = "//p[contains(., '分类')]/a/text()"
34
+ STATUS_SELECTOR = "//p[contains(., '状态')]/text()"
35
+ UPDATE_SELECTOR = "//p[contains(., '更新')]/text()"
36
+ COVER_SELECTOR = "//div[@class='block_img2']//img/@src"
37
+ SUMMARY_SELECTOR = (
38
+ "//div[@class='intro' and contains(., '小说简介')]"
39
+ "/following-sibling::div[@class='intro_info'][1]"
40
+ )
41
+ CATALOG_ANCHORS = (
42
+ "//h2[contains(., '正文')]/following-sibling::div[@class='book_list'][1]//a"
43
+ )
44
+
45
+ CHAPTER_TITLE_SELECTOR = "//h1[@id='_52mb_h1']/text()"
46
+ CHAPTER_PARAGRAPHS = "//div[@id='nr1']//p"
47
+
48
+ _RE_STRIP_DIV = re.compile(r"^<div[^>]*>|</div>$", re.I)
49
+ _RE_STRIP_JIANJIE = re.compile(r"^\s*简介\s*[::]\s*", re.I)
50
+ _RE_SPACES = re.compile(r"[ \t]+")
51
+
52
+ ADS = {"如章节缺失", "本章未完", "下一页继续阅读", "xs63b.com"}
53
+
54
+ def parse_book_info(
55
+ self,
56
+ html_list: list[str],
57
+ **kwargs: Any,
58
+ ) -> BookInfoDict | None:
59
+ if len(html_list) < 2:
60
+ return None
61
+
62
+ info_tree = html.fromstring(html_list[0])
63
+ catalog_tree = html.fromstring(html_list[1])
64
+
65
+ book_name = self._first_str(info_tree.xpath(self.TITLE_SELECTOR))
66
+ author = self._first_str(info_tree.xpath(self.AUTHOR_SELECTOR))
67
+ book_type = self._first_str(info_tree.xpath(self.TYPE_SELECTOR))
68
+
69
+ serial_status = self._first_str(
70
+ info_tree.xpath(self.STATUS_SELECTOR),
71
+ replaces=[("状态:", "")],
72
+ )
73
+ serial_status = self._norm_space(serial_status)
74
+
75
+ update_time = self._first_str(
76
+ info_tree.xpath(self.UPDATE_SELECTOR),
77
+ replaces=[("更新:", "")],
78
+ )
79
+ cover_url = self._first_str(info_tree.xpath(self.COVER_SELECTOR))
80
+
81
+ # Summary: keep first <br> segment, then cut at "{author}的作品集"
82
+ summary = ""
83
+ nodes = info_tree.xpath(self.SUMMARY_SELECTOR)
84
+ if nodes:
85
+ node_html = html.tostring(nodes[0], method="html", encoding="unicode")
86
+ node_html = self._RE_STRIP_DIV.sub("", node_html).strip()
87
+ first_seg = node_html.split("<br", 1)[0]
88
+ text = html.fromstring(f"<div>{first_seg}</div>").text_content()
89
+ text = self._RE_STRIP_JIANJIE.sub("", text).strip()
90
+ if author:
91
+ text = text.split(f"{author}的作品集")[0].strip()
92
+ summary = text
93
+
94
+ tags = [book_type] if book_type else []
95
+
96
+ chapters: list[ChapterInfoDict] = []
97
+ for a in catalog_tree.xpath(self.CATALOG_ANCHORS):
98
+ href = a.get("href") or ""
99
+ title = (a.text_content() or "").strip()
100
+ if not href or not title:
101
+ continue
102
+ # 'https://www.xs63b.com/xuanhuan/wanyuzhiwang/29546477.html' -> '29546477'
103
+ chap_id = href.rsplit("/", 1)[-1].split(".")[0]
104
+ chapters.append({"title": title, "url": href, "chapterId": chap_id})
105
+
106
+ volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
107
+
108
+ return {
109
+ "book_name": book_name,
110
+ "author": author,
111
+ "cover_url": cover_url,
112
+ "update_time": update_time,
113
+ "serial_status": serial_status,
114
+ "summary": summary,
115
+ "tags": tags,
116
+ "volumes": volumes,
117
+ "extra": {},
118
+ }
119
+
120
+ def parse_chapter(
121
+ self,
122
+ html_list: list[str],
123
+ chapter_id: str,
124
+ **kwargs: Any,
125
+ ) -> ChapterDict | None:
126
+ if not html_list:
127
+ return None
128
+
129
+ title = ""
130
+ paragraphs: list[str] = []
131
+
132
+ for html_str in html_list:
133
+ tree = html.fromstring(html_str)
134
+
135
+ if not title:
136
+ h1 = self._first_str(tree.xpath(self.CHAPTER_TITLE_SELECTOR))
137
+ title = h1.rsplit(" ", 1)[0].strip() if (" " in h1) else h1
138
+
139
+ for p in tree.xpath(self.CHAPTER_PARAGRAPHS):
140
+ cls = p.get("class") or ""
141
+ pid = p.get("id") or ""
142
+ if "hid-pages" in cls or "pages" in cls or "contentTip" in pid:
143
+ continue
144
+
145
+ txt = (p.text_content() or "").replace("\xa0", " ")
146
+ txt = self._RE_SPACES.sub(" ", txt).strip()
147
+ if not txt or self._is_ad_line(txt):
148
+ continue
149
+
150
+ paragraphs.append(txt)
151
+
152
+ content = "\n".join(paragraphs).strip()
153
+ if not content:
154
+ return None
155
+
156
+ return {
157
+ "id": chapter_id,
158
+ "title": title,
159
+ "content": content,
160
+ "extra": {"site": "xs63b"},
161
+ }