novel-downloader 1.5.0__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (248) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/__init__.py +1 -3
  3. novel_downloader/cli/clean.py +21 -88
  4. novel_downloader/cli/config.py +26 -21
  5. novel_downloader/cli/download.py +79 -66
  6. novel_downloader/cli/export.py +17 -21
  7. novel_downloader/cli/main.py +1 -1
  8. novel_downloader/cli/search.py +62 -65
  9. novel_downloader/cli/ui.py +156 -0
  10. novel_downloader/config/__init__.py +8 -5
  11. novel_downloader/config/adapter.py +206 -209
  12. novel_downloader/config/{loader.py → file_io.py} +53 -26
  13. novel_downloader/core/__init__.py +5 -5
  14. novel_downloader/core/archived/deqixs/fetcher.py +115 -0
  15. novel_downloader/core/archived/deqixs/parser.py +132 -0
  16. novel_downloader/core/archived/deqixs/searcher.py +89 -0
  17. novel_downloader/core/{searchers/qidian.py → archived/qidian/searcher.py} +12 -20
  18. novel_downloader/core/archived/wanbengo/searcher.py +98 -0
  19. novel_downloader/core/archived/xshbook/searcher.py +93 -0
  20. novel_downloader/core/downloaders/__init__.py +3 -24
  21. novel_downloader/core/downloaders/base.py +49 -23
  22. novel_downloader/core/downloaders/common.py +191 -137
  23. novel_downloader/core/downloaders/qianbi.py +187 -146
  24. novel_downloader/core/downloaders/qidian.py +187 -141
  25. novel_downloader/core/downloaders/registry.py +4 -2
  26. novel_downloader/core/downloaders/signals.py +46 -0
  27. novel_downloader/core/exporters/__init__.py +3 -20
  28. novel_downloader/core/exporters/base.py +33 -37
  29. novel_downloader/core/exporters/common/__init__.py +1 -2
  30. novel_downloader/core/exporters/common/epub.py +15 -10
  31. novel_downloader/core/exporters/common/main_exporter.py +19 -12
  32. novel_downloader/core/exporters/common/txt.py +17 -12
  33. novel_downloader/core/exporters/epub_util.py +59 -29
  34. novel_downloader/core/exporters/linovelib/__init__.py +1 -0
  35. novel_downloader/core/exporters/linovelib/epub.py +23 -25
  36. novel_downloader/core/exporters/linovelib/main_exporter.py +8 -12
  37. novel_downloader/core/exporters/linovelib/txt.py +20 -14
  38. novel_downloader/core/exporters/qidian.py +2 -8
  39. novel_downloader/core/exporters/registry.py +4 -2
  40. novel_downloader/core/exporters/txt_util.py +7 -7
  41. novel_downloader/core/fetchers/__init__.py +54 -48
  42. novel_downloader/core/fetchers/aaatxt.py +83 -0
  43. novel_downloader/core/fetchers/{biquge/session.py → b520.py} +6 -11
  44. novel_downloader/core/fetchers/{base/session.py → base.py} +37 -46
  45. novel_downloader/core/fetchers/{biquge/browser.py → biquyuedu.py} +12 -17
  46. novel_downloader/core/fetchers/dxmwx.py +110 -0
  47. novel_downloader/core/fetchers/eightnovel.py +139 -0
  48. novel_downloader/core/fetchers/{esjzone/session.py → esjzone.py} +19 -12
  49. novel_downloader/core/fetchers/guidaye.py +85 -0
  50. novel_downloader/core/fetchers/hetushu.py +92 -0
  51. novel_downloader/core/fetchers/{qianbi/browser.py → i25zw.py} +19 -28
  52. novel_downloader/core/fetchers/ixdzs8.py +113 -0
  53. novel_downloader/core/fetchers/jpxs123.py +101 -0
  54. novel_downloader/core/fetchers/lewenn.py +83 -0
  55. novel_downloader/core/fetchers/{linovelib/session.py → linovelib.py} +12 -13
  56. novel_downloader/core/fetchers/piaotia.py +105 -0
  57. novel_downloader/core/fetchers/qbtr.py +101 -0
  58. novel_downloader/core/fetchers/{qianbi/session.py → qianbi.py} +5 -10
  59. novel_downloader/core/fetchers/{qidian/session.py → qidian.py} +56 -64
  60. novel_downloader/core/fetchers/quanben5.py +92 -0
  61. novel_downloader/core/fetchers/{base/rate_limiter.py → rate_limiter.py} +2 -2
  62. novel_downloader/core/fetchers/registry.py +5 -16
  63. novel_downloader/core/fetchers/{sfacg/session.py → sfacg.py} +7 -10
  64. novel_downloader/core/fetchers/shencou.py +106 -0
  65. novel_downloader/core/fetchers/shuhaige.py +84 -0
  66. novel_downloader/core/fetchers/tongrenquan.py +84 -0
  67. novel_downloader/core/fetchers/ttkan.py +95 -0
  68. novel_downloader/core/fetchers/wanbengo.py +83 -0
  69. novel_downloader/core/fetchers/xiaoshuowu.py +106 -0
  70. novel_downloader/core/fetchers/xiguashuwu.py +177 -0
  71. novel_downloader/core/fetchers/xs63b.py +171 -0
  72. novel_downloader/core/fetchers/xshbook.py +85 -0
  73. novel_downloader/core/fetchers/{yamibo/session.py → yamibo.py} +19 -12
  74. novel_downloader/core/fetchers/yibige.py +114 -0
  75. novel_downloader/core/interfaces/__init__.py +1 -9
  76. novel_downloader/core/interfaces/downloader.py +6 -2
  77. novel_downloader/core/interfaces/exporter.py +7 -7
  78. novel_downloader/core/interfaces/fetcher.py +6 -19
  79. novel_downloader/core/interfaces/parser.py +7 -8
  80. novel_downloader/core/interfaces/searcher.py +9 -1
  81. novel_downloader/core/parsers/__init__.py +49 -12
  82. novel_downloader/core/parsers/aaatxt.py +132 -0
  83. novel_downloader/core/parsers/b520.py +116 -0
  84. novel_downloader/core/parsers/base.py +64 -12
  85. novel_downloader/core/parsers/biquyuedu.py +133 -0
  86. novel_downloader/core/parsers/dxmwx.py +162 -0
  87. novel_downloader/core/parsers/eightnovel.py +224 -0
  88. novel_downloader/core/parsers/esjzone.py +64 -69
  89. novel_downloader/core/parsers/guidaye.py +128 -0
  90. novel_downloader/core/parsers/hetushu.py +139 -0
  91. novel_downloader/core/parsers/i25zw.py +137 -0
  92. novel_downloader/core/parsers/ixdzs8.py +186 -0
  93. novel_downloader/core/parsers/jpxs123.py +137 -0
  94. novel_downloader/core/parsers/lewenn.py +142 -0
  95. novel_downloader/core/parsers/linovelib.py +48 -64
  96. novel_downloader/core/parsers/piaotia.py +189 -0
  97. novel_downloader/core/parsers/qbtr.py +136 -0
  98. novel_downloader/core/parsers/qianbi.py +48 -50
  99. novel_downloader/core/parsers/qidian/main_parser.py +756 -48
  100. novel_downloader/core/parsers/qidian/utils/__init__.py +3 -21
  101. novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +1 -1
  102. novel_downloader/core/parsers/qidian/utils/node_decryptor.py +4 -4
  103. novel_downloader/core/parsers/quanben5.py +103 -0
  104. novel_downloader/core/parsers/registry.py +5 -16
  105. novel_downloader/core/parsers/sfacg.py +38 -45
  106. novel_downloader/core/parsers/shencou.py +215 -0
  107. novel_downloader/core/parsers/shuhaige.py +111 -0
  108. novel_downloader/core/parsers/tongrenquan.py +116 -0
  109. novel_downloader/core/parsers/ttkan.py +132 -0
  110. novel_downloader/core/parsers/wanbengo.py +191 -0
  111. novel_downloader/core/parsers/xiaoshuowu.py +173 -0
  112. novel_downloader/core/parsers/xiguashuwu.py +429 -0
  113. novel_downloader/core/parsers/xs63b.py +161 -0
  114. novel_downloader/core/parsers/xshbook.py +134 -0
  115. novel_downloader/core/parsers/yamibo.py +87 -131
  116. novel_downloader/core/parsers/yibige.py +166 -0
  117. novel_downloader/core/searchers/__init__.py +34 -3
  118. novel_downloader/core/searchers/aaatxt.py +107 -0
  119. novel_downloader/core/searchers/{biquge.py → b520.py} +29 -28
  120. novel_downloader/core/searchers/base.py +112 -36
  121. novel_downloader/core/searchers/dxmwx.py +105 -0
  122. novel_downloader/core/searchers/eightnovel.py +84 -0
  123. novel_downloader/core/searchers/esjzone.py +43 -25
  124. novel_downloader/core/searchers/hetushu.py +92 -0
  125. novel_downloader/core/searchers/i25zw.py +93 -0
  126. novel_downloader/core/searchers/ixdzs8.py +107 -0
  127. novel_downloader/core/searchers/jpxs123.py +107 -0
  128. novel_downloader/core/searchers/piaotia.py +100 -0
  129. novel_downloader/core/searchers/qbtr.py +106 -0
  130. novel_downloader/core/searchers/qianbi.py +74 -40
  131. novel_downloader/core/searchers/quanben5.py +144 -0
  132. novel_downloader/core/searchers/registry.py +24 -8
  133. novel_downloader/core/searchers/shuhaige.py +124 -0
  134. novel_downloader/core/searchers/tongrenquan.py +110 -0
  135. novel_downloader/core/searchers/ttkan.py +92 -0
  136. novel_downloader/core/searchers/xiaoshuowu.py +122 -0
  137. novel_downloader/core/searchers/xiguashuwu.py +95 -0
  138. novel_downloader/core/searchers/xs63b.py +104 -0
  139. novel_downloader/locales/en.json +34 -85
  140. novel_downloader/locales/zh.json +35 -86
  141. novel_downloader/models/__init__.py +21 -22
  142. novel_downloader/models/book.py +44 -0
  143. novel_downloader/models/config.py +4 -37
  144. novel_downloader/models/login.py +1 -1
  145. novel_downloader/models/search.py +5 -0
  146. novel_downloader/resources/config/settings.toml +8 -70
  147. novel_downloader/resources/json/xiguashuwu.json +718 -0
  148. novel_downloader/utils/__init__.py +13 -24
  149. novel_downloader/utils/chapter_storage.py +5 -5
  150. novel_downloader/utils/constants.py +4 -31
  151. novel_downloader/utils/cookies.py +38 -35
  152. novel_downloader/utils/crypto_utils/__init__.py +7 -0
  153. novel_downloader/utils/crypto_utils/aes_util.py +90 -0
  154. novel_downloader/utils/crypto_utils/aes_v1.py +619 -0
  155. novel_downloader/utils/crypto_utils/aes_v2.py +1143 -0
  156. novel_downloader/utils/crypto_utils/rc4.py +54 -0
  157. novel_downloader/utils/epub/__init__.py +3 -4
  158. novel_downloader/utils/epub/builder.py +6 -6
  159. novel_downloader/utils/epub/constants.py +62 -21
  160. novel_downloader/utils/epub/documents.py +95 -201
  161. novel_downloader/utils/epub/models.py +8 -22
  162. novel_downloader/utils/epub/utils.py +73 -106
  163. novel_downloader/utils/file_utils/__init__.py +2 -23
  164. novel_downloader/utils/file_utils/io.py +53 -188
  165. novel_downloader/utils/file_utils/normalize.py +1 -7
  166. novel_downloader/utils/file_utils/sanitize.py +4 -15
  167. novel_downloader/utils/fontocr/__init__.py +5 -14
  168. novel_downloader/utils/fontocr/core.py +216 -0
  169. novel_downloader/utils/fontocr/loader.py +50 -0
  170. novel_downloader/utils/logger.py +81 -65
  171. novel_downloader/utils/network.py +17 -41
  172. novel_downloader/utils/state.py +4 -90
  173. novel_downloader/utils/text_utils/__init__.py +1 -7
  174. novel_downloader/utils/text_utils/diff_display.py +5 -7
  175. novel_downloader/utils/text_utils/text_cleaner.py +39 -30
  176. novel_downloader/utils/text_utils/truncate_utils.py +3 -14
  177. novel_downloader/utils/time_utils/__init__.py +5 -11
  178. novel_downloader/utils/time_utils/datetime_utils.py +20 -29
  179. novel_downloader/utils/time_utils/sleep_utils.py +55 -49
  180. novel_downloader/web/__init__.py +13 -0
  181. novel_downloader/web/components/__init__.py +11 -0
  182. novel_downloader/web/components/navigation.py +35 -0
  183. novel_downloader/web/main.py +66 -0
  184. novel_downloader/web/pages/__init__.py +17 -0
  185. novel_downloader/web/pages/download.py +78 -0
  186. novel_downloader/web/pages/progress.py +147 -0
  187. novel_downloader/web/pages/search.py +329 -0
  188. novel_downloader/web/services/__init__.py +17 -0
  189. novel_downloader/web/services/client_dialog.py +164 -0
  190. novel_downloader/web/services/cred_broker.py +113 -0
  191. novel_downloader/web/services/cred_models.py +35 -0
  192. novel_downloader/web/services/task_manager.py +264 -0
  193. novel_downloader-2.0.1.dist-info/METADATA +172 -0
  194. novel_downloader-2.0.1.dist-info/RECORD +206 -0
  195. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/entry_points.txt +1 -1
  196. novel_downloader/core/downloaders/biquge.py +0 -29
  197. novel_downloader/core/downloaders/esjzone.py +0 -29
  198. novel_downloader/core/downloaders/linovelib.py +0 -29
  199. novel_downloader/core/downloaders/sfacg.py +0 -29
  200. novel_downloader/core/downloaders/yamibo.py +0 -29
  201. novel_downloader/core/exporters/biquge.py +0 -22
  202. novel_downloader/core/exporters/esjzone.py +0 -22
  203. novel_downloader/core/exporters/qianbi.py +0 -22
  204. novel_downloader/core/exporters/sfacg.py +0 -22
  205. novel_downloader/core/exporters/yamibo.py +0 -22
  206. novel_downloader/core/fetchers/base/__init__.py +0 -14
  207. novel_downloader/core/fetchers/base/browser.py +0 -422
  208. novel_downloader/core/fetchers/biquge/__init__.py +0 -14
  209. novel_downloader/core/fetchers/esjzone/__init__.py +0 -14
  210. novel_downloader/core/fetchers/esjzone/browser.py +0 -209
  211. novel_downloader/core/fetchers/linovelib/__init__.py +0 -14
  212. novel_downloader/core/fetchers/linovelib/browser.py +0 -198
  213. novel_downloader/core/fetchers/qianbi/__init__.py +0 -14
  214. novel_downloader/core/fetchers/qidian/__init__.py +0 -14
  215. novel_downloader/core/fetchers/qidian/browser.py +0 -326
  216. novel_downloader/core/fetchers/sfacg/__init__.py +0 -14
  217. novel_downloader/core/fetchers/sfacg/browser.py +0 -194
  218. novel_downloader/core/fetchers/yamibo/__init__.py +0 -14
  219. novel_downloader/core/fetchers/yamibo/browser.py +0 -234
  220. novel_downloader/core/parsers/biquge.py +0 -139
  221. novel_downloader/core/parsers/qidian/book_info_parser.py +0 -90
  222. novel_downloader/core/parsers/qidian/chapter_encrypted.py +0 -528
  223. novel_downloader/core/parsers/qidian/chapter_normal.py +0 -157
  224. novel_downloader/core/parsers/qidian/chapter_router.py +0 -68
  225. novel_downloader/core/parsers/qidian/utils/helpers.py +0 -114
  226. novel_downloader/models/chapter.py +0 -25
  227. novel_downloader/models/types.py +0 -13
  228. novel_downloader/tui/__init__.py +0 -7
  229. novel_downloader/tui/app.py +0 -32
  230. novel_downloader/tui/main.py +0 -17
  231. novel_downloader/tui/screens/__init__.py +0 -14
  232. novel_downloader/tui/screens/home.py +0 -198
  233. novel_downloader/tui/screens/login.py +0 -74
  234. novel_downloader/tui/styles/home_layout.tcss +0 -79
  235. novel_downloader/tui/widgets/richlog_handler.py +0 -24
  236. novel_downloader/utils/cache.py +0 -24
  237. novel_downloader/utils/crypto_utils.py +0 -71
  238. novel_downloader/utils/fontocr/hash_store.py +0 -280
  239. novel_downloader/utils/fontocr/hash_utils.py +0 -103
  240. novel_downloader/utils/fontocr/model_loader.py +0 -69
  241. novel_downloader/utils/fontocr/ocr_v1.py +0 -315
  242. novel_downloader/utils/fontocr/ocr_v2.py +0 -764
  243. novel_downloader/utils/fontocr/ocr_v3.py +0 -744
  244. novel_downloader-1.5.0.dist-info/METADATA +0 -196
  245. novel_downloader-1.5.0.dist-info/RECORD +0 -164
  246. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/WHEEL +0 -0
  247. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/licenses/LICENSE +0 -0
  248. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,429 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.xiguashuwu
4
+ ----------------------------------------
5
+
6
+ """
7
+
8
+ import base64
9
+ import hashlib
10
+ import json
11
+ import logging
12
+ import re
13
+ import urllib.parse
14
+ from typing import Any
15
+
16
+ import requests
17
+ from lxml import html
18
+
19
+ from novel_downloader.core.parsers.base import BaseParser
20
+ from novel_downloader.core.parsers.registry import register_parser
21
+ from novel_downloader.models import (
22
+ BookInfoDict,
23
+ ChapterDict,
24
+ ChapterInfoDict,
25
+ VolumeInfoDict,
26
+ )
27
+ from novel_downloader.utils.constants import (
28
+ DEFAULT_USER_HEADERS,
29
+ XIGUASHUWU_FONT_MAP_PATH,
30
+ )
31
+ from novel_downloader.utils.crypto_utils.aes_util import aes_cbc_decrypt
32
+ from novel_downloader.utils.fontocr import get_font_ocr
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ @register_parser(
38
+ site_keys=["xiguashuwu"],
39
+ )
40
+ class XiguashuwuParser(BaseParser):
41
+ """
42
+ Parser for 西瓜书屋 book pages.
43
+ """
44
+
45
+ BASE_URL = "https://www.xiguashuwu.com"
46
+ _CONF_THRESHOLD = 0.60
47
+ _FONT_MAP: dict[str, str] = json.loads(
48
+ XIGUASHUWU_FONT_MAP_PATH.read_text(encoding="utf-8")
49
+ )
50
+ _GLYPH_CACHE: dict[str, str] = {}
51
+
52
+ _CODEURL_PATTERN = re.compile(
53
+ r"""var\s+codeurl\s*=\s*['"]?(\d+)['"]?;?""", re.IGNORECASE
54
+ )
55
+
56
+ _NRID_PATTERN = re.compile(
57
+ r"""var\s+nrid\s*=\s*['"]?([A-Za-z0-9]+)['"]?;?""", re.IGNORECASE
58
+ )
59
+
60
+ _NEWCON_PATTERN = re.compile(
61
+ r"""let\s+newcon\s*=\s*decodeURIComponent\(\s*['"](.+?)['"]\s*\);?""",
62
+ re.IGNORECASE,
63
+ )
64
+
65
+ _D_CALL_PATTERN = re.compile(
66
+ r"""d\(\s*[^,]+,\s*['"]([0-9A-Fa-f]{32})['"]\s*\);?""", re.IGNORECASE
67
+ )
68
+
69
+ def parse_book_info(
70
+ self,
71
+ html_list: list[str],
72
+ **kwargs: Any,
73
+ ) -> BookInfoDict | None:
74
+ """
75
+ Parse a book info page and extract metadata and chapter structure.
76
+
77
+ :param html_list: Raw HTML of the book info page.
78
+ :return: Parsed metadata and chapter structure as a dictionary.
79
+ """
80
+ if not html_list:
81
+ return None
82
+ info_tree = html.fromstring(html_list[0])
83
+
84
+ book_name = self._first_str(info_tree.xpath('//p[@class="title"]/text()'))
85
+
86
+ author = self._first_str(info_tree.xpath('//p[@class="author"]//a/text()'))
87
+
88
+ cover_rel = info_tree.xpath(
89
+ '//div[@class="BGsectionOne-top-left"]//img/@_src'
90
+ ) or info_tree.xpath('//div[@class="BGsectionOne-top-left"]//img/@src')
91
+ cover_url = self.BASE_URL + self._first_str(cover_rel)
92
+
93
+ tags = [
94
+ self._first_str(info_tree.xpath('//p[@class="category"]/span[1]/a/text()'))
95
+ ]
96
+
97
+ update_time = self._first_str(info_tree.xpath('//p[@class="time"]/span/text()'))
98
+
99
+ paras = info_tree.xpath('//section[@id="intro"]//p')
100
+ summary = "\n".join(p.xpath("string()").strip() for p in paras).strip()
101
+
102
+ chapters: list[ChapterInfoDict] = []
103
+ for catalog_html in html_list[1:]:
104
+ cat_tree = html.fromstring(catalog_html)
105
+ links = cat_tree.xpath(
106
+ '//section[contains(@class,"BCsectionTwo")]'
107
+ '[.//h3[text()="正文"]]//ol//li/a'
108
+ )
109
+ for a in links:
110
+ title = a.xpath("string()").strip()
111
+ href = a.get("href", "").strip()
112
+ # chapterId is filename sans extension
113
+ chapter_id = href.rsplit("/", 1)[-1].split(".", 1)[0]
114
+ chapters.append(
115
+ ChapterInfoDict(
116
+ title=title,
117
+ url=self.BASE_URL + href,
118
+ chapterId=chapter_id,
119
+ )
120
+ )
121
+
122
+ volumes: list[VolumeInfoDict] = [
123
+ VolumeInfoDict(volume_name="正文", chapters=chapters)
124
+ ]
125
+
126
+ return BookInfoDict(
127
+ book_name=book_name,
128
+ author=author,
129
+ cover_url=cover_url,
130
+ update_time=update_time,
131
+ tags=tags,
132
+ summary=summary,
133
+ volumes=volumes,
134
+ extra={},
135
+ )
136
+
137
+ def parse_chapter(
138
+ self,
139
+ html_list: list[str],
140
+ chapter_id: str,
141
+ **kwargs: Any,
142
+ ) -> ChapterDict | None:
143
+ """
144
+ Parse chapter pages and extract clean text or simplified HTML.
145
+
146
+ :param html_list: Raw HTML of the chapter page.
147
+ :param chapter_id: Identifier of the chapter being parsed.
148
+ :return: Cleaned chapter content as plain text or minimal HTML.
149
+ """
150
+ if not html_list:
151
+ return None
152
+
153
+ title_text = ""
154
+ paragraphs: list[str] = []
155
+
156
+ for page_idx, html_str in enumerate(html_list, start=1):
157
+ if page_idx == 1:
158
+ tree = html.fromstring(html_str)
159
+ title_text = self._extract_chapter_title(tree)
160
+ paragraphs.extend(self._parse_chapter_page1(tree))
161
+ elif page_idx == 2:
162
+ paragraphs.extend(self._parse_chapter_page2(html_str))
163
+ else:
164
+ paragraphs.extend(self._parse_chapter_page3plus(html_str))
165
+
166
+ content = "\n".join(paragraphs).strip()
167
+ if not content:
168
+ return None
169
+
170
+ return {
171
+ "id": chapter_id,
172
+ "title": title_text,
173
+ "content": content,
174
+ "extra": {"site": "xiguashuwu"},
175
+ }
176
+
177
+ @classmethod
178
+ def _parse_chapter_page1(cls, tree: html.HtmlElement) -> list[str]:
179
+ """
180
+ Parse page 1 of the chapter: plain text, no encryption or obfuscation.
181
+
182
+ This method extracts all visible text from the element with id="C0NTENT",
183
+ removes known ad sections
184
+
185
+ :param tree: Parsed HTML element tree of the chapter page.
186
+ :return: List of text lines in reading order.
187
+ """
188
+ try:
189
+ # note: 'C0NTENT' contains a zero, not the letter 'O'
190
+ content_div = tree.xpath('//*[@id="C0NTENT"]')
191
+ if not content_div:
192
+ return []
193
+ content_div = content_div[0]
194
+
195
+ # Remove advertisement or irrelevant sections
196
+ for ad in content_div.xpath('.//div[@class="s_m"]'):
197
+ ad.getparent().remove(ad)
198
+
199
+ lines = content_div.xpath(".//text()")
200
+ return [line.strip() for line in lines if line.strip()]
201
+ except Exception as e:
202
+ logger.warning("Failed to parse chapter page 1: %s", e)
203
+ return []
204
+
205
+ def _parse_chapter_page2(self, html_str: str) -> list[str]:
206
+ """
207
+ Parse page 2 of the chapter: content order shuffled by JavaScript,
208
+ and text replaced with images.
209
+
210
+ :param html_str: Raw HTML string of the chapter page.
211
+ :return: List of text lines extracted in correct reading order.
212
+ """
213
+ try:
214
+ tree = html.fromstring(html_str)
215
+ # Extract ordering metadata
216
+ order_raw = self._parse_client_meta(tree)
217
+ codeurl = self._parse_codeurl(html_str)
218
+ nrid = self._parse_nrid(html_str)
219
+ order_list = self._restore_order(order_raw, codeurl)
220
+
221
+ # Extract paragraphs in raw order
222
+ content_divs = tree.xpath(f'//*[@id="{nrid}"]')
223
+ if not content_divs:
224
+ return []
225
+ paragraphs = self._rebuild_paragraphs(content_divs[0])
226
+
227
+ # Reorder paragraphs
228
+ reordered: list[str] = []
229
+ for idx in order_list:
230
+ if 0 <= idx < len(paragraphs):
231
+ reordered.append(paragraphs[idx])
232
+ return reordered
233
+ except Exception as e:
234
+ logger.warning("Failed to parse chapter page 2: %s", e)
235
+ return []
236
+
237
+ def _parse_chapter_page3plus(self, html_str: str) -> list[str]:
238
+ """
239
+ Parse pages 3 and beyond of the chapter: AES-encrypted text
240
+ replaced with images.
241
+
242
+ :param html_str: Raw HTML string of the chapter page.
243
+ :return: List of decrypted text lines in reading order.
244
+ """
245
+ try:
246
+ newcon = self._parse_newcon(html_str)
247
+ d_key = self._parse_d_key(html_str)
248
+ full_html = self._decrypt_d(newcon, d_key)
249
+ tree = html.fromstring(full_html)
250
+ paragraphs = self._rebuild_paragraphs(tree)
251
+ return paragraphs
252
+ except Exception as e:
253
+ logger.warning("Failed to parse chapter page 3+: %s", e)
254
+ return []
255
+
256
+ @classmethod
257
+ def _extract_chapter_title(cls, tree: html.HtmlElement) -> str:
258
+ """
259
+ Extract the chapter title from the HTML tree.
260
+
261
+ The title is expected to be located inside:
262
+ <h1 id="chapterTitle">...</h1>
263
+
264
+ :param tree: Parsed HTML element tree of the chapter page.
265
+ :return: Chapter title as a string, or an empty string if not found.
266
+ """
267
+ return cls._first_str(tree.xpath('//h1[@id="chapterTitle"]/text()'))
268
+
269
+ def _char_from_img(self, url: str) -> str:
270
+ """
271
+ Given an <img> src URL, return the mapped character if this image
272
+ represents a single glyph.
273
+ """
274
+ fname = url.split("/")[-1].split("?", 1)[0]
275
+ char = self._FONT_MAP.get(fname)
276
+ if char:
277
+ return char
278
+ if url in self._GLYPH_CACHE:
279
+ return self._GLYPH_CACHE[url]
280
+ if self._decode_font:
281
+ char = self._recognize_glyph_from_url(url)
282
+ if char:
283
+ self._GLYPH_CACHE[url] = char
284
+ return char
285
+ return f'<img src="{url}" />'
286
+
287
+ @classmethod
288
+ def _recognize_glyph_from_url(cls, url: str) -> str | None:
289
+ """
290
+ Download the glyph image at `url` and run the font OCR on it.
291
+
292
+ :param url: Fully-qualified <img src="..."> URL to a single-glyph image.
293
+ :return: The recognized character (top-1) if OCR succeeds, otherwise None.
294
+ """
295
+ try:
296
+ ocr = get_font_ocr()
297
+ if not ocr:
298
+ return None
299
+
300
+ resp = requests.get(url, headers=DEFAULT_USER_HEADERS, timeout=15)
301
+ resp.raise_for_status()
302
+
303
+ img_np = ocr.load_image_array_from_bytes(resp.content)
304
+
305
+ char, score = ocr.predict([img_np])[0]
306
+
307
+ return char if score >= cls._CONF_THRESHOLD else None
308
+
309
+ except Exception as e:
310
+ logger.warning("[Parser] Failed to ocr glyph image %s: %s", url, e)
311
+ return None
312
+
313
+ @classmethod
314
+ def _parse_codeurl(cls, text: str) -> int:
315
+ """
316
+ Extract the integer from `var codeurl="7";`.
317
+
318
+ Raises ValueError if not found.
319
+ """
320
+ m = cls._CODEURL_PATTERN.search(text)
321
+ if not m:
322
+ raise ValueError("codeurl not found")
323
+ return int(m.group(1))
324
+
325
+ @classmethod
326
+ def _parse_nrid(cls, text: str) -> str:
327
+ """
328
+ Extract the string from `var nrid="FGQSWYBCK";`.
329
+
330
+ Raises ValueError if not found.
331
+ """
332
+ m = cls._NRID_PATTERN.search(text)
333
+ if not m:
334
+ raise ValueError("nrid not found")
335
+ return m.group(1)
336
+
337
+ @classmethod
338
+ def _parse_newcon(cls, text: str) -> str:
339
+ """
340
+ Extract and decode the percent-encoded argument of
341
+ `let newcon=decodeURIComponent("...");`.
342
+
343
+ Raises ValueError if not found.
344
+ """
345
+ m = cls._NEWCON_PATTERN.search(text)
346
+ if not m:
347
+ raise ValueError("newcon not found")
348
+ return urllib.parse.unquote(m.group(1))
349
+
350
+ @classmethod
351
+ def _parse_d_key(cls, text: str) -> str:
352
+ """
353
+ Extract the second argument (the hex key) from `d(newcon, "...");`.
354
+
355
+ Raises ValueError if not found.
356
+ """
357
+ m = cls._D_CALL_PATTERN.search(text)
358
+ if not m:
359
+ raise ValueError("d() call with key not found")
360
+ return m.group(1)
361
+
362
+ @classmethod
363
+ def _parse_client_meta(cls, tree: html.HtmlElement) -> str:
364
+ """
365
+ Given an lxml.html tree, return the `content` of
366
+ <meta name="client" content="..."/> in <head>.
367
+
368
+ Raises ValueError if missing.
369
+ """
370
+ vals = tree.xpath("//head/meta[@name='client']/@content")
371
+ if not vals:
372
+ raise ValueError("client meta not found")
373
+ return str(vals[0])
374
+
375
+ @staticmethod
376
+ def _restore_order(raw_b64: str, code: int) -> list[int]:
377
+ decoded = base64.b64decode(raw_b64).decode("utf-8")
378
+ fragments = re.split(r"[A-Z]+%", decoded)
379
+
380
+ order = [0] * len(fragments)
381
+ for i, m in enumerate(fragments):
382
+ # UpWz logic: k = ceil(parseInt(m) - ceil((i+1) % codeurl))
383
+ k = int(m) - ((i + 1) % code)
384
+ order[k] = i
385
+ return order
386
+
387
+ @staticmethod
388
+ def _decrypt_d(a: str, b: str) -> str:
389
+ digest = hashlib.md5(b.encode("utf-8")).hexdigest() # 32 hex chars
390
+
391
+ iv = digest[:16].encode("utf-8")
392
+ key = digest[16:].encode("utf-8")
393
+
394
+ ct = base64.b64decode(a)
395
+ plaintext = aes_cbc_decrypt(key, iv, ct, block_size=32)
396
+
397
+ return plaintext.decode("utf-8")
398
+
399
+ def _rebuild_paragraphs(self, content_div: html.HtmlElement) -> list[str]:
400
+ """
401
+ Given a content container element, reconstruct each paragraph by
402
+ interleaving normal text nodes and <img>-based glyphs.
403
+
404
+ Uses `_char_from_img` to map image glyphs to characters.
405
+
406
+ :param content_div: The HTML element containing <p> paragraphs.
407
+ :return: List of reconstructed paragraph strings.
408
+ """
409
+ paragraphs: list[str] = []
410
+ for p in content_div.xpath(".//p"):
411
+ parts: list[str] = []
412
+
413
+ # Leading text before any children
414
+ if p.text and p.text.strip():
415
+ parts.append(p.text.strip())
416
+
417
+ for child in p:
418
+ tag = child.tag.lower()
419
+ if tag == "img":
420
+ src = (child.get("src") or "").strip()
421
+ full = src if src.startswith("http") else self.BASE_URL + src
422
+ parts.append(self._char_from_img(full))
423
+ # Append any tail text after this child
424
+ if child.tail and child.tail.strip():
425
+ parts.append(child.tail.strip())
426
+
427
+ paragraph = "".join(parts).strip()
428
+ paragraphs.append(paragraph)
429
+ return paragraphs
@@ -0,0 +1,161 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.xs63b
4
+ -----------------------------------
5
+
6
+ """
7
+
8
+ import re
9
+ from typing import Any
10
+
11
+ from lxml import html
12
+
13
+ from novel_downloader.core.parsers.base import BaseParser
14
+ from novel_downloader.core.parsers.registry import register_parser
15
+ from novel_downloader.models import (
16
+ BookInfoDict,
17
+ ChapterDict,
18
+ ChapterInfoDict,
19
+ VolumeInfoDict,
20
+ )
21
+
22
+
23
+ @register_parser(
24
+ site_keys=["xs63b"],
25
+ )
26
+ class Xs63bParser(BaseParser):
27
+ """
28
+ Parser for 小说路上 book pages.
29
+ """
30
+
31
+ TITLE_SELECTOR = "//div[@class='block_txt2']//h2/text()"
32
+ AUTHOR_SELECTOR = "//p[contains(., '作者')]/a/text()"
33
+ TYPE_SELECTOR = "//p[contains(., '分类')]/a/text()"
34
+ STATUS_SELECTOR = "//p[contains(., '状态')]/text()"
35
+ UPDATE_SELECTOR = "//p[contains(., '更新')]/text()"
36
+ COVER_SELECTOR = "//div[@class='block_img2']//img/@src"
37
+ SUMMARY_SELECTOR = (
38
+ "//div[@class='intro' and contains(., '小说简介')]"
39
+ "/following-sibling::div[@class='intro_info'][1]"
40
+ )
41
+ CATALOG_ANCHORS = (
42
+ "//h2[contains(., '正文')]/following-sibling::div[@class='book_list'][1]//a"
43
+ )
44
+
45
+ CHAPTER_TITLE_SELECTOR = "//h1[@id='_52mb_h1']/text()"
46
+ CHAPTER_PARAGRAPHS = "//div[@id='nr1']//p"
47
+
48
+ _RE_STRIP_DIV = re.compile(r"^<div[^>]*>|</div>$", re.I)
49
+ _RE_STRIP_JIANJIE = re.compile(r"^\s*简介\s*[::]\s*", re.I)
50
+ _RE_SPACES = re.compile(r"[ \t]+")
51
+
52
+ ADS = {"如章节缺失", "本章未完", "下一页继续阅读", "xs63b.com"}
53
+
54
+ def parse_book_info(
55
+ self,
56
+ html_list: list[str],
57
+ **kwargs: Any,
58
+ ) -> BookInfoDict | None:
59
+ if len(html_list) < 2:
60
+ return None
61
+
62
+ info_tree = html.fromstring(html_list[0])
63
+ catalog_tree = html.fromstring(html_list[1])
64
+
65
+ book_name = self._first_str(info_tree.xpath(self.TITLE_SELECTOR))
66
+ author = self._first_str(info_tree.xpath(self.AUTHOR_SELECTOR))
67
+ book_type = self._first_str(info_tree.xpath(self.TYPE_SELECTOR))
68
+
69
+ serial_status = self._first_str(
70
+ info_tree.xpath(self.STATUS_SELECTOR),
71
+ replaces=[("状态:", "")],
72
+ )
73
+ serial_status = self._norm_space(serial_status)
74
+
75
+ update_time = self._first_str(
76
+ info_tree.xpath(self.UPDATE_SELECTOR),
77
+ replaces=[("更新:", "")],
78
+ )
79
+ cover_url = self._first_str(info_tree.xpath(self.COVER_SELECTOR))
80
+
81
+ # Summary: keep first <br> segment, then cut at "{author}的作品集"
82
+ summary = ""
83
+ nodes = info_tree.xpath(self.SUMMARY_SELECTOR)
84
+ if nodes:
85
+ node_html = html.tostring(nodes[0], method="html", encoding="unicode")
86
+ node_html = self._RE_STRIP_DIV.sub("", node_html).strip()
87
+ first_seg = node_html.split("<br", 1)[0]
88
+ text = html.fromstring(f"<div>{first_seg}</div>").text_content()
89
+ text = self._RE_STRIP_JIANJIE.sub("", text).strip()
90
+ if author:
91
+ text = text.split(f"{author}的作品集")[0].strip()
92
+ summary = text
93
+
94
+ tags = [book_type] if book_type else []
95
+
96
+ chapters: list[ChapterInfoDict] = []
97
+ for a in catalog_tree.xpath(self.CATALOG_ANCHORS):
98
+ href = a.get("href") or ""
99
+ title = (a.text_content() or "").strip()
100
+ if not href or not title:
101
+ continue
102
+ # 'https://www.xs63b.com/xuanhuan/wanyuzhiwang/29546477.html' -> '29546477'
103
+ chap_id = href.rsplit("/", 1)[-1].split(".")[0]
104
+ chapters.append({"title": title, "url": href, "chapterId": chap_id})
105
+
106
+ volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
107
+
108
+ return {
109
+ "book_name": book_name,
110
+ "author": author,
111
+ "cover_url": cover_url,
112
+ "update_time": update_time,
113
+ "serial_status": serial_status,
114
+ "summary": summary,
115
+ "tags": tags,
116
+ "volumes": volumes,
117
+ "extra": {},
118
+ }
119
+
120
+ def parse_chapter(
121
+ self,
122
+ html_list: list[str],
123
+ chapter_id: str,
124
+ **kwargs: Any,
125
+ ) -> ChapterDict | None:
126
+ if not html_list:
127
+ return None
128
+
129
+ title = ""
130
+ paragraphs: list[str] = []
131
+
132
+ for html_str in html_list:
133
+ tree = html.fromstring(html_str)
134
+
135
+ if not title:
136
+ h1 = self._first_str(tree.xpath(self.CHAPTER_TITLE_SELECTOR))
137
+ title = h1.rsplit(" ", 1)[0].strip() if (" " in h1) else h1
138
+
139
+ for p in tree.xpath(self.CHAPTER_PARAGRAPHS):
140
+ cls = p.get("class") or ""
141
+ pid = p.get("id") or ""
142
+ if "hid-pages" in cls or "pages" in cls or "contentTip" in pid:
143
+ continue
144
+
145
+ txt = (p.text_content() or "").replace("\xa0", " ")
146
+ txt = self._RE_SPACES.sub(" ", txt).strip()
147
+ if not txt or self._is_ad_line(txt):
148
+ continue
149
+
150
+ paragraphs.append(txt)
151
+
152
+ content = "\n".join(paragraphs).strip()
153
+ if not content:
154
+ return None
155
+
156
+ return {
157
+ "id": chapter_id,
158
+ "title": title,
159
+ "content": content,
160
+ "extra": {"site": "xs63b"},
161
+ }