novel-downloader 1.4.5__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/__init__.py +2 -4
  3. novel_downloader/cli/clean.py +21 -88
  4. novel_downloader/cli/config.py +27 -104
  5. novel_downloader/cli/download.py +78 -66
  6. novel_downloader/cli/export.py +20 -21
  7. novel_downloader/cli/main.py +3 -1
  8. novel_downloader/cli/search.py +120 -0
  9. novel_downloader/cli/ui.py +156 -0
  10. novel_downloader/config/__init__.py +10 -14
  11. novel_downloader/config/adapter.py +195 -99
  12. novel_downloader/config/{loader.py → file_io.py} +53 -27
  13. novel_downloader/core/__init__.py +14 -13
  14. novel_downloader/core/archived/deqixs/fetcher.py +115 -0
  15. novel_downloader/core/archived/deqixs/parser.py +132 -0
  16. novel_downloader/core/archived/deqixs/searcher.py +89 -0
  17. novel_downloader/core/archived/qidian/searcher.py +79 -0
  18. novel_downloader/core/archived/wanbengo/searcher.py +98 -0
  19. novel_downloader/core/archived/xshbook/searcher.py +93 -0
  20. novel_downloader/core/downloaders/__init__.py +8 -30
  21. novel_downloader/core/downloaders/base.py +182 -30
  22. novel_downloader/core/downloaders/common.py +217 -384
  23. novel_downloader/core/downloaders/qianbi.py +332 -4
  24. novel_downloader/core/downloaders/qidian.py +250 -290
  25. novel_downloader/core/downloaders/registry.py +69 -0
  26. novel_downloader/core/downloaders/signals.py +46 -0
  27. novel_downloader/core/exporters/__init__.py +8 -26
  28. novel_downloader/core/exporters/base.py +107 -31
  29. novel_downloader/core/exporters/common/__init__.py +3 -4
  30. novel_downloader/core/exporters/common/epub.py +92 -171
  31. novel_downloader/core/exporters/common/main_exporter.py +14 -67
  32. novel_downloader/core/exporters/common/txt.py +90 -86
  33. novel_downloader/core/exporters/epub_util.py +184 -1327
  34. novel_downloader/core/exporters/linovelib/__init__.py +3 -2
  35. novel_downloader/core/exporters/linovelib/epub.py +165 -222
  36. novel_downloader/core/exporters/linovelib/main_exporter.py +10 -71
  37. novel_downloader/core/exporters/linovelib/txt.py +76 -66
  38. novel_downloader/core/exporters/qidian.py +15 -11
  39. novel_downloader/core/exporters/registry.py +55 -0
  40. novel_downloader/core/exporters/txt_util.py +67 -0
  41. novel_downloader/core/fetchers/__init__.py +57 -56
  42. novel_downloader/core/fetchers/aaatxt.py +83 -0
  43. novel_downloader/core/fetchers/{biquge/session.py → b520.py} +10 -10
  44. novel_downloader/core/fetchers/{base/session.py → base.py} +63 -47
  45. novel_downloader/core/fetchers/biquyuedu.py +83 -0
  46. novel_downloader/core/fetchers/dxmwx.py +110 -0
  47. novel_downloader/core/fetchers/eightnovel.py +139 -0
  48. novel_downloader/core/fetchers/{esjzone/session.py → esjzone.py} +23 -11
  49. novel_downloader/core/fetchers/guidaye.py +85 -0
  50. novel_downloader/core/fetchers/hetushu.py +92 -0
  51. novel_downloader/core/fetchers/{qianbi/browser.py → i25zw.py} +22 -26
  52. novel_downloader/core/fetchers/ixdzs8.py +113 -0
  53. novel_downloader/core/fetchers/jpxs123.py +101 -0
  54. novel_downloader/core/fetchers/{biquge/browser.py → lewenn.py} +15 -15
  55. novel_downloader/core/fetchers/{linovelib/session.py → linovelib.py} +16 -12
  56. novel_downloader/core/fetchers/piaotia.py +105 -0
  57. novel_downloader/core/fetchers/qbtr.py +101 -0
  58. novel_downloader/core/fetchers/{qianbi/session.py → qianbi.py} +9 -9
  59. novel_downloader/core/fetchers/{qidian/session.py → qidian.py} +55 -40
  60. novel_downloader/core/fetchers/quanben5.py +92 -0
  61. novel_downloader/core/fetchers/{base/rate_limiter.py → rate_limiter.py} +2 -2
  62. novel_downloader/core/fetchers/registry.py +60 -0
  63. novel_downloader/core/fetchers/{sfacg/session.py → sfacg.py} +11 -9
  64. novel_downloader/core/fetchers/shencou.py +106 -0
  65. novel_downloader/core/fetchers/{common/browser.py → shuhaige.py} +24 -19
  66. novel_downloader/core/fetchers/tongrenquan.py +84 -0
  67. novel_downloader/core/fetchers/ttkan.py +95 -0
  68. novel_downloader/core/fetchers/{common/session.py → wanbengo.py} +21 -17
  69. novel_downloader/core/fetchers/xiaoshuowu.py +106 -0
  70. novel_downloader/core/fetchers/xiguashuwu.py +177 -0
  71. novel_downloader/core/fetchers/xs63b.py +171 -0
  72. novel_downloader/core/fetchers/xshbook.py +85 -0
  73. novel_downloader/core/fetchers/{yamibo/session.py → yamibo.py} +23 -11
  74. novel_downloader/core/fetchers/yibige.py +114 -0
  75. novel_downloader/core/interfaces/__init__.py +8 -14
  76. novel_downloader/core/interfaces/downloader.py +6 -2
  77. novel_downloader/core/interfaces/exporter.py +7 -7
  78. novel_downloader/core/interfaces/fetcher.py +4 -17
  79. novel_downloader/core/interfaces/parser.py +5 -6
  80. novel_downloader/core/interfaces/searcher.py +26 -0
  81. novel_downloader/core/parsers/__init__.py +58 -22
  82. novel_downloader/core/parsers/aaatxt.py +132 -0
  83. novel_downloader/core/parsers/b520.py +116 -0
  84. novel_downloader/core/parsers/base.py +63 -12
  85. novel_downloader/core/parsers/biquyuedu.py +133 -0
  86. novel_downloader/core/parsers/dxmwx.py +162 -0
  87. novel_downloader/core/parsers/eightnovel.py +224 -0
  88. novel_downloader/core/parsers/{esjzone/main_parser.py → esjzone.py} +67 -67
  89. novel_downloader/core/parsers/guidaye.py +128 -0
  90. novel_downloader/core/parsers/hetushu.py +139 -0
  91. novel_downloader/core/parsers/i25zw.py +137 -0
  92. novel_downloader/core/parsers/ixdzs8.py +186 -0
  93. novel_downloader/core/parsers/jpxs123.py +137 -0
  94. novel_downloader/core/parsers/lewenn.py +142 -0
  95. novel_downloader/core/parsers/{linovelib/main_parser.py → linovelib.py} +54 -65
  96. novel_downloader/core/parsers/piaotia.py +189 -0
  97. novel_downloader/core/parsers/qbtr.py +136 -0
  98. novel_downloader/core/parsers/{qianbi/main_parser.py → qianbi.py} +54 -51
  99. novel_downloader/core/parsers/qidian/__init__.py +2 -2
  100. novel_downloader/core/parsers/qidian/book_info_parser.py +58 -59
  101. novel_downloader/core/parsers/qidian/chapter_encrypted.py +290 -346
  102. novel_downloader/core/parsers/qidian/chapter_normal.py +25 -56
  103. novel_downloader/core/parsers/qidian/main_parser.py +19 -57
  104. novel_downloader/core/parsers/qidian/utils/__init__.py +12 -11
  105. novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +6 -7
  106. novel_downloader/core/parsers/qidian/utils/fontmap_recover.py +143 -0
  107. novel_downloader/core/parsers/qidian/utils/helpers.py +0 -4
  108. novel_downloader/core/parsers/qidian/utils/node_decryptor.py +2 -2
  109. novel_downloader/core/parsers/quanben5.py +103 -0
  110. novel_downloader/core/parsers/registry.py +57 -0
  111. novel_downloader/core/parsers/{sfacg/main_parser.py → sfacg.py} +46 -48
  112. novel_downloader/core/parsers/shencou.py +215 -0
  113. novel_downloader/core/parsers/shuhaige.py +111 -0
  114. novel_downloader/core/parsers/tongrenquan.py +116 -0
  115. novel_downloader/core/parsers/ttkan.py +132 -0
  116. novel_downloader/core/parsers/wanbengo.py +191 -0
  117. novel_downloader/core/parsers/xiaoshuowu.py +173 -0
  118. novel_downloader/core/parsers/xiguashuwu.py +435 -0
  119. novel_downloader/core/parsers/xs63b.py +161 -0
  120. novel_downloader/core/parsers/xshbook.py +134 -0
  121. novel_downloader/core/parsers/yamibo.py +155 -0
  122. novel_downloader/core/parsers/yibige.py +166 -0
  123. novel_downloader/core/searchers/__init__.py +51 -0
  124. novel_downloader/core/searchers/aaatxt.py +107 -0
  125. novel_downloader/core/searchers/b520.py +84 -0
  126. novel_downloader/core/searchers/base.py +168 -0
  127. novel_downloader/core/searchers/dxmwx.py +105 -0
  128. novel_downloader/core/searchers/eightnovel.py +84 -0
  129. novel_downloader/core/searchers/esjzone.py +102 -0
  130. novel_downloader/core/searchers/hetushu.py +92 -0
  131. novel_downloader/core/searchers/i25zw.py +93 -0
  132. novel_downloader/core/searchers/ixdzs8.py +107 -0
  133. novel_downloader/core/searchers/jpxs123.py +107 -0
  134. novel_downloader/core/searchers/piaotia.py +100 -0
  135. novel_downloader/core/searchers/qbtr.py +106 -0
  136. novel_downloader/core/searchers/qianbi.py +165 -0
  137. novel_downloader/core/searchers/quanben5.py +144 -0
  138. novel_downloader/core/searchers/registry.py +79 -0
  139. novel_downloader/core/searchers/shuhaige.py +124 -0
  140. novel_downloader/core/searchers/tongrenquan.py +110 -0
  141. novel_downloader/core/searchers/ttkan.py +92 -0
  142. novel_downloader/core/searchers/xiaoshuowu.py +122 -0
  143. novel_downloader/core/searchers/xiguashuwu.py +95 -0
  144. novel_downloader/core/searchers/xs63b.py +104 -0
  145. novel_downloader/locales/en.json +36 -79
  146. novel_downloader/locales/zh.json +37 -80
  147. novel_downloader/models/__init__.py +23 -50
  148. novel_downloader/models/book.py +44 -0
  149. novel_downloader/models/config.py +16 -43
  150. novel_downloader/models/login.py +1 -1
  151. novel_downloader/models/search.py +21 -0
  152. novel_downloader/resources/config/settings.toml +39 -74
  153. novel_downloader/resources/css_styles/intro.css +83 -0
  154. novel_downloader/resources/css_styles/main.css +30 -89
  155. novel_downloader/resources/json/xiguashuwu.json +718 -0
  156. novel_downloader/utils/__init__.py +43 -0
  157. novel_downloader/utils/chapter_storage.py +247 -226
  158. novel_downloader/utils/constants.py +5 -50
  159. novel_downloader/utils/cookies.py +6 -18
  160. novel_downloader/utils/crypto_utils/__init__.py +13 -0
  161. novel_downloader/utils/crypto_utils/aes_util.py +90 -0
  162. novel_downloader/utils/crypto_utils/aes_v1.py +619 -0
  163. novel_downloader/utils/crypto_utils/aes_v2.py +1143 -0
  164. novel_downloader/utils/{crypto_utils.py → crypto_utils/rc4.py} +3 -10
  165. novel_downloader/utils/epub/__init__.py +34 -0
  166. novel_downloader/utils/epub/builder.py +377 -0
  167. novel_downloader/utils/epub/constants.py +118 -0
  168. novel_downloader/utils/epub/documents.py +297 -0
  169. novel_downloader/utils/epub/models.py +120 -0
  170. novel_downloader/utils/epub/utils.py +179 -0
  171. novel_downloader/utils/file_utils/__init__.py +5 -30
  172. novel_downloader/utils/file_utils/io.py +9 -150
  173. novel_downloader/utils/file_utils/normalize.py +2 -2
  174. novel_downloader/utils/file_utils/sanitize.py +2 -7
  175. novel_downloader/utils/fontocr.py +207 -0
  176. novel_downloader/utils/i18n.py +2 -0
  177. novel_downloader/utils/logger.py +10 -16
  178. novel_downloader/utils/network.py +111 -252
  179. novel_downloader/utils/state.py +5 -90
  180. novel_downloader/utils/text_utils/__init__.py +16 -21
  181. novel_downloader/utils/text_utils/diff_display.py +6 -9
  182. novel_downloader/utils/text_utils/numeric_conversion.py +253 -0
  183. novel_downloader/utils/text_utils/text_cleaner.py +179 -0
  184. novel_downloader/utils/text_utils/truncate_utils.py +62 -0
  185. novel_downloader/utils/time_utils/__init__.py +6 -12
  186. novel_downloader/utils/time_utils/datetime_utils.py +23 -33
  187. novel_downloader/utils/time_utils/sleep_utils.py +5 -10
  188. novel_downloader/web/__init__.py +13 -0
  189. novel_downloader/web/components/__init__.py +11 -0
  190. novel_downloader/web/components/navigation.py +35 -0
  191. novel_downloader/web/main.py +66 -0
  192. novel_downloader/web/pages/__init__.py +17 -0
  193. novel_downloader/web/pages/download.py +78 -0
  194. novel_downloader/web/pages/progress.py +147 -0
  195. novel_downloader/web/pages/search.py +329 -0
  196. novel_downloader/web/services/__init__.py +17 -0
  197. novel_downloader/web/services/client_dialog.py +164 -0
  198. novel_downloader/web/services/cred_broker.py +113 -0
  199. novel_downloader/web/services/cred_models.py +35 -0
  200. novel_downloader/web/services/task_manager.py +264 -0
  201. novel_downloader-2.0.0.dist-info/METADATA +171 -0
  202. novel_downloader-2.0.0.dist-info/RECORD +210 -0
  203. {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/entry_points.txt +1 -1
  204. novel_downloader/config/site_rules.py +0 -94
  205. novel_downloader/core/downloaders/biquge.py +0 -25
  206. novel_downloader/core/downloaders/esjzone.py +0 -25
  207. novel_downloader/core/downloaders/linovelib.py +0 -25
  208. novel_downloader/core/downloaders/sfacg.py +0 -25
  209. novel_downloader/core/downloaders/yamibo.py +0 -25
  210. novel_downloader/core/exporters/biquge.py +0 -25
  211. novel_downloader/core/exporters/esjzone.py +0 -25
  212. novel_downloader/core/exporters/qianbi.py +0 -25
  213. novel_downloader/core/exporters/sfacg.py +0 -25
  214. novel_downloader/core/exporters/yamibo.py +0 -25
  215. novel_downloader/core/factory/__init__.py +0 -20
  216. novel_downloader/core/factory/downloader.py +0 -73
  217. novel_downloader/core/factory/exporter.py +0 -58
  218. novel_downloader/core/factory/fetcher.py +0 -96
  219. novel_downloader/core/factory/parser.py +0 -86
  220. novel_downloader/core/fetchers/base/__init__.py +0 -14
  221. novel_downloader/core/fetchers/base/browser.py +0 -403
  222. novel_downloader/core/fetchers/biquge/__init__.py +0 -14
  223. novel_downloader/core/fetchers/common/__init__.py +0 -14
  224. novel_downloader/core/fetchers/esjzone/__init__.py +0 -14
  225. novel_downloader/core/fetchers/esjzone/browser.py +0 -204
  226. novel_downloader/core/fetchers/linovelib/__init__.py +0 -14
  227. novel_downloader/core/fetchers/linovelib/browser.py +0 -193
  228. novel_downloader/core/fetchers/qianbi/__init__.py +0 -14
  229. novel_downloader/core/fetchers/qidian/__init__.py +0 -14
  230. novel_downloader/core/fetchers/qidian/browser.py +0 -318
  231. novel_downloader/core/fetchers/sfacg/__init__.py +0 -14
  232. novel_downloader/core/fetchers/sfacg/browser.py +0 -189
  233. novel_downloader/core/fetchers/yamibo/__init__.py +0 -14
  234. novel_downloader/core/fetchers/yamibo/browser.py +0 -229
  235. novel_downloader/core/parsers/biquge/__init__.py +0 -10
  236. novel_downloader/core/parsers/biquge/main_parser.py +0 -134
  237. novel_downloader/core/parsers/common/__init__.py +0 -13
  238. novel_downloader/core/parsers/common/helper.py +0 -323
  239. novel_downloader/core/parsers/common/main_parser.py +0 -106
  240. novel_downloader/core/parsers/esjzone/__init__.py +0 -10
  241. novel_downloader/core/parsers/linovelib/__init__.py +0 -10
  242. novel_downloader/core/parsers/qianbi/__init__.py +0 -10
  243. novel_downloader/core/parsers/sfacg/__init__.py +0 -10
  244. novel_downloader/core/parsers/yamibo/__init__.py +0 -10
  245. novel_downloader/core/parsers/yamibo/main_parser.py +0 -194
  246. novel_downloader/models/browser.py +0 -21
  247. novel_downloader/models/chapter.py +0 -25
  248. novel_downloader/models/site_rules.py +0 -99
  249. novel_downloader/models/tasks.py +0 -33
  250. novel_downloader/models/types.py +0 -15
  251. novel_downloader/resources/css_styles/volume-intro.css +0 -56
  252. novel_downloader/resources/json/replace_word_map.json +0 -4
  253. novel_downloader/resources/text/blacklist.txt +0 -22
  254. novel_downloader/tui/__init__.py +0 -7
  255. novel_downloader/tui/app.py +0 -32
  256. novel_downloader/tui/main.py +0 -17
  257. novel_downloader/tui/screens/__init__.py +0 -14
  258. novel_downloader/tui/screens/home.py +0 -198
  259. novel_downloader/tui/screens/login.py +0 -74
  260. novel_downloader/tui/styles/home_layout.tcss +0 -79
  261. novel_downloader/tui/widgets/richlog_handler.py +0 -24
  262. novel_downloader/utils/cache.py +0 -24
  263. novel_downloader/utils/fontocr/__init__.py +0 -22
  264. novel_downloader/utils/fontocr/model_loader.py +0 -69
  265. novel_downloader/utils/fontocr/ocr_v1.py +0 -303
  266. novel_downloader/utils/fontocr/ocr_v2.py +0 -752
  267. novel_downloader/utils/hash_store.py +0 -279
  268. novel_downloader/utils/hash_utils.py +0 -103
  269. novel_downloader/utils/text_utils/chapter_formatting.py +0 -46
  270. novel_downloader/utils/text_utils/font_mapping.py +0 -28
  271. novel_downloader/utils/text_utils/text_cleaning.py +0 -107
  272. novel_downloader-1.4.5.dist-info/METADATA +0 -196
  273. novel_downloader-1.4.5.dist-info/RECORD +0 -165
  274. {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/WHEEL +0 -0
  275. {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/licenses/LICENSE +0 -0
  276. {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,132 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.ttkan
4
+ -----------------------------------
5
+
6
+ """
7
+
8
+ from datetime import datetime
9
+ from typing import Any
10
+
11
+ from lxml import html
12
+
13
+ from novel_downloader.core.parsers.base import BaseParser
14
+ from novel_downloader.core.parsers.registry import register_parser
15
+ from novel_downloader.models import (
16
+ BookInfoDict,
17
+ ChapterDict,
18
+ ChapterInfoDict,
19
+ VolumeInfoDict,
20
+ )
21
+
22
+
23
+ @register_parser(
24
+ site_keys=["ttkan"],
25
+ )
26
+ class TtkanParser(BaseParser):
27
+ """
28
+ Parser for 天天看小說 book pages.
29
+ """
30
+
31
+ def parse_book_info(
32
+ self,
33
+ html_list: list[str],
34
+ **kwargs: Any,
35
+ ) -> BookInfoDict | None:
36
+ if not html_list:
37
+ return None
38
+
39
+ tree = html.fromstring(html_list[0])
40
+
41
+ # Book metadata
42
+ book_name = self._first_str(
43
+ tree.xpath('//div[contains(@class,"novel_info")]//h1/text()')
44
+ )
45
+
46
+ author = self._first_str(
47
+ tree.xpath(
48
+ '//div[contains(@class,"novel_info")]//li[span/text()="作者:"]/a/text()'
49
+ )
50
+ )
51
+
52
+ cover_url = self._first_str(
53
+ tree.xpath('//div[contains(@class,"novel_info")]//amp-img/@src')
54
+ )
55
+
56
+ serial_status = self._first_str(
57
+ tree.xpath(
58
+ '//div[contains(@class,"novel_info")]//span[contains(@class,"state_serial")]/text()'
59
+ )
60
+ )
61
+
62
+ update_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
63
+
64
+ # Summary
65
+ summary_nodes = tree.xpath('//div[@class="description"]//p/text()')
66
+ summary = "".join(summary_nodes).strip()
67
+
68
+ # Single "正文" volume with all chapter links
69
+ chapters: list[ChapterInfoDict] = []
70
+ for a in tree.xpath('//div[@class="full_chapters"]/div[1]/a'):
71
+ url = a.get("href", "").strip()
72
+ title = a.text_content().strip()
73
+ # '/novel/pagea/wushenzhuzai-anmoshi_6094.html' -> '6094'
74
+ chap_id = url.rstrip(".html").split("_")[-1]
75
+ chapters.append(
76
+ {
77
+ "chapterId": chap_id,
78
+ "title": title,
79
+ "url": url,
80
+ }
81
+ )
82
+
83
+ volumes: list[VolumeInfoDict] = [
84
+ {
85
+ "volume_name": "正文",
86
+ "chapters": chapters,
87
+ }
88
+ ]
89
+
90
+ return {
91
+ "book_name": book_name,
92
+ "author": author,
93
+ "cover_url": cover_url,
94
+ "update_time": update_time,
95
+ "serial_status": serial_status,
96
+ "summary": summary,
97
+ "volumes": volumes,
98
+ "extra": {},
99
+ }
100
+
101
+ def parse_chapter(
102
+ self,
103
+ html_list: list[str],
104
+ chapter_id: str,
105
+ **kwargs: Any,
106
+ ) -> ChapterDict | None:
107
+ if not html_list:
108
+ return None
109
+ tree = html.fromstring(html_list[0])
110
+
111
+ # Title
112
+ title_nodes = tree.xpath('//div[@class="title"]/h1/text()')
113
+ title = title_nodes[0].strip() if title_nodes else ""
114
+
115
+ # Content paragraphs under <div class="content">
116
+ paras = tree.xpath('//div[@class="content"]/p')
117
+ lines = []
118
+ for p in paras:
119
+ text = p.text_content().strip()
120
+ if text:
121
+ lines.append(text)
122
+
123
+ content = "\n".join(lines).strip()
124
+ if not content:
125
+ return None
126
+
127
+ return {
128
+ "id": chapter_id,
129
+ "title": title,
130
+ "content": content,
131
+ "extra": {"site": "ttkan"},
132
+ }
@@ -0,0 +1,191 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.wanbengo
4
+ --------------------------------------
5
+
6
+ """
7
+
8
+ import re
9
+ from datetime import datetime
10
+ from html import unescape
11
+ from typing import Any
12
+ from urllib.parse import urljoin
13
+
14
+ from lxml import html
15
+
16
+ from novel_downloader.core.parsers.base import BaseParser
17
+ from novel_downloader.core.parsers.registry import register_parser
18
+ from novel_downloader.models import (
19
+ BookInfoDict,
20
+ ChapterDict,
21
+ ChapterInfoDict,
22
+ VolumeInfoDict,
23
+ )
24
+
25
+
26
+ @register_parser(
27
+ site_keys=["wanbengo"],
28
+ )
29
+ class WanbengoParser(BaseParser):
30
+ """
31
+ Parser for 完本神站 book pages.
32
+ """
33
+
34
+ BASE = "https://www.wanbengo.com"
35
+
36
+ # XPaths for the book info page
37
+ X_BOOK_NAME = "//div[@class='detailTopMid']//h1/text()"
38
+ X_AUTHOR = "//div[@class='detailTopMid']//div[@class='writer']//a/text()"
39
+ X_COVER = "//div[@class='detailTopLeft']//img/@src"
40
+ X_STATUS = "//div[@class='detailTopLeft']//span[contains(@class,'end')]/text()"
41
+ X_WORDS = "//div[@class='detailTopMid']//table//tr[td/span[contains(text(),'字数')]]/td[last()]/text()" # noqa: E501
42
+ X_SUMMARY = "//div[@class='detailTopMid']//table//tr[td/span[contains(text(),'简介')]]/td[last()]//text()" # noqa: E501
43
+ X_TAG = "//div[@class='route']/a[2]//text()"
44
+ X_UPDATE_TXT = "//div[@class='chapterTitle']//span//text()"
45
+ X_CHAPTERS = "//div[@class='chapter']//ul//li/a"
46
+
47
+ # XPaths for the chapter page
48
+ X_CHAP_TITLE = "//div[contains(@class,'readerTitle')]//h2/text()"
49
+ _CHAP_SPLIT_RE = re.compile(r"(?:</p\s*>|<p\b[^>]*>|<br\s*/?>)", re.I)
50
+ _CHAP_READERCON_RE = re.compile(
51
+ r'<div[^>]*class=(?:"[^"]*readerCon[^"]*"|\'[^\']*readerCon[^\']*\')[^>]*>(.*?)</div>',
52
+ re.I | re.S,
53
+ )
54
+ _TAGS_RE = re.compile(r"<[^>]+>")
55
+ _SCRUB_RUNS_RE = re.compile(r"[_?]{2,}")
56
+ _SCRUB_TAIL_RE = re.compile(r"\s*(未完待续.*?$")
57
+
58
+ # fmt: off
59
+ ADS = {
60
+ "完本神站", "本站网址", "报错", "键盘", "客户端", "收藏", "书架",
61
+ "猜你喜欢", "上一章", "下一章", "章节目录", "LastRead", "贴吧",
62
+ "倾心打造", "全文无错", "分享本站", "点此章节报错", "温馨提示", "域名",
63
+ "wanbentxt.com", "wanbengo.com",
64
+ }
65
+ # fmt: on
66
+ _PUNCT_ONLY = re.compile(
67
+ r"^[\s\W_·—\-・。,、;;::!!??\(\)()【】《》“”\"'…·]+$"
68
+ ) # noqa: E501
69
+
70
+ def parse_book_info(
71
+ self,
72
+ html_list: list[str],
73
+ **kwargs: Any,
74
+ ) -> BookInfoDict | None:
75
+ if not html_list:
76
+ return None
77
+
78
+ tree = html.fromstring(html_list[0])
79
+
80
+ book_name = self._first_str(tree.xpath(self.X_BOOK_NAME))
81
+ author = self._first_str(tree.xpath(self.X_AUTHOR))
82
+ cover_url = self._first_str(tree.xpath(self.X_COVER))
83
+ serial_status = (
84
+ self._norm_space(self._first_str(tree.xpath(self.X_STATUS))) or "连载中"
85
+ )
86
+ word_count = self._norm_space("".join(tree.xpath(self.X_WORDS)))
87
+ summary = self._norm_space("".join(tree.xpath(self.X_SUMMARY)))
88
+
89
+ book_type = self._norm_space("".join(tree.xpath(self.X_TAG)))
90
+ tags = [book_type] if book_type else []
91
+
92
+ update_time = self._extract_update_date(tree.xpath(self.X_UPDATE_TXT))
93
+
94
+ chapters: list[ChapterInfoDict] = []
95
+ for a in tree.xpath(self.X_CHAPTERS):
96
+ title = self._norm_space("".join(a.xpath(".//text()")))
97
+ href = a.get("href") or ""
98
+ url = urljoin(self.BASE, href)
99
+ # "/129/103950.html" -> "103950"
100
+ cid = url.rstrip(".html").split("/")[-1]
101
+ chapters.append({"title": title, "url": url, "chapterId": cid})
102
+
103
+ volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
104
+
105
+ return {
106
+ "book_name": book_name,
107
+ "author": author,
108
+ "cover_url": cover_url,
109
+ "update_time": update_time,
110
+ "word_count": word_count,
111
+ "summary": summary,
112
+ "tags": tags,
113
+ "volumes": volumes,
114
+ "serial_status": serial_status,
115
+ "extra": {},
116
+ }
117
+
118
+ def parse_chapter(
119
+ self,
120
+ html_list: list[str],
121
+ chapter_id: str,
122
+ **kwargs: Any,
123
+ ) -> ChapterDict | None:
124
+ if not html_list:
125
+ return None
126
+
127
+ inner = self._CHAP_READERCON_RE.search(html_list[0])
128
+ if not inner:
129
+ return None
130
+
131
+ tree = html.fromstring(html_list[0])
132
+ title = self._first_str(tree.xpath(self.X_CHAP_TITLE))
133
+
134
+ parts = self._CHAP_SPLIT_RE.split(inner.group(1))
135
+ lines: list[str] = []
136
+ for part in parts:
137
+ if not part:
138
+ continue
139
+ s = self._TAGS_RE.sub("", part)
140
+ s = unescape(s).replace("\xa0", " ")
141
+ if self._is_noise_line(s):
142
+ continue
143
+ s = self._norm_space(self._scrub_ascii_gibberish(s.strip()))
144
+ if s:
145
+ lines.append(s)
146
+
147
+ content = "\n".join(lines)
148
+ if not content:
149
+ return None
150
+
151
+ return {
152
+ "id": chapter_id,
153
+ "title": title,
154
+ "content": content,
155
+ "extra": {"site": "wanbengo"},
156
+ }
157
+
158
+ @staticmethod
159
+ def _extract_update_date(texts: list[str]) -> str:
160
+ """
161
+ Find a YYYY-MM-DD anywhere in the provided text nodes.
162
+
163
+ If none found, return today's date.
164
+ """
165
+ joined = " ".join(t for t in texts if t)
166
+ m = re.search(r"\b(\d{4}-\d{2}-\d{2})\b", joined)
167
+ if m:
168
+ return m.group(1)
169
+ return datetime.now().strftime("%Y-%m-%d")
170
+
171
+ def _is_noise_line(self, s: str) -> bool:
172
+ """Heuristic to drop obvious ad/footer/noise lines."""
173
+ if not s.strip():
174
+ return True
175
+ if self._is_ad_line(s):
176
+ return True
177
+ if self._PUNCT_ONLY.match(s):
178
+ return True
179
+ return False
180
+
181
+ @classmethod
182
+ def _scrub_ascii_gibberish(cls, s: str) -> str:
183
+ """
184
+ Remove common injected ASCII junk like long runs of '?' or '_'
185
+ while keeping normal text intact.
186
+ """
187
+ s = s.replace("()?()", "").replace("[(.)]", "")
188
+ s = s.replace(".", ".")
189
+ s = cls._SCRUB_RUNS_RE.sub("", s) # drop runs like ???? or ____
190
+ s = cls._SCRUB_TAIL_RE.sub("", s)
191
+ return s.strip()
@@ -0,0 +1,173 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.xiaoshuowu
4
+ ----------------------------------------
5
+
6
+ """
7
+
8
+ from typing import Any
9
+
10
+ from lxml import html
11
+
12
+ from novel_downloader.core.parsers.base import BaseParser
13
+ from novel_downloader.core.parsers.registry import register_parser
14
+ from novel_downloader.models import (
15
+ BookInfoDict,
16
+ ChapterDict,
17
+ ChapterInfoDict,
18
+ VolumeInfoDict,
19
+ )
20
+
21
+
22
+ @register_parser(
23
+ site_keys=["xiaoshuowu", "xiaoshuoge"],
24
+ )
25
+ class XiaoshuowuParser(BaseParser):
26
+ """
27
+ Parser for 小说屋 (xiaoshuoge.info).
28
+ """
29
+
30
+ AD_STR: str = "小说屋 www.xiaoshuoge.info"
31
+
32
+ def parse_book_info(
33
+ self,
34
+ html_list: list[str],
35
+ **kwargs: Any,
36
+ ) -> BookInfoDict | None:
37
+ if len(html_list) < 2:
38
+ return None
39
+
40
+ # Parse trees
41
+ info_tree = html.fromstring(html_list[0])
42
+ catalog_tree = html.fromstring(html_list[1])
43
+
44
+ book_name = self._first_str(
45
+ info_tree.xpath('//meta[@property="og:novel:book_name"]/@content')
46
+ )
47
+ author = self._first_str(
48
+ info_tree.xpath('//meta[@property="og:novel:author"]/@content')
49
+ )
50
+
51
+ # Category -> tags
52
+ cat_val = self._first_str(
53
+ info_tree.xpath('//meta[@property="og:novel:category"]/@content')
54
+ )
55
+ tags = [cat_val] if cat_val else []
56
+
57
+ word_count = self._first_str(
58
+ info_tree.xpath(
59
+ '//table[@class="hide"]//td[contains(text(),"全文字数")]/text()'
60
+ ),
61
+ replaces=[("全文字数:", "")],
62
+ )
63
+ update_time = self._first_str(
64
+ info_tree.xpath(
65
+ '//table[@class="hide"]//td[contains(text(),"最后更新")]/text()'
66
+ ),
67
+ replaces=[("最后更新:", "")],
68
+ )
69
+ serial_status = self._first_str(
70
+ info_tree.xpath(
71
+ '//table[@class="hide"]//td[contains(text(),"连载状态")]/text()'
72
+ ),
73
+ replaces=[("连载状态:", "")],
74
+ )
75
+
76
+ cover_url = self._first_str(
77
+ info_tree.xpath('//meta[@property="og:image"]/@content')
78
+ )
79
+
80
+ # Summary
81
+ summary_div = info_tree.xpath('//div[@class="tabvalue"][1]//div')
82
+ summary: str = summary_div[0].text_content().strip() if summary_div else ""
83
+
84
+ # Chapters (single volume)
85
+ chapters: list[ChapterInfoDict] = []
86
+ chapter_links = catalog_tree.xpath(
87
+ '//ul[contains(@class,"chapters")]//li[contains(@class,"chapter")]/a'
88
+ )
89
+ for a in chapter_links:
90
+ url = a.get("href", "").strip()
91
+ title = a.text_content().strip()
92
+ # chapterId is the numeric filename before ".html"
93
+ chapter_id = url.rsplit("/", 1)[-1].split(".")[0]
94
+ chapters.append({"title": title, "url": url, "chapterId": chapter_id})
95
+
96
+ # Single volume
97
+ volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
98
+
99
+ return {
100
+ "book_name": book_name,
101
+ "author": author,
102
+ "cover_url": cover_url,
103
+ "update_time": update_time,
104
+ "word_count": word_count,
105
+ "serial_status": serial_status,
106
+ "tags": tags,
107
+ "summary": summary,
108
+ "volumes": volumes,
109
+ "extra": {},
110
+ }
111
+
112
+ def parse_chapter(
113
+ self,
114
+ html_list: list[str],
115
+ chapter_id: str,
116
+ **kwargs: Any,
117
+ ) -> ChapterDict | None:
118
+ if not html_list:
119
+ return None
120
+
121
+ doc = html.fromstring(html_list[0])
122
+ # main container
123
+ content_divs = doc.xpath('//div[@id="acontent"]')
124
+ if not content_divs:
125
+ return None
126
+ container = content_divs[0]
127
+
128
+ # Get the <h1> title
129
+ title_elem = container.find("h1")
130
+ title = title_elem.text_content().strip() if title_elem is not None else ""
131
+
132
+ paras: list[str] = []
133
+ started = False
134
+ for node in container.xpath("./*"):
135
+ # anchor: first <div id="content_tip">
136
+ if node.tag == "div" and node.get("id") == "content_tip":
137
+ raw = node.tail or ""
138
+ # drop any "(小说屋 ...)" prefix before the real text
139
+ if ")" in raw:
140
+ raw = raw.split(")", 1)[1]
141
+ first_line = raw.lstrip("\ufeff").strip()
142
+ if first_line:
143
+ paras.append(first_line)
144
+ started = True
145
+ continue
146
+
147
+ if not started:
148
+ continue
149
+
150
+ # stop collecting once we hit any div
151
+ cls_name = node.get("class") or ""
152
+ if node.tag == "div" and any(
153
+ k in cls_name for k in ("tishi", "footlink", "fullbar")
154
+ ):
155
+ break
156
+
157
+ # grab each <br/> tail as a paragraph
158
+ if node.tag == "br":
159
+ line = (node.tail or "").strip()
160
+ if not line or self.AD_STR in line:
161
+ continue
162
+ paras.append(line)
163
+
164
+ if not paras:
165
+ return None
166
+ content = "\n".join(paras)
167
+
168
+ return {
169
+ "id": chapter_id,
170
+ "title": title,
171
+ "content": content,
172
+ "extra": {"site": "xiaoshuowu"},
173
+ }