novel-downloader 1.4.5__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/__init__.py +2 -4
  3. novel_downloader/cli/clean.py +21 -88
  4. novel_downloader/cli/config.py +27 -104
  5. novel_downloader/cli/download.py +78 -66
  6. novel_downloader/cli/export.py +20 -21
  7. novel_downloader/cli/main.py +3 -1
  8. novel_downloader/cli/search.py +120 -0
  9. novel_downloader/cli/ui.py +156 -0
  10. novel_downloader/config/__init__.py +10 -14
  11. novel_downloader/config/adapter.py +195 -99
  12. novel_downloader/config/{loader.py → file_io.py} +53 -27
  13. novel_downloader/core/__init__.py +14 -13
  14. novel_downloader/core/archived/deqixs/fetcher.py +115 -0
  15. novel_downloader/core/archived/deqixs/parser.py +132 -0
  16. novel_downloader/core/archived/deqixs/searcher.py +89 -0
  17. novel_downloader/core/archived/qidian/searcher.py +79 -0
  18. novel_downloader/core/archived/wanbengo/searcher.py +98 -0
  19. novel_downloader/core/archived/xshbook/searcher.py +93 -0
  20. novel_downloader/core/downloaders/__init__.py +8 -30
  21. novel_downloader/core/downloaders/base.py +182 -30
  22. novel_downloader/core/downloaders/common.py +217 -384
  23. novel_downloader/core/downloaders/qianbi.py +332 -4
  24. novel_downloader/core/downloaders/qidian.py +250 -290
  25. novel_downloader/core/downloaders/registry.py +69 -0
  26. novel_downloader/core/downloaders/signals.py +46 -0
  27. novel_downloader/core/exporters/__init__.py +8 -26
  28. novel_downloader/core/exporters/base.py +107 -31
  29. novel_downloader/core/exporters/common/__init__.py +3 -4
  30. novel_downloader/core/exporters/common/epub.py +92 -171
  31. novel_downloader/core/exporters/common/main_exporter.py +14 -67
  32. novel_downloader/core/exporters/common/txt.py +90 -86
  33. novel_downloader/core/exporters/epub_util.py +184 -1327
  34. novel_downloader/core/exporters/linovelib/__init__.py +3 -2
  35. novel_downloader/core/exporters/linovelib/epub.py +165 -222
  36. novel_downloader/core/exporters/linovelib/main_exporter.py +10 -71
  37. novel_downloader/core/exporters/linovelib/txt.py +76 -66
  38. novel_downloader/core/exporters/qidian.py +15 -11
  39. novel_downloader/core/exporters/registry.py +55 -0
  40. novel_downloader/core/exporters/txt_util.py +67 -0
  41. novel_downloader/core/fetchers/__init__.py +57 -56
  42. novel_downloader/core/fetchers/aaatxt.py +83 -0
  43. novel_downloader/core/fetchers/{biquge/session.py → b520.py} +10 -10
  44. novel_downloader/core/fetchers/{base/session.py → base.py} +63 -47
  45. novel_downloader/core/fetchers/biquyuedu.py +83 -0
  46. novel_downloader/core/fetchers/dxmwx.py +110 -0
  47. novel_downloader/core/fetchers/eightnovel.py +139 -0
  48. novel_downloader/core/fetchers/{esjzone/session.py → esjzone.py} +23 -11
  49. novel_downloader/core/fetchers/guidaye.py +85 -0
  50. novel_downloader/core/fetchers/hetushu.py +92 -0
  51. novel_downloader/core/fetchers/{qianbi/browser.py → i25zw.py} +22 -26
  52. novel_downloader/core/fetchers/ixdzs8.py +113 -0
  53. novel_downloader/core/fetchers/jpxs123.py +101 -0
  54. novel_downloader/core/fetchers/{biquge/browser.py → lewenn.py} +15 -15
  55. novel_downloader/core/fetchers/{linovelib/session.py → linovelib.py} +16 -12
  56. novel_downloader/core/fetchers/piaotia.py +105 -0
  57. novel_downloader/core/fetchers/qbtr.py +101 -0
  58. novel_downloader/core/fetchers/{qianbi/session.py → qianbi.py} +9 -9
  59. novel_downloader/core/fetchers/{qidian/session.py → qidian.py} +55 -40
  60. novel_downloader/core/fetchers/quanben5.py +92 -0
  61. novel_downloader/core/fetchers/{base/rate_limiter.py → rate_limiter.py} +2 -2
  62. novel_downloader/core/fetchers/registry.py +60 -0
  63. novel_downloader/core/fetchers/{sfacg/session.py → sfacg.py} +11 -9
  64. novel_downloader/core/fetchers/shencou.py +106 -0
  65. novel_downloader/core/fetchers/{common/browser.py → shuhaige.py} +24 -19
  66. novel_downloader/core/fetchers/tongrenquan.py +84 -0
  67. novel_downloader/core/fetchers/ttkan.py +95 -0
  68. novel_downloader/core/fetchers/{common/session.py → wanbengo.py} +21 -17
  69. novel_downloader/core/fetchers/xiaoshuowu.py +106 -0
  70. novel_downloader/core/fetchers/xiguashuwu.py +177 -0
  71. novel_downloader/core/fetchers/xs63b.py +171 -0
  72. novel_downloader/core/fetchers/xshbook.py +85 -0
  73. novel_downloader/core/fetchers/{yamibo/session.py → yamibo.py} +23 -11
  74. novel_downloader/core/fetchers/yibige.py +114 -0
  75. novel_downloader/core/interfaces/__init__.py +8 -14
  76. novel_downloader/core/interfaces/downloader.py +6 -2
  77. novel_downloader/core/interfaces/exporter.py +7 -7
  78. novel_downloader/core/interfaces/fetcher.py +4 -17
  79. novel_downloader/core/interfaces/parser.py +5 -6
  80. novel_downloader/core/interfaces/searcher.py +26 -0
  81. novel_downloader/core/parsers/__init__.py +58 -22
  82. novel_downloader/core/parsers/aaatxt.py +132 -0
  83. novel_downloader/core/parsers/b520.py +116 -0
  84. novel_downloader/core/parsers/base.py +63 -12
  85. novel_downloader/core/parsers/biquyuedu.py +133 -0
  86. novel_downloader/core/parsers/dxmwx.py +162 -0
  87. novel_downloader/core/parsers/eightnovel.py +224 -0
  88. novel_downloader/core/parsers/{esjzone/main_parser.py → esjzone.py} +67 -67
  89. novel_downloader/core/parsers/guidaye.py +128 -0
  90. novel_downloader/core/parsers/hetushu.py +139 -0
  91. novel_downloader/core/parsers/i25zw.py +137 -0
  92. novel_downloader/core/parsers/ixdzs8.py +186 -0
  93. novel_downloader/core/parsers/jpxs123.py +137 -0
  94. novel_downloader/core/parsers/lewenn.py +142 -0
  95. novel_downloader/core/parsers/{linovelib/main_parser.py → linovelib.py} +54 -65
  96. novel_downloader/core/parsers/piaotia.py +189 -0
  97. novel_downloader/core/parsers/qbtr.py +136 -0
  98. novel_downloader/core/parsers/{qianbi/main_parser.py → qianbi.py} +54 -51
  99. novel_downloader/core/parsers/qidian/__init__.py +2 -2
  100. novel_downloader/core/parsers/qidian/book_info_parser.py +58 -59
  101. novel_downloader/core/parsers/qidian/chapter_encrypted.py +290 -346
  102. novel_downloader/core/parsers/qidian/chapter_normal.py +25 -56
  103. novel_downloader/core/parsers/qidian/main_parser.py +19 -57
  104. novel_downloader/core/parsers/qidian/utils/__init__.py +12 -11
  105. novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +6 -7
  106. novel_downloader/core/parsers/qidian/utils/fontmap_recover.py +143 -0
  107. novel_downloader/core/parsers/qidian/utils/helpers.py +0 -4
  108. novel_downloader/core/parsers/qidian/utils/node_decryptor.py +2 -2
  109. novel_downloader/core/parsers/quanben5.py +103 -0
  110. novel_downloader/core/parsers/registry.py +57 -0
  111. novel_downloader/core/parsers/{sfacg/main_parser.py → sfacg.py} +46 -48
  112. novel_downloader/core/parsers/shencou.py +215 -0
  113. novel_downloader/core/parsers/shuhaige.py +111 -0
  114. novel_downloader/core/parsers/tongrenquan.py +116 -0
  115. novel_downloader/core/parsers/ttkan.py +132 -0
  116. novel_downloader/core/parsers/wanbengo.py +191 -0
  117. novel_downloader/core/parsers/xiaoshuowu.py +173 -0
  118. novel_downloader/core/parsers/xiguashuwu.py +435 -0
  119. novel_downloader/core/parsers/xs63b.py +161 -0
  120. novel_downloader/core/parsers/xshbook.py +134 -0
  121. novel_downloader/core/parsers/yamibo.py +155 -0
  122. novel_downloader/core/parsers/yibige.py +166 -0
  123. novel_downloader/core/searchers/__init__.py +51 -0
  124. novel_downloader/core/searchers/aaatxt.py +107 -0
  125. novel_downloader/core/searchers/b520.py +84 -0
  126. novel_downloader/core/searchers/base.py +168 -0
  127. novel_downloader/core/searchers/dxmwx.py +105 -0
  128. novel_downloader/core/searchers/eightnovel.py +84 -0
  129. novel_downloader/core/searchers/esjzone.py +102 -0
  130. novel_downloader/core/searchers/hetushu.py +92 -0
  131. novel_downloader/core/searchers/i25zw.py +93 -0
  132. novel_downloader/core/searchers/ixdzs8.py +107 -0
  133. novel_downloader/core/searchers/jpxs123.py +107 -0
  134. novel_downloader/core/searchers/piaotia.py +100 -0
  135. novel_downloader/core/searchers/qbtr.py +106 -0
  136. novel_downloader/core/searchers/qianbi.py +165 -0
  137. novel_downloader/core/searchers/quanben5.py +144 -0
  138. novel_downloader/core/searchers/registry.py +79 -0
  139. novel_downloader/core/searchers/shuhaige.py +124 -0
  140. novel_downloader/core/searchers/tongrenquan.py +110 -0
  141. novel_downloader/core/searchers/ttkan.py +92 -0
  142. novel_downloader/core/searchers/xiaoshuowu.py +122 -0
  143. novel_downloader/core/searchers/xiguashuwu.py +95 -0
  144. novel_downloader/core/searchers/xs63b.py +104 -0
  145. novel_downloader/locales/en.json +36 -79
  146. novel_downloader/locales/zh.json +37 -80
  147. novel_downloader/models/__init__.py +23 -50
  148. novel_downloader/models/book.py +44 -0
  149. novel_downloader/models/config.py +16 -43
  150. novel_downloader/models/login.py +1 -1
  151. novel_downloader/models/search.py +21 -0
  152. novel_downloader/resources/config/settings.toml +39 -74
  153. novel_downloader/resources/css_styles/intro.css +83 -0
  154. novel_downloader/resources/css_styles/main.css +30 -89
  155. novel_downloader/resources/json/xiguashuwu.json +718 -0
  156. novel_downloader/utils/__init__.py +43 -0
  157. novel_downloader/utils/chapter_storage.py +247 -226
  158. novel_downloader/utils/constants.py +5 -50
  159. novel_downloader/utils/cookies.py +6 -18
  160. novel_downloader/utils/crypto_utils/__init__.py +13 -0
  161. novel_downloader/utils/crypto_utils/aes_util.py +90 -0
  162. novel_downloader/utils/crypto_utils/aes_v1.py +619 -0
  163. novel_downloader/utils/crypto_utils/aes_v2.py +1143 -0
  164. novel_downloader/utils/{crypto_utils.py → crypto_utils/rc4.py} +3 -10
  165. novel_downloader/utils/epub/__init__.py +34 -0
  166. novel_downloader/utils/epub/builder.py +377 -0
  167. novel_downloader/utils/epub/constants.py +118 -0
  168. novel_downloader/utils/epub/documents.py +297 -0
  169. novel_downloader/utils/epub/models.py +120 -0
  170. novel_downloader/utils/epub/utils.py +179 -0
  171. novel_downloader/utils/file_utils/__init__.py +5 -30
  172. novel_downloader/utils/file_utils/io.py +9 -150
  173. novel_downloader/utils/file_utils/normalize.py +2 -2
  174. novel_downloader/utils/file_utils/sanitize.py +2 -7
  175. novel_downloader/utils/fontocr.py +207 -0
  176. novel_downloader/utils/i18n.py +2 -0
  177. novel_downloader/utils/logger.py +10 -16
  178. novel_downloader/utils/network.py +111 -252
  179. novel_downloader/utils/state.py +5 -90
  180. novel_downloader/utils/text_utils/__init__.py +16 -21
  181. novel_downloader/utils/text_utils/diff_display.py +6 -9
  182. novel_downloader/utils/text_utils/numeric_conversion.py +253 -0
  183. novel_downloader/utils/text_utils/text_cleaner.py +179 -0
  184. novel_downloader/utils/text_utils/truncate_utils.py +62 -0
  185. novel_downloader/utils/time_utils/__init__.py +6 -12
  186. novel_downloader/utils/time_utils/datetime_utils.py +23 -33
  187. novel_downloader/utils/time_utils/sleep_utils.py +5 -10
  188. novel_downloader/web/__init__.py +13 -0
  189. novel_downloader/web/components/__init__.py +11 -0
  190. novel_downloader/web/components/navigation.py +35 -0
  191. novel_downloader/web/main.py +66 -0
  192. novel_downloader/web/pages/__init__.py +17 -0
  193. novel_downloader/web/pages/download.py +78 -0
  194. novel_downloader/web/pages/progress.py +147 -0
  195. novel_downloader/web/pages/search.py +329 -0
  196. novel_downloader/web/services/__init__.py +17 -0
  197. novel_downloader/web/services/client_dialog.py +164 -0
  198. novel_downloader/web/services/cred_broker.py +113 -0
  199. novel_downloader/web/services/cred_models.py +35 -0
  200. novel_downloader/web/services/task_manager.py +264 -0
  201. novel_downloader-2.0.0.dist-info/METADATA +171 -0
  202. novel_downloader-2.0.0.dist-info/RECORD +210 -0
  203. {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/entry_points.txt +1 -1
  204. novel_downloader/config/site_rules.py +0 -94
  205. novel_downloader/core/downloaders/biquge.py +0 -25
  206. novel_downloader/core/downloaders/esjzone.py +0 -25
  207. novel_downloader/core/downloaders/linovelib.py +0 -25
  208. novel_downloader/core/downloaders/sfacg.py +0 -25
  209. novel_downloader/core/downloaders/yamibo.py +0 -25
  210. novel_downloader/core/exporters/biquge.py +0 -25
  211. novel_downloader/core/exporters/esjzone.py +0 -25
  212. novel_downloader/core/exporters/qianbi.py +0 -25
  213. novel_downloader/core/exporters/sfacg.py +0 -25
  214. novel_downloader/core/exporters/yamibo.py +0 -25
  215. novel_downloader/core/factory/__init__.py +0 -20
  216. novel_downloader/core/factory/downloader.py +0 -73
  217. novel_downloader/core/factory/exporter.py +0 -58
  218. novel_downloader/core/factory/fetcher.py +0 -96
  219. novel_downloader/core/factory/parser.py +0 -86
  220. novel_downloader/core/fetchers/base/__init__.py +0 -14
  221. novel_downloader/core/fetchers/base/browser.py +0 -403
  222. novel_downloader/core/fetchers/biquge/__init__.py +0 -14
  223. novel_downloader/core/fetchers/common/__init__.py +0 -14
  224. novel_downloader/core/fetchers/esjzone/__init__.py +0 -14
  225. novel_downloader/core/fetchers/esjzone/browser.py +0 -204
  226. novel_downloader/core/fetchers/linovelib/__init__.py +0 -14
  227. novel_downloader/core/fetchers/linovelib/browser.py +0 -193
  228. novel_downloader/core/fetchers/qianbi/__init__.py +0 -14
  229. novel_downloader/core/fetchers/qidian/__init__.py +0 -14
  230. novel_downloader/core/fetchers/qidian/browser.py +0 -318
  231. novel_downloader/core/fetchers/sfacg/__init__.py +0 -14
  232. novel_downloader/core/fetchers/sfacg/browser.py +0 -189
  233. novel_downloader/core/fetchers/yamibo/__init__.py +0 -14
  234. novel_downloader/core/fetchers/yamibo/browser.py +0 -229
  235. novel_downloader/core/parsers/biquge/__init__.py +0 -10
  236. novel_downloader/core/parsers/biquge/main_parser.py +0 -134
  237. novel_downloader/core/parsers/common/__init__.py +0 -13
  238. novel_downloader/core/parsers/common/helper.py +0 -323
  239. novel_downloader/core/parsers/common/main_parser.py +0 -106
  240. novel_downloader/core/parsers/esjzone/__init__.py +0 -10
  241. novel_downloader/core/parsers/linovelib/__init__.py +0 -10
  242. novel_downloader/core/parsers/qianbi/__init__.py +0 -10
  243. novel_downloader/core/parsers/sfacg/__init__.py +0 -10
  244. novel_downloader/core/parsers/yamibo/__init__.py +0 -10
  245. novel_downloader/core/parsers/yamibo/main_parser.py +0 -194
  246. novel_downloader/models/browser.py +0 -21
  247. novel_downloader/models/chapter.py +0 -25
  248. novel_downloader/models/site_rules.py +0 -99
  249. novel_downloader/models/tasks.py +0 -33
  250. novel_downloader/models/types.py +0 -15
  251. novel_downloader/resources/css_styles/volume-intro.css +0 -56
  252. novel_downloader/resources/json/replace_word_map.json +0 -4
  253. novel_downloader/resources/text/blacklist.txt +0 -22
  254. novel_downloader/tui/__init__.py +0 -7
  255. novel_downloader/tui/app.py +0 -32
  256. novel_downloader/tui/main.py +0 -17
  257. novel_downloader/tui/screens/__init__.py +0 -14
  258. novel_downloader/tui/screens/home.py +0 -198
  259. novel_downloader/tui/screens/login.py +0 -74
  260. novel_downloader/tui/styles/home_layout.tcss +0 -79
  261. novel_downloader/tui/widgets/richlog_handler.py +0 -24
  262. novel_downloader/utils/cache.py +0 -24
  263. novel_downloader/utils/fontocr/__init__.py +0 -22
  264. novel_downloader/utils/fontocr/model_loader.py +0 -69
  265. novel_downloader/utils/fontocr/ocr_v1.py +0 -303
  266. novel_downloader/utils/fontocr/ocr_v2.py +0 -752
  267. novel_downloader/utils/hash_store.py +0 -279
  268. novel_downloader/utils/hash_utils.py +0 -103
  269. novel_downloader/utils/text_utils/chapter_formatting.py +0 -46
  270. novel_downloader/utils/text_utils/font_mapping.py +0 -28
  271. novel_downloader/utils/text_utils/text_cleaning.py +0 -107
  272. novel_downloader-1.4.5.dist-info/METADATA +0 -196
  273. novel_downloader-1.4.5.dist-info/RECORD +0 -165
  274. {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/WHEEL +0 -0
  275. {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/licenses/LICENSE +0 -0
  276. {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,189 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.piaotia
4
+ -------------------------------------
5
+
6
+ """
7
+
8
+ import re
9
+ from typing import Any
10
+
11
+ from lxml import html
12
+
13
+ from novel_downloader.core.parsers.base import BaseParser
14
+ from novel_downloader.core.parsers.registry import register_parser
15
+ from novel_downloader.models import (
16
+ BookInfoDict,
17
+ ChapterDict,
18
+ ChapterInfoDict,
19
+ VolumeInfoDict,
20
+ )
21
+
22
+
23
+ @register_parser(
24
+ site_keys=["piaotia"],
25
+ )
26
+ class PiaotiaParser(BaseParser):
27
+ """
28
+ Parser for 飘天文学网 book pages.
29
+ """
30
+
31
+ _RE_DEVICE_DIV = re.compile(
32
+ r'<div\s+id=[\'"“”]?device[\'"“”]?[^>]*>',
33
+ flags=re.IGNORECASE,
34
+ )
35
+
36
+ def parse_book_info(
37
+ self,
38
+ html_list: list[str],
39
+ **kwargs: Any,
40
+ ) -> BookInfoDict | None:
41
+ if len(html_list) < 2:
42
+ return None
43
+
44
+ # Parse trees
45
+ info_tree = html.fromstring(html_list[0])
46
+ catalog_tree = html.fromstring(html_list[1])
47
+
48
+ book_name = self._first_str(info_tree.xpath("//span[@style]//h1/text()"))
49
+ author = self._first_str(
50
+ info_tree.xpath(
51
+ '//td[contains(text(),"作") and contains(text(),"者")]/text()'
52
+ ),
53
+ replaces=[(chr(0xA0), ""), (" ", ""), ("作者:", "")],
54
+ )
55
+
56
+ # Category as tag
57
+ category = self._first_str(
58
+ info_tree.xpath(
59
+ '//td[contains(text(),"类") and contains(text(),"别")]/text()'
60
+ ),
61
+ replaces=[(chr(0xA0), ""), (" ", ""), ("类别:", "")],
62
+ )
63
+ tags = [category] if category else []
64
+
65
+ word_count = self._first_str(
66
+ info_tree.xpath('//td[contains(text(),"全文长度")]/text()'),
67
+ replaces=[(chr(0xA0), ""), (" ", ""), ("全文长度:", "")],
68
+ )
69
+
70
+ update_time = self._first_str(
71
+ info_tree.xpath('//td[contains(text(),"最后更新")]/text()'),
72
+ replaces=[(chr(0xA0), ""), (" ", ""), ("最后更新:", "")],
73
+ )
74
+
75
+ serial_status = self._first_str(
76
+ info_tree.xpath('//td[contains(text(),"文章状态")]/text()'),
77
+ replaces=[(chr(0xA0), ""), (" ", ""), ("文章状态:", "")],
78
+ )
79
+
80
+ cover_url = self._first_str(info_tree.xpath('//td[@width="80%"]//img/@src'))
81
+
82
+ # Summary
83
+ summary_divs = info_tree.xpath('//td[@width="80%"]/div')
84
+ if summary_divs:
85
+ raw = str(summary_divs[0].text_content())
86
+ summary = raw.split("内容简介:")[-1].strip()
87
+ else:
88
+ summary = ""
89
+
90
+ # Chapters (single volume)
91
+ chapters: list[ChapterInfoDict] = []
92
+ for a in catalog_tree.xpath('//div[@class="centent"]//ul/li/a'):
93
+ title = (a.text or "").strip()
94
+ url = a.get("href", "").strip()
95
+ chapter_id = url.split(".")[0]
96
+ chapters.append({"title": title, "url": url, "chapterId": chapter_id})
97
+
98
+ # Single volume
99
+ volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
100
+
101
+ return {
102
+ "book_name": book_name,
103
+ "author": author,
104
+ "cover_url": cover_url,
105
+ "update_time": update_time,
106
+ "summary": summary,
107
+ "volumes": volumes,
108
+ "tags": tags,
109
+ "word_count": word_count,
110
+ "serial_status": serial_status,
111
+ "extra": {},
112
+ }
113
+
114
+ def parse_chapter(
115
+ self,
116
+ html_list: list[str],
117
+ chapter_id: str,
118
+ **kwargs: Any,
119
+ ) -> ChapterDict | None:
120
+ """
121
+ Parse chapter page and extract the content of one chapter.
122
+
123
+ p.s. 结构好混乱:
124
+ 1. `<head>` 没有对应的 `</head>`, 同理 `</body>` 没有对应的 `<body>`
125
+ 2. 部分 html 通过 js 直接写入, 例如:
126
+ `document.write("<div id=\"main\" class=\"colors1 sidebar\">");`
127
+ 3. 部分 div 的 id 或 style 属性周围的引号是非标准的波浪引号, 例如:
128
+ `<div id=”device” style=”background-color...”>`,
129
+ 并也没有对应的 `</div>`
130
+
131
+ :param html_list: The HTML list of the chapter pages.
132
+ :param chapter_id: Identifier of the chapter being parsed.
133
+ :return: The chapter's data.
134
+ """
135
+ if not html_list:
136
+ return None
137
+
138
+ raw = self._RE_DEVICE_DIV.sub("", html_list[0])
139
+ raw = raw.replace(
140
+ '<script language="javascript">GetMode();</script>',
141
+ '<div id="main" class="colors1 sidebar">',
142
+ ).replace(
143
+ '<script language="javascript">GetFont();</script>',
144
+ '<div id="content">',
145
+ )
146
+
147
+ doc = html.fromstring(raw)
148
+ container = doc.xpath('//div[@id="content"]')
149
+ root = container[0] if container else doc
150
+
151
+ # Title comes straight from the <h1>
152
+ title = ""
153
+ h1 = root.find(".//h1")
154
+ if h1 is not None:
155
+ full = h1.text_content().strip()
156
+ a_txt = h1.xpath("./a/text()")
157
+ title = full.replace(a_txt[0].strip(), "").strip() if a_txt else full
158
+
159
+ # Walk the “script‑tables” -> <br> siblings for the body
160
+ table = root.xpath('.//table[@align="center" and @border]')
161
+ if not table:
162
+ return None
163
+ node = table[0].getnext()
164
+
165
+ lines: list[str] = []
166
+ while node is not None:
167
+ # stop at the next table or any bottom‑link nav div
168
+ if (node.tag == "table" and node.get("border")) or (
169
+ node.tag == "div" and node.get("class", "").endswith("link")
170
+ ):
171
+ break
172
+
173
+ if node.tag == "br":
174
+ txt = (node.tail or "").replace("\xa0", " ").strip()
175
+ if txt:
176
+ lines.append(txt)
177
+
178
+ node = node.getnext()
179
+
180
+ content = "\n".join(lines).strip()
181
+ if not content:
182
+ return None
183
+
184
+ return {
185
+ "id": chapter_id,
186
+ "title": title,
187
+ "content": content,
188
+ "extra": {"site": "piaotia"},
189
+ }
@@ -0,0 +1,136 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.qbtr
4
+ ----------------------------------
5
+
6
+ """
7
+
8
+ import re
9
+ from typing import Any
10
+
11
+ from lxml import html
12
+
13
+ from novel_downloader.core.parsers.base import BaseParser
14
+ from novel_downloader.core.parsers.registry import register_parser
15
+ from novel_downloader.models import (
16
+ BookInfoDict,
17
+ ChapterDict,
18
+ ChapterInfoDict,
19
+ VolumeInfoDict,
20
+ )
21
+
22
+
23
+ @register_parser(
24
+ site_keys=["qbtr"],
25
+ )
26
+ class QbtrParser(BaseParser):
27
+ """
28
+ Parser for 全本同人小说 book pages.
29
+ """
30
+
31
+ BASE_URL = "https://www.qbtr.cc"
32
+
33
+ def parse_book_info(
34
+ self,
35
+ html_list: list[str],
36
+ **kwargs: Any,
37
+ ) -> BookInfoDict | None:
38
+ if not html_list:
39
+ return None
40
+
41
+ # Parse the main info page
42
+ tree = html.fromstring(html_list[0])
43
+ # Book name
44
+ book_name = self._first_str(tree.xpath('//div[@class="infos"]/h1/text()'))
45
+ # Tags: the second breadcrumb (e.g., "同人小说")
46
+ tag = self._first_str(
47
+ tree.xpath('//div[contains(@class,"menNav")]/a[2]/text()')
48
+ )
49
+ tags = [tag] if tag else []
50
+
51
+ # Author & update_time from the date div
52
+ date_div = tree.xpath('//div[@class="date"]')
53
+ date_text = html.tostring(date_div[0], encoding="unicode", method="text")
54
+ author_match = re.search(r"作者[::]\s*([^日]+)", date_text)
55
+ author = author_match.group(1).strip() if author_match else ""
56
+ date_match = re.search(r"日期[::]\s*([\d-]+)", date_text)
57
+ update_time = date_match.group(1) if date_match else ""
58
+
59
+ # Summary from the <p> inside infos
60
+ paras = tree.xpath('//div[@class="infos"]/p//text()')
61
+ summary = "\n".join(p.strip() for p in paras if p.strip())
62
+
63
+ # Chapters from the book_list
64
+ chapters: list[ChapterInfoDict] = []
65
+ for a in tree.xpath('//div[contains(@class,"book_list")]//li/a'):
66
+ url = a.get("href", "").strip()
67
+ title = a.text_content().strip()
68
+ # General regex: /{category}/{bookId}/{chapterId}.html
69
+ m = re.search(r"^/[^/]+/\d+/(\d+)\.html$", url)
70
+ cid = m.group(1) if m else ""
71
+ chapters.append({"title": title, "url": url, "chapterId": cid})
72
+
73
+ volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
74
+
75
+ # Parse the download page (second HTML)
76
+ download_url = ""
77
+ if len(html_list) > 1 and html_list[1]:
78
+ dtree = html.fromstring(html_list[1])
79
+ a = dtree.xpath('//a[@id="dowloadnUrl"]')
80
+ if a:
81
+ link = a[0].get("link") or a[0].get("href") or ""
82
+ download_url = self._fix_download_link(link)
83
+
84
+ return {
85
+ "book_name": book_name,
86
+ "author": author,
87
+ "cover_url": "",
88
+ "update_time": update_time,
89
+ "tags": tags,
90
+ "summary": summary,
91
+ "volumes": volumes,
92
+ "extra": {"download_url": download_url},
93
+ }
94
+
95
+ def parse_chapter(
96
+ self,
97
+ html_list: list[str],
98
+ chapter_id: str,
99
+ **kwargs: Any,
100
+ ) -> ChapterDict | None:
101
+ if not html_list:
102
+ return None
103
+
104
+ tree = html.fromstring(html_list[0])
105
+
106
+ raw_title = self._first_str(
107
+ tree.xpath('//div[contains(@class,"read_chapterName")]//h1/text()')
108
+ )
109
+
110
+ crumbs = tree.xpath('//div[contains(@class,"readTop")]//a/text()')
111
+ book_name = crumbs[-1].strip() if crumbs else ""
112
+
113
+ title = raw_title.replace(book_name, "").strip()
114
+
115
+ paragraphs = tree.xpath('//div[contains(@class,"read_chapterDetail")]/p')
116
+ texts = []
117
+ for p in paragraphs:
118
+ txt = p.text_content().strip()
119
+ if txt:
120
+ texts.append(txt)
121
+
122
+ content = "\n".join(texts)
123
+ if not content:
124
+ return None
125
+
126
+ return {
127
+ "id": chapter_id,
128
+ "title": title,
129
+ "content": content,
130
+ "extra": {"site": "qbtr"},
131
+ }
132
+
133
+ @classmethod
134
+ def _fix_download_link(cls, link: str) -> str:
135
+ true_link = link.replace("qb../", "/e/DownSys/")
136
+ return f"{cls.BASE_URL}{true_link}"
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python3
2
2
  """
3
- novel_downloader.core.parsers.qianbi.main_parser
4
- ------------------------------------------------
3
+ novel_downloader.core.parsers.qianbi
4
+ ------------------------------------
5
5
 
6
6
  """
7
7
 
@@ -11,59 +11,61 @@ from typing import Any
11
11
  from lxml import html
12
12
 
13
13
  from novel_downloader.core.parsers.base import BaseParser
14
- from novel_downloader.models import ChapterDict
14
+ from novel_downloader.core.parsers.registry import register_parser
15
+ from novel_downloader.models import (
16
+ BookInfoDict,
17
+ ChapterDict,
18
+ VolumeInfoDict,
19
+ )
15
20
 
16
21
 
22
+ @register_parser(
23
+ site_keys=["qianbi"],
24
+ )
17
25
  class QianbiParser(BaseParser):
18
- """ """
26
+ """
27
+ Parser for 铅笔小说 book pages.
28
+ """
19
29
 
20
30
  def parse_book_info(
21
31
  self,
22
32
  html_list: list[str],
23
33
  **kwargs: Any,
24
- ) -> dict[str, Any]:
25
- """
26
- Parse a book info page and extract metadata and chapter structure.
27
-
28
- :param html_list: Raw HTML of the book info pages.
29
- :return: Parsed metadata and chapter structure as a dictionary.
30
- """
34
+ ) -> BookInfoDict | None:
31
35
  if len(html_list) < 2:
32
- return {}
36
+ return None
33
37
 
34
38
  info_tree = html.fromstring(html_list[0])
35
39
  catalog_tree = html.fromstring(html_list[1])
36
- result: dict[str, Any] = {}
37
-
38
- title = info_tree.xpath('//h1[@class="page-title"]/text()')
39
- result["book_name"] = title[0].strip() if title else ""
40
-
41
- author = info_tree.xpath('//a[contains(@href,"/author/")]/@title')
42
- result["author"] = author[0].strip() if author else ""
43
40
 
44
- cover = info_tree.xpath('//div[@class="novel-cover"]//img/@data-src')
45
- result["cover_url"] = cover[0].strip() if cover else ""
46
-
47
- status = info_tree.xpath(
48
- '//a[@class="tag-link" and (text()="完结" or text()="连载")]/text()'
41
+ book_name = self._first_str(info_tree.xpath('//h1[@class="page-title"]/text()'))
42
+ author = self._first_str(
43
+ info_tree.xpath('//a[contains(@href,"/author/")]/@title')
44
+ )
45
+ cover_url = self._first_str(
46
+ info_tree.xpath('//div[@class="novel-cover"]//img/@data-src')
47
+ )
48
+ serial_status = self._first_str(
49
+ info_tree.xpath(
50
+ '//a[@class="tag-link" and (text()="完结" or text()="连载")]/text()'
51
+ )
52
+ )
53
+ word_count = self._first_str(
54
+ info_tree.xpath('//span[contains(text(), "字")]/text()')
49
55
  )
50
- result["serial_status"] = status[0] if status else ""
51
-
52
- word_count_raw = info_tree.xpath('//span[contains(text(), "万字")]/text()')
53
- result["word_count"] = word_count_raw[0].strip() if word_count_raw else ""
54
56
 
55
57
  summary_node = info_tree.xpath(
56
58
  '//div[@class="novel-info-item novel-info-content"]/span'
57
59
  )
58
60
  if summary_node and summary_node[0] is not None:
59
- result["summary"] = summary_node[0].text_content().strip()
61
+ summary = str(summary_node[0].text_content()).strip()
60
62
  else:
61
- result["summary"] = ""
63
+ summary = ""
62
64
 
63
- result["update_time"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
65
+ update_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
64
66
 
65
- volumes: list[dict[str, Any]] = []
66
- current_volume = None
67
+ volumes: list[VolumeInfoDict] = []
68
+ current_volume: VolumeInfoDict | None = None
67
69
 
68
70
  for elem in catalog_tree.xpath('//div[@class="box"]/*'):
69
71
  class_attr = elem.get("class", "")
@@ -99,9 +101,17 @@ class QianbiParser(BaseParser):
99
101
  if current_volume:
100
102
  volumes.append(current_volume)
101
103
 
102
- result["volumes"] = volumes
103
-
104
- return result
104
+ return {
105
+ "book_name": book_name,
106
+ "author": author,
107
+ "cover_url": cover_url,
108
+ "update_time": update_time,
109
+ "word_count": word_count,
110
+ "serial_status": serial_status,
111
+ "summary": summary,
112
+ "volumes": volumes,
113
+ "extra": {},
114
+ }
105
115
 
106
116
  def parse_chapter(
107
117
  self,
@@ -109,31 +119,24 @@ class QianbiParser(BaseParser):
109
119
  chapter_id: str,
110
120
  **kwargs: Any,
111
121
  ) -> ChapterDict | None:
112
- """
113
- Parse a single chapter page and extract clean text or simplified HTML.
114
-
115
- :param html_list: Raw HTML of the chapter page.
116
- :param chapter_id: Identifier of the chapter being parsed.
117
- :return: Cleaned chapter content as plain text or minimal HTML.
118
- """
119
122
  if not html_list:
120
123
  return None
121
124
  tree = html.fromstring(html_list[0])
122
125
 
126
+ # Content paragraphs
123
127
  paras = tree.xpath('//div[@class="article-content"]/p/text()')
124
- content_text = "\n\n".join(p.strip() for p in paras if p.strip())
128
+ content_text = "\n".join(p.strip() for p in paras if p.strip())
125
129
  if not content_text:
126
130
  return None
127
131
 
128
- title = tree.xpath('//h1[@class="article-title"]/text()')
129
- title_text = title[0].strip() if title else ""
130
-
131
- volume = tree.xpath('//h3[@class="text-muted"]/text()')
132
- volume_text = volume[0].strip() if volume else ""
132
+ title_text = self._first_str(tree.xpath('//h1[@class="article-title"]/text()'))
133
+ volume_text = self._first_str(tree.xpath('//h3[@class="text-muted"]/text()'))
133
134
 
134
- next_href = tree.xpath('//div[@class="footer"]/a[@class="f-right"]/@href')
135
+ next_href = self._first_str(
136
+ tree.xpath('//div[@class="footer"]/a[@class="f-right"]/@href')
137
+ )
135
138
  next_chapter_id = (
136
- next_href[0].split("/")[-1].replace(".html", "") if next_href else ""
139
+ next_href.split("/")[-1].replace(".html", "") if next_href else ""
137
140
  )
138
141
 
139
142
  return {
@@ -5,6 +5,6 @@ novel_downloader.core.parsers.qidian
5
5
 
6
6
  """
7
7
 
8
- from .main_parser import QidianParser
9
-
10
8
  __all__ = ["QidianParser"]
9
+
10
+ from .main_parser import QidianParser
@@ -12,10 +12,11 @@ time, status, word count, summary, and volume-chapter structure.
12
12
  import logging
13
13
  import re
14
14
  from datetime import datetime
15
- from typing import Any
16
15
 
17
16
  from lxml import html
18
17
 
18
+ from novel_downloader.models import BookInfoDict, ChapterInfoDict, VolumeInfoDict
19
+
19
20
  logger = logging.getLogger(__name__)
20
21
 
21
22
 
@@ -23,7 +24,7 @@ def _chapter_url_to_id(url: str) -> str:
23
24
  return url.rstrip("/").split("/")[-1]
24
25
 
25
26
 
26
- def parse_book_info(html_str: str) -> dict[str, Any]:
27
+ def parse_book_info(html_str: str) -> BookInfoDict | None:
27
28
  """
28
29
  Extract metadata: title, author, cover_url, update_time, status,
29
30
  word_count, summary, and volumes with chapters.
@@ -31,60 +32,58 @@ def parse_book_info(html_str: str) -> dict[str, Any]:
31
32
  :param html_str: Raw HTML of the book info page.
32
33
  :return: A dict containing book metadata.
33
34
  """
34
- info: dict[str, Any] = {}
35
- try:
36
- doc = html.fromstring(html_str)
37
-
38
- info["book_name"] = doc.xpath('string(//h1[@id="bookName"])').strip()
39
-
40
- info["author"] = doc.xpath('string(//a[@class="writer-name"])').strip()
41
-
42
- book_id = doc.xpath('//a[@id="bookImg"]/@data-bid')[0]
43
- info[
44
- "cover_url"
45
- ] = f"https://bookcover.yuewen.com/qdbimg/349573/{book_id}/600.webp"
46
-
47
- ut = (
48
- doc.xpath('string(//span[@class="update-time"])')
49
- .replace("更新时间:", "")
50
- .strip()
51
- )
52
- if re.match(r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$", ut):
53
- info["update_time"] = ut
54
- else:
55
- info["update_time"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
56
-
57
- info["serial_status"] = doc.xpath(
58
- 'string(//p[@class="book-attribute"]/span[1])'
59
- ).strip()
60
-
61
- tags = doc.xpath('//p[contains(@class,"all-label")]//a/text()')
62
- info["tags"] = [t.strip() for t in tags if t.strip()]
63
-
64
- info["word_count"] = doc.xpath('string(//p[@class="count"]/em[1])').strip()
65
-
66
- summary = doc.xpath('string(//p[@class="intro"])').strip()
67
- info["summary_brief"] = summary
68
-
69
- raw = doc.xpath('//p[@id="book-intro-detail"]//text()')
70
- info["summary"] = "\n".join(line.strip() for line in raw if line.strip())
71
-
72
- volumes = []
73
- for vol in doc.xpath('//div[@id="allCatalog"]//div[@class="catalog-volume"]'):
74
- vol_name = vol.xpath('string(.//h3[@class="volume-name"])').strip()
75
- vol_name = vol_name.split(chr(183))[0].strip()
76
- chapters = []
77
- for li in vol.xpath('.//ul[contains(@class,"volume-chapters")]/li'):
78
- a = li.xpath('.//a[@class="chapter-name"]')[0]
79
- title = a.text.strip()
80
- url = a.get("href")
81
- chapters.append(
82
- {"title": title, "url": url, "chapterId": _chapter_url_to_id(url)}
83
- )
84
- volumes.append({"volume_name": vol_name, "chapters": chapters})
85
- info["volumes"] = volumes
86
-
87
- except Exception as e:
88
- logger.warning("[Parser] Error parsing book info: %s", e)
89
-
90
- return info
35
+ doc = html.fromstring(html_str)
36
+
37
+ book_name = doc.xpath('string(//h1[@id="bookName"])').strip()
38
+
39
+ author = doc.xpath('string(//a[@class="writer-name"])').strip()
40
+
41
+ book_id = doc.xpath('//a[@id="bookImg"]/@data-bid')[0]
42
+ cover_url = f"https://bookcover.yuewen.com/qdbimg/349573/{book_id}/600.webp"
43
+
44
+ ut = doc.xpath('string(//span[@class="update-time"])')
45
+ ut = ut.replace("更新时间:", "").strip()
46
+ if re.match(r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$", ut):
47
+ update_time = ut
48
+ else:
49
+ update_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
50
+
51
+ serial_status = doc.xpath('string(//p[@class="book-attribute"]/span[1])').strip()
52
+
53
+ tags_elem = doc.xpath('//p[contains(@class,"all-label")]//a/text()')
54
+ tags = [t.strip() for t in tags_elem if t.strip()]
55
+
56
+ word_count = doc.xpath('string(//p[@class="count"]/em[1])').strip()
57
+
58
+ summary_brief = doc.xpath('string(//p[@class="intro"])').strip()
59
+
60
+ raw = doc.xpath('//p[@id="book-intro-detail"]//text()')
61
+ summary = "\n".join(line.strip() for line in raw if line.strip())
62
+
63
+ volumes: list[VolumeInfoDict] = []
64
+ for vol in doc.xpath('//div[@id="allCatalog"]//div[@class="catalog-volume"]'):
65
+ vol_name = vol.xpath('string(.//h3[@class="volume-name"])').strip()
66
+ vol_name = vol_name.split(chr(183))[0].strip()
67
+ chapters: list[ChapterInfoDict] = []
68
+ for li in vol.xpath('.//ul[contains(@class,"volume-chapters")]/li'):
69
+ a = li.xpath('.//a[@class="chapter-name"]')[0]
70
+ title = a.text.strip()
71
+ url = a.get("href")
72
+ chapters.append(
73
+ {"title": title, "url": url, "chapterId": _chapter_url_to_id(url)}
74
+ )
75
+ volumes.append({"volume_name": vol_name, "chapters": chapters})
76
+
77
+ return {
78
+ "book_name": book_name,
79
+ "author": author,
80
+ "cover_url": cover_url,
81
+ "update_time": update_time,
82
+ "word_count": word_count,
83
+ "serial_status": serial_status,
84
+ "tags": tags,
85
+ "summary_brief": summary_brief,
86
+ "summary": summary,
87
+ "volumes": volumes,
88
+ "extra": {},
89
+ }