novel-downloader 1.5.0__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (248) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/__init__.py +1 -3
  3. novel_downloader/cli/clean.py +21 -88
  4. novel_downloader/cli/config.py +26 -21
  5. novel_downloader/cli/download.py +79 -66
  6. novel_downloader/cli/export.py +17 -21
  7. novel_downloader/cli/main.py +1 -1
  8. novel_downloader/cli/search.py +62 -65
  9. novel_downloader/cli/ui.py +156 -0
  10. novel_downloader/config/__init__.py +8 -5
  11. novel_downloader/config/adapter.py +206 -209
  12. novel_downloader/config/{loader.py → file_io.py} +53 -26
  13. novel_downloader/core/__init__.py +5 -5
  14. novel_downloader/core/archived/deqixs/fetcher.py +115 -0
  15. novel_downloader/core/archived/deqixs/parser.py +132 -0
  16. novel_downloader/core/archived/deqixs/searcher.py +89 -0
  17. novel_downloader/core/{searchers/qidian.py → archived/qidian/searcher.py} +12 -20
  18. novel_downloader/core/archived/wanbengo/searcher.py +98 -0
  19. novel_downloader/core/archived/xshbook/searcher.py +93 -0
  20. novel_downloader/core/downloaders/__init__.py +3 -24
  21. novel_downloader/core/downloaders/base.py +49 -23
  22. novel_downloader/core/downloaders/common.py +191 -137
  23. novel_downloader/core/downloaders/qianbi.py +187 -146
  24. novel_downloader/core/downloaders/qidian.py +187 -141
  25. novel_downloader/core/downloaders/registry.py +4 -2
  26. novel_downloader/core/downloaders/signals.py +46 -0
  27. novel_downloader/core/exporters/__init__.py +3 -20
  28. novel_downloader/core/exporters/base.py +33 -37
  29. novel_downloader/core/exporters/common/__init__.py +1 -2
  30. novel_downloader/core/exporters/common/epub.py +15 -10
  31. novel_downloader/core/exporters/common/main_exporter.py +19 -12
  32. novel_downloader/core/exporters/common/txt.py +17 -12
  33. novel_downloader/core/exporters/epub_util.py +59 -29
  34. novel_downloader/core/exporters/linovelib/__init__.py +1 -0
  35. novel_downloader/core/exporters/linovelib/epub.py +23 -25
  36. novel_downloader/core/exporters/linovelib/main_exporter.py +8 -12
  37. novel_downloader/core/exporters/linovelib/txt.py +20 -14
  38. novel_downloader/core/exporters/qidian.py +2 -8
  39. novel_downloader/core/exporters/registry.py +4 -2
  40. novel_downloader/core/exporters/txt_util.py +7 -7
  41. novel_downloader/core/fetchers/__init__.py +54 -48
  42. novel_downloader/core/fetchers/aaatxt.py +83 -0
  43. novel_downloader/core/fetchers/{biquge/session.py → b520.py} +6 -11
  44. novel_downloader/core/fetchers/{base/session.py → base.py} +37 -46
  45. novel_downloader/core/fetchers/{biquge/browser.py → biquyuedu.py} +12 -17
  46. novel_downloader/core/fetchers/dxmwx.py +110 -0
  47. novel_downloader/core/fetchers/eightnovel.py +139 -0
  48. novel_downloader/core/fetchers/{esjzone/session.py → esjzone.py} +19 -12
  49. novel_downloader/core/fetchers/guidaye.py +85 -0
  50. novel_downloader/core/fetchers/hetushu.py +92 -0
  51. novel_downloader/core/fetchers/{qianbi/browser.py → i25zw.py} +19 -28
  52. novel_downloader/core/fetchers/ixdzs8.py +113 -0
  53. novel_downloader/core/fetchers/jpxs123.py +101 -0
  54. novel_downloader/core/fetchers/lewenn.py +83 -0
  55. novel_downloader/core/fetchers/{linovelib/session.py → linovelib.py} +12 -13
  56. novel_downloader/core/fetchers/piaotia.py +105 -0
  57. novel_downloader/core/fetchers/qbtr.py +101 -0
  58. novel_downloader/core/fetchers/{qianbi/session.py → qianbi.py} +5 -10
  59. novel_downloader/core/fetchers/{qidian/session.py → qidian.py} +56 -64
  60. novel_downloader/core/fetchers/quanben5.py +92 -0
  61. novel_downloader/core/fetchers/{base/rate_limiter.py → rate_limiter.py} +2 -2
  62. novel_downloader/core/fetchers/registry.py +5 -16
  63. novel_downloader/core/fetchers/{sfacg/session.py → sfacg.py} +7 -10
  64. novel_downloader/core/fetchers/shencou.py +106 -0
  65. novel_downloader/core/fetchers/shuhaige.py +84 -0
  66. novel_downloader/core/fetchers/tongrenquan.py +84 -0
  67. novel_downloader/core/fetchers/ttkan.py +95 -0
  68. novel_downloader/core/fetchers/wanbengo.py +83 -0
  69. novel_downloader/core/fetchers/xiaoshuowu.py +106 -0
  70. novel_downloader/core/fetchers/xiguashuwu.py +177 -0
  71. novel_downloader/core/fetchers/xs63b.py +171 -0
  72. novel_downloader/core/fetchers/xshbook.py +85 -0
  73. novel_downloader/core/fetchers/{yamibo/session.py → yamibo.py} +19 -12
  74. novel_downloader/core/fetchers/yibige.py +114 -0
  75. novel_downloader/core/interfaces/__init__.py +1 -9
  76. novel_downloader/core/interfaces/downloader.py +6 -2
  77. novel_downloader/core/interfaces/exporter.py +7 -7
  78. novel_downloader/core/interfaces/fetcher.py +6 -19
  79. novel_downloader/core/interfaces/parser.py +7 -8
  80. novel_downloader/core/interfaces/searcher.py +9 -1
  81. novel_downloader/core/parsers/__init__.py +49 -12
  82. novel_downloader/core/parsers/aaatxt.py +132 -0
  83. novel_downloader/core/parsers/b520.py +116 -0
  84. novel_downloader/core/parsers/base.py +64 -12
  85. novel_downloader/core/parsers/biquyuedu.py +133 -0
  86. novel_downloader/core/parsers/dxmwx.py +162 -0
  87. novel_downloader/core/parsers/eightnovel.py +224 -0
  88. novel_downloader/core/parsers/esjzone.py +64 -69
  89. novel_downloader/core/parsers/guidaye.py +128 -0
  90. novel_downloader/core/parsers/hetushu.py +139 -0
  91. novel_downloader/core/parsers/i25zw.py +137 -0
  92. novel_downloader/core/parsers/ixdzs8.py +186 -0
  93. novel_downloader/core/parsers/jpxs123.py +137 -0
  94. novel_downloader/core/parsers/lewenn.py +142 -0
  95. novel_downloader/core/parsers/linovelib.py +48 -64
  96. novel_downloader/core/parsers/piaotia.py +189 -0
  97. novel_downloader/core/parsers/qbtr.py +136 -0
  98. novel_downloader/core/parsers/qianbi.py +48 -50
  99. novel_downloader/core/parsers/qidian/main_parser.py +756 -48
  100. novel_downloader/core/parsers/qidian/utils/__init__.py +3 -21
  101. novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +1 -1
  102. novel_downloader/core/parsers/qidian/utils/node_decryptor.py +4 -4
  103. novel_downloader/core/parsers/quanben5.py +103 -0
  104. novel_downloader/core/parsers/registry.py +5 -16
  105. novel_downloader/core/parsers/sfacg.py +38 -45
  106. novel_downloader/core/parsers/shencou.py +215 -0
  107. novel_downloader/core/parsers/shuhaige.py +111 -0
  108. novel_downloader/core/parsers/tongrenquan.py +116 -0
  109. novel_downloader/core/parsers/ttkan.py +132 -0
  110. novel_downloader/core/parsers/wanbengo.py +191 -0
  111. novel_downloader/core/parsers/xiaoshuowu.py +173 -0
  112. novel_downloader/core/parsers/xiguashuwu.py +429 -0
  113. novel_downloader/core/parsers/xs63b.py +161 -0
  114. novel_downloader/core/parsers/xshbook.py +134 -0
  115. novel_downloader/core/parsers/yamibo.py +87 -131
  116. novel_downloader/core/parsers/yibige.py +166 -0
  117. novel_downloader/core/searchers/__init__.py +34 -3
  118. novel_downloader/core/searchers/aaatxt.py +107 -0
  119. novel_downloader/core/searchers/{biquge.py → b520.py} +29 -28
  120. novel_downloader/core/searchers/base.py +112 -36
  121. novel_downloader/core/searchers/dxmwx.py +105 -0
  122. novel_downloader/core/searchers/eightnovel.py +84 -0
  123. novel_downloader/core/searchers/esjzone.py +43 -25
  124. novel_downloader/core/searchers/hetushu.py +92 -0
  125. novel_downloader/core/searchers/i25zw.py +93 -0
  126. novel_downloader/core/searchers/ixdzs8.py +107 -0
  127. novel_downloader/core/searchers/jpxs123.py +107 -0
  128. novel_downloader/core/searchers/piaotia.py +100 -0
  129. novel_downloader/core/searchers/qbtr.py +106 -0
  130. novel_downloader/core/searchers/qianbi.py +74 -40
  131. novel_downloader/core/searchers/quanben5.py +144 -0
  132. novel_downloader/core/searchers/registry.py +24 -8
  133. novel_downloader/core/searchers/shuhaige.py +124 -0
  134. novel_downloader/core/searchers/tongrenquan.py +110 -0
  135. novel_downloader/core/searchers/ttkan.py +92 -0
  136. novel_downloader/core/searchers/xiaoshuowu.py +122 -0
  137. novel_downloader/core/searchers/xiguashuwu.py +95 -0
  138. novel_downloader/core/searchers/xs63b.py +104 -0
  139. novel_downloader/locales/en.json +34 -85
  140. novel_downloader/locales/zh.json +35 -86
  141. novel_downloader/models/__init__.py +21 -22
  142. novel_downloader/models/book.py +44 -0
  143. novel_downloader/models/config.py +4 -37
  144. novel_downloader/models/login.py +1 -1
  145. novel_downloader/models/search.py +5 -0
  146. novel_downloader/resources/config/settings.toml +8 -70
  147. novel_downloader/resources/json/xiguashuwu.json +718 -0
  148. novel_downloader/utils/__init__.py +13 -24
  149. novel_downloader/utils/chapter_storage.py +5 -5
  150. novel_downloader/utils/constants.py +4 -31
  151. novel_downloader/utils/cookies.py +38 -35
  152. novel_downloader/utils/crypto_utils/__init__.py +7 -0
  153. novel_downloader/utils/crypto_utils/aes_util.py +90 -0
  154. novel_downloader/utils/crypto_utils/aes_v1.py +619 -0
  155. novel_downloader/utils/crypto_utils/aes_v2.py +1143 -0
  156. novel_downloader/utils/crypto_utils/rc4.py +54 -0
  157. novel_downloader/utils/epub/__init__.py +3 -4
  158. novel_downloader/utils/epub/builder.py +6 -6
  159. novel_downloader/utils/epub/constants.py +62 -21
  160. novel_downloader/utils/epub/documents.py +95 -201
  161. novel_downloader/utils/epub/models.py +8 -22
  162. novel_downloader/utils/epub/utils.py +73 -106
  163. novel_downloader/utils/file_utils/__init__.py +2 -23
  164. novel_downloader/utils/file_utils/io.py +53 -188
  165. novel_downloader/utils/file_utils/normalize.py +1 -7
  166. novel_downloader/utils/file_utils/sanitize.py +4 -15
  167. novel_downloader/utils/fontocr/__init__.py +5 -14
  168. novel_downloader/utils/fontocr/core.py +216 -0
  169. novel_downloader/utils/fontocr/loader.py +50 -0
  170. novel_downloader/utils/logger.py +81 -65
  171. novel_downloader/utils/network.py +17 -41
  172. novel_downloader/utils/state.py +4 -90
  173. novel_downloader/utils/text_utils/__init__.py +1 -7
  174. novel_downloader/utils/text_utils/diff_display.py +5 -7
  175. novel_downloader/utils/text_utils/text_cleaner.py +39 -30
  176. novel_downloader/utils/text_utils/truncate_utils.py +3 -14
  177. novel_downloader/utils/time_utils/__init__.py +5 -11
  178. novel_downloader/utils/time_utils/datetime_utils.py +20 -29
  179. novel_downloader/utils/time_utils/sleep_utils.py +55 -49
  180. novel_downloader/web/__init__.py +13 -0
  181. novel_downloader/web/components/__init__.py +11 -0
  182. novel_downloader/web/components/navigation.py +35 -0
  183. novel_downloader/web/main.py +66 -0
  184. novel_downloader/web/pages/__init__.py +17 -0
  185. novel_downloader/web/pages/download.py +78 -0
  186. novel_downloader/web/pages/progress.py +147 -0
  187. novel_downloader/web/pages/search.py +329 -0
  188. novel_downloader/web/services/__init__.py +17 -0
  189. novel_downloader/web/services/client_dialog.py +164 -0
  190. novel_downloader/web/services/cred_broker.py +113 -0
  191. novel_downloader/web/services/cred_models.py +35 -0
  192. novel_downloader/web/services/task_manager.py +264 -0
  193. novel_downloader-2.0.1.dist-info/METADATA +172 -0
  194. novel_downloader-2.0.1.dist-info/RECORD +206 -0
  195. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/entry_points.txt +1 -1
  196. novel_downloader/core/downloaders/biquge.py +0 -29
  197. novel_downloader/core/downloaders/esjzone.py +0 -29
  198. novel_downloader/core/downloaders/linovelib.py +0 -29
  199. novel_downloader/core/downloaders/sfacg.py +0 -29
  200. novel_downloader/core/downloaders/yamibo.py +0 -29
  201. novel_downloader/core/exporters/biquge.py +0 -22
  202. novel_downloader/core/exporters/esjzone.py +0 -22
  203. novel_downloader/core/exporters/qianbi.py +0 -22
  204. novel_downloader/core/exporters/sfacg.py +0 -22
  205. novel_downloader/core/exporters/yamibo.py +0 -22
  206. novel_downloader/core/fetchers/base/__init__.py +0 -14
  207. novel_downloader/core/fetchers/base/browser.py +0 -422
  208. novel_downloader/core/fetchers/biquge/__init__.py +0 -14
  209. novel_downloader/core/fetchers/esjzone/__init__.py +0 -14
  210. novel_downloader/core/fetchers/esjzone/browser.py +0 -209
  211. novel_downloader/core/fetchers/linovelib/__init__.py +0 -14
  212. novel_downloader/core/fetchers/linovelib/browser.py +0 -198
  213. novel_downloader/core/fetchers/qianbi/__init__.py +0 -14
  214. novel_downloader/core/fetchers/qidian/__init__.py +0 -14
  215. novel_downloader/core/fetchers/qidian/browser.py +0 -326
  216. novel_downloader/core/fetchers/sfacg/__init__.py +0 -14
  217. novel_downloader/core/fetchers/sfacg/browser.py +0 -194
  218. novel_downloader/core/fetchers/yamibo/__init__.py +0 -14
  219. novel_downloader/core/fetchers/yamibo/browser.py +0 -234
  220. novel_downloader/core/parsers/biquge.py +0 -139
  221. novel_downloader/core/parsers/qidian/book_info_parser.py +0 -90
  222. novel_downloader/core/parsers/qidian/chapter_encrypted.py +0 -528
  223. novel_downloader/core/parsers/qidian/chapter_normal.py +0 -157
  224. novel_downloader/core/parsers/qidian/chapter_router.py +0 -68
  225. novel_downloader/core/parsers/qidian/utils/helpers.py +0 -114
  226. novel_downloader/models/chapter.py +0 -25
  227. novel_downloader/models/types.py +0 -13
  228. novel_downloader/tui/__init__.py +0 -7
  229. novel_downloader/tui/app.py +0 -32
  230. novel_downloader/tui/main.py +0 -17
  231. novel_downloader/tui/screens/__init__.py +0 -14
  232. novel_downloader/tui/screens/home.py +0 -198
  233. novel_downloader/tui/screens/login.py +0 -74
  234. novel_downloader/tui/styles/home_layout.tcss +0 -79
  235. novel_downloader/tui/widgets/richlog_handler.py +0 -24
  236. novel_downloader/utils/cache.py +0 -24
  237. novel_downloader/utils/crypto_utils.py +0 -71
  238. novel_downloader/utils/fontocr/hash_store.py +0 -280
  239. novel_downloader/utils/fontocr/hash_utils.py +0 -103
  240. novel_downloader/utils/fontocr/model_loader.py +0 -69
  241. novel_downloader/utils/fontocr/ocr_v1.py +0 -315
  242. novel_downloader/utils/fontocr/ocr_v2.py +0 -764
  243. novel_downloader/utils/fontocr/ocr_v3.py +0 -744
  244. novel_downloader-1.5.0.dist-info/METADATA +0 -196
  245. novel_downloader-1.5.0.dist-info/RECORD +0 -164
  246. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/WHEEL +0 -0
  247. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/licenses/LICENSE +0 -0
  248. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,116 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.tongrenquan
4
+ -----------------------------------------
5
+
6
+ """
7
+
8
+ from typing import Any
9
+
10
+ from lxml import html
11
+
12
+ from novel_downloader.core.parsers.base import BaseParser
13
+ from novel_downloader.core.parsers.registry import register_parser
14
+ from novel_downloader.models import (
15
+ BookInfoDict,
16
+ ChapterDict,
17
+ ChapterInfoDict,
18
+ VolumeInfoDict,
19
+ )
20
+
21
+
22
+ @register_parser(
23
+ site_keys=["tongrenquan"],
24
+ )
25
+ class TongrenquanParser(BaseParser):
26
+ """
27
+ Parser for 同人圈 book pages.
28
+ """
29
+
30
+ BASE_URL = "https://www.tongrenquan.org"
31
+
32
+ def parse_book_info(
33
+ self,
34
+ html_list: list[str],
35
+ **kwargs: Any,
36
+ ) -> BookInfoDict | None:
37
+ if not html_list:
38
+ return None
39
+
40
+ tree = html.fromstring(html_list[0])
41
+
42
+ # Metadata
43
+ book_name = self._first_str(tree.xpath('//div[@class="infos"]/h1/text()'))
44
+ author = self._first_str(
45
+ tree.xpath('//div[@class="date"]/span/text()'),
46
+ replaces=[("作者:", "")],
47
+ )
48
+ cover_url = self.BASE_URL + self._first_str(
49
+ tree.xpath('//div[@class="pic"]//img/@src')
50
+ )
51
+ update_time = self._first_str(
52
+ tree.xpath('//div[@class="date"]/text()'),
53
+ replaces=[("日期:", "")],
54
+ )
55
+
56
+ # Summary (collapse text within the <p> tag)
57
+ paras = tree.xpath('//div[@class="infos"]/p//text()')
58
+ summary = "\n".join(p.strip() for p in paras if p.strip())
59
+
60
+ # Chapters extraction
61
+ chapters: list[ChapterInfoDict] = []
62
+ for a in tree.xpath('//div[contains(@class,"book_list")]//ul//li/a'):
63
+ url = a.get("href", "").strip()
64
+ title = a.text_content().strip()
65
+ # General pattern: /category/bookId/chapterId.html
66
+ # '/tongren/7562/462.html' -> '462'
67
+ chapter_id = url.rstrip(".html").split("/")[-1]
68
+ chapters.append({"title": title, "url": url, "chapterId": chapter_id})
69
+
70
+ volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
71
+
72
+ return {
73
+ "book_name": book_name,
74
+ "author": author,
75
+ "cover_url": cover_url,
76
+ "update_time": update_time,
77
+ "tags": ["同人小说"],
78
+ "summary": summary,
79
+ "volumes": volumes,
80
+ "extra": {},
81
+ }
82
+
83
+ def parse_chapter(
84
+ self,
85
+ html_list: list[str],
86
+ chapter_id: str,
87
+ **kwargs: Any,
88
+ ) -> ChapterDict | None:
89
+ if not html_list:
90
+ return None
91
+
92
+ tree = html.fromstring(html_list[0])
93
+
94
+ raw_title = self._first_str(
95
+ tree.xpath('//div[contains(@class,"read_chapterName")]//h1/text()')
96
+ )
97
+
98
+ book_name = self._first_str(
99
+ tree.xpath('//div[contains(@class,"readTop")]//a[last()]/text()')
100
+ )
101
+
102
+ title = raw_title.replace(book_name, "").strip()
103
+
104
+ # Extract paragraphs of content
105
+ paras = tree.xpath('//div[contains(@class,"read_chapterDetail")]/p')
106
+ texts = [p.text_content().strip() for p in paras if p.text_content().strip()]
107
+ content = "\n".join(texts)
108
+ if not content:
109
+ return None
110
+
111
+ return {
112
+ "id": chapter_id,
113
+ "title": title,
114
+ "content": content,
115
+ "extra": {"site": "tongrenquan"},
116
+ }
@@ -0,0 +1,132 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.ttkan
4
+ -----------------------------------
5
+
6
+ """
7
+
8
+ from datetime import datetime
9
+ from typing import Any
10
+
11
+ from lxml import html
12
+
13
+ from novel_downloader.core.parsers.base import BaseParser
14
+ from novel_downloader.core.parsers.registry import register_parser
15
+ from novel_downloader.models import (
16
+ BookInfoDict,
17
+ ChapterDict,
18
+ ChapterInfoDict,
19
+ VolumeInfoDict,
20
+ )
21
+
22
+
23
+ @register_parser(
24
+ site_keys=["ttkan"],
25
+ )
26
+ class TtkanParser(BaseParser):
27
+ """
28
+ Parser for 天天看小說 book pages.
29
+ """
30
+
31
+ def parse_book_info(
32
+ self,
33
+ html_list: list[str],
34
+ **kwargs: Any,
35
+ ) -> BookInfoDict | None:
36
+ if not html_list:
37
+ return None
38
+
39
+ tree = html.fromstring(html_list[0])
40
+
41
+ # Book metadata
42
+ book_name = self._first_str(
43
+ tree.xpath('//div[contains(@class,"novel_info")]//h1/text()')
44
+ )
45
+
46
+ author = self._first_str(
47
+ tree.xpath(
48
+ '//div[contains(@class,"novel_info")]//li[span/text()="作者:"]/a/text()'
49
+ )
50
+ )
51
+
52
+ cover_url = self._first_str(
53
+ tree.xpath('//div[contains(@class,"novel_info")]//amp-img/@src')
54
+ )
55
+
56
+ serial_status = self._first_str(
57
+ tree.xpath(
58
+ '//div[contains(@class,"novel_info")]//span[contains(@class,"state_serial")]/text()'
59
+ )
60
+ )
61
+
62
+ update_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
63
+
64
+ # Summary
65
+ summary_nodes = tree.xpath('//div[@class="description"]//p/text()')
66
+ summary = "".join(summary_nodes).strip()
67
+
68
+ # Single "正文" volume with all chapter links
69
+ chapters: list[ChapterInfoDict] = []
70
+ for a in tree.xpath('//div[@class="full_chapters"]/div[1]/a'):
71
+ url = a.get("href", "").strip()
72
+ title = a.text_content().strip()
73
+ # '/novel/pagea/wushenzhuzai-anmoshi_6094.html' -> '6094'
74
+ chap_id = url.rstrip(".html").split("_")[-1]
75
+ chapters.append(
76
+ {
77
+ "chapterId": chap_id,
78
+ "title": title,
79
+ "url": url,
80
+ }
81
+ )
82
+
83
+ volumes: list[VolumeInfoDict] = [
84
+ {
85
+ "volume_name": "正文",
86
+ "chapters": chapters,
87
+ }
88
+ ]
89
+
90
+ return {
91
+ "book_name": book_name,
92
+ "author": author,
93
+ "cover_url": cover_url,
94
+ "update_time": update_time,
95
+ "serial_status": serial_status,
96
+ "summary": summary,
97
+ "volumes": volumes,
98
+ "extra": {},
99
+ }
100
+
101
+ def parse_chapter(
102
+ self,
103
+ html_list: list[str],
104
+ chapter_id: str,
105
+ **kwargs: Any,
106
+ ) -> ChapterDict | None:
107
+ if not html_list:
108
+ return None
109
+ tree = html.fromstring(html_list[0])
110
+
111
+ # Title
112
+ title_nodes = tree.xpath('//div[@class="title"]/h1/text()')
113
+ title = title_nodes[0].strip() if title_nodes else ""
114
+
115
+ # Content paragraphs under <div class="content">
116
+ paras = tree.xpath('//div[@class="content"]/p')
117
+ lines = []
118
+ for p in paras:
119
+ text = p.text_content().strip()
120
+ if text:
121
+ lines.append(text)
122
+
123
+ content = "\n".join(lines).strip()
124
+ if not content:
125
+ return None
126
+
127
+ return {
128
+ "id": chapter_id,
129
+ "title": title,
130
+ "content": content,
131
+ "extra": {"site": "ttkan"},
132
+ }
@@ -0,0 +1,191 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.wanbengo
4
+ --------------------------------------
5
+
6
+ """
7
+
8
+ import re
9
+ from datetime import datetime
10
+ from html import unescape
11
+ from typing import Any
12
+ from urllib.parse import urljoin
13
+
14
+ from lxml import html
15
+
16
+ from novel_downloader.core.parsers.base import BaseParser
17
+ from novel_downloader.core.parsers.registry import register_parser
18
+ from novel_downloader.models import (
19
+ BookInfoDict,
20
+ ChapterDict,
21
+ ChapterInfoDict,
22
+ VolumeInfoDict,
23
+ )
24
+
25
+
26
+ @register_parser(
27
+ site_keys=["wanbengo"],
28
+ )
29
+ class WanbengoParser(BaseParser):
30
+ """
31
+ Parser for 完本神站 book pages.
32
+ """
33
+
34
+ BASE = "https://www.wanbengo.com"
35
+
36
+ # XPaths for the book info page
37
+ X_BOOK_NAME = "//div[@class='detailTopMid']//h1/text()"
38
+ X_AUTHOR = "//div[@class='detailTopMid']//div[@class='writer']//a/text()"
39
+ X_COVER = "//div[@class='detailTopLeft']//img/@src"
40
+ X_STATUS = "//div[@class='detailTopLeft']//span[contains(@class,'end')]/text()"
41
+ X_WORDS = "//div[@class='detailTopMid']//table//tr[td/span[contains(text(),'字数')]]/td[last()]/text()" # noqa: E501
42
+ X_SUMMARY = "//div[@class='detailTopMid']//table//tr[td/span[contains(text(),'简介')]]/td[last()]//text()" # noqa: E501
43
+ X_TAG = "//div[@class='route']/a[2]//text()"
44
+ X_UPDATE_TXT = "//div[@class='chapterTitle']//span//text()"
45
+ X_CHAPTERS = "//div[@class='chapter']//ul//li/a"
46
+
47
+ # XPaths for the chapter page
48
+ X_CHAP_TITLE = "//div[contains(@class,'readerTitle')]//h2/text()"
49
+ _CHAP_SPLIT_RE = re.compile(r"(?:</p\s*>|<p\b[^>]*>|<br\s*/?>)", re.I)
50
+ _CHAP_READERCON_RE = re.compile(
51
+ r'<div[^>]*class=(?:"[^"]*readerCon[^"]*"|\'[^\']*readerCon[^\']*\')[^>]*>(.*?)</div>',
52
+ re.I | re.S,
53
+ )
54
+ _TAGS_RE = re.compile(r"<[^>]+>")
55
+ _SCRUB_RUNS_RE = re.compile(r"[_?]{2,}")
56
+ _SCRUB_TAIL_RE = re.compile(r"\s*(未完待续.*?$")
57
+
58
+ # fmt: off
59
+ ADS = {
60
+ "完本神站", "本站网址", "报错", "键盘", "客户端", "收藏", "书架",
61
+ "猜你喜欢", "上一章", "下一章", "章节目录", "LastRead", "贴吧",
62
+ "倾心打造", "全文无错", "分享本站", "点此章节报错", "温馨提示", "域名",
63
+ "wanbentxt.com", "wanbengo.com",
64
+ }
65
+ # fmt: on
66
+ _PUNCT_ONLY = re.compile(
67
+ r"^[\s\W_·—\-・。,、;;::!!??\(\)()【】《》“”\"'…·]+$"
68
+ ) # noqa: E501
69
+
70
+ def parse_book_info(
71
+ self,
72
+ html_list: list[str],
73
+ **kwargs: Any,
74
+ ) -> BookInfoDict | None:
75
+ if not html_list:
76
+ return None
77
+
78
+ tree = html.fromstring(html_list[0])
79
+
80
+ book_name = self._first_str(tree.xpath(self.X_BOOK_NAME))
81
+ author = self._first_str(tree.xpath(self.X_AUTHOR))
82
+ cover_url = self._first_str(tree.xpath(self.X_COVER))
83
+ serial_status = (
84
+ self._norm_space(self._first_str(tree.xpath(self.X_STATUS))) or "连载中"
85
+ )
86
+ word_count = self._norm_space("".join(tree.xpath(self.X_WORDS)))
87
+ summary = self._norm_space("".join(tree.xpath(self.X_SUMMARY)))
88
+
89
+ book_type = self._norm_space("".join(tree.xpath(self.X_TAG)))
90
+ tags = [book_type] if book_type else []
91
+
92
+ update_time = self._extract_update_date(tree.xpath(self.X_UPDATE_TXT))
93
+
94
+ chapters: list[ChapterInfoDict] = []
95
+ for a in tree.xpath(self.X_CHAPTERS):
96
+ title = self._norm_space("".join(a.xpath(".//text()")))
97
+ href = a.get("href") or ""
98
+ url = urljoin(self.BASE, href)
99
+ # "/129/103950.html" -> "103950"
100
+ cid = url.rstrip(".html").split("/")[-1]
101
+ chapters.append({"title": title, "url": url, "chapterId": cid})
102
+
103
+ volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
104
+
105
+ return {
106
+ "book_name": book_name,
107
+ "author": author,
108
+ "cover_url": cover_url,
109
+ "update_time": update_time,
110
+ "word_count": word_count,
111
+ "summary": summary,
112
+ "tags": tags,
113
+ "volumes": volumes,
114
+ "serial_status": serial_status,
115
+ "extra": {},
116
+ }
117
+
118
+ def parse_chapter(
119
+ self,
120
+ html_list: list[str],
121
+ chapter_id: str,
122
+ **kwargs: Any,
123
+ ) -> ChapterDict | None:
124
+ if not html_list:
125
+ return None
126
+
127
+ inner = self._CHAP_READERCON_RE.search(html_list[0])
128
+ if not inner:
129
+ return None
130
+
131
+ tree = html.fromstring(html_list[0])
132
+ title = self._first_str(tree.xpath(self.X_CHAP_TITLE))
133
+
134
+ parts = self._CHAP_SPLIT_RE.split(inner.group(1))
135
+ lines: list[str] = []
136
+ for part in parts:
137
+ if not part:
138
+ continue
139
+ s = self._TAGS_RE.sub("", part)
140
+ s = unescape(s).replace("\xa0", " ")
141
+ if self._is_noise_line(s):
142
+ continue
143
+ s = self._norm_space(self._scrub_ascii_gibberish(s.strip()))
144
+ if s:
145
+ lines.append(s)
146
+
147
+ content = "\n".join(lines)
148
+ if not content:
149
+ return None
150
+
151
+ return {
152
+ "id": chapter_id,
153
+ "title": title,
154
+ "content": content,
155
+ "extra": {"site": "wanbengo"},
156
+ }
157
+
158
+ @staticmethod
159
+ def _extract_update_date(texts: list[str]) -> str:
160
+ """
161
+ Find a YYYY-MM-DD anywhere in the provided text nodes.
162
+
163
+ If none found, return today's date.
164
+ """
165
+ joined = " ".join(t for t in texts if t)
166
+ m = re.search(r"\b(\d{4}-\d{2}-\d{2})\b", joined)
167
+ if m:
168
+ return m.group(1)
169
+ return datetime.now().strftime("%Y-%m-%d")
170
+
171
+ def _is_noise_line(self, s: str) -> bool:
172
+ """Heuristic to drop obvious ad/footer/noise lines."""
173
+ if not s.strip():
174
+ return True
175
+ if self._is_ad_line(s):
176
+ return True
177
+ if self._PUNCT_ONLY.match(s):
178
+ return True
179
+ return False
180
+
181
+ @classmethod
182
+ def _scrub_ascii_gibberish(cls, s: str) -> str:
183
+ """
184
+ Remove common injected ASCII junk like long runs of '?' or '_'
185
+ while keeping normal text intact.
186
+ """
187
+ s = s.replace("()?()", "").replace("[(.)]", "")
188
+ s = s.replace(".", ".")
189
+ s = cls._SCRUB_RUNS_RE.sub("", s) # drop runs like ???? or ____
190
+ s = cls._SCRUB_TAIL_RE.sub("", s)
191
+ return s.strip()
@@ -0,0 +1,173 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.xiaoshuowu
4
+ ----------------------------------------
5
+
6
+ """
7
+
8
+ from typing import Any
9
+
10
+ from lxml import html
11
+
12
+ from novel_downloader.core.parsers.base import BaseParser
13
+ from novel_downloader.core.parsers.registry import register_parser
14
+ from novel_downloader.models import (
15
+ BookInfoDict,
16
+ ChapterDict,
17
+ ChapterInfoDict,
18
+ VolumeInfoDict,
19
+ )
20
+
21
+
22
+ @register_parser(
23
+ site_keys=["xiaoshuowu", "xiaoshuoge"],
24
+ )
25
+ class XiaoshuowuParser(BaseParser):
26
+ """
27
+ Parser for 小说屋 (xiaoshuoge.info).
28
+ """
29
+
30
+ AD_STR: str = "小说屋 www.xiaoshuoge.info"
31
+
32
+ def parse_book_info(
33
+ self,
34
+ html_list: list[str],
35
+ **kwargs: Any,
36
+ ) -> BookInfoDict | None:
37
+ if len(html_list) < 2:
38
+ return None
39
+
40
+ # Parse trees
41
+ info_tree = html.fromstring(html_list[0])
42
+ catalog_tree = html.fromstring(html_list[1])
43
+
44
+ book_name = self._first_str(
45
+ info_tree.xpath('//meta[@property="og:novel:book_name"]/@content')
46
+ )
47
+ author = self._first_str(
48
+ info_tree.xpath('//meta[@property="og:novel:author"]/@content')
49
+ )
50
+
51
+ # Category -> tags
52
+ cat_val = self._first_str(
53
+ info_tree.xpath('//meta[@property="og:novel:category"]/@content')
54
+ )
55
+ tags = [cat_val] if cat_val else []
56
+
57
+ word_count = self._first_str(
58
+ info_tree.xpath(
59
+ '//table[@class="hide"]//td[contains(text(),"全文字数")]/text()'
60
+ ),
61
+ replaces=[("全文字数:", "")],
62
+ )
63
+ update_time = self._first_str(
64
+ info_tree.xpath(
65
+ '//table[@class="hide"]//td[contains(text(),"最后更新")]/text()'
66
+ ),
67
+ replaces=[("最后更新:", "")],
68
+ )
69
+ serial_status = self._first_str(
70
+ info_tree.xpath(
71
+ '//table[@class="hide"]//td[contains(text(),"连载状态")]/text()'
72
+ ),
73
+ replaces=[("连载状态:", "")],
74
+ )
75
+
76
+ cover_url = self._first_str(
77
+ info_tree.xpath('//meta[@property="og:image"]/@content')
78
+ )
79
+
80
+ # Summary
81
+ summary_div = info_tree.xpath('//div[@class="tabvalue"][1]//div')
82
+ summary: str = summary_div[0].text_content().strip() if summary_div else ""
83
+
84
+ # Chapters (single volume)
85
+ chapters: list[ChapterInfoDict] = []
86
+ chapter_links = catalog_tree.xpath(
87
+ '//ul[contains(@class,"chapters")]//li[contains(@class,"chapter")]/a'
88
+ )
89
+ for a in chapter_links:
90
+ url = a.get("href", "").strip()
91
+ title = a.text_content().strip()
92
+ # chapterId is the numeric filename before ".html"
93
+ chapter_id = url.rsplit("/", 1)[-1].split(".")[0]
94
+ chapters.append({"title": title, "url": url, "chapterId": chapter_id})
95
+
96
+ # Single volume
97
+ volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
98
+
99
+ return {
100
+ "book_name": book_name,
101
+ "author": author,
102
+ "cover_url": cover_url,
103
+ "update_time": update_time,
104
+ "word_count": word_count,
105
+ "serial_status": serial_status,
106
+ "tags": tags,
107
+ "summary": summary,
108
+ "volumes": volumes,
109
+ "extra": {},
110
+ }
111
+
112
+ def parse_chapter(
113
+ self,
114
+ html_list: list[str],
115
+ chapter_id: str,
116
+ **kwargs: Any,
117
+ ) -> ChapterDict | None:
118
+ if not html_list:
119
+ return None
120
+
121
+ doc = html.fromstring(html_list[0])
122
+ # main container
123
+ content_divs = doc.xpath('//div[@id="acontent"]')
124
+ if not content_divs:
125
+ return None
126
+ container = content_divs[0]
127
+
128
+ # Get the <h1> title
129
+ title_elem = container.find("h1")
130
+ title = title_elem.text_content().strip() if title_elem is not None else ""
131
+
132
+ paras: list[str] = []
133
+ started = False
134
+ for node in container.xpath("./*"):
135
+ # anchor: first <div id="content_tip">
136
+ if node.tag == "div" and node.get("id") == "content_tip":
137
+ raw = node.tail or ""
138
+ # drop any "(小说屋 ...)" prefix before the real text
139
+ if ")" in raw:
140
+ raw = raw.split(")", 1)[1]
141
+ first_line = raw.lstrip("\ufeff").strip()
142
+ if first_line:
143
+ paras.append(first_line)
144
+ started = True
145
+ continue
146
+
147
+ if not started:
148
+ continue
149
+
150
+ # stop collecting once we hit any div
151
+ cls_name = node.get("class") or ""
152
+ if node.tag == "div" and any(
153
+ k in cls_name for k in ("tishi", "footlink", "fullbar")
154
+ ):
155
+ break
156
+
157
+ # grab each <br/> tail as a paragraph
158
+ if node.tag == "br":
159
+ line = (node.tail or "").strip()
160
+ if not line or self.AD_STR in line:
161
+ continue
162
+ paras.append(line)
163
+
164
+ if not paras:
165
+ return None
166
+ content = "\n".join(paras)
167
+
168
+ return {
169
+ "id": chapter_id,
170
+ "title": title,
171
+ "content": content,
172
+ "extra": {"site": "xiaoshuowu"},
173
+ }