novel-downloader 1.5.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (241) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/__init__.py +1 -3
  3. novel_downloader/cli/clean.py +21 -88
  4. novel_downloader/cli/config.py +26 -21
  5. novel_downloader/cli/download.py +77 -64
  6. novel_downloader/cli/export.py +16 -20
  7. novel_downloader/cli/main.py +1 -1
  8. novel_downloader/cli/search.py +62 -65
  9. novel_downloader/cli/ui.py +156 -0
  10. novel_downloader/config/__init__.py +8 -5
  11. novel_downloader/config/adapter.py +65 -105
  12. novel_downloader/config/{loader.py → file_io.py} +53 -26
  13. novel_downloader/core/__init__.py +1 -0
  14. novel_downloader/core/archived/deqixs/fetcher.py +115 -0
  15. novel_downloader/core/archived/deqixs/parser.py +132 -0
  16. novel_downloader/core/archived/deqixs/searcher.py +89 -0
  17. novel_downloader/core/{searchers/qidian.py → archived/qidian/searcher.py} +12 -20
  18. novel_downloader/core/archived/wanbengo/searcher.py +98 -0
  19. novel_downloader/core/archived/xshbook/searcher.py +93 -0
  20. novel_downloader/core/downloaders/__init__.py +3 -24
  21. novel_downloader/core/downloaders/base.py +49 -23
  22. novel_downloader/core/downloaders/common.py +191 -137
  23. novel_downloader/core/downloaders/qianbi.py +187 -146
  24. novel_downloader/core/downloaders/qidian.py +187 -141
  25. novel_downloader/core/downloaders/registry.py +4 -2
  26. novel_downloader/core/downloaders/signals.py +46 -0
  27. novel_downloader/core/exporters/__init__.py +3 -20
  28. novel_downloader/core/exporters/base.py +33 -37
  29. novel_downloader/core/exporters/common/__init__.py +1 -2
  30. novel_downloader/core/exporters/common/epub.py +15 -10
  31. novel_downloader/core/exporters/common/main_exporter.py +19 -12
  32. novel_downloader/core/exporters/common/txt.py +14 -9
  33. novel_downloader/core/exporters/epub_util.py +59 -29
  34. novel_downloader/core/exporters/linovelib/__init__.py +1 -0
  35. novel_downloader/core/exporters/linovelib/epub.py +23 -25
  36. novel_downloader/core/exporters/linovelib/main_exporter.py +8 -12
  37. novel_downloader/core/exporters/linovelib/txt.py +17 -11
  38. novel_downloader/core/exporters/qidian.py +2 -8
  39. novel_downloader/core/exporters/registry.py +4 -2
  40. novel_downloader/core/exporters/txt_util.py +7 -7
  41. novel_downloader/core/fetchers/__init__.py +54 -48
  42. novel_downloader/core/fetchers/aaatxt.py +83 -0
  43. novel_downloader/core/fetchers/{biquge/session.py → b520.py} +6 -11
  44. novel_downloader/core/fetchers/{base/session.py → base.py} +37 -46
  45. novel_downloader/core/fetchers/{biquge/browser.py → biquyuedu.py} +12 -17
  46. novel_downloader/core/fetchers/dxmwx.py +110 -0
  47. novel_downloader/core/fetchers/eightnovel.py +139 -0
  48. novel_downloader/core/fetchers/{esjzone/session.py → esjzone.py} +19 -12
  49. novel_downloader/core/fetchers/guidaye.py +85 -0
  50. novel_downloader/core/fetchers/hetushu.py +92 -0
  51. novel_downloader/core/fetchers/{qianbi/browser.py → i25zw.py} +19 -28
  52. novel_downloader/core/fetchers/ixdzs8.py +113 -0
  53. novel_downloader/core/fetchers/jpxs123.py +101 -0
  54. novel_downloader/core/fetchers/lewenn.py +83 -0
  55. novel_downloader/core/fetchers/{linovelib/session.py → linovelib.py} +12 -13
  56. novel_downloader/core/fetchers/piaotia.py +105 -0
  57. novel_downloader/core/fetchers/qbtr.py +101 -0
  58. novel_downloader/core/fetchers/{qianbi/session.py → qianbi.py} +5 -10
  59. novel_downloader/core/fetchers/{qidian/session.py → qidian.py} +46 -39
  60. novel_downloader/core/fetchers/quanben5.py +92 -0
  61. novel_downloader/core/fetchers/{base/rate_limiter.py → rate_limiter.py} +2 -2
  62. novel_downloader/core/fetchers/registry.py +5 -16
  63. novel_downloader/core/fetchers/{sfacg/session.py → sfacg.py} +7 -10
  64. novel_downloader/core/fetchers/shencou.py +106 -0
  65. novel_downloader/core/fetchers/shuhaige.py +84 -0
  66. novel_downloader/core/fetchers/tongrenquan.py +84 -0
  67. novel_downloader/core/fetchers/ttkan.py +95 -0
  68. novel_downloader/core/fetchers/wanbengo.py +83 -0
  69. novel_downloader/core/fetchers/xiaoshuowu.py +106 -0
  70. novel_downloader/core/fetchers/xiguashuwu.py +177 -0
  71. novel_downloader/core/fetchers/xs63b.py +171 -0
  72. novel_downloader/core/fetchers/xshbook.py +85 -0
  73. novel_downloader/core/fetchers/{yamibo/session.py → yamibo.py} +19 -12
  74. novel_downloader/core/fetchers/yibige.py +114 -0
  75. novel_downloader/core/interfaces/__init__.py +1 -9
  76. novel_downloader/core/interfaces/downloader.py +6 -2
  77. novel_downloader/core/interfaces/exporter.py +7 -7
  78. novel_downloader/core/interfaces/fetcher.py +4 -17
  79. novel_downloader/core/interfaces/parser.py +5 -6
  80. novel_downloader/core/interfaces/searcher.py +9 -1
  81. novel_downloader/core/parsers/__init__.py +49 -12
  82. novel_downloader/core/parsers/aaatxt.py +132 -0
  83. novel_downloader/core/parsers/b520.py +116 -0
  84. novel_downloader/core/parsers/base.py +63 -12
  85. novel_downloader/core/parsers/biquyuedu.py +133 -0
  86. novel_downloader/core/parsers/dxmwx.py +162 -0
  87. novel_downloader/core/parsers/eightnovel.py +224 -0
  88. novel_downloader/core/parsers/esjzone.py +61 -66
  89. novel_downloader/core/parsers/guidaye.py +128 -0
  90. novel_downloader/core/parsers/hetushu.py +139 -0
  91. novel_downloader/core/parsers/i25zw.py +137 -0
  92. novel_downloader/core/parsers/ixdzs8.py +186 -0
  93. novel_downloader/core/parsers/jpxs123.py +137 -0
  94. novel_downloader/core/parsers/lewenn.py +142 -0
  95. novel_downloader/core/parsers/linovelib.py +48 -64
  96. novel_downloader/core/parsers/piaotia.py +189 -0
  97. novel_downloader/core/parsers/qbtr.py +136 -0
  98. novel_downloader/core/parsers/qianbi.py +48 -50
  99. novel_downloader/core/parsers/qidian/book_info_parser.py +58 -59
  100. novel_downloader/core/parsers/qidian/chapter_encrypted.py +272 -330
  101. novel_downloader/core/parsers/qidian/chapter_normal.py +24 -55
  102. novel_downloader/core/parsers/qidian/main_parser.py +11 -38
  103. novel_downloader/core/parsers/qidian/utils/__init__.py +1 -0
  104. novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +1 -1
  105. novel_downloader/core/parsers/qidian/utils/fontmap_recover.py +143 -0
  106. novel_downloader/core/parsers/qidian/utils/helpers.py +0 -4
  107. novel_downloader/core/parsers/quanben5.py +103 -0
  108. novel_downloader/core/parsers/registry.py +5 -16
  109. novel_downloader/core/parsers/sfacg.py +38 -45
  110. novel_downloader/core/parsers/shencou.py +215 -0
  111. novel_downloader/core/parsers/shuhaige.py +111 -0
  112. novel_downloader/core/parsers/tongrenquan.py +116 -0
  113. novel_downloader/core/parsers/ttkan.py +132 -0
  114. novel_downloader/core/parsers/wanbengo.py +191 -0
  115. novel_downloader/core/parsers/xiaoshuowu.py +173 -0
  116. novel_downloader/core/parsers/xiguashuwu.py +435 -0
  117. novel_downloader/core/parsers/xs63b.py +161 -0
  118. novel_downloader/core/parsers/xshbook.py +134 -0
  119. novel_downloader/core/parsers/yamibo.py +87 -131
  120. novel_downloader/core/parsers/yibige.py +166 -0
  121. novel_downloader/core/searchers/__init__.py +34 -3
  122. novel_downloader/core/searchers/aaatxt.py +107 -0
  123. novel_downloader/core/searchers/{biquge.py → b520.py} +29 -28
  124. novel_downloader/core/searchers/base.py +112 -36
  125. novel_downloader/core/searchers/dxmwx.py +105 -0
  126. novel_downloader/core/searchers/eightnovel.py +84 -0
  127. novel_downloader/core/searchers/esjzone.py +43 -25
  128. novel_downloader/core/searchers/hetushu.py +92 -0
  129. novel_downloader/core/searchers/i25zw.py +93 -0
  130. novel_downloader/core/searchers/ixdzs8.py +107 -0
  131. novel_downloader/core/searchers/jpxs123.py +107 -0
  132. novel_downloader/core/searchers/piaotia.py +100 -0
  133. novel_downloader/core/searchers/qbtr.py +106 -0
  134. novel_downloader/core/searchers/qianbi.py +74 -40
  135. novel_downloader/core/searchers/quanben5.py +144 -0
  136. novel_downloader/core/searchers/registry.py +24 -8
  137. novel_downloader/core/searchers/shuhaige.py +124 -0
  138. novel_downloader/core/searchers/tongrenquan.py +110 -0
  139. novel_downloader/core/searchers/ttkan.py +92 -0
  140. novel_downloader/core/searchers/xiaoshuowu.py +122 -0
  141. novel_downloader/core/searchers/xiguashuwu.py +95 -0
  142. novel_downloader/core/searchers/xs63b.py +104 -0
  143. novel_downloader/locales/en.json +31 -82
  144. novel_downloader/locales/zh.json +32 -83
  145. novel_downloader/models/__init__.py +21 -22
  146. novel_downloader/models/book.py +44 -0
  147. novel_downloader/models/config.py +4 -37
  148. novel_downloader/models/login.py +1 -1
  149. novel_downloader/models/search.py +5 -0
  150. novel_downloader/resources/config/settings.toml +8 -70
  151. novel_downloader/resources/json/xiguashuwu.json +718 -0
  152. novel_downloader/utils/__init__.py +13 -22
  153. novel_downloader/utils/chapter_storage.py +3 -2
  154. novel_downloader/utils/constants.py +4 -29
  155. novel_downloader/utils/cookies.py +6 -18
  156. novel_downloader/utils/crypto_utils/__init__.py +13 -0
  157. novel_downloader/utils/crypto_utils/aes_util.py +90 -0
  158. novel_downloader/utils/crypto_utils/aes_v1.py +619 -0
  159. novel_downloader/utils/crypto_utils/aes_v2.py +1143 -0
  160. novel_downloader/utils/{crypto_utils.py → crypto_utils/rc4.py} +3 -10
  161. novel_downloader/utils/epub/__init__.py +1 -1
  162. novel_downloader/utils/epub/constants.py +57 -16
  163. novel_downloader/utils/epub/documents.py +88 -194
  164. novel_downloader/utils/epub/models.py +0 -14
  165. novel_downloader/utils/epub/utils.py +63 -96
  166. novel_downloader/utils/file_utils/__init__.py +2 -23
  167. novel_downloader/utils/file_utils/io.py +3 -113
  168. novel_downloader/utils/file_utils/sanitize.py +0 -4
  169. novel_downloader/utils/fontocr.py +207 -0
  170. novel_downloader/utils/logger.py +8 -16
  171. novel_downloader/utils/network.py +2 -2
  172. novel_downloader/utils/state.py +4 -90
  173. novel_downloader/utils/text_utils/__init__.py +1 -7
  174. novel_downloader/utils/text_utils/diff_display.py +5 -7
  175. novel_downloader/utils/time_utils/__init__.py +5 -11
  176. novel_downloader/utils/time_utils/datetime_utils.py +20 -29
  177. novel_downloader/utils/time_utils/sleep_utils.py +4 -8
  178. novel_downloader/web/__init__.py +13 -0
  179. novel_downloader/web/components/__init__.py +11 -0
  180. novel_downloader/web/components/navigation.py +35 -0
  181. novel_downloader/web/main.py +66 -0
  182. novel_downloader/web/pages/__init__.py +17 -0
  183. novel_downloader/web/pages/download.py +78 -0
  184. novel_downloader/web/pages/progress.py +147 -0
  185. novel_downloader/web/pages/search.py +329 -0
  186. novel_downloader/web/services/__init__.py +17 -0
  187. novel_downloader/web/services/client_dialog.py +164 -0
  188. novel_downloader/web/services/cred_broker.py +113 -0
  189. novel_downloader/web/services/cred_models.py +35 -0
  190. novel_downloader/web/services/task_manager.py +264 -0
  191. novel_downloader-2.0.0.dist-info/METADATA +171 -0
  192. novel_downloader-2.0.0.dist-info/RECORD +210 -0
  193. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/entry_points.txt +1 -1
  194. novel_downloader/core/downloaders/biquge.py +0 -29
  195. novel_downloader/core/downloaders/esjzone.py +0 -29
  196. novel_downloader/core/downloaders/linovelib.py +0 -29
  197. novel_downloader/core/downloaders/sfacg.py +0 -29
  198. novel_downloader/core/downloaders/yamibo.py +0 -29
  199. novel_downloader/core/exporters/biquge.py +0 -22
  200. novel_downloader/core/exporters/esjzone.py +0 -22
  201. novel_downloader/core/exporters/qianbi.py +0 -22
  202. novel_downloader/core/exporters/sfacg.py +0 -22
  203. novel_downloader/core/exporters/yamibo.py +0 -22
  204. novel_downloader/core/fetchers/base/__init__.py +0 -14
  205. novel_downloader/core/fetchers/base/browser.py +0 -422
  206. novel_downloader/core/fetchers/biquge/__init__.py +0 -14
  207. novel_downloader/core/fetchers/esjzone/__init__.py +0 -14
  208. novel_downloader/core/fetchers/esjzone/browser.py +0 -209
  209. novel_downloader/core/fetchers/linovelib/__init__.py +0 -14
  210. novel_downloader/core/fetchers/linovelib/browser.py +0 -198
  211. novel_downloader/core/fetchers/qianbi/__init__.py +0 -14
  212. novel_downloader/core/fetchers/qidian/__init__.py +0 -14
  213. novel_downloader/core/fetchers/qidian/browser.py +0 -326
  214. novel_downloader/core/fetchers/sfacg/__init__.py +0 -14
  215. novel_downloader/core/fetchers/sfacg/browser.py +0 -194
  216. novel_downloader/core/fetchers/yamibo/__init__.py +0 -14
  217. novel_downloader/core/fetchers/yamibo/browser.py +0 -234
  218. novel_downloader/core/parsers/biquge.py +0 -139
  219. novel_downloader/models/chapter.py +0 -25
  220. novel_downloader/models/types.py +0 -13
  221. novel_downloader/tui/__init__.py +0 -7
  222. novel_downloader/tui/app.py +0 -32
  223. novel_downloader/tui/main.py +0 -17
  224. novel_downloader/tui/screens/__init__.py +0 -14
  225. novel_downloader/tui/screens/home.py +0 -198
  226. novel_downloader/tui/screens/login.py +0 -74
  227. novel_downloader/tui/styles/home_layout.tcss +0 -79
  228. novel_downloader/tui/widgets/richlog_handler.py +0 -24
  229. novel_downloader/utils/cache.py +0 -24
  230. novel_downloader/utils/fontocr/__init__.py +0 -22
  231. novel_downloader/utils/fontocr/hash_store.py +0 -280
  232. novel_downloader/utils/fontocr/hash_utils.py +0 -103
  233. novel_downloader/utils/fontocr/model_loader.py +0 -69
  234. novel_downloader/utils/fontocr/ocr_v1.py +0 -315
  235. novel_downloader/utils/fontocr/ocr_v2.py +0 -764
  236. novel_downloader/utils/fontocr/ocr_v3.py +0 -744
  237. novel_downloader-1.5.0.dist-info/METADATA +0 -196
  238. novel_downloader-1.5.0.dist-info/RECORD +0 -164
  239. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/WHEEL +0 -0
  240. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/licenses/LICENSE +0 -0
  241. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,191 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.wanbengo
4
+ --------------------------------------
5
+
6
+ """
7
+
8
+ import re
9
+ from datetime import datetime
10
+ from html import unescape
11
+ from typing import Any
12
+ from urllib.parse import urljoin
13
+
14
+ from lxml import html
15
+
16
+ from novel_downloader.core.parsers.base import BaseParser
17
+ from novel_downloader.core.parsers.registry import register_parser
18
+ from novel_downloader.models import (
19
+ BookInfoDict,
20
+ ChapterDict,
21
+ ChapterInfoDict,
22
+ VolumeInfoDict,
23
+ )
24
+
25
+
26
+ @register_parser(
27
+ site_keys=["wanbengo"],
28
+ )
29
+ class WanbengoParser(BaseParser):
30
+ """
31
+ Parser for 完本神站 book pages.
32
+ """
33
+
34
+ BASE = "https://www.wanbengo.com"
35
+
36
+ # XPaths for the book info page
37
+ X_BOOK_NAME = "//div[@class='detailTopMid']//h1/text()"
38
+ X_AUTHOR = "//div[@class='detailTopMid']//div[@class='writer']//a/text()"
39
+ X_COVER = "//div[@class='detailTopLeft']//img/@src"
40
+ X_STATUS = "//div[@class='detailTopLeft']//span[contains(@class,'end')]/text()"
41
+ X_WORDS = "//div[@class='detailTopMid']//table//tr[td/span[contains(text(),'字数')]]/td[last()]/text()" # noqa: E501
42
+ X_SUMMARY = "//div[@class='detailTopMid']//table//tr[td/span[contains(text(),'简介')]]/td[last()]//text()" # noqa: E501
43
+ X_TAG = "//div[@class='route']/a[2]//text()"
44
+ X_UPDATE_TXT = "//div[@class='chapterTitle']//span//text()"
45
+ X_CHAPTERS = "//div[@class='chapter']//ul//li/a"
46
+
47
+ # XPaths for the chapter page
48
+ X_CHAP_TITLE = "//div[contains(@class,'readerTitle')]//h2/text()"
49
+ _CHAP_SPLIT_RE = re.compile(r"(?:</p\s*>|<p\b[^>]*>|<br\s*/?>)", re.I)
50
+ _CHAP_READERCON_RE = re.compile(
51
+ r'<div[^>]*class=(?:"[^"]*readerCon[^"]*"|\'[^\']*readerCon[^\']*\')[^>]*>(.*?)</div>',
52
+ re.I | re.S,
53
+ )
54
+ _TAGS_RE = re.compile(r"<[^>]+>")
55
+ _SCRUB_RUNS_RE = re.compile(r"[_?]{2,}")
56
+ _SCRUB_TAIL_RE = re.compile(r"\s*(未完待续.*?$")
57
+
58
+ # fmt: off
59
+ ADS = {
60
+ "完本神站", "本站网址", "报错", "键盘", "客户端", "收藏", "书架",
61
+ "猜你喜欢", "上一章", "下一章", "章节目录", "LastRead", "贴吧",
62
+ "倾心打造", "全文无错", "分享本站", "点此章节报错", "温馨提示", "域名",
63
+ "wanbentxt.com", "wanbengo.com",
64
+ }
65
+ # fmt: on
66
+ _PUNCT_ONLY = re.compile(
67
+ r"^[\s\W_·—\-・。,、;;::!!??\(\)()【】《》“”\"'…·]+$"
68
+ ) # noqa: E501
69
+
70
+ def parse_book_info(
71
+ self,
72
+ html_list: list[str],
73
+ **kwargs: Any,
74
+ ) -> BookInfoDict | None:
75
+ if not html_list:
76
+ return None
77
+
78
+ tree = html.fromstring(html_list[0])
79
+
80
+ book_name = self._first_str(tree.xpath(self.X_BOOK_NAME))
81
+ author = self._first_str(tree.xpath(self.X_AUTHOR))
82
+ cover_url = self._first_str(tree.xpath(self.X_COVER))
83
+ serial_status = (
84
+ self._norm_space(self._first_str(tree.xpath(self.X_STATUS))) or "连载中"
85
+ )
86
+ word_count = self._norm_space("".join(tree.xpath(self.X_WORDS)))
87
+ summary = self._norm_space("".join(tree.xpath(self.X_SUMMARY)))
88
+
89
+ book_type = self._norm_space("".join(tree.xpath(self.X_TAG)))
90
+ tags = [book_type] if book_type else []
91
+
92
+ update_time = self._extract_update_date(tree.xpath(self.X_UPDATE_TXT))
93
+
94
+ chapters: list[ChapterInfoDict] = []
95
+ for a in tree.xpath(self.X_CHAPTERS):
96
+ title = self._norm_space("".join(a.xpath(".//text()")))
97
+ href = a.get("href") or ""
98
+ url = urljoin(self.BASE, href)
99
+ # "/129/103950.html" -> "103950"
100
+ cid = url.rstrip(".html").split("/")[-1]
101
+ chapters.append({"title": title, "url": url, "chapterId": cid})
102
+
103
+ volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
104
+
105
+ return {
106
+ "book_name": book_name,
107
+ "author": author,
108
+ "cover_url": cover_url,
109
+ "update_time": update_time,
110
+ "word_count": word_count,
111
+ "summary": summary,
112
+ "tags": tags,
113
+ "volumes": volumes,
114
+ "serial_status": serial_status,
115
+ "extra": {},
116
+ }
117
+
118
+ def parse_chapter(
119
+ self,
120
+ html_list: list[str],
121
+ chapter_id: str,
122
+ **kwargs: Any,
123
+ ) -> ChapterDict | None:
124
+ if not html_list:
125
+ return None
126
+
127
+ inner = self._CHAP_READERCON_RE.search(html_list[0])
128
+ if not inner:
129
+ return None
130
+
131
+ tree = html.fromstring(html_list[0])
132
+ title = self._first_str(tree.xpath(self.X_CHAP_TITLE))
133
+
134
+ parts = self._CHAP_SPLIT_RE.split(inner.group(1))
135
+ lines: list[str] = []
136
+ for part in parts:
137
+ if not part:
138
+ continue
139
+ s = self._TAGS_RE.sub("", part)
140
+ s = unescape(s).replace("\xa0", " ")
141
+ if self._is_noise_line(s):
142
+ continue
143
+ s = self._norm_space(self._scrub_ascii_gibberish(s.strip()))
144
+ if s:
145
+ lines.append(s)
146
+
147
+ content = "\n".join(lines)
148
+ if not content:
149
+ return None
150
+
151
+ return {
152
+ "id": chapter_id,
153
+ "title": title,
154
+ "content": content,
155
+ "extra": {"site": "wanbengo"},
156
+ }
157
+
158
+ @staticmethod
159
+ def _extract_update_date(texts: list[str]) -> str:
160
+ """
161
+ Find a YYYY-MM-DD anywhere in the provided text nodes.
162
+
163
+ If none found, return today's date.
164
+ """
165
+ joined = " ".join(t for t in texts if t)
166
+ m = re.search(r"\b(\d{4}-\d{2}-\d{2})\b", joined)
167
+ if m:
168
+ return m.group(1)
169
+ return datetime.now().strftime("%Y-%m-%d")
170
+
171
+ def _is_noise_line(self, s: str) -> bool:
172
+ """Heuristic to drop obvious ad/footer/noise lines."""
173
+ if not s.strip():
174
+ return True
175
+ if self._is_ad_line(s):
176
+ return True
177
+ if self._PUNCT_ONLY.match(s):
178
+ return True
179
+ return False
180
+
181
+ @classmethod
182
+ def _scrub_ascii_gibberish(cls, s: str) -> str:
183
+ """
184
+ Remove common injected ASCII junk like long runs of '?' or '_'
185
+ while keeping normal text intact.
186
+ """
187
+ s = s.replace("()?()", "").replace("[(.)]", "")
188
+ s = s.replace(".", ".")
189
+ s = cls._SCRUB_RUNS_RE.sub("", s) # drop runs like ???? or ____
190
+ s = cls._SCRUB_TAIL_RE.sub("", s)
191
+ return s.strip()
@@ -0,0 +1,173 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.xiaoshuowu
4
+ ----------------------------------------
5
+
6
+ """
7
+
8
+ from typing import Any
9
+
10
+ from lxml import html
11
+
12
+ from novel_downloader.core.parsers.base import BaseParser
13
+ from novel_downloader.core.parsers.registry import register_parser
14
+ from novel_downloader.models import (
15
+ BookInfoDict,
16
+ ChapterDict,
17
+ ChapterInfoDict,
18
+ VolumeInfoDict,
19
+ )
20
+
21
+
22
+ @register_parser(
23
+ site_keys=["xiaoshuowu", "xiaoshuoge"],
24
+ )
25
+ class XiaoshuowuParser(BaseParser):
26
+ """
27
+ Parser for 小说屋 (xiaoshuoge.info).
28
+ """
29
+
30
+ AD_STR: str = "小说屋 www.xiaoshuoge.info"
31
+
32
+ def parse_book_info(
33
+ self,
34
+ html_list: list[str],
35
+ **kwargs: Any,
36
+ ) -> BookInfoDict | None:
37
+ if len(html_list) < 2:
38
+ return None
39
+
40
+ # Parse trees
41
+ info_tree = html.fromstring(html_list[0])
42
+ catalog_tree = html.fromstring(html_list[1])
43
+
44
+ book_name = self._first_str(
45
+ info_tree.xpath('//meta[@property="og:novel:book_name"]/@content')
46
+ )
47
+ author = self._first_str(
48
+ info_tree.xpath('//meta[@property="og:novel:author"]/@content')
49
+ )
50
+
51
+ # Category -> tags
52
+ cat_val = self._first_str(
53
+ info_tree.xpath('//meta[@property="og:novel:category"]/@content')
54
+ )
55
+ tags = [cat_val] if cat_val else []
56
+
57
+ word_count = self._first_str(
58
+ info_tree.xpath(
59
+ '//table[@class="hide"]//td[contains(text(),"全文字数")]/text()'
60
+ ),
61
+ replaces=[("全文字数:", "")],
62
+ )
63
+ update_time = self._first_str(
64
+ info_tree.xpath(
65
+ '//table[@class="hide"]//td[contains(text(),"最后更新")]/text()'
66
+ ),
67
+ replaces=[("最后更新:", "")],
68
+ )
69
+ serial_status = self._first_str(
70
+ info_tree.xpath(
71
+ '//table[@class="hide"]//td[contains(text(),"连载状态")]/text()'
72
+ ),
73
+ replaces=[("连载状态:", "")],
74
+ )
75
+
76
+ cover_url = self._first_str(
77
+ info_tree.xpath('//meta[@property="og:image"]/@content')
78
+ )
79
+
80
+ # Summary
81
+ summary_div = info_tree.xpath('//div[@class="tabvalue"][1]//div')
82
+ summary: str = summary_div[0].text_content().strip() if summary_div else ""
83
+
84
+ # Chapters (single volume)
85
+ chapters: list[ChapterInfoDict] = []
86
+ chapter_links = catalog_tree.xpath(
87
+ '//ul[contains(@class,"chapters")]//li[contains(@class,"chapter")]/a'
88
+ )
89
+ for a in chapter_links:
90
+ url = a.get("href", "").strip()
91
+ title = a.text_content().strip()
92
+ # chapterId is the numeric filename before ".html"
93
+ chapter_id = url.rsplit("/", 1)[-1].split(".")[0]
94
+ chapters.append({"title": title, "url": url, "chapterId": chapter_id})
95
+
96
+ # Single volume
97
+ volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
98
+
99
+ return {
100
+ "book_name": book_name,
101
+ "author": author,
102
+ "cover_url": cover_url,
103
+ "update_time": update_time,
104
+ "word_count": word_count,
105
+ "serial_status": serial_status,
106
+ "tags": tags,
107
+ "summary": summary,
108
+ "volumes": volumes,
109
+ "extra": {},
110
+ }
111
+
112
+ def parse_chapter(
113
+ self,
114
+ html_list: list[str],
115
+ chapter_id: str,
116
+ **kwargs: Any,
117
+ ) -> ChapterDict | None:
118
+ if not html_list:
119
+ return None
120
+
121
+ doc = html.fromstring(html_list[0])
122
+ # main container
123
+ content_divs = doc.xpath('//div[@id="acontent"]')
124
+ if not content_divs:
125
+ return None
126
+ container = content_divs[0]
127
+
128
+ # Get the <h1> title
129
+ title_elem = container.find("h1")
130
+ title = title_elem.text_content().strip() if title_elem is not None else ""
131
+
132
+ paras: list[str] = []
133
+ started = False
134
+ for node in container.xpath("./*"):
135
+ # anchor: first <div id="content_tip">
136
+ if node.tag == "div" and node.get("id") == "content_tip":
137
+ raw = node.tail or ""
138
+ # drop any "(小说屋 ...)" prefix before the real text
139
+ if ")" in raw:
140
+ raw = raw.split(")", 1)[1]
141
+ first_line = raw.lstrip("\ufeff").strip()
142
+ if first_line:
143
+ paras.append(first_line)
144
+ started = True
145
+ continue
146
+
147
+ if not started:
148
+ continue
149
+
150
+ # stop collecting once we hit any div
151
+ cls_name = node.get("class") or ""
152
+ if node.tag == "div" and any(
153
+ k in cls_name for k in ("tishi", "footlink", "fullbar")
154
+ ):
155
+ break
156
+
157
+ # grab each <br/> tail as a paragraph
158
+ if node.tag == "br":
159
+ line = (node.tail or "").strip()
160
+ if not line or self.AD_STR in line:
161
+ continue
162
+ paras.append(line)
163
+
164
+ if not paras:
165
+ return None
166
+ content = "\n".join(paras)
167
+
168
+ return {
169
+ "id": chapter_id,
170
+ "title": title,
171
+ "content": content,
172
+ "extra": {"site": "xiaoshuowu"},
173
+ }