novel-downloader 1.5.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (241) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/__init__.py +1 -3
  3. novel_downloader/cli/clean.py +21 -88
  4. novel_downloader/cli/config.py +26 -21
  5. novel_downloader/cli/download.py +77 -64
  6. novel_downloader/cli/export.py +16 -20
  7. novel_downloader/cli/main.py +1 -1
  8. novel_downloader/cli/search.py +62 -65
  9. novel_downloader/cli/ui.py +156 -0
  10. novel_downloader/config/__init__.py +8 -5
  11. novel_downloader/config/adapter.py +65 -105
  12. novel_downloader/config/{loader.py → file_io.py} +53 -26
  13. novel_downloader/core/__init__.py +1 -0
  14. novel_downloader/core/archived/deqixs/fetcher.py +115 -0
  15. novel_downloader/core/archived/deqixs/parser.py +132 -0
  16. novel_downloader/core/archived/deqixs/searcher.py +89 -0
  17. novel_downloader/core/{searchers/qidian.py → archived/qidian/searcher.py} +12 -20
  18. novel_downloader/core/archived/wanbengo/searcher.py +98 -0
  19. novel_downloader/core/archived/xshbook/searcher.py +93 -0
  20. novel_downloader/core/downloaders/__init__.py +3 -24
  21. novel_downloader/core/downloaders/base.py +49 -23
  22. novel_downloader/core/downloaders/common.py +191 -137
  23. novel_downloader/core/downloaders/qianbi.py +187 -146
  24. novel_downloader/core/downloaders/qidian.py +187 -141
  25. novel_downloader/core/downloaders/registry.py +4 -2
  26. novel_downloader/core/downloaders/signals.py +46 -0
  27. novel_downloader/core/exporters/__init__.py +3 -20
  28. novel_downloader/core/exporters/base.py +33 -37
  29. novel_downloader/core/exporters/common/__init__.py +1 -2
  30. novel_downloader/core/exporters/common/epub.py +15 -10
  31. novel_downloader/core/exporters/common/main_exporter.py +19 -12
  32. novel_downloader/core/exporters/common/txt.py +14 -9
  33. novel_downloader/core/exporters/epub_util.py +59 -29
  34. novel_downloader/core/exporters/linovelib/__init__.py +1 -0
  35. novel_downloader/core/exporters/linovelib/epub.py +23 -25
  36. novel_downloader/core/exporters/linovelib/main_exporter.py +8 -12
  37. novel_downloader/core/exporters/linovelib/txt.py +17 -11
  38. novel_downloader/core/exporters/qidian.py +2 -8
  39. novel_downloader/core/exporters/registry.py +4 -2
  40. novel_downloader/core/exporters/txt_util.py +7 -7
  41. novel_downloader/core/fetchers/__init__.py +54 -48
  42. novel_downloader/core/fetchers/aaatxt.py +83 -0
  43. novel_downloader/core/fetchers/{biquge/session.py → b520.py} +6 -11
  44. novel_downloader/core/fetchers/{base/session.py → base.py} +37 -46
  45. novel_downloader/core/fetchers/{biquge/browser.py → biquyuedu.py} +12 -17
  46. novel_downloader/core/fetchers/dxmwx.py +110 -0
  47. novel_downloader/core/fetchers/eightnovel.py +139 -0
  48. novel_downloader/core/fetchers/{esjzone/session.py → esjzone.py} +19 -12
  49. novel_downloader/core/fetchers/guidaye.py +85 -0
  50. novel_downloader/core/fetchers/hetushu.py +92 -0
  51. novel_downloader/core/fetchers/{qianbi/browser.py → i25zw.py} +19 -28
  52. novel_downloader/core/fetchers/ixdzs8.py +113 -0
  53. novel_downloader/core/fetchers/jpxs123.py +101 -0
  54. novel_downloader/core/fetchers/lewenn.py +83 -0
  55. novel_downloader/core/fetchers/{linovelib/session.py → linovelib.py} +12 -13
  56. novel_downloader/core/fetchers/piaotia.py +105 -0
  57. novel_downloader/core/fetchers/qbtr.py +101 -0
  58. novel_downloader/core/fetchers/{qianbi/session.py → qianbi.py} +5 -10
  59. novel_downloader/core/fetchers/{qidian/session.py → qidian.py} +46 -39
  60. novel_downloader/core/fetchers/quanben5.py +92 -0
  61. novel_downloader/core/fetchers/{base/rate_limiter.py → rate_limiter.py} +2 -2
  62. novel_downloader/core/fetchers/registry.py +5 -16
  63. novel_downloader/core/fetchers/{sfacg/session.py → sfacg.py} +7 -10
  64. novel_downloader/core/fetchers/shencou.py +106 -0
  65. novel_downloader/core/fetchers/shuhaige.py +84 -0
  66. novel_downloader/core/fetchers/tongrenquan.py +84 -0
  67. novel_downloader/core/fetchers/ttkan.py +95 -0
  68. novel_downloader/core/fetchers/wanbengo.py +83 -0
  69. novel_downloader/core/fetchers/xiaoshuowu.py +106 -0
  70. novel_downloader/core/fetchers/xiguashuwu.py +177 -0
  71. novel_downloader/core/fetchers/xs63b.py +171 -0
  72. novel_downloader/core/fetchers/xshbook.py +85 -0
  73. novel_downloader/core/fetchers/{yamibo/session.py → yamibo.py} +19 -12
  74. novel_downloader/core/fetchers/yibige.py +114 -0
  75. novel_downloader/core/interfaces/__init__.py +1 -9
  76. novel_downloader/core/interfaces/downloader.py +6 -2
  77. novel_downloader/core/interfaces/exporter.py +7 -7
  78. novel_downloader/core/interfaces/fetcher.py +4 -17
  79. novel_downloader/core/interfaces/parser.py +5 -6
  80. novel_downloader/core/interfaces/searcher.py +9 -1
  81. novel_downloader/core/parsers/__init__.py +49 -12
  82. novel_downloader/core/parsers/aaatxt.py +132 -0
  83. novel_downloader/core/parsers/b520.py +116 -0
  84. novel_downloader/core/parsers/base.py +63 -12
  85. novel_downloader/core/parsers/biquyuedu.py +133 -0
  86. novel_downloader/core/parsers/dxmwx.py +162 -0
  87. novel_downloader/core/parsers/eightnovel.py +224 -0
  88. novel_downloader/core/parsers/esjzone.py +61 -66
  89. novel_downloader/core/parsers/guidaye.py +128 -0
  90. novel_downloader/core/parsers/hetushu.py +139 -0
  91. novel_downloader/core/parsers/i25zw.py +137 -0
  92. novel_downloader/core/parsers/ixdzs8.py +186 -0
  93. novel_downloader/core/parsers/jpxs123.py +137 -0
  94. novel_downloader/core/parsers/lewenn.py +142 -0
  95. novel_downloader/core/parsers/linovelib.py +48 -64
  96. novel_downloader/core/parsers/piaotia.py +189 -0
  97. novel_downloader/core/parsers/qbtr.py +136 -0
  98. novel_downloader/core/parsers/qianbi.py +48 -50
  99. novel_downloader/core/parsers/qidian/book_info_parser.py +58 -59
  100. novel_downloader/core/parsers/qidian/chapter_encrypted.py +272 -330
  101. novel_downloader/core/parsers/qidian/chapter_normal.py +24 -55
  102. novel_downloader/core/parsers/qidian/main_parser.py +11 -38
  103. novel_downloader/core/parsers/qidian/utils/__init__.py +1 -0
  104. novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +1 -1
  105. novel_downloader/core/parsers/qidian/utils/fontmap_recover.py +143 -0
  106. novel_downloader/core/parsers/qidian/utils/helpers.py +0 -4
  107. novel_downloader/core/parsers/quanben5.py +103 -0
  108. novel_downloader/core/parsers/registry.py +5 -16
  109. novel_downloader/core/parsers/sfacg.py +38 -45
  110. novel_downloader/core/parsers/shencou.py +215 -0
  111. novel_downloader/core/parsers/shuhaige.py +111 -0
  112. novel_downloader/core/parsers/tongrenquan.py +116 -0
  113. novel_downloader/core/parsers/ttkan.py +132 -0
  114. novel_downloader/core/parsers/wanbengo.py +191 -0
  115. novel_downloader/core/parsers/xiaoshuowu.py +173 -0
  116. novel_downloader/core/parsers/xiguashuwu.py +435 -0
  117. novel_downloader/core/parsers/xs63b.py +161 -0
  118. novel_downloader/core/parsers/xshbook.py +134 -0
  119. novel_downloader/core/parsers/yamibo.py +87 -131
  120. novel_downloader/core/parsers/yibige.py +166 -0
  121. novel_downloader/core/searchers/__init__.py +34 -3
  122. novel_downloader/core/searchers/aaatxt.py +107 -0
  123. novel_downloader/core/searchers/{biquge.py → b520.py} +29 -28
  124. novel_downloader/core/searchers/base.py +112 -36
  125. novel_downloader/core/searchers/dxmwx.py +105 -0
  126. novel_downloader/core/searchers/eightnovel.py +84 -0
  127. novel_downloader/core/searchers/esjzone.py +43 -25
  128. novel_downloader/core/searchers/hetushu.py +92 -0
  129. novel_downloader/core/searchers/i25zw.py +93 -0
  130. novel_downloader/core/searchers/ixdzs8.py +107 -0
  131. novel_downloader/core/searchers/jpxs123.py +107 -0
  132. novel_downloader/core/searchers/piaotia.py +100 -0
  133. novel_downloader/core/searchers/qbtr.py +106 -0
  134. novel_downloader/core/searchers/qianbi.py +74 -40
  135. novel_downloader/core/searchers/quanben5.py +144 -0
  136. novel_downloader/core/searchers/registry.py +24 -8
  137. novel_downloader/core/searchers/shuhaige.py +124 -0
  138. novel_downloader/core/searchers/tongrenquan.py +110 -0
  139. novel_downloader/core/searchers/ttkan.py +92 -0
  140. novel_downloader/core/searchers/xiaoshuowu.py +122 -0
  141. novel_downloader/core/searchers/xiguashuwu.py +95 -0
  142. novel_downloader/core/searchers/xs63b.py +104 -0
  143. novel_downloader/locales/en.json +31 -82
  144. novel_downloader/locales/zh.json +32 -83
  145. novel_downloader/models/__init__.py +21 -22
  146. novel_downloader/models/book.py +44 -0
  147. novel_downloader/models/config.py +4 -37
  148. novel_downloader/models/login.py +1 -1
  149. novel_downloader/models/search.py +5 -0
  150. novel_downloader/resources/config/settings.toml +8 -70
  151. novel_downloader/resources/json/xiguashuwu.json +718 -0
  152. novel_downloader/utils/__init__.py +13 -22
  153. novel_downloader/utils/chapter_storage.py +3 -2
  154. novel_downloader/utils/constants.py +4 -29
  155. novel_downloader/utils/cookies.py +6 -18
  156. novel_downloader/utils/crypto_utils/__init__.py +13 -0
  157. novel_downloader/utils/crypto_utils/aes_util.py +90 -0
  158. novel_downloader/utils/crypto_utils/aes_v1.py +619 -0
  159. novel_downloader/utils/crypto_utils/aes_v2.py +1143 -0
  160. novel_downloader/utils/{crypto_utils.py → crypto_utils/rc4.py} +3 -10
  161. novel_downloader/utils/epub/__init__.py +1 -1
  162. novel_downloader/utils/epub/constants.py +57 -16
  163. novel_downloader/utils/epub/documents.py +88 -194
  164. novel_downloader/utils/epub/models.py +0 -14
  165. novel_downloader/utils/epub/utils.py +63 -96
  166. novel_downloader/utils/file_utils/__init__.py +2 -23
  167. novel_downloader/utils/file_utils/io.py +3 -113
  168. novel_downloader/utils/file_utils/sanitize.py +0 -4
  169. novel_downloader/utils/fontocr.py +207 -0
  170. novel_downloader/utils/logger.py +8 -16
  171. novel_downloader/utils/network.py +2 -2
  172. novel_downloader/utils/state.py +4 -90
  173. novel_downloader/utils/text_utils/__init__.py +1 -7
  174. novel_downloader/utils/text_utils/diff_display.py +5 -7
  175. novel_downloader/utils/time_utils/__init__.py +5 -11
  176. novel_downloader/utils/time_utils/datetime_utils.py +20 -29
  177. novel_downloader/utils/time_utils/sleep_utils.py +4 -8
  178. novel_downloader/web/__init__.py +13 -0
  179. novel_downloader/web/components/__init__.py +11 -0
  180. novel_downloader/web/components/navigation.py +35 -0
  181. novel_downloader/web/main.py +66 -0
  182. novel_downloader/web/pages/__init__.py +17 -0
  183. novel_downloader/web/pages/download.py +78 -0
  184. novel_downloader/web/pages/progress.py +147 -0
  185. novel_downloader/web/pages/search.py +329 -0
  186. novel_downloader/web/services/__init__.py +17 -0
  187. novel_downloader/web/services/client_dialog.py +164 -0
  188. novel_downloader/web/services/cred_broker.py +113 -0
  189. novel_downloader/web/services/cred_models.py +35 -0
  190. novel_downloader/web/services/task_manager.py +264 -0
  191. novel_downloader-2.0.0.dist-info/METADATA +171 -0
  192. novel_downloader-2.0.0.dist-info/RECORD +210 -0
  193. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/entry_points.txt +1 -1
  194. novel_downloader/core/downloaders/biquge.py +0 -29
  195. novel_downloader/core/downloaders/esjzone.py +0 -29
  196. novel_downloader/core/downloaders/linovelib.py +0 -29
  197. novel_downloader/core/downloaders/sfacg.py +0 -29
  198. novel_downloader/core/downloaders/yamibo.py +0 -29
  199. novel_downloader/core/exporters/biquge.py +0 -22
  200. novel_downloader/core/exporters/esjzone.py +0 -22
  201. novel_downloader/core/exporters/qianbi.py +0 -22
  202. novel_downloader/core/exporters/sfacg.py +0 -22
  203. novel_downloader/core/exporters/yamibo.py +0 -22
  204. novel_downloader/core/fetchers/base/__init__.py +0 -14
  205. novel_downloader/core/fetchers/base/browser.py +0 -422
  206. novel_downloader/core/fetchers/biquge/__init__.py +0 -14
  207. novel_downloader/core/fetchers/esjzone/__init__.py +0 -14
  208. novel_downloader/core/fetchers/esjzone/browser.py +0 -209
  209. novel_downloader/core/fetchers/linovelib/__init__.py +0 -14
  210. novel_downloader/core/fetchers/linovelib/browser.py +0 -198
  211. novel_downloader/core/fetchers/qianbi/__init__.py +0 -14
  212. novel_downloader/core/fetchers/qidian/__init__.py +0 -14
  213. novel_downloader/core/fetchers/qidian/browser.py +0 -326
  214. novel_downloader/core/fetchers/sfacg/__init__.py +0 -14
  215. novel_downloader/core/fetchers/sfacg/browser.py +0 -194
  216. novel_downloader/core/fetchers/yamibo/__init__.py +0 -14
  217. novel_downloader/core/fetchers/yamibo/browser.py +0 -234
  218. novel_downloader/core/parsers/biquge.py +0 -139
  219. novel_downloader/models/chapter.py +0 -25
  220. novel_downloader/models/types.py +0 -13
  221. novel_downloader/tui/__init__.py +0 -7
  222. novel_downloader/tui/app.py +0 -32
  223. novel_downloader/tui/main.py +0 -17
  224. novel_downloader/tui/screens/__init__.py +0 -14
  225. novel_downloader/tui/screens/home.py +0 -198
  226. novel_downloader/tui/screens/login.py +0 -74
  227. novel_downloader/tui/styles/home_layout.tcss +0 -79
  228. novel_downloader/tui/widgets/richlog_handler.py +0 -24
  229. novel_downloader/utils/cache.py +0 -24
  230. novel_downloader/utils/fontocr/__init__.py +0 -22
  231. novel_downloader/utils/fontocr/hash_store.py +0 -280
  232. novel_downloader/utils/fontocr/hash_utils.py +0 -103
  233. novel_downloader/utils/fontocr/model_loader.py +0 -69
  234. novel_downloader/utils/fontocr/ocr_v1.py +0 -315
  235. novel_downloader/utils/fontocr/ocr_v2.py +0 -764
  236. novel_downloader/utils/fontocr/ocr_v3.py +0 -744
  237. novel_downloader-1.5.0.dist-info/METADATA +0 -196
  238. novel_downloader-1.5.0.dist-info/RECORD +0 -164
  239. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/WHEEL +0 -0
  240. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/licenses/LICENSE +0 -0
  241. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,137 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.jpxs123
4
+ -------------------------------------
5
+
6
+ """
7
+
8
+ from typing import Any
9
+
10
+ from lxml import html
11
+
12
+ from novel_downloader.core.parsers.base import BaseParser
13
+ from novel_downloader.core.parsers.registry import register_parser
14
+ from novel_downloader.models import (
15
+ BookInfoDict,
16
+ ChapterDict,
17
+ ChapterInfoDict,
18
+ VolumeInfoDict,
19
+ )
20
+
21
+
22
+ @register_parser(
23
+ site_keys=["jpxs123"],
24
+ )
25
+ class Jpxs123Parser(BaseParser):
26
+ """
27
+ Parser for 精品小说网 book pages.
28
+ """
29
+
30
+ BASE_URL = "https://www.jpxs123.com"
31
+
32
+ def parse_book_info(
33
+ self,
34
+ html_list: list[str],
35
+ **kwargs: Any,
36
+ ) -> BookInfoDict | None:
37
+ if not html_list:
38
+ return None
39
+
40
+ # Parse the main info page
41
+ tree = html.fromstring(html_list[0])
42
+ # Book name
43
+ book_name = self._first_str(tree.xpath('//div[@class="infos"]/h1/text()'))
44
+ # Tags: the second breadcrumb (e.g., "同人小说")
45
+ tag = self._first_str(
46
+ tree.xpath('//div[contains(@class,"menNav")]/a[2]/text()')
47
+ )
48
+ tags = [tag] if tag else []
49
+
50
+ author = self._first_str(tree.xpath('//div[@class="date"]/span[1]//a/text()'))
51
+ update_time = self._first_str(
52
+ tree.xpath('//div[@class="date"]/span[2]/text()'), replaces=[("时间:", "")]
53
+ )
54
+ cover_rel = self._first_str(tree.xpath('//div[@class="pic"]/img/@src'))
55
+ cover_url = (
56
+ f"{self.BASE_URL}{cover_rel}"
57
+ if cover_rel and not cover_rel.startswith("http")
58
+ else cover_rel
59
+ )
60
+
61
+ # Summary from the <p> inside infos
62
+ paras = tree.xpath('//div[@class="infos"]/p//text()')
63
+ summary = "\n".join(p.strip() for p in paras if p.strip())
64
+
65
+ # Chapters from the book_list
66
+ chapters: list[ChapterInfoDict] = []
67
+ for a in tree.xpath('//div[contains(@class,"book_list")]//li/a'):
68
+ url = a.get("href", "").strip()
69
+ title = a.text_content().strip()
70
+ # General regex: /{category}/{bookId}/{chapterId}.html
71
+ cid = url.split("/")[-1].split(".")[0]
72
+ chapters.append({"title": title, "url": url, "chapterId": cid})
73
+
74
+ volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
75
+
76
+ # Parse the download page (second HTML)
77
+ download_url = ""
78
+ if len(html_list) > 1 and html_list[1]:
79
+ dtree = html.fromstring(html_list[1])
80
+ a = dtree.xpath('//a[@id="dowloadnUrl"]')
81
+ if a:
82
+ link = a[0].get("link") or a[0].get("href") or ""
83
+ download_url = self._fix_download_link(link)
84
+
85
+ return {
86
+ "book_name": book_name,
87
+ "author": author,
88
+ "cover_url": cover_url,
89
+ "update_time": update_time,
90
+ "tags": tags,
91
+ "summary": summary,
92
+ "volumes": volumes,
93
+ "extra": {"download_url": download_url},
94
+ }
95
+
96
+ def parse_chapter(
97
+ self,
98
+ html_list: list[str],
99
+ chapter_id: str,
100
+ **kwargs: Any,
101
+ ) -> ChapterDict | None:
102
+ if not html_list:
103
+ return None
104
+
105
+ tree = html.fromstring(html_list[0])
106
+
107
+ raw_title = self._first_str(
108
+ tree.xpath('//div[contains(@class,"read_chapterName")]//h1/text()')
109
+ )
110
+
111
+ crumbs = tree.xpath('//div[contains(@class,"readTop")]//a/text()')
112
+ book_name = crumbs[-1].strip() if crumbs else ""
113
+
114
+ title = raw_title.replace(book_name, "").strip()
115
+
116
+ paragraphs = tree.xpath('//div[contains(@class,"read_chapterDetail")]/p')
117
+ texts = []
118
+ for p in paragraphs:
119
+ txt = p.text_content().strip()
120
+ if txt:
121
+ texts.append(txt)
122
+
123
+ content = "\n".join(texts)
124
+ if not content:
125
+ return None
126
+
127
+ return {
128
+ "id": chapter_id,
129
+ "title": title,
130
+ "content": content,
131
+ "extra": {"site": "jpxs123"},
132
+ }
133
+
134
+ @classmethod
135
+ def _fix_download_link(cls, link: str) -> str:
136
+ true_link = link.replace("xs../", "/e/DownSys/")
137
+ return f"{cls.BASE_URL}{true_link}"
@@ -0,0 +1,142 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.lewenn
4
+ ------------------------------------
5
+
6
+ """
7
+
8
+ from typing import Any
9
+
10
+ from lxml import html
11
+
12
+ from novel_downloader.core.parsers.base import BaseParser
13
+ from novel_downloader.core.parsers.registry import register_parser
14
+ from novel_downloader.models import (
15
+ BookInfoDict,
16
+ ChapterDict,
17
+ ChapterInfoDict,
18
+ VolumeInfoDict,
19
+ )
20
+
21
+
22
+ @register_parser(
23
+ site_keys=["lewenn", "lewen"],
24
+ )
25
+ class LewennParser(BaseParser):
26
+ """
27
+ Parser for 乐文小说网 book pages.
28
+ """
29
+
30
+ BASE_URL = "https://www.lewenn.net"
31
+
32
+ ADS: set[str] = {
33
+ "app2",
34
+ "read2",
35
+ "chaptererror",
36
+ "记住乐文小说网",
37
+ "lewenn.net",
38
+ }
39
+
40
+ def parse_book_info(
41
+ self,
42
+ html_list: list[str],
43
+ **kwargs: Any,
44
+ ) -> BookInfoDict | None:
45
+ if not html_list:
46
+ return None
47
+
48
+ tree = html.fromstring(html_list[0])
49
+
50
+ # --- Metadata ---
51
+ book_name = self._first_str(tree.xpath('//div[@id="info"]/h1/text()'))
52
+ author = self._first_str(
53
+ tree.xpath('//div[@id="info"]/p[1]/text()'),
54
+ replaces=[(chr(0xA0), ""), ("作者:", "")],
55
+ )
56
+ serial_status = self._first_str(
57
+ tree.xpath('//div[@id="info"]/p[2]/text()'),
58
+ replaces=[(chr(0xA0), ""), ("状态:", "")],
59
+ )
60
+ update_time = self._first_str(
61
+ tree.xpath('//div[@id="info"]/p[3]/text()'),
62
+ replaces=[("最后更新:", "")],
63
+ )
64
+
65
+ cover_src = self._first_str(tree.xpath('//div[@id="sidebar"]//img/@src'))
66
+ cover_url = (
67
+ cover_src if cover_src.startswith("http") else f"{self.BASE_URL}{cover_src}"
68
+ )
69
+
70
+ summary_lines = tree.xpath('//div[@id="intro"]/p//text()')
71
+ summary = "\n".join(line.strip() for line in summary_lines).strip()
72
+
73
+ # --- Volumes & Chapters ---
74
+ chapters: list[ChapterInfoDict] = []
75
+ for dt in tree.xpath('//div[@class="listmain"]/dl/dt'):
76
+ title_text = dt.text_content().strip()
77
+ if "正文" in title_text:
78
+ # collect its <dd> siblings
79
+ sib = dt.getnext()
80
+ while sib is not None and sib.tag == "dd":
81
+ a = sib.xpath(".//a")[0]
82
+ chap_title = a.text_content().strip()
83
+ href = a.get("href")
84
+ url = href if href.startswith("http") else f"{self.BASE_URL}{href}"
85
+ chap_id = url.rstrip(".html").split("/")[-1]
86
+ chapters.append(
87
+ {"title": chap_title, "url": url, "chapterId": chap_id}
88
+ )
89
+ sib = sib.getnext()
90
+ break
91
+
92
+ volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
93
+
94
+ return {
95
+ "book_name": book_name,
96
+ "author": author,
97
+ "cover_url": cover_url,
98
+ "update_time": update_time,
99
+ "serial_status": serial_status,
100
+ "summary": summary,
101
+ "volumes": volumes,
102
+ "extra": {},
103
+ }
104
+
105
+ def parse_chapter(
106
+ self,
107
+ html_list: list[str],
108
+ chapter_id: str,
109
+ **kwargs: Any,
110
+ ) -> ChapterDict | None:
111
+ if not html_list:
112
+ return None
113
+
114
+ tree = html.fromstring(html_list[0])
115
+
116
+ title = self._first_str(tree.xpath('//div[@class="content"]/h1/text()'))
117
+
118
+ nodes = tree.xpath('//div[@id="content" and contains(@class,"showtxt")]')
119
+ if not nodes:
120
+ return None
121
+ content_div = nodes[0]
122
+
123
+ raw_lines = [ln.strip() for ln in content_div.xpath(".//text()")]
124
+
125
+ lines: list[str] = []
126
+ for ln in raw_lines:
127
+ if not ln or self._is_ad_line(ln):
128
+ continue
129
+ # if ln.startswith("(") and ln.endswith(")"):
130
+ # continue
131
+ lines.append(ln.replace(chr(0xA0), ""))
132
+
133
+ content = "\n".join(lines)
134
+ if not content.strip():
135
+ return None
136
+
137
+ return {
138
+ "id": chapter_id,
139
+ "title": title,
140
+ "content": content,
141
+ "extra": {"site": "lewenn"},
142
+ }
@@ -7,23 +7,28 @@ novel_downloader.core.parsers.linovelib
7
7
 
8
8
  import json
9
9
  from itertools import islice
10
- from pathlib import PurePosixPath
11
10
  from typing import Any
12
11
 
13
12
  from lxml import html
14
13
 
15
14
  from novel_downloader.core.parsers.base import BaseParser
16
15
  from novel_downloader.core.parsers.registry import register_parser
17
- from novel_downloader.models import ChapterDict
16
+ from novel_downloader.models import (
17
+ BookInfoDict,
18
+ ChapterDict,
19
+ ChapterInfoDict,
20
+ VolumeInfoDict,
21
+ )
18
22
  from novel_downloader.utils.constants import LINOVELIB_FONT_MAP_PATH
19
23
 
20
24
 
21
25
  @register_parser(
22
26
  site_keys=["linovelib"],
23
- backends=["session", "browser"],
24
27
  )
25
28
  class LinovelibParser(BaseParser):
26
- """ """
29
+ """
30
+ Parser for 哔哩轻小说 book pages.
31
+ """
27
32
 
28
33
  # Book info XPaths
29
34
  _BOOK_NAME_XPATH = '//div[@class="book-info"]/h1[@class="book-name"]/text()'
@@ -51,68 +56,69 @@ class LinovelibParser(BaseParser):
51
56
  self,
52
57
  html_list: list[str],
53
58
  **kwargs: Any,
54
- ) -> dict[str, Any]:
55
- """
56
- Parse a book info page and extract metadata and chapter structure.
57
-
58
- :param html_list: Raw HTML of the book info page.
59
- :return: Parsed metadata and chapter structure as a dictionary.
60
- """
59
+ ) -> BookInfoDict | None:
61
60
  if not html_list:
62
- return {}
63
- info_tree = html.fromstring(html_list[0])
64
- result: dict[str, Any] = {}
65
-
66
- result["book_name"] = self._safe_xpath(info_tree, self._BOOK_NAME_XPATH)
67
- result["author"] = self._safe_xpath(info_tree, self._AUTHOR_XPATH)
68
- result["cover_url"] = self._safe_xpath(info_tree, self._COVER_URL_XPATH)
69
- result["update_time"] = self._safe_xpath(
70
- info_tree, self._UPDATE_TIME_XPATH, replace=("最后更新:", "")
61
+ return None
62
+ tree = html.fromstring(html_list[0])
63
+
64
+ book_name = self._first_str(tree.xpath(self._BOOK_NAME_XPATH))
65
+ author = self._first_str(tree.xpath(self._AUTHOR_XPATH))
66
+ cover_url = self._first_str(tree.xpath(self._COVER_URL_XPATH))
67
+ update_time = self._first_str(
68
+ tree.xpath(self._UPDATE_TIME_XPATH), replaces=[("最后更新:", "")]
71
69
  )
72
- result["serial_status"] = self._safe_xpath(info_tree, self._SERIAL_STATUS_XPATH)
73
- result["word_count"] = self._safe_xpath(
74
- info_tree, self._WORD_COUNT_XPATH, replace=("字数:", "")
70
+ serial_status = self._first_str(tree.xpath(self._SERIAL_STATUS_XPATH))
71
+ word_count = self._first_str(
72
+ tree.xpath(self._WORD_COUNT_XPATH), replaces=[("最后更新:", "")]
75
73
  )
76
74
 
77
- result["summary"] = self._extract_intro(info_tree, self._SUMMARY_XPATH)
75
+ summary = self._extract_intro(tree, self._SUMMARY_XPATH)
78
76
 
79
77
  vol_pages = html_list[1:]
80
- volumes: list[dict[str, Any]] = []
78
+ volumes: list[VolumeInfoDict] = []
81
79
  for vol_page in vol_pages:
82
80
  vol_tree = html.fromstring(vol_page)
83
- volume_cover = self._safe_xpath(vol_tree, self._COVER_URL_XPATH)
84
- volume_name = self._safe_xpath(vol_tree, self._BOOK_NAME_XPATH)
85
- update_time = self._safe_xpath(
86
- vol_tree, self._UPDATE_TIME_XPATH, replace=("最后更新:", "")
81
+ volume_cover = self._first_str(vol_tree.xpath(self._COVER_URL_XPATH))
82
+ volume_name = self._first_str(vol_tree.xpath(self._BOOK_NAME_XPATH))
83
+ vol_update_time = self._first_str(
84
+ vol_tree.xpath(self._UPDATE_TIME_XPATH), replaces=[("最后更新:", "")]
87
85
  )
88
- word_count = self._safe_xpath(
89
- vol_tree, self._WORD_COUNT_XPATH, replace=("字数:", "")
86
+ vol_word_count = self._first_str(
87
+ vol_tree.xpath(self._WORD_COUNT_XPATH), replaces=[("字数:", "")]
90
88
  )
91
89
  volume_intro = self._extract_intro(vol_tree, self._SUMMARY_XPATH)
92
90
 
93
- chapters = []
91
+ chapters: list[ChapterInfoDict] = []
94
92
  chapter_elements = vol_tree.xpath(self._CHAPTERS_XPATH)
95
93
  for a in chapter_elements:
96
94
  title = a.text.strip()
97
95
  url = a.attrib.get("href", "").strip()
98
- chap_path = PurePosixPath(url.rstrip("/"))
99
- chapters.append(
100
- {"title": title, "url": url, "chapterId": chap_path.stem}
101
- )
96
+ # '/novel/4668/276082.html' -> '276082'
97
+ cid = url.split("/")[-1].split(".")[0]
98
+ chapters.append({"title": title, "url": url, "chapterId": cid})
102
99
 
103
100
  volumes.append(
104
101
  {
105
102
  "volume_name": volume_name,
106
103
  "volume_cover": volume_cover,
107
- "update_time": update_time,
108
- "word_count": word_count,
104
+ "update_time": vol_update_time,
105
+ "word_count": vol_word_count,
109
106
  "volume_intro": volume_intro,
110
107
  "chapters": chapters,
111
108
  }
112
109
  )
113
- result["volumes"] = volumes
114
110
 
115
- return result
111
+ return {
112
+ "book_name": book_name,
113
+ "author": author,
114
+ "cover_url": cover_url,
115
+ "serial_status": serial_status,
116
+ "word_count": word_count,
117
+ "summary": summary,
118
+ "update_time": update_time,
119
+ "volumes": volumes,
120
+ "extra": {},
121
+ }
116
122
 
117
123
  def parse_chapter(
118
124
  self,
@@ -120,13 +126,6 @@ class LinovelibParser(BaseParser):
120
126
  chapter_id: str,
121
127
  **kwargs: Any,
122
128
  ) -> ChapterDict | None:
123
- """
124
- Parse chapter pages and extract clean text or simplified HTML.
125
-
126
- :param html_list: Raw HTML of the chapter page.
127
- :param chapter_id: Identifier of the chapter being parsed.
128
- :return: Cleaned chapter content as plain text or minimal HTML.
129
- """
130
129
  if not html_list:
131
130
  return None
132
131
  title_text: str = ""
@@ -170,25 +169,10 @@ class LinovelibParser(BaseParser):
170
169
  return {
171
170
  "id": chapter_id,
172
171
  "title": title_text,
173
- "content": "\n\n".join(contents),
172
+ "content": "\n".join(contents),
174
173
  "extra": {"site": "linovelib"},
175
174
  }
176
175
 
177
- def _safe_xpath(
178
- self,
179
- tree: html.HtmlElement,
180
- path: str,
181
- replace: tuple[str, str] | None = None,
182
- ) -> str:
183
- result = tree.xpath(path)
184
- if not result:
185
- return ""
186
- value: str = result[0].strip()
187
- if replace:
188
- old, new = replace
189
- value = value.replace(old, new)
190
- return value
191
-
192
176
  @staticmethod
193
177
  def _extract_intro(tree: html.HtmlElement, xpath: str) -> str:
194
178
  paragraphs = tree.xpath(xpath.replace("//text()", ""))
@@ -197,7 +181,7 @@ class LinovelibParser(BaseParser):
197
181
  text_segments = p.xpath(".//text()")
198
182
  cleaned = [seg.strip() for seg in text_segments if seg.strip()]
199
183
  lines.append("\n".join(cleaned))
200
- return "\n\n".join(lines)
184
+ return "\n".join(lines)
201
185
 
202
186
  @staticmethod
203
187
  def _is_encrypted(html: str) -> bool:
@@ -0,0 +1,189 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.piaotia
4
+ -------------------------------------
5
+
6
+ """
7
+
8
+ import re
9
+ from typing import Any
10
+
11
+ from lxml import html
12
+
13
+ from novel_downloader.core.parsers.base import BaseParser
14
+ from novel_downloader.core.parsers.registry import register_parser
15
+ from novel_downloader.models import (
16
+ BookInfoDict,
17
+ ChapterDict,
18
+ ChapterInfoDict,
19
+ VolumeInfoDict,
20
+ )
21
+
22
+
23
+ @register_parser(
24
+ site_keys=["piaotia"],
25
+ )
26
+ class PiaotiaParser(BaseParser):
27
+ """
28
+ Parser for 飘天文学网 book pages.
29
+ """
30
+
31
+ _RE_DEVICE_DIV = re.compile(
32
+ r'<div\s+id=[\'"“”]?device[\'"“”]?[^>]*>',
33
+ flags=re.IGNORECASE,
34
+ )
35
+
36
+ def parse_book_info(
37
+ self,
38
+ html_list: list[str],
39
+ **kwargs: Any,
40
+ ) -> BookInfoDict | None:
41
+ if len(html_list) < 2:
42
+ return None
43
+
44
+ # Parse trees
45
+ info_tree = html.fromstring(html_list[0])
46
+ catalog_tree = html.fromstring(html_list[1])
47
+
48
+ book_name = self._first_str(info_tree.xpath("//span[@style]//h1/text()"))
49
+ author = self._first_str(
50
+ info_tree.xpath(
51
+ '//td[contains(text(),"作") and contains(text(),"者")]/text()'
52
+ ),
53
+ replaces=[(chr(0xA0), ""), (" ", ""), ("作者:", "")],
54
+ )
55
+
56
+ # Category as tag
57
+ category = self._first_str(
58
+ info_tree.xpath(
59
+ '//td[contains(text(),"类") and contains(text(),"别")]/text()'
60
+ ),
61
+ replaces=[(chr(0xA0), ""), (" ", ""), ("类别:", "")],
62
+ )
63
+ tags = [category] if category else []
64
+
65
+ word_count = self._first_str(
66
+ info_tree.xpath('//td[contains(text(),"全文长度")]/text()'),
67
+ replaces=[(chr(0xA0), ""), (" ", ""), ("全文长度:", "")],
68
+ )
69
+
70
+ update_time = self._first_str(
71
+ info_tree.xpath('//td[contains(text(),"最后更新")]/text()'),
72
+ replaces=[(chr(0xA0), ""), (" ", ""), ("最后更新:", "")],
73
+ )
74
+
75
+ serial_status = self._first_str(
76
+ info_tree.xpath('//td[contains(text(),"文章状态")]/text()'),
77
+ replaces=[(chr(0xA0), ""), (" ", ""), ("文章状态:", "")],
78
+ )
79
+
80
+ cover_url = self._first_str(info_tree.xpath('//td[@width="80%"]//img/@src'))
81
+
82
+ # Summary
83
+ summary_divs = info_tree.xpath('//td[@width="80%"]/div')
84
+ if summary_divs:
85
+ raw = str(summary_divs[0].text_content())
86
+ summary = raw.split("内容简介:")[-1].strip()
87
+ else:
88
+ summary = ""
89
+
90
+ # Chapters (single volume)
91
+ chapters: list[ChapterInfoDict] = []
92
+ for a in catalog_tree.xpath('//div[@class="centent"]//ul/li/a'):
93
+ title = (a.text or "").strip()
94
+ url = a.get("href", "").strip()
95
+ chapter_id = url.split(".")[0]
96
+ chapters.append({"title": title, "url": url, "chapterId": chapter_id})
97
+
98
+ # Single volume
99
+ volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
100
+
101
+ return {
102
+ "book_name": book_name,
103
+ "author": author,
104
+ "cover_url": cover_url,
105
+ "update_time": update_time,
106
+ "summary": summary,
107
+ "volumes": volumes,
108
+ "tags": tags,
109
+ "word_count": word_count,
110
+ "serial_status": serial_status,
111
+ "extra": {},
112
+ }
113
+
114
+ def parse_chapter(
115
+ self,
116
+ html_list: list[str],
117
+ chapter_id: str,
118
+ **kwargs: Any,
119
+ ) -> ChapterDict | None:
120
+ """
121
+ Parse chapter page and extract the content of one chapter.
122
+
123
+ p.s. 结构好混乱:
124
+ 1. `<head>` 没有对应的 `</head>`, 同理 `</body>` 没有对应的 `<body>`
125
+ 2. 部分 html 通过 js 直接写入, 例如:
126
+ `document.write("<div id=\"main\" class=\"colors1 sidebar\">");`
127
+ 3. 部分 div 的 id 或 style 属性周围的引号是非标准的波浪引号, 例如:
128
+ `<div id=”device” style=”background-color...”>`,
129
+ 并也没有对应的 `</div>`
130
+
131
+ :param html_list: The HTML list of the chapter pages.
132
+ :param chapter_id: Identifier of the chapter being parsed.
133
+ :return: The chapter's data.
134
+ """
135
+ if not html_list:
136
+ return None
137
+
138
+ raw = self._RE_DEVICE_DIV.sub("", html_list[0])
139
+ raw = raw.replace(
140
+ '<script language="javascript">GetMode();</script>',
141
+ '<div id="main" class="colors1 sidebar">',
142
+ ).replace(
143
+ '<script language="javascript">GetFont();</script>',
144
+ '<div id="content">',
145
+ )
146
+
147
+ doc = html.fromstring(raw)
148
+ container = doc.xpath('//div[@id="content"]')
149
+ root = container[0] if container else doc
150
+
151
+ # Title comes straight from the <h1>
152
+ title = ""
153
+ h1 = root.find(".//h1")
154
+ if h1 is not None:
155
+ full = h1.text_content().strip()
156
+ a_txt = h1.xpath("./a/text()")
157
+ title = full.replace(a_txt[0].strip(), "").strip() if a_txt else full
158
+
159
+ # Walk the “script‑tables” -> <br> siblings for the body
160
+ table = root.xpath('.//table[@align="center" and @border]')
161
+ if not table:
162
+ return None
163
+ node = table[0].getnext()
164
+
165
+ lines: list[str] = []
166
+ while node is not None:
167
+ # stop at the next table or any bottom‑link nav div
168
+ if (node.tag == "table" and node.get("border")) or (
169
+ node.tag == "div" and node.get("class", "").endswith("link")
170
+ ):
171
+ break
172
+
173
+ if node.tag == "br":
174
+ txt = (node.tail or "").replace("\xa0", " ").strip()
175
+ if txt:
176
+ lines.append(txt)
177
+
178
+ node = node.getnext()
179
+
180
+ content = "\n".join(lines).strip()
181
+ if not content:
182
+ return None
183
+
184
+ return {
185
+ "id": chapter_id,
186
+ "title": title,
187
+ "content": content,
188
+ "extra": {"site": "piaotia"},
189
+ }