novel-downloader 1.5.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (241) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/__init__.py +1 -3
  3. novel_downloader/cli/clean.py +21 -88
  4. novel_downloader/cli/config.py +26 -21
  5. novel_downloader/cli/download.py +77 -64
  6. novel_downloader/cli/export.py +16 -20
  7. novel_downloader/cli/main.py +1 -1
  8. novel_downloader/cli/search.py +62 -65
  9. novel_downloader/cli/ui.py +156 -0
  10. novel_downloader/config/__init__.py +8 -5
  11. novel_downloader/config/adapter.py +65 -105
  12. novel_downloader/config/{loader.py → file_io.py} +53 -26
  13. novel_downloader/core/__init__.py +1 -0
  14. novel_downloader/core/archived/deqixs/fetcher.py +115 -0
  15. novel_downloader/core/archived/deqixs/parser.py +132 -0
  16. novel_downloader/core/archived/deqixs/searcher.py +89 -0
  17. novel_downloader/core/{searchers/qidian.py → archived/qidian/searcher.py} +12 -20
  18. novel_downloader/core/archived/wanbengo/searcher.py +98 -0
  19. novel_downloader/core/archived/xshbook/searcher.py +93 -0
  20. novel_downloader/core/downloaders/__init__.py +3 -24
  21. novel_downloader/core/downloaders/base.py +49 -23
  22. novel_downloader/core/downloaders/common.py +191 -137
  23. novel_downloader/core/downloaders/qianbi.py +187 -146
  24. novel_downloader/core/downloaders/qidian.py +187 -141
  25. novel_downloader/core/downloaders/registry.py +4 -2
  26. novel_downloader/core/downloaders/signals.py +46 -0
  27. novel_downloader/core/exporters/__init__.py +3 -20
  28. novel_downloader/core/exporters/base.py +33 -37
  29. novel_downloader/core/exporters/common/__init__.py +1 -2
  30. novel_downloader/core/exporters/common/epub.py +15 -10
  31. novel_downloader/core/exporters/common/main_exporter.py +19 -12
  32. novel_downloader/core/exporters/common/txt.py +14 -9
  33. novel_downloader/core/exporters/epub_util.py +59 -29
  34. novel_downloader/core/exporters/linovelib/__init__.py +1 -0
  35. novel_downloader/core/exporters/linovelib/epub.py +23 -25
  36. novel_downloader/core/exporters/linovelib/main_exporter.py +8 -12
  37. novel_downloader/core/exporters/linovelib/txt.py +17 -11
  38. novel_downloader/core/exporters/qidian.py +2 -8
  39. novel_downloader/core/exporters/registry.py +4 -2
  40. novel_downloader/core/exporters/txt_util.py +7 -7
  41. novel_downloader/core/fetchers/__init__.py +54 -48
  42. novel_downloader/core/fetchers/aaatxt.py +83 -0
  43. novel_downloader/core/fetchers/{biquge/session.py → b520.py} +6 -11
  44. novel_downloader/core/fetchers/{base/session.py → base.py} +37 -46
  45. novel_downloader/core/fetchers/{biquge/browser.py → biquyuedu.py} +12 -17
  46. novel_downloader/core/fetchers/dxmwx.py +110 -0
  47. novel_downloader/core/fetchers/eightnovel.py +139 -0
  48. novel_downloader/core/fetchers/{esjzone/session.py → esjzone.py} +19 -12
  49. novel_downloader/core/fetchers/guidaye.py +85 -0
  50. novel_downloader/core/fetchers/hetushu.py +92 -0
  51. novel_downloader/core/fetchers/{qianbi/browser.py → i25zw.py} +19 -28
  52. novel_downloader/core/fetchers/ixdzs8.py +113 -0
  53. novel_downloader/core/fetchers/jpxs123.py +101 -0
  54. novel_downloader/core/fetchers/lewenn.py +83 -0
  55. novel_downloader/core/fetchers/{linovelib/session.py → linovelib.py} +12 -13
  56. novel_downloader/core/fetchers/piaotia.py +105 -0
  57. novel_downloader/core/fetchers/qbtr.py +101 -0
  58. novel_downloader/core/fetchers/{qianbi/session.py → qianbi.py} +5 -10
  59. novel_downloader/core/fetchers/{qidian/session.py → qidian.py} +46 -39
  60. novel_downloader/core/fetchers/quanben5.py +92 -0
  61. novel_downloader/core/fetchers/{base/rate_limiter.py → rate_limiter.py} +2 -2
  62. novel_downloader/core/fetchers/registry.py +5 -16
  63. novel_downloader/core/fetchers/{sfacg/session.py → sfacg.py} +7 -10
  64. novel_downloader/core/fetchers/shencou.py +106 -0
  65. novel_downloader/core/fetchers/shuhaige.py +84 -0
  66. novel_downloader/core/fetchers/tongrenquan.py +84 -0
  67. novel_downloader/core/fetchers/ttkan.py +95 -0
  68. novel_downloader/core/fetchers/wanbengo.py +83 -0
  69. novel_downloader/core/fetchers/xiaoshuowu.py +106 -0
  70. novel_downloader/core/fetchers/xiguashuwu.py +177 -0
  71. novel_downloader/core/fetchers/xs63b.py +171 -0
  72. novel_downloader/core/fetchers/xshbook.py +85 -0
  73. novel_downloader/core/fetchers/{yamibo/session.py → yamibo.py} +19 -12
  74. novel_downloader/core/fetchers/yibige.py +114 -0
  75. novel_downloader/core/interfaces/__init__.py +1 -9
  76. novel_downloader/core/interfaces/downloader.py +6 -2
  77. novel_downloader/core/interfaces/exporter.py +7 -7
  78. novel_downloader/core/interfaces/fetcher.py +4 -17
  79. novel_downloader/core/interfaces/parser.py +5 -6
  80. novel_downloader/core/interfaces/searcher.py +9 -1
  81. novel_downloader/core/parsers/__init__.py +49 -12
  82. novel_downloader/core/parsers/aaatxt.py +132 -0
  83. novel_downloader/core/parsers/b520.py +116 -0
  84. novel_downloader/core/parsers/base.py +63 -12
  85. novel_downloader/core/parsers/biquyuedu.py +133 -0
  86. novel_downloader/core/parsers/dxmwx.py +162 -0
  87. novel_downloader/core/parsers/eightnovel.py +224 -0
  88. novel_downloader/core/parsers/esjzone.py +61 -66
  89. novel_downloader/core/parsers/guidaye.py +128 -0
  90. novel_downloader/core/parsers/hetushu.py +139 -0
  91. novel_downloader/core/parsers/i25zw.py +137 -0
  92. novel_downloader/core/parsers/ixdzs8.py +186 -0
  93. novel_downloader/core/parsers/jpxs123.py +137 -0
  94. novel_downloader/core/parsers/lewenn.py +142 -0
  95. novel_downloader/core/parsers/linovelib.py +48 -64
  96. novel_downloader/core/parsers/piaotia.py +189 -0
  97. novel_downloader/core/parsers/qbtr.py +136 -0
  98. novel_downloader/core/parsers/qianbi.py +48 -50
  99. novel_downloader/core/parsers/qidian/book_info_parser.py +58 -59
  100. novel_downloader/core/parsers/qidian/chapter_encrypted.py +272 -330
  101. novel_downloader/core/parsers/qidian/chapter_normal.py +24 -55
  102. novel_downloader/core/parsers/qidian/main_parser.py +11 -38
  103. novel_downloader/core/parsers/qidian/utils/__init__.py +1 -0
  104. novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +1 -1
  105. novel_downloader/core/parsers/qidian/utils/fontmap_recover.py +143 -0
  106. novel_downloader/core/parsers/qidian/utils/helpers.py +0 -4
  107. novel_downloader/core/parsers/quanben5.py +103 -0
  108. novel_downloader/core/parsers/registry.py +5 -16
  109. novel_downloader/core/parsers/sfacg.py +38 -45
  110. novel_downloader/core/parsers/shencou.py +215 -0
  111. novel_downloader/core/parsers/shuhaige.py +111 -0
  112. novel_downloader/core/parsers/tongrenquan.py +116 -0
  113. novel_downloader/core/parsers/ttkan.py +132 -0
  114. novel_downloader/core/parsers/wanbengo.py +191 -0
  115. novel_downloader/core/parsers/xiaoshuowu.py +173 -0
  116. novel_downloader/core/parsers/xiguashuwu.py +435 -0
  117. novel_downloader/core/parsers/xs63b.py +161 -0
  118. novel_downloader/core/parsers/xshbook.py +134 -0
  119. novel_downloader/core/parsers/yamibo.py +87 -131
  120. novel_downloader/core/parsers/yibige.py +166 -0
  121. novel_downloader/core/searchers/__init__.py +34 -3
  122. novel_downloader/core/searchers/aaatxt.py +107 -0
  123. novel_downloader/core/searchers/{biquge.py → b520.py} +29 -28
  124. novel_downloader/core/searchers/base.py +112 -36
  125. novel_downloader/core/searchers/dxmwx.py +105 -0
  126. novel_downloader/core/searchers/eightnovel.py +84 -0
  127. novel_downloader/core/searchers/esjzone.py +43 -25
  128. novel_downloader/core/searchers/hetushu.py +92 -0
  129. novel_downloader/core/searchers/i25zw.py +93 -0
  130. novel_downloader/core/searchers/ixdzs8.py +107 -0
  131. novel_downloader/core/searchers/jpxs123.py +107 -0
  132. novel_downloader/core/searchers/piaotia.py +100 -0
  133. novel_downloader/core/searchers/qbtr.py +106 -0
  134. novel_downloader/core/searchers/qianbi.py +74 -40
  135. novel_downloader/core/searchers/quanben5.py +144 -0
  136. novel_downloader/core/searchers/registry.py +24 -8
  137. novel_downloader/core/searchers/shuhaige.py +124 -0
  138. novel_downloader/core/searchers/tongrenquan.py +110 -0
  139. novel_downloader/core/searchers/ttkan.py +92 -0
  140. novel_downloader/core/searchers/xiaoshuowu.py +122 -0
  141. novel_downloader/core/searchers/xiguashuwu.py +95 -0
  142. novel_downloader/core/searchers/xs63b.py +104 -0
  143. novel_downloader/locales/en.json +31 -82
  144. novel_downloader/locales/zh.json +32 -83
  145. novel_downloader/models/__init__.py +21 -22
  146. novel_downloader/models/book.py +44 -0
  147. novel_downloader/models/config.py +4 -37
  148. novel_downloader/models/login.py +1 -1
  149. novel_downloader/models/search.py +5 -0
  150. novel_downloader/resources/config/settings.toml +8 -70
  151. novel_downloader/resources/json/xiguashuwu.json +718 -0
  152. novel_downloader/utils/__init__.py +13 -22
  153. novel_downloader/utils/chapter_storage.py +3 -2
  154. novel_downloader/utils/constants.py +4 -29
  155. novel_downloader/utils/cookies.py +6 -18
  156. novel_downloader/utils/crypto_utils/__init__.py +13 -0
  157. novel_downloader/utils/crypto_utils/aes_util.py +90 -0
  158. novel_downloader/utils/crypto_utils/aes_v1.py +619 -0
  159. novel_downloader/utils/crypto_utils/aes_v2.py +1143 -0
  160. novel_downloader/utils/{crypto_utils.py → crypto_utils/rc4.py} +3 -10
  161. novel_downloader/utils/epub/__init__.py +1 -1
  162. novel_downloader/utils/epub/constants.py +57 -16
  163. novel_downloader/utils/epub/documents.py +88 -194
  164. novel_downloader/utils/epub/models.py +0 -14
  165. novel_downloader/utils/epub/utils.py +63 -96
  166. novel_downloader/utils/file_utils/__init__.py +2 -23
  167. novel_downloader/utils/file_utils/io.py +3 -113
  168. novel_downloader/utils/file_utils/sanitize.py +0 -4
  169. novel_downloader/utils/fontocr.py +207 -0
  170. novel_downloader/utils/logger.py +8 -16
  171. novel_downloader/utils/network.py +2 -2
  172. novel_downloader/utils/state.py +4 -90
  173. novel_downloader/utils/text_utils/__init__.py +1 -7
  174. novel_downloader/utils/text_utils/diff_display.py +5 -7
  175. novel_downloader/utils/time_utils/__init__.py +5 -11
  176. novel_downloader/utils/time_utils/datetime_utils.py +20 -29
  177. novel_downloader/utils/time_utils/sleep_utils.py +4 -8
  178. novel_downloader/web/__init__.py +13 -0
  179. novel_downloader/web/components/__init__.py +11 -0
  180. novel_downloader/web/components/navigation.py +35 -0
  181. novel_downloader/web/main.py +66 -0
  182. novel_downloader/web/pages/__init__.py +17 -0
  183. novel_downloader/web/pages/download.py +78 -0
  184. novel_downloader/web/pages/progress.py +147 -0
  185. novel_downloader/web/pages/search.py +329 -0
  186. novel_downloader/web/services/__init__.py +17 -0
  187. novel_downloader/web/services/client_dialog.py +164 -0
  188. novel_downloader/web/services/cred_broker.py +113 -0
  189. novel_downloader/web/services/cred_models.py +35 -0
  190. novel_downloader/web/services/task_manager.py +264 -0
  191. novel_downloader-2.0.0.dist-info/METADATA +171 -0
  192. novel_downloader-2.0.0.dist-info/RECORD +210 -0
  193. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/entry_points.txt +1 -1
  194. novel_downloader/core/downloaders/biquge.py +0 -29
  195. novel_downloader/core/downloaders/esjzone.py +0 -29
  196. novel_downloader/core/downloaders/linovelib.py +0 -29
  197. novel_downloader/core/downloaders/sfacg.py +0 -29
  198. novel_downloader/core/downloaders/yamibo.py +0 -29
  199. novel_downloader/core/exporters/biquge.py +0 -22
  200. novel_downloader/core/exporters/esjzone.py +0 -22
  201. novel_downloader/core/exporters/qianbi.py +0 -22
  202. novel_downloader/core/exporters/sfacg.py +0 -22
  203. novel_downloader/core/exporters/yamibo.py +0 -22
  204. novel_downloader/core/fetchers/base/__init__.py +0 -14
  205. novel_downloader/core/fetchers/base/browser.py +0 -422
  206. novel_downloader/core/fetchers/biquge/__init__.py +0 -14
  207. novel_downloader/core/fetchers/esjzone/__init__.py +0 -14
  208. novel_downloader/core/fetchers/esjzone/browser.py +0 -209
  209. novel_downloader/core/fetchers/linovelib/__init__.py +0 -14
  210. novel_downloader/core/fetchers/linovelib/browser.py +0 -198
  211. novel_downloader/core/fetchers/qianbi/__init__.py +0 -14
  212. novel_downloader/core/fetchers/qidian/__init__.py +0 -14
  213. novel_downloader/core/fetchers/qidian/browser.py +0 -326
  214. novel_downloader/core/fetchers/sfacg/__init__.py +0 -14
  215. novel_downloader/core/fetchers/sfacg/browser.py +0 -194
  216. novel_downloader/core/fetchers/yamibo/__init__.py +0 -14
  217. novel_downloader/core/fetchers/yamibo/browser.py +0 -234
  218. novel_downloader/core/parsers/biquge.py +0 -139
  219. novel_downloader/models/chapter.py +0 -25
  220. novel_downloader/models/types.py +0 -13
  221. novel_downloader/tui/__init__.py +0 -7
  222. novel_downloader/tui/app.py +0 -32
  223. novel_downloader/tui/main.py +0 -17
  224. novel_downloader/tui/screens/__init__.py +0 -14
  225. novel_downloader/tui/screens/home.py +0 -198
  226. novel_downloader/tui/screens/login.py +0 -74
  227. novel_downloader/tui/styles/home_layout.tcss +0 -79
  228. novel_downloader/tui/widgets/richlog_handler.py +0 -24
  229. novel_downloader/utils/cache.py +0 -24
  230. novel_downloader/utils/fontocr/__init__.py +0 -22
  231. novel_downloader/utils/fontocr/hash_store.py +0 -280
  232. novel_downloader/utils/fontocr/hash_utils.py +0 -103
  233. novel_downloader/utils/fontocr/model_loader.py +0 -69
  234. novel_downloader/utils/fontocr/ocr_v1.py +0 -315
  235. novel_downloader/utils/fontocr/ocr_v2.py +0 -764
  236. novel_downloader/utils/fontocr/ocr_v3.py +0 -744
  237. novel_downloader-1.5.0.dist-info/METADATA +0 -196
  238. novel_downloader-1.5.0.dist-info/RECORD +0 -164
  239. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/WHEEL +0 -0
  240. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/licenses/LICENSE +0 -0
  241. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/top_level.txt +0 -0
@@ -3,35 +3,72 @@
3
3
  novel_downloader.core.parsers
4
4
  -----------------------------
5
5
 
6
- This package defines all site-specific parsing modules
7
- for the novel_downloader framework.
8
-
9
- Modules:
10
- - biquge (笔趣阁)
11
- - esjzone (ESJ Zone)
12
- - linovelib (哔哩轻小说)
13
- - qianbi (铅笔小说)
14
- - qidian (起点中文网)
15
- - sfacg (SF轻小说)
16
- - yamibo (百合会)
6
+ Parser implementations for extracting book metadata and
7
+ chapter content from various sources
17
8
  """
18
9
 
19
10
  __all__ = [
20
11
  "get_parser",
12
+ "AaatxtParser",
21
13
  "BiqugeParser",
14
+ "BiquyueduParser",
15
+ "DxmwxParser",
16
+ "EightnovelParser",
22
17
  "EsjzoneParser",
18
+ "GuidayeParser",
19
+ "HetushuParser",
20
+ "I25zwParser",
21
+ "Ixdzs8Parser",
22
+ "Jpxs123Parser",
23
+ "LewennParser",
23
24
  "LinovelibParser",
25
+ "PiaotiaParser",
26
+ "QbtrParser",
24
27
  "QianbiParser",
25
28
  "QidianParser",
29
+ "Quanben5Parser",
26
30
  "SfacgParser",
31
+ "ShencouParser",
32
+ "ShuhaigeParser",
33
+ "TongrenquanParser",
34
+ "TtkanParser",
35
+ "WanbengoParser",
36
+ "XiaoshuowuParser",
37
+ "XiguashuwuParser",
38
+ "Xs63bParser",
39
+ "XshbookParser",
27
40
  "YamiboParser",
41
+ "YibigeParser",
28
42
  ]
29
43
 
30
- from .biquge import BiqugeParser
44
+ from .aaatxt import AaatxtParser
45
+ from .b520 import BiqugeParser
46
+ from .biquyuedu import BiquyueduParser
47
+ from .dxmwx import DxmwxParser
48
+ from .eightnovel import EightnovelParser
31
49
  from .esjzone import EsjzoneParser
50
+ from .guidaye import GuidayeParser
51
+ from .hetushu import HetushuParser
52
+ from .i25zw import I25zwParser
53
+ from .ixdzs8 import Ixdzs8Parser
54
+ from .jpxs123 import Jpxs123Parser
55
+ from .lewenn import LewennParser
32
56
  from .linovelib import LinovelibParser
57
+ from .piaotia import PiaotiaParser
58
+ from .qbtr import QbtrParser
33
59
  from .qianbi import QianbiParser
34
60
  from .qidian import QidianParser
61
+ from .quanben5 import Quanben5Parser
35
62
  from .registry import get_parser
36
63
  from .sfacg import SfacgParser
64
+ from .shencou import ShencouParser
65
+ from .shuhaige import ShuhaigeParser
66
+ from .tongrenquan import TongrenquanParser
67
+ from .ttkan import TtkanParser
68
+ from .wanbengo import WanbengoParser
69
+ from .xiaoshuowu import XiaoshuowuParser
70
+ from .xiguashuwu import XiguashuwuParser
71
+ from .xs63b import Xs63bParser
72
+ from .xshbook import XshbookParser
37
73
  from .yamibo import YamiboParser
74
+ from .yibige import YibigeParser
@@ -0,0 +1,132 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.aaatxt
4
+ ------------------------------------
5
+
6
+ """
7
+
8
+ from typing import Any
9
+
10
+ from lxml import html
11
+
12
+ from novel_downloader.core.parsers.base import BaseParser
13
+ from novel_downloader.core.parsers.registry import register_parser
14
+ from novel_downloader.models import (
15
+ BookInfoDict,
16
+ ChapterDict,
17
+ ChapterInfoDict,
18
+ VolumeInfoDict,
19
+ )
20
+
21
+
22
+ @register_parser(
23
+ site_keys=["aaatxt"],
24
+ )
25
+ class AaatxtParser(BaseParser):
26
+ """
27
+ Parser for 3A电子书 book pages.
28
+ """
29
+
30
+ ADS: set[str] = {
31
+ "按键盘上方向键",
32
+ "未阅读完",
33
+ "加入书签",
34
+ "已便下次继续阅读",
35
+ "更多原创手机电子书",
36
+ "免费TXT小说下载",
37
+ }
38
+
39
+ def parse_book_info(
40
+ self,
41
+ html_list: list[str],
42
+ **kwargs: Any,
43
+ ) -> BookInfoDict | None:
44
+ if not html_list:
45
+ return None
46
+
47
+ tree = html.fromstring(html_list[0])
48
+
49
+ book_name = self._first_str(tree.xpath("//div[@class='xiazai']/h1/text()"))
50
+
51
+ author = self._first_str(tree.xpath("//span[@id='author']/a/text()"))
52
+
53
+ cover_url = self._first_str(
54
+ tree.xpath("//div[@id='txtbook']//div[@class='fm']//img/@src")
55
+ )
56
+
57
+ update_time = self._first_str(
58
+ tree.xpath("//div[@id='txtbook']//li[contains(text(), '上传日期')]/text()"),
59
+ replaces=[("上传日期:", "")],
60
+ )
61
+
62
+ genre = self._first_str(
63
+ tree.xpath("//div[@id='submenu']/h2/a[@class='lan']/text()")
64
+ )
65
+ tags = [genre] if genre else []
66
+
67
+ summary_el = tree.xpath("//div[@id='jj']//p")
68
+ summary = summary_el[0].text_content().strip() if summary_el else ""
69
+
70
+ download_url = self._first_str(
71
+ tree.xpath("//div[@id='down']//li[@class='bd']//a/@href")
72
+ )
73
+
74
+ # Chapters from the book_list
75
+ chapters: list[ChapterInfoDict] = []
76
+ for a in tree.xpath("//div[@id='ml']//ol/li/a"):
77
+ url = a.get("href", "").strip()
78
+ chapter_id = url.split("/")[-1].replace(".html", "")
79
+ title = a.text_content().strip()
80
+ chapters.append(
81
+ {
82
+ "title": title,
83
+ "url": url,
84
+ "chapterId": chapter_id,
85
+ }
86
+ )
87
+
88
+ volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
89
+
90
+ return {
91
+ "book_name": book_name,
92
+ "author": author,
93
+ "cover_url": cover_url,
94
+ "update_time": update_time,
95
+ "tags": tags,
96
+ "summary": summary,
97
+ "volumes": volumes,
98
+ "extra": {"download_url": download_url},
99
+ }
100
+
101
+ def parse_chapter(
102
+ self,
103
+ html_list: list[str],
104
+ chapter_id: str,
105
+ **kwargs: Any,
106
+ ) -> ChapterDict | None:
107
+ if not html_list:
108
+ return None
109
+
110
+ tree = html.fromstring(html_list[0])
111
+
112
+ raw_title = self._first_str(tree.xpath("//div[@id='content']//h1/text()"))
113
+ title = raw_title.split("-", 1)[-1].strip()
114
+
115
+ texts = []
116
+ for txt in tree.xpath("//div[@class='chapter']//text()"):
117
+ line = txt.strip()
118
+ # Skip empty/instruction/ad lines
119
+ if not line or self._is_ad_line(txt):
120
+ continue
121
+ texts.append(line)
122
+
123
+ content = "\n".join(texts)
124
+ if not content:
125
+ return None
126
+
127
+ return {
128
+ "id": chapter_id,
129
+ "title": title,
130
+ "content": content,
131
+ "extra": {"site": "aaatxt"},
132
+ }
@@ -0,0 +1,116 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.b520
4
+ ----------------------------------
5
+
6
+ """
7
+
8
+ from typing import Any
9
+
10
+ from lxml import html
11
+
12
+ from novel_downloader.core.parsers.base import BaseParser
13
+ from novel_downloader.core.parsers.registry import register_parser
14
+ from novel_downloader.models import (
15
+ BookInfoDict,
16
+ ChapterDict,
17
+ ChapterInfoDict,
18
+ VolumeInfoDict,
19
+ )
20
+
21
+
22
+ @register_parser(
23
+ site_keys=["biquge", "bqg", "b520"],
24
+ )
25
+ class BiqugeParser(BaseParser):
26
+ """
27
+ Parser for 笔趣阁 book pages.
28
+ """
29
+
30
+ def parse_book_info(
31
+ self,
32
+ html_list: list[str],
33
+ **kwargs: Any,
34
+ ) -> BookInfoDict | None:
35
+ if not html_list:
36
+ return None
37
+
38
+ tree = html.fromstring(html_list[0])
39
+
40
+ book_name = self._first_str(tree.xpath('//div[@id="info"]/h1/text()'))
41
+
42
+ author = self._first_str(
43
+ tree.xpath('//div[@id="info"]/p[1]/text()'),
44
+ replaces=[("\xa0", ""), ("作者:", "")],
45
+ )
46
+
47
+ cover_url = self._first_str(tree.xpath('//div[@id="fmimg"]/img/@src'))
48
+
49
+ update_time = self._first_str(
50
+ tree.xpath('//div[@id="info"]/p[3]/text()'),
51
+ replaces=[("最后更新:", "")],
52
+ )
53
+
54
+ intro_elem = tree.xpath('//div[@id="intro"]')
55
+ summary = "".join(intro_elem[0].itertext()).strip() if intro_elem else ""
56
+
57
+ book_type = self._first_str(tree.xpath('//div[@class="con_top"]/a[2]/text()'))
58
+ tags = [book_type] if book_type else []
59
+
60
+ chapters: list[ChapterInfoDict] = [
61
+ {
62
+ "title": (a.text or "").strip(),
63
+ "url": (a.get("href") or "").strip(),
64
+ "chapterId": (a.get("href") or "").rsplit("/", 1)[-1].split(".", 1)[0],
65
+ }
66
+ for a in tree.xpath(
67
+ '//div[@id="list"]/dl/dt[contains(., "正文")]/following-sibling::dd/a'
68
+ )
69
+ ]
70
+
71
+ volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
72
+
73
+ return {
74
+ "book_name": book_name,
75
+ "author": author,
76
+ "cover_url": cover_url,
77
+ "update_time": update_time,
78
+ "tags": tags,
79
+ "summary": summary,
80
+ "volumes": volumes,
81
+ "extra": {},
82
+ }
83
+
84
+ def parse_chapter(
85
+ self,
86
+ html_list: list[str],
87
+ chapter_id: str,
88
+ **kwargs: Any,
89
+ ) -> ChapterDict | None:
90
+ if not html_list:
91
+ return None
92
+ tree = html.fromstring(html_list[0])
93
+
94
+ title = self._first_str(tree.xpath('//div[@class="bookname"]/h1/text()'))
95
+ if not title:
96
+ title = f"第 {chapter_id} 章"
97
+
98
+ content_elem = tree.xpath('//div[@id="content"]')
99
+ if not content_elem:
100
+ return None
101
+ paragraphs = [
102
+ "".join(p.itertext()).strip() for p in content_elem[0].xpath(".//p")
103
+ ]
104
+ if paragraphs and "www.shuhaige.net" in paragraphs[-1]:
105
+ paragraphs.pop()
106
+
107
+ content = "\n".join(paragraphs)
108
+ if not content.strip():
109
+ return None
110
+
111
+ return {
112
+ "id": chapter_id,
113
+ "title": title,
114
+ "content": content,
115
+ "extra": {"site": "biquge"},
116
+ }
@@ -3,22 +3,17 @@
3
3
  novel_downloader.core.parsers.base
4
4
  ----------------------------------
5
5
 
6
- This module defines the BaseParser abstract class, which implements the
7
- ParserProtocol interface and provides a structured foundation for
8
- site-specific parsers.
9
-
10
- BaseParser manages internal parser state and enforces
11
- a standard parsing interface for:
12
- - Book info pages (e.g. metadata, chapter list)
13
- - Chapter pages (e.g. textual content)
6
+ Abstract base class providing common behavior for site-specific parsers.
14
7
  """
15
8
 
16
9
  import abc
10
+ import re
11
+ from collections.abc import Iterable
17
12
  from pathlib import Path
18
13
  from typing import Any
19
14
 
20
15
  from novel_downloader.core.interfaces import ParserProtocol
21
- from novel_downloader.models import ChapterDict, ParserConfig
16
+ from novel_downloader.models import BookInfoDict, ChapterDict, ParserConfig
22
17
 
23
18
 
24
19
  class BaseParser(ParserProtocol, abc.ABC):
@@ -32,6 +27,10 @@ class BaseParser(ParserProtocol, abc.ABC):
32
27
  Subclasses must implement actual parsing logic for specific sites.
33
28
  """
34
29
 
30
+ ADS: set[str] = set()
31
+
32
+ _SPACE_RE = re.compile(r"\s+")
33
+
35
34
  def __init__(
36
35
  self,
37
36
  config: ParserConfig,
@@ -44,15 +43,19 @@ class BaseParser(ParserProtocol, abc.ABC):
44
43
  self._config = config
45
44
  self._book_id: str | None = None
46
45
 
46
+ self._decode_font: bool = config.decode_font
47
+ self._use_truncation = config.use_truncation
47
48
  self._base_cache_dir = Path(config.cache_dir)
48
49
  self._cache_dir = self._base_cache_dir
49
50
 
51
+ self._ad_pattern = self._compile_ads_pattern()
52
+
50
53
  @abc.abstractmethod
51
54
  def parse_book_info(
52
55
  self,
53
56
  html_list: list[str],
54
57
  **kwargs: Any,
55
- ) -> dict[str, Any]:
58
+ ) -> BookInfoDict | None:
56
59
  """
57
60
  Parse and return a dictionary of book information from the raw HTML.
58
61
 
@@ -69,11 +72,11 @@ class BaseParser(ParserProtocol, abc.ABC):
69
72
  **kwargs: Any,
70
73
  ) -> ChapterDict | None:
71
74
  """
72
- Parse and return the text content of one chapter.
75
+ Parse chapter page and extract the content of one chapter.
73
76
 
74
77
  :param html_list: The HTML list of the chapter pages.
75
78
  :param chapter_id: Identifier of the chapter being parsed.
76
- :return: The chapter's text.
79
+ :return: The chapter's data.
77
80
  """
78
81
  ...
79
82
 
@@ -104,3 +107,51 @@ class BaseParser(ParserProtocol, abc.ABC):
104
107
  book-related folders or states.
105
108
  """
106
109
  pass
110
+
111
+ def _compile_ads_pattern(self) -> re.Pattern[str] | None:
112
+ """
113
+ Compile a regex pattern from the ADS list, or return None if no ADS.
114
+ """
115
+ if not self.ADS:
116
+ return None
117
+
118
+ return re.compile("|".join(map(re.escape, self.ADS)))
119
+
120
+ def _is_ad_line(self, line: str) -> bool:
121
+ """
122
+ Check if a line contains any ad text.
123
+
124
+ :param line: Single text line.
125
+ :return: True if line matches ad pattern, else False.
126
+ """
127
+ return bool(self._ad_pattern and self._ad_pattern.search(line))
128
+
129
+ def _filter_ads(self, lines: Iterable[str]) -> list[str]:
130
+ """
131
+ Filter out lines containing any ad text defined in ADS.
132
+
133
+ :param lines: Iterable of text lines (e.g. chapter content).
134
+ :return: List of lines with ads removed.
135
+ """
136
+ if not self._ad_pattern:
137
+ return list(lines)
138
+ return [line for line in lines if not self._ad_pattern.search(line)]
139
+
140
+ @classmethod
141
+ def _norm_space(cls, s: str, c: str = " ") -> str:
142
+ """
143
+ collapse any run of whitespace (incl. newlines, full-width spaces)
144
+
145
+ :param s: Input string to normalize.
146
+ :param c: Replacement character to use for collapsed whitespace.
147
+ """
148
+ return cls._SPACE_RE.sub(c, s).strip()
149
+
150
+ @staticmethod
151
+ def _first_str(xs: list[str], replaces: list[tuple[str, str]] | None = None) -> str:
152
+ replaces = replaces or []
153
+ value: str = xs[0].strip() if xs else ""
154
+ for replace in replaces:
155
+ old, new = replace
156
+ value = value.replace(old, new)
157
+ return value
@@ -0,0 +1,133 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.biquyuedu
4
+ ---------------------------------------
5
+
6
+ """
7
+
8
+ from typing import Any
9
+
10
+ from lxml import etree, html
11
+
12
+ from novel_downloader.core.parsers.base import BaseParser
13
+ from novel_downloader.core.parsers.registry import register_parser
14
+ from novel_downloader.models import (
15
+ BookInfoDict,
16
+ ChapterDict,
17
+ ChapterInfoDict,
18
+ VolumeInfoDict,
19
+ )
20
+
21
+
22
+ @register_parser(
23
+ site_keys=["biquyuedu"],
24
+ )
25
+ class BiquyueduParser(BaseParser):
26
+ """
27
+ Parser for 精彩小说 book pages.
28
+ """
29
+
30
+ ADS: set[str] = {
31
+ "笔趣阁",
32
+ "请记住本书首发域名",
33
+ "www.biquyuedu.com",
34
+ }
35
+
36
+ def parse_book_info(
37
+ self,
38
+ html_list: list[str],
39
+ **kwargs: Any,
40
+ ) -> BookInfoDict | None:
41
+ if not html_list:
42
+ return None
43
+
44
+ tree = html.fromstring(html_list[0])
45
+
46
+ # --- Metadata ---
47
+ book_name = self._first_str(tree.xpath("//div[@class='info']/h1/text()"))
48
+ author = self._first_str(
49
+ tree.xpath(
50
+ "//div[@class='info']//div[@class='small'][1]//span[1]//a/text()"
51
+ )
52
+ )
53
+ cover_url = self._first_str(
54
+ tree.xpath("//div[@class='info']//div[@class='cover']//img/@src")
55
+ )
56
+ update_time = self._first_str(
57
+ tree.xpath("//div[@class='info']//div[@class='small'][2]//span[1]/text()"),
58
+ replaces=[("更新时间:", "")],
59
+ )
60
+
61
+ crumbs = tree.xpath("//div[@class='path']//div[@class='p']/a/text()")
62
+ book_type = self._first_str(crumbs[1:2])
63
+ tags = [book_type] if book_type else []
64
+
65
+ intro_text = tree.xpath(
66
+ "string(//div[@class='info']//div[@class='intro'])"
67
+ ).strip()
68
+ summary = intro_text.replace("简介:", "", 1).split("作者:", 1)[0].strip()
69
+
70
+ # --- Chapters ---
71
+ chapters: list[ChapterInfoDict] = [
72
+ {
73
+ "title": (a.get("title") or a.text_content() or "").strip(),
74
+ "url": (a.get("href") or "").strip(),
75
+ "chapterId": (a.get("href") or "").rsplit("/", 1)[-1].split(".", 1)[0],
76
+ }
77
+ for a in tree.xpath(
78
+ "//div[@class='listmain']//dl/dd[preceding-sibling::dt[1][contains(text(),'全文')]]/a"
79
+ )
80
+ ]
81
+
82
+ volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
83
+
84
+ return {
85
+ "book_name": book_name,
86
+ "author": author,
87
+ "cover_url": cover_url,
88
+ "update_time": update_time,
89
+ "tags": tags,
90
+ "summary": summary,
91
+ "volumes": volumes,
92
+ "extra": {},
93
+ }
94
+
95
+ def parse_chapter(
96
+ self,
97
+ html_list: list[str],
98
+ chapter_id: str,
99
+ **kwargs: Any,
100
+ ) -> ChapterDict | None:
101
+ if not html_list:
102
+ return None
103
+ tree = html.fromstring(html_list[0])
104
+
105
+ # Extract chapter title via helper
106
+ title = self._first_str(tree.xpath("//div[@class='content']/h1/text()"))
107
+
108
+ # Find the main content container
109
+ content_nodes = tree.xpath("//div[@id='content']")
110
+ if not content_nodes:
111
+ return None
112
+ content_div = content_nodes[0]
113
+
114
+ etree.strip_elements(content_div, "script", with_tail=False)
115
+ raw_texts = content_div.xpath(".//text()[normalize-space()]")
116
+
117
+ # Clean & filter in one comprehension
118
+ paragraphs = [
119
+ txt.replace("\xa0", "").strip()
120
+ for txt in raw_texts
121
+ if not self._is_ad_line(txt)
122
+ ]
123
+
124
+ content = "\n".join(paragraphs)
125
+ if not content.strip():
126
+ return None
127
+
128
+ return {
129
+ "id": chapter_id,
130
+ "title": title,
131
+ "content": content,
132
+ "extra": {"site": "biquyuedu"},
133
+ }