novel-downloader 1.5.0__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (248) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/__init__.py +1 -3
  3. novel_downloader/cli/clean.py +21 -88
  4. novel_downloader/cli/config.py +26 -21
  5. novel_downloader/cli/download.py +79 -66
  6. novel_downloader/cli/export.py +17 -21
  7. novel_downloader/cli/main.py +1 -1
  8. novel_downloader/cli/search.py +62 -65
  9. novel_downloader/cli/ui.py +156 -0
  10. novel_downloader/config/__init__.py +8 -5
  11. novel_downloader/config/adapter.py +206 -209
  12. novel_downloader/config/{loader.py → file_io.py} +53 -26
  13. novel_downloader/core/__init__.py +5 -5
  14. novel_downloader/core/archived/deqixs/fetcher.py +115 -0
  15. novel_downloader/core/archived/deqixs/parser.py +132 -0
  16. novel_downloader/core/archived/deqixs/searcher.py +89 -0
  17. novel_downloader/core/{searchers/qidian.py → archived/qidian/searcher.py} +12 -20
  18. novel_downloader/core/archived/wanbengo/searcher.py +98 -0
  19. novel_downloader/core/archived/xshbook/searcher.py +93 -0
  20. novel_downloader/core/downloaders/__init__.py +3 -24
  21. novel_downloader/core/downloaders/base.py +49 -23
  22. novel_downloader/core/downloaders/common.py +191 -137
  23. novel_downloader/core/downloaders/qianbi.py +187 -146
  24. novel_downloader/core/downloaders/qidian.py +187 -141
  25. novel_downloader/core/downloaders/registry.py +4 -2
  26. novel_downloader/core/downloaders/signals.py +46 -0
  27. novel_downloader/core/exporters/__init__.py +3 -20
  28. novel_downloader/core/exporters/base.py +33 -37
  29. novel_downloader/core/exporters/common/__init__.py +1 -2
  30. novel_downloader/core/exporters/common/epub.py +15 -10
  31. novel_downloader/core/exporters/common/main_exporter.py +19 -12
  32. novel_downloader/core/exporters/common/txt.py +17 -12
  33. novel_downloader/core/exporters/epub_util.py +59 -29
  34. novel_downloader/core/exporters/linovelib/__init__.py +1 -0
  35. novel_downloader/core/exporters/linovelib/epub.py +23 -25
  36. novel_downloader/core/exporters/linovelib/main_exporter.py +8 -12
  37. novel_downloader/core/exporters/linovelib/txt.py +20 -14
  38. novel_downloader/core/exporters/qidian.py +2 -8
  39. novel_downloader/core/exporters/registry.py +4 -2
  40. novel_downloader/core/exporters/txt_util.py +7 -7
  41. novel_downloader/core/fetchers/__init__.py +54 -48
  42. novel_downloader/core/fetchers/aaatxt.py +83 -0
  43. novel_downloader/core/fetchers/{biquge/session.py → b520.py} +6 -11
  44. novel_downloader/core/fetchers/{base/session.py → base.py} +37 -46
  45. novel_downloader/core/fetchers/{biquge/browser.py → biquyuedu.py} +12 -17
  46. novel_downloader/core/fetchers/dxmwx.py +110 -0
  47. novel_downloader/core/fetchers/eightnovel.py +139 -0
  48. novel_downloader/core/fetchers/{esjzone/session.py → esjzone.py} +19 -12
  49. novel_downloader/core/fetchers/guidaye.py +85 -0
  50. novel_downloader/core/fetchers/hetushu.py +92 -0
  51. novel_downloader/core/fetchers/{qianbi/browser.py → i25zw.py} +19 -28
  52. novel_downloader/core/fetchers/ixdzs8.py +113 -0
  53. novel_downloader/core/fetchers/jpxs123.py +101 -0
  54. novel_downloader/core/fetchers/lewenn.py +83 -0
  55. novel_downloader/core/fetchers/{linovelib/session.py → linovelib.py} +12 -13
  56. novel_downloader/core/fetchers/piaotia.py +105 -0
  57. novel_downloader/core/fetchers/qbtr.py +101 -0
  58. novel_downloader/core/fetchers/{qianbi/session.py → qianbi.py} +5 -10
  59. novel_downloader/core/fetchers/{qidian/session.py → qidian.py} +56 -64
  60. novel_downloader/core/fetchers/quanben5.py +92 -0
  61. novel_downloader/core/fetchers/{base/rate_limiter.py → rate_limiter.py} +2 -2
  62. novel_downloader/core/fetchers/registry.py +5 -16
  63. novel_downloader/core/fetchers/{sfacg/session.py → sfacg.py} +7 -10
  64. novel_downloader/core/fetchers/shencou.py +106 -0
  65. novel_downloader/core/fetchers/shuhaige.py +84 -0
  66. novel_downloader/core/fetchers/tongrenquan.py +84 -0
  67. novel_downloader/core/fetchers/ttkan.py +95 -0
  68. novel_downloader/core/fetchers/wanbengo.py +83 -0
  69. novel_downloader/core/fetchers/xiaoshuowu.py +106 -0
  70. novel_downloader/core/fetchers/xiguashuwu.py +177 -0
  71. novel_downloader/core/fetchers/xs63b.py +171 -0
  72. novel_downloader/core/fetchers/xshbook.py +85 -0
  73. novel_downloader/core/fetchers/{yamibo/session.py → yamibo.py} +19 -12
  74. novel_downloader/core/fetchers/yibige.py +114 -0
  75. novel_downloader/core/interfaces/__init__.py +1 -9
  76. novel_downloader/core/interfaces/downloader.py +6 -2
  77. novel_downloader/core/interfaces/exporter.py +7 -7
  78. novel_downloader/core/interfaces/fetcher.py +6 -19
  79. novel_downloader/core/interfaces/parser.py +7 -8
  80. novel_downloader/core/interfaces/searcher.py +9 -1
  81. novel_downloader/core/parsers/__init__.py +49 -12
  82. novel_downloader/core/parsers/aaatxt.py +132 -0
  83. novel_downloader/core/parsers/b520.py +116 -0
  84. novel_downloader/core/parsers/base.py +64 -12
  85. novel_downloader/core/parsers/biquyuedu.py +133 -0
  86. novel_downloader/core/parsers/dxmwx.py +162 -0
  87. novel_downloader/core/parsers/eightnovel.py +224 -0
  88. novel_downloader/core/parsers/esjzone.py +64 -69
  89. novel_downloader/core/parsers/guidaye.py +128 -0
  90. novel_downloader/core/parsers/hetushu.py +139 -0
  91. novel_downloader/core/parsers/i25zw.py +137 -0
  92. novel_downloader/core/parsers/ixdzs8.py +186 -0
  93. novel_downloader/core/parsers/jpxs123.py +137 -0
  94. novel_downloader/core/parsers/lewenn.py +142 -0
  95. novel_downloader/core/parsers/linovelib.py +48 -64
  96. novel_downloader/core/parsers/piaotia.py +189 -0
  97. novel_downloader/core/parsers/qbtr.py +136 -0
  98. novel_downloader/core/parsers/qianbi.py +48 -50
  99. novel_downloader/core/parsers/qidian/main_parser.py +756 -48
  100. novel_downloader/core/parsers/qidian/utils/__init__.py +3 -21
  101. novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +1 -1
  102. novel_downloader/core/parsers/qidian/utils/node_decryptor.py +4 -4
  103. novel_downloader/core/parsers/quanben5.py +103 -0
  104. novel_downloader/core/parsers/registry.py +5 -16
  105. novel_downloader/core/parsers/sfacg.py +38 -45
  106. novel_downloader/core/parsers/shencou.py +215 -0
  107. novel_downloader/core/parsers/shuhaige.py +111 -0
  108. novel_downloader/core/parsers/tongrenquan.py +116 -0
  109. novel_downloader/core/parsers/ttkan.py +132 -0
  110. novel_downloader/core/parsers/wanbengo.py +191 -0
  111. novel_downloader/core/parsers/xiaoshuowu.py +173 -0
  112. novel_downloader/core/parsers/xiguashuwu.py +429 -0
  113. novel_downloader/core/parsers/xs63b.py +161 -0
  114. novel_downloader/core/parsers/xshbook.py +134 -0
  115. novel_downloader/core/parsers/yamibo.py +87 -131
  116. novel_downloader/core/parsers/yibige.py +166 -0
  117. novel_downloader/core/searchers/__init__.py +34 -3
  118. novel_downloader/core/searchers/aaatxt.py +107 -0
  119. novel_downloader/core/searchers/{biquge.py → b520.py} +29 -28
  120. novel_downloader/core/searchers/base.py +112 -36
  121. novel_downloader/core/searchers/dxmwx.py +105 -0
  122. novel_downloader/core/searchers/eightnovel.py +84 -0
  123. novel_downloader/core/searchers/esjzone.py +43 -25
  124. novel_downloader/core/searchers/hetushu.py +92 -0
  125. novel_downloader/core/searchers/i25zw.py +93 -0
  126. novel_downloader/core/searchers/ixdzs8.py +107 -0
  127. novel_downloader/core/searchers/jpxs123.py +107 -0
  128. novel_downloader/core/searchers/piaotia.py +100 -0
  129. novel_downloader/core/searchers/qbtr.py +106 -0
  130. novel_downloader/core/searchers/qianbi.py +74 -40
  131. novel_downloader/core/searchers/quanben5.py +144 -0
  132. novel_downloader/core/searchers/registry.py +24 -8
  133. novel_downloader/core/searchers/shuhaige.py +124 -0
  134. novel_downloader/core/searchers/tongrenquan.py +110 -0
  135. novel_downloader/core/searchers/ttkan.py +92 -0
  136. novel_downloader/core/searchers/xiaoshuowu.py +122 -0
  137. novel_downloader/core/searchers/xiguashuwu.py +95 -0
  138. novel_downloader/core/searchers/xs63b.py +104 -0
  139. novel_downloader/locales/en.json +34 -85
  140. novel_downloader/locales/zh.json +35 -86
  141. novel_downloader/models/__init__.py +21 -22
  142. novel_downloader/models/book.py +44 -0
  143. novel_downloader/models/config.py +4 -37
  144. novel_downloader/models/login.py +1 -1
  145. novel_downloader/models/search.py +5 -0
  146. novel_downloader/resources/config/settings.toml +8 -70
  147. novel_downloader/resources/json/xiguashuwu.json +718 -0
  148. novel_downloader/utils/__init__.py +13 -24
  149. novel_downloader/utils/chapter_storage.py +5 -5
  150. novel_downloader/utils/constants.py +4 -31
  151. novel_downloader/utils/cookies.py +38 -35
  152. novel_downloader/utils/crypto_utils/__init__.py +7 -0
  153. novel_downloader/utils/crypto_utils/aes_util.py +90 -0
  154. novel_downloader/utils/crypto_utils/aes_v1.py +619 -0
  155. novel_downloader/utils/crypto_utils/aes_v2.py +1143 -0
  156. novel_downloader/utils/crypto_utils/rc4.py +54 -0
  157. novel_downloader/utils/epub/__init__.py +3 -4
  158. novel_downloader/utils/epub/builder.py +6 -6
  159. novel_downloader/utils/epub/constants.py +62 -21
  160. novel_downloader/utils/epub/documents.py +95 -201
  161. novel_downloader/utils/epub/models.py +8 -22
  162. novel_downloader/utils/epub/utils.py +73 -106
  163. novel_downloader/utils/file_utils/__init__.py +2 -23
  164. novel_downloader/utils/file_utils/io.py +53 -188
  165. novel_downloader/utils/file_utils/normalize.py +1 -7
  166. novel_downloader/utils/file_utils/sanitize.py +4 -15
  167. novel_downloader/utils/fontocr/__init__.py +5 -14
  168. novel_downloader/utils/fontocr/core.py +216 -0
  169. novel_downloader/utils/fontocr/loader.py +50 -0
  170. novel_downloader/utils/logger.py +81 -65
  171. novel_downloader/utils/network.py +17 -41
  172. novel_downloader/utils/state.py +4 -90
  173. novel_downloader/utils/text_utils/__init__.py +1 -7
  174. novel_downloader/utils/text_utils/diff_display.py +5 -7
  175. novel_downloader/utils/text_utils/text_cleaner.py +39 -30
  176. novel_downloader/utils/text_utils/truncate_utils.py +3 -14
  177. novel_downloader/utils/time_utils/__init__.py +5 -11
  178. novel_downloader/utils/time_utils/datetime_utils.py +20 -29
  179. novel_downloader/utils/time_utils/sleep_utils.py +55 -49
  180. novel_downloader/web/__init__.py +13 -0
  181. novel_downloader/web/components/__init__.py +11 -0
  182. novel_downloader/web/components/navigation.py +35 -0
  183. novel_downloader/web/main.py +66 -0
  184. novel_downloader/web/pages/__init__.py +17 -0
  185. novel_downloader/web/pages/download.py +78 -0
  186. novel_downloader/web/pages/progress.py +147 -0
  187. novel_downloader/web/pages/search.py +329 -0
  188. novel_downloader/web/services/__init__.py +17 -0
  189. novel_downloader/web/services/client_dialog.py +164 -0
  190. novel_downloader/web/services/cred_broker.py +113 -0
  191. novel_downloader/web/services/cred_models.py +35 -0
  192. novel_downloader/web/services/task_manager.py +264 -0
  193. novel_downloader-2.0.1.dist-info/METADATA +172 -0
  194. novel_downloader-2.0.1.dist-info/RECORD +206 -0
  195. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/entry_points.txt +1 -1
  196. novel_downloader/core/downloaders/biquge.py +0 -29
  197. novel_downloader/core/downloaders/esjzone.py +0 -29
  198. novel_downloader/core/downloaders/linovelib.py +0 -29
  199. novel_downloader/core/downloaders/sfacg.py +0 -29
  200. novel_downloader/core/downloaders/yamibo.py +0 -29
  201. novel_downloader/core/exporters/biquge.py +0 -22
  202. novel_downloader/core/exporters/esjzone.py +0 -22
  203. novel_downloader/core/exporters/qianbi.py +0 -22
  204. novel_downloader/core/exporters/sfacg.py +0 -22
  205. novel_downloader/core/exporters/yamibo.py +0 -22
  206. novel_downloader/core/fetchers/base/__init__.py +0 -14
  207. novel_downloader/core/fetchers/base/browser.py +0 -422
  208. novel_downloader/core/fetchers/biquge/__init__.py +0 -14
  209. novel_downloader/core/fetchers/esjzone/__init__.py +0 -14
  210. novel_downloader/core/fetchers/esjzone/browser.py +0 -209
  211. novel_downloader/core/fetchers/linovelib/__init__.py +0 -14
  212. novel_downloader/core/fetchers/linovelib/browser.py +0 -198
  213. novel_downloader/core/fetchers/qianbi/__init__.py +0 -14
  214. novel_downloader/core/fetchers/qidian/__init__.py +0 -14
  215. novel_downloader/core/fetchers/qidian/browser.py +0 -326
  216. novel_downloader/core/fetchers/sfacg/__init__.py +0 -14
  217. novel_downloader/core/fetchers/sfacg/browser.py +0 -194
  218. novel_downloader/core/fetchers/yamibo/__init__.py +0 -14
  219. novel_downloader/core/fetchers/yamibo/browser.py +0 -234
  220. novel_downloader/core/parsers/biquge.py +0 -139
  221. novel_downloader/core/parsers/qidian/book_info_parser.py +0 -90
  222. novel_downloader/core/parsers/qidian/chapter_encrypted.py +0 -528
  223. novel_downloader/core/parsers/qidian/chapter_normal.py +0 -157
  224. novel_downloader/core/parsers/qidian/chapter_router.py +0 -68
  225. novel_downloader/core/parsers/qidian/utils/helpers.py +0 -114
  226. novel_downloader/models/chapter.py +0 -25
  227. novel_downloader/models/types.py +0 -13
  228. novel_downloader/tui/__init__.py +0 -7
  229. novel_downloader/tui/app.py +0 -32
  230. novel_downloader/tui/main.py +0 -17
  231. novel_downloader/tui/screens/__init__.py +0 -14
  232. novel_downloader/tui/screens/home.py +0 -198
  233. novel_downloader/tui/screens/login.py +0 -74
  234. novel_downloader/tui/styles/home_layout.tcss +0 -79
  235. novel_downloader/tui/widgets/richlog_handler.py +0 -24
  236. novel_downloader/utils/cache.py +0 -24
  237. novel_downloader/utils/crypto_utils.py +0 -71
  238. novel_downloader/utils/fontocr/hash_store.py +0 -280
  239. novel_downloader/utils/fontocr/hash_utils.py +0 -103
  240. novel_downloader/utils/fontocr/model_loader.py +0 -69
  241. novel_downloader/utils/fontocr/ocr_v1.py +0 -315
  242. novel_downloader/utils/fontocr/ocr_v2.py +0 -764
  243. novel_downloader/utils/fontocr/ocr_v3.py +0 -744
  244. novel_downloader-1.5.0.dist-info/METADATA +0 -196
  245. novel_downloader-1.5.0.dist-info/RECORD +0 -164
  246. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/WHEEL +0 -0
  247. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/licenses/LICENSE +0 -0
  248. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.1.dist-info}/top_level.txt +0 -0
@@ -12,26 +12,20 @@ from lxml import html
12
12
 
13
13
  from novel_downloader.core.parsers.base import BaseParser
14
14
  from novel_downloader.core.parsers.registry import register_parser
15
- from novel_downloader.models import ChapterDict
15
+ from novel_downloader.models import (
16
+ BookInfoDict,
17
+ ChapterDict,
18
+ VolumeInfoDict,
19
+ )
16
20
 
17
21
 
18
22
  @register_parser(
19
23
  site_keys=["esjzone"],
20
- backends=["session", "browser"],
21
24
  )
22
25
  class EsjzoneParser(BaseParser):
23
- """ """
24
-
25
- # Book info XPaths
26
- _BOOK_NAME_XPATH = '//h2[contains(@class, "text-normal")]/text()'
27
- _AUTHOR_XPATH = '//li[strong[text()="作者:"]]/a/text()'
28
- _COVER_URL_XPATH = '//div[contains(@class,"product-gallery")]//img/@src'
29
- _UPDATE_TIME_XPATH = '//li[strong[text()="更新日期:"]]/text()'
30
- _WORD_COUNT_XPATH = '//span[@id="txt"]/text()'
31
- _TYPE_XPATH = '//li[strong[text()="類型:"]]/text()'
32
- _ALT_NAME_XPATH = '//li[strong[text()="其他書名:"]]/text()'
33
- _WEB_URL_XPATH = '//li[strong[text()="Web生肉:"]]/a/@href'
34
- _SUMMARY_XPATH = '//div[@class="description"]/p//text()'
26
+ """
27
+ Parser for esjzone book pages.
28
+ """
35
29
 
36
30
  # Chapter XPaths
37
31
  _CHAPTER_TEXT_XPATH = 'string(//div[contains(@class, "forum-content")])'
@@ -40,14 +34,13 @@ class EsjzoneParser(BaseParser):
40
34
  '//i[contains(@class, "icon-clock")]/following-sibling::text()',
41
35
  '//i[contains(@class, "icon-pen-tool")]/following-sibling::text()',
42
36
  ]
43
-
44
37
  _CHECK_FORUM_XPATH = '//div[@class="page-title"]//ul[@class="breadcrumbs"]/li[not(@class="slash")]//text()' # noqa: E501
45
38
 
46
39
  def parse_book_info(
47
40
  self,
48
41
  html_list: list[str],
49
42
  **kwargs: Any,
50
- ) -> dict[str, Any]:
43
+ ) -> BookInfoDict | None:
51
44
  """
52
45
  Parse a book info page and extract metadata and chapter structure.
53
46
 
@@ -58,27 +51,40 @@ class EsjzoneParser(BaseParser):
58
51
  :return: Parsed metadata and chapter structure as a dictionary.
59
52
  """
60
53
  if not html_list or self._is_forum_page(html_list):
61
- return {}
54
+ return None
55
+
62
56
  tree = html.fromstring(html_list[0])
63
- result: dict[str, Any] = {}
64
-
65
- result["book_name"] = self._get_text(tree, self._BOOK_NAME_XPATH)
66
- result["author"] = self._get_text(tree, self._AUTHOR_XPATH)
67
- result["cover_url"] = self._get_text(tree, self._COVER_URL_XPATH)
68
- result["update_time"] = self._get_text(tree, self._UPDATE_TIME_XPATH)
69
- result["word_count"] = self._get_text(
70
- tree, self._WORD_COUNT_XPATH, clean_comma=True
57
+
58
+ # --- Basic metadata ---
59
+ book_name = self._first_str(
60
+ tree.xpath('//h2[contains(@class,"text-normal")]/text()')
71
61
  )
72
- result["type"] = self._get_text(tree, self._TYPE_XPATH)
73
- result["alt_name"] = self._get_text(tree, self._ALT_NAME_XPATH)
74
- result["web_url"] = self._get_text(tree, self._WEB_URL_XPATH)
75
- # result["summary"] = self._get_text(tree, self._SUMMARY_XPATH, join=True)
62
+ author = self._first_str(tree.xpath('//li[strong[text()="作者:"]]/a/text()'))
63
+ cover_url = self._first_str(
64
+ tree.xpath('//div[contains(@class,"product-gallery")]//img/@src')
65
+ )
66
+ update_time = self._first_str(
67
+ tree.xpath('//li[strong[text()="更新日期:"]]/text()')
68
+ ) # noqa: E501
69
+ word_count = self._first_str(
70
+ tree.xpath('//span[@id="txt"]/text()'), replaces=[(",", "")]
71
+ )
72
+ book_type = self._first_str(tree.xpath('//li[strong[text()="類型:"]]/text()'))
73
+ alt_name = self._first_str(
74
+ tree.xpath('//li[strong[text()="其他書名:"]]/text()')
75
+ ) # noqa: E501
76
+ web_url = self._first_str(tree.xpath('//li[strong[text()="Web生肉:"]]/a/@href'))
77
+
78
+ # Summary paragraphs
76
79
  paras = tree.xpath('//div[@class="description"]/p')
77
80
  texts = [p.xpath("string()").strip() for p in paras]
78
- result["summary"] = "\n".join(texts).strip()
81
+ summary = "\n".join(t for t in texts if t)
79
82
 
80
- volumes: list[dict[str, Any]] = []
81
- current_vol: dict[str, Any] = {}
83
+ current_vol: VolumeInfoDict = {
84
+ "volume_name": "單卷",
85
+ "chapters": [],
86
+ }
87
+ volumes: list[VolumeInfoDict] = [current_vol]
82
88
 
83
89
  def _is_garbage_title(name: str) -> bool:
84
90
  stripped = name.strip()
@@ -89,25 +95,18 @@ class EsjzoneParser(BaseParser):
89
95
  if _is_garbage_title(name):
90
96
  return
91
97
  name = name.strip() or "未命名卷"
92
- if name == "未命名卷" and current_vol is not None:
98
+ if current_vol and current_vol["volume_name"] == name:
93
99
  return
94
100
  current_vol = {"volume_name": name, "chapters": []}
95
101
  volumes.append(current_vol)
96
102
 
97
- _start_volume("單卷")
98
-
99
- # nodes = tree.xpath('//div[@id="chapterList"]/details') + tree.xpath(
100
- # '//div[@id="chapterList"]/*[not(self::details)]'
101
- # )
102
103
  nodes = tree.xpath('//div[@id="chapterList"]/*')
103
-
104
104
  for node in nodes:
105
105
  tag = node.tag.lower()
106
106
 
107
107
  if tag == "details":
108
108
  # ---- DETAILS-based layout ----
109
- summary = node.find("summary")
110
- vol_name = summary.text if summary is not None else "未命名卷"
109
+ vol_name = node.xpath("string(./summary)").strip() or "未命名卷"
111
110
  _start_volume(vol_name)
112
111
 
113
112
  # all chapters inside this details
@@ -116,7 +115,11 @@ class EsjzoneParser(BaseParser):
116
115
  href = a.get("href", "")
117
116
  chap_id = href.rstrip("/").split("/")[-1].split(".", 1)[0]
118
117
  current_vol["chapters"].append(
119
- {"title": title, "url": href, "chapterId": chap_id}
118
+ {
119
+ "title": title,
120
+ "url": href,
121
+ "chapterId": chap_id,
122
+ }
120
123
  )
121
124
 
122
125
  elif (
@@ -125,9 +128,9 @@ class EsjzoneParser(BaseParser):
125
128
  or tag == "summary"
126
129
  ):
127
130
  # Handle possible volume title markers:
128
- # - <h2>: standard volume header
129
- # - <p class="non">: alternative volume header style
130
- # - <summary>: fallback for stray <summary> tags outside <details>
131
+ # * <h2>: standard volume header
132
+ # * <p class="non">: alternative volume header style
133
+ # * <summary>: fallback for stray <summary> tags outside <details>
131
134
  _start_volume(node.xpath("string()"))
132
135
 
133
136
  elif tag == "a":
@@ -139,9 +142,21 @@ class EsjzoneParser(BaseParser):
139
142
  {"title": title, "url": href, "chapterId": chap_id}
140
143
  )
141
144
  volumes = [vol for vol in volumes if vol["chapters"]]
142
- result["volumes"] = volumes
143
145
 
144
- return result
146
+ return {
147
+ "book_name": book_name,
148
+ "author": author,
149
+ "cover_url": cover_url,
150
+ "update_time": update_time,
151
+ "summary": summary,
152
+ "tags": [book_type],
153
+ "word_count": word_count,
154
+ "volumes": volumes,
155
+ "extra": {
156
+ "alt_name": alt_name,
157
+ "web_url": web_url,
158
+ },
159
+ }
145
160
 
146
161
  def parse_chapter(
147
162
  self,
@@ -149,16 +164,9 @@ class EsjzoneParser(BaseParser):
149
164
  chapter_id: str,
150
165
  **kwargs: Any,
151
166
  ) -> ChapterDict | None:
152
- """
153
- Parse a single chapter page and extract clean text or simplified HTML.
154
-
155
- :param html_list: Raw HTML of the chapter page.
156
- :param chapter_id: Identifier of the chapter being parsed.
157
- :return: Cleaned chapter content as plain text or minimal HTML.
158
- """
159
167
  if not html_list or self._is_forum_page(html_list):
160
168
  return None
161
- tree = html.fromstring(html_list[0], parser=None)
169
+ tree = html.fromstring(html_list[0])
162
170
 
163
171
  content_lines: list[str] = []
164
172
  content_nodes = tree.xpath(self._CHAPTER_CONTENT_NODES_XPATH)
@@ -178,7 +186,7 @@ class EsjzoneParser(BaseParser):
178
186
  content_lines.append(f'<img src="{src}" />')
179
187
 
180
188
  content = (
181
- "\n\n".join(content_lines).strip()
189
+ "\n".join(content_lines).strip()
182
190
  if content_lines
183
191
  else tree.xpath(self._CHAPTER_TEXT_XPATH).strip()
184
192
  )
@@ -216,16 +224,3 @@ class EsjzoneParser(BaseParser):
216
224
  breadcrumb: list[str] = tree.xpath(self._CHECK_FORUM_XPATH)
217
225
  breadcrumb = [s.strip() for s in breadcrumb if s.strip()]
218
226
  return breadcrumb == ["Home", "論壇"]
219
-
220
- @staticmethod
221
- def _get_text(
222
- tree: html.HtmlElement,
223
- xpath: str,
224
- join: bool = False,
225
- clean_comma: bool = False,
226
- ) -> str:
227
- data = tree.xpath(xpath)
228
- if not data:
229
- return ""
230
- text = "\n".join(data) if join else data[0].strip()
231
- return text.replace(",", "") if clean_comma else text
@@ -0,0 +1,128 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.guidaye
4
+ -------------------------------------
5
+
6
+ """
7
+
8
+ import re
9
+ from datetime import datetime
10
+ from typing import Any
11
+
12
+ from lxml import html
13
+
14
+ from novel_downloader.core.parsers.base import BaseParser
15
+ from novel_downloader.core.parsers.registry import register_parser
16
+ from novel_downloader.models import (
17
+ BookInfoDict,
18
+ ChapterDict,
19
+ VolumeInfoDict,
20
+ )
21
+
22
+
23
+ @register_parser(
24
+ site_keys=["guidaye"],
25
+ )
26
+ class GuidayeParser(BaseParser):
27
+ """
28
+ Parser for 名著阅读 book pages.
29
+ """
30
+
31
+ BASE_URL = "https://b.guidaye.com"
32
+
33
+ def parse_book_info(
34
+ self,
35
+ html_list: list[str],
36
+ **kwargs: Any,
37
+ ) -> BookInfoDict | None:
38
+ if not html_list:
39
+ return None
40
+
41
+ tree = html.fromstring(html_list[0])
42
+
43
+ # Book metadata
44
+ book_name = self._first_str(tree.xpath('//h1[@class="page-title"]/a/text()'))
45
+ author = self._first_str(
46
+ tree.xpath('//div[@id="category-description-author"]/a/text()')
47
+ )
48
+ cover_url = self.BASE_URL + self._first_str(
49
+ tree.xpath('//div[@id="category-description-image"]//img/@src')
50
+ )
51
+
52
+ # Summary paragraphs
53
+ summary = (
54
+ tree.xpath('string(//div[@id="category-description-text"])')
55
+ .replace("内容简介:", "", 1)
56
+ .strip()
57
+ )
58
+
59
+ # Chapter volumes & listings
60
+ volumes: list[VolumeInfoDict] = []
61
+ curr_vol: VolumeInfoDict = {"volume_name": "未命名卷", "chapters": []}
62
+
63
+ items = tree.xpath('//div[@class="entry-content"]/ul/*')
64
+ for elem in items:
65
+ if elem.tag.lower() == "h3":
66
+ # Flush previous volume
67
+ if curr_vol["chapters"]:
68
+ volumes.append(curr_vol)
69
+ curr_vol = {"volume_name": elem.text_content().strip(), "chapters": []}
70
+ elif elem.tag.lower() == "li":
71
+ link = elem.xpath(".//a")[0]
72
+ href = link.get("href", "").strip()
73
+ title = link.get("title", "").strip()
74
+ cid_match = re.search(r"/(\d+)\.html$", href)
75
+ chapter_id = cid_match.group(1) if cid_match else ""
76
+ curr_vol["chapters"].append(
77
+ {"title": title, "url": href, "chapterId": chapter_id}
78
+ )
79
+
80
+ # Append last volume
81
+ if curr_vol["chapters"]:
82
+ volumes.append(curr_vol)
83
+
84
+ # Timestamp of parsing
85
+ share_text = tree.xpath('string(//div[@id="category-description-share"])')
86
+ m = re.search(r"最近更新[::]\s*([\d-]+)", share_text)
87
+ update_time = m.group(1) if m else datetime.now().strftime("%Y-%m-%d")
88
+
89
+ return {
90
+ "book_name": book_name,
91
+ "author": author,
92
+ "cover_url": cover_url,
93
+ "update_time": update_time,
94
+ "summary": summary,
95
+ "volumes": volumes,
96
+ "extra": {},
97
+ }
98
+
99
+ def parse_chapter(
100
+ self,
101
+ html_list: list[str],
102
+ chapter_id: str,
103
+ **kwargs: Any,
104
+ ) -> ChapterDict | None:
105
+ if not html_list:
106
+ return None
107
+ tree = html.fromstring(html_list[0])
108
+
109
+ # Title from entry-title
110
+ title = self._first_str(tree.xpath('//h1[@class="entry-title"]/text()'))
111
+
112
+ # Extract paragraphs within entry-content
113
+ full_text = tree.xpath('string(//div[@class="entry-content"])')
114
+ full_text = full_text.replace("\u00A0", " ")
115
+
116
+ # 3. Split into lines and clean up
117
+ lines = [line.strip() for line in full_text.splitlines() if line.strip()]
118
+ if not lines:
119
+ return None
120
+
121
+ content = "\n".join(lines)
122
+
123
+ return {
124
+ "id": chapter_id,
125
+ "title": title,
126
+ "content": content,
127
+ "extra": {"site": "guidaye"},
128
+ }
@@ -0,0 +1,139 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.hetushu
4
+ -------------------------------------
5
+
6
+ """
7
+
8
+ import re
9
+ from datetime import datetime
10
+ from typing import Any
11
+
12
+ from lxml import html
13
+
14
+ from novel_downloader.core.parsers.base import BaseParser
15
+ from novel_downloader.core.parsers.registry import register_parser
16
+ from novel_downloader.models import (
17
+ BookInfoDict,
18
+ ChapterDict,
19
+ VolumeInfoDict,
20
+ )
21
+
22
+
23
+ @register_parser(
24
+ site_keys=["hetushu"],
25
+ )
26
+ class HetushuParser(BaseParser):
27
+ """
28
+ Parser for 和图书 book pages.
29
+ """
30
+
31
+ BASE_URL = "https://www.hetushu.com"
32
+
33
+ def parse_book_info(
34
+ self,
35
+ html_list: list[str],
36
+ **kwargs: Any,
37
+ ) -> BookInfoDict | None:
38
+ if not html_list:
39
+ return None
40
+
41
+ tree = html.fromstring(html_list[0])
42
+
43
+ # --- Metadata ---
44
+ book_name = self._first_str(
45
+ tree.xpath('//div[contains(@class,"book_info")]/h2/text()')
46
+ )
47
+ author = self._first_str(
48
+ tree.xpath(
49
+ '//div[contains(@class,"book_info")]/div[contains(.,"作者")]/a/text()'
50
+ )
51
+ )
52
+ cover_url = self.BASE_URL + self._first_str(
53
+ tree.xpath('//div[contains(@class,"book_info")]//img/@src')
54
+ )
55
+
56
+ cls_attr = self._first_str(
57
+ tree.xpath('//div[contains(@class,"book_info")]/@class')
58
+ )
59
+ serial_status = "已完结" if "finish" in cls_attr else "连载中"
60
+
61
+ tags = [
62
+ a.strip()
63
+ for a in tree.xpath('//dl[@class="tag"]//dd/a/text()')
64
+ if a.strip()
65
+ ]
66
+
67
+ paras = tree.xpath('//div[@class="intro"]/p/text()')
68
+ summary = "\n".join(p.strip() for p in paras if p.strip())
69
+
70
+ # --- Chapter volumes & listings ---
71
+ volumes: list[VolumeInfoDict] = []
72
+ curr_vol: VolumeInfoDict = {"volume_name": "未命名卷", "chapters": []}
73
+
74
+ for elem in tree.xpath('//dl[@id="dir"]/*'):
75
+ if elem.tag == "dt":
76
+ # Start a new volume
77
+ if curr_vol["chapters"]:
78
+ volumes.append(curr_vol)
79
+ curr_vol = {
80
+ "volume_name": elem.text_content().strip(),
81
+ "chapters": [],
82
+ }
83
+ elif elem.tag == "dd":
84
+ link = elem.xpath(".//a")[0]
85
+ href = link.get("href", "").strip()
86
+ title = link.get("title", "").strip()
87
+ # Extract numeric chapterId from the URL
88
+ m = re.search(r"/book/\d+/(?P<id>\d+)\.html", href)
89
+ chapter_id = m.group("id") if m else ""
90
+ curr_vol["chapters"].append(
91
+ {"title": title, "url": href, "chapterId": chapter_id}
92
+ )
93
+
94
+ # Append the last volume if it has any chapters
95
+ if curr_vol["chapters"]:
96
+ volumes.append(curr_vol)
97
+
98
+ update_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
99
+
100
+ return {
101
+ "book_name": book_name,
102
+ "author": author,
103
+ "cover_url": cover_url,
104
+ "update_time": update_time,
105
+ "serial_status": serial_status,
106
+ "tags": tags,
107
+ "summary": summary,
108
+ "volumes": volumes,
109
+ "extra": {},
110
+ }
111
+
112
+ def parse_chapter(
113
+ self,
114
+ html_list: list[str],
115
+ chapter_id: str,
116
+ **kwargs: Any,
117
+ ) -> ChapterDict | None:
118
+ if not html_list:
119
+ return None
120
+
121
+ tree = html.fromstring(html_list[0])
122
+
123
+ title = self._first_str(
124
+ tree.xpath('//div[@id="content"]//h2[@class="h2"]/text()')
125
+ )
126
+
127
+ paras = tree.xpath('//div[@id="content"]/div[not(@class)]/text()')
128
+ paragraph_texts = [p.strip() for p in paras if p.strip()]
129
+
130
+ content = "\n".join(paragraph_texts)
131
+ if not content.strip():
132
+ return None
133
+
134
+ return {
135
+ "id": chapter_id,
136
+ "title": title,
137
+ "content": content,
138
+ "extra": {"site": "hetushu"},
139
+ }
@@ -0,0 +1,137 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.i25zw
4
+ -----------------------------------
5
+
6
+ """
7
+
8
+ from typing import Any
9
+
10
+ from lxml import html
11
+
12
+ from novel_downloader.core.parsers.base import BaseParser
13
+ from novel_downloader.core.parsers.registry import register_parser
14
+ from novel_downloader.models import (
15
+ BookInfoDict,
16
+ ChapterDict,
17
+ ChapterInfoDict,
18
+ VolumeInfoDict,
19
+ )
20
+
21
+
22
+ @register_parser(
23
+ site_keys=["i25zw"],
24
+ )
25
+ class I25zwParser(BaseParser):
26
+ """
27
+ Parser for 25中文网 book-info pages.
28
+ """
29
+
30
+ def parse_book_info(
31
+ self,
32
+ html_list: list[str],
33
+ **kwargs: Any,
34
+ ) -> BookInfoDict | None:
35
+ if len(html_list) < 2:
36
+ return None
37
+
38
+ info_tree = html.fromstring(html_list[0])
39
+ catalog_tree = html.fromstring(html_list[1])
40
+
41
+ # Metadata extraction
42
+ book_name = self._first_str(info_tree.xpath("//h1[@class='f21h']/text()"))
43
+ author = self._first_str(info_tree.xpath("//h1[@class='f21h']/em/a/text()"))
44
+ cover_url = self._first_str(info_tree.xpath("//div[@class='pic']/img/@src"))
45
+
46
+ # Tags, status, word count, update time
47
+ tag = self._first_str(
48
+ info_tree.xpath("//b[contains(text(),'小说分类')]/parent::td/text()")
49
+ )
50
+ serial_status = self._first_str(
51
+ info_tree.xpath("//b[contains(text(),'小说状态')]/parent::td/text()")
52
+ )
53
+ word_count = self._first_str(
54
+ info_tree.xpath("//b[contains(text(),'全文字数')]/parent::td/text()")
55
+ )
56
+ raw_update = self._first_str(
57
+ info_tree.xpath("//b[contains(text(),'更新时间')]/parent::td/text()")
58
+ )
59
+ update_time = raw_update.strip("()")
60
+
61
+ # Summary from styled intro div
62
+ full_intro = info_tree.xpath("string(//div[@class='intro'][@style])").strip()
63
+ summary = full_intro.replace(f"关于{book_name}:", "", 1).strip()
64
+
65
+ # Chapter list extraction
66
+ dl = catalog_tree.xpath("//div[@id='list']/dl")[0]
67
+ # Full-text section dd's
68
+ dds = dl.xpath("./dd[preceding-sibling::dt[1][contains(., '正文')]]/a")
69
+ if not dds:
70
+ # Fallback to second <dt>'s following <dd>
71
+ dds = dl.xpath("./dt[2]/following-sibling::dd/a")
72
+
73
+ chapters: list[ChapterInfoDict] = []
74
+ for a in dds:
75
+ url = a.get("href", "").strip()
76
+ title = a.text_content().strip()
77
+ # '/311006/252845677.html' -> '252845677'
78
+ chapter_id = url.split("/")[-1].split(".")[0]
79
+ chapters.append(
80
+ {
81
+ "title": title,
82
+ "url": url,
83
+ "chapterId": chapter_id,
84
+ }
85
+ )
86
+ volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
87
+
88
+ return {
89
+ "book_name": book_name,
90
+ "author": author,
91
+ "cover_url": cover_url,
92
+ "update_time": update_time,
93
+ "word_count": word_count,
94
+ "serial_status": serial_status,
95
+ "tags": [tag] if tag else [],
96
+ "summary": summary,
97
+ "volumes": volumes,
98
+ "extra": {},
99
+ }
100
+
101
+ def parse_chapter(
102
+ self,
103
+ html_list: list[str],
104
+ chapter_id: str,
105
+ **kwargs: Any,
106
+ ) -> ChapterDict | None:
107
+ if not html_list:
108
+ return None
109
+
110
+ tree = html.fromstring(html_list[0])
111
+
112
+ title_text = self._first_str(
113
+ tree.xpath("//div[@class='zhangjieming']/h1/text()")
114
+ )
115
+
116
+ content_divs = tree.xpath("//div[@id='content']")
117
+ if not content_divs:
118
+ return None
119
+ content_div = content_divs[0]
120
+
121
+ # Only select direct <p> children to avoid nav links
122
+ paragraphs = []
123
+ for p in content_div.xpath("./p"):
124
+ text = p.text_content().strip()
125
+ if text:
126
+ paragraphs.append(text)
127
+
128
+ content_text = "\n".join(paragraphs)
129
+ if not content_text.strip():
130
+ return None
131
+
132
+ return {
133
+ "id": chapter_id,
134
+ "title": title_text,
135
+ "content": content_text,
136
+ "extra": {"site": "i25zw"},
137
+ }