novel-downloader 1.5.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (241) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/__init__.py +1 -3
  3. novel_downloader/cli/clean.py +21 -88
  4. novel_downloader/cli/config.py +26 -21
  5. novel_downloader/cli/download.py +77 -64
  6. novel_downloader/cli/export.py +16 -20
  7. novel_downloader/cli/main.py +1 -1
  8. novel_downloader/cli/search.py +62 -65
  9. novel_downloader/cli/ui.py +156 -0
  10. novel_downloader/config/__init__.py +8 -5
  11. novel_downloader/config/adapter.py +65 -105
  12. novel_downloader/config/{loader.py → file_io.py} +53 -26
  13. novel_downloader/core/__init__.py +1 -0
  14. novel_downloader/core/archived/deqixs/fetcher.py +115 -0
  15. novel_downloader/core/archived/deqixs/parser.py +132 -0
  16. novel_downloader/core/archived/deqixs/searcher.py +89 -0
  17. novel_downloader/core/{searchers/qidian.py → archived/qidian/searcher.py} +12 -20
  18. novel_downloader/core/archived/wanbengo/searcher.py +98 -0
  19. novel_downloader/core/archived/xshbook/searcher.py +93 -0
  20. novel_downloader/core/downloaders/__init__.py +3 -24
  21. novel_downloader/core/downloaders/base.py +49 -23
  22. novel_downloader/core/downloaders/common.py +191 -137
  23. novel_downloader/core/downloaders/qianbi.py +187 -146
  24. novel_downloader/core/downloaders/qidian.py +187 -141
  25. novel_downloader/core/downloaders/registry.py +4 -2
  26. novel_downloader/core/downloaders/signals.py +46 -0
  27. novel_downloader/core/exporters/__init__.py +3 -20
  28. novel_downloader/core/exporters/base.py +33 -37
  29. novel_downloader/core/exporters/common/__init__.py +1 -2
  30. novel_downloader/core/exporters/common/epub.py +15 -10
  31. novel_downloader/core/exporters/common/main_exporter.py +19 -12
  32. novel_downloader/core/exporters/common/txt.py +14 -9
  33. novel_downloader/core/exporters/epub_util.py +59 -29
  34. novel_downloader/core/exporters/linovelib/__init__.py +1 -0
  35. novel_downloader/core/exporters/linovelib/epub.py +23 -25
  36. novel_downloader/core/exporters/linovelib/main_exporter.py +8 -12
  37. novel_downloader/core/exporters/linovelib/txt.py +17 -11
  38. novel_downloader/core/exporters/qidian.py +2 -8
  39. novel_downloader/core/exporters/registry.py +4 -2
  40. novel_downloader/core/exporters/txt_util.py +7 -7
  41. novel_downloader/core/fetchers/__init__.py +54 -48
  42. novel_downloader/core/fetchers/aaatxt.py +83 -0
  43. novel_downloader/core/fetchers/{biquge/session.py → b520.py} +6 -11
  44. novel_downloader/core/fetchers/{base/session.py → base.py} +37 -46
  45. novel_downloader/core/fetchers/{biquge/browser.py → biquyuedu.py} +12 -17
  46. novel_downloader/core/fetchers/dxmwx.py +110 -0
  47. novel_downloader/core/fetchers/eightnovel.py +139 -0
  48. novel_downloader/core/fetchers/{esjzone/session.py → esjzone.py} +19 -12
  49. novel_downloader/core/fetchers/guidaye.py +85 -0
  50. novel_downloader/core/fetchers/hetushu.py +92 -0
  51. novel_downloader/core/fetchers/{qianbi/browser.py → i25zw.py} +19 -28
  52. novel_downloader/core/fetchers/ixdzs8.py +113 -0
  53. novel_downloader/core/fetchers/jpxs123.py +101 -0
  54. novel_downloader/core/fetchers/lewenn.py +83 -0
  55. novel_downloader/core/fetchers/{linovelib/session.py → linovelib.py} +12 -13
  56. novel_downloader/core/fetchers/piaotia.py +105 -0
  57. novel_downloader/core/fetchers/qbtr.py +101 -0
  58. novel_downloader/core/fetchers/{qianbi/session.py → qianbi.py} +5 -10
  59. novel_downloader/core/fetchers/{qidian/session.py → qidian.py} +46 -39
  60. novel_downloader/core/fetchers/quanben5.py +92 -0
  61. novel_downloader/core/fetchers/{base/rate_limiter.py → rate_limiter.py} +2 -2
  62. novel_downloader/core/fetchers/registry.py +5 -16
  63. novel_downloader/core/fetchers/{sfacg/session.py → sfacg.py} +7 -10
  64. novel_downloader/core/fetchers/shencou.py +106 -0
  65. novel_downloader/core/fetchers/shuhaige.py +84 -0
  66. novel_downloader/core/fetchers/tongrenquan.py +84 -0
  67. novel_downloader/core/fetchers/ttkan.py +95 -0
  68. novel_downloader/core/fetchers/wanbengo.py +83 -0
  69. novel_downloader/core/fetchers/xiaoshuowu.py +106 -0
  70. novel_downloader/core/fetchers/xiguashuwu.py +177 -0
  71. novel_downloader/core/fetchers/xs63b.py +171 -0
  72. novel_downloader/core/fetchers/xshbook.py +85 -0
  73. novel_downloader/core/fetchers/{yamibo/session.py → yamibo.py} +19 -12
  74. novel_downloader/core/fetchers/yibige.py +114 -0
  75. novel_downloader/core/interfaces/__init__.py +1 -9
  76. novel_downloader/core/interfaces/downloader.py +6 -2
  77. novel_downloader/core/interfaces/exporter.py +7 -7
  78. novel_downloader/core/interfaces/fetcher.py +4 -17
  79. novel_downloader/core/interfaces/parser.py +5 -6
  80. novel_downloader/core/interfaces/searcher.py +9 -1
  81. novel_downloader/core/parsers/__init__.py +49 -12
  82. novel_downloader/core/parsers/aaatxt.py +132 -0
  83. novel_downloader/core/parsers/b520.py +116 -0
  84. novel_downloader/core/parsers/base.py +63 -12
  85. novel_downloader/core/parsers/biquyuedu.py +133 -0
  86. novel_downloader/core/parsers/dxmwx.py +162 -0
  87. novel_downloader/core/parsers/eightnovel.py +224 -0
  88. novel_downloader/core/parsers/esjzone.py +61 -66
  89. novel_downloader/core/parsers/guidaye.py +128 -0
  90. novel_downloader/core/parsers/hetushu.py +139 -0
  91. novel_downloader/core/parsers/i25zw.py +137 -0
  92. novel_downloader/core/parsers/ixdzs8.py +186 -0
  93. novel_downloader/core/parsers/jpxs123.py +137 -0
  94. novel_downloader/core/parsers/lewenn.py +142 -0
  95. novel_downloader/core/parsers/linovelib.py +48 -64
  96. novel_downloader/core/parsers/piaotia.py +189 -0
  97. novel_downloader/core/parsers/qbtr.py +136 -0
  98. novel_downloader/core/parsers/qianbi.py +48 -50
  99. novel_downloader/core/parsers/qidian/book_info_parser.py +58 -59
  100. novel_downloader/core/parsers/qidian/chapter_encrypted.py +272 -330
  101. novel_downloader/core/parsers/qidian/chapter_normal.py +24 -55
  102. novel_downloader/core/parsers/qidian/main_parser.py +11 -38
  103. novel_downloader/core/parsers/qidian/utils/__init__.py +1 -0
  104. novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +1 -1
  105. novel_downloader/core/parsers/qidian/utils/fontmap_recover.py +143 -0
  106. novel_downloader/core/parsers/qidian/utils/helpers.py +0 -4
  107. novel_downloader/core/parsers/quanben5.py +103 -0
  108. novel_downloader/core/parsers/registry.py +5 -16
  109. novel_downloader/core/parsers/sfacg.py +38 -45
  110. novel_downloader/core/parsers/shencou.py +215 -0
  111. novel_downloader/core/parsers/shuhaige.py +111 -0
  112. novel_downloader/core/parsers/tongrenquan.py +116 -0
  113. novel_downloader/core/parsers/ttkan.py +132 -0
  114. novel_downloader/core/parsers/wanbengo.py +191 -0
  115. novel_downloader/core/parsers/xiaoshuowu.py +173 -0
  116. novel_downloader/core/parsers/xiguashuwu.py +435 -0
  117. novel_downloader/core/parsers/xs63b.py +161 -0
  118. novel_downloader/core/parsers/xshbook.py +134 -0
  119. novel_downloader/core/parsers/yamibo.py +87 -131
  120. novel_downloader/core/parsers/yibige.py +166 -0
  121. novel_downloader/core/searchers/__init__.py +34 -3
  122. novel_downloader/core/searchers/aaatxt.py +107 -0
  123. novel_downloader/core/searchers/{biquge.py → b520.py} +29 -28
  124. novel_downloader/core/searchers/base.py +112 -36
  125. novel_downloader/core/searchers/dxmwx.py +105 -0
  126. novel_downloader/core/searchers/eightnovel.py +84 -0
  127. novel_downloader/core/searchers/esjzone.py +43 -25
  128. novel_downloader/core/searchers/hetushu.py +92 -0
  129. novel_downloader/core/searchers/i25zw.py +93 -0
  130. novel_downloader/core/searchers/ixdzs8.py +107 -0
  131. novel_downloader/core/searchers/jpxs123.py +107 -0
  132. novel_downloader/core/searchers/piaotia.py +100 -0
  133. novel_downloader/core/searchers/qbtr.py +106 -0
  134. novel_downloader/core/searchers/qianbi.py +74 -40
  135. novel_downloader/core/searchers/quanben5.py +144 -0
  136. novel_downloader/core/searchers/registry.py +24 -8
  137. novel_downloader/core/searchers/shuhaige.py +124 -0
  138. novel_downloader/core/searchers/tongrenquan.py +110 -0
  139. novel_downloader/core/searchers/ttkan.py +92 -0
  140. novel_downloader/core/searchers/xiaoshuowu.py +122 -0
  141. novel_downloader/core/searchers/xiguashuwu.py +95 -0
  142. novel_downloader/core/searchers/xs63b.py +104 -0
  143. novel_downloader/locales/en.json +31 -82
  144. novel_downloader/locales/zh.json +32 -83
  145. novel_downloader/models/__init__.py +21 -22
  146. novel_downloader/models/book.py +44 -0
  147. novel_downloader/models/config.py +4 -37
  148. novel_downloader/models/login.py +1 -1
  149. novel_downloader/models/search.py +5 -0
  150. novel_downloader/resources/config/settings.toml +8 -70
  151. novel_downloader/resources/json/xiguashuwu.json +718 -0
  152. novel_downloader/utils/__init__.py +13 -22
  153. novel_downloader/utils/chapter_storage.py +3 -2
  154. novel_downloader/utils/constants.py +4 -29
  155. novel_downloader/utils/cookies.py +6 -18
  156. novel_downloader/utils/crypto_utils/__init__.py +13 -0
  157. novel_downloader/utils/crypto_utils/aes_util.py +90 -0
  158. novel_downloader/utils/crypto_utils/aes_v1.py +619 -0
  159. novel_downloader/utils/crypto_utils/aes_v2.py +1143 -0
  160. novel_downloader/utils/{crypto_utils.py → crypto_utils/rc4.py} +3 -10
  161. novel_downloader/utils/epub/__init__.py +1 -1
  162. novel_downloader/utils/epub/constants.py +57 -16
  163. novel_downloader/utils/epub/documents.py +88 -194
  164. novel_downloader/utils/epub/models.py +0 -14
  165. novel_downloader/utils/epub/utils.py +63 -96
  166. novel_downloader/utils/file_utils/__init__.py +2 -23
  167. novel_downloader/utils/file_utils/io.py +3 -113
  168. novel_downloader/utils/file_utils/sanitize.py +0 -4
  169. novel_downloader/utils/fontocr.py +207 -0
  170. novel_downloader/utils/logger.py +8 -16
  171. novel_downloader/utils/network.py +2 -2
  172. novel_downloader/utils/state.py +4 -90
  173. novel_downloader/utils/text_utils/__init__.py +1 -7
  174. novel_downloader/utils/text_utils/diff_display.py +5 -7
  175. novel_downloader/utils/time_utils/__init__.py +5 -11
  176. novel_downloader/utils/time_utils/datetime_utils.py +20 -29
  177. novel_downloader/utils/time_utils/sleep_utils.py +4 -8
  178. novel_downloader/web/__init__.py +13 -0
  179. novel_downloader/web/components/__init__.py +11 -0
  180. novel_downloader/web/components/navigation.py +35 -0
  181. novel_downloader/web/main.py +66 -0
  182. novel_downloader/web/pages/__init__.py +17 -0
  183. novel_downloader/web/pages/download.py +78 -0
  184. novel_downloader/web/pages/progress.py +147 -0
  185. novel_downloader/web/pages/search.py +329 -0
  186. novel_downloader/web/services/__init__.py +17 -0
  187. novel_downloader/web/services/client_dialog.py +164 -0
  188. novel_downloader/web/services/cred_broker.py +113 -0
  189. novel_downloader/web/services/cred_models.py +35 -0
  190. novel_downloader/web/services/task_manager.py +264 -0
  191. novel_downloader-2.0.0.dist-info/METADATA +171 -0
  192. novel_downloader-2.0.0.dist-info/RECORD +210 -0
  193. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/entry_points.txt +1 -1
  194. novel_downloader/core/downloaders/biquge.py +0 -29
  195. novel_downloader/core/downloaders/esjzone.py +0 -29
  196. novel_downloader/core/downloaders/linovelib.py +0 -29
  197. novel_downloader/core/downloaders/sfacg.py +0 -29
  198. novel_downloader/core/downloaders/yamibo.py +0 -29
  199. novel_downloader/core/exporters/biquge.py +0 -22
  200. novel_downloader/core/exporters/esjzone.py +0 -22
  201. novel_downloader/core/exporters/qianbi.py +0 -22
  202. novel_downloader/core/exporters/sfacg.py +0 -22
  203. novel_downloader/core/exporters/yamibo.py +0 -22
  204. novel_downloader/core/fetchers/base/__init__.py +0 -14
  205. novel_downloader/core/fetchers/base/browser.py +0 -422
  206. novel_downloader/core/fetchers/biquge/__init__.py +0 -14
  207. novel_downloader/core/fetchers/esjzone/__init__.py +0 -14
  208. novel_downloader/core/fetchers/esjzone/browser.py +0 -209
  209. novel_downloader/core/fetchers/linovelib/__init__.py +0 -14
  210. novel_downloader/core/fetchers/linovelib/browser.py +0 -198
  211. novel_downloader/core/fetchers/qianbi/__init__.py +0 -14
  212. novel_downloader/core/fetchers/qidian/__init__.py +0 -14
  213. novel_downloader/core/fetchers/qidian/browser.py +0 -326
  214. novel_downloader/core/fetchers/sfacg/__init__.py +0 -14
  215. novel_downloader/core/fetchers/sfacg/browser.py +0 -194
  216. novel_downloader/core/fetchers/yamibo/__init__.py +0 -14
  217. novel_downloader/core/fetchers/yamibo/browser.py +0 -234
  218. novel_downloader/core/parsers/biquge.py +0 -139
  219. novel_downloader/models/chapter.py +0 -25
  220. novel_downloader/models/types.py +0 -13
  221. novel_downloader/tui/__init__.py +0 -7
  222. novel_downloader/tui/app.py +0 -32
  223. novel_downloader/tui/main.py +0 -17
  224. novel_downloader/tui/screens/__init__.py +0 -14
  225. novel_downloader/tui/screens/home.py +0 -198
  226. novel_downloader/tui/screens/login.py +0 -74
  227. novel_downloader/tui/styles/home_layout.tcss +0 -79
  228. novel_downloader/tui/widgets/richlog_handler.py +0 -24
  229. novel_downloader/utils/cache.py +0 -24
  230. novel_downloader/utils/fontocr/__init__.py +0 -22
  231. novel_downloader/utils/fontocr/hash_store.py +0 -280
  232. novel_downloader/utils/fontocr/hash_utils.py +0 -103
  233. novel_downloader/utils/fontocr/model_loader.py +0 -69
  234. novel_downloader/utils/fontocr/ocr_v1.py +0 -315
  235. novel_downloader/utils/fontocr/ocr_v2.py +0 -764
  236. novel_downloader/utils/fontocr/ocr_v3.py +0 -744
  237. novel_downloader-1.5.0.dist-info/METADATA +0 -196
  238. novel_downloader-1.5.0.dist-info/RECORD +0 -164
  239. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/WHEEL +0 -0
  240. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/licenses/LICENSE +0 -0
  241. {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,162 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.dxmwx
4
+ -----------------------------------
5
+
6
+ """
7
+
8
+ import re
9
+ from datetime import datetime
10
+ from typing import Any
11
+
12
+ from lxml import html
13
+
14
+ from novel_downloader.core.parsers.base import BaseParser
15
+ from novel_downloader.core.parsers.registry import register_parser
16
+ from novel_downloader.models import (
17
+ BookInfoDict,
18
+ ChapterDict,
19
+ ChapterInfoDict,
20
+ VolumeInfoDict,
21
+ )
22
+
23
+
24
+ @register_parser(
25
+ site_keys=["dxmwx"],
26
+ )
27
+ class DxmwxParser(BaseParser):
28
+ """
29
+ Parser for 大熊猫文学网 book pages.
30
+ """
31
+
32
+ _RE_DATE = re.compile(r"\d{4}-\d{2}-\d{2}")
33
+ _RE_SPACES = re.compile(r"[ \t\u3000]+")
34
+ _RE_NEWLINES = re.compile(r"\n{2,}")
35
+ _RE_TITLE_WS = re.compile(r"\s+")
36
+
37
+ def parse_book_info(
38
+ self,
39
+ html_list: list[str],
40
+ **kwargs: Any,
41
+ ) -> BookInfoDict | None:
42
+ if len(html_list) < 2:
43
+ return None
44
+
45
+ info_tree = html.fromstring(html_list[0])
46
+ catalog_tree = html.fromstring(html_list[1])
47
+
48
+ book_name = self._first_str(
49
+ info_tree.xpath("//span[contains(@style,'font-size: 24px')]/text()")
50
+ )
51
+ author = self._first_str(
52
+ info_tree.xpath(
53
+ "//div[contains(@style,'height: 28px') and contains(., '著')]//a/text()"
54
+ )
55
+ )
56
+ tags = [
57
+ t.strip()
58
+ for t in info_tree.xpath("//span[@class='typebut']//a/text()")
59
+ if t.strip()
60
+ ]
61
+ cover_url = "https://www.dxmwx.org" + self._first_str(
62
+ info_tree.xpath("//img[@class='imgwidth']/@src")
63
+ )
64
+
65
+ raw_update = self._first_str(
66
+ info_tree.xpath(
67
+ "normalize-space(string(//span[starts-with(normalize-space(.), '更新时间:')]))" # noqa: E501
68
+ )
69
+ )
70
+ raw_update = raw_update.replace("更新时间:", "").strip()
71
+ update_time = self._normalize_update_date(raw_update)
72
+
73
+ nodes = info_tree.xpath(
74
+ "//div[contains(@style,'min-height') and "
75
+ "contains(@style,'padding-left') and contains(@style,'padding-right')][1]"
76
+ )
77
+ summary = ""
78
+ if nodes:
79
+ texts = [
80
+ t.replace("\xa0", " ").strip() for t in nodes[0].xpath(".//text()")
81
+ ]
82
+ lines = [t for t in texts if t]
83
+ summary = "\n".join(lines)
84
+ summary = re.sub(r"^\s*[::]\s*", "", summary)
85
+ summary = self._clean_spaces(summary)
86
+
87
+ chapters: list[ChapterInfoDict] = []
88
+ for a in catalog_tree.xpath(
89
+ "//div[contains(@style,'height:40px') and contains(@style,'border-bottom')]//a" # noqa: E501
90
+ ):
91
+ href = a.get("href") or ""
92
+ title = (a.text_content() or "").strip()
93
+ if not href or not title:
94
+ continue
95
+ # "/read/57215_50197663.html" -> "50197663"
96
+ chap_id = href.split("read/", 1)[-1].split(".html", 1)[0].split("_")[-1]
97
+ chapters.append({"title": title, "url": href, "chapterId": chap_id})
98
+ volumes: list[VolumeInfoDict] = [{"volume_name": "正文", "chapters": chapters}]
99
+
100
+ return {
101
+ "book_name": book_name,
102
+ "author": author,
103
+ "cover_url": cover_url,
104
+ "update_time": update_time,
105
+ "tags": tags,
106
+ "summary": summary,
107
+ "volumes": volumes,
108
+ "extra": {},
109
+ }
110
+
111
+ def parse_chapter(
112
+ self,
113
+ html_list: list[str],
114
+ chapter_id: str,
115
+ **kwargs: Any,
116
+ ) -> ChapterDict | None:
117
+ if not html_list:
118
+ return None
119
+
120
+ tree = html.fromstring(html_list[0])
121
+
122
+ title = self._first_str(tree.xpath("//h1[@id='ChapterTitle']/text()"))
123
+ title = self._RE_TITLE_WS.sub(" ", title).strip()
124
+ if not title:
125
+ title = f"第 {chapter_id} 章"
126
+
127
+ paragraphs: list[str] = []
128
+ for p in tree.xpath("//div[@id='Lab_Contents']//p"):
129
+ text = self._clean_spaces(p.text_content())
130
+ if not text:
131
+ continue
132
+ if "点这里听书" in text or "大熊猫文学" in text:
133
+ continue
134
+ paragraphs.append(text)
135
+
136
+ content = "\n".join(paragraphs).strip()
137
+ if not content:
138
+ return None
139
+
140
+ return {
141
+ "id": chapter_id,
142
+ "title": title,
143
+ "content": content,
144
+ "extra": {"site": "dxmwx"},
145
+ }
146
+
147
+ @classmethod
148
+ def _clean_spaces(cls, s: str) -> str:
149
+ s = s.replace("\xa0", " ")
150
+ s = cls._RE_SPACES.sub(" ", s)
151
+ s = cls._RE_NEWLINES.sub("\n", s)
152
+ return s.strip()
153
+
154
+ @classmethod
155
+ def _normalize_update_date(cls, raw: str) -> str:
156
+ """Return a YYYY-MM-DD string."""
157
+ if not raw:
158
+ return datetime.now().strftime("%Y-%m-%d")
159
+ m = cls._RE_DATE.search(raw)
160
+ if m:
161
+ return m.group(0)
162
+ return datetime.now().strftime("%Y-%m-%d")
@@ -0,0 +1,224 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.parsers.eightnovel
4
+ ----------------------------------------
5
+
6
+ """
7
+
8
+ import re
9
+ from typing import Any
10
+
11
+ from lxml import html
12
+
13
+ from novel_downloader.core.parsers.base import BaseParser
14
+ from novel_downloader.core.parsers.registry import register_parser
15
+ from novel_downloader.models import (
16
+ BookInfoDict,
17
+ ChapterDict,
18
+ ChapterInfoDict,
19
+ VolumeInfoDict,
20
+ )
21
+
22
+
23
+ @register_parser(
24
+ site_keys=["eightnovel", "8novel"],
25
+ )
26
+ class EightnovelParser(BaseParser):
27
+ """
28
+ Parser for 无限轻小说 book pages.
29
+ """
30
+
31
+ BASE_URL = "https://www.8novel.com"
32
+ _SPLIT_STR_PATTERN = re.compile(
33
+ r'["\']([^"\']+)["\']\s*\.split\s*\(\s*["\']\s*,\s*["\']\s*\)', re.DOTALL
34
+ )
35
+ _RE_AUTHOR = re.compile(r"作者[::]?\s*")
36
+ _RE_UPDATE = re.compile(r"更新[::]?\s*")
37
+
38
+ def parse_book_info(
39
+ self,
40
+ html_list: list[str],
41
+ **kwargs: Any,
42
+ ) -> BookInfoDict | None:
43
+ if not html_list:
44
+ return None
45
+
46
+ tree = html.fromstring(html_list[0])
47
+
48
+ # --- Basic metadata ---
49
+ book_name = self._first_str(tree.xpath("//li[contains(@class,'h2')]/text()"))
50
+
51
+ author_raw = self._first_str(
52
+ tree.xpath("//span[contains(@class,'item-info-author')]/text()")
53
+ )
54
+ author = self._RE_AUTHOR.sub("", author_raw)
55
+
56
+ cover_url = self.BASE_URL + self._first_str(
57
+ tree.xpath("//div[contains(@class,'item-cover')]//img/@src")
58
+ )
59
+
60
+ update_raw = self._first_str(
61
+ tree.xpath("//span[contains(@class,'item-info-date')]/text()")
62
+ )
63
+ update_time = self._RE_UPDATE.sub("", update_raw)
64
+
65
+ counts = tree.xpath(
66
+ "//li[@class='small text-gray']//span[contains(@class,'item-info-num')]/text()" # noqa: E501
67
+ )
68
+ word_count = counts[1].strip() + "萬字" if len(counts) >= 2 else ""
69
+
70
+ tags = tree.xpath("//meta[@property='og:novel:category']/@content")
71
+
72
+ # --- Summary ---
73
+ summary_nodes = tree.xpath(
74
+ "//li[contains(@class,'full_text') and contains(@class,'mt-2')]"
75
+ )
76
+ if summary_nodes:
77
+ texts = [t.strip() for t in summary_nodes[0].itertext()]
78
+ summary = "\n".join(line for line in texts if line)
79
+ else:
80
+ summary = ""
81
+
82
+ # --- Chapters / Volumes ---
83
+ volumes: list[VolumeInfoDict] = []
84
+ for vol_div in tree.xpath("//div[contains(@class,'folder') and @pid]"):
85
+ # Volume title
86
+ h3 = vol_div.xpath(".//div[contains(@class,'vol-title')]//h3")
87
+ vol_name = (
88
+ h3[0].text_content().split("/")[0].strip() if h3 else "Unnamed Volume"
89
+ )
90
+
91
+ # Chapters
92
+ chapters: list[ChapterInfoDict] = []
93
+ for a in vol_div.xpath(
94
+ ".//a[contains(@class,'episode_li') and contains(@class,'d-block')]"
95
+ ):
96
+ title = (a.text_content() or "").strip()
97
+ href = a.get("href") or ""
98
+ if not href or not title:
99
+ continue
100
+ url = href if href.startswith("http") else self.BASE_URL + href
101
+ chapter_id = href.split("?")[-1] # "/read/3355/?270015" -> "270015"
102
+ chapters.append({"title": title, "url": url, "chapterId": chapter_id})
103
+
104
+ volumes.append({"volume_name": vol_name, "chapters": chapters})
105
+
106
+ return {
107
+ "book_name": book_name,
108
+ "author": author,
109
+ "cover_url": cover_url,
110
+ "update_time": update_time,
111
+ "word_count": word_count,
112
+ "tags": tags,
113
+ "summary": summary,
114
+ "volumes": volumes,
115
+ "extra": {},
116
+ }
117
+
118
+ def parse_chapter(
119
+ self,
120
+ html_list: list[str],
121
+ chapter_id: str,
122
+ **kwargs: Any,
123
+ ) -> ChapterDict | None:
124
+ if len(html_list) < 2:
125
+ return None
126
+
127
+ try:
128
+ id_title_map = self._build_id_title_map(html_list[0])
129
+ title = id_title_map.get(chapter_id) or ""
130
+ except Exception:
131
+ title = ""
132
+
133
+ wrapper = html.fromstring(f"<div>{html_list[1]}</div>")
134
+
135
+ segments: list[str] = []
136
+
137
+ self._append_segment(segments, wrapper.text)
138
+
139
+ for node in wrapper:
140
+ tag = node.tag.lower() if isinstance(node.tag, str) else ""
141
+
142
+ # A picture‑gallery block
143
+ if tag == "div" and "content-pics" in (node.get("class") or ""):
144
+ for img in node.xpath(".//img"):
145
+ src = img.get("src")
146
+ full = src if not src.startswith("/") else self.BASE_URL + src
147
+ segments.append(f'<img src="{full}" />')
148
+ self._append_segment(segments, node.tail)
149
+
150
+ # Standalone <img>
151
+ elif tag == "img":
152
+ src = node.get("src")
153
+ if not src:
154
+ continue
155
+ full = src if not src.startswith("/") else self.BASE_URL + src
156
+ segments.append(f'<img src="{full}" />')
157
+ self._append_segment(segments, node.tail)
158
+
159
+ # Line break -> text in .tail is next paragraph
160
+ elif tag == "br":
161
+ self._append_segment(segments, node.tail)
162
+
163
+ # Any other element -> get its text content
164
+ else:
165
+ self._append_segment(segments, node.text_content())
166
+ self._append_segment(segments, node.tail)
167
+
168
+ # Remove final ad line if present
169
+ if segments and segments[-1] and segments[-1][0] in ("8", "⑧", "⒏"):
170
+ segments.pop()
171
+
172
+ content = "\n".join(segments).strip()
173
+ if not content.strip():
174
+ return None
175
+
176
+ return {
177
+ "id": chapter_id,
178
+ "title": title,
179
+ "content": content,
180
+ "extra": {"site": "eightnovel"},
181
+ }
182
+
183
+ @staticmethod
184
+ def _append_segment(segments: list[str], text: str | None) -> None:
185
+ """
186
+ Strip, filter out the '8novel' ad, and append non-empty text to segments.
187
+ """
188
+ if not text:
189
+ return
190
+ cleaned = text.strip()
191
+ if cleaned:
192
+ segments.append(cleaned)
193
+
194
+ @classmethod
195
+ def _build_id_title_map(cls, html_str: str) -> dict[str, str]:
196
+ """
197
+ Extracts two comma-split lists from html_str:
198
+ - A numeric list of IDs (one element longer)
199
+ - A list of titles
200
+ """
201
+ id_list = None
202
+ title_list = None
203
+
204
+ for content in cls._SPLIT_STR_PATTERN.findall(html_str):
205
+ items = [s.strip() for s in content.split(",")]
206
+ if items == [""]:
207
+ # skip bids=""
208
+ continue
209
+ if all(item.isdigit() for item in items):
210
+ id_list = items
211
+ else:
212
+ title_list = items
213
+
214
+ if id_list and title_list:
215
+ break
216
+
217
+ if not id_list or not title_list:
218
+ raise ValueError("Could not locate both ID and title lists")
219
+ if len(id_list) != len(title_list) + 1:
220
+ raise ValueError(
221
+ "ID list must be exactly one element longer than title list"
222
+ )
223
+
224
+ return dict(zip(id_list[:-1], title_list, strict=False))
@@ -12,26 +12,20 @@ from lxml import html
12
12
 
13
13
  from novel_downloader.core.parsers.base import BaseParser
14
14
  from novel_downloader.core.parsers.registry import register_parser
15
- from novel_downloader.models import ChapterDict
15
+ from novel_downloader.models import (
16
+ BookInfoDict,
17
+ ChapterDict,
18
+ VolumeInfoDict,
19
+ )
16
20
 
17
21
 
18
22
  @register_parser(
19
23
  site_keys=["esjzone"],
20
- backends=["session", "browser"],
21
24
  )
22
25
  class EsjzoneParser(BaseParser):
23
- """ """
24
-
25
- # Book info XPaths
26
- _BOOK_NAME_XPATH = '//h2[contains(@class, "text-normal")]/text()'
27
- _AUTHOR_XPATH = '//li[strong[text()="作者:"]]/a/text()'
28
- _COVER_URL_XPATH = '//div[contains(@class,"product-gallery")]//img/@src'
29
- _UPDATE_TIME_XPATH = '//li[strong[text()="更新日期:"]]/text()'
30
- _WORD_COUNT_XPATH = '//span[@id="txt"]/text()'
31
- _TYPE_XPATH = '//li[strong[text()="類型:"]]/text()'
32
- _ALT_NAME_XPATH = '//li[strong[text()="其他書名:"]]/text()'
33
- _WEB_URL_XPATH = '//li[strong[text()="Web生肉:"]]/a/@href'
34
- _SUMMARY_XPATH = '//div[@class="description"]/p//text()'
26
+ """
27
+ Parser for esjzone book pages.
28
+ """
35
29
 
36
30
  # Chapter XPaths
37
31
  _CHAPTER_TEXT_XPATH = 'string(//div[contains(@class, "forum-content")])'
@@ -40,14 +34,13 @@ class EsjzoneParser(BaseParser):
40
34
  '//i[contains(@class, "icon-clock")]/following-sibling::text()',
41
35
  '//i[contains(@class, "icon-pen-tool")]/following-sibling::text()',
42
36
  ]
43
-
44
37
  _CHECK_FORUM_XPATH = '//div[@class="page-title"]//ul[@class="breadcrumbs"]/li[not(@class="slash")]//text()' # noqa: E501
45
38
 
46
39
  def parse_book_info(
47
40
  self,
48
41
  html_list: list[str],
49
42
  **kwargs: Any,
50
- ) -> dict[str, Any]:
43
+ ) -> BookInfoDict | None:
51
44
  """
52
45
  Parse a book info page and extract metadata and chapter structure.
53
46
 
@@ -58,27 +51,40 @@ class EsjzoneParser(BaseParser):
58
51
  :return: Parsed metadata and chapter structure as a dictionary.
59
52
  """
60
53
  if not html_list or self._is_forum_page(html_list):
61
- return {}
54
+ return None
55
+
62
56
  tree = html.fromstring(html_list[0])
63
- result: dict[str, Any] = {}
64
-
65
- result["book_name"] = self._get_text(tree, self._BOOK_NAME_XPATH)
66
- result["author"] = self._get_text(tree, self._AUTHOR_XPATH)
67
- result["cover_url"] = self._get_text(tree, self._COVER_URL_XPATH)
68
- result["update_time"] = self._get_text(tree, self._UPDATE_TIME_XPATH)
69
- result["word_count"] = self._get_text(
70
- tree, self._WORD_COUNT_XPATH, clean_comma=True
57
+
58
+ # --- Basic metadata ---
59
+ book_name = self._first_str(
60
+ tree.xpath('//h2[contains(@class,"text-normal")]/text()')
71
61
  )
72
- result["type"] = self._get_text(tree, self._TYPE_XPATH)
73
- result["alt_name"] = self._get_text(tree, self._ALT_NAME_XPATH)
74
- result["web_url"] = self._get_text(tree, self._WEB_URL_XPATH)
75
- # result["summary"] = self._get_text(tree, self._SUMMARY_XPATH, join=True)
62
+ author = self._first_str(tree.xpath('//li[strong[text()="作者:"]]/a/text()'))
63
+ cover_url = self._first_str(
64
+ tree.xpath('//div[contains(@class,"product-gallery")]//img/@src')
65
+ )
66
+ update_time = self._first_str(
67
+ tree.xpath('//li[strong[text()="更新日期:"]]/text()')
68
+ ) # noqa: E501
69
+ word_count = self._first_str(
70
+ tree.xpath('//span[@id="txt"]/text()'), replaces=[(",", "")]
71
+ )
72
+ book_type = self._first_str(tree.xpath('//li[strong[text()="類型:"]]/text()'))
73
+ alt_name = self._first_str(
74
+ tree.xpath('//li[strong[text()="其他書名:"]]/text()')
75
+ ) # noqa: E501
76
+ web_url = self._first_str(tree.xpath('//li[strong[text()="Web生肉:"]]/a/@href'))
77
+
78
+ # Summary paragraphs
76
79
  paras = tree.xpath('//div[@class="description"]/p')
77
80
  texts = [p.xpath("string()").strip() for p in paras]
78
- result["summary"] = "\n".join(texts).strip()
81
+ summary = "\n".join(t for t in texts if t)
79
82
 
80
- volumes: list[dict[str, Any]] = []
81
- current_vol: dict[str, Any] = {}
83
+ current_vol: VolumeInfoDict = {
84
+ "volume_name": "單卷",
85
+ "chapters": [],
86
+ }
87
+ volumes: list[VolumeInfoDict] = [current_vol]
82
88
 
83
89
  def _is_garbage_title(name: str) -> bool:
84
90
  stripped = name.strip()
@@ -89,25 +95,18 @@ class EsjzoneParser(BaseParser):
89
95
  if _is_garbage_title(name):
90
96
  return
91
97
  name = name.strip() or "未命名卷"
92
- if name == "未命名卷" and current_vol is not None:
98
+ if current_vol and current_vol["volume_name"] == name:
93
99
  return
94
100
  current_vol = {"volume_name": name, "chapters": []}
95
101
  volumes.append(current_vol)
96
102
 
97
- _start_volume("單卷")
98
-
99
- # nodes = tree.xpath('//div[@id="chapterList"]/details') + tree.xpath(
100
- # '//div[@id="chapterList"]/*[not(self::details)]'
101
- # )
102
103
  nodes = tree.xpath('//div[@id="chapterList"]/*')
103
-
104
104
  for node in nodes:
105
105
  tag = node.tag.lower()
106
106
 
107
107
  if tag == "details":
108
108
  # ---- DETAILS-based layout ----
109
- summary = node.find("summary")
110
- vol_name = summary.text if summary is not None else "未命名卷"
109
+ vol_name = node.xpath("string(./summary)").strip() or "未命名卷"
111
110
  _start_volume(vol_name)
112
111
 
113
112
  # all chapters inside this details
@@ -116,7 +115,11 @@ class EsjzoneParser(BaseParser):
116
115
  href = a.get("href", "")
117
116
  chap_id = href.rstrip("/").split("/")[-1].split(".", 1)[0]
118
117
  current_vol["chapters"].append(
119
- {"title": title, "url": href, "chapterId": chap_id}
118
+ {
119
+ "title": title,
120
+ "url": href,
121
+ "chapterId": chap_id,
122
+ }
120
123
  )
121
124
 
122
125
  elif (
@@ -139,9 +142,21 @@ class EsjzoneParser(BaseParser):
139
142
  {"title": title, "url": href, "chapterId": chap_id}
140
143
  )
141
144
  volumes = [vol for vol in volumes if vol["chapters"]]
142
- result["volumes"] = volumes
143
145
 
144
- return result
146
+ return {
147
+ "book_name": book_name,
148
+ "author": author,
149
+ "cover_url": cover_url,
150
+ "update_time": update_time,
151
+ "summary": summary,
152
+ "tags": [book_type],
153
+ "word_count": word_count,
154
+ "volumes": volumes,
155
+ "extra": {
156
+ "alt_name": alt_name,
157
+ "web_url": web_url,
158
+ },
159
+ }
145
160
 
146
161
  def parse_chapter(
147
162
  self,
@@ -149,16 +164,9 @@ class EsjzoneParser(BaseParser):
149
164
  chapter_id: str,
150
165
  **kwargs: Any,
151
166
  ) -> ChapterDict | None:
152
- """
153
- Parse a single chapter page and extract clean text or simplified HTML.
154
-
155
- :param html_list: Raw HTML of the chapter page.
156
- :param chapter_id: Identifier of the chapter being parsed.
157
- :return: Cleaned chapter content as plain text or minimal HTML.
158
- """
159
167
  if not html_list or self._is_forum_page(html_list):
160
168
  return None
161
- tree = html.fromstring(html_list[0], parser=None)
169
+ tree = html.fromstring(html_list[0])
162
170
 
163
171
  content_lines: list[str] = []
164
172
  content_nodes = tree.xpath(self._CHAPTER_CONTENT_NODES_XPATH)
@@ -178,7 +186,7 @@ class EsjzoneParser(BaseParser):
178
186
  content_lines.append(f'<img src="{src}" />')
179
187
 
180
188
  content = (
181
- "\n\n".join(content_lines).strip()
189
+ "\n".join(content_lines).strip()
182
190
  if content_lines
183
191
  else tree.xpath(self._CHAPTER_TEXT_XPATH).strip()
184
192
  )
@@ -216,16 +224,3 @@ class EsjzoneParser(BaseParser):
216
224
  breadcrumb: list[str] = tree.xpath(self._CHECK_FORUM_XPATH)
217
225
  breadcrumb = [s.strip() for s in breadcrumb if s.strip()]
218
226
  return breadcrumb == ["Home", "論壇"]
219
-
220
- @staticmethod
221
- def _get_text(
222
- tree: html.HtmlElement,
223
- xpath: str,
224
- join: bool = False,
225
- clean_comma: bool = False,
226
- ) -> str:
227
- data = tree.xpath(xpath)
228
- if not data:
229
- return ""
230
- text = "\n".join(data) if join else data[0].strip()
231
- return text.replace(",", "") if clean_comma else text