novel-downloader 1.4.5__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/__init__.py +2 -4
  3. novel_downloader/cli/clean.py +21 -88
  4. novel_downloader/cli/config.py +27 -104
  5. novel_downloader/cli/download.py +78 -66
  6. novel_downloader/cli/export.py +20 -21
  7. novel_downloader/cli/main.py +3 -1
  8. novel_downloader/cli/search.py +120 -0
  9. novel_downloader/cli/ui.py +156 -0
  10. novel_downloader/config/__init__.py +10 -14
  11. novel_downloader/config/adapter.py +195 -99
  12. novel_downloader/config/{loader.py → file_io.py} +53 -27
  13. novel_downloader/core/__init__.py +14 -13
  14. novel_downloader/core/archived/deqixs/fetcher.py +115 -0
  15. novel_downloader/core/archived/deqixs/parser.py +132 -0
  16. novel_downloader/core/archived/deqixs/searcher.py +89 -0
  17. novel_downloader/core/archived/qidian/searcher.py +79 -0
  18. novel_downloader/core/archived/wanbengo/searcher.py +98 -0
  19. novel_downloader/core/archived/xshbook/searcher.py +93 -0
  20. novel_downloader/core/downloaders/__init__.py +8 -30
  21. novel_downloader/core/downloaders/base.py +182 -30
  22. novel_downloader/core/downloaders/common.py +217 -384
  23. novel_downloader/core/downloaders/qianbi.py +332 -4
  24. novel_downloader/core/downloaders/qidian.py +250 -290
  25. novel_downloader/core/downloaders/registry.py +69 -0
  26. novel_downloader/core/downloaders/signals.py +46 -0
  27. novel_downloader/core/exporters/__init__.py +8 -26
  28. novel_downloader/core/exporters/base.py +107 -31
  29. novel_downloader/core/exporters/common/__init__.py +3 -4
  30. novel_downloader/core/exporters/common/epub.py +92 -171
  31. novel_downloader/core/exporters/common/main_exporter.py +14 -67
  32. novel_downloader/core/exporters/common/txt.py +90 -86
  33. novel_downloader/core/exporters/epub_util.py +184 -1327
  34. novel_downloader/core/exporters/linovelib/__init__.py +3 -2
  35. novel_downloader/core/exporters/linovelib/epub.py +165 -222
  36. novel_downloader/core/exporters/linovelib/main_exporter.py +10 -71
  37. novel_downloader/core/exporters/linovelib/txt.py +76 -66
  38. novel_downloader/core/exporters/qidian.py +15 -11
  39. novel_downloader/core/exporters/registry.py +55 -0
  40. novel_downloader/core/exporters/txt_util.py +67 -0
  41. novel_downloader/core/fetchers/__init__.py +57 -56
  42. novel_downloader/core/fetchers/aaatxt.py +83 -0
  43. novel_downloader/core/fetchers/{biquge/session.py → b520.py} +10 -10
  44. novel_downloader/core/fetchers/{base/session.py → base.py} +63 -47
  45. novel_downloader/core/fetchers/biquyuedu.py +83 -0
  46. novel_downloader/core/fetchers/dxmwx.py +110 -0
  47. novel_downloader/core/fetchers/eightnovel.py +139 -0
  48. novel_downloader/core/fetchers/{esjzone/session.py → esjzone.py} +23 -11
  49. novel_downloader/core/fetchers/guidaye.py +85 -0
  50. novel_downloader/core/fetchers/hetushu.py +92 -0
  51. novel_downloader/core/fetchers/{qianbi/browser.py → i25zw.py} +22 -26
  52. novel_downloader/core/fetchers/ixdzs8.py +113 -0
  53. novel_downloader/core/fetchers/jpxs123.py +101 -0
  54. novel_downloader/core/fetchers/{biquge/browser.py → lewenn.py} +15 -15
  55. novel_downloader/core/fetchers/{linovelib/session.py → linovelib.py} +16 -12
  56. novel_downloader/core/fetchers/piaotia.py +105 -0
  57. novel_downloader/core/fetchers/qbtr.py +101 -0
  58. novel_downloader/core/fetchers/{qianbi/session.py → qianbi.py} +9 -9
  59. novel_downloader/core/fetchers/{qidian/session.py → qidian.py} +55 -40
  60. novel_downloader/core/fetchers/quanben5.py +92 -0
  61. novel_downloader/core/fetchers/{base/rate_limiter.py → rate_limiter.py} +2 -2
  62. novel_downloader/core/fetchers/registry.py +60 -0
  63. novel_downloader/core/fetchers/{sfacg/session.py → sfacg.py} +11 -9
  64. novel_downloader/core/fetchers/shencou.py +106 -0
  65. novel_downloader/core/fetchers/{common/browser.py → shuhaige.py} +24 -19
  66. novel_downloader/core/fetchers/tongrenquan.py +84 -0
  67. novel_downloader/core/fetchers/ttkan.py +95 -0
  68. novel_downloader/core/fetchers/{common/session.py → wanbengo.py} +21 -17
  69. novel_downloader/core/fetchers/xiaoshuowu.py +106 -0
  70. novel_downloader/core/fetchers/xiguashuwu.py +177 -0
  71. novel_downloader/core/fetchers/xs63b.py +171 -0
  72. novel_downloader/core/fetchers/xshbook.py +85 -0
  73. novel_downloader/core/fetchers/{yamibo/session.py → yamibo.py} +23 -11
  74. novel_downloader/core/fetchers/yibige.py +114 -0
  75. novel_downloader/core/interfaces/__init__.py +8 -14
  76. novel_downloader/core/interfaces/downloader.py +6 -2
  77. novel_downloader/core/interfaces/exporter.py +7 -7
  78. novel_downloader/core/interfaces/fetcher.py +4 -17
  79. novel_downloader/core/interfaces/parser.py +5 -6
  80. novel_downloader/core/interfaces/searcher.py +26 -0
  81. novel_downloader/core/parsers/__init__.py +58 -22
  82. novel_downloader/core/parsers/aaatxt.py +132 -0
  83. novel_downloader/core/parsers/b520.py +116 -0
  84. novel_downloader/core/parsers/base.py +63 -12
  85. novel_downloader/core/parsers/biquyuedu.py +133 -0
  86. novel_downloader/core/parsers/dxmwx.py +162 -0
  87. novel_downloader/core/parsers/eightnovel.py +224 -0
  88. novel_downloader/core/parsers/{esjzone/main_parser.py → esjzone.py} +67 -67
  89. novel_downloader/core/parsers/guidaye.py +128 -0
  90. novel_downloader/core/parsers/hetushu.py +139 -0
  91. novel_downloader/core/parsers/i25zw.py +137 -0
  92. novel_downloader/core/parsers/ixdzs8.py +186 -0
  93. novel_downloader/core/parsers/jpxs123.py +137 -0
  94. novel_downloader/core/parsers/lewenn.py +142 -0
  95. novel_downloader/core/parsers/{linovelib/main_parser.py → linovelib.py} +54 -65
  96. novel_downloader/core/parsers/piaotia.py +189 -0
  97. novel_downloader/core/parsers/qbtr.py +136 -0
  98. novel_downloader/core/parsers/{qianbi/main_parser.py → qianbi.py} +54 -51
  99. novel_downloader/core/parsers/qidian/__init__.py +2 -2
  100. novel_downloader/core/parsers/qidian/book_info_parser.py +58 -59
  101. novel_downloader/core/parsers/qidian/chapter_encrypted.py +290 -346
  102. novel_downloader/core/parsers/qidian/chapter_normal.py +25 -56
  103. novel_downloader/core/parsers/qidian/main_parser.py +19 -57
  104. novel_downloader/core/parsers/qidian/utils/__init__.py +12 -11
  105. novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +6 -7
  106. novel_downloader/core/parsers/qidian/utils/fontmap_recover.py +143 -0
  107. novel_downloader/core/parsers/qidian/utils/helpers.py +0 -4
  108. novel_downloader/core/parsers/qidian/utils/node_decryptor.py +2 -2
  109. novel_downloader/core/parsers/quanben5.py +103 -0
  110. novel_downloader/core/parsers/registry.py +57 -0
  111. novel_downloader/core/parsers/{sfacg/main_parser.py → sfacg.py} +46 -48
  112. novel_downloader/core/parsers/shencou.py +215 -0
  113. novel_downloader/core/parsers/shuhaige.py +111 -0
  114. novel_downloader/core/parsers/tongrenquan.py +116 -0
  115. novel_downloader/core/parsers/ttkan.py +132 -0
  116. novel_downloader/core/parsers/wanbengo.py +191 -0
  117. novel_downloader/core/parsers/xiaoshuowu.py +173 -0
  118. novel_downloader/core/parsers/xiguashuwu.py +435 -0
  119. novel_downloader/core/parsers/xs63b.py +161 -0
  120. novel_downloader/core/parsers/xshbook.py +134 -0
  121. novel_downloader/core/parsers/yamibo.py +155 -0
  122. novel_downloader/core/parsers/yibige.py +166 -0
  123. novel_downloader/core/searchers/__init__.py +51 -0
  124. novel_downloader/core/searchers/aaatxt.py +107 -0
  125. novel_downloader/core/searchers/b520.py +84 -0
  126. novel_downloader/core/searchers/base.py +168 -0
  127. novel_downloader/core/searchers/dxmwx.py +105 -0
  128. novel_downloader/core/searchers/eightnovel.py +84 -0
  129. novel_downloader/core/searchers/esjzone.py +102 -0
  130. novel_downloader/core/searchers/hetushu.py +92 -0
  131. novel_downloader/core/searchers/i25zw.py +93 -0
  132. novel_downloader/core/searchers/ixdzs8.py +107 -0
  133. novel_downloader/core/searchers/jpxs123.py +107 -0
  134. novel_downloader/core/searchers/piaotia.py +100 -0
  135. novel_downloader/core/searchers/qbtr.py +106 -0
  136. novel_downloader/core/searchers/qianbi.py +165 -0
  137. novel_downloader/core/searchers/quanben5.py +144 -0
  138. novel_downloader/core/searchers/registry.py +79 -0
  139. novel_downloader/core/searchers/shuhaige.py +124 -0
  140. novel_downloader/core/searchers/tongrenquan.py +110 -0
  141. novel_downloader/core/searchers/ttkan.py +92 -0
  142. novel_downloader/core/searchers/xiaoshuowu.py +122 -0
  143. novel_downloader/core/searchers/xiguashuwu.py +95 -0
  144. novel_downloader/core/searchers/xs63b.py +104 -0
  145. novel_downloader/locales/en.json +36 -79
  146. novel_downloader/locales/zh.json +37 -80
  147. novel_downloader/models/__init__.py +23 -50
  148. novel_downloader/models/book.py +44 -0
  149. novel_downloader/models/config.py +16 -43
  150. novel_downloader/models/login.py +1 -1
  151. novel_downloader/models/search.py +21 -0
  152. novel_downloader/resources/config/settings.toml +39 -74
  153. novel_downloader/resources/css_styles/intro.css +83 -0
  154. novel_downloader/resources/css_styles/main.css +30 -89
  155. novel_downloader/resources/json/xiguashuwu.json +718 -0
  156. novel_downloader/utils/__init__.py +43 -0
  157. novel_downloader/utils/chapter_storage.py +247 -226
  158. novel_downloader/utils/constants.py +5 -50
  159. novel_downloader/utils/cookies.py +6 -18
  160. novel_downloader/utils/crypto_utils/__init__.py +13 -0
  161. novel_downloader/utils/crypto_utils/aes_util.py +90 -0
  162. novel_downloader/utils/crypto_utils/aes_v1.py +619 -0
  163. novel_downloader/utils/crypto_utils/aes_v2.py +1143 -0
  164. novel_downloader/utils/{crypto_utils.py → crypto_utils/rc4.py} +3 -10
  165. novel_downloader/utils/epub/__init__.py +34 -0
  166. novel_downloader/utils/epub/builder.py +377 -0
  167. novel_downloader/utils/epub/constants.py +118 -0
  168. novel_downloader/utils/epub/documents.py +297 -0
  169. novel_downloader/utils/epub/models.py +120 -0
  170. novel_downloader/utils/epub/utils.py +179 -0
  171. novel_downloader/utils/file_utils/__init__.py +5 -30
  172. novel_downloader/utils/file_utils/io.py +9 -150
  173. novel_downloader/utils/file_utils/normalize.py +2 -2
  174. novel_downloader/utils/file_utils/sanitize.py +2 -7
  175. novel_downloader/utils/fontocr.py +207 -0
  176. novel_downloader/utils/i18n.py +2 -0
  177. novel_downloader/utils/logger.py +10 -16
  178. novel_downloader/utils/network.py +111 -252
  179. novel_downloader/utils/state.py +5 -90
  180. novel_downloader/utils/text_utils/__init__.py +16 -21
  181. novel_downloader/utils/text_utils/diff_display.py +6 -9
  182. novel_downloader/utils/text_utils/numeric_conversion.py +253 -0
  183. novel_downloader/utils/text_utils/text_cleaner.py +179 -0
  184. novel_downloader/utils/text_utils/truncate_utils.py +62 -0
  185. novel_downloader/utils/time_utils/__init__.py +6 -12
  186. novel_downloader/utils/time_utils/datetime_utils.py +23 -33
  187. novel_downloader/utils/time_utils/sleep_utils.py +5 -10
  188. novel_downloader/web/__init__.py +13 -0
  189. novel_downloader/web/components/__init__.py +11 -0
  190. novel_downloader/web/components/navigation.py +35 -0
  191. novel_downloader/web/main.py +66 -0
  192. novel_downloader/web/pages/__init__.py +17 -0
  193. novel_downloader/web/pages/download.py +78 -0
  194. novel_downloader/web/pages/progress.py +147 -0
  195. novel_downloader/web/pages/search.py +329 -0
  196. novel_downloader/web/services/__init__.py +17 -0
  197. novel_downloader/web/services/client_dialog.py +164 -0
  198. novel_downloader/web/services/cred_broker.py +113 -0
  199. novel_downloader/web/services/cred_models.py +35 -0
  200. novel_downloader/web/services/task_manager.py +264 -0
  201. novel_downloader-2.0.0.dist-info/METADATA +171 -0
  202. novel_downloader-2.0.0.dist-info/RECORD +210 -0
  203. {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/entry_points.txt +1 -1
  204. novel_downloader/config/site_rules.py +0 -94
  205. novel_downloader/core/downloaders/biquge.py +0 -25
  206. novel_downloader/core/downloaders/esjzone.py +0 -25
  207. novel_downloader/core/downloaders/linovelib.py +0 -25
  208. novel_downloader/core/downloaders/sfacg.py +0 -25
  209. novel_downloader/core/downloaders/yamibo.py +0 -25
  210. novel_downloader/core/exporters/biquge.py +0 -25
  211. novel_downloader/core/exporters/esjzone.py +0 -25
  212. novel_downloader/core/exporters/qianbi.py +0 -25
  213. novel_downloader/core/exporters/sfacg.py +0 -25
  214. novel_downloader/core/exporters/yamibo.py +0 -25
  215. novel_downloader/core/factory/__init__.py +0 -20
  216. novel_downloader/core/factory/downloader.py +0 -73
  217. novel_downloader/core/factory/exporter.py +0 -58
  218. novel_downloader/core/factory/fetcher.py +0 -96
  219. novel_downloader/core/factory/parser.py +0 -86
  220. novel_downloader/core/fetchers/base/__init__.py +0 -14
  221. novel_downloader/core/fetchers/base/browser.py +0 -403
  222. novel_downloader/core/fetchers/biquge/__init__.py +0 -14
  223. novel_downloader/core/fetchers/common/__init__.py +0 -14
  224. novel_downloader/core/fetchers/esjzone/__init__.py +0 -14
  225. novel_downloader/core/fetchers/esjzone/browser.py +0 -204
  226. novel_downloader/core/fetchers/linovelib/__init__.py +0 -14
  227. novel_downloader/core/fetchers/linovelib/browser.py +0 -193
  228. novel_downloader/core/fetchers/qianbi/__init__.py +0 -14
  229. novel_downloader/core/fetchers/qidian/__init__.py +0 -14
  230. novel_downloader/core/fetchers/qidian/browser.py +0 -318
  231. novel_downloader/core/fetchers/sfacg/__init__.py +0 -14
  232. novel_downloader/core/fetchers/sfacg/browser.py +0 -189
  233. novel_downloader/core/fetchers/yamibo/__init__.py +0 -14
  234. novel_downloader/core/fetchers/yamibo/browser.py +0 -229
  235. novel_downloader/core/parsers/biquge/__init__.py +0 -10
  236. novel_downloader/core/parsers/biquge/main_parser.py +0 -134
  237. novel_downloader/core/parsers/common/__init__.py +0 -13
  238. novel_downloader/core/parsers/common/helper.py +0 -323
  239. novel_downloader/core/parsers/common/main_parser.py +0 -106
  240. novel_downloader/core/parsers/esjzone/__init__.py +0 -10
  241. novel_downloader/core/parsers/linovelib/__init__.py +0 -10
  242. novel_downloader/core/parsers/qianbi/__init__.py +0 -10
  243. novel_downloader/core/parsers/sfacg/__init__.py +0 -10
  244. novel_downloader/core/parsers/yamibo/__init__.py +0 -10
  245. novel_downloader/core/parsers/yamibo/main_parser.py +0 -194
  246. novel_downloader/models/browser.py +0 -21
  247. novel_downloader/models/chapter.py +0 -25
  248. novel_downloader/models/site_rules.py +0 -99
  249. novel_downloader/models/tasks.py +0 -33
  250. novel_downloader/models/types.py +0 -15
  251. novel_downloader/resources/css_styles/volume-intro.css +0 -56
  252. novel_downloader/resources/json/replace_word_map.json +0 -4
  253. novel_downloader/resources/text/blacklist.txt +0 -22
  254. novel_downloader/tui/__init__.py +0 -7
  255. novel_downloader/tui/app.py +0 -32
  256. novel_downloader/tui/main.py +0 -17
  257. novel_downloader/tui/screens/__init__.py +0 -14
  258. novel_downloader/tui/screens/home.py +0 -198
  259. novel_downloader/tui/screens/login.py +0 -74
  260. novel_downloader/tui/styles/home_layout.tcss +0 -79
  261. novel_downloader/tui/widgets/richlog_handler.py +0 -24
  262. novel_downloader/utils/cache.py +0 -24
  263. novel_downloader/utils/fontocr/__init__.py +0 -22
  264. novel_downloader/utils/fontocr/model_loader.py +0 -69
  265. novel_downloader/utils/fontocr/ocr_v1.py +0 -303
  266. novel_downloader/utils/fontocr/ocr_v2.py +0 -752
  267. novel_downloader/utils/hash_store.py +0 -279
  268. novel_downloader/utils/hash_utils.py +0 -103
  269. novel_downloader/utils/text_utils/chapter_formatting.py +0 -46
  270. novel_downloader/utils/text_utils/font_mapping.py +0 -28
  271. novel_downloader/utils/text_utils/text_cleaning.py +0 -107
  272. novel_downloader-1.4.5.dist-info/METADATA +0 -196
  273. novel_downloader-1.4.5.dist-info/RECORD +0 -165
  274. {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/WHEEL +0 -0
  275. {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/licenses/LICENSE +0 -0
  276. {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/top_level.txt +0 -0
@@ -3,33 +3,33 @@
3
3
  novel_downloader.core.downloaders.common
4
4
  ----------------------------------------
5
5
 
6
+ Concrete downloader implementation with a generic async pipeline for common novel sites
6
7
  """
7
8
 
8
9
  import asyncio
9
- import json
10
10
  from collections.abc import Awaitable, Callable
11
- from contextlib import suppress
12
- from typing import Any, cast
11
+ from pathlib import Path
12
+ from typing import Any
13
13
 
14
14
  from novel_downloader.core.downloaders.base import BaseDownloader
15
+ from novel_downloader.core.downloaders.signals import (
16
+ STOP,
17
+ Progress,
18
+ StopToken,
19
+ )
15
20
  from novel_downloader.models import (
16
21
  BookConfig,
17
22
  ChapterDict,
18
- CidTask,
19
- HtmlTask,
20
- RestoreTask,
21
23
  )
22
- from novel_downloader.utils.chapter_storage import ChapterStorage
23
- from novel_downloader.utils.file_utils import save_as_json, save_as_txt
24
- from novel_downloader.utils.time_utils import (
25
- async_sleep_with_random_delay,
26
- calculate_time_difference,
24
+ from novel_downloader.utils import (
25
+ ChapterStorage,
26
+ async_jitter_sleep,
27
27
  )
28
28
 
29
29
 
30
30
  class CommonDownloader(BaseDownloader):
31
31
  """
32
- Specialized Async downloader for common novels.
32
+ Specialized Async downloader for "common" novel sites.
33
33
  """
34
34
 
35
35
  async def _download_one(
@@ -37,412 +37,245 @@ class CommonDownloader(BaseDownloader):
37
37
  book: BookConfig,
38
38
  *,
39
39
  progress_hook: Callable[[int, int], Awaitable[None]] | None = None,
40
+ cancel_event: asyncio.Event | None = None,
40
41
  **kwargs: Any,
41
42
  ) -> None:
42
43
  """
43
- The full download logic for a single book.
44
+ Sentinel-based pipeline with graceful cancellation:
45
+
46
+ Producer -> ChapterWorkers -> StorageWorker.
44
47
 
45
- :param book: BookConfig with at least 'book_id'.
48
+ On cancel: stop producing, workers finish at most one chapter,
49
+ storage drains, flushes, and exits.
46
50
  """
47
51
  TAG = "[Downloader]"
48
- book_id = book["book_id"]
52
+
53
+ book_id = self._normalize_book_id(book["book_id"])
49
54
  start_id = book.get("start_id")
50
55
  end_id = book.get("end_id")
51
56
  ignore_set = set(book.get("ignore_ids", []))
52
57
 
53
- raw_base = self.raw_data_dir / book_id
54
- cache_base = self.cache_dir / book_id
55
- info_path = raw_base / "book_info.json"
56
- chapters_html_dir = cache_base / "html"
57
-
58
+ raw_base = self._raw_data_dir / book_id
58
59
  raw_base.mkdir(parents=True, exist_ok=True)
59
- if self.save_html:
60
- chapters_html_dir.mkdir(parents=True, exist_ok=True)
61
- normal_cs = ChapterStorage(
60
+ html_dir = self._debug_dir / book_id / "html"
61
+
62
+ chapter_storage = ChapterStorage(
62
63
  raw_base=raw_base,
63
- namespace="chapters",
64
- backend_type=self._config.storage_backend,
65
- batch_size=self._config.storage_batch_size,
64
+ priorities=self.PRIORITIES_MAP,
66
65
  )
66
+ chapter_storage.connect()
67
67
 
68
- # load or fetch book_info
69
- book_info: dict[str, Any]
70
- re_fetch = True
71
- old_data: dict[str, Any] = {}
68
+ def cancelled() -> bool:
69
+ return bool(cancel_event and cancel_event.is_set())
72
70
 
73
- if info_path.exists():
74
- try:
75
- old_data = json.loads(info_path.read_text("utf-8"))
76
- days, *_ = calculate_time_difference(
77
- old_data.get("update_time", ""), "UTC+8"
78
- )
79
- re_fetch = days > 1
80
- except Exception:
81
- re_fetch = True
82
-
83
- if re_fetch:
84
- info_html = await self.fetcher.get_book_info(book_id)
85
- if self.save_html:
86
- for i, html in enumerate(info_html):
87
- save_as_txt(html, chapters_html_dir / f"info_{i}.html")
88
- book_info = self.parser.parse_book_info(info_html)
89
-
90
- if book_info.get("book_name") != "未找到书名":
91
- save_as_json(book_info, info_path)
92
- else:
93
- self.logger.warning("%s 书籍信息未找到, book_id = %s", TAG, book_id)
94
- book_info = old_data or {"book_name": "未找到书名"}
95
- else:
96
- book_info = old_data
97
-
98
- vols = book_info.get("volumes", [])
99
- total_chapters = 0
100
- for vol in vols:
101
- total_chapters += len(vol.get("chapters", []))
102
- if total_chapters == 0:
103
- self.logger.warning("%s 书籍没有章节可下载: book_id=%s", TAG, book_id)
104
- return
105
-
106
- completed_count = 0
107
-
108
- # setup queue, semaphore
109
- semaphore = asyncio.Semaphore(self.download_workers)
110
- cid_queue: asyncio.Queue[CidTask] = asyncio.Queue()
111
- restore_queue: asyncio.Queue[RestoreTask] = asyncio.Queue()
112
- html_queue: asyncio.Queue[HtmlTask] = asyncio.Queue()
113
- save_queue: asyncio.Queue[ChapterDict] = asyncio.Queue()
114
- pending_restore: dict[str, RestoreTask] = {}
115
-
116
- def update_book_info(
117
- vol_idx: int,
118
- chap_idx: int,
119
- cid: str,
120
- ) -> None:
121
- try:
122
- book_info["volumes"][vol_idx]["chapters"][chap_idx]["chapterId"] = cid
123
- except (IndexError, KeyError, TypeError) as e:
124
- self.logger.info(
125
- "[update_book_info] Failed to update vol=%s, chap=%s: %s",
126
- vol_idx,
127
- chap_idx,
128
- e,
129
- )
71
+ try:
72
+ # --- metadata ---
73
+ book_info = await self.load_book_info(book_id=book_id, html_dir=html_dir)
74
+ if not book_info:
75
+ return
130
76
 
131
- async def fetcher_worker(
132
- book_id: str,
133
- cid_queue: asyncio.Queue[CidTask],
134
- html_queue: asyncio.Queue[HtmlTask],
135
- restore_queue: asyncio.Queue[RestoreTask],
136
- retry_times: int,
137
- semaphore: asyncio.Semaphore,
138
- ) -> None:
139
- while True:
140
- task = await cid_queue.get()
141
- cid = task.cid
142
- if not cid and task.prev_cid:
143
- await restore_queue.put(
144
- RestoreTask(
145
- vol_idx=task.vol_idx,
146
- chap_idx=task.chap_idx,
147
- prev_cid=task.prev_cid,
148
- )
149
- )
150
- cid_queue.task_done()
151
- continue
77
+ vols = book_info["volumes"]
78
+ total_chapters = sum(len(v["chapters"]) for v in vols)
79
+ if total_chapters == 0:
80
+ self.logger.warning("%s 书籍没有章节可下载: %s", TAG, book_id)
81
+ return
152
82
 
153
- if not cid:
154
- self.logger.warning("[Fetcher] Skipped empty cid task: %s", task)
155
- cid_queue.task_done()
156
- continue
83
+ progress = Progress(total_chapters, progress_hook)
157
84
 
158
- if cid in ignore_set:
159
- cid_queue.task_done()
160
- continue
85
+ # --- queues & batching ---
86
+ cid_q: asyncio.Queue[str | StopToken] = asyncio.Queue()
87
+ save_q: asyncio.Queue[ChapterDict | StopToken] = asyncio.Queue()
88
+ batch: list[ChapterDict] = []
161
89
 
90
+ async def flush_batch() -> None:
91
+ if not batch:
92
+ return
162
93
  try:
163
- async with semaphore:
164
- html_list = await self.fetcher.get_book_chapter(book_id, cid)
165
- await html_queue.put(
166
- HtmlTask(
167
- cid=cid,
168
- retry=task.retry,
169
- html_list=html_list,
170
- vol_idx=task.vol_idx,
171
- chap_idx=task.chap_idx,
172
- )
94
+ chapter_storage.upsert_chapters(batch, self.DEFAULT_SOURCE_ID)
95
+ except Exception as e:
96
+ self.logger.error(
97
+ "[Storage] batch upsert failed (size=%d): %s",
98
+ len(batch),
99
+ e,
100
+ exc_info=True,
173
101
  )
174
- self.logger.info("[Fetcher] Downloaded chapter %s", cid)
175
- await async_sleep_with_random_delay(
102
+ else:
103
+ await progress.bump(len(batch))
104
+ finally:
105
+ batch.clear()
106
+
107
+ # --- stage: storage worker ---
108
+ async def storage_worker() -> None:
109
+ """
110
+ Consumes parsed chapters, writes in batches.
111
+
112
+ Terminates after receiving STOP from each chapter worker.
113
+
114
+ On cancel: keeps consuming (to avoid blocking producers),
115
+ flushes, and exits once all STOPs are seen.
116
+ """
117
+ stop_count = 0
118
+ while True:
119
+ item = await save_q.get()
120
+ if isinstance(item, StopToken):
121
+ stop_count += 1
122
+ if stop_count == self.workers:
123
+ # All chapter workers have exited.
124
+ await flush_batch()
125
+ return
126
+ # else keep waiting for remaining STOPs
127
+ continue
128
+
129
+ # Normal chapter
130
+ batch.append(item)
131
+ if len(batch) >= self.storage_batch_size:
132
+ await flush_batch()
133
+
134
+ if cancelled():
135
+ # Drain whatever is already in the queue
136
+ try:
137
+ while True:
138
+ nxt = save_q.get_nowait()
139
+ if isinstance(nxt, StopToken):
140
+ stop_count += 1
141
+ else:
142
+ batch.append(nxt)
143
+ except asyncio.QueueEmpty:
144
+ pass
145
+ # Final flush of everything
146
+ await flush_batch()
147
+ # Wait for remaining STOPs so chapter workers can finish.
148
+ while stop_count < self.workers:
149
+ nxt = await save_q.get()
150
+ if isinstance(nxt, StopToken):
151
+ stop_count += 1
152
+ return
153
+
154
+ # --- stage: chapter worker ---
155
+ sem = asyncio.Semaphore(self.workers)
156
+
157
+ async def chapter_worker() -> None:
158
+ """
159
+ Fetch + parse with retry, then enqueue to save_q.
160
+
161
+ Exits on STOP, or early if cancel is set before starting a new fetch.
162
+ """
163
+ while True:
164
+ cid = await cid_q.get()
165
+ if isinstance(cid, StopToken):
166
+ # Propagate one STOP to storage and exit.
167
+ await save_q.put(STOP)
168
+ return
169
+
170
+ if not cid or cid in ignore_set:
171
+ # Ignore silently and continue.
172
+ continue
173
+
174
+ # If cancelled, don't start a new network call; let storage finish.
175
+ if cancelled():
176
+ await save_q.put(STOP)
177
+ return
178
+
179
+ async with sem:
180
+ chap = await self._process_chapter(book_id, cid, html_dir)
181
+ if chap:
182
+ await save_q.put(chap)
183
+
184
+ # polite pacing
185
+ await async_jitter_sleep(
176
186
  self.request_interval,
177
187
  mul_spread=1.1,
178
188
  max_sleep=self.request_interval + 2,
179
189
  )
180
190
 
181
- except Exception as e:
182
- if task.retry < retry_times:
183
- await cid_queue.put(
184
- CidTask(
185
- prev_cid=task.prev_cid,
186
- cid=cid,
187
- retry=task.retry + 1,
188
- vol_idx=task.vol_idx,
189
- chap_idx=task.chap_idx,
190
- )
191
- )
192
- self.logger.info(
193
- "[Fetcher] Re-queued chapter %s for retry #%d: %s",
194
- cid,
195
- task.retry + 1,
196
- e,
197
- )
198
- backoff = self.backoff_factor * (2**task.retry)
199
- await async_sleep_with_random_delay(
200
- base=backoff,
201
- mul_spread=1.2,
202
- max_sleep=backoff + 3,
203
- )
204
- else:
205
- self.logger.warning(
206
- "[Fetcher] Max retries reached for chapter %s: %s",
207
- cid,
208
- e,
209
- )
191
+ # --- stage: producer ---
192
+ async def producer() -> None:
193
+ """
194
+ Enqueue chapter IDs (respecting start/end/skip_existing).
210
195
 
211
- finally:
212
- cid_queue.task_done()
213
-
214
- async def parser_worker(
215
- worker_id: int,
216
- cid_queue: asyncio.Queue[CidTask],
217
- html_queue: asyncio.Queue[HtmlTask],
218
- save_queue: asyncio.Queue[ChapterDict],
219
- retry_times: int,
220
- ) -> None:
221
- while True:
222
- task = await html_queue.get()
196
+ Always sends STOP x workers at the end (even if cancelled early),
197
+ so chapter workers can exit deterministically.
198
+ """
223
199
  try:
224
- chap_json = await asyncio.to_thread(
225
- self.parser.parse_chapter,
226
- task.html_list,
227
- task.cid,
228
- )
229
- if chap_json:
230
- await save_queue.put(chap_json)
231
- self.logger.info(
232
- "[Parser-%d] saved chapter %s",
233
- worker_id,
234
- task.cid,
235
- )
236
- else:
237
- raise ValueError("Empty parse result")
238
- except Exception as e:
239
- if task.retry < retry_times:
240
- await cid_queue.put(
241
- CidTask(
242
- prev_cid=None,
243
- cid=task.cid,
244
- retry=task.retry + 1,
245
- vol_idx=task.vol_idx,
246
- chap_idx=task.chap_idx,
247
- )
248
- )
249
- self.logger.info(
250
- "[Parser-%d] Re-queued cid %s for retry #%d: %s",
251
- worker_id,
252
- task.cid,
253
- task.retry + 1,
254
- e,
255
- )
256
- else:
257
- self.logger.warning(
258
- "[Parser-%d] Max retries reached for cid %s: %s",
259
- worker_id,
260
- task.cid,
261
- e,
262
- )
200
+ async for cid in self._chapter_ids(vols, start_id, end_id):
201
+ if cancelled():
202
+ break
203
+ if self.skip_existing and chapter_storage.exists(cid):
204
+ # Count as completed but don't enqueue.
205
+ await progress.bump(1)
206
+ else:
207
+ await cid_q.put(cid)
263
208
  finally:
264
- html_queue.task_done()
265
-
266
- async def storage_worker(
267
- cs: ChapterStorage,
268
- save_queue: asyncio.Queue[ChapterDict],
269
- restore_queue: asyncio.Queue[RestoreTask],
270
- cid_queue: asyncio.Queue[CidTask],
271
- ) -> None:
272
- nonlocal completed_count
273
- while True:
274
- save_task = asyncio.create_task(save_queue.get())
275
- restore_task = asyncio.create_task(restore_queue.get())
276
-
277
- done, pending = await asyncio.wait(
278
- [save_task, restore_task],
279
- return_when=asyncio.FIRST_COMPLETED,
209
+ for _ in range(self.workers):
210
+ await cid_q.put(STOP)
211
+
212
+ # --- run the pipeline ---
213
+ async with asyncio.TaskGroup() as tg:
214
+ tg.create_task(storage_worker())
215
+ for _ in range(self.workers):
216
+ tg.create_task(chapter_worker())
217
+ tg.create_task(producer())
218
+
219
+ # --- done ---
220
+ if cancelled():
221
+ self.logger.info(
222
+ "%s Novel '%s' cancelled: flushed %d/%d chapters.",
223
+ TAG,
224
+ book_info.get("book_name", "unknown"),
225
+ progress.done,
226
+ progress.total,
227
+ )
228
+ else:
229
+ self.logger.info(
230
+ "%s Novel '%s' download completed.",
231
+ TAG,
232
+ book_info.get("book_name", "unknown"),
280
233
  )
281
234
 
282
- for task in pending:
283
- task.cancel()
284
- with suppress(asyncio.CancelledError):
285
- await task
235
+ finally:
236
+ chapter_storage.close()
286
237
 
287
- for task in done:
288
- item = task.result()
238
+ async def _process_chapter(
239
+ self,
240
+ book_id: str,
241
+ cid: str,
242
+ html_dir: Path,
243
+ ) -> ChapterDict | None:
244
+ """
245
+ Fetches, saves raw HTML, parses a single chapter,
246
+ retrying up to self.retry_times.
289
247
 
290
- if isinstance(item, dict): # from save_queue
291
- try:
292
- cs.save(cast(ChapterDict, item))
293
- completed_count += 1
294
- if progress_hook:
295
- await progress_hook(completed_count, total_chapters)
296
-
297
- curr_cid = item["id"]
298
- if curr_cid in pending_restore:
299
- rt = pending_restore.pop(curr_cid)
300
- next_cid = item.get("extra", {}).get("next_chapter_id")
301
- if next_cid:
302
- update_book_info(
303
- vol_idx=rt.vol_idx,
304
- chap_idx=rt.chap_idx,
305
- cid=next_cid,
306
- )
307
- await cid_queue.put(
308
- CidTask(
309
- prev_cid=rt.prev_cid,
310
- cid=next_cid,
311
- vol_idx=rt.vol_idx,
312
- chap_idx=rt.chap_idx,
313
- )
314
- )
315
- else:
316
- self.logger.warning(
317
- "[storage_worker] No next_cid found for %r",
318
- rt,
319
- )
320
- except Exception as e:
321
- self.logger.error("[storage_worker] Failed to save: %s", e)
322
- finally:
323
- save_queue.task_done()
324
-
325
- elif isinstance(item, RestoreTask): # from restore_queue
326
- prev_json = cs.get(item.prev_cid)
327
- next_cid = (
328
- prev_json.get("extra", {}).get("next_chapter_id")
329
- if prev_json
330
- else None
331
- )
332
- if next_cid:
333
- update_book_info(
334
- vol_idx=item.vol_idx,
335
- chap_idx=item.chap_idx,
336
- cid=next_cid,
337
- )
338
- await cid_queue.put(
339
- CidTask(
340
- prev_cid=item.prev_cid,
341
- cid=next_cid,
342
- vol_idx=item.vol_idx,
343
- chap_idx=item.chap_idx,
344
- )
345
- )
346
- else:
347
- pending_restore[item.prev_cid] = item
348
- restore_queue.task_done()
349
-
350
- fetcher_tasks = [
351
- asyncio.create_task(
352
- fetcher_worker(
353
- book_id,
354
- cid_queue,
355
- html_queue,
356
- restore_queue,
357
- self.retry_times,
358
- semaphore,
359
- )
360
- )
361
- for _ in range(self.download_workers)
362
- ]
363
-
364
- parser_tasks = [
365
- asyncio.create_task(
366
- parser_worker(
367
- i,
368
- cid_queue,
369
- html_queue,
370
- save_queue,
371
- self.retry_times,
248
+ :return: ChapterDict on success, or None on failure.
249
+ """
250
+ for attempt in range(self.retry_times + 1):
251
+ try:
252
+ html_list = await self.fetcher.get_book_chapter(book_id, cid)
253
+ self._save_html_pages(html_dir, cid, html_list)
254
+ chap = await asyncio.to_thread(
255
+ self.parser.parse_chapter, html_list, cid
372
256
  )
373
- )
374
- for i in range(self.parser_workers)
375
- ]
376
-
377
- storage_task = asyncio.create_task(
378
- storage_worker(
379
- cs=normal_cs,
380
- save_queue=save_queue,
381
- restore_queue=restore_queue,
382
- cid_queue=cid_queue,
383
- )
384
- )
385
-
386
- found_start = start_id is None
387
- stop_early = False
388
- last_cid: str | None = None
389
-
390
- for vol_idx, vol in enumerate(vols):
391
- chapters = vol.get("chapters", [])
392
- for chap_idx, chap in enumerate(chapters):
393
- if stop_early:
394
- break
395
-
396
- cid = chap.get("chapterId")
397
-
398
- # Skip until reaching start_id
399
- if not found_start:
400
- if cid == start_id:
401
- found_start = True
402
- else:
403
- completed_count += 1
404
- last_cid = cid
405
- continue
406
-
407
- # Stop when reaching end_id
408
- if end_id is not None and cid == end_id:
409
- stop_early = True
410
-
411
- if cid and normal_cs.exists(cid) and self.skip_existing:
412
- completed_count += 1
413
- last_cid = cid
414
- continue
415
-
416
- await cid_queue.put(
417
- CidTask(
418
- vol_idx=vol_idx,
419
- chap_idx=chap_idx,
420
- cid=cid,
421
- prev_cid=last_cid,
257
+ if not chap:
258
+ raise ValueError("Empty parse result")
259
+ return chap
260
+ except Exception as e:
261
+ if attempt < self.retry_times:
262
+ self.logger.info(
263
+ "[ChapterWorker] Retry %s (%s): %s", cid, attempt + 1, e
422
264
  )
423
- )
424
-
425
- last_cid = cid
426
-
427
- if stop_early:
428
- break
429
-
430
- await restore_queue.join()
431
- await cid_queue.join()
432
- await html_queue.join()
433
- await save_queue.join()
434
-
435
- for task in fetcher_tasks + parser_tasks + [storage_task]:
436
- task.cancel()
437
- with suppress(asyncio.CancelledError):
438
- await task
265
+ backoff = self.backoff_factor * (2**attempt)
266
+ await async_jitter_sleep(
267
+ base=backoff, mul_spread=1.2, max_sleep=backoff + 3
268
+ )
269
+ else:
270
+ self.logger.warning("[ChapterWorker] Failed %s: %s", cid, e)
271
+ return None
439
272
 
440
- normal_cs.close()
441
- save_as_json(book_info, info_path)
273
+ @staticmethod
274
+ def _normalize_book_id(book_id: str) -> str:
275
+ """
276
+ Normalize a book identifier.
442
277
 
443
- self.logger.info(
444
- "%s Novel '%s' download completed.",
445
- TAG,
446
- book_info.get("book_name", "unknown"),
447
- )
448
- return
278
+ Subclasses may override this method to transform the book ID
279
+ into their preferred format.
280
+ """
281
+ return book_id.replace("/", "-")