novel-downloader 1.4.5__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/__init__.py +2 -2
  3. novel_downloader/cli/config.py +1 -83
  4. novel_downloader/cli/download.py +4 -5
  5. novel_downloader/cli/export.py +4 -1
  6. novel_downloader/cli/main.py +2 -0
  7. novel_downloader/cli/search.py +123 -0
  8. novel_downloader/config/__init__.py +3 -10
  9. novel_downloader/config/adapter.py +190 -54
  10. novel_downloader/config/loader.py +2 -3
  11. novel_downloader/core/__init__.py +13 -13
  12. novel_downloader/core/downloaders/__init__.py +10 -11
  13. novel_downloader/core/downloaders/base.py +152 -26
  14. novel_downloader/core/downloaders/biquge.py +5 -1
  15. novel_downloader/core/downloaders/common.py +157 -378
  16. novel_downloader/core/downloaders/esjzone.py +5 -1
  17. novel_downloader/core/downloaders/linovelib.py +5 -1
  18. novel_downloader/core/downloaders/qianbi.py +291 -4
  19. novel_downloader/core/downloaders/qidian.py +199 -285
  20. novel_downloader/core/downloaders/registry.py +67 -0
  21. novel_downloader/core/downloaders/sfacg.py +5 -1
  22. novel_downloader/core/downloaders/yamibo.py +5 -1
  23. novel_downloader/core/exporters/__init__.py +10 -11
  24. novel_downloader/core/exporters/base.py +87 -7
  25. novel_downloader/core/exporters/biquge.py +5 -8
  26. novel_downloader/core/exporters/common/__init__.py +2 -2
  27. novel_downloader/core/exporters/common/epub.py +82 -166
  28. novel_downloader/core/exporters/common/main_exporter.py +0 -60
  29. novel_downloader/core/exporters/common/txt.py +82 -83
  30. novel_downloader/core/exporters/epub_util.py +157 -1330
  31. novel_downloader/core/exporters/esjzone.py +5 -8
  32. novel_downloader/core/exporters/linovelib/__init__.py +2 -2
  33. novel_downloader/core/exporters/linovelib/epub.py +157 -212
  34. novel_downloader/core/exporters/linovelib/main_exporter.py +2 -59
  35. novel_downloader/core/exporters/linovelib/txt.py +67 -63
  36. novel_downloader/core/exporters/qianbi.py +5 -8
  37. novel_downloader/core/exporters/qidian.py +14 -4
  38. novel_downloader/core/exporters/registry.py +53 -0
  39. novel_downloader/core/exporters/sfacg.py +5 -8
  40. novel_downloader/core/exporters/txt_util.py +67 -0
  41. novel_downloader/core/exporters/yamibo.py +5 -8
  42. novel_downloader/core/fetchers/__init__.py +19 -24
  43. novel_downloader/core/fetchers/base/__init__.py +3 -3
  44. novel_downloader/core/fetchers/base/browser.py +23 -4
  45. novel_downloader/core/fetchers/base/session.py +30 -5
  46. novel_downloader/core/fetchers/biquge/__init__.py +3 -3
  47. novel_downloader/core/fetchers/biquge/browser.py +5 -0
  48. novel_downloader/core/fetchers/biquge/session.py +6 -1
  49. novel_downloader/core/fetchers/esjzone/__init__.py +3 -3
  50. novel_downloader/core/fetchers/esjzone/browser.py +5 -0
  51. novel_downloader/core/fetchers/esjzone/session.py +6 -1
  52. novel_downloader/core/fetchers/linovelib/__init__.py +3 -3
  53. novel_downloader/core/fetchers/linovelib/browser.py +6 -1
  54. novel_downloader/core/fetchers/linovelib/session.py +6 -1
  55. novel_downloader/core/fetchers/qianbi/__init__.py +3 -3
  56. novel_downloader/core/fetchers/qianbi/browser.py +5 -0
  57. novel_downloader/core/fetchers/qianbi/session.py +5 -0
  58. novel_downloader/core/fetchers/qidian/__init__.py +3 -3
  59. novel_downloader/core/fetchers/qidian/browser.py +12 -4
  60. novel_downloader/core/fetchers/qidian/session.py +11 -3
  61. novel_downloader/core/fetchers/registry.py +71 -0
  62. novel_downloader/core/fetchers/sfacg/__init__.py +3 -3
  63. novel_downloader/core/fetchers/sfacg/browser.py +5 -0
  64. novel_downloader/core/fetchers/sfacg/session.py +5 -0
  65. novel_downloader/core/fetchers/yamibo/__init__.py +3 -3
  66. novel_downloader/core/fetchers/yamibo/browser.py +5 -0
  67. novel_downloader/core/fetchers/yamibo/session.py +6 -1
  68. novel_downloader/core/interfaces/__init__.py +7 -5
  69. novel_downloader/core/interfaces/searcher.py +18 -0
  70. novel_downloader/core/parsers/__init__.py +10 -11
  71. novel_downloader/core/parsers/{biquge/main_parser.py → biquge.py} +7 -2
  72. novel_downloader/core/parsers/{esjzone/main_parser.py → esjzone.py} +7 -2
  73. novel_downloader/core/parsers/{linovelib/main_parser.py → linovelib.py} +7 -2
  74. novel_downloader/core/parsers/{qianbi/main_parser.py → qianbi.py} +7 -2
  75. novel_downloader/core/parsers/qidian/__init__.py +2 -2
  76. novel_downloader/core/parsers/qidian/chapter_encrypted.py +23 -21
  77. novel_downloader/core/parsers/qidian/chapter_normal.py +1 -1
  78. novel_downloader/core/parsers/qidian/main_parser.py +10 -21
  79. novel_downloader/core/parsers/qidian/utils/__init__.py +11 -11
  80. novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +5 -6
  81. novel_downloader/core/parsers/qidian/utils/node_decryptor.py +2 -2
  82. novel_downloader/core/parsers/registry.py +68 -0
  83. novel_downloader/core/parsers/{sfacg/main_parser.py → sfacg.py} +7 -2
  84. novel_downloader/core/parsers/{yamibo/main_parser.py → yamibo.py} +7 -2
  85. novel_downloader/core/searchers/__init__.py +20 -0
  86. novel_downloader/core/searchers/base.py +92 -0
  87. novel_downloader/core/searchers/biquge.py +83 -0
  88. novel_downloader/core/searchers/esjzone.py +84 -0
  89. novel_downloader/core/searchers/qianbi.py +131 -0
  90. novel_downloader/core/searchers/qidian.py +87 -0
  91. novel_downloader/core/searchers/registry.py +63 -0
  92. novel_downloader/locales/en.json +12 -4
  93. novel_downloader/locales/zh.json +12 -4
  94. novel_downloader/models/__init__.py +4 -30
  95. novel_downloader/models/config.py +12 -6
  96. novel_downloader/models/search.py +16 -0
  97. novel_downloader/models/types.py +0 -2
  98. novel_downloader/resources/config/settings.toml +31 -4
  99. novel_downloader/resources/css_styles/intro.css +83 -0
  100. novel_downloader/resources/css_styles/main.css +30 -89
  101. novel_downloader/utils/__init__.py +52 -0
  102. novel_downloader/utils/chapter_storage.py +244 -224
  103. novel_downloader/utils/constants.py +1 -21
  104. novel_downloader/utils/epub/__init__.py +34 -0
  105. novel_downloader/utils/epub/builder.py +377 -0
  106. novel_downloader/utils/epub/constants.py +77 -0
  107. novel_downloader/utils/epub/documents.py +403 -0
  108. novel_downloader/utils/epub/models.py +134 -0
  109. novel_downloader/utils/epub/utils.py +212 -0
  110. novel_downloader/utils/file_utils/__init__.py +10 -14
  111. novel_downloader/utils/file_utils/io.py +20 -51
  112. novel_downloader/utils/file_utils/normalize.py +2 -2
  113. novel_downloader/utils/file_utils/sanitize.py +2 -3
  114. novel_downloader/utils/fontocr/__init__.py +5 -5
  115. novel_downloader/utils/{hash_store.py → fontocr/hash_store.py} +4 -3
  116. novel_downloader/utils/{hash_utils.py → fontocr/hash_utils.py} +2 -2
  117. novel_downloader/utils/fontocr/ocr_v1.py +13 -1
  118. novel_downloader/utils/fontocr/ocr_v2.py +13 -1
  119. novel_downloader/utils/fontocr/ocr_v3.py +744 -0
  120. novel_downloader/utils/i18n.py +2 -0
  121. novel_downloader/utils/logger.py +2 -0
  122. novel_downloader/utils/network.py +110 -251
  123. novel_downloader/utils/state.py +1 -0
  124. novel_downloader/utils/text_utils/__init__.py +18 -17
  125. novel_downloader/utils/text_utils/diff_display.py +4 -5
  126. novel_downloader/utils/text_utils/numeric_conversion.py +253 -0
  127. novel_downloader/utils/text_utils/text_cleaner.py +179 -0
  128. novel_downloader/utils/text_utils/truncate_utils.py +62 -0
  129. novel_downloader/utils/time_utils/__init__.py +3 -3
  130. novel_downloader/utils/time_utils/datetime_utils.py +4 -5
  131. novel_downloader/utils/time_utils/sleep_utils.py +2 -3
  132. {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/METADATA +2 -2
  133. novel_downloader-1.5.0.dist-info/RECORD +164 -0
  134. novel_downloader/config/site_rules.py +0 -94
  135. novel_downloader/core/factory/__init__.py +0 -20
  136. novel_downloader/core/factory/downloader.py +0 -73
  137. novel_downloader/core/factory/exporter.py +0 -58
  138. novel_downloader/core/factory/fetcher.py +0 -96
  139. novel_downloader/core/factory/parser.py +0 -86
  140. novel_downloader/core/fetchers/common/__init__.py +0 -14
  141. novel_downloader/core/fetchers/common/browser.py +0 -79
  142. novel_downloader/core/fetchers/common/session.py +0 -79
  143. novel_downloader/core/parsers/biquge/__init__.py +0 -10
  144. novel_downloader/core/parsers/common/__init__.py +0 -13
  145. novel_downloader/core/parsers/common/helper.py +0 -323
  146. novel_downloader/core/parsers/common/main_parser.py +0 -106
  147. novel_downloader/core/parsers/esjzone/__init__.py +0 -10
  148. novel_downloader/core/parsers/linovelib/__init__.py +0 -10
  149. novel_downloader/core/parsers/qianbi/__init__.py +0 -10
  150. novel_downloader/core/parsers/sfacg/__init__.py +0 -10
  151. novel_downloader/core/parsers/yamibo/__init__.py +0 -10
  152. novel_downloader/models/browser.py +0 -21
  153. novel_downloader/models/site_rules.py +0 -99
  154. novel_downloader/models/tasks.py +0 -33
  155. novel_downloader/resources/css_styles/volume-intro.css +0 -56
  156. novel_downloader/resources/json/replace_word_map.json +0 -4
  157. novel_downloader/resources/text/blacklist.txt +0 -22
  158. novel_downloader/utils/text_utils/chapter_formatting.py +0 -46
  159. novel_downloader/utils/text_utils/font_mapping.py +0 -28
  160. novel_downloader/utils/text_utils/text_cleaning.py +0 -107
  161. novel_downloader-1.4.5.dist-info/RECORD +0 -165
  162. {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/WHEEL +0 -0
  163. {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/entry_points.txt +0 -0
  164. {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/licenses/LICENSE +0 -0
  165. {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/top_level.txt +0 -0
@@ -6,6 +6,7 @@ novel_downloader.core.downloaders.biquge
6
6
  """
7
7
 
8
8
  from novel_downloader.core.downloaders.common import CommonDownloader
9
+ from novel_downloader.core.downloaders.registry import register_downloader
9
10
  from novel_downloader.core.interfaces import (
10
11
  FetcherProtocol,
11
12
  ParserProtocol,
@@ -13,8 +14,11 @@ from novel_downloader.core.interfaces import (
13
14
  from novel_downloader.models import DownloaderConfig
14
15
 
15
16
 
17
+ @register_downloader(site_keys=["biquge", "bqg"])
16
18
  class BiqugeDownloader(CommonDownloader):
17
- """"""
19
+ """
20
+ Downloader for biquge (笔趣阁) novels.
21
+ """
18
22
 
19
23
  def __init__(
20
24
  self,
@@ -6,30 +6,25 @@ novel_downloader.core.downloaders.common
6
6
  """
7
7
 
8
8
  import asyncio
9
- import json
10
- from collections.abc import Awaitable, Callable
11
- from contextlib import suppress
12
- from typing import Any, cast
9
+ from collections.abc import AsyncIterator, Awaitable, Callable
10
+ from contextlib import asynccontextmanager
11
+ from pathlib import Path
12
+ from typing import Any
13
13
 
14
14
  from novel_downloader.core.downloaders.base import BaseDownloader
15
15
  from novel_downloader.models import (
16
16
  BookConfig,
17
17
  ChapterDict,
18
- CidTask,
19
- HtmlTask,
20
- RestoreTask,
21
18
  )
22
- from novel_downloader.utils.chapter_storage import ChapterStorage
23
- from novel_downloader.utils.file_utils import save_as_json, save_as_txt
24
- from novel_downloader.utils.time_utils import (
19
+ from novel_downloader.utils import (
20
+ ChapterStorage,
25
21
  async_sleep_with_random_delay,
26
- calculate_time_difference,
27
22
  )
28
23
 
29
24
 
30
25
  class CommonDownloader(BaseDownloader):
31
26
  """
32
- Specialized Async downloader for common novels.
27
+ Specialized Async downloader for "common" novel sites.
33
28
  """
34
29
 
35
30
  async def _download_one(
@@ -50,399 +45,183 @@ class CommonDownloader(BaseDownloader):
50
45
  end_id = book.get("end_id")
51
46
  ignore_set = set(book.get("ignore_ids", []))
52
47
 
53
- raw_base = self.raw_data_dir / book_id
54
- cache_base = self.cache_dir / book_id
55
- info_path = raw_base / "book_info.json"
56
- chapters_html_dir = cache_base / "html"
57
-
48
+ # prepare storage & dirs
49
+ raw_base = self._raw_data_dir / book_id
58
50
  raw_base.mkdir(parents=True, exist_ok=True)
59
- if self.save_html:
60
- chapters_html_dir.mkdir(parents=True, exist_ok=True)
61
- normal_cs = ChapterStorage(
51
+ html_dir = self._debug_dir / book_id / "html"
52
+ chapter_storage = ChapterStorage(
62
53
  raw_base=raw_base,
63
- namespace="chapters",
64
- backend_type=self._config.storage_backend,
65
- batch_size=self._config.storage_batch_size,
54
+ priorities=self._priorities,
66
55
  )
56
+ chapter_storage.connect()
67
57
 
68
- # load or fetch book_info
69
- book_info: dict[str, Any]
70
- re_fetch = True
71
- old_data: dict[str, Any] = {}
72
-
73
- if info_path.exists():
74
- try:
75
- old_data = json.loads(info_path.read_text("utf-8"))
76
- days, *_ = calculate_time_difference(
77
- old_data.get("update_time", ""), "UTC+8"
78
- )
79
- re_fetch = days > 1
80
- except Exception:
81
- re_fetch = True
82
-
83
- if re_fetch:
84
- info_html = await self.fetcher.get_book_info(book_id)
85
- if self.save_html:
86
- for i, html in enumerate(info_html):
87
- save_as_txt(html, chapters_html_dir / f"info_{i}.html")
88
- book_info = self.parser.parse_book_info(info_html)
89
-
90
- if book_info.get("book_name") != "未找到书名":
91
- save_as_json(book_info, info_path)
92
- else:
93
- self.logger.warning("%s 书籍信息未找到, book_id = %s", TAG, book_id)
94
- book_info = old_data or {"book_name": "未找到书名"}
95
- else:
96
- book_info = old_data
97
-
58
+ # load or fetch metadata
59
+ book_info = await self.load_book_info(book_id=book_id, html_dir=html_dir)
98
60
  vols = book_info.get("volumes", [])
99
- total_chapters = 0
100
- for vol in vols:
101
- total_chapters += len(vol.get("chapters", []))
61
+ total_chapters = sum(len(v.get("chapters", [])) for v in vols)
102
62
  if total_chapters == 0:
103
- self.logger.warning("%s 书籍没有章节可下载: book_id=%s", TAG, book_id)
63
+ self.logger.warning("%s 书籍没有章节可下载: %s", TAG, book_id)
104
64
  return
105
65
 
106
- completed_count = 0
66
+ # concurrency primitives
67
+ sem = asyncio.Semaphore(self.workers)
68
+ cid_q: asyncio.Queue[str | None] = asyncio.Queue()
69
+ save_q: asyncio.Queue[ChapterDict | None] = asyncio.Queue()
70
+ batch: list[ChapterDict] = []
71
+ completed = 0
107
72
 
108
- # setup queue, semaphore
109
- semaphore = asyncio.Semaphore(self.download_workers)
110
- cid_queue: asyncio.Queue[CidTask] = asyncio.Queue()
111
- restore_queue: asyncio.Queue[RestoreTask] = asyncio.Queue()
112
- html_queue: asyncio.Queue[HtmlTask] = asyncio.Queue()
113
- save_queue: asyncio.Queue[ChapterDict] = asyncio.Queue()
114
- pending_restore: dict[str, RestoreTask] = {}
73
+ async def _flush_batch() -> None:
74
+ nonlocal batch, completed
75
+ if not batch:
76
+ return
115
77
 
116
- def update_book_info(
117
- vol_idx: int,
118
- chap_idx: int,
119
- cid: str,
120
- ) -> None:
121
78
  try:
122
- book_info["volumes"][vol_idx]["chapters"][chap_idx]["chapterId"] = cid
123
- except (IndexError, KeyError, TypeError) as e:
124
- self.logger.info(
125
- "[update_book_info] Failed to update vol=%s, chap=%s: %s",
126
- vol_idx,
127
- chap_idx,
79
+ chapter_storage.upsert_chapters(batch, self.DEFAULT_SOURCE_ID)
80
+ except Exception as e:
81
+ self.logger.error(
82
+ "[Storage] batch upsert failed (size=%d): %s",
83
+ len(batch),
128
84
  e,
85
+ exc_info=True,
129
86
  )
87
+ else:
88
+ completed += len(batch)
89
+ if progress_hook:
90
+ await progress_hook(completed, total_chapters)
91
+ finally:
92
+ batch.clear()
130
93
 
131
- async def fetcher_worker(
132
- book_id: str,
133
- cid_queue: asyncio.Queue[CidTask],
134
- html_queue: asyncio.Queue[HtmlTask],
135
- restore_queue: asyncio.Queue[RestoreTask],
136
- retry_times: int,
137
- semaphore: asyncio.Semaphore,
138
- ) -> None:
94
+ async def storage_worker(q: asyncio.Queue[ChapterDict | None]) -> None:
139
95
  while True:
140
- task = await cid_queue.get()
141
- cid = task.cid
142
- if not cid and task.prev_cid:
143
- await restore_queue.put(
144
- RestoreTask(
145
- vol_idx=task.vol_idx,
146
- chap_idx=task.chap_idx,
147
- prev_cid=task.prev_cid,
148
- )
149
- )
150
- cid_queue.task_done()
151
- continue
152
-
153
- if not cid:
154
- self.logger.warning("[Fetcher] Skipped empty cid task: %s", task)
155
- cid_queue.task_done()
156
- continue
157
-
158
- if cid in ignore_set:
159
- cid_queue.task_done()
160
- continue
161
-
162
- try:
163
- async with semaphore:
164
- html_list = await self.fetcher.get_book_chapter(book_id, cid)
165
- await html_queue.put(
166
- HtmlTask(
167
- cid=cid,
168
- retry=task.retry,
169
- html_list=html_list,
170
- vol_idx=task.vol_idx,
171
- chap_idx=task.chap_idx,
172
- )
173
- )
174
- self.logger.info("[Fetcher] Downloaded chapter %s", cid)
175
- await async_sleep_with_random_delay(
176
- self.request_interval,
177
- mul_spread=1.1,
178
- max_sleep=self.request_interval + 2,
179
- )
180
-
181
- except Exception as e:
182
- if task.retry < retry_times:
183
- await cid_queue.put(
184
- CidTask(
185
- prev_cid=task.prev_cid,
186
- cid=cid,
187
- retry=task.retry + 1,
188
- vol_idx=task.vol_idx,
189
- chap_idx=task.chap_idx,
190
- )
191
- )
192
- self.logger.info(
193
- "[Fetcher] Re-queued chapter %s for retry #%d: %s",
194
- cid,
195
- task.retry + 1,
196
- e,
197
- )
198
- backoff = self.backoff_factor * (2**task.retry)
199
- await async_sleep_with_random_delay(
200
- base=backoff,
201
- mul_spread=1.2,
202
- max_sleep=backoff + 3,
203
- )
204
- else:
205
- self.logger.warning(
206
- "[Fetcher] Max retries reached for chapter %s: %s",
207
- cid,
208
- e,
96
+ item = await q.get()
97
+ q.task_done()
98
+ if item is None:
99
+ # final flush before exit
100
+ if batch:
101
+ await _flush_batch()
102
+ break
103
+ batch.append(item)
104
+ if len(batch) >= self.storage_batch_size:
105
+ await _flush_batch()
106
+
107
+ async def producer() -> None:
108
+ nonlocal completed
109
+ async for cid in self._chapter_ids(vols, start_id, end_id):
110
+ if self.skip_existing and chapter_storage.exists(cid):
111
+ completed += 1
112
+ if progress_hook:
113
+ await progress_hook(completed, total_chapters)
114
+ else:
115
+ await cid_q.put(cid)
116
+
117
+ @asynccontextmanager
118
+ async def task_group_ctx() -> AsyncIterator[asyncio.TaskGroup]:
119
+ async with asyncio.TaskGroup() as tg:
120
+ # start chapter workers
121
+ for _ in range(self.workers):
122
+ tg.create_task(
123
+ self._chapter_worker(
124
+ book_id,
125
+ ignore_set,
126
+ cid_q,
127
+ save_q,
128
+ sem,
209
129
  )
210
-
211
- finally:
212
- cid_queue.task_done()
213
-
214
- async def parser_worker(
215
- worker_id: int,
216
- cid_queue: asyncio.Queue[CidTask],
217
- html_queue: asyncio.Queue[HtmlTask],
218
- save_queue: asyncio.Queue[ChapterDict],
219
- retry_times: int,
220
- ) -> None:
221
- while True:
222
- task = await html_queue.get()
223
- try:
224
- chap_json = await asyncio.to_thread(
225
- self.parser.parse_chapter,
226
- task.html_list,
227
- task.cid,
228
130
  )
229
- if chap_json:
230
- await save_queue.put(chap_json)
231
- self.logger.info(
232
- "[Parser-%d] saved chapter %s",
233
- worker_id,
234
- task.cid,
235
- )
236
- else:
237
- raise ValueError("Empty parse result")
238
- except Exception as e:
239
- if task.retry < retry_times:
240
- await cid_queue.put(
241
- CidTask(
242
- prev_cid=None,
243
- cid=task.cid,
244
- retry=task.retry + 1,
245
- vol_idx=task.vol_idx,
246
- chap_idx=task.chap_idx,
247
- )
248
- )
249
- self.logger.info(
250
- "[Parser-%d] Re-queued cid %s for retry #%d: %s",
251
- worker_id,
252
- task.cid,
253
- task.retry + 1,
254
- e,
255
- )
256
- else:
257
- self.logger.warning(
258
- "[Parser-%d] Max retries reached for cid %s: %s",
259
- worker_id,
260
- task.cid,
261
- e,
262
- )
263
- finally:
264
- html_queue.task_done()
265
-
266
- async def storage_worker(
267
- cs: ChapterStorage,
268
- save_queue: asyncio.Queue[ChapterDict],
269
- restore_queue: asyncio.Queue[RestoreTask],
270
- cid_queue: asyncio.Queue[CidTask],
271
- ) -> None:
272
- nonlocal completed_count
273
- while True:
274
- save_task = asyncio.create_task(save_queue.get())
275
- restore_task = asyncio.create_task(restore_queue.get())
276
-
277
- done, pending = await asyncio.wait(
278
- [save_task, restore_task],
279
- return_when=asyncio.FIRST_COMPLETED,
280
- )
131
+ # start storage worker
132
+ tg.create_task(storage_worker(save_q))
133
+ yield tg
281
134
 
282
- for task in pending:
283
- task.cancel()
284
- with suppress(asyncio.CancelledError):
285
- await task
135
+ # run producer + workers
136
+ async with task_group_ctx():
137
+ # produce all CidTask
138
+ await producer()
286
139
 
287
- for task in done:
288
- item = task.result()
140
+ # signal chapter workers to exit
141
+ for _ in range(self.workers):
142
+ await cid_q.put(None)
143
+ await cid_q.join()
289
144
 
290
- if isinstance(item, dict): # from save_queue
291
- try:
292
- cs.save(cast(ChapterDict, item))
293
- completed_count += 1
294
- if progress_hook:
295
- await progress_hook(completed_count, total_chapters)
145
+ # signal storage worker to exit
146
+ await save_q.put(None)
147
+ await save_q.join()
296
148
 
297
- curr_cid = item["id"]
298
- if curr_cid in pending_restore:
299
- rt = pending_restore.pop(curr_cid)
300
- next_cid = item.get("extra", {}).get("next_chapter_id")
301
- if next_cid:
302
- update_book_info(
303
- vol_idx=rt.vol_idx,
304
- chap_idx=rt.chap_idx,
305
- cid=next_cid,
306
- )
307
- await cid_queue.put(
308
- CidTask(
309
- prev_cid=rt.prev_cid,
310
- cid=next_cid,
311
- vol_idx=rt.vol_idx,
312
- chap_idx=rt.chap_idx,
313
- )
314
- )
315
- else:
316
- self.logger.warning(
317
- "[storage_worker] No next_cid found for %r",
318
- rt,
319
- )
320
- except Exception as e:
321
- self.logger.error("[storage_worker] Failed to save: %s", e)
322
- finally:
323
- save_queue.task_done()
149
+ # final flush to catch any remaining items
150
+ await _flush_batch()
324
151
 
325
- elif isinstance(item, RestoreTask): # from restore_queue
326
- prev_json = cs.get(item.prev_cid)
327
- next_cid = (
328
- prev_json.get("extra", {}).get("next_chapter_id")
329
- if prev_json
330
- else None
331
- )
332
- if next_cid:
333
- update_book_info(
334
- vol_idx=item.vol_idx,
335
- chap_idx=item.chap_idx,
336
- cid=next_cid,
337
- )
338
- await cid_queue.put(
339
- CidTask(
340
- prev_cid=item.prev_cid,
341
- cid=next_cid,
342
- vol_idx=item.vol_idx,
343
- chap_idx=item.chap_idx,
344
- )
345
- )
346
- else:
347
- pending_restore[item.prev_cid] = item
348
- restore_queue.task_done()
349
-
350
- fetcher_tasks = [
351
- asyncio.create_task(
352
- fetcher_worker(
353
- book_id,
354
- cid_queue,
355
- html_queue,
356
- restore_queue,
357
- self.retry_times,
358
- semaphore,
359
- )
360
- )
361
- for _ in range(self.download_workers)
362
- ]
363
-
364
- parser_tasks = [
365
- asyncio.create_task(
366
- parser_worker(
367
- i,
368
- cid_queue,
369
- html_queue,
370
- save_queue,
371
- self.retry_times,
372
- )
373
- )
374
- for i in range(self.parser_workers)
375
- ]
376
-
377
- storage_task = asyncio.create_task(
378
- storage_worker(
379
- cs=normal_cs,
380
- save_queue=save_queue,
381
- restore_queue=restore_queue,
382
- cid_queue=cid_queue,
383
- )
152
+ chapter_storage.close()
153
+ self.logger.info(
154
+ "%s Novel '%s' download completed.",
155
+ TAG,
156
+ book_info.get("book_name", "unknown"),
384
157
  )
385
158
 
386
- found_start = start_id is None
387
- stop_early = False
388
- last_cid: str | None = None
389
-
390
- for vol_idx, vol in enumerate(vols):
391
- chapters = vol.get("chapters", [])
392
- for chap_idx, chap in enumerate(chapters):
393
- if stop_early:
394
- break
159
+ async def _chapter_worker(
160
+ self,
161
+ book_id: str,
162
+ ignore_set: set[str],
163
+ cid_q: asyncio.Queue[str | None],
164
+ save_q: asyncio.Queue[ChapterDict | None],
165
+ sem: asyncio.Semaphore,
166
+ ) -> None:
167
+ """
168
+ Worker that processes one chapter at a time:
169
+ fetch + parse with retry, then enqueue to save_q.
170
+ """
171
+ html_dir = self._debug_dir / book_id / "html"
172
+ while True:
173
+ cid = await cid_q.get()
174
+ if cid is None:
175
+ cid_q.task_done()
176
+ break
177
+ if not cid or cid in ignore_set:
178
+ cid_q.task_done()
179
+ continue
395
180
 
396
- cid = chap.get("chapterId")
181
+ async with sem:
182
+ chap = await self._process_chapter(book_id, cid, html_dir)
397
183
 
398
- # Skip until reaching start_id
399
- if not found_start:
400
- if cid == start_id:
401
- found_start = True
402
- else:
403
- completed_count += 1
404
- last_cid = cid
405
- continue
184
+ if chap:
185
+ await save_q.put(chap)
406
186
 
407
- # Stop when reaching end_id
408
- if end_id is not None and cid == end_id:
409
- stop_early = True
187
+ cid_q.task_done()
188
+ await async_sleep_with_random_delay(
189
+ self.request_interval,
190
+ mul_spread=1.1,
191
+ max_sleep=self.request_interval + 2,
192
+ )
410
193
 
411
- if cid and normal_cs.exists(cid) and self.skip_existing:
412
- completed_count += 1
413
- last_cid = cid
414
- continue
194
+ async def _process_chapter(
195
+ self,
196
+ book_id: str,
197
+ cid: str,
198
+ html_dir: Path,
199
+ ) -> ChapterDict | None:
200
+ """
201
+ Fetches, saves raw HTML, parses a single chapter,
202
+ retrying up to self.retry_times.
415
203
 
416
- await cid_queue.put(
417
- CidTask(
418
- vol_idx=vol_idx,
419
- chap_idx=chap_idx,
420
- cid=cid,
421
- prev_cid=last_cid,
422
- )
204
+ :return: ChapterDict on success, or None on failure.
205
+ """
206
+ for attempt in range(self.retry_times + 1):
207
+ try:
208
+ html_list = await self.fetcher.get_book_chapter(book_id, cid)
209
+ self._save_html_pages(html_dir, cid, html_list)
210
+ chap = await asyncio.to_thread(
211
+ self.parser.parse_chapter, html_list, cid
423
212
  )
424
-
425
- last_cid = cid
426
-
427
- if stop_early:
428
- break
429
-
430
- await restore_queue.join()
431
- await cid_queue.join()
432
- await html_queue.join()
433
- await save_queue.join()
434
-
435
- for task in fetcher_tasks + parser_tasks + [storage_task]:
436
- task.cancel()
437
- with suppress(asyncio.CancelledError):
438
- await task
439
-
440
- normal_cs.close()
441
- save_as_json(book_info, info_path)
442
-
443
- self.logger.info(
444
- "%s Novel '%s' download completed.",
445
- TAG,
446
- book_info.get("book_name", "unknown"),
447
- )
448
- return
213
+ if not chap:
214
+ raise ValueError("Empty parse result")
215
+ return chap
216
+ except Exception as e:
217
+ if attempt < self.retry_times:
218
+ self.logger.info(
219
+ "[ChapterWorker] Retry %s (%s): %s", cid, attempt + 1, e
220
+ )
221
+ backoff = self.backoff_factor * (2**attempt)
222
+ await async_sleep_with_random_delay(
223
+ base=backoff, mul_spread=1.2, max_sleep=backoff + 3
224
+ )
225
+ else:
226
+ self.logger.warning("[ChapterWorker] Failed %s: %s", cid, e)
227
+ return None
@@ -6,6 +6,7 @@ novel_downloader.core.downloaders.esjzone
6
6
  """
7
7
 
8
8
  from novel_downloader.core.downloaders.common import CommonDownloader
9
+ from novel_downloader.core.downloaders.registry import register_downloader
9
10
  from novel_downloader.core.interfaces import (
10
11
  FetcherProtocol,
11
12
  ParserProtocol,
@@ -13,8 +14,11 @@ from novel_downloader.core.interfaces import (
13
14
  from novel_downloader.models import DownloaderConfig
14
15
 
15
16
 
17
+ @register_downloader(site_keys=["esjzone"])
16
18
  class EsjzoneDownloader(CommonDownloader):
17
- """"""
19
+ """
20
+ Downloader for ESJ Zone novels.
21
+ """
18
22
 
19
23
  def __init__(
20
24
  self,
@@ -6,6 +6,7 @@ novel_downloader.core.downloaders.linovelib
6
6
  """
7
7
 
8
8
  from novel_downloader.core.downloaders.common import CommonDownloader
9
+ from novel_downloader.core.downloaders.registry import register_downloader
9
10
  from novel_downloader.core.interfaces import (
10
11
  FetcherProtocol,
11
12
  ParserProtocol,
@@ -13,8 +14,11 @@ from novel_downloader.core.interfaces import (
13
14
  from novel_downloader.models import DownloaderConfig
14
15
 
15
16
 
17
+ @register_downloader(site_keys=["linovelib"])
16
18
  class LinovelibDownloader(CommonDownloader):
17
- """"""
19
+ """
20
+ Downloader for Linovelib (哔哩轻小说) novels.
21
+ """
18
22
 
19
23
  def __init__(
20
24
  self,