novel-downloader 1.4.4__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/__init__.py +2 -2
  3. novel_downloader/cli/config.py +1 -83
  4. novel_downloader/cli/download.py +4 -5
  5. novel_downloader/cli/export.py +4 -1
  6. novel_downloader/cli/main.py +2 -0
  7. novel_downloader/cli/search.py +123 -0
  8. novel_downloader/config/__init__.py +3 -10
  9. novel_downloader/config/adapter.py +190 -54
  10. novel_downloader/config/loader.py +2 -3
  11. novel_downloader/core/__init__.py +13 -13
  12. novel_downloader/core/downloaders/__init__.py +10 -11
  13. novel_downloader/core/downloaders/base.py +152 -26
  14. novel_downloader/core/downloaders/biquge.py +5 -1
  15. novel_downloader/core/downloaders/common.py +157 -378
  16. novel_downloader/core/downloaders/esjzone.py +5 -1
  17. novel_downloader/core/downloaders/linovelib.py +5 -1
  18. novel_downloader/core/downloaders/qianbi.py +291 -4
  19. novel_downloader/core/downloaders/qidian.py +199 -285
  20. novel_downloader/core/downloaders/registry.py +67 -0
  21. novel_downloader/core/downloaders/sfacg.py +5 -1
  22. novel_downloader/core/downloaders/yamibo.py +5 -1
  23. novel_downloader/core/exporters/__init__.py +10 -11
  24. novel_downloader/core/exporters/base.py +87 -7
  25. novel_downloader/core/exporters/biquge.py +5 -8
  26. novel_downloader/core/exporters/common/__init__.py +2 -2
  27. novel_downloader/core/exporters/common/epub.py +82 -166
  28. novel_downloader/core/exporters/common/main_exporter.py +0 -60
  29. novel_downloader/core/exporters/common/txt.py +82 -83
  30. novel_downloader/core/exporters/epub_util.py +157 -1330
  31. novel_downloader/core/exporters/esjzone.py +5 -8
  32. novel_downloader/core/exporters/linovelib/__init__.py +2 -2
  33. novel_downloader/core/exporters/linovelib/epub.py +157 -212
  34. novel_downloader/core/exporters/linovelib/main_exporter.py +2 -59
  35. novel_downloader/core/exporters/linovelib/txt.py +67 -63
  36. novel_downloader/core/exporters/qianbi.py +5 -8
  37. novel_downloader/core/exporters/qidian.py +14 -4
  38. novel_downloader/core/exporters/registry.py +53 -0
  39. novel_downloader/core/exporters/sfacg.py +5 -8
  40. novel_downloader/core/exporters/txt_util.py +67 -0
  41. novel_downloader/core/exporters/yamibo.py +5 -8
  42. novel_downloader/core/fetchers/__init__.py +19 -24
  43. novel_downloader/core/fetchers/base/__init__.py +3 -3
  44. novel_downloader/core/fetchers/base/browser.py +23 -4
  45. novel_downloader/core/fetchers/base/session.py +30 -5
  46. novel_downloader/core/fetchers/biquge/__init__.py +3 -3
  47. novel_downloader/core/fetchers/biquge/browser.py +5 -0
  48. novel_downloader/core/fetchers/biquge/session.py +6 -1
  49. novel_downloader/core/fetchers/esjzone/__init__.py +3 -3
  50. novel_downloader/core/fetchers/esjzone/browser.py +5 -0
  51. novel_downloader/core/fetchers/esjzone/session.py +6 -1
  52. novel_downloader/core/fetchers/linovelib/__init__.py +3 -3
  53. novel_downloader/core/fetchers/linovelib/browser.py +6 -1
  54. novel_downloader/core/fetchers/linovelib/session.py +6 -1
  55. novel_downloader/core/fetchers/qianbi/__init__.py +3 -3
  56. novel_downloader/core/fetchers/qianbi/browser.py +5 -0
  57. novel_downloader/core/fetchers/qianbi/session.py +5 -0
  58. novel_downloader/core/fetchers/qidian/__init__.py +3 -3
  59. novel_downloader/core/fetchers/qidian/browser.py +12 -4
  60. novel_downloader/core/fetchers/qidian/session.py +11 -3
  61. novel_downloader/core/fetchers/registry.py +71 -0
  62. novel_downloader/core/fetchers/sfacg/__init__.py +3 -3
  63. novel_downloader/core/fetchers/sfacg/browser.py +5 -0
  64. novel_downloader/core/fetchers/sfacg/session.py +5 -0
  65. novel_downloader/core/fetchers/yamibo/__init__.py +3 -3
  66. novel_downloader/core/fetchers/yamibo/browser.py +5 -0
  67. novel_downloader/core/fetchers/yamibo/session.py +6 -1
  68. novel_downloader/core/interfaces/__init__.py +7 -5
  69. novel_downloader/core/interfaces/searcher.py +18 -0
  70. novel_downloader/core/parsers/__init__.py +10 -11
  71. novel_downloader/core/parsers/{biquge/main_parser.py → biquge.py} +7 -2
  72. novel_downloader/core/parsers/{esjzone/main_parser.py → esjzone.py} +7 -2
  73. novel_downloader/core/parsers/{linovelib/main_parser.py → linovelib.py} +7 -2
  74. novel_downloader/core/parsers/{qianbi/main_parser.py → qianbi.py} +7 -2
  75. novel_downloader/core/parsers/qidian/__init__.py +2 -2
  76. novel_downloader/core/parsers/qidian/chapter_encrypted.py +23 -21
  77. novel_downloader/core/parsers/qidian/chapter_normal.py +1 -1
  78. novel_downloader/core/parsers/qidian/main_parser.py +10 -21
  79. novel_downloader/core/parsers/qidian/utils/__init__.py +11 -11
  80. novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +5 -6
  81. novel_downloader/core/parsers/qidian/utils/node_decryptor.py +2 -2
  82. novel_downloader/core/parsers/registry.py +68 -0
  83. novel_downloader/core/parsers/{sfacg/main_parser.py → sfacg.py} +7 -2
  84. novel_downloader/core/parsers/{yamibo/main_parser.py → yamibo.py} +7 -2
  85. novel_downloader/core/searchers/__init__.py +20 -0
  86. novel_downloader/core/searchers/base.py +92 -0
  87. novel_downloader/core/searchers/biquge.py +83 -0
  88. novel_downloader/core/searchers/esjzone.py +84 -0
  89. novel_downloader/core/searchers/qianbi.py +131 -0
  90. novel_downloader/core/searchers/qidian.py +87 -0
  91. novel_downloader/core/searchers/registry.py +63 -0
  92. novel_downloader/locales/en.json +12 -4
  93. novel_downloader/locales/zh.json +12 -4
  94. novel_downloader/models/__init__.py +4 -30
  95. novel_downloader/models/config.py +12 -6
  96. novel_downloader/models/search.py +16 -0
  97. novel_downloader/models/types.py +0 -2
  98. novel_downloader/resources/config/settings.toml +31 -4
  99. novel_downloader/resources/css_styles/intro.css +83 -0
  100. novel_downloader/resources/css_styles/main.css +30 -89
  101. novel_downloader/utils/__init__.py +52 -0
  102. novel_downloader/utils/chapter_storage.py +244 -224
  103. novel_downloader/utils/constants.py +1 -21
  104. novel_downloader/utils/epub/__init__.py +34 -0
  105. novel_downloader/utils/epub/builder.py +377 -0
  106. novel_downloader/utils/epub/constants.py +77 -0
  107. novel_downloader/utils/epub/documents.py +403 -0
  108. novel_downloader/utils/epub/models.py +134 -0
  109. novel_downloader/utils/epub/utils.py +212 -0
  110. novel_downloader/utils/file_utils/__init__.py +10 -14
  111. novel_downloader/utils/file_utils/io.py +20 -51
  112. novel_downloader/utils/file_utils/normalize.py +2 -2
  113. novel_downloader/utils/file_utils/sanitize.py +2 -3
  114. novel_downloader/utils/fontocr/__init__.py +5 -5
  115. novel_downloader/utils/{hash_store.py → fontocr/hash_store.py} +4 -3
  116. novel_downloader/utils/{hash_utils.py → fontocr/hash_utils.py} +2 -2
  117. novel_downloader/utils/fontocr/ocr_v1.py +13 -1
  118. novel_downloader/utils/fontocr/ocr_v2.py +13 -1
  119. novel_downloader/utils/fontocr/ocr_v3.py +744 -0
  120. novel_downloader/utils/i18n.py +2 -0
  121. novel_downloader/utils/logger.py +2 -0
  122. novel_downloader/utils/network.py +110 -251
  123. novel_downloader/utils/state.py +1 -0
  124. novel_downloader/utils/text_utils/__init__.py +18 -17
  125. novel_downloader/utils/text_utils/diff_display.py +4 -5
  126. novel_downloader/utils/text_utils/numeric_conversion.py +253 -0
  127. novel_downloader/utils/text_utils/text_cleaner.py +179 -0
  128. novel_downloader/utils/text_utils/truncate_utils.py +62 -0
  129. novel_downloader/utils/time_utils/__init__.py +3 -3
  130. novel_downloader/utils/time_utils/datetime_utils.py +4 -5
  131. novel_downloader/utils/time_utils/sleep_utils.py +2 -3
  132. {novel_downloader-1.4.4.dist-info → novel_downloader-1.5.0.dist-info}/METADATA +2 -2
  133. novel_downloader-1.5.0.dist-info/RECORD +164 -0
  134. novel_downloader/config/site_rules.py +0 -94
  135. novel_downloader/core/factory/__init__.py +0 -20
  136. novel_downloader/core/factory/downloader.py +0 -73
  137. novel_downloader/core/factory/exporter.py +0 -58
  138. novel_downloader/core/factory/fetcher.py +0 -96
  139. novel_downloader/core/factory/parser.py +0 -86
  140. novel_downloader/core/fetchers/common/__init__.py +0 -14
  141. novel_downloader/core/fetchers/common/browser.py +0 -79
  142. novel_downloader/core/fetchers/common/session.py +0 -79
  143. novel_downloader/core/parsers/biquge/__init__.py +0 -10
  144. novel_downloader/core/parsers/common/__init__.py +0 -13
  145. novel_downloader/core/parsers/common/helper.py +0 -323
  146. novel_downloader/core/parsers/common/main_parser.py +0 -106
  147. novel_downloader/core/parsers/esjzone/__init__.py +0 -10
  148. novel_downloader/core/parsers/linovelib/__init__.py +0 -10
  149. novel_downloader/core/parsers/qianbi/__init__.py +0 -10
  150. novel_downloader/core/parsers/sfacg/__init__.py +0 -10
  151. novel_downloader/core/parsers/yamibo/__init__.py +0 -10
  152. novel_downloader/models/browser.py +0 -21
  153. novel_downloader/models/site_rules.py +0 -99
  154. novel_downloader/models/tasks.py +0 -33
  155. novel_downloader/resources/css_styles/volume-intro.css +0 -56
  156. novel_downloader/resources/json/replace_word_map.json +0 -4
  157. novel_downloader/resources/text/blacklist.txt +0 -22
  158. novel_downloader/utils/text_utils/chapter_formatting.py +0 -46
  159. novel_downloader/utils/text_utils/font_mapping.py +0 -28
  160. novel_downloader/utils/text_utils/text_cleaning.py +0 -107
  161. novel_downloader-1.4.4.dist-info/RECORD +0 -165
  162. {novel_downloader-1.4.4.dist-info → novel_downloader-1.5.0.dist-info}/WHEEL +0 -0
  163. {novel_downloader-1.4.4.dist-info → novel_downloader-1.5.0.dist-info}/entry_points.txt +0 -0
  164. {novel_downloader-1.4.4.dist-info → novel_downloader-1.5.0.dist-info}/licenses/LICENSE +0 -0
  165. {novel_downloader-1.4.4.dist-info → novel_downloader-1.5.0.dist-info}/top_level.txt +0 -0
@@ -6,12 +6,13 @@ novel_downloader.core.downloaders.qidian
6
6
  """
7
7
 
8
8
  import asyncio
9
- import json
10
- from collections.abc import Awaitable, Callable
11
- from contextlib import suppress
12
- from typing import Any, cast
9
+ from collections.abc import AsyncIterator, Awaitable, Callable
10
+ from contextlib import asynccontextmanager
11
+ from pathlib import Path
12
+ from typing import Any
13
13
 
14
14
  from novel_downloader.core.downloaders.base import BaseDownloader
15
+ from novel_downloader.core.downloaders.registry import register_downloader
15
16
  from novel_downloader.core.interfaces import (
16
17
  FetcherProtocol,
17
18
  ParserProtocol,
@@ -19,23 +20,30 @@ from novel_downloader.core.interfaces import (
19
20
  from novel_downloader.models import (
20
21
  BookConfig,
21
22
  ChapterDict,
22
- CidTask,
23
23
  DownloaderConfig,
24
- HtmlTask,
25
24
  )
26
- from novel_downloader.utils.chapter_storage import ChapterStorage
27
- from novel_downloader.utils.file_utils import save_as_json, save_as_txt
28
- from novel_downloader.utils.time_utils import (
25
+ from novel_downloader.utils import (
26
+ ChapterStorage,
29
27
  async_sleep_with_random_delay,
30
- calculate_time_difference,
31
28
  )
32
29
 
33
30
 
31
+ @register_downloader(site_keys=["qidian", "qd"])
34
32
  class QidianDownloader(BaseDownloader):
35
33
  """
36
- Specialized downloader for Qidian novels.
34
+ Specialized downloader for Qidian (起点) novels.
35
+
36
+ Processes each chapter in a single worker that
37
+ handles fetch -> parse -> enqueue storage.
37
38
  """
38
39
 
40
+ DEFAULT_SOURCE_ID = 0
41
+ ENCRYPTED_SOURCE_ID = 1
42
+ PRIORITIES_MAP = {
43
+ DEFAULT_SOURCE_ID: 0,
44
+ ENCRYPTED_SOURCE_ID: 1,
45
+ }
46
+
39
47
  def __init__(
40
48
  self,
41
49
  fetcher: FetcherProtocol,
@@ -43,7 +51,7 @@ class QidianDownloader(BaseDownloader):
43
51
  config: DownloaderConfig,
44
52
  ):
45
53
  config.request_interval = max(1.0, config.request_interval)
46
- super().__init__(fetcher, parser, config, "qidian")
54
+ super().__init__(fetcher, parser, config, "qidian", self.PRIORITIES_MAP)
47
55
 
48
56
  async def _download_one(
49
57
  self,
@@ -63,301 +71,123 @@ class QidianDownloader(BaseDownloader):
63
71
  end_id = book.get("end_id")
64
72
  ignore_set = set(book.get("ignore_ids", []))
65
73
 
66
- raw_base = self.raw_data_dir / book_id
67
- cache_base = self.cache_dir / book_id
68
- info_path = raw_base / "book_info.json"
69
- chapters_html_dir = cache_base / "html"
70
-
74
+ raw_base = self._raw_data_dir / book_id
71
75
  raw_base.mkdir(parents=True, exist_ok=True)
72
- if self.save_html:
73
- chapters_html_dir.mkdir(parents=True, exist_ok=True)
74
- normal_cs = ChapterStorage(
75
- raw_base=raw_base,
76
- namespace="chapters",
77
- backend_type=self._config.storage_backend,
78
- batch_size=self._config.storage_batch_size,
79
- )
80
- encrypted_cs = ChapterStorage(
76
+ html_dir = self._debug_dir / book_id / "html"
77
+ chapter_storage = ChapterStorage(
81
78
  raw_base=raw_base,
82
- namespace="encrypted_chapters",
83
- backend_type=self._config.storage_backend,
84
- batch_size=self._config.storage_batch_size,
79
+ priorities=self._priorities,
85
80
  )
81
+ chapter_storage.connect()
86
82
 
87
- # load or fetch book_info
88
- book_info: dict[str, Any]
89
- re_fetch = True
90
- old_data: dict[str, Any] = {}
91
-
92
- if info_path.exists():
93
- try:
94
- old_data = json.loads(info_path.read_text("utf-8"))
95
- days, *_ = calculate_time_difference(
96
- old_data.get("update_time", ""), "UTC+8"
97
- )
98
- re_fetch = days > 1
99
- except Exception:
100
- re_fetch = True
101
-
102
- if re_fetch:
103
- info_html = await self.fetcher.get_book_info(book_id)
104
- if self.save_html:
105
- for i, html in enumerate(info_html):
106
- save_as_txt(html, chapters_html_dir / f"info_{i}.html")
107
- book_info = self.parser.parse_book_info(info_html)
108
-
109
- if book_info.get("book_name") != "未找到书名":
110
- save_as_json(book_info, info_path)
111
- else:
112
- self.logger.warning("%s 书籍信息未找到, book_id = %s", TAG, book_id)
113
- book_info = old_data or {"book_name": "未找到书名"}
114
- else:
115
- book_info = old_data
116
-
83
+ # load or fetch metadata
84
+ book_info = await self.load_book_info(book_id=book_id, html_dir=html_dir)
117
85
  vols = book_info.get("volumes", [])
118
- total_chapters = 0
119
- for vol in vols:
120
- total_chapters += len(vol.get("chapters", []))
86
+ total_chapters = sum(len(v.get("chapters", [])) for v in vols)
121
87
  if total_chapters == 0:
122
- self.logger.warning("%s 书籍没有章节可下载: book_id=%s", TAG, book_id)
88
+ self.logger.warning("%s 书籍没有章节可下载: %s", TAG, book_id)
123
89
  return
124
90
 
125
- completed_count = 0
126
-
127
- # setup queue
128
- cid_queue: asyncio.Queue[CidTask] = asyncio.Queue()
129
- html_queue: asyncio.Queue[HtmlTask] = asyncio.Queue()
130
- save_queue: asyncio.Queue[ChapterDict] = asyncio.Queue()
131
-
132
- async def fetcher_worker(
133
- book_id: str,
134
- cid_queue: asyncio.Queue[CidTask],
135
- html_queue: asyncio.Queue[HtmlTask],
136
- retry_times: int,
137
- ) -> None:
138
- while True:
139
- task = await cid_queue.get()
140
- cid = task.cid
141
- if not cid:
142
- self.logger.warning("[Fetcher] Skipped empty cid task: %s", task)
143
- cid_queue.task_done()
144
- continue
145
-
146
- if cid in ignore_set:
147
- cid_queue.task_done()
148
- continue
149
-
150
- try:
151
- html_list = await self.fetcher.get_book_chapter(book_id, cid)
152
- await html_queue.put(
153
- HtmlTask(cid=cid, retry=task.retry, html_list=html_list)
154
- )
155
- self.logger.info("[Fetcher] Downloaded chapter %s", cid)
156
- await async_sleep_with_random_delay(
157
- self.request_interval,
158
- mul_spread=1.1,
159
- max_sleep=self.request_interval + 2,
160
- )
91
+ # concurrency primitives
92
+ sem = asyncio.Semaphore(self.workers)
93
+ cid_q: asyncio.Queue[str | None] = asyncio.Queue()
94
+ save_q: asyncio.Queue[ChapterDict | None] = asyncio.Queue()
95
+ default_batch: list[ChapterDict] = []
96
+ encrypted_batch: list[ChapterDict] = []
97
+ completed = 0
98
+
99
+ def _select(batch_item: ChapterDict) -> tuple[list[ChapterDict], int]:
100
+ if batch_item.get("extra", {}).get("encrypted", False):
101
+ return encrypted_batch, self.ENCRYPTED_SOURCE_ID
102
+ return default_batch, self.DEFAULT_SOURCE_ID
103
+
104
+ async def _flush(batch: list[ChapterDict], src: int) -> None:
105
+ nonlocal completed
106
+ if not batch:
107
+ return
108
+ try:
109
+ chapter_storage.upsert_chapters(batch, src)
110
+ except Exception as e:
111
+ self.logger.error(
112
+ "[Storage] batch upsert failed (size=%d, source=%d): %s",
113
+ len(batch),
114
+ src,
115
+ e,
116
+ exc_info=True,
117
+ )
118
+ else:
119
+ completed += len(batch)
120
+ if progress_hook:
121
+ await progress_hook(completed, total_chapters)
122
+ finally:
123
+ batch.clear()
161
124
 
162
- except Exception as e:
163
- if task.retry < retry_times:
164
- await cid_queue.put(
165
- CidTask(
166
- prev_cid=task.prev_cid,
167
- cid=cid,
168
- retry=task.retry + 1,
169
- )
170
- )
171
- self.logger.info(
172
- "[Fetcher] Re-queued chapter %s for retry #%d: %s",
173
- cid,
174
- task.retry + 1,
175
- e,
176
- )
177
- backoff = self.backoff_factor * (2**task.retry)
178
- await async_sleep_with_random_delay(
179
- base=backoff,
180
- mul_spread=1.2,
181
- max_sleep=backoff + 3,
182
- )
183
- else:
184
- self.logger.warning(
185
- "[Fetcher] Max retries reached for chapter %s: %s",
186
- cid,
187
- e,
188
- )
189
-
190
- finally:
191
- cid_queue.task_done()
192
-
193
- async def parser_worker(
194
- cid_queue: asyncio.Queue[CidTask],
195
- html_queue: asyncio.Queue[HtmlTask],
196
- save_queue: asyncio.Queue[ChapterDict],
197
- retry_times: int,
198
- ) -> None:
199
- while True:
200
- task = await html_queue.get()
201
- skip_retry = False
202
- try:
203
- chap_json: ChapterDict | None = None
204
- if self.check_restricted(task.html_list):
205
- self.logger.info(
206
- "[Parser] Skipped restricted page for cid %s", task.cid
207
- )
208
- skip_retry = True
209
- raise ValueError("Restricted content detected")
210
-
211
- is_encrypted = self.check_encrypted(task.html_list)
212
- chap_json = await asyncio.to_thread(
213
- self.parser.parse_chapter,
214
- task.html_list,
215
- task.cid,
216
- )
217
- if is_encrypted:
218
- skip_retry = True
219
- if self.save_html:
220
- folder = chapters_html_dir / (
221
- "html_encrypted" if is_encrypted else "html_plain"
222
- )
223
- html_path = folder / f"{task.cid}.html"
224
- save_as_txt(task.html_list[0], html_path, on_exist="skip")
225
- self.logger.debug(
226
- "%s Saved raw HTML for chapter %s to %s",
227
- TAG,
228
- task.cid,
229
- html_path,
230
- )
231
- if chap_json:
232
- await save_queue.put(chap_json)
233
- self.logger.info(
234
- "[Parser] saved chapter %s",
235
- task.cid,
236
- )
237
- else:
238
- raise ValueError("Empty parse result")
239
- except Exception as e:
240
- if not skip_retry and task.retry < retry_times:
241
- await cid_queue.put(
242
- CidTask(prev_cid=None, cid=task.cid, retry=task.retry + 1)
243
- )
244
- self.logger.info(
245
- "[Parser] Re-queued cid %s for retry #%d: %s",
246
- task.cid,
247
- task.retry + 1,
248
- e,
249
- )
250
- elif not skip_retry:
251
- self.logger.warning(
252
- "[Parser] Max retries reached for cid %s: %s",
253
- task.cid,
254
- e,
255
- )
256
- finally:
257
- html_queue.task_done()
258
-
259
- async def storage_worker(
260
- normal_cs: ChapterStorage,
261
- encrypted_cs: ChapterStorage,
262
- save_queue: asyncio.Queue[ChapterDict],
263
- ) -> None:
264
- nonlocal completed_count
125
+ async def storage_worker(q: asyncio.Queue[ChapterDict | None]) -> None:
265
126
  while True:
266
- item = await save_queue.get()
267
- try:
268
- is_encrypted = item.get("extra", {}).get("encrypted", False)
269
- cs = encrypted_cs if is_encrypted else normal_cs
270
- cs.save(cast(ChapterDict, item))
271
- completed_count += 1
272
- if progress_hook:
273
- await progress_hook(completed_count, total_chapters)
274
- except Exception as e:
275
- self.logger.error("[storage_worker] Failed to save: %s", e)
276
- finally:
277
- save_queue.task_done()
278
-
279
- fetcher_task = asyncio.create_task(
280
- fetcher_worker(
281
- book_id,
282
- cid_queue,
283
- html_queue,
284
- self.retry_times,
285
- )
286
- )
287
-
288
- parser_task = asyncio.create_task(
289
- parser_worker(
290
- cid_queue,
291
- html_queue,
292
- save_queue,
293
- self.retry_times,
294
- )
295
- )
296
-
297
- storage_task = asyncio.create_task(
298
- storage_worker(
299
- normal_cs=normal_cs,
300
- encrypted_cs=encrypted_cs,
301
- save_queue=save_queue,
302
- )
303
- )
304
-
305
- found_start = start_id is None
306
- stop_early = False
307
-
308
- for vol in book_info.get("volumes", []):
309
- chapters = vol.get("chapters", [])
310
- for chap in chapters:
311
- if stop_early:
127
+ chap = await q.get()
128
+ q.task_done()
129
+ if chap is None:
130
+ # final flush before exit
131
+ await _flush(default_batch, self.DEFAULT_SOURCE_ID)
132
+ await _flush(encrypted_batch, self.ENCRYPTED_SOURCE_ID)
312
133
  break
134
+ batch, src = _select(chap)
135
+ batch.append(chap)
136
+ if len(batch) >= self.storage_batch_size:
137
+ await _flush(batch, src)
138
+
139
+ async def producer() -> None:
140
+ nonlocal completed
141
+ async for cid in self._chapter_ids(vols, start_id, end_id):
142
+ if self.skip_existing and chapter_storage.exists(
143
+ cid, self.DEFAULT_SOURCE_ID
144
+ ):
145
+ completed += 1
146
+ if progress_hook:
147
+ await progress_hook(completed, total_chapters)
148
+ else:
149
+ await cid_q.put(cid)
150
+
151
+ @asynccontextmanager
152
+ async def task_group_ctx() -> AsyncIterator[None]:
153
+ async with asyncio.TaskGroup() as tg:
154
+ tg.create_task(
155
+ self._chapter_worker(
156
+ book_id,
157
+ ignore_set,
158
+ cid_q,
159
+ save_q,
160
+ sem,
161
+ )
162
+ )
163
+ tg.create_task(storage_worker(save_q))
164
+ yield
313
165
 
314
- cid = chap.get("chapterId")
315
- if not cid:
316
- continue
317
-
318
- if not found_start:
319
- if cid == start_id:
320
- found_start = True
321
- else:
322
- completed_count += 1
323
- continue
324
-
325
- if end_id is not None and cid == end_id:
326
- stop_early = True
327
-
328
- if cid in ignore_set:
329
- continue
330
-
331
- if normal_cs.exists(cid) and self.skip_existing:
332
- completed_count += 1
333
- continue
334
-
335
- await cid_queue.put(CidTask(cid=cid, prev_cid=None))
336
-
337
- if stop_early:
338
- break
166
+ # run producer + workers, send None sentinels to shut down loops
167
+ async with task_group_ctx():
168
+ await producer()
339
169
 
340
- await cid_queue.join()
341
- await html_queue.join()
342
- await save_queue.join()
170
+ # signal fetcher to exit
171
+ await cid_q.put(None)
172
+ await cid_q.join()
343
173
 
344
- for task in [fetcher_task, parser_task, storage_task]:
345
- task.cancel()
346
- with suppress(asyncio.CancelledError):
347
- await task
174
+ # signal storage to exit
175
+ await save_q.put(None)
176
+ await save_q.join()
348
177
 
349
- normal_cs.close()
350
- encrypted_cs.close()
178
+ # final flush for both batches
179
+ await _flush(default_batch, self.DEFAULT_SOURCE_ID)
180
+ await _flush(encrypted_batch, self.ENCRYPTED_SOURCE_ID)
351
181
 
182
+ chapter_storage.close()
352
183
  self.logger.info(
353
184
  "%s Novel '%s' download completed.",
354
185
  TAG,
355
186
  book_info.get("book_name", "unknown"),
356
187
  )
357
- return
358
188
 
359
189
  @staticmethod
360
- def check_restricted(html_list: list[str]) -> bool:
190
+ def _check_restricted(html_list: list[str]) -> bool:
361
191
  """
362
192
  Return True if page content indicates access restriction
363
193
  (e.g. not subscribed/purchased).
@@ -370,7 +200,91 @@ class QidianDownloader(BaseDownloader):
370
200
  return any(m in html_list[0] for m in markers)
371
201
 
372
202
  @staticmethod
373
- def check_encrypted(html_list: list[str]) -> bool:
203
+ def _check_encrypted(html_list: list[str]) -> bool:
374
204
  if not html_list:
375
205
  return True
376
206
  return '"cES":2' in html_list[0]
207
+
208
+ async def _chapter_worker(
209
+ self,
210
+ book_id: str,
211
+ ignore_set: set[str],
212
+ cid_q: asyncio.Queue[str | None],
213
+ save_q: asyncio.Queue[ChapterDict | None],
214
+ sem: asyncio.Semaphore,
215
+ ) -> None:
216
+ """
217
+ Worker that processes one chapter at a time:
218
+ fetch + parse with retry, then enqueue to save_q.
219
+ """
220
+ html_dir = self._debug_dir / book_id / "html"
221
+ while True:
222
+ cid = await cid_q.get()
223
+ if cid is None:
224
+ cid_q.task_done()
225
+ break
226
+ if not cid or cid in ignore_set:
227
+ cid_q.task_done()
228
+ continue
229
+
230
+ async with sem:
231
+ chap = await self._process_chapter(book_id, cid, html_dir)
232
+ if chap:
233
+ await save_q.put(chap)
234
+
235
+ cid_q.task_done()
236
+ await async_sleep_with_random_delay(
237
+ self.request_interval,
238
+ mul_spread=1.1,
239
+ max_sleep=self.request_interval + 2,
240
+ )
241
+
242
+ async def _process_chapter(
243
+ self,
244
+ book_id: str,
245
+ cid: str,
246
+ html_dir: Path,
247
+ ) -> ChapterDict | None:
248
+ """
249
+ Fetch, debug-save, parse a single chapter with retries.
250
+ Returns ChapterDict or None on failure.
251
+ """
252
+ for attempt in range(self.retry_times + 1):
253
+ try:
254
+ html_list = await self.fetcher.get_book_chapter(book_id, cid)
255
+ if self._check_restricted(html_list):
256
+ self.logger.info(
257
+ "[ChapterWorker] Restricted content detected: %s", cid
258
+ )
259
+ return None
260
+ encrypted = self._check_encrypted(html_list)
261
+
262
+ folder = "html_encrypted" if encrypted else "html_plain"
263
+ self._save_html_pages(html_dir / folder, cid, html_list)
264
+
265
+ chap = await asyncio.to_thread(
266
+ self.parser.parse_chapter, html_list, cid
267
+ )
268
+ if encrypted and not chap:
269
+ self.logger.info(
270
+ "[ChapterWorker] Fail for encrypted chapter: %s", cid
271
+ )
272
+ return None
273
+ if not chap:
274
+ raise ValueError("Empty parse result")
275
+ return chap
276
+
277
+ except Exception as e:
278
+ if attempt < self.retry_times:
279
+ self.logger.info(
280
+ "[ChapterWorker] Retry %s (%s): %s", cid, attempt + 1, e
281
+ )
282
+ backoff = self.backoff_factor * (2**attempt)
283
+ await async_sleep_with_random_delay(
284
+ base=backoff,
285
+ mul_spread=1.2,
286
+ max_sleep=backoff + 3,
287
+ )
288
+ else:
289
+ self.logger.warning("[ChapterWorker] Failed %s: %s", cid, e)
290
+ return None
@@ -0,0 +1,67 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.core.downloaders.registry
4
+ ------------------------------------------
5
+
6
+ """
7
+
8
+ __all__ = ["register_downloader", "get_downloader"]
9
+
10
+ from collections.abc import Callable, Sequence
11
+ from typing import TypeVar
12
+
13
+ from novel_downloader.core.interfaces import (
14
+ DownloaderProtocol,
15
+ FetcherProtocol,
16
+ ParserProtocol,
17
+ )
18
+ from novel_downloader.models import DownloaderConfig
19
+
20
+ DownloaderBuilder = Callable[
21
+ [FetcherProtocol, ParserProtocol, DownloaderConfig],
22
+ DownloaderProtocol,
23
+ ]
24
+ D = TypeVar("D", bound=DownloaderProtocol)
25
+ _DOWNLOADER_MAP: dict[str, DownloaderBuilder] = {}
26
+
27
+
28
+ def register_downloader(
29
+ site_keys: Sequence[str],
30
+ ) -> Callable[[type[D]], type[D]]:
31
+ """
32
+ Decorator to register a downloader class under given keys.
33
+
34
+ :param site_keys: Sequence of site identifiers
35
+ :return: A class decorator that populates _DOWNLOADER_MAP.
36
+ """
37
+
38
+ def decorator(cls: type[D]) -> type[D]:
39
+ for key in site_keys:
40
+ _DOWNLOADER_MAP[key.lower()] = cls
41
+ return cls
42
+
43
+ return decorator
44
+
45
+
46
+ def get_downloader(
47
+ fetcher: FetcherProtocol,
48
+ parser: ParserProtocol,
49
+ site: str,
50
+ config: DownloaderConfig,
51
+ ) -> DownloaderProtocol:
52
+ """
53
+ Returns an DownloaderProtocol for the given site.
54
+
55
+ :param fetcher: Fetcher implementation
56
+ :param parser: Parser implementation
57
+ :param site: Site name (e.g., 'qidian')
58
+ :param config: Downloader configuration
59
+
60
+ :return: An instance of a downloader class
61
+ """
62
+ site_key = site.lower()
63
+ try:
64
+ downloader_cls = _DOWNLOADER_MAP[site_key]
65
+ except KeyError as err:
66
+ raise ValueError(f"Unsupported site: {site}") from err
67
+ return downloader_cls(fetcher, parser, config)
@@ -6,6 +6,7 @@ novel_downloader.core.downloaders.sfacg
6
6
  """
7
7
 
8
8
  from novel_downloader.core.downloaders.common import CommonDownloader
9
+ from novel_downloader.core.downloaders.registry import register_downloader
9
10
  from novel_downloader.core.interfaces import (
10
11
  FetcherProtocol,
11
12
  ParserProtocol,
@@ -13,8 +14,11 @@ from novel_downloader.core.interfaces import (
13
14
  from novel_downloader.models import DownloaderConfig
14
15
 
15
16
 
17
+ @register_downloader(site_keys=["sfacg"])
16
18
  class SfacgDownloader(CommonDownloader):
17
- """"""
19
+ """
20
+ Downloader for sfacg (SF 轻小说) novels.
21
+ """
18
22
 
19
23
  def __init__(
20
24
  self,
@@ -6,6 +6,7 @@ novel_downloader.core.downloaders.yamibo
6
6
  """
7
7
 
8
8
  from novel_downloader.core.downloaders.common import CommonDownloader
9
+ from novel_downloader.core.downloaders.registry import register_downloader
9
10
  from novel_downloader.core.interfaces import (
10
11
  FetcherProtocol,
11
12
  ParserProtocol,
@@ -13,8 +14,11 @@ from novel_downloader.core.interfaces import (
13
14
  from novel_downloader.models import DownloaderConfig
14
15
 
15
16
 
17
+ @register_downloader(site_keys=["yamibo"])
16
18
  class YamiboDownloader(CommonDownloader):
17
- """"""
19
+ """
20
+ Downloader for yamibo (百合会) novels.
21
+ """
18
22
 
19
23
  def __init__(
20
24
  self,