novel-downloader 2.0.0__py3-none-any.whl → 2.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -1
- novel_downloader/cli/download.py +14 -11
- novel_downloader/cli/export.py +19 -19
- novel_downloader/cli/ui.py +35 -8
- novel_downloader/config/adapter.py +216 -153
- novel_downloader/core/__init__.py +5 -6
- novel_downloader/core/archived/deqixs/fetcher.py +1 -28
- novel_downloader/core/downloaders/__init__.py +2 -0
- novel_downloader/core/downloaders/base.py +34 -85
- novel_downloader/core/downloaders/common.py +147 -171
- novel_downloader/core/downloaders/qianbi.py +30 -64
- novel_downloader/core/downloaders/qidian.py +157 -184
- novel_downloader/core/downloaders/qqbook.py +292 -0
- novel_downloader/core/downloaders/registry.py +2 -2
- novel_downloader/core/exporters/__init__.py +2 -0
- novel_downloader/core/exporters/base.py +37 -59
- novel_downloader/core/exporters/common.py +620 -0
- novel_downloader/core/exporters/linovelib.py +47 -0
- novel_downloader/core/exporters/qidian.py +41 -12
- novel_downloader/core/exporters/qqbook.py +28 -0
- novel_downloader/core/exporters/registry.py +2 -2
- novel_downloader/core/fetchers/__init__.py +4 -2
- novel_downloader/core/fetchers/aaatxt.py +2 -22
- novel_downloader/core/fetchers/b520.py +3 -23
- novel_downloader/core/fetchers/base.py +80 -105
- novel_downloader/core/fetchers/biquyuedu.py +2 -22
- novel_downloader/core/fetchers/dxmwx.py +10 -22
- novel_downloader/core/fetchers/esjzone.py +6 -29
- novel_downloader/core/fetchers/guidaye.py +2 -22
- novel_downloader/core/fetchers/hetushu.py +9 -29
- novel_downloader/core/fetchers/i25zw.py +2 -16
- novel_downloader/core/fetchers/ixdzs8.py +2 -16
- novel_downloader/core/fetchers/jpxs123.py +2 -16
- novel_downloader/core/fetchers/lewenn.py +2 -22
- novel_downloader/core/fetchers/linovelib.py +4 -20
- novel_downloader/core/fetchers/{eightnovel.py → n8novel.py} +12 -40
- novel_downloader/core/fetchers/piaotia.py +2 -16
- novel_downloader/core/fetchers/qbtr.py +2 -16
- novel_downloader/core/fetchers/qianbi.py +1 -20
- novel_downloader/core/fetchers/qidian.py +27 -68
- novel_downloader/core/fetchers/qqbook.py +177 -0
- novel_downloader/core/fetchers/quanben5.py +9 -29
- novel_downloader/core/fetchers/rate_limiter.py +22 -53
- novel_downloader/core/fetchers/sfacg.py +3 -16
- novel_downloader/core/fetchers/shencou.py +2 -16
- novel_downloader/core/fetchers/shuhaige.py +2 -22
- novel_downloader/core/fetchers/tongrenquan.py +2 -22
- novel_downloader/core/fetchers/ttkan.py +3 -14
- novel_downloader/core/fetchers/wanbengo.py +2 -22
- novel_downloader/core/fetchers/xiaoshuowu.py +2 -16
- novel_downloader/core/fetchers/xiguashuwu.py +4 -20
- novel_downloader/core/fetchers/xs63b.py +3 -15
- novel_downloader/core/fetchers/xshbook.py +2 -22
- novel_downloader/core/fetchers/yamibo.py +4 -28
- novel_downloader/core/fetchers/yibige.py +13 -26
- novel_downloader/core/interfaces/exporter.py +19 -7
- novel_downloader/core/interfaces/fetcher.py +23 -49
- novel_downloader/core/interfaces/parser.py +2 -2
- novel_downloader/core/parsers/__init__.py +4 -2
- novel_downloader/core/parsers/b520.py +2 -2
- novel_downloader/core/parsers/base.py +5 -39
- novel_downloader/core/parsers/esjzone.py +3 -3
- novel_downloader/core/parsers/{eightnovel.py → n8novel.py} +7 -7
- novel_downloader/core/parsers/qidian.py +717 -0
- novel_downloader/core/parsers/qqbook.py +709 -0
- novel_downloader/core/parsers/xiguashuwu.py +8 -15
- novel_downloader/core/searchers/__init__.py +2 -2
- novel_downloader/core/searchers/b520.py +1 -1
- novel_downloader/core/searchers/base.py +2 -2
- novel_downloader/core/searchers/{eightnovel.py → n8novel.py} +5 -5
- novel_downloader/locales/en.json +3 -3
- novel_downloader/locales/zh.json +3 -3
- novel_downloader/models/__init__.py +2 -0
- novel_downloader/models/book.py +1 -0
- novel_downloader/models/config.py +12 -0
- novel_downloader/resources/config/settings.toml +23 -5
- novel_downloader/resources/js_scripts/expr_to_json.js +14 -0
- novel_downloader/resources/js_scripts/qidian_decrypt_node.js +21 -16
- novel_downloader/resources/js_scripts/qq_decrypt_node.js +92 -0
- novel_downloader/utils/__init__.py +0 -2
- novel_downloader/utils/chapter_storage.py +2 -3
- novel_downloader/utils/constants.py +7 -3
- novel_downloader/utils/cookies.py +32 -17
- novel_downloader/utils/crypto_utils/__init__.py +0 -6
- novel_downloader/utils/crypto_utils/aes_util.py +1 -1
- novel_downloader/utils/crypto_utils/rc4.py +40 -50
- novel_downloader/utils/epub/__init__.py +2 -3
- novel_downloader/utils/epub/builder.py +6 -6
- novel_downloader/utils/epub/constants.py +1 -6
- novel_downloader/utils/epub/documents.py +7 -7
- novel_downloader/utils/epub/models.py +8 -8
- novel_downloader/utils/epub/utils.py +10 -10
- novel_downloader/utils/file_utils/io.py +48 -73
- novel_downloader/utils/file_utils/normalize.py +1 -7
- novel_downloader/utils/file_utils/sanitize.py +4 -11
- novel_downloader/utils/fontocr/__init__.py +13 -0
- novel_downloader/utils/{fontocr.py → fontocr/core.py} +72 -61
- novel_downloader/utils/fontocr/loader.py +52 -0
- novel_downloader/utils/logger.py +80 -56
- novel_downloader/utils/network.py +16 -40
- novel_downloader/utils/node_decryptor/__init__.py +13 -0
- novel_downloader/utils/node_decryptor/decryptor.py +342 -0
- novel_downloader/{core/parsers/qidian/utils → utils/node_decryptor}/decryptor_fetcher.py +5 -6
- novel_downloader/utils/text_utils/text_cleaner.py +39 -30
- novel_downloader/utils/text_utils/truncate_utils.py +3 -14
- novel_downloader/utils/time_utils/sleep_utils.py +53 -43
- novel_downloader/web/main.py +1 -1
- novel_downloader/web/pages/download.py +1 -1
- novel_downloader/web/pages/search.py +4 -4
- novel_downloader/web/services/task_manager.py +2 -0
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.2.dist-info}/METADATA +5 -1
- novel_downloader-2.0.2.dist-info/RECORD +203 -0
- novel_downloader/core/exporters/common/__init__.py +0 -11
- novel_downloader/core/exporters/common/epub.py +0 -198
- novel_downloader/core/exporters/common/main_exporter.py +0 -64
- novel_downloader/core/exporters/common/txt.py +0 -146
- novel_downloader/core/exporters/epub_util.py +0 -215
- novel_downloader/core/exporters/linovelib/__init__.py +0 -11
- novel_downloader/core/exporters/linovelib/epub.py +0 -349
- novel_downloader/core/exporters/linovelib/main_exporter.py +0 -66
- novel_downloader/core/exporters/linovelib/txt.py +0 -139
- novel_downloader/core/exporters/txt_util.py +0 -67
- novel_downloader/core/parsers/qidian/__init__.py +0 -10
- novel_downloader/core/parsers/qidian/book_info_parser.py +0 -89
- novel_downloader/core/parsers/qidian/chapter_encrypted.py +0 -470
- novel_downloader/core/parsers/qidian/chapter_normal.py +0 -126
- novel_downloader/core/parsers/qidian/chapter_router.py +0 -68
- novel_downloader/core/parsers/qidian/main_parser.py +0 -101
- novel_downloader/core/parsers/qidian/utils/__init__.py +0 -30
- novel_downloader/core/parsers/qidian/utils/fontmap_recover.py +0 -143
- novel_downloader/core/parsers/qidian/utils/helpers.py +0 -110
- novel_downloader/core/parsers/qidian/utils/node_decryptor.py +0 -175
- novel_downloader-2.0.0.dist-info/RECORD +0 -210
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.2.dist-info}/WHEEL +0 -0
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.2.dist-info}/entry_points.txt +0 -0
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.2.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.2.dist-info}/top_level.txt +0 -0
@@ -10,15 +10,11 @@ import abc
|
|
10
10
|
import asyncio
|
11
11
|
import json
|
12
12
|
import logging
|
13
|
-
from collections.abc import
|
13
|
+
from collections.abc import Awaitable, Callable, Sequence
|
14
14
|
from pathlib import Path
|
15
|
-
from typing import Any, cast
|
15
|
+
from typing import Any, ClassVar, cast
|
16
16
|
|
17
|
-
from novel_downloader.core.interfaces import
|
18
|
-
DownloaderProtocol,
|
19
|
-
FetcherProtocol,
|
20
|
-
ParserProtocol,
|
21
|
-
)
|
17
|
+
from novel_downloader.core.interfaces import FetcherProtocol, ParserProtocol
|
22
18
|
from novel_downloader.models import (
|
23
19
|
BookConfig,
|
24
20
|
BookInfoDict,
|
@@ -28,7 +24,7 @@ from novel_downloader.models import (
|
|
28
24
|
from novel_downloader.utils import time_diff
|
29
25
|
|
30
26
|
|
31
|
-
class BaseDownloader(
|
27
|
+
class BaseDownloader(abc.ABC):
|
32
28
|
"""
|
33
29
|
Abstract base class for novel downloaders.
|
34
30
|
|
@@ -39,8 +35,8 @@ class BaseDownloader(DownloaderProtocol, abc.ABC):
|
|
39
35
|
a single book, using the provided fetcher and parser components.
|
40
36
|
"""
|
41
37
|
|
42
|
-
DEFAULT_SOURCE_ID = 0
|
43
|
-
PRIORITIES_MAP = {
|
38
|
+
DEFAULT_SOURCE_ID: ClassVar[int] = 0
|
39
|
+
PRIORITIES_MAP: ClassVar[dict[int, int]] = {
|
44
40
|
DEFAULT_SOURCE_ID: 0,
|
45
41
|
}
|
46
42
|
|
@@ -61,15 +57,23 @@ class BaseDownloader(DownloaderProtocol, abc.ABC):
|
|
61
57
|
"""
|
62
58
|
self._fetcher = fetcher
|
63
59
|
self._parser = parser
|
64
|
-
self._config = config
|
65
60
|
self._site = site
|
66
61
|
|
62
|
+
self._save_html = config.save_html
|
63
|
+
self._skip_existing = config.skip_existing
|
64
|
+
self._login_required = config.login_required
|
65
|
+
self._request_interval = config.request_interval
|
66
|
+
self._retry_times = config.retry_times
|
67
|
+
self._backoff_factor = config.backoff_factor
|
68
|
+
self._workers = config.workers
|
69
|
+
self._storage_batch_size = max(1, config.storage_batch_size)
|
70
|
+
|
67
71
|
self._raw_data_dir = Path(config.raw_data_dir) / site
|
68
72
|
self._raw_data_dir.mkdir(parents=True, exist_ok=True)
|
69
73
|
self._debug_dir = Path.cwd() / "debug" / site
|
70
74
|
self._debug_dir.mkdir(parents=True, exist_ok=True)
|
71
75
|
|
72
|
-
self.logger = logging.getLogger(f"{self.__class__.__name__}")
|
76
|
+
self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
|
73
77
|
|
74
78
|
async def download_many(
|
75
79
|
self,
|
@@ -87,7 +91,7 @@ class BaseDownloader(DownloaderProtocol, abc.ABC):
|
|
87
91
|
args: completed_count, total_count.
|
88
92
|
:param cancel_event: Optional asyncio.Event to allow cancellation.
|
89
93
|
"""
|
90
|
-
if not
|
94
|
+
if not self._check_login():
|
91
95
|
book_ids = [b["book_id"] for b in books]
|
92
96
|
self.logger.warning(
|
93
97
|
"[%s] login failed, skipping download of books: %s",
|
@@ -116,8 +120,6 @@ class BaseDownloader(DownloaderProtocol, abc.ABC):
|
|
116
120
|
except Exception as e:
|
117
121
|
self._handle_download_exception(book, e)
|
118
122
|
|
119
|
-
await self._finalize()
|
120
|
-
|
121
123
|
async def download(
|
122
124
|
self,
|
123
125
|
book: BookConfig,
|
@@ -134,7 +136,7 @@ class BaseDownloader(DownloaderProtocol, abc.ABC):
|
|
134
136
|
args: completed_count, total_count.
|
135
137
|
:param cancel_event: Optional asyncio.Event to allow cancellation.
|
136
138
|
"""
|
137
|
-
if not
|
139
|
+
if not self._check_login():
|
138
140
|
self.logger.warning(
|
139
141
|
"[%s] login failed, skipping download of book: %s (%s-%s)",
|
140
142
|
self._site,
|
@@ -142,6 +144,7 @@ class BaseDownloader(DownloaderProtocol, abc.ABC):
|
|
142
144
|
book.get("start_id", "-"),
|
143
145
|
book.get("end_id", "-"),
|
144
146
|
)
|
147
|
+
return
|
145
148
|
|
146
149
|
# if already cancelled before starting
|
147
150
|
if cancel_event and cancel_event.is_set():
|
@@ -162,8 +165,6 @@ class BaseDownloader(DownloaderProtocol, abc.ABC):
|
|
162
165
|
except Exception as e:
|
163
166
|
self._handle_download_exception(book, e)
|
164
167
|
|
165
|
-
await self._finalize()
|
166
|
-
|
167
168
|
async def load_book_info(
|
168
169
|
self,
|
169
170
|
book_id: str,
|
@@ -200,23 +201,6 @@ class BaseDownloader(DownloaderProtocol, abc.ABC):
|
|
200
201
|
"""
|
201
202
|
...
|
202
203
|
|
203
|
-
async def _prepare(self) -> None:
|
204
|
-
"""
|
205
|
-
Optional hook called before downloading.
|
206
|
-
|
207
|
-
Subclasses can override this method to perform pre-download setup.
|
208
|
-
"""
|
209
|
-
return
|
210
|
-
|
211
|
-
async def _finalize(self) -> None:
|
212
|
-
"""
|
213
|
-
Optional hook called after downloading is complete.
|
214
|
-
|
215
|
-
Subclasses can override this method to perform post-download tasks,
|
216
|
-
such as saving state or releasing resources.
|
217
|
-
"""
|
218
|
-
return
|
219
|
-
|
220
204
|
def _load_book_info(
|
221
205
|
self,
|
222
206
|
book_id: str,
|
@@ -283,25 +267,22 @@ class BaseDownloader(DownloaderProtocol, abc.ABC):
|
|
283
267
|
:param filename: used as filename prefix
|
284
268
|
:param html_list: list of HTML strings to save
|
285
269
|
"""
|
286
|
-
if not self.
|
270
|
+
if not self._save_html:
|
287
271
|
return
|
288
|
-
|
289
272
|
html_dir.mkdir(parents=True, exist_ok=True)
|
290
273
|
for i, html in enumerate(html_list):
|
291
|
-
|
292
|
-
file_path.write_text(html, encoding="utf-8")
|
274
|
+
(html_dir / f"{filename}_{i}.html").write_text(html, encoding="utf-8")
|
293
275
|
|
294
276
|
@staticmethod
|
295
|
-
|
296
|
-
|
277
|
+
def _planned_chapter_ids(
|
278
|
+
vols: list[VolumeInfoDict],
|
297
279
|
start_id: str | None,
|
298
280
|
end_id: str | None,
|
299
|
-
|
300
|
-
|
301
|
-
Yield each chapterId in order, respecting start/end bounds.
|
302
|
-
"""
|
281
|
+
ignore: set[str],
|
282
|
+
) -> list[str]:
|
303
283
|
seen_start = start_id is None
|
304
|
-
|
284
|
+
out: list[str] = []
|
285
|
+
for vol in vols:
|
305
286
|
for chap in vol["chapters"]:
|
306
287
|
cid = chap.get("chapterId")
|
307
288
|
if not cid:
|
@@ -311,9 +292,11 @@ class BaseDownloader(DownloaderProtocol, abc.ABC):
|
|
311
292
|
seen_start = True
|
312
293
|
else:
|
313
294
|
continue
|
314
|
-
|
295
|
+
if cid not in ignore:
|
296
|
+
out.append(cid)
|
315
297
|
if end_id is not None and cid == end_id:
|
316
|
-
return
|
298
|
+
return out
|
299
|
+
return out
|
317
300
|
|
318
301
|
@property
|
319
302
|
def fetcher(self) -> FetcherProtocol:
|
@@ -323,38 +306,6 @@ class BaseDownloader(DownloaderProtocol, abc.ABC):
|
|
323
306
|
def parser(self) -> ParserProtocol:
|
324
307
|
return self._parser
|
325
308
|
|
326
|
-
@property
|
327
|
-
def save_html(self) -> bool:
|
328
|
-
return self._config.save_html
|
329
|
-
|
330
|
-
@property
|
331
|
-
def skip_existing(self) -> bool:
|
332
|
-
return self._config.skip_existing
|
333
|
-
|
334
|
-
@property
|
335
|
-
def login_required(self) -> bool:
|
336
|
-
return self._config.login_required
|
337
|
-
|
338
|
-
@property
|
339
|
-
def request_interval(self) -> float:
|
340
|
-
return self._config.request_interval
|
341
|
-
|
342
|
-
@property
|
343
|
-
def retry_times(self) -> int:
|
344
|
-
return self._config.retry_times
|
345
|
-
|
346
|
-
@property
|
347
|
-
def backoff_factor(self) -> float:
|
348
|
-
return self._config.backoff_factor
|
349
|
-
|
350
|
-
@property
|
351
|
-
def workers(self) -> int:
|
352
|
-
return self._config.workers
|
353
|
-
|
354
|
-
@property
|
355
|
-
def storage_batch_size(self) -> int:
|
356
|
-
return max(1, self._config.storage_batch_size)
|
357
|
-
|
358
309
|
def _handle_download_exception(self, book: BookConfig, error: Exception) -> None:
|
359
310
|
"""
|
360
311
|
Handle download errors in a consistent way.
|
@@ -373,10 +324,8 @@ class BaseDownloader(DownloaderProtocol, abc.ABC):
|
|
373
324
|
error,
|
374
325
|
)
|
375
326
|
|
376
|
-
|
327
|
+
def _check_login(self) -> bool:
|
377
328
|
"""
|
378
|
-
|
329
|
+
Check login if needed.
|
379
330
|
"""
|
380
|
-
|
381
|
-
|
382
|
-
return self.fetcher.is_logged_in if self.login_required else True
|
331
|
+
return self.fetcher.is_logged_in if self._login_required else True
|
@@ -12,19 +12,9 @@ from pathlib import Path
|
|
12
12
|
from typing import Any
|
13
13
|
|
14
14
|
from novel_downloader.core.downloaders.base import BaseDownloader
|
15
|
-
from novel_downloader.core.downloaders.signals import
|
16
|
-
|
17
|
-
|
18
|
-
StopToken,
|
19
|
-
)
|
20
|
-
from novel_downloader.models import (
|
21
|
-
BookConfig,
|
22
|
-
ChapterDict,
|
23
|
-
)
|
24
|
-
from novel_downloader.utils import (
|
25
|
-
ChapterStorage,
|
26
|
-
async_jitter_sleep,
|
27
|
-
)
|
15
|
+
from novel_downloader.core.downloaders.signals import STOP, Progress, StopToken
|
16
|
+
from novel_downloader.models import BookConfig, ChapterDict
|
17
|
+
from novel_downloader.utils import ChapterStorage, async_jitter_sleep
|
28
18
|
|
29
19
|
|
30
20
|
class CommonDownloader(BaseDownloader):
|
@@ -41,7 +31,7 @@ class CommonDownloader(BaseDownloader):
|
|
41
31
|
**kwargs: Any,
|
42
32
|
) -> None:
|
43
33
|
"""
|
44
|
-
Sentinel-based pipeline with
|
34
|
+
Sentinel-based pipeline with cancellation:
|
45
35
|
|
46
36
|
Producer -> ChapterWorkers -> StorageWorker.
|
47
37
|
|
@@ -59,181 +49,167 @@ class CommonDownloader(BaseDownloader):
|
|
59
49
|
raw_base.mkdir(parents=True, exist_ok=True)
|
60
50
|
html_dir = self._debug_dir / book_id / "html"
|
61
51
|
|
62
|
-
chapter_storage = ChapterStorage(
|
63
|
-
raw_base=raw_base,
|
64
|
-
priorities=self.PRIORITIES_MAP,
|
65
|
-
)
|
66
|
-
chapter_storage.connect()
|
67
|
-
|
68
52
|
def cancelled() -> bool:
|
69
53
|
return bool(cancel_event and cancel_event.is_set())
|
70
54
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
return
|
76
|
-
|
77
|
-
vols = book_info["volumes"]
|
78
|
-
total_chapters = sum(len(v["chapters"]) for v in vols)
|
79
|
-
if total_chapters == 0:
|
80
|
-
self.logger.warning("%s 书籍没有章节可下载: %s", TAG, book_id)
|
81
|
-
return
|
55
|
+
# --- metadata ---
|
56
|
+
book_info = await self.load_book_info(book_id=book_id, html_dir=html_dir)
|
57
|
+
if not book_info:
|
58
|
+
return
|
82
59
|
|
83
|
-
|
60
|
+
vols = book_info["volumes"]
|
61
|
+
plan = self._planned_chapter_ids(vols, start_id, end_id, ignore_set)
|
62
|
+
if not plan:
|
63
|
+
self.logger.info("%s nothing to do after filtering: %s", TAG, book_id)
|
64
|
+
return
|
84
65
|
|
85
|
-
|
86
|
-
cid_q: asyncio.Queue[str | StopToken] = asyncio.Queue()
|
87
|
-
save_q: asyncio.Queue[ChapterDict | StopToken] = asyncio.Queue()
|
88
|
-
batch: list[ChapterDict] = []
|
66
|
+
progress = Progress(total=len(plan), hook=progress_hook)
|
89
67
|
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
self.logger.error(
|
97
|
-
"[Storage] batch upsert failed (size=%d): %s",
|
98
|
-
len(batch),
|
99
|
-
e,
|
100
|
-
exc_info=True,
|
101
|
-
)
|
102
|
-
else:
|
103
|
-
await progress.bump(len(batch))
|
104
|
-
finally:
|
105
|
-
batch.clear()
|
106
|
-
|
107
|
-
# --- stage: storage worker ---
|
108
|
-
async def storage_worker() -> None:
|
109
|
-
"""
|
110
|
-
Consumes parsed chapters, writes in batches.
|
111
|
-
|
112
|
-
Terminates after receiving STOP from each chapter worker.
|
113
|
-
|
114
|
-
On cancel: keeps consuming (to avoid blocking producers),
|
115
|
-
flushes, and exits once all STOPs are seen.
|
116
|
-
"""
|
117
|
-
stop_count = 0
|
118
|
-
while True:
|
119
|
-
item = await save_q.get()
|
120
|
-
if isinstance(item, StopToken):
|
121
|
-
stop_count += 1
|
122
|
-
if stop_count == self.workers:
|
123
|
-
# All chapter workers have exited.
|
124
|
-
await flush_batch()
|
125
|
-
return
|
126
|
-
# else keep waiting for remaining STOPs
|
127
|
-
continue
|
128
|
-
|
129
|
-
# Normal chapter
|
130
|
-
batch.append(item)
|
131
|
-
if len(batch) >= self.storage_batch_size:
|
132
|
-
await flush_batch()
|
68
|
+
# --- queues & batching ---
|
69
|
+
cid_q: asyncio.Queue[str | StopToken] = asyncio.Queue(maxsize=self._workers * 2)
|
70
|
+
save_q: asyncio.Queue[ChapterDict | StopToken] = asyncio.Queue(
|
71
|
+
maxsize=self._workers * 2
|
72
|
+
)
|
73
|
+
batch: list[ChapterDict] = []
|
133
74
|
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
75
|
+
async def flush_batch() -> None:
|
76
|
+
if not batch:
|
77
|
+
return
|
78
|
+
try:
|
79
|
+
storage.upsert_chapters(batch, self.DEFAULT_SOURCE_ID)
|
80
|
+
except Exception as e:
|
81
|
+
self.logger.error(
|
82
|
+
"[Storage] batch upsert failed (size=%d): %s",
|
83
|
+
len(batch),
|
84
|
+
e,
|
85
|
+
exc_info=True,
|
86
|
+
)
|
87
|
+
else:
|
88
|
+
await progress.bump(len(batch))
|
89
|
+
finally:
|
90
|
+
batch.clear()
|
91
|
+
|
92
|
+
# --- stage: storage worker ---
|
93
|
+
async def storage_worker() -> None:
|
94
|
+
"""
|
95
|
+
Consumes parsed chapters, writes in batches.
|
96
|
+
|
97
|
+
Terminates after receiving STOP from each chapter worker.
|
98
|
+
|
99
|
+
On cancel: keeps consuming (to avoid blocking producers),
|
100
|
+
flushes, and exits once all STOPs are seen.
|
101
|
+
"""
|
102
|
+
stop_count = 0
|
103
|
+
while True:
|
104
|
+
item = await save_q.get()
|
105
|
+
if isinstance(item, StopToken):
|
106
|
+
stop_count += 1
|
107
|
+
if stop_count == self._workers:
|
108
|
+
# All chapter workers have exited.
|
146
109
|
await flush_batch()
|
147
|
-
|
148
|
-
|
149
|
-
|
110
|
+
return
|
111
|
+
# else keep waiting for remaining STOPs
|
112
|
+
continue
|
113
|
+
|
114
|
+
# Normal chapter
|
115
|
+
batch.append(item)
|
116
|
+
if len(batch) >= self._storage_batch_size:
|
117
|
+
await flush_batch()
|
118
|
+
|
119
|
+
if cancelled():
|
120
|
+
# Drain whatever is already in the queue
|
121
|
+
try:
|
122
|
+
while True:
|
123
|
+
nxt = save_q.get_nowait()
|
150
124
|
if isinstance(nxt, StopToken):
|
151
125
|
stop_count += 1
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
126
|
+
else:
|
127
|
+
batch.append(nxt)
|
128
|
+
except asyncio.QueueEmpty:
|
129
|
+
pass
|
130
|
+
# Final flush of everything
|
131
|
+
await flush_batch()
|
132
|
+
# Wait for remaining STOPs so chapter workers can finish.
|
133
|
+
while stop_count < self._workers:
|
134
|
+
nxt = await save_q.get()
|
135
|
+
if isinstance(nxt, StopToken):
|
136
|
+
stop_count += 1
|
137
|
+
return
|
156
138
|
|
157
|
-
|
158
|
-
|
159
|
-
|
139
|
+
# --- stage: chapter worker ---
|
140
|
+
async def chapter_worker() -> None:
|
141
|
+
"""
|
142
|
+
Fetch + parse with retry, then enqueue to save_q.
|
143
|
+
|
144
|
+
Exits on STOP, or early if cancel is set before starting a new fetch.
|
145
|
+
"""
|
146
|
+
while True:
|
147
|
+
cid = await cid_q.get()
|
148
|
+
if isinstance(cid, StopToken):
|
149
|
+
# Propagate one STOP to storage and exit.
|
150
|
+
await save_q.put(STOP)
|
151
|
+
return
|
160
152
|
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
if isinstance(cid, StopToken):
|
166
|
-
# Propagate one STOP to storage and exit.
|
167
|
-
await save_q.put(STOP)
|
168
|
-
return
|
153
|
+
# If cancelled, don't start a new network call; let storage finish.
|
154
|
+
if cancelled():
|
155
|
+
await save_q.put(STOP)
|
156
|
+
return
|
169
157
|
|
170
|
-
|
171
|
-
|
172
|
-
|
158
|
+
chap = await self._process_chapter(book_id, cid, html_dir)
|
159
|
+
if chap:
|
160
|
+
await save_q.put(chap)
|
173
161
|
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
162
|
+
# polite pacing
|
163
|
+
await async_jitter_sleep(
|
164
|
+
self._request_interval,
|
165
|
+
mul_spread=1.1,
|
166
|
+
max_sleep=self._request_interval + 2,
|
167
|
+
)
|
178
168
|
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
169
|
+
# --- stage: producer ---
|
170
|
+
async def producer() -> None:
|
171
|
+
"""
|
172
|
+
Enqueue chapter IDs (respecting start/end/skip_existing).
|
183
173
|
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
)
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
break
|
203
|
-
if self.skip_existing and chapter_storage.exists(cid):
|
204
|
-
# Count as completed but don't enqueue.
|
205
|
-
await progress.bump(1)
|
206
|
-
else:
|
207
|
-
await cid_q.put(cid)
|
208
|
-
finally:
|
209
|
-
for _ in range(self.workers):
|
210
|
-
await cid_q.put(STOP)
|
211
|
-
|
212
|
-
# --- run the pipeline ---
|
174
|
+
Always sends STOP x workers at the end (even if cancelled early),
|
175
|
+
so chapter workers can exit deterministically.
|
176
|
+
"""
|
177
|
+
try:
|
178
|
+
for cid in plan:
|
179
|
+
if cancelled():
|
180
|
+
break
|
181
|
+
if self._skip_existing and storage.exists(cid):
|
182
|
+
# Count as completed but don't enqueue.
|
183
|
+
await progress.bump(1)
|
184
|
+
else:
|
185
|
+
await cid_q.put(cid)
|
186
|
+
finally:
|
187
|
+
for _ in range(self._workers):
|
188
|
+
await cid_q.put(STOP)
|
189
|
+
|
190
|
+
# --- run the pipeline ---
|
191
|
+
with ChapterStorage(raw_base, priorities=self.PRIORITIES_MAP) as storage:
|
213
192
|
async with asyncio.TaskGroup() as tg:
|
214
193
|
tg.create_task(storage_worker())
|
215
|
-
for _ in range(self.
|
194
|
+
for _ in range(self._workers):
|
216
195
|
tg.create_task(chapter_worker())
|
217
196
|
tg.create_task(producer())
|
218
197
|
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
finally:
|
236
|
-
chapter_storage.close()
|
198
|
+
# --- done ---
|
199
|
+
if cancelled():
|
200
|
+
self.logger.info(
|
201
|
+
"%s Novel '%s' cancelled: flushed %d/%d chapters.",
|
202
|
+
TAG,
|
203
|
+
book_info.get("book_name", "unknown"),
|
204
|
+
progress.done,
|
205
|
+
progress.total,
|
206
|
+
)
|
207
|
+
else:
|
208
|
+
self.logger.info(
|
209
|
+
"%s Novel '%s' download completed.",
|
210
|
+
TAG,
|
211
|
+
book_info.get("book_name", "unknown"),
|
212
|
+
)
|
237
213
|
|
238
214
|
async def _process_chapter(
|
239
215
|
self,
|
@@ -247,7 +223,7 @@ class CommonDownloader(BaseDownloader):
|
|
247
223
|
|
248
224
|
:return: ChapterDict on success, or None on failure.
|
249
225
|
"""
|
250
|
-
for attempt in range(self.
|
226
|
+
for attempt in range(self._retry_times + 1):
|
251
227
|
try:
|
252
228
|
html_list = await self.fetcher.get_book_chapter(book_id, cid)
|
253
229
|
self._save_html_pages(html_dir, cid, html_list)
|
@@ -258,11 +234,11 @@ class CommonDownloader(BaseDownloader):
|
|
258
234
|
raise ValueError("Empty parse result")
|
259
235
|
return chap
|
260
236
|
except Exception as e:
|
261
|
-
if attempt < self.
|
237
|
+
if attempt < self._retry_times:
|
262
238
|
self.logger.info(
|
263
239
|
"[ChapterWorker] Retry %s (%s): %s", cid, attempt + 1, e
|
264
240
|
)
|
265
|
-
backoff = self.
|
241
|
+
backoff = self._backoff_factor * (2**attempt)
|
266
242
|
await async_jitter_sleep(
|
267
243
|
base=backoff, mul_spread=1.2, max_sleep=backoff + 3
|
268
244
|
)
|