novel-downloader 1.4.5__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -1
- novel_downloader/cli/__init__.py +2 -4
- novel_downloader/cli/clean.py +21 -88
- novel_downloader/cli/config.py +27 -104
- novel_downloader/cli/download.py +78 -66
- novel_downloader/cli/export.py +20 -21
- novel_downloader/cli/main.py +3 -1
- novel_downloader/cli/search.py +120 -0
- novel_downloader/cli/ui.py +156 -0
- novel_downloader/config/__init__.py +10 -14
- novel_downloader/config/adapter.py +195 -99
- novel_downloader/config/{loader.py → file_io.py} +53 -27
- novel_downloader/core/__init__.py +14 -13
- novel_downloader/core/archived/deqixs/fetcher.py +115 -0
- novel_downloader/core/archived/deqixs/parser.py +132 -0
- novel_downloader/core/archived/deqixs/searcher.py +89 -0
- novel_downloader/core/archived/qidian/searcher.py +79 -0
- novel_downloader/core/archived/wanbengo/searcher.py +98 -0
- novel_downloader/core/archived/xshbook/searcher.py +93 -0
- novel_downloader/core/downloaders/__init__.py +8 -30
- novel_downloader/core/downloaders/base.py +182 -30
- novel_downloader/core/downloaders/common.py +217 -384
- novel_downloader/core/downloaders/qianbi.py +332 -4
- novel_downloader/core/downloaders/qidian.py +250 -290
- novel_downloader/core/downloaders/registry.py +69 -0
- novel_downloader/core/downloaders/signals.py +46 -0
- novel_downloader/core/exporters/__init__.py +8 -26
- novel_downloader/core/exporters/base.py +107 -31
- novel_downloader/core/exporters/common/__init__.py +3 -4
- novel_downloader/core/exporters/common/epub.py +92 -171
- novel_downloader/core/exporters/common/main_exporter.py +14 -67
- novel_downloader/core/exporters/common/txt.py +90 -86
- novel_downloader/core/exporters/epub_util.py +184 -1327
- novel_downloader/core/exporters/linovelib/__init__.py +3 -2
- novel_downloader/core/exporters/linovelib/epub.py +165 -222
- novel_downloader/core/exporters/linovelib/main_exporter.py +10 -71
- novel_downloader/core/exporters/linovelib/txt.py +76 -66
- novel_downloader/core/exporters/qidian.py +15 -11
- novel_downloader/core/exporters/registry.py +55 -0
- novel_downloader/core/exporters/txt_util.py +67 -0
- novel_downloader/core/fetchers/__init__.py +57 -56
- novel_downloader/core/fetchers/aaatxt.py +83 -0
- novel_downloader/core/fetchers/{biquge/session.py → b520.py} +10 -10
- novel_downloader/core/fetchers/{base/session.py → base.py} +63 -47
- novel_downloader/core/fetchers/biquyuedu.py +83 -0
- novel_downloader/core/fetchers/dxmwx.py +110 -0
- novel_downloader/core/fetchers/eightnovel.py +139 -0
- novel_downloader/core/fetchers/{esjzone/session.py → esjzone.py} +23 -11
- novel_downloader/core/fetchers/guidaye.py +85 -0
- novel_downloader/core/fetchers/hetushu.py +92 -0
- novel_downloader/core/fetchers/{qianbi/browser.py → i25zw.py} +22 -26
- novel_downloader/core/fetchers/ixdzs8.py +113 -0
- novel_downloader/core/fetchers/jpxs123.py +101 -0
- novel_downloader/core/fetchers/{biquge/browser.py → lewenn.py} +15 -15
- novel_downloader/core/fetchers/{linovelib/session.py → linovelib.py} +16 -12
- novel_downloader/core/fetchers/piaotia.py +105 -0
- novel_downloader/core/fetchers/qbtr.py +101 -0
- novel_downloader/core/fetchers/{qianbi/session.py → qianbi.py} +9 -9
- novel_downloader/core/fetchers/{qidian/session.py → qidian.py} +55 -40
- novel_downloader/core/fetchers/quanben5.py +92 -0
- novel_downloader/core/fetchers/{base/rate_limiter.py → rate_limiter.py} +2 -2
- novel_downloader/core/fetchers/registry.py +60 -0
- novel_downloader/core/fetchers/{sfacg/session.py → sfacg.py} +11 -9
- novel_downloader/core/fetchers/shencou.py +106 -0
- novel_downloader/core/fetchers/{common/browser.py → shuhaige.py} +24 -19
- novel_downloader/core/fetchers/tongrenquan.py +84 -0
- novel_downloader/core/fetchers/ttkan.py +95 -0
- novel_downloader/core/fetchers/{common/session.py → wanbengo.py} +21 -17
- novel_downloader/core/fetchers/xiaoshuowu.py +106 -0
- novel_downloader/core/fetchers/xiguashuwu.py +177 -0
- novel_downloader/core/fetchers/xs63b.py +171 -0
- novel_downloader/core/fetchers/xshbook.py +85 -0
- novel_downloader/core/fetchers/{yamibo/session.py → yamibo.py} +23 -11
- novel_downloader/core/fetchers/yibige.py +114 -0
- novel_downloader/core/interfaces/__init__.py +8 -14
- novel_downloader/core/interfaces/downloader.py +6 -2
- novel_downloader/core/interfaces/exporter.py +7 -7
- novel_downloader/core/interfaces/fetcher.py +4 -17
- novel_downloader/core/interfaces/parser.py +5 -6
- novel_downloader/core/interfaces/searcher.py +26 -0
- novel_downloader/core/parsers/__init__.py +58 -22
- novel_downloader/core/parsers/aaatxt.py +132 -0
- novel_downloader/core/parsers/b520.py +116 -0
- novel_downloader/core/parsers/base.py +63 -12
- novel_downloader/core/parsers/biquyuedu.py +133 -0
- novel_downloader/core/parsers/dxmwx.py +162 -0
- novel_downloader/core/parsers/eightnovel.py +224 -0
- novel_downloader/core/parsers/{esjzone/main_parser.py → esjzone.py} +67 -67
- novel_downloader/core/parsers/guidaye.py +128 -0
- novel_downloader/core/parsers/hetushu.py +139 -0
- novel_downloader/core/parsers/i25zw.py +137 -0
- novel_downloader/core/parsers/ixdzs8.py +186 -0
- novel_downloader/core/parsers/jpxs123.py +137 -0
- novel_downloader/core/parsers/lewenn.py +142 -0
- novel_downloader/core/parsers/{linovelib/main_parser.py → linovelib.py} +54 -65
- novel_downloader/core/parsers/piaotia.py +189 -0
- novel_downloader/core/parsers/qbtr.py +136 -0
- novel_downloader/core/parsers/{qianbi/main_parser.py → qianbi.py} +54 -51
- novel_downloader/core/parsers/qidian/__init__.py +2 -2
- novel_downloader/core/parsers/qidian/book_info_parser.py +58 -59
- novel_downloader/core/parsers/qidian/chapter_encrypted.py +290 -346
- novel_downloader/core/parsers/qidian/chapter_normal.py +25 -56
- novel_downloader/core/parsers/qidian/main_parser.py +19 -57
- novel_downloader/core/parsers/qidian/utils/__init__.py +12 -11
- novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +6 -7
- novel_downloader/core/parsers/qidian/utils/fontmap_recover.py +143 -0
- novel_downloader/core/parsers/qidian/utils/helpers.py +0 -4
- novel_downloader/core/parsers/qidian/utils/node_decryptor.py +2 -2
- novel_downloader/core/parsers/quanben5.py +103 -0
- novel_downloader/core/parsers/registry.py +57 -0
- novel_downloader/core/parsers/{sfacg/main_parser.py → sfacg.py} +46 -48
- novel_downloader/core/parsers/shencou.py +215 -0
- novel_downloader/core/parsers/shuhaige.py +111 -0
- novel_downloader/core/parsers/tongrenquan.py +116 -0
- novel_downloader/core/parsers/ttkan.py +132 -0
- novel_downloader/core/parsers/wanbengo.py +191 -0
- novel_downloader/core/parsers/xiaoshuowu.py +173 -0
- novel_downloader/core/parsers/xiguashuwu.py +435 -0
- novel_downloader/core/parsers/xs63b.py +161 -0
- novel_downloader/core/parsers/xshbook.py +134 -0
- novel_downloader/core/parsers/yamibo.py +155 -0
- novel_downloader/core/parsers/yibige.py +166 -0
- novel_downloader/core/searchers/__init__.py +51 -0
- novel_downloader/core/searchers/aaatxt.py +107 -0
- novel_downloader/core/searchers/b520.py +84 -0
- novel_downloader/core/searchers/base.py +168 -0
- novel_downloader/core/searchers/dxmwx.py +105 -0
- novel_downloader/core/searchers/eightnovel.py +84 -0
- novel_downloader/core/searchers/esjzone.py +102 -0
- novel_downloader/core/searchers/hetushu.py +92 -0
- novel_downloader/core/searchers/i25zw.py +93 -0
- novel_downloader/core/searchers/ixdzs8.py +107 -0
- novel_downloader/core/searchers/jpxs123.py +107 -0
- novel_downloader/core/searchers/piaotia.py +100 -0
- novel_downloader/core/searchers/qbtr.py +106 -0
- novel_downloader/core/searchers/qianbi.py +165 -0
- novel_downloader/core/searchers/quanben5.py +144 -0
- novel_downloader/core/searchers/registry.py +79 -0
- novel_downloader/core/searchers/shuhaige.py +124 -0
- novel_downloader/core/searchers/tongrenquan.py +110 -0
- novel_downloader/core/searchers/ttkan.py +92 -0
- novel_downloader/core/searchers/xiaoshuowu.py +122 -0
- novel_downloader/core/searchers/xiguashuwu.py +95 -0
- novel_downloader/core/searchers/xs63b.py +104 -0
- novel_downloader/locales/en.json +36 -79
- novel_downloader/locales/zh.json +37 -80
- novel_downloader/models/__init__.py +23 -50
- novel_downloader/models/book.py +44 -0
- novel_downloader/models/config.py +16 -43
- novel_downloader/models/login.py +1 -1
- novel_downloader/models/search.py +21 -0
- novel_downloader/resources/config/settings.toml +39 -74
- novel_downloader/resources/css_styles/intro.css +83 -0
- novel_downloader/resources/css_styles/main.css +30 -89
- novel_downloader/resources/json/xiguashuwu.json +718 -0
- novel_downloader/utils/__init__.py +43 -0
- novel_downloader/utils/chapter_storage.py +247 -226
- novel_downloader/utils/constants.py +5 -50
- novel_downloader/utils/cookies.py +6 -18
- novel_downloader/utils/crypto_utils/__init__.py +13 -0
- novel_downloader/utils/crypto_utils/aes_util.py +90 -0
- novel_downloader/utils/crypto_utils/aes_v1.py +619 -0
- novel_downloader/utils/crypto_utils/aes_v2.py +1143 -0
- novel_downloader/utils/{crypto_utils.py → crypto_utils/rc4.py} +3 -10
- novel_downloader/utils/epub/__init__.py +34 -0
- novel_downloader/utils/epub/builder.py +377 -0
- novel_downloader/utils/epub/constants.py +118 -0
- novel_downloader/utils/epub/documents.py +297 -0
- novel_downloader/utils/epub/models.py +120 -0
- novel_downloader/utils/epub/utils.py +179 -0
- novel_downloader/utils/file_utils/__init__.py +5 -30
- novel_downloader/utils/file_utils/io.py +9 -150
- novel_downloader/utils/file_utils/normalize.py +2 -2
- novel_downloader/utils/file_utils/sanitize.py +2 -7
- novel_downloader/utils/fontocr.py +207 -0
- novel_downloader/utils/i18n.py +2 -0
- novel_downloader/utils/logger.py +10 -16
- novel_downloader/utils/network.py +111 -252
- novel_downloader/utils/state.py +5 -90
- novel_downloader/utils/text_utils/__init__.py +16 -21
- novel_downloader/utils/text_utils/diff_display.py +6 -9
- novel_downloader/utils/text_utils/numeric_conversion.py +253 -0
- novel_downloader/utils/text_utils/text_cleaner.py +179 -0
- novel_downloader/utils/text_utils/truncate_utils.py +62 -0
- novel_downloader/utils/time_utils/__init__.py +6 -12
- novel_downloader/utils/time_utils/datetime_utils.py +23 -33
- novel_downloader/utils/time_utils/sleep_utils.py +5 -10
- novel_downloader/web/__init__.py +13 -0
- novel_downloader/web/components/__init__.py +11 -0
- novel_downloader/web/components/navigation.py +35 -0
- novel_downloader/web/main.py +66 -0
- novel_downloader/web/pages/__init__.py +17 -0
- novel_downloader/web/pages/download.py +78 -0
- novel_downloader/web/pages/progress.py +147 -0
- novel_downloader/web/pages/search.py +329 -0
- novel_downloader/web/services/__init__.py +17 -0
- novel_downloader/web/services/client_dialog.py +164 -0
- novel_downloader/web/services/cred_broker.py +113 -0
- novel_downloader/web/services/cred_models.py +35 -0
- novel_downloader/web/services/task_manager.py +264 -0
- novel_downloader-2.0.0.dist-info/METADATA +171 -0
- novel_downloader-2.0.0.dist-info/RECORD +210 -0
- {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/entry_points.txt +1 -1
- novel_downloader/config/site_rules.py +0 -94
- novel_downloader/core/downloaders/biquge.py +0 -25
- novel_downloader/core/downloaders/esjzone.py +0 -25
- novel_downloader/core/downloaders/linovelib.py +0 -25
- novel_downloader/core/downloaders/sfacg.py +0 -25
- novel_downloader/core/downloaders/yamibo.py +0 -25
- novel_downloader/core/exporters/biquge.py +0 -25
- novel_downloader/core/exporters/esjzone.py +0 -25
- novel_downloader/core/exporters/qianbi.py +0 -25
- novel_downloader/core/exporters/sfacg.py +0 -25
- novel_downloader/core/exporters/yamibo.py +0 -25
- novel_downloader/core/factory/__init__.py +0 -20
- novel_downloader/core/factory/downloader.py +0 -73
- novel_downloader/core/factory/exporter.py +0 -58
- novel_downloader/core/factory/fetcher.py +0 -96
- novel_downloader/core/factory/parser.py +0 -86
- novel_downloader/core/fetchers/base/__init__.py +0 -14
- novel_downloader/core/fetchers/base/browser.py +0 -403
- novel_downloader/core/fetchers/biquge/__init__.py +0 -14
- novel_downloader/core/fetchers/common/__init__.py +0 -14
- novel_downloader/core/fetchers/esjzone/__init__.py +0 -14
- novel_downloader/core/fetchers/esjzone/browser.py +0 -204
- novel_downloader/core/fetchers/linovelib/__init__.py +0 -14
- novel_downloader/core/fetchers/linovelib/browser.py +0 -193
- novel_downloader/core/fetchers/qianbi/__init__.py +0 -14
- novel_downloader/core/fetchers/qidian/__init__.py +0 -14
- novel_downloader/core/fetchers/qidian/browser.py +0 -318
- novel_downloader/core/fetchers/sfacg/__init__.py +0 -14
- novel_downloader/core/fetchers/sfacg/browser.py +0 -189
- novel_downloader/core/fetchers/yamibo/__init__.py +0 -14
- novel_downloader/core/fetchers/yamibo/browser.py +0 -229
- novel_downloader/core/parsers/biquge/__init__.py +0 -10
- novel_downloader/core/parsers/biquge/main_parser.py +0 -134
- novel_downloader/core/parsers/common/__init__.py +0 -13
- novel_downloader/core/parsers/common/helper.py +0 -323
- novel_downloader/core/parsers/common/main_parser.py +0 -106
- novel_downloader/core/parsers/esjzone/__init__.py +0 -10
- novel_downloader/core/parsers/linovelib/__init__.py +0 -10
- novel_downloader/core/parsers/qianbi/__init__.py +0 -10
- novel_downloader/core/parsers/sfacg/__init__.py +0 -10
- novel_downloader/core/parsers/yamibo/__init__.py +0 -10
- novel_downloader/core/parsers/yamibo/main_parser.py +0 -194
- novel_downloader/models/browser.py +0 -21
- novel_downloader/models/chapter.py +0 -25
- novel_downloader/models/site_rules.py +0 -99
- novel_downloader/models/tasks.py +0 -33
- novel_downloader/models/types.py +0 -15
- novel_downloader/resources/css_styles/volume-intro.css +0 -56
- novel_downloader/resources/json/replace_word_map.json +0 -4
- novel_downloader/resources/text/blacklist.txt +0 -22
- novel_downloader/tui/__init__.py +0 -7
- novel_downloader/tui/app.py +0 -32
- novel_downloader/tui/main.py +0 -17
- novel_downloader/tui/screens/__init__.py +0 -14
- novel_downloader/tui/screens/home.py +0 -198
- novel_downloader/tui/screens/login.py +0 -74
- novel_downloader/tui/styles/home_layout.tcss +0 -79
- novel_downloader/tui/widgets/richlog_handler.py +0 -24
- novel_downloader/utils/cache.py +0 -24
- novel_downloader/utils/fontocr/__init__.py +0 -22
- novel_downloader/utils/fontocr/model_loader.py +0 -69
- novel_downloader/utils/fontocr/ocr_v1.py +0 -303
- novel_downloader/utils/fontocr/ocr_v2.py +0 -752
- novel_downloader/utils/hash_store.py +0 -279
- novel_downloader/utils/hash_utils.py +0 -103
- novel_downloader/utils/text_utils/chapter_formatting.py +0 -46
- novel_downloader/utils/text_utils/font_mapping.py +0 -28
- novel_downloader/utils/text_utils/text_cleaning.py +0 -107
- novel_downloader-1.4.5.dist-info/METADATA +0 -196
- novel_downloader-1.4.5.dist-info/RECORD +0 -165
- {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/WHEEL +0 -0
- {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/top_level.txt +0 -0
@@ -3,33 +3,33 @@
|
|
3
3
|
novel_downloader.core.downloaders.common
|
4
4
|
----------------------------------------
|
5
5
|
|
6
|
+
Concrete downloader implementation with a generic async pipeline for common novel sites
|
6
7
|
"""
|
7
8
|
|
8
9
|
import asyncio
|
9
|
-
import json
|
10
10
|
from collections.abc import Awaitable, Callable
|
11
|
-
from
|
12
|
-
from typing import Any
|
11
|
+
from pathlib import Path
|
12
|
+
from typing import Any
|
13
13
|
|
14
14
|
from novel_downloader.core.downloaders.base import BaseDownloader
|
15
|
+
from novel_downloader.core.downloaders.signals import (
|
16
|
+
STOP,
|
17
|
+
Progress,
|
18
|
+
StopToken,
|
19
|
+
)
|
15
20
|
from novel_downloader.models import (
|
16
21
|
BookConfig,
|
17
22
|
ChapterDict,
|
18
|
-
CidTask,
|
19
|
-
HtmlTask,
|
20
|
-
RestoreTask,
|
21
23
|
)
|
22
|
-
from novel_downloader.utils
|
23
|
-
|
24
|
-
|
25
|
-
async_sleep_with_random_delay,
|
26
|
-
calculate_time_difference,
|
24
|
+
from novel_downloader.utils import (
|
25
|
+
ChapterStorage,
|
26
|
+
async_jitter_sleep,
|
27
27
|
)
|
28
28
|
|
29
29
|
|
30
30
|
class CommonDownloader(BaseDownloader):
|
31
31
|
"""
|
32
|
-
Specialized Async downloader for common
|
32
|
+
Specialized Async downloader for "common" novel sites.
|
33
33
|
"""
|
34
34
|
|
35
35
|
async def _download_one(
|
@@ -37,412 +37,245 @@ class CommonDownloader(BaseDownloader):
|
|
37
37
|
book: BookConfig,
|
38
38
|
*,
|
39
39
|
progress_hook: Callable[[int, int], Awaitable[None]] | None = None,
|
40
|
+
cancel_event: asyncio.Event | None = None,
|
40
41
|
**kwargs: Any,
|
41
42
|
) -> None:
|
42
43
|
"""
|
43
|
-
|
44
|
+
Sentinel-based pipeline with graceful cancellation:
|
45
|
+
|
46
|
+
Producer -> ChapterWorkers -> StorageWorker.
|
44
47
|
|
45
|
-
|
48
|
+
On cancel: stop producing, workers finish at most one chapter,
|
49
|
+
storage drains, flushes, and exits.
|
46
50
|
"""
|
47
51
|
TAG = "[Downloader]"
|
48
|
-
|
52
|
+
|
53
|
+
book_id = self._normalize_book_id(book["book_id"])
|
49
54
|
start_id = book.get("start_id")
|
50
55
|
end_id = book.get("end_id")
|
51
56
|
ignore_set = set(book.get("ignore_ids", []))
|
52
57
|
|
53
|
-
raw_base = self.
|
54
|
-
cache_base = self.cache_dir / book_id
|
55
|
-
info_path = raw_base / "book_info.json"
|
56
|
-
chapters_html_dir = cache_base / "html"
|
57
|
-
|
58
|
+
raw_base = self._raw_data_dir / book_id
|
58
59
|
raw_base.mkdir(parents=True, exist_ok=True)
|
59
|
-
|
60
|
-
|
61
|
-
|
60
|
+
html_dir = self._debug_dir / book_id / "html"
|
61
|
+
|
62
|
+
chapter_storage = ChapterStorage(
|
62
63
|
raw_base=raw_base,
|
63
|
-
|
64
|
-
backend_type=self._config.storage_backend,
|
65
|
-
batch_size=self._config.storage_batch_size,
|
64
|
+
priorities=self.PRIORITIES_MAP,
|
66
65
|
)
|
66
|
+
chapter_storage.connect()
|
67
67
|
|
68
|
-
|
69
|
-
|
70
|
-
re_fetch = True
|
71
|
-
old_data: dict[str, Any] = {}
|
68
|
+
def cancelled() -> bool:
|
69
|
+
return bool(cancel_event and cancel_event.is_set())
|
72
70
|
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
)
|
79
|
-
re_fetch = days > 1
|
80
|
-
except Exception:
|
81
|
-
re_fetch = True
|
82
|
-
|
83
|
-
if re_fetch:
|
84
|
-
info_html = await self.fetcher.get_book_info(book_id)
|
85
|
-
if self.save_html:
|
86
|
-
for i, html in enumerate(info_html):
|
87
|
-
save_as_txt(html, chapters_html_dir / f"info_{i}.html")
|
88
|
-
book_info = self.parser.parse_book_info(info_html)
|
89
|
-
|
90
|
-
if book_info.get("book_name") != "未找到书名":
|
91
|
-
save_as_json(book_info, info_path)
|
92
|
-
else:
|
93
|
-
self.logger.warning("%s 书籍信息未找到, book_id = %s", TAG, book_id)
|
94
|
-
book_info = old_data or {"book_name": "未找到书名"}
|
95
|
-
else:
|
96
|
-
book_info = old_data
|
97
|
-
|
98
|
-
vols = book_info.get("volumes", [])
|
99
|
-
total_chapters = 0
|
100
|
-
for vol in vols:
|
101
|
-
total_chapters += len(vol.get("chapters", []))
|
102
|
-
if total_chapters == 0:
|
103
|
-
self.logger.warning("%s 书籍没有章节可下载: book_id=%s", TAG, book_id)
|
104
|
-
return
|
105
|
-
|
106
|
-
completed_count = 0
|
107
|
-
|
108
|
-
# setup queue, semaphore
|
109
|
-
semaphore = asyncio.Semaphore(self.download_workers)
|
110
|
-
cid_queue: asyncio.Queue[CidTask] = asyncio.Queue()
|
111
|
-
restore_queue: asyncio.Queue[RestoreTask] = asyncio.Queue()
|
112
|
-
html_queue: asyncio.Queue[HtmlTask] = asyncio.Queue()
|
113
|
-
save_queue: asyncio.Queue[ChapterDict] = asyncio.Queue()
|
114
|
-
pending_restore: dict[str, RestoreTask] = {}
|
115
|
-
|
116
|
-
def update_book_info(
|
117
|
-
vol_idx: int,
|
118
|
-
chap_idx: int,
|
119
|
-
cid: str,
|
120
|
-
) -> None:
|
121
|
-
try:
|
122
|
-
book_info["volumes"][vol_idx]["chapters"][chap_idx]["chapterId"] = cid
|
123
|
-
except (IndexError, KeyError, TypeError) as e:
|
124
|
-
self.logger.info(
|
125
|
-
"[update_book_info] Failed to update vol=%s, chap=%s: %s",
|
126
|
-
vol_idx,
|
127
|
-
chap_idx,
|
128
|
-
e,
|
129
|
-
)
|
71
|
+
try:
|
72
|
+
# --- metadata ---
|
73
|
+
book_info = await self.load_book_info(book_id=book_id, html_dir=html_dir)
|
74
|
+
if not book_info:
|
75
|
+
return
|
130
76
|
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
retry_times: int,
|
137
|
-
semaphore: asyncio.Semaphore,
|
138
|
-
) -> None:
|
139
|
-
while True:
|
140
|
-
task = await cid_queue.get()
|
141
|
-
cid = task.cid
|
142
|
-
if not cid and task.prev_cid:
|
143
|
-
await restore_queue.put(
|
144
|
-
RestoreTask(
|
145
|
-
vol_idx=task.vol_idx,
|
146
|
-
chap_idx=task.chap_idx,
|
147
|
-
prev_cid=task.prev_cid,
|
148
|
-
)
|
149
|
-
)
|
150
|
-
cid_queue.task_done()
|
151
|
-
continue
|
77
|
+
vols = book_info["volumes"]
|
78
|
+
total_chapters = sum(len(v["chapters"]) for v in vols)
|
79
|
+
if total_chapters == 0:
|
80
|
+
self.logger.warning("%s 书籍没有章节可下载: %s", TAG, book_id)
|
81
|
+
return
|
152
82
|
|
153
|
-
|
154
|
-
self.logger.warning("[Fetcher] Skipped empty cid task: %s", task)
|
155
|
-
cid_queue.task_done()
|
156
|
-
continue
|
83
|
+
progress = Progress(total_chapters, progress_hook)
|
157
84
|
|
158
|
-
|
159
|
-
|
160
|
-
|
85
|
+
# --- queues & batching ---
|
86
|
+
cid_q: asyncio.Queue[str | StopToken] = asyncio.Queue()
|
87
|
+
save_q: asyncio.Queue[ChapterDict | StopToken] = asyncio.Queue()
|
88
|
+
batch: list[ChapterDict] = []
|
161
89
|
|
90
|
+
async def flush_batch() -> None:
|
91
|
+
if not batch:
|
92
|
+
return
|
162
93
|
try:
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
vol_idx=task.vol_idx,
|
171
|
-
chap_idx=task.chap_idx,
|
172
|
-
)
|
94
|
+
chapter_storage.upsert_chapters(batch, self.DEFAULT_SOURCE_ID)
|
95
|
+
except Exception as e:
|
96
|
+
self.logger.error(
|
97
|
+
"[Storage] batch upsert failed (size=%d): %s",
|
98
|
+
len(batch),
|
99
|
+
e,
|
100
|
+
exc_info=True,
|
173
101
|
)
|
174
|
-
|
175
|
-
await
|
102
|
+
else:
|
103
|
+
await progress.bump(len(batch))
|
104
|
+
finally:
|
105
|
+
batch.clear()
|
106
|
+
|
107
|
+
# --- stage: storage worker ---
|
108
|
+
async def storage_worker() -> None:
|
109
|
+
"""
|
110
|
+
Consumes parsed chapters, writes in batches.
|
111
|
+
|
112
|
+
Terminates after receiving STOP from each chapter worker.
|
113
|
+
|
114
|
+
On cancel: keeps consuming (to avoid blocking producers),
|
115
|
+
flushes, and exits once all STOPs are seen.
|
116
|
+
"""
|
117
|
+
stop_count = 0
|
118
|
+
while True:
|
119
|
+
item = await save_q.get()
|
120
|
+
if isinstance(item, StopToken):
|
121
|
+
stop_count += 1
|
122
|
+
if stop_count == self.workers:
|
123
|
+
# All chapter workers have exited.
|
124
|
+
await flush_batch()
|
125
|
+
return
|
126
|
+
# else keep waiting for remaining STOPs
|
127
|
+
continue
|
128
|
+
|
129
|
+
# Normal chapter
|
130
|
+
batch.append(item)
|
131
|
+
if len(batch) >= self.storage_batch_size:
|
132
|
+
await flush_batch()
|
133
|
+
|
134
|
+
if cancelled():
|
135
|
+
# Drain whatever is already in the queue
|
136
|
+
try:
|
137
|
+
while True:
|
138
|
+
nxt = save_q.get_nowait()
|
139
|
+
if isinstance(nxt, StopToken):
|
140
|
+
stop_count += 1
|
141
|
+
else:
|
142
|
+
batch.append(nxt)
|
143
|
+
except asyncio.QueueEmpty:
|
144
|
+
pass
|
145
|
+
# Final flush of everything
|
146
|
+
await flush_batch()
|
147
|
+
# Wait for remaining STOPs so chapter workers can finish.
|
148
|
+
while stop_count < self.workers:
|
149
|
+
nxt = await save_q.get()
|
150
|
+
if isinstance(nxt, StopToken):
|
151
|
+
stop_count += 1
|
152
|
+
return
|
153
|
+
|
154
|
+
# --- stage: chapter worker ---
|
155
|
+
sem = asyncio.Semaphore(self.workers)
|
156
|
+
|
157
|
+
async def chapter_worker() -> None:
|
158
|
+
"""
|
159
|
+
Fetch + parse with retry, then enqueue to save_q.
|
160
|
+
|
161
|
+
Exits on STOP, or early if cancel is set before starting a new fetch.
|
162
|
+
"""
|
163
|
+
while True:
|
164
|
+
cid = await cid_q.get()
|
165
|
+
if isinstance(cid, StopToken):
|
166
|
+
# Propagate one STOP to storage and exit.
|
167
|
+
await save_q.put(STOP)
|
168
|
+
return
|
169
|
+
|
170
|
+
if not cid or cid in ignore_set:
|
171
|
+
# Ignore silently and continue.
|
172
|
+
continue
|
173
|
+
|
174
|
+
# If cancelled, don't start a new network call; let storage finish.
|
175
|
+
if cancelled():
|
176
|
+
await save_q.put(STOP)
|
177
|
+
return
|
178
|
+
|
179
|
+
async with sem:
|
180
|
+
chap = await self._process_chapter(book_id, cid, html_dir)
|
181
|
+
if chap:
|
182
|
+
await save_q.put(chap)
|
183
|
+
|
184
|
+
# polite pacing
|
185
|
+
await async_jitter_sleep(
|
176
186
|
self.request_interval,
|
177
187
|
mul_spread=1.1,
|
178
188
|
max_sleep=self.request_interval + 2,
|
179
189
|
)
|
180
190
|
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
prev_cid=task.prev_cid,
|
186
|
-
cid=cid,
|
187
|
-
retry=task.retry + 1,
|
188
|
-
vol_idx=task.vol_idx,
|
189
|
-
chap_idx=task.chap_idx,
|
190
|
-
)
|
191
|
-
)
|
192
|
-
self.logger.info(
|
193
|
-
"[Fetcher] Re-queued chapter %s for retry #%d: %s",
|
194
|
-
cid,
|
195
|
-
task.retry + 1,
|
196
|
-
e,
|
197
|
-
)
|
198
|
-
backoff = self.backoff_factor * (2**task.retry)
|
199
|
-
await async_sleep_with_random_delay(
|
200
|
-
base=backoff,
|
201
|
-
mul_spread=1.2,
|
202
|
-
max_sleep=backoff + 3,
|
203
|
-
)
|
204
|
-
else:
|
205
|
-
self.logger.warning(
|
206
|
-
"[Fetcher] Max retries reached for chapter %s: %s",
|
207
|
-
cid,
|
208
|
-
e,
|
209
|
-
)
|
191
|
+
# --- stage: producer ---
|
192
|
+
async def producer() -> None:
|
193
|
+
"""
|
194
|
+
Enqueue chapter IDs (respecting start/end/skip_existing).
|
210
195
|
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
async def parser_worker(
|
215
|
-
worker_id: int,
|
216
|
-
cid_queue: asyncio.Queue[CidTask],
|
217
|
-
html_queue: asyncio.Queue[HtmlTask],
|
218
|
-
save_queue: asyncio.Queue[ChapterDict],
|
219
|
-
retry_times: int,
|
220
|
-
) -> None:
|
221
|
-
while True:
|
222
|
-
task = await html_queue.get()
|
196
|
+
Always sends STOP x workers at the end (even if cancelled early),
|
197
|
+
so chapter workers can exit deterministically.
|
198
|
+
"""
|
223
199
|
try:
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
"[Parser-%d] saved chapter %s",
|
233
|
-
worker_id,
|
234
|
-
task.cid,
|
235
|
-
)
|
236
|
-
else:
|
237
|
-
raise ValueError("Empty parse result")
|
238
|
-
except Exception as e:
|
239
|
-
if task.retry < retry_times:
|
240
|
-
await cid_queue.put(
|
241
|
-
CidTask(
|
242
|
-
prev_cid=None,
|
243
|
-
cid=task.cid,
|
244
|
-
retry=task.retry + 1,
|
245
|
-
vol_idx=task.vol_idx,
|
246
|
-
chap_idx=task.chap_idx,
|
247
|
-
)
|
248
|
-
)
|
249
|
-
self.logger.info(
|
250
|
-
"[Parser-%d] Re-queued cid %s for retry #%d: %s",
|
251
|
-
worker_id,
|
252
|
-
task.cid,
|
253
|
-
task.retry + 1,
|
254
|
-
e,
|
255
|
-
)
|
256
|
-
else:
|
257
|
-
self.logger.warning(
|
258
|
-
"[Parser-%d] Max retries reached for cid %s: %s",
|
259
|
-
worker_id,
|
260
|
-
task.cid,
|
261
|
-
e,
|
262
|
-
)
|
200
|
+
async for cid in self._chapter_ids(vols, start_id, end_id):
|
201
|
+
if cancelled():
|
202
|
+
break
|
203
|
+
if self.skip_existing and chapter_storage.exists(cid):
|
204
|
+
# Count as completed but don't enqueue.
|
205
|
+
await progress.bump(1)
|
206
|
+
else:
|
207
|
+
await cid_q.put(cid)
|
263
208
|
finally:
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
209
|
+
for _ in range(self.workers):
|
210
|
+
await cid_q.put(STOP)
|
211
|
+
|
212
|
+
# --- run the pipeline ---
|
213
|
+
async with asyncio.TaskGroup() as tg:
|
214
|
+
tg.create_task(storage_worker())
|
215
|
+
for _ in range(self.workers):
|
216
|
+
tg.create_task(chapter_worker())
|
217
|
+
tg.create_task(producer())
|
218
|
+
|
219
|
+
# --- done ---
|
220
|
+
if cancelled():
|
221
|
+
self.logger.info(
|
222
|
+
"%s Novel '%s' cancelled: flushed %d/%d chapters.",
|
223
|
+
TAG,
|
224
|
+
book_info.get("book_name", "unknown"),
|
225
|
+
progress.done,
|
226
|
+
progress.total,
|
227
|
+
)
|
228
|
+
else:
|
229
|
+
self.logger.info(
|
230
|
+
"%s Novel '%s' download completed.",
|
231
|
+
TAG,
|
232
|
+
book_info.get("book_name", "unknown"),
|
280
233
|
)
|
281
234
|
|
282
|
-
|
283
|
-
|
284
|
-
with suppress(asyncio.CancelledError):
|
285
|
-
await task
|
235
|
+
finally:
|
236
|
+
chapter_storage.close()
|
286
237
|
|
287
|
-
|
288
|
-
|
238
|
+
async def _process_chapter(
|
239
|
+
self,
|
240
|
+
book_id: str,
|
241
|
+
cid: str,
|
242
|
+
html_dir: Path,
|
243
|
+
) -> ChapterDict | None:
|
244
|
+
"""
|
245
|
+
Fetches, saves raw HTML, parses a single chapter,
|
246
|
+
retrying up to self.retry_times.
|
289
247
|
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
if curr_cid in pending_restore:
|
299
|
-
rt = pending_restore.pop(curr_cid)
|
300
|
-
next_cid = item.get("extra", {}).get("next_chapter_id")
|
301
|
-
if next_cid:
|
302
|
-
update_book_info(
|
303
|
-
vol_idx=rt.vol_idx,
|
304
|
-
chap_idx=rt.chap_idx,
|
305
|
-
cid=next_cid,
|
306
|
-
)
|
307
|
-
await cid_queue.put(
|
308
|
-
CidTask(
|
309
|
-
prev_cid=rt.prev_cid,
|
310
|
-
cid=next_cid,
|
311
|
-
vol_idx=rt.vol_idx,
|
312
|
-
chap_idx=rt.chap_idx,
|
313
|
-
)
|
314
|
-
)
|
315
|
-
else:
|
316
|
-
self.logger.warning(
|
317
|
-
"[storage_worker] No next_cid found for %r",
|
318
|
-
rt,
|
319
|
-
)
|
320
|
-
except Exception as e:
|
321
|
-
self.logger.error("[storage_worker] Failed to save: %s", e)
|
322
|
-
finally:
|
323
|
-
save_queue.task_done()
|
324
|
-
|
325
|
-
elif isinstance(item, RestoreTask): # from restore_queue
|
326
|
-
prev_json = cs.get(item.prev_cid)
|
327
|
-
next_cid = (
|
328
|
-
prev_json.get("extra", {}).get("next_chapter_id")
|
329
|
-
if prev_json
|
330
|
-
else None
|
331
|
-
)
|
332
|
-
if next_cid:
|
333
|
-
update_book_info(
|
334
|
-
vol_idx=item.vol_idx,
|
335
|
-
chap_idx=item.chap_idx,
|
336
|
-
cid=next_cid,
|
337
|
-
)
|
338
|
-
await cid_queue.put(
|
339
|
-
CidTask(
|
340
|
-
prev_cid=item.prev_cid,
|
341
|
-
cid=next_cid,
|
342
|
-
vol_idx=item.vol_idx,
|
343
|
-
chap_idx=item.chap_idx,
|
344
|
-
)
|
345
|
-
)
|
346
|
-
else:
|
347
|
-
pending_restore[item.prev_cid] = item
|
348
|
-
restore_queue.task_done()
|
349
|
-
|
350
|
-
fetcher_tasks = [
|
351
|
-
asyncio.create_task(
|
352
|
-
fetcher_worker(
|
353
|
-
book_id,
|
354
|
-
cid_queue,
|
355
|
-
html_queue,
|
356
|
-
restore_queue,
|
357
|
-
self.retry_times,
|
358
|
-
semaphore,
|
359
|
-
)
|
360
|
-
)
|
361
|
-
for _ in range(self.download_workers)
|
362
|
-
]
|
363
|
-
|
364
|
-
parser_tasks = [
|
365
|
-
asyncio.create_task(
|
366
|
-
parser_worker(
|
367
|
-
i,
|
368
|
-
cid_queue,
|
369
|
-
html_queue,
|
370
|
-
save_queue,
|
371
|
-
self.retry_times,
|
248
|
+
:return: ChapterDict on success, or None on failure.
|
249
|
+
"""
|
250
|
+
for attempt in range(self.retry_times + 1):
|
251
|
+
try:
|
252
|
+
html_list = await self.fetcher.get_book_chapter(book_id, cid)
|
253
|
+
self._save_html_pages(html_dir, cid, html_list)
|
254
|
+
chap = await asyncio.to_thread(
|
255
|
+
self.parser.parse_chapter, html_list, cid
|
372
256
|
)
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
save_queue=save_queue,
|
381
|
-
restore_queue=restore_queue,
|
382
|
-
cid_queue=cid_queue,
|
383
|
-
)
|
384
|
-
)
|
385
|
-
|
386
|
-
found_start = start_id is None
|
387
|
-
stop_early = False
|
388
|
-
last_cid: str | None = None
|
389
|
-
|
390
|
-
for vol_idx, vol in enumerate(vols):
|
391
|
-
chapters = vol.get("chapters", [])
|
392
|
-
for chap_idx, chap in enumerate(chapters):
|
393
|
-
if stop_early:
|
394
|
-
break
|
395
|
-
|
396
|
-
cid = chap.get("chapterId")
|
397
|
-
|
398
|
-
# Skip until reaching start_id
|
399
|
-
if not found_start:
|
400
|
-
if cid == start_id:
|
401
|
-
found_start = True
|
402
|
-
else:
|
403
|
-
completed_count += 1
|
404
|
-
last_cid = cid
|
405
|
-
continue
|
406
|
-
|
407
|
-
# Stop when reaching end_id
|
408
|
-
if end_id is not None and cid == end_id:
|
409
|
-
stop_early = True
|
410
|
-
|
411
|
-
if cid and normal_cs.exists(cid) and self.skip_existing:
|
412
|
-
completed_count += 1
|
413
|
-
last_cid = cid
|
414
|
-
continue
|
415
|
-
|
416
|
-
await cid_queue.put(
|
417
|
-
CidTask(
|
418
|
-
vol_idx=vol_idx,
|
419
|
-
chap_idx=chap_idx,
|
420
|
-
cid=cid,
|
421
|
-
prev_cid=last_cid,
|
257
|
+
if not chap:
|
258
|
+
raise ValueError("Empty parse result")
|
259
|
+
return chap
|
260
|
+
except Exception as e:
|
261
|
+
if attempt < self.retry_times:
|
262
|
+
self.logger.info(
|
263
|
+
"[ChapterWorker] Retry %s (%s): %s", cid, attempt + 1, e
|
422
264
|
)
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
await restore_queue.join()
|
431
|
-
await cid_queue.join()
|
432
|
-
await html_queue.join()
|
433
|
-
await save_queue.join()
|
434
|
-
|
435
|
-
for task in fetcher_tasks + parser_tasks + [storage_task]:
|
436
|
-
task.cancel()
|
437
|
-
with suppress(asyncio.CancelledError):
|
438
|
-
await task
|
265
|
+
backoff = self.backoff_factor * (2**attempt)
|
266
|
+
await async_jitter_sleep(
|
267
|
+
base=backoff, mul_spread=1.2, max_sleep=backoff + 3
|
268
|
+
)
|
269
|
+
else:
|
270
|
+
self.logger.warning("[ChapterWorker] Failed %s: %s", cid, e)
|
271
|
+
return None
|
439
272
|
|
440
|
-
|
441
|
-
|
273
|
+
@staticmethod
|
274
|
+
def _normalize_book_id(book_id: str) -> str:
|
275
|
+
"""
|
276
|
+
Normalize a book identifier.
|
442
277
|
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
)
|
448
|
-
return
|
278
|
+
Subclasses may override this method to transform the book ID
|
279
|
+
into their preferred format.
|
280
|
+
"""
|
281
|
+
return book_id.replace("/", "-")
|