novel-downloader 1.4.5__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -1
- novel_downloader/cli/__init__.py +2 -2
- novel_downloader/cli/config.py +1 -83
- novel_downloader/cli/download.py +4 -5
- novel_downloader/cli/export.py +4 -1
- novel_downloader/cli/main.py +2 -0
- novel_downloader/cli/search.py +123 -0
- novel_downloader/config/__init__.py +3 -10
- novel_downloader/config/adapter.py +190 -54
- novel_downloader/config/loader.py +2 -3
- novel_downloader/core/__init__.py +13 -13
- novel_downloader/core/downloaders/__init__.py +10 -11
- novel_downloader/core/downloaders/base.py +152 -26
- novel_downloader/core/downloaders/biquge.py +5 -1
- novel_downloader/core/downloaders/common.py +157 -378
- novel_downloader/core/downloaders/esjzone.py +5 -1
- novel_downloader/core/downloaders/linovelib.py +5 -1
- novel_downloader/core/downloaders/qianbi.py +291 -4
- novel_downloader/core/downloaders/qidian.py +199 -285
- novel_downloader/core/downloaders/registry.py +67 -0
- novel_downloader/core/downloaders/sfacg.py +5 -1
- novel_downloader/core/downloaders/yamibo.py +5 -1
- novel_downloader/core/exporters/__init__.py +10 -11
- novel_downloader/core/exporters/base.py +87 -7
- novel_downloader/core/exporters/biquge.py +5 -8
- novel_downloader/core/exporters/common/__init__.py +2 -2
- novel_downloader/core/exporters/common/epub.py +82 -166
- novel_downloader/core/exporters/common/main_exporter.py +0 -60
- novel_downloader/core/exporters/common/txt.py +82 -83
- novel_downloader/core/exporters/epub_util.py +157 -1330
- novel_downloader/core/exporters/esjzone.py +5 -8
- novel_downloader/core/exporters/linovelib/__init__.py +2 -2
- novel_downloader/core/exporters/linovelib/epub.py +157 -212
- novel_downloader/core/exporters/linovelib/main_exporter.py +2 -59
- novel_downloader/core/exporters/linovelib/txt.py +67 -63
- novel_downloader/core/exporters/qianbi.py +5 -8
- novel_downloader/core/exporters/qidian.py +14 -4
- novel_downloader/core/exporters/registry.py +53 -0
- novel_downloader/core/exporters/sfacg.py +5 -8
- novel_downloader/core/exporters/txt_util.py +67 -0
- novel_downloader/core/exporters/yamibo.py +5 -8
- novel_downloader/core/fetchers/__init__.py +19 -24
- novel_downloader/core/fetchers/base/__init__.py +3 -3
- novel_downloader/core/fetchers/base/browser.py +23 -4
- novel_downloader/core/fetchers/base/session.py +30 -5
- novel_downloader/core/fetchers/biquge/__init__.py +3 -3
- novel_downloader/core/fetchers/biquge/browser.py +5 -0
- novel_downloader/core/fetchers/biquge/session.py +6 -1
- novel_downloader/core/fetchers/esjzone/__init__.py +3 -3
- novel_downloader/core/fetchers/esjzone/browser.py +5 -0
- novel_downloader/core/fetchers/esjzone/session.py +6 -1
- novel_downloader/core/fetchers/linovelib/__init__.py +3 -3
- novel_downloader/core/fetchers/linovelib/browser.py +6 -1
- novel_downloader/core/fetchers/linovelib/session.py +6 -1
- novel_downloader/core/fetchers/qianbi/__init__.py +3 -3
- novel_downloader/core/fetchers/qianbi/browser.py +5 -0
- novel_downloader/core/fetchers/qianbi/session.py +5 -0
- novel_downloader/core/fetchers/qidian/__init__.py +3 -3
- novel_downloader/core/fetchers/qidian/browser.py +12 -4
- novel_downloader/core/fetchers/qidian/session.py +11 -3
- novel_downloader/core/fetchers/registry.py +71 -0
- novel_downloader/core/fetchers/sfacg/__init__.py +3 -3
- novel_downloader/core/fetchers/sfacg/browser.py +5 -0
- novel_downloader/core/fetchers/sfacg/session.py +5 -0
- novel_downloader/core/fetchers/yamibo/__init__.py +3 -3
- novel_downloader/core/fetchers/yamibo/browser.py +5 -0
- novel_downloader/core/fetchers/yamibo/session.py +6 -1
- novel_downloader/core/interfaces/__init__.py +7 -5
- novel_downloader/core/interfaces/searcher.py +18 -0
- novel_downloader/core/parsers/__init__.py +10 -11
- novel_downloader/core/parsers/{biquge/main_parser.py → biquge.py} +7 -2
- novel_downloader/core/parsers/{esjzone/main_parser.py → esjzone.py} +7 -2
- novel_downloader/core/parsers/{linovelib/main_parser.py → linovelib.py} +7 -2
- novel_downloader/core/parsers/{qianbi/main_parser.py → qianbi.py} +7 -2
- novel_downloader/core/parsers/qidian/__init__.py +2 -2
- novel_downloader/core/parsers/qidian/chapter_encrypted.py +23 -21
- novel_downloader/core/parsers/qidian/chapter_normal.py +1 -1
- novel_downloader/core/parsers/qidian/main_parser.py +10 -21
- novel_downloader/core/parsers/qidian/utils/__init__.py +11 -11
- novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +5 -6
- novel_downloader/core/parsers/qidian/utils/node_decryptor.py +2 -2
- novel_downloader/core/parsers/registry.py +68 -0
- novel_downloader/core/parsers/{sfacg/main_parser.py → sfacg.py} +7 -2
- novel_downloader/core/parsers/{yamibo/main_parser.py → yamibo.py} +7 -2
- novel_downloader/core/searchers/__init__.py +20 -0
- novel_downloader/core/searchers/base.py +92 -0
- novel_downloader/core/searchers/biquge.py +83 -0
- novel_downloader/core/searchers/esjzone.py +84 -0
- novel_downloader/core/searchers/qianbi.py +131 -0
- novel_downloader/core/searchers/qidian.py +87 -0
- novel_downloader/core/searchers/registry.py +63 -0
- novel_downloader/locales/en.json +12 -4
- novel_downloader/locales/zh.json +12 -4
- novel_downloader/models/__init__.py +4 -30
- novel_downloader/models/config.py +12 -6
- novel_downloader/models/search.py +16 -0
- novel_downloader/models/types.py +0 -2
- novel_downloader/resources/config/settings.toml +31 -4
- novel_downloader/resources/css_styles/intro.css +83 -0
- novel_downloader/resources/css_styles/main.css +30 -89
- novel_downloader/utils/__init__.py +52 -0
- novel_downloader/utils/chapter_storage.py +244 -224
- novel_downloader/utils/constants.py +1 -21
- novel_downloader/utils/epub/__init__.py +34 -0
- novel_downloader/utils/epub/builder.py +377 -0
- novel_downloader/utils/epub/constants.py +77 -0
- novel_downloader/utils/epub/documents.py +403 -0
- novel_downloader/utils/epub/models.py +134 -0
- novel_downloader/utils/epub/utils.py +212 -0
- novel_downloader/utils/file_utils/__init__.py +10 -14
- novel_downloader/utils/file_utils/io.py +20 -51
- novel_downloader/utils/file_utils/normalize.py +2 -2
- novel_downloader/utils/file_utils/sanitize.py +2 -3
- novel_downloader/utils/fontocr/__init__.py +5 -5
- novel_downloader/utils/{hash_store.py → fontocr/hash_store.py} +4 -3
- novel_downloader/utils/{hash_utils.py → fontocr/hash_utils.py} +2 -2
- novel_downloader/utils/fontocr/ocr_v1.py +13 -1
- novel_downloader/utils/fontocr/ocr_v2.py +13 -1
- novel_downloader/utils/fontocr/ocr_v3.py +744 -0
- novel_downloader/utils/i18n.py +2 -0
- novel_downloader/utils/logger.py +2 -0
- novel_downloader/utils/network.py +110 -251
- novel_downloader/utils/state.py +1 -0
- novel_downloader/utils/text_utils/__init__.py +18 -17
- novel_downloader/utils/text_utils/diff_display.py +4 -5
- novel_downloader/utils/text_utils/numeric_conversion.py +253 -0
- novel_downloader/utils/text_utils/text_cleaner.py +179 -0
- novel_downloader/utils/text_utils/truncate_utils.py +62 -0
- novel_downloader/utils/time_utils/__init__.py +3 -3
- novel_downloader/utils/time_utils/datetime_utils.py +4 -5
- novel_downloader/utils/time_utils/sleep_utils.py +2 -3
- {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/METADATA +2 -2
- novel_downloader-1.5.0.dist-info/RECORD +164 -0
- novel_downloader/config/site_rules.py +0 -94
- novel_downloader/core/factory/__init__.py +0 -20
- novel_downloader/core/factory/downloader.py +0 -73
- novel_downloader/core/factory/exporter.py +0 -58
- novel_downloader/core/factory/fetcher.py +0 -96
- novel_downloader/core/factory/parser.py +0 -86
- novel_downloader/core/fetchers/common/__init__.py +0 -14
- novel_downloader/core/fetchers/common/browser.py +0 -79
- novel_downloader/core/fetchers/common/session.py +0 -79
- novel_downloader/core/parsers/biquge/__init__.py +0 -10
- novel_downloader/core/parsers/common/__init__.py +0 -13
- novel_downloader/core/parsers/common/helper.py +0 -323
- novel_downloader/core/parsers/common/main_parser.py +0 -106
- novel_downloader/core/parsers/esjzone/__init__.py +0 -10
- novel_downloader/core/parsers/linovelib/__init__.py +0 -10
- novel_downloader/core/parsers/qianbi/__init__.py +0 -10
- novel_downloader/core/parsers/sfacg/__init__.py +0 -10
- novel_downloader/core/parsers/yamibo/__init__.py +0 -10
- novel_downloader/models/browser.py +0 -21
- novel_downloader/models/site_rules.py +0 -99
- novel_downloader/models/tasks.py +0 -33
- novel_downloader/resources/css_styles/volume-intro.css +0 -56
- novel_downloader/resources/json/replace_word_map.json +0 -4
- novel_downloader/resources/text/blacklist.txt +0 -22
- novel_downloader/utils/text_utils/chapter_formatting.py +0 -46
- novel_downloader/utils/text_utils/font_mapping.py +0 -28
- novel_downloader/utils/text_utils/text_cleaning.py +0 -107
- novel_downloader-1.4.5.dist-info/RECORD +0 -165
- {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/WHEEL +0 -0
- {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/entry_points.txt +0 -0
- {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/top_level.txt +0 -0
@@ -6,12 +6,13 @@ novel_downloader.core.downloaders.qidian
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
import asyncio
|
9
|
-
import
|
10
|
-
from
|
11
|
-
from
|
12
|
-
from typing import Any
|
9
|
+
from collections.abc import AsyncIterator, Awaitable, Callable
|
10
|
+
from contextlib import asynccontextmanager
|
11
|
+
from pathlib import Path
|
12
|
+
from typing import Any
|
13
13
|
|
14
14
|
from novel_downloader.core.downloaders.base import BaseDownloader
|
15
|
+
from novel_downloader.core.downloaders.registry import register_downloader
|
15
16
|
from novel_downloader.core.interfaces import (
|
16
17
|
FetcherProtocol,
|
17
18
|
ParserProtocol,
|
@@ -19,23 +20,30 @@ from novel_downloader.core.interfaces import (
|
|
19
20
|
from novel_downloader.models import (
|
20
21
|
BookConfig,
|
21
22
|
ChapterDict,
|
22
|
-
CidTask,
|
23
23
|
DownloaderConfig,
|
24
|
-
HtmlTask,
|
25
24
|
)
|
26
|
-
from novel_downloader.utils
|
27
|
-
|
28
|
-
from novel_downloader.utils.time_utils import (
|
25
|
+
from novel_downloader.utils import (
|
26
|
+
ChapterStorage,
|
29
27
|
async_sleep_with_random_delay,
|
30
|
-
calculate_time_difference,
|
31
28
|
)
|
32
29
|
|
33
30
|
|
31
|
+
@register_downloader(site_keys=["qidian", "qd"])
|
34
32
|
class QidianDownloader(BaseDownloader):
|
35
33
|
"""
|
36
|
-
Specialized downloader for Qidian novels.
|
34
|
+
Specialized downloader for Qidian (起点) novels.
|
35
|
+
|
36
|
+
Processes each chapter in a single worker that
|
37
|
+
handles fetch -> parse -> enqueue storage.
|
37
38
|
"""
|
38
39
|
|
40
|
+
DEFAULT_SOURCE_ID = 0
|
41
|
+
ENCRYPTED_SOURCE_ID = 1
|
42
|
+
PRIORITIES_MAP = {
|
43
|
+
DEFAULT_SOURCE_ID: 0,
|
44
|
+
ENCRYPTED_SOURCE_ID: 1,
|
45
|
+
}
|
46
|
+
|
39
47
|
def __init__(
|
40
48
|
self,
|
41
49
|
fetcher: FetcherProtocol,
|
@@ -43,7 +51,7 @@ class QidianDownloader(BaseDownloader):
|
|
43
51
|
config: DownloaderConfig,
|
44
52
|
):
|
45
53
|
config.request_interval = max(1.0, config.request_interval)
|
46
|
-
super().__init__(fetcher, parser, config, "qidian")
|
54
|
+
super().__init__(fetcher, parser, config, "qidian", self.PRIORITIES_MAP)
|
47
55
|
|
48
56
|
async def _download_one(
|
49
57
|
self,
|
@@ -63,301 +71,123 @@ class QidianDownloader(BaseDownloader):
|
|
63
71
|
end_id = book.get("end_id")
|
64
72
|
ignore_set = set(book.get("ignore_ids", []))
|
65
73
|
|
66
|
-
raw_base = self.
|
67
|
-
cache_base = self.cache_dir / book_id
|
68
|
-
info_path = raw_base / "book_info.json"
|
69
|
-
chapters_html_dir = cache_base / "html"
|
70
|
-
|
74
|
+
raw_base = self._raw_data_dir / book_id
|
71
75
|
raw_base.mkdir(parents=True, exist_ok=True)
|
72
|
-
|
73
|
-
|
74
|
-
normal_cs = ChapterStorage(
|
75
|
-
raw_base=raw_base,
|
76
|
-
namespace="chapters",
|
77
|
-
backend_type=self._config.storage_backend,
|
78
|
-
batch_size=self._config.storage_batch_size,
|
79
|
-
)
|
80
|
-
encrypted_cs = ChapterStorage(
|
76
|
+
html_dir = self._debug_dir / book_id / "html"
|
77
|
+
chapter_storage = ChapterStorage(
|
81
78
|
raw_base=raw_base,
|
82
|
-
|
83
|
-
backend_type=self._config.storage_backend,
|
84
|
-
batch_size=self._config.storage_batch_size,
|
79
|
+
priorities=self._priorities,
|
85
80
|
)
|
81
|
+
chapter_storage.connect()
|
86
82
|
|
87
|
-
# load or fetch
|
88
|
-
book_info
|
89
|
-
re_fetch = True
|
90
|
-
old_data: dict[str, Any] = {}
|
91
|
-
|
92
|
-
if info_path.exists():
|
93
|
-
try:
|
94
|
-
old_data = json.loads(info_path.read_text("utf-8"))
|
95
|
-
days, *_ = calculate_time_difference(
|
96
|
-
old_data.get("update_time", ""), "UTC+8"
|
97
|
-
)
|
98
|
-
re_fetch = days > 1
|
99
|
-
except Exception:
|
100
|
-
re_fetch = True
|
101
|
-
|
102
|
-
if re_fetch:
|
103
|
-
info_html = await self.fetcher.get_book_info(book_id)
|
104
|
-
if self.save_html:
|
105
|
-
for i, html in enumerate(info_html):
|
106
|
-
save_as_txt(html, chapters_html_dir / f"info_{i}.html")
|
107
|
-
book_info = self.parser.parse_book_info(info_html)
|
108
|
-
|
109
|
-
if book_info.get("book_name") != "未找到书名":
|
110
|
-
save_as_json(book_info, info_path)
|
111
|
-
else:
|
112
|
-
self.logger.warning("%s 书籍信息未找到, book_id = %s", TAG, book_id)
|
113
|
-
book_info = old_data or {"book_name": "未找到书名"}
|
114
|
-
else:
|
115
|
-
book_info = old_data
|
116
|
-
|
83
|
+
# load or fetch metadata
|
84
|
+
book_info = await self.load_book_info(book_id=book_id, html_dir=html_dir)
|
117
85
|
vols = book_info.get("volumes", [])
|
118
|
-
total_chapters =
|
119
|
-
for vol in vols:
|
120
|
-
total_chapters += len(vol.get("chapters", []))
|
86
|
+
total_chapters = sum(len(v.get("chapters", [])) for v in vols)
|
121
87
|
if total_chapters == 0:
|
122
|
-
self.logger.warning("%s 书籍没有章节可下载:
|
88
|
+
self.logger.warning("%s 书籍没有章节可下载: %s", TAG, book_id)
|
123
89
|
return
|
124
90
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
mul_spread=1.1,
|
159
|
-
max_sleep=self.request_interval + 2,
|
160
|
-
)
|
91
|
+
# concurrency primitives
|
92
|
+
sem = asyncio.Semaphore(self.workers)
|
93
|
+
cid_q: asyncio.Queue[str | None] = asyncio.Queue()
|
94
|
+
save_q: asyncio.Queue[ChapterDict | None] = asyncio.Queue()
|
95
|
+
default_batch: list[ChapterDict] = []
|
96
|
+
encrypted_batch: list[ChapterDict] = []
|
97
|
+
completed = 0
|
98
|
+
|
99
|
+
def _select(batch_item: ChapterDict) -> tuple[list[ChapterDict], int]:
|
100
|
+
if batch_item.get("extra", {}).get("encrypted", False):
|
101
|
+
return encrypted_batch, self.ENCRYPTED_SOURCE_ID
|
102
|
+
return default_batch, self.DEFAULT_SOURCE_ID
|
103
|
+
|
104
|
+
async def _flush(batch: list[ChapterDict], src: int) -> None:
|
105
|
+
nonlocal completed
|
106
|
+
if not batch:
|
107
|
+
return
|
108
|
+
try:
|
109
|
+
chapter_storage.upsert_chapters(batch, src)
|
110
|
+
except Exception as e:
|
111
|
+
self.logger.error(
|
112
|
+
"[Storage] batch upsert failed (size=%d, source=%d): %s",
|
113
|
+
len(batch),
|
114
|
+
src,
|
115
|
+
e,
|
116
|
+
exc_info=True,
|
117
|
+
)
|
118
|
+
else:
|
119
|
+
completed += len(batch)
|
120
|
+
if progress_hook:
|
121
|
+
await progress_hook(completed, total_chapters)
|
122
|
+
finally:
|
123
|
+
batch.clear()
|
161
124
|
|
162
|
-
|
163
|
-
if task.retry < retry_times:
|
164
|
-
await cid_queue.put(
|
165
|
-
CidTask(
|
166
|
-
prev_cid=task.prev_cid,
|
167
|
-
cid=cid,
|
168
|
-
retry=task.retry + 1,
|
169
|
-
)
|
170
|
-
)
|
171
|
-
self.logger.info(
|
172
|
-
"[Fetcher] Re-queued chapter %s for retry #%d: %s",
|
173
|
-
cid,
|
174
|
-
task.retry + 1,
|
175
|
-
e,
|
176
|
-
)
|
177
|
-
backoff = self.backoff_factor * (2**task.retry)
|
178
|
-
await async_sleep_with_random_delay(
|
179
|
-
base=backoff,
|
180
|
-
mul_spread=1.2,
|
181
|
-
max_sleep=backoff + 3,
|
182
|
-
)
|
183
|
-
else:
|
184
|
-
self.logger.warning(
|
185
|
-
"[Fetcher] Max retries reached for chapter %s: %s",
|
186
|
-
cid,
|
187
|
-
e,
|
188
|
-
)
|
189
|
-
|
190
|
-
finally:
|
191
|
-
cid_queue.task_done()
|
192
|
-
|
193
|
-
async def parser_worker(
|
194
|
-
cid_queue: asyncio.Queue[CidTask],
|
195
|
-
html_queue: asyncio.Queue[HtmlTask],
|
196
|
-
save_queue: asyncio.Queue[ChapterDict],
|
197
|
-
retry_times: int,
|
198
|
-
) -> None:
|
199
|
-
while True:
|
200
|
-
task = await html_queue.get()
|
201
|
-
skip_retry = False
|
202
|
-
try:
|
203
|
-
chap_json: ChapterDict | None = None
|
204
|
-
if self.check_restricted(task.html_list):
|
205
|
-
self.logger.info(
|
206
|
-
"[Parser] Skipped restricted page for cid %s", task.cid
|
207
|
-
)
|
208
|
-
skip_retry = True
|
209
|
-
raise ValueError("Restricted content detected")
|
210
|
-
|
211
|
-
is_encrypted = self.check_encrypted(task.html_list)
|
212
|
-
chap_json = await asyncio.to_thread(
|
213
|
-
self.parser.parse_chapter,
|
214
|
-
task.html_list,
|
215
|
-
task.cid,
|
216
|
-
)
|
217
|
-
if is_encrypted:
|
218
|
-
skip_retry = True
|
219
|
-
if self.save_html:
|
220
|
-
folder = chapters_html_dir / (
|
221
|
-
"html_encrypted" if is_encrypted else "html_plain"
|
222
|
-
)
|
223
|
-
html_path = folder / f"{task.cid}.html"
|
224
|
-
save_as_txt(task.html_list[0], html_path, on_exist="skip")
|
225
|
-
self.logger.debug(
|
226
|
-
"%s Saved raw HTML for chapter %s to %s",
|
227
|
-
TAG,
|
228
|
-
task.cid,
|
229
|
-
html_path,
|
230
|
-
)
|
231
|
-
if chap_json:
|
232
|
-
await save_queue.put(chap_json)
|
233
|
-
self.logger.info(
|
234
|
-
"[Parser] saved chapter %s",
|
235
|
-
task.cid,
|
236
|
-
)
|
237
|
-
else:
|
238
|
-
raise ValueError("Empty parse result")
|
239
|
-
except Exception as e:
|
240
|
-
if not skip_retry and task.retry < retry_times:
|
241
|
-
await cid_queue.put(
|
242
|
-
CidTask(prev_cid=None, cid=task.cid, retry=task.retry + 1)
|
243
|
-
)
|
244
|
-
self.logger.info(
|
245
|
-
"[Parser] Re-queued cid %s for retry #%d: %s",
|
246
|
-
task.cid,
|
247
|
-
task.retry + 1,
|
248
|
-
e,
|
249
|
-
)
|
250
|
-
elif not skip_retry:
|
251
|
-
self.logger.warning(
|
252
|
-
"[Parser] Max retries reached for cid %s: %s",
|
253
|
-
task.cid,
|
254
|
-
e,
|
255
|
-
)
|
256
|
-
finally:
|
257
|
-
html_queue.task_done()
|
258
|
-
|
259
|
-
async def storage_worker(
|
260
|
-
normal_cs: ChapterStorage,
|
261
|
-
encrypted_cs: ChapterStorage,
|
262
|
-
save_queue: asyncio.Queue[ChapterDict],
|
263
|
-
) -> None:
|
264
|
-
nonlocal completed_count
|
125
|
+
async def storage_worker(q: asyncio.Queue[ChapterDict | None]) -> None:
|
265
126
|
while True:
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
if progress_hook:
|
273
|
-
await progress_hook(completed_count, total_chapters)
|
274
|
-
except Exception as e:
|
275
|
-
self.logger.error("[storage_worker] Failed to save: %s", e)
|
276
|
-
finally:
|
277
|
-
save_queue.task_done()
|
278
|
-
|
279
|
-
fetcher_task = asyncio.create_task(
|
280
|
-
fetcher_worker(
|
281
|
-
book_id,
|
282
|
-
cid_queue,
|
283
|
-
html_queue,
|
284
|
-
self.retry_times,
|
285
|
-
)
|
286
|
-
)
|
287
|
-
|
288
|
-
parser_task = asyncio.create_task(
|
289
|
-
parser_worker(
|
290
|
-
cid_queue,
|
291
|
-
html_queue,
|
292
|
-
save_queue,
|
293
|
-
self.retry_times,
|
294
|
-
)
|
295
|
-
)
|
296
|
-
|
297
|
-
storage_task = asyncio.create_task(
|
298
|
-
storage_worker(
|
299
|
-
normal_cs=normal_cs,
|
300
|
-
encrypted_cs=encrypted_cs,
|
301
|
-
save_queue=save_queue,
|
302
|
-
)
|
303
|
-
)
|
304
|
-
|
305
|
-
found_start = start_id is None
|
306
|
-
stop_early = False
|
307
|
-
|
308
|
-
for vol in book_info.get("volumes", []):
|
309
|
-
chapters = vol.get("chapters", [])
|
310
|
-
for chap in chapters:
|
311
|
-
if stop_early:
|
127
|
+
chap = await q.get()
|
128
|
+
q.task_done()
|
129
|
+
if chap is None:
|
130
|
+
# final flush before exit
|
131
|
+
await _flush(default_batch, self.DEFAULT_SOURCE_ID)
|
132
|
+
await _flush(encrypted_batch, self.ENCRYPTED_SOURCE_ID)
|
312
133
|
break
|
134
|
+
batch, src = _select(chap)
|
135
|
+
batch.append(chap)
|
136
|
+
if len(batch) >= self.storage_batch_size:
|
137
|
+
await _flush(batch, src)
|
138
|
+
|
139
|
+
async def producer() -> None:
|
140
|
+
nonlocal completed
|
141
|
+
async for cid in self._chapter_ids(vols, start_id, end_id):
|
142
|
+
if self.skip_existing and chapter_storage.exists(
|
143
|
+
cid, self.DEFAULT_SOURCE_ID
|
144
|
+
):
|
145
|
+
completed += 1
|
146
|
+
if progress_hook:
|
147
|
+
await progress_hook(completed, total_chapters)
|
148
|
+
else:
|
149
|
+
await cid_q.put(cid)
|
150
|
+
|
151
|
+
@asynccontextmanager
|
152
|
+
async def task_group_ctx() -> AsyncIterator[None]:
|
153
|
+
async with asyncio.TaskGroup() as tg:
|
154
|
+
tg.create_task(
|
155
|
+
self._chapter_worker(
|
156
|
+
book_id,
|
157
|
+
ignore_set,
|
158
|
+
cid_q,
|
159
|
+
save_q,
|
160
|
+
sem,
|
161
|
+
)
|
162
|
+
)
|
163
|
+
tg.create_task(storage_worker(save_q))
|
164
|
+
yield
|
313
165
|
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
if not found_start:
|
319
|
-
if cid == start_id:
|
320
|
-
found_start = True
|
321
|
-
else:
|
322
|
-
completed_count += 1
|
323
|
-
continue
|
324
|
-
|
325
|
-
if end_id is not None and cid == end_id:
|
326
|
-
stop_early = True
|
327
|
-
|
328
|
-
if cid in ignore_set:
|
329
|
-
continue
|
330
|
-
|
331
|
-
if normal_cs.exists(cid) and self.skip_existing:
|
332
|
-
completed_count += 1
|
333
|
-
continue
|
334
|
-
|
335
|
-
await cid_queue.put(CidTask(cid=cid, prev_cid=None))
|
336
|
-
|
337
|
-
if stop_early:
|
338
|
-
break
|
166
|
+
# run producer + workers, send None sentinels to shut down loops
|
167
|
+
async with task_group_ctx():
|
168
|
+
await producer()
|
339
169
|
|
340
|
-
|
341
|
-
|
342
|
-
|
170
|
+
# signal fetcher to exit
|
171
|
+
await cid_q.put(None)
|
172
|
+
await cid_q.join()
|
343
173
|
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
await task
|
174
|
+
# signal storage to exit
|
175
|
+
await save_q.put(None)
|
176
|
+
await save_q.join()
|
348
177
|
|
349
|
-
|
350
|
-
|
178
|
+
# final flush for both batches
|
179
|
+
await _flush(default_batch, self.DEFAULT_SOURCE_ID)
|
180
|
+
await _flush(encrypted_batch, self.ENCRYPTED_SOURCE_ID)
|
351
181
|
|
182
|
+
chapter_storage.close()
|
352
183
|
self.logger.info(
|
353
184
|
"%s Novel '%s' download completed.",
|
354
185
|
TAG,
|
355
186
|
book_info.get("book_name", "unknown"),
|
356
187
|
)
|
357
|
-
return
|
358
188
|
|
359
189
|
@staticmethod
|
360
|
-
def
|
190
|
+
def _check_restricted(html_list: list[str]) -> bool:
|
361
191
|
"""
|
362
192
|
Return True if page content indicates access restriction
|
363
193
|
(e.g. not subscribed/purchased).
|
@@ -370,7 +200,91 @@ class QidianDownloader(BaseDownloader):
|
|
370
200
|
return any(m in html_list[0] for m in markers)
|
371
201
|
|
372
202
|
@staticmethod
|
373
|
-
def
|
203
|
+
def _check_encrypted(html_list: list[str]) -> bool:
|
374
204
|
if not html_list:
|
375
205
|
return True
|
376
206
|
return '"cES":2' in html_list[0]
|
207
|
+
|
208
|
+
async def _chapter_worker(
|
209
|
+
self,
|
210
|
+
book_id: str,
|
211
|
+
ignore_set: set[str],
|
212
|
+
cid_q: asyncio.Queue[str | None],
|
213
|
+
save_q: asyncio.Queue[ChapterDict | None],
|
214
|
+
sem: asyncio.Semaphore,
|
215
|
+
) -> None:
|
216
|
+
"""
|
217
|
+
Worker that processes one chapter at a time:
|
218
|
+
fetch + parse with retry, then enqueue to save_q.
|
219
|
+
"""
|
220
|
+
html_dir = self._debug_dir / book_id / "html"
|
221
|
+
while True:
|
222
|
+
cid = await cid_q.get()
|
223
|
+
if cid is None:
|
224
|
+
cid_q.task_done()
|
225
|
+
break
|
226
|
+
if not cid or cid in ignore_set:
|
227
|
+
cid_q.task_done()
|
228
|
+
continue
|
229
|
+
|
230
|
+
async with sem:
|
231
|
+
chap = await self._process_chapter(book_id, cid, html_dir)
|
232
|
+
if chap:
|
233
|
+
await save_q.put(chap)
|
234
|
+
|
235
|
+
cid_q.task_done()
|
236
|
+
await async_sleep_with_random_delay(
|
237
|
+
self.request_interval,
|
238
|
+
mul_spread=1.1,
|
239
|
+
max_sleep=self.request_interval + 2,
|
240
|
+
)
|
241
|
+
|
242
|
+
async def _process_chapter(
|
243
|
+
self,
|
244
|
+
book_id: str,
|
245
|
+
cid: str,
|
246
|
+
html_dir: Path,
|
247
|
+
) -> ChapterDict | None:
|
248
|
+
"""
|
249
|
+
Fetch, debug-save, parse a single chapter with retries.
|
250
|
+
Returns ChapterDict or None on failure.
|
251
|
+
"""
|
252
|
+
for attempt in range(self.retry_times + 1):
|
253
|
+
try:
|
254
|
+
html_list = await self.fetcher.get_book_chapter(book_id, cid)
|
255
|
+
if self._check_restricted(html_list):
|
256
|
+
self.logger.info(
|
257
|
+
"[ChapterWorker] Restricted content detected: %s", cid
|
258
|
+
)
|
259
|
+
return None
|
260
|
+
encrypted = self._check_encrypted(html_list)
|
261
|
+
|
262
|
+
folder = "html_encrypted" if encrypted else "html_plain"
|
263
|
+
self._save_html_pages(html_dir / folder, cid, html_list)
|
264
|
+
|
265
|
+
chap = await asyncio.to_thread(
|
266
|
+
self.parser.parse_chapter, html_list, cid
|
267
|
+
)
|
268
|
+
if encrypted and not chap:
|
269
|
+
self.logger.info(
|
270
|
+
"[ChapterWorker] Fail for encrypted chapter: %s", cid
|
271
|
+
)
|
272
|
+
return None
|
273
|
+
if not chap:
|
274
|
+
raise ValueError("Empty parse result")
|
275
|
+
return chap
|
276
|
+
|
277
|
+
except Exception as e:
|
278
|
+
if attempt < self.retry_times:
|
279
|
+
self.logger.info(
|
280
|
+
"[ChapterWorker] Retry %s (%s): %s", cid, attempt + 1, e
|
281
|
+
)
|
282
|
+
backoff = self.backoff_factor * (2**attempt)
|
283
|
+
await async_sleep_with_random_delay(
|
284
|
+
base=backoff,
|
285
|
+
mul_spread=1.2,
|
286
|
+
max_sleep=backoff + 3,
|
287
|
+
)
|
288
|
+
else:
|
289
|
+
self.logger.warning("[ChapterWorker] Failed %s: %s", cid, e)
|
290
|
+
return None
|
@@ -0,0 +1,67 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.downloaders.registry
|
4
|
+
------------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
__all__ = ["register_downloader", "get_downloader"]
|
9
|
+
|
10
|
+
from collections.abc import Callable, Sequence
|
11
|
+
from typing import TypeVar
|
12
|
+
|
13
|
+
from novel_downloader.core.interfaces import (
|
14
|
+
DownloaderProtocol,
|
15
|
+
FetcherProtocol,
|
16
|
+
ParserProtocol,
|
17
|
+
)
|
18
|
+
from novel_downloader.models import DownloaderConfig
|
19
|
+
|
20
|
+
DownloaderBuilder = Callable[
|
21
|
+
[FetcherProtocol, ParserProtocol, DownloaderConfig],
|
22
|
+
DownloaderProtocol,
|
23
|
+
]
|
24
|
+
D = TypeVar("D", bound=DownloaderProtocol)
|
25
|
+
_DOWNLOADER_MAP: dict[str, DownloaderBuilder] = {}
|
26
|
+
|
27
|
+
|
28
|
+
def register_downloader(
|
29
|
+
site_keys: Sequence[str],
|
30
|
+
) -> Callable[[type[D]], type[D]]:
|
31
|
+
"""
|
32
|
+
Decorator to register a downloader class under given keys.
|
33
|
+
|
34
|
+
:param site_keys: Sequence of site identifiers
|
35
|
+
:return: A class decorator that populates _DOWNLOADER_MAP.
|
36
|
+
"""
|
37
|
+
|
38
|
+
def decorator(cls: type[D]) -> type[D]:
|
39
|
+
for key in site_keys:
|
40
|
+
_DOWNLOADER_MAP[key.lower()] = cls
|
41
|
+
return cls
|
42
|
+
|
43
|
+
return decorator
|
44
|
+
|
45
|
+
|
46
|
+
def get_downloader(
|
47
|
+
fetcher: FetcherProtocol,
|
48
|
+
parser: ParserProtocol,
|
49
|
+
site: str,
|
50
|
+
config: DownloaderConfig,
|
51
|
+
) -> DownloaderProtocol:
|
52
|
+
"""
|
53
|
+
Returns an DownloaderProtocol for the given site.
|
54
|
+
|
55
|
+
:param fetcher: Fetcher implementation
|
56
|
+
:param parser: Parser implementation
|
57
|
+
:param site: Site name (e.g., 'qidian')
|
58
|
+
:param config: Downloader configuration
|
59
|
+
|
60
|
+
:return: An instance of a downloader class
|
61
|
+
"""
|
62
|
+
site_key = site.lower()
|
63
|
+
try:
|
64
|
+
downloader_cls = _DOWNLOADER_MAP[site_key]
|
65
|
+
except KeyError as err:
|
66
|
+
raise ValueError(f"Unsupported site: {site}") from err
|
67
|
+
return downloader_cls(fetcher, parser, config)
|
@@ -6,6 +6,7 @@ novel_downloader.core.downloaders.sfacg
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
from novel_downloader.core.downloaders.common import CommonDownloader
|
9
|
+
from novel_downloader.core.downloaders.registry import register_downloader
|
9
10
|
from novel_downloader.core.interfaces import (
|
10
11
|
FetcherProtocol,
|
11
12
|
ParserProtocol,
|
@@ -13,8 +14,11 @@ from novel_downloader.core.interfaces import (
|
|
13
14
|
from novel_downloader.models import DownloaderConfig
|
14
15
|
|
15
16
|
|
17
|
+
@register_downloader(site_keys=["sfacg"])
|
16
18
|
class SfacgDownloader(CommonDownloader):
|
17
|
-
"""
|
19
|
+
"""
|
20
|
+
Downloader for sfacg (SF 轻小说) novels.
|
21
|
+
"""
|
18
22
|
|
19
23
|
def __init__(
|
20
24
|
self,
|
@@ -6,6 +6,7 @@ novel_downloader.core.downloaders.yamibo
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
from novel_downloader.core.downloaders.common import CommonDownloader
|
9
|
+
from novel_downloader.core.downloaders.registry import register_downloader
|
9
10
|
from novel_downloader.core.interfaces import (
|
10
11
|
FetcherProtocol,
|
11
12
|
ParserProtocol,
|
@@ -13,8 +14,11 @@ from novel_downloader.core.interfaces import (
|
|
13
14
|
from novel_downloader.models import DownloaderConfig
|
14
15
|
|
15
16
|
|
17
|
+
@register_downloader(site_keys=["yamibo"])
|
16
18
|
class YamiboDownloader(CommonDownloader):
|
17
|
-
"""
|
19
|
+
"""
|
20
|
+
Downloader for yamibo (百合会) novels.
|
21
|
+
"""
|
18
22
|
|
19
23
|
def __init__(
|
20
24
|
self,
|