novel-downloader 1.4.5__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -1
- novel_downloader/cli/__init__.py +2 -2
- novel_downloader/cli/config.py +1 -83
- novel_downloader/cli/download.py +4 -5
- novel_downloader/cli/export.py +4 -1
- novel_downloader/cli/main.py +2 -0
- novel_downloader/cli/search.py +123 -0
- novel_downloader/config/__init__.py +3 -10
- novel_downloader/config/adapter.py +190 -54
- novel_downloader/config/loader.py +2 -3
- novel_downloader/core/__init__.py +13 -13
- novel_downloader/core/downloaders/__init__.py +10 -11
- novel_downloader/core/downloaders/base.py +152 -26
- novel_downloader/core/downloaders/biquge.py +5 -1
- novel_downloader/core/downloaders/common.py +157 -378
- novel_downloader/core/downloaders/esjzone.py +5 -1
- novel_downloader/core/downloaders/linovelib.py +5 -1
- novel_downloader/core/downloaders/qianbi.py +291 -4
- novel_downloader/core/downloaders/qidian.py +199 -285
- novel_downloader/core/downloaders/registry.py +67 -0
- novel_downloader/core/downloaders/sfacg.py +5 -1
- novel_downloader/core/downloaders/yamibo.py +5 -1
- novel_downloader/core/exporters/__init__.py +10 -11
- novel_downloader/core/exporters/base.py +87 -7
- novel_downloader/core/exporters/biquge.py +5 -8
- novel_downloader/core/exporters/common/__init__.py +2 -2
- novel_downloader/core/exporters/common/epub.py +82 -166
- novel_downloader/core/exporters/common/main_exporter.py +0 -60
- novel_downloader/core/exporters/common/txt.py +82 -83
- novel_downloader/core/exporters/epub_util.py +157 -1330
- novel_downloader/core/exporters/esjzone.py +5 -8
- novel_downloader/core/exporters/linovelib/__init__.py +2 -2
- novel_downloader/core/exporters/linovelib/epub.py +157 -212
- novel_downloader/core/exporters/linovelib/main_exporter.py +2 -59
- novel_downloader/core/exporters/linovelib/txt.py +67 -63
- novel_downloader/core/exporters/qianbi.py +5 -8
- novel_downloader/core/exporters/qidian.py +14 -4
- novel_downloader/core/exporters/registry.py +53 -0
- novel_downloader/core/exporters/sfacg.py +5 -8
- novel_downloader/core/exporters/txt_util.py +67 -0
- novel_downloader/core/exporters/yamibo.py +5 -8
- novel_downloader/core/fetchers/__init__.py +19 -24
- novel_downloader/core/fetchers/base/__init__.py +3 -3
- novel_downloader/core/fetchers/base/browser.py +23 -4
- novel_downloader/core/fetchers/base/session.py +30 -5
- novel_downloader/core/fetchers/biquge/__init__.py +3 -3
- novel_downloader/core/fetchers/biquge/browser.py +5 -0
- novel_downloader/core/fetchers/biquge/session.py +6 -1
- novel_downloader/core/fetchers/esjzone/__init__.py +3 -3
- novel_downloader/core/fetchers/esjzone/browser.py +5 -0
- novel_downloader/core/fetchers/esjzone/session.py +6 -1
- novel_downloader/core/fetchers/linovelib/__init__.py +3 -3
- novel_downloader/core/fetchers/linovelib/browser.py +6 -1
- novel_downloader/core/fetchers/linovelib/session.py +6 -1
- novel_downloader/core/fetchers/qianbi/__init__.py +3 -3
- novel_downloader/core/fetchers/qianbi/browser.py +5 -0
- novel_downloader/core/fetchers/qianbi/session.py +5 -0
- novel_downloader/core/fetchers/qidian/__init__.py +3 -3
- novel_downloader/core/fetchers/qidian/browser.py +12 -4
- novel_downloader/core/fetchers/qidian/session.py +11 -3
- novel_downloader/core/fetchers/registry.py +71 -0
- novel_downloader/core/fetchers/sfacg/__init__.py +3 -3
- novel_downloader/core/fetchers/sfacg/browser.py +5 -0
- novel_downloader/core/fetchers/sfacg/session.py +5 -0
- novel_downloader/core/fetchers/yamibo/__init__.py +3 -3
- novel_downloader/core/fetchers/yamibo/browser.py +5 -0
- novel_downloader/core/fetchers/yamibo/session.py +6 -1
- novel_downloader/core/interfaces/__init__.py +7 -5
- novel_downloader/core/interfaces/searcher.py +18 -0
- novel_downloader/core/parsers/__init__.py +10 -11
- novel_downloader/core/parsers/{biquge/main_parser.py → biquge.py} +7 -2
- novel_downloader/core/parsers/{esjzone/main_parser.py → esjzone.py} +7 -2
- novel_downloader/core/parsers/{linovelib/main_parser.py → linovelib.py} +7 -2
- novel_downloader/core/parsers/{qianbi/main_parser.py → qianbi.py} +7 -2
- novel_downloader/core/parsers/qidian/__init__.py +2 -2
- novel_downloader/core/parsers/qidian/chapter_encrypted.py +23 -21
- novel_downloader/core/parsers/qidian/chapter_normal.py +1 -1
- novel_downloader/core/parsers/qidian/main_parser.py +10 -21
- novel_downloader/core/parsers/qidian/utils/__init__.py +11 -11
- novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +5 -6
- novel_downloader/core/parsers/qidian/utils/node_decryptor.py +2 -2
- novel_downloader/core/parsers/registry.py +68 -0
- novel_downloader/core/parsers/{sfacg/main_parser.py → sfacg.py} +7 -2
- novel_downloader/core/parsers/{yamibo/main_parser.py → yamibo.py} +7 -2
- novel_downloader/core/searchers/__init__.py +20 -0
- novel_downloader/core/searchers/base.py +92 -0
- novel_downloader/core/searchers/biquge.py +83 -0
- novel_downloader/core/searchers/esjzone.py +84 -0
- novel_downloader/core/searchers/qianbi.py +131 -0
- novel_downloader/core/searchers/qidian.py +87 -0
- novel_downloader/core/searchers/registry.py +63 -0
- novel_downloader/locales/en.json +12 -4
- novel_downloader/locales/zh.json +12 -4
- novel_downloader/models/__init__.py +4 -30
- novel_downloader/models/config.py +12 -6
- novel_downloader/models/search.py +16 -0
- novel_downloader/models/types.py +0 -2
- novel_downloader/resources/config/settings.toml +31 -4
- novel_downloader/resources/css_styles/intro.css +83 -0
- novel_downloader/resources/css_styles/main.css +30 -89
- novel_downloader/utils/__init__.py +52 -0
- novel_downloader/utils/chapter_storage.py +244 -224
- novel_downloader/utils/constants.py +1 -21
- novel_downloader/utils/epub/__init__.py +34 -0
- novel_downloader/utils/epub/builder.py +377 -0
- novel_downloader/utils/epub/constants.py +77 -0
- novel_downloader/utils/epub/documents.py +403 -0
- novel_downloader/utils/epub/models.py +134 -0
- novel_downloader/utils/epub/utils.py +212 -0
- novel_downloader/utils/file_utils/__init__.py +10 -14
- novel_downloader/utils/file_utils/io.py +20 -51
- novel_downloader/utils/file_utils/normalize.py +2 -2
- novel_downloader/utils/file_utils/sanitize.py +2 -3
- novel_downloader/utils/fontocr/__init__.py +5 -5
- novel_downloader/utils/{hash_store.py → fontocr/hash_store.py} +4 -3
- novel_downloader/utils/{hash_utils.py → fontocr/hash_utils.py} +2 -2
- novel_downloader/utils/fontocr/ocr_v1.py +13 -1
- novel_downloader/utils/fontocr/ocr_v2.py +13 -1
- novel_downloader/utils/fontocr/ocr_v3.py +744 -0
- novel_downloader/utils/i18n.py +2 -0
- novel_downloader/utils/logger.py +2 -0
- novel_downloader/utils/network.py +110 -251
- novel_downloader/utils/state.py +1 -0
- novel_downloader/utils/text_utils/__init__.py +18 -17
- novel_downloader/utils/text_utils/diff_display.py +4 -5
- novel_downloader/utils/text_utils/numeric_conversion.py +253 -0
- novel_downloader/utils/text_utils/text_cleaner.py +179 -0
- novel_downloader/utils/text_utils/truncate_utils.py +62 -0
- novel_downloader/utils/time_utils/__init__.py +3 -3
- novel_downloader/utils/time_utils/datetime_utils.py +4 -5
- novel_downloader/utils/time_utils/sleep_utils.py +2 -3
- {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/METADATA +2 -2
- novel_downloader-1.5.0.dist-info/RECORD +164 -0
- novel_downloader/config/site_rules.py +0 -94
- novel_downloader/core/factory/__init__.py +0 -20
- novel_downloader/core/factory/downloader.py +0 -73
- novel_downloader/core/factory/exporter.py +0 -58
- novel_downloader/core/factory/fetcher.py +0 -96
- novel_downloader/core/factory/parser.py +0 -86
- novel_downloader/core/fetchers/common/__init__.py +0 -14
- novel_downloader/core/fetchers/common/browser.py +0 -79
- novel_downloader/core/fetchers/common/session.py +0 -79
- novel_downloader/core/parsers/biquge/__init__.py +0 -10
- novel_downloader/core/parsers/common/__init__.py +0 -13
- novel_downloader/core/parsers/common/helper.py +0 -323
- novel_downloader/core/parsers/common/main_parser.py +0 -106
- novel_downloader/core/parsers/esjzone/__init__.py +0 -10
- novel_downloader/core/parsers/linovelib/__init__.py +0 -10
- novel_downloader/core/parsers/qianbi/__init__.py +0 -10
- novel_downloader/core/parsers/sfacg/__init__.py +0 -10
- novel_downloader/core/parsers/yamibo/__init__.py +0 -10
- novel_downloader/models/browser.py +0 -21
- novel_downloader/models/site_rules.py +0 -99
- novel_downloader/models/tasks.py +0 -33
- novel_downloader/resources/css_styles/volume-intro.css +0 -56
- novel_downloader/resources/json/replace_word_map.json +0 -4
- novel_downloader/resources/text/blacklist.txt +0 -22
- novel_downloader/utils/text_utils/chapter_formatting.py +0 -46
- novel_downloader/utils/text_utils/font_mapping.py +0 -28
- novel_downloader/utils/text_utils/text_cleaning.py +0 -107
- novel_downloader-1.4.5.dist-info/RECORD +0 -165
- {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/WHEEL +0 -0
- {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/entry_points.txt +0 -0
- {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/top_level.txt +0 -0
@@ -6,6 +6,7 @@ novel_downloader.core.downloaders.biquge
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
from novel_downloader.core.downloaders.common import CommonDownloader
|
9
|
+
from novel_downloader.core.downloaders.registry import register_downloader
|
9
10
|
from novel_downloader.core.interfaces import (
|
10
11
|
FetcherProtocol,
|
11
12
|
ParserProtocol,
|
@@ -13,8 +14,11 @@ from novel_downloader.core.interfaces import (
|
|
13
14
|
from novel_downloader.models import DownloaderConfig
|
14
15
|
|
15
16
|
|
17
|
+
@register_downloader(site_keys=["biquge", "bqg"])
|
16
18
|
class BiqugeDownloader(CommonDownloader):
|
17
|
-
"""
|
19
|
+
"""
|
20
|
+
Downloader for biquge (笔趣阁) novels.
|
21
|
+
"""
|
18
22
|
|
19
23
|
def __init__(
|
20
24
|
self,
|
@@ -6,30 +6,25 @@ novel_downloader.core.downloaders.common
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
import asyncio
|
9
|
-
import
|
10
|
-
from
|
11
|
-
from
|
12
|
-
from typing import Any
|
9
|
+
from collections.abc import AsyncIterator, Awaitable, Callable
|
10
|
+
from contextlib import asynccontextmanager
|
11
|
+
from pathlib import Path
|
12
|
+
from typing import Any
|
13
13
|
|
14
14
|
from novel_downloader.core.downloaders.base import BaseDownloader
|
15
15
|
from novel_downloader.models import (
|
16
16
|
BookConfig,
|
17
17
|
ChapterDict,
|
18
|
-
CidTask,
|
19
|
-
HtmlTask,
|
20
|
-
RestoreTask,
|
21
18
|
)
|
22
|
-
from novel_downloader.utils
|
23
|
-
|
24
|
-
from novel_downloader.utils.time_utils import (
|
19
|
+
from novel_downloader.utils import (
|
20
|
+
ChapterStorage,
|
25
21
|
async_sleep_with_random_delay,
|
26
|
-
calculate_time_difference,
|
27
22
|
)
|
28
23
|
|
29
24
|
|
30
25
|
class CommonDownloader(BaseDownloader):
|
31
26
|
"""
|
32
|
-
Specialized Async downloader for common
|
27
|
+
Specialized Async downloader for "common" novel sites.
|
33
28
|
"""
|
34
29
|
|
35
30
|
async def _download_one(
|
@@ -50,399 +45,183 @@ class CommonDownloader(BaseDownloader):
|
|
50
45
|
end_id = book.get("end_id")
|
51
46
|
ignore_set = set(book.get("ignore_ids", []))
|
52
47
|
|
53
|
-
|
54
|
-
|
55
|
-
info_path = raw_base / "book_info.json"
|
56
|
-
chapters_html_dir = cache_base / "html"
|
57
|
-
|
48
|
+
# prepare storage & dirs
|
49
|
+
raw_base = self._raw_data_dir / book_id
|
58
50
|
raw_base.mkdir(parents=True, exist_ok=True)
|
59
|
-
|
60
|
-
|
61
|
-
normal_cs = ChapterStorage(
|
51
|
+
html_dir = self._debug_dir / book_id / "html"
|
52
|
+
chapter_storage = ChapterStorage(
|
62
53
|
raw_base=raw_base,
|
63
|
-
|
64
|
-
backend_type=self._config.storage_backend,
|
65
|
-
batch_size=self._config.storage_batch_size,
|
54
|
+
priorities=self._priorities,
|
66
55
|
)
|
56
|
+
chapter_storage.connect()
|
67
57
|
|
68
|
-
# load or fetch
|
69
|
-
book_info
|
70
|
-
re_fetch = True
|
71
|
-
old_data: dict[str, Any] = {}
|
72
|
-
|
73
|
-
if info_path.exists():
|
74
|
-
try:
|
75
|
-
old_data = json.loads(info_path.read_text("utf-8"))
|
76
|
-
days, *_ = calculate_time_difference(
|
77
|
-
old_data.get("update_time", ""), "UTC+8"
|
78
|
-
)
|
79
|
-
re_fetch = days > 1
|
80
|
-
except Exception:
|
81
|
-
re_fetch = True
|
82
|
-
|
83
|
-
if re_fetch:
|
84
|
-
info_html = await self.fetcher.get_book_info(book_id)
|
85
|
-
if self.save_html:
|
86
|
-
for i, html in enumerate(info_html):
|
87
|
-
save_as_txt(html, chapters_html_dir / f"info_{i}.html")
|
88
|
-
book_info = self.parser.parse_book_info(info_html)
|
89
|
-
|
90
|
-
if book_info.get("book_name") != "未找到书名":
|
91
|
-
save_as_json(book_info, info_path)
|
92
|
-
else:
|
93
|
-
self.logger.warning("%s 书籍信息未找到, book_id = %s", TAG, book_id)
|
94
|
-
book_info = old_data or {"book_name": "未找到书名"}
|
95
|
-
else:
|
96
|
-
book_info = old_data
|
97
|
-
|
58
|
+
# load or fetch metadata
|
59
|
+
book_info = await self.load_book_info(book_id=book_id, html_dir=html_dir)
|
98
60
|
vols = book_info.get("volumes", [])
|
99
|
-
total_chapters =
|
100
|
-
for vol in vols:
|
101
|
-
total_chapters += len(vol.get("chapters", []))
|
61
|
+
total_chapters = sum(len(v.get("chapters", [])) for v in vols)
|
102
62
|
if total_chapters == 0:
|
103
|
-
self.logger.warning("%s 书籍没有章节可下载:
|
63
|
+
self.logger.warning("%s 书籍没有章节可下载: %s", TAG, book_id)
|
104
64
|
return
|
105
65
|
|
106
|
-
|
66
|
+
# concurrency primitives
|
67
|
+
sem = asyncio.Semaphore(self.workers)
|
68
|
+
cid_q: asyncio.Queue[str | None] = asyncio.Queue()
|
69
|
+
save_q: asyncio.Queue[ChapterDict | None] = asyncio.Queue()
|
70
|
+
batch: list[ChapterDict] = []
|
71
|
+
completed = 0
|
107
72
|
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
html_queue: asyncio.Queue[HtmlTask] = asyncio.Queue()
|
113
|
-
save_queue: asyncio.Queue[ChapterDict] = asyncio.Queue()
|
114
|
-
pending_restore: dict[str, RestoreTask] = {}
|
73
|
+
async def _flush_batch() -> None:
|
74
|
+
nonlocal batch, completed
|
75
|
+
if not batch:
|
76
|
+
return
|
115
77
|
|
116
|
-
def update_book_info(
|
117
|
-
vol_idx: int,
|
118
|
-
chap_idx: int,
|
119
|
-
cid: str,
|
120
|
-
) -> None:
|
121
78
|
try:
|
122
|
-
|
123
|
-
except
|
124
|
-
self.logger.
|
125
|
-
"[
|
126
|
-
|
127
|
-
chap_idx,
|
79
|
+
chapter_storage.upsert_chapters(batch, self.DEFAULT_SOURCE_ID)
|
80
|
+
except Exception as e:
|
81
|
+
self.logger.error(
|
82
|
+
"[Storage] batch upsert failed (size=%d): %s",
|
83
|
+
len(batch),
|
128
84
|
e,
|
85
|
+
exc_info=True,
|
129
86
|
)
|
87
|
+
else:
|
88
|
+
completed += len(batch)
|
89
|
+
if progress_hook:
|
90
|
+
await progress_hook(completed, total_chapters)
|
91
|
+
finally:
|
92
|
+
batch.clear()
|
130
93
|
|
131
|
-
async def
|
132
|
-
book_id: str,
|
133
|
-
cid_queue: asyncio.Queue[CidTask],
|
134
|
-
html_queue: asyncio.Queue[HtmlTask],
|
135
|
-
restore_queue: asyncio.Queue[RestoreTask],
|
136
|
-
retry_times: int,
|
137
|
-
semaphore: asyncio.Semaphore,
|
138
|
-
) -> None:
|
94
|
+
async def storage_worker(q: asyncio.Queue[ChapterDict | None]) -> None:
|
139
95
|
while True:
|
140
|
-
|
141
|
-
|
142
|
-
if
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
)
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
)
|
174
|
-
self.logger.info("[Fetcher] Downloaded chapter %s", cid)
|
175
|
-
await async_sleep_with_random_delay(
|
176
|
-
self.request_interval,
|
177
|
-
mul_spread=1.1,
|
178
|
-
max_sleep=self.request_interval + 2,
|
179
|
-
)
|
180
|
-
|
181
|
-
except Exception as e:
|
182
|
-
if task.retry < retry_times:
|
183
|
-
await cid_queue.put(
|
184
|
-
CidTask(
|
185
|
-
prev_cid=task.prev_cid,
|
186
|
-
cid=cid,
|
187
|
-
retry=task.retry + 1,
|
188
|
-
vol_idx=task.vol_idx,
|
189
|
-
chap_idx=task.chap_idx,
|
190
|
-
)
|
191
|
-
)
|
192
|
-
self.logger.info(
|
193
|
-
"[Fetcher] Re-queued chapter %s for retry #%d: %s",
|
194
|
-
cid,
|
195
|
-
task.retry + 1,
|
196
|
-
e,
|
197
|
-
)
|
198
|
-
backoff = self.backoff_factor * (2**task.retry)
|
199
|
-
await async_sleep_with_random_delay(
|
200
|
-
base=backoff,
|
201
|
-
mul_spread=1.2,
|
202
|
-
max_sleep=backoff + 3,
|
203
|
-
)
|
204
|
-
else:
|
205
|
-
self.logger.warning(
|
206
|
-
"[Fetcher] Max retries reached for chapter %s: %s",
|
207
|
-
cid,
|
208
|
-
e,
|
96
|
+
item = await q.get()
|
97
|
+
q.task_done()
|
98
|
+
if item is None:
|
99
|
+
# final flush before exit
|
100
|
+
if batch:
|
101
|
+
await _flush_batch()
|
102
|
+
break
|
103
|
+
batch.append(item)
|
104
|
+
if len(batch) >= self.storage_batch_size:
|
105
|
+
await _flush_batch()
|
106
|
+
|
107
|
+
async def producer() -> None:
|
108
|
+
nonlocal completed
|
109
|
+
async for cid in self._chapter_ids(vols, start_id, end_id):
|
110
|
+
if self.skip_existing and chapter_storage.exists(cid):
|
111
|
+
completed += 1
|
112
|
+
if progress_hook:
|
113
|
+
await progress_hook(completed, total_chapters)
|
114
|
+
else:
|
115
|
+
await cid_q.put(cid)
|
116
|
+
|
117
|
+
@asynccontextmanager
|
118
|
+
async def task_group_ctx() -> AsyncIterator[asyncio.TaskGroup]:
|
119
|
+
async with asyncio.TaskGroup() as tg:
|
120
|
+
# start chapter workers
|
121
|
+
for _ in range(self.workers):
|
122
|
+
tg.create_task(
|
123
|
+
self._chapter_worker(
|
124
|
+
book_id,
|
125
|
+
ignore_set,
|
126
|
+
cid_q,
|
127
|
+
save_q,
|
128
|
+
sem,
|
209
129
|
)
|
210
|
-
|
211
|
-
finally:
|
212
|
-
cid_queue.task_done()
|
213
|
-
|
214
|
-
async def parser_worker(
|
215
|
-
worker_id: int,
|
216
|
-
cid_queue: asyncio.Queue[CidTask],
|
217
|
-
html_queue: asyncio.Queue[HtmlTask],
|
218
|
-
save_queue: asyncio.Queue[ChapterDict],
|
219
|
-
retry_times: int,
|
220
|
-
) -> None:
|
221
|
-
while True:
|
222
|
-
task = await html_queue.get()
|
223
|
-
try:
|
224
|
-
chap_json = await asyncio.to_thread(
|
225
|
-
self.parser.parse_chapter,
|
226
|
-
task.html_list,
|
227
|
-
task.cid,
|
228
130
|
)
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
"[Parser-%d] saved chapter %s",
|
233
|
-
worker_id,
|
234
|
-
task.cid,
|
235
|
-
)
|
236
|
-
else:
|
237
|
-
raise ValueError("Empty parse result")
|
238
|
-
except Exception as e:
|
239
|
-
if task.retry < retry_times:
|
240
|
-
await cid_queue.put(
|
241
|
-
CidTask(
|
242
|
-
prev_cid=None,
|
243
|
-
cid=task.cid,
|
244
|
-
retry=task.retry + 1,
|
245
|
-
vol_idx=task.vol_idx,
|
246
|
-
chap_idx=task.chap_idx,
|
247
|
-
)
|
248
|
-
)
|
249
|
-
self.logger.info(
|
250
|
-
"[Parser-%d] Re-queued cid %s for retry #%d: %s",
|
251
|
-
worker_id,
|
252
|
-
task.cid,
|
253
|
-
task.retry + 1,
|
254
|
-
e,
|
255
|
-
)
|
256
|
-
else:
|
257
|
-
self.logger.warning(
|
258
|
-
"[Parser-%d] Max retries reached for cid %s: %s",
|
259
|
-
worker_id,
|
260
|
-
task.cid,
|
261
|
-
e,
|
262
|
-
)
|
263
|
-
finally:
|
264
|
-
html_queue.task_done()
|
265
|
-
|
266
|
-
async def storage_worker(
|
267
|
-
cs: ChapterStorage,
|
268
|
-
save_queue: asyncio.Queue[ChapterDict],
|
269
|
-
restore_queue: asyncio.Queue[RestoreTask],
|
270
|
-
cid_queue: asyncio.Queue[CidTask],
|
271
|
-
) -> None:
|
272
|
-
nonlocal completed_count
|
273
|
-
while True:
|
274
|
-
save_task = asyncio.create_task(save_queue.get())
|
275
|
-
restore_task = asyncio.create_task(restore_queue.get())
|
276
|
-
|
277
|
-
done, pending = await asyncio.wait(
|
278
|
-
[save_task, restore_task],
|
279
|
-
return_when=asyncio.FIRST_COMPLETED,
|
280
|
-
)
|
131
|
+
# start storage worker
|
132
|
+
tg.create_task(storage_worker(save_q))
|
133
|
+
yield tg
|
281
134
|
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
135
|
+
# run producer + workers
|
136
|
+
async with task_group_ctx():
|
137
|
+
# produce all CidTask
|
138
|
+
await producer()
|
286
139
|
|
287
|
-
|
288
|
-
|
140
|
+
# signal chapter workers to exit
|
141
|
+
for _ in range(self.workers):
|
142
|
+
await cid_q.put(None)
|
143
|
+
await cid_q.join()
|
289
144
|
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
completed_count += 1
|
294
|
-
if progress_hook:
|
295
|
-
await progress_hook(completed_count, total_chapters)
|
145
|
+
# signal storage worker to exit
|
146
|
+
await save_q.put(None)
|
147
|
+
await save_q.join()
|
296
148
|
|
297
|
-
|
298
|
-
|
299
|
-
rt = pending_restore.pop(curr_cid)
|
300
|
-
next_cid = item.get("extra", {}).get("next_chapter_id")
|
301
|
-
if next_cid:
|
302
|
-
update_book_info(
|
303
|
-
vol_idx=rt.vol_idx,
|
304
|
-
chap_idx=rt.chap_idx,
|
305
|
-
cid=next_cid,
|
306
|
-
)
|
307
|
-
await cid_queue.put(
|
308
|
-
CidTask(
|
309
|
-
prev_cid=rt.prev_cid,
|
310
|
-
cid=next_cid,
|
311
|
-
vol_idx=rt.vol_idx,
|
312
|
-
chap_idx=rt.chap_idx,
|
313
|
-
)
|
314
|
-
)
|
315
|
-
else:
|
316
|
-
self.logger.warning(
|
317
|
-
"[storage_worker] No next_cid found for %r",
|
318
|
-
rt,
|
319
|
-
)
|
320
|
-
except Exception as e:
|
321
|
-
self.logger.error("[storage_worker] Failed to save: %s", e)
|
322
|
-
finally:
|
323
|
-
save_queue.task_done()
|
149
|
+
# final flush to catch any remaining items
|
150
|
+
await _flush_batch()
|
324
151
|
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
else None
|
331
|
-
)
|
332
|
-
if next_cid:
|
333
|
-
update_book_info(
|
334
|
-
vol_idx=item.vol_idx,
|
335
|
-
chap_idx=item.chap_idx,
|
336
|
-
cid=next_cid,
|
337
|
-
)
|
338
|
-
await cid_queue.put(
|
339
|
-
CidTask(
|
340
|
-
prev_cid=item.prev_cid,
|
341
|
-
cid=next_cid,
|
342
|
-
vol_idx=item.vol_idx,
|
343
|
-
chap_idx=item.chap_idx,
|
344
|
-
)
|
345
|
-
)
|
346
|
-
else:
|
347
|
-
pending_restore[item.prev_cid] = item
|
348
|
-
restore_queue.task_done()
|
349
|
-
|
350
|
-
fetcher_tasks = [
|
351
|
-
asyncio.create_task(
|
352
|
-
fetcher_worker(
|
353
|
-
book_id,
|
354
|
-
cid_queue,
|
355
|
-
html_queue,
|
356
|
-
restore_queue,
|
357
|
-
self.retry_times,
|
358
|
-
semaphore,
|
359
|
-
)
|
360
|
-
)
|
361
|
-
for _ in range(self.download_workers)
|
362
|
-
]
|
363
|
-
|
364
|
-
parser_tasks = [
|
365
|
-
asyncio.create_task(
|
366
|
-
parser_worker(
|
367
|
-
i,
|
368
|
-
cid_queue,
|
369
|
-
html_queue,
|
370
|
-
save_queue,
|
371
|
-
self.retry_times,
|
372
|
-
)
|
373
|
-
)
|
374
|
-
for i in range(self.parser_workers)
|
375
|
-
]
|
376
|
-
|
377
|
-
storage_task = asyncio.create_task(
|
378
|
-
storage_worker(
|
379
|
-
cs=normal_cs,
|
380
|
-
save_queue=save_queue,
|
381
|
-
restore_queue=restore_queue,
|
382
|
-
cid_queue=cid_queue,
|
383
|
-
)
|
152
|
+
chapter_storage.close()
|
153
|
+
self.logger.info(
|
154
|
+
"%s Novel '%s' download completed.",
|
155
|
+
TAG,
|
156
|
+
book_info.get("book_name", "unknown"),
|
384
157
|
)
|
385
158
|
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
159
|
+
async def _chapter_worker(
|
160
|
+
self,
|
161
|
+
book_id: str,
|
162
|
+
ignore_set: set[str],
|
163
|
+
cid_q: asyncio.Queue[str | None],
|
164
|
+
save_q: asyncio.Queue[ChapterDict | None],
|
165
|
+
sem: asyncio.Semaphore,
|
166
|
+
) -> None:
|
167
|
+
"""
|
168
|
+
Worker that processes one chapter at a time:
|
169
|
+
fetch + parse with retry, then enqueue to save_q.
|
170
|
+
"""
|
171
|
+
html_dir = self._debug_dir / book_id / "html"
|
172
|
+
while True:
|
173
|
+
cid = await cid_q.get()
|
174
|
+
if cid is None:
|
175
|
+
cid_q.task_done()
|
176
|
+
break
|
177
|
+
if not cid or cid in ignore_set:
|
178
|
+
cid_q.task_done()
|
179
|
+
continue
|
395
180
|
|
396
|
-
|
181
|
+
async with sem:
|
182
|
+
chap = await self._process_chapter(book_id, cid, html_dir)
|
397
183
|
|
398
|
-
|
399
|
-
|
400
|
-
if cid == start_id:
|
401
|
-
found_start = True
|
402
|
-
else:
|
403
|
-
completed_count += 1
|
404
|
-
last_cid = cid
|
405
|
-
continue
|
184
|
+
if chap:
|
185
|
+
await save_q.put(chap)
|
406
186
|
|
407
|
-
|
408
|
-
|
409
|
-
|
187
|
+
cid_q.task_done()
|
188
|
+
await async_sleep_with_random_delay(
|
189
|
+
self.request_interval,
|
190
|
+
mul_spread=1.1,
|
191
|
+
max_sleep=self.request_interval + 2,
|
192
|
+
)
|
410
193
|
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
194
|
+
async def _process_chapter(
|
195
|
+
self,
|
196
|
+
book_id: str,
|
197
|
+
cid: str,
|
198
|
+
html_dir: Path,
|
199
|
+
) -> ChapterDict | None:
|
200
|
+
"""
|
201
|
+
Fetches, saves raw HTML, parses a single chapter,
|
202
|
+
retrying up to self.retry_times.
|
415
203
|
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
204
|
+
:return: ChapterDict on success, or None on failure.
|
205
|
+
"""
|
206
|
+
for attempt in range(self.retry_times + 1):
|
207
|
+
try:
|
208
|
+
html_list = await self.fetcher.get_book_chapter(book_id, cid)
|
209
|
+
self._save_html_pages(html_dir, cid, html_list)
|
210
|
+
chap = await asyncio.to_thread(
|
211
|
+
self.parser.parse_chapter, html_list, cid
|
423
212
|
)
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
normal_cs.close()
|
441
|
-
save_as_json(book_info, info_path)
|
442
|
-
|
443
|
-
self.logger.info(
|
444
|
-
"%s Novel '%s' download completed.",
|
445
|
-
TAG,
|
446
|
-
book_info.get("book_name", "unknown"),
|
447
|
-
)
|
448
|
-
return
|
213
|
+
if not chap:
|
214
|
+
raise ValueError("Empty parse result")
|
215
|
+
return chap
|
216
|
+
except Exception as e:
|
217
|
+
if attempt < self.retry_times:
|
218
|
+
self.logger.info(
|
219
|
+
"[ChapterWorker] Retry %s (%s): %s", cid, attempt + 1, e
|
220
|
+
)
|
221
|
+
backoff = self.backoff_factor * (2**attempt)
|
222
|
+
await async_sleep_with_random_delay(
|
223
|
+
base=backoff, mul_spread=1.2, max_sleep=backoff + 3
|
224
|
+
)
|
225
|
+
else:
|
226
|
+
self.logger.warning("[ChapterWorker] Failed %s: %s", cid, e)
|
227
|
+
return None
|
@@ -6,6 +6,7 @@ novel_downloader.core.downloaders.esjzone
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
from novel_downloader.core.downloaders.common import CommonDownloader
|
9
|
+
from novel_downloader.core.downloaders.registry import register_downloader
|
9
10
|
from novel_downloader.core.interfaces import (
|
10
11
|
FetcherProtocol,
|
11
12
|
ParserProtocol,
|
@@ -13,8 +14,11 @@ from novel_downloader.core.interfaces import (
|
|
13
14
|
from novel_downloader.models import DownloaderConfig
|
14
15
|
|
15
16
|
|
17
|
+
@register_downloader(site_keys=["esjzone"])
|
16
18
|
class EsjzoneDownloader(CommonDownloader):
|
17
|
-
"""
|
19
|
+
"""
|
20
|
+
Downloader for ESJ Zone novels.
|
21
|
+
"""
|
18
22
|
|
19
23
|
def __init__(
|
20
24
|
self,
|
@@ -6,6 +6,7 @@ novel_downloader.core.downloaders.linovelib
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
from novel_downloader.core.downloaders.common import CommonDownloader
|
9
|
+
from novel_downloader.core.downloaders.registry import register_downloader
|
9
10
|
from novel_downloader.core.interfaces import (
|
10
11
|
FetcherProtocol,
|
11
12
|
ParserProtocol,
|
@@ -13,8 +14,11 @@ from novel_downloader.core.interfaces import (
|
|
13
14
|
from novel_downloader.models import DownloaderConfig
|
14
15
|
|
15
16
|
|
17
|
+
@register_downloader(site_keys=["linovelib"])
|
16
18
|
class LinovelibDownloader(CommonDownloader):
|
17
|
-
"""
|
19
|
+
"""
|
20
|
+
Downloader for Linovelib (哔哩轻小说) novels.
|
21
|
+
"""
|
18
22
|
|
19
23
|
def __init__(
|
20
24
|
self,
|