novel-downloader 1.2.2__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -2
- novel_downloader/cli/__init__.py +0 -1
- novel_downloader/cli/clean.py +2 -10
- novel_downloader/cli/download.py +16 -22
- novel_downloader/cli/interactive.py +0 -1
- novel_downloader/cli/main.py +1 -3
- novel_downloader/cli/settings.py +8 -8
- novel_downloader/config/__init__.py +0 -1
- novel_downloader/config/adapter.py +32 -27
- novel_downloader/config/loader.py +116 -108
- novel_downloader/config/models.py +35 -29
- novel_downloader/config/site_rules.py +2 -4
- novel_downloader/core/__init__.py +0 -1
- novel_downloader/core/downloaders/__init__.py +4 -4
- novel_downloader/core/downloaders/base/__init__.py +14 -0
- novel_downloader/core/downloaders/{base_async_downloader.py → base/base_async.py} +49 -53
- novel_downloader/core/downloaders/{base_downloader.py → base/base_sync.py} +64 -43
- novel_downloader/core/downloaders/biquge/__init__.py +12 -0
- novel_downloader/core/downloaders/biquge/biquge_sync.py +25 -0
- novel_downloader/core/downloaders/common/__init__.py +14 -0
- novel_downloader/core/downloaders/{common_asynb_downloader.py → common/common_async.py} +42 -33
- novel_downloader/core/downloaders/{common_downloader.py → common/common_sync.py} +33 -21
- novel_downloader/core/downloaders/qidian/__init__.py +10 -0
- novel_downloader/core/downloaders/{qidian_downloader.py → qidian/qidian_sync.py} +79 -62
- novel_downloader/core/factory/__init__.py +4 -5
- novel_downloader/core/factory/{downloader_factory.py → downloader.py} +25 -26
- novel_downloader/core/factory/{parser_factory.py → parser.py} +12 -14
- novel_downloader/core/factory/{requester_factory.py → requester.py} +29 -16
- novel_downloader/core/factory/{saver_factory.py → saver.py} +4 -9
- novel_downloader/core/interfaces/__init__.py +8 -9
- novel_downloader/core/interfaces/{async_downloader_protocol.py → async_downloader.py} +4 -5
- novel_downloader/core/interfaces/{async_requester_protocol.py → async_requester.py} +23 -12
- novel_downloader/core/interfaces/{parser_protocol.py → parser.py} +11 -6
- novel_downloader/core/interfaces/{saver_protocol.py → saver.py} +2 -3
- novel_downloader/core/interfaces/{downloader_protocol.py → sync_downloader.py} +6 -7
- novel_downloader/core/interfaces/{requester_protocol.py → sync_requester.py} +31 -17
- novel_downloader/core/parsers/__init__.py +5 -4
- novel_downloader/core/parsers/{base_parser.py → base.py} +18 -9
- novel_downloader/core/parsers/biquge/__init__.py +10 -0
- novel_downloader/core/parsers/biquge/main_parser.py +126 -0
- novel_downloader/core/parsers/{common_parser → common}/__init__.py +2 -3
- novel_downloader/core/parsers/{common_parser → common}/helper.py +13 -13
- novel_downloader/core/parsers/{common_parser → common}/main_parser.py +15 -9
- novel_downloader/core/parsers/{qidian_parser → qidian}/__init__.py +2 -3
- novel_downloader/core/parsers/{qidian_parser → qidian}/browser/__init__.py +2 -3
- novel_downloader/core/parsers/{qidian_parser → qidian}/browser/chapter_encrypted.py +40 -48
- novel_downloader/core/parsers/{qidian_parser → qidian}/browser/chapter_normal.py +17 -21
- novel_downloader/core/parsers/{qidian_parser → qidian}/browser/chapter_router.py +10 -9
- novel_downloader/core/parsers/{qidian_parser → qidian}/browser/main_parser.py +14 -10
- novel_downloader/core/parsers/{qidian_parser → qidian}/session/__init__.py +2 -3
- novel_downloader/core/parsers/{qidian_parser → qidian}/session/chapter_encrypted.py +36 -44
- novel_downloader/core/parsers/{qidian_parser → qidian}/session/chapter_normal.py +19 -23
- novel_downloader/core/parsers/{qidian_parser → qidian}/session/chapter_router.py +10 -9
- novel_downloader/core/parsers/{qidian_parser → qidian}/session/main_parser.py +14 -10
- novel_downloader/core/parsers/{qidian_parser → qidian}/session/node_decryptor.py +7 -10
- novel_downloader/core/parsers/{qidian_parser → qidian}/shared/__init__.py +2 -3
- novel_downloader/core/parsers/{qidian_parser → qidian}/shared/book_info_parser.py +5 -6
- novel_downloader/core/parsers/{qidian_parser → qidian}/shared/helpers.py +7 -8
- novel_downloader/core/requesters/__init__.py +9 -5
- novel_downloader/core/requesters/base/__init__.py +16 -0
- novel_downloader/core/requesters/{base_async_session.py → base/async_session.py} +177 -73
- novel_downloader/core/requesters/base/browser.py +340 -0
- novel_downloader/core/requesters/base/session.py +364 -0
- novel_downloader/core/requesters/biquge/__init__.py +12 -0
- novel_downloader/core/requesters/biquge/session.py +90 -0
- novel_downloader/core/requesters/{common_requester → common}/__init__.py +4 -5
- novel_downloader/core/requesters/common/async_session.py +96 -0
- novel_downloader/core/requesters/common/session.py +113 -0
- novel_downloader/core/requesters/qidian/__init__.py +21 -0
- novel_downloader/core/requesters/qidian/broswer.py +307 -0
- novel_downloader/core/requesters/qidian/session.py +287 -0
- novel_downloader/core/savers/__init__.py +5 -3
- novel_downloader/core/savers/{base_saver.py → base.py} +12 -13
- novel_downloader/core/savers/biquge.py +25 -0
- novel_downloader/core/savers/{common_saver → common}/__init__.py +2 -3
- novel_downloader/core/savers/{common_saver/common_epub.py → common/epub.py} +23 -51
- novel_downloader/core/savers/{common_saver → common}/main_saver.py +43 -9
- novel_downloader/core/savers/{common_saver/common_txt.py → common/txt.py} +16 -46
- novel_downloader/core/savers/epub_utils/__init__.py +0 -1
- novel_downloader/core/savers/epub_utils/css_builder.py +13 -7
- novel_downloader/core/savers/epub_utils/initializer.py +4 -5
- novel_downloader/core/savers/epub_utils/text_to_html.py +2 -3
- novel_downloader/core/savers/epub_utils/volume_intro.py +1 -3
- novel_downloader/core/savers/{qidian_saver.py → qidian.py} +12 -6
- novel_downloader/locales/en.json +8 -4
- novel_downloader/locales/zh.json +5 -1
- novel_downloader/resources/config/settings.toml +88 -0
- novel_downloader/utils/cache.py +2 -2
- novel_downloader/utils/chapter_storage.py +340 -0
- novel_downloader/utils/constants.py +6 -4
- novel_downloader/utils/crypto_utils.py +3 -3
- novel_downloader/utils/file_utils/__init__.py +0 -1
- novel_downloader/utils/file_utils/io.py +12 -17
- novel_downloader/utils/file_utils/normalize.py +1 -3
- novel_downloader/utils/file_utils/sanitize.py +2 -9
- novel_downloader/utils/fontocr/__init__.py +0 -1
- novel_downloader/utils/fontocr/ocr_v1.py +19 -22
- novel_downloader/utils/fontocr/ocr_v2.py +147 -60
- novel_downloader/utils/hash_store.py +19 -20
- novel_downloader/utils/hash_utils.py +0 -1
- novel_downloader/utils/i18n.py +3 -4
- novel_downloader/utils/logger.py +5 -6
- novel_downloader/utils/model_loader.py +5 -8
- novel_downloader/utils/network.py +9 -10
- novel_downloader/utils/state.py +6 -7
- novel_downloader/utils/text_utils/__init__.py +0 -1
- novel_downloader/utils/text_utils/chapter_formatting.py +2 -7
- novel_downloader/utils/text_utils/diff_display.py +0 -1
- novel_downloader/utils/text_utils/font_mapping.py +1 -4
- novel_downloader/utils/text_utils/text_cleaning.py +0 -1
- novel_downloader/utils/time_utils/__init__.py +0 -1
- novel_downloader/utils/time_utils/datetime_utils.py +8 -10
- novel_downloader/utils/time_utils/sleep_utils.py +1 -3
- {novel_downloader-1.2.2.dist-info → novel_downloader-1.3.1.dist-info}/METADATA +14 -17
- novel_downloader-1.3.1.dist-info/RECORD +127 -0
- {novel_downloader-1.2.2.dist-info → novel_downloader-1.3.1.dist-info}/WHEEL +1 -1
- novel_downloader/core/requesters/base_browser.py +0 -214
- novel_downloader/core/requesters/base_session.py +0 -246
- novel_downloader/core/requesters/common_requester/common_async_session.py +0 -98
- novel_downloader/core/requesters/common_requester/common_session.py +0 -126
- novel_downloader/core/requesters/qidian_requester/__init__.py +0 -22
- novel_downloader/core/requesters/qidian_requester/qidian_broswer.py +0 -396
- novel_downloader/core/requesters/qidian_requester/qidian_session.py +0 -202
- novel_downloader/resources/config/settings.yaml +0 -76
- novel_downloader-1.2.2.dist-info/RECORD +0 -115
- {novel_downloader-1.2.2.dist-info → novel_downloader-1.3.1.dist-info}/entry_points.txt +0 -0
- {novel_downloader-1.2.2.dist-info → novel_downloader-1.3.1.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-1.2.2.dist-info → novel_downloader-1.3.1.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,7 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.downloaders.
|
5
|
-
|
3
|
+
novel_downloader.core.downloaders.base.base_sync
|
4
|
+
------------------------------------------------
|
6
5
|
|
7
6
|
Defines the abstract base class `BaseDownloader`, which provides a
|
8
7
|
common interface and reusable logic for all downloader implementations.
|
@@ -11,20 +10,17 @@ common interface and reusable logic for all downloader implementations.
|
|
11
10
|
import abc
|
12
11
|
import logging
|
13
12
|
from pathlib import Path
|
14
|
-
from typing import List
|
15
13
|
|
16
14
|
from novel_downloader.config import DownloaderConfig
|
17
15
|
from novel_downloader.core.interfaces import (
|
18
|
-
DownloaderProtocol,
|
19
16
|
ParserProtocol,
|
20
|
-
RequesterProtocol,
|
21
17
|
SaverProtocol,
|
18
|
+
SyncDownloaderProtocol,
|
19
|
+
SyncRequesterProtocol,
|
22
20
|
)
|
23
21
|
|
24
|
-
logger = logging.getLogger(__name__)
|
25
22
|
|
26
|
-
|
27
|
-
class BaseDownloader(DownloaderProtocol, abc.ABC):
|
23
|
+
class BaseDownloader(SyncDownloaderProtocol, abc.ABC):
|
28
24
|
"""
|
29
25
|
Abstract downloader that defines the initialization interface
|
30
26
|
and the general batch download flow.
|
@@ -34,10 +30,11 @@ class BaseDownloader(DownloaderProtocol, abc.ABC):
|
|
34
30
|
|
35
31
|
def __init__(
|
36
32
|
self,
|
37
|
-
requester:
|
33
|
+
requester: SyncRequesterProtocol,
|
38
34
|
parser: ParserProtocol,
|
39
35
|
saver: SaverProtocol,
|
40
36
|
config: DownloaderConfig,
|
37
|
+
site: str,
|
41
38
|
):
|
42
39
|
"""
|
43
40
|
Initialize the downloader with its components.
|
@@ -51,23 +48,16 @@ class BaseDownloader(DownloaderProtocol, abc.ABC):
|
|
51
48
|
self._parser = parser
|
52
49
|
self._saver = saver
|
53
50
|
self._config = config
|
54
|
-
self.
|
55
|
-
|
51
|
+
self._site = site
|
52
|
+
|
53
|
+
self._raw_data_dir = Path(config.raw_data_dir) / site
|
54
|
+
self._cache_dir = Path(config.cache_dir) / site
|
56
55
|
self._raw_data_dir.mkdir(parents=True, exist_ok=True)
|
57
56
|
self._cache_dir.mkdir(parents=True, exist_ok=True)
|
58
57
|
|
59
|
-
|
60
|
-
def download_one(self, book_id: str) -> None:
|
61
|
-
"""
|
62
|
-
The full download logic for a single book.
|
58
|
+
self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
|
63
59
|
|
64
|
-
|
65
|
-
|
66
|
-
:param book_id: The identifier of the book to download.
|
67
|
-
"""
|
68
|
-
...
|
69
|
-
|
70
|
-
def download(self, book_ids: List[str]) -> None:
|
60
|
+
def download(self, book_ids: list[str]) -> None:
|
71
61
|
"""
|
72
62
|
The general batch download process:
|
73
63
|
1. Iterate over all book IDs
|
@@ -75,41 +65,41 @@ class BaseDownloader(DownloaderProtocol, abc.ABC):
|
|
75
65
|
|
76
66
|
:param book_ids: A list of book identifiers to download.
|
77
67
|
"""
|
68
|
+
self.prepare()
|
69
|
+
|
78
70
|
for idx, book_id in enumerate(book_ids, start=1):
|
71
|
+
self.logger.debug(
|
72
|
+
"[downloader] Starting download for book_id: %s (%s/%s)",
|
73
|
+
book_id,
|
74
|
+
idx,
|
75
|
+
len(book_ids),
|
76
|
+
)
|
79
77
|
try:
|
80
|
-
logger.debug(
|
81
|
-
"[downloader] Starting download for book_id: %s (%s/%s)",
|
82
|
-
book_id,
|
83
|
-
idx,
|
84
|
-
len(book_ids),
|
85
|
-
)
|
86
78
|
self.download_one(book_id)
|
87
79
|
except Exception as e:
|
88
80
|
self._handle_download_exception(book_id, e)
|
89
81
|
|
90
|
-
|
82
|
+
@abc.abstractmethod
|
83
|
+
def download_one(self, book_id: str) -> None:
|
91
84
|
"""
|
92
|
-
|
85
|
+
The full download logic for a single book.
|
93
86
|
|
94
|
-
Subclasses
|
87
|
+
Subclasses must implement this method.
|
95
88
|
|
96
|
-
:param book_id: The
|
89
|
+
:param book_id: The identifier of the book to download.
|
97
90
|
"""
|
98
|
-
|
91
|
+
...
|
99
92
|
|
100
|
-
def
|
93
|
+
def prepare(self) -> None:
|
101
94
|
"""
|
102
|
-
|
103
|
-
|
104
|
-
This method can be overridden or extended to implement retry logic, etc.
|
95
|
+
Optional hook called before downloading each book.
|
105
96
|
|
106
|
-
|
107
|
-
:param error: The exception raised during download.
|
97
|
+
Subclasses can override this method to perform pre-download setup.
|
108
98
|
"""
|
109
|
-
|
99
|
+
return
|
110
100
|
|
111
101
|
@property
|
112
|
-
def requester(self) ->
|
102
|
+
def requester(self) -> SyncRequesterProtocol:
|
113
103
|
"""
|
114
104
|
Access the current requester.
|
115
105
|
|
@@ -162,7 +152,27 @@ class BaseDownloader(DownloaderProtocol, abc.ABC):
|
|
162
152
|
"""
|
163
153
|
return self._cache_dir
|
164
154
|
|
165
|
-
|
155
|
+
@property
|
156
|
+
def site(self) -> str:
|
157
|
+
return self._site
|
158
|
+
|
159
|
+
@property
|
160
|
+
def save_html(self) -> bool:
|
161
|
+
return self._config.save_html
|
162
|
+
|
163
|
+
@property
|
164
|
+
def skip_existing(self) -> bool:
|
165
|
+
return self._config.skip_existing
|
166
|
+
|
167
|
+
@property
|
168
|
+
def login_required(self) -> bool:
|
169
|
+
return self._config.login_required
|
170
|
+
|
171
|
+
@property
|
172
|
+
def request_interval(self) -> float:
|
173
|
+
return self._config.request_interval
|
174
|
+
|
175
|
+
def set_requester(self, requester: SyncRequesterProtocol) -> None:
|
166
176
|
"""
|
167
177
|
Replace the requester instance with a new one.
|
168
178
|
|
@@ -185,3 +195,14 @@ class BaseDownloader(DownloaderProtocol, abc.ABC):
|
|
185
195
|
:param saver: The new saver to be used.
|
186
196
|
"""
|
187
197
|
self._saver = saver
|
198
|
+
|
199
|
+
def _handle_download_exception(self, book_id: str, error: Exception) -> None:
|
200
|
+
"""
|
201
|
+
Handle download errors in a consistent way.
|
202
|
+
|
203
|
+
This method can be overridden or extended to implement retry logic, etc.
|
204
|
+
|
205
|
+
:param book_id: The ID of the book that failed.
|
206
|
+
:param error: The exception raised during download.
|
207
|
+
"""
|
208
|
+
self.logger.warning("[downloader] Failed to download %s: %s", book_id, error)
|
@@ -0,0 +1,25 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.downloaders.biquge.biquge_sync
|
4
|
+
----------------------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
from novel_downloader.config.models import DownloaderConfig
|
9
|
+
from novel_downloader.core.downloaders.common import CommonDownloader
|
10
|
+
from novel_downloader.core.interfaces.parser import ParserProtocol
|
11
|
+
from novel_downloader.core.interfaces.saver import SaverProtocol
|
12
|
+
from novel_downloader.core.interfaces.sync_requester import SyncRequesterProtocol
|
13
|
+
|
14
|
+
|
15
|
+
class BiqugeDownloader(CommonDownloader):
|
16
|
+
""""""
|
17
|
+
|
18
|
+
def __init__(
|
19
|
+
self,
|
20
|
+
requester: SyncRequesterProtocol,
|
21
|
+
parser: ParserProtocol,
|
22
|
+
saver: SaverProtocol,
|
23
|
+
config: DownloaderConfig,
|
24
|
+
):
|
25
|
+
super().__init__(requester, parser, saver, config, "biquge")
|
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.downloaders.common
|
4
|
+
----------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
from .common_async import CommonAsyncDownloader
|
9
|
+
from .common_sync import CommonDownloader
|
10
|
+
|
11
|
+
__all__ = [
|
12
|
+
"CommonAsyncDownloader",
|
13
|
+
"CommonDownloader",
|
14
|
+
]
|
@@ -1,30 +1,28 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.downloaders.
|
5
|
-
|
3
|
+
novel_downloader.core.downloaders.common.common_async
|
4
|
+
-----------------------------------------------------
|
6
5
|
|
7
|
-
This module defines `CommonAsynbDownloader`.
|
8
6
|
"""
|
9
7
|
|
10
8
|
import asyncio
|
11
9
|
import json
|
12
10
|
import logging
|
13
11
|
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
14
|
-
from typing import Any
|
12
|
+
from typing import Any
|
15
13
|
|
16
14
|
from novel_downloader.config import DownloaderConfig
|
15
|
+
from novel_downloader.core.downloaders.base import BaseAsyncDownloader
|
17
16
|
from novel_downloader.core.interfaces import (
|
18
17
|
AsyncRequesterProtocol,
|
19
18
|
ParserProtocol,
|
20
19
|
SaverProtocol,
|
21
20
|
)
|
21
|
+
from novel_downloader.utils.chapter_storage import ChapterDict, ChapterStorage
|
22
22
|
from novel_downloader.utils.file_utils import save_as_json, save_as_txt
|
23
23
|
from novel_downloader.utils.network import download_image_as_bytes
|
24
24
|
from novel_downloader.utils.time_utils import calculate_time_difference
|
25
25
|
|
26
|
-
from .base_async_downloader import BaseAsyncDownloader
|
27
|
-
|
28
26
|
logger = logging.getLogger(__name__)
|
29
27
|
|
30
28
|
|
@@ -64,19 +62,25 @@ class CommonAsyncDownloader(BaseAsyncDownloader):
|
|
64
62
|
assert isinstance(self.requester, AsyncRequesterProtocol)
|
65
63
|
|
66
64
|
TAG = "[AsyncDownloader]"
|
65
|
+
wait_time = self.config.request_interval
|
66
|
+
|
67
67
|
raw_base = self.raw_data_dir / book_id
|
68
68
|
cache_base = self.cache_dir / book_id
|
69
69
|
info_path = raw_base / "book_info.json"
|
70
70
|
chapters_html_dir = cache_base / "html"
|
71
|
-
chapter_dir = raw_base / "chapters"
|
72
71
|
|
73
72
|
raw_base.mkdir(parents=True, exist_ok=True)
|
74
|
-
chapter_dir.mkdir(parents=True, exist_ok=True)
|
75
73
|
if self.save_html:
|
76
74
|
chapters_html_dir.mkdir(parents=True, exist_ok=True)
|
75
|
+
normal_cs = ChapterStorage(
|
76
|
+
raw_base=raw_base,
|
77
|
+
namespace="chapters",
|
78
|
+
backend_type=self._config.storage_backend,
|
79
|
+
batch_size=self._config.storage_batch_size,
|
80
|
+
)
|
77
81
|
|
78
82
|
# load or fetch book_info
|
79
|
-
book_info:
|
83
|
+
book_info: dict[str, Any]
|
80
84
|
re_fetch = True
|
81
85
|
if info_path.exists():
|
82
86
|
try:
|
@@ -89,9 +93,7 @@ class CommonAsyncDownloader(BaseAsyncDownloader):
|
|
89
93
|
re_fetch = True
|
90
94
|
|
91
95
|
if re_fetch:
|
92
|
-
info_html = await self.requester.get_book_info(
|
93
|
-
book_id, self.request_interval
|
94
|
-
)
|
96
|
+
info_html = await self.requester.get_book_info(book_id)
|
95
97
|
if self.save_html:
|
96
98
|
save_as_txt(info_html, chapters_html_dir / "info.html")
|
97
99
|
book_info = self.parser.parse_book_info(info_html)
|
@@ -99,6 +101,7 @@ class CommonAsyncDownloader(BaseAsyncDownloader):
|
|
99
101
|
save_as_json(book_info, info_path)
|
100
102
|
else:
|
101
103
|
logger.warning("%s 书籍信息未找到, book_id = %s", TAG, book_id)
|
104
|
+
await asyncio.sleep(wait_time)
|
102
105
|
else:
|
103
106
|
book_info = json.loads(info_path.read_text("utf-8"))
|
104
107
|
|
@@ -111,7 +114,8 @@ class CommonAsyncDownloader(BaseAsyncDownloader):
|
|
111
114
|
|
112
115
|
# setup queue, semaphore, executor
|
113
116
|
semaphore = asyncio.Semaphore(self.download_workers)
|
114
|
-
queue: asyncio.Queue[
|
117
|
+
queue: asyncio.Queue[tuple[str, str]] = asyncio.Queue()
|
118
|
+
save_queue: asyncio.Queue[ChapterDict] = asyncio.Queue()
|
115
119
|
loop = asyncio.get_running_loop()
|
116
120
|
executor = (
|
117
121
|
ProcessPoolExecutor() if self.use_process_pool else ThreadPoolExecutor()
|
@@ -125,12 +129,7 @@ class CommonAsyncDownloader(BaseAsyncDownloader):
|
|
125
129
|
executor, self.parser.parse_chapter, html, cid
|
126
130
|
)
|
127
131
|
if chap_json:
|
128
|
-
await
|
129
|
-
executor,
|
130
|
-
save_as_json,
|
131
|
-
chap_json,
|
132
|
-
chapter_dir / f"{cid}.json",
|
133
|
-
)
|
132
|
+
await save_queue.put(chap_json)
|
134
133
|
logger.info(
|
135
134
|
"%s [Parser-%d] saved chapter %s", TAG, worker_id, cid
|
136
135
|
)
|
@@ -141,27 +140,34 @@ class CommonAsyncDownloader(BaseAsyncDownloader):
|
|
141
140
|
finally:
|
142
141
|
queue.task_done()
|
143
142
|
|
144
|
-
async def
|
143
|
+
async def saver_loop(
|
144
|
+
cs: ChapterStorage,
|
145
|
+
queue: asyncio.Queue[ChapterDict],
|
146
|
+
) -> None:
|
147
|
+
while True:
|
148
|
+
data = await queue.get()
|
149
|
+
try:
|
150
|
+
cs.save(data)
|
151
|
+
except Exception as e:
|
152
|
+
logger.error(
|
153
|
+
"[saver] Error saving chapter %s: %s",
|
154
|
+
data.get("id"),
|
155
|
+
e,
|
156
|
+
)
|
157
|
+
finally:
|
158
|
+
queue.task_done()
|
159
|
+
|
160
|
+
async def download_worker(chap: dict[str, Any]) -> None:
|
145
161
|
cid = str(chap.get("chapterId") or "")
|
146
162
|
if not cid:
|
147
163
|
return
|
148
|
-
|
149
|
-
if target.exists() and self.skip_existing:
|
164
|
+
if normal_cs.exists(cid) and self.skip_existing:
|
150
165
|
logger.info("%s skipping existing chapter %s", TAG, cid)
|
151
166
|
return
|
152
167
|
|
153
168
|
try:
|
154
169
|
async with semaphore:
|
155
|
-
html = await self.requester.get_book_chapter(
|
156
|
-
book_id, cid, self.request_interval
|
157
|
-
)
|
158
|
-
if self.save_html:
|
159
|
-
await loop.run_in_executor(
|
160
|
-
executor,
|
161
|
-
save_as_txt,
|
162
|
-
html,
|
163
|
-
chapters_html_dir / f"{cid}.html",
|
164
|
-
)
|
170
|
+
html = await self.requester.get_book_chapter(book_id, cid)
|
165
171
|
await queue.put((cid, html))
|
166
172
|
logger.info("%s downloaded chapter %s", TAG, cid)
|
167
173
|
except Exception as e:
|
@@ -171,6 +177,7 @@ class CommonAsyncDownloader(BaseAsyncDownloader):
|
|
171
177
|
parsers = [
|
172
178
|
asyncio.create_task(parser_worker(i)) for i in range(self.parser_workers)
|
173
179
|
]
|
180
|
+
chapter_saver = asyncio.create_task(saver_loop(normal_cs, save_queue))
|
174
181
|
|
175
182
|
# enqueue + run downloads
|
176
183
|
download_tasks = []
|
@@ -180,8 +187,10 @@ class CommonAsyncDownloader(BaseAsyncDownloader):
|
|
180
187
|
|
181
188
|
await asyncio.gather(*download_tasks)
|
182
189
|
await queue.join() # wait until all parsed
|
190
|
+
await save_queue.join()
|
183
191
|
for p in parsers:
|
184
192
|
p.cancel() # stop parser loops
|
193
|
+
chapter_saver.cancel()
|
185
194
|
|
186
195
|
# final save
|
187
196
|
await loop.run_in_executor(executor, self.saver.save, book_id)
|
@@ -1,27 +1,29 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
"""
|
4
|
-
novel_downloader.core.downloaders.
|
5
|
-
|
3
|
+
novel_downloader.core.downloaders.common.common_sync
|
4
|
+
----------------------------------------------------
|
6
5
|
|
7
6
|
This module defines `CommonDownloader`.
|
8
7
|
"""
|
9
8
|
|
10
9
|
import json
|
11
10
|
import logging
|
12
|
-
from typing import Any
|
11
|
+
from typing import Any
|
13
12
|
|
14
13
|
from novel_downloader.config import DownloaderConfig
|
14
|
+
from novel_downloader.core.downloaders.base import BaseDownloader
|
15
15
|
from novel_downloader.core.interfaces import (
|
16
16
|
ParserProtocol,
|
17
|
-
RequesterProtocol,
|
18
17
|
SaverProtocol,
|
18
|
+
SyncRequesterProtocol,
|
19
19
|
)
|
20
|
+
from novel_downloader.utils.chapter_storage import ChapterStorage
|
20
21
|
from novel_downloader.utils.file_utils import save_as_json, save_as_txt
|
21
22
|
from novel_downloader.utils.network import download_image_as_bytes
|
22
|
-
from novel_downloader.utils.time_utils import
|
23
|
-
|
24
|
-
|
23
|
+
from novel_downloader.utils.time_utils import (
|
24
|
+
calculate_time_difference,
|
25
|
+
sleep_with_random_delay,
|
26
|
+
)
|
25
27
|
|
26
28
|
logger = logging.getLogger(__name__)
|
27
29
|
|
@@ -33,7 +35,7 @@ class CommonDownloader(BaseDownloader):
|
|
33
35
|
|
34
36
|
def __init__(
|
35
37
|
self,
|
36
|
-
requester:
|
38
|
+
requester: SyncRequesterProtocol,
|
37
39
|
parser: ParserProtocol,
|
38
40
|
saver: SaverProtocol,
|
39
41
|
config: DownloaderConfig,
|
@@ -48,7 +50,7 @@ class CommonDownloader(BaseDownloader):
|
|
48
50
|
:param config: Downloader configuration object.
|
49
51
|
:param site: Identifier for the site the downloader is targeting.
|
50
52
|
"""
|
51
|
-
super().__init__(requester, parser, saver, config)
|
53
|
+
super().__init__(requester, parser, saver, config, site)
|
52
54
|
self._site = site
|
53
55
|
|
54
56
|
def download_one(self, book_id: str) -> None:
|
@@ -60,19 +62,24 @@ class CommonDownloader(BaseDownloader):
|
|
60
62
|
TAG = "[Downloader]"
|
61
63
|
save_html = self.config.save_html
|
62
64
|
skip_existing = self.config.skip_existing
|
63
|
-
site = self.site
|
64
65
|
wait_time = self.config.request_interval
|
65
66
|
|
66
|
-
raw_base = self.raw_data_dir /
|
67
|
-
cache_base = self.cache_dir /
|
67
|
+
raw_base = self.raw_data_dir / book_id
|
68
|
+
cache_base = self.cache_dir / book_id
|
68
69
|
info_path = raw_base / "book_info.json"
|
69
|
-
chapter_dir = raw_base / "chapters"
|
70
70
|
chapters_html_dir = cache_base / "html"
|
71
71
|
|
72
72
|
raw_base.mkdir(parents=True, exist_ok=True)
|
73
|
-
|
73
|
+
if self.save_html:
|
74
|
+
chapters_html_dir.mkdir(parents=True, exist_ok=True)
|
75
|
+
normal_cs = ChapterStorage(
|
76
|
+
raw_base=raw_base,
|
77
|
+
namespace="chapters",
|
78
|
+
backend_type=self._config.storage_backend,
|
79
|
+
batch_size=self._config.storage_batch_size,
|
80
|
+
)
|
74
81
|
|
75
|
-
book_info:
|
82
|
+
book_info: dict[str, Any]
|
76
83
|
|
77
84
|
try:
|
78
85
|
if not info_path.exists():
|
@@ -87,7 +94,7 @@ class CommonDownloader(BaseDownloader):
|
|
87
94
|
if days > 1:
|
88
95
|
raise FileNotFoundError # trigger re-fetch
|
89
96
|
except Exception:
|
90
|
-
info_html = self.requester.get_book_info(book_id
|
97
|
+
info_html = self.requester.get_book_info(book_id)
|
91
98
|
if save_html:
|
92
99
|
info_html_path = chapters_html_dir / "info.html"
|
93
100
|
save_as_txt(info_html, info_html_path)
|
@@ -97,6 +104,7 @@ class CommonDownloader(BaseDownloader):
|
|
97
104
|
and book_info.get("update_time", "") != "未找到更新时间"
|
98
105
|
):
|
99
106
|
save_as_json(book_info, info_path)
|
107
|
+
sleep_with_random_delay(wait_time, mul_spread=1.1, max_sleep=wait_time + 2)
|
100
108
|
|
101
109
|
# download cover
|
102
110
|
cover_url = book_info.get("cover_url", "")
|
@@ -116,8 +124,7 @@ class CommonDownloader(BaseDownloader):
|
|
116
124
|
logger.warning("%s Skipping chapter without chapterId", TAG)
|
117
125
|
continue
|
118
126
|
|
119
|
-
|
120
|
-
if chap_path.exists() and skip_existing:
|
127
|
+
if normal_cs.exists(cid) and skip_existing:
|
121
128
|
logger.debug(
|
122
129
|
"%s Chapter already exists, skipping: %s",
|
123
130
|
TAG,
|
@@ -128,7 +135,7 @@ class CommonDownloader(BaseDownloader):
|
|
128
135
|
chap_title = chap.get("title", "")
|
129
136
|
logger.info("%s Fetching chapter: %s (%s)", TAG, chap_title, cid)
|
130
137
|
try:
|
131
|
-
chap_html = self.requester.get_book_chapter(book_id, cid
|
138
|
+
chap_html = self.requester.get_book_chapter(book_id, cid)
|
132
139
|
|
133
140
|
if save_html:
|
134
141
|
html_path = chapters_html_dir / f"{cid}.html"
|
@@ -141,6 +148,10 @@ class CommonDownloader(BaseDownloader):
|
|
141
148
|
)
|
142
149
|
|
143
150
|
chap_json = self.parser.parse_chapter(chap_html, cid)
|
151
|
+
|
152
|
+
sleep_with_random_delay(
|
153
|
+
wait_time, mul_spread=1.1, max_sleep=wait_time + 2
|
154
|
+
)
|
144
155
|
if not chap_json:
|
145
156
|
logger.warning(
|
146
157
|
"%s Parsed chapter json is empty, skipping: %s (%s)",
|
@@ -159,9 +170,10 @@ class CommonDownloader(BaseDownloader):
|
|
159
170
|
)
|
160
171
|
continue
|
161
172
|
|
162
|
-
|
173
|
+
normal_cs.save(chap_json)
|
163
174
|
logger.info("%s Saved chapter: %s (%s)", TAG, chap_title, cid)
|
164
175
|
|
176
|
+
normal_cs.close()
|
165
177
|
self.saver.save(book_id)
|
166
178
|
|
167
179
|
logger.info(
|