novel-downloader 1.4.5__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -1
- novel_downloader/cli/__init__.py +2 -2
- novel_downloader/cli/config.py +1 -83
- novel_downloader/cli/download.py +4 -5
- novel_downloader/cli/export.py +4 -1
- novel_downloader/cli/main.py +2 -0
- novel_downloader/cli/search.py +123 -0
- novel_downloader/config/__init__.py +3 -10
- novel_downloader/config/adapter.py +190 -54
- novel_downloader/config/loader.py +2 -3
- novel_downloader/core/__init__.py +13 -13
- novel_downloader/core/downloaders/__init__.py +10 -11
- novel_downloader/core/downloaders/base.py +152 -26
- novel_downloader/core/downloaders/biquge.py +5 -1
- novel_downloader/core/downloaders/common.py +157 -378
- novel_downloader/core/downloaders/esjzone.py +5 -1
- novel_downloader/core/downloaders/linovelib.py +5 -1
- novel_downloader/core/downloaders/qianbi.py +291 -4
- novel_downloader/core/downloaders/qidian.py +199 -285
- novel_downloader/core/downloaders/registry.py +67 -0
- novel_downloader/core/downloaders/sfacg.py +5 -1
- novel_downloader/core/downloaders/yamibo.py +5 -1
- novel_downloader/core/exporters/__init__.py +10 -11
- novel_downloader/core/exporters/base.py +87 -7
- novel_downloader/core/exporters/biquge.py +5 -8
- novel_downloader/core/exporters/common/__init__.py +2 -2
- novel_downloader/core/exporters/common/epub.py +82 -166
- novel_downloader/core/exporters/common/main_exporter.py +0 -60
- novel_downloader/core/exporters/common/txt.py +82 -83
- novel_downloader/core/exporters/epub_util.py +157 -1330
- novel_downloader/core/exporters/esjzone.py +5 -8
- novel_downloader/core/exporters/linovelib/__init__.py +2 -2
- novel_downloader/core/exporters/linovelib/epub.py +157 -212
- novel_downloader/core/exporters/linovelib/main_exporter.py +2 -59
- novel_downloader/core/exporters/linovelib/txt.py +67 -63
- novel_downloader/core/exporters/qianbi.py +5 -8
- novel_downloader/core/exporters/qidian.py +14 -4
- novel_downloader/core/exporters/registry.py +53 -0
- novel_downloader/core/exporters/sfacg.py +5 -8
- novel_downloader/core/exporters/txt_util.py +67 -0
- novel_downloader/core/exporters/yamibo.py +5 -8
- novel_downloader/core/fetchers/__init__.py +19 -24
- novel_downloader/core/fetchers/base/__init__.py +3 -3
- novel_downloader/core/fetchers/base/browser.py +23 -4
- novel_downloader/core/fetchers/base/session.py +30 -5
- novel_downloader/core/fetchers/biquge/__init__.py +3 -3
- novel_downloader/core/fetchers/biquge/browser.py +5 -0
- novel_downloader/core/fetchers/biquge/session.py +6 -1
- novel_downloader/core/fetchers/esjzone/__init__.py +3 -3
- novel_downloader/core/fetchers/esjzone/browser.py +5 -0
- novel_downloader/core/fetchers/esjzone/session.py +6 -1
- novel_downloader/core/fetchers/linovelib/__init__.py +3 -3
- novel_downloader/core/fetchers/linovelib/browser.py +6 -1
- novel_downloader/core/fetchers/linovelib/session.py +6 -1
- novel_downloader/core/fetchers/qianbi/__init__.py +3 -3
- novel_downloader/core/fetchers/qianbi/browser.py +5 -0
- novel_downloader/core/fetchers/qianbi/session.py +5 -0
- novel_downloader/core/fetchers/qidian/__init__.py +3 -3
- novel_downloader/core/fetchers/qidian/browser.py +12 -4
- novel_downloader/core/fetchers/qidian/session.py +11 -3
- novel_downloader/core/fetchers/registry.py +71 -0
- novel_downloader/core/fetchers/sfacg/__init__.py +3 -3
- novel_downloader/core/fetchers/sfacg/browser.py +5 -0
- novel_downloader/core/fetchers/sfacg/session.py +5 -0
- novel_downloader/core/fetchers/yamibo/__init__.py +3 -3
- novel_downloader/core/fetchers/yamibo/browser.py +5 -0
- novel_downloader/core/fetchers/yamibo/session.py +6 -1
- novel_downloader/core/interfaces/__init__.py +7 -5
- novel_downloader/core/interfaces/searcher.py +18 -0
- novel_downloader/core/parsers/__init__.py +10 -11
- novel_downloader/core/parsers/{biquge/main_parser.py → biquge.py} +7 -2
- novel_downloader/core/parsers/{esjzone/main_parser.py → esjzone.py} +7 -2
- novel_downloader/core/parsers/{linovelib/main_parser.py → linovelib.py} +7 -2
- novel_downloader/core/parsers/{qianbi/main_parser.py → qianbi.py} +7 -2
- novel_downloader/core/parsers/qidian/__init__.py +2 -2
- novel_downloader/core/parsers/qidian/chapter_encrypted.py +23 -21
- novel_downloader/core/parsers/qidian/chapter_normal.py +1 -1
- novel_downloader/core/parsers/qidian/main_parser.py +10 -21
- novel_downloader/core/parsers/qidian/utils/__init__.py +11 -11
- novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +5 -6
- novel_downloader/core/parsers/qidian/utils/node_decryptor.py +2 -2
- novel_downloader/core/parsers/registry.py +68 -0
- novel_downloader/core/parsers/{sfacg/main_parser.py → sfacg.py} +7 -2
- novel_downloader/core/parsers/{yamibo/main_parser.py → yamibo.py} +7 -2
- novel_downloader/core/searchers/__init__.py +20 -0
- novel_downloader/core/searchers/base.py +92 -0
- novel_downloader/core/searchers/biquge.py +83 -0
- novel_downloader/core/searchers/esjzone.py +84 -0
- novel_downloader/core/searchers/qianbi.py +131 -0
- novel_downloader/core/searchers/qidian.py +87 -0
- novel_downloader/core/searchers/registry.py +63 -0
- novel_downloader/locales/en.json +12 -4
- novel_downloader/locales/zh.json +12 -4
- novel_downloader/models/__init__.py +4 -30
- novel_downloader/models/config.py +12 -6
- novel_downloader/models/search.py +16 -0
- novel_downloader/models/types.py +0 -2
- novel_downloader/resources/config/settings.toml +31 -4
- novel_downloader/resources/css_styles/intro.css +83 -0
- novel_downloader/resources/css_styles/main.css +30 -89
- novel_downloader/utils/__init__.py +52 -0
- novel_downloader/utils/chapter_storage.py +244 -224
- novel_downloader/utils/constants.py +1 -21
- novel_downloader/utils/epub/__init__.py +34 -0
- novel_downloader/utils/epub/builder.py +377 -0
- novel_downloader/utils/epub/constants.py +77 -0
- novel_downloader/utils/epub/documents.py +403 -0
- novel_downloader/utils/epub/models.py +134 -0
- novel_downloader/utils/epub/utils.py +212 -0
- novel_downloader/utils/file_utils/__init__.py +10 -14
- novel_downloader/utils/file_utils/io.py +20 -51
- novel_downloader/utils/file_utils/normalize.py +2 -2
- novel_downloader/utils/file_utils/sanitize.py +2 -3
- novel_downloader/utils/fontocr/__init__.py +5 -5
- novel_downloader/utils/{hash_store.py → fontocr/hash_store.py} +4 -3
- novel_downloader/utils/{hash_utils.py → fontocr/hash_utils.py} +2 -2
- novel_downloader/utils/fontocr/ocr_v1.py +13 -1
- novel_downloader/utils/fontocr/ocr_v2.py +13 -1
- novel_downloader/utils/fontocr/ocr_v3.py +744 -0
- novel_downloader/utils/i18n.py +2 -0
- novel_downloader/utils/logger.py +2 -0
- novel_downloader/utils/network.py +110 -251
- novel_downloader/utils/state.py +1 -0
- novel_downloader/utils/text_utils/__init__.py +18 -17
- novel_downloader/utils/text_utils/diff_display.py +4 -5
- novel_downloader/utils/text_utils/numeric_conversion.py +253 -0
- novel_downloader/utils/text_utils/text_cleaner.py +179 -0
- novel_downloader/utils/text_utils/truncate_utils.py +62 -0
- novel_downloader/utils/time_utils/__init__.py +3 -3
- novel_downloader/utils/time_utils/datetime_utils.py +4 -5
- novel_downloader/utils/time_utils/sleep_utils.py +2 -3
- {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/METADATA +2 -2
- novel_downloader-1.5.0.dist-info/RECORD +164 -0
- novel_downloader/config/site_rules.py +0 -94
- novel_downloader/core/factory/__init__.py +0 -20
- novel_downloader/core/factory/downloader.py +0 -73
- novel_downloader/core/factory/exporter.py +0 -58
- novel_downloader/core/factory/fetcher.py +0 -96
- novel_downloader/core/factory/parser.py +0 -86
- novel_downloader/core/fetchers/common/__init__.py +0 -14
- novel_downloader/core/fetchers/common/browser.py +0 -79
- novel_downloader/core/fetchers/common/session.py +0 -79
- novel_downloader/core/parsers/biquge/__init__.py +0 -10
- novel_downloader/core/parsers/common/__init__.py +0 -13
- novel_downloader/core/parsers/common/helper.py +0 -323
- novel_downloader/core/parsers/common/main_parser.py +0 -106
- novel_downloader/core/parsers/esjzone/__init__.py +0 -10
- novel_downloader/core/parsers/linovelib/__init__.py +0 -10
- novel_downloader/core/parsers/qianbi/__init__.py +0 -10
- novel_downloader/core/parsers/sfacg/__init__.py +0 -10
- novel_downloader/core/parsers/yamibo/__init__.py +0 -10
- novel_downloader/models/browser.py +0 -21
- novel_downloader/models/site_rules.py +0 -99
- novel_downloader/models/tasks.py +0 -33
- novel_downloader/resources/css_styles/volume-intro.css +0 -56
- novel_downloader/resources/json/replace_word_map.json +0 -4
- novel_downloader/resources/text/blacklist.txt +0 -22
- novel_downloader/utils/text_utils/chapter_formatting.py +0 -46
- novel_downloader/utils/text_utils/font_mapping.py +0 -28
- novel_downloader/utils/text_utils/text_cleaning.py +0 -107
- novel_downloader-1.4.5.dist-info/RECORD +0 -165
- {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/WHEEL +0 -0
- {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/entry_points.txt +0 -0
- {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/top_level.txt +0 -0
@@ -7,67 +7,53 @@ Defines ConfigAdapter, which maps a raw configuration dictionary and
|
|
7
7
|
site name into structured dataclass-based config models.
|
8
8
|
"""
|
9
9
|
|
10
|
-
|
10
|
+
import json
|
11
|
+
from typing import Any, cast
|
11
12
|
|
12
13
|
from novel_downloader.models import (
|
13
14
|
BookConfig,
|
14
15
|
DownloaderConfig,
|
15
16
|
ExporterConfig,
|
16
17
|
FetcherConfig,
|
18
|
+
LogLevel,
|
17
19
|
ParserConfig,
|
20
|
+
TextCleanerConfig,
|
18
21
|
)
|
19
|
-
from novel_downloader.utils.constants import SUPPORTED_SITES
|
20
|
-
|
21
|
-
from .site_rules import load_site_rules
|
22
22
|
|
23
23
|
|
24
24
|
class ConfigAdapter:
|
25
25
|
"""
|
26
|
-
Adapter to map a raw
|
26
|
+
Adapter to map a raw configuration dictionary and site name
|
27
|
+
into structured dataclass configuration models.
|
27
28
|
"""
|
28
29
|
|
30
|
+
_ALLOWED_LOG_LEVELS: tuple[LogLevel, ...] = (
|
31
|
+
"DEBUG",
|
32
|
+
"INFO",
|
33
|
+
"WARNING",
|
34
|
+
"ERROR",
|
35
|
+
)
|
36
|
+
|
29
37
|
def __init__(self, config: dict[str, Any], site: str):
|
30
38
|
"""
|
31
|
-
|
32
|
-
|
39
|
+
Initialize the adapter.
|
40
|
+
|
41
|
+
:param config: The fully loaded configuration dictionary.
|
42
|
+
:param site: The current site name (e.g. "qidian").
|
33
43
|
"""
|
34
44
|
self._config = config
|
35
45
|
self._site = site
|
36
46
|
|
37
|
-
|
38
|
-
self._supported_sites = set(site_rules.keys()) | SUPPORTED_SITES
|
39
|
-
|
40
|
-
@property
|
41
|
-
def site(self) -> str:
|
42
|
-
return self._site
|
43
|
-
|
44
|
-
@site.setter
|
45
|
-
def site(self, value: str) -> None:
|
46
|
-
self._site = value
|
47
|
-
|
48
|
-
def _get_site_cfg(self, site: str | None = None) -> dict[str, Any]:
|
49
|
-
"""
|
50
|
-
获取指定站点的配置 (默认为当前适配站点)
|
51
|
-
|
52
|
-
1. 如果有 site-specific 配置, 优先返回它
|
53
|
-
2. 否则, 如果该站点在支持站点中, 尝试返回 'common' 配置
|
54
|
-
3. 否则返回空 dict
|
47
|
+
def get_fetcher_config(self) -> FetcherConfig:
|
55
48
|
"""
|
56
|
-
|
57
|
-
sites_cfg = self._config.get("sites", {}) or {}
|
58
|
-
|
59
|
-
if site in sites_cfg:
|
60
|
-
return sites_cfg[site] or {}
|
61
|
-
|
62
|
-
if site in self._supported_sites:
|
63
|
-
return sites_cfg.get("common", {}) or {}
|
49
|
+
Build a FetcherConfig from the raw configuration.
|
64
50
|
|
65
|
-
|
51
|
+
Reads from:
|
52
|
+
- config["general"] for global defaults (e.g. request_interval)
|
53
|
+
- config["requests"] for HTTP-specific settings (timeouts, retries, etc.)
|
54
|
+
- site-specific overrides under config["sites"][site]
|
66
55
|
|
67
|
-
|
68
|
-
"""
|
69
|
-
从 config["requests"] 中读取通用请求配置
|
70
|
-
返回 FetcherConfig 实例
|
56
|
+
:return: A FetcherConfig instance with all fields populated.
|
71
57
|
"""
|
72
58
|
gen = self._config.get("general", {})
|
73
59
|
req = self._config.get("requests", {})
|
@@ -91,8 +77,15 @@ class ConfigAdapter:
|
|
91
77
|
|
92
78
|
def get_downloader_config(self) -> DownloaderConfig:
|
93
79
|
"""
|
94
|
-
|
95
|
-
|
80
|
+
Build a DownloaderConfig using both general and site-specific settings.
|
81
|
+
|
82
|
+
Reads from:
|
83
|
+
- config["general"] for download directories, worker counts, etc.
|
84
|
+
- config["requests"] for retry and backoff settings
|
85
|
+
- config["general"]["debug"] for debug toggles (e.g. save_html)
|
86
|
+
- config["sites"][site] for login credentials and mode
|
87
|
+
|
88
|
+
:return: A DownloaderConfig instance with all fields populated.
|
96
89
|
"""
|
97
90
|
gen = self._config.get("general", {})
|
98
91
|
req = self._config.get("requests", {})
|
@@ -104,13 +97,11 @@ class ConfigAdapter:
|
|
104
97
|
backoff_factor=req.get("backoff_factor", 2.0),
|
105
98
|
raw_data_dir=gen.get("raw_data_dir", "./raw_data"),
|
106
99
|
cache_dir=gen.get("cache_dir", "./novel_cache"),
|
107
|
-
|
108
|
-
parser_workers=gen.get("parser_workers", 2),
|
100
|
+
workers=gen.get("workers", 2),
|
109
101
|
skip_existing=gen.get("skip_existing", True),
|
110
102
|
login_required=site_cfg.get("login_required", False),
|
111
103
|
save_html=debug.get("save_html", False),
|
112
104
|
mode=site_cfg.get("mode", "session"),
|
113
|
-
storage_backend=gen.get("storage_backend", "json"),
|
114
105
|
storage_batch_size=gen.get("storage_batch_size", 1),
|
115
106
|
username=site_cfg.get("username", ""),
|
116
107
|
password=site_cfg.get("password", ""),
|
@@ -119,8 +110,14 @@ class ConfigAdapter:
|
|
119
110
|
|
120
111
|
def get_parser_config(self) -> ParserConfig:
|
121
112
|
"""
|
122
|
-
|
123
|
-
|
113
|
+
Build a ParserConfig from general, OCR, and site-specific settings.
|
114
|
+
|
115
|
+
Reads from:
|
116
|
+
- config["general"]["cache_dir"] for where to cache intermediate parses
|
117
|
+
- config["general"]["font_ocr"] for font-decoding and OCR options
|
118
|
+
- config["sites"][site] for parsing mode and truncation behavior
|
119
|
+
|
120
|
+
:return: A ParserConfig instance with all fields populated.
|
124
121
|
"""
|
125
122
|
gen = self._config.get("general", {})
|
126
123
|
font_ocr = gen.get("font_ocr", {})
|
@@ -144,20 +141,29 @@ class ConfigAdapter:
|
|
144
141
|
|
145
142
|
def get_exporter_config(self) -> ExporterConfig:
|
146
143
|
"""
|
147
|
-
|
148
|
-
|
144
|
+
Build an ExporterConfig from output and general settings.
|
145
|
+
|
146
|
+
Reads from:
|
147
|
+
- config["general"] for cache and raw data directories
|
148
|
+
- config["output"]["formats"] for which formats to generate
|
149
|
+
- config["output"]["naming"] for filename templates
|
150
|
+
- config["output"]["epub"] for EPUB-specific options
|
151
|
+
- config["sites"][site] for export split mode
|
152
|
+
|
153
|
+
:return: An ExporterConfig instance with all fields populated.
|
149
154
|
"""
|
150
155
|
gen = self._config.get("general", {})
|
151
156
|
out = self._config.get("output", {})
|
157
|
+
cln = self._config.get("cleaner", {})
|
152
158
|
fmt = out.get("formats", {})
|
153
159
|
naming = out.get("naming", {})
|
154
160
|
epub_opts = out.get("epub", {})
|
155
161
|
site_cfg = self._get_site_cfg()
|
162
|
+
cleaner_cfg = self._dict_to_cleaner_cfg(cln)
|
156
163
|
return ExporterConfig(
|
157
164
|
cache_dir=gen.get("cache_dir", "./novel_cache"),
|
158
165
|
raw_data_dir=gen.get("raw_data_dir", "./raw_data"),
|
159
166
|
output_dir=gen.get("output_dir", "./downloads"),
|
160
|
-
storage_backend=gen.get("storage_backend", "json"),
|
161
167
|
clean_text=out.get("clean_text", True),
|
162
168
|
make_txt=fmt.get("make_txt", True),
|
163
169
|
make_epub=fmt.get("make_epub", False),
|
@@ -169,11 +175,20 @@ class ConfigAdapter:
|
|
169
175
|
include_toc=epub_opts.get("include_toc", False),
|
170
176
|
include_picture=epub_opts.get("include_picture", False),
|
171
177
|
split_mode=site_cfg.get("split_mode", "book"),
|
178
|
+
cleaner_cfg=cleaner_cfg,
|
172
179
|
)
|
173
180
|
|
174
181
|
def get_book_ids(self) -> list[BookConfig]:
|
175
182
|
"""
|
176
|
-
|
183
|
+
Extract the list of target books from the site configuration.
|
184
|
+
|
185
|
+
The site config may specify book_ids as:
|
186
|
+
- a single string or integer
|
187
|
+
- a dict with book_id and optional start_id, end_id, ignore_ids
|
188
|
+
- a list of the above types
|
189
|
+
|
190
|
+
:return: A list of BookConfig dicts.
|
191
|
+
:raises ValueError: if the raw book_ids is neither a str/int, dict, nor list.
|
177
192
|
"""
|
178
193
|
site_cfg = self._get_site_cfg()
|
179
194
|
raw = site_cfg.get("book_ids", [])
|
@@ -182,7 +197,7 @@ class ConfigAdapter:
|
|
182
197
|
return [{"book_id": str(raw)}]
|
183
198
|
|
184
199
|
if isinstance(raw, dict):
|
185
|
-
return [self.
|
200
|
+
return [self._dict_to_book_cfg(raw)]
|
186
201
|
|
187
202
|
if not isinstance(raw, list):
|
188
203
|
raise ValueError(
|
@@ -195,17 +210,71 @@ class ConfigAdapter:
|
|
195
210
|
if isinstance(item, str | int):
|
196
211
|
result.append({"book_id": str(item)})
|
197
212
|
elif isinstance(item, dict):
|
198
|
-
result.append(self.
|
213
|
+
result.append(self._dict_to_book_cfg(item))
|
199
214
|
except ValueError:
|
200
215
|
continue
|
201
216
|
|
202
217
|
return result
|
203
218
|
|
219
|
+
def get_log_level(self) -> LogLevel:
|
220
|
+
"""
|
221
|
+
Retrieve the logging level from [general.debug].
|
222
|
+
|
223
|
+
Reads from config["general"]["debug"]["log_level"], defaulting to "INFO"
|
224
|
+
if not set or invalid.
|
225
|
+
|
226
|
+
:return: The configured LogLevel literal ("DEBUG", "INFO", "WARNING", "ERROR").
|
227
|
+
"""
|
228
|
+
debug_cfg = self._config.get("general", {}).get("debug", {})
|
229
|
+
raw = debug_cfg.get("log_level") or "INFO"
|
230
|
+
if raw in self._ALLOWED_LOG_LEVELS:
|
231
|
+
return cast(LogLevel, raw)
|
232
|
+
return "INFO"
|
233
|
+
|
234
|
+
@property
|
235
|
+
def site(self) -> str:
|
236
|
+
"""
|
237
|
+
Get the current site name.
|
238
|
+
"""
|
239
|
+
return self._site
|
240
|
+
|
241
|
+
@site.setter
|
242
|
+
def site(self, value: str) -> None:
|
243
|
+
"""
|
244
|
+
Set a new site name for configuration lookups.
|
245
|
+
|
246
|
+
:param value: The new site key in config["sites"] to use.
|
247
|
+
"""
|
248
|
+
self._site = value
|
249
|
+
|
250
|
+
def _get_site_cfg(self, site: str | None = None) -> dict[str, Any]:
|
251
|
+
"""
|
252
|
+
Retrieve the configuration for a specific site.
|
253
|
+
|
254
|
+
Lookup order:
|
255
|
+
1. If there is a site-specific entry under config["sites"], return that.
|
256
|
+
2. Otherwise, if a "common" entry exists under config["sites"], return that.
|
257
|
+
3. If neither is present, return an empty dict.
|
258
|
+
|
259
|
+
:param site: Optional override of the site name; defaults to self._site.
|
260
|
+
:return: The site-specific or common configuration dict.
|
261
|
+
"""
|
262
|
+
site = site or self._site
|
263
|
+
sites_cfg = self._config.get("sites", {}) or {}
|
264
|
+
|
265
|
+
if site in sites_cfg:
|
266
|
+
return sites_cfg[site] or {}
|
267
|
+
|
268
|
+
return sites_cfg.get("common", {}) or {}
|
269
|
+
|
204
270
|
@staticmethod
|
205
|
-
def
|
271
|
+
def _dict_to_book_cfg(data: dict[str, Any]) -> BookConfig:
|
206
272
|
"""
|
207
|
-
|
208
|
-
|
273
|
+
Convert a dictionary to a BookConfig with normalized types.
|
274
|
+
|
275
|
+
:param data: A dict that must contain at least "book_id".
|
276
|
+
:return: A BookConfig dict with all values cast to strings or lists of strings.
|
277
|
+
:raises ValueError: if the "book_id" field is missing.
|
209
278
|
"""
|
210
279
|
if "book_id" not in data:
|
211
280
|
raise ValueError("Missing required field 'book_id'")
|
@@ -222,3 +291,70 @@ class ConfigAdapter:
|
|
222
291
|
result["ignore_ids"] = [str(x) for x in data["ignore_ids"]]
|
223
292
|
|
224
293
|
return result
|
294
|
+
|
295
|
+
@classmethod
|
296
|
+
def _dict_to_cleaner_cfg(cls, cfg: dict[str, Any]) -> TextCleanerConfig:
|
297
|
+
"""
|
298
|
+
Convert a nested dict of title/content rules into a TextCleanerConfig.
|
299
|
+
|
300
|
+
:param cfg: configuration dictionary
|
301
|
+
:return: fully constructed TextCleanerConfig
|
302
|
+
"""
|
303
|
+
# Title rules
|
304
|
+
title_section = cfg.get("title", {})
|
305
|
+
title_remove = title_section.get("remove_patterns", [])
|
306
|
+
title_repl = title_section.get("replace", {})
|
307
|
+
|
308
|
+
title_ext = title_section.get("external", {})
|
309
|
+
title_ext_en = title_ext.get("enabled", False)
|
310
|
+
title_ext_rm_p = title_ext.get("remove_patterns", "")
|
311
|
+
title_ext_rp_p = title_ext.get("replace", "")
|
312
|
+
if title_ext_en:
|
313
|
+
title_remove_ext = cls._load_str_list(title_ext_rm_p)
|
314
|
+
title_remove += title_remove_ext
|
315
|
+
|
316
|
+
title_repl_ext = cls._load_str_dict(title_ext_rp_p)
|
317
|
+
title_repl = {**title_repl, **title_repl_ext}
|
318
|
+
|
319
|
+
# Content rules
|
320
|
+
content_section = cfg.get("content", {})
|
321
|
+
content_remove = content_section.get("remove_patterns", [])
|
322
|
+
content_repl = content_section.get("replace", {})
|
323
|
+
|
324
|
+
content_ext = content_section.get("external", {})
|
325
|
+
content_ext_en = content_ext.get("enabled", False)
|
326
|
+
content_ext_rm_p = content_ext.get("remove_patterns", "")
|
327
|
+
content_ext_rp_p = content_ext.get("replace", "")
|
328
|
+
|
329
|
+
if content_ext_en:
|
330
|
+
content_remove_ext = cls._load_str_list(content_ext_rm_p)
|
331
|
+
content_remove += content_remove_ext
|
332
|
+
|
333
|
+
content_repl_ext = cls._load_str_dict(content_ext_rp_p)
|
334
|
+
content_repl = {**content_repl, **content_repl_ext}
|
335
|
+
|
336
|
+
return TextCleanerConfig(
|
337
|
+
remove_invisible=cfg.get("remove_invisible", True),
|
338
|
+
title_remove_patterns=title_remove,
|
339
|
+
title_replacements=title_repl,
|
340
|
+
content_remove_patterns=content_remove,
|
341
|
+
content_replacements=content_repl,
|
342
|
+
)
|
343
|
+
|
344
|
+
@staticmethod
|
345
|
+
def _load_str_list(path: str) -> list[str]:
|
346
|
+
try:
|
347
|
+
with open(path, encoding="utf-8") as f:
|
348
|
+
parsed = json.load(f)
|
349
|
+
return cast(list[str], parsed)
|
350
|
+
except Exception:
|
351
|
+
return []
|
352
|
+
|
353
|
+
@staticmethod
|
354
|
+
def _load_str_dict(path: str) -> dict[str, str]:
|
355
|
+
try:
|
356
|
+
with open(path, encoding="utf-8") as f:
|
357
|
+
parsed = json.load(f)
|
358
|
+
return cast(dict[str, str], parsed)
|
359
|
+
except Exception:
|
360
|
+
return {}
|
@@ -7,6 +7,8 @@ Provides functionality to load Toml configuration files into Python
|
|
7
7
|
dictionaries, with robust error handling and fallback support.
|
8
8
|
"""
|
9
9
|
|
10
|
+
__all__ = ["load_config"]
|
11
|
+
|
10
12
|
import json
|
11
13
|
import logging
|
12
14
|
from pathlib import Path
|
@@ -180,6 +182,3 @@ def save_config_file(
|
|
180
182
|
|
181
183
|
logger.info("[config] Configuration successfully saved to JSON: %s", output)
|
182
184
|
return
|
183
|
-
|
184
|
-
|
185
|
-
__all__ = ["load_config"]
|
@@ -14,26 +14,26 @@ downloading and processing online novel content, including:
|
|
14
14
|
- Exporter: Responsible for exporting downloaded data into various output formats.
|
15
15
|
"""
|
16
16
|
|
17
|
-
from .factory import (
|
18
|
-
get_downloader,
|
19
|
-
get_exporter,
|
20
|
-
get_fetcher,
|
21
|
-
get_parser,
|
22
|
-
)
|
23
|
-
from .interfaces import (
|
24
|
-
DownloaderProtocol,
|
25
|
-
ExporterProtocol,
|
26
|
-
FetcherProtocol,
|
27
|
-
ParserProtocol,
|
28
|
-
)
|
29
|
-
|
30
17
|
__all__ = [
|
31
18
|
"get_downloader",
|
32
19
|
"get_exporter",
|
33
20
|
"get_fetcher",
|
34
21
|
"get_parser",
|
22
|
+
"search",
|
35
23
|
"DownloaderProtocol",
|
36
24
|
"ExporterProtocol",
|
37
25
|
"FetcherProtocol",
|
38
26
|
"ParserProtocol",
|
39
27
|
]
|
28
|
+
|
29
|
+
from .downloaders import get_downloader
|
30
|
+
from .exporters import get_exporter
|
31
|
+
from .fetchers import get_fetcher
|
32
|
+
from .interfaces import (
|
33
|
+
DownloaderProtocol,
|
34
|
+
ExporterProtocol,
|
35
|
+
FetcherProtocol,
|
36
|
+
ParserProtocol,
|
37
|
+
)
|
38
|
+
from .parsers import get_parser
|
39
|
+
from .searchers import search
|
@@ -17,19 +17,10 @@ Currently supported platforms:
|
|
17
17
|
- qidian (起点中文网)
|
18
18
|
- sfacg (SF轻小说)
|
19
19
|
- yamibo (百合会)
|
20
|
-
- common (通用架构)
|
21
20
|
"""
|
22
21
|
|
23
|
-
from .biquge import BiqugeDownloader
|
24
|
-
from .common import CommonDownloader
|
25
|
-
from .esjzone import EsjzoneDownloader
|
26
|
-
from .linovelib import LinovelibDownloader
|
27
|
-
from .qianbi import QianbiDownloader
|
28
|
-
from .qidian import QidianDownloader
|
29
|
-
from .sfacg import SfacgDownloader
|
30
|
-
from .yamibo import YamiboDownloader
|
31
|
-
|
32
22
|
__all__ = [
|
23
|
+
"get_downloader",
|
33
24
|
"BiqugeDownloader",
|
34
25
|
"EsjzoneDownloader",
|
35
26
|
"LinovelibDownloader",
|
@@ -37,5 +28,13 @@ __all__ = [
|
|
37
28
|
"QidianDownloader",
|
38
29
|
"SfacgDownloader",
|
39
30
|
"YamiboDownloader",
|
40
|
-
"CommonDownloader",
|
41
31
|
]
|
32
|
+
|
33
|
+
from .biquge import BiqugeDownloader
|
34
|
+
from .esjzone import EsjzoneDownloader
|
35
|
+
from .linovelib import LinovelibDownloader
|
36
|
+
from .qianbi import QianbiDownloader
|
37
|
+
from .qidian import QidianDownloader
|
38
|
+
from .registry import get_downloader
|
39
|
+
from .sfacg import SfacgDownloader
|
40
|
+
from .yamibo import YamiboDownloader
|
@@ -8,8 +8,9 @@ common interface and reusable logic for all downloader implementations.
|
|
8
8
|
"""
|
9
9
|
|
10
10
|
import abc
|
11
|
+
import json
|
11
12
|
import logging
|
12
|
-
from collections.abc import Awaitable, Callable
|
13
|
+
from collections.abc import AsyncIterator, Awaitable, Callable, Sequence
|
13
14
|
from pathlib import Path
|
14
15
|
from typing import Any
|
15
16
|
|
@@ -19,32 +20,54 @@ from novel_downloader.core.interfaces import (
|
|
19
20
|
ParserProtocol,
|
20
21
|
)
|
21
22
|
from novel_downloader.models import BookConfig, DownloaderConfig
|
23
|
+
from novel_downloader.utils import calculate_time_difference
|
22
24
|
|
23
25
|
|
24
26
|
class BaseDownloader(DownloaderProtocol, abc.ABC):
|
25
27
|
"""
|
26
|
-
Abstract
|
27
|
-
and the general batch download flow.
|
28
|
+
Abstract base class for novel downloaders.
|
28
29
|
|
29
|
-
|
30
|
+
Defines the general interface and batch download workflow,
|
31
|
+
while delegating book-specific downloading logic to subclasses.
|
32
|
+
|
33
|
+
Subclasses are required to implement methods for downloading
|
34
|
+
a single book, using the provided fetcher and parser components.
|
30
35
|
"""
|
31
36
|
|
37
|
+
DEFAULT_SOURCE_ID = 0
|
38
|
+
DEFAULT_PRIORITIES_MAP = {
|
39
|
+
DEFAULT_SOURCE_ID: 0,
|
40
|
+
}
|
41
|
+
|
32
42
|
def __init__(
|
33
43
|
self,
|
34
44
|
fetcher: FetcherProtocol,
|
35
45
|
parser: ParserProtocol,
|
36
46
|
config: DownloaderConfig,
|
37
47
|
site: str,
|
48
|
+
priorities: dict[int, int] | None = None,
|
38
49
|
):
|
50
|
+
"""
|
51
|
+
Initialize the downloader for a specific site.
|
52
|
+
|
53
|
+
:param fetcher: Fetcher component for retrieving raw chapter data.
|
54
|
+
:param parser: Parser component for extracting chapter content.
|
55
|
+
:param config: Downloader configuration settings.
|
56
|
+
:param site: Identifier for the target website or source.
|
57
|
+
:param priorities: Mapping of source_id to priority value.
|
58
|
+
Lower numbers indicate higher priority.
|
59
|
+
E.X. {0: 10, 1: 100} means source 0 is preferred.
|
60
|
+
"""
|
39
61
|
self._fetcher = fetcher
|
40
62
|
self._parser = parser
|
41
63
|
self._config = config
|
42
64
|
self._site = site
|
65
|
+
self._priorities = priorities or self.DEFAULT_PRIORITIES_MAP
|
43
66
|
|
44
67
|
self._raw_data_dir = Path(config.raw_data_dir) / site
|
45
|
-
self._cache_dir = Path(config.cache_dir) / site
|
46
68
|
self._raw_data_dir.mkdir(parents=True, exist_ok=True)
|
47
|
-
self.
|
69
|
+
self._debug_dir = Path.cwd() / "debug" / site
|
70
|
+
self._debug_dir.mkdir(parents=True, exist_ok=True)
|
48
71
|
|
49
72
|
self.logger = logging.getLogger(f"{self.__class__.__name__}")
|
50
73
|
|
@@ -117,6 +140,28 @@ class BaseDownloader(DownloaderProtocol, abc.ABC):
|
|
117
140
|
|
118
141
|
await self._finalize()
|
119
142
|
|
143
|
+
async def load_book_info(
|
144
|
+
self,
|
145
|
+
book_id: str,
|
146
|
+
html_dir: Path,
|
147
|
+
) -> dict[str, Any]:
|
148
|
+
book_info = self._load_book_info(
|
149
|
+
book_id=book_id,
|
150
|
+
max_age_days=1,
|
151
|
+
)
|
152
|
+
if book_info:
|
153
|
+
return book_info
|
154
|
+
|
155
|
+
info_html = await self.fetcher.get_book_info(book_id)
|
156
|
+
self._save_html_pages(html_dir, "info", info_html)
|
157
|
+
book_info = self.parser.parse_book_info(info_html)
|
158
|
+
|
159
|
+
if book_info:
|
160
|
+
self._save_book_info(book_id, book_info)
|
161
|
+
return book_info
|
162
|
+
|
163
|
+
return self._load_book_info(book_id)
|
164
|
+
|
120
165
|
@abc.abstractmethod
|
121
166
|
async def _download_one(
|
122
167
|
self,
|
@@ -147,29 +192,110 @@ class BaseDownloader(DownloaderProtocol, abc.ABC):
|
|
147
192
|
"""
|
148
193
|
return
|
149
194
|
|
150
|
-
|
151
|
-
|
152
|
-
|
195
|
+
def _load_book_info(
|
196
|
+
self,
|
197
|
+
book_id: str,
|
198
|
+
*,
|
199
|
+
max_age_days: int | None = None,
|
200
|
+
) -> dict[str, Any]:
|
201
|
+
"""
|
202
|
+
Attempt to read and parse the book_info.json for a given book_id.
|
153
203
|
|
154
|
-
|
155
|
-
|
156
|
-
return
|
204
|
+
:param book_id: identifier of the book
|
205
|
+
:param max_age_days: if set, only return if 'update_time' is less
|
206
|
+
:return: dict of book info if is valid JSON, else empty
|
207
|
+
"""
|
208
|
+
info_path = self._raw_data_dir / book_id / "book_info.json"
|
209
|
+
if not info_path.is_file():
|
210
|
+
return {}
|
157
211
|
|
158
|
-
|
159
|
-
|
160
|
-
|
212
|
+
try:
|
213
|
+
data: dict[str, Any] = json.loads(info_path.read_text(encoding="utf-8"))
|
214
|
+
except json.JSONDecodeError:
|
215
|
+
return {}
|
216
|
+
|
217
|
+
if max_age_days is not None:
|
218
|
+
days, *_ = calculate_time_difference(
|
219
|
+
data.get("update_time", ""),
|
220
|
+
"UTC+8",
|
221
|
+
)
|
222
|
+
if days > max_age_days:
|
223
|
+
return {}
|
161
224
|
|
162
|
-
|
163
|
-
|
164
|
-
|
225
|
+
return data
|
226
|
+
|
227
|
+
def _save_book_info(
|
228
|
+
self,
|
229
|
+
book_id: str,
|
230
|
+
book_info: dict[str, Any],
|
231
|
+
) -> None:
|
232
|
+
"""
|
233
|
+
Serialize and save the book_info dict as json.
|
234
|
+
|
235
|
+
:param book_id: identifier of the book
|
236
|
+
:param book_info: dict containing metadata about the book
|
237
|
+
"""
|
238
|
+
target_dir = self._raw_data_dir / book_id
|
239
|
+
target_dir.mkdir(parents=True, exist_ok=True)
|
240
|
+
(target_dir / "book_info.json").write_text(
|
241
|
+
json.dumps(book_info, ensure_ascii=False, indent=2),
|
242
|
+
encoding="utf-8",
|
243
|
+
)
|
244
|
+
|
245
|
+
def _save_html_pages(
|
246
|
+
self,
|
247
|
+
html_dir: Path,
|
248
|
+
filename: str,
|
249
|
+
html_list: Sequence[str],
|
250
|
+
) -> None:
|
251
|
+
"""
|
252
|
+
If save_html is enabled, write each HTML snippet to a file.
|
253
|
+
|
254
|
+
Filenames will be {chap_id}_{index}.html in html_dir.
|
255
|
+
|
256
|
+
:param html_dir: directory in which to write HTML files
|
257
|
+
:param filename: used as filename prefix
|
258
|
+
:param html_list: list of HTML strings to save
|
259
|
+
"""
|
260
|
+
if not self.save_html:
|
261
|
+
return
|
262
|
+
|
263
|
+
html_dir.mkdir(parents=True, exist_ok=True)
|
264
|
+
for i, html in enumerate(html_list):
|
265
|
+
file_path = html_dir / f"{filename}_{i}.html"
|
266
|
+
file_path.write_text(html, encoding="utf-8")
|
267
|
+
|
268
|
+
@staticmethod
|
269
|
+
async def _chapter_ids(
|
270
|
+
volumes: list[dict[str, Any]],
|
271
|
+
start_id: str | None,
|
272
|
+
end_id: str | None,
|
273
|
+
) -> AsyncIterator[str]:
|
274
|
+
"""
|
275
|
+
Yield each chapterId in order, respecting start/end bounds.
|
276
|
+
"""
|
277
|
+
seen_start = start_id is None
|
278
|
+
for vol in volumes:
|
279
|
+
for chap in vol.get("chapters", []):
|
280
|
+
cid = chap.get("chapterId")
|
281
|
+
if not cid:
|
282
|
+
continue
|
283
|
+
if not seen_start:
|
284
|
+
if cid == start_id:
|
285
|
+
seen_start = True
|
286
|
+
else:
|
287
|
+
continue
|
288
|
+
yield cid
|
289
|
+
if end_id is not None and cid == end_id:
|
290
|
+
return
|
165
291
|
|
166
292
|
@property
|
167
|
-
def
|
168
|
-
return self.
|
293
|
+
def fetcher(self) -> FetcherProtocol:
|
294
|
+
return self._fetcher
|
169
295
|
|
170
296
|
@property
|
171
|
-
def
|
172
|
-
return self.
|
297
|
+
def parser(self) -> ParserProtocol:
|
298
|
+
return self._parser
|
173
299
|
|
174
300
|
@property
|
175
301
|
def save_html(self) -> bool:
|
@@ -196,12 +322,12 @@ class BaseDownloader(DownloaderProtocol, abc.ABC):
|
|
196
322
|
return self._config.backoff_factor
|
197
323
|
|
198
324
|
@property
|
199
|
-
def
|
200
|
-
return self._config.
|
325
|
+
def workers(self) -> int:
|
326
|
+
return self._config.workers
|
201
327
|
|
202
328
|
@property
|
203
|
-
def
|
204
|
-
return self._config.
|
329
|
+
def storage_batch_size(self) -> int:
|
330
|
+
return max(1, self._config.storage_batch_size)
|
205
331
|
|
206
332
|
def _handle_download_exception(self, book: BookConfig, error: Exception) -> None:
|
207
333
|
"""
|