novel-downloader 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +14 -0
- novel_downloader/cli/__init__.py +14 -0
- novel_downloader/cli/clean.py +134 -0
- novel_downloader/cli/download.py +132 -0
- novel_downloader/cli/interactive.py +67 -0
- novel_downloader/cli/main.py +45 -0
- novel_downloader/cli/settings.py +177 -0
- novel_downloader/config/__init__.py +52 -0
- novel_downloader/config/adapter.py +153 -0
- novel_downloader/config/loader.py +177 -0
- novel_downloader/config/models.py +173 -0
- novel_downloader/config/site_rules.py +97 -0
- novel_downloader/core/__init__.py +25 -0
- novel_downloader/core/downloaders/__init__.py +22 -0
- novel_downloader/core/downloaders/base_async_downloader.py +157 -0
- novel_downloader/core/downloaders/base_downloader.py +187 -0
- novel_downloader/core/downloaders/common_asynb_downloader.py +207 -0
- novel_downloader/core/downloaders/common_downloader.py +191 -0
- novel_downloader/core/downloaders/qidian_downloader.py +208 -0
- novel_downloader/core/factory/__init__.py +33 -0
- novel_downloader/core/factory/downloader_factory.py +149 -0
- novel_downloader/core/factory/parser_factory.py +62 -0
- novel_downloader/core/factory/requester_factory.py +106 -0
- novel_downloader/core/factory/saver_factory.py +49 -0
- novel_downloader/core/interfaces/__init__.py +32 -0
- novel_downloader/core/interfaces/async_downloader_protocol.py +37 -0
- novel_downloader/core/interfaces/async_requester_protocol.py +68 -0
- novel_downloader/core/interfaces/downloader_protocol.py +37 -0
- novel_downloader/core/interfaces/parser_protocol.py +40 -0
- novel_downloader/core/interfaces/requester_protocol.py +65 -0
- novel_downloader/core/interfaces/saver_protocol.py +61 -0
- novel_downloader/core/parsers/__init__.py +28 -0
- novel_downloader/core/parsers/base_parser.py +96 -0
- novel_downloader/core/parsers/common_parser/__init__.py +14 -0
- novel_downloader/core/parsers/common_parser/helper.py +321 -0
- novel_downloader/core/parsers/common_parser/main_parser.py +86 -0
- novel_downloader/core/parsers/qidian_parser/__init__.py +20 -0
- novel_downloader/core/parsers/qidian_parser/browser/__init__.py +13 -0
- novel_downloader/core/parsers/qidian_parser/browser/chapter_encrypted.py +498 -0
- novel_downloader/core/parsers/qidian_parser/browser/chapter_normal.py +97 -0
- novel_downloader/core/parsers/qidian_parser/browser/chapter_router.py +70 -0
- novel_downloader/core/parsers/qidian_parser/browser/main_parser.py +110 -0
- novel_downloader/core/parsers/qidian_parser/session/__init__.py +13 -0
- novel_downloader/core/parsers/qidian_parser/session/chapter_encrypted.py +451 -0
- novel_downloader/core/parsers/qidian_parser/session/chapter_normal.py +119 -0
- novel_downloader/core/parsers/qidian_parser/session/chapter_router.py +67 -0
- novel_downloader/core/parsers/qidian_parser/session/main_parser.py +113 -0
- novel_downloader/core/parsers/qidian_parser/session/node_decryptor.py +164 -0
- novel_downloader/core/parsers/qidian_parser/shared/__init__.py +38 -0
- novel_downloader/core/parsers/qidian_parser/shared/book_info_parser.py +95 -0
- novel_downloader/core/parsers/qidian_parser/shared/helpers.py +133 -0
- novel_downloader/core/requesters/__init__.py +31 -0
- novel_downloader/core/requesters/base_async_session.py +297 -0
- novel_downloader/core/requesters/base_browser.py +210 -0
- novel_downloader/core/requesters/base_session.py +243 -0
- novel_downloader/core/requesters/common_requester/__init__.py +18 -0
- novel_downloader/core/requesters/common_requester/common_async_session.py +96 -0
- novel_downloader/core/requesters/common_requester/common_session.py +126 -0
- novel_downloader/core/requesters/qidian_requester/__init__.py +22 -0
- novel_downloader/core/requesters/qidian_requester/qidian_broswer.py +377 -0
- novel_downloader/core/requesters/qidian_requester/qidian_session.py +202 -0
- novel_downloader/core/savers/__init__.py +20 -0
- novel_downloader/core/savers/base_saver.py +169 -0
- novel_downloader/core/savers/common_saver/__init__.py +13 -0
- novel_downloader/core/savers/common_saver/common_epub.py +232 -0
- novel_downloader/core/savers/common_saver/common_txt.py +176 -0
- novel_downloader/core/savers/common_saver/main_saver.py +86 -0
- novel_downloader/core/savers/epub_utils/__init__.py +27 -0
- novel_downloader/core/savers/epub_utils/css_builder.py +68 -0
- novel_downloader/core/savers/epub_utils/initializer.py +98 -0
- novel_downloader/core/savers/epub_utils/text_to_html.py +132 -0
- novel_downloader/core/savers/epub_utils/volume_intro.py +61 -0
- novel_downloader/core/savers/qidian_saver.py +22 -0
- novel_downloader/locales/en.json +91 -0
- novel_downloader/locales/zh.json +91 -0
- novel_downloader/resources/config/rules.toml +196 -0
- novel_downloader/resources/config/settings.yaml +73 -0
- novel_downloader/resources/css_styles/main.css +104 -0
- novel_downloader/resources/css_styles/volume-intro.css +56 -0
- novel_downloader/resources/images/volume_border.png +0 -0
- novel_downloader/resources/js_scripts/qidian_decrypt_node.js +82 -0
- novel_downloader/resources/json/replace_word_map.json +4 -0
- novel_downloader/resources/text/blacklist.txt +22 -0
- novel_downloader/utils/__init__.py +0 -0
- novel_downloader/utils/cache.py +24 -0
- novel_downloader/utils/constants.py +158 -0
- novel_downloader/utils/crypto_utils.py +144 -0
- novel_downloader/utils/file_utils/__init__.py +43 -0
- novel_downloader/utils/file_utils/io.py +252 -0
- novel_downloader/utils/file_utils/normalize.py +68 -0
- novel_downloader/utils/file_utils/sanitize.py +77 -0
- novel_downloader/utils/fontocr/__init__.py +23 -0
- novel_downloader/utils/fontocr/ocr_v1.py +304 -0
- novel_downloader/utils/fontocr/ocr_v2.py +658 -0
- novel_downloader/utils/hash_store.py +288 -0
- novel_downloader/utils/hash_utils.py +103 -0
- novel_downloader/utils/i18n.py +41 -0
- novel_downloader/utils/logger.py +104 -0
- novel_downloader/utils/model_loader.py +72 -0
- novel_downloader/utils/network.py +287 -0
- novel_downloader/utils/state.py +156 -0
- novel_downloader/utils/text_utils/__init__.py +27 -0
- novel_downloader/utils/text_utils/chapter_formatting.py +46 -0
- novel_downloader/utils/text_utils/diff_display.py +75 -0
- novel_downloader/utils/text_utils/font_mapping.py +31 -0
- novel_downloader/utils/text_utils/text_cleaning.py +57 -0
- novel_downloader/utils/time_utils/__init__.py +22 -0
- novel_downloader/utils/time_utils/datetime_utils.py +146 -0
- novel_downloader/utils/time_utils/sleep_utils.py +49 -0
- novel_downloader-1.1.0.dist-info/METADATA +157 -0
- novel_downloader-1.1.0.dist-info/RECORD +115 -0
- novel_downloader-1.1.0.dist-info/WHEEL +5 -0
- novel_downloader-1.1.0.dist-info/entry_points.txt +2 -0
- novel_downloader-1.1.0.dist-info/licenses/LICENSE +21 -0
- novel_downloader-1.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,153 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
novel_downloader.config.adapter
|
5
|
+
-------------------------------
|
6
|
+
|
7
|
+
Defines ConfigAdapter, which maps a raw configuration dictionary and
|
8
|
+
site name into structured dataclass-based config models.
|
9
|
+
|
10
|
+
Supported mappings:
|
11
|
+
- requests -> RequesterConfig
|
12
|
+
- general+site -> DownloaderConfig
|
13
|
+
- general+site -> ParserConfig
|
14
|
+
- general+output -> SaverConfig
|
15
|
+
- sites[site] -> book_ids list
|
16
|
+
"""
|
17
|
+
|
18
|
+
from typing import Any, Dict, List
|
19
|
+
|
20
|
+
from .models import (
|
21
|
+
DownloaderConfig,
|
22
|
+
ParserConfig,
|
23
|
+
RequesterConfig,
|
24
|
+
SaverConfig,
|
25
|
+
)
|
26
|
+
|
27
|
+
|
28
|
+
class ConfigAdapter:
|
29
|
+
"""
|
30
|
+
Adapter to map a raw config dict + site name into structured dataclass configs.
|
31
|
+
"""
|
32
|
+
|
33
|
+
def __init__(self, config: Dict[str, Any], site: str):
|
34
|
+
"""
|
35
|
+
:param config: 完整加载的配置 dict
|
36
|
+
:param site: 当前站点名称 (e.g. "qidian")
|
37
|
+
"""
|
38
|
+
self._config = config
|
39
|
+
self._site = site
|
40
|
+
|
41
|
+
def set_site(self, site: str) -> None:
|
42
|
+
"""
|
43
|
+
切换当前适配的站点
|
44
|
+
"""
|
45
|
+
self._site = site
|
46
|
+
|
47
|
+
def get_requester_config(self) -> RequesterConfig:
|
48
|
+
"""
|
49
|
+
从 config["requests"] 中读取通用请求配置 (含 DrissionPage 设置)
|
50
|
+
返回 RequesterConfig 实例
|
51
|
+
"""
|
52
|
+
req = self._config.get("requests", {})
|
53
|
+
site_cfg = self._config.get("sites", {}).get(self._site, {})
|
54
|
+
return RequesterConfig(
|
55
|
+
wait_time=req.get("wait_time", 5),
|
56
|
+
retry_times=req.get("retry_times", 3),
|
57
|
+
retry_interval=req.get("retry_interval", 5),
|
58
|
+
timeout=req.get("timeout", 30),
|
59
|
+
headless=req.get("headless", True),
|
60
|
+
user_data_folder=req.get("user_data_folder", "./user_data"),
|
61
|
+
profile_name=req.get("profile_name", "Profile_1"),
|
62
|
+
auto_close=req.get("auto_close", True),
|
63
|
+
disable_images=req.get("disable_images", True),
|
64
|
+
mute_audio=req.get("mute_audio", True),
|
65
|
+
mode=site_cfg.get("mode", "session"),
|
66
|
+
max_rps=site_cfg.get("max_rps", None),
|
67
|
+
)
|
68
|
+
|
69
|
+
def get_downloader_config(self) -> DownloaderConfig:
|
70
|
+
"""
|
71
|
+
从 config["general"] 和 config["sites"][site] 中读取下载器相关配置,
|
72
|
+
返回 DownloaderConfig 实例
|
73
|
+
"""
|
74
|
+
gen = self._config.get("general", {})
|
75
|
+
debug = gen.get("debug", {})
|
76
|
+
site_cfg = self._config.get("sites", {}).get(self._site, {})
|
77
|
+
return DownloaderConfig(
|
78
|
+
request_interval=gen.get("request_interval", 5),
|
79
|
+
raw_data_dir=gen.get("raw_data_dir", "./raw_data"),
|
80
|
+
cache_dir=gen.get("cache_dir", "./cache"),
|
81
|
+
download_workers=gen.get("download_workers", 4),
|
82
|
+
parser_workers=gen.get("parser_workers", 4),
|
83
|
+
use_process_pool=gen.get("use_process_pool", True),
|
84
|
+
skip_existing=gen.get("skip_existing", True),
|
85
|
+
login_required=site_cfg.get("login_required", False),
|
86
|
+
save_html=debug.get("save_html", False),
|
87
|
+
mode=site_cfg.get("mode", "session"),
|
88
|
+
)
|
89
|
+
|
90
|
+
def get_parser_config(self) -> ParserConfig:
|
91
|
+
"""
|
92
|
+
从 config["general"]["cache_dir"]、config["general"]["debug"] 与
|
93
|
+
config["sites"][site] 中读取解析器相关配置, 返回 ParserConfig 实例
|
94
|
+
"""
|
95
|
+
gen = self._config.get("general", {})
|
96
|
+
site_cfg = self._config.get("sites", {}).get(self._site, {})
|
97
|
+
return ParserConfig(
|
98
|
+
cache_dir=gen.get("cache_dir", "./cache"),
|
99
|
+
decode_font=site_cfg.get("decode_font", False),
|
100
|
+
use_freq=site_cfg.get("use_freq", False),
|
101
|
+
use_ocr=site_cfg.get("use_ocr", True),
|
102
|
+
use_vec=site_cfg.get("use_vec", False),
|
103
|
+
ocr_version=site_cfg.get("ocr_version", "v1.0"),
|
104
|
+
save_font_debug=site_cfg.get("save_font_debug", False),
|
105
|
+
batch_size=site_cfg.get("batch_size", 32),
|
106
|
+
ocr_weight=site_cfg.get("ocr_weight", 0.6),
|
107
|
+
vec_weight=site_cfg.get("vec_weight", 0.4),
|
108
|
+
mode=site_cfg.get("mode", "session"),
|
109
|
+
)
|
110
|
+
|
111
|
+
def get_saver_config(self) -> SaverConfig:
|
112
|
+
"""
|
113
|
+
从 config["general"] 与 config["output"] 中读取存储器相关配置,
|
114
|
+
返回 SaverConfig 实例
|
115
|
+
"""
|
116
|
+
gen = self._config.get("general", {})
|
117
|
+
out = self._config.get("output", {})
|
118
|
+
fmt = out.get("formats", {})
|
119
|
+
naming = out.get("naming", {})
|
120
|
+
epub_opts = out.get("epub", {})
|
121
|
+
return SaverConfig(
|
122
|
+
raw_data_dir=gen.get("raw_data_dir", "./raw_data"),
|
123
|
+
output_dir=gen.get("output_dir", "./downloads"),
|
124
|
+
clean_text=out.get("clean_text", True),
|
125
|
+
make_txt=fmt.get("make_txt", True),
|
126
|
+
make_epub=fmt.get("make_epub", False),
|
127
|
+
make_md=fmt.get("make_md", False),
|
128
|
+
make_pdf=fmt.get("make_pdf", False),
|
129
|
+
append_timestamp=naming.get("append_timestamp", True),
|
130
|
+
filename_template=naming.get("filename_template", "{title}_{author}"),
|
131
|
+
include_cover=epub_opts.get("include_cover", True),
|
132
|
+
include_toc=epub_opts.get("include_toc", False),
|
133
|
+
)
|
134
|
+
|
135
|
+
def get_book_ids(self) -> List[str]:
|
136
|
+
"""
|
137
|
+
从 config["sites"][site]["book_ids"] 中提取目标书籍列表
|
138
|
+
"""
|
139
|
+
site_cfg = self._config.get("sites", {}).get(self._site, {})
|
140
|
+
raw_ids = site_cfg.get("book_ids", [])
|
141
|
+
|
142
|
+
if isinstance(raw_ids, str):
|
143
|
+
return [raw_ids]
|
144
|
+
|
145
|
+
if isinstance(raw_ids, int):
|
146
|
+
return [str(raw_ids)]
|
147
|
+
|
148
|
+
if not isinstance(raw_ids, list):
|
149
|
+
raise ValueError(
|
150
|
+
f"book_ids must be a list or string, got {type(raw_ids).__name__}"
|
151
|
+
)
|
152
|
+
|
153
|
+
return [str(book_id) for book_id in raw_ids]
|
@@ -0,0 +1,177 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
novel_downloader.config.loader
|
5
|
+
--------------------------------
|
6
|
+
|
7
|
+
Provides functionality to load YAML configuration files into Python
|
8
|
+
dictionaries, with robust error handling and fallback support.
|
9
|
+
|
10
|
+
This is typically used to load user-supplied or internal default config files.
|
11
|
+
"""
|
12
|
+
|
13
|
+
import json
|
14
|
+
import logging
|
15
|
+
from importlib.abc import Traversable
|
16
|
+
from importlib.resources import as_file
|
17
|
+
from pathlib import Path
|
18
|
+
from typing import Any, Dict, Optional, Union
|
19
|
+
|
20
|
+
import yaml
|
21
|
+
|
22
|
+
from novel_downloader.utils.cache import cached_load_config
|
23
|
+
from novel_downloader.utils.constants import (
|
24
|
+
BASE_CONFIG_PATH,
|
25
|
+
SETTING_FILE,
|
26
|
+
)
|
27
|
+
|
28
|
+
logger = logging.getLogger(__name__)
|
29
|
+
|
30
|
+
|
31
|
+
def resolve_config_path(
|
32
|
+
config_path: Optional[Union[str, Path]]
|
33
|
+
) -> Optional[Union[Path, Traversable]]:
|
34
|
+
"""
|
35
|
+
Resolve which configuration file to use, in this priority order:
|
36
|
+
|
37
|
+
1. User-specified path (the `config_path` argument).
|
38
|
+
2. `./settings.yaml` in the current working directory.
|
39
|
+
3. The global settings file (`SETTING_FILE`).
|
40
|
+
4. The internal default (`BASE_CONFIG_PATH`).
|
41
|
+
|
42
|
+
Returns a Path to the first existing file, or None if none is found.
|
43
|
+
"""
|
44
|
+
# 1. Try the user-provided path
|
45
|
+
if config_path:
|
46
|
+
path = Path(config_path).expanduser().resolve()
|
47
|
+
if path.is_file():
|
48
|
+
return path
|
49
|
+
logger.warning("[config] Specified config file not found: %s", path)
|
50
|
+
|
51
|
+
# 2. Try ./settings.yaml in the current working directory
|
52
|
+
local_path = Path.cwd() / "settings.yaml"
|
53
|
+
if local_path.is_file():
|
54
|
+
logger.debug("[config] Using local settings.yaml at %s", local_path)
|
55
|
+
return local_path
|
56
|
+
|
57
|
+
# 3. Try the globally registered settings file
|
58
|
+
if SETTING_FILE.is_file():
|
59
|
+
logger.debug("[config] Using global settings file at %s", SETTING_FILE)
|
60
|
+
return SETTING_FILE
|
61
|
+
|
62
|
+
# 4. Fallback to the internal default configuration
|
63
|
+
try:
|
64
|
+
logger.debug(
|
65
|
+
"[config] Falling back to internal base config at %s", BASE_CONFIG_PATH
|
66
|
+
)
|
67
|
+
return BASE_CONFIG_PATH
|
68
|
+
except Exception as e:
|
69
|
+
logger.error("[config] Failed to load internal base config: %s", e)
|
70
|
+
return None
|
71
|
+
|
72
|
+
|
73
|
+
@cached_load_config
|
74
|
+
def load_config(config_path: Optional[Union[str, Path]]) -> Dict[str, Any]:
|
75
|
+
"""
|
76
|
+
Load configuration data from a YAML file.
|
77
|
+
|
78
|
+
:param config_path: Optional path to the YAML configuration file.
|
79
|
+
:return: Parsed configuration as a dict.
|
80
|
+
"""
|
81
|
+
path = resolve_config_path(config_path)
|
82
|
+
if not path or not path.is_file():
|
83
|
+
logger.warning("[config] No valid config file found, using empty config.")
|
84
|
+
return {}
|
85
|
+
|
86
|
+
with as_file(path) as real_path:
|
87
|
+
try:
|
88
|
+
content = real_path.read_text(encoding="utf-8")
|
89
|
+
ext = real_path.suffix.lower()
|
90
|
+
except Exception as e:
|
91
|
+
logger.error("[config] Failed to read config file '%s': %s", path, e)
|
92
|
+
return {}
|
93
|
+
|
94
|
+
data: Any = None
|
95
|
+
|
96
|
+
if ext == ".json":
|
97
|
+
try:
|
98
|
+
data = json.loads(content)
|
99
|
+
except json.JSONDecodeError as e:
|
100
|
+
logger.error("[config] JSON parse error in '%s': %s", path, e)
|
101
|
+
return {}
|
102
|
+
else:
|
103
|
+
try:
|
104
|
+
data = yaml.safe_load(content)
|
105
|
+
except yaml.YAMLError as e:
|
106
|
+
logger.error("[config] YAML parse error in '%s': %s", path, e)
|
107
|
+
return {}
|
108
|
+
|
109
|
+
if data is None:
|
110
|
+
return {}
|
111
|
+
if not isinstance(data, dict):
|
112
|
+
logger.warning(
|
113
|
+
"[config] Expected dict in config file '%s', got %s",
|
114
|
+
path,
|
115
|
+
type(data).__name__,
|
116
|
+
)
|
117
|
+
return {}
|
118
|
+
|
119
|
+
return data
|
120
|
+
|
121
|
+
|
122
|
+
def save_config_file(
|
123
|
+
source_path: Union[str, Path],
|
124
|
+
output_path: Union[str, Path] = SETTING_FILE,
|
125
|
+
) -> None:
|
126
|
+
"""
|
127
|
+
Validate a YAML/JSON config file, load it into a dict,
|
128
|
+
and then dump it as JSON to the internal SETTING_FILE.
|
129
|
+
|
130
|
+
:param source_path: The user-provided YAML file path.
|
131
|
+
:param output_path: Destination path to save the config (default: SETTING_FILE).
|
132
|
+
"""
|
133
|
+
source = Path(source_path).expanduser().resolve()
|
134
|
+
output = Path(output_path).expanduser().resolve()
|
135
|
+
|
136
|
+
if not source.is_file():
|
137
|
+
raise FileNotFoundError(f"Source file not found: {source}")
|
138
|
+
|
139
|
+
ext = source.suffix.lower()
|
140
|
+
|
141
|
+
if ext in {".yaml", ".yml"}:
|
142
|
+
logger.debug("[config] Loading YAML for conversion: %s", source)
|
143
|
+
try:
|
144
|
+
with source.open("r", encoding="utf-8") as f:
|
145
|
+
data = yaml.safe_load(f)
|
146
|
+
except yaml.YAMLError as e:
|
147
|
+
logger.error("[config] Invalid YAML format: %s", e)
|
148
|
+
raise ValueError(f"Invalid YAML file: {source}") from e
|
149
|
+
|
150
|
+
elif ext == ".json":
|
151
|
+
logger.debug("[config] Loading JSON for saving: %s", source)
|
152
|
+
try:
|
153
|
+
with source.open("r", encoding="utf-8") as f:
|
154
|
+
data = json.load(f)
|
155
|
+
except json.JSONDecodeError as e:
|
156
|
+
logger.error("[config] Invalid JSON format: %s", e)
|
157
|
+
raise ValueError(f"Invalid JSON file: {source}") from e
|
158
|
+
|
159
|
+
else:
|
160
|
+
raise ValueError(f"Source file must be .yaml, .yml, or .json: {source}")
|
161
|
+
|
162
|
+
if not isinstance(data, dict):
|
163
|
+
raise ValueError(f"Config root must be a JSON/YAML object: {source}")
|
164
|
+
|
165
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
166
|
+
try:
|
167
|
+
with output.open("w", encoding="utf-8") as f:
|
168
|
+
json.dump(data, f, indent=2, ensure_ascii=False)
|
169
|
+
except Exception as e:
|
170
|
+
logger.error("[config] Failed to write config JSON '%s': %s", output, e)
|
171
|
+
raise
|
172
|
+
|
173
|
+
logger.info("[config] Configuration successfully saved to JSON: %s", output)
|
174
|
+
return
|
175
|
+
|
176
|
+
|
177
|
+
__all__ = ["load_config"]
|
@@ -0,0 +1,173 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
novel_downloader.config.models
|
5
|
+
------------------------------
|
6
|
+
|
7
|
+
Defines structured configuration models using dataclasses for each
|
8
|
+
major component in the novel_downloader pipeline.
|
9
|
+
|
10
|
+
Each config section corresponds to a specific stage of the pipeline:
|
11
|
+
- RequesterConfig: network settings for requests and DrissionPage
|
12
|
+
- DownloaderConfig: chapter download behavior and local raw data paths
|
13
|
+
- ParserConfig: font decoding, cache handling, and debug options
|
14
|
+
- SaverConfig: output formatting, export formats, and filename templates
|
15
|
+
|
16
|
+
These models are used to map loaded YAML or JSON config data into
|
17
|
+
strongly typed Python objects for safer and cleaner access.
|
18
|
+
"""
|
19
|
+
|
20
|
+
from dataclasses import dataclass
|
21
|
+
from typing import Any, Dict, List, Literal, Optional, TypedDict
|
22
|
+
|
23
|
+
|
24
|
+
# === Requesters ===
|
25
|
+
@dataclass
|
26
|
+
class RequesterConfig:
|
27
|
+
wait_time: int = 5
|
28
|
+
retry_times: int = 3
|
29
|
+
retry_interval: int = 5
|
30
|
+
timeout: int = 30
|
31
|
+
headless: bool = True
|
32
|
+
user_data_folder: str = ""
|
33
|
+
profile_name: str = ""
|
34
|
+
auto_close: bool = True
|
35
|
+
disable_images: bool = True
|
36
|
+
mute_audio: bool = True
|
37
|
+
mode: str = "session" # browser / session / async
|
38
|
+
max_rps: Optional[float] = None # Maximum requests per second
|
39
|
+
|
40
|
+
|
41
|
+
# === Downloaders ===
|
42
|
+
@dataclass
|
43
|
+
class DownloaderConfig:
|
44
|
+
request_interval: int = 5
|
45
|
+
raw_data_dir: str = "./raw_data"
|
46
|
+
cache_dir: str = "./novel_cache"
|
47
|
+
download_workers: int = 4
|
48
|
+
parser_workers: int = 4
|
49
|
+
use_process_pool: bool = False
|
50
|
+
skip_existing: bool = True
|
51
|
+
login_required: bool = False
|
52
|
+
save_html: bool = False
|
53
|
+
mode: str = "session" # browser / session / async
|
54
|
+
|
55
|
+
|
56
|
+
# === Parsers ===
|
57
|
+
@dataclass
|
58
|
+
class ParserConfig:
|
59
|
+
cache_dir: str = "./novel_cache"
|
60
|
+
decode_font: bool = False
|
61
|
+
use_freq: bool = False
|
62
|
+
use_ocr: bool = True
|
63
|
+
use_vec: bool = False
|
64
|
+
ocr_version: str = "v1.0"
|
65
|
+
batch_size: int = 32
|
66
|
+
ocr_weight: float = 0.6
|
67
|
+
vec_weight: float = 0.4
|
68
|
+
save_font_debug: bool = False
|
69
|
+
mode: str = "session" # browser / session
|
70
|
+
|
71
|
+
|
72
|
+
# === Savers ===
|
73
|
+
@dataclass
|
74
|
+
class SaverConfig:
|
75
|
+
raw_data_dir: str = "./raw_data"
|
76
|
+
output_dir: str = "./downloads"
|
77
|
+
clean_text: bool = True
|
78
|
+
make_txt: bool = True
|
79
|
+
make_epub: bool = False
|
80
|
+
make_md: bool = False
|
81
|
+
make_pdf: bool = False
|
82
|
+
append_timestamp: bool = True
|
83
|
+
filename_template: str = "{title}_{author}"
|
84
|
+
include_cover: bool = True
|
85
|
+
include_toc: bool = False
|
86
|
+
|
87
|
+
|
88
|
+
class RuleStep(TypedDict, total=False):
|
89
|
+
# —— 操作类型 —— #
|
90
|
+
type: Literal[
|
91
|
+
"attr",
|
92
|
+
"select_one",
|
93
|
+
"select",
|
94
|
+
"find",
|
95
|
+
"find_all",
|
96
|
+
"exclude",
|
97
|
+
"regex",
|
98
|
+
"text",
|
99
|
+
"strip",
|
100
|
+
"replace",
|
101
|
+
"split",
|
102
|
+
"join",
|
103
|
+
]
|
104
|
+
|
105
|
+
# —— BeautifulSoup 相关 —— #
|
106
|
+
selector: Optional[str] # CSS 选择器, 用于 select/select_one/exclude
|
107
|
+
name: Optional[str] # 标签名称, 用于 find/find_all
|
108
|
+
attrs: Optional[Dict[str, Any]] # 属性过滤, 用于 find/find_all
|
109
|
+
limit: Optional[int] # find_all 的最大匹配数
|
110
|
+
attr: Optional[str] # 从元素获取属性值 (select/select_one/select_all)
|
111
|
+
|
112
|
+
# —— 正则相关 —— #
|
113
|
+
pattern: Optional[str] # 正则表达式
|
114
|
+
flags: Optional[int] # re.I, re.M 等
|
115
|
+
group: Optional[int] # 匹配结果中的第几个分组 (默认 0)
|
116
|
+
template: Optional[str] # 自定义组合, 比如 "$1$2字"
|
117
|
+
|
118
|
+
# —— 文本处理 —— #
|
119
|
+
chars: Optional[str] # strip 要去除的字符集
|
120
|
+
old: Optional[str] # replace 中要被替换的子串
|
121
|
+
new: Optional[str] # replace 中新的子串
|
122
|
+
count: Optional[int] # replace 中的最大替换次数
|
123
|
+
sep: Optional[str] # split/join 的分隔符
|
124
|
+
index: Optional[int] # split/select_all/select 之后取第几个元素
|
125
|
+
|
126
|
+
|
127
|
+
class FieldRules(TypedDict):
|
128
|
+
steps: List[RuleStep]
|
129
|
+
|
130
|
+
|
131
|
+
class ChapterFieldRules(TypedDict):
|
132
|
+
key: str
|
133
|
+
steps: List[RuleStep]
|
134
|
+
|
135
|
+
|
136
|
+
class VolumesRules(TypedDict, total=False):
|
137
|
+
has_volume: bool # 是否存在卷,false=未分卷
|
138
|
+
volume_selector: str # 有卷时选择 volume 块的 selector
|
139
|
+
chapter_selector: str # 选择 chapter 节点的 selector
|
140
|
+
volume_name_steps: List[RuleStep]
|
141
|
+
chapter_steps: List[ChapterFieldRules] # 提取章节信息的步骤列表
|
142
|
+
volume_mode: str # Optional: "normal" (default) or "mixed"
|
143
|
+
list_selector: str # Optional: If "mixed" mode, parent container selector
|
144
|
+
|
145
|
+
|
146
|
+
class BookInfoRules(TypedDict, total=False):
|
147
|
+
book_name: FieldRules
|
148
|
+
author: FieldRules
|
149
|
+
cover_url: FieldRules
|
150
|
+
update_time: FieldRules
|
151
|
+
serial_status: FieldRules
|
152
|
+
word_count: FieldRules
|
153
|
+
summary: FieldRules
|
154
|
+
volumes: VolumesRules
|
155
|
+
|
156
|
+
|
157
|
+
class ChapterRules(TypedDict, total=False):
|
158
|
+
title: FieldRules
|
159
|
+
content: FieldRules
|
160
|
+
|
161
|
+
|
162
|
+
class SiteProfile(TypedDict):
|
163
|
+
book_info_url: str
|
164
|
+
chapter_url: str
|
165
|
+
|
166
|
+
|
167
|
+
class SiteRules(TypedDict):
|
168
|
+
profile: SiteProfile
|
169
|
+
book_info: BookInfoRules
|
170
|
+
chapter: ChapterRules
|
171
|
+
|
172
|
+
|
173
|
+
SiteRulesDict = Dict[str, SiteRules]
|
@@ -0,0 +1,97 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
novel_downloader.config.site_rules
|
5
|
+
----------------------------------
|
6
|
+
|
7
|
+
Handles loading, saving, and caching of site-specific scraping rules.
|
8
|
+
|
9
|
+
This module provides functionality to:
|
10
|
+
- Load site rules from JSON, YAML, or TOML formats.
|
11
|
+
- Save rules into a standard JSON format.
|
12
|
+
"""
|
13
|
+
|
14
|
+
import json
|
15
|
+
import logging
|
16
|
+
from pathlib import Path
|
17
|
+
from typing import Union
|
18
|
+
|
19
|
+
from novel_downloader.utils.cache import cached_load_config
|
20
|
+
from novel_downloader.utils.constants import SITE_RULES_FILE
|
21
|
+
from novel_downloader.utils.file_utils import save_as_json
|
22
|
+
|
23
|
+
from .models import SiteRulesDict
|
24
|
+
|
25
|
+
logger = logging.getLogger(__name__)
|
26
|
+
|
27
|
+
|
28
|
+
def save_rules_as_json(
|
29
|
+
source_path: Union[str, Path], output_path: Union[str, Path] = SITE_RULES_FILE
|
30
|
+
) -> None:
|
31
|
+
"""
|
32
|
+
Load rules from source_path (toml, yaml, or json) and save as JSON.
|
33
|
+
|
34
|
+
:param source_path: Path to the source rules file
|
35
|
+
(supports .toml, .yaml, .yml, .json).
|
36
|
+
:param output_path: Path where the JSON output will be saved.
|
37
|
+
Defaults to SITE_RULES_FILE.
|
38
|
+
:raises FileNotFoundError: If the source_path does not exist.
|
39
|
+
:raises ValueError: If the source file format is not supported.
|
40
|
+
:raises Exception: If file loading or saving fails.
|
41
|
+
"""
|
42
|
+
TAG = "[Config]"
|
43
|
+
source_path = Path(source_path)
|
44
|
+
output_path = Path(output_path)
|
45
|
+
|
46
|
+
if not source_path.exists():
|
47
|
+
raise FileNotFoundError(f"Source file {source_path} not found.")
|
48
|
+
|
49
|
+
suffix = source_path.suffix.lower()
|
50
|
+
|
51
|
+
logger.debug("%s Loading rules from %s (format: %s)", TAG, source_path, suffix)
|
52
|
+
|
53
|
+
try:
|
54
|
+
if suffix == ".toml":
|
55
|
+
import tomllib
|
56
|
+
|
57
|
+
with source_path.open("rb") as f:
|
58
|
+
rules_data = tomllib.load(f)
|
59
|
+
elif suffix in {".yaml", ".yml"}:
|
60
|
+
import yaml
|
61
|
+
|
62
|
+
with source_path.open("r", encoding="utf-8") as f:
|
63
|
+
rules_data = yaml.safe_load(f)
|
64
|
+
elif suffix == ".json":
|
65
|
+
with source_path.open("r", encoding="utf-8") as f:
|
66
|
+
rules_data = json.load(f)
|
67
|
+
else:
|
68
|
+
raise ValueError(f"Unsupported input format: {suffix}")
|
69
|
+
|
70
|
+
except Exception as e:
|
71
|
+
logger.error("%s Failed to load rules from %s: %s", TAG, source_path, str(e))
|
72
|
+
raise
|
73
|
+
|
74
|
+
logger.info("%s Saving rules to %s as JSON", TAG, output_path)
|
75
|
+
|
76
|
+
save_as_json(rules_data, output_path)
|
77
|
+
return
|
78
|
+
|
79
|
+
|
80
|
+
@cached_load_config
|
81
|
+
def load_site_rules(json_path: Union[str, Path] = SITE_RULES_FILE) -> SiteRulesDict:
|
82
|
+
"""
|
83
|
+
Loads site scraping rules from a JSON file and caches the result for future access.
|
84
|
+
|
85
|
+
:param json_path: Path to the site rules JSON file. Defaults to SITE_RULES_FILE.
|
86
|
+
:return: A dictionary containing all site-specific scraping rules.
|
87
|
+
"""
|
88
|
+
json_path = Path(json_path)
|
89
|
+
site_rules: SiteRulesDict = {}
|
90
|
+
|
91
|
+
if not json_path.exists():
|
92
|
+
return site_rules
|
93
|
+
|
94
|
+
with json_path.open("r", encoding="utf-8") as f:
|
95
|
+
site_rules = json.load(f)
|
96
|
+
|
97
|
+
return site_rules
|
@@ -0,0 +1,25 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
novel_downloader.core
|
5
|
+
---------------------
|
6
|
+
|
7
|
+
This package serves as the core layer of the novel_downloader system.
|
8
|
+
|
9
|
+
It provides factory methods for constructing key components required for
|
10
|
+
downloading and processing online novel content, including:
|
11
|
+
|
12
|
+
- Downloader: Handles the full download lifecycle of a book or a batch of books.
|
13
|
+
- Parser: Extracts structured data from HTML or SSR content.
|
14
|
+
- Requester: Sends HTTP requests and manages sessions, including login if required.
|
15
|
+
- Saver: Responsible for exporting downloaded data into various output formats.
|
16
|
+
"""
|
17
|
+
|
18
|
+
from .factory import get_downloader, get_parser, get_requester, get_saver
|
19
|
+
|
20
|
+
__all__ = [
|
21
|
+
"get_downloader",
|
22
|
+
"get_parser",
|
23
|
+
"get_requester",
|
24
|
+
"get_saver",
|
25
|
+
]
|
@@ -0,0 +1,22 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
novel_downloader.core.downloaders
|
5
|
+
---------------------------------
|
6
|
+
|
7
|
+
This subpackage contains concrete downloader implementations for
|
8
|
+
specific novel platforms.
|
9
|
+
|
10
|
+
Each downloader is responsible for orchestrating the full lifecycle
|
11
|
+
of retrieving, parsing, and saving novel content for a given source.
|
12
|
+
"""
|
13
|
+
|
14
|
+
from .common_asynb_downloader import CommonAsyncDownloader
|
15
|
+
from .common_downloader import CommonDownloader
|
16
|
+
from .qidian_downloader import QidianDownloader
|
17
|
+
|
18
|
+
__all__ = [
|
19
|
+
"CommonAsyncDownloader",
|
20
|
+
"CommonDownloader",
|
21
|
+
"QidianDownloader",
|
22
|
+
]
|