novel-downloader 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. novel_downloader/__init__.py +14 -0
  2. novel_downloader/cli/__init__.py +14 -0
  3. novel_downloader/cli/clean.py +134 -0
  4. novel_downloader/cli/download.py +98 -0
  5. novel_downloader/cli/interactive.py +67 -0
  6. novel_downloader/cli/main.py +45 -0
  7. novel_downloader/cli/settings.py +177 -0
  8. novel_downloader/config/__init__.py +52 -0
  9. novel_downloader/config/adapter.py +150 -0
  10. novel_downloader/config/loader.py +177 -0
  11. novel_downloader/config/models.py +170 -0
  12. novel_downloader/config/site_rules.py +97 -0
  13. novel_downloader/core/__init__.py +25 -0
  14. novel_downloader/core/downloaders/__init__.py +20 -0
  15. novel_downloader/core/downloaders/base_downloader.py +187 -0
  16. novel_downloader/core/downloaders/common_downloader.py +192 -0
  17. novel_downloader/core/downloaders/qidian_downloader.py +208 -0
  18. novel_downloader/core/factory/__init__.py +21 -0
  19. novel_downloader/core/factory/downloader_factory.py +62 -0
  20. novel_downloader/core/factory/parser_factory.py +62 -0
  21. novel_downloader/core/factory/requester_factory.py +62 -0
  22. novel_downloader/core/factory/saver_factory.py +49 -0
  23. novel_downloader/core/interfaces/__init__.py +28 -0
  24. novel_downloader/core/interfaces/downloader_protocol.py +37 -0
  25. novel_downloader/core/interfaces/parser_protocol.py +40 -0
  26. novel_downloader/core/interfaces/requester_protocol.py +65 -0
  27. novel_downloader/core/interfaces/saver_protocol.py +61 -0
  28. novel_downloader/core/parsers/__init__.py +28 -0
  29. novel_downloader/core/parsers/base_parser.py +96 -0
  30. novel_downloader/core/parsers/common_parser/__init__.py +14 -0
  31. novel_downloader/core/parsers/common_parser/helper.py +321 -0
  32. novel_downloader/core/parsers/common_parser/main_parser.py +86 -0
  33. novel_downloader/core/parsers/qidian_parser/__init__.py +20 -0
  34. novel_downloader/core/parsers/qidian_parser/browser/__init__.py +13 -0
  35. novel_downloader/core/parsers/qidian_parser/browser/chapter_encrypted.py +498 -0
  36. novel_downloader/core/parsers/qidian_parser/browser/chapter_normal.py +97 -0
  37. novel_downloader/core/parsers/qidian_parser/browser/chapter_router.py +70 -0
  38. novel_downloader/core/parsers/qidian_parser/browser/main_parser.py +110 -0
  39. novel_downloader/core/parsers/qidian_parser/session/__init__.py +13 -0
  40. novel_downloader/core/parsers/qidian_parser/session/chapter_encrypted.py +451 -0
  41. novel_downloader/core/parsers/qidian_parser/session/chapter_normal.py +119 -0
  42. novel_downloader/core/parsers/qidian_parser/session/chapter_router.py +67 -0
  43. novel_downloader/core/parsers/qidian_parser/session/main_parser.py +113 -0
  44. novel_downloader/core/parsers/qidian_parser/session/node_decryptor.py +164 -0
  45. novel_downloader/core/parsers/qidian_parser/shared/__init__.py +38 -0
  46. novel_downloader/core/parsers/qidian_parser/shared/book_info_parser.py +95 -0
  47. novel_downloader/core/parsers/qidian_parser/shared/helpers.py +133 -0
  48. novel_downloader/core/requesters/__init__.py +27 -0
  49. novel_downloader/core/requesters/base_browser.py +210 -0
  50. novel_downloader/core/requesters/base_session.py +243 -0
  51. novel_downloader/core/requesters/common_requester/__init__.py +14 -0
  52. novel_downloader/core/requesters/common_requester/common_session.py +126 -0
  53. novel_downloader/core/requesters/qidian_requester/__init__.py +22 -0
  54. novel_downloader/core/requesters/qidian_requester/qidian_broswer.py +377 -0
  55. novel_downloader/core/requesters/qidian_requester/qidian_session.py +202 -0
  56. novel_downloader/core/savers/__init__.py +20 -0
  57. novel_downloader/core/savers/base_saver.py +169 -0
  58. novel_downloader/core/savers/common_saver/__init__.py +13 -0
  59. novel_downloader/core/savers/common_saver/common_epub.py +232 -0
  60. novel_downloader/core/savers/common_saver/common_txt.py +176 -0
  61. novel_downloader/core/savers/common_saver/main_saver.py +86 -0
  62. novel_downloader/core/savers/epub_utils/__init__.py +27 -0
  63. novel_downloader/core/savers/epub_utils/css_builder.py +68 -0
  64. novel_downloader/core/savers/epub_utils/initializer.py +98 -0
  65. novel_downloader/core/savers/epub_utils/text_to_html.py +132 -0
  66. novel_downloader/core/savers/epub_utils/volume_intro.py +61 -0
  67. novel_downloader/core/savers/qidian_saver.py +22 -0
  68. novel_downloader/locales/en.json +91 -0
  69. novel_downloader/locales/zh.json +91 -0
  70. novel_downloader/resources/config/rules.toml +196 -0
  71. novel_downloader/resources/config/settings.yaml +70 -0
  72. novel_downloader/resources/css_styles/main.css +104 -0
  73. novel_downloader/resources/css_styles/volume-intro.css +56 -0
  74. novel_downloader/resources/images/volume_border.png +0 -0
  75. novel_downloader/resources/js_scripts/qidian_decrypt_node.js +82 -0
  76. novel_downloader/resources/json/replace_word_map.json +4 -0
  77. novel_downloader/resources/text/blacklist.txt +22 -0
  78. novel_downloader/utils/__init__.py +0 -0
  79. novel_downloader/utils/cache.py +24 -0
  80. novel_downloader/utils/constants.py +158 -0
  81. novel_downloader/utils/crypto_utils.py +144 -0
  82. novel_downloader/utils/file_utils/__init__.py +43 -0
  83. novel_downloader/utils/file_utils/io.py +252 -0
  84. novel_downloader/utils/file_utils/normalize.py +68 -0
  85. novel_downloader/utils/file_utils/sanitize.py +77 -0
  86. novel_downloader/utils/fontocr/__init__.py +23 -0
  87. novel_downloader/utils/fontocr/ocr_v1.py +304 -0
  88. novel_downloader/utils/fontocr/ocr_v2.py +658 -0
  89. novel_downloader/utils/hash_store.py +288 -0
  90. novel_downloader/utils/hash_utils.py +103 -0
  91. novel_downloader/utils/i18n.py +41 -0
  92. novel_downloader/utils/logger.py +104 -0
  93. novel_downloader/utils/model_loader.py +72 -0
  94. novel_downloader/utils/network.py +287 -0
  95. novel_downloader/utils/state.py +156 -0
  96. novel_downloader/utils/text_utils/__init__.py +27 -0
  97. novel_downloader/utils/text_utils/chapter_formatting.py +46 -0
  98. novel_downloader/utils/text_utils/diff_display.py +75 -0
  99. novel_downloader/utils/text_utils/font_mapping.py +31 -0
  100. novel_downloader/utils/text_utils/text_cleaning.py +57 -0
  101. novel_downloader/utils/time_utils/__init__.py +22 -0
  102. novel_downloader/utils/time_utils/datetime_utils.py +146 -0
  103. novel_downloader/utils/time_utils/sleep_utils.py +49 -0
  104. novel_downloader-1.1.1.dist-info/METADATA +137 -0
  105. novel_downloader-1.1.1.dist-info/RECORD +109 -0
  106. novel_downloader-1.1.1.dist-info/WHEEL +5 -0
  107. novel_downloader-1.1.1.dist-info/entry_points.txt +2 -0
  108. novel_downloader-1.1.1.dist-info/licenses/LICENSE +21 -0
  109. novel_downloader-1.1.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,150 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.config.adapter
5
+ -------------------------------
6
+
7
+ Defines ConfigAdapter, which maps a raw configuration dictionary and
8
+ site name into structured dataclass-based config models.
9
+
10
+ Supported mappings:
11
+ - requests -> RequesterConfig
12
+ - general+site -> DownloaderConfig
13
+ - general+site -> ParserConfig
14
+ - general+output -> SaverConfig
15
+ - sites[site] -> book_ids list
16
+ """
17
+
18
+ from typing import Any, Dict, List
19
+
20
+ from .models import (
21
+ DownloaderConfig,
22
+ ParserConfig,
23
+ RequesterConfig,
24
+ SaverConfig,
25
+ )
26
+
27
+
28
+ class ConfigAdapter:
29
+ """
30
+ Adapter to map a raw config dict + site name into structured dataclass configs.
31
+ """
32
+
33
+ def __init__(self, config: Dict[str, Any], site: str):
34
+ """
35
+ :param config: 完整加载的配置 dict
36
+ :param site: 当前站点名称 (e.g. "qidian")
37
+ """
38
+ self._config = config
39
+ self._site = site
40
+
41
+ def set_site(self, site: str) -> None:
42
+ """
43
+ 切换当前适配的站点
44
+ """
45
+ self._site = site
46
+
47
+ def get_requester_config(self) -> RequesterConfig:
48
+ """
49
+ 从 config["requests"] 中读取通用请求配置 (含 DrissionPage 设置)
50
+ 返回 RequesterConfig 实例
51
+ """
52
+ req = self._config.get("requests", {})
53
+ site_cfg = self._config.get("sites", {}).get(self._site, {})
54
+ return RequesterConfig(
55
+ wait_time=req.get("wait_time", 5),
56
+ retry_times=req.get("retry_times", 3),
57
+ retry_interval=req.get("retry_interval", 5),
58
+ timeout=req.get("timeout", 30),
59
+ headless=req.get("headless", True),
60
+ user_data_folder=req.get("user_data_folder", "./user_data"),
61
+ profile_name=req.get("profile_name", "Profile_1"),
62
+ auto_close=req.get("auto_close", True),
63
+ disable_images=req.get("disable_images", True),
64
+ mute_audio=req.get("mute_audio", True),
65
+ mode=site_cfg.get("mode", "session"),
66
+ )
67
+
68
+ def get_downloader_config(self) -> DownloaderConfig:
69
+ """
70
+ 从 config["general"] 和 config["sites"][site] 中读取下载器相关配置,
71
+ 返回 DownloaderConfig 实例
72
+ """
73
+ gen = self._config.get("general", {})
74
+ debug = gen.get("debug", {})
75
+ site_cfg = self._config.get("sites", {}).get(self._site, {})
76
+ return DownloaderConfig(
77
+ request_interval=gen.get("request_interval", 5),
78
+ raw_data_dir=gen.get("raw_data_dir", "./raw_data"),
79
+ cache_dir=gen.get("cache_dir", "./cache"),
80
+ max_threads=gen.get("max_threads", 4),
81
+ skip_existing=gen.get("skip_existing", True),
82
+ login_required=site_cfg.get("login_required", False),
83
+ save_html=debug.get("save_html", False),
84
+ mode=site_cfg.get("mode", "session"),
85
+ )
86
+
87
+ def get_parser_config(self) -> ParserConfig:
88
+ """
89
+ 从 config["general"]["cache_dir"]、config["general"]["debug"] 与
90
+ config["sites"][site] 中读取解析器相关配置, 返回 ParserConfig 实例
91
+ """
92
+ gen = self._config.get("general", {})
93
+ site_cfg = self._config.get("sites", {}).get(self._site, {})
94
+ return ParserConfig(
95
+ cache_dir=gen.get("cache_dir", "./cache"),
96
+ decode_font=site_cfg.get("decode_font", False),
97
+ use_freq=site_cfg.get("use_freq", False),
98
+ use_ocr=site_cfg.get("use_ocr", True),
99
+ use_vec=site_cfg.get("use_vec", False),
100
+ ocr_version=site_cfg.get("ocr_version", "v1.0"),
101
+ save_font_debug=site_cfg.get("save_font_debug", False),
102
+ batch_size=site_cfg.get("batch_size", 32),
103
+ ocr_weight=site_cfg.get("ocr_weight", 0.6),
104
+ vec_weight=site_cfg.get("vec_weight", 0.4),
105
+ mode=site_cfg.get("mode", "session"),
106
+ )
107
+
108
+ def get_saver_config(self) -> SaverConfig:
109
+ """
110
+ 从 config["general"] 与 config["output"] 中读取存储器相关配置,
111
+ 返回 SaverConfig 实例
112
+ """
113
+ gen = self._config.get("general", {})
114
+ out = self._config.get("output", {})
115
+ fmt = out.get("formats", {})
116
+ naming = out.get("naming", {})
117
+ epub_opts = out.get("epub", {})
118
+ return SaverConfig(
119
+ raw_data_dir=gen.get("raw_data_dir", "./raw_data"),
120
+ output_dir=gen.get("output_dir", "./downloads"),
121
+ clean_text=out.get("clean_text", True),
122
+ make_txt=fmt.get("make_txt", True),
123
+ make_epub=fmt.get("make_epub", False),
124
+ make_md=fmt.get("make_md", False),
125
+ make_pdf=fmt.get("make_pdf", False),
126
+ append_timestamp=naming.get("append_timestamp", True),
127
+ filename_template=naming.get("filename_template", "{title}_{author}"),
128
+ include_cover=epub_opts.get("include_cover", True),
129
+ include_toc=epub_opts.get("include_toc", False),
130
+ )
131
+
132
+ def get_book_ids(self) -> List[str]:
133
+ """
134
+ 从 config["sites"][site]["book_ids"] 中提取目标书籍列表
135
+ """
136
+ site_cfg = self._config.get("sites", {}).get(self._site, {})
137
+ raw_ids = site_cfg.get("book_ids", [])
138
+
139
+ if isinstance(raw_ids, str):
140
+ return [raw_ids]
141
+
142
+ if isinstance(raw_ids, int):
143
+ return [str(raw_ids)]
144
+
145
+ if not isinstance(raw_ids, list):
146
+ raise ValueError(
147
+ f"book_ids must be a list or string, got {type(raw_ids).__name__}"
148
+ )
149
+
150
+ return [str(book_id) for book_id in raw_ids]
@@ -0,0 +1,177 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.config.loader
5
+ --------------------------------
6
+
7
+ Provides functionality to load YAML configuration files into Python
8
+ dictionaries, with robust error handling and fallback support.
9
+
10
+ This is typically used to load user-supplied or internal default config files.
11
+ """
12
+
13
+ import json
14
+ import logging
15
+ from importlib.abc import Traversable
16
+ from importlib.resources import as_file
17
+ from pathlib import Path
18
+ from typing import Any, Dict, Optional, Union
19
+
20
+ import yaml
21
+
22
+ from novel_downloader.utils.cache import cached_load_config
23
+ from novel_downloader.utils.constants import (
24
+ BASE_CONFIG_PATH,
25
+ SETTING_FILE,
26
+ )
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ def resolve_config_path(
32
+ config_path: Optional[Union[str, Path]]
33
+ ) -> Optional[Union[Path, Traversable]]:
34
+ """
35
+ Resolve which configuration file to use, in this priority order:
36
+
37
+ 1. User-specified path (the `config_path` argument).
38
+ 2. `./settings.yaml` in the current working directory.
39
+ 3. The global settings file (`SETTING_FILE`).
40
+ 4. The internal default (`BASE_CONFIG_PATH`).
41
+
42
+ Returns a Path to the first existing file, or None if none is found.
43
+ """
44
+ # 1. Try the user-provided path
45
+ if config_path:
46
+ path = Path(config_path).expanduser().resolve()
47
+ if path.is_file():
48
+ return path
49
+ logger.warning("[config] Specified config file not found: %s", path)
50
+
51
+ # 2. Try ./settings.yaml in the current working directory
52
+ local_path = Path.cwd() / "settings.yaml"
53
+ if local_path.is_file():
54
+ logger.debug("[config] Using local settings.yaml at %s", local_path)
55
+ return local_path
56
+
57
+ # 3. Try the globally registered settings file
58
+ if SETTING_FILE.is_file():
59
+ logger.debug("[config] Using global settings file at %s", SETTING_FILE)
60
+ return SETTING_FILE
61
+
62
+ # 4. Fallback to the internal default configuration
63
+ try:
64
+ logger.debug(
65
+ "[config] Falling back to internal base config at %s", BASE_CONFIG_PATH
66
+ )
67
+ return BASE_CONFIG_PATH
68
+ except Exception as e:
69
+ logger.error("[config] Failed to load internal base config: %s", e)
70
+ return None
71
+
72
+
73
+ @cached_load_config
74
+ def load_config(config_path: Optional[Union[str, Path]]) -> Dict[str, Any]:
75
+ """
76
+ Load configuration data from a YAML file.
77
+
78
+ :param config_path: Optional path to the YAML configuration file.
79
+ :return: Parsed configuration as a dict.
80
+ """
81
+ path = resolve_config_path(config_path)
82
+ if not path or not path.is_file():
83
+ logger.warning("[config] No valid config file found, using empty config.")
84
+ return {}
85
+
86
+ with as_file(path) as real_path:
87
+ try:
88
+ content = real_path.read_text(encoding="utf-8")
89
+ ext = real_path.suffix.lower()
90
+ except Exception as e:
91
+ logger.error("[config] Failed to read config file '%s': %s", path, e)
92
+ return {}
93
+
94
+ data: Any = None
95
+
96
+ if ext == ".json":
97
+ try:
98
+ data = json.loads(content)
99
+ except json.JSONDecodeError as e:
100
+ logger.error("[config] JSON parse error in '%s': %s", path, e)
101
+ return {}
102
+ else:
103
+ try:
104
+ data = yaml.safe_load(content)
105
+ except yaml.YAMLError as e:
106
+ logger.error("[config] YAML parse error in '%s': %s", path, e)
107
+ return {}
108
+
109
+ if data is None:
110
+ return {}
111
+ if not isinstance(data, dict):
112
+ logger.warning(
113
+ "[config] Expected dict in config file '%s', got %s",
114
+ path,
115
+ type(data).__name__,
116
+ )
117
+ return {}
118
+
119
+ return data
120
+
121
+
122
+ def save_config_file(
123
+ source_path: Union[str, Path],
124
+ output_path: Union[str, Path] = SETTING_FILE,
125
+ ) -> None:
126
+ """
127
+ Validate a YAML/JSON config file, load it into a dict,
128
+ and then dump it as JSON to the internal SETTING_FILE.
129
+
130
+ :param source_path: The user-provided YAML file path.
131
+ :param output_path: Destination path to save the config (default: SETTING_FILE).
132
+ """
133
+ source = Path(source_path).expanduser().resolve()
134
+ output = Path(output_path).expanduser().resolve()
135
+
136
+ if not source.is_file():
137
+ raise FileNotFoundError(f"Source file not found: {source}")
138
+
139
+ ext = source.suffix.lower()
140
+
141
+ if ext in {".yaml", ".yml"}:
142
+ logger.debug("[config] Loading YAML for conversion: %s", source)
143
+ try:
144
+ with source.open("r", encoding="utf-8") as f:
145
+ data = yaml.safe_load(f)
146
+ except yaml.YAMLError as e:
147
+ logger.error("[config] Invalid YAML format: %s", e)
148
+ raise ValueError(f"Invalid YAML file: {source}") from e
149
+
150
+ elif ext == ".json":
151
+ logger.debug("[config] Loading JSON for saving: %s", source)
152
+ try:
153
+ with source.open("r", encoding="utf-8") as f:
154
+ data = json.load(f)
155
+ except json.JSONDecodeError as e:
156
+ logger.error("[config] Invalid JSON format: %s", e)
157
+ raise ValueError(f"Invalid JSON file: {source}") from e
158
+
159
+ else:
160
+ raise ValueError(f"Source file must be .yaml, .yml, or .json: {source}")
161
+
162
+ if not isinstance(data, dict):
163
+ raise ValueError(f"Config root must be a JSON/YAML object: {source}")
164
+
165
+ output.parent.mkdir(parents=True, exist_ok=True)
166
+ try:
167
+ with output.open("w", encoding="utf-8") as f:
168
+ json.dump(data, f, indent=2, ensure_ascii=False)
169
+ except Exception as e:
170
+ logger.error("[config] Failed to write config JSON '%s': %s", output, e)
171
+ raise
172
+
173
+ logger.info("[config] Configuration successfully saved to JSON: %s", output)
174
+ return
175
+
176
+
177
+ __all__ = ["load_config"]
@@ -0,0 +1,170 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.config.models
5
+ ------------------------------
6
+
7
+ Defines structured configuration models using dataclasses for each
8
+ major component in the novel_downloader pipeline.
9
+
10
+ Each config section corresponds to a specific stage of the pipeline:
11
+ - RequesterConfig: network settings for requests and DrissionPage
12
+ - DownloaderConfig: chapter download behavior and local raw data paths
13
+ - ParserConfig: font decoding, cache handling, and debug options
14
+ - SaverConfig: output formatting, export formats, and filename templates
15
+
16
+ These models are used to map loaded YAML or JSON config data into
17
+ strongly typed Python objects for safer and cleaner access.
18
+ """
19
+
20
+ from dataclasses import dataclass
21
+ from typing import Any, Dict, List, Literal, Optional, TypedDict
22
+
23
+
24
+ # === Requesters ===
25
+ @dataclass
26
+ class RequesterConfig:
27
+ wait_time: int = 5
28
+ retry_times: int = 3
29
+ retry_interval: int = 5
30
+ timeout: int = 30
31
+ headless: bool = True
32
+ user_data_folder: str = ""
33
+ profile_name: str = ""
34
+ auto_close: bool = True
35
+ disable_images: bool = True
36
+ mute_audio: bool = True
37
+ mode: str = "session" # browser / session / async
38
+
39
+
40
+ # === Downloaders ===
41
+ @dataclass
42
+ class DownloaderConfig:
43
+ request_interval: int = 5
44
+ raw_data_dir: str = "./raw_data"
45
+ cache_dir: str = "./novel_cache"
46
+ max_threads: int = 4
47
+ skip_existing: bool = True
48
+ login_required: bool = False
49
+ save_html: bool = False
50
+ mode: str = "session" # browser / session / async
51
+
52
+
53
+ # === Parsers ===
54
+ @dataclass
55
+ class ParserConfig:
56
+ cache_dir: str = "./novel_cache"
57
+ decode_font: bool = False
58
+ use_freq: bool = False
59
+ use_ocr: bool = True
60
+ use_vec: bool = False
61
+ ocr_version: str = "v1.0"
62
+ batch_size: int = 32
63
+ ocr_weight: float = 0.6
64
+ vec_weight: float = 0.4
65
+ save_font_debug: bool = False
66
+ mode: str = "session" # browser / session
67
+
68
+
69
+ # === Savers ===
70
+ @dataclass
71
+ class SaverConfig:
72
+ raw_data_dir: str = "./raw_data"
73
+ output_dir: str = "./downloads"
74
+ clean_text: bool = True
75
+ make_txt: bool = True
76
+ make_epub: bool = False
77
+ make_md: bool = False
78
+ make_pdf: bool = False
79
+ append_timestamp: bool = True
80
+ filename_template: str = "{title}_{author}"
81
+ include_cover: bool = True
82
+ include_toc: bool = False
83
+
84
+
85
+ class RuleStep(TypedDict, total=False):
86
+ # —— 操作类型 —— #
87
+ type: Literal[
88
+ "attr",
89
+ "select_one",
90
+ "select",
91
+ "find",
92
+ "find_all",
93
+ "exclude",
94
+ "regex",
95
+ "text",
96
+ "strip",
97
+ "replace",
98
+ "split",
99
+ "join",
100
+ ]
101
+
102
+ # —— BeautifulSoup 相关 —— #
103
+ selector: Optional[str] # CSS 选择器, 用于 select/select_one/exclude
104
+ name: Optional[str] # 标签名称, 用于 find/find_all
105
+ attrs: Optional[Dict[str, Any]] # 属性过滤, 用于 find/find_all
106
+ limit: Optional[int] # find_all 的最大匹配数
107
+ attr: Optional[str] # 从元素获取属性值 (select/select_one/select_all)
108
+
109
+ # —— 正则相关 —— #
110
+ pattern: Optional[str] # 正则表达式
111
+ flags: Optional[int] # re.I, re.M 等
112
+ group: Optional[int] # 匹配结果中的第几个分组 (默认 0)
113
+ template: Optional[str] # 自定义组合, 比如 "$1$2字"
114
+
115
+ # —— 文本处理 —— #
116
+ chars: Optional[str] # strip 要去除的字符集
117
+ old: Optional[str] # replace 中要被替换的子串
118
+ new: Optional[str] # replace 中新的子串
119
+ count: Optional[int] # replace 中的最大替换次数
120
+ sep: Optional[str] # split/join 的分隔符
121
+ index: Optional[int] # split/select_all/select 之后取第几个元素
122
+
123
+
124
+ class FieldRules(TypedDict):
125
+ steps: List[RuleStep]
126
+
127
+
128
+ class ChapterFieldRules(TypedDict):
129
+ key: str
130
+ steps: List[RuleStep]
131
+
132
+
133
+ class VolumesRules(TypedDict, total=False):
134
+ has_volume: bool # 是否存在卷,false=未分卷
135
+ volume_selector: str # 有卷时选择 volume 块的 selector
136
+ chapter_selector: str # 选择 chapter 节点的 selector
137
+ volume_name_steps: List[RuleStep]
138
+ chapter_steps: List[ChapterFieldRules] # 提取章节信息的步骤列表
139
+ volume_mode: str # Optional: "normal" (default) or "mixed"
140
+ list_selector: str # Optional: If "mixed" mode, parent container selector
141
+
142
+
143
+ class BookInfoRules(TypedDict, total=False):
144
+ book_name: FieldRules
145
+ author: FieldRules
146
+ cover_url: FieldRules
147
+ update_time: FieldRules
148
+ serial_status: FieldRules
149
+ word_count: FieldRules
150
+ summary: FieldRules
151
+ volumes: VolumesRules
152
+
153
+
154
+ class ChapterRules(TypedDict, total=False):
155
+ title: FieldRules
156
+ content: FieldRules
157
+
158
+
159
+ class SiteProfile(TypedDict):
160
+ book_info_url: str
161
+ chapter_url: str
162
+
163
+
164
+ class SiteRules(TypedDict):
165
+ profile: SiteProfile
166
+ book_info: BookInfoRules
167
+ chapter: ChapterRules
168
+
169
+
170
+ SiteRulesDict = Dict[str, SiteRules]
@@ -0,0 +1,97 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.config.site_rules
5
+ ----------------------------------
6
+
7
+ Handles loading, saving, and caching of site-specific scraping rules.
8
+
9
+ This module provides functionality to:
10
+ - Load site rules from JSON, YAML, or TOML formats.
11
+ - Save rules into a standard JSON format.
12
+ """
13
+
14
+ import json
15
+ import logging
16
+ from pathlib import Path
17
+ from typing import Union
18
+
19
+ from novel_downloader.utils.cache import cached_load_config
20
+ from novel_downloader.utils.constants import SITE_RULES_FILE
21
+ from novel_downloader.utils.file_utils import save_as_json
22
+
23
+ from .models import SiteRulesDict
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ def save_rules_as_json(
29
+ source_path: Union[str, Path], output_path: Union[str, Path] = SITE_RULES_FILE
30
+ ) -> None:
31
+ """
32
+ Load rules from source_path (toml, yaml, or json) and save as JSON.
33
+
34
+ :param source_path: Path to the source rules file
35
+ (supports .toml, .yaml, .yml, .json).
36
+ :param output_path: Path where the JSON output will be saved.
37
+ Defaults to SITE_RULES_FILE.
38
+ :raises FileNotFoundError: If the source_path does not exist.
39
+ :raises ValueError: If the source file format is not supported.
40
+ :raises Exception: If file loading or saving fails.
41
+ """
42
+ TAG = "[Config]"
43
+ source_path = Path(source_path)
44
+ output_path = Path(output_path)
45
+
46
+ if not source_path.exists():
47
+ raise FileNotFoundError(f"Source file {source_path} not found.")
48
+
49
+ suffix = source_path.suffix.lower()
50
+
51
+ logger.debug("%s Loading rules from %s (format: %s)", TAG, source_path, suffix)
52
+
53
+ try:
54
+ if suffix == ".toml":
55
+ import tomllib
56
+
57
+ with source_path.open("rb") as f:
58
+ rules_data = tomllib.load(f)
59
+ elif suffix in {".yaml", ".yml"}:
60
+ import yaml
61
+
62
+ with source_path.open("r", encoding="utf-8") as f:
63
+ rules_data = yaml.safe_load(f)
64
+ elif suffix == ".json":
65
+ with source_path.open("r", encoding="utf-8") as f:
66
+ rules_data = json.load(f)
67
+ else:
68
+ raise ValueError(f"Unsupported input format: {suffix}")
69
+
70
+ except Exception as e:
71
+ logger.error("%s Failed to load rules from %s: %s", TAG, source_path, str(e))
72
+ raise
73
+
74
+ logger.info("%s Saving rules to %s as JSON", TAG, output_path)
75
+
76
+ save_as_json(rules_data, output_path)
77
+ return
78
+
79
+
80
+ @cached_load_config
81
+ def load_site_rules(json_path: Union[str, Path] = SITE_RULES_FILE) -> SiteRulesDict:
82
+ """
83
+ Loads site scraping rules from a JSON file and caches the result for future access.
84
+
85
+ :param json_path: Path to the site rules JSON file. Defaults to SITE_RULES_FILE.
86
+ :return: A dictionary containing all site-specific scraping rules.
87
+ """
88
+ json_path = Path(json_path)
89
+ site_rules: SiteRulesDict = {}
90
+
91
+ if not json_path.exists():
92
+ return site_rules
93
+
94
+ with json_path.open("r", encoding="utf-8") as f:
95
+ site_rules = json.load(f)
96
+
97
+ return site_rules
@@ -0,0 +1,25 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.core
5
+ ---------------------
6
+
7
+ This package serves as the core layer of the novel_downloader system.
8
+
9
+ It provides factory methods for constructing key components required for
10
+ downloading and processing online novel content, including:
11
+
12
+ - Downloader: Handles the full download lifecycle of a book or a batch of books.
13
+ - Parser: Extracts structured data from HTML or SSR content.
14
+ - Requester: Sends HTTP requests and manages sessions, including login if required.
15
+ - Saver: Responsible for exporting downloaded data into various output formats.
16
+ """
17
+
18
+ from .factory import get_downloader, get_parser, get_requester, get_saver
19
+
20
+ __all__ = [
21
+ "get_downloader",
22
+ "get_parser",
23
+ "get_requester",
24
+ "get_saver",
25
+ ]
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.core.downloaders
5
+ ---------------------------------
6
+
7
+ This subpackage contains concrete downloader implementations for
8
+ specific novel platforms.
9
+
10
+ Each downloader is responsible for orchestrating the full lifecycle
11
+ of retrieving, parsing, and saving novel content for a given source.
12
+ """
13
+
14
+ from .common_downloader import CommonDownloader
15
+ from .qidian_downloader import QidianDownloader
16
+
17
+ __all__ = [
18
+ "CommonDownloader",
19
+ "QidianDownloader",
20
+ ]