novel-downloader 1.4.4__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/__init__.py +2 -2
  3. novel_downloader/cli/config.py +1 -83
  4. novel_downloader/cli/download.py +4 -5
  5. novel_downloader/cli/export.py +4 -1
  6. novel_downloader/cli/main.py +2 -0
  7. novel_downloader/cli/search.py +123 -0
  8. novel_downloader/config/__init__.py +3 -10
  9. novel_downloader/config/adapter.py +190 -54
  10. novel_downloader/config/loader.py +2 -3
  11. novel_downloader/core/__init__.py +13 -13
  12. novel_downloader/core/downloaders/__init__.py +10 -11
  13. novel_downloader/core/downloaders/base.py +152 -26
  14. novel_downloader/core/downloaders/biquge.py +5 -1
  15. novel_downloader/core/downloaders/common.py +157 -378
  16. novel_downloader/core/downloaders/esjzone.py +5 -1
  17. novel_downloader/core/downloaders/linovelib.py +5 -1
  18. novel_downloader/core/downloaders/qianbi.py +291 -4
  19. novel_downloader/core/downloaders/qidian.py +199 -285
  20. novel_downloader/core/downloaders/registry.py +67 -0
  21. novel_downloader/core/downloaders/sfacg.py +5 -1
  22. novel_downloader/core/downloaders/yamibo.py +5 -1
  23. novel_downloader/core/exporters/__init__.py +10 -11
  24. novel_downloader/core/exporters/base.py +87 -7
  25. novel_downloader/core/exporters/biquge.py +5 -8
  26. novel_downloader/core/exporters/common/__init__.py +2 -2
  27. novel_downloader/core/exporters/common/epub.py +82 -166
  28. novel_downloader/core/exporters/common/main_exporter.py +0 -60
  29. novel_downloader/core/exporters/common/txt.py +82 -83
  30. novel_downloader/core/exporters/epub_util.py +157 -1330
  31. novel_downloader/core/exporters/esjzone.py +5 -8
  32. novel_downloader/core/exporters/linovelib/__init__.py +2 -2
  33. novel_downloader/core/exporters/linovelib/epub.py +157 -212
  34. novel_downloader/core/exporters/linovelib/main_exporter.py +2 -59
  35. novel_downloader/core/exporters/linovelib/txt.py +67 -63
  36. novel_downloader/core/exporters/qianbi.py +5 -8
  37. novel_downloader/core/exporters/qidian.py +14 -4
  38. novel_downloader/core/exporters/registry.py +53 -0
  39. novel_downloader/core/exporters/sfacg.py +5 -8
  40. novel_downloader/core/exporters/txt_util.py +67 -0
  41. novel_downloader/core/exporters/yamibo.py +5 -8
  42. novel_downloader/core/fetchers/__init__.py +19 -24
  43. novel_downloader/core/fetchers/base/__init__.py +3 -3
  44. novel_downloader/core/fetchers/base/browser.py +23 -4
  45. novel_downloader/core/fetchers/base/session.py +30 -5
  46. novel_downloader/core/fetchers/biquge/__init__.py +3 -3
  47. novel_downloader/core/fetchers/biquge/browser.py +5 -0
  48. novel_downloader/core/fetchers/biquge/session.py +6 -1
  49. novel_downloader/core/fetchers/esjzone/__init__.py +3 -3
  50. novel_downloader/core/fetchers/esjzone/browser.py +5 -0
  51. novel_downloader/core/fetchers/esjzone/session.py +6 -1
  52. novel_downloader/core/fetchers/linovelib/__init__.py +3 -3
  53. novel_downloader/core/fetchers/linovelib/browser.py +6 -1
  54. novel_downloader/core/fetchers/linovelib/session.py +6 -1
  55. novel_downloader/core/fetchers/qianbi/__init__.py +3 -3
  56. novel_downloader/core/fetchers/qianbi/browser.py +5 -0
  57. novel_downloader/core/fetchers/qianbi/session.py +5 -0
  58. novel_downloader/core/fetchers/qidian/__init__.py +3 -3
  59. novel_downloader/core/fetchers/qidian/browser.py +12 -4
  60. novel_downloader/core/fetchers/qidian/session.py +11 -3
  61. novel_downloader/core/fetchers/registry.py +71 -0
  62. novel_downloader/core/fetchers/sfacg/__init__.py +3 -3
  63. novel_downloader/core/fetchers/sfacg/browser.py +5 -0
  64. novel_downloader/core/fetchers/sfacg/session.py +5 -0
  65. novel_downloader/core/fetchers/yamibo/__init__.py +3 -3
  66. novel_downloader/core/fetchers/yamibo/browser.py +5 -0
  67. novel_downloader/core/fetchers/yamibo/session.py +6 -1
  68. novel_downloader/core/interfaces/__init__.py +7 -5
  69. novel_downloader/core/interfaces/searcher.py +18 -0
  70. novel_downloader/core/parsers/__init__.py +10 -11
  71. novel_downloader/core/parsers/{biquge/main_parser.py → biquge.py} +7 -2
  72. novel_downloader/core/parsers/{esjzone/main_parser.py → esjzone.py} +7 -2
  73. novel_downloader/core/parsers/{linovelib/main_parser.py → linovelib.py} +7 -2
  74. novel_downloader/core/parsers/{qianbi/main_parser.py → qianbi.py} +7 -2
  75. novel_downloader/core/parsers/qidian/__init__.py +2 -2
  76. novel_downloader/core/parsers/qidian/chapter_encrypted.py +23 -21
  77. novel_downloader/core/parsers/qidian/chapter_normal.py +1 -1
  78. novel_downloader/core/parsers/qidian/main_parser.py +10 -21
  79. novel_downloader/core/parsers/qidian/utils/__init__.py +11 -11
  80. novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +5 -6
  81. novel_downloader/core/parsers/qidian/utils/node_decryptor.py +2 -2
  82. novel_downloader/core/parsers/registry.py +68 -0
  83. novel_downloader/core/parsers/{sfacg/main_parser.py → sfacg.py} +7 -2
  84. novel_downloader/core/parsers/{yamibo/main_parser.py → yamibo.py} +7 -2
  85. novel_downloader/core/searchers/__init__.py +20 -0
  86. novel_downloader/core/searchers/base.py +92 -0
  87. novel_downloader/core/searchers/biquge.py +83 -0
  88. novel_downloader/core/searchers/esjzone.py +84 -0
  89. novel_downloader/core/searchers/qianbi.py +131 -0
  90. novel_downloader/core/searchers/qidian.py +87 -0
  91. novel_downloader/core/searchers/registry.py +63 -0
  92. novel_downloader/locales/en.json +12 -4
  93. novel_downloader/locales/zh.json +12 -4
  94. novel_downloader/models/__init__.py +4 -30
  95. novel_downloader/models/config.py +12 -6
  96. novel_downloader/models/search.py +16 -0
  97. novel_downloader/models/types.py +0 -2
  98. novel_downloader/resources/config/settings.toml +31 -4
  99. novel_downloader/resources/css_styles/intro.css +83 -0
  100. novel_downloader/resources/css_styles/main.css +30 -89
  101. novel_downloader/utils/__init__.py +52 -0
  102. novel_downloader/utils/chapter_storage.py +244 -224
  103. novel_downloader/utils/constants.py +1 -21
  104. novel_downloader/utils/epub/__init__.py +34 -0
  105. novel_downloader/utils/epub/builder.py +377 -0
  106. novel_downloader/utils/epub/constants.py +77 -0
  107. novel_downloader/utils/epub/documents.py +403 -0
  108. novel_downloader/utils/epub/models.py +134 -0
  109. novel_downloader/utils/epub/utils.py +212 -0
  110. novel_downloader/utils/file_utils/__init__.py +10 -14
  111. novel_downloader/utils/file_utils/io.py +20 -51
  112. novel_downloader/utils/file_utils/normalize.py +2 -2
  113. novel_downloader/utils/file_utils/sanitize.py +2 -3
  114. novel_downloader/utils/fontocr/__init__.py +5 -5
  115. novel_downloader/utils/{hash_store.py → fontocr/hash_store.py} +4 -3
  116. novel_downloader/utils/{hash_utils.py → fontocr/hash_utils.py} +2 -2
  117. novel_downloader/utils/fontocr/ocr_v1.py +13 -1
  118. novel_downloader/utils/fontocr/ocr_v2.py +13 -1
  119. novel_downloader/utils/fontocr/ocr_v3.py +744 -0
  120. novel_downloader/utils/i18n.py +2 -0
  121. novel_downloader/utils/logger.py +2 -0
  122. novel_downloader/utils/network.py +110 -251
  123. novel_downloader/utils/state.py +1 -0
  124. novel_downloader/utils/text_utils/__init__.py +18 -17
  125. novel_downloader/utils/text_utils/diff_display.py +4 -5
  126. novel_downloader/utils/text_utils/numeric_conversion.py +253 -0
  127. novel_downloader/utils/text_utils/text_cleaner.py +179 -0
  128. novel_downloader/utils/text_utils/truncate_utils.py +62 -0
  129. novel_downloader/utils/time_utils/__init__.py +3 -3
  130. novel_downloader/utils/time_utils/datetime_utils.py +4 -5
  131. novel_downloader/utils/time_utils/sleep_utils.py +2 -3
  132. {novel_downloader-1.4.4.dist-info → novel_downloader-1.5.0.dist-info}/METADATA +2 -2
  133. novel_downloader-1.5.0.dist-info/RECORD +164 -0
  134. novel_downloader/config/site_rules.py +0 -94
  135. novel_downloader/core/factory/__init__.py +0 -20
  136. novel_downloader/core/factory/downloader.py +0 -73
  137. novel_downloader/core/factory/exporter.py +0 -58
  138. novel_downloader/core/factory/fetcher.py +0 -96
  139. novel_downloader/core/factory/parser.py +0 -86
  140. novel_downloader/core/fetchers/common/__init__.py +0 -14
  141. novel_downloader/core/fetchers/common/browser.py +0 -79
  142. novel_downloader/core/fetchers/common/session.py +0 -79
  143. novel_downloader/core/parsers/biquge/__init__.py +0 -10
  144. novel_downloader/core/parsers/common/__init__.py +0 -13
  145. novel_downloader/core/parsers/common/helper.py +0 -323
  146. novel_downloader/core/parsers/common/main_parser.py +0 -106
  147. novel_downloader/core/parsers/esjzone/__init__.py +0 -10
  148. novel_downloader/core/parsers/linovelib/__init__.py +0 -10
  149. novel_downloader/core/parsers/qianbi/__init__.py +0 -10
  150. novel_downloader/core/parsers/sfacg/__init__.py +0 -10
  151. novel_downloader/core/parsers/yamibo/__init__.py +0 -10
  152. novel_downloader/models/browser.py +0 -21
  153. novel_downloader/models/site_rules.py +0 -99
  154. novel_downloader/models/tasks.py +0 -33
  155. novel_downloader/resources/css_styles/volume-intro.css +0 -56
  156. novel_downloader/resources/json/replace_word_map.json +0 -4
  157. novel_downloader/resources/text/blacklist.txt +0 -22
  158. novel_downloader/utils/text_utils/chapter_formatting.py +0 -46
  159. novel_downloader/utils/text_utils/font_mapping.py +0 -28
  160. novel_downloader/utils/text_utils/text_cleaning.py +0 -107
  161. novel_downloader-1.4.4.dist-info/RECORD +0 -165
  162. {novel_downloader-1.4.4.dist-info → novel_downloader-1.5.0.dist-info}/WHEEL +0 -0
  163. {novel_downloader-1.4.4.dist-info → novel_downloader-1.5.0.dist-info}/entry_points.txt +0 -0
  164. {novel_downloader-1.4.4.dist-info → novel_downloader-1.5.0.dist-info}/licenses/LICENSE +0 -0
  165. {novel_downloader-1.4.4.dist-info → novel_downloader-1.5.0.dist-info}/top_level.txt +0 -0
@@ -7,67 +7,53 @@ Defines ConfigAdapter, which maps a raw configuration dictionary and
7
7
  site name into structured dataclass-based config models.
8
8
  """
9
9
 
10
- from typing import Any
10
+ import json
11
+ from typing import Any, cast
11
12
 
12
13
  from novel_downloader.models import (
13
14
  BookConfig,
14
15
  DownloaderConfig,
15
16
  ExporterConfig,
16
17
  FetcherConfig,
18
+ LogLevel,
17
19
  ParserConfig,
20
+ TextCleanerConfig,
18
21
  )
19
- from novel_downloader.utils.constants import SUPPORTED_SITES
20
-
21
- from .site_rules import load_site_rules
22
22
 
23
23
 
24
24
  class ConfigAdapter:
25
25
  """
26
- Adapter to map a raw config dict + site name into structured dataclass configs.
26
+ Adapter to map a raw configuration dictionary and site name
27
+ into structured dataclass configuration models.
27
28
  """
28
29
 
30
+ _ALLOWED_LOG_LEVELS: tuple[LogLevel, ...] = (
31
+ "DEBUG",
32
+ "INFO",
33
+ "WARNING",
34
+ "ERROR",
35
+ )
36
+
29
37
  def __init__(self, config: dict[str, Any], site: str):
30
38
  """
31
- :param config: 完整加载的配置 dict
32
- :param site: 当前站点名称 (e.g. "qidian")
39
+ Initialize the adapter.
40
+
41
+ :param config: The fully loaded configuration dictionary.
42
+ :param site: The current site name (e.g. "qidian").
33
43
  """
34
44
  self._config = config
35
45
  self._site = site
36
46
 
37
- site_rules = load_site_rules() # -> Dict[str, SiteRules]
38
- self._supported_sites = set(site_rules.keys()) | SUPPORTED_SITES
39
-
40
- @property
41
- def site(self) -> str:
42
- return self._site
43
-
44
- @site.setter
45
- def site(self, value: str) -> None:
46
- self._site = value
47
-
48
- def _get_site_cfg(self, site: str | None = None) -> dict[str, Any]:
49
- """
50
- 获取指定站点的配置 (默认为当前适配站点)
51
-
52
- 1. 如果有 site-specific 配置, 优先返回它
53
- 2. 否则, 如果该站点在支持站点中, 尝试返回 'common' 配置
54
- 3. 否则返回空 dict
47
+ def get_fetcher_config(self) -> FetcherConfig:
55
48
  """
56
- site = site or self._site
57
- sites_cfg = self._config.get("sites", {}) or {}
58
-
59
- if site in sites_cfg:
60
- return sites_cfg[site] or {}
61
-
62
- if site in self._supported_sites:
63
- return sites_cfg.get("common", {}) or {}
49
+ Build a FetcherConfig from the raw configuration.
64
50
 
65
- return {}
51
+ Reads from:
52
+ - config["general"] for global defaults (e.g. request_interval)
53
+ - config["requests"] for HTTP-specific settings (timeouts, retries, etc.)
54
+ - site-specific overrides under config["sites"][site]
66
55
 
67
- def get_fetcher_config(self) -> FetcherConfig:
68
- """
69
- 从 config["requests"] 中读取通用请求配置
70
- 返回 FetcherConfig 实例
56
+ :return: A FetcherConfig instance with all fields populated.
71
57
  """
72
58
  gen = self._config.get("general", {})
73
59
  req = self._config.get("requests", {})
@@ -91,8 +77,15 @@ class ConfigAdapter:
91
77
 
92
78
  def get_downloader_config(self) -> DownloaderConfig:
93
79
  """
94
- config["general"] config["sites"][site] 中读取下载器相关配置,
95
- 返回 DownloaderConfig 实例
80
+ Build a DownloaderConfig using both general and site-specific settings.
81
+
82
+ Reads from:
83
+ - config["general"] for download directories, worker counts, etc.
84
+ - config["requests"] for retry and backoff settings
85
+ - config["general"]["debug"] for debug toggles (e.g. save_html)
86
+ - config["sites"][site] for login credentials and mode
87
+
88
+ :return: A DownloaderConfig instance with all fields populated.
96
89
  """
97
90
  gen = self._config.get("general", {})
98
91
  req = self._config.get("requests", {})
@@ -104,13 +97,11 @@ class ConfigAdapter:
104
97
  backoff_factor=req.get("backoff_factor", 2.0),
105
98
  raw_data_dir=gen.get("raw_data_dir", "./raw_data"),
106
99
  cache_dir=gen.get("cache_dir", "./novel_cache"),
107
- download_workers=gen.get("download_workers", 2),
108
- parser_workers=gen.get("parser_workers", 2),
100
+ workers=gen.get("workers", 2),
109
101
  skip_existing=gen.get("skip_existing", True),
110
102
  login_required=site_cfg.get("login_required", False),
111
103
  save_html=debug.get("save_html", False),
112
104
  mode=site_cfg.get("mode", "session"),
113
- storage_backend=gen.get("storage_backend", "json"),
114
105
  storage_batch_size=gen.get("storage_batch_size", 1),
115
106
  username=site_cfg.get("username", ""),
116
107
  password=site_cfg.get("password", ""),
@@ -119,8 +110,14 @@ class ConfigAdapter:
119
110
 
120
111
  def get_parser_config(self) -> ParserConfig:
121
112
  """
122
- config["general"]["cache_dir"]、config["general"]["debug"]
123
- config["sites"][site] 中读取解析器相关配置, 返回 ParserConfig 实例
113
+ Build a ParserConfig from general, OCR, and site-specific settings.
114
+
115
+ Reads from:
116
+ - config["general"]["cache_dir"] for where to cache intermediate parses
117
+ - config["general"]["font_ocr"] for font-decoding and OCR options
118
+ - config["sites"][site] for parsing mode and truncation behavior
119
+
120
+ :return: A ParserConfig instance with all fields populated.
124
121
  """
125
122
  gen = self._config.get("general", {})
126
123
  font_ocr = gen.get("font_ocr", {})
@@ -144,20 +141,29 @@ class ConfigAdapter:
144
141
 
145
142
  def get_exporter_config(self) -> ExporterConfig:
146
143
  """
147
- config["general"] config["output"] 中读取存储器相关配置,
148
- 返回 ExporterConfig 实例
144
+ Build an ExporterConfig from output and general settings.
145
+
146
+ Reads from:
147
+ - config["general"] for cache and raw data directories
148
+ - config["output"]["formats"] for which formats to generate
149
+ - config["output"]["naming"] for filename templates
150
+ - config["output"]["epub"] for EPUB-specific options
151
+ - config["sites"][site] for export split mode
152
+
153
+ :return: An ExporterConfig instance with all fields populated.
149
154
  """
150
155
  gen = self._config.get("general", {})
151
156
  out = self._config.get("output", {})
157
+ cln = self._config.get("cleaner", {})
152
158
  fmt = out.get("formats", {})
153
159
  naming = out.get("naming", {})
154
160
  epub_opts = out.get("epub", {})
155
161
  site_cfg = self._get_site_cfg()
162
+ cleaner_cfg = self._dict_to_cleaner_cfg(cln)
156
163
  return ExporterConfig(
157
164
  cache_dir=gen.get("cache_dir", "./novel_cache"),
158
165
  raw_data_dir=gen.get("raw_data_dir", "./raw_data"),
159
166
  output_dir=gen.get("output_dir", "./downloads"),
160
- storage_backend=gen.get("storage_backend", "json"),
161
167
  clean_text=out.get("clean_text", True),
162
168
  make_txt=fmt.get("make_txt", True),
163
169
  make_epub=fmt.get("make_epub", False),
@@ -169,11 +175,20 @@ class ConfigAdapter:
169
175
  include_toc=epub_opts.get("include_toc", False),
170
176
  include_picture=epub_opts.get("include_picture", False),
171
177
  split_mode=site_cfg.get("split_mode", "book"),
178
+ cleaner_cfg=cleaner_cfg,
172
179
  )
173
180
 
174
181
  def get_book_ids(self) -> list[BookConfig]:
175
182
  """
176
- config["sites"][site]["book_ids"] 中提取目标书籍列表
183
+ Extract the list of target books from the site configuration.
184
+
185
+ The site config may specify book_ids as:
186
+ - a single string or integer
187
+ - a dict with book_id and optional start_id, end_id, ignore_ids
188
+ - a list of the above types
189
+
190
+ :return: A list of BookConfig dicts.
191
+ :raises ValueError: if the raw book_ids is neither a str/int, dict, nor list.
177
192
  """
178
193
  site_cfg = self._get_site_cfg()
179
194
  raw = site_cfg.get("book_ids", [])
@@ -182,7 +197,7 @@ class ConfigAdapter:
182
197
  return [{"book_id": str(raw)}]
183
198
 
184
199
  if isinstance(raw, dict):
185
- return [self._dict_to_book_config(raw)]
200
+ return [self._dict_to_book_cfg(raw)]
186
201
 
187
202
  if not isinstance(raw, list):
188
203
  raise ValueError(
@@ -195,17 +210,71 @@ class ConfigAdapter:
195
210
  if isinstance(item, str | int):
196
211
  result.append({"book_id": str(item)})
197
212
  elif isinstance(item, dict):
198
- result.append(self._dict_to_book_config(item))
213
+ result.append(self._dict_to_book_cfg(item))
199
214
  except ValueError:
200
215
  continue
201
216
 
202
217
  return result
203
218
 
219
+ def get_log_level(self) -> LogLevel:
220
+ """
221
+ Retrieve the logging level from [general.debug].
222
+
223
+ Reads from config["general"]["debug"]["log_level"], defaulting to "INFO"
224
+ if not set or invalid.
225
+
226
+ :return: The configured LogLevel literal ("DEBUG", "INFO", "WARNING", "ERROR").
227
+ """
228
+ debug_cfg = self._config.get("general", {}).get("debug", {})
229
+ raw = debug_cfg.get("log_level") or "INFO"
230
+ if raw in self._ALLOWED_LOG_LEVELS:
231
+ return cast(LogLevel, raw)
232
+ return "INFO"
233
+
234
+ @property
235
+ def site(self) -> str:
236
+ """
237
+ Get the current site name.
238
+ """
239
+ return self._site
240
+
241
+ @site.setter
242
+ def site(self, value: str) -> None:
243
+ """
244
+ Set a new site name for configuration lookups.
245
+
246
+ :param value: The new site key in config["sites"] to use.
247
+ """
248
+ self._site = value
249
+
250
+ def _get_site_cfg(self, site: str | None = None) -> dict[str, Any]:
251
+ """
252
+ Retrieve the configuration for a specific site.
253
+
254
+ Lookup order:
255
+ 1. If there is a site-specific entry under config["sites"], return that.
256
+ 2. Otherwise, if a "common" entry exists under config["sites"], return that.
257
+ 3. If neither is present, return an empty dict.
258
+
259
+ :param site: Optional override of the site name; defaults to self._site.
260
+ :return: The site-specific or common configuration dict.
261
+ """
262
+ site = site or self._site
263
+ sites_cfg = self._config.get("sites", {}) or {}
264
+
265
+ if site in sites_cfg:
266
+ return sites_cfg[site] or {}
267
+
268
+ return sites_cfg.get("common", {}) or {}
269
+
204
270
  @staticmethod
205
- def _dict_to_book_config(data: dict[str, Any]) -> BookConfig:
271
+ def _dict_to_book_cfg(data: dict[str, Any]) -> BookConfig:
206
272
  """
207
- Converts a dict to BookConfig with type normalization.
208
- Raises ValueError if 'book_id' is missing.
273
+ Convert a dictionary to a BookConfig with normalized types.
274
+
275
+ :param data: A dict that must contain at least "book_id".
276
+ :return: A BookConfig dict with all values cast to strings or lists of strings.
277
+ :raises ValueError: if the "book_id" field is missing.
209
278
  """
210
279
  if "book_id" not in data:
211
280
  raise ValueError("Missing required field 'book_id'")
@@ -222,3 +291,70 @@ class ConfigAdapter:
222
291
  result["ignore_ids"] = [str(x) for x in data["ignore_ids"]]
223
292
 
224
293
  return result
294
+
295
+ @classmethod
296
+ def _dict_to_cleaner_cfg(cls, cfg: dict[str, Any]) -> TextCleanerConfig:
297
+ """
298
+ Convert a nested dict of title/content rules into a TextCleanerConfig.
299
+
300
+ :param cfg: configuration dictionary
301
+ :return: fully constructed TextCleanerConfig
302
+ """
303
+ # Title rules
304
+ title_section = cfg.get("title", {})
305
+ title_remove = title_section.get("remove_patterns", [])
306
+ title_repl = title_section.get("replace", {})
307
+
308
+ title_ext = title_section.get("external", {})
309
+ title_ext_en = title_ext.get("enabled", False)
310
+ title_ext_rm_p = title_ext.get("remove_patterns", "")
311
+ title_ext_rp_p = title_ext.get("replace", "")
312
+ if title_ext_en:
313
+ title_remove_ext = cls._load_str_list(title_ext_rm_p)
314
+ title_remove += title_remove_ext
315
+
316
+ title_repl_ext = cls._load_str_dict(title_ext_rp_p)
317
+ title_repl = {**title_repl, **title_repl_ext}
318
+
319
+ # Content rules
320
+ content_section = cfg.get("content", {})
321
+ content_remove = content_section.get("remove_patterns", [])
322
+ content_repl = content_section.get("replace", {})
323
+
324
+ content_ext = content_section.get("external", {})
325
+ content_ext_en = content_ext.get("enabled", False)
326
+ content_ext_rm_p = content_ext.get("remove_patterns", "")
327
+ content_ext_rp_p = content_ext.get("replace", "")
328
+
329
+ if content_ext_en:
330
+ content_remove_ext = cls._load_str_list(content_ext_rm_p)
331
+ content_remove += content_remove_ext
332
+
333
+ content_repl_ext = cls._load_str_dict(content_ext_rp_p)
334
+ content_repl = {**content_repl, **content_repl_ext}
335
+
336
+ return TextCleanerConfig(
337
+ remove_invisible=cfg.get("remove_invisible", True),
338
+ title_remove_patterns=title_remove,
339
+ title_replacements=title_repl,
340
+ content_remove_patterns=content_remove,
341
+ content_replacements=content_repl,
342
+ )
343
+
344
+ @staticmethod
345
+ def _load_str_list(path: str) -> list[str]:
346
+ try:
347
+ with open(path, encoding="utf-8") as f:
348
+ parsed = json.load(f)
349
+ return cast(list[str], parsed)
350
+ except Exception:
351
+ return []
352
+
353
+ @staticmethod
354
+ def _load_str_dict(path: str) -> dict[str, str]:
355
+ try:
356
+ with open(path, encoding="utf-8") as f:
357
+ parsed = json.load(f)
358
+ return cast(dict[str, str], parsed)
359
+ except Exception:
360
+ return {}
@@ -7,6 +7,8 @@ Provides functionality to load Toml configuration files into Python
7
7
  dictionaries, with robust error handling and fallback support.
8
8
  """
9
9
 
10
+ __all__ = ["load_config"]
11
+
10
12
  import json
11
13
  import logging
12
14
  from pathlib import Path
@@ -180,6 +182,3 @@ def save_config_file(
180
182
 
181
183
  logger.info("[config] Configuration successfully saved to JSON: %s", output)
182
184
  return
183
-
184
-
185
- __all__ = ["load_config"]
@@ -14,26 +14,26 @@ downloading and processing online novel content, including:
14
14
  - Exporter: Responsible for exporting downloaded data into various output formats.
15
15
  """
16
16
 
17
- from .factory import (
18
- get_downloader,
19
- get_exporter,
20
- get_fetcher,
21
- get_parser,
22
- )
23
- from .interfaces import (
24
- DownloaderProtocol,
25
- ExporterProtocol,
26
- FetcherProtocol,
27
- ParserProtocol,
28
- )
29
-
30
17
  __all__ = [
31
18
  "get_downloader",
32
19
  "get_exporter",
33
20
  "get_fetcher",
34
21
  "get_parser",
22
+ "search",
35
23
  "DownloaderProtocol",
36
24
  "ExporterProtocol",
37
25
  "FetcherProtocol",
38
26
  "ParserProtocol",
39
27
  ]
28
+
29
+ from .downloaders import get_downloader
30
+ from .exporters import get_exporter
31
+ from .fetchers import get_fetcher
32
+ from .interfaces import (
33
+ DownloaderProtocol,
34
+ ExporterProtocol,
35
+ FetcherProtocol,
36
+ ParserProtocol,
37
+ )
38
+ from .parsers import get_parser
39
+ from .searchers import search
@@ -17,19 +17,10 @@ Currently supported platforms:
17
17
  - qidian (起点中文网)
18
18
  - sfacg (SF轻小说)
19
19
  - yamibo (百合会)
20
- - common (通用架构)
21
20
  """
22
21
 
23
- from .biquge import BiqugeDownloader
24
- from .common import CommonDownloader
25
- from .esjzone import EsjzoneDownloader
26
- from .linovelib import LinovelibDownloader
27
- from .qianbi import QianbiDownloader
28
- from .qidian import QidianDownloader
29
- from .sfacg import SfacgDownloader
30
- from .yamibo import YamiboDownloader
31
-
32
22
  __all__ = [
23
+ "get_downloader",
33
24
  "BiqugeDownloader",
34
25
  "EsjzoneDownloader",
35
26
  "LinovelibDownloader",
@@ -37,5 +28,13 @@ __all__ = [
37
28
  "QidianDownloader",
38
29
  "SfacgDownloader",
39
30
  "YamiboDownloader",
40
- "CommonDownloader",
41
31
  ]
32
+
33
+ from .biquge import BiqugeDownloader
34
+ from .esjzone import EsjzoneDownloader
35
+ from .linovelib import LinovelibDownloader
36
+ from .qianbi import QianbiDownloader
37
+ from .qidian import QidianDownloader
38
+ from .registry import get_downloader
39
+ from .sfacg import SfacgDownloader
40
+ from .yamibo import YamiboDownloader
@@ -8,8 +8,9 @@ common interface and reusable logic for all downloader implementations.
8
8
  """
9
9
 
10
10
  import abc
11
+ import json
11
12
  import logging
12
- from collections.abc import Awaitable, Callable
13
+ from collections.abc import AsyncIterator, Awaitable, Callable, Sequence
13
14
  from pathlib import Path
14
15
  from typing import Any
15
16
 
@@ -19,32 +20,54 @@ from novel_downloader.core.interfaces import (
19
20
  ParserProtocol,
20
21
  )
21
22
  from novel_downloader.models import BookConfig, DownloaderConfig
23
+ from novel_downloader.utils import calculate_time_difference
22
24
 
23
25
 
24
26
  class BaseDownloader(DownloaderProtocol, abc.ABC):
25
27
  """
26
- Abstract downloader that defines the initialization interface
27
- and the general batch download flow.
28
+ Abstract base class for novel downloaders.
28
29
 
29
- Subclasses must implement the logic for downloading a single book.
30
+ Defines the general interface and batch download workflow,
31
+ while delegating book-specific downloading logic to subclasses.
32
+
33
+ Subclasses are required to implement methods for downloading
34
+ a single book, using the provided fetcher and parser components.
30
35
  """
31
36
 
37
+ DEFAULT_SOURCE_ID = 0
38
+ DEFAULT_PRIORITIES_MAP = {
39
+ DEFAULT_SOURCE_ID: 0,
40
+ }
41
+
32
42
  def __init__(
33
43
  self,
34
44
  fetcher: FetcherProtocol,
35
45
  parser: ParserProtocol,
36
46
  config: DownloaderConfig,
37
47
  site: str,
48
+ priorities: dict[int, int] | None = None,
38
49
  ):
50
+ """
51
+ Initialize the downloader for a specific site.
52
+
53
+ :param fetcher: Fetcher component for retrieving raw chapter data.
54
+ :param parser: Parser component for extracting chapter content.
55
+ :param config: Downloader configuration settings.
56
+ :param site: Identifier for the target website or source.
57
+ :param priorities: Mapping of source_id to priority value.
58
+ Lower numbers indicate higher priority.
59
+ E.X. {0: 10, 1: 100} means source 0 is preferred.
60
+ """
39
61
  self._fetcher = fetcher
40
62
  self._parser = parser
41
63
  self._config = config
42
64
  self._site = site
65
+ self._priorities = priorities or self.DEFAULT_PRIORITIES_MAP
43
66
 
44
67
  self._raw_data_dir = Path(config.raw_data_dir) / site
45
- self._cache_dir = Path(config.cache_dir) / site
46
68
  self._raw_data_dir.mkdir(parents=True, exist_ok=True)
47
- self._cache_dir.mkdir(parents=True, exist_ok=True)
69
+ self._debug_dir = Path.cwd() / "debug" / site
70
+ self._debug_dir.mkdir(parents=True, exist_ok=True)
48
71
 
49
72
  self.logger = logging.getLogger(f"{self.__class__.__name__}")
50
73
 
@@ -117,6 +140,28 @@ class BaseDownloader(DownloaderProtocol, abc.ABC):
117
140
 
118
141
  await self._finalize()
119
142
 
143
+ async def load_book_info(
144
+ self,
145
+ book_id: str,
146
+ html_dir: Path,
147
+ ) -> dict[str, Any]:
148
+ book_info = self._load_book_info(
149
+ book_id=book_id,
150
+ max_age_days=1,
151
+ )
152
+ if book_info:
153
+ return book_info
154
+
155
+ info_html = await self.fetcher.get_book_info(book_id)
156
+ self._save_html_pages(html_dir, "info", info_html)
157
+ book_info = self.parser.parse_book_info(info_html)
158
+
159
+ if book_info:
160
+ self._save_book_info(book_id, book_info)
161
+ return book_info
162
+
163
+ return self._load_book_info(book_id)
164
+
120
165
  @abc.abstractmethod
121
166
  async def _download_one(
122
167
  self,
@@ -147,29 +192,110 @@ class BaseDownloader(DownloaderProtocol, abc.ABC):
147
192
  """
148
193
  return
149
194
 
150
- @property
151
- def fetcher(self) -> FetcherProtocol:
152
- return self._fetcher
195
+ def _load_book_info(
196
+ self,
197
+ book_id: str,
198
+ *,
199
+ max_age_days: int | None = None,
200
+ ) -> dict[str, Any]:
201
+ """
202
+ Attempt to read and parse the book_info.json for a given book_id.
153
203
 
154
- @property
155
- def parser(self) -> ParserProtocol:
156
- return self._parser
204
+ :param book_id: identifier of the book
205
+ :param max_age_days: if set, only return if 'update_time' is less
206
+ :return: dict of book info if is valid JSON, else empty
207
+ """
208
+ info_path = self._raw_data_dir / book_id / "book_info.json"
209
+ if not info_path.is_file():
210
+ return {}
157
211
 
158
- @property
159
- def config(self) -> DownloaderConfig:
160
- return self._config
212
+ try:
213
+ data: dict[str, Any] = json.loads(info_path.read_text(encoding="utf-8"))
214
+ except json.JSONDecodeError:
215
+ return {}
216
+
217
+ if max_age_days is not None:
218
+ days, *_ = calculate_time_difference(
219
+ data.get("update_time", ""),
220
+ "UTC+8",
221
+ )
222
+ if days > max_age_days:
223
+ return {}
161
224
 
162
- @property
163
- def raw_data_dir(self) -> Path:
164
- return self._raw_data_dir
225
+ return data
226
+
227
+ def _save_book_info(
228
+ self,
229
+ book_id: str,
230
+ book_info: dict[str, Any],
231
+ ) -> None:
232
+ """
233
+ Serialize and save the book_info dict as json.
234
+
235
+ :param book_id: identifier of the book
236
+ :param book_info: dict containing metadata about the book
237
+ """
238
+ target_dir = self._raw_data_dir / book_id
239
+ target_dir.mkdir(parents=True, exist_ok=True)
240
+ (target_dir / "book_info.json").write_text(
241
+ json.dumps(book_info, ensure_ascii=False, indent=2),
242
+ encoding="utf-8",
243
+ )
244
+
245
+ def _save_html_pages(
246
+ self,
247
+ html_dir: Path,
248
+ filename: str,
249
+ html_list: Sequence[str],
250
+ ) -> None:
251
+ """
252
+ If save_html is enabled, write each HTML snippet to a file.
253
+
254
+ Filenames will be {chap_id}_{index}.html in html_dir.
255
+
256
+ :param html_dir: directory in which to write HTML files
257
+ :param filename: used as filename prefix
258
+ :param html_list: list of HTML strings to save
259
+ """
260
+ if not self.save_html:
261
+ return
262
+
263
+ html_dir.mkdir(parents=True, exist_ok=True)
264
+ for i, html in enumerate(html_list):
265
+ file_path = html_dir / f"{filename}_{i}.html"
266
+ file_path.write_text(html, encoding="utf-8")
267
+
268
+ @staticmethod
269
+ async def _chapter_ids(
270
+ volumes: list[dict[str, Any]],
271
+ start_id: str | None,
272
+ end_id: str | None,
273
+ ) -> AsyncIterator[str]:
274
+ """
275
+ Yield each chapterId in order, respecting start/end bounds.
276
+ """
277
+ seen_start = start_id is None
278
+ for vol in volumes:
279
+ for chap in vol.get("chapters", []):
280
+ cid = chap.get("chapterId")
281
+ if not cid:
282
+ continue
283
+ if not seen_start:
284
+ if cid == start_id:
285
+ seen_start = True
286
+ else:
287
+ continue
288
+ yield cid
289
+ if end_id is not None and cid == end_id:
290
+ return
165
291
 
166
292
  @property
167
- def cache_dir(self) -> Path:
168
- return self._cache_dir
293
+ def fetcher(self) -> FetcherProtocol:
294
+ return self._fetcher
169
295
 
170
296
  @property
171
- def site(self) -> str:
172
- return self._site
297
+ def parser(self) -> ParserProtocol:
298
+ return self._parser
173
299
 
174
300
  @property
175
301
  def save_html(self) -> bool:
@@ -196,12 +322,12 @@ class BaseDownloader(DownloaderProtocol, abc.ABC):
196
322
  return self._config.backoff_factor
197
323
 
198
324
  @property
199
- def parser_workers(self) -> int:
200
- return self._config.parser_workers
325
+ def workers(self) -> int:
326
+ return self._config.workers
201
327
 
202
328
  @property
203
- def download_workers(self) -> int:
204
- return self._config.download_workers
329
+ def storage_batch_size(self) -> int:
330
+ return max(1, self._config.storage_batch_size)
205
331
 
206
332
  def _handle_download_exception(self, book: BookConfig, error: Exception) -> None:
207
333
  """