novel-downloader 1.2.1__tar.gz → 1.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/PKG-INFO +1 -1
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/__init__.py +1 -1
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/cli/download.py +2 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/config/adapter.py +29 -4
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/config/models.py +7 -4
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/downloaders/common_downloader.py +1 -2
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/downloaders/qidian_downloader.py +1 -2
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/factory/downloader_factory.py +13 -11
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/interfaces/async_requester_protocol.py +4 -1
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/interfaces/requester_protocol.py +4 -1
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/parsers/base_parser.py +3 -3
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/parsers/common_parser/helper.py +7 -5
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/parsers/qidian_parser/browser/chapter_encrypted.py +1 -1
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/parsers/qidian_parser/browser/main_parser.py +3 -3
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/parsers/qidian_parser/session/chapter_encrypted.py +1 -1
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/parsers/qidian_parser/session/main_parser.py +3 -3
- novel_downloader-1.2.2/novel_downloader/core/parsers/qidian_parser/shared/book_info_parser.py +151 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/parsers/qidian_parser/shared/helpers.py +2 -2
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/requesters/base_async_session.py +4 -1
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/requesters/base_browser.py +9 -5
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/requesters/base_session.py +4 -1
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/requesters/common_requester/common_session.py +2 -2
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/requesters/qidian_requester/qidian_broswer.py +35 -16
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/requesters/qidian_requester/qidian_session.py +3 -3
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/savers/common_saver/common_epub.py +1 -1
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/locales/en.json +4 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/locales/zh.json +4 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/utils/constants.py +2 -1
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/utils/time_utils/datetime_utils.py +1 -1
- novel_downloader-1.2.2/novel_downloader/utils/time_utils/sleep_utils.py +65 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader.egg-info/PKG-INFO +1 -1
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/pyproject.toml +1 -1
- novel_downloader-1.2.1/novel_downloader/core/parsers/qidian_parser/shared/book_info_parser.py +0 -95
- novel_downloader-1.2.1/novel_downloader/utils/time_utils/sleep_utils.py +0 -49
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/LICENSE +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/README.md +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/cli/__init__.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/cli/clean.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/cli/interactive.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/cli/main.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/cli/settings.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/config/__init__.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/config/loader.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/config/site_rules.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/__init__.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/downloaders/__init__.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/downloaders/base_async_downloader.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/downloaders/base_downloader.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/downloaders/common_asynb_downloader.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/factory/__init__.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/factory/parser_factory.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/factory/requester_factory.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/factory/saver_factory.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/interfaces/__init__.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/interfaces/async_downloader_protocol.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/interfaces/downloader_protocol.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/interfaces/parser_protocol.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/interfaces/saver_protocol.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/parsers/__init__.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/parsers/common_parser/__init__.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/parsers/common_parser/main_parser.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/parsers/qidian_parser/__init__.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/parsers/qidian_parser/browser/__init__.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/parsers/qidian_parser/browser/chapter_normal.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/parsers/qidian_parser/browser/chapter_router.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/parsers/qidian_parser/session/__init__.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/parsers/qidian_parser/session/chapter_normal.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/parsers/qidian_parser/session/chapter_router.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/parsers/qidian_parser/session/node_decryptor.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/parsers/qidian_parser/shared/__init__.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/requesters/__init__.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/requesters/common_requester/__init__.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/requesters/common_requester/common_async_session.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/requesters/qidian_requester/__init__.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/savers/__init__.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/savers/base_saver.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/savers/common_saver/__init__.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/savers/common_saver/common_txt.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/savers/common_saver/main_saver.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/savers/epub_utils/__init__.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/savers/epub_utils/css_builder.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/savers/epub_utils/initializer.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/savers/epub_utils/text_to_html.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/savers/epub_utils/volume_intro.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/savers/qidian_saver.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/resources/config/rules.toml +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/resources/config/settings.yaml +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/resources/css_styles/main.css +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/resources/css_styles/volume-intro.css +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/resources/images/volume_border.png +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/resources/js_scripts/qidian_decrypt_node.js +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/resources/json/replace_word_map.json +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/resources/text/blacklist.txt +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/utils/__init__.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/utils/cache.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/utils/crypto_utils.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/utils/file_utils/__init__.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/utils/file_utils/io.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/utils/file_utils/normalize.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/utils/file_utils/sanitize.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/utils/fontocr/__init__.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/utils/fontocr/ocr_v1.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/utils/fontocr/ocr_v2.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/utils/hash_store.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/utils/hash_utils.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/utils/i18n.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/utils/logger.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/utils/model_loader.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/utils/network.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/utils/state.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/utils/text_utils/__init__.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/utils/text_utils/chapter_formatting.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/utils/text_utils/diff_display.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/utils/text_utils/font_mapping.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/utils/text_utils/text_cleaning.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/utils/time_utils/__init__.py +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader.egg-info/SOURCES.txt +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader.egg-info/dependency_links.txt +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader.egg-info/entry_points.txt +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader.egg-info/requires.txt +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader.egg-info/top_level.txt +0 -0
- {novel_downloader-1.2.1 → novel_downloader-1.2.2}/setup.cfg +0 -0
@@ -57,6 +57,8 @@ def download_cli(ctx: Context, book_ids: List[str], site: str) -> None:
|
|
57
57
|
parser_cfg = adapter.get_parser_config()
|
58
58
|
saver_cfg = adapter.get_saver_config()
|
59
59
|
|
60
|
+
click.echo(t("download_site_mode", mode=downloader_cfg.mode))
|
61
|
+
|
60
62
|
# If no book_ids provided on the command line, try to load them from config
|
61
63
|
if not book_ids:
|
62
64
|
try:
|
@@ -23,6 +23,7 @@ from .models import (
|
|
23
23
|
RequesterConfig,
|
24
24
|
SaverConfig,
|
25
25
|
)
|
26
|
+
from .site_rules import load_site_rules
|
26
27
|
|
27
28
|
|
28
29
|
class ConfigAdapter:
|
@@ -38,19 +39,43 @@ class ConfigAdapter:
|
|
38
39
|
self._config = config
|
39
40
|
self._site = site
|
40
41
|
|
42
|
+
site_rules = load_site_rules() # -> Dict[str, SiteRules]
|
43
|
+
self._supported_sites = set(site_rules.keys())
|
44
|
+
|
41
45
|
def set_site(self, site: str) -> None:
|
42
46
|
"""
|
43
47
|
切换当前适配的站点
|
44
48
|
"""
|
45
49
|
self._site = site
|
46
50
|
|
51
|
+
def _get_site_cfg(self) -> Dict[str, Any]:
|
52
|
+
"""
|
53
|
+
统一获取站点配置:
|
54
|
+
|
55
|
+
1. 先尝试从 self._config["sites"][self._site] 取配置
|
56
|
+
2. 如果没有配置, 且 self._site 在 self._supported_sites 中, 则取 sites["common"]
|
57
|
+
3. 否则返回空 dict
|
58
|
+
"""
|
59
|
+
sites_cfg = self._config.get("sites", {}) or {}
|
60
|
+
|
61
|
+
# 1. site-specific config
|
62
|
+
if self._site in sites_cfg:
|
63
|
+
return sites_cfg[self._site] or {}
|
64
|
+
|
65
|
+
# 2. fallback to "common" only if site is supported
|
66
|
+
if self._site in self._supported_sites:
|
67
|
+
return sites_cfg.get("common", {}) or {}
|
68
|
+
|
69
|
+
# 3. completely unsupported site
|
70
|
+
return {}
|
71
|
+
|
47
72
|
def get_requester_config(self) -> RequesterConfig:
|
48
73
|
"""
|
49
74
|
从 config["requests"] 中读取通用请求配置 (含 DrissionPage 设置)
|
50
75
|
返回 RequesterConfig 实例
|
51
76
|
"""
|
52
77
|
req = self._config.get("requests", {})
|
53
|
-
site_cfg = self.
|
78
|
+
site_cfg = self._get_site_cfg()
|
54
79
|
return RequesterConfig(
|
55
80
|
wait_time=req.get("wait_time", 5),
|
56
81
|
retry_times=req.get("retry_times", 3),
|
@@ -73,7 +98,7 @@ class ConfigAdapter:
|
|
73
98
|
"""
|
74
99
|
gen = self._config.get("general", {})
|
75
100
|
debug = gen.get("debug", {})
|
76
|
-
site_cfg = self.
|
101
|
+
site_cfg = self._get_site_cfg()
|
77
102
|
return DownloaderConfig(
|
78
103
|
request_interval=gen.get("request_interval", 5),
|
79
104
|
raw_data_dir=gen.get("raw_data_dir", "./raw_data"),
|
@@ -94,7 +119,7 @@ class ConfigAdapter:
|
|
94
119
|
"""
|
95
120
|
gen = self._config.get("general", {})
|
96
121
|
font_ocr = gen.get("font_ocr", {})
|
97
|
-
site_cfg = self.
|
122
|
+
site_cfg = self._get_site_cfg()
|
98
123
|
return ParserConfig(
|
99
124
|
cache_dir=gen.get("cache_dir", "./cache"),
|
100
125
|
decode_font=font_ocr.get("decode_font", False),
|
@@ -139,7 +164,7 @@ class ConfigAdapter:
|
|
139
164
|
"""
|
140
165
|
从 config["sites"][site]["book_ids"] 中提取目标书籍列表
|
141
166
|
"""
|
142
|
-
site_cfg = self.
|
167
|
+
site_cfg = self._get_site_cfg()
|
143
168
|
raw_ids = site_cfg.get("book_ids", [])
|
144
169
|
|
145
170
|
if isinstance(raw_ids, str):
|
@@ -135,16 +135,19 @@ class ChapterFieldRules(TypedDict):
|
|
135
135
|
steps: List[RuleStep]
|
136
136
|
|
137
137
|
|
138
|
-
class
|
139
|
-
has_volume: bool # 是否存在卷,false=未分卷
|
138
|
+
class VolumesRulesOptional(TypedDict, total=False):
|
140
139
|
volume_selector: str # 有卷时选择 volume 块的 selector
|
141
|
-
chapter_selector: str # 选择 chapter 节点的 selector
|
142
140
|
volume_name_steps: List[RuleStep]
|
143
|
-
chapter_steps: List[ChapterFieldRules] # 提取章节信息的步骤列表
|
144
141
|
volume_mode: str # Optional: "normal" (default) or "mixed"
|
145
142
|
list_selector: str # Optional: If "mixed" mode, parent container selector
|
146
143
|
|
147
144
|
|
145
|
+
class VolumesRules(VolumesRulesOptional):
|
146
|
+
has_volume: bool # 是否存在卷,false=未分卷
|
147
|
+
chapter_selector: str # 选择 chapter 节点的 selector
|
148
|
+
chapter_steps: List[ChapterFieldRules] # 提取章节信息的步骤列表
|
149
|
+
|
150
|
+
|
148
151
|
class BookInfoRules(TypedDict, total=False):
|
149
152
|
book_name: FieldRules
|
150
153
|
author: FieldRules
|
@@ -67,8 +67,7 @@ class CommonDownloader(BaseDownloader):
|
|
67
67
|
cache_base = self.cache_dir / site / book_id
|
68
68
|
info_path = raw_base / "book_info.json"
|
69
69
|
chapter_dir = raw_base / "chapters"
|
70
|
-
|
71
|
-
chapters_html_dir = cache_base / "html"
|
70
|
+
chapters_html_dir = cache_base / "html"
|
72
71
|
|
73
72
|
raw_base.mkdir(parents=True, exist_ok=True)
|
74
73
|
chapter_dir.mkdir(parents=True, exist_ok=True)
|
@@ -87,8 +87,7 @@ class QidianDownloader(BaseDownloader):
|
|
87
87
|
info_path = raw_base / "book_info.json"
|
88
88
|
chapter_dir = raw_base / "chapters"
|
89
89
|
encrypted_chapter_dir = raw_base / "encrypted_chapters"
|
90
|
-
|
91
|
-
chapters_html_dir = cache_base / "html"
|
90
|
+
chapters_html_dir = cache_base / "html"
|
92
91
|
|
93
92
|
raw_base.mkdir(parents=True, exist_ok=True)
|
94
93
|
chapter_dir.mkdir(parents=True, exist_ok=True)
|
@@ -14,7 +14,7 @@ based on the site name and parser mode specified in the configuration.
|
|
14
14
|
To add support for new sites or modes, extend the `_site_map` accordingly.
|
15
15
|
"""
|
16
16
|
|
17
|
-
from typing import Union
|
17
|
+
from typing import Union, cast
|
18
18
|
|
19
19
|
from novel_downloader.config import DownloaderConfig, load_site_rules
|
20
20
|
from novel_downloader.core.downloaders import (
|
@@ -137,13 +137,15 @@ def get_downloader(
|
|
137
137
|
:raises TypeError: If the provided requester does not match the required protocol
|
138
138
|
for the chosen mode (sync vs async).
|
139
139
|
"""
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
return get_async_downloader(
|
145
|
-
|
146
|
-
if not
|
147
|
-
raise TypeError(
|
148
|
-
|
149
|
-
|
140
|
+
if requester.is_async():
|
141
|
+
if config.mode.lower() != "async":
|
142
|
+
raise TypeError("Requester is async, but config.mode is not 'async'")
|
143
|
+
async_requester = cast(AsyncRequesterProtocol, requester)
|
144
|
+
return get_async_downloader(async_requester, parser, saver, site, config)
|
145
|
+
else:
|
146
|
+
if config.mode.lower() not in ("browser", "session"):
|
147
|
+
raise TypeError(
|
148
|
+
"Requester is sync, but config.mode is not 'browser' or 'session'"
|
149
|
+
)
|
150
|
+
sync_requester = cast(RequesterProtocol, requester)
|
151
|
+
return get_sync_downloader(sync_requester, parser, saver, site, config)
|
@@ -9,7 +9,7 @@ for book info pages, individual chapters, managing request lifecycle,
|
|
9
9
|
and optionally retrieving a user's authenticated bookcase — all in async style.
|
10
10
|
"""
|
11
11
|
|
12
|
-
from typing import Optional, Protocol, runtime_checkable
|
12
|
+
from typing import Literal, Optional, Protocol, runtime_checkable
|
13
13
|
|
14
14
|
|
15
15
|
@runtime_checkable
|
@@ -21,6 +21,9 @@ class AsyncRequesterProtocol(Protocol):
|
|
21
21
|
and manage login/shutdown asynchronously.
|
22
22
|
"""
|
23
23
|
|
24
|
+
def is_async(self) -> Literal[True]:
|
25
|
+
...
|
26
|
+
|
24
27
|
async def login(self, max_retries: int = 3, manual_login: bool = False) -> bool:
|
25
28
|
"""
|
26
29
|
Attempt to log in asynchronously.
|
@@ -9,7 +9,7 @@ for book info pages, individual chapters, managing request lifecycle,
|
|
9
9
|
and optionally retrieving a user's authenticated bookcase.
|
10
10
|
"""
|
11
11
|
|
12
|
-
from typing import Optional, Protocol, runtime_checkable
|
12
|
+
from typing import Literal, Optional, Protocol, runtime_checkable
|
13
13
|
|
14
14
|
|
15
15
|
@runtime_checkable
|
@@ -20,6 +20,9 @@ class RequesterProtocol(Protocol):
|
|
20
20
|
- a specific chapter page.
|
21
21
|
"""
|
22
22
|
|
23
|
+
def is_async(self) -> Literal[False]:
|
24
|
+
...
|
25
|
+
|
23
26
|
def login(self, max_retries: int = 3, manual_login: bool = False) -> bool:
|
24
27
|
"""
|
25
28
|
Attempt to log in
|
{novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/parsers/base_parser.py
RENAMED
@@ -45,14 +45,14 @@ class BaseParser(ParserProtocol, abc.ABC):
|
|
45
45
|
self._base_cache_dir = Path(config.cache_dir)
|
46
46
|
|
47
47
|
@abc.abstractmethod
|
48
|
-
def parse_book_info(self,
|
48
|
+
def parse_book_info(self, html_str: str) -> Dict[str, Any]:
|
49
49
|
"""
|
50
50
|
Parse a book info page and extract metadata and chapter structure.
|
51
51
|
|
52
52
|
Depending on the site structure, the return dict may include a
|
53
53
|
flat `chapters` list or nested `volumes` with chapter groups.
|
54
54
|
|
55
|
-
:param
|
55
|
+
:param html_str: Raw HTML of the book info page.
|
56
56
|
:return: Parsed metadata and chapter structure as a dictionary.
|
57
57
|
"""
|
58
58
|
...
|
@@ -62,7 +62,7 @@ class BaseParser(ParserProtocol, abc.ABC):
|
|
62
62
|
"""
|
63
63
|
Parse a single chapter page and extract clean text or simplified HTML.
|
64
64
|
|
65
|
-
:param
|
65
|
+
:param html_str: Raw HTML of the chapter page.
|
66
66
|
:param chapter_id: Identifier of the chapter being parsed.
|
67
67
|
:return: Cleaned chapter content as plain text or minimal HTML.
|
68
68
|
"""
|
@@ -188,7 +188,7 @@ class HTMLExtractor:
|
|
188
188
|
current = sep.join(current)
|
189
189
|
|
190
190
|
elif t == "attr":
|
191
|
-
name = step.get("attr")
|
191
|
+
name = step.get("attr") or ""
|
192
192
|
if isinstance(current, list):
|
193
193
|
current = [elem.get(name, "") for elem in current]
|
194
194
|
elif isinstance(current, Tag):
|
@@ -216,9 +216,9 @@ class HTMLExtractor:
|
|
216
216
|
"""
|
217
217
|
list_selector = volume_rule.get("list_selector")
|
218
218
|
volume_selector = volume_rule.get("volume_selector")
|
219
|
-
chapter_selector = volume_rule.get("chapter_selector")
|
220
219
|
volume_name_steps = volume_rule.get("volume_name_steps")
|
221
|
-
|
220
|
+
chapter_selector = volume_rule["chapter_selector"]
|
221
|
+
chapter_steps_list = volume_rule["chapter_steps"]
|
222
222
|
|
223
223
|
if not (
|
224
224
|
list_selector and volume_selector and chapter_selector and volume_name_steps
|
@@ -241,6 +241,8 @@ class HTMLExtractor:
|
|
241
241
|
for elem in list_area.find_all(
|
242
242
|
[volume_selector, chapter_selector], recursive=True
|
243
243
|
):
|
244
|
+
if not isinstance(elem, Tag):
|
245
|
+
continue
|
244
246
|
if elem.name == volume_selector:
|
245
247
|
extractor = HTMLExtractor(str(elem))
|
246
248
|
volume_name = extractor.extract_field(volume_name_steps)
|
@@ -257,9 +259,9 @@ class HTMLExtractor:
|
|
257
259
|
return volumes
|
258
260
|
|
259
261
|
def extract_volume_blocks(self, volume_rule: VolumesRules) -> List[Dict[str, Any]]:
|
260
|
-
volume_selector = volume_rule
|
262
|
+
volume_selector = volume_rule.get("volume_selector")
|
263
|
+
volume_name_steps = volume_rule.get("volume_name_steps")
|
261
264
|
chapter_selector = volume_rule["chapter_selector"]
|
262
|
-
volume_name_steps = volume_rule["volume_name_steps"]
|
263
265
|
chapter_steps_list = volume_rule["chapter_steps"]
|
264
266
|
if not (volume_selector and volume_name_steps):
|
265
267
|
raise ValueError(
|
@@ -69,14 +69,14 @@ class QidianBrowserParser(BaseParser):
|
|
69
69
|
self._font_debug_dir = self._base_cache_dir / "font_debug"
|
70
70
|
self._font_debug_dir.mkdir(parents=True, exist_ok=True)
|
71
71
|
|
72
|
-
def parse_book_info(self,
|
72
|
+
def parse_book_info(self, html_str: str) -> Dict[str, Any]:
|
73
73
|
"""
|
74
74
|
Parse a book info page and extract metadata and chapter structure.
|
75
75
|
|
76
|
-
:param
|
76
|
+
:param html_str: Raw HTML of the book info page.
|
77
77
|
:return: Parsed metadata and chapter structure as a dictionary.
|
78
78
|
"""
|
79
|
-
return parse_book_info(
|
79
|
+
return parse_book_info(html_str)
|
80
80
|
|
81
81
|
def parse_chapter(self, html_str: str, chapter_id: str) -> Dict[str, Any]:
|
82
82
|
"""
|
@@ -72,14 +72,14 @@ class QidianSessionParser(BaseParser):
|
|
72
72
|
self._font_debug_dir = self._base_cache_dir / "font_debug"
|
73
73
|
self._font_debug_dir.mkdir(parents=True, exist_ok=True)
|
74
74
|
|
75
|
-
def parse_book_info(self,
|
75
|
+
def parse_book_info(self, html_str: str) -> Dict[str, Any]:
|
76
76
|
"""
|
77
77
|
Parse a book info page and extract metadata and chapter structure.
|
78
78
|
|
79
|
-
:param
|
79
|
+
:param html_str: Raw HTML of the book info page.
|
80
80
|
:return: Parsed metadata and chapter structure as a dictionary.
|
81
81
|
"""
|
82
|
-
return parse_book_info(
|
82
|
+
return parse_book_info(html_str)
|
83
83
|
|
84
84
|
def parse_chapter(self, html_str: str, chapter_id: str) -> Dict[str, Any]:
|
85
85
|
"""
|
@@ -0,0 +1,151 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
novel_downloader.core.parsers.qidian_parser.shared.book_info_parser
|
5
|
+
-------------------------------------------------------------------
|
6
|
+
|
7
|
+
This module provides parsing of Qidian book info pages.
|
8
|
+
|
9
|
+
It extracts metadata such as title, author, cover URL, update
|
10
|
+
time, status, word count, summary, and volume-chapter structure.
|
11
|
+
"""
|
12
|
+
|
13
|
+
import logging
|
14
|
+
import re
|
15
|
+
from typing import Any, Dict
|
16
|
+
|
17
|
+
from bs4.element import Tag
|
18
|
+
|
19
|
+
from .helpers import html_to_soup
|
20
|
+
|
21
|
+
logger = logging.getLogger(__name__)
|
22
|
+
|
23
|
+
|
24
|
+
def _chapter_url_to_id(url: str) -> str:
|
25
|
+
"""
|
26
|
+
Extract chapterId as the last non-empty segment of the URL.
|
27
|
+
"""
|
28
|
+
return url.rstrip("/").split("/")[-1]
|
29
|
+
|
30
|
+
|
31
|
+
def _get_volume_name(vol_div: Tag) -> str:
|
32
|
+
"""
|
33
|
+
Extracts the volume title from a <div class="volume"> element
|
34
|
+
"""
|
35
|
+
h3 = vol_div.select_one("h3")
|
36
|
+
if not h3:
|
37
|
+
return ""
|
38
|
+
for a in h3.find_all("a"):
|
39
|
+
a.decompose()
|
40
|
+
text: str = h3.get_text(strip=True)
|
41
|
+
return text.split(chr(183))[0].strip()
|
42
|
+
|
43
|
+
|
44
|
+
def safe_select_text(
|
45
|
+
soup: Tag,
|
46
|
+
selector: str,
|
47
|
+
*,
|
48
|
+
separator: str = "",
|
49
|
+
strip: bool = False,
|
50
|
+
default: str = "",
|
51
|
+
) -> str:
|
52
|
+
"""
|
53
|
+
Safely select the first element matching a CSS selector and return its text.
|
54
|
+
|
55
|
+
:param soup: A BeautifulSoup Tag or sub-tree to query.
|
56
|
+
:param selector: A CSS selector string.
|
57
|
+
:param separator: Separator to use between strings when joining.
|
58
|
+
:param strip: Whether to strip whitespace from the result.
|
59
|
+
:param default: Value to return if no element is found.
|
60
|
+
:return: The element's text, or `default` if not found.
|
61
|
+
"""
|
62
|
+
tag = soup.select_one(selector)
|
63
|
+
return (
|
64
|
+
tag.get_text(separator=separator, strip=strip)
|
65
|
+
if isinstance(tag, Tag)
|
66
|
+
else default
|
67
|
+
)
|
68
|
+
|
69
|
+
|
70
|
+
def safe_select_attr(
|
71
|
+
soup: Tag,
|
72
|
+
selector: str,
|
73
|
+
attr: str,
|
74
|
+
*,
|
75
|
+
default: str = "",
|
76
|
+
) -> str:
|
77
|
+
"""
|
78
|
+
Safely select the first element matching a CSS selector and return one attributes.
|
79
|
+
|
80
|
+
:param soup: A BeautifulSoup Tag or sub-tree to query.
|
81
|
+
:param selector: A CSS selector string.
|
82
|
+
:param attr: The attribute name to retrieve from the selected element.
|
83
|
+
:param default: Value to return if no element or attribute is found.
|
84
|
+
:return: The attribute's value stripped of whitespace, or `default` if not found.
|
85
|
+
"""
|
86
|
+
tag = soup.select_one(selector)
|
87
|
+
if isinstance(tag, Tag) and attr in tag.attrs:
|
88
|
+
value = tag.attrs[attr]
|
89
|
+
if isinstance(value, list):
|
90
|
+
return " ".join(value).strip()
|
91
|
+
elif isinstance(value, str):
|
92
|
+
return value.strip()
|
93
|
+
return default
|
94
|
+
|
95
|
+
|
96
|
+
def parse_book_info(html_str: str) -> Dict[str, Any]:
|
97
|
+
"""
|
98
|
+
Extract metadata: title, author, cover_url, update_time, status,
|
99
|
+
word_count, summary, and volumes with chapters.
|
100
|
+
|
101
|
+
:param html_str: Raw HTML of the book info page.
|
102
|
+
:return: A dict containing book metadata.
|
103
|
+
"""
|
104
|
+
info: Dict[str, Any] = {}
|
105
|
+
try:
|
106
|
+
soup = html_to_soup(html_str)
|
107
|
+
info["book_name"] = safe_select_text(soup, "em#bookName", strip=True)
|
108
|
+
info["author"] = safe_select_text(soup, "a.writer", strip=True)
|
109
|
+
info["cover_url"] = safe_select_attr(soup, "div.book-img img", "src")
|
110
|
+
info["update_time"] = (
|
111
|
+
safe_select_text(soup, "span.book-update-time", strip=True)
|
112
|
+
.replace("更新时间", "")
|
113
|
+
.strip()
|
114
|
+
)
|
115
|
+
info["serial_status"] = safe_select_text(soup, "span.blue", strip=True)
|
116
|
+
|
117
|
+
# Word count via regex fallback
|
118
|
+
match = re.search(r"<em>([\d.]+)</em>\s*<cite>(.*?)字</cite>", html_str)
|
119
|
+
info["word_count"] = (
|
120
|
+
f"{match.group(1)}{match.group(2)}字" if match else "Unknown"
|
121
|
+
)
|
122
|
+
|
123
|
+
info["summary"] = safe_select_text(
|
124
|
+
soup, "div.book-intro p", separator="\n", strip=True
|
125
|
+
)
|
126
|
+
# volumes
|
127
|
+
vols = []
|
128
|
+
for vol_div in soup.select("div.volume-wrap div.volume"):
|
129
|
+
name = _get_volume_name(vol_div)
|
130
|
+
chaps = []
|
131
|
+
for li in vol_div.select("li"):
|
132
|
+
a = li.select_one("a")
|
133
|
+
if not isinstance(a, Tag) or "href" not in a.attrs:
|
134
|
+
continue
|
135
|
+
href_val = a["href"]
|
136
|
+
if isinstance(href_val, list):
|
137
|
+
href = href_val[0].strip()
|
138
|
+
else:
|
139
|
+
href = str(href_val).strip()
|
140
|
+
chaps.append(
|
141
|
+
{
|
142
|
+
"title": a.get_text(strip=True),
|
143
|
+
"url": href,
|
144
|
+
"chapterId": _chapter_url_to_id(href),
|
145
|
+
}
|
146
|
+
)
|
147
|
+
vols.append({"volume_name": name, "chapters": chaps})
|
148
|
+
info["volumes"] = vols
|
149
|
+
except Exception as e:
|
150
|
+
logger.warning("[Parser] Error parsing book info: %s", e)
|
151
|
+
return info
|
@@ -16,7 +16,7 @@ import json
|
|
16
16
|
import logging
|
17
17
|
from typing import Any, Dict, Union
|
18
18
|
|
19
|
-
from bs4 import BeautifulSoup
|
19
|
+
from bs4 import BeautifulSoup, Tag
|
20
20
|
|
21
21
|
logger = logging.getLogger(__name__)
|
22
22
|
|
@@ -103,7 +103,7 @@ def find_ssr_page_context(soup: BeautifulSoup) -> Dict[str, Any]:
|
|
103
103
|
"""
|
104
104
|
try:
|
105
105
|
tag = soup.find("script", id="vite-plugin-ssr_pageContext")
|
106
|
-
if tag and tag.string:
|
106
|
+
if isinstance(tag, Tag) and tag.string:
|
107
107
|
data: Dict[str, Any] = json.loads(tag.string.strip())
|
108
108
|
return data
|
109
109
|
except Exception as e:
|
@@ -13,7 +13,7 @@ cookie handling, and defines abstract methods for subclasses.
|
|
13
13
|
import abc
|
14
14
|
import asyncio
|
15
15
|
import time
|
16
|
-
from typing import Any, Dict, Optional, Union
|
16
|
+
from typing import Any, Dict, Literal, Optional, Union
|
17
17
|
|
18
18
|
import aiohttp
|
19
19
|
from aiohttp import ClientResponse, ClientSession, ClientTimeout, TCPConnector
|
@@ -58,6 +58,9 @@ class BaseAsyncSession(AsyncRequesterProtocol, abc.ABC):
|
|
58
58
|
_cookies (Dict[str, str]): Optional cookie jar for the session.
|
59
59
|
"""
|
60
60
|
|
61
|
+
def is_async(self) -> Literal[True]:
|
62
|
+
return True
|
63
|
+
|
61
64
|
def _init_session(
|
62
65
|
self,
|
63
66
|
config: RequesterConfig,
|
{novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/requesters/base_browser.py
RENAMED
@@ -11,9 +11,10 @@ specialized purposes.
|
|
11
11
|
|
12
12
|
import abc
|
13
13
|
import logging
|
14
|
-
from typing import Any, Dict, Optional
|
14
|
+
from typing import Any, Dict, Literal, Optional, cast
|
15
15
|
|
16
|
-
from DrissionPage import Chromium, ChromiumOptions
|
16
|
+
from DrissionPage import Chromium, ChromiumOptions
|
17
|
+
from DrissionPage._pages.mix_tab import MixTab
|
17
18
|
|
18
19
|
from novel_downloader.config.models import RequesterConfig
|
19
20
|
from novel_downloader.core.interfaces import RequesterProtocol
|
@@ -42,6 +43,9 @@ class BaseBrowser(RequesterProtocol, abc.ABC):
|
|
42
43
|
_page (ChromiumPage): The active browser tab.
|
43
44
|
"""
|
44
45
|
|
46
|
+
def is_async(self) -> Literal[False]:
|
47
|
+
return False
|
48
|
+
|
45
49
|
def _init_browser(self, config: RequesterConfig) -> None:
|
46
50
|
"""
|
47
51
|
Initialize the browser with specified options from RequesterConfig.
|
@@ -99,7 +103,7 @@ class BaseBrowser(RequesterProtocol, abc.ABC):
|
|
99
103
|
Set up the browser instance and open the default tab.
|
100
104
|
"""
|
101
105
|
self._browser = Chromium(self._options)
|
102
|
-
self._page = self._browser.get_tab()
|
106
|
+
self._page = cast(MixTab, self._browser.get_tab())
|
103
107
|
|
104
108
|
def login(self, max_retries: int = 3, manual_login: bool = False) -> bool:
|
105
109
|
"""
|
@@ -151,7 +155,7 @@ class BaseBrowser(RequesterProtocol, abc.ABC):
|
|
151
155
|
)
|
152
156
|
|
153
157
|
@property
|
154
|
-
def page(self) ->
|
158
|
+
def page(self) -> Optional[MixTab]:
|
155
159
|
"""
|
156
160
|
Return the current Chromium page object.
|
157
161
|
|
@@ -160,7 +164,7 @@ class BaseBrowser(RequesterProtocol, abc.ABC):
|
|
160
164
|
return self._page
|
161
165
|
|
162
166
|
@property
|
163
|
-
def browser(self) -> Chromium:
|
167
|
+
def browser(self) -> Optional[Chromium]:
|
164
168
|
"""
|
165
169
|
Return the Chromium browser instance.
|
166
170
|
|
{novel_downloader-1.2.1 → novel_downloader-1.2.2}/novel_downloader/core/requesters/base_session.py
RENAMED
@@ -10,7 +10,7 @@ persistent session and supports retries, headers, and timeout configurations.
|
|
10
10
|
"""
|
11
11
|
|
12
12
|
import abc
|
13
|
-
from typing import Any, Dict, Optional, Union
|
13
|
+
from typing import Any, Dict, Literal, Optional, Union
|
14
14
|
|
15
15
|
import requests
|
16
16
|
from requests import Response, Session
|
@@ -31,6 +31,9 @@ class BaseSession(RequesterProtocol, abc.ABC):
|
|
31
31
|
_timeout (float): Timeout for each request in seconds.
|
32
32
|
"""
|
33
33
|
|
34
|
+
def is_async(self) -> Literal[False]:
|
35
|
+
return False
|
36
|
+
|
34
37
|
def _init_session(
|
35
38
|
self, config: RequesterConfig, cookies: Optional[Dict[str, str]] = None
|
36
39
|
) -> None:
|
@@ -64,7 +64,7 @@ class CommonSession(BaseSession):
|
|
64
64
|
with self.session.get(url, timeout=self.timeout) as response:
|
65
65
|
response.raise_for_status()
|
66
66
|
content = response.text
|
67
|
-
sleep_with_random_delay(base)
|
67
|
+
sleep_with_random_delay(base, add_spread=1.0)
|
68
68
|
return content
|
69
69
|
except Exception as e:
|
70
70
|
if attempt == self.retry_times:
|
@@ -94,7 +94,7 @@ class CommonSession(BaseSession):
|
|
94
94
|
with self.session.get(url, timeout=self.timeout) as response:
|
95
95
|
response.raise_for_status()
|
96
96
|
content = response.text
|
97
|
-
sleep_with_random_delay(base)
|
97
|
+
sleep_with_random_delay(base, add_spread=1.0)
|
98
98
|
return content
|
99
99
|
except Exception as e:
|
100
100
|
if attempt == self.retry_times:
|