novel-downloader 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -1
- novel_downloader/cli/download.py +2 -0
- novel_downloader/config/adapter.py +41 -13
- novel_downloader/config/models.py +13 -8
- novel_downloader/core/downloaders/base_async_downloader.py +1 -1
- novel_downloader/core/downloaders/common_downloader.py +1 -2
- novel_downloader/core/downloaders/qidian_downloader.py +1 -2
- novel_downloader/core/factory/downloader_factory.py +13 -11
- novel_downloader/core/interfaces/async_requester_protocol.py +9 -4
- novel_downloader/core/interfaces/requester_protocol.py +7 -4
- novel_downloader/core/parsers/base_parser.py +3 -3
- novel_downloader/core/parsers/common_parser/helper.py +7 -5
- novel_downloader/core/parsers/qidian_parser/browser/chapter_encrypted.py +1 -1
- novel_downloader/core/parsers/qidian_parser/browser/main_parser.py +5 -3
- novel_downloader/core/parsers/qidian_parser/session/chapter_encrypted.py +1 -1
- novel_downloader/core/parsers/qidian_parser/session/main_parser.py +5 -3
- novel_downloader/core/parsers/qidian_parser/shared/book_info_parser.py +74 -18
- novel_downloader/core/parsers/qidian_parser/shared/helpers.py +2 -2
- novel_downloader/core/requesters/base_async_session.py +11 -6
- novel_downloader/core/requesters/base_browser.py +12 -8
- novel_downloader/core/requesters/base_session.py +9 -6
- novel_downloader/core/requesters/common_requester/common_async_session.py +4 -2
- novel_downloader/core/requesters/common_requester/common_session.py +4 -4
- novel_downloader/core/requesters/qidian_requester/qidian_broswer.py +38 -19
- novel_downloader/core/requesters/qidian_requester/qidian_session.py +6 -6
- novel_downloader/core/savers/common_saver/common_epub.py +1 -1
- novel_downloader/locales/en.json +4 -0
- novel_downloader/locales/zh.json +4 -0
- novel_downloader/resources/config/settings.yaml +16 -13
- novel_downloader/utils/constants.py +2 -1
- novel_downloader/utils/fontocr/ocr_v2.py +6 -0
- novel_downloader/utils/time_utils/datetime_utils.py +1 -1
- novel_downloader/utils/time_utils/sleep_utils.py +27 -11
- {novel_downloader-1.2.0.dist-info → novel_downloader-1.2.2.dist-info}/METADATA +1 -1
- {novel_downloader-1.2.0.dist-info → novel_downloader-1.2.2.dist-info}/RECORD +39 -39
- {novel_downloader-1.2.0.dist-info → novel_downloader-1.2.2.dist-info}/WHEEL +1 -1
- {novel_downloader-1.2.0.dist-info → novel_downloader-1.2.2.dist-info}/entry_points.txt +0 -0
- {novel_downloader-1.2.0.dist-info → novel_downloader-1.2.2.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-1.2.0.dist-info → novel_downloader-1.2.2.dist-info}/top_level.txt +0 -0
novel_downloader/__init__.py
CHANGED
novel_downloader/cli/download.py
CHANGED
@@ -57,6 +57,8 @@ def download_cli(ctx: Context, book_ids: List[str], site: str) -> None:
|
|
57
57
|
parser_cfg = adapter.get_parser_config()
|
58
58
|
saver_cfg = adapter.get_saver_config()
|
59
59
|
|
60
|
+
click.echo(t("download_site_mode", mode=downloader_cfg.mode))
|
61
|
+
|
60
62
|
# If no book_ids provided on the command line, try to load them from config
|
61
63
|
if not book_ids:
|
62
64
|
try:
|
@@ -23,6 +23,7 @@ from .models import (
|
|
23
23
|
RequesterConfig,
|
24
24
|
SaverConfig,
|
25
25
|
)
|
26
|
+
from .site_rules import load_site_rules
|
26
27
|
|
27
28
|
|
28
29
|
class ConfigAdapter:
|
@@ -38,19 +39,43 @@ class ConfigAdapter:
|
|
38
39
|
self._config = config
|
39
40
|
self._site = site
|
40
41
|
|
42
|
+
site_rules = load_site_rules() # -> Dict[str, SiteRules]
|
43
|
+
self._supported_sites = set(site_rules.keys())
|
44
|
+
|
41
45
|
def set_site(self, site: str) -> None:
|
42
46
|
"""
|
43
47
|
切换当前适配的站点
|
44
48
|
"""
|
45
49
|
self._site = site
|
46
50
|
|
51
|
+
def _get_site_cfg(self) -> Dict[str, Any]:
|
52
|
+
"""
|
53
|
+
统一获取站点配置:
|
54
|
+
|
55
|
+
1. 先尝试从 self._config["sites"][self._site] 取配置
|
56
|
+
2. 如果没有配置, 且 self._site 在 self._supported_sites 中, 则取 sites["common"]
|
57
|
+
3. 否则返回空 dict
|
58
|
+
"""
|
59
|
+
sites_cfg = self._config.get("sites", {}) or {}
|
60
|
+
|
61
|
+
# 1. site-specific config
|
62
|
+
if self._site in sites_cfg:
|
63
|
+
return sites_cfg[self._site] or {}
|
64
|
+
|
65
|
+
# 2. fallback to "common" only if site is supported
|
66
|
+
if self._site in self._supported_sites:
|
67
|
+
return sites_cfg.get("common", {}) or {}
|
68
|
+
|
69
|
+
# 3. completely unsupported site
|
70
|
+
return {}
|
71
|
+
|
47
72
|
def get_requester_config(self) -> RequesterConfig:
|
48
73
|
"""
|
49
74
|
从 config["requests"] 中读取通用请求配置 (含 DrissionPage 设置)
|
50
75
|
返回 RequesterConfig 实例
|
51
76
|
"""
|
52
77
|
req = self._config.get("requests", {})
|
53
|
-
site_cfg = self.
|
78
|
+
site_cfg = self._get_site_cfg()
|
54
79
|
return RequesterConfig(
|
55
80
|
wait_time=req.get("wait_time", 5),
|
56
81
|
retry_times=req.get("retry_times", 3),
|
@@ -73,7 +98,7 @@ class ConfigAdapter:
|
|
73
98
|
"""
|
74
99
|
gen = self._config.get("general", {})
|
75
100
|
debug = gen.get("debug", {})
|
76
|
-
site_cfg = self.
|
101
|
+
site_cfg = self._get_site_cfg()
|
77
102
|
return DownloaderConfig(
|
78
103
|
request_interval=gen.get("request_interval", 5),
|
79
104
|
raw_data_dir=gen.get("raw_data_dir", "./raw_data"),
|
@@ -93,18 +118,21 @@ class ConfigAdapter:
|
|
93
118
|
config["sites"][site] 中读取解析器相关配置, 返回 ParserConfig 实例
|
94
119
|
"""
|
95
120
|
gen = self._config.get("general", {})
|
96
|
-
|
121
|
+
font_ocr = gen.get("font_ocr", {})
|
122
|
+
site_cfg = self._get_site_cfg()
|
97
123
|
return ParserConfig(
|
98
124
|
cache_dir=gen.get("cache_dir", "./cache"),
|
99
|
-
decode_font=
|
100
|
-
use_freq=
|
101
|
-
use_ocr=
|
102
|
-
use_vec=
|
103
|
-
ocr_version=
|
104
|
-
save_font_debug=
|
105
|
-
batch_size=
|
106
|
-
|
107
|
-
|
125
|
+
decode_font=font_ocr.get("decode_font", False),
|
126
|
+
use_freq=font_ocr.get("use_freq", False),
|
127
|
+
use_ocr=font_ocr.get("use_ocr", True),
|
128
|
+
use_vec=font_ocr.get("use_vec", False),
|
129
|
+
ocr_version=font_ocr.get("ocr_version", "v1.0"),
|
130
|
+
save_font_debug=font_ocr.get("save_font_debug", False),
|
131
|
+
batch_size=font_ocr.get("batch_size", 32),
|
132
|
+
gpu_mem=font_ocr.get("gpu_mem", 500),
|
133
|
+
gpu_id=font_ocr.get("gpu_id", None),
|
134
|
+
ocr_weight=font_ocr.get("ocr_weight", 0.6),
|
135
|
+
vec_weight=font_ocr.get("vec_weight", 0.4),
|
108
136
|
mode=site_cfg.get("mode", "session"),
|
109
137
|
)
|
110
138
|
|
@@ -136,7 +164,7 @@ class ConfigAdapter:
|
|
136
164
|
"""
|
137
165
|
从 config["sites"][site]["book_ids"] 中提取目标书籍列表
|
138
166
|
"""
|
139
|
-
site_cfg = self.
|
167
|
+
site_cfg = self._get_site_cfg()
|
140
168
|
raw_ids = site_cfg.get("book_ids", [])
|
141
169
|
|
142
170
|
if isinstance(raw_ids, str):
|
@@ -24,10 +24,10 @@ from typing import Any, Dict, List, Literal, Optional, TypedDict
|
|
24
24
|
# === Requesters ===
|
25
25
|
@dataclass
|
26
26
|
class RequesterConfig:
|
27
|
-
wait_time:
|
27
|
+
wait_time: float = 5.0
|
28
28
|
retry_times: int = 3
|
29
|
-
retry_interval:
|
30
|
-
timeout:
|
29
|
+
retry_interval: float = 5.0
|
30
|
+
timeout: float = 30.0
|
31
31
|
headless: bool = True
|
32
32
|
user_data_folder: str = ""
|
33
33
|
profile_name: str = ""
|
@@ -41,7 +41,7 @@ class RequesterConfig:
|
|
41
41
|
# === Downloaders ===
|
42
42
|
@dataclass
|
43
43
|
class DownloaderConfig:
|
44
|
-
request_interval:
|
44
|
+
request_interval: float = 5.0
|
45
45
|
raw_data_dir: str = "./raw_data"
|
46
46
|
cache_dir: str = "./novel_cache"
|
47
47
|
download_workers: int = 4
|
@@ -63,6 +63,8 @@ class ParserConfig:
|
|
63
63
|
use_vec: bool = False
|
64
64
|
ocr_version: str = "v1.0"
|
65
65
|
batch_size: int = 32
|
66
|
+
gpu_mem: int = 500
|
67
|
+
gpu_id: Optional[int] = None
|
66
68
|
ocr_weight: float = 0.6
|
67
69
|
vec_weight: float = 0.4
|
68
70
|
save_font_debug: bool = False
|
@@ -133,16 +135,19 @@ class ChapterFieldRules(TypedDict):
|
|
133
135
|
steps: List[RuleStep]
|
134
136
|
|
135
137
|
|
136
|
-
class
|
137
|
-
has_volume: bool # 是否存在卷,false=未分卷
|
138
|
+
class VolumesRulesOptional(TypedDict, total=False):
|
138
139
|
volume_selector: str # 有卷时选择 volume 块的 selector
|
139
|
-
chapter_selector: str # 选择 chapter 节点的 selector
|
140
140
|
volume_name_steps: List[RuleStep]
|
141
|
-
chapter_steps: List[ChapterFieldRules] # 提取章节信息的步骤列表
|
142
141
|
volume_mode: str # Optional: "normal" (default) or "mixed"
|
143
142
|
list_selector: str # Optional: If "mixed" mode, parent container selector
|
144
143
|
|
145
144
|
|
145
|
+
class VolumesRules(VolumesRulesOptional):
|
146
|
+
has_volume: bool # 是否存在卷,false=未分卷
|
147
|
+
chapter_selector: str # 选择 chapter 节点的 selector
|
148
|
+
chapter_steps: List[ChapterFieldRules] # 提取章节信息的步骤列表
|
149
|
+
|
150
|
+
|
146
151
|
class BookInfoRules(TypedDict, total=False):
|
147
152
|
book_name: FieldRules
|
148
153
|
author: FieldRules
|
@@ -94,7 +94,7 @@ class BaseAsyncDownloader(AsyncDownloaderProtocol, abc.ABC):
|
|
94
94
|
return self._config.login_required
|
95
95
|
|
96
96
|
@property
|
97
|
-
def request_interval(self) ->
|
97
|
+
def request_interval(self) -> float:
|
98
98
|
return self._config.request_interval
|
99
99
|
|
100
100
|
async def prepare(self) -> None:
|
@@ -67,8 +67,7 @@ class CommonDownloader(BaseDownloader):
|
|
67
67
|
cache_base = self.cache_dir / site / book_id
|
68
68
|
info_path = raw_base / "book_info.json"
|
69
69
|
chapter_dir = raw_base / "chapters"
|
70
|
-
|
71
|
-
chapters_html_dir = cache_base / "html"
|
70
|
+
chapters_html_dir = cache_base / "html"
|
72
71
|
|
73
72
|
raw_base.mkdir(parents=True, exist_ok=True)
|
74
73
|
chapter_dir.mkdir(parents=True, exist_ok=True)
|
@@ -87,8 +87,7 @@ class QidianDownloader(BaseDownloader):
|
|
87
87
|
info_path = raw_base / "book_info.json"
|
88
88
|
chapter_dir = raw_base / "chapters"
|
89
89
|
encrypted_chapter_dir = raw_base / "encrypted_chapters"
|
90
|
-
|
91
|
-
chapters_html_dir = cache_base / "html"
|
90
|
+
chapters_html_dir = cache_base / "html"
|
92
91
|
|
93
92
|
raw_base.mkdir(parents=True, exist_ok=True)
|
94
93
|
chapter_dir.mkdir(parents=True, exist_ok=True)
|
@@ -14,7 +14,7 @@ based on the site name and parser mode specified in the configuration.
|
|
14
14
|
To add support for new sites or modes, extend the `_site_map` accordingly.
|
15
15
|
"""
|
16
16
|
|
17
|
-
from typing import Union
|
17
|
+
from typing import Union, cast
|
18
18
|
|
19
19
|
from novel_downloader.config import DownloaderConfig, load_site_rules
|
20
20
|
from novel_downloader.core.downloaders import (
|
@@ -137,13 +137,15 @@ def get_downloader(
|
|
137
137
|
:raises TypeError: If the provided requester does not match the required protocol
|
138
138
|
for the chosen mode (sync vs async).
|
139
139
|
"""
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
return get_async_downloader(
|
145
|
-
|
146
|
-
if not
|
147
|
-
raise TypeError(
|
148
|
-
|
149
|
-
|
140
|
+
if requester.is_async():
|
141
|
+
if config.mode.lower() != "async":
|
142
|
+
raise TypeError("Requester is async, but config.mode is not 'async'")
|
143
|
+
async_requester = cast(AsyncRequesterProtocol, requester)
|
144
|
+
return get_async_downloader(async_requester, parser, saver, site, config)
|
145
|
+
else:
|
146
|
+
if config.mode.lower() not in ("browser", "session"):
|
147
|
+
raise TypeError(
|
148
|
+
"Requester is sync, but config.mode is not 'browser' or 'session'"
|
149
|
+
)
|
150
|
+
sync_requester = cast(RequesterProtocol, requester)
|
151
|
+
return get_sync_downloader(sync_requester, parser, saver, site, config)
|
@@ -9,7 +9,7 @@ for book info pages, individual chapters, managing request lifecycle,
|
|
9
9
|
and optionally retrieving a user's authenticated bookcase — all in async style.
|
10
10
|
"""
|
11
11
|
|
12
|
-
from typing import Optional, Protocol, runtime_checkable
|
12
|
+
from typing import Literal, Optional, Protocol, runtime_checkable
|
13
13
|
|
14
14
|
|
15
15
|
@runtime_checkable
|
@@ -21,6 +21,9 @@ class AsyncRequesterProtocol(Protocol):
|
|
21
21
|
and manage login/shutdown asynchronously.
|
22
22
|
"""
|
23
23
|
|
24
|
+
def is_async(self) -> Literal[True]:
|
25
|
+
...
|
26
|
+
|
24
27
|
async def login(self, max_retries: int = 3, manual_login: bool = False) -> bool:
|
25
28
|
"""
|
26
29
|
Attempt to log in asynchronously.
|
@@ -28,7 +31,9 @@ class AsyncRequesterProtocol(Protocol):
|
|
28
31
|
"""
|
29
32
|
...
|
30
33
|
|
31
|
-
async def get_book_info(
|
34
|
+
async def get_book_info(
|
35
|
+
self, book_id: str, wait_time: Optional[float] = None
|
36
|
+
) -> str:
|
32
37
|
"""
|
33
38
|
Fetch the raw HTML (or JSON) of the book info page asynchronously.
|
34
39
|
|
@@ -39,7 +44,7 @@ class AsyncRequesterProtocol(Protocol):
|
|
39
44
|
...
|
40
45
|
|
41
46
|
async def get_book_chapter(
|
42
|
-
self, book_id: str, chapter_id: str, wait_time: Optional[
|
47
|
+
self, book_id: str, chapter_id: str, wait_time: Optional[float] = None
|
43
48
|
) -> str:
|
44
49
|
"""
|
45
50
|
Fetch the raw HTML (or JSON) of a single chapter asynchronously.
|
@@ -51,7 +56,7 @@ class AsyncRequesterProtocol(Protocol):
|
|
51
56
|
"""
|
52
57
|
...
|
53
58
|
|
54
|
-
async def get_bookcase(self, wait_time: Optional[
|
59
|
+
async def get_bookcase(self, wait_time: Optional[float] = None) -> str:
|
55
60
|
"""
|
56
61
|
Optional: Retrieve the HTML content of the authenticated
|
57
62
|
user's bookcase page asynchronously.
|
@@ -9,7 +9,7 @@ for book info pages, individual chapters, managing request lifecycle,
|
|
9
9
|
and optionally retrieving a user's authenticated bookcase.
|
10
10
|
"""
|
11
11
|
|
12
|
-
from typing import Optional, Protocol, runtime_checkable
|
12
|
+
from typing import Literal, Optional, Protocol, runtime_checkable
|
13
13
|
|
14
14
|
|
15
15
|
@runtime_checkable
|
@@ -20,13 +20,16 @@ class RequesterProtocol(Protocol):
|
|
20
20
|
- a specific chapter page.
|
21
21
|
"""
|
22
22
|
|
23
|
+
def is_async(self) -> Literal[False]:
|
24
|
+
...
|
25
|
+
|
23
26
|
def login(self, max_retries: int = 3, manual_login: bool = False) -> bool:
|
24
27
|
"""
|
25
28
|
Attempt to log in
|
26
29
|
"""
|
27
30
|
...
|
28
31
|
|
29
|
-
def get_book_info(self, book_id: str, wait_time: Optional[
|
32
|
+
def get_book_info(self, book_id: str, wait_time: Optional[float] = None) -> str:
|
30
33
|
"""
|
31
34
|
Fetch the raw HTML (or JSON) of the book info page.
|
32
35
|
|
@@ -37,7 +40,7 @@ class RequesterProtocol(Protocol):
|
|
37
40
|
...
|
38
41
|
|
39
42
|
def get_book_chapter(
|
40
|
-
self, book_id: str, chapter_id: str, wait_time: Optional[
|
43
|
+
self, book_id: str, chapter_id: str, wait_time: Optional[float] = None
|
41
44
|
) -> str:
|
42
45
|
"""
|
43
46
|
Fetch the raw HTML (or JSON) of a single chapter.
|
@@ -55,7 +58,7 @@ class RequesterProtocol(Protocol):
|
|
55
58
|
"""
|
56
59
|
...
|
57
60
|
|
58
|
-
def get_bookcase(self, wait_time: Optional[
|
61
|
+
def get_bookcase(self, wait_time: Optional[float] = None) -> str:
|
59
62
|
"""
|
60
63
|
Optional: Retrieve the HTML content of the authenticated user's bookcase page.
|
61
64
|
|
@@ -45,14 +45,14 @@ class BaseParser(ParserProtocol, abc.ABC):
|
|
45
45
|
self._base_cache_dir = Path(config.cache_dir)
|
46
46
|
|
47
47
|
@abc.abstractmethod
|
48
|
-
def parse_book_info(self,
|
48
|
+
def parse_book_info(self, html_str: str) -> Dict[str, Any]:
|
49
49
|
"""
|
50
50
|
Parse a book info page and extract metadata and chapter structure.
|
51
51
|
|
52
52
|
Depending on the site structure, the return dict may include a
|
53
53
|
flat `chapters` list or nested `volumes` with chapter groups.
|
54
54
|
|
55
|
-
:param
|
55
|
+
:param html_str: Raw HTML of the book info page.
|
56
56
|
:return: Parsed metadata and chapter structure as a dictionary.
|
57
57
|
"""
|
58
58
|
...
|
@@ -62,7 +62,7 @@ class BaseParser(ParserProtocol, abc.ABC):
|
|
62
62
|
"""
|
63
63
|
Parse a single chapter page and extract clean text or simplified HTML.
|
64
64
|
|
65
|
-
:param
|
65
|
+
:param html_str: Raw HTML of the chapter page.
|
66
66
|
:param chapter_id: Identifier of the chapter being parsed.
|
67
67
|
:return: Cleaned chapter content as plain text or minimal HTML.
|
68
68
|
"""
|
@@ -188,7 +188,7 @@ class HTMLExtractor:
|
|
188
188
|
current = sep.join(current)
|
189
189
|
|
190
190
|
elif t == "attr":
|
191
|
-
name = step.get("attr")
|
191
|
+
name = step.get("attr") or ""
|
192
192
|
if isinstance(current, list):
|
193
193
|
current = [elem.get(name, "") for elem in current]
|
194
194
|
elif isinstance(current, Tag):
|
@@ -216,9 +216,9 @@ class HTMLExtractor:
|
|
216
216
|
"""
|
217
217
|
list_selector = volume_rule.get("list_selector")
|
218
218
|
volume_selector = volume_rule.get("volume_selector")
|
219
|
-
chapter_selector = volume_rule.get("chapter_selector")
|
220
219
|
volume_name_steps = volume_rule.get("volume_name_steps")
|
221
|
-
|
220
|
+
chapter_selector = volume_rule["chapter_selector"]
|
221
|
+
chapter_steps_list = volume_rule["chapter_steps"]
|
222
222
|
|
223
223
|
if not (
|
224
224
|
list_selector and volume_selector and chapter_selector and volume_name_steps
|
@@ -241,6 +241,8 @@ class HTMLExtractor:
|
|
241
241
|
for elem in list_area.find_all(
|
242
242
|
[volume_selector, chapter_selector], recursive=True
|
243
243
|
):
|
244
|
+
if not isinstance(elem, Tag):
|
245
|
+
continue
|
244
246
|
if elem.name == volume_selector:
|
245
247
|
extractor = HTMLExtractor(str(elem))
|
246
248
|
volume_name = extractor.extract_field(volume_name_steps)
|
@@ -257,9 +259,9 @@ class HTMLExtractor:
|
|
257
259
|
return volumes
|
258
260
|
|
259
261
|
def extract_volume_blocks(self, volume_rule: VolumesRules) -> List[Dict[str, Any]]:
|
260
|
-
volume_selector = volume_rule
|
262
|
+
volume_selector = volume_rule.get("volume_selector")
|
263
|
+
volume_name_steps = volume_rule.get("volume_name_steps")
|
261
264
|
chapter_selector = volume_rule["chapter_selector"]
|
262
|
-
volume_name_steps = volume_rule["volume_name_steps"]
|
263
265
|
chapter_steps_list = volume_rule["chapter_steps"]
|
264
266
|
if not (volume_selector and volume_name_steps):
|
265
267
|
raise ValueError(
|
@@ -60,6 +60,8 @@ class QidianBrowserParser(BaseParser):
|
|
60
60
|
use_ocr=config.use_ocr,
|
61
61
|
use_vec=config.use_vec,
|
62
62
|
batch_size=config.batch_size,
|
63
|
+
gpu_mem=config.gpu_mem,
|
64
|
+
gpu_id=config.gpu_id,
|
63
65
|
ocr_weight=config.ocr_weight,
|
64
66
|
vec_weight=config.vec_weight,
|
65
67
|
font_debug=config.save_font_debug,
|
@@ -67,14 +69,14 @@ class QidianBrowserParser(BaseParser):
|
|
67
69
|
self._font_debug_dir = self._base_cache_dir / "font_debug"
|
68
70
|
self._font_debug_dir.mkdir(parents=True, exist_ok=True)
|
69
71
|
|
70
|
-
def parse_book_info(self,
|
72
|
+
def parse_book_info(self, html_str: str) -> Dict[str, Any]:
|
71
73
|
"""
|
72
74
|
Parse a book info page and extract metadata and chapter structure.
|
73
75
|
|
74
|
-
:param
|
76
|
+
:param html_str: Raw HTML of the book info page.
|
75
77
|
:return: Parsed metadata and chapter structure as a dictionary.
|
76
78
|
"""
|
77
|
-
return parse_book_info(
|
79
|
+
return parse_book_info(html_str)
|
78
80
|
|
79
81
|
def parse_chapter(self, html_str: str, chapter_id: str) -> Dict[str, Any]:
|
80
82
|
"""
|
@@ -63,6 +63,8 @@ class QidianSessionParser(BaseParser):
|
|
63
63
|
use_ocr=config.use_ocr,
|
64
64
|
use_vec=config.use_vec,
|
65
65
|
batch_size=config.batch_size,
|
66
|
+
gpu_mem=config.gpu_mem,
|
67
|
+
gpu_id=config.gpu_id,
|
66
68
|
ocr_weight=config.ocr_weight,
|
67
69
|
vec_weight=config.vec_weight,
|
68
70
|
font_debug=config.save_font_debug,
|
@@ -70,14 +72,14 @@ class QidianSessionParser(BaseParser):
|
|
70
72
|
self._font_debug_dir = self._base_cache_dir / "font_debug"
|
71
73
|
self._font_debug_dir.mkdir(parents=True, exist_ok=True)
|
72
74
|
|
73
|
-
def parse_book_info(self,
|
75
|
+
def parse_book_info(self, html_str: str) -> Dict[str, Any]:
|
74
76
|
"""
|
75
77
|
Parse a book info page and extract metadata and chapter structure.
|
76
78
|
|
77
|
-
:param
|
79
|
+
:param html_str: Raw HTML of the book info page.
|
78
80
|
:return: Parsed metadata and chapter structure as a dictionary.
|
79
81
|
"""
|
80
|
-
return parse_book_info(
|
82
|
+
return parse_book_info(html_str)
|
81
83
|
|
82
84
|
def parse_chapter(self, html_str: str, chapter_id: str) -> Dict[str, Any]:
|
83
85
|
"""
|
@@ -41,6 +41,58 @@ def _get_volume_name(vol_div: Tag) -> str:
|
|
41
41
|
return text.split(chr(183))[0].strip()
|
42
42
|
|
43
43
|
|
44
|
+
def safe_select_text(
|
45
|
+
soup: Tag,
|
46
|
+
selector: str,
|
47
|
+
*,
|
48
|
+
separator: str = "",
|
49
|
+
strip: bool = False,
|
50
|
+
default: str = "",
|
51
|
+
) -> str:
|
52
|
+
"""
|
53
|
+
Safely select the first element matching a CSS selector and return its text.
|
54
|
+
|
55
|
+
:param soup: A BeautifulSoup Tag or sub-tree to query.
|
56
|
+
:param selector: A CSS selector string.
|
57
|
+
:param separator: Separator to use between strings when joining.
|
58
|
+
:param strip: Whether to strip whitespace from the result.
|
59
|
+
:param default: Value to return if no element is found.
|
60
|
+
:return: The element's text, or `default` if not found.
|
61
|
+
"""
|
62
|
+
tag = soup.select_one(selector)
|
63
|
+
return (
|
64
|
+
tag.get_text(separator=separator, strip=strip)
|
65
|
+
if isinstance(tag, Tag)
|
66
|
+
else default
|
67
|
+
)
|
68
|
+
|
69
|
+
|
70
|
+
def safe_select_attr(
|
71
|
+
soup: Tag,
|
72
|
+
selector: str,
|
73
|
+
attr: str,
|
74
|
+
*,
|
75
|
+
default: str = "",
|
76
|
+
) -> str:
|
77
|
+
"""
|
78
|
+
Safely select the first element matching a CSS selector and return one attributes.
|
79
|
+
|
80
|
+
:param soup: A BeautifulSoup Tag or sub-tree to query.
|
81
|
+
:param selector: A CSS selector string.
|
82
|
+
:param attr: The attribute name to retrieve from the selected element.
|
83
|
+
:param default: Value to return if no element or attribute is found.
|
84
|
+
:return: The attribute's value stripped of whitespace, or `default` if not found.
|
85
|
+
"""
|
86
|
+
tag = soup.select_one(selector)
|
87
|
+
if isinstance(tag, Tag) and attr in tag.attrs:
|
88
|
+
value = tag.attrs[attr]
|
89
|
+
if isinstance(value, list):
|
90
|
+
return " ".join(value).strip()
|
91
|
+
elif isinstance(value, str):
|
92
|
+
return value.strip()
|
93
|
+
return default
|
94
|
+
|
95
|
+
|
44
96
|
def parse_book_info(html_str: str) -> Dict[str, Any]:
|
45
97
|
"""
|
46
98
|
Extract metadata: title, author, cover_url, update_time, status,
|
@@ -52,27 +104,24 @@ def parse_book_info(html_str: str) -> Dict[str, Any]:
|
|
52
104
|
info: Dict[str, Any] = {}
|
53
105
|
try:
|
54
106
|
soup = html_to_soup(html_str)
|
55
|
-
info["book_name"] = soup
|
56
|
-
info["author"] = soup
|
57
|
-
info["cover_url"] = soup
|
107
|
+
info["book_name"] = safe_select_text(soup, "em#bookName", strip=True)
|
108
|
+
info["author"] = safe_select_text(soup, "a.writer", strip=True)
|
109
|
+
info["cover_url"] = safe_select_attr(soup, "div.book-img img", "src")
|
58
110
|
info["update_time"] = (
|
59
|
-
soup
|
60
|
-
.get_text(strip=True)
|
111
|
+
safe_select_text(soup, "span.book-update-time", strip=True)
|
61
112
|
.replace("更新时间", "")
|
62
113
|
.strip()
|
63
114
|
)
|
64
|
-
info["serial_status"] = soup
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
115
|
+
info["serial_status"] = safe_select_text(soup, "span.blue", strip=True)
|
116
|
+
|
117
|
+
# Word count via regex fallback
|
118
|
+
match = re.search(r"<em>([\d.]+)</em>\s*<cite>(.*?)字</cite>", html_str)
|
119
|
+
info["word_count"] = (
|
120
|
+
f"{match.group(1)}{match.group(2)}字" if match else "Unknown"
|
69
121
|
)
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
info["word_count"] = "Unknown"
|
74
|
-
info["summary"] = soup.select_one("div.book-intro p").get_text(
|
75
|
-
separator="\n", strip=True
|
122
|
+
|
123
|
+
info["summary"] = safe_select_text(
|
124
|
+
soup, "div.book-intro p", separator="\n", strip=True
|
76
125
|
)
|
77
126
|
# volumes
|
78
127
|
vols = []
|
@@ -81,11 +130,18 @@ def parse_book_info(html_str: str) -> Dict[str, Any]:
|
|
81
130
|
chaps = []
|
82
131
|
for li in vol_div.select("li"):
|
83
132
|
a = li.select_one("a")
|
133
|
+
if not isinstance(a, Tag) or "href" not in a.attrs:
|
134
|
+
continue
|
135
|
+
href_val = a["href"]
|
136
|
+
if isinstance(href_val, list):
|
137
|
+
href = href_val[0].strip()
|
138
|
+
else:
|
139
|
+
href = str(href_val).strip()
|
84
140
|
chaps.append(
|
85
141
|
{
|
86
142
|
"title": a.get_text(strip=True),
|
87
|
-
"url":
|
88
|
-
"chapterId": _chapter_url_to_id(
|
143
|
+
"url": href,
|
144
|
+
"chapterId": _chapter_url_to_id(href),
|
89
145
|
}
|
90
146
|
)
|
91
147
|
vols.append({"volume_name": name, "chapters": chaps})
|
@@ -16,7 +16,7 @@ import json
|
|
16
16
|
import logging
|
17
17
|
from typing import Any, Dict, Union
|
18
18
|
|
19
|
-
from bs4 import BeautifulSoup
|
19
|
+
from bs4 import BeautifulSoup, Tag
|
20
20
|
|
21
21
|
logger = logging.getLogger(__name__)
|
22
22
|
|
@@ -103,7 +103,7 @@ def find_ssr_page_context(soup: BeautifulSoup) -> Dict[str, Any]:
|
|
103
103
|
"""
|
104
104
|
try:
|
105
105
|
tag = soup.find("script", id="vite-plugin-ssr_pageContext")
|
106
|
-
if tag and tag.string:
|
106
|
+
if isinstance(tag, Tag) and tag.string:
|
107
107
|
data: Dict[str, Any] = json.loads(tag.string.strip())
|
108
108
|
return data
|
109
109
|
except Exception as e:
|