novel-downloader 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +14 -0
- novel_downloader/cli/__init__.py +14 -0
- novel_downloader/cli/clean.py +134 -0
- novel_downloader/cli/download.py +132 -0
- novel_downloader/cli/interactive.py +67 -0
- novel_downloader/cli/main.py +45 -0
- novel_downloader/cli/settings.py +177 -0
- novel_downloader/config/__init__.py +52 -0
- novel_downloader/config/adapter.py +153 -0
- novel_downloader/config/loader.py +177 -0
- novel_downloader/config/models.py +173 -0
- novel_downloader/config/site_rules.py +97 -0
- novel_downloader/core/__init__.py +25 -0
- novel_downloader/core/downloaders/__init__.py +22 -0
- novel_downloader/core/downloaders/base_async_downloader.py +157 -0
- novel_downloader/core/downloaders/base_downloader.py +187 -0
- novel_downloader/core/downloaders/common_asynb_downloader.py +207 -0
- novel_downloader/core/downloaders/common_downloader.py +191 -0
- novel_downloader/core/downloaders/qidian_downloader.py +208 -0
- novel_downloader/core/factory/__init__.py +33 -0
- novel_downloader/core/factory/downloader_factory.py +149 -0
- novel_downloader/core/factory/parser_factory.py +62 -0
- novel_downloader/core/factory/requester_factory.py +106 -0
- novel_downloader/core/factory/saver_factory.py +49 -0
- novel_downloader/core/interfaces/__init__.py +32 -0
- novel_downloader/core/interfaces/async_downloader_protocol.py +37 -0
- novel_downloader/core/interfaces/async_requester_protocol.py +68 -0
- novel_downloader/core/interfaces/downloader_protocol.py +37 -0
- novel_downloader/core/interfaces/parser_protocol.py +40 -0
- novel_downloader/core/interfaces/requester_protocol.py +65 -0
- novel_downloader/core/interfaces/saver_protocol.py +61 -0
- novel_downloader/core/parsers/__init__.py +28 -0
- novel_downloader/core/parsers/base_parser.py +96 -0
- novel_downloader/core/parsers/common_parser/__init__.py +14 -0
- novel_downloader/core/parsers/common_parser/helper.py +321 -0
- novel_downloader/core/parsers/common_parser/main_parser.py +86 -0
- novel_downloader/core/parsers/qidian_parser/__init__.py +20 -0
- novel_downloader/core/parsers/qidian_parser/browser/__init__.py +13 -0
- novel_downloader/core/parsers/qidian_parser/browser/chapter_encrypted.py +498 -0
- novel_downloader/core/parsers/qidian_parser/browser/chapter_normal.py +97 -0
- novel_downloader/core/parsers/qidian_parser/browser/chapter_router.py +70 -0
- novel_downloader/core/parsers/qidian_parser/browser/main_parser.py +110 -0
- novel_downloader/core/parsers/qidian_parser/session/__init__.py +13 -0
- novel_downloader/core/parsers/qidian_parser/session/chapter_encrypted.py +451 -0
- novel_downloader/core/parsers/qidian_parser/session/chapter_normal.py +119 -0
- novel_downloader/core/parsers/qidian_parser/session/chapter_router.py +67 -0
- novel_downloader/core/parsers/qidian_parser/session/main_parser.py +113 -0
- novel_downloader/core/parsers/qidian_parser/session/node_decryptor.py +164 -0
- novel_downloader/core/parsers/qidian_parser/shared/__init__.py +38 -0
- novel_downloader/core/parsers/qidian_parser/shared/book_info_parser.py +95 -0
- novel_downloader/core/parsers/qidian_parser/shared/helpers.py +133 -0
- novel_downloader/core/requesters/__init__.py +31 -0
- novel_downloader/core/requesters/base_async_session.py +297 -0
- novel_downloader/core/requesters/base_browser.py +210 -0
- novel_downloader/core/requesters/base_session.py +243 -0
- novel_downloader/core/requesters/common_requester/__init__.py +18 -0
- novel_downloader/core/requesters/common_requester/common_async_session.py +96 -0
- novel_downloader/core/requesters/common_requester/common_session.py +126 -0
- novel_downloader/core/requesters/qidian_requester/__init__.py +22 -0
- novel_downloader/core/requesters/qidian_requester/qidian_broswer.py +377 -0
- novel_downloader/core/requesters/qidian_requester/qidian_session.py +202 -0
- novel_downloader/core/savers/__init__.py +20 -0
- novel_downloader/core/savers/base_saver.py +169 -0
- novel_downloader/core/savers/common_saver/__init__.py +13 -0
- novel_downloader/core/savers/common_saver/common_epub.py +232 -0
- novel_downloader/core/savers/common_saver/common_txt.py +176 -0
- novel_downloader/core/savers/common_saver/main_saver.py +86 -0
- novel_downloader/core/savers/epub_utils/__init__.py +27 -0
- novel_downloader/core/savers/epub_utils/css_builder.py +68 -0
- novel_downloader/core/savers/epub_utils/initializer.py +98 -0
- novel_downloader/core/savers/epub_utils/text_to_html.py +132 -0
- novel_downloader/core/savers/epub_utils/volume_intro.py +61 -0
- novel_downloader/core/savers/qidian_saver.py +22 -0
- novel_downloader/locales/en.json +91 -0
- novel_downloader/locales/zh.json +91 -0
- novel_downloader/resources/config/rules.toml +196 -0
- novel_downloader/resources/config/settings.yaml +73 -0
- novel_downloader/resources/css_styles/main.css +104 -0
- novel_downloader/resources/css_styles/volume-intro.css +56 -0
- novel_downloader/resources/images/volume_border.png +0 -0
- novel_downloader/resources/js_scripts/qidian_decrypt_node.js +82 -0
- novel_downloader/resources/json/replace_word_map.json +4 -0
- novel_downloader/resources/text/blacklist.txt +22 -0
- novel_downloader/utils/__init__.py +0 -0
- novel_downloader/utils/cache.py +24 -0
- novel_downloader/utils/constants.py +158 -0
- novel_downloader/utils/crypto_utils.py +144 -0
- novel_downloader/utils/file_utils/__init__.py +43 -0
- novel_downloader/utils/file_utils/io.py +252 -0
- novel_downloader/utils/file_utils/normalize.py +68 -0
- novel_downloader/utils/file_utils/sanitize.py +77 -0
- novel_downloader/utils/fontocr/__init__.py +23 -0
- novel_downloader/utils/fontocr/ocr_v1.py +304 -0
- novel_downloader/utils/fontocr/ocr_v2.py +658 -0
- novel_downloader/utils/hash_store.py +288 -0
- novel_downloader/utils/hash_utils.py +103 -0
- novel_downloader/utils/i18n.py +41 -0
- novel_downloader/utils/logger.py +104 -0
- novel_downloader/utils/model_loader.py +72 -0
- novel_downloader/utils/network.py +287 -0
- novel_downloader/utils/state.py +156 -0
- novel_downloader/utils/text_utils/__init__.py +27 -0
- novel_downloader/utils/text_utils/chapter_formatting.py +46 -0
- novel_downloader/utils/text_utils/diff_display.py +75 -0
- novel_downloader/utils/text_utils/font_mapping.py +31 -0
- novel_downloader/utils/text_utils/text_cleaning.py +57 -0
- novel_downloader/utils/time_utils/__init__.py +22 -0
- novel_downloader/utils/time_utils/datetime_utils.py +146 -0
- novel_downloader/utils/time_utils/sleep_utils.py +49 -0
- novel_downloader-1.1.0.dist-info/METADATA +157 -0
- novel_downloader-1.1.0.dist-info/RECORD +115 -0
- novel_downloader-1.1.0.dist-info/WHEEL +5 -0
- novel_downloader-1.1.0.dist-info/entry_points.txt +2 -0
- novel_downloader-1.1.0.dist-info/licenses/LICENSE +21 -0
- novel_downloader-1.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,321 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
novel_downloader.core.parsers.common_parser.helpers
|
5
|
+
---------------------------------------------------
|
6
|
+
|
7
|
+
Shared utility functions for parsing Common pages.
|
8
|
+
"""
|
9
|
+
|
10
|
+
import logging
|
11
|
+
import re
|
12
|
+
from typing import Any, Dict, Iterable, Iterator, List, Optional, cast
|
13
|
+
|
14
|
+
from bs4 import BeautifulSoup, Tag
|
15
|
+
|
16
|
+
from novel_downloader.config import (
|
17
|
+
BookInfoRules,
|
18
|
+
FieldRules,
|
19
|
+
RuleStep,
|
20
|
+
VolumesRules,
|
21
|
+
)
|
22
|
+
|
23
|
+
logger = logging.getLogger(__name__)
|
24
|
+
|
25
|
+
|
26
|
+
def html_to_soup(html_str: str) -> BeautifulSoup:
|
27
|
+
"""
|
28
|
+
Convert an HTML string to a BeautifulSoup object with fallback.
|
29
|
+
|
30
|
+
:param html_str: Raw HTML string.
|
31
|
+
:return: Parsed BeautifulSoup object.
|
32
|
+
"""
|
33
|
+
try:
|
34
|
+
return BeautifulSoup(html_str, "lxml")
|
35
|
+
except Exception as e:
|
36
|
+
logger.warning("[Parser] lxml parse failed, falling back: %s", e)
|
37
|
+
return BeautifulSoup(html_str, "html.parser")
|
38
|
+
|
39
|
+
|
40
|
+
class HTMLExtractor:
|
41
|
+
"""
|
42
|
+
HTML extraction engine that applies a sequence of RuleSteps to
|
43
|
+
pull data out of a page.
|
44
|
+
"""
|
45
|
+
|
46
|
+
def __init__(self, html: str):
|
47
|
+
self._html = html
|
48
|
+
self._soup = html_to_soup(html)
|
49
|
+
|
50
|
+
def extract_book_info(self, rules: BookInfoRules) -> Dict[str, Any]:
|
51
|
+
"""
|
52
|
+
Extract structured book information from HTML according to the given rules.
|
53
|
+
|
54
|
+
Only non-empty fields in the rules are processed.
|
55
|
+
|
56
|
+
:param rules: Extraction configuration specifying how to extract.
|
57
|
+
:return: A dictionary containing extracted book information.
|
58
|
+
"""
|
59
|
+
book_info: Dict[str, Any] = {}
|
60
|
+
|
61
|
+
for field_name, field_rules in rules.items():
|
62
|
+
if field_rules is None:
|
63
|
+
continue
|
64
|
+
|
65
|
+
if field_name == "volumes":
|
66
|
+
book_info[field_name] = self.extract_volumes_structure(
|
67
|
+
cast(VolumesRules, field_rules)
|
68
|
+
)
|
69
|
+
else:
|
70
|
+
steps = cast(FieldRules, field_rules)["steps"]
|
71
|
+
book_info[field_name] = self.extract_field(steps)
|
72
|
+
|
73
|
+
return book_info
|
74
|
+
|
75
|
+
def extract_field(self, steps: List[RuleStep]) -> str:
|
76
|
+
"""
|
77
|
+
Execute a list of extraction steps on the given HTML.
|
78
|
+
|
79
|
+
- If any step yields None, stops processing further steps.
|
80
|
+
- At the end, always returns a str:
|
81
|
+
* If current is a list, converts items to text and joins with '\n'.
|
82
|
+
* If current is a Tag, extracts its .get_text().
|
83
|
+
* Else, uses str().
|
84
|
+
"""
|
85
|
+
|
86
|
+
def flatten_list(items: Iterable[Any]) -> Iterator[Any]:
|
87
|
+
for item in items:
|
88
|
+
if isinstance(item, list):
|
89
|
+
yield from flatten_list(item)
|
90
|
+
else:
|
91
|
+
yield item
|
92
|
+
|
93
|
+
def to_text(item: Any) -> str:
|
94
|
+
if isinstance(item, Tag):
|
95
|
+
return str(item.get_text().strip())
|
96
|
+
return str(item).strip()
|
97
|
+
|
98
|
+
current: Any = self._soup
|
99
|
+
|
100
|
+
for step in steps:
|
101
|
+
t = step.get("type")
|
102
|
+
if t == "select_one":
|
103
|
+
sel = step.get("selector")
|
104
|
+
current = current.select_one(sel) if sel else None
|
105
|
+
|
106
|
+
elif t == "select":
|
107
|
+
sel = step.get("selector")
|
108
|
+
lst = current.select(sel) if sel else []
|
109
|
+
idx = step.get("index")
|
110
|
+
current = lst[idx] if idx is not None and idx < len(lst) else lst
|
111
|
+
|
112
|
+
elif t == "exclude":
|
113
|
+
sel = step.get("selector")
|
114
|
+
for elem in current.select(sel or ""):
|
115
|
+
elem.decompose()
|
116
|
+
|
117
|
+
elif t == "find":
|
118
|
+
nm = step.get("name")
|
119
|
+
attrs = step.get("attrs") or {}
|
120
|
+
current = current.find(nm, attrs=attrs)
|
121
|
+
|
122
|
+
elif t == "find_all":
|
123
|
+
nm = step.get("name")
|
124
|
+
attrs = step.get("attrs") or {}
|
125
|
+
lst = current.find_all(nm, attrs=attrs, limit=step.get("limit"))
|
126
|
+
idx = step.get("index")
|
127
|
+
current = lst[idx] if idx is not None and idx < len(lst) else lst
|
128
|
+
|
129
|
+
elif t == "text":
|
130
|
+
if isinstance(current, list):
|
131
|
+
current = [elem.get_text() for elem in current]
|
132
|
+
elif isinstance(current, Tag):
|
133
|
+
current = current.get_text()
|
134
|
+
|
135
|
+
elif t == "strip":
|
136
|
+
chars = step.get("chars")
|
137
|
+
if isinstance(current, list):
|
138
|
+
current = [c.strip(chars) for c in current]
|
139
|
+
elif isinstance(current, str):
|
140
|
+
current = current.strip(chars)
|
141
|
+
|
142
|
+
elif t == "regex":
|
143
|
+
txt = str(current or "")
|
144
|
+
pat = step.get("pattern") or ""
|
145
|
+
flags = step.get("flags")
|
146
|
+
flags = flags if flags is not None else 0
|
147
|
+
match = re.compile(pat, flags).search(txt)
|
148
|
+
if match:
|
149
|
+
template = step.get("template")
|
150
|
+
if template:
|
151
|
+
s = template
|
152
|
+
for i in range(1, len(match.groups()) + 1):
|
153
|
+
s = s.replace(f"${i}", match.group(i) or "")
|
154
|
+
current = s
|
155
|
+
else:
|
156
|
+
grp = step.get("group")
|
157
|
+
grp = grp if grp is not None else 0
|
158
|
+
current = match.group(grp)
|
159
|
+
else:
|
160
|
+
current = ""
|
161
|
+
|
162
|
+
elif t == "replace":
|
163
|
+
old = step.get("old")
|
164
|
+
old = old if old is not None else ""
|
165
|
+
|
166
|
+
new = step.get("new")
|
167
|
+
new = new if new is not None else ""
|
168
|
+
|
169
|
+
cnt = step.get("count")
|
170
|
+
cnt = cnt if cnt is not None else -1
|
171
|
+
|
172
|
+
if isinstance(current, list):
|
173
|
+
current = [c.replace(old, new, cnt) for c in current]
|
174
|
+
elif isinstance(current, str):
|
175
|
+
current = current.replace(old, new, cnt)
|
176
|
+
|
177
|
+
elif t == "split":
|
178
|
+
sep = step.get("sep", "")
|
179
|
+
idx = step.get("index")
|
180
|
+
idx = idx if idx is not None else 0
|
181
|
+
parts = (current or "").split(sep)
|
182
|
+
current = parts[idx] if idx < len(parts) else ""
|
183
|
+
|
184
|
+
elif t == "join":
|
185
|
+
sep = step.get("sep")
|
186
|
+
sep = sep if sep is not None else ""
|
187
|
+
if isinstance(current, list):
|
188
|
+
current = sep.join(current)
|
189
|
+
|
190
|
+
elif t == "attr":
|
191
|
+
name = step.get("attr")
|
192
|
+
if isinstance(current, list):
|
193
|
+
current = [elem.get(name, "") for elem in current]
|
194
|
+
elif isinstance(current, Tag):
|
195
|
+
current = current.get(name, "")
|
196
|
+
|
197
|
+
else:
|
198
|
+
raise ValueError(f"Unsupported step type: {t}")
|
199
|
+
|
200
|
+
if current is None:
|
201
|
+
break
|
202
|
+
|
203
|
+
# Final normalization
|
204
|
+
if isinstance(current, list):
|
205
|
+
flat = list(flatten_list(current))
|
206
|
+
texts = [to_text(x) for x in flat if x is not None]
|
207
|
+
return "\n".join(texts)
|
208
|
+
if isinstance(current, Tag):
|
209
|
+
return str(current.get_text().strip())
|
210
|
+
return str(current or "").strip()
|
211
|
+
|
212
|
+
def extract_mixed_volumes(self, volume_rule: VolumesRules) -> List[Dict[str, Any]]:
|
213
|
+
"""
|
214
|
+
Special mode: mixed <volume> and <chapter> under same parent.
|
215
|
+
(e.g., dt / dd pattern in BiQuGe)
|
216
|
+
"""
|
217
|
+
list_selector = volume_rule.get("list_selector")
|
218
|
+
volume_selector = volume_rule.get("volume_selector")
|
219
|
+
chapter_selector = volume_rule.get("chapter_selector")
|
220
|
+
volume_name_steps = volume_rule.get("volume_name_steps")
|
221
|
+
chapter_steps_list = volume_rule.get("chapter_steps")
|
222
|
+
|
223
|
+
if not (
|
224
|
+
list_selector and volume_selector and chapter_selector and volume_name_steps
|
225
|
+
):
|
226
|
+
raise ValueError(
|
227
|
+
"volume_mode='mixed' 时, 必须提供 list_selector, volume_selector, "
|
228
|
+
"chapter_selector 和 volume_name_steps"
|
229
|
+
)
|
230
|
+
|
231
|
+
volumes: List[Dict[str, Any]] = []
|
232
|
+
current_volume: Optional[Dict[str, Any]] = None
|
233
|
+
if not chapter_steps_list:
|
234
|
+
chapter_steps_list = []
|
235
|
+
chapter_info_steps = {item["key"]: item["steps"] for item in chapter_steps_list}
|
236
|
+
|
237
|
+
list_area = self._soup.select_one(list_selector)
|
238
|
+
if not list_area:
|
239
|
+
raise ValueError(f"找不到 list_selector: {list_selector}")
|
240
|
+
|
241
|
+
for elem in list_area.find_all(
|
242
|
+
[volume_selector, chapter_selector], recursive=True
|
243
|
+
):
|
244
|
+
if elem.name == volume_selector:
|
245
|
+
extractor = HTMLExtractor(str(elem))
|
246
|
+
volume_name = extractor.extract_field(volume_name_steps)
|
247
|
+
current_volume = {"volume_name": volume_name, "chapters": []}
|
248
|
+
volumes.append(current_volume)
|
249
|
+
|
250
|
+
elif elem.name == chapter_selector and current_volume is not None:
|
251
|
+
chap_extractor = HTMLExtractor(str(elem))
|
252
|
+
chapter_data = {}
|
253
|
+
for field, steps in chapter_info_steps.items():
|
254
|
+
chapter_data[field] = chap_extractor.extract_field(steps)
|
255
|
+
current_volume["chapters"].append(chapter_data)
|
256
|
+
|
257
|
+
return volumes
|
258
|
+
|
259
|
+
def extract_volume_blocks(self, volume_rule: VolumesRules) -> List[Dict[str, Any]]:
|
260
|
+
volume_selector = volume_rule["volume_selector"]
|
261
|
+
chapter_selector = volume_rule["chapter_selector"]
|
262
|
+
volume_name_steps = volume_rule["volume_name_steps"]
|
263
|
+
chapter_steps_list = volume_rule["chapter_steps"]
|
264
|
+
if not (volume_selector and volume_name_steps):
|
265
|
+
raise ValueError(
|
266
|
+
"has_volume=True 时, 必须提供 volume_selector 和 volume_name_steps"
|
267
|
+
)
|
268
|
+
volumes = []
|
269
|
+
chapter_info_steps = {item["key"]: item["steps"] for item in chapter_steps_list}
|
270
|
+
for vol in self._soup.select(volume_selector):
|
271
|
+
extractor = HTMLExtractor(str(vol))
|
272
|
+
volume_name = extractor.extract_field(volume_name_steps)
|
273
|
+
|
274
|
+
chapters = []
|
275
|
+
for chap in vol.select(chapter_selector):
|
276
|
+
chap_extractor = HTMLExtractor(str(chap))
|
277
|
+
chapter_data = {}
|
278
|
+
for field, steps in chapter_info_steps.items():
|
279
|
+
chapter_data[field] = chap_extractor.extract_field(steps)
|
280
|
+
chapters.append(chapter_data)
|
281
|
+
|
282
|
+
volumes.append({"volume_name": volume_name, "chapters": chapters})
|
283
|
+
|
284
|
+
return volumes
|
285
|
+
|
286
|
+
def extract_flat_chapters(self, volume_rule: VolumesRules) -> List[Dict[str, Any]]:
|
287
|
+
chapter_selector = volume_rule["chapter_selector"]
|
288
|
+
chapter_steps_list = volume_rule["chapter_steps"]
|
289
|
+
volume_selector = volume_rule.get("volume_selector")
|
290
|
+
volumes = []
|
291
|
+
chapter_info_steps = {item["key"]: item["steps"] for item in chapter_steps_list}
|
292
|
+
|
293
|
+
if volume_selector:
|
294
|
+
candidates = self._soup.select(volume_selector)
|
295
|
+
else:
|
296
|
+
candidates = [self._soup]
|
297
|
+
|
298
|
+
all_chapters = []
|
299
|
+
for area in candidates:
|
300
|
+
for chap in area.select(chapter_selector):
|
301
|
+
chap_extractor = HTMLExtractor(str(chap))
|
302
|
+
chapter_data = {}
|
303
|
+
for field, steps in chapter_info_steps.items():
|
304
|
+
chapter_data[field] = chap_extractor.extract_field(steps)
|
305
|
+
all_chapters.append(chapter_data)
|
306
|
+
|
307
|
+
volumes.append({"volume_name": "未分卷", "chapters": all_chapters})
|
308
|
+
|
309
|
+
return volumes
|
310
|
+
|
311
|
+
def extract_volumes_structure(
|
312
|
+
self, volume_rule: VolumesRules
|
313
|
+
) -> List[Dict[str, Any]]:
|
314
|
+
volume_mode = volume_rule.get("volume_mode", "normal")
|
315
|
+
if volume_mode == "mixed":
|
316
|
+
return self.extract_mixed_volumes(volume_rule)
|
317
|
+
|
318
|
+
if volume_rule.get("has_volume", True):
|
319
|
+
return self.extract_volume_blocks(volume_rule)
|
320
|
+
else:
|
321
|
+
return self.extract_flat_chapters(volume_rule)
|
@@ -0,0 +1,86 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
novel_downloader.core.parsers.common_parser.main_parser
|
5
|
+
-------------------------------------------------------
|
6
|
+
|
7
|
+
This package provides parsing components for handling
|
8
|
+
Common pages.
|
9
|
+
"""
|
10
|
+
|
11
|
+
from typing import Any, Dict
|
12
|
+
|
13
|
+
from novel_downloader.config import ParserConfig, SiteRules
|
14
|
+
|
15
|
+
from ..base_parser import BaseParser
|
16
|
+
from .helper import HTMLExtractor
|
17
|
+
|
18
|
+
|
19
|
+
class CommonParser(BaseParser):
|
20
|
+
"""
|
21
|
+
CommonParser extends BaseParser to support site-specific parsing rules.
|
22
|
+
|
23
|
+
It accepts additional site information and site-specific rules during initialization
|
24
|
+
"""
|
25
|
+
|
26
|
+
def __init__(self, config: ParserConfig, site: str, site_rule: SiteRules):
|
27
|
+
"""
|
28
|
+
Initialize the parser with configuration, site name, and site-specific rules.
|
29
|
+
|
30
|
+
:param config: ParserConfig object controlling parsing behavior.
|
31
|
+
:param site: Name of the site this parser is targeting.
|
32
|
+
:param site_rule: SiteRules object containing parsing rules for the site.
|
33
|
+
"""
|
34
|
+
super().__init__(config)
|
35
|
+
self._site = site
|
36
|
+
self._site_rule = site_rule
|
37
|
+
|
38
|
+
def parse_book_info(self, html_str: str) -> Dict[str, Any]:
|
39
|
+
"""
|
40
|
+
Parse a book info page and extract metadata and chapter structure.
|
41
|
+
|
42
|
+
:param html: Raw HTML of the book info page.
|
43
|
+
:return: Parsed metadata and chapter structure as a dictionary.
|
44
|
+
"""
|
45
|
+
extractor = HTMLExtractor(html_str)
|
46
|
+
rules = self._site_rule["book_info"]
|
47
|
+
return extractor.extract_book_info(rules)
|
48
|
+
|
49
|
+
def parse_chapter(self, html_str: str, chapter_id: str) -> Dict[str, Any]:
|
50
|
+
"""
|
51
|
+
Parse a single chapter page and extract clean text or simplified HTML.
|
52
|
+
|
53
|
+
:param html: Raw HTML of the chapter page.
|
54
|
+
:param chapter_id: Identifier of the chapter being parsed.
|
55
|
+
:return: Cleaned chapter content as plain text or minimal HTML.
|
56
|
+
"""
|
57
|
+
extractor = HTMLExtractor(html_str)
|
58
|
+
chapter_rules = self._site_rule["chapter"]
|
59
|
+
|
60
|
+
# 必须有正文内容
|
61
|
+
content_steps = chapter_rules.get("content")
|
62
|
+
if not content_steps:
|
63
|
+
raise ValueError(f"No chapter content steps defined for site: {self._site}")
|
64
|
+
|
65
|
+
title_steps = chapter_rules.get("title")
|
66
|
+
title = extractor.extract_field(title_steps["steps"]) if title_steps else ""
|
67
|
+
content = extractor.extract_field(content_steps["steps"])
|
68
|
+
if not content:
|
69
|
+
return {}
|
70
|
+
|
71
|
+
return {
|
72
|
+
"id": chapter_id,
|
73
|
+
"title": title or "Untitled",
|
74
|
+
"content": content,
|
75
|
+
"site": self._site,
|
76
|
+
}
|
77
|
+
|
78
|
+
@property
|
79
|
+
def site(self) -> str:
|
80
|
+
"""Return the site name."""
|
81
|
+
return self._site
|
82
|
+
|
83
|
+
@property
|
84
|
+
def site_rule(self) -> SiteRules:
|
85
|
+
"""Return the site-specific rules."""
|
86
|
+
return self._site_rule
|
@@ -0,0 +1,20 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
novel_downloader.core.parsers.qidian_parser
|
5
|
+
-------------------------------------------
|
6
|
+
|
7
|
+
This package provides parsing implementations for the Qidian platform.
|
8
|
+
|
9
|
+
Modules:
|
10
|
+
- browser: Contains `QidianBrowserParser` for browser-rendered page parsing.
|
11
|
+
- session: Contains `QidianSessionParser` for session page parsing.
|
12
|
+
"""
|
13
|
+
|
14
|
+
from .browser import QidianBrowserParser
|
15
|
+
from .session import QidianSessionParser
|
16
|
+
|
17
|
+
__all__ = [
|
18
|
+
"QidianBrowserParser",
|
19
|
+
"QidianSessionParser",
|
20
|
+
]
|
@@ -0,0 +1,13 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
novel_downloader.core.parsers.qidian_parser.browser
|
5
|
+
---------------------------------------------------
|
6
|
+
|
7
|
+
This package provides parsing components for handling Qidian
|
8
|
+
pages that have been rendered by a browser engine.
|
9
|
+
"""
|
10
|
+
|
11
|
+
from .main_parser import QidianBrowserParser
|
12
|
+
|
13
|
+
__all__ = ["QidianBrowserParser"]
|