novel-downloader 2.0.0__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/download.py +3 -3
  3. novel_downloader/cli/export.py +1 -1
  4. novel_downloader/cli/ui.py +7 -7
  5. novel_downloader/config/adapter.py +191 -154
  6. novel_downloader/core/__init__.py +5 -6
  7. novel_downloader/core/exporters/common/txt.py +9 -9
  8. novel_downloader/core/exporters/linovelib/txt.py +9 -9
  9. novel_downloader/core/fetchers/qidian.py +20 -35
  10. novel_downloader/core/interfaces/fetcher.py +2 -2
  11. novel_downloader/core/interfaces/parser.py +2 -2
  12. novel_downloader/core/parsers/base.py +1 -0
  13. novel_downloader/core/parsers/eightnovel.py +2 -2
  14. novel_downloader/core/parsers/esjzone.py +3 -3
  15. novel_downloader/core/parsers/qidian/main_parser.py +747 -12
  16. novel_downloader/core/parsers/qidian/utils/__init__.py +2 -21
  17. novel_downloader/core/parsers/qidian/utils/node_decryptor.py +4 -4
  18. novel_downloader/core/parsers/xiguashuwu.py +6 -12
  19. novel_downloader/locales/en.json +3 -3
  20. novel_downloader/locales/zh.json +3 -3
  21. novel_downloader/utils/__init__.py +0 -2
  22. novel_downloader/utils/chapter_storage.py +2 -3
  23. novel_downloader/utils/constants.py +1 -3
  24. novel_downloader/utils/cookies.py +32 -17
  25. novel_downloader/utils/crypto_utils/__init__.py +0 -6
  26. novel_downloader/utils/crypto_utils/rc4.py +40 -50
  27. novel_downloader/utils/epub/__init__.py +2 -3
  28. novel_downloader/utils/epub/builder.py +6 -6
  29. novel_downloader/utils/epub/constants.py +5 -5
  30. novel_downloader/utils/epub/documents.py +7 -7
  31. novel_downloader/utils/epub/models.py +8 -8
  32. novel_downloader/utils/epub/utils.py +10 -10
  33. novel_downloader/utils/file_utils/io.py +48 -73
  34. novel_downloader/utils/file_utils/normalize.py +1 -7
  35. novel_downloader/utils/file_utils/sanitize.py +4 -11
  36. novel_downloader/utils/fontocr/__init__.py +13 -0
  37. novel_downloader/utils/{fontocr.py → fontocr/core.py} +70 -61
  38. novel_downloader/utils/fontocr/loader.py +50 -0
  39. novel_downloader/utils/logger.py +80 -56
  40. novel_downloader/utils/network.py +16 -40
  41. novel_downloader/utils/text_utils/text_cleaner.py +39 -30
  42. novel_downloader/utils/text_utils/truncate_utils.py +3 -14
  43. novel_downloader/utils/time_utils/sleep_utils.py +53 -43
  44. novel_downloader/web/main.py +1 -1
  45. novel_downloader/web/pages/search.py +3 -3
  46. {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.1.dist-info}/METADATA +2 -1
  47. {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.1.dist-info}/RECORD +51 -55
  48. novel_downloader/core/parsers/qidian/book_info_parser.py +0 -89
  49. novel_downloader/core/parsers/qidian/chapter_encrypted.py +0 -470
  50. novel_downloader/core/parsers/qidian/chapter_normal.py +0 -126
  51. novel_downloader/core/parsers/qidian/chapter_router.py +0 -68
  52. novel_downloader/core/parsers/qidian/utils/fontmap_recover.py +0 -143
  53. novel_downloader/core/parsers/qidian/utils/helpers.py +0 -110
  54. {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.1.dist-info}/WHEEL +0 -0
  55. {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.1.dist-info}/entry_points.txt +0 -0
  56. {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.1.dist-info}/licenses/LICENSE +0 -0
  57. {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.1.dist-info}/top_level.txt +0 -0
@@ -1,470 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- novel_downloader.core.parsers.qidian.chapter_encrypted
4
- ------------------------------------------------------
5
-
6
- Support for parsing encrypted chapters from Qidian using font OCR mapping,
7
- CSS rules, and custom rendering logic.
8
- """
9
-
10
- from __future__ import annotations
11
-
12
- import json
13
- import logging
14
- import re
15
- from contextlib import suppress
16
- from typing import TYPE_CHECKING, TypedDict
17
-
18
- from lxml import html
19
-
20
- from novel_downloader.models import ChapterDict
21
- from novel_downloader.utils import (
22
- download,
23
- truncate_half_lines,
24
- )
25
-
26
- from .utils import (
27
- extract_chapter_info,
28
- find_ssr_page_context,
29
- get_decryptor,
30
- is_duplicated,
31
- vip_status,
32
- )
33
- from .utils.fontmap_recover import (
34
- apply_font_mapping,
35
- generate_font_map,
36
- )
37
-
38
- if TYPE_CHECKING:
39
- from .main_parser import QidianParser
40
-
41
- logger = logging.getLogger(__name__)
42
- _RE_ATTR = re.compile(r"attr\(\s*([^)]+?)\s*\)", re.I)
43
- _RE_SCALEX = re.compile(r"scalex\(\s*-?1\s*\)", re.I)
44
-
45
-
46
- class Rule(TypedDict, total=False):
47
- delete_all: bool
48
- delete_first: bool
49
- transform_flip_x: bool
50
- append_start_char: str
51
- append_end_char: str
52
- append_start_attr: str
53
- append_end_attr: str
54
-
55
-
56
- class Rules(TypedDict):
57
- # e.g., orders = ["i", "em", "span"]
58
- orders: list[str]
59
- # e.g., sy["sy-3"] -> Rule
60
- sy: dict[str, Rule]
61
- # e.g., p_rules["p3"]["i"] -> Rule
62
- p_rules: dict[str, dict[str, Rule]]
63
-
64
-
65
- def parse_encrypted_chapter(
66
- parser: QidianParser,
67
- html_str: str,
68
- chapter_id: str,
69
- ) -> ChapterDict | None:
70
- """
71
- Extract and return the formatted textual content of an encrypted chapter.
72
-
73
- Steps:
74
- 1. Load SSR JSON context for CSS, fonts, and metadata.
75
- 3. Decode and save randomFont bytes; download fixedFont via download_font().
76
- 4. Extract paragraph structures and save debug JSON.
77
- 5. Parse CSS rules and save debug JSON.
78
- 6. Render encrypted paragraphs, then run OCR font-mapping.
79
- 7. Extracts paragraph texts and formats them.
80
-
81
- :param html_str: Raw HTML content of the chapter page.
82
- :return: Formatted chapter text or empty string if not parsable.
83
- """
84
- try:
85
- if not parser._decode_font:
86
- return None
87
- ssr_data = find_ssr_page_context(html_str)
88
- chapter_info = extract_chapter_info(ssr_data)
89
- if not chapter_info:
90
- logger.warning(
91
- "[Parser] ssr_chapterInfo not found for chapter '%s'", chapter_id
92
- )
93
- return None
94
-
95
- debug_dir = parser._debug_dir / "font_debug" / "qidian" / chapter_id
96
- if parser.save_font_debug:
97
- debug_dir.mkdir(parents=True, exist_ok=True)
98
-
99
- css_str = chapter_info["css"]
100
- randomFont_str = chapter_info["randomFont"]
101
- fixedFontWoff2_url = chapter_info["fixedFontWoff2"]
102
-
103
- title = chapter_info.get("chapterName", "Untitled")
104
- duplicated = is_duplicated(ssr_data)
105
- raw_html = chapter_info.get("content", "")
106
- chapter_id = chapter_info.get("chapterId", chapter_id)
107
- fkp = chapter_info.get("fkp", "")
108
- author_say = chapter_info.get("authorSay", "")
109
- update_time = chapter_info.get("updateTime", "")
110
- update_timestamp = chapter_info.get("updateTimestamp", 0)
111
- modify_time = chapter_info.get("modifyTime", 0)
112
- word_count = chapter_info.get("actualWords", 0)
113
- seq = chapter_info.get("seq", None)
114
- volume = chapter_info.get("extra", {}).get("volumeName", "")
115
-
116
- # extract + save font
117
- rf = json.loads(randomFont_str)
118
- rand_path = parser._base_cache_dir / "randomFont.ttf"
119
- rand_path.parent.mkdir(parents=True, exist_ok=True)
120
- rand_path.write_bytes(bytes(rf["data"]))
121
-
122
- fixed_path = download(
123
- url=fixedFontWoff2_url,
124
- target_dir=parser._fixed_font_dir,
125
- stream=True,
126
- )
127
- if fixed_path is None:
128
- raise ValueError("fixed_path is None: failed to download font")
129
-
130
- # Extract and render paragraphs from HTML with CSS rules
131
- if vip_status(ssr_data):
132
- try:
133
- decryptor = get_decryptor()
134
- raw_html = decryptor.decrypt(
135
- raw_html,
136
- chapter_id,
137
- fkp,
138
- parser._fuid,
139
- )
140
- except Exception as e:
141
- logger.error("[Parser] decryption failed for '%s': %s", chapter_id, e)
142
- return None
143
-
144
- css_rules = parse_css_rules(css_str)
145
- paragraphs_str, refl_list = render_visible_text(raw_html, css_rules)
146
- if parser.save_font_debug:
147
- paragraphs_str_path = debug_dir / f"{chapter_id}_debug.txt"
148
- paragraphs_str_path.write_text(paragraphs_str, encoding="utf-8")
149
-
150
- # Run OCR + fallback mapping
151
- char_set = {c for c in paragraphs_str if c not in {" ", "\n", "\u3000"}}
152
- refl_set = set(refl_list)
153
- char_set = char_set - refl_set
154
- if parser.save_font_debug:
155
- char_sets_path = debug_dir / "char_set_debug.txt"
156
- temp = f"char_set:\n{char_set}\n\nrefl_set:\n{refl_set}"
157
- char_sets_path.write_text(
158
- temp,
159
- encoding="utf-8",
160
- )
161
-
162
- mapping_result = generate_font_map(
163
- fixed_font_path=fixed_path,
164
- random_font_path=rand_path,
165
- char_set=char_set,
166
- refl_set=refl_set,
167
- cache_dir=parser._base_cache_dir,
168
- batch_size=parser._config.batch_size,
169
- )
170
- if not mapping_result:
171
- return None
172
-
173
- if parser.save_font_debug:
174
- mapping_json_path = debug_dir / "font_mapping.json"
175
- mapping_json_path.write_text(
176
- json.dumps(mapping_result, ensure_ascii=False, indent=2),
177
- encoding="utf-8",
178
- )
179
-
180
- # Reconstruct final readable text
181
- original_text = apply_font_mapping(
182
- text=paragraphs_str,
183
- font_map=mapping_result,
184
- )
185
-
186
- final_paragraphs_str = "\n".join(
187
- line.strip() for line in original_text.splitlines() if line.strip()
188
- )
189
- if parser._use_truncation and duplicated:
190
- final_paragraphs_str = truncate_half_lines(final_paragraphs_str)
191
-
192
- return {
193
- "id": str(chapter_id),
194
- "title": str(title),
195
- "content": final_paragraphs_str,
196
- "extra": {
197
- "author_say": author_say.strip() if author_say else "",
198
- "updated_at": update_time,
199
- "update_timestamp": update_timestamp,
200
- "modify_time": modify_time,
201
- "word_count": word_count,
202
- "duplicated": duplicated,
203
- "seq": seq,
204
- "volume": volume,
205
- "encrypted": True,
206
- },
207
- }
208
-
209
- except Exception as e:
210
- logger.warning(
211
- "[Parser] parse error for encrypted chapter '%s': %s", chapter_id, e
212
- )
213
- return None
214
-
215
-
216
- def _only_tag(selector: str) -> str | None:
217
- """
218
- Normalize a selector into just its tag name for ordering.
219
-
220
- Handles forms like 'i', 'em::before', '.p3 i', '.p2 span::after'.
221
-
222
- Returns None if can't extract a tag.
223
- """
224
- sel = selector.strip()
225
- # If it has spaces, take the rightmost simple selector
226
- last = sel.split()[-1]
227
- # Drop ::pseudo
228
- last = last.split("::", 1)[0]
229
- # If it's like 'span[attr=..]' keep 'span'
230
- last = last.split("[", 1)[0]
231
- # If it starts with '.', it's not a tag
232
- if not last or last.startswith("."):
233
- return None
234
- return last
235
-
236
-
237
- def _parse_decls(block: str) -> list[tuple[str, str]]:
238
- """
239
- Parse 'name:value;...' inside a block. Tolerates quotes and attr().
240
- """
241
- decls: list[tuple[str, str]] = []
242
- i = 0
243
- n = len(block)
244
- name: list[str] = []
245
- val: list[str] = []
246
- in_name = True
247
- quote = None # track ' or "
248
- while i < n:
249
- c = block[i]
250
- if quote:
251
- # inside quotes
252
- if c == "\\" and i + 1 < n:
253
- # keep escaped char
254
- (name if in_name else val).append(c)
255
- i += 1
256
- (name if in_name else val).append(block[i])
257
- elif c == quote:
258
- (name if in_name else val).append(c)
259
- quote = None
260
- else:
261
- (name if in_name else val).append(c)
262
- else:
263
- if c in ("'", '"'):
264
- (name if in_name else val).append(c)
265
- quote = c
266
- elif in_name and c == ":":
267
- in_name = False
268
- elif c == ";":
269
- nm = "".join(name).strip().lower()
270
- vl = "".join(val).strip()
271
- if nm:
272
- decls.append((nm, vl))
273
- name.clear()
274
- val.clear()
275
- in_name = True
276
- else:
277
- (name if in_name else val).append(c)
278
- i += 1
279
-
280
- if name or val:
281
- nm = "".join(name).strip().lower()
282
- vl = "".join(val).strip()
283
- if nm:
284
- decls.append((nm, vl))
285
- return decls
286
-
287
-
288
- def parse_css_rules(css_str: str) -> Rules:
289
- """
290
- Produces normalized Rules with:
291
- - orders: list[str] of tag names sorted by numeric 'order'
292
- - sy: '.sy-*' class rules
293
- - p_rules: '.p* <tag>' rules, indexed by p-class then tag
294
- """
295
- rules: Rules = {"orders": [], "sy": {}, "p_rules": {}}
296
- order_pairs: list[tuple[str, int]] = []
297
-
298
- i = 0
299
- while True:
300
- b1 = css_str.find("{", i)
301
- if b1 == -1:
302
- break
303
- selector = css_str[i:b1].strip().lower()
304
- b2 = css_str.find("}", b1 + 1)
305
- if b2 == -1:
306
- break
307
- block = css_str[b1 + 1 : b2]
308
- i = b2 + 1
309
-
310
- decls = _parse_decls(block)
311
-
312
- new_rule: Rule = {}
313
- order_val: int | None = None
314
-
315
- for name, value in decls:
316
- v = value.strip()
317
- if name == "font-size" and v == "0":
318
- if "::first-letter" in selector:
319
- new_rule["delete_first"] = True
320
- else:
321
- new_rule["delete_all"] = True
322
- elif name == "transform":
323
- if _RE_SCALEX.search(v.replace(" ", "")):
324
- new_rule["transform_flip_x"] = True
325
- elif name == "order":
326
- with suppress(ValueError, TypeError):
327
- order_val = int(v)
328
- elif name == "content":
329
- # normalize: remove outer quotes
330
- if "::after" in selector:
331
- m = _RE_ATTR.search(v)
332
- if m:
333
- new_rule["append_end_attr"] = m.group(1)
334
- else:
335
- s = v.strip().strip("\"'")
336
- new_rule["append_end_char"] = s
337
- elif "::before" in selector:
338
- m = _RE_ATTR.search(v)
339
- if m:
340
- new_rule["append_start_attr"] = m.group(1)
341
- else:
342
- s = v.strip().strip("\"'")
343
- new_rule["append_start_char"] = s
344
-
345
- # classification
346
- if selector.startswith(".sy-"):
347
- key = selector.lstrip(".")
348
- old = rules["sy"].get(key)
349
- rules["sy"][key] = {**old, **new_rule} if old else (new_rule or {})
350
-
351
- elif selector.startswith(".p") and " " in selector:
352
- p_cls, right = selector.split(" ", 1)
353
- p_cls = p_cls.lstrip(".")
354
- tag = _only_tag(right)
355
- if tag:
356
- prev = rules["p_rules"].setdefault(p_cls, {}).get(tag)
357
- rules["p_rules"][p_cls][tag] = (
358
- {**prev, **new_rule} if prev else (new_rule or {})
359
- )
360
-
361
- if order_val is not None:
362
- tag_for_order = _only_tag(selector)
363
- if tag_for_order:
364
- order_pairs.append((tag_for_order, order_val))
365
-
366
- # normalize orders
367
- order_pairs.sort(key=lambda t: t[1])
368
- seen = set()
369
- orders: list[str] = []
370
- for tag, _num in order_pairs:
371
- if tag not in seen:
372
- seen.add(tag)
373
- orders.append(tag)
374
- rules["orders"] = orders
375
- return rules
376
-
377
-
378
- def render_visible_text(html_str: str, rules: Rules) -> tuple[str, list[str]]:
379
- """
380
- Renderer the HTML using pre-parsed Rules.
381
- """
382
- tree = html.fromstring(html_str)
383
- paragraphs_out: list[str] = []
384
- refl_list: list[str] = []
385
- orders = rules.get("orders") or []
386
- p_rules = rules.get("p_rules") or {}
387
- sy_rules = rules.get("sy") or {}
388
-
389
- def _class_list(el: html.HtmlElement) -> list[str]:
390
- cls = el.get("class")
391
- return cls.split() if cls else []
392
-
393
- def _apply_rule(el: html.HtmlElement, rule: Rule) -> str:
394
- if rule.get("delete_all"):
395
- return ""
396
-
397
- parts: list[str] = []
398
- if "append_start_char" in rule:
399
- parts.append(rule["append_start_char"])
400
- if "append_start_attr" in rule:
401
- parts.append(el.get(rule["append_start_attr"], ""))
402
-
403
- text = el.text or ""
404
- if rule.get("delete_first") and text:
405
- text = text[1:]
406
- parts.append(text)
407
-
408
- if "append_end_char" in rule:
409
- parts.append(rule["append_end_char"])
410
- if "append_end_attr" in rule:
411
- parts.append(el.get(rule["append_end_attr"], ""))
412
-
413
- s = "".join(parts)
414
-
415
- if rule.get("transform_flip_x") and s:
416
- refl_list.append(s)
417
-
418
- return s
419
-
420
- for p in tree.findall(".//p"):
421
- p_classes = _class_list(p)
422
- p_key = next((c for c in p_classes if c.startswith("p")), None)
423
- has_ordered_rules = p_key in p_rules
424
-
425
- buf_parts: list[str] = []
426
-
427
- if p.text and not has_ordered_rules:
428
- buf_parts.append(p.text)
429
-
430
- ordered_cache: dict[str, list[str]] = {}
431
-
432
- for child in p:
433
- tag = str(child.tag)
434
-
435
- # Handle inline <y class="sy-*"> spans
436
- if tag == "y" and not has_ordered_rules:
437
- y_cls = next(
438
- (c for c in _class_list(child) if c.startswith("sy-")), None
439
- )
440
- if y_cls and y_cls in sy_rules:
441
- buf_parts.append(_apply_rule(child, sy_rules[y_cls]))
442
- else:
443
- buf_parts.append(child.text or "")
444
- if child.tail:
445
- buf_parts.append(child.tail)
446
- continue
447
-
448
- # Handle ordered paragraphs: only cache tags that appear in `orders`
449
- if p_key and has_ordered_rules and tag in orders:
450
- rule = p_rules[p_key].get(tag, {})
451
- ordered_cache.setdefault(tag, []).append(_apply_rule(child, rule))
452
- continue
453
-
454
- # Non-ordered, non-<y> nodes: include text + tails as-is
455
- if not has_ordered_rules:
456
- buf_parts.append(child.text or "")
457
- if child.tail:
458
- buf_parts.append(child.tail)
459
-
460
- # If ordered, flush in global orders with all duplicates preserved
461
- if has_ordered_rules:
462
- for tag in orders:
463
- if tag in ordered_cache:
464
- buf_parts.extend(ordered_cache[tag])
465
-
466
- para = "".join(buf_parts)
467
- if para:
468
- paragraphs_out.append(para)
469
-
470
- return "\n".join(paragraphs_out), refl_list
@@ -1,126 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- novel_downloader.core.parsers.qidian.chapter_normal
4
- ---------------------------------------------------
5
-
6
- Parser logic for extracting readable text from Qidian chapters
7
- that use plain (non-encrypted) browser-rendered HTML.
8
- """
9
-
10
- from __future__ import annotations
11
-
12
- import logging
13
- from typing import TYPE_CHECKING
14
-
15
- from lxml import html
16
-
17
- from novel_downloader.models import ChapterDict
18
- from novel_downloader.utils import truncate_half_lines
19
-
20
- from .utils import (
21
- extract_chapter_info,
22
- find_ssr_page_context,
23
- get_decryptor,
24
- is_duplicated,
25
- vip_status,
26
- )
27
-
28
- if TYPE_CHECKING:
29
- from .main_parser import QidianParser
30
-
31
- logger = logging.getLogger(__name__)
32
-
33
-
34
- def parse_normal_chapter(
35
- parser: QidianParser,
36
- html_str: str,
37
- chapter_id: str,
38
- ) -> ChapterDict | None:
39
- """
40
- Extract structured chapter info from a normal Qidian page.
41
-
42
- :param html_str: Chapter HTML.
43
- :param chapter_id: Chapter identifier (string).
44
- :return: a dictionary with keys like 'id', 'title', 'content', etc.
45
- """
46
- try:
47
- ssr_data = find_ssr_page_context(html_str)
48
- chapter_info = extract_chapter_info(ssr_data)
49
- if not chapter_info:
50
- logger.warning(
51
- "[Parser] ssr_chapterInfo not found for chapter '%s'", chapter_id
52
- )
53
- return None
54
-
55
- title = chapter_info.get("chapterName", "Untitled")
56
- duplicated = is_duplicated(ssr_data)
57
- raw_html = chapter_info.get("content", "")
58
- chapter_id = chapter_info.get("chapterId", chapter_id)
59
- fkp = chapter_info.get("fkp", "")
60
- author_say = chapter_info.get("authorSay", "")
61
- update_time = chapter_info.get("updateTime", "")
62
- update_timestamp = chapter_info.get("updateTimestamp", 0)
63
- modify_time = chapter_info.get("modifyTime", 0)
64
- word_count = chapter_info.get("actualWords", 0)
65
- seq = chapter_info.get("seq", None)
66
- volume = chapter_info.get("extra", {}).get("volumeName", "")
67
-
68
- chapter_text = _parse_paragraph(
69
- html_str=raw_html,
70
- is_vip=vip_status(ssr_data),
71
- chapter_id=chapter_id,
72
- fkp=fkp,
73
- fuid=parser._fuid,
74
- )
75
- if not chapter_text:
76
- return None
77
-
78
- if parser._use_truncation and duplicated:
79
- chapter_text = truncate_half_lines(chapter_text)
80
-
81
- return {
82
- "id": str(chapter_id),
83
- "title": title,
84
- "content": chapter_text,
85
- "extra": {
86
- "author_say": author_say.strip() if author_say else "",
87
- "updated_at": update_time,
88
- "update_timestamp": update_timestamp,
89
- "modify_time": modify_time,
90
- "word_count": word_count,
91
- "duplicated": duplicated,
92
- "seq": seq,
93
- "volume": volume,
94
- "encrypted": False,
95
- },
96
- }
97
- except Exception as e:
98
- logger.warning(
99
- "[Parser] parse error for normal chapter '%s': %s", chapter_id, e
100
- )
101
- return None
102
-
103
-
104
- def _parse_paragraph(
105
- html_str: str,
106
- is_vip: bool,
107
- chapter_id: str,
108
- fkp: str,
109
- fuid: str,
110
- ) -> str:
111
- raw_html = html_str
112
-
113
- if is_vip:
114
- try:
115
- decryptor = get_decryptor()
116
- raw_html = decryptor.decrypt(raw_html, chapter_id, fkp, fuid)
117
- except Exception as e:
118
- logger.error("[Parser] decryption failed for '%s': %s", chapter_id, e)
119
- return ""
120
-
121
- tree = html.fromstring(raw_html)
122
- paras = tree.xpath(".//p")
123
- paragraph_texts = [
124
- p.text_content().strip() for p in paras if p.text_content().strip()
125
- ]
126
- return "\n".join(paragraph_texts)
@@ -1,68 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- novel_downloader.core.parsers.qidian.chapter_router
4
- ---------------------------------------------------
5
-
6
- Routing logic for selecting the correct chapter parser for Qidian pages.
7
- """
8
-
9
- from __future__ import annotations
10
-
11
- import logging
12
- from typing import TYPE_CHECKING
13
-
14
- from novel_downloader.models import ChapterDict
15
-
16
- from .chapter_normal import parse_normal_chapter
17
- from .utils import (
18
- can_view_chapter,
19
- find_ssr_page_context,
20
- is_encrypted,
21
- )
22
-
23
- if TYPE_CHECKING:
24
- from .main_parser import QidianParser
25
-
26
- logger = logging.getLogger(__name__)
27
-
28
-
29
- def parse_chapter(
30
- parser: QidianParser,
31
- html_str: str,
32
- chapter_id: str,
33
- ) -> ChapterDict | None:
34
- """
35
- Extract and return the formatted textual content of chapter.
36
-
37
- :param parser: Instance of QidianParser.
38
- :param html_str: Raw HTML content of the chapter page.
39
- :param chapter_id: Identifier of the chapter being parsed.
40
- :return: Formatted chapter text or empty string if not parsable.
41
- """
42
- try:
43
- ssr_data = find_ssr_page_context(html_str)
44
-
45
- if not can_view_chapter(ssr_data):
46
- logger.warning(
47
- "[Parser] Chapter '%s' is not purchased or inaccessible.", chapter_id
48
- )
49
- return None
50
-
51
- if is_encrypted(ssr_data):
52
- if not parser._decode_font:
53
- return None
54
- try:
55
- from .chapter_encrypted import parse_encrypted_chapter
56
-
57
- return parse_encrypted_chapter(parser, html_str, chapter_id)
58
- except ImportError:
59
- logger.warning(
60
- "[Parser] Encrypted chapter '%s' requires extra dependencies.",
61
- chapter_id,
62
- )
63
- return None
64
-
65
- return parse_normal_chapter(parser, html_str, chapter_id)
66
- except Exception as e:
67
- logger.warning("[Parser] parse error for chapter '%s': %s", chapter_id, e)
68
- return None