novel-downloader 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. novel_downloader/__init__.py +14 -0
  2. novel_downloader/cli/__init__.py +14 -0
  3. novel_downloader/cli/clean.py +134 -0
  4. novel_downloader/cli/download.py +132 -0
  5. novel_downloader/cli/interactive.py +67 -0
  6. novel_downloader/cli/main.py +45 -0
  7. novel_downloader/cli/settings.py +177 -0
  8. novel_downloader/config/__init__.py +52 -0
  9. novel_downloader/config/adapter.py +153 -0
  10. novel_downloader/config/loader.py +177 -0
  11. novel_downloader/config/models.py +173 -0
  12. novel_downloader/config/site_rules.py +97 -0
  13. novel_downloader/core/__init__.py +25 -0
  14. novel_downloader/core/downloaders/__init__.py +22 -0
  15. novel_downloader/core/downloaders/base_async_downloader.py +157 -0
  16. novel_downloader/core/downloaders/base_downloader.py +187 -0
  17. novel_downloader/core/downloaders/common_asynb_downloader.py +207 -0
  18. novel_downloader/core/downloaders/common_downloader.py +191 -0
  19. novel_downloader/core/downloaders/qidian_downloader.py +208 -0
  20. novel_downloader/core/factory/__init__.py +33 -0
  21. novel_downloader/core/factory/downloader_factory.py +149 -0
  22. novel_downloader/core/factory/parser_factory.py +62 -0
  23. novel_downloader/core/factory/requester_factory.py +106 -0
  24. novel_downloader/core/factory/saver_factory.py +49 -0
  25. novel_downloader/core/interfaces/__init__.py +32 -0
  26. novel_downloader/core/interfaces/async_downloader_protocol.py +37 -0
  27. novel_downloader/core/interfaces/async_requester_protocol.py +68 -0
  28. novel_downloader/core/interfaces/downloader_protocol.py +37 -0
  29. novel_downloader/core/interfaces/parser_protocol.py +40 -0
  30. novel_downloader/core/interfaces/requester_protocol.py +65 -0
  31. novel_downloader/core/interfaces/saver_protocol.py +61 -0
  32. novel_downloader/core/parsers/__init__.py +28 -0
  33. novel_downloader/core/parsers/base_parser.py +96 -0
  34. novel_downloader/core/parsers/common_parser/__init__.py +14 -0
  35. novel_downloader/core/parsers/common_parser/helper.py +321 -0
  36. novel_downloader/core/parsers/common_parser/main_parser.py +86 -0
  37. novel_downloader/core/parsers/qidian_parser/__init__.py +20 -0
  38. novel_downloader/core/parsers/qidian_parser/browser/__init__.py +13 -0
  39. novel_downloader/core/parsers/qidian_parser/browser/chapter_encrypted.py +498 -0
  40. novel_downloader/core/parsers/qidian_parser/browser/chapter_normal.py +97 -0
  41. novel_downloader/core/parsers/qidian_parser/browser/chapter_router.py +70 -0
  42. novel_downloader/core/parsers/qidian_parser/browser/main_parser.py +110 -0
  43. novel_downloader/core/parsers/qidian_parser/session/__init__.py +13 -0
  44. novel_downloader/core/parsers/qidian_parser/session/chapter_encrypted.py +451 -0
  45. novel_downloader/core/parsers/qidian_parser/session/chapter_normal.py +119 -0
  46. novel_downloader/core/parsers/qidian_parser/session/chapter_router.py +67 -0
  47. novel_downloader/core/parsers/qidian_parser/session/main_parser.py +113 -0
  48. novel_downloader/core/parsers/qidian_parser/session/node_decryptor.py +164 -0
  49. novel_downloader/core/parsers/qidian_parser/shared/__init__.py +38 -0
  50. novel_downloader/core/parsers/qidian_parser/shared/book_info_parser.py +95 -0
  51. novel_downloader/core/parsers/qidian_parser/shared/helpers.py +133 -0
  52. novel_downloader/core/requesters/__init__.py +31 -0
  53. novel_downloader/core/requesters/base_async_session.py +297 -0
  54. novel_downloader/core/requesters/base_browser.py +210 -0
  55. novel_downloader/core/requesters/base_session.py +243 -0
  56. novel_downloader/core/requesters/common_requester/__init__.py +18 -0
  57. novel_downloader/core/requesters/common_requester/common_async_session.py +96 -0
  58. novel_downloader/core/requesters/common_requester/common_session.py +126 -0
  59. novel_downloader/core/requesters/qidian_requester/__init__.py +22 -0
  60. novel_downloader/core/requesters/qidian_requester/qidian_broswer.py +377 -0
  61. novel_downloader/core/requesters/qidian_requester/qidian_session.py +202 -0
  62. novel_downloader/core/savers/__init__.py +20 -0
  63. novel_downloader/core/savers/base_saver.py +169 -0
  64. novel_downloader/core/savers/common_saver/__init__.py +13 -0
  65. novel_downloader/core/savers/common_saver/common_epub.py +232 -0
  66. novel_downloader/core/savers/common_saver/common_txt.py +176 -0
  67. novel_downloader/core/savers/common_saver/main_saver.py +86 -0
  68. novel_downloader/core/savers/epub_utils/__init__.py +27 -0
  69. novel_downloader/core/savers/epub_utils/css_builder.py +68 -0
  70. novel_downloader/core/savers/epub_utils/initializer.py +98 -0
  71. novel_downloader/core/savers/epub_utils/text_to_html.py +132 -0
  72. novel_downloader/core/savers/epub_utils/volume_intro.py +61 -0
  73. novel_downloader/core/savers/qidian_saver.py +22 -0
  74. novel_downloader/locales/en.json +91 -0
  75. novel_downloader/locales/zh.json +91 -0
  76. novel_downloader/resources/config/rules.toml +196 -0
  77. novel_downloader/resources/config/settings.yaml +73 -0
  78. novel_downloader/resources/css_styles/main.css +104 -0
  79. novel_downloader/resources/css_styles/volume-intro.css +56 -0
  80. novel_downloader/resources/images/volume_border.png +0 -0
  81. novel_downloader/resources/js_scripts/qidian_decrypt_node.js +82 -0
  82. novel_downloader/resources/json/replace_word_map.json +4 -0
  83. novel_downloader/resources/text/blacklist.txt +22 -0
  84. novel_downloader/utils/__init__.py +0 -0
  85. novel_downloader/utils/cache.py +24 -0
  86. novel_downloader/utils/constants.py +158 -0
  87. novel_downloader/utils/crypto_utils.py +144 -0
  88. novel_downloader/utils/file_utils/__init__.py +43 -0
  89. novel_downloader/utils/file_utils/io.py +252 -0
  90. novel_downloader/utils/file_utils/normalize.py +68 -0
  91. novel_downloader/utils/file_utils/sanitize.py +77 -0
  92. novel_downloader/utils/fontocr/__init__.py +23 -0
  93. novel_downloader/utils/fontocr/ocr_v1.py +304 -0
  94. novel_downloader/utils/fontocr/ocr_v2.py +658 -0
  95. novel_downloader/utils/hash_store.py +288 -0
  96. novel_downloader/utils/hash_utils.py +103 -0
  97. novel_downloader/utils/i18n.py +41 -0
  98. novel_downloader/utils/logger.py +104 -0
  99. novel_downloader/utils/model_loader.py +72 -0
  100. novel_downloader/utils/network.py +287 -0
  101. novel_downloader/utils/state.py +156 -0
  102. novel_downloader/utils/text_utils/__init__.py +27 -0
  103. novel_downloader/utils/text_utils/chapter_formatting.py +46 -0
  104. novel_downloader/utils/text_utils/diff_display.py +75 -0
  105. novel_downloader/utils/text_utils/font_mapping.py +31 -0
  106. novel_downloader/utils/text_utils/text_cleaning.py +57 -0
  107. novel_downloader/utils/time_utils/__init__.py +22 -0
  108. novel_downloader/utils/time_utils/datetime_utils.py +146 -0
  109. novel_downloader/utils/time_utils/sleep_utils.py +49 -0
  110. novel_downloader-1.1.0.dist-info/METADATA +157 -0
  111. novel_downloader-1.1.0.dist-info/RECORD +115 -0
  112. novel_downloader-1.1.0.dist-info/WHEEL +5 -0
  113. novel_downloader-1.1.0.dist-info/entry_points.txt +2 -0
  114. novel_downloader-1.1.0.dist-info/licenses/LICENSE +21 -0
  115. novel_downloader-1.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,498 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.core.parsers.qidian_parser.browser.chapter_encrypted
5
+ ---------------------------------------------------------------------
6
+
7
+ Support for parsing encrypted chapters from Qidian using font OCR mapping,
8
+ CSS rules, and custom rendering logic.
9
+
10
+ Includes:
11
+ - Font downloading and caching
12
+ - Encrypted paragraph extraction
13
+ - Custom CSS parsing and layout restoration
14
+ - Font-based OCR decryption and mapping
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import json
20
+ import logging
21
+ from pathlib import Path
22
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Union
23
+
24
+ import tinycss2
25
+ from bs4 import BeautifulSoup, Tag
26
+
27
+ from novel_downloader.utils.network import download_font_file
28
+ from novel_downloader.utils.text_utils import apply_font_mapping
29
+
30
+ from ..shared import (
31
+ extract_chapter_info,
32
+ find_ssr_page_context,
33
+ )
34
+
35
+ if TYPE_CHECKING:
36
+ from .main_parser import QidianBrowserParser
37
+
38
+ logger = logging.getLogger(__name__)
39
+ IGNORED_CLASS_LISTS = {"title", "review"}
40
+
41
+
42
+ def parse_encrypted_chapter(
43
+ parser: QidianBrowserParser,
44
+ soup: BeautifulSoup,
45
+ chapter_id: str,
46
+ ) -> Dict[str, Any]:
47
+ """
48
+ Extract and return the formatted textual content of an encrypted chapter.
49
+
50
+ Steps:
51
+ 1. Load SSR JSON context for CSS, fonts, and metadata.
52
+ 3. Decode and save randomFont bytes; download fixedFont via download_font().
53
+ 4. Extract paragraph structures and save debug JSON.
54
+ 5. Parse CSS rules and save debug JSON.
55
+ 6. Determine paragraph name prefixes and ending number; save debug text.
56
+ 7. Render encrypted paragraphs, then run OCR font-mapping.
57
+ 8. Extracts paragraph texts and formats them.
58
+
59
+ :param html_str: Raw HTML content of the chapter page.
60
+ :return: Formatted chapter text or empty string if not parsable.
61
+ """
62
+ try:
63
+ if not (parser._decode_font and parser._font_ocr):
64
+ return {}
65
+ ssr_data = find_ssr_page_context(soup)
66
+ chapter_info = extract_chapter_info(ssr_data)
67
+ if not chapter_info:
68
+ logger.warning(
69
+ "[Parser] ssr_chapterInfo not found for chapter '%s'", chapter_id
70
+ )
71
+ return {}
72
+ debug_base_dir: Optional[Path] = None
73
+ if parser._font_debug_dir:
74
+ debug_base_dir = parser._font_debug_dir / chapter_id
75
+ debug_base_dir.mkdir(parents=True, exist_ok=True)
76
+
77
+ css_str = chapter_info["css"]
78
+ randomFont_str = chapter_info["randomFont"]
79
+ fixedFontWoff2_url = chapter_info["fixedFontWoff2"]
80
+
81
+ title = chapter_info.get("chapterName", "Untitled")
82
+ chapter_id = chapter_info.get("chapterId", "")
83
+ author_say = chapter_info.get("authorSay", "")
84
+ update_time = chapter_info.get("updateTime", "")
85
+ update_timestamp = chapter_info.get("updateTimestamp", 0)
86
+ modify_time = chapter_info.get("modifyTime", 0)
87
+ word_count = chapter_info.get("wordsCount", 0)
88
+ vip = bool(chapter_info.get("vipStatus", 0))
89
+ is_buy = bool(chapter_info.get("isBuy", 0))
90
+ seq = chapter_info.get("seq", None)
91
+ order = chapter_info.get("chapterOrder", None)
92
+ volume = chapter_info.get("extra", {}).get("volumeName", "")
93
+
94
+ # extract + save font
95
+ rf = json.loads(randomFont_str)
96
+ rand_path = parser._base_cache_dir / "randomFont.ttf"
97
+ rand_path.parent.mkdir(parents=True, exist_ok=True)
98
+ rand_path.write_bytes(bytes(rf["data"]))
99
+
100
+ fixed_path = download_font_file(
101
+ url=fixedFontWoff2_url, target_folder=parser._fixed_font_dir
102
+ )
103
+ if fixed_path is None:
104
+ raise ValueError("fixed_path is None: failed to download font")
105
+
106
+ # Extract and render paragraphs from HTML with CSS rules
107
+ main_paragraphs = extract_paragraphs_recursively(soup, chapter_id)
108
+ if debug_base_dir:
109
+ main_paragraphs_path = debug_base_dir / "main_paragraphs_debug.json"
110
+ main_paragraphs_path.write_text(
111
+ json.dumps(main_paragraphs, ensure_ascii=False, indent=2),
112
+ encoding="utf-8",
113
+ )
114
+
115
+ paragraphs_rules = parse_rule(css_str)
116
+ if debug_base_dir:
117
+ paragraphs_rules_path = debug_base_dir / "paragraphs_rules_debug.json"
118
+ paragraphs_rules_path.write_text(
119
+ json.dumps(paragraphs_rules, ensure_ascii=False, indent=2),
120
+ encoding="utf-8",
121
+ )
122
+
123
+ paragraph_names = parse_paragraph_names(paragraphs_rules)
124
+ end_number = parse_end_number(main_paragraphs, paragraph_names)
125
+ if debug_base_dir:
126
+ paragraphs_rules_path = debug_base_dir / "paragraph_names_debug.txt"
127
+ temp = f"names:\n{paragraph_names}\n\nend_number: {end_number}"
128
+ paragraphs_rules_path.write_text(
129
+ temp,
130
+ encoding="utf-8",
131
+ )
132
+ if not end_number:
133
+ logger.warning(
134
+ f"[Parser] No end_number found after parsing chapter '{chapter_id}'"
135
+ )
136
+ return {}
137
+
138
+ paragraphs_str, refl_list = render_paragraphs(
139
+ main_paragraphs, paragraphs_rules, end_number
140
+ )
141
+ if debug_base_dir:
142
+ paragraphs_str_path = debug_base_dir / f"{chapter_id}_debug.txt"
143
+ paragraphs_str_path.write_text(paragraphs_str, encoding="utf-8")
144
+
145
+ # Run OCR + fallback mapping
146
+ char_set = set(c for c in paragraphs_str if c not in {" ", "\n", "\u3000"})
147
+ refl_set = set(refl_list)
148
+ char_set = char_set - refl_set
149
+ if debug_base_dir:
150
+ char_sets_path = debug_base_dir / "char_set_debug.txt"
151
+ temp = f"char_set:\n{char_set}\n\nrefl_set:\n{refl_set}"
152
+ char_sets_path.write_text(
153
+ temp,
154
+ encoding="utf-8",
155
+ )
156
+
157
+ mapping_result = parser._font_ocr.generate_font_map(
158
+ fixed_font_path=fixed_path,
159
+ random_font_path=rand_path,
160
+ char_set=char_set,
161
+ refl_set=refl_set,
162
+ chapter_id=chapter_id,
163
+ )
164
+ if debug_base_dir:
165
+ mapping_json_path = debug_base_dir / "font_mapping.json"
166
+ mapping_json_path.write_text(
167
+ json.dumps(mapping_result, ensure_ascii=False, indent=2),
168
+ encoding="utf-8",
169
+ )
170
+
171
+ # Reconstruct final readable text
172
+ original_text = apply_font_mapping(paragraphs_str, mapping_result)
173
+
174
+ final_paragraphs_str = "\n\n".join(
175
+ line.strip() for line in original_text.splitlines() if line.strip()
176
+ )
177
+ chapter_info = {
178
+ "id": str(chapter_id),
179
+ "title": title,
180
+ "content": final_paragraphs_str,
181
+ "author_say": author_say.strip() if author_say else "",
182
+ "updated_at": update_time,
183
+ "update_timestamp": update_timestamp,
184
+ "modify_time": modify_time,
185
+ "word_count": word_count,
186
+ "vip": vip,
187
+ "purchased": is_buy,
188
+ "order": order,
189
+ "seq": seq,
190
+ "volume": volume,
191
+ }
192
+ return chapter_info
193
+
194
+ except Exception as e:
195
+ logger.warning(
196
+ "[Parser] parse error for encrypted chapter '%s': %s", chapter_id, e
197
+ )
198
+ return {}
199
+
200
+
201
+ def extract_paragraphs_recursively(
202
+ soup: BeautifulSoup, chapter_id: str = ""
203
+ ) -> List[Dict[str, Any]]:
204
+ """
205
+ Extracts paragraph elements under <main id="c-{chapter_id}"> from HTML
206
+ and converts them to a nested data structure for further processing.
207
+
208
+ :param html_str: Full HTML content.
209
+ :param chapter_id: ID used to locate <main id="c-{chapter_id}">.
210
+
211
+ :return list: List of parsed <p> paragraph data.
212
+ """
213
+
214
+ def parse_element(elem: Any) -> Union[Dict[str, Any], None]:
215
+ if not isinstance(elem, Tag):
216
+ return None
217
+ result = {"tag": elem.name, "attrs": dict(elem.attrs), "data": []}
218
+ for child in elem.contents:
219
+ if isinstance(child, Tag):
220
+ parsed = parse_element(child)
221
+ if parsed:
222
+ result["data"].append(parsed)
223
+ else:
224
+ text = child
225
+ if text:
226
+ result["data"].append(text)
227
+ return result
228
+
229
+ if chapter_id:
230
+ main_id = f"c-{chapter_id}"
231
+ main_tag = soup.find("main", id=main_id)
232
+ if not main_tag:
233
+ return []
234
+ else:
235
+ main_tag = soup
236
+
237
+ result = []
238
+ for p in main_tag.find_all("p"):
239
+ parsed_p = parse_element(p)
240
+ if parsed_p:
241
+ result.append(parsed_p)
242
+
243
+ return result
244
+
245
+
246
+ def parse_rule(css_str: str) -> Dict[str, Any]:
247
+ """
248
+ Parse a CSS string and extract style rules for rendering.
249
+
250
+ Handles:
251
+ - font-size:0 (mark for deletion)
252
+ - scaleX(-1) (mark as mirrored)
253
+ - ::before / ::after with content or attr()
254
+ - class + tag selector mapping
255
+ - custom rendering order via 'order'
256
+
257
+ :param css_str: Raw CSS stylesheet string.
258
+ :return: Dict with "rules" and "orders" for rendering.
259
+ """
260
+
261
+ rules: Dict[str, Any] = {}
262
+ orders = []
263
+
264
+ stylesheet = tinycss2.parse_stylesheet(
265
+ css_str, skip_comments=True, skip_whitespace=True
266
+ )
267
+
268
+ for rule in stylesheet:
269
+ if rule.type != "qualified-rule":
270
+ continue
271
+
272
+ selector = tinycss2.serialize(rule.prelude).strip()
273
+ declarations = tinycss2.parse_declaration_list(rule.content)
274
+
275
+ parsed = {}
276
+ order_val = None
277
+
278
+ for decl in declarations:
279
+ if decl.type != "declaration":
280
+ continue
281
+ name = decl.lower_name
282
+ value = tinycss2.serialize(decl.value).strip()
283
+
284
+ if name == "font-size" and value == "0":
285
+ if "::first-letter" in selector:
286
+ parsed["delete-first"] = True
287
+ else:
288
+ parsed["delete-all"] = True
289
+ elif name == "transform" and value.lower() == "scalex(-1)":
290
+ parsed["transform-x_-1"] = True
291
+ elif name == "order":
292
+ order_val = value
293
+ elif name == "content":
294
+ if "::after" in selector:
295
+ if "attr(" in value:
296
+ parsed["append-end-attr"] = value.split("attr(")[1].split(")")[
297
+ 0
298
+ ]
299
+ else:
300
+ parsed["append-end-char"] = value.strip("\"'")
301
+ elif "::before" in selector:
302
+ if "attr(" in value:
303
+ parsed["append-start-attr"] = value.split("attr(")[1].split(
304
+ ")"
305
+ )[0]
306
+ else:
307
+ parsed["append-start-char"] = value.strip("\"'")
308
+
309
+ # Store in structure
310
+ if selector.startswith(".sy-"):
311
+ rules.setdefault("sy", {})[selector[1:]] = parsed
312
+ elif selector.startswith(".p") and " " in selector:
313
+ class_str, tag_part = selector.split(" ", 1)
314
+ class_str = class_str.lstrip(".")
315
+ tag_part = tag_part.split("::")[0]
316
+ rules.setdefault(class_str, {}).setdefault(tag_part, {}).update(parsed)
317
+
318
+ if order_val:
319
+ orders.append((selector, order_val))
320
+
321
+ orders.sort(key=lambda x: int(x[1]))
322
+ return {"rules": rules, "orders": orders}
323
+
324
+
325
+ def parse_paragraph_names(rules: Dict[str, Any]) -> Set[str]:
326
+ """
327
+ Extract all paragraph selector names from parsed rules, excluding "sy".
328
+ """
329
+ paragraph_names = set()
330
+ for group, group_rules in rules.get("rules", {}).items():
331
+ if group == "sy":
332
+ continue
333
+ paragraph_names.update(group_rules.keys())
334
+ return paragraph_names
335
+
336
+
337
+ def parse_end_number(
338
+ main_paragraphs: List[Dict[str, Any]], paragraph_names: Set[str]
339
+ ) -> Optional[int]:
340
+ """
341
+ Find the most frequent numeric suffix from tag names
342
+ matched by given paragraph prefixes.
343
+ """
344
+ end_numbers: Dict[int, int] = {}
345
+ sorted_names = sorted(paragraph_names, key=len, reverse=True)
346
+
347
+ def rec_parse(item: Union[List[Any], Dict[str, Any]]) -> None:
348
+ if isinstance(item, list):
349
+ for element in item:
350
+ rec_parse(element)
351
+ elif isinstance(item, dict):
352
+ tag = item.get("tag")
353
+ if isinstance(tag, str):
354
+ for prefix in sorted_names:
355
+ if tag.startswith(prefix):
356
+ remain = tag[len(prefix) :]
357
+ if remain.isdigit():
358
+ num = int(remain)
359
+ end_numbers[num] = end_numbers.get(num, 0) + 1
360
+ break
361
+ for val in item.values():
362
+ if isinstance(val, (list, dict)):
363
+ rec_parse(val)
364
+
365
+ rec_parse(main_paragraphs)
366
+
367
+ if not end_numbers:
368
+ logger.warning("[Parser] No valid ending numbers found")
369
+ return None
370
+
371
+ sorted_numbers = sorted(
372
+ end_numbers.items(), key=lambda x: (x[1], x[0]), reverse=True
373
+ )
374
+
375
+ logger.debug(
376
+ "[Parser] Top 3 end numbers:\n%s",
377
+ "\n".join(f"{n}: {c}" for n, c in sorted_numbers[:3]),
378
+ )
379
+
380
+ return sorted_numbers[0][0]
381
+
382
+
383
+ def render_paragraphs(
384
+ main_paragraphs: List[Dict[str, Any]],
385
+ rules: Dict[str, Any],
386
+ end_number: int,
387
+ ) -> Tuple[str, List[str]]:
388
+ """
389
+ Applies the parsed CSS rules to the paragraph structure and
390
+ reconstructs the visible text.
391
+
392
+ Handles special class styles like .sy-*, text order control,
393
+ mirrored characters, etc.
394
+
395
+ :param main_paragraphs: A list of paragraph dictionaries, each with 'attrs'
396
+ and 'data' fields representing structured content.
397
+ :param rules: A dictionary with keys 'orders' and 'rules', parsed from CSS.
398
+ - rules['orders']: List of (selector, id) tuples.
399
+ - rules['rules']: Nested dict containing transformation rules.
400
+ :param end_number: HTML tag suffix (e.g. span123 -> 123).
401
+
402
+ :return:
403
+ - A reconstructed paragraph string with line breaks.
404
+ - A list of mirrored (reflected) characters for later OCR processing.
405
+ """
406
+ orders: List[Tuple[str, str]] = rules.get("orders", [])
407
+ rules = rules.get("rules", {})
408
+ refl_list: List[str] = []
409
+
410
+ def apply_rule(data: Dict[str, Any], rule: Dict[str, Any]) -> str:
411
+ if rule.get("delete-all", False):
412
+ return ""
413
+
414
+ curr_str = ""
415
+ if isinstance(data.get("data"), list) and data["data"]:
416
+ first_data = data["data"][0]
417
+ if isinstance(first_data, str):
418
+ curr_str += first_data
419
+
420
+ if rule.get("delete-first", False):
421
+ if len(curr_str) <= 1:
422
+ curr_str = ""
423
+ else:
424
+ curr_str = curr_str[1:]
425
+
426
+ curr_str += rule.get("append-end-char", "")
427
+
428
+ attr_name = rule.get("append-end-attr", "")
429
+ if attr_name:
430
+ curr_str += data.get("attrs", {}).get(f"{attr_name}{end_number}", "")
431
+
432
+ curr_str = rule.get("append-start-char", "") + curr_str
433
+
434
+ attr_name = rule.get("append-start-attr", "")
435
+ if attr_name:
436
+ curr_str = (
437
+ data.get("attrs", {}).get(f"{attr_name}{end_number}", "") + curr_str
438
+ )
439
+
440
+ if rule.get("transform-x_-1", False):
441
+ refl_list.append(curr_str)
442
+ return curr_str
443
+
444
+ paragraphs_str = ""
445
+ for paragraph in main_paragraphs:
446
+ class_list = paragraph.get("attrs", {}).get("class", [])
447
+ p_class_str = next((c for c in class_list if c.startswith("p")), None)
448
+ curr_datas = paragraph.get("data", [])
449
+
450
+ ordered_cache = {}
451
+ for data in curr_datas:
452
+ # 文本节点直接加
453
+ if isinstance(data, str):
454
+ paragraphs_str += data
455
+ continue
456
+
457
+ if isinstance(data, dict):
458
+ tag = data.get("tag", "")
459
+ attrs = data.get("attrs", {})
460
+
461
+ # 跳过 span.review
462
+ if tag == "span" and "class" in attrs and "review" in attrs["class"]:
463
+ continue
464
+
465
+ # sy 类型标签处理
466
+ if tag == "y":
467
+ tag_class_list = attrs.get("class", [])
468
+ tag_class = next(
469
+ (c for c in tag_class_list if c.startswith("sy-")), None
470
+ )
471
+
472
+ if tag_class in rules.get("sy", {}):
473
+ curr_rule = rules["sy"][tag_class]
474
+ paragraphs_str += apply_rule(data, curr_rule)
475
+ continue
476
+
477
+ if not p_class_str:
478
+ if any(cls in IGNORED_CLASS_LISTS for cls in class_list):
479
+ continue
480
+ logger.debug(f"[parser] not find p_class_str: {class_list}")
481
+ continue
482
+ # 普通标签处理,根据 orders 顺序匹配
483
+ for ord_selector, ord_id in orders:
484
+ tag_name = f"{ord_selector}{end_number}"
485
+ if data.get("tag") != tag_name:
486
+ continue
487
+ curr_rule = rules.get(p_class_str, {}).get(ord_selector)
488
+ curr_rule = curr_rule if curr_rule else {}
489
+ ordered_cache[ord_selector] = apply_rule(data, curr_rule)
490
+ break
491
+ # 最后按 orders 顺序拼接
492
+ for ord_selector, ord_id in orders:
493
+ if ord_selector in ordered_cache:
494
+ paragraphs_str += ordered_cache[ord_selector]
495
+
496
+ paragraphs_str += "\n\n"
497
+
498
+ return paragraphs_str, refl_list
@@ -0,0 +1,97 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.core.parsers.qidian_parser.browser.chapter_normal
5
+ ------------------------------------------------------------------
6
+
7
+ Parser logic for extracting readable text from Qidian chapters
8
+ that use plain (non-encrypted) browser-rendered HTML.
9
+ """
10
+
11
+ import logging
12
+ from typing import Any, Dict
13
+
14
+ from bs4 import BeautifulSoup
15
+
16
+ from ..shared import (
17
+ extract_chapter_info,
18
+ find_ssr_page_context,
19
+ )
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ def parse_normal_chapter(
25
+ soup: BeautifulSoup,
26
+ chapter_id: str,
27
+ ) -> Dict[str, Any]:
28
+ """
29
+ Extract and format the chapter text from a normal Qidian page.
30
+ Returns empty string if VIP/encrypted.
31
+
32
+ This method performs the following steps:
33
+ 1. Parses HTML into soup.
34
+ 2. Skips parsing if VIP or encrypted.
35
+ 3. Locates main content container.
36
+ 4. Extracts SSR-rendered chapter info (title, author note).
37
+ 5. Removes review spans.
38
+ 6. Extracts paragraph texts and formats them.
39
+
40
+ :param html_str: Raw HTML content of the chapter page.
41
+ :return: Formatted chapter text or empty string if not parsable.
42
+ """
43
+ try:
44
+ main = soup.select_one("div#app div#reader-content main")
45
+ if not main:
46
+ logger.warning("[Parser] Main content not found for chapter")
47
+ return {}
48
+
49
+ ssr_data = find_ssr_page_context(soup)
50
+ chapter_info = extract_chapter_info(ssr_data)
51
+ if not chapter_info:
52
+ logger.warning(
53
+ "[Parser] ssr_chapterInfo not found for chapter '%s'", chapter_id
54
+ )
55
+ return {}
56
+
57
+ title = chapter_info.get("chapterName", "Untitled")
58
+ chapter_id = chapter_info.get("chapterId", "")
59
+ author_say = chapter_info.get("authorSay", "")
60
+ update_time = chapter_info.get("updateTime", "")
61
+ update_timestamp = chapter_info.get("updateTimestamp", 0)
62
+ modify_time = chapter_info.get("modifyTime", 0)
63
+ word_count = chapter_info.get("wordsCount", 0)
64
+ vip = bool(chapter_info.get("vipStatus", 0))
65
+ is_buy = bool(chapter_info.get("isBuy", 0))
66
+ seq = chapter_info.get("seq", None)
67
+ order = chapter_info.get("chapterOrder", None)
68
+ volume = chapter_info.get("extra", {}).get("volumeName", "")
69
+
70
+ # remove review spans
71
+ for span in main.select("span.review"):
72
+ span.decompose()
73
+
74
+ paras = [p.get_text(strip=True) for p in main.find_all("p")]
75
+ chapter_text = "\n\n".join(paras)
76
+
77
+ return {
78
+ "id": str(chapter_id),
79
+ "title": title,
80
+ "content": chapter_text,
81
+ "author_say": author_say.strip() if author_say else "",
82
+ "updated_at": update_time,
83
+ "update_timestamp": update_timestamp,
84
+ "modify_time": modify_time,
85
+ "word_count": word_count,
86
+ "vip": vip,
87
+ "purchased": is_buy,
88
+ "order": order,
89
+ "seq": seq,
90
+ "volume": volume,
91
+ }
92
+
93
+ except Exception as e:
94
+ logger.warning(
95
+ "[Parser] parse error for normal chapter '%s': %s", chapter_id, e
96
+ )
97
+ return {}
@@ -0,0 +1,70 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ novel_downloader.core.parsers.qidian_parser.browser.chapter_router
5
+ ------------------------------------------------------------------
6
+
7
+ Routing logic for selecting the correct chapter parser for Qidian browser pages.
8
+
9
+ This module acts as a dispatcher that analyzes a chapter's HTML content and
10
+ routes the parsing task to either the encrypted or normal chapter parser.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import logging
16
+ from typing import TYPE_CHECKING, Any, Dict
17
+
18
+ from ..shared import (
19
+ can_view_chapter,
20
+ html_to_soup,
21
+ is_encrypted,
22
+ )
23
+ from .chapter_normal import parse_normal_chapter
24
+
25
+ if TYPE_CHECKING:
26
+ from .main_parser import QidianBrowserParser
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ def parse_chapter(
32
+ parser: QidianBrowserParser,
33
+ html_str: str,
34
+ chapter_id: str,
35
+ ) -> Dict[str, Any]:
36
+ """
37
+ Extract and return the formatted textual content of chapter.
38
+
39
+ :param parser: Instance of QidianBrowserParser.
40
+ :param html_str: Raw HTML content of the chapter page.
41
+ :param chapter_id: Identifier of the chapter being parsed.
42
+ :return: Formatted chapter text or empty string if not parsable.
43
+ """
44
+ try:
45
+ soup = html_to_soup(html_str)
46
+
47
+ if not can_view_chapter(soup):
48
+ logger.warning(
49
+ "[Parser] Chapter '%s' is not purchased or inaccessible.", chapter_id
50
+ )
51
+ return {}
52
+
53
+ if is_encrypted(soup):
54
+ if not parser._decode_font:
55
+ return {}
56
+ try:
57
+ from .chapter_encrypted import parse_encrypted_chapter
58
+
59
+ return parse_encrypted_chapter(parser, soup, chapter_id)
60
+ except ImportError:
61
+ logger.warning(
62
+ "[Parser] Encrypted chapter '%s' requires extra dependencies.",
63
+ chapter_id,
64
+ )
65
+ return {}
66
+
67
+ return parse_normal_chapter(soup, chapter_id)
68
+ except Exception as e:
69
+ logger.warning("[Parser] parse error for chapter '%s': %s", chapter_id, e)
70
+ return {}