novel-downloader 2.0.0__py3-none-any.whl → 2.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/download.py +14 -11
  3. novel_downloader/cli/export.py +19 -19
  4. novel_downloader/cli/ui.py +35 -8
  5. novel_downloader/config/adapter.py +216 -153
  6. novel_downloader/core/__init__.py +5 -6
  7. novel_downloader/core/archived/deqixs/fetcher.py +1 -28
  8. novel_downloader/core/downloaders/__init__.py +2 -0
  9. novel_downloader/core/downloaders/base.py +34 -85
  10. novel_downloader/core/downloaders/common.py +147 -171
  11. novel_downloader/core/downloaders/qianbi.py +30 -64
  12. novel_downloader/core/downloaders/qidian.py +157 -184
  13. novel_downloader/core/downloaders/qqbook.py +292 -0
  14. novel_downloader/core/downloaders/registry.py +2 -2
  15. novel_downloader/core/exporters/__init__.py +2 -0
  16. novel_downloader/core/exporters/base.py +37 -59
  17. novel_downloader/core/exporters/common.py +620 -0
  18. novel_downloader/core/exporters/linovelib.py +47 -0
  19. novel_downloader/core/exporters/qidian.py +41 -12
  20. novel_downloader/core/exporters/qqbook.py +28 -0
  21. novel_downloader/core/exporters/registry.py +2 -2
  22. novel_downloader/core/fetchers/__init__.py +4 -2
  23. novel_downloader/core/fetchers/aaatxt.py +2 -22
  24. novel_downloader/core/fetchers/b520.py +3 -23
  25. novel_downloader/core/fetchers/base.py +80 -105
  26. novel_downloader/core/fetchers/biquyuedu.py +2 -22
  27. novel_downloader/core/fetchers/dxmwx.py +10 -22
  28. novel_downloader/core/fetchers/esjzone.py +6 -29
  29. novel_downloader/core/fetchers/guidaye.py +2 -22
  30. novel_downloader/core/fetchers/hetushu.py +9 -29
  31. novel_downloader/core/fetchers/i25zw.py +2 -16
  32. novel_downloader/core/fetchers/ixdzs8.py +2 -16
  33. novel_downloader/core/fetchers/jpxs123.py +2 -16
  34. novel_downloader/core/fetchers/lewenn.py +2 -22
  35. novel_downloader/core/fetchers/linovelib.py +4 -20
  36. novel_downloader/core/fetchers/{eightnovel.py → n8novel.py} +12 -40
  37. novel_downloader/core/fetchers/piaotia.py +2 -16
  38. novel_downloader/core/fetchers/qbtr.py +2 -16
  39. novel_downloader/core/fetchers/qianbi.py +1 -20
  40. novel_downloader/core/fetchers/qidian.py +27 -68
  41. novel_downloader/core/fetchers/qqbook.py +177 -0
  42. novel_downloader/core/fetchers/quanben5.py +9 -29
  43. novel_downloader/core/fetchers/rate_limiter.py +22 -53
  44. novel_downloader/core/fetchers/sfacg.py +3 -16
  45. novel_downloader/core/fetchers/shencou.py +2 -16
  46. novel_downloader/core/fetchers/shuhaige.py +2 -22
  47. novel_downloader/core/fetchers/tongrenquan.py +2 -22
  48. novel_downloader/core/fetchers/ttkan.py +3 -14
  49. novel_downloader/core/fetchers/wanbengo.py +2 -22
  50. novel_downloader/core/fetchers/xiaoshuowu.py +2 -16
  51. novel_downloader/core/fetchers/xiguashuwu.py +4 -20
  52. novel_downloader/core/fetchers/xs63b.py +3 -15
  53. novel_downloader/core/fetchers/xshbook.py +2 -22
  54. novel_downloader/core/fetchers/yamibo.py +4 -28
  55. novel_downloader/core/fetchers/yibige.py +13 -26
  56. novel_downloader/core/interfaces/exporter.py +19 -7
  57. novel_downloader/core/interfaces/fetcher.py +23 -49
  58. novel_downloader/core/interfaces/parser.py +2 -2
  59. novel_downloader/core/parsers/__init__.py +4 -2
  60. novel_downloader/core/parsers/b520.py +2 -2
  61. novel_downloader/core/parsers/base.py +5 -39
  62. novel_downloader/core/parsers/esjzone.py +3 -3
  63. novel_downloader/core/parsers/{eightnovel.py → n8novel.py} +7 -7
  64. novel_downloader/core/parsers/qidian.py +717 -0
  65. novel_downloader/core/parsers/qqbook.py +709 -0
  66. novel_downloader/core/parsers/xiguashuwu.py +8 -15
  67. novel_downloader/core/searchers/__init__.py +2 -2
  68. novel_downloader/core/searchers/b520.py +1 -1
  69. novel_downloader/core/searchers/base.py +2 -2
  70. novel_downloader/core/searchers/{eightnovel.py → n8novel.py} +5 -5
  71. novel_downloader/locales/en.json +3 -3
  72. novel_downloader/locales/zh.json +3 -3
  73. novel_downloader/models/__init__.py +2 -0
  74. novel_downloader/models/book.py +1 -0
  75. novel_downloader/models/config.py +12 -0
  76. novel_downloader/resources/config/settings.toml +23 -5
  77. novel_downloader/resources/js_scripts/expr_to_json.js +14 -0
  78. novel_downloader/resources/js_scripts/qidian_decrypt_node.js +21 -16
  79. novel_downloader/resources/js_scripts/qq_decrypt_node.js +92 -0
  80. novel_downloader/utils/__init__.py +0 -2
  81. novel_downloader/utils/chapter_storage.py +2 -3
  82. novel_downloader/utils/constants.py +7 -3
  83. novel_downloader/utils/cookies.py +32 -17
  84. novel_downloader/utils/crypto_utils/__init__.py +0 -6
  85. novel_downloader/utils/crypto_utils/aes_util.py +1 -1
  86. novel_downloader/utils/crypto_utils/rc4.py +40 -50
  87. novel_downloader/utils/epub/__init__.py +2 -3
  88. novel_downloader/utils/epub/builder.py +6 -6
  89. novel_downloader/utils/epub/constants.py +1 -6
  90. novel_downloader/utils/epub/documents.py +7 -7
  91. novel_downloader/utils/epub/models.py +8 -8
  92. novel_downloader/utils/epub/utils.py +10 -10
  93. novel_downloader/utils/file_utils/io.py +48 -73
  94. novel_downloader/utils/file_utils/normalize.py +1 -7
  95. novel_downloader/utils/file_utils/sanitize.py +4 -11
  96. novel_downloader/utils/fontocr/__init__.py +13 -0
  97. novel_downloader/utils/{fontocr.py → fontocr/core.py} +72 -61
  98. novel_downloader/utils/fontocr/loader.py +52 -0
  99. novel_downloader/utils/logger.py +80 -56
  100. novel_downloader/utils/network.py +16 -40
  101. novel_downloader/utils/node_decryptor/__init__.py +13 -0
  102. novel_downloader/utils/node_decryptor/decryptor.py +342 -0
  103. novel_downloader/{core/parsers/qidian/utils → utils/node_decryptor}/decryptor_fetcher.py +5 -6
  104. novel_downloader/utils/text_utils/text_cleaner.py +39 -30
  105. novel_downloader/utils/text_utils/truncate_utils.py +3 -14
  106. novel_downloader/utils/time_utils/sleep_utils.py +53 -43
  107. novel_downloader/web/main.py +1 -1
  108. novel_downloader/web/pages/download.py +1 -1
  109. novel_downloader/web/pages/search.py +4 -4
  110. novel_downloader/web/services/task_manager.py +2 -0
  111. {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.2.dist-info}/METADATA +5 -1
  112. novel_downloader-2.0.2.dist-info/RECORD +203 -0
  113. novel_downloader/core/exporters/common/__init__.py +0 -11
  114. novel_downloader/core/exporters/common/epub.py +0 -198
  115. novel_downloader/core/exporters/common/main_exporter.py +0 -64
  116. novel_downloader/core/exporters/common/txt.py +0 -146
  117. novel_downloader/core/exporters/epub_util.py +0 -215
  118. novel_downloader/core/exporters/linovelib/__init__.py +0 -11
  119. novel_downloader/core/exporters/linovelib/epub.py +0 -349
  120. novel_downloader/core/exporters/linovelib/main_exporter.py +0 -66
  121. novel_downloader/core/exporters/linovelib/txt.py +0 -139
  122. novel_downloader/core/exporters/txt_util.py +0 -67
  123. novel_downloader/core/parsers/qidian/__init__.py +0 -10
  124. novel_downloader/core/parsers/qidian/book_info_parser.py +0 -89
  125. novel_downloader/core/parsers/qidian/chapter_encrypted.py +0 -470
  126. novel_downloader/core/parsers/qidian/chapter_normal.py +0 -126
  127. novel_downloader/core/parsers/qidian/chapter_router.py +0 -68
  128. novel_downloader/core/parsers/qidian/main_parser.py +0 -101
  129. novel_downloader/core/parsers/qidian/utils/__init__.py +0 -30
  130. novel_downloader/core/parsers/qidian/utils/fontmap_recover.py +0 -143
  131. novel_downloader/core/parsers/qidian/utils/helpers.py +0 -110
  132. novel_downloader/core/parsers/qidian/utils/node_decryptor.py +0 -175
  133. novel_downloader-2.0.0.dist-info/RECORD +0 -210
  134. {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.2.dist-info}/WHEEL +0 -0
  135. {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.2.dist-info}/entry_points.txt +0 -0
  136. {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.2.dist-info}/licenses/LICENSE +0 -0
  137. {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.2.dist-info}/top_level.txt +0 -0
@@ -1,89 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- novel_downloader.core.parsers.qidian.book_info_parser
4
- -----------------------------------------------------
5
-
6
- This module provides parsing of Qidian book info pages.
7
-
8
- It extracts metadata such as title, author, cover URL, update
9
- time, status, word count, summary, and volume-chapter structure.
10
- """
11
-
12
- import logging
13
- import re
14
- from datetime import datetime
15
-
16
- from lxml import html
17
-
18
- from novel_downloader.models import BookInfoDict, ChapterInfoDict, VolumeInfoDict
19
-
20
- logger = logging.getLogger(__name__)
21
-
22
-
23
- def _chapter_url_to_id(url: str) -> str:
24
- return url.rstrip("/").split("/")[-1]
25
-
26
-
27
- def parse_book_info(html_str: str) -> BookInfoDict | None:
28
- """
29
- Extract metadata: title, author, cover_url, update_time, status,
30
- word_count, summary, and volumes with chapters.
31
-
32
- :param html_str: Raw HTML of the book info page.
33
- :return: A dict containing book metadata.
34
- """
35
- doc = html.fromstring(html_str)
36
-
37
- book_name = doc.xpath('string(//h1[@id="bookName"])').strip()
38
-
39
- author = doc.xpath('string(//a[@class="writer-name"])').strip()
40
-
41
- book_id = doc.xpath('//a[@id="bookImg"]/@data-bid')[0]
42
- cover_url = f"https://bookcover.yuewen.com/qdbimg/349573/{book_id}/600.webp"
43
-
44
- ut = doc.xpath('string(//span[@class="update-time"])')
45
- ut = ut.replace("更新时间:", "").strip()
46
- if re.match(r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$", ut):
47
- update_time = ut
48
- else:
49
- update_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
50
-
51
- serial_status = doc.xpath('string(//p[@class="book-attribute"]/span[1])').strip()
52
-
53
- tags_elem = doc.xpath('//p[contains(@class,"all-label")]//a/text()')
54
- tags = [t.strip() for t in tags_elem if t.strip()]
55
-
56
- word_count = doc.xpath('string(//p[@class="count"]/em[1])').strip()
57
-
58
- summary_brief = doc.xpath('string(//p[@class="intro"])').strip()
59
-
60
- raw = doc.xpath('//p[@id="book-intro-detail"]//text()')
61
- summary = "\n".join(line.strip() for line in raw if line.strip())
62
-
63
- volumes: list[VolumeInfoDict] = []
64
- for vol in doc.xpath('//div[@id="allCatalog"]//div[@class="catalog-volume"]'):
65
- vol_name = vol.xpath('string(.//h3[@class="volume-name"])').strip()
66
- vol_name = vol_name.split(chr(183))[0].strip()
67
- chapters: list[ChapterInfoDict] = []
68
- for li in vol.xpath('.//ul[contains(@class,"volume-chapters")]/li'):
69
- a = li.xpath('.//a[@class="chapter-name"]')[0]
70
- title = a.text.strip()
71
- url = a.get("href")
72
- chapters.append(
73
- {"title": title, "url": url, "chapterId": _chapter_url_to_id(url)}
74
- )
75
- volumes.append({"volume_name": vol_name, "chapters": chapters})
76
-
77
- return {
78
- "book_name": book_name,
79
- "author": author,
80
- "cover_url": cover_url,
81
- "update_time": update_time,
82
- "word_count": word_count,
83
- "serial_status": serial_status,
84
- "tags": tags,
85
- "summary_brief": summary_brief,
86
- "summary": summary,
87
- "volumes": volumes,
88
- "extra": {},
89
- }
@@ -1,470 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- novel_downloader.core.parsers.qidian.chapter_encrypted
4
- ------------------------------------------------------
5
-
6
- Support for parsing encrypted chapters from Qidian using font OCR mapping,
7
- CSS rules, and custom rendering logic.
8
- """
9
-
10
- from __future__ import annotations
11
-
12
- import json
13
- import logging
14
- import re
15
- from contextlib import suppress
16
- from typing import TYPE_CHECKING, TypedDict
17
-
18
- from lxml import html
19
-
20
- from novel_downloader.models import ChapterDict
21
- from novel_downloader.utils import (
22
- download,
23
- truncate_half_lines,
24
- )
25
-
26
- from .utils import (
27
- extract_chapter_info,
28
- find_ssr_page_context,
29
- get_decryptor,
30
- is_duplicated,
31
- vip_status,
32
- )
33
- from .utils.fontmap_recover import (
34
- apply_font_mapping,
35
- generate_font_map,
36
- )
37
-
38
- if TYPE_CHECKING:
39
- from .main_parser import QidianParser
40
-
41
- logger = logging.getLogger(__name__)
42
- _RE_ATTR = re.compile(r"attr\(\s*([^)]+?)\s*\)", re.I)
43
- _RE_SCALEX = re.compile(r"scalex\(\s*-?1\s*\)", re.I)
44
-
45
-
46
- class Rule(TypedDict, total=False):
47
- delete_all: bool
48
- delete_first: bool
49
- transform_flip_x: bool
50
- append_start_char: str
51
- append_end_char: str
52
- append_start_attr: str
53
- append_end_attr: str
54
-
55
-
56
- class Rules(TypedDict):
57
- # e.g., orders = ["i", "em", "span"]
58
- orders: list[str]
59
- # e.g., sy["sy-3"] -> Rule
60
- sy: dict[str, Rule]
61
- # e.g., p_rules["p3"]["i"] -> Rule
62
- p_rules: dict[str, dict[str, Rule]]
63
-
64
-
65
- def parse_encrypted_chapter(
66
- parser: QidianParser,
67
- html_str: str,
68
- chapter_id: str,
69
- ) -> ChapterDict | None:
70
- """
71
- Extract and return the formatted textual content of an encrypted chapter.
72
-
73
- Steps:
74
- 1. Load SSR JSON context for CSS, fonts, and metadata.
75
- 3. Decode and save randomFont bytes; download fixedFont via download_font().
76
- 4. Extract paragraph structures and save debug JSON.
77
- 5. Parse CSS rules and save debug JSON.
78
- 6. Render encrypted paragraphs, then run OCR font-mapping.
79
- 7. Extracts paragraph texts and formats them.
80
-
81
- :param html_str: Raw HTML content of the chapter page.
82
- :return: Formatted chapter text or empty string if not parsable.
83
- """
84
- try:
85
- if not parser._decode_font:
86
- return None
87
- ssr_data = find_ssr_page_context(html_str)
88
- chapter_info = extract_chapter_info(ssr_data)
89
- if not chapter_info:
90
- logger.warning(
91
- "[Parser] ssr_chapterInfo not found for chapter '%s'", chapter_id
92
- )
93
- return None
94
-
95
- debug_dir = parser._debug_dir / "font_debug" / "qidian" / chapter_id
96
- if parser.save_font_debug:
97
- debug_dir.mkdir(parents=True, exist_ok=True)
98
-
99
- css_str = chapter_info["css"]
100
- randomFont_str = chapter_info["randomFont"]
101
- fixedFontWoff2_url = chapter_info["fixedFontWoff2"]
102
-
103
- title = chapter_info.get("chapterName", "Untitled")
104
- duplicated = is_duplicated(ssr_data)
105
- raw_html = chapter_info.get("content", "")
106
- chapter_id = chapter_info.get("chapterId", chapter_id)
107
- fkp = chapter_info.get("fkp", "")
108
- author_say = chapter_info.get("authorSay", "")
109
- update_time = chapter_info.get("updateTime", "")
110
- update_timestamp = chapter_info.get("updateTimestamp", 0)
111
- modify_time = chapter_info.get("modifyTime", 0)
112
- word_count = chapter_info.get("actualWords", 0)
113
- seq = chapter_info.get("seq", None)
114
- volume = chapter_info.get("extra", {}).get("volumeName", "")
115
-
116
- # extract + save font
117
- rf = json.loads(randomFont_str)
118
- rand_path = parser._base_cache_dir / "randomFont.ttf"
119
- rand_path.parent.mkdir(parents=True, exist_ok=True)
120
- rand_path.write_bytes(bytes(rf["data"]))
121
-
122
- fixed_path = download(
123
- url=fixedFontWoff2_url,
124
- target_dir=parser._fixed_font_dir,
125
- stream=True,
126
- )
127
- if fixed_path is None:
128
- raise ValueError("fixed_path is None: failed to download font")
129
-
130
- # Extract and render paragraphs from HTML with CSS rules
131
- if vip_status(ssr_data):
132
- try:
133
- decryptor = get_decryptor()
134
- raw_html = decryptor.decrypt(
135
- raw_html,
136
- chapter_id,
137
- fkp,
138
- parser._fuid,
139
- )
140
- except Exception as e:
141
- logger.error("[Parser] decryption failed for '%s': %s", chapter_id, e)
142
- return None
143
-
144
- css_rules = parse_css_rules(css_str)
145
- paragraphs_str, refl_list = render_visible_text(raw_html, css_rules)
146
- if parser.save_font_debug:
147
- paragraphs_str_path = debug_dir / f"{chapter_id}_debug.txt"
148
- paragraphs_str_path.write_text(paragraphs_str, encoding="utf-8")
149
-
150
- # Run OCR + fallback mapping
151
- char_set = {c for c in paragraphs_str if c not in {" ", "\n", "\u3000"}}
152
- refl_set = set(refl_list)
153
- char_set = char_set - refl_set
154
- if parser.save_font_debug:
155
- char_sets_path = debug_dir / "char_set_debug.txt"
156
- temp = f"char_set:\n{char_set}\n\nrefl_set:\n{refl_set}"
157
- char_sets_path.write_text(
158
- temp,
159
- encoding="utf-8",
160
- )
161
-
162
- mapping_result = generate_font_map(
163
- fixed_font_path=fixed_path,
164
- random_font_path=rand_path,
165
- char_set=char_set,
166
- refl_set=refl_set,
167
- cache_dir=parser._base_cache_dir,
168
- batch_size=parser._config.batch_size,
169
- )
170
- if not mapping_result:
171
- return None
172
-
173
- if parser.save_font_debug:
174
- mapping_json_path = debug_dir / "font_mapping.json"
175
- mapping_json_path.write_text(
176
- json.dumps(mapping_result, ensure_ascii=False, indent=2),
177
- encoding="utf-8",
178
- )
179
-
180
- # Reconstruct final readable text
181
- original_text = apply_font_mapping(
182
- text=paragraphs_str,
183
- font_map=mapping_result,
184
- )
185
-
186
- final_paragraphs_str = "\n".join(
187
- line.strip() for line in original_text.splitlines() if line.strip()
188
- )
189
- if parser._use_truncation and duplicated:
190
- final_paragraphs_str = truncate_half_lines(final_paragraphs_str)
191
-
192
- return {
193
- "id": str(chapter_id),
194
- "title": str(title),
195
- "content": final_paragraphs_str,
196
- "extra": {
197
- "author_say": author_say.strip() if author_say else "",
198
- "updated_at": update_time,
199
- "update_timestamp": update_timestamp,
200
- "modify_time": modify_time,
201
- "word_count": word_count,
202
- "duplicated": duplicated,
203
- "seq": seq,
204
- "volume": volume,
205
- "encrypted": True,
206
- },
207
- }
208
-
209
- except Exception as e:
210
- logger.warning(
211
- "[Parser] parse error for encrypted chapter '%s': %s", chapter_id, e
212
- )
213
- return None
214
-
215
-
216
- def _only_tag(selector: str) -> str | None:
217
- """
218
- Normalize a selector into just its tag name for ordering.
219
-
220
- Handles forms like 'i', 'em::before', '.p3 i', '.p2 span::after'.
221
-
222
- Returns None if can't extract a tag.
223
- """
224
- sel = selector.strip()
225
- # If it has spaces, take the rightmost simple selector
226
- last = sel.split()[-1]
227
- # Drop ::pseudo
228
- last = last.split("::", 1)[0]
229
- # If it's like 'span[attr=..]' keep 'span'
230
- last = last.split("[", 1)[0]
231
- # If it starts with '.', it's not a tag
232
- if not last or last.startswith("."):
233
- return None
234
- return last
235
-
236
-
237
- def _parse_decls(block: str) -> list[tuple[str, str]]:
238
- """
239
- Parse 'name:value;...' inside a block. Tolerates quotes and attr().
240
- """
241
- decls: list[tuple[str, str]] = []
242
- i = 0
243
- n = len(block)
244
- name: list[str] = []
245
- val: list[str] = []
246
- in_name = True
247
- quote = None # track ' or "
248
- while i < n:
249
- c = block[i]
250
- if quote:
251
- # inside quotes
252
- if c == "\\" and i + 1 < n:
253
- # keep escaped char
254
- (name if in_name else val).append(c)
255
- i += 1
256
- (name if in_name else val).append(block[i])
257
- elif c == quote:
258
- (name if in_name else val).append(c)
259
- quote = None
260
- else:
261
- (name if in_name else val).append(c)
262
- else:
263
- if c in ("'", '"'):
264
- (name if in_name else val).append(c)
265
- quote = c
266
- elif in_name and c == ":":
267
- in_name = False
268
- elif c == ";":
269
- nm = "".join(name).strip().lower()
270
- vl = "".join(val).strip()
271
- if nm:
272
- decls.append((nm, vl))
273
- name.clear()
274
- val.clear()
275
- in_name = True
276
- else:
277
- (name if in_name else val).append(c)
278
- i += 1
279
-
280
- if name or val:
281
- nm = "".join(name).strip().lower()
282
- vl = "".join(val).strip()
283
- if nm:
284
- decls.append((nm, vl))
285
- return decls
286
-
287
-
288
- def parse_css_rules(css_str: str) -> Rules:
289
- """
290
- Produces normalized Rules with:
291
- - orders: list[str] of tag names sorted by numeric 'order'
292
- - sy: '.sy-*' class rules
293
- - p_rules: '.p* <tag>' rules, indexed by p-class then tag
294
- """
295
- rules: Rules = {"orders": [], "sy": {}, "p_rules": {}}
296
- order_pairs: list[tuple[str, int]] = []
297
-
298
- i = 0
299
- while True:
300
- b1 = css_str.find("{", i)
301
- if b1 == -1:
302
- break
303
- selector = css_str[i:b1].strip().lower()
304
- b2 = css_str.find("}", b1 + 1)
305
- if b2 == -1:
306
- break
307
- block = css_str[b1 + 1 : b2]
308
- i = b2 + 1
309
-
310
- decls = _parse_decls(block)
311
-
312
- new_rule: Rule = {}
313
- order_val: int | None = None
314
-
315
- for name, value in decls:
316
- v = value.strip()
317
- if name == "font-size" and v == "0":
318
- if "::first-letter" in selector:
319
- new_rule["delete_first"] = True
320
- else:
321
- new_rule["delete_all"] = True
322
- elif name == "transform":
323
- if _RE_SCALEX.search(v.replace(" ", "")):
324
- new_rule["transform_flip_x"] = True
325
- elif name == "order":
326
- with suppress(ValueError, TypeError):
327
- order_val = int(v)
328
- elif name == "content":
329
- # normalize: remove outer quotes
330
- if "::after" in selector:
331
- m = _RE_ATTR.search(v)
332
- if m:
333
- new_rule["append_end_attr"] = m.group(1)
334
- else:
335
- s = v.strip().strip("\"'")
336
- new_rule["append_end_char"] = s
337
- elif "::before" in selector:
338
- m = _RE_ATTR.search(v)
339
- if m:
340
- new_rule["append_start_attr"] = m.group(1)
341
- else:
342
- s = v.strip().strip("\"'")
343
- new_rule["append_start_char"] = s
344
-
345
- # classification
346
- if selector.startswith(".sy-"):
347
- key = selector.lstrip(".")
348
- old = rules["sy"].get(key)
349
- rules["sy"][key] = {**old, **new_rule} if old else (new_rule or {})
350
-
351
- elif selector.startswith(".p") and " " in selector:
352
- p_cls, right = selector.split(" ", 1)
353
- p_cls = p_cls.lstrip(".")
354
- tag = _only_tag(right)
355
- if tag:
356
- prev = rules["p_rules"].setdefault(p_cls, {}).get(tag)
357
- rules["p_rules"][p_cls][tag] = (
358
- {**prev, **new_rule} if prev else (new_rule or {})
359
- )
360
-
361
- if order_val is not None:
362
- tag_for_order = _only_tag(selector)
363
- if tag_for_order:
364
- order_pairs.append((tag_for_order, order_val))
365
-
366
- # normalize orders
367
- order_pairs.sort(key=lambda t: t[1])
368
- seen = set()
369
- orders: list[str] = []
370
- for tag, _num in order_pairs:
371
- if tag not in seen:
372
- seen.add(tag)
373
- orders.append(tag)
374
- rules["orders"] = orders
375
- return rules
376
-
377
-
378
- def render_visible_text(html_str: str, rules: Rules) -> tuple[str, list[str]]:
379
- """
380
- Renderer the HTML using pre-parsed Rules.
381
- """
382
- tree = html.fromstring(html_str)
383
- paragraphs_out: list[str] = []
384
- refl_list: list[str] = []
385
- orders = rules.get("orders") or []
386
- p_rules = rules.get("p_rules") or {}
387
- sy_rules = rules.get("sy") or {}
388
-
389
- def _class_list(el: html.HtmlElement) -> list[str]:
390
- cls = el.get("class")
391
- return cls.split() if cls else []
392
-
393
- def _apply_rule(el: html.HtmlElement, rule: Rule) -> str:
394
- if rule.get("delete_all"):
395
- return ""
396
-
397
- parts: list[str] = []
398
- if "append_start_char" in rule:
399
- parts.append(rule["append_start_char"])
400
- if "append_start_attr" in rule:
401
- parts.append(el.get(rule["append_start_attr"], ""))
402
-
403
- text = el.text or ""
404
- if rule.get("delete_first") and text:
405
- text = text[1:]
406
- parts.append(text)
407
-
408
- if "append_end_char" in rule:
409
- parts.append(rule["append_end_char"])
410
- if "append_end_attr" in rule:
411
- parts.append(el.get(rule["append_end_attr"], ""))
412
-
413
- s = "".join(parts)
414
-
415
- if rule.get("transform_flip_x") and s:
416
- refl_list.append(s)
417
-
418
- return s
419
-
420
- for p in tree.findall(".//p"):
421
- p_classes = _class_list(p)
422
- p_key = next((c for c in p_classes if c.startswith("p")), None)
423
- has_ordered_rules = p_key in p_rules
424
-
425
- buf_parts: list[str] = []
426
-
427
- if p.text and not has_ordered_rules:
428
- buf_parts.append(p.text)
429
-
430
- ordered_cache: dict[str, list[str]] = {}
431
-
432
- for child in p:
433
- tag = str(child.tag)
434
-
435
- # Handle inline <y class="sy-*"> spans
436
- if tag == "y" and not has_ordered_rules:
437
- y_cls = next(
438
- (c for c in _class_list(child) if c.startswith("sy-")), None
439
- )
440
- if y_cls and y_cls in sy_rules:
441
- buf_parts.append(_apply_rule(child, sy_rules[y_cls]))
442
- else:
443
- buf_parts.append(child.text or "")
444
- if child.tail:
445
- buf_parts.append(child.tail)
446
- continue
447
-
448
- # Handle ordered paragraphs: only cache tags that appear in `orders`
449
- if p_key and has_ordered_rules and tag in orders:
450
- rule = p_rules[p_key].get(tag, {})
451
- ordered_cache.setdefault(tag, []).append(_apply_rule(child, rule))
452
- continue
453
-
454
- # Non-ordered, non-<y> nodes: include text + tails as-is
455
- if not has_ordered_rules:
456
- buf_parts.append(child.text or "")
457
- if child.tail:
458
- buf_parts.append(child.tail)
459
-
460
- # If ordered, flush in global orders with all duplicates preserved
461
- if has_ordered_rules:
462
- for tag in orders:
463
- if tag in ordered_cache:
464
- buf_parts.extend(ordered_cache[tag])
465
-
466
- para = "".join(buf_parts)
467
- if para:
468
- paragraphs_out.append(para)
469
-
470
- return "\n".join(paragraphs_out), refl_list
@@ -1,126 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- novel_downloader.core.parsers.qidian.chapter_normal
4
- ---------------------------------------------------
5
-
6
- Parser logic for extracting readable text from Qidian chapters
7
- that use plain (non-encrypted) browser-rendered HTML.
8
- """
9
-
10
- from __future__ import annotations
11
-
12
- import logging
13
- from typing import TYPE_CHECKING
14
-
15
- from lxml import html
16
-
17
- from novel_downloader.models import ChapterDict
18
- from novel_downloader.utils import truncate_half_lines
19
-
20
- from .utils import (
21
- extract_chapter_info,
22
- find_ssr_page_context,
23
- get_decryptor,
24
- is_duplicated,
25
- vip_status,
26
- )
27
-
28
- if TYPE_CHECKING:
29
- from .main_parser import QidianParser
30
-
31
- logger = logging.getLogger(__name__)
32
-
33
-
34
- def parse_normal_chapter(
35
- parser: QidianParser,
36
- html_str: str,
37
- chapter_id: str,
38
- ) -> ChapterDict | None:
39
- """
40
- Extract structured chapter info from a normal Qidian page.
41
-
42
- :param html_str: Chapter HTML.
43
- :param chapter_id: Chapter identifier (string).
44
- :return: a dictionary with keys like 'id', 'title', 'content', etc.
45
- """
46
- try:
47
- ssr_data = find_ssr_page_context(html_str)
48
- chapter_info = extract_chapter_info(ssr_data)
49
- if not chapter_info:
50
- logger.warning(
51
- "[Parser] ssr_chapterInfo not found for chapter '%s'", chapter_id
52
- )
53
- return None
54
-
55
- title = chapter_info.get("chapterName", "Untitled")
56
- duplicated = is_duplicated(ssr_data)
57
- raw_html = chapter_info.get("content", "")
58
- chapter_id = chapter_info.get("chapterId", chapter_id)
59
- fkp = chapter_info.get("fkp", "")
60
- author_say = chapter_info.get("authorSay", "")
61
- update_time = chapter_info.get("updateTime", "")
62
- update_timestamp = chapter_info.get("updateTimestamp", 0)
63
- modify_time = chapter_info.get("modifyTime", 0)
64
- word_count = chapter_info.get("actualWords", 0)
65
- seq = chapter_info.get("seq", None)
66
- volume = chapter_info.get("extra", {}).get("volumeName", "")
67
-
68
- chapter_text = _parse_paragraph(
69
- html_str=raw_html,
70
- is_vip=vip_status(ssr_data),
71
- chapter_id=chapter_id,
72
- fkp=fkp,
73
- fuid=parser._fuid,
74
- )
75
- if not chapter_text:
76
- return None
77
-
78
- if parser._use_truncation and duplicated:
79
- chapter_text = truncate_half_lines(chapter_text)
80
-
81
- return {
82
- "id": str(chapter_id),
83
- "title": title,
84
- "content": chapter_text,
85
- "extra": {
86
- "author_say": author_say.strip() if author_say else "",
87
- "updated_at": update_time,
88
- "update_timestamp": update_timestamp,
89
- "modify_time": modify_time,
90
- "word_count": word_count,
91
- "duplicated": duplicated,
92
- "seq": seq,
93
- "volume": volume,
94
- "encrypted": False,
95
- },
96
- }
97
- except Exception as e:
98
- logger.warning(
99
- "[Parser] parse error for normal chapter '%s': %s", chapter_id, e
100
- )
101
- return None
102
-
103
-
104
- def _parse_paragraph(
105
- html_str: str,
106
- is_vip: bool,
107
- chapter_id: str,
108
- fkp: str,
109
- fuid: str,
110
- ) -> str:
111
- raw_html = html_str
112
-
113
- if is_vip:
114
- try:
115
- decryptor = get_decryptor()
116
- raw_html = decryptor.decrypt(raw_html, chapter_id, fkp, fuid)
117
- except Exception as e:
118
- logger.error("[Parser] decryption failed for '%s': %s", chapter_id, e)
119
- return ""
120
-
121
- tree = html.fromstring(raw_html)
122
- paras = tree.xpath(".//p")
123
- paragraph_texts = [
124
- p.text_content().strip() for p in paras if p.text_content().strip()
125
- ]
126
- return "\n".join(paragraph_texts)