novel-downloader 2.0.0__py3-none-any.whl → 2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -1
- novel_downloader/cli/download.py +3 -3
- novel_downloader/cli/export.py +1 -1
- novel_downloader/cli/ui.py +7 -7
- novel_downloader/config/adapter.py +191 -154
- novel_downloader/core/__init__.py +5 -6
- novel_downloader/core/exporters/common/txt.py +9 -9
- novel_downloader/core/exporters/linovelib/txt.py +9 -9
- novel_downloader/core/fetchers/qidian.py +20 -35
- novel_downloader/core/interfaces/fetcher.py +2 -2
- novel_downloader/core/interfaces/parser.py +2 -2
- novel_downloader/core/parsers/base.py +1 -0
- novel_downloader/core/parsers/eightnovel.py +2 -2
- novel_downloader/core/parsers/esjzone.py +3 -3
- novel_downloader/core/parsers/qidian/main_parser.py +747 -12
- novel_downloader/core/parsers/qidian/utils/__init__.py +2 -21
- novel_downloader/core/parsers/qidian/utils/node_decryptor.py +4 -4
- novel_downloader/core/parsers/xiguashuwu.py +6 -12
- novel_downloader/locales/en.json +3 -3
- novel_downloader/locales/zh.json +3 -3
- novel_downloader/utils/__init__.py +0 -2
- novel_downloader/utils/chapter_storage.py +2 -3
- novel_downloader/utils/constants.py +1 -3
- novel_downloader/utils/cookies.py +32 -17
- novel_downloader/utils/crypto_utils/__init__.py +0 -6
- novel_downloader/utils/crypto_utils/rc4.py +40 -50
- novel_downloader/utils/epub/__init__.py +2 -3
- novel_downloader/utils/epub/builder.py +6 -6
- novel_downloader/utils/epub/constants.py +5 -5
- novel_downloader/utils/epub/documents.py +7 -7
- novel_downloader/utils/epub/models.py +8 -8
- novel_downloader/utils/epub/utils.py +10 -10
- novel_downloader/utils/file_utils/io.py +48 -73
- novel_downloader/utils/file_utils/normalize.py +1 -7
- novel_downloader/utils/file_utils/sanitize.py +4 -11
- novel_downloader/utils/fontocr/__init__.py +13 -0
- novel_downloader/utils/{fontocr.py → fontocr/core.py} +70 -61
- novel_downloader/utils/fontocr/loader.py +50 -0
- novel_downloader/utils/logger.py +80 -56
- novel_downloader/utils/network.py +16 -40
- novel_downloader/utils/text_utils/text_cleaner.py +39 -30
- novel_downloader/utils/text_utils/truncate_utils.py +3 -14
- novel_downloader/utils/time_utils/sleep_utils.py +53 -43
- novel_downloader/web/main.py +1 -1
- novel_downloader/web/pages/search.py +3 -3
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.1.dist-info}/METADATA +2 -1
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.1.dist-info}/RECORD +51 -55
- novel_downloader/core/parsers/qidian/book_info_parser.py +0 -89
- novel_downloader/core/parsers/qidian/chapter_encrypted.py +0 -470
- novel_downloader/core/parsers/qidian/chapter_normal.py +0 -126
- novel_downloader/core/parsers/qidian/chapter_router.py +0 -68
- novel_downloader/core/parsers/qidian/utils/fontmap_recover.py +0 -143
- novel_downloader/core/parsers/qidian/utils/helpers.py +0 -110
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.1.dist-info}/WHEEL +0 -0
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.1.dist-info}/entry_points.txt +0 -0
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.1.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.1.dist-info}/top_level.txt +0 -0
@@ -8,27 +8,59 @@ Main parser class for handling Qidian HTML
|
|
8
8
|
|
9
9
|
from __future__ import annotations
|
10
10
|
|
11
|
+
import json
|
11
12
|
import logging
|
13
|
+
import re
|
14
|
+
from contextlib import suppress
|
15
|
+
from html import unescape
|
12
16
|
from pathlib import Path
|
13
|
-
from typing import Any
|
17
|
+
from typing import Any, TypedDict
|
18
|
+
|
19
|
+
from lxml import html
|
14
20
|
|
15
21
|
from novel_downloader.core.parsers.base import BaseParser
|
16
22
|
from novel_downloader.core.parsers.registry import register_parser
|
17
23
|
from novel_downloader.models import (
|
18
24
|
BookInfoDict,
|
19
25
|
ChapterDict,
|
26
|
+
ChapterInfoDict,
|
20
27
|
ParserConfig,
|
28
|
+
VolumeInfoDict,
|
29
|
+
)
|
30
|
+
from novel_downloader.utils import (
|
31
|
+
download,
|
32
|
+
truncate_half_lines,
|
21
33
|
)
|
22
34
|
from novel_downloader.utils.constants import DATA_DIR
|
23
35
|
from novel_downloader.utils.cookies import get_cookie_value
|
36
|
+
from novel_downloader.utils.fontocr import get_font_ocr
|
24
37
|
|
25
|
-
from .
|
26
|
-
|
27
|
-
|
38
|
+
from .utils import (
|
39
|
+
get_decryptor,
|
40
|
+
)
|
28
41
|
|
29
42
|
logger = logging.getLogger(__name__)
|
30
43
|
|
31
44
|
|
45
|
+
class Rule(TypedDict, total=False):
|
46
|
+
delete_all: bool
|
47
|
+
delete_first: bool
|
48
|
+
transform_flip_x: bool
|
49
|
+
append_start_char: str
|
50
|
+
append_end_char: str
|
51
|
+
append_start_attr: str
|
52
|
+
append_end_attr: str
|
53
|
+
|
54
|
+
|
55
|
+
class Rules(TypedDict):
|
56
|
+
# e.g., orders = ["i", "em", "span"]
|
57
|
+
orders: list[str]
|
58
|
+
# e.g., sy["sy-3"] -> Rule
|
59
|
+
sy: dict[str, Rule]
|
60
|
+
# e.g., p_rules["p3"]["i"] -> Rule
|
61
|
+
p_rules: dict[str, dict[str, Rule]]
|
62
|
+
|
63
|
+
|
32
64
|
@register_parser(
|
33
65
|
site_keys=["qidian", "qd"],
|
34
66
|
)
|
@@ -37,6 +69,10 @@ class QidianParser(BaseParser):
|
|
37
69
|
Parser for 起点中文网 site.
|
38
70
|
"""
|
39
71
|
|
72
|
+
_RE_P_DELIM = re.compile(r"(?i)<\s*p\s*>")
|
73
|
+
_RE_ATTR = re.compile(r"attr\(\s*([^)]+?)\s*\)", re.I)
|
74
|
+
_RE_SCALEX = re.compile(r"scalex\(\s*-?1\s*\)", re.I)
|
75
|
+
|
40
76
|
def __init__(
|
41
77
|
self,
|
42
78
|
config: ParserConfig,
|
@@ -71,7 +107,64 @@ class QidianParser(BaseParser):
|
|
71
107
|
"""
|
72
108
|
if not html_list:
|
73
109
|
return None
|
74
|
-
|
110
|
+
|
111
|
+
doc = html.fromstring(html_list[0])
|
112
|
+
|
113
|
+
book_name = self._first_str(doc.xpath('//h1[@id="bookName"]/text()'))
|
114
|
+
author = self._first_str(doc.xpath('//a[@class="writer-name"]/text()'))
|
115
|
+
|
116
|
+
book_id = doc.xpath('//a[@id="bookImg"]/@data-bid')[0]
|
117
|
+
cover_url = f"https://bookcover.yuewen.com/qdbimg/349573/{book_id}/600.webp"
|
118
|
+
|
119
|
+
update_time = self._first_str(
|
120
|
+
doc.xpath('//span[@class="update-time"]/text()'),
|
121
|
+
replaces=[("更新时间:", "")],
|
122
|
+
)
|
123
|
+
serial_status = self._first_str(
|
124
|
+
doc.xpath('//p[@class="book-attribute"]/span[1]/text()')
|
125
|
+
)
|
126
|
+
|
127
|
+
tags = [
|
128
|
+
t.strip()
|
129
|
+
for t in doc.xpath('//p[contains(@class,"all-label")]//a/text()')
|
130
|
+
if t.strip()
|
131
|
+
]
|
132
|
+
|
133
|
+
word_count = self._first_str(doc.xpath('//p[@class="count"]/em[1]/text()'))
|
134
|
+
summary_brief = self._first_str(doc.xpath('//p[@class="intro"]/text()'))
|
135
|
+
|
136
|
+
raw_lines = [
|
137
|
+
s.strip()
|
138
|
+
for s in doc.xpath('//p[@id="book-intro-detail"]//text()')
|
139
|
+
if s.strip()
|
140
|
+
]
|
141
|
+
summary = "\n".join(raw_lines)
|
142
|
+
|
143
|
+
volumes: list[VolumeInfoDict] = []
|
144
|
+
for vol in doc.xpath('//div[@id="allCatalog"]//div[@class="catalog-volume"]'):
|
145
|
+
vol_name = self._first_str(vol.xpath('.//h3[@class="volume-name"]/text()'))
|
146
|
+
vol_name = vol_name.split(chr(183))[0].strip()
|
147
|
+
chapters: list[ChapterInfoDict] = []
|
148
|
+
for li in vol.xpath('.//ul[contains(@class,"volume-chapters")]/li'):
|
149
|
+
title = self._first_str(li.xpath('.//a[@class="chapter-name"]/text()'))
|
150
|
+
url = self._first_str(li.xpath('.//a[@class="chapter-name"]/@href'))
|
151
|
+
cid = url.rstrip("/").split("/")[-1] if url else ""
|
152
|
+
chapters.append({"title": title, "url": url, "chapterId": cid})
|
153
|
+
volumes.append({"volume_name": vol_name, "chapters": chapters})
|
154
|
+
|
155
|
+
return {
|
156
|
+
"book_name": book_name,
|
157
|
+
"author": author,
|
158
|
+
"cover_url": cover_url,
|
159
|
+
"update_time": update_time,
|
160
|
+
"word_count": word_count,
|
161
|
+
"serial_status": serial_status,
|
162
|
+
"tags": tags,
|
163
|
+
"summary_brief": summary_brief,
|
164
|
+
"summary": summary,
|
165
|
+
"volumes": volumes,
|
166
|
+
"extra": {},
|
167
|
+
}
|
75
168
|
|
76
169
|
def parse_chapter(
|
77
170
|
self,
|
@@ -86,16 +179,658 @@ class QidianParser(BaseParser):
|
|
86
179
|
"""
|
87
180
|
if not html_list:
|
88
181
|
return None
|
89
|
-
|
182
|
+
try:
|
183
|
+
ssr_data = self._find_ssr_page_context(html_list[0])
|
184
|
+
chapter_info = self._extract_chapter_info(ssr_data)
|
185
|
+
if not chapter_info:
|
186
|
+
logger.warning(
|
187
|
+
"[Parser] ssr_chapterInfo not found for chapter '%s'", chapter_id
|
188
|
+
)
|
189
|
+
return None
|
190
|
+
|
191
|
+
if not self._can_view_chapter(chapter_info):
|
192
|
+
logger.warning(
|
193
|
+
"[Parser] Chapter '%s' is not purchased or inaccessible.",
|
194
|
+
chapter_id,
|
195
|
+
)
|
196
|
+
return None
|
197
|
+
|
198
|
+
if self._is_encrypted(ssr_data):
|
199
|
+
if not self._decode_font:
|
200
|
+
return None
|
201
|
+
return self.parse_encrypted_chapter(chapter_info, chapter_id)
|
202
|
+
|
203
|
+
return self.parse_normal_chapter(chapter_info, chapter_id)
|
204
|
+
|
205
|
+
except Exception as e:
|
206
|
+
logger.warning("[Parser] parse error for chapter '%s': %s", chapter_id, e)
|
207
|
+
return None
|
208
|
+
|
209
|
+
def parse_normal_chapter(
|
210
|
+
self,
|
211
|
+
chapter_info: dict[str, Any],
|
212
|
+
chapter_id: str,
|
213
|
+
) -> ChapterDict | None:
|
214
|
+
"""
|
215
|
+
Extract structured chapter info from a normal Qidian page.
|
216
|
+
|
217
|
+
:param chapter_info: Parsed chapter info block from ssr data.
|
218
|
+
:param chapter_id: Chapter identifier (string).
|
219
|
+
:return: a dictionary with keys like 'id', 'title', 'content', etc.
|
220
|
+
"""
|
221
|
+
duplicated = self._is_duplicated(chapter_info)
|
222
|
+
|
223
|
+
title = chapter_info.get("chapterName", "Untitled")
|
224
|
+
raw_html = chapter_info.get("content", "")
|
225
|
+
chapter_id = chapter_info.get("chapterId", chapter_id)
|
226
|
+
fkp = chapter_info.get("fkp", "")
|
227
|
+
author_say = chapter_info.get("authorSay", "").strip()
|
228
|
+
update_time = chapter_info.get("updateTime", "")
|
229
|
+
update_timestamp = chapter_info.get("updateTimestamp", 0)
|
230
|
+
modify_time = chapter_info.get("modifyTime", 0)
|
231
|
+
word_count = chapter_info.get("actualWords", 0)
|
232
|
+
seq = chapter_info.get("seq")
|
233
|
+
volume = chapter_info.get("extra", {}).get("volumeName", "")
|
234
|
+
|
235
|
+
if self._is_vip(chapter_info):
|
236
|
+
decryptor = get_decryptor()
|
237
|
+
raw_html = decryptor.decrypt(raw_html, chapter_id, fkp, self._fuid)
|
238
|
+
|
239
|
+
parts = self._RE_P_DELIM.split(raw_html)
|
240
|
+
paragraphs = [unescape(p).strip() for p in parts if p.strip()]
|
241
|
+
chapter_text = "\n".join(paragraphs)
|
242
|
+
if not chapter_text:
|
243
|
+
return None
|
244
|
+
|
245
|
+
if self._use_truncation and duplicated:
|
246
|
+
chapter_text = truncate_half_lines(chapter_text)
|
247
|
+
|
248
|
+
return {
|
249
|
+
"id": str(chapter_id),
|
250
|
+
"title": title,
|
251
|
+
"content": chapter_text,
|
252
|
+
"extra": {
|
253
|
+
"author_say": author_say,
|
254
|
+
"updated_at": update_time,
|
255
|
+
"update_timestamp": update_timestamp,
|
256
|
+
"modify_time": modify_time,
|
257
|
+
"word_count": word_count,
|
258
|
+
"duplicated": duplicated,
|
259
|
+
"seq": seq,
|
260
|
+
"volume": volume,
|
261
|
+
"encrypted": False,
|
262
|
+
},
|
263
|
+
}
|
264
|
+
|
265
|
+
def parse_encrypted_chapter(
|
266
|
+
self,
|
267
|
+
chapter_info: dict[str, Any],
|
268
|
+
chapter_id: str,
|
269
|
+
) -> ChapterDict | None:
|
270
|
+
"""
|
271
|
+
Extract and return the formatted textual content of an encrypted chapter.
|
272
|
+
|
273
|
+
Steps:
|
274
|
+
1. Decode and save randomFont bytes; download fixedFont via download().
|
275
|
+
2. Parse CSS rules and save debug JSON.
|
276
|
+
3. Render encrypted paragraphs, then run OCR font-mapping.
|
277
|
+
4. Extracts paragraph texts and formats them.
|
278
|
+
|
279
|
+
:param chapter_info: Parsed chapter info block from ssr data.
|
280
|
+
:return: Formatted chapter text or empty string if not parsable.
|
281
|
+
"""
|
282
|
+
debug_dir = self._debug_dir / "qidian" / "font_debug" / chapter_id
|
283
|
+
if self._save_font_debug:
|
284
|
+
debug_dir.mkdir(parents=True, exist_ok=True)
|
285
|
+
|
286
|
+
duplicated = self._is_duplicated(chapter_info)
|
287
|
+
|
288
|
+
css_str = chapter_info["css"]
|
289
|
+
randomFont_str = chapter_info["randomFont"]
|
290
|
+
fixedFontWoff2_url = chapter_info["fixedFontWoff2"]
|
291
|
+
|
292
|
+
title = chapter_info.get("chapterName", "Untitled")
|
293
|
+
raw_html = chapter_info.get("content", "")
|
294
|
+
chapter_id = chapter_info.get("chapterId", chapter_id)
|
295
|
+
fkp = chapter_info.get("fkp", "")
|
296
|
+
author_say = chapter_info.get("authorSay", "").strip()
|
297
|
+
update_time = chapter_info.get("updateTime", "")
|
298
|
+
update_timestamp = chapter_info.get("updateTimestamp", 0)
|
299
|
+
modify_time = chapter_info.get("modifyTime", 0)
|
300
|
+
word_count = chapter_info.get("actualWords", 0)
|
301
|
+
seq = chapter_info.get("seq")
|
302
|
+
volume = chapter_info.get("extra", {}).get("volumeName", "")
|
303
|
+
|
304
|
+
# extract + save font
|
305
|
+
rf = json.loads(randomFont_str)
|
306
|
+
rand_path = self._base_cache_dir / "randomFont.ttf"
|
307
|
+
rand_path.parent.mkdir(parents=True, exist_ok=True)
|
308
|
+
rand_path.write_bytes(bytes(rf["data"]))
|
309
|
+
|
310
|
+
fixed_path = download(
|
311
|
+
url=fixedFontWoff2_url,
|
312
|
+
target_dir=self._fixed_font_dir,
|
313
|
+
)
|
314
|
+
if fixed_path is None:
|
315
|
+
logger.warning(
|
316
|
+
"[Parser] failed to download fixedfont for chapter '%s'", chapter_id
|
317
|
+
)
|
318
|
+
return None
|
319
|
+
|
320
|
+
# Extract and render paragraphs from HTML with CSS rules
|
321
|
+
if self._is_vip(chapter_info):
|
322
|
+
decryptor = get_decryptor()
|
323
|
+
raw_html = decryptor.decrypt(
|
324
|
+
raw_html,
|
325
|
+
chapter_id,
|
326
|
+
fkp,
|
327
|
+
self._fuid,
|
328
|
+
)
|
329
|
+
|
330
|
+
css_rules = self._parse_css_rules(css_str)
|
331
|
+
paragraphs_str, refl_list = self._render_visible_text(raw_html, css_rules)
|
332
|
+
if self._save_font_debug:
|
333
|
+
paragraphs_str_path = debug_dir / f"{chapter_id}_debug.txt"
|
334
|
+
paragraphs_str_path.write_text(paragraphs_str, encoding="utf-8")
|
335
|
+
|
336
|
+
# Run OCR + fallback mapping
|
337
|
+
char_set = {c for c in paragraphs_str if c not in {" ", "\n", "\u3000"}}
|
338
|
+
refl_set = set(refl_list)
|
339
|
+
char_set = char_set - refl_set
|
340
|
+
if self._save_font_debug:
|
341
|
+
char_sets_path = debug_dir / "char_set_debug.txt"
|
342
|
+
temp = f"char_set:\n{char_set}\n\nrefl_set:\n{refl_set}"
|
343
|
+
char_sets_path.write_text(
|
344
|
+
temp,
|
345
|
+
encoding="utf-8",
|
346
|
+
)
|
347
|
+
|
348
|
+
mapping_result = self._generate_font_map(
|
349
|
+
fixed_font_path=fixed_path,
|
350
|
+
random_font_path=rand_path,
|
351
|
+
char_set=char_set,
|
352
|
+
refl_set=refl_set,
|
353
|
+
cache_dir=self._base_cache_dir,
|
354
|
+
batch_size=self._config.batch_size,
|
355
|
+
)
|
356
|
+
if not mapping_result:
|
357
|
+
return None
|
358
|
+
|
359
|
+
if self._save_font_debug:
|
360
|
+
mapping_json_path = debug_dir / "font_mapping.json"
|
361
|
+
mapping_json_path.write_text(
|
362
|
+
json.dumps(mapping_result, ensure_ascii=False, indent=2),
|
363
|
+
encoding="utf-8",
|
364
|
+
)
|
365
|
+
|
366
|
+
# Reconstruct final readable text
|
367
|
+
original_text = self._apply_font_mapping(
|
368
|
+
text=paragraphs_str,
|
369
|
+
font_map=mapping_result,
|
370
|
+
)
|
371
|
+
|
372
|
+
final_paragraphs_str = "\n".join(
|
373
|
+
line.strip() for line in original_text.splitlines() if line.strip()
|
374
|
+
)
|
375
|
+
if self._use_truncation and duplicated:
|
376
|
+
final_paragraphs_str = truncate_half_lines(final_paragraphs_str)
|
377
|
+
|
378
|
+
return {
|
379
|
+
"id": str(chapter_id),
|
380
|
+
"title": str(title),
|
381
|
+
"content": final_paragraphs_str,
|
382
|
+
"extra": {
|
383
|
+
"author_say": author_say,
|
384
|
+
"updated_at": update_time,
|
385
|
+
"update_timestamp": update_timestamp,
|
386
|
+
"modify_time": modify_time,
|
387
|
+
"word_count": word_count,
|
388
|
+
"duplicated": duplicated,
|
389
|
+
"seq": seq,
|
390
|
+
"volume": volume,
|
391
|
+
"encrypted": True,
|
392
|
+
},
|
393
|
+
}
|
394
|
+
|
395
|
+
@staticmethod
|
396
|
+
def _find_ssr_page_context(html_str: str) -> dict[str, Any]:
|
397
|
+
"""
|
398
|
+
Extract SSR JSON from <script id="vite-plugin-ssr_pageContext">.
|
399
|
+
"""
|
400
|
+
tree = html.fromstring(html_str)
|
401
|
+
script = tree.xpath('//script[@id="vite-plugin-ssr_pageContext"]/text()')
|
402
|
+
return json.loads(script[0].strip()) if script else {}
|
403
|
+
|
404
|
+
@staticmethod
|
405
|
+
def _extract_chapter_info(ssr_data: dict[str, Any]) -> dict[str, Any]:
|
406
|
+
"""
|
407
|
+
Extract the 'chapterInfo' dictionary from the SSR page context.
|
408
|
+
|
409
|
+
This handles nested key access and returns an empty dict if missing.
|
410
|
+
|
411
|
+
:param ssr_data: The full SSR data object from _find_ssr_page_context().
|
412
|
+
:return: A dict with chapter metadata such as chapterName, authorSay, etc.
|
413
|
+
"""
|
414
|
+
page_context = ssr_data.get("pageContext", {})
|
415
|
+
page_props = page_context.get("pageProps", {})
|
416
|
+
page_data = page_props.get("pageData", {})
|
417
|
+
chapter_info = page_data.get("chapterInfo", {})
|
418
|
+
return chapter_info if isinstance(chapter_info, dict) else {}
|
419
|
+
|
420
|
+
@staticmethod
|
421
|
+
def _is_restricted_page(html_str: str) -> bool:
|
422
|
+
"""
|
423
|
+
Return True if page content indicates access restriction
|
424
|
+
(e.g. not subscribed/purchased).
|
425
|
+
|
426
|
+
:param html_str: Raw HTML string.
|
427
|
+
"""
|
428
|
+
markers = ["这是VIP章节", "需要订阅", "订阅后才能阅读"]
|
429
|
+
return any(m in html_str for m in markers)
|
90
430
|
|
91
|
-
|
431
|
+
@classmethod
|
432
|
+
def _is_vip(cls, chapter_info: dict[str, Any]) -> bool:
|
433
|
+
"""
|
434
|
+
:return: True if VIP, False otherwise.
|
435
|
+
"""
|
436
|
+
vip_flag = chapter_info.get("vipStatus", 0)
|
437
|
+
fens_flag = chapter_info.get("fEnS", 0)
|
438
|
+
return bool(vip_flag == 1 and fens_flag != 0)
|
439
|
+
|
440
|
+
@classmethod
|
441
|
+
def _can_view_chapter(cls, chapter_info: dict[str, Any]) -> bool:
|
442
|
+
"""
|
443
|
+
A chapter is not viewable if it is marked as VIP
|
444
|
+
and has not been purchased.
|
445
|
+
|
446
|
+
:return: True if viewable, False otherwise.
|
447
|
+
"""
|
448
|
+
is_buy = chapter_info.get("isBuy", 0)
|
449
|
+
vip_status = chapter_info.get("vipStatus", 0)
|
450
|
+
return not (vip_status == 1 and is_buy == 0)
|
451
|
+
|
452
|
+
@classmethod
|
453
|
+
def _is_duplicated(cls, chapter_info: dict[str, Any]) -> bool:
|
454
|
+
"""
|
455
|
+
Check if chapter is marked as duplicated (eFW = 1).
|
456
|
+
"""
|
457
|
+
efw_flag = chapter_info.get("eFW", 0)
|
458
|
+
return bool(efw_flag == 1)
|
459
|
+
|
460
|
+
@classmethod
|
461
|
+
def _is_encrypted(cls, content: str | dict[str, Any]) -> bool:
|
92
462
|
"""
|
93
463
|
Return True if content is encrypted.
|
94
464
|
|
95
|
-
|
465
|
+
Chapter Encryption Status (cES):
|
466
|
+
* 0: 内容是'明文'
|
467
|
+
* 2: 字体加密
|
468
|
+
|
469
|
+
:param content: HTML content, either as a raw string or a BeautifulSoup object.
|
470
|
+
:return: True if encrypted marker is found, else False.
|
471
|
+
"""
|
472
|
+
ssr_data = (
|
473
|
+
cls._find_ssr_page_context(content) if isinstance(content, str) else content
|
474
|
+
)
|
475
|
+
chapter_info = cls._extract_chapter_info(ssr_data)
|
476
|
+
return int(chapter_info.get("cES", 0)) == 2
|
477
|
+
|
478
|
+
@staticmethod
|
479
|
+
def _generate_font_map(
|
480
|
+
fixed_font_path: Path,
|
481
|
+
random_font_path: Path,
|
482
|
+
char_set: set[str],
|
483
|
+
refl_set: set[str],
|
484
|
+
cache_dir: Path,
|
485
|
+
batch_size: int = 32,
|
486
|
+
) -> dict[str, str]:
|
487
|
+
"""
|
488
|
+
Build a mapping from scrambled font chars to real chars.
|
489
|
+
|
490
|
+
Uses OCR to decode and generate mapping from a fixed obfuscated font
|
491
|
+
and an random obfuscated font. Results are cached in JSON.
|
492
|
+
|
493
|
+
:param fixed_font_path: fixed font file.
|
494
|
+
:param random_font_path: random font file.
|
495
|
+
:param char_set: Characters to match directly.
|
496
|
+
:param refl_set: Characters to match in flipped form.
|
497
|
+
:param cache_dir: Directory to save/load cached results.
|
498
|
+
:param batch_size: How many chars to OCR per batch.
|
499
|
+
|
500
|
+
:return: { obf_char: real_char, ... }
|
501
|
+
"""
|
502
|
+
font_ocr = get_font_ocr()
|
503
|
+
if not font_ocr:
|
504
|
+
return {}
|
505
|
+
|
506
|
+
mapping_result: dict[str, str] = {}
|
507
|
+
fixed_map_file = cache_dir / "fixed_font_map" / f"{fixed_font_path.stem}.json"
|
508
|
+
fixed_map_file.parent.mkdir(parents=True, exist_ok=True)
|
509
|
+
|
510
|
+
# load existing cache
|
511
|
+
try:
|
512
|
+
with open(fixed_map_file, encoding="utf-8") as f:
|
513
|
+
fixed_map = json.load(f)
|
514
|
+
cached_chars = set(fixed_map.keys())
|
515
|
+
mapping_result.update(
|
516
|
+
{ch: fixed_map[ch] for ch in char_set if ch in fixed_map}
|
517
|
+
)
|
518
|
+
mapping_result.update(
|
519
|
+
{ch: fixed_map[ch] for ch in refl_set if ch in fixed_map}
|
520
|
+
)
|
521
|
+
char_set = char_set - cached_chars
|
522
|
+
refl_set = refl_set - cached_chars
|
523
|
+
except Exception:
|
524
|
+
fixed_map = {}
|
525
|
+
cached_chars = set()
|
526
|
+
|
527
|
+
# prepare font renderers and cmap sets
|
528
|
+
fixed_chars = font_ocr.extract_font_charset(fixed_font_path)
|
529
|
+
random_chars = font_ocr.extract_font_charset(random_font_path)
|
530
|
+
fixed_font = font_ocr.load_render_font(fixed_font_path)
|
531
|
+
random_font = font_ocr.load_render_font(random_font_path)
|
532
|
+
|
533
|
+
# process normal and reflected sets together
|
534
|
+
rendered = []
|
535
|
+
for chars, reflect in [(char_set, False), (refl_set, True)]:
|
536
|
+
for ch in chars:
|
537
|
+
if ch in fixed_chars:
|
538
|
+
font = fixed_font
|
539
|
+
elif ch in random_chars:
|
540
|
+
font = random_font
|
541
|
+
else:
|
542
|
+
continue
|
543
|
+
rendered.append(
|
544
|
+
(ch, font_ocr.render_char_image_array(ch, font, reflect))
|
545
|
+
)
|
546
|
+
|
547
|
+
if rendered:
|
548
|
+
# query OCR+vec simultaneously
|
549
|
+
imgs_to_query = [img for _, img in rendered]
|
550
|
+
fused = font_ocr.predict(imgs_to_query, batch_size=batch_size)
|
551
|
+
|
552
|
+
# pick best per char, apply threshold + cache
|
553
|
+
for (ch, _), preds in zip(rendered, fused, strict=False):
|
554
|
+
if not preds:
|
555
|
+
continue
|
556
|
+
real_char, _ = preds
|
557
|
+
mapping_result[ch] = real_char
|
558
|
+
fixed_map[ch] = real_char
|
559
|
+
|
560
|
+
# persist updated fixed_map
|
561
|
+
try:
|
562
|
+
with open(fixed_map_file, "w", encoding="utf-8") as f:
|
563
|
+
json.dump(fixed_map, f, ensure_ascii=False, indent=2)
|
564
|
+
except Exception as e:
|
565
|
+
logger.error("[FontOCR] Failed to save fixed map: %s", e)
|
566
|
+
|
567
|
+
return mapping_result
|
568
|
+
|
569
|
+
@staticmethod
|
570
|
+
def _apply_font_mapping(text: str, font_map: dict[str, str]) -> str:
|
96
571
|
"""
|
97
|
-
|
572
|
+
Replace each character in `text` using `font_map`,
|
573
|
+
leaving unmapped characters unchanged.
|
574
|
+
|
575
|
+
:param text: The input string, possibly containing obfuscated font chars.
|
576
|
+
:param font_map: A dict mapping obfuscated chars to real chars.
|
577
|
+
:return: The de-obfuscated text.
|
578
|
+
"""
|
579
|
+
return "".join(font_map.get(ch, ch) for ch in text)
|
580
|
+
|
581
|
+
@staticmethod
|
582
|
+
def _only_tag(selector: str) -> str | None:
|
583
|
+
"""
|
584
|
+
Normalize a selector into just its tag name for ordering.
|
585
|
+
|
586
|
+
Handles forms like 'i', 'em::before', '.p3 i', '.p2 span::after'.
|
587
|
+
|
588
|
+
Returns None if can't extract a tag.
|
589
|
+
"""
|
590
|
+
sel = selector.strip()
|
591
|
+
# If it has spaces, take the rightmost simple selector
|
592
|
+
last = sel.split()[-1]
|
593
|
+
# Drop ::pseudo
|
594
|
+
last = last.split("::", 1)[0]
|
595
|
+
# If it's like 'span[attr=..]' keep 'span'
|
596
|
+
last = last.split("[", 1)[0]
|
597
|
+
# If it starts with '.', it's not a tag
|
598
|
+
if not last or last.startswith("."):
|
599
|
+
return None
|
600
|
+
return last
|
601
|
+
|
602
|
+
@staticmethod
|
603
|
+
def _parse_decls(block: str) -> list[tuple[str, str]]:
|
604
|
+
"""
|
605
|
+
Parse 'name:value;...' inside a block. Tolerates quotes and attr().
|
606
|
+
"""
|
607
|
+
decls: list[tuple[str, str]] = []
|
608
|
+
i = 0
|
609
|
+
n = len(block)
|
610
|
+
name: list[str] = []
|
611
|
+
val: list[str] = []
|
612
|
+
in_name = True
|
613
|
+
quote = None # track ' or "
|
614
|
+
while i < n:
|
615
|
+
c = block[i]
|
616
|
+
if quote:
|
617
|
+
# inside quotes
|
618
|
+
if c == "\\" and i + 1 < n:
|
619
|
+
# keep escaped char
|
620
|
+
(name if in_name else val).append(c)
|
621
|
+
i += 1
|
622
|
+
(name if in_name else val).append(block[i])
|
623
|
+
elif c == quote:
|
624
|
+
(name if in_name else val).append(c)
|
625
|
+
quote = None
|
626
|
+
else:
|
627
|
+
(name if in_name else val).append(c)
|
628
|
+
else:
|
629
|
+
if c in ("'", '"'):
|
630
|
+
(name if in_name else val).append(c)
|
631
|
+
quote = c
|
632
|
+
elif in_name and c == ":":
|
633
|
+
in_name = False
|
634
|
+
elif c == ";":
|
635
|
+
nm = "".join(name).strip().lower()
|
636
|
+
vl = "".join(val).strip()
|
637
|
+
if nm:
|
638
|
+
decls.append((nm, vl))
|
639
|
+
name.clear()
|
640
|
+
val.clear()
|
641
|
+
in_name = True
|
642
|
+
else:
|
643
|
+
(name if in_name else val).append(c)
|
644
|
+
i += 1
|
645
|
+
|
646
|
+
if name or val:
|
647
|
+
nm = "".join(name).strip().lower()
|
648
|
+
vl = "".join(val).strip()
|
649
|
+
if nm:
|
650
|
+
decls.append((nm, vl))
|
651
|
+
return decls
|
652
|
+
|
653
|
+
@classmethod
|
654
|
+
def _parse_css_rules(cls, css_str: str) -> Rules:
|
655
|
+
"""
|
656
|
+
Produces normalized Rules with:
|
657
|
+
* orders: list[str] of tag names sorted by numeric 'order'
|
658
|
+
* sy: '.sy-*' class rules
|
659
|
+
* p_rules: '.p* <tag>' rules, indexed by p-class then tag
|
660
|
+
"""
|
661
|
+
rules: Rules = {"orders": [], "sy": {}, "p_rules": {}}
|
662
|
+
order_pairs: list[tuple[str, int]] = []
|
663
|
+
|
664
|
+
i = 0
|
665
|
+
while True:
|
666
|
+
b1 = css_str.find("{", i)
|
667
|
+
if b1 == -1:
|
668
|
+
break
|
669
|
+
selector = css_str[i:b1].strip().lower()
|
670
|
+
b2 = css_str.find("}", b1 + 1)
|
671
|
+
if b2 == -1:
|
672
|
+
break
|
673
|
+
block = css_str[b1 + 1 : b2]
|
674
|
+
i = b2 + 1
|
675
|
+
|
676
|
+
decls = cls._parse_decls(block)
|
677
|
+
|
678
|
+
new_rule: Rule = {}
|
679
|
+
order_val: int | None = None
|
680
|
+
|
681
|
+
for name, value in decls:
|
682
|
+
v = value.strip()
|
683
|
+
if name == "font-size" and v == "0":
|
684
|
+
if "::first-letter" in selector:
|
685
|
+
new_rule["delete_first"] = True
|
686
|
+
else:
|
687
|
+
new_rule["delete_all"] = True
|
688
|
+
elif name == "transform":
|
689
|
+
if cls._RE_SCALEX.search(v.replace(" ", "")):
|
690
|
+
new_rule["transform_flip_x"] = True
|
691
|
+
elif name == "order":
|
692
|
+
with suppress(ValueError, TypeError):
|
693
|
+
order_val = int(v)
|
694
|
+
elif name == "content":
|
695
|
+
# normalize: remove outer quotes
|
696
|
+
if "::after" in selector:
|
697
|
+
m = cls._RE_ATTR.search(v)
|
698
|
+
if m:
|
699
|
+
new_rule["append_end_attr"] = m.group(1)
|
700
|
+
else:
|
701
|
+
s = v.strip().strip("\"'")
|
702
|
+
new_rule["append_end_char"] = s
|
703
|
+
elif "::before" in selector:
|
704
|
+
m = cls._RE_ATTR.search(v)
|
705
|
+
if m:
|
706
|
+
new_rule["append_start_attr"] = m.group(1)
|
707
|
+
else:
|
708
|
+
s = v.strip().strip("\"'")
|
709
|
+
new_rule["append_start_char"] = s
|
710
|
+
|
711
|
+
# classification
|
712
|
+
if selector.startswith(".sy-"):
|
713
|
+
key = selector.lstrip(".")
|
714
|
+
old = rules["sy"].get(key)
|
715
|
+
rules["sy"][key] = {**old, **new_rule} if old else (new_rule or {})
|
716
|
+
|
717
|
+
elif selector.startswith(".p") and " " in selector:
|
718
|
+
p_cls, right = selector.split(" ", 1)
|
719
|
+
p_cls = p_cls.lstrip(".")
|
720
|
+
tag = cls._only_tag(right)
|
721
|
+
if tag:
|
722
|
+
prev = rules["p_rules"].setdefault(p_cls, {}).get(tag)
|
723
|
+
rules["p_rules"][p_cls][tag] = (
|
724
|
+
{**prev, **new_rule} if prev else (new_rule or {})
|
725
|
+
)
|
726
|
+
|
727
|
+
if order_val is not None:
|
728
|
+
tag_for_order = cls._only_tag(selector)
|
729
|
+
if tag_for_order:
|
730
|
+
order_pairs.append((tag_for_order, order_val))
|
731
|
+
|
732
|
+
# normalize orders
|
733
|
+
order_pairs.sort(key=lambda t: t[1])
|
734
|
+
seen = set()
|
735
|
+
orders: list[str] = []
|
736
|
+
for tag, _ in order_pairs:
|
737
|
+
if tag not in seen:
|
738
|
+
seen.add(tag)
|
739
|
+
orders.append(tag)
|
740
|
+
rules["orders"] = orders
|
741
|
+
return rules
|
742
|
+
|
743
|
+
@staticmethod
|
744
|
+
def _render_visible_text(html_str: str, rules: Rules) -> tuple[str, list[str]]:
|
745
|
+
"""
|
746
|
+
Renderer the HTML using pre-parsed Rules.
|
747
|
+
"""
|
748
|
+
tree = html.fromstring(html_str)
|
749
|
+
paragraphs_out: list[str] = []
|
750
|
+
refl_list: list[str] = []
|
751
|
+
orders = rules.get("orders") or []
|
752
|
+
p_rules = rules.get("p_rules") or {}
|
753
|
+
sy_rules = rules.get("sy") or {}
|
754
|
+
|
755
|
+
def _class_list(el: html.HtmlElement) -> list[str]:
|
756
|
+
cls = el.get("class")
|
757
|
+
return cls.split() if cls else []
|
758
|
+
|
759
|
+
def _apply_rule(el: html.HtmlElement, rule: Rule) -> str:
|
760
|
+
if rule.get("delete_all"):
|
761
|
+
return ""
|
762
|
+
|
763
|
+
parts: list[str] = []
|
764
|
+
if "append_start_char" in rule:
|
765
|
+
parts.append(rule["append_start_char"])
|
766
|
+
if "append_start_attr" in rule:
|
767
|
+
parts.append(el.get(rule["append_start_attr"], ""))
|
768
|
+
|
769
|
+
text = el.text or ""
|
770
|
+
if rule.get("delete_first") and text:
|
771
|
+
text = text[1:]
|
772
|
+
parts.append(text)
|
773
|
+
|
774
|
+
if "append_end_char" in rule:
|
775
|
+
parts.append(rule["append_end_char"])
|
776
|
+
if "append_end_attr" in rule:
|
777
|
+
parts.append(el.get(rule["append_end_attr"], ""))
|
778
|
+
|
779
|
+
s = "".join(parts)
|
780
|
+
|
781
|
+
if rule.get("transform_flip_x") and s:
|
782
|
+
refl_list.append(s)
|
783
|
+
|
784
|
+
return s
|
785
|
+
|
786
|
+
for p in tree.findall(".//p"):
|
787
|
+
p_classes = _class_list(p)
|
788
|
+
p_key = next((c for c in p_classes if c.startswith("p")), None)
|
789
|
+
has_ordered_rules = p_key in p_rules
|
790
|
+
|
791
|
+
buf_parts: list[str] = []
|
792
|
+
|
793
|
+
if p.text and not has_ordered_rules:
|
794
|
+
buf_parts.append(p.text)
|
795
|
+
|
796
|
+
ordered_cache: dict[str, list[str]] = {}
|
797
|
+
|
798
|
+
for child in p:
|
799
|
+
tag = str(child.tag)
|
800
|
+
|
801
|
+
# Handle inline <y class="sy-*"> spans
|
802
|
+
if tag == "y" and not has_ordered_rules:
|
803
|
+
y_cls = next(
|
804
|
+
(c for c in _class_list(child) if c.startswith("sy-")), None
|
805
|
+
)
|
806
|
+
if y_cls and y_cls in sy_rules:
|
807
|
+
buf_parts.append(_apply_rule(child, sy_rules[y_cls]))
|
808
|
+
else:
|
809
|
+
buf_parts.append(child.text or "")
|
810
|
+
if child.tail:
|
811
|
+
buf_parts.append(child.tail)
|
812
|
+
continue
|
813
|
+
|
814
|
+
# Handle ordered paragraphs: only cache tags that appear in `orders`
|
815
|
+
if p_key and has_ordered_rules and tag in orders:
|
816
|
+
rule = p_rules[p_key].get(tag, {})
|
817
|
+
ordered_cache.setdefault(tag, []).append(_apply_rule(child, rule))
|
818
|
+
continue
|
819
|
+
|
820
|
+
# Non-ordered, non-<y> nodes: include text + tails as-is
|
821
|
+
if not has_ordered_rules:
|
822
|
+
buf_parts.append(child.text or "")
|
823
|
+
if child.tail:
|
824
|
+
buf_parts.append(child.tail)
|
825
|
+
|
826
|
+
# If ordered, flush in global orders with all duplicates preserved
|
827
|
+
if has_ordered_rules:
|
828
|
+
for tag in orders:
|
829
|
+
if tag in ordered_cache:
|
830
|
+
buf_parts.extend(ordered_cache[tag])
|
831
|
+
|
832
|
+
para = "".join(buf_parts)
|
833
|
+
if para:
|
834
|
+
paragraphs_out.append(para)
|
98
835
|
|
99
|
-
|
100
|
-
def save_font_debug(self) -> bool:
|
101
|
-
return self._config.save_font_debug
|
836
|
+
return "\n".join(paragraphs_out), refl_list
|