novel-downloader 2.0.0__py3-none-any.whl → 2.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -1
- novel_downloader/cli/download.py +14 -11
- novel_downloader/cli/export.py +19 -19
- novel_downloader/cli/ui.py +35 -8
- novel_downloader/config/adapter.py +216 -153
- novel_downloader/core/__init__.py +5 -6
- novel_downloader/core/archived/deqixs/fetcher.py +1 -28
- novel_downloader/core/downloaders/__init__.py +2 -0
- novel_downloader/core/downloaders/base.py +34 -85
- novel_downloader/core/downloaders/common.py +147 -171
- novel_downloader/core/downloaders/qianbi.py +30 -64
- novel_downloader/core/downloaders/qidian.py +157 -184
- novel_downloader/core/downloaders/qqbook.py +292 -0
- novel_downloader/core/downloaders/registry.py +2 -2
- novel_downloader/core/exporters/__init__.py +2 -0
- novel_downloader/core/exporters/base.py +37 -59
- novel_downloader/core/exporters/common.py +620 -0
- novel_downloader/core/exporters/linovelib.py +47 -0
- novel_downloader/core/exporters/qidian.py +41 -12
- novel_downloader/core/exporters/qqbook.py +28 -0
- novel_downloader/core/exporters/registry.py +2 -2
- novel_downloader/core/fetchers/__init__.py +4 -2
- novel_downloader/core/fetchers/aaatxt.py +2 -22
- novel_downloader/core/fetchers/b520.py +3 -23
- novel_downloader/core/fetchers/base.py +80 -105
- novel_downloader/core/fetchers/biquyuedu.py +2 -22
- novel_downloader/core/fetchers/dxmwx.py +10 -22
- novel_downloader/core/fetchers/esjzone.py +6 -29
- novel_downloader/core/fetchers/guidaye.py +2 -22
- novel_downloader/core/fetchers/hetushu.py +9 -29
- novel_downloader/core/fetchers/i25zw.py +2 -16
- novel_downloader/core/fetchers/ixdzs8.py +2 -16
- novel_downloader/core/fetchers/jpxs123.py +2 -16
- novel_downloader/core/fetchers/lewenn.py +2 -22
- novel_downloader/core/fetchers/linovelib.py +4 -20
- novel_downloader/core/fetchers/{eightnovel.py → n8novel.py} +12 -40
- novel_downloader/core/fetchers/piaotia.py +2 -16
- novel_downloader/core/fetchers/qbtr.py +2 -16
- novel_downloader/core/fetchers/qianbi.py +1 -20
- novel_downloader/core/fetchers/qidian.py +27 -68
- novel_downloader/core/fetchers/qqbook.py +177 -0
- novel_downloader/core/fetchers/quanben5.py +9 -29
- novel_downloader/core/fetchers/rate_limiter.py +22 -53
- novel_downloader/core/fetchers/sfacg.py +3 -16
- novel_downloader/core/fetchers/shencou.py +2 -16
- novel_downloader/core/fetchers/shuhaige.py +2 -22
- novel_downloader/core/fetchers/tongrenquan.py +2 -22
- novel_downloader/core/fetchers/ttkan.py +3 -14
- novel_downloader/core/fetchers/wanbengo.py +2 -22
- novel_downloader/core/fetchers/xiaoshuowu.py +2 -16
- novel_downloader/core/fetchers/xiguashuwu.py +4 -20
- novel_downloader/core/fetchers/xs63b.py +3 -15
- novel_downloader/core/fetchers/xshbook.py +2 -22
- novel_downloader/core/fetchers/yamibo.py +4 -28
- novel_downloader/core/fetchers/yibige.py +13 -26
- novel_downloader/core/interfaces/exporter.py +19 -7
- novel_downloader/core/interfaces/fetcher.py +23 -49
- novel_downloader/core/interfaces/parser.py +2 -2
- novel_downloader/core/parsers/__init__.py +4 -2
- novel_downloader/core/parsers/b520.py +2 -2
- novel_downloader/core/parsers/base.py +5 -39
- novel_downloader/core/parsers/esjzone.py +3 -3
- novel_downloader/core/parsers/{eightnovel.py → n8novel.py} +7 -7
- novel_downloader/core/parsers/qidian.py +717 -0
- novel_downloader/core/parsers/qqbook.py +709 -0
- novel_downloader/core/parsers/xiguashuwu.py +8 -15
- novel_downloader/core/searchers/__init__.py +2 -2
- novel_downloader/core/searchers/b520.py +1 -1
- novel_downloader/core/searchers/base.py +2 -2
- novel_downloader/core/searchers/{eightnovel.py → n8novel.py} +5 -5
- novel_downloader/locales/en.json +3 -3
- novel_downloader/locales/zh.json +3 -3
- novel_downloader/models/__init__.py +2 -0
- novel_downloader/models/book.py +1 -0
- novel_downloader/models/config.py +12 -0
- novel_downloader/resources/config/settings.toml +23 -5
- novel_downloader/resources/js_scripts/expr_to_json.js +14 -0
- novel_downloader/resources/js_scripts/qidian_decrypt_node.js +21 -16
- novel_downloader/resources/js_scripts/qq_decrypt_node.js +92 -0
- novel_downloader/utils/__init__.py +0 -2
- novel_downloader/utils/chapter_storage.py +2 -3
- novel_downloader/utils/constants.py +7 -3
- novel_downloader/utils/cookies.py +32 -17
- novel_downloader/utils/crypto_utils/__init__.py +0 -6
- novel_downloader/utils/crypto_utils/aes_util.py +1 -1
- novel_downloader/utils/crypto_utils/rc4.py +40 -50
- novel_downloader/utils/epub/__init__.py +2 -3
- novel_downloader/utils/epub/builder.py +6 -6
- novel_downloader/utils/epub/constants.py +1 -6
- novel_downloader/utils/epub/documents.py +7 -7
- novel_downloader/utils/epub/models.py +8 -8
- novel_downloader/utils/epub/utils.py +10 -10
- novel_downloader/utils/file_utils/io.py +48 -73
- novel_downloader/utils/file_utils/normalize.py +1 -7
- novel_downloader/utils/file_utils/sanitize.py +4 -11
- novel_downloader/utils/fontocr/__init__.py +13 -0
- novel_downloader/utils/{fontocr.py → fontocr/core.py} +72 -61
- novel_downloader/utils/fontocr/loader.py +52 -0
- novel_downloader/utils/logger.py +80 -56
- novel_downloader/utils/network.py +16 -40
- novel_downloader/utils/node_decryptor/__init__.py +13 -0
- novel_downloader/utils/node_decryptor/decryptor.py +342 -0
- novel_downloader/{core/parsers/qidian/utils → utils/node_decryptor}/decryptor_fetcher.py +5 -6
- novel_downloader/utils/text_utils/text_cleaner.py +39 -30
- novel_downloader/utils/text_utils/truncate_utils.py +3 -14
- novel_downloader/utils/time_utils/sleep_utils.py +53 -43
- novel_downloader/web/main.py +1 -1
- novel_downloader/web/pages/download.py +1 -1
- novel_downloader/web/pages/search.py +4 -4
- novel_downloader/web/services/task_manager.py +2 -0
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.2.dist-info}/METADATA +5 -1
- novel_downloader-2.0.2.dist-info/RECORD +203 -0
- novel_downloader/core/exporters/common/__init__.py +0 -11
- novel_downloader/core/exporters/common/epub.py +0 -198
- novel_downloader/core/exporters/common/main_exporter.py +0 -64
- novel_downloader/core/exporters/common/txt.py +0 -146
- novel_downloader/core/exporters/epub_util.py +0 -215
- novel_downloader/core/exporters/linovelib/__init__.py +0 -11
- novel_downloader/core/exporters/linovelib/epub.py +0 -349
- novel_downloader/core/exporters/linovelib/main_exporter.py +0 -66
- novel_downloader/core/exporters/linovelib/txt.py +0 -139
- novel_downloader/core/exporters/txt_util.py +0 -67
- novel_downloader/core/parsers/qidian/__init__.py +0 -10
- novel_downloader/core/parsers/qidian/book_info_parser.py +0 -89
- novel_downloader/core/parsers/qidian/chapter_encrypted.py +0 -470
- novel_downloader/core/parsers/qidian/chapter_normal.py +0 -126
- novel_downloader/core/parsers/qidian/chapter_router.py +0 -68
- novel_downloader/core/parsers/qidian/main_parser.py +0 -101
- novel_downloader/core/parsers/qidian/utils/__init__.py +0 -30
- novel_downloader/core/parsers/qidian/utils/fontmap_recover.py +0 -143
- novel_downloader/core/parsers/qidian/utils/helpers.py +0 -110
- novel_downloader/core/parsers/qidian/utils/node_decryptor.py +0 -175
- novel_downloader-2.0.0.dist-info/RECORD +0 -210
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.2.dist-info}/WHEEL +0 -0
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.2.dist-info}/entry_points.txt +0 -0
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.2.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,717 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.core.parsers.qidian
|
4
|
+
------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
from __future__ import annotations
|
9
|
+
|
10
|
+
import json
|
11
|
+
import logging
|
12
|
+
from contextlib import suppress
|
13
|
+
from html import unescape
|
14
|
+
from pathlib import Path
|
15
|
+
from typing import Any, TypedDict
|
16
|
+
|
17
|
+
from lxml import html
|
18
|
+
|
19
|
+
from novel_downloader.core.parsers.base import BaseParser
|
20
|
+
from novel_downloader.core.parsers.registry import register_parser
|
21
|
+
from novel_downloader.models import (
|
22
|
+
BookInfoDict,
|
23
|
+
ChapterDict,
|
24
|
+
ChapterInfoDict,
|
25
|
+
ParserConfig,
|
26
|
+
VolumeInfoDict,
|
27
|
+
)
|
28
|
+
from novel_downloader.utils import (
|
29
|
+
download,
|
30
|
+
truncate_half_lines,
|
31
|
+
)
|
32
|
+
from novel_downloader.utils.constants import DATA_DIR
|
33
|
+
from novel_downloader.utils.cookies import get_cookie_value
|
34
|
+
from novel_downloader.utils.fontocr import get_font_ocr
|
35
|
+
from novel_downloader.utils.node_decryptor import get_decryptor
|
36
|
+
|
37
|
+
logger = logging.getLogger(__name__)
|
38
|
+
|
39
|
+
|
40
|
+
class Rule(TypedDict, total=False):
|
41
|
+
delete_all: bool
|
42
|
+
delete_first: bool
|
43
|
+
transform_flip_x: bool
|
44
|
+
append_start_char: str
|
45
|
+
append_end_char: str
|
46
|
+
append_start_attr: str
|
47
|
+
append_end_attr: str
|
48
|
+
|
49
|
+
|
50
|
+
class Rules(TypedDict):
|
51
|
+
# e.g., orders = ["i", "em", "span"]
|
52
|
+
orders: list[str]
|
53
|
+
# e.g., sy["sy-3"] -> Rule
|
54
|
+
sy: dict[str, Rule]
|
55
|
+
# e.g., p_rules["p3"]["i"] -> Rule
|
56
|
+
p_rules: dict[str, dict[str, Rule]]
|
57
|
+
|
58
|
+
|
59
|
+
@register_parser(
|
60
|
+
site_keys=["qidian", "qd"],
|
61
|
+
)
|
62
|
+
class QidianParser(BaseParser):
|
63
|
+
"""
|
64
|
+
Parser for 起点中文网 site.
|
65
|
+
"""
|
66
|
+
|
67
|
+
def __init__(self, config: ParserConfig, fuid: str = ""):
|
68
|
+
"""
|
69
|
+
Initialize the QidianParser with the given configuration.
|
70
|
+
"""
|
71
|
+
super().__init__(config)
|
72
|
+
|
73
|
+
self._rand_path = self._base_cache_dir / "qidian" / "randomFont.ttf"
|
74
|
+
self._fixed_font_dir = self._base_cache_dir / "qidian" / "fixed_fonts"
|
75
|
+
self._fixed_map_dir = self._base_cache_dir / "qidian" / "fixed_font_map"
|
76
|
+
self._debug_dir = Path.cwd() / "debug" / "qidian"
|
77
|
+
|
78
|
+
state_files = [
|
79
|
+
DATA_DIR / "qidian" / "session_state.cookies",
|
80
|
+
]
|
81
|
+
self._fuid: str = fuid or get_cookie_value(state_files, "ywguid")
|
82
|
+
|
83
|
+
def parse_book_info(
|
84
|
+
self,
|
85
|
+
html_list: list[str],
|
86
|
+
**kwargs: Any,
|
87
|
+
) -> BookInfoDict | None:
|
88
|
+
if not html_list:
|
89
|
+
return None
|
90
|
+
|
91
|
+
doc = html.fromstring(html_list[0])
|
92
|
+
|
93
|
+
book_name = self._first_str(doc.xpath('//h1[@id="bookName"]/text()'))
|
94
|
+
author = self._first_str(doc.xpath('//a[@class="writer-name"]/text()'))
|
95
|
+
|
96
|
+
book_id = doc.xpath('//a[@id="bookImg"]/@data-bid')[0]
|
97
|
+
cover_url = f"https://bookcover.yuewen.com/qdbimg/349573/{book_id}/600.webp"
|
98
|
+
|
99
|
+
update_time = self._first_str(
|
100
|
+
doc.xpath('//span[@class="update-time"]/text()'),
|
101
|
+
replaces=[("更新时间:", "")],
|
102
|
+
)
|
103
|
+
serial_status = self._first_str(
|
104
|
+
doc.xpath('//p[@class="book-attribute"]/span[1]/text()')
|
105
|
+
)
|
106
|
+
|
107
|
+
tags = [
|
108
|
+
t.strip()
|
109
|
+
for t in doc.xpath('//p[contains(@class,"all-label")]//a/text()')
|
110
|
+
if t.strip()
|
111
|
+
]
|
112
|
+
|
113
|
+
word_count = self._first_str(doc.xpath('//p[@class="count"]/em[1]/text()'))
|
114
|
+
summary_brief = self._first_str(doc.xpath('//p[@class="intro"]/text()'))
|
115
|
+
|
116
|
+
raw_lines = [
|
117
|
+
s.strip()
|
118
|
+
for s in doc.xpath('//p[@id="book-intro-detail"]//text()')
|
119
|
+
if s.strip()
|
120
|
+
]
|
121
|
+
summary = "\n".join(raw_lines)
|
122
|
+
|
123
|
+
volumes: list[VolumeInfoDict] = []
|
124
|
+
for vol in doc.xpath('//div[@id="allCatalog"]//div[@class="catalog-volume"]'):
|
125
|
+
vol_name = self._first_str(vol.xpath('.//h3[@class="volume-name"]/text()'))
|
126
|
+
vol_name = vol_name.split(chr(183))[0].strip()
|
127
|
+
chapters: list[ChapterInfoDict] = []
|
128
|
+
for li in vol.xpath('.//ul[contains(@class,"volume-chapters")]/li'):
|
129
|
+
title = self._first_str(li.xpath('.//a[@class="chapter-name"]/text()'))
|
130
|
+
url = self._first_str(li.xpath('.//a[@class="chapter-name"]/@href'))
|
131
|
+
cid = url.rstrip("/").split("/")[-1] if url else ""
|
132
|
+
chapters.append({"title": title, "url": url, "chapterId": cid})
|
133
|
+
volumes.append({"volume_name": vol_name, "chapters": chapters})
|
134
|
+
|
135
|
+
return {
|
136
|
+
"book_name": book_name,
|
137
|
+
"author": author,
|
138
|
+
"cover_url": cover_url,
|
139
|
+
"update_time": update_time,
|
140
|
+
"word_count": word_count,
|
141
|
+
"serial_status": serial_status,
|
142
|
+
"tags": tags,
|
143
|
+
"summary_brief": summary_brief,
|
144
|
+
"summary": summary,
|
145
|
+
"volumes": volumes,
|
146
|
+
"extra": {},
|
147
|
+
}
|
148
|
+
|
149
|
+
def parse_chapter(
|
150
|
+
self,
|
151
|
+
html_list: list[str],
|
152
|
+
chapter_id: str,
|
153
|
+
**kwargs: Any,
|
154
|
+
) -> ChapterDict | None:
|
155
|
+
if not html_list:
|
156
|
+
logger.warning("[Parser] chapter_id=%s :: html_list is empty", chapter_id)
|
157
|
+
return None
|
158
|
+
try:
|
159
|
+
ssr_data = self._find_ssr_page_context(html_list[0])
|
160
|
+
chapter_info = self._extract_chapter_info(ssr_data)
|
161
|
+
except Exception as e:
|
162
|
+
logger.warning(
|
163
|
+
"[Parser] chapter_id=%s :: failed to locate ssr_pageContext block: %s",
|
164
|
+
chapter_id,
|
165
|
+
e,
|
166
|
+
)
|
167
|
+
return None
|
168
|
+
|
169
|
+
if not chapter_info:
|
170
|
+
logger.warning(
|
171
|
+
"[Parser] ssr_chapterInfo not found for chapter '%s'", chapter_id
|
172
|
+
)
|
173
|
+
return None
|
174
|
+
|
175
|
+
if not self._can_view_chapter(chapter_info):
|
176
|
+
logger.warning(
|
177
|
+
"[Parser] Chapter '%s' is not purchased or inaccessible.",
|
178
|
+
chapter_id,
|
179
|
+
)
|
180
|
+
return None
|
181
|
+
|
182
|
+
duplicated = self._is_duplicated(chapter_info)
|
183
|
+
encrypted = self._is_encrypted(chapter_info)
|
184
|
+
|
185
|
+
title = chapter_info.get("chapterName", "Untitled")
|
186
|
+
raw_html = chapter_info.get("content", "")
|
187
|
+
cid = str(chapter_info.get("chapterId") or chapter_id)
|
188
|
+
fkp = chapter_info.get("fkp", "")
|
189
|
+
author_say = chapter_info.get("authorSay", "").strip()
|
190
|
+
update_time = chapter_info.get("updateTime", "")
|
191
|
+
update_timestamp = chapter_info.get("updateTimestamp", 0)
|
192
|
+
modify_time = chapter_info.get("modifyTime", 0)
|
193
|
+
word_count = chapter_info.get("actualWords", 0)
|
194
|
+
seq = chapter_info.get("seq")
|
195
|
+
volume = chapter_info.get("extra", {}).get("volumeName", "")
|
196
|
+
|
197
|
+
if self._is_vip(chapter_info):
|
198
|
+
decryptor = get_decryptor()
|
199
|
+
raw_html = decryptor.decrypt_qd(raw_html, cid, fkp, self._fuid)
|
200
|
+
|
201
|
+
chapter_text = (
|
202
|
+
self._parse_font_encrypted(raw_html, chapter_info, cid)
|
203
|
+
if encrypted
|
204
|
+
else self._parse_normal(raw_html)
|
205
|
+
)
|
206
|
+
if not chapter_text:
|
207
|
+
logger.warning(
|
208
|
+
"[Parser] chapter_id=%s :: content empty after decryption/font-mapping",
|
209
|
+
chapter_id,
|
210
|
+
)
|
211
|
+
return None
|
212
|
+
|
213
|
+
if self._use_truncation and duplicated:
|
214
|
+
chapter_text = truncate_half_lines(chapter_text)
|
215
|
+
|
216
|
+
return {
|
217
|
+
"id": cid,
|
218
|
+
"title": title,
|
219
|
+
"content": chapter_text,
|
220
|
+
"extra": {
|
221
|
+
"author_say": author_say,
|
222
|
+
"updated_at": update_time,
|
223
|
+
"update_timestamp": update_timestamp,
|
224
|
+
"modify_time": modify_time,
|
225
|
+
"word_count": word_count,
|
226
|
+
"duplicated": duplicated,
|
227
|
+
"seq": seq,
|
228
|
+
"volume": volume,
|
229
|
+
"encrypted": encrypted,
|
230
|
+
},
|
231
|
+
}
|
232
|
+
|
233
|
+
def _parse_normal(self, raw_html: str) -> str:
|
234
|
+
"""
|
235
|
+
Extract structured chapter content from a normal Qidian page.
|
236
|
+
"""
|
237
|
+
parts = raw_html.split("<p>")
|
238
|
+
paragraphs = [unescape(p).strip() for p in parts if p.strip()]
|
239
|
+
chapter_text = "\n".join(paragraphs)
|
240
|
+
if not chapter_text:
|
241
|
+
return ""
|
242
|
+
return chapter_text
|
243
|
+
|
244
|
+
def _parse_font_encrypted(
|
245
|
+
self,
|
246
|
+
raw_html: str,
|
247
|
+
chapter_info: dict[str, Any],
|
248
|
+
cid: str,
|
249
|
+
) -> str:
|
250
|
+
"""
|
251
|
+
Steps:
|
252
|
+
1. Decode and save randomFont bytes; download fixedFont via download().
|
253
|
+
2. Parse CSS rules and save debug JSON.
|
254
|
+
3. Render encrypted paragraphs, then run OCR font-mapping.
|
255
|
+
4. Extracts paragraph texts and formats them.
|
256
|
+
"""
|
257
|
+
if not self._decode_font:
|
258
|
+
logger.warning(
|
259
|
+
"[Parser] chapter_id=%s :: font decryption skipped "
|
260
|
+
"(set `decode_font=True` to enable)",
|
261
|
+
cid,
|
262
|
+
)
|
263
|
+
return ""
|
264
|
+
|
265
|
+
css_str = chapter_info.get("css")
|
266
|
+
random_font_str = chapter_info.get("randomFont")
|
267
|
+
rf = json.loads(random_font_str) if isinstance(random_font_str, str) else None
|
268
|
+
rf_data = rf.get("data") if rf else None
|
269
|
+
fixed_woff2_url = chapter_info.get("fixedFontWoff2")
|
270
|
+
|
271
|
+
if not css_str:
|
272
|
+
logger.warning("[Parser] cid=%s :: css missing or empty", cid)
|
273
|
+
return ""
|
274
|
+
if not rf_data:
|
275
|
+
logger.warning("[Parser] cid=%s :: randomFont.data missing or empty", cid)
|
276
|
+
return ""
|
277
|
+
if not fixed_woff2_url:
|
278
|
+
logger.warning("[Parser] cid=%s :: fixedFontWoff2 missing or empty", cid)
|
279
|
+
return ""
|
280
|
+
|
281
|
+
debug_dir = self._debug_dir / "font_debug" / cid
|
282
|
+
if self._save_font_debug:
|
283
|
+
debug_dir.mkdir(parents=True, exist_ok=True)
|
284
|
+
|
285
|
+
try:
|
286
|
+
self._rand_path.parent.mkdir(parents=True, exist_ok=True)
|
287
|
+
self._rand_path.write_bytes(bytes(rf_data))
|
288
|
+
except Exception as e:
|
289
|
+
logger.error(
|
290
|
+
"[Parser] cid=%s :: failed to write randomFont.ttf",
|
291
|
+
cid,
|
292
|
+
exc_info=e,
|
293
|
+
)
|
294
|
+
return ""
|
295
|
+
|
296
|
+
fixed_path = download(
|
297
|
+
url=fixed_woff2_url,
|
298
|
+
target_dir=self._fixed_font_dir,
|
299
|
+
on_exist="skip",
|
300
|
+
)
|
301
|
+
if fixed_path is None:
|
302
|
+
logger.warning(
|
303
|
+
"[Parser] failed to download fixedfont for chapter '%s'", cid
|
304
|
+
)
|
305
|
+
return ""
|
306
|
+
|
307
|
+
css_rules = self._parse_css_rules(css_str)
|
308
|
+
paragraphs_str, refl_list = self._render_visible_text(raw_html, css_rules)
|
309
|
+
if self._save_font_debug:
|
310
|
+
(debug_dir / f"{cid}_debug.txt").write_text(
|
311
|
+
paragraphs_str, encoding="utf-8"
|
312
|
+
)
|
313
|
+
|
314
|
+
# Run OCR + fallback mapping
|
315
|
+
char_set = {c for c in paragraphs_str if c not in {" ", "\n", "\u3000"}}
|
316
|
+
refl_set = set(refl_list)
|
317
|
+
char_set = char_set - refl_set
|
318
|
+
if self._save_font_debug:
|
319
|
+
(debug_dir / "char_set_debug.txt").write_text(
|
320
|
+
f"char_set:\n{char_set}\n\nrefl_set:\n{refl_set}",
|
321
|
+
encoding="utf-8",
|
322
|
+
)
|
323
|
+
|
324
|
+
mapping_result = self._generate_font_map(
|
325
|
+
fixed_font_path=fixed_path,
|
326
|
+
random_font_path=self._rand_path,
|
327
|
+
char_set=char_set,
|
328
|
+
refl_set=refl_set,
|
329
|
+
batch_size=self._batch_size,
|
330
|
+
)
|
331
|
+
if not mapping_result:
|
332
|
+
logger.warning(
|
333
|
+
"[Parser] font mapping returned empty result for chapter '%s'", cid
|
334
|
+
)
|
335
|
+
return ""
|
336
|
+
|
337
|
+
if self._save_font_debug:
|
338
|
+
(debug_dir / "font_mapping.json").write_text(
|
339
|
+
json.dumps(mapping_result, ensure_ascii=False, indent=2),
|
340
|
+
encoding="utf-8",
|
341
|
+
)
|
342
|
+
|
343
|
+
# Reconstruct final readable text
|
344
|
+
original_text = self._apply_font_mapping(
|
345
|
+
text=paragraphs_str,
|
346
|
+
font_map=mapping_result,
|
347
|
+
)
|
348
|
+
|
349
|
+
return "\n".join(
|
350
|
+
line.strip() for line in original_text.splitlines() if line.strip()
|
351
|
+
)
|
352
|
+
|
353
|
+
@staticmethod
|
354
|
+
def _find_ssr_page_context(html_str: str) -> dict[str, Any]:
|
355
|
+
"""
|
356
|
+
Extract SSR JSON from <script id="vite-plugin-ssr_pageContext">.
|
357
|
+
"""
|
358
|
+
tree = html.fromstring(html_str)
|
359
|
+
script = tree.xpath('//script[@id="vite-plugin-ssr_pageContext"]/text()')
|
360
|
+
return json.loads(script[0].strip()) if script else {}
|
361
|
+
|
362
|
+
@staticmethod
|
363
|
+
def _extract_chapter_info(ssr_data: dict[str, Any]) -> dict[str, Any]:
|
364
|
+
"""
|
365
|
+
Extract the 'chapterInfo' dictionary from the SSR page context.
|
366
|
+
|
367
|
+
This handles nested key access and returns an empty dict if missing.
|
368
|
+
|
369
|
+
:param ssr_data: The full SSR data object from _find_ssr_page_context().
|
370
|
+
:return: A dict with chapter metadata such as chapterName, authorSay, etc.
|
371
|
+
"""
|
372
|
+
page_context = ssr_data.get("pageContext", {})
|
373
|
+
page_props = page_context.get("pageProps", {})
|
374
|
+
page_data = page_props.get("pageData", {})
|
375
|
+
chapter_info = page_data.get("chapterInfo", {})
|
376
|
+
return chapter_info if isinstance(chapter_info, dict) else {}
|
377
|
+
|
378
|
+
@classmethod
|
379
|
+
def _is_vip(cls, chapter_info: dict[str, Any]) -> bool:
|
380
|
+
"""
|
381
|
+
:return: True if VIP, False otherwise.
|
382
|
+
"""
|
383
|
+
vip_flag = chapter_info.get("vipStatus", 0)
|
384
|
+
fens_flag = chapter_info.get("fEnS", 0)
|
385
|
+
return bool(vip_flag == 1 and fens_flag != 0)
|
386
|
+
|
387
|
+
@classmethod
|
388
|
+
def _can_view_chapter(cls, chapter_info: dict[str, Any]) -> bool:
|
389
|
+
"""
|
390
|
+
A chapter is not viewable if it is marked as VIP
|
391
|
+
and has not been purchased.
|
392
|
+
|
393
|
+
:return: True if viewable, False otherwise.
|
394
|
+
"""
|
395
|
+
is_buy = chapter_info.get("isBuy", 0)
|
396
|
+
vip_status = chapter_info.get("vipStatus", 0)
|
397
|
+
return not (vip_status == 1 and is_buy == 0)
|
398
|
+
|
399
|
+
@classmethod
|
400
|
+
def _is_duplicated(cls, chapter_info: dict[str, Any]) -> bool:
|
401
|
+
"""
|
402
|
+
Check if chapter is marked as duplicated (eFW = 1).
|
403
|
+
"""
|
404
|
+
efw_flag = chapter_info.get("eFW", 0)
|
405
|
+
return bool(efw_flag == 1)
|
406
|
+
|
407
|
+
@classmethod
|
408
|
+
def _is_encrypted(cls, chapter_info: dict[str, Any]) -> bool:
|
409
|
+
"""
|
410
|
+
Return True if content is encrypted.
|
411
|
+
|
412
|
+
Chapter Encryption Status (cES):
|
413
|
+
* 0: 内容是'明文'
|
414
|
+
* 2: 字体加密
|
415
|
+
"""
|
416
|
+
return int(chapter_info.get("cES", 0)) == 2
|
417
|
+
|
418
|
+
def _generate_font_map(
|
419
|
+
self,
|
420
|
+
fixed_font_path: Path,
|
421
|
+
random_font_path: Path,
|
422
|
+
char_set: set[str],
|
423
|
+
refl_set: set[str],
|
424
|
+
batch_size: int = 32,
|
425
|
+
) -> dict[str, str]:
|
426
|
+
"""
|
427
|
+
Build a mapping from scrambled font chars to real chars.
|
428
|
+
|
429
|
+
Uses OCR to decode and generate mapping from a fixed obfuscated font
|
430
|
+
and an random obfuscated font. Results are cached in JSON.
|
431
|
+
|
432
|
+
:param fixed_font_path: fixed font file.
|
433
|
+
:param random_font_path: random font file.
|
434
|
+
:param char_set: Characters to match directly.
|
435
|
+
:param refl_set: Characters to match in flipped form.
|
436
|
+
:param batch_size: How many chars to OCR per batch.
|
437
|
+
|
438
|
+
:return: { obf_char: real_char, ... }
|
439
|
+
"""
|
440
|
+
font_ocr = get_font_ocr(self._fontocr_cfg)
|
441
|
+
if not font_ocr:
|
442
|
+
return {}
|
443
|
+
|
444
|
+
mapping_result: dict[str, str] = {}
|
445
|
+
fixed_map_file = self._fixed_map_dir / f"{fixed_font_path.stem}.json"
|
446
|
+
fixed_map_file.parent.mkdir(parents=True, exist_ok=True)
|
447
|
+
|
448
|
+
# load existing cache
|
449
|
+
try:
|
450
|
+
with open(fixed_map_file, encoding="utf-8") as f:
|
451
|
+
fixed_map = json.load(f)
|
452
|
+
cached_chars = set(fixed_map.keys())
|
453
|
+
mapping_result.update(
|
454
|
+
{ch: fixed_map[ch] for ch in char_set if ch in fixed_map}
|
455
|
+
)
|
456
|
+
mapping_result.update(
|
457
|
+
{ch: fixed_map[ch] for ch in refl_set if ch in fixed_map}
|
458
|
+
)
|
459
|
+
char_set = char_set - cached_chars
|
460
|
+
refl_set = refl_set - cached_chars
|
461
|
+
except Exception:
|
462
|
+
fixed_map = {}
|
463
|
+
cached_chars = set()
|
464
|
+
|
465
|
+
# prepare font renderers and cmap sets
|
466
|
+
fixed_chars = font_ocr.extract_font_charset(fixed_font_path)
|
467
|
+
random_chars = font_ocr.extract_font_charset(random_font_path)
|
468
|
+
fixed_font = font_ocr.load_render_font(fixed_font_path)
|
469
|
+
random_font = font_ocr.load_render_font(random_font_path)
|
470
|
+
|
471
|
+
# process normal and reflected sets together
|
472
|
+
rendered = []
|
473
|
+
for chars, reflect in [(char_set, False), (refl_set, True)]:
|
474
|
+
for ch in chars:
|
475
|
+
if ch in fixed_chars:
|
476
|
+
font = fixed_font
|
477
|
+
elif ch in random_chars:
|
478
|
+
font = random_font
|
479
|
+
else:
|
480
|
+
continue
|
481
|
+
rendered.append(
|
482
|
+
(ch, font_ocr.render_char_image_array(ch, font, reflect))
|
483
|
+
)
|
484
|
+
|
485
|
+
if rendered:
|
486
|
+
# query OCR+vec simultaneously
|
487
|
+
imgs_to_query = [img for _, img in rendered]
|
488
|
+
fused = font_ocr.predict(imgs_to_query, batch_size=batch_size)
|
489
|
+
|
490
|
+
# pick best per char, apply threshold + cache
|
491
|
+
for (ch, _), preds in zip(rendered, fused, strict=False):
|
492
|
+
if not preds:
|
493
|
+
continue
|
494
|
+
real_char, _ = preds
|
495
|
+
mapping_result[ch] = real_char
|
496
|
+
fixed_map[ch] = real_char
|
497
|
+
|
498
|
+
# persist updated fixed_map
|
499
|
+
try:
|
500
|
+
with open(fixed_map_file, "w", encoding="utf-8") as f:
|
501
|
+
json.dump(fixed_map, f, ensure_ascii=False, indent=2)
|
502
|
+
except Exception as e:
|
503
|
+
logger.error("[FontOCR] Failed to save fixed map: %s", e)
|
504
|
+
|
505
|
+
return mapping_result
|
506
|
+
|
507
|
+
@staticmethod
|
508
|
+
def _apply_font_mapping(text: str, font_map: dict[str, str]) -> str:
|
509
|
+
"""
|
510
|
+
Replace each character in `text` using `font_map`,
|
511
|
+
leaving unmapped characters unchanged.
|
512
|
+
|
513
|
+
:param text: The input string, possibly containing obfuscated font chars.
|
514
|
+
:param font_map: A dict mapping obfuscated chars to real chars.
|
515
|
+
:return: The de-obfuscated text.
|
516
|
+
"""
|
517
|
+
return "".join(font_map.get(ch, ch) for ch in text)
|
518
|
+
|
519
|
+
@staticmethod
|
520
|
+
def _only_tag(selector: str) -> str | None:
|
521
|
+
"""
|
522
|
+
Normalize a selector into just its tag name for ordering.
|
523
|
+
|
524
|
+
Handles forms like 'i', 'em::before', '.p3 i', '.p2 span::after'.
|
525
|
+
|
526
|
+
Returns None if can't extract a tag.
|
527
|
+
"""
|
528
|
+
# If it has spaces, take the rightmost simple selector
|
529
|
+
last = selector.strip().split()[-1]
|
530
|
+
# Drop ::pseudo
|
531
|
+
last = last.split("::", 1)[0]
|
532
|
+
# If it's like 'span[attr=..]' keep 'span'
|
533
|
+
last = last.split("[", 1)[0]
|
534
|
+
# If it starts with '.', it's not a tag
|
535
|
+
if not last or last.startswith("."):
|
536
|
+
return None
|
537
|
+
return last
|
538
|
+
|
539
|
+
@staticmethod
|
540
|
+
def _parse_decls(block: str) -> list[tuple[str, str]]:
|
541
|
+
"""
|
542
|
+
Parse 'name:value;...' inside a block. Tolerates quotes and attr().
|
543
|
+
"""
|
544
|
+
parts = [d.strip() for d in block.split(";") if d.strip()]
|
545
|
+
decls = []
|
546
|
+
for p in parts:
|
547
|
+
if ":" in p:
|
548
|
+
name, val = p.split(":", 1)
|
549
|
+
decls.append((name.strip().lower(), val.strip()))
|
550
|
+
return decls
|
551
|
+
|
552
|
+
@classmethod
|
553
|
+
def _parse_css_rules(cls, css_str: str) -> Rules:
|
554
|
+
"""
|
555
|
+
Produces normalized Rules with:
|
556
|
+
* orders: list[str] of tag names sorted by numeric 'order'
|
557
|
+
* sy: '.sy-*' class rules
|
558
|
+
* p_rules: '.p* <tag>' rules, indexed by p-class then tag
|
559
|
+
"""
|
560
|
+
rules: Rules = {"orders": [], "sy": {}, "p_rules": {}}
|
561
|
+
order_pairs: list[tuple[str, int]] = []
|
562
|
+
|
563
|
+
pos = 0
|
564
|
+
while True:
|
565
|
+
b1 = css_str.find("{", pos)
|
566
|
+
if b1 == -1:
|
567
|
+
break
|
568
|
+
selector = css_str[pos:b1].strip().lower()
|
569
|
+
b2 = css_str.find("}", b1 + 1)
|
570
|
+
if b2 == -1:
|
571
|
+
break
|
572
|
+
block = css_str[b1 + 1 : b2]
|
573
|
+
pos = b2 + 1
|
574
|
+
|
575
|
+
decls = cls._parse_decls(block)
|
576
|
+
new_rule: Rule = {}
|
577
|
+
order_val: int | None = None
|
578
|
+
|
579
|
+
for name, value in decls:
|
580
|
+
v = value.strip()
|
581
|
+
if name == "font-size" and v == "0":
|
582
|
+
new_rule[
|
583
|
+
"delete_first" if "::first-letter" in selector else "delete_all"
|
584
|
+
] = True
|
585
|
+
elif name == "transform" and "scalex(-1" in v.replace(" ", "").lower():
|
586
|
+
new_rule["transform_flip_x"] = True
|
587
|
+
elif name == "order":
|
588
|
+
with suppress(ValueError):
|
589
|
+
order_val = int(v)
|
590
|
+
elif name == "content":
|
591
|
+
if "::after" in selector:
|
592
|
+
if v.lower().startswith("attr("):
|
593
|
+
new_rule["append_end_attr"] = v[5:-1].strip()
|
594
|
+
else:
|
595
|
+
new_rule["append_end_char"] = v.strip().strip("\"'")
|
596
|
+
elif "::before" in selector:
|
597
|
+
if v.lower().startswith("attr("):
|
598
|
+
new_rule["append_start_attr"] = v[5:-1].strip()
|
599
|
+
else:
|
600
|
+
new_rule["append_start_char"] = v.strip().strip("\"'")
|
601
|
+
|
602
|
+
if selector.startswith(".sy-"):
|
603
|
+
key = selector.lstrip(".")
|
604
|
+
rules["sy"][key] = {**rules["sy"].get(key, {}), **new_rule}
|
605
|
+
elif selector.startswith(".p") and " " in selector:
|
606
|
+
p_cls, right = selector.split(" ", 1)
|
607
|
+
tag = cls._only_tag(right)
|
608
|
+
if tag:
|
609
|
+
p_cls = p_cls.lstrip(".")
|
610
|
+
rules["p_rules"].setdefault(p_cls, {})
|
611
|
+
rules["p_rules"][p_cls][tag] = {
|
612
|
+
**rules["p_rules"][p_cls].get(tag, {}),
|
613
|
+
**new_rule,
|
614
|
+
}
|
615
|
+
|
616
|
+
if order_val is not None:
|
617
|
+
tag = cls._only_tag(selector)
|
618
|
+
if tag:
|
619
|
+
order_pairs.append((tag, order_val))
|
620
|
+
|
621
|
+
rules["orders"] = [t for t, _ in sorted(order_pairs, key=lambda x: x[1])]
|
622
|
+
return rules
|
623
|
+
|
624
|
+
@staticmethod
|
625
|
+
def _render_visible_text(html_str: str, rules: Rules) -> tuple[str, list[str]]:
|
626
|
+
"""
|
627
|
+
Renderer the HTML using pre-parsed Rules.
|
628
|
+
"""
|
629
|
+
tree = html.fromstring(html_str)
|
630
|
+
paragraphs_out: list[str] = []
|
631
|
+
refl_list: list[str] = []
|
632
|
+
orders = rules.get("orders") or []
|
633
|
+
p_rules = rules.get("p_rules") or {}
|
634
|
+
sy_rules = rules.get("sy") or {}
|
635
|
+
|
636
|
+
def _class_list(el: html.HtmlElement) -> list[str]:
|
637
|
+
cls = el.get("class")
|
638
|
+
return cls.split() if cls else []
|
639
|
+
|
640
|
+
def _apply_rule(el: html.HtmlElement, rule: Rule) -> str:
|
641
|
+
if rule.get("delete_all"):
|
642
|
+
return ""
|
643
|
+
|
644
|
+
parts: list[str] = []
|
645
|
+
if "append_start_char" in rule:
|
646
|
+
parts.append(rule["append_start_char"])
|
647
|
+
if "append_start_attr" in rule:
|
648
|
+
parts.append(el.get(rule["append_start_attr"], ""))
|
649
|
+
|
650
|
+
text = el.text or ""
|
651
|
+
if rule.get("delete_first") and text:
|
652
|
+
text = text[1:]
|
653
|
+
parts.append(text)
|
654
|
+
|
655
|
+
if "append_end_char" in rule:
|
656
|
+
parts.append(rule["append_end_char"])
|
657
|
+
if "append_end_attr" in rule:
|
658
|
+
parts.append(el.get(rule["append_end_attr"], ""))
|
659
|
+
|
660
|
+
s = "".join(parts)
|
661
|
+
|
662
|
+
if rule.get("transform_flip_x") and s:
|
663
|
+
refl_list.append(s)
|
664
|
+
|
665
|
+
return s
|
666
|
+
|
667
|
+
for p in tree.findall(".//p"):
|
668
|
+
p_classes = _class_list(p)
|
669
|
+
p_key = next((c for c in p_classes if c.startswith("p")), None)
|
670
|
+
has_ordered_rules = p_key in p_rules
|
671
|
+
|
672
|
+
buf_parts: list[str] = []
|
673
|
+
|
674
|
+
if p.text and not has_ordered_rules:
|
675
|
+
buf_parts.append(p.text)
|
676
|
+
|
677
|
+
ordered_cache: dict[str, list[str]] = {}
|
678
|
+
|
679
|
+
for child in p:
|
680
|
+
tag = str(child.tag)
|
681
|
+
|
682
|
+
# Handle inline <y class="sy-*"> spans
|
683
|
+
if tag == "y" and not has_ordered_rules:
|
684
|
+
y_cls = next(
|
685
|
+
(c for c in _class_list(child) if c.startswith("sy-")), None
|
686
|
+
)
|
687
|
+
if y_cls and y_cls in sy_rules:
|
688
|
+
buf_parts.append(_apply_rule(child, sy_rules[y_cls]))
|
689
|
+
else:
|
690
|
+
buf_parts.append(child.text or "")
|
691
|
+
if child.tail:
|
692
|
+
buf_parts.append(child.tail)
|
693
|
+
continue
|
694
|
+
|
695
|
+
# Handle ordered paragraphs: only cache tags that appear in `orders`
|
696
|
+
if p_key and has_ordered_rules and tag in orders:
|
697
|
+
rule = p_rules[p_key].get(tag, {})
|
698
|
+
ordered_cache.setdefault(tag, []).append(_apply_rule(child, rule))
|
699
|
+
continue
|
700
|
+
|
701
|
+
# Non-ordered, non-<y> nodes: include text + tails as-is
|
702
|
+
if not has_ordered_rules:
|
703
|
+
buf_parts.append(child.text or "")
|
704
|
+
if child.tail:
|
705
|
+
buf_parts.append(child.tail)
|
706
|
+
|
707
|
+
# If ordered, flush in global orders with all duplicates preserved
|
708
|
+
if has_ordered_rules:
|
709
|
+
for tag in orders:
|
710
|
+
if tag in ordered_cache:
|
711
|
+
buf_parts.extend(ordered_cache[tag])
|
712
|
+
|
713
|
+
para = "".join(buf_parts)
|
714
|
+
if para:
|
715
|
+
paragraphs_out.append(para)
|
716
|
+
|
717
|
+
return "\n".join(paragraphs_out), refl_list
|