novel-downloader 2.0.0__py3-none-any.whl → 2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -1
- novel_downloader/cli/download.py +3 -3
- novel_downloader/cli/export.py +1 -1
- novel_downloader/cli/ui.py +7 -7
- novel_downloader/config/adapter.py +191 -154
- novel_downloader/core/__init__.py +5 -6
- novel_downloader/core/exporters/common/txt.py +9 -9
- novel_downloader/core/exporters/linovelib/txt.py +9 -9
- novel_downloader/core/fetchers/qidian.py +20 -35
- novel_downloader/core/interfaces/fetcher.py +2 -2
- novel_downloader/core/interfaces/parser.py +2 -2
- novel_downloader/core/parsers/base.py +1 -0
- novel_downloader/core/parsers/eightnovel.py +2 -2
- novel_downloader/core/parsers/esjzone.py +3 -3
- novel_downloader/core/parsers/qidian/main_parser.py +747 -12
- novel_downloader/core/parsers/qidian/utils/__init__.py +2 -21
- novel_downloader/core/parsers/qidian/utils/node_decryptor.py +4 -4
- novel_downloader/core/parsers/xiguashuwu.py +6 -12
- novel_downloader/locales/en.json +3 -3
- novel_downloader/locales/zh.json +3 -3
- novel_downloader/utils/__init__.py +0 -2
- novel_downloader/utils/chapter_storage.py +2 -3
- novel_downloader/utils/constants.py +1 -3
- novel_downloader/utils/cookies.py +32 -17
- novel_downloader/utils/crypto_utils/__init__.py +0 -6
- novel_downloader/utils/crypto_utils/rc4.py +40 -50
- novel_downloader/utils/epub/__init__.py +2 -3
- novel_downloader/utils/epub/builder.py +6 -6
- novel_downloader/utils/epub/constants.py +5 -5
- novel_downloader/utils/epub/documents.py +7 -7
- novel_downloader/utils/epub/models.py +8 -8
- novel_downloader/utils/epub/utils.py +10 -10
- novel_downloader/utils/file_utils/io.py +48 -73
- novel_downloader/utils/file_utils/normalize.py +1 -7
- novel_downloader/utils/file_utils/sanitize.py +4 -11
- novel_downloader/utils/fontocr/__init__.py +13 -0
- novel_downloader/utils/{fontocr.py → fontocr/core.py} +70 -61
- novel_downloader/utils/fontocr/loader.py +50 -0
- novel_downloader/utils/logger.py +80 -56
- novel_downloader/utils/network.py +16 -40
- novel_downloader/utils/text_utils/text_cleaner.py +39 -30
- novel_downloader/utils/text_utils/truncate_utils.py +3 -14
- novel_downloader/utils/time_utils/sleep_utils.py +53 -43
- novel_downloader/web/main.py +1 -1
- novel_downloader/web/pages/search.py +3 -3
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.1.dist-info}/METADATA +2 -1
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.1.dist-info}/RECORD +51 -55
- novel_downloader/core/parsers/qidian/book_info_parser.py +0 -89
- novel_downloader/core/parsers/qidian/chapter_encrypted.py +0 -470
- novel_downloader/core/parsers/qidian/chapter_normal.py +0 -126
- novel_downloader/core/parsers/qidian/chapter_router.py +0 -68
- novel_downloader/core/parsers/qidian/utils/fontmap_recover.py +0 -143
- novel_downloader/core/parsers/qidian/utils/helpers.py +0 -110
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.1.dist-info}/WHEEL +0 -0
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.1.dist-info}/entry_points.txt +0 -0
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.1.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.1.dist-info}/top_level.txt +0 -0
@@ -1,143 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python3
|
2
|
-
"""
|
3
|
-
novel_downloader.core.parsers.qidian.utils.fontmap_recover
|
4
|
-
----------------------------------------------------------
|
5
|
-
|
6
|
-
Tools for generating and applying font character mappings
|
7
|
-
to recover obfuscated Qidian text.
|
8
|
-
"""
|
9
|
-
|
10
|
-
__all__ = [
|
11
|
-
"generate_font_map",
|
12
|
-
"apply_font_mapping",
|
13
|
-
]
|
14
|
-
|
15
|
-
import json
|
16
|
-
import logging
|
17
|
-
from pathlib import Path
|
18
|
-
|
19
|
-
import numpy as np
|
20
|
-
from fontTools.ttLib import TTFont
|
21
|
-
from PIL import ImageFont
|
22
|
-
|
23
|
-
logger = logging.getLogger(__name__)
|
24
|
-
CHAR_FONT_SIZE = 52
|
25
|
-
|
26
|
-
|
27
|
-
def generate_font_map(
|
28
|
-
fixed_font_path: Path,
|
29
|
-
random_font_path: Path,
|
30
|
-
char_set: set[str],
|
31
|
-
refl_set: set[str],
|
32
|
-
cache_dir: Path,
|
33
|
-
batch_size: int = 32,
|
34
|
-
) -> dict[str, str]:
|
35
|
-
"""
|
36
|
-
Build a mapping from scrambled font chars to real chars.
|
37
|
-
|
38
|
-
Uses OCR to compare rendered glyphs from a known (fixed) font and an
|
39
|
-
obfuscated (random) font. Results are cached in JSON so repeated runs
|
40
|
-
are faster.
|
41
|
-
|
42
|
-
:param fixed_font_path: fixed font file.
|
43
|
-
:param random_font_path: random font file.
|
44
|
-
:param char_set: Characters to match directly.
|
45
|
-
:param refl_set: Characters to match in flipped form.
|
46
|
-
:param cache_dir: Directory to save/load cached results.
|
47
|
-
:param batch_size: How many chars to OCR per batch.
|
48
|
-
|
49
|
-
:return: { obf_char: real_char, ... }
|
50
|
-
"""
|
51
|
-
try:
|
52
|
-
from novel_downloader.utils.fontocr import get_font_ocr
|
53
|
-
|
54
|
-
font_ocr = get_font_ocr(batch_size=batch_size)
|
55
|
-
except ImportError:
|
56
|
-
logger.warning("[QidianParser] FontOCR not available, font decoding will skip")
|
57
|
-
return {}
|
58
|
-
|
59
|
-
mapping_result: dict[str, str] = {}
|
60
|
-
fixed_map_file = cache_dir / "fixed_font_map" / f"{Path(fixed_font_path).stem}.json"
|
61
|
-
fixed_map_file.parent.mkdir(parents=True, exist_ok=True)
|
62
|
-
|
63
|
-
# load existing cache
|
64
|
-
try:
|
65
|
-
with open(fixed_map_file, encoding="utf-8") as f:
|
66
|
-
fixed_map = json.load(f)
|
67
|
-
cached_chars = set(fixed_map.keys())
|
68
|
-
mapping_result.update({ch: fixed_map[ch] for ch in char_set if ch in fixed_map})
|
69
|
-
mapping_result.update({ch: fixed_map[ch] for ch in refl_set if ch in fixed_map})
|
70
|
-
char_set = set(char_set) - cached_chars
|
71
|
-
refl_set = set(refl_set) - cached_chars
|
72
|
-
except Exception:
|
73
|
-
fixed_map = {}
|
74
|
-
cached_chars = set()
|
75
|
-
|
76
|
-
# prepare font renderers and cmap sets
|
77
|
-
try:
|
78
|
-
fixed_ttf = TTFont(fixed_font_path)
|
79
|
-
fixed_chars = {chr(c) for c in fixed_ttf.getBestCmap()}
|
80
|
-
fixed_font = ImageFont.truetype(str(fixed_font_path), CHAR_FONT_SIZE)
|
81
|
-
|
82
|
-
random_ttf = TTFont(random_font_path)
|
83
|
-
random_chars = {chr(c) for c in random_ttf.getBestCmap()}
|
84
|
-
random_font = ImageFont.truetype(str(random_font_path), CHAR_FONT_SIZE)
|
85
|
-
except Exception as e:
|
86
|
-
logger.error("[FontOCR] Failed to load TTF fonts: %s", e)
|
87
|
-
return mapping_result
|
88
|
-
|
89
|
-
def _render_batch(chars: list[tuple[str, bool]]) -> list[tuple[str, np.ndarray]]:
|
90
|
-
out = []
|
91
|
-
for ch, reflect in chars:
|
92
|
-
if ch in fixed_chars:
|
93
|
-
font = fixed_font
|
94
|
-
elif ch in random_chars:
|
95
|
-
font = random_font
|
96
|
-
else:
|
97
|
-
continue
|
98
|
-
img = font_ocr.render_char_image_array(ch, font, reflect)
|
99
|
-
if img is not None:
|
100
|
-
out.append((ch, img))
|
101
|
-
return out
|
102
|
-
|
103
|
-
# process normal and reflected sets together
|
104
|
-
for chars, reflect in [(list(char_set), False), (list(refl_set), True)]:
|
105
|
-
for batch_chars in font_ocr._chunked(chars, font_ocr._batch_size):
|
106
|
-
# render all images in this batch
|
107
|
-
to_render = [(ch, reflect) for ch in batch_chars]
|
108
|
-
rendered = _render_batch(to_render)
|
109
|
-
if not rendered:
|
110
|
-
continue
|
111
|
-
|
112
|
-
# query OCR+vec simultaneously
|
113
|
-
imgs_to_query = [img for (ch, img) in rendered]
|
114
|
-
fused = font_ocr.predict(imgs_to_query, top_k=1)
|
115
|
-
|
116
|
-
# pick best per char, apply threshold + cache
|
117
|
-
for (ch, _), preds in zip(rendered, fused, strict=False):
|
118
|
-
if not preds:
|
119
|
-
continue
|
120
|
-
real_char, _ = preds[0]
|
121
|
-
mapping_result[ch] = real_char
|
122
|
-
fixed_map[ch] = real_char
|
123
|
-
|
124
|
-
# persist updated fixed_map
|
125
|
-
try:
|
126
|
-
with open(fixed_map_file, "w", encoding="utf-8") as f:
|
127
|
-
json.dump(fixed_map, f, ensure_ascii=False, indent=2)
|
128
|
-
except Exception as e:
|
129
|
-
logger.error("[FontOCR] Failed to save fixed map: %s", e)
|
130
|
-
|
131
|
-
return mapping_result
|
132
|
-
|
133
|
-
|
134
|
-
def apply_font_mapping(text: str, font_map: dict[str, str]) -> str:
|
135
|
-
"""
|
136
|
-
Replace each character in `text` using `font_map`,
|
137
|
-
leaving unmapped characters unchanged.
|
138
|
-
|
139
|
-
:param text: The input string, possibly containing obfuscated font chars.
|
140
|
-
:param font_map: A dict mapping obfuscated chars to real chars.
|
141
|
-
:return: The de-obfuscated text.
|
142
|
-
"""
|
143
|
-
return "".join(font_map.get(ch, ch) for ch in text)
|
@@ -1,110 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python3
|
2
|
-
"""
|
3
|
-
novel_downloader.core.parsers.qidian.utils.helpers
|
4
|
-
--------------------------------------------------
|
5
|
-
|
6
|
-
Shared utility functions for parsing Qidian pages.
|
7
|
-
"""
|
8
|
-
|
9
|
-
import json
|
10
|
-
import logging
|
11
|
-
from typing import Any
|
12
|
-
|
13
|
-
from lxml import html
|
14
|
-
|
15
|
-
logger = logging.getLogger(__name__)
|
16
|
-
|
17
|
-
|
18
|
-
def find_ssr_page_context(html_str: str) -> dict[str, Any]:
|
19
|
-
"""
|
20
|
-
Extract SSR JSON from <script id="vite-plugin-ssr_pageContext">.
|
21
|
-
"""
|
22
|
-
try:
|
23
|
-
tree = html.fromstring(html_str)
|
24
|
-
script = tree.xpath('//script[@id="vite-plugin-ssr_pageContext"]/text()')
|
25
|
-
if script:
|
26
|
-
data: dict[str, Any] = json.loads(script[0].strip())
|
27
|
-
return data
|
28
|
-
except Exception as e:
|
29
|
-
logger.warning("[Parser] SSR JSON parse error: %s", e)
|
30
|
-
return {}
|
31
|
-
|
32
|
-
|
33
|
-
def extract_chapter_info(ssr_data: dict[str, Any]) -> dict[str, Any]:
|
34
|
-
"""
|
35
|
-
Extract the 'chapterInfo' dictionary from the SSR page context.
|
36
|
-
|
37
|
-
This handles nested key access and returns an empty dict if missing.
|
38
|
-
|
39
|
-
:param ssr_data: The full SSR data object from _find_ssr_page_context().
|
40
|
-
:return: A dict with chapter metadata such as chapterName, authorSay, etc.
|
41
|
-
"""
|
42
|
-
try:
|
43
|
-
page_context = ssr_data.get("pageContext", {})
|
44
|
-
page_props = page_context.get("pageProps", {})
|
45
|
-
page_data = page_props.get("pageData", {})
|
46
|
-
chapter_info = page_data.get("chapterInfo", {})
|
47
|
-
|
48
|
-
assert isinstance(chapter_info, dict)
|
49
|
-
return chapter_info
|
50
|
-
except Exception:
|
51
|
-
return {}
|
52
|
-
|
53
|
-
|
54
|
-
def is_restricted_page(html_str: str) -> bool:
|
55
|
-
"""
|
56
|
-
Return True if page content indicates access restriction
|
57
|
-
(e.g. not subscribed/purchased).
|
58
|
-
|
59
|
-
:param html_str: Raw HTML string.
|
60
|
-
"""
|
61
|
-
markers = ["这是VIP章节", "需要订阅", "订阅后才能阅读"]
|
62
|
-
return any(m in html_str for m in markers)
|
63
|
-
|
64
|
-
|
65
|
-
def vip_status(ssr_data: dict[str, Any]) -> bool:
|
66
|
-
"""
|
67
|
-
:return: True if VIP, False otherwise.
|
68
|
-
"""
|
69
|
-
chapter_info = extract_chapter_info(ssr_data)
|
70
|
-
vip_flag = chapter_info.get("vipStatus", 0)
|
71
|
-
fens_flag = chapter_info.get("fEnS", 0)
|
72
|
-
return bool(vip_flag == 1 and fens_flag != 0)
|
73
|
-
|
74
|
-
|
75
|
-
def can_view_chapter(ssr_data: dict[str, Any]) -> bool:
|
76
|
-
"""
|
77
|
-
A chapter is not viewable if it is marked as VIP
|
78
|
-
and has not been purchased.
|
79
|
-
|
80
|
-
:return: True if viewable, False otherwise.
|
81
|
-
"""
|
82
|
-
chapter_info = extract_chapter_info(ssr_data)
|
83
|
-
is_buy = chapter_info.get("isBuy", 0)
|
84
|
-
vip_status = chapter_info.get("vipStatus", 0)
|
85
|
-
return not (vip_status == 1 and is_buy == 0)
|
86
|
-
|
87
|
-
|
88
|
-
def is_duplicated(ssr_data: dict[str, Any]) -> bool:
|
89
|
-
"""
|
90
|
-
Check if chapter is marked as duplicated (eFW = 1).
|
91
|
-
"""
|
92
|
-
chapter_info = extract_chapter_info(ssr_data)
|
93
|
-
efw_flag = chapter_info.get("eFW", 0)
|
94
|
-
return bool(efw_flag == 1)
|
95
|
-
|
96
|
-
|
97
|
-
def is_encrypted(content: str | dict[str, Any]) -> bool:
|
98
|
-
"""
|
99
|
-
Return True if content is encrypted.
|
100
|
-
|
101
|
-
Chapter Encryption Status (cES):
|
102
|
-
- 0: 内容是'明文'
|
103
|
-
- 2: 字体加密
|
104
|
-
|
105
|
-
:param content: HTML content, either as a raw string or a BeautifulSoup object.
|
106
|
-
:return: True if encrypted marker is found, else False.
|
107
|
-
"""
|
108
|
-
ssr_data = find_ssr_page_context(content) if isinstance(content, str) else content
|
109
|
-
chapter_info = extract_chapter_info(ssr_data)
|
110
|
-
return int(chapter_info.get("cES", 0)) == 2
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|