novel-downloader 2.0.0__py3-none-any.whl → 2.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -1
- novel_downloader/cli/download.py +14 -11
- novel_downloader/cli/export.py +19 -19
- novel_downloader/cli/ui.py +35 -8
- novel_downloader/config/adapter.py +216 -153
- novel_downloader/core/__init__.py +5 -6
- novel_downloader/core/archived/deqixs/fetcher.py +1 -28
- novel_downloader/core/downloaders/__init__.py +2 -0
- novel_downloader/core/downloaders/base.py +34 -85
- novel_downloader/core/downloaders/common.py +147 -171
- novel_downloader/core/downloaders/qianbi.py +30 -64
- novel_downloader/core/downloaders/qidian.py +157 -184
- novel_downloader/core/downloaders/qqbook.py +292 -0
- novel_downloader/core/downloaders/registry.py +2 -2
- novel_downloader/core/exporters/__init__.py +2 -0
- novel_downloader/core/exporters/base.py +37 -59
- novel_downloader/core/exporters/common.py +620 -0
- novel_downloader/core/exporters/linovelib.py +47 -0
- novel_downloader/core/exporters/qidian.py +41 -12
- novel_downloader/core/exporters/qqbook.py +28 -0
- novel_downloader/core/exporters/registry.py +2 -2
- novel_downloader/core/fetchers/__init__.py +4 -2
- novel_downloader/core/fetchers/aaatxt.py +2 -22
- novel_downloader/core/fetchers/b520.py +3 -23
- novel_downloader/core/fetchers/base.py +80 -105
- novel_downloader/core/fetchers/biquyuedu.py +2 -22
- novel_downloader/core/fetchers/dxmwx.py +10 -22
- novel_downloader/core/fetchers/esjzone.py +6 -29
- novel_downloader/core/fetchers/guidaye.py +2 -22
- novel_downloader/core/fetchers/hetushu.py +9 -29
- novel_downloader/core/fetchers/i25zw.py +2 -16
- novel_downloader/core/fetchers/ixdzs8.py +2 -16
- novel_downloader/core/fetchers/jpxs123.py +2 -16
- novel_downloader/core/fetchers/lewenn.py +2 -22
- novel_downloader/core/fetchers/linovelib.py +4 -20
- novel_downloader/core/fetchers/{eightnovel.py → n8novel.py} +12 -40
- novel_downloader/core/fetchers/piaotia.py +2 -16
- novel_downloader/core/fetchers/qbtr.py +2 -16
- novel_downloader/core/fetchers/qianbi.py +1 -20
- novel_downloader/core/fetchers/qidian.py +27 -68
- novel_downloader/core/fetchers/qqbook.py +177 -0
- novel_downloader/core/fetchers/quanben5.py +9 -29
- novel_downloader/core/fetchers/rate_limiter.py +22 -53
- novel_downloader/core/fetchers/sfacg.py +3 -16
- novel_downloader/core/fetchers/shencou.py +2 -16
- novel_downloader/core/fetchers/shuhaige.py +2 -22
- novel_downloader/core/fetchers/tongrenquan.py +2 -22
- novel_downloader/core/fetchers/ttkan.py +3 -14
- novel_downloader/core/fetchers/wanbengo.py +2 -22
- novel_downloader/core/fetchers/xiaoshuowu.py +2 -16
- novel_downloader/core/fetchers/xiguashuwu.py +4 -20
- novel_downloader/core/fetchers/xs63b.py +3 -15
- novel_downloader/core/fetchers/xshbook.py +2 -22
- novel_downloader/core/fetchers/yamibo.py +4 -28
- novel_downloader/core/fetchers/yibige.py +13 -26
- novel_downloader/core/interfaces/exporter.py +19 -7
- novel_downloader/core/interfaces/fetcher.py +23 -49
- novel_downloader/core/interfaces/parser.py +2 -2
- novel_downloader/core/parsers/__init__.py +4 -2
- novel_downloader/core/parsers/b520.py +2 -2
- novel_downloader/core/parsers/base.py +5 -39
- novel_downloader/core/parsers/esjzone.py +3 -3
- novel_downloader/core/parsers/{eightnovel.py → n8novel.py} +7 -7
- novel_downloader/core/parsers/qidian.py +717 -0
- novel_downloader/core/parsers/qqbook.py +709 -0
- novel_downloader/core/parsers/xiguashuwu.py +8 -15
- novel_downloader/core/searchers/__init__.py +2 -2
- novel_downloader/core/searchers/b520.py +1 -1
- novel_downloader/core/searchers/base.py +2 -2
- novel_downloader/core/searchers/{eightnovel.py → n8novel.py} +5 -5
- novel_downloader/locales/en.json +3 -3
- novel_downloader/locales/zh.json +3 -3
- novel_downloader/models/__init__.py +2 -0
- novel_downloader/models/book.py +1 -0
- novel_downloader/models/config.py +12 -0
- novel_downloader/resources/config/settings.toml +23 -5
- novel_downloader/resources/js_scripts/expr_to_json.js +14 -0
- novel_downloader/resources/js_scripts/qidian_decrypt_node.js +21 -16
- novel_downloader/resources/js_scripts/qq_decrypt_node.js +92 -0
- novel_downloader/utils/__init__.py +0 -2
- novel_downloader/utils/chapter_storage.py +2 -3
- novel_downloader/utils/constants.py +7 -3
- novel_downloader/utils/cookies.py +32 -17
- novel_downloader/utils/crypto_utils/__init__.py +0 -6
- novel_downloader/utils/crypto_utils/aes_util.py +1 -1
- novel_downloader/utils/crypto_utils/rc4.py +40 -50
- novel_downloader/utils/epub/__init__.py +2 -3
- novel_downloader/utils/epub/builder.py +6 -6
- novel_downloader/utils/epub/constants.py +1 -6
- novel_downloader/utils/epub/documents.py +7 -7
- novel_downloader/utils/epub/models.py +8 -8
- novel_downloader/utils/epub/utils.py +10 -10
- novel_downloader/utils/file_utils/io.py +48 -73
- novel_downloader/utils/file_utils/normalize.py +1 -7
- novel_downloader/utils/file_utils/sanitize.py +4 -11
- novel_downloader/utils/fontocr/__init__.py +13 -0
- novel_downloader/utils/{fontocr.py → fontocr/core.py} +72 -61
- novel_downloader/utils/fontocr/loader.py +52 -0
- novel_downloader/utils/logger.py +80 -56
- novel_downloader/utils/network.py +16 -40
- novel_downloader/utils/node_decryptor/__init__.py +13 -0
- novel_downloader/utils/node_decryptor/decryptor.py +342 -0
- novel_downloader/{core/parsers/qidian/utils → utils/node_decryptor}/decryptor_fetcher.py +5 -6
- novel_downloader/utils/text_utils/text_cleaner.py +39 -30
- novel_downloader/utils/text_utils/truncate_utils.py +3 -14
- novel_downloader/utils/time_utils/sleep_utils.py +53 -43
- novel_downloader/web/main.py +1 -1
- novel_downloader/web/pages/download.py +1 -1
- novel_downloader/web/pages/search.py +4 -4
- novel_downloader/web/services/task_manager.py +2 -0
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.2.dist-info}/METADATA +5 -1
- novel_downloader-2.0.2.dist-info/RECORD +203 -0
- novel_downloader/core/exporters/common/__init__.py +0 -11
- novel_downloader/core/exporters/common/epub.py +0 -198
- novel_downloader/core/exporters/common/main_exporter.py +0 -64
- novel_downloader/core/exporters/common/txt.py +0 -146
- novel_downloader/core/exporters/epub_util.py +0 -215
- novel_downloader/core/exporters/linovelib/__init__.py +0 -11
- novel_downloader/core/exporters/linovelib/epub.py +0 -349
- novel_downloader/core/exporters/linovelib/main_exporter.py +0 -66
- novel_downloader/core/exporters/linovelib/txt.py +0 -139
- novel_downloader/core/exporters/txt_util.py +0 -67
- novel_downloader/core/parsers/qidian/__init__.py +0 -10
- novel_downloader/core/parsers/qidian/book_info_parser.py +0 -89
- novel_downloader/core/parsers/qidian/chapter_encrypted.py +0 -470
- novel_downloader/core/parsers/qidian/chapter_normal.py +0 -126
- novel_downloader/core/parsers/qidian/chapter_router.py +0 -68
- novel_downloader/core/parsers/qidian/main_parser.py +0 -101
- novel_downloader/core/parsers/qidian/utils/__init__.py +0 -30
- novel_downloader/core/parsers/qidian/utils/fontmap_recover.py +0 -143
- novel_downloader/core/parsers/qidian/utils/helpers.py +0 -110
- novel_downloader/core/parsers/qidian/utils/node_decryptor.py +0 -175
- novel_downloader-2.0.0.dist-info/RECORD +0 -210
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.2.dist-info}/WHEEL +0 -0
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.2.dist-info}/entry_points.txt +0 -0
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.2.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-2.0.0.dist-info → novel_downloader-2.0.2.dist-info}/top_level.txt +0 -0
@@ -1,89 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python3
|
2
|
-
"""
|
3
|
-
novel_downloader.core.parsers.qidian.book_info_parser
|
4
|
-
-----------------------------------------------------
|
5
|
-
|
6
|
-
This module provides parsing of Qidian book info pages.
|
7
|
-
|
8
|
-
It extracts metadata such as title, author, cover URL, update
|
9
|
-
time, status, word count, summary, and volume-chapter structure.
|
10
|
-
"""
|
11
|
-
|
12
|
-
import logging
|
13
|
-
import re
|
14
|
-
from datetime import datetime
|
15
|
-
|
16
|
-
from lxml import html
|
17
|
-
|
18
|
-
from novel_downloader.models import BookInfoDict, ChapterInfoDict, VolumeInfoDict
|
19
|
-
|
20
|
-
logger = logging.getLogger(__name__)
|
21
|
-
|
22
|
-
|
23
|
-
def _chapter_url_to_id(url: str) -> str:
|
24
|
-
return url.rstrip("/").split("/")[-1]
|
25
|
-
|
26
|
-
|
27
|
-
def parse_book_info(html_str: str) -> BookInfoDict | None:
|
28
|
-
"""
|
29
|
-
Extract metadata: title, author, cover_url, update_time, status,
|
30
|
-
word_count, summary, and volumes with chapters.
|
31
|
-
|
32
|
-
:param html_str: Raw HTML of the book info page.
|
33
|
-
:return: A dict containing book metadata.
|
34
|
-
"""
|
35
|
-
doc = html.fromstring(html_str)
|
36
|
-
|
37
|
-
book_name = doc.xpath('string(//h1[@id="bookName"])').strip()
|
38
|
-
|
39
|
-
author = doc.xpath('string(//a[@class="writer-name"])').strip()
|
40
|
-
|
41
|
-
book_id = doc.xpath('//a[@id="bookImg"]/@data-bid')[0]
|
42
|
-
cover_url = f"https://bookcover.yuewen.com/qdbimg/349573/{book_id}/600.webp"
|
43
|
-
|
44
|
-
ut = doc.xpath('string(//span[@class="update-time"])')
|
45
|
-
ut = ut.replace("更新时间:", "").strip()
|
46
|
-
if re.match(r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$", ut):
|
47
|
-
update_time = ut
|
48
|
-
else:
|
49
|
-
update_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
50
|
-
|
51
|
-
serial_status = doc.xpath('string(//p[@class="book-attribute"]/span[1])').strip()
|
52
|
-
|
53
|
-
tags_elem = doc.xpath('//p[contains(@class,"all-label")]//a/text()')
|
54
|
-
tags = [t.strip() for t in tags_elem if t.strip()]
|
55
|
-
|
56
|
-
word_count = doc.xpath('string(//p[@class="count"]/em[1])').strip()
|
57
|
-
|
58
|
-
summary_brief = doc.xpath('string(//p[@class="intro"])').strip()
|
59
|
-
|
60
|
-
raw = doc.xpath('//p[@id="book-intro-detail"]//text()')
|
61
|
-
summary = "\n".join(line.strip() for line in raw if line.strip())
|
62
|
-
|
63
|
-
volumes: list[VolumeInfoDict] = []
|
64
|
-
for vol in doc.xpath('//div[@id="allCatalog"]//div[@class="catalog-volume"]'):
|
65
|
-
vol_name = vol.xpath('string(.//h3[@class="volume-name"])').strip()
|
66
|
-
vol_name = vol_name.split(chr(183))[0].strip()
|
67
|
-
chapters: list[ChapterInfoDict] = []
|
68
|
-
for li in vol.xpath('.//ul[contains(@class,"volume-chapters")]/li'):
|
69
|
-
a = li.xpath('.//a[@class="chapter-name"]')[0]
|
70
|
-
title = a.text.strip()
|
71
|
-
url = a.get("href")
|
72
|
-
chapters.append(
|
73
|
-
{"title": title, "url": url, "chapterId": _chapter_url_to_id(url)}
|
74
|
-
)
|
75
|
-
volumes.append({"volume_name": vol_name, "chapters": chapters})
|
76
|
-
|
77
|
-
return {
|
78
|
-
"book_name": book_name,
|
79
|
-
"author": author,
|
80
|
-
"cover_url": cover_url,
|
81
|
-
"update_time": update_time,
|
82
|
-
"word_count": word_count,
|
83
|
-
"serial_status": serial_status,
|
84
|
-
"tags": tags,
|
85
|
-
"summary_brief": summary_brief,
|
86
|
-
"summary": summary,
|
87
|
-
"volumes": volumes,
|
88
|
-
"extra": {},
|
89
|
-
}
|
@@ -1,470 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python3
|
2
|
-
"""
|
3
|
-
novel_downloader.core.parsers.qidian.chapter_encrypted
|
4
|
-
------------------------------------------------------
|
5
|
-
|
6
|
-
Support for parsing encrypted chapters from Qidian using font OCR mapping,
|
7
|
-
CSS rules, and custom rendering logic.
|
8
|
-
"""
|
9
|
-
|
10
|
-
from __future__ import annotations
|
11
|
-
|
12
|
-
import json
|
13
|
-
import logging
|
14
|
-
import re
|
15
|
-
from contextlib import suppress
|
16
|
-
from typing import TYPE_CHECKING, TypedDict
|
17
|
-
|
18
|
-
from lxml import html
|
19
|
-
|
20
|
-
from novel_downloader.models import ChapterDict
|
21
|
-
from novel_downloader.utils import (
|
22
|
-
download,
|
23
|
-
truncate_half_lines,
|
24
|
-
)
|
25
|
-
|
26
|
-
from .utils import (
|
27
|
-
extract_chapter_info,
|
28
|
-
find_ssr_page_context,
|
29
|
-
get_decryptor,
|
30
|
-
is_duplicated,
|
31
|
-
vip_status,
|
32
|
-
)
|
33
|
-
from .utils.fontmap_recover import (
|
34
|
-
apply_font_mapping,
|
35
|
-
generate_font_map,
|
36
|
-
)
|
37
|
-
|
38
|
-
if TYPE_CHECKING:
|
39
|
-
from .main_parser import QidianParser
|
40
|
-
|
41
|
-
logger = logging.getLogger(__name__)
|
42
|
-
_RE_ATTR = re.compile(r"attr\(\s*([^)]+?)\s*\)", re.I)
|
43
|
-
_RE_SCALEX = re.compile(r"scalex\(\s*-?1\s*\)", re.I)
|
44
|
-
|
45
|
-
|
46
|
-
class Rule(TypedDict, total=False):
|
47
|
-
delete_all: bool
|
48
|
-
delete_first: bool
|
49
|
-
transform_flip_x: bool
|
50
|
-
append_start_char: str
|
51
|
-
append_end_char: str
|
52
|
-
append_start_attr: str
|
53
|
-
append_end_attr: str
|
54
|
-
|
55
|
-
|
56
|
-
class Rules(TypedDict):
|
57
|
-
# e.g., orders = ["i", "em", "span"]
|
58
|
-
orders: list[str]
|
59
|
-
# e.g., sy["sy-3"] -> Rule
|
60
|
-
sy: dict[str, Rule]
|
61
|
-
# e.g., p_rules["p3"]["i"] -> Rule
|
62
|
-
p_rules: dict[str, dict[str, Rule]]
|
63
|
-
|
64
|
-
|
65
|
-
def parse_encrypted_chapter(
|
66
|
-
parser: QidianParser,
|
67
|
-
html_str: str,
|
68
|
-
chapter_id: str,
|
69
|
-
) -> ChapterDict | None:
|
70
|
-
"""
|
71
|
-
Extract and return the formatted textual content of an encrypted chapter.
|
72
|
-
|
73
|
-
Steps:
|
74
|
-
1. Load SSR JSON context for CSS, fonts, and metadata.
|
75
|
-
3. Decode and save randomFont bytes; download fixedFont via download_font().
|
76
|
-
4. Extract paragraph structures and save debug JSON.
|
77
|
-
5. Parse CSS rules and save debug JSON.
|
78
|
-
6. Render encrypted paragraphs, then run OCR font-mapping.
|
79
|
-
7. Extracts paragraph texts and formats them.
|
80
|
-
|
81
|
-
:param html_str: Raw HTML content of the chapter page.
|
82
|
-
:return: Formatted chapter text or empty string if not parsable.
|
83
|
-
"""
|
84
|
-
try:
|
85
|
-
if not parser._decode_font:
|
86
|
-
return None
|
87
|
-
ssr_data = find_ssr_page_context(html_str)
|
88
|
-
chapter_info = extract_chapter_info(ssr_data)
|
89
|
-
if not chapter_info:
|
90
|
-
logger.warning(
|
91
|
-
"[Parser] ssr_chapterInfo not found for chapter '%s'", chapter_id
|
92
|
-
)
|
93
|
-
return None
|
94
|
-
|
95
|
-
debug_dir = parser._debug_dir / "font_debug" / "qidian" / chapter_id
|
96
|
-
if parser.save_font_debug:
|
97
|
-
debug_dir.mkdir(parents=True, exist_ok=True)
|
98
|
-
|
99
|
-
css_str = chapter_info["css"]
|
100
|
-
randomFont_str = chapter_info["randomFont"]
|
101
|
-
fixedFontWoff2_url = chapter_info["fixedFontWoff2"]
|
102
|
-
|
103
|
-
title = chapter_info.get("chapterName", "Untitled")
|
104
|
-
duplicated = is_duplicated(ssr_data)
|
105
|
-
raw_html = chapter_info.get("content", "")
|
106
|
-
chapter_id = chapter_info.get("chapterId", chapter_id)
|
107
|
-
fkp = chapter_info.get("fkp", "")
|
108
|
-
author_say = chapter_info.get("authorSay", "")
|
109
|
-
update_time = chapter_info.get("updateTime", "")
|
110
|
-
update_timestamp = chapter_info.get("updateTimestamp", 0)
|
111
|
-
modify_time = chapter_info.get("modifyTime", 0)
|
112
|
-
word_count = chapter_info.get("actualWords", 0)
|
113
|
-
seq = chapter_info.get("seq", None)
|
114
|
-
volume = chapter_info.get("extra", {}).get("volumeName", "")
|
115
|
-
|
116
|
-
# extract + save font
|
117
|
-
rf = json.loads(randomFont_str)
|
118
|
-
rand_path = parser._base_cache_dir / "randomFont.ttf"
|
119
|
-
rand_path.parent.mkdir(parents=True, exist_ok=True)
|
120
|
-
rand_path.write_bytes(bytes(rf["data"]))
|
121
|
-
|
122
|
-
fixed_path = download(
|
123
|
-
url=fixedFontWoff2_url,
|
124
|
-
target_dir=parser._fixed_font_dir,
|
125
|
-
stream=True,
|
126
|
-
)
|
127
|
-
if fixed_path is None:
|
128
|
-
raise ValueError("fixed_path is None: failed to download font")
|
129
|
-
|
130
|
-
# Extract and render paragraphs from HTML with CSS rules
|
131
|
-
if vip_status(ssr_data):
|
132
|
-
try:
|
133
|
-
decryptor = get_decryptor()
|
134
|
-
raw_html = decryptor.decrypt(
|
135
|
-
raw_html,
|
136
|
-
chapter_id,
|
137
|
-
fkp,
|
138
|
-
parser._fuid,
|
139
|
-
)
|
140
|
-
except Exception as e:
|
141
|
-
logger.error("[Parser] decryption failed for '%s': %s", chapter_id, e)
|
142
|
-
return None
|
143
|
-
|
144
|
-
css_rules = parse_css_rules(css_str)
|
145
|
-
paragraphs_str, refl_list = render_visible_text(raw_html, css_rules)
|
146
|
-
if parser.save_font_debug:
|
147
|
-
paragraphs_str_path = debug_dir / f"{chapter_id}_debug.txt"
|
148
|
-
paragraphs_str_path.write_text(paragraphs_str, encoding="utf-8")
|
149
|
-
|
150
|
-
# Run OCR + fallback mapping
|
151
|
-
char_set = {c for c in paragraphs_str if c not in {" ", "\n", "\u3000"}}
|
152
|
-
refl_set = set(refl_list)
|
153
|
-
char_set = char_set - refl_set
|
154
|
-
if parser.save_font_debug:
|
155
|
-
char_sets_path = debug_dir / "char_set_debug.txt"
|
156
|
-
temp = f"char_set:\n{char_set}\n\nrefl_set:\n{refl_set}"
|
157
|
-
char_sets_path.write_text(
|
158
|
-
temp,
|
159
|
-
encoding="utf-8",
|
160
|
-
)
|
161
|
-
|
162
|
-
mapping_result = generate_font_map(
|
163
|
-
fixed_font_path=fixed_path,
|
164
|
-
random_font_path=rand_path,
|
165
|
-
char_set=char_set,
|
166
|
-
refl_set=refl_set,
|
167
|
-
cache_dir=parser._base_cache_dir,
|
168
|
-
batch_size=parser._config.batch_size,
|
169
|
-
)
|
170
|
-
if not mapping_result:
|
171
|
-
return None
|
172
|
-
|
173
|
-
if parser.save_font_debug:
|
174
|
-
mapping_json_path = debug_dir / "font_mapping.json"
|
175
|
-
mapping_json_path.write_text(
|
176
|
-
json.dumps(mapping_result, ensure_ascii=False, indent=2),
|
177
|
-
encoding="utf-8",
|
178
|
-
)
|
179
|
-
|
180
|
-
# Reconstruct final readable text
|
181
|
-
original_text = apply_font_mapping(
|
182
|
-
text=paragraphs_str,
|
183
|
-
font_map=mapping_result,
|
184
|
-
)
|
185
|
-
|
186
|
-
final_paragraphs_str = "\n".join(
|
187
|
-
line.strip() for line in original_text.splitlines() if line.strip()
|
188
|
-
)
|
189
|
-
if parser._use_truncation and duplicated:
|
190
|
-
final_paragraphs_str = truncate_half_lines(final_paragraphs_str)
|
191
|
-
|
192
|
-
return {
|
193
|
-
"id": str(chapter_id),
|
194
|
-
"title": str(title),
|
195
|
-
"content": final_paragraphs_str,
|
196
|
-
"extra": {
|
197
|
-
"author_say": author_say.strip() if author_say else "",
|
198
|
-
"updated_at": update_time,
|
199
|
-
"update_timestamp": update_timestamp,
|
200
|
-
"modify_time": modify_time,
|
201
|
-
"word_count": word_count,
|
202
|
-
"duplicated": duplicated,
|
203
|
-
"seq": seq,
|
204
|
-
"volume": volume,
|
205
|
-
"encrypted": True,
|
206
|
-
},
|
207
|
-
}
|
208
|
-
|
209
|
-
except Exception as e:
|
210
|
-
logger.warning(
|
211
|
-
"[Parser] parse error for encrypted chapter '%s': %s", chapter_id, e
|
212
|
-
)
|
213
|
-
return None
|
214
|
-
|
215
|
-
|
216
|
-
def _only_tag(selector: str) -> str | None:
|
217
|
-
"""
|
218
|
-
Normalize a selector into just its tag name for ordering.
|
219
|
-
|
220
|
-
Handles forms like 'i', 'em::before', '.p3 i', '.p2 span::after'.
|
221
|
-
|
222
|
-
Returns None if can't extract a tag.
|
223
|
-
"""
|
224
|
-
sel = selector.strip()
|
225
|
-
# If it has spaces, take the rightmost simple selector
|
226
|
-
last = sel.split()[-1]
|
227
|
-
# Drop ::pseudo
|
228
|
-
last = last.split("::", 1)[0]
|
229
|
-
# If it's like 'span[attr=..]' keep 'span'
|
230
|
-
last = last.split("[", 1)[0]
|
231
|
-
# If it starts with '.', it's not a tag
|
232
|
-
if not last or last.startswith("."):
|
233
|
-
return None
|
234
|
-
return last
|
235
|
-
|
236
|
-
|
237
|
-
def _parse_decls(block: str) -> list[tuple[str, str]]:
|
238
|
-
"""
|
239
|
-
Parse 'name:value;...' inside a block. Tolerates quotes and attr().
|
240
|
-
"""
|
241
|
-
decls: list[tuple[str, str]] = []
|
242
|
-
i = 0
|
243
|
-
n = len(block)
|
244
|
-
name: list[str] = []
|
245
|
-
val: list[str] = []
|
246
|
-
in_name = True
|
247
|
-
quote = None # track ' or "
|
248
|
-
while i < n:
|
249
|
-
c = block[i]
|
250
|
-
if quote:
|
251
|
-
# inside quotes
|
252
|
-
if c == "\\" and i + 1 < n:
|
253
|
-
# keep escaped char
|
254
|
-
(name if in_name else val).append(c)
|
255
|
-
i += 1
|
256
|
-
(name if in_name else val).append(block[i])
|
257
|
-
elif c == quote:
|
258
|
-
(name if in_name else val).append(c)
|
259
|
-
quote = None
|
260
|
-
else:
|
261
|
-
(name if in_name else val).append(c)
|
262
|
-
else:
|
263
|
-
if c in ("'", '"'):
|
264
|
-
(name if in_name else val).append(c)
|
265
|
-
quote = c
|
266
|
-
elif in_name and c == ":":
|
267
|
-
in_name = False
|
268
|
-
elif c == ";":
|
269
|
-
nm = "".join(name).strip().lower()
|
270
|
-
vl = "".join(val).strip()
|
271
|
-
if nm:
|
272
|
-
decls.append((nm, vl))
|
273
|
-
name.clear()
|
274
|
-
val.clear()
|
275
|
-
in_name = True
|
276
|
-
else:
|
277
|
-
(name if in_name else val).append(c)
|
278
|
-
i += 1
|
279
|
-
|
280
|
-
if name or val:
|
281
|
-
nm = "".join(name).strip().lower()
|
282
|
-
vl = "".join(val).strip()
|
283
|
-
if nm:
|
284
|
-
decls.append((nm, vl))
|
285
|
-
return decls
|
286
|
-
|
287
|
-
|
288
|
-
def parse_css_rules(css_str: str) -> Rules:
|
289
|
-
"""
|
290
|
-
Produces normalized Rules with:
|
291
|
-
- orders: list[str] of tag names sorted by numeric 'order'
|
292
|
-
- sy: '.sy-*' class rules
|
293
|
-
- p_rules: '.p* <tag>' rules, indexed by p-class then tag
|
294
|
-
"""
|
295
|
-
rules: Rules = {"orders": [], "sy": {}, "p_rules": {}}
|
296
|
-
order_pairs: list[tuple[str, int]] = []
|
297
|
-
|
298
|
-
i = 0
|
299
|
-
while True:
|
300
|
-
b1 = css_str.find("{", i)
|
301
|
-
if b1 == -1:
|
302
|
-
break
|
303
|
-
selector = css_str[i:b1].strip().lower()
|
304
|
-
b2 = css_str.find("}", b1 + 1)
|
305
|
-
if b2 == -1:
|
306
|
-
break
|
307
|
-
block = css_str[b1 + 1 : b2]
|
308
|
-
i = b2 + 1
|
309
|
-
|
310
|
-
decls = _parse_decls(block)
|
311
|
-
|
312
|
-
new_rule: Rule = {}
|
313
|
-
order_val: int | None = None
|
314
|
-
|
315
|
-
for name, value in decls:
|
316
|
-
v = value.strip()
|
317
|
-
if name == "font-size" and v == "0":
|
318
|
-
if "::first-letter" in selector:
|
319
|
-
new_rule["delete_first"] = True
|
320
|
-
else:
|
321
|
-
new_rule["delete_all"] = True
|
322
|
-
elif name == "transform":
|
323
|
-
if _RE_SCALEX.search(v.replace(" ", "")):
|
324
|
-
new_rule["transform_flip_x"] = True
|
325
|
-
elif name == "order":
|
326
|
-
with suppress(ValueError, TypeError):
|
327
|
-
order_val = int(v)
|
328
|
-
elif name == "content":
|
329
|
-
# normalize: remove outer quotes
|
330
|
-
if "::after" in selector:
|
331
|
-
m = _RE_ATTR.search(v)
|
332
|
-
if m:
|
333
|
-
new_rule["append_end_attr"] = m.group(1)
|
334
|
-
else:
|
335
|
-
s = v.strip().strip("\"'")
|
336
|
-
new_rule["append_end_char"] = s
|
337
|
-
elif "::before" in selector:
|
338
|
-
m = _RE_ATTR.search(v)
|
339
|
-
if m:
|
340
|
-
new_rule["append_start_attr"] = m.group(1)
|
341
|
-
else:
|
342
|
-
s = v.strip().strip("\"'")
|
343
|
-
new_rule["append_start_char"] = s
|
344
|
-
|
345
|
-
# classification
|
346
|
-
if selector.startswith(".sy-"):
|
347
|
-
key = selector.lstrip(".")
|
348
|
-
old = rules["sy"].get(key)
|
349
|
-
rules["sy"][key] = {**old, **new_rule} if old else (new_rule or {})
|
350
|
-
|
351
|
-
elif selector.startswith(".p") and " " in selector:
|
352
|
-
p_cls, right = selector.split(" ", 1)
|
353
|
-
p_cls = p_cls.lstrip(".")
|
354
|
-
tag = _only_tag(right)
|
355
|
-
if tag:
|
356
|
-
prev = rules["p_rules"].setdefault(p_cls, {}).get(tag)
|
357
|
-
rules["p_rules"][p_cls][tag] = (
|
358
|
-
{**prev, **new_rule} if prev else (new_rule or {})
|
359
|
-
)
|
360
|
-
|
361
|
-
if order_val is not None:
|
362
|
-
tag_for_order = _only_tag(selector)
|
363
|
-
if tag_for_order:
|
364
|
-
order_pairs.append((tag_for_order, order_val))
|
365
|
-
|
366
|
-
# normalize orders
|
367
|
-
order_pairs.sort(key=lambda t: t[1])
|
368
|
-
seen = set()
|
369
|
-
orders: list[str] = []
|
370
|
-
for tag, _num in order_pairs:
|
371
|
-
if tag not in seen:
|
372
|
-
seen.add(tag)
|
373
|
-
orders.append(tag)
|
374
|
-
rules["orders"] = orders
|
375
|
-
return rules
|
376
|
-
|
377
|
-
|
378
|
-
def render_visible_text(html_str: str, rules: Rules) -> tuple[str, list[str]]:
|
379
|
-
"""
|
380
|
-
Renderer the HTML using pre-parsed Rules.
|
381
|
-
"""
|
382
|
-
tree = html.fromstring(html_str)
|
383
|
-
paragraphs_out: list[str] = []
|
384
|
-
refl_list: list[str] = []
|
385
|
-
orders = rules.get("orders") or []
|
386
|
-
p_rules = rules.get("p_rules") or {}
|
387
|
-
sy_rules = rules.get("sy") or {}
|
388
|
-
|
389
|
-
def _class_list(el: html.HtmlElement) -> list[str]:
|
390
|
-
cls = el.get("class")
|
391
|
-
return cls.split() if cls else []
|
392
|
-
|
393
|
-
def _apply_rule(el: html.HtmlElement, rule: Rule) -> str:
|
394
|
-
if rule.get("delete_all"):
|
395
|
-
return ""
|
396
|
-
|
397
|
-
parts: list[str] = []
|
398
|
-
if "append_start_char" in rule:
|
399
|
-
parts.append(rule["append_start_char"])
|
400
|
-
if "append_start_attr" in rule:
|
401
|
-
parts.append(el.get(rule["append_start_attr"], ""))
|
402
|
-
|
403
|
-
text = el.text or ""
|
404
|
-
if rule.get("delete_first") and text:
|
405
|
-
text = text[1:]
|
406
|
-
parts.append(text)
|
407
|
-
|
408
|
-
if "append_end_char" in rule:
|
409
|
-
parts.append(rule["append_end_char"])
|
410
|
-
if "append_end_attr" in rule:
|
411
|
-
parts.append(el.get(rule["append_end_attr"], ""))
|
412
|
-
|
413
|
-
s = "".join(parts)
|
414
|
-
|
415
|
-
if rule.get("transform_flip_x") and s:
|
416
|
-
refl_list.append(s)
|
417
|
-
|
418
|
-
return s
|
419
|
-
|
420
|
-
for p in tree.findall(".//p"):
|
421
|
-
p_classes = _class_list(p)
|
422
|
-
p_key = next((c for c in p_classes if c.startswith("p")), None)
|
423
|
-
has_ordered_rules = p_key in p_rules
|
424
|
-
|
425
|
-
buf_parts: list[str] = []
|
426
|
-
|
427
|
-
if p.text and not has_ordered_rules:
|
428
|
-
buf_parts.append(p.text)
|
429
|
-
|
430
|
-
ordered_cache: dict[str, list[str]] = {}
|
431
|
-
|
432
|
-
for child in p:
|
433
|
-
tag = str(child.tag)
|
434
|
-
|
435
|
-
# Handle inline <y class="sy-*"> spans
|
436
|
-
if tag == "y" and not has_ordered_rules:
|
437
|
-
y_cls = next(
|
438
|
-
(c for c in _class_list(child) if c.startswith("sy-")), None
|
439
|
-
)
|
440
|
-
if y_cls and y_cls in sy_rules:
|
441
|
-
buf_parts.append(_apply_rule(child, sy_rules[y_cls]))
|
442
|
-
else:
|
443
|
-
buf_parts.append(child.text or "")
|
444
|
-
if child.tail:
|
445
|
-
buf_parts.append(child.tail)
|
446
|
-
continue
|
447
|
-
|
448
|
-
# Handle ordered paragraphs: only cache tags that appear in `orders`
|
449
|
-
if p_key and has_ordered_rules and tag in orders:
|
450
|
-
rule = p_rules[p_key].get(tag, {})
|
451
|
-
ordered_cache.setdefault(tag, []).append(_apply_rule(child, rule))
|
452
|
-
continue
|
453
|
-
|
454
|
-
# Non-ordered, non-<y> nodes: include text + tails as-is
|
455
|
-
if not has_ordered_rules:
|
456
|
-
buf_parts.append(child.text or "")
|
457
|
-
if child.tail:
|
458
|
-
buf_parts.append(child.tail)
|
459
|
-
|
460
|
-
# If ordered, flush in global orders with all duplicates preserved
|
461
|
-
if has_ordered_rules:
|
462
|
-
for tag in orders:
|
463
|
-
if tag in ordered_cache:
|
464
|
-
buf_parts.extend(ordered_cache[tag])
|
465
|
-
|
466
|
-
para = "".join(buf_parts)
|
467
|
-
if para:
|
468
|
-
paragraphs_out.append(para)
|
469
|
-
|
470
|
-
return "\n".join(paragraphs_out), refl_list
|
@@ -1,126 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python3
|
2
|
-
"""
|
3
|
-
novel_downloader.core.parsers.qidian.chapter_normal
|
4
|
-
---------------------------------------------------
|
5
|
-
|
6
|
-
Parser logic for extracting readable text from Qidian chapters
|
7
|
-
that use plain (non-encrypted) browser-rendered HTML.
|
8
|
-
"""
|
9
|
-
|
10
|
-
from __future__ import annotations
|
11
|
-
|
12
|
-
import logging
|
13
|
-
from typing import TYPE_CHECKING
|
14
|
-
|
15
|
-
from lxml import html
|
16
|
-
|
17
|
-
from novel_downloader.models import ChapterDict
|
18
|
-
from novel_downloader.utils import truncate_half_lines
|
19
|
-
|
20
|
-
from .utils import (
|
21
|
-
extract_chapter_info,
|
22
|
-
find_ssr_page_context,
|
23
|
-
get_decryptor,
|
24
|
-
is_duplicated,
|
25
|
-
vip_status,
|
26
|
-
)
|
27
|
-
|
28
|
-
if TYPE_CHECKING:
|
29
|
-
from .main_parser import QidianParser
|
30
|
-
|
31
|
-
logger = logging.getLogger(__name__)
|
32
|
-
|
33
|
-
|
34
|
-
def parse_normal_chapter(
|
35
|
-
parser: QidianParser,
|
36
|
-
html_str: str,
|
37
|
-
chapter_id: str,
|
38
|
-
) -> ChapterDict | None:
|
39
|
-
"""
|
40
|
-
Extract structured chapter info from a normal Qidian page.
|
41
|
-
|
42
|
-
:param html_str: Chapter HTML.
|
43
|
-
:param chapter_id: Chapter identifier (string).
|
44
|
-
:return: a dictionary with keys like 'id', 'title', 'content', etc.
|
45
|
-
"""
|
46
|
-
try:
|
47
|
-
ssr_data = find_ssr_page_context(html_str)
|
48
|
-
chapter_info = extract_chapter_info(ssr_data)
|
49
|
-
if not chapter_info:
|
50
|
-
logger.warning(
|
51
|
-
"[Parser] ssr_chapterInfo not found for chapter '%s'", chapter_id
|
52
|
-
)
|
53
|
-
return None
|
54
|
-
|
55
|
-
title = chapter_info.get("chapterName", "Untitled")
|
56
|
-
duplicated = is_duplicated(ssr_data)
|
57
|
-
raw_html = chapter_info.get("content", "")
|
58
|
-
chapter_id = chapter_info.get("chapterId", chapter_id)
|
59
|
-
fkp = chapter_info.get("fkp", "")
|
60
|
-
author_say = chapter_info.get("authorSay", "")
|
61
|
-
update_time = chapter_info.get("updateTime", "")
|
62
|
-
update_timestamp = chapter_info.get("updateTimestamp", 0)
|
63
|
-
modify_time = chapter_info.get("modifyTime", 0)
|
64
|
-
word_count = chapter_info.get("actualWords", 0)
|
65
|
-
seq = chapter_info.get("seq", None)
|
66
|
-
volume = chapter_info.get("extra", {}).get("volumeName", "")
|
67
|
-
|
68
|
-
chapter_text = _parse_paragraph(
|
69
|
-
html_str=raw_html,
|
70
|
-
is_vip=vip_status(ssr_data),
|
71
|
-
chapter_id=chapter_id,
|
72
|
-
fkp=fkp,
|
73
|
-
fuid=parser._fuid,
|
74
|
-
)
|
75
|
-
if not chapter_text:
|
76
|
-
return None
|
77
|
-
|
78
|
-
if parser._use_truncation and duplicated:
|
79
|
-
chapter_text = truncate_half_lines(chapter_text)
|
80
|
-
|
81
|
-
return {
|
82
|
-
"id": str(chapter_id),
|
83
|
-
"title": title,
|
84
|
-
"content": chapter_text,
|
85
|
-
"extra": {
|
86
|
-
"author_say": author_say.strip() if author_say else "",
|
87
|
-
"updated_at": update_time,
|
88
|
-
"update_timestamp": update_timestamp,
|
89
|
-
"modify_time": modify_time,
|
90
|
-
"word_count": word_count,
|
91
|
-
"duplicated": duplicated,
|
92
|
-
"seq": seq,
|
93
|
-
"volume": volume,
|
94
|
-
"encrypted": False,
|
95
|
-
},
|
96
|
-
}
|
97
|
-
except Exception as e:
|
98
|
-
logger.warning(
|
99
|
-
"[Parser] parse error for normal chapter '%s': %s", chapter_id, e
|
100
|
-
)
|
101
|
-
return None
|
102
|
-
|
103
|
-
|
104
|
-
def _parse_paragraph(
|
105
|
-
html_str: str,
|
106
|
-
is_vip: bool,
|
107
|
-
chapter_id: str,
|
108
|
-
fkp: str,
|
109
|
-
fuid: str,
|
110
|
-
) -> str:
|
111
|
-
raw_html = html_str
|
112
|
-
|
113
|
-
if is_vip:
|
114
|
-
try:
|
115
|
-
decryptor = get_decryptor()
|
116
|
-
raw_html = decryptor.decrypt(raw_html, chapter_id, fkp, fuid)
|
117
|
-
except Exception as e:
|
118
|
-
logger.error("[Parser] decryption failed for '%s': %s", chapter_id, e)
|
119
|
-
return ""
|
120
|
-
|
121
|
-
tree = html.fromstring(raw_html)
|
122
|
-
paras = tree.xpath(".//p")
|
123
|
-
paragraph_texts = [
|
124
|
-
p.text_content().strip() for p in paras if p.text_content().strip()
|
125
|
-
]
|
126
|
-
return "\n".join(paragraph_texts)
|