novel-downloader 1.5.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -1
- novel_downloader/cli/__init__.py +1 -3
- novel_downloader/cli/clean.py +21 -88
- novel_downloader/cli/config.py +26 -21
- novel_downloader/cli/download.py +77 -64
- novel_downloader/cli/export.py +16 -20
- novel_downloader/cli/main.py +1 -1
- novel_downloader/cli/search.py +62 -65
- novel_downloader/cli/ui.py +156 -0
- novel_downloader/config/__init__.py +8 -5
- novel_downloader/config/adapter.py +65 -105
- novel_downloader/config/{loader.py → file_io.py} +53 -26
- novel_downloader/core/__init__.py +1 -0
- novel_downloader/core/archived/deqixs/fetcher.py +115 -0
- novel_downloader/core/archived/deqixs/parser.py +132 -0
- novel_downloader/core/archived/deqixs/searcher.py +89 -0
- novel_downloader/core/{searchers/qidian.py → archived/qidian/searcher.py} +12 -20
- novel_downloader/core/archived/wanbengo/searcher.py +98 -0
- novel_downloader/core/archived/xshbook/searcher.py +93 -0
- novel_downloader/core/downloaders/__init__.py +3 -24
- novel_downloader/core/downloaders/base.py +49 -23
- novel_downloader/core/downloaders/common.py +191 -137
- novel_downloader/core/downloaders/qianbi.py +187 -146
- novel_downloader/core/downloaders/qidian.py +187 -141
- novel_downloader/core/downloaders/registry.py +4 -2
- novel_downloader/core/downloaders/signals.py +46 -0
- novel_downloader/core/exporters/__init__.py +3 -20
- novel_downloader/core/exporters/base.py +33 -37
- novel_downloader/core/exporters/common/__init__.py +1 -2
- novel_downloader/core/exporters/common/epub.py +15 -10
- novel_downloader/core/exporters/common/main_exporter.py +19 -12
- novel_downloader/core/exporters/common/txt.py +14 -9
- novel_downloader/core/exporters/epub_util.py +59 -29
- novel_downloader/core/exporters/linovelib/__init__.py +1 -0
- novel_downloader/core/exporters/linovelib/epub.py +23 -25
- novel_downloader/core/exporters/linovelib/main_exporter.py +8 -12
- novel_downloader/core/exporters/linovelib/txt.py +17 -11
- novel_downloader/core/exporters/qidian.py +2 -8
- novel_downloader/core/exporters/registry.py +4 -2
- novel_downloader/core/exporters/txt_util.py +7 -7
- novel_downloader/core/fetchers/__init__.py +54 -48
- novel_downloader/core/fetchers/aaatxt.py +83 -0
- novel_downloader/core/fetchers/{biquge/session.py → b520.py} +6 -11
- novel_downloader/core/fetchers/{base/session.py → base.py} +37 -46
- novel_downloader/core/fetchers/{biquge/browser.py → biquyuedu.py} +12 -17
- novel_downloader/core/fetchers/dxmwx.py +110 -0
- novel_downloader/core/fetchers/eightnovel.py +139 -0
- novel_downloader/core/fetchers/{esjzone/session.py → esjzone.py} +19 -12
- novel_downloader/core/fetchers/guidaye.py +85 -0
- novel_downloader/core/fetchers/hetushu.py +92 -0
- novel_downloader/core/fetchers/{qianbi/browser.py → i25zw.py} +19 -28
- novel_downloader/core/fetchers/ixdzs8.py +113 -0
- novel_downloader/core/fetchers/jpxs123.py +101 -0
- novel_downloader/core/fetchers/lewenn.py +83 -0
- novel_downloader/core/fetchers/{linovelib/session.py → linovelib.py} +12 -13
- novel_downloader/core/fetchers/piaotia.py +105 -0
- novel_downloader/core/fetchers/qbtr.py +101 -0
- novel_downloader/core/fetchers/{qianbi/session.py → qianbi.py} +5 -10
- novel_downloader/core/fetchers/{qidian/session.py → qidian.py} +46 -39
- novel_downloader/core/fetchers/quanben5.py +92 -0
- novel_downloader/core/fetchers/{base/rate_limiter.py → rate_limiter.py} +2 -2
- novel_downloader/core/fetchers/registry.py +5 -16
- novel_downloader/core/fetchers/{sfacg/session.py → sfacg.py} +7 -10
- novel_downloader/core/fetchers/shencou.py +106 -0
- novel_downloader/core/fetchers/shuhaige.py +84 -0
- novel_downloader/core/fetchers/tongrenquan.py +84 -0
- novel_downloader/core/fetchers/ttkan.py +95 -0
- novel_downloader/core/fetchers/wanbengo.py +83 -0
- novel_downloader/core/fetchers/xiaoshuowu.py +106 -0
- novel_downloader/core/fetchers/xiguashuwu.py +177 -0
- novel_downloader/core/fetchers/xs63b.py +171 -0
- novel_downloader/core/fetchers/xshbook.py +85 -0
- novel_downloader/core/fetchers/{yamibo/session.py → yamibo.py} +19 -12
- novel_downloader/core/fetchers/yibige.py +114 -0
- novel_downloader/core/interfaces/__init__.py +1 -9
- novel_downloader/core/interfaces/downloader.py +6 -2
- novel_downloader/core/interfaces/exporter.py +7 -7
- novel_downloader/core/interfaces/fetcher.py +4 -17
- novel_downloader/core/interfaces/parser.py +5 -6
- novel_downloader/core/interfaces/searcher.py +9 -1
- novel_downloader/core/parsers/__init__.py +49 -12
- novel_downloader/core/parsers/aaatxt.py +132 -0
- novel_downloader/core/parsers/b520.py +116 -0
- novel_downloader/core/parsers/base.py +63 -12
- novel_downloader/core/parsers/biquyuedu.py +133 -0
- novel_downloader/core/parsers/dxmwx.py +162 -0
- novel_downloader/core/parsers/eightnovel.py +224 -0
- novel_downloader/core/parsers/esjzone.py +61 -66
- novel_downloader/core/parsers/guidaye.py +128 -0
- novel_downloader/core/parsers/hetushu.py +139 -0
- novel_downloader/core/parsers/i25zw.py +137 -0
- novel_downloader/core/parsers/ixdzs8.py +186 -0
- novel_downloader/core/parsers/jpxs123.py +137 -0
- novel_downloader/core/parsers/lewenn.py +142 -0
- novel_downloader/core/parsers/linovelib.py +48 -64
- novel_downloader/core/parsers/piaotia.py +189 -0
- novel_downloader/core/parsers/qbtr.py +136 -0
- novel_downloader/core/parsers/qianbi.py +48 -50
- novel_downloader/core/parsers/qidian/book_info_parser.py +58 -59
- novel_downloader/core/parsers/qidian/chapter_encrypted.py +272 -330
- novel_downloader/core/parsers/qidian/chapter_normal.py +24 -55
- novel_downloader/core/parsers/qidian/main_parser.py +11 -38
- novel_downloader/core/parsers/qidian/utils/__init__.py +1 -0
- novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +1 -1
- novel_downloader/core/parsers/qidian/utils/fontmap_recover.py +143 -0
- novel_downloader/core/parsers/qidian/utils/helpers.py +0 -4
- novel_downloader/core/parsers/quanben5.py +103 -0
- novel_downloader/core/parsers/registry.py +5 -16
- novel_downloader/core/parsers/sfacg.py +38 -45
- novel_downloader/core/parsers/shencou.py +215 -0
- novel_downloader/core/parsers/shuhaige.py +111 -0
- novel_downloader/core/parsers/tongrenquan.py +116 -0
- novel_downloader/core/parsers/ttkan.py +132 -0
- novel_downloader/core/parsers/wanbengo.py +191 -0
- novel_downloader/core/parsers/xiaoshuowu.py +173 -0
- novel_downloader/core/parsers/xiguashuwu.py +435 -0
- novel_downloader/core/parsers/xs63b.py +161 -0
- novel_downloader/core/parsers/xshbook.py +134 -0
- novel_downloader/core/parsers/yamibo.py +87 -131
- novel_downloader/core/parsers/yibige.py +166 -0
- novel_downloader/core/searchers/__init__.py +34 -3
- novel_downloader/core/searchers/aaatxt.py +107 -0
- novel_downloader/core/searchers/{biquge.py → b520.py} +29 -28
- novel_downloader/core/searchers/base.py +112 -36
- novel_downloader/core/searchers/dxmwx.py +105 -0
- novel_downloader/core/searchers/eightnovel.py +84 -0
- novel_downloader/core/searchers/esjzone.py +43 -25
- novel_downloader/core/searchers/hetushu.py +92 -0
- novel_downloader/core/searchers/i25zw.py +93 -0
- novel_downloader/core/searchers/ixdzs8.py +107 -0
- novel_downloader/core/searchers/jpxs123.py +107 -0
- novel_downloader/core/searchers/piaotia.py +100 -0
- novel_downloader/core/searchers/qbtr.py +106 -0
- novel_downloader/core/searchers/qianbi.py +74 -40
- novel_downloader/core/searchers/quanben5.py +144 -0
- novel_downloader/core/searchers/registry.py +24 -8
- novel_downloader/core/searchers/shuhaige.py +124 -0
- novel_downloader/core/searchers/tongrenquan.py +110 -0
- novel_downloader/core/searchers/ttkan.py +92 -0
- novel_downloader/core/searchers/xiaoshuowu.py +122 -0
- novel_downloader/core/searchers/xiguashuwu.py +95 -0
- novel_downloader/core/searchers/xs63b.py +104 -0
- novel_downloader/locales/en.json +31 -82
- novel_downloader/locales/zh.json +32 -83
- novel_downloader/models/__init__.py +21 -22
- novel_downloader/models/book.py +44 -0
- novel_downloader/models/config.py +4 -37
- novel_downloader/models/login.py +1 -1
- novel_downloader/models/search.py +5 -0
- novel_downloader/resources/config/settings.toml +8 -70
- novel_downloader/resources/json/xiguashuwu.json +718 -0
- novel_downloader/utils/__init__.py +13 -22
- novel_downloader/utils/chapter_storage.py +3 -2
- novel_downloader/utils/constants.py +4 -29
- novel_downloader/utils/cookies.py +6 -18
- novel_downloader/utils/crypto_utils/__init__.py +13 -0
- novel_downloader/utils/crypto_utils/aes_util.py +90 -0
- novel_downloader/utils/crypto_utils/aes_v1.py +619 -0
- novel_downloader/utils/crypto_utils/aes_v2.py +1143 -0
- novel_downloader/utils/{crypto_utils.py → crypto_utils/rc4.py} +3 -10
- novel_downloader/utils/epub/__init__.py +1 -1
- novel_downloader/utils/epub/constants.py +57 -16
- novel_downloader/utils/epub/documents.py +88 -194
- novel_downloader/utils/epub/models.py +0 -14
- novel_downloader/utils/epub/utils.py +63 -96
- novel_downloader/utils/file_utils/__init__.py +2 -23
- novel_downloader/utils/file_utils/io.py +3 -113
- novel_downloader/utils/file_utils/sanitize.py +0 -4
- novel_downloader/utils/fontocr.py +207 -0
- novel_downloader/utils/logger.py +8 -16
- novel_downloader/utils/network.py +2 -2
- novel_downloader/utils/state.py +4 -90
- novel_downloader/utils/text_utils/__init__.py +1 -7
- novel_downloader/utils/text_utils/diff_display.py +5 -7
- novel_downloader/utils/time_utils/__init__.py +5 -11
- novel_downloader/utils/time_utils/datetime_utils.py +20 -29
- novel_downloader/utils/time_utils/sleep_utils.py +4 -8
- novel_downloader/web/__init__.py +13 -0
- novel_downloader/web/components/__init__.py +11 -0
- novel_downloader/web/components/navigation.py +35 -0
- novel_downloader/web/main.py +66 -0
- novel_downloader/web/pages/__init__.py +17 -0
- novel_downloader/web/pages/download.py +78 -0
- novel_downloader/web/pages/progress.py +147 -0
- novel_downloader/web/pages/search.py +329 -0
- novel_downloader/web/services/__init__.py +17 -0
- novel_downloader/web/services/client_dialog.py +164 -0
- novel_downloader/web/services/cred_broker.py +113 -0
- novel_downloader/web/services/cred_models.py +35 -0
- novel_downloader/web/services/task_manager.py +264 -0
- novel_downloader-2.0.0.dist-info/METADATA +171 -0
- novel_downloader-2.0.0.dist-info/RECORD +210 -0
- {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/entry_points.txt +1 -1
- novel_downloader/core/downloaders/biquge.py +0 -29
- novel_downloader/core/downloaders/esjzone.py +0 -29
- novel_downloader/core/downloaders/linovelib.py +0 -29
- novel_downloader/core/downloaders/sfacg.py +0 -29
- novel_downloader/core/downloaders/yamibo.py +0 -29
- novel_downloader/core/exporters/biquge.py +0 -22
- novel_downloader/core/exporters/esjzone.py +0 -22
- novel_downloader/core/exporters/qianbi.py +0 -22
- novel_downloader/core/exporters/sfacg.py +0 -22
- novel_downloader/core/exporters/yamibo.py +0 -22
- novel_downloader/core/fetchers/base/__init__.py +0 -14
- novel_downloader/core/fetchers/base/browser.py +0 -422
- novel_downloader/core/fetchers/biquge/__init__.py +0 -14
- novel_downloader/core/fetchers/esjzone/__init__.py +0 -14
- novel_downloader/core/fetchers/esjzone/browser.py +0 -209
- novel_downloader/core/fetchers/linovelib/__init__.py +0 -14
- novel_downloader/core/fetchers/linovelib/browser.py +0 -198
- novel_downloader/core/fetchers/qianbi/__init__.py +0 -14
- novel_downloader/core/fetchers/qidian/__init__.py +0 -14
- novel_downloader/core/fetchers/qidian/browser.py +0 -326
- novel_downloader/core/fetchers/sfacg/__init__.py +0 -14
- novel_downloader/core/fetchers/sfacg/browser.py +0 -194
- novel_downloader/core/fetchers/yamibo/__init__.py +0 -14
- novel_downloader/core/fetchers/yamibo/browser.py +0 -234
- novel_downloader/core/parsers/biquge.py +0 -139
- novel_downloader/models/chapter.py +0 -25
- novel_downloader/models/types.py +0 -13
- novel_downloader/tui/__init__.py +0 -7
- novel_downloader/tui/app.py +0 -32
- novel_downloader/tui/main.py +0 -17
- novel_downloader/tui/screens/__init__.py +0 -14
- novel_downloader/tui/screens/home.py +0 -198
- novel_downloader/tui/screens/login.py +0 -74
- novel_downloader/tui/styles/home_layout.tcss +0 -79
- novel_downloader/tui/widgets/richlog_handler.py +0 -24
- novel_downloader/utils/cache.py +0 -24
- novel_downloader/utils/fontocr/__init__.py +0 -22
- novel_downloader/utils/fontocr/hash_store.py +0 -280
- novel_downloader/utils/fontocr/hash_utils.py +0 -103
- novel_downloader/utils/fontocr/model_loader.py +0 -69
- novel_downloader/utils/fontocr/ocr_v1.py +0 -315
- novel_downloader/utils/fontocr/ocr_v2.py +0 -764
- novel_downloader/utils/fontocr/ocr_v3.py +0 -744
- novel_downloader-1.5.0.dist-info/METADATA +0 -196
- novel_downloader-1.5.0.dist-info/RECORD +0 -164
- {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/WHEEL +0 -0
- {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-1.5.0.dist-info → novel_downloader-2.0.0.dist-info}/top_level.txt +0 -0
@@ -11,9 +11,10 @@ from __future__ import annotations
|
|
11
11
|
|
12
12
|
import json
|
13
13
|
import logging
|
14
|
-
|
14
|
+
import re
|
15
|
+
from contextlib import suppress
|
16
|
+
from typing import TYPE_CHECKING, TypedDict
|
15
17
|
|
16
|
-
import tinycss2
|
17
18
|
from lxml import html
|
18
19
|
|
19
20
|
from novel_downloader.models import ChapterDict
|
@@ -29,13 +30,36 @@ from .utils import (
|
|
29
30
|
is_duplicated,
|
30
31
|
vip_status,
|
31
32
|
)
|
33
|
+
from .utils.fontmap_recover import (
|
34
|
+
apply_font_mapping,
|
35
|
+
generate_font_map,
|
36
|
+
)
|
32
37
|
|
33
38
|
if TYPE_CHECKING:
|
34
39
|
from .main_parser import QidianParser
|
35
40
|
|
36
41
|
logger = logging.getLogger(__name__)
|
37
|
-
|
38
|
-
|
42
|
+
_RE_ATTR = re.compile(r"attr\(\s*([^)]+?)\s*\)", re.I)
|
43
|
+
_RE_SCALEX = re.compile(r"scalex\(\s*-?1\s*\)", re.I)
|
44
|
+
|
45
|
+
|
46
|
+
class Rule(TypedDict, total=False):
|
47
|
+
delete_all: bool
|
48
|
+
delete_first: bool
|
49
|
+
transform_flip_x: bool
|
50
|
+
append_start_char: str
|
51
|
+
append_end_char: str
|
52
|
+
append_start_attr: str
|
53
|
+
append_end_attr: str
|
54
|
+
|
55
|
+
|
56
|
+
class Rules(TypedDict):
|
57
|
+
# e.g., orders = ["i", "em", "span"]
|
58
|
+
orders: list[str]
|
59
|
+
# e.g., sy["sy-3"] -> Rule
|
60
|
+
sy: dict[str, Rule]
|
61
|
+
# e.g., p_rules["p3"]["i"] -> Rule
|
62
|
+
p_rules: dict[str, dict[str, Rule]]
|
39
63
|
|
40
64
|
|
41
65
|
def parse_encrypted_chapter(
|
@@ -58,7 +82,7 @@ def parse_encrypted_chapter(
|
|
58
82
|
:return: Formatted chapter text or empty string if not parsable.
|
59
83
|
"""
|
60
84
|
try:
|
61
|
-
if not
|
85
|
+
if not parser._decode_font:
|
62
86
|
return None
|
63
87
|
ssr_data = find_ssr_page_context(html_str)
|
64
88
|
chapter_info = extract_chapter_info(ssr_data)
|
@@ -104,47 +128,21 @@ def parse_encrypted_chapter(
|
|
104
128
|
raise ValueError("fixed_path is None: failed to download font")
|
105
129
|
|
106
130
|
# Extract and render paragraphs from HTML with CSS rules
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
"[Parser] decryption failed for '%s': %s", chapter_id, e
|
123
|
-
)
|
124
|
-
return None
|
125
|
-
main_paragraphs = extract_paragraphs_recursively(raw_html, chapter_id)
|
126
|
-
|
127
|
-
if parser.save_font_debug:
|
128
|
-
main_paragraphs_path = debug_dir / "main_paragraphs_debug.json"
|
129
|
-
main_paragraphs_path.write_text(
|
130
|
-
json.dumps(main_paragraphs, ensure_ascii=False, indent=2),
|
131
|
-
encoding="utf-8",
|
132
|
-
)
|
133
|
-
|
134
|
-
paragraphs_rules = parse_rule(css_str)
|
135
|
-
if parser.save_font_debug:
|
136
|
-
paragraphs_rules_path = debug_dir / "paragraphs_rules_debug.json"
|
137
|
-
paragraphs_rules_path.write_text(
|
138
|
-
json.dumps(paragraphs_rules, ensure_ascii=False, indent=2),
|
139
|
-
encoding="utf-8",
|
140
|
-
)
|
141
|
-
|
142
|
-
end_number = parse_end_number(main_paragraphs, paragraphs_rules)
|
143
|
-
paragraphs_str, refl_list = render_paragraphs(
|
144
|
-
main_paragraphs,
|
145
|
-
paragraphs_rules,
|
146
|
-
end_number,
|
147
|
-
)
|
131
|
+
if vip_status(ssr_data):
|
132
|
+
try:
|
133
|
+
decryptor = get_decryptor()
|
134
|
+
raw_html = decryptor.decrypt(
|
135
|
+
raw_html,
|
136
|
+
chapter_id,
|
137
|
+
fkp,
|
138
|
+
parser._fuid,
|
139
|
+
)
|
140
|
+
except Exception as e:
|
141
|
+
logger.error("[Parser] decryption failed for '%s': %s", chapter_id, e)
|
142
|
+
return None
|
143
|
+
|
144
|
+
css_rules = parse_css_rules(css_str)
|
145
|
+
paragraphs_str, refl_list = render_visible_text(raw_html, css_rules)
|
148
146
|
if parser.save_font_debug:
|
149
147
|
paragraphs_str_path = debug_dir / f"{chapter_id}_debug.txt"
|
150
148
|
paragraphs_str_path.write_text(paragraphs_str, encoding="utf-8")
|
@@ -161,13 +159,17 @@ def parse_encrypted_chapter(
|
|
161
159
|
encoding="utf-8",
|
162
160
|
)
|
163
161
|
|
164
|
-
mapping_result =
|
162
|
+
mapping_result = generate_font_map(
|
165
163
|
fixed_font_path=fixed_path,
|
166
164
|
random_font_path=rand_path,
|
167
165
|
char_set=char_set,
|
168
166
|
refl_set=refl_set,
|
169
|
-
|
167
|
+
cache_dir=parser._base_cache_dir,
|
168
|
+
batch_size=parser._config.batch_size,
|
170
169
|
)
|
170
|
+
if not mapping_result:
|
171
|
+
return None
|
172
|
+
|
171
173
|
if parser.save_font_debug:
|
172
174
|
mapping_json_path = debug_dir / "font_mapping.json"
|
173
175
|
mapping_json_path.write_text(
|
@@ -176,12 +178,12 @@ def parse_encrypted_chapter(
|
|
176
178
|
)
|
177
179
|
|
178
180
|
# Reconstruct final readable text
|
179
|
-
original_text =
|
181
|
+
original_text = apply_font_mapping(
|
180
182
|
text=paragraphs_str,
|
181
183
|
font_map=mapping_result,
|
182
184
|
)
|
183
185
|
|
184
|
-
final_paragraphs_str = "\n
|
186
|
+
final_paragraphs_str = "\n".join(
|
185
187
|
line.strip() for line in original_text.splitlines() if line.strip()
|
186
188
|
)
|
187
189
|
if parser._use_truncation and duplicated:
|
@@ -211,318 +213,258 @@ def parse_encrypted_chapter(
|
|
211
213
|
return None
|
212
214
|
|
213
215
|
|
214
|
-
def
|
215
|
-
html_str: str,
|
216
|
-
chapter_id: str,
|
217
|
-
) -> list[dict[str, Any]]:
|
218
|
-
def parse_element(elem: html.HtmlElement) -> dict[str, Any]:
|
219
|
-
class_attr = elem.attrib.get("class", "")
|
220
|
-
class_list = class_attr.split() if isinstance(class_attr, str) else class_attr
|
221
|
-
if "review" in class_list:
|
222
|
-
return {}
|
223
|
-
|
224
|
-
# Build attrs with class as list
|
225
|
-
attrs = {k: v.split() if k == "class" else v for k, v in elem.attrib.items()}
|
226
|
-
|
227
|
-
node: dict[str, Any] = {
|
228
|
-
"tag": elem.tag,
|
229
|
-
"attrs": attrs,
|
230
|
-
"data": [],
|
231
|
-
}
|
232
|
-
|
233
|
-
# Append entire elem.text if present (no splitting)
|
234
|
-
if elem.text:
|
235
|
-
node["data"].append(elem.text)
|
236
|
-
|
237
|
-
# Recurse into children
|
238
|
-
for child in elem.iterchildren(tag=None):
|
239
|
-
child_dict = parse_element(child)
|
240
|
-
if child_dict:
|
241
|
-
node["data"].append(child_dict)
|
242
|
-
|
243
|
-
# Append entire tail string (no split)
|
244
|
-
if child.tail:
|
245
|
-
node["data"].append(child.tail)
|
246
|
-
|
247
|
-
return node
|
248
|
-
|
249
|
-
tree = html.fromstring(html_str)
|
250
|
-
|
251
|
-
# Try to find <main id="c-{chapter_id}">
|
252
|
-
main_elem = tree.xpath(f'//main[@id="c-{chapter_id}"]')
|
253
|
-
search_root = main_elem[0] if main_elem else tree
|
254
|
-
return [parse_element(p) for p in search_root.findall(".//p")]
|
255
|
-
|
256
|
-
|
257
|
-
def parse_rule(css_str: str) -> dict[str, Any]:
|
216
|
+
def _only_tag(selector: str) -> str | None:
|
258
217
|
"""
|
259
|
-
|
218
|
+
Normalize a selector into just its tag name for ordering.
|
260
219
|
|
261
|
-
Handles
|
262
|
-
- font-size:0 (mark for deletion)
|
263
|
-
- scaleX(-1) (mark as mirrored)
|
264
|
-
- ::before / ::after with content or attr()
|
265
|
-
- class + tag selector mapping
|
266
|
-
- custom rendering order via 'order'
|
220
|
+
Handles forms like 'i', 'em::before', '.p3 i', '.p2 span::after'.
|
267
221
|
|
268
|
-
|
269
|
-
:return: Dict with "rules" and "orders" for rendering.
|
222
|
+
Returns None if can't extract a tag.
|
270
223
|
"""
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
)
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
224
|
+
sel = selector.strip()
|
225
|
+
# If it has spaces, take the rightmost simple selector
|
226
|
+
last = sel.split()[-1]
|
227
|
+
# Drop ::pseudo
|
228
|
+
last = last.split("::", 1)[0]
|
229
|
+
# If it's like 'span[attr=..]' keep 'span'
|
230
|
+
last = last.split("[", 1)[0]
|
231
|
+
# If it starts with '.', it's not a tag
|
232
|
+
if not last or last.startswith("."):
|
233
|
+
return None
|
234
|
+
return last
|
235
|
+
|
236
|
+
|
237
|
+
def _parse_decls(block: str) -> list[tuple[str, str]]:
|
238
|
+
"""
|
239
|
+
Parse 'name:value;...' inside a block. Tolerates quotes and attr().
|
240
|
+
"""
|
241
|
+
decls: list[tuple[str, str]] = []
|
242
|
+
i = 0
|
243
|
+
n = len(block)
|
244
|
+
name: list[str] = []
|
245
|
+
val: list[str] = []
|
246
|
+
in_name = True
|
247
|
+
quote = None # track ' or "
|
248
|
+
while i < n:
|
249
|
+
c = block[i]
|
250
|
+
if quote:
|
251
|
+
# inside quotes
|
252
|
+
if c == "\\" and i + 1 < n:
|
253
|
+
# keep escaped char
|
254
|
+
(name if in_name else val).append(c)
|
255
|
+
i += 1
|
256
|
+
(name if in_name else val).append(block[i])
|
257
|
+
elif c == quote:
|
258
|
+
(name if in_name else val).append(c)
|
259
|
+
quote = None
|
260
|
+
else:
|
261
|
+
(name if in_name else val).append(c)
|
262
|
+
else:
|
263
|
+
if c in ("'", '"'):
|
264
|
+
(name if in_name else val).append(c)
|
265
|
+
quote = c
|
266
|
+
elif in_name and c == ":":
|
267
|
+
in_name = False
|
268
|
+
elif c == ";":
|
269
|
+
nm = "".join(name).strip().lower()
|
270
|
+
vl = "".join(val).strip()
|
271
|
+
if nm:
|
272
|
+
decls.append((nm, vl))
|
273
|
+
name.clear()
|
274
|
+
val.clear()
|
275
|
+
in_name = True
|
276
|
+
else:
|
277
|
+
(name if in_name else val).append(c)
|
278
|
+
i += 1
|
279
|
+
|
280
|
+
if name or val:
|
281
|
+
nm = "".join(name).strip().lower()
|
282
|
+
vl = "".join(val).strip()
|
283
|
+
if nm:
|
284
|
+
decls.append((nm, vl))
|
285
|
+
return decls
|
286
|
+
|
287
|
+
|
288
|
+
def parse_css_rules(css_str: str) -> Rules:
|
289
|
+
"""
|
290
|
+
Produces normalized Rules with:
|
291
|
+
- orders: list[str] of tag names sorted by numeric 'order'
|
292
|
+
- sy: '.sy-*' class rules
|
293
|
+
- p_rules: '.p* <tag>' rules, indexed by p-class then tag
|
294
|
+
"""
|
295
|
+
rules: Rules = {"orders": [], "sy": {}, "p_rules": {}}
|
296
|
+
order_pairs: list[tuple[str, int]] = []
|
297
|
+
|
298
|
+
i = 0
|
299
|
+
while True:
|
300
|
+
b1 = css_str.find("{", i)
|
301
|
+
if b1 == -1:
|
302
|
+
break
|
303
|
+
selector = css_str[i:b1].strip().lower()
|
304
|
+
b2 = css_str.find("}", b1 + 1)
|
305
|
+
if b2 == -1:
|
306
|
+
break
|
307
|
+
block = css_str[b1 + 1 : b2]
|
308
|
+
i = b2 + 1
|
309
|
+
|
310
|
+
decls = _parse_decls(block)
|
311
|
+
|
312
|
+
new_rule: Rule = {}
|
313
|
+
order_val: int | None = None
|
314
|
+
|
315
|
+
for name, value in decls:
|
316
|
+
v = value.strip()
|
317
|
+
if name == "font-size" and v == "0":
|
296
318
|
if "::first-letter" in selector:
|
297
|
-
|
319
|
+
new_rule["delete_first"] = True
|
298
320
|
else:
|
299
|
-
|
300
|
-
elif name == "transform"
|
301
|
-
|
321
|
+
new_rule["delete_all"] = True
|
322
|
+
elif name == "transform":
|
323
|
+
if _RE_SCALEX.search(v.replace(" ", "")):
|
324
|
+
new_rule["transform_flip_x"] = True
|
302
325
|
elif name == "order":
|
303
|
-
|
326
|
+
with suppress(ValueError, TypeError):
|
327
|
+
order_val = int(v)
|
304
328
|
elif name == "content":
|
329
|
+
# normalize: remove outer quotes
|
305
330
|
if "::after" in selector:
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
]
|
331
|
+
m = _RE_ATTR.search(v)
|
332
|
+
if m:
|
333
|
+
new_rule["append_end_attr"] = m.group(1)
|
310
334
|
else:
|
311
|
-
|
335
|
+
s = v.strip().strip("\"'")
|
336
|
+
new_rule["append_end_char"] = s
|
312
337
|
elif "::before" in selector:
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
)[0]
|
338
|
+
m = _RE_ATTR.search(v)
|
339
|
+
if m:
|
340
|
+
new_rule["append_start_attr"] = m.group(1)
|
317
341
|
else:
|
318
|
-
|
342
|
+
s = v.strip().strip("\"'")
|
343
|
+
new_rule["append_start_char"] = s
|
319
344
|
|
320
|
-
#
|
345
|
+
# classification
|
321
346
|
if selector.startswith(".sy-"):
|
322
|
-
|
347
|
+
key = selector.lstrip(".")
|
348
|
+
old = rules["sy"].get(key)
|
349
|
+
rules["sy"][key] = {**old, **new_rule} if old else (new_rule or {})
|
350
|
+
|
323
351
|
elif selector.startswith(".p") and " " in selector:
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
352
|
+
p_cls, right = selector.split(" ", 1)
|
353
|
+
p_cls = p_cls.lstrip(".")
|
354
|
+
tag = _only_tag(right)
|
355
|
+
if tag:
|
356
|
+
prev = rules["p_rules"].setdefault(p_cls, {}).get(tag)
|
357
|
+
rules["p_rules"][p_cls][tag] = (
|
358
|
+
{**prev, **new_rule} if prev else (new_rule or {})
|
359
|
+
)
|
360
|
+
|
361
|
+
if order_val is not None:
|
362
|
+
tag_for_order = _only_tag(selector)
|
363
|
+
if tag_for_order:
|
364
|
+
order_pairs.append((tag_for_order, order_val))
|
365
|
+
|
366
|
+
# normalize orders
|
367
|
+
order_pairs.sort(key=lambda t: t[1])
|
368
|
+
seen = set()
|
369
|
+
orders: list[str] = []
|
370
|
+
for tag, _num in order_pairs:
|
371
|
+
if tag not in seen:
|
372
|
+
seen.add(tag)
|
373
|
+
orders.append(tag)
|
374
|
+
rules["orders"] = orders
|
375
|
+
return rules
|
376
|
+
|
377
|
+
|
378
|
+
def render_visible_text(html_str: str, rules: Rules) -> tuple[str, list[str]]:
|
379
|
+
"""
|
380
|
+
Renderer the HTML using pre-parsed Rules.
|
381
|
+
"""
|
382
|
+
tree = html.fromstring(html_str)
|
383
|
+
paragraphs_out: list[str] = []
|
384
|
+
refl_list: list[str] = []
|
385
|
+
orders = rules.get("orders") or []
|
386
|
+
p_rules = rules.get("p_rules") or {}
|
387
|
+
sy_rules = rules.get("sy") or {}
|
328
388
|
|
329
|
-
|
330
|
-
|
389
|
+
def _class_list(el: html.HtmlElement) -> list[str]:
|
390
|
+
cls = el.get("class")
|
391
|
+
return cls.split() if cls else []
|
331
392
|
|
332
|
-
|
333
|
-
|
393
|
+
def _apply_rule(el: html.HtmlElement, rule: Rule) -> str:
|
394
|
+
if rule.get("delete_all"):
|
395
|
+
return ""
|
334
396
|
|
397
|
+
parts: list[str] = []
|
398
|
+
if "append_start_char" in rule:
|
399
|
+
parts.append(rule["append_start_char"])
|
400
|
+
if "append_start_attr" in rule:
|
401
|
+
parts.append(el.get(rule["append_start_attr"], ""))
|
335
402
|
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
) -> tuple[str, list[str]]:
|
341
|
-
"""
|
342
|
-
Applies the parsed CSS rules to the paragraph structure and
|
343
|
-
reconstructs the visible text.
|
403
|
+
text = el.text or ""
|
404
|
+
if rule.get("delete_first") and text:
|
405
|
+
text = text[1:]
|
406
|
+
parts.append(text)
|
344
407
|
|
345
|
-
|
346
|
-
|
408
|
+
if "append_end_char" in rule:
|
409
|
+
parts.append(rule["append_end_char"])
|
410
|
+
if "append_end_attr" in rule:
|
411
|
+
parts.append(el.get(rule["append_end_attr"], ""))
|
347
412
|
|
348
|
-
|
349
|
-
and 'data' fields representing structured content.
|
350
|
-
:param rules: A dictionary with keys 'orders' and 'rules', parsed from CSS.
|
351
|
-
- rules['orders']: List of (selector, id) tuples.
|
352
|
-
- rules['rules']: Nested dict containing transformation rules.
|
413
|
+
s = "".join(parts)
|
353
414
|
|
354
|
-
|
355
|
-
|
356
|
-
- A list of mirrored (reflected) characters for later OCR processing.
|
357
|
-
"""
|
358
|
-
orders: list[tuple[str, str]] = rules.get("orders", [])
|
359
|
-
rules = rules.get("rules", {})
|
360
|
-
refl_list: list[str] = []
|
415
|
+
if rule.get("transform_flip_x") and s:
|
416
|
+
refl_list.append(s)
|
361
417
|
|
362
|
-
|
363
|
-
if rule.get("delete-all", False):
|
364
|
-
return ""
|
418
|
+
return s
|
365
419
|
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
curr_str += first_data
|
420
|
+
for p in tree.findall(".//p"):
|
421
|
+
p_classes = _class_list(p)
|
422
|
+
p_key = next((c for c in p_classes if c.startswith("p")), None)
|
423
|
+
has_ordered_rules = p_key in p_rules
|
371
424
|
|
372
|
-
|
373
|
-
curr_str = "" if len(curr_str) <= 1 else curr_str[1:]
|
425
|
+
buf_parts: list[str] = []
|
374
426
|
|
375
|
-
|
427
|
+
if p.text and not has_ordered_rules:
|
428
|
+
buf_parts.append(p.text)
|
376
429
|
|
377
|
-
|
378
|
-
if attr_name:
|
379
|
-
curr_str += data.get("attrs", {}).get(f"{attr_name}{end_number}", "")
|
430
|
+
ordered_cache: dict[str, list[str]] = {}
|
380
431
|
|
381
|
-
|
432
|
+
for child in p:
|
433
|
+
tag = str(child.tag)
|
382
434
|
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
435
|
+
# Handle inline <y class="sy-*"> spans
|
436
|
+
if tag == "y" and not has_ordered_rules:
|
437
|
+
y_cls = next(
|
438
|
+
(c for c in _class_list(child) if c.startswith("sy-")), None
|
439
|
+
)
|
440
|
+
if y_cls and y_cls in sy_rules:
|
441
|
+
buf_parts.append(_apply_rule(child, sy_rules[y_cls]))
|
442
|
+
else:
|
443
|
+
buf_parts.append(child.text or "")
|
444
|
+
if child.tail:
|
445
|
+
buf_parts.append(child.tail)
|
446
|
+
continue
|
388
447
|
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
paragraphs_str = ""
|
394
|
-
for paragraph in main_paragraphs:
|
395
|
-
class_list = paragraph.get("attrs", {}).get("class", [])
|
396
|
-
p_class_str = next((c for c in class_list if c.startswith("p")), None)
|
397
|
-
curr_datas = paragraph.get("data", [])
|
398
|
-
|
399
|
-
ordered_cache = {}
|
400
|
-
for data in curr_datas:
|
401
|
-
# 文本节点直接加
|
402
|
-
if isinstance(data, str):
|
403
|
-
paragraphs_str += data
|
448
|
+
# Handle ordered paragraphs: only cache tags that appear in `orders`
|
449
|
+
if p_key and has_ordered_rules and tag in orders:
|
450
|
+
rule = p_rules[p_key].get(tag, {})
|
451
|
+
ordered_cache.setdefault(tag, []).append(_apply_rule(child, rule))
|
404
452
|
continue
|
405
453
|
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
if tag == "span" and "class" in attrs and "review" in attrs["class"]:
|
412
|
-
continue
|
413
|
-
|
414
|
-
# sy 类型标签处理
|
415
|
-
if tag == "y":
|
416
|
-
tag_class_list = attrs.get("class", [])
|
417
|
-
tag_class = next(
|
418
|
-
(c for c in tag_class_list if c.startswith("sy-")), None
|
419
|
-
)
|
420
|
-
|
421
|
-
if tag_class in rules.get("sy", {}):
|
422
|
-
curr_rule = rules["sy"][tag_class]
|
423
|
-
paragraphs_str += apply_rule(data, curr_rule)
|
424
|
-
continue
|
425
|
-
|
426
|
-
if not p_class_str:
|
427
|
-
if any(cls in IGNORED_CLASS_LISTS for cls in class_list):
|
428
|
-
continue
|
429
|
-
logger.debug(f"[parser] not find p_class_str: {class_list}")
|
430
|
-
continue
|
431
|
-
# 普通标签处理,根据 orders 顺序匹配
|
432
|
-
for ord_selector, _ in orders:
|
433
|
-
tag_name = f"{ord_selector}{end_number}"
|
434
|
-
if data.get("tag") != tag_name:
|
435
|
-
continue
|
436
|
-
curr_rule = rules.get(p_class_str, {}).get(ord_selector)
|
437
|
-
curr_rule = curr_rule if curr_rule else {}
|
438
|
-
ordered_cache[ord_selector] = apply_rule(data, curr_rule)
|
439
|
-
break
|
440
|
-
# 最后按 orders 顺序拼接
|
441
|
-
for ord_selector, _ in orders:
|
442
|
-
if ord_selector in ordered_cache:
|
443
|
-
paragraphs_str += ordered_cache[ord_selector]
|
444
|
-
|
445
|
-
paragraphs_str += "\n\n"
|
446
|
-
|
447
|
-
return paragraphs_str, refl_list
|
448
|
-
|
449
|
-
|
450
|
-
def parse_paragraph_names(rules: dict[str, Any]) -> set[str]:
|
451
|
-
"""
|
452
|
-
Extract all paragraph selector names from parsed rules, excluding "sy".
|
453
|
-
"""
|
454
|
-
paragraph_names = set()
|
455
|
-
for group, group_rules in rules.get("rules", {}).items():
|
456
|
-
if group == "sy":
|
457
|
-
continue
|
458
|
-
paragraph_names.update(group_rules.keys())
|
459
|
-
return paragraph_names
|
460
|
-
|
461
|
-
|
462
|
-
def parse_end_number(
|
463
|
-
main_paragraphs: list[dict[str, Any]],
|
464
|
-
rules: dict[str, Any],
|
465
|
-
) -> str:
|
466
|
-
"""
|
467
|
-
Find the most frequent numeric suffix from tag names
|
468
|
-
matched by given paragraph prefixes.
|
469
|
-
"""
|
470
|
-
paragraph_names = parse_paragraph_names(rules)
|
471
|
-
end_numbers: dict[int, int] = {}
|
472
|
-
prefix_hits = 0
|
473
|
-
sorted_names = sorted(paragraph_names, key=len, reverse=True)
|
474
|
-
|
475
|
-
def rec_parse(item: list[Any] | dict[str, Any]) -> None:
|
476
|
-
nonlocal prefix_hits
|
477
|
-
if isinstance(item, list):
|
478
|
-
for element in item:
|
479
|
-
rec_parse(element)
|
480
|
-
elif isinstance(item, dict):
|
481
|
-
tag = item.get("tag")
|
482
|
-
if isinstance(tag, str):
|
483
|
-
for prefix in sorted_names:
|
484
|
-
if tag.startswith(prefix):
|
485
|
-
prefix_hits += 1
|
486
|
-
remain = tag[len(prefix) :]
|
487
|
-
if remain.isdigit():
|
488
|
-
num = int(remain)
|
489
|
-
end_numbers[num] = end_numbers.get(num, 0) + 1
|
490
|
-
break
|
491
|
-
for val in item.values():
|
492
|
-
if isinstance(val, (list | dict)):
|
493
|
-
rec_parse(val)
|
494
|
-
|
495
|
-
rec_parse(main_paragraphs)
|
496
|
-
|
497
|
-
if not end_numbers:
|
498
|
-
logger.debug("[Parser] No valid ending numbers found")
|
499
|
-
return ""
|
500
|
-
|
501
|
-
sorted_numbers = sorted(
|
502
|
-
end_numbers.items(), key=lambda x: (x[1], x[0]), reverse=True
|
503
|
-
)
|
504
|
-
|
505
|
-
logger.debug(
|
506
|
-
"[Parser] Top 3 end numbers:\n%s",
|
507
|
-
"\n".join(f"{n}: {c}" for n, c in sorted_numbers[:3]),
|
508
|
-
)
|
509
|
-
most_common_number, most_common_count = sorted_numbers[0]
|
510
|
-
if most_common_count <= prefix_hits / 2:
|
511
|
-
logger.debug(
|
512
|
-
"[Parser] Top number (%s) does not exceed 50%% threshold: %d of %d",
|
513
|
-
most_common_number,
|
514
|
-
most_common_count,
|
515
|
-
prefix_hits,
|
516
|
-
)
|
517
|
-
return ""
|
454
|
+
# Non-ordered, non-<y> nodes: include text + tails as-is
|
455
|
+
if not has_ordered_rules:
|
456
|
+
buf_parts.append(child.text or "")
|
457
|
+
if child.tail:
|
458
|
+
buf_parts.append(child.tail)
|
518
459
|
|
519
|
-
|
460
|
+
# If ordered, flush in global orders with all duplicates preserved
|
461
|
+
if has_ordered_rules:
|
462
|
+
for tag in orders:
|
463
|
+
if tag in ordered_cache:
|
464
|
+
buf_parts.extend(ordered_cache[tag])
|
520
465
|
|
466
|
+
para = "".join(buf_parts)
|
467
|
+
if para:
|
468
|
+
paragraphs_out.append(para)
|
521
469
|
|
522
|
-
|
523
|
-
for para in paragraphs:
|
524
|
-
data = para.get("data", [])
|
525
|
-
for item in data:
|
526
|
-
if isinstance(item, str) and any(kw in item for kw in keywords):
|
527
|
-
return True
|
528
|
-
return False
|
470
|
+
return "\n".join(paragraphs_out), refl_list
|