novel-downloader 1.4.5__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -1
- novel_downloader/cli/__init__.py +2 -4
- novel_downloader/cli/clean.py +21 -88
- novel_downloader/cli/config.py +27 -104
- novel_downloader/cli/download.py +78 -66
- novel_downloader/cli/export.py +20 -21
- novel_downloader/cli/main.py +3 -1
- novel_downloader/cli/search.py +120 -0
- novel_downloader/cli/ui.py +156 -0
- novel_downloader/config/__init__.py +10 -14
- novel_downloader/config/adapter.py +195 -99
- novel_downloader/config/{loader.py → file_io.py} +53 -27
- novel_downloader/core/__init__.py +14 -13
- novel_downloader/core/archived/deqixs/fetcher.py +115 -0
- novel_downloader/core/archived/deqixs/parser.py +132 -0
- novel_downloader/core/archived/deqixs/searcher.py +89 -0
- novel_downloader/core/archived/qidian/searcher.py +79 -0
- novel_downloader/core/archived/wanbengo/searcher.py +98 -0
- novel_downloader/core/archived/xshbook/searcher.py +93 -0
- novel_downloader/core/downloaders/__init__.py +8 -30
- novel_downloader/core/downloaders/base.py +182 -30
- novel_downloader/core/downloaders/common.py +217 -384
- novel_downloader/core/downloaders/qianbi.py +332 -4
- novel_downloader/core/downloaders/qidian.py +250 -290
- novel_downloader/core/downloaders/registry.py +69 -0
- novel_downloader/core/downloaders/signals.py +46 -0
- novel_downloader/core/exporters/__init__.py +8 -26
- novel_downloader/core/exporters/base.py +107 -31
- novel_downloader/core/exporters/common/__init__.py +3 -4
- novel_downloader/core/exporters/common/epub.py +92 -171
- novel_downloader/core/exporters/common/main_exporter.py +14 -67
- novel_downloader/core/exporters/common/txt.py +90 -86
- novel_downloader/core/exporters/epub_util.py +184 -1327
- novel_downloader/core/exporters/linovelib/__init__.py +3 -2
- novel_downloader/core/exporters/linovelib/epub.py +165 -222
- novel_downloader/core/exporters/linovelib/main_exporter.py +10 -71
- novel_downloader/core/exporters/linovelib/txt.py +76 -66
- novel_downloader/core/exporters/qidian.py +15 -11
- novel_downloader/core/exporters/registry.py +55 -0
- novel_downloader/core/exporters/txt_util.py +67 -0
- novel_downloader/core/fetchers/__init__.py +57 -56
- novel_downloader/core/fetchers/aaatxt.py +83 -0
- novel_downloader/core/fetchers/{biquge/session.py → b520.py} +10 -10
- novel_downloader/core/fetchers/{base/session.py → base.py} +63 -47
- novel_downloader/core/fetchers/biquyuedu.py +83 -0
- novel_downloader/core/fetchers/dxmwx.py +110 -0
- novel_downloader/core/fetchers/eightnovel.py +139 -0
- novel_downloader/core/fetchers/{esjzone/session.py → esjzone.py} +23 -11
- novel_downloader/core/fetchers/guidaye.py +85 -0
- novel_downloader/core/fetchers/hetushu.py +92 -0
- novel_downloader/core/fetchers/{qianbi/browser.py → i25zw.py} +22 -26
- novel_downloader/core/fetchers/ixdzs8.py +113 -0
- novel_downloader/core/fetchers/jpxs123.py +101 -0
- novel_downloader/core/fetchers/{biquge/browser.py → lewenn.py} +15 -15
- novel_downloader/core/fetchers/{linovelib/session.py → linovelib.py} +16 -12
- novel_downloader/core/fetchers/piaotia.py +105 -0
- novel_downloader/core/fetchers/qbtr.py +101 -0
- novel_downloader/core/fetchers/{qianbi/session.py → qianbi.py} +9 -9
- novel_downloader/core/fetchers/{qidian/session.py → qidian.py} +55 -40
- novel_downloader/core/fetchers/quanben5.py +92 -0
- novel_downloader/core/fetchers/{base/rate_limiter.py → rate_limiter.py} +2 -2
- novel_downloader/core/fetchers/registry.py +60 -0
- novel_downloader/core/fetchers/{sfacg/session.py → sfacg.py} +11 -9
- novel_downloader/core/fetchers/shencou.py +106 -0
- novel_downloader/core/fetchers/{common/browser.py → shuhaige.py} +24 -19
- novel_downloader/core/fetchers/tongrenquan.py +84 -0
- novel_downloader/core/fetchers/ttkan.py +95 -0
- novel_downloader/core/fetchers/{common/session.py → wanbengo.py} +21 -17
- novel_downloader/core/fetchers/xiaoshuowu.py +106 -0
- novel_downloader/core/fetchers/xiguashuwu.py +177 -0
- novel_downloader/core/fetchers/xs63b.py +171 -0
- novel_downloader/core/fetchers/xshbook.py +85 -0
- novel_downloader/core/fetchers/{yamibo/session.py → yamibo.py} +23 -11
- novel_downloader/core/fetchers/yibige.py +114 -0
- novel_downloader/core/interfaces/__init__.py +8 -14
- novel_downloader/core/interfaces/downloader.py +6 -2
- novel_downloader/core/interfaces/exporter.py +7 -7
- novel_downloader/core/interfaces/fetcher.py +4 -17
- novel_downloader/core/interfaces/parser.py +5 -6
- novel_downloader/core/interfaces/searcher.py +26 -0
- novel_downloader/core/parsers/__init__.py +58 -22
- novel_downloader/core/parsers/aaatxt.py +132 -0
- novel_downloader/core/parsers/b520.py +116 -0
- novel_downloader/core/parsers/base.py +63 -12
- novel_downloader/core/parsers/biquyuedu.py +133 -0
- novel_downloader/core/parsers/dxmwx.py +162 -0
- novel_downloader/core/parsers/eightnovel.py +224 -0
- novel_downloader/core/parsers/{esjzone/main_parser.py → esjzone.py} +67 -67
- novel_downloader/core/parsers/guidaye.py +128 -0
- novel_downloader/core/parsers/hetushu.py +139 -0
- novel_downloader/core/parsers/i25zw.py +137 -0
- novel_downloader/core/parsers/ixdzs8.py +186 -0
- novel_downloader/core/parsers/jpxs123.py +137 -0
- novel_downloader/core/parsers/lewenn.py +142 -0
- novel_downloader/core/parsers/{linovelib/main_parser.py → linovelib.py} +54 -65
- novel_downloader/core/parsers/piaotia.py +189 -0
- novel_downloader/core/parsers/qbtr.py +136 -0
- novel_downloader/core/parsers/{qianbi/main_parser.py → qianbi.py} +54 -51
- novel_downloader/core/parsers/qidian/__init__.py +2 -2
- novel_downloader/core/parsers/qidian/book_info_parser.py +58 -59
- novel_downloader/core/parsers/qidian/chapter_encrypted.py +290 -346
- novel_downloader/core/parsers/qidian/chapter_normal.py +25 -56
- novel_downloader/core/parsers/qidian/main_parser.py +19 -57
- novel_downloader/core/parsers/qidian/utils/__init__.py +12 -11
- novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +6 -7
- novel_downloader/core/parsers/qidian/utils/fontmap_recover.py +143 -0
- novel_downloader/core/parsers/qidian/utils/helpers.py +0 -4
- novel_downloader/core/parsers/qidian/utils/node_decryptor.py +2 -2
- novel_downloader/core/parsers/quanben5.py +103 -0
- novel_downloader/core/parsers/registry.py +57 -0
- novel_downloader/core/parsers/{sfacg/main_parser.py → sfacg.py} +46 -48
- novel_downloader/core/parsers/shencou.py +215 -0
- novel_downloader/core/parsers/shuhaige.py +111 -0
- novel_downloader/core/parsers/tongrenquan.py +116 -0
- novel_downloader/core/parsers/ttkan.py +132 -0
- novel_downloader/core/parsers/wanbengo.py +191 -0
- novel_downloader/core/parsers/xiaoshuowu.py +173 -0
- novel_downloader/core/parsers/xiguashuwu.py +435 -0
- novel_downloader/core/parsers/xs63b.py +161 -0
- novel_downloader/core/parsers/xshbook.py +134 -0
- novel_downloader/core/parsers/yamibo.py +155 -0
- novel_downloader/core/parsers/yibige.py +166 -0
- novel_downloader/core/searchers/__init__.py +51 -0
- novel_downloader/core/searchers/aaatxt.py +107 -0
- novel_downloader/core/searchers/b520.py +84 -0
- novel_downloader/core/searchers/base.py +168 -0
- novel_downloader/core/searchers/dxmwx.py +105 -0
- novel_downloader/core/searchers/eightnovel.py +84 -0
- novel_downloader/core/searchers/esjzone.py +102 -0
- novel_downloader/core/searchers/hetushu.py +92 -0
- novel_downloader/core/searchers/i25zw.py +93 -0
- novel_downloader/core/searchers/ixdzs8.py +107 -0
- novel_downloader/core/searchers/jpxs123.py +107 -0
- novel_downloader/core/searchers/piaotia.py +100 -0
- novel_downloader/core/searchers/qbtr.py +106 -0
- novel_downloader/core/searchers/qianbi.py +165 -0
- novel_downloader/core/searchers/quanben5.py +144 -0
- novel_downloader/core/searchers/registry.py +79 -0
- novel_downloader/core/searchers/shuhaige.py +124 -0
- novel_downloader/core/searchers/tongrenquan.py +110 -0
- novel_downloader/core/searchers/ttkan.py +92 -0
- novel_downloader/core/searchers/xiaoshuowu.py +122 -0
- novel_downloader/core/searchers/xiguashuwu.py +95 -0
- novel_downloader/core/searchers/xs63b.py +104 -0
- novel_downloader/locales/en.json +36 -79
- novel_downloader/locales/zh.json +37 -80
- novel_downloader/models/__init__.py +23 -50
- novel_downloader/models/book.py +44 -0
- novel_downloader/models/config.py +16 -43
- novel_downloader/models/login.py +1 -1
- novel_downloader/models/search.py +21 -0
- novel_downloader/resources/config/settings.toml +39 -74
- novel_downloader/resources/css_styles/intro.css +83 -0
- novel_downloader/resources/css_styles/main.css +30 -89
- novel_downloader/resources/json/xiguashuwu.json +718 -0
- novel_downloader/utils/__init__.py +43 -0
- novel_downloader/utils/chapter_storage.py +247 -226
- novel_downloader/utils/constants.py +5 -50
- novel_downloader/utils/cookies.py +6 -18
- novel_downloader/utils/crypto_utils/__init__.py +13 -0
- novel_downloader/utils/crypto_utils/aes_util.py +90 -0
- novel_downloader/utils/crypto_utils/aes_v1.py +619 -0
- novel_downloader/utils/crypto_utils/aes_v2.py +1143 -0
- novel_downloader/utils/{crypto_utils.py → crypto_utils/rc4.py} +3 -10
- novel_downloader/utils/epub/__init__.py +34 -0
- novel_downloader/utils/epub/builder.py +377 -0
- novel_downloader/utils/epub/constants.py +118 -0
- novel_downloader/utils/epub/documents.py +297 -0
- novel_downloader/utils/epub/models.py +120 -0
- novel_downloader/utils/epub/utils.py +179 -0
- novel_downloader/utils/file_utils/__init__.py +5 -30
- novel_downloader/utils/file_utils/io.py +9 -150
- novel_downloader/utils/file_utils/normalize.py +2 -2
- novel_downloader/utils/file_utils/sanitize.py +2 -7
- novel_downloader/utils/fontocr.py +207 -0
- novel_downloader/utils/i18n.py +2 -0
- novel_downloader/utils/logger.py +10 -16
- novel_downloader/utils/network.py +111 -252
- novel_downloader/utils/state.py +5 -90
- novel_downloader/utils/text_utils/__init__.py +16 -21
- novel_downloader/utils/text_utils/diff_display.py +6 -9
- novel_downloader/utils/text_utils/numeric_conversion.py +253 -0
- novel_downloader/utils/text_utils/text_cleaner.py +179 -0
- novel_downloader/utils/text_utils/truncate_utils.py +62 -0
- novel_downloader/utils/time_utils/__init__.py +6 -12
- novel_downloader/utils/time_utils/datetime_utils.py +23 -33
- novel_downloader/utils/time_utils/sleep_utils.py +5 -10
- novel_downloader/web/__init__.py +13 -0
- novel_downloader/web/components/__init__.py +11 -0
- novel_downloader/web/components/navigation.py +35 -0
- novel_downloader/web/main.py +66 -0
- novel_downloader/web/pages/__init__.py +17 -0
- novel_downloader/web/pages/download.py +78 -0
- novel_downloader/web/pages/progress.py +147 -0
- novel_downloader/web/pages/search.py +329 -0
- novel_downloader/web/services/__init__.py +17 -0
- novel_downloader/web/services/client_dialog.py +164 -0
- novel_downloader/web/services/cred_broker.py +113 -0
- novel_downloader/web/services/cred_models.py +35 -0
- novel_downloader/web/services/task_manager.py +264 -0
- novel_downloader-2.0.0.dist-info/METADATA +171 -0
- novel_downloader-2.0.0.dist-info/RECORD +210 -0
- {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/entry_points.txt +1 -1
- novel_downloader/config/site_rules.py +0 -94
- novel_downloader/core/downloaders/biquge.py +0 -25
- novel_downloader/core/downloaders/esjzone.py +0 -25
- novel_downloader/core/downloaders/linovelib.py +0 -25
- novel_downloader/core/downloaders/sfacg.py +0 -25
- novel_downloader/core/downloaders/yamibo.py +0 -25
- novel_downloader/core/exporters/biquge.py +0 -25
- novel_downloader/core/exporters/esjzone.py +0 -25
- novel_downloader/core/exporters/qianbi.py +0 -25
- novel_downloader/core/exporters/sfacg.py +0 -25
- novel_downloader/core/exporters/yamibo.py +0 -25
- novel_downloader/core/factory/__init__.py +0 -20
- novel_downloader/core/factory/downloader.py +0 -73
- novel_downloader/core/factory/exporter.py +0 -58
- novel_downloader/core/factory/fetcher.py +0 -96
- novel_downloader/core/factory/parser.py +0 -86
- novel_downloader/core/fetchers/base/__init__.py +0 -14
- novel_downloader/core/fetchers/base/browser.py +0 -403
- novel_downloader/core/fetchers/biquge/__init__.py +0 -14
- novel_downloader/core/fetchers/common/__init__.py +0 -14
- novel_downloader/core/fetchers/esjzone/__init__.py +0 -14
- novel_downloader/core/fetchers/esjzone/browser.py +0 -204
- novel_downloader/core/fetchers/linovelib/__init__.py +0 -14
- novel_downloader/core/fetchers/linovelib/browser.py +0 -193
- novel_downloader/core/fetchers/qianbi/__init__.py +0 -14
- novel_downloader/core/fetchers/qidian/__init__.py +0 -14
- novel_downloader/core/fetchers/qidian/browser.py +0 -318
- novel_downloader/core/fetchers/sfacg/__init__.py +0 -14
- novel_downloader/core/fetchers/sfacg/browser.py +0 -189
- novel_downloader/core/fetchers/yamibo/__init__.py +0 -14
- novel_downloader/core/fetchers/yamibo/browser.py +0 -229
- novel_downloader/core/parsers/biquge/__init__.py +0 -10
- novel_downloader/core/parsers/biquge/main_parser.py +0 -134
- novel_downloader/core/parsers/common/__init__.py +0 -13
- novel_downloader/core/parsers/common/helper.py +0 -323
- novel_downloader/core/parsers/common/main_parser.py +0 -106
- novel_downloader/core/parsers/esjzone/__init__.py +0 -10
- novel_downloader/core/parsers/linovelib/__init__.py +0 -10
- novel_downloader/core/parsers/qianbi/__init__.py +0 -10
- novel_downloader/core/parsers/sfacg/__init__.py +0 -10
- novel_downloader/core/parsers/yamibo/__init__.py +0 -10
- novel_downloader/core/parsers/yamibo/main_parser.py +0 -194
- novel_downloader/models/browser.py +0 -21
- novel_downloader/models/chapter.py +0 -25
- novel_downloader/models/site_rules.py +0 -99
- novel_downloader/models/tasks.py +0 -33
- novel_downloader/models/types.py +0 -15
- novel_downloader/resources/css_styles/volume-intro.css +0 -56
- novel_downloader/resources/json/replace_word_map.json +0 -4
- novel_downloader/resources/text/blacklist.txt +0 -22
- novel_downloader/tui/__init__.py +0 -7
- novel_downloader/tui/app.py +0 -32
- novel_downloader/tui/main.py +0 -17
- novel_downloader/tui/screens/__init__.py +0 -14
- novel_downloader/tui/screens/home.py +0 -198
- novel_downloader/tui/screens/login.py +0 -74
- novel_downloader/tui/styles/home_layout.tcss +0 -79
- novel_downloader/tui/widgets/richlog_handler.py +0 -24
- novel_downloader/utils/cache.py +0 -24
- novel_downloader/utils/fontocr/__init__.py +0 -22
- novel_downloader/utils/fontocr/model_loader.py +0 -69
- novel_downloader/utils/fontocr/ocr_v1.py +0 -303
- novel_downloader/utils/fontocr/ocr_v2.py +0 -752
- novel_downloader/utils/hash_store.py +0 -279
- novel_downloader/utils/hash_utils.py +0 -103
- novel_downloader/utils/text_utils/chapter_formatting.py +0 -46
- novel_downloader/utils/text_utils/font_mapping.py +0 -28
- novel_downloader/utils/text_utils/text_cleaning.py +0 -107
- novel_downloader-1.4.5.dist-info/METADATA +0 -196
- novel_downloader-1.4.5.dist-info/RECORD +0 -165
- {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/WHEEL +0 -0
- {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-1.4.5.dist-info → novel_downloader-2.0.0.dist-info}/top_level.txt +0 -0
@@ -11,16 +11,15 @@ from __future__ import annotations
|
|
11
11
|
|
12
12
|
import json
|
13
13
|
import logging
|
14
|
-
|
15
|
-
from
|
14
|
+
import re
|
15
|
+
from contextlib import suppress
|
16
|
+
from typing import TYPE_CHECKING, TypedDict
|
16
17
|
|
17
|
-
import tinycss2
|
18
18
|
from lxml import html
|
19
19
|
|
20
20
|
from novel_downloader.models import ChapterDict
|
21
|
-
from novel_downloader.utils
|
22
|
-
|
23
|
-
apply_font_mapping,
|
21
|
+
from novel_downloader.utils import (
|
22
|
+
download,
|
24
23
|
truncate_half_lines,
|
25
24
|
)
|
26
25
|
|
@@ -31,13 +30,36 @@ from .utils import (
|
|
31
30
|
is_duplicated,
|
32
31
|
vip_status,
|
33
32
|
)
|
33
|
+
from .utils.fontmap_recover import (
|
34
|
+
apply_font_mapping,
|
35
|
+
generate_font_map,
|
36
|
+
)
|
34
37
|
|
35
38
|
if TYPE_CHECKING:
|
36
39
|
from .main_parser import QidianParser
|
37
40
|
|
38
41
|
logger = logging.getLogger(__name__)
|
39
|
-
|
40
|
-
|
42
|
+
_RE_ATTR = re.compile(r"attr\(\s*([^)]+?)\s*\)", re.I)
|
43
|
+
_RE_SCALEX = re.compile(r"scalex\(\s*-?1\s*\)", re.I)
|
44
|
+
|
45
|
+
|
46
|
+
class Rule(TypedDict, total=False):
|
47
|
+
delete_all: bool
|
48
|
+
delete_first: bool
|
49
|
+
transform_flip_x: bool
|
50
|
+
append_start_char: str
|
51
|
+
append_end_char: str
|
52
|
+
append_start_attr: str
|
53
|
+
append_end_attr: str
|
54
|
+
|
55
|
+
|
56
|
+
class Rules(TypedDict):
|
57
|
+
# e.g., orders = ["i", "em", "span"]
|
58
|
+
orders: list[str]
|
59
|
+
# e.g., sy["sy-3"] -> Rule
|
60
|
+
sy: dict[str, Rule]
|
61
|
+
# e.g., p_rules["p3"]["i"] -> Rule
|
62
|
+
p_rules: dict[str, dict[str, Rule]]
|
41
63
|
|
42
64
|
|
43
65
|
def parse_encrypted_chapter(
|
@@ -60,7 +82,7 @@ def parse_encrypted_chapter(
|
|
60
82
|
:return: Formatted chapter text or empty string if not parsable.
|
61
83
|
"""
|
62
84
|
try:
|
63
|
-
if not
|
85
|
+
if not parser._decode_font:
|
64
86
|
return None
|
65
87
|
ssr_data = find_ssr_page_context(html_str)
|
66
88
|
chapter_info = extract_chapter_info(ssr_data)
|
@@ -70,10 +92,9 @@ def parse_encrypted_chapter(
|
|
70
92
|
)
|
71
93
|
return None
|
72
94
|
|
73
|
-
|
74
|
-
if parser.
|
75
|
-
|
76
|
-
debug_base_dir.mkdir(parents=True, exist_ok=True)
|
95
|
+
debug_dir = parser._debug_dir / "font_debug" / "qidian" / chapter_id
|
96
|
+
if parser.save_font_debug:
|
97
|
+
debug_dir.mkdir(parents=True, exist_ok=True)
|
77
98
|
|
78
99
|
css_str = chapter_info["css"]
|
79
100
|
randomFont_str = chapter_info["randomFont"]
|
@@ -98,88 +119,71 @@ def parse_encrypted_chapter(
|
|
98
119
|
rand_path.parent.mkdir(parents=True, exist_ok=True)
|
99
120
|
rand_path.write_bytes(bytes(rf["data"]))
|
100
121
|
|
101
|
-
fixed_path =
|
102
|
-
url=fixedFontWoff2_url,
|
122
|
+
fixed_path = download(
|
123
|
+
url=fixedFontWoff2_url,
|
124
|
+
target_dir=parser._fixed_font_dir,
|
125
|
+
stream=True,
|
103
126
|
)
|
104
127
|
if fixed_path is None:
|
105
128
|
raise ValueError("fixed_path is None: failed to download font")
|
106
129
|
|
107
130
|
# Extract and render paragraphs from HTML with CSS rules
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
return None
|
126
|
-
main_paragraphs = extract_paragraphs_recursively(raw_html, chapter_id)
|
127
|
-
|
128
|
-
if debug_base_dir:
|
129
|
-
main_paragraphs_path = debug_base_dir / "main_paragraphs_debug.json"
|
130
|
-
main_paragraphs_path.write_text(
|
131
|
-
json.dumps(main_paragraphs, ensure_ascii=False, indent=2),
|
132
|
-
encoding="utf-8",
|
133
|
-
)
|
134
|
-
|
135
|
-
paragraphs_rules = parse_rule(css_str)
|
136
|
-
if debug_base_dir:
|
137
|
-
paragraphs_rules_path = debug_base_dir / "paragraphs_rules_debug.json"
|
138
|
-
paragraphs_rules_path.write_text(
|
139
|
-
json.dumps(paragraphs_rules, ensure_ascii=False, indent=2),
|
140
|
-
encoding="utf-8",
|
141
|
-
)
|
142
|
-
|
143
|
-
end_number = parse_end_number(main_paragraphs, paragraphs_rules)
|
144
|
-
paragraphs_str, refl_list = render_paragraphs(
|
145
|
-
main_paragraphs,
|
146
|
-
paragraphs_rules,
|
147
|
-
end_number,
|
148
|
-
)
|
149
|
-
if debug_base_dir:
|
150
|
-
paragraphs_str_path = debug_base_dir / f"{chapter_id}_debug.txt"
|
131
|
+
if vip_status(ssr_data):
|
132
|
+
try:
|
133
|
+
decryptor = get_decryptor()
|
134
|
+
raw_html = decryptor.decrypt(
|
135
|
+
raw_html,
|
136
|
+
chapter_id,
|
137
|
+
fkp,
|
138
|
+
parser._fuid,
|
139
|
+
)
|
140
|
+
except Exception as e:
|
141
|
+
logger.error("[Parser] decryption failed for '%s': %s", chapter_id, e)
|
142
|
+
return None
|
143
|
+
|
144
|
+
css_rules = parse_css_rules(css_str)
|
145
|
+
paragraphs_str, refl_list = render_visible_text(raw_html, css_rules)
|
146
|
+
if parser.save_font_debug:
|
147
|
+
paragraphs_str_path = debug_dir / f"{chapter_id}_debug.txt"
|
151
148
|
paragraphs_str_path.write_text(paragraphs_str, encoding="utf-8")
|
152
149
|
|
153
150
|
# Run OCR + fallback mapping
|
154
151
|
char_set = {c for c in paragraphs_str if c not in {" ", "\n", "\u3000"}}
|
155
152
|
refl_set = set(refl_list)
|
156
153
|
char_set = char_set - refl_set
|
157
|
-
if
|
158
|
-
char_sets_path =
|
154
|
+
if parser.save_font_debug:
|
155
|
+
char_sets_path = debug_dir / "char_set_debug.txt"
|
159
156
|
temp = f"char_set:\n{char_set}\n\nrefl_set:\n{refl_set}"
|
160
157
|
char_sets_path.write_text(
|
161
158
|
temp,
|
162
159
|
encoding="utf-8",
|
163
160
|
)
|
164
161
|
|
165
|
-
mapping_result =
|
162
|
+
mapping_result = generate_font_map(
|
166
163
|
fixed_font_path=fixed_path,
|
167
164
|
random_font_path=rand_path,
|
168
165
|
char_set=char_set,
|
169
166
|
refl_set=refl_set,
|
170
|
-
|
167
|
+
cache_dir=parser._base_cache_dir,
|
168
|
+
batch_size=parser._config.batch_size,
|
171
169
|
)
|
172
|
-
if
|
173
|
-
|
170
|
+
if not mapping_result:
|
171
|
+
return None
|
172
|
+
|
173
|
+
if parser.save_font_debug:
|
174
|
+
mapping_json_path = debug_dir / "font_mapping.json"
|
174
175
|
mapping_json_path.write_text(
|
175
176
|
json.dumps(mapping_result, ensure_ascii=False, indent=2),
|
176
177
|
encoding="utf-8",
|
177
178
|
)
|
178
179
|
|
179
180
|
# Reconstruct final readable text
|
180
|
-
original_text = apply_font_mapping(
|
181
|
+
original_text = apply_font_mapping(
|
182
|
+
text=paragraphs_str,
|
183
|
+
font_map=mapping_result,
|
184
|
+
)
|
181
185
|
|
182
|
-
final_paragraphs_str = "\n
|
186
|
+
final_paragraphs_str = "\n".join(
|
183
187
|
line.strip() for line in original_text.splitlines() if line.strip()
|
184
188
|
)
|
185
189
|
if parser._use_truncation and duplicated:
|
@@ -209,318 +213,258 @@ def parse_encrypted_chapter(
|
|
209
213
|
return None
|
210
214
|
|
211
215
|
|
212
|
-
def
|
213
|
-
html_str: str,
|
214
|
-
chapter_id: str,
|
215
|
-
) -> list[dict[str, Any]]:
|
216
|
-
def parse_element(elem: html.HtmlElement) -> dict[str, Any]:
|
217
|
-
class_attr = elem.attrib.get("class", "")
|
218
|
-
class_list = class_attr.split() if isinstance(class_attr, str) else class_attr
|
219
|
-
if "review" in class_list:
|
220
|
-
return {}
|
221
|
-
|
222
|
-
# Build attrs with class as list
|
223
|
-
attrs = {k: v.split() if k == "class" else v for k, v in elem.attrib.items()}
|
224
|
-
|
225
|
-
node: dict[str, Any] = {
|
226
|
-
"tag": elem.tag,
|
227
|
-
"attrs": attrs,
|
228
|
-
"data": [],
|
229
|
-
}
|
230
|
-
|
231
|
-
# Append entire elem.text if present (no splitting)
|
232
|
-
if elem.text:
|
233
|
-
node["data"].append(elem.text)
|
234
|
-
|
235
|
-
# Recurse into children
|
236
|
-
for child in elem.iterchildren(tag=None):
|
237
|
-
child_dict = parse_element(child)
|
238
|
-
if child_dict:
|
239
|
-
node["data"].append(child_dict)
|
240
|
-
|
241
|
-
# Append entire tail string (no split)
|
242
|
-
if child.tail:
|
243
|
-
node["data"].append(child.tail)
|
244
|
-
|
245
|
-
return node
|
246
|
-
|
247
|
-
tree = html.fromstring(html_str)
|
248
|
-
|
249
|
-
# Try to find <main id="c-{chapter_id}">
|
250
|
-
main_elem = tree.xpath(f'//main[@id="c-{chapter_id}"]')
|
251
|
-
search_root = main_elem[0] if main_elem else tree
|
252
|
-
return [parse_element(p) for p in search_root.findall(".//p")]
|
253
|
-
|
254
|
-
|
255
|
-
def parse_rule(css_str: str) -> dict[str, Any]:
|
216
|
+
def _only_tag(selector: str) -> str | None:
|
256
217
|
"""
|
257
|
-
|
218
|
+
Normalize a selector into just its tag name for ordering.
|
258
219
|
|
259
|
-
Handles
|
260
|
-
- font-size:0 (mark for deletion)
|
261
|
-
- scaleX(-1) (mark as mirrored)
|
262
|
-
- ::before / ::after with content or attr()
|
263
|
-
- class + tag selector mapping
|
264
|
-
- custom rendering order via 'order'
|
220
|
+
Handles forms like 'i', 'em::before', '.p3 i', '.p2 span::after'.
|
265
221
|
|
266
|
-
|
267
|
-
:return: Dict with "rules" and "orders" for rendering.
|
222
|
+
Returns None if can't extract a tag.
|
268
223
|
"""
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
)
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
224
|
+
sel = selector.strip()
|
225
|
+
# If it has spaces, take the rightmost simple selector
|
226
|
+
last = sel.split()[-1]
|
227
|
+
# Drop ::pseudo
|
228
|
+
last = last.split("::", 1)[0]
|
229
|
+
# If it's like 'span[attr=..]' keep 'span'
|
230
|
+
last = last.split("[", 1)[0]
|
231
|
+
# If it starts with '.', it's not a tag
|
232
|
+
if not last or last.startswith("."):
|
233
|
+
return None
|
234
|
+
return last
|
235
|
+
|
236
|
+
|
237
|
+
def _parse_decls(block: str) -> list[tuple[str, str]]:
|
238
|
+
"""
|
239
|
+
Parse 'name:value;...' inside a block. Tolerates quotes and attr().
|
240
|
+
"""
|
241
|
+
decls: list[tuple[str, str]] = []
|
242
|
+
i = 0
|
243
|
+
n = len(block)
|
244
|
+
name: list[str] = []
|
245
|
+
val: list[str] = []
|
246
|
+
in_name = True
|
247
|
+
quote = None # track ' or "
|
248
|
+
while i < n:
|
249
|
+
c = block[i]
|
250
|
+
if quote:
|
251
|
+
# inside quotes
|
252
|
+
if c == "\\" and i + 1 < n:
|
253
|
+
# keep escaped char
|
254
|
+
(name if in_name else val).append(c)
|
255
|
+
i += 1
|
256
|
+
(name if in_name else val).append(block[i])
|
257
|
+
elif c == quote:
|
258
|
+
(name if in_name else val).append(c)
|
259
|
+
quote = None
|
260
|
+
else:
|
261
|
+
(name if in_name else val).append(c)
|
262
|
+
else:
|
263
|
+
if c in ("'", '"'):
|
264
|
+
(name if in_name else val).append(c)
|
265
|
+
quote = c
|
266
|
+
elif in_name and c == ":":
|
267
|
+
in_name = False
|
268
|
+
elif c == ";":
|
269
|
+
nm = "".join(name).strip().lower()
|
270
|
+
vl = "".join(val).strip()
|
271
|
+
if nm:
|
272
|
+
decls.append((nm, vl))
|
273
|
+
name.clear()
|
274
|
+
val.clear()
|
275
|
+
in_name = True
|
276
|
+
else:
|
277
|
+
(name if in_name else val).append(c)
|
278
|
+
i += 1
|
279
|
+
|
280
|
+
if name or val:
|
281
|
+
nm = "".join(name).strip().lower()
|
282
|
+
vl = "".join(val).strip()
|
283
|
+
if nm:
|
284
|
+
decls.append((nm, vl))
|
285
|
+
return decls
|
286
|
+
|
287
|
+
|
288
|
+
def parse_css_rules(css_str: str) -> Rules:
|
289
|
+
"""
|
290
|
+
Produces normalized Rules with:
|
291
|
+
- orders: list[str] of tag names sorted by numeric 'order'
|
292
|
+
- sy: '.sy-*' class rules
|
293
|
+
- p_rules: '.p* <tag>' rules, indexed by p-class then tag
|
294
|
+
"""
|
295
|
+
rules: Rules = {"orders": [], "sy": {}, "p_rules": {}}
|
296
|
+
order_pairs: list[tuple[str, int]] = []
|
297
|
+
|
298
|
+
i = 0
|
299
|
+
while True:
|
300
|
+
b1 = css_str.find("{", i)
|
301
|
+
if b1 == -1:
|
302
|
+
break
|
303
|
+
selector = css_str[i:b1].strip().lower()
|
304
|
+
b2 = css_str.find("}", b1 + 1)
|
305
|
+
if b2 == -1:
|
306
|
+
break
|
307
|
+
block = css_str[b1 + 1 : b2]
|
308
|
+
i = b2 + 1
|
309
|
+
|
310
|
+
decls = _parse_decls(block)
|
311
|
+
|
312
|
+
new_rule: Rule = {}
|
313
|
+
order_val: int | None = None
|
314
|
+
|
315
|
+
for name, value in decls:
|
316
|
+
v = value.strip()
|
317
|
+
if name == "font-size" and v == "0":
|
294
318
|
if "::first-letter" in selector:
|
295
|
-
|
319
|
+
new_rule["delete_first"] = True
|
296
320
|
else:
|
297
|
-
|
298
|
-
elif name == "transform"
|
299
|
-
|
321
|
+
new_rule["delete_all"] = True
|
322
|
+
elif name == "transform":
|
323
|
+
if _RE_SCALEX.search(v.replace(" ", "")):
|
324
|
+
new_rule["transform_flip_x"] = True
|
300
325
|
elif name == "order":
|
301
|
-
|
326
|
+
with suppress(ValueError, TypeError):
|
327
|
+
order_val = int(v)
|
302
328
|
elif name == "content":
|
329
|
+
# normalize: remove outer quotes
|
303
330
|
if "::after" in selector:
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
]
|
331
|
+
m = _RE_ATTR.search(v)
|
332
|
+
if m:
|
333
|
+
new_rule["append_end_attr"] = m.group(1)
|
308
334
|
else:
|
309
|
-
|
335
|
+
s = v.strip().strip("\"'")
|
336
|
+
new_rule["append_end_char"] = s
|
310
337
|
elif "::before" in selector:
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
)[0]
|
338
|
+
m = _RE_ATTR.search(v)
|
339
|
+
if m:
|
340
|
+
new_rule["append_start_attr"] = m.group(1)
|
315
341
|
else:
|
316
|
-
|
342
|
+
s = v.strip().strip("\"'")
|
343
|
+
new_rule["append_start_char"] = s
|
317
344
|
|
318
|
-
#
|
345
|
+
# classification
|
319
346
|
if selector.startswith(".sy-"):
|
320
|
-
|
347
|
+
key = selector.lstrip(".")
|
348
|
+
old = rules["sy"].get(key)
|
349
|
+
rules["sy"][key] = {**old, **new_rule} if old else (new_rule or {})
|
350
|
+
|
321
351
|
elif selector.startswith(".p") and " " in selector:
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
352
|
+
p_cls, right = selector.split(" ", 1)
|
353
|
+
p_cls = p_cls.lstrip(".")
|
354
|
+
tag = _only_tag(right)
|
355
|
+
if tag:
|
356
|
+
prev = rules["p_rules"].setdefault(p_cls, {}).get(tag)
|
357
|
+
rules["p_rules"][p_cls][tag] = (
|
358
|
+
{**prev, **new_rule} if prev else (new_rule or {})
|
359
|
+
)
|
360
|
+
|
361
|
+
if order_val is not None:
|
362
|
+
tag_for_order = _only_tag(selector)
|
363
|
+
if tag_for_order:
|
364
|
+
order_pairs.append((tag_for_order, order_val))
|
365
|
+
|
366
|
+
# normalize orders
|
367
|
+
order_pairs.sort(key=lambda t: t[1])
|
368
|
+
seen = set()
|
369
|
+
orders: list[str] = []
|
370
|
+
for tag, _num in order_pairs:
|
371
|
+
if tag not in seen:
|
372
|
+
seen.add(tag)
|
373
|
+
orders.append(tag)
|
374
|
+
rules["orders"] = orders
|
375
|
+
return rules
|
376
|
+
|
377
|
+
|
378
|
+
def render_visible_text(html_str: str, rules: Rules) -> tuple[str, list[str]]:
|
379
|
+
"""
|
380
|
+
Renderer the HTML using pre-parsed Rules.
|
381
|
+
"""
|
382
|
+
tree = html.fromstring(html_str)
|
383
|
+
paragraphs_out: list[str] = []
|
384
|
+
refl_list: list[str] = []
|
385
|
+
orders = rules.get("orders") or []
|
386
|
+
p_rules = rules.get("p_rules") or {}
|
387
|
+
sy_rules = rules.get("sy") or {}
|
326
388
|
|
327
|
-
|
328
|
-
|
389
|
+
def _class_list(el: html.HtmlElement) -> list[str]:
|
390
|
+
cls = el.get("class")
|
391
|
+
return cls.split() if cls else []
|
329
392
|
|
330
|
-
|
331
|
-
|
393
|
+
def _apply_rule(el: html.HtmlElement, rule: Rule) -> str:
|
394
|
+
if rule.get("delete_all"):
|
395
|
+
return ""
|
332
396
|
|
397
|
+
parts: list[str] = []
|
398
|
+
if "append_start_char" in rule:
|
399
|
+
parts.append(rule["append_start_char"])
|
400
|
+
if "append_start_attr" in rule:
|
401
|
+
parts.append(el.get(rule["append_start_attr"], ""))
|
333
402
|
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
) -> tuple[str, list[str]]:
|
339
|
-
"""
|
340
|
-
Applies the parsed CSS rules to the paragraph structure and
|
341
|
-
reconstructs the visible text.
|
403
|
+
text = el.text or ""
|
404
|
+
if rule.get("delete_first") and text:
|
405
|
+
text = text[1:]
|
406
|
+
parts.append(text)
|
342
407
|
|
343
|
-
|
344
|
-
|
408
|
+
if "append_end_char" in rule:
|
409
|
+
parts.append(rule["append_end_char"])
|
410
|
+
if "append_end_attr" in rule:
|
411
|
+
parts.append(el.get(rule["append_end_attr"], ""))
|
345
412
|
|
346
|
-
|
347
|
-
and 'data' fields representing structured content.
|
348
|
-
:param rules: A dictionary with keys 'orders' and 'rules', parsed from CSS.
|
349
|
-
- rules['orders']: List of (selector, id) tuples.
|
350
|
-
- rules['rules']: Nested dict containing transformation rules.
|
413
|
+
s = "".join(parts)
|
351
414
|
|
352
|
-
|
353
|
-
|
354
|
-
- A list of mirrored (reflected) characters for later OCR processing.
|
355
|
-
"""
|
356
|
-
orders: list[tuple[str, str]] = rules.get("orders", [])
|
357
|
-
rules = rules.get("rules", {})
|
358
|
-
refl_list: list[str] = []
|
415
|
+
if rule.get("transform_flip_x") and s:
|
416
|
+
refl_list.append(s)
|
359
417
|
|
360
|
-
|
361
|
-
if rule.get("delete-all", False):
|
362
|
-
return ""
|
418
|
+
return s
|
363
419
|
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
curr_str += first_data
|
420
|
+
for p in tree.findall(".//p"):
|
421
|
+
p_classes = _class_list(p)
|
422
|
+
p_key = next((c for c in p_classes if c.startswith("p")), None)
|
423
|
+
has_ordered_rules = p_key in p_rules
|
369
424
|
|
370
|
-
|
371
|
-
curr_str = "" if len(curr_str) <= 1 else curr_str[1:]
|
425
|
+
buf_parts: list[str] = []
|
372
426
|
|
373
|
-
|
427
|
+
if p.text and not has_ordered_rules:
|
428
|
+
buf_parts.append(p.text)
|
374
429
|
|
375
|
-
|
376
|
-
if attr_name:
|
377
|
-
curr_str += data.get("attrs", {}).get(f"{attr_name}{end_number}", "")
|
430
|
+
ordered_cache: dict[str, list[str]] = {}
|
378
431
|
|
379
|
-
|
432
|
+
for child in p:
|
433
|
+
tag = str(child.tag)
|
380
434
|
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
435
|
+
# Handle inline <y class="sy-*"> spans
|
436
|
+
if tag == "y" and not has_ordered_rules:
|
437
|
+
y_cls = next(
|
438
|
+
(c for c in _class_list(child) if c.startswith("sy-")), None
|
439
|
+
)
|
440
|
+
if y_cls and y_cls in sy_rules:
|
441
|
+
buf_parts.append(_apply_rule(child, sy_rules[y_cls]))
|
442
|
+
else:
|
443
|
+
buf_parts.append(child.text or "")
|
444
|
+
if child.tail:
|
445
|
+
buf_parts.append(child.tail)
|
446
|
+
continue
|
386
447
|
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
paragraphs_str = ""
|
392
|
-
for paragraph in main_paragraphs:
|
393
|
-
class_list = paragraph.get("attrs", {}).get("class", [])
|
394
|
-
p_class_str = next((c for c in class_list if c.startswith("p")), None)
|
395
|
-
curr_datas = paragraph.get("data", [])
|
396
|
-
|
397
|
-
ordered_cache = {}
|
398
|
-
for data in curr_datas:
|
399
|
-
# 文本节点直接加
|
400
|
-
if isinstance(data, str):
|
401
|
-
paragraphs_str += data
|
448
|
+
# Handle ordered paragraphs: only cache tags that appear in `orders`
|
449
|
+
if p_key and has_ordered_rules and tag in orders:
|
450
|
+
rule = p_rules[p_key].get(tag, {})
|
451
|
+
ordered_cache.setdefault(tag, []).append(_apply_rule(child, rule))
|
402
452
|
continue
|
403
453
|
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
if tag == "span" and "class" in attrs and "review" in attrs["class"]:
|
410
|
-
continue
|
411
|
-
|
412
|
-
# sy 类型标签处理
|
413
|
-
if tag == "y":
|
414
|
-
tag_class_list = attrs.get("class", [])
|
415
|
-
tag_class = next(
|
416
|
-
(c for c in tag_class_list if c.startswith("sy-")), None
|
417
|
-
)
|
418
|
-
|
419
|
-
if tag_class in rules.get("sy", {}):
|
420
|
-
curr_rule = rules["sy"][tag_class]
|
421
|
-
paragraphs_str += apply_rule(data, curr_rule)
|
422
|
-
continue
|
423
|
-
|
424
|
-
if not p_class_str:
|
425
|
-
if any(cls in IGNORED_CLASS_LISTS for cls in class_list):
|
426
|
-
continue
|
427
|
-
logger.debug(f"[parser] not find p_class_str: {class_list}")
|
428
|
-
continue
|
429
|
-
# 普通标签处理,根据 orders 顺序匹配
|
430
|
-
for ord_selector, _ in orders:
|
431
|
-
tag_name = f"{ord_selector}{end_number}"
|
432
|
-
if data.get("tag") != tag_name:
|
433
|
-
continue
|
434
|
-
curr_rule = rules.get(p_class_str, {}).get(ord_selector)
|
435
|
-
curr_rule = curr_rule if curr_rule else {}
|
436
|
-
ordered_cache[ord_selector] = apply_rule(data, curr_rule)
|
437
|
-
break
|
438
|
-
# 最后按 orders 顺序拼接
|
439
|
-
for ord_selector, _ in orders:
|
440
|
-
if ord_selector in ordered_cache:
|
441
|
-
paragraphs_str += ordered_cache[ord_selector]
|
442
|
-
|
443
|
-
paragraphs_str += "\n\n"
|
444
|
-
|
445
|
-
return paragraphs_str, refl_list
|
446
|
-
|
447
|
-
|
448
|
-
def parse_paragraph_names(rules: dict[str, Any]) -> set[str]:
|
449
|
-
"""
|
450
|
-
Extract all paragraph selector names from parsed rules, excluding "sy".
|
451
|
-
"""
|
452
|
-
paragraph_names = set()
|
453
|
-
for group, group_rules in rules.get("rules", {}).items():
|
454
|
-
if group == "sy":
|
455
|
-
continue
|
456
|
-
paragraph_names.update(group_rules.keys())
|
457
|
-
return paragraph_names
|
458
|
-
|
459
|
-
|
460
|
-
def parse_end_number(
|
461
|
-
main_paragraphs: list[dict[str, Any]],
|
462
|
-
rules: dict[str, Any],
|
463
|
-
) -> str:
|
464
|
-
"""
|
465
|
-
Find the most frequent numeric suffix from tag names
|
466
|
-
matched by given paragraph prefixes.
|
467
|
-
"""
|
468
|
-
paragraph_names = parse_paragraph_names(rules)
|
469
|
-
end_numbers: dict[int, int] = {}
|
470
|
-
prefix_hits = 0
|
471
|
-
sorted_names = sorted(paragraph_names, key=len, reverse=True)
|
472
|
-
|
473
|
-
def rec_parse(item: list[Any] | dict[str, Any]) -> None:
|
474
|
-
nonlocal prefix_hits
|
475
|
-
if isinstance(item, list):
|
476
|
-
for element in item:
|
477
|
-
rec_parse(element)
|
478
|
-
elif isinstance(item, dict):
|
479
|
-
tag = item.get("tag")
|
480
|
-
if isinstance(tag, str):
|
481
|
-
for prefix in sorted_names:
|
482
|
-
if tag.startswith(prefix):
|
483
|
-
prefix_hits += 1
|
484
|
-
remain = tag[len(prefix) :]
|
485
|
-
if remain.isdigit():
|
486
|
-
num = int(remain)
|
487
|
-
end_numbers[num] = end_numbers.get(num, 0) + 1
|
488
|
-
break
|
489
|
-
for val in item.values():
|
490
|
-
if isinstance(val, (list | dict)):
|
491
|
-
rec_parse(val)
|
492
|
-
|
493
|
-
rec_parse(main_paragraphs)
|
494
|
-
|
495
|
-
if not end_numbers:
|
496
|
-
logger.debug("[Parser] No valid ending numbers found")
|
497
|
-
return ""
|
498
|
-
|
499
|
-
sorted_numbers = sorted(
|
500
|
-
end_numbers.items(), key=lambda x: (x[1], x[0]), reverse=True
|
501
|
-
)
|
502
|
-
|
503
|
-
logger.debug(
|
504
|
-
"[Parser] Top 3 end numbers:\n%s",
|
505
|
-
"\n".join(f"{n}: {c}" for n, c in sorted_numbers[:3]),
|
506
|
-
)
|
507
|
-
most_common_number, most_common_count = sorted_numbers[0]
|
508
|
-
if most_common_count <= prefix_hits / 2:
|
509
|
-
logger.debug(
|
510
|
-
"[Parser] Top number (%s) does not exceed 50%% threshold: %d of %d",
|
511
|
-
most_common_number,
|
512
|
-
most_common_count,
|
513
|
-
prefix_hits,
|
514
|
-
)
|
515
|
-
return ""
|
454
|
+
# Non-ordered, non-<y> nodes: include text + tails as-is
|
455
|
+
if not has_ordered_rules:
|
456
|
+
buf_parts.append(child.text or "")
|
457
|
+
if child.tail:
|
458
|
+
buf_parts.append(child.tail)
|
516
459
|
|
517
|
-
|
460
|
+
# If ordered, flush in global orders with all duplicates preserved
|
461
|
+
if has_ordered_rules:
|
462
|
+
for tag in orders:
|
463
|
+
if tag in ordered_cache:
|
464
|
+
buf_parts.extend(ordered_cache[tag])
|
518
465
|
|
466
|
+
para = "".join(buf_parts)
|
467
|
+
if para:
|
468
|
+
paragraphs_out.append(para)
|
519
469
|
|
520
|
-
|
521
|
-
for para in paragraphs:
|
522
|
-
data = para.get("data", [])
|
523
|
-
for item in data:
|
524
|
-
if isinstance(item, str) and any(kw in item for kw in keywords):
|
525
|
-
return True
|
526
|
-
return False
|
470
|
+
return "\n".join(paragraphs_out), refl_list
|