novel-downloader 1.4.4__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- novel_downloader/__init__.py +1 -1
- novel_downloader/cli/__init__.py +2 -2
- novel_downloader/cli/config.py +1 -83
- novel_downloader/cli/download.py +4 -5
- novel_downloader/cli/export.py +4 -1
- novel_downloader/cli/main.py +2 -0
- novel_downloader/cli/search.py +123 -0
- novel_downloader/config/__init__.py +3 -10
- novel_downloader/config/adapter.py +190 -54
- novel_downloader/config/loader.py +2 -3
- novel_downloader/core/__init__.py +13 -13
- novel_downloader/core/downloaders/__init__.py +10 -11
- novel_downloader/core/downloaders/base.py +152 -26
- novel_downloader/core/downloaders/biquge.py +5 -1
- novel_downloader/core/downloaders/common.py +157 -378
- novel_downloader/core/downloaders/esjzone.py +5 -1
- novel_downloader/core/downloaders/linovelib.py +5 -1
- novel_downloader/core/downloaders/qianbi.py +291 -4
- novel_downloader/core/downloaders/qidian.py +199 -285
- novel_downloader/core/downloaders/registry.py +67 -0
- novel_downloader/core/downloaders/sfacg.py +5 -1
- novel_downloader/core/downloaders/yamibo.py +5 -1
- novel_downloader/core/exporters/__init__.py +10 -11
- novel_downloader/core/exporters/base.py +87 -7
- novel_downloader/core/exporters/biquge.py +5 -8
- novel_downloader/core/exporters/common/__init__.py +2 -2
- novel_downloader/core/exporters/common/epub.py +82 -166
- novel_downloader/core/exporters/common/main_exporter.py +0 -60
- novel_downloader/core/exporters/common/txt.py +82 -83
- novel_downloader/core/exporters/epub_util.py +157 -1330
- novel_downloader/core/exporters/esjzone.py +5 -8
- novel_downloader/core/exporters/linovelib/__init__.py +2 -2
- novel_downloader/core/exporters/linovelib/epub.py +157 -212
- novel_downloader/core/exporters/linovelib/main_exporter.py +2 -59
- novel_downloader/core/exporters/linovelib/txt.py +67 -63
- novel_downloader/core/exporters/qianbi.py +5 -8
- novel_downloader/core/exporters/qidian.py +14 -4
- novel_downloader/core/exporters/registry.py +53 -0
- novel_downloader/core/exporters/sfacg.py +5 -8
- novel_downloader/core/exporters/txt_util.py +67 -0
- novel_downloader/core/exporters/yamibo.py +5 -8
- novel_downloader/core/fetchers/__init__.py +19 -24
- novel_downloader/core/fetchers/base/__init__.py +3 -3
- novel_downloader/core/fetchers/base/browser.py +23 -4
- novel_downloader/core/fetchers/base/session.py +30 -5
- novel_downloader/core/fetchers/biquge/__init__.py +3 -3
- novel_downloader/core/fetchers/biquge/browser.py +5 -0
- novel_downloader/core/fetchers/biquge/session.py +6 -1
- novel_downloader/core/fetchers/esjzone/__init__.py +3 -3
- novel_downloader/core/fetchers/esjzone/browser.py +5 -0
- novel_downloader/core/fetchers/esjzone/session.py +6 -1
- novel_downloader/core/fetchers/linovelib/__init__.py +3 -3
- novel_downloader/core/fetchers/linovelib/browser.py +6 -1
- novel_downloader/core/fetchers/linovelib/session.py +6 -1
- novel_downloader/core/fetchers/qianbi/__init__.py +3 -3
- novel_downloader/core/fetchers/qianbi/browser.py +5 -0
- novel_downloader/core/fetchers/qianbi/session.py +5 -0
- novel_downloader/core/fetchers/qidian/__init__.py +3 -3
- novel_downloader/core/fetchers/qidian/browser.py +12 -4
- novel_downloader/core/fetchers/qidian/session.py +11 -3
- novel_downloader/core/fetchers/registry.py +71 -0
- novel_downloader/core/fetchers/sfacg/__init__.py +3 -3
- novel_downloader/core/fetchers/sfacg/browser.py +5 -0
- novel_downloader/core/fetchers/sfacg/session.py +5 -0
- novel_downloader/core/fetchers/yamibo/__init__.py +3 -3
- novel_downloader/core/fetchers/yamibo/browser.py +5 -0
- novel_downloader/core/fetchers/yamibo/session.py +6 -1
- novel_downloader/core/interfaces/__init__.py +7 -5
- novel_downloader/core/interfaces/searcher.py +18 -0
- novel_downloader/core/parsers/__init__.py +10 -11
- novel_downloader/core/parsers/{biquge/main_parser.py → biquge.py} +7 -2
- novel_downloader/core/parsers/{esjzone/main_parser.py → esjzone.py} +7 -2
- novel_downloader/core/parsers/{linovelib/main_parser.py → linovelib.py} +7 -2
- novel_downloader/core/parsers/{qianbi/main_parser.py → qianbi.py} +7 -2
- novel_downloader/core/parsers/qidian/__init__.py +2 -2
- novel_downloader/core/parsers/qidian/chapter_encrypted.py +23 -21
- novel_downloader/core/parsers/qidian/chapter_normal.py +1 -1
- novel_downloader/core/parsers/qidian/main_parser.py +10 -21
- novel_downloader/core/parsers/qidian/utils/__init__.py +11 -11
- novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +5 -6
- novel_downloader/core/parsers/qidian/utils/node_decryptor.py +2 -2
- novel_downloader/core/parsers/registry.py +68 -0
- novel_downloader/core/parsers/{sfacg/main_parser.py → sfacg.py} +7 -2
- novel_downloader/core/parsers/{yamibo/main_parser.py → yamibo.py} +7 -2
- novel_downloader/core/searchers/__init__.py +20 -0
- novel_downloader/core/searchers/base.py +92 -0
- novel_downloader/core/searchers/biquge.py +83 -0
- novel_downloader/core/searchers/esjzone.py +84 -0
- novel_downloader/core/searchers/qianbi.py +131 -0
- novel_downloader/core/searchers/qidian.py +87 -0
- novel_downloader/core/searchers/registry.py +63 -0
- novel_downloader/locales/en.json +12 -4
- novel_downloader/locales/zh.json +12 -4
- novel_downloader/models/__init__.py +4 -30
- novel_downloader/models/config.py +12 -6
- novel_downloader/models/search.py +16 -0
- novel_downloader/models/types.py +0 -2
- novel_downloader/resources/config/settings.toml +31 -4
- novel_downloader/resources/css_styles/intro.css +83 -0
- novel_downloader/resources/css_styles/main.css +30 -89
- novel_downloader/utils/__init__.py +52 -0
- novel_downloader/utils/chapter_storage.py +244 -224
- novel_downloader/utils/constants.py +1 -21
- novel_downloader/utils/epub/__init__.py +34 -0
- novel_downloader/utils/epub/builder.py +377 -0
- novel_downloader/utils/epub/constants.py +77 -0
- novel_downloader/utils/epub/documents.py +403 -0
- novel_downloader/utils/epub/models.py +134 -0
- novel_downloader/utils/epub/utils.py +212 -0
- novel_downloader/utils/file_utils/__init__.py +10 -14
- novel_downloader/utils/file_utils/io.py +20 -51
- novel_downloader/utils/file_utils/normalize.py +2 -2
- novel_downloader/utils/file_utils/sanitize.py +2 -3
- novel_downloader/utils/fontocr/__init__.py +5 -5
- novel_downloader/utils/{hash_store.py → fontocr/hash_store.py} +4 -3
- novel_downloader/utils/{hash_utils.py → fontocr/hash_utils.py} +2 -2
- novel_downloader/utils/fontocr/ocr_v1.py +13 -1
- novel_downloader/utils/fontocr/ocr_v2.py +13 -1
- novel_downloader/utils/fontocr/ocr_v3.py +744 -0
- novel_downloader/utils/i18n.py +2 -0
- novel_downloader/utils/logger.py +2 -0
- novel_downloader/utils/network.py +110 -251
- novel_downloader/utils/state.py +1 -0
- novel_downloader/utils/text_utils/__init__.py +18 -17
- novel_downloader/utils/text_utils/diff_display.py +4 -5
- novel_downloader/utils/text_utils/numeric_conversion.py +253 -0
- novel_downloader/utils/text_utils/text_cleaner.py +179 -0
- novel_downloader/utils/text_utils/truncate_utils.py +62 -0
- novel_downloader/utils/time_utils/__init__.py +3 -3
- novel_downloader/utils/time_utils/datetime_utils.py +4 -5
- novel_downloader/utils/time_utils/sleep_utils.py +2 -3
- {novel_downloader-1.4.4.dist-info → novel_downloader-1.5.0.dist-info}/METADATA +2 -2
- novel_downloader-1.5.0.dist-info/RECORD +164 -0
- novel_downloader/config/site_rules.py +0 -94
- novel_downloader/core/factory/__init__.py +0 -20
- novel_downloader/core/factory/downloader.py +0 -73
- novel_downloader/core/factory/exporter.py +0 -58
- novel_downloader/core/factory/fetcher.py +0 -96
- novel_downloader/core/factory/parser.py +0 -86
- novel_downloader/core/fetchers/common/__init__.py +0 -14
- novel_downloader/core/fetchers/common/browser.py +0 -79
- novel_downloader/core/fetchers/common/session.py +0 -79
- novel_downloader/core/parsers/biquge/__init__.py +0 -10
- novel_downloader/core/parsers/common/__init__.py +0 -13
- novel_downloader/core/parsers/common/helper.py +0 -323
- novel_downloader/core/parsers/common/main_parser.py +0 -106
- novel_downloader/core/parsers/esjzone/__init__.py +0 -10
- novel_downloader/core/parsers/linovelib/__init__.py +0 -10
- novel_downloader/core/parsers/qianbi/__init__.py +0 -10
- novel_downloader/core/parsers/sfacg/__init__.py +0 -10
- novel_downloader/core/parsers/yamibo/__init__.py +0 -10
- novel_downloader/models/browser.py +0 -21
- novel_downloader/models/site_rules.py +0 -99
- novel_downloader/models/tasks.py +0 -33
- novel_downloader/resources/css_styles/volume-intro.css +0 -56
- novel_downloader/resources/json/replace_word_map.json +0 -4
- novel_downloader/resources/text/blacklist.txt +0 -22
- novel_downloader/utils/text_utils/chapter_formatting.py +0 -46
- novel_downloader/utils/text_utils/font_mapping.py +0 -28
- novel_downloader/utils/text_utils/text_cleaning.py +0 -107
- novel_downloader-1.4.4.dist-info/RECORD +0 -165
- {novel_downloader-1.4.4.dist-info → novel_downloader-1.5.0.dist-info}/WHEEL +0 -0
- {novel_downloader-1.4.4.dist-info → novel_downloader-1.5.0.dist-info}/entry_points.txt +0 -0
- {novel_downloader-1.4.4.dist-info → novel_downloader-1.5.0.dist-info}/licenses/LICENSE +0 -0
- {novel_downloader-1.4.4.dist-info → novel_downloader-1.5.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,253 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.utils.text_utils.numeric_conversion
|
4
|
+
----------------------------------------------------
|
5
|
+
|
6
|
+
Utility functions to convert between Chinese numeral strings
|
7
|
+
and Python integers.
|
8
|
+
"""
|
9
|
+
|
10
|
+
CHINESE_NUMERALS = {
|
11
|
+
"零": 0,
|
12
|
+
"〇": 0,
|
13
|
+
"一": 1,
|
14
|
+
"壹": 1,
|
15
|
+
"二": 2,
|
16
|
+
"两": 2,
|
17
|
+
"贰": 2,
|
18
|
+
"貮": 2,
|
19
|
+
"三": 3,
|
20
|
+
"叁": 3,
|
21
|
+
"四": 4,
|
22
|
+
"肆": 4,
|
23
|
+
"五": 5,
|
24
|
+
"伍": 5,
|
25
|
+
"六": 6,
|
26
|
+
"陆": 6,
|
27
|
+
"七": 7,
|
28
|
+
"柒": 7,
|
29
|
+
"八": 8,
|
30
|
+
"捌": 8,
|
31
|
+
"九": 9,
|
32
|
+
"玖": 9,
|
33
|
+
}
|
34
|
+
|
35
|
+
CHINESE_UNITS = {
|
36
|
+
"十": 10,
|
37
|
+
"拾": 10,
|
38
|
+
"百": 100,
|
39
|
+
"佰": 100,
|
40
|
+
"千": 1000,
|
41
|
+
"仟": 1000,
|
42
|
+
"万": 10_000,
|
43
|
+
"萬": 10_000,
|
44
|
+
"亿": 100_000_000,
|
45
|
+
"億": 100_000_000,
|
46
|
+
"兆": 10**12,
|
47
|
+
"京": 10**16,
|
48
|
+
"垓": 10**20,
|
49
|
+
}
|
50
|
+
|
51
|
+
LARGE_UNITS = [
|
52
|
+
("垓", 10**20),
|
53
|
+
("京", 10**16),
|
54
|
+
("兆", 10**12),
|
55
|
+
("亿", 10**8),
|
56
|
+
("億", 10**8),
|
57
|
+
("万", 10**4),
|
58
|
+
("萬", 10**4),
|
59
|
+
]
|
60
|
+
|
61
|
+
|
62
|
+
def chinese_to_arabic(s: str) -> int:
|
63
|
+
"""
|
64
|
+
Convert a Chinese numeral string into its integer value.
|
65
|
+
|
66
|
+
Examples:
|
67
|
+
---
|
68
|
+
>>> chinese_to_arabic("一千二百三十四")
|
69
|
+
1234
|
70
|
+
>>> chinese_to_arabic("负一千二百三十四")
|
71
|
+
-1234
|
72
|
+
>>> chinese_to_arabic("一万零三")
|
73
|
+
10003
|
74
|
+
>>> chinese_to_arabic("三亿二千五百")
|
75
|
+
3000002500
|
76
|
+
|
77
|
+
:param s: A string of Chinese numerals, e.g. "三千零二十一", "五亿零七万".
|
78
|
+
:return: The integer value represented by the input string.
|
79
|
+
:raises KeyError: If `s` contains characters not found in the supported
|
80
|
+
numeral or unit mappings.
|
81
|
+
"""
|
82
|
+
if not s:
|
83
|
+
raise ValueError("Input string is empty")
|
84
|
+
|
85
|
+
sign = 1
|
86
|
+
if s[0] in ("负", "-"):
|
87
|
+
sign = -1
|
88
|
+
s = s[1:]
|
89
|
+
|
90
|
+
def _parse_section(sec: str) -> int:
|
91
|
+
"""Parse up to 千 unit."""
|
92
|
+
num = 0
|
93
|
+
section_total = 0
|
94
|
+
for ch in sec:
|
95
|
+
if ch in CHINESE_NUMERALS:
|
96
|
+
num = num * 10 + CHINESE_NUMERALS[ch]
|
97
|
+
else:
|
98
|
+
unit = CHINESE_UNITS[ch]
|
99
|
+
section_total += (num or 1) * unit
|
100
|
+
num = 0
|
101
|
+
return section_total + num
|
102
|
+
|
103
|
+
total = 0
|
104
|
+
rest = s
|
105
|
+
for char, val in LARGE_UNITS:
|
106
|
+
if char in rest:
|
107
|
+
left, rest = rest.split(char, 1)
|
108
|
+
total += _parse_section(left) * val
|
109
|
+
|
110
|
+
total += _parse_section(rest)
|
111
|
+
|
112
|
+
return sign * total
|
113
|
+
|
114
|
+
|
115
|
+
def arabic_to_chinese(num: int) -> str:
|
116
|
+
"""
|
117
|
+
Convert an integer to its Chinese numeral representation.
|
118
|
+
|
119
|
+
Examples:
|
120
|
+
---
|
121
|
+
>>> arabic_to_chinese(0)
|
122
|
+
"零"
|
123
|
+
>>> arabic_to_chinese(1234)
|
124
|
+
"一千二百三十四"
|
125
|
+
>>> arabic_to_chinese(10003)
|
126
|
+
"一万零三"
|
127
|
+
>>> arabic_to_chinese(-205)
|
128
|
+
"负二百零五"
|
129
|
+
>>> arabic_to_chinese(3000002500)
|
130
|
+
"三十亿零二百五百" # 3 000 002 500
|
131
|
+
|
132
|
+
:param num: The integer to convert (e.g. 42, -1300).
|
133
|
+
:return: The Chinese-numeral string for `num`.
|
134
|
+
:raises TypeError: If `num` is not an integer.
|
135
|
+
"""
|
136
|
+
if not isinstance(num, int):
|
137
|
+
raise TypeError("Input must be an integer.")
|
138
|
+
if num == 0:
|
139
|
+
return "零"
|
140
|
+
|
141
|
+
digits = "零一二三四五六七八九"
|
142
|
+
small_units = ["", "十", "百", "千"]
|
143
|
+
big_units = ["", "万", "亿", "兆", "京", "垓"]
|
144
|
+
|
145
|
+
negative = num < 0
|
146
|
+
num = -num if negative else num
|
147
|
+
|
148
|
+
def _section_to_chinese(sec: int) -> str:
|
149
|
+
"""
|
150
|
+
Convert a value 1..9999 into Chinese using 千/百/十 units,
|
151
|
+
without any large unit (万, 亿, ...) or leading '零'.
|
152
|
+
"""
|
153
|
+
s = ""
|
154
|
+
unit_pos = 0
|
155
|
+
zero_flag = True
|
156
|
+
while sec > 0:
|
157
|
+
d = sec % 10
|
158
|
+
if d == 0:
|
159
|
+
# only emit one '零' for consecutive zeros
|
160
|
+
if not zero_flag:
|
161
|
+
s = digits[0] + s
|
162
|
+
zero_flag = True
|
163
|
+
else:
|
164
|
+
s = digits[d] + small_units[unit_pos] + s
|
165
|
+
zero_flag = False
|
166
|
+
unit_pos += 1
|
167
|
+
sec //= 10
|
168
|
+
return s
|
169
|
+
|
170
|
+
result = ""
|
171
|
+
section_pos = 0
|
172
|
+
|
173
|
+
while num > 0:
|
174
|
+
section = num % 10_000
|
175
|
+
if section != 0:
|
176
|
+
sec_str = _section_to_chinese(section)
|
177
|
+
result = sec_str + big_units[section_pos] + result
|
178
|
+
else:
|
179
|
+
# if there's already something in `result`, and the next non-zero
|
180
|
+
# block will appear further left, we need a '零' separator
|
181
|
+
if result and not result.startswith("零"):
|
182
|
+
result = "零" + result
|
183
|
+
|
184
|
+
num //= 10_000
|
185
|
+
section_pos += 1
|
186
|
+
|
187
|
+
if negative:
|
188
|
+
result = "负" + result
|
189
|
+
|
190
|
+
return result
|
191
|
+
|
192
|
+
|
193
|
+
if __name__ == "__main__":
|
194
|
+
import random
|
195
|
+
|
196
|
+
RED = "\033[91m"
|
197
|
+
GREEN = "\033[92m"
|
198
|
+
RESET = "\033[0m"
|
199
|
+
random.seed(42)
|
200
|
+
|
201
|
+
fail_count = 0
|
202
|
+
num_list = [
|
203
|
+
("一千二百三十四", 1234),
|
204
|
+
("一万五千", 15000),
|
205
|
+
("一万零三", 10003),
|
206
|
+
("三亿二千五百", 300002500),
|
207
|
+
]
|
208
|
+
print("=== chinese_to_arabic() with fixed cases ===")
|
209
|
+
for s, expected in num_list:
|
210
|
+
actual = chinese_to_arabic(s)
|
211
|
+
if actual != expected:
|
212
|
+
print(f"{RED}FAIL:{RESET} “{s}” -> expected {expected}, got {actual}")
|
213
|
+
fail_count += 1
|
214
|
+
|
215
|
+
if fail_count:
|
216
|
+
print(f"{RED}{fail_count} chinese_to_arabic() tests failed.{RESET}\n")
|
217
|
+
else:
|
218
|
+
print(f"{GREEN}All {len(num_list)} chinese_to_arabic() tests passed!{RESET}\n")
|
219
|
+
|
220
|
+
fail_count = 0
|
221
|
+
print("=== Round-trip test for values 0 - 9999 ===")
|
222
|
+
for i in range(10_000):
|
223
|
+
s = arabic_to_chinese(i)
|
224
|
+
r = chinese_to_arabic(s)
|
225
|
+
if r != i:
|
226
|
+
print(f'{RED}FAIL round-trip:{RESET} {i} -> "{s}" -> {r}')
|
227
|
+
fail_count += 1
|
228
|
+
break
|
229
|
+
|
230
|
+
if fail_count:
|
231
|
+
print(f"{RED}{fail_count} round-trip failures in 0 - 9999.{RESET}\n")
|
232
|
+
else:
|
233
|
+
print(f"{GREEN}0 - 9999 round-trip all passed!{RESET}\n")
|
234
|
+
|
235
|
+
fail_count = 0
|
236
|
+
exponents = range(5, 22) # test around 10^5...
|
237
|
+
print("=== Random round-trip at larger scales ===")
|
238
|
+
for exp in exponents:
|
239
|
+
lower = 10**exp
|
240
|
+
upper = 10 ** (exp + 1)
|
241
|
+
for _ in range(2):
|
242
|
+
i = random.randint(lower, upper - 1)
|
243
|
+
for val in (i, -i):
|
244
|
+
s = arabic_to_chinese(val)
|
245
|
+
r = chinese_to_arabic(s)
|
246
|
+
if r != val:
|
247
|
+
print(f'{RED}FAIL:{RESET} {val} -> "{s}" -> {r}')
|
248
|
+
fail_count += 1
|
249
|
+
|
250
|
+
if fail_count:
|
251
|
+
print(f"{RED}{fail_count} random large-scale failures.{RESET}")
|
252
|
+
else:
|
253
|
+
print(f"{GREEN}All random large-scale round-trips passed!{RESET}")
|
@@ -0,0 +1,179 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.utils.text_utils.text_cleaner
|
4
|
+
----------------------------------------------
|
5
|
+
|
6
|
+
Provides utilities to clean novel titles and content
|
7
|
+
by removing unwanted patterns, replacing strings.
|
8
|
+
"""
|
9
|
+
|
10
|
+
import re
|
11
|
+
from re import Match, Pattern
|
12
|
+
from typing import Protocol, runtime_checkable
|
13
|
+
|
14
|
+
from novel_downloader.models import TextCleanerConfig
|
15
|
+
|
16
|
+
|
17
|
+
@runtime_checkable
|
18
|
+
class Cleaner(Protocol):
|
19
|
+
def clean(self, text: str, *, as_title: bool = False) -> str:
|
20
|
+
...
|
21
|
+
|
22
|
+
def clean_title(self, text: str) -> str:
|
23
|
+
...
|
24
|
+
|
25
|
+
def clean_content(self, text: str) -> str:
|
26
|
+
...
|
27
|
+
|
28
|
+
|
29
|
+
class NullCleaner(Cleaner):
|
30
|
+
def clean_title(self, text: str) -> str:
|
31
|
+
return text
|
32
|
+
|
33
|
+
def clean_content(self, text: str) -> str:
|
34
|
+
return text
|
35
|
+
|
36
|
+
def clean(self, text: str, *, as_title: bool = False) -> str:
|
37
|
+
return text
|
38
|
+
|
39
|
+
|
40
|
+
class TextCleaner(Cleaner):
|
41
|
+
"""
|
42
|
+
TextCleaner removes invisible characters, strips unwanted patterns,
|
43
|
+
and applies literal replacements in a single pass using a combined regex.
|
44
|
+
|
45
|
+
For regex that never matches, reference:
|
46
|
+
|
47
|
+
https://stackoverflow.com/questions/2930182/regex-to-not-match-anything
|
48
|
+
"""
|
49
|
+
|
50
|
+
_INVISIBLE_PATTERN: Pattern[str] = re.compile(r"[\ufeff\u200B\u200C\u200D\u2060]")
|
51
|
+
|
52
|
+
def __init__(self, config: TextCleanerConfig) -> None:
|
53
|
+
"""
|
54
|
+
Initialize TextCleaner with the given configuration.
|
55
|
+
|
56
|
+
:param config: TextCleanerConfig instance containing:
|
57
|
+
|
58
|
+
- remove_invisible: whether to strip BOM/zero-width chars
|
59
|
+
- title_remove_patterns: list of regex patterns to delete from titles
|
60
|
+
- content_remove_patterns: list of regex patterns to delete from content
|
61
|
+
- title_replacements: dict of literal replacements for titles
|
62
|
+
- content_replacements: dict of literal replacements for content
|
63
|
+
"""
|
64
|
+
self._remove_invisible = config.remove_invisible
|
65
|
+
|
66
|
+
# Build literal‐to‐literal replacement maps
|
67
|
+
self._title_repl_map = config.title_replacements
|
68
|
+
self._content_repl_map = config.content_replacements
|
69
|
+
|
70
|
+
# Deduplicate removal patterns (keep order)
|
71
|
+
title_remove = list(dict.fromkeys(config.title_remove_patterns))
|
72
|
+
content_remove = list(dict.fromkeys(config.content_remove_patterns))
|
73
|
+
|
74
|
+
# Build a single combined regex for title:
|
75
|
+
# all delete‐patterns OR all escaped replacement‐keys
|
76
|
+
title_parts = title_remove + [re.escape(k) for k in self._title_repl_map]
|
77
|
+
title_parts.sort(
|
78
|
+
key=len, reverse=True
|
79
|
+
) # longer first to avoid prefix collisions
|
80
|
+
title_pattern = "|".join(title_parts) if title_parts else r"$^"
|
81
|
+
self._title_combined_rx: Pattern[str] = re.compile(title_pattern)
|
82
|
+
|
83
|
+
# Build a single combined regex for content (multiline mode)
|
84
|
+
content_parts = content_remove + [re.escape(k) for k in self._content_repl_map]
|
85
|
+
content_parts.sort(key=len, reverse=True)
|
86
|
+
content_pattern = "|".join(content_parts) if content_parts else r"$^"
|
87
|
+
self._content_combined_rx: Pattern[str] = re.compile(
|
88
|
+
content_pattern, flags=re.MULTILINE
|
89
|
+
)
|
90
|
+
|
91
|
+
def clean_title(self, text: str) -> str:
|
92
|
+
"""
|
93
|
+
Clean a title string.
|
94
|
+
|
95
|
+
Steps:
|
96
|
+
1. Optionally strip BOM & zero-width characters.
|
97
|
+
2. Remove unwanted patterns and apply literal replacements in one pass.
|
98
|
+
3. Trim leading/trailing whitespace.
|
99
|
+
|
100
|
+
:param text: Raw title text.
|
101
|
+
:return: Cleaned title.
|
102
|
+
"""
|
103
|
+
return self._do_clean(text, self._title_combined_rx, self._title_repl_map)
|
104
|
+
|
105
|
+
def clean_content(self, text: str) -> str:
|
106
|
+
"""
|
107
|
+
Clean a content string.
|
108
|
+
|
109
|
+
Steps:
|
110
|
+
1. Optionally strip BOM & zero-width characters.
|
111
|
+
2. Remove unwanted patterns and apply literal replacements in one pass.
|
112
|
+
3. Trim leading/trailing whitespace.
|
113
|
+
|
114
|
+
:param text: Raw content/body text.
|
115
|
+
:return: Cleaned content.
|
116
|
+
"""
|
117
|
+
return self._do_clean(text, self._content_combined_rx, self._content_repl_map)
|
118
|
+
|
119
|
+
def clean(self, text: str, *, as_title: bool = False) -> str:
|
120
|
+
"""
|
121
|
+
Unified clean method to process text as either title or content.
|
122
|
+
|
123
|
+
:param text: Raw text to clean.
|
124
|
+
:param as_title: If True, use title rules; otherwise content rules.
|
125
|
+
:return: Cleaned text.
|
126
|
+
"""
|
127
|
+
return self.clean_title(text) if as_title else self.clean_content(text)
|
128
|
+
|
129
|
+
@classmethod
|
130
|
+
def _remove_bom_and_invisible(cls, text: str) -> str:
|
131
|
+
"""
|
132
|
+
Remove BOM and zero-width/invisible characters from the text.
|
133
|
+
|
134
|
+
Matches:
|
135
|
+
- U+FEFF (BOM)
|
136
|
+
- U+200B ZERO WIDTH SPACE
|
137
|
+
- U+200C ZERO WIDTH NON-JOINER
|
138
|
+
- U+200D ZERO WIDTH JOINER
|
139
|
+
- U+2060 WORD JOINER
|
140
|
+
|
141
|
+
:param text: Input string possibly containing invisible chars.
|
142
|
+
:return: String with those characters stripped.
|
143
|
+
"""
|
144
|
+
return cls._INVISIBLE_PATTERN.sub("", text)
|
145
|
+
|
146
|
+
def _do_clean(
|
147
|
+
self,
|
148
|
+
text: str,
|
149
|
+
combined_rx: Pattern[str],
|
150
|
+
repl_map: dict[str, str],
|
151
|
+
) -> str:
|
152
|
+
"""
|
153
|
+
Core cleaning logic:
|
154
|
+
optional invisible removal, single-pass remove/replace, trimming.
|
155
|
+
|
156
|
+
:param text: Text to clean.
|
157
|
+
:param combined_rx: Compiled regex for removal patterns and replacement keys.
|
158
|
+
:param repl_map: Mapping from matched token to replacement text.
|
159
|
+
:return: Cleaned text.
|
160
|
+
"""
|
161
|
+
# Strip invisible chars if configured
|
162
|
+
if self._remove_invisible:
|
163
|
+
text = self._remove_bom_and_invisible(text)
|
164
|
+
|
165
|
+
# Single‐pass removal & replacement
|
166
|
+
def _sub(match: Match[str]) -> str:
|
167
|
+
token = match.group(0)
|
168
|
+
# If token in repl_map -> replacement; else -> delete (empty string)
|
169
|
+
return repl_map.get(token, "")
|
170
|
+
|
171
|
+
text = combined_rx.sub(_sub, text)
|
172
|
+
return text.strip()
|
173
|
+
|
174
|
+
|
175
|
+
def get_cleaner(
|
176
|
+
enabled: bool,
|
177
|
+
config: TextCleanerConfig,
|
178
|
+
) -> Cleaner:
|
179
|
+
return TextCleaner(config) if enabled else NullCleaner()
|
@@ -0,0 +1,62 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
novel_downloader.utils.text_utils.truncate_utils
|
4
|
+
------------------------------------------------
|
5
|
+
|
6
|
+
Tools for truncating text.
|
7
|
+
"""
|
8
|
+
|
9
|
+
__all__ = [
|
10
|
+
"content_prefix",
|
11
|
+
"truncate_half_lines",
|
12
|
+
]
|
13
|
+
|
14
|
+
import math
|
15
|
+
|
16
|
+
|
17
|
+
def content_prefix(
|
18
|
+
text: str,
|
19
|
+
n: int,
|
20
|
+
ignore_chars: set[str] | None = None,
|
21
|
+
) -> str:
|
22
|
+
"""
|
23
|
+
Return the prefix of `text` containing the first `n` non-ignored characters.
|
24
|
+
|
25
|
+
:param text: The full input string.
|
26
|
+
:param n: Number of content characters to include.
|
27
|
+
:param ignore_chars: Characters to ignore when counting content.
|
28
|
+
:return: Truncated string preserving original whitespace and line breaks.
|
29
|
+
"""
|
30
|
+
ignore = ignore_chars or set()
|
31
|
+
cnt = 0
|
32
|
+
|
33
|
+
for i, ch in enumerate(text):
|
34
|
+
if ch not in ignore:
|
35
|
+
cnt += 1
|
36
|
+
if cnt >= n:
|
37
|
+
return text[: i + 1]
|
38
|
+
|
39
|
+
return text
|
40
|
+
|
41
|
+
|
42
|
+
def truncate_half_lines(text: str) -> str:
|
43
|
+
"""
|
44
|
+
Keep the first half of the lines (rounded up), preserving line breaks.
|
45
|
+
|
46
|
+
:param text: Full input text
|
47
|
+
:return: Truncated text with first half of lines
|
48
|
+
"""
|
49
|
+
lines = text.splitlines()
|
50
|
+
non_empty_lines = [line for line in lines if line.strip()]
|
51
|
+
keep_count = math.ceil(len(non_empty_lines) / 2)
|
52
|
+
|
53
|
+
result_lines = []
|
54
|
+
count = 0
|
55
|
+
for line in lines:
|
56
|
+
result_lines.append(line)
|
57
|
+
if line.strip():
|
58
|
+
count += 1
|
59
|
+
if count >= keep_count:
|
60
|
+
break
|
61
|
+
|
62
|
+
return "\n".join(result_lines)
|
@@ -12,11 +12,11 @@ Includes:
|
|
12
12
|
Sleeps for a random duration, useful for human-like delays or rate limiting.
|
13
13
|
"""
|
14
14
|
|
15
|
-
from .datetime_utils import calculate_time_difference
|
16
|
-
from .sleep_utils import async_sleep_with_random_delay, sleep_with_random_delay
|
17
|
-
|
18
15
|
__all__ = [
|
19
16
|
"calculate_time_difference",
|
20
17
|
"async_sleep_with_random_delay",
|
21
18
|
"sleep_with_random_delay",
|
22
19
|
]
|
20
|
+
|
21
|
+
from .datetime_utils import calculate_time_difference
|
22
|
+
from .sleep_utils import async_sleep_with_random_delay, sleep_with_random_delay
|
@@ -12,6 +12,10 @@ Includes:
|
|
12
12
|
Computes timedelta between two datetime strings, with optional timezones.
|
13
13
|
"""
|
14
14
|
|
15
|
+
__all__ = [
|
16
|
+
"calculate_time_difference",
|
17
|
+
]
|
18
|
+
|
15
19
|
import logging
|
16
20
|
import re
|
17
21
|
from datetime import UTC, datetime, timedelta, timezone
|
@@ -139,8 +143,3 @@ def calculate_time_difference(
|
|
139
143
|
except Exception as e:
|
140
144
|
logger.warning("[time] Failed to calculate time difference: %s", e)
|
141
145
|
return 999, 23, 59, 59
|
142
|
-
|
143
|
-
|
144
|
-
__all__ = [
|
145
|
-
"calculate_time_difference",
|
146
|
-
]
|
@@ -10,6 +10,8 @@ Includes:
|
|
10
10
|
optionally capped with a max_sleep limit.
|
11
11
|
"""
|
12
12
|
|
13
|
+
__all__ = ["sleep_with_random_delay", "async_sleep_with_random_delay"]
|
14
|
+
|
13
15
|
import asyncio
|
14
16
|
import logging
|
15
17
|
import random
|
@@ -100,6 +102,3 @@ async def async_sleep_with_random_delay(
|
|
100
102
|
|
101
103
|
logger.debug("[async time] Sleeping for %.2f seconds", duration)
|
102
104
|
await asyncio.sleep(duration)
|
103
|
-
|
104
|
-
|
105
|
-
__all__ = ["sleep_with_random_delay", "async_sleep_with_random_delay"]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: novel-downloader
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.5.0
|
4
4
|
Summary: A command-line tool for downloading Chinese web novels from Qidian and similar platforms.
|
5
5
|
Author-email: Saudade Z <saudadez217@gmail.com>
|
6
6
|
License: MIT License
|
@@ -83,7 +83,7 @@ Dynamic: license-file
|
|
83
83
|
- EPUB (可选包含章节插图)
|
84
84
|
- 支持活动广告过滤:
|
85
85
|
- [x] 章节标题
|
86
|
-
- [
|
86
|
+
- [x] 章节正文
|
87
87
|
|
88
88
|
---
|
89
89
|
|