novel-downloader 1.4.5__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. novel_downloader/__init__.py +1 -1
  2. novel_downloader/cli/__init__.py +2 -2
  3. novel_downloader/cli/config.py +1 -83
  4. novel_downloader/cli/download.py +4 -5
  5. novel_downloader/cli/export.py +4 -1
  6. novel_downloader/cli/main.py +2 -0
  7. novel_downloader/cli/search.py +123 -0
  8. novel_downloader/config/__init__.py +3 -10
  9. novel_downloader/config/adapter.py +190 -54
  10. novel_downloader/config/loader.py +2 -3
  11. novel_downloader/core/__init__.py +13 -13
  12. novel_downloader/core/downloaders/__init__.py +10 -11
  13. novel_downloader/core/downloaders/base.py +152 -26
  14. novel_downloader/core/downloaders/biquge.py +5 -1
  15. novel_downloader/core/downloaders/common.py +157 -378
  16. novel_downloader/core/downloaders/esjzone.py +5 -1
  17. novel_downloader/core/downloaders/linovelib.py +5 -1
  18. novel_downloader/core/downloaders/qianbi.py +291 -4
  19. novel_downloader/core/downloaders/qidian.py +199 -285
  20. novel_downloader/core/downloaders/registry.py +67 -0
  21. novel_downloader/core/downloaders/sfacg.py +5 -1
  22. novel_downloader/core/downloaders/yamibo.py +5 -1
  23. novel_downloader/core/exporters/__init__.py +10 -11
  24. novel_downloader/core/exporters/base.py +87 -7
  25. novel_downloader/core/exporters/biquge.py +5 -8
  26. novel_downloader/core/exporters/common/__init__.py +2 -2
  27. novel_downloader/core/exporters/common/epub.py +82 -166
  28. novel_downloader/core/exporters/common/main_exporter.py +0 -60
  29. novel_downloader/core/exporters/common/txt.py +82 -83
  30. novel_downloader/core/exporters/epub_util.py +157 -1330
  31. novel_downloader/core/exporters/esjzone.py +5 -8
  32. novel_downloader/core/exporters/linovelib/__init__.py +2 -2
  33. novel_downloader/core/exporters/linovelib/epub.py +157 -212
  34. novel_downloader/core/exporters/linovelib/main_exporter.py +2 -59
  35. novel_downloader/core/exporters/linovelib/txt.py +67 -63
  36. novel_downloader/core/exporters/qianbi.py +5 -8
  37. novel_downloader/core/exporters/qidian.py +14 -4
  38. novel_downloader/core/exporters/registry.py +53 -0
  39. novel_downloader/core/exporters/sfacg.py +5 -8
  40. novel_downloader/core/exporters/txt_util.py +67 -0
  41. novel_downloader/core/exporters/yamibo.py +5 -8
  42. novel_downloader/core/fetchers/__init__.py +19 -24
  43. novel_downloader/core/fetchers/base/__init__.py +3 -3
  44. novel_downloader/core/fetchers/base/browser.py +23 -4
  45. novel_downloader/core/fetchers/base/session.py +30 -5
  46. novel_downloader/core/fetchers/biquge/__init__.py +3 -3
  47. novel_downloader/core/fetchers/biquge/browser.py +5 -0
  48. novel_downloader/core/fetchers/biquge/session.py +6 -1
  49. novel_downloader/core/fetchers/esjzone/__init__.py +3 -3
  50. novel_downloader/core/fetchers/esjzone/browser.py +5 -0
  51. novel_downloader/core/fetchers/esjzone/session.py +6 -1
  52. novel_downloader/core/fetchers/linovelib/__init__.py +3 -3
  53. novel_downloader/core/fetchers/linovelib/browser.py +6 -1
  54. novel_downloader/core/fetchers/linovelib/session.py +6 -1
  55. novel_downloader/core/fetchers/qianbi/__init__.py +3 -3
  56. novel_downloader/core/fetchers/qianbi/browser.py +5 -0
  57. novel_downloader/core/fetchers/qianbi/session.py +5 -0
  58. novel_downloader/core/fetchers/qidian/__init__.py +3 -3
  59. novel_downloader/core/fetchers/qidian/browser.py +12 -4
  60. novel_downloader/core/fetchers/qidian/session.py +11 -3
  61. novel_downloader/core/fetchers/registry.py +71 -0
  62. novel_downloader/core/fetchers/sfacg/__init__.py +3 -3
  63. novel_downloader/core/fetchers/sfacg/browser.py +5 -0
  64. novel_downloader/core/fetchers/sfacg/session.py +5 -0
  65. novel_downloader/core/fetchers/yamibo/__init__.py +3 -3
  66. novel_downloader/core/fetchers/yamibo/browser.py +5 -0
  67. novel_downloader/core/fetchers/yamibo/session.py +6 -1
  68. novel_downloader/core/interfaces/__init__.py +7 -5
  69. novel_downloader/core/interfaces/searcher.py +18 -0
  70. novel_downloader/core/parsers/__init__.py +10 -11
  71. novel_downloader/core/parsers/{biquge/main_parser.py → biquge.py} +7 -2
  72. novel_downloader/core/parsers/{esjzone/main_parser.py → esjzone.py} +7 -2
  73. novel_downloader/core/parsers/{linovelib/main_parser.py → linovelib.py} +7 -2
  74. novel_downloader/core/parsers/{qianbi/main_parser.py → qianbi.py} +7 -2
  75. novel_downloader/core/parsers/qidian/__init__.py +2 -2
  76. novel_downloader/core/parsers/qidian/chapter_encrypted.py +23 -21
  77. novel_downloader/core/parsers/qidian/chapter_normal.py +1 -1
  78. novel_downloader/core/parsers/qidian/main_parser.py +10 -21
  79. novel_downloader/core/parsers/qidian/utils/__init__.py +11 -11
  80. novel_downloader/core/parsers/qidian/utils/decryptor_fetcher.py +5 -6
  81. novel_downloader/core/parsers/qidian/utils/node_decryptor.py +2 -2
  82. novel_downloader/core/parsers/registry.py +68 -0
  83. novel_downloader/core/parsers/{sfacg/main_parser.py → sfacg.py} +7 -2
  84. novel_downloader/core/parsers/{yamibo/main_parser.py → yamibo.py} +7 -2
  85. novel_downloader/core/searchers/__init__.py +20 -0
  86. novel_downloader/core/searchers/base.py +92 -0
  87. novel_downloader/core/searchers/biquge.py +83 -0
  88. novel_downloader/core/searchers/esjzone.py +84 -0
  89. novel_downloader/core/searchers/qianbi.py +131 -0
  90. novel_downloader/core/searchers/qidian.py +87 -0
  91. novel_downloader/core/searchers/registry.py +63 -0
  92. novel_downloader/locales/en.json +12 -4
  93. novel_downloader/locales/zh.json +12 -4
  94. novel_downloader/models/__init__.py +4 -30
  95. novel_downloader/models/config.py +12 -6
  96. novel_downloader/models/search.py +16 -0
  97. novel_downloader/models/types.py +0 -2
  98. novel_downloader/resources/config/settings.toml +31 -4
  99. novel_downloader/resources/css_styles/intro.css +83 -0
  100. novel_downloader/resources/css_styles/main.css +30 -89
  101. novel_downloader/utils/__init__.py +52 -0
  102. novel_downloader/utils/chapter_storage.py +244 -224
  103. novel_downloader/utils/constants.py +1 -21
  104. novel_downloader/utils/epub/__init__.py +34 -0
  105. novel_downloader/utils/epub/builder.py +377 -0
  106. novel_downloader/utils/epub/constants.py +77 -0
  107. novel_downloader/utils/epub/documents.py +403 -0
  108. novel_downloader/utils/epub/models.py +134 -0
  109. novel_downloader/utils/epub/utils.py +212 -0
  110. novel_downloader/utils/file_utils/__init__.py +10 -14
  111. novel_downloader/utils/file_utils/io.py +20 -51
  112. novel_downloader/utils/file_utils/normalize.py +2 -2
  113. novel_downloader/utils/file_utils/sanitize.py +2 -3
  114. novel_downloader/utils/fontocr/__init__.py +5 -5
  115. novel_downloader/utils/{hash_store.py → fontocr/hash_store.py} +4 -3
  116. novel_downloader/utils/{hash_utils.py → fontocr/hash_utils.py} +2 -2
  117. novel_downloader/utils/fontocr/ocr_v1.py +13 -1
  118. novel_downloader/utils/fontocr/ocr_v2.py +13 -1
  119. novel_downloader/utils/fontocr/ocr_v3.py +744 -0
  120. novel_downloader/utils/i18n.py +2 -0
  121. novel_downloader/utils/logger.py +2 -0
  122. novel_downloader/utils/network.py +110 -251
  123. novel_downloader/utils/state.py +1 -0
  124. novel_downloader/utils/text_utils/__init__.py +18 -17
  125. novel_downloader/utils/text_utils/diff_display.py +4 -5
  126. novel_downloader/utils/text_utils/numeric_conversion.py +253 -0
  127. novel_downloader/utils/text_utils/text_cleaner.py +179 -0
  128. novel_downloader/utils/text_utils/truncate_utils.py +62 -0
  129. novel_downloader/utils/time_utils/__init__.py +3 -3
  130. novel_downloader/utils/time_utils/datetime_utils.py +4 -5
  131. novel_downloader/utils/time_utils/sleep_utils.py +2 -3
  132. {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/METADATA +2 -2
  133. novel_downloader-1.5.0.dist-info/RECORD +164 -0
  134. novel_downloader/config/site_rules.py +0 -94
  135. novel_downloader/core/factory/__init__.py +0 -20
  136. novel_downloader/core/factory/downloader.py +0 -73
  137. novel_downloader/core/factory/exporter.py +0 -58
  138. novel_downloader/core/factory/fetcher.py +0 -96
  139. novel_downloader/core/factory/parser.py +0 -86
  140. novel_downloader/core/fetchers/common/__init__.py +0 -14
  141. novel_downloader/core/fetchers/common/browser.py +0 -79
  142. novel_downloader/core/fetchers/common/session.py +0 -79
  143. novel_downloader/core/parsers/biquge/__init__.py +0 -10
  144. novel_downloader/core/parsers/common/__init__.py +0 -13
  145. novel_downloader/core/parsers/common/helper.py +0 -323
  146. novel_downloader/core/parsers/common/main_parser.py +0 -106
  147. novel_downloader/core/parsers/esjzone/__init__.py +0 -10
  148. novel_downloader/core/parsers/linovelib/__init__.py +0 -10
  149. novel_downloader/core/parsers/qianbi/__init__.py +0 -10
  150. novel_downloader/core/parsers/sfacg/__init__.py +0 -10
  151. novel_downloader/core/parsers/yamibo/__init__.py +0 -10
  152. novel_downloader/models/browser.py +0 -21
  153. novel_downloader/models/site_rules.py +0 -99
  154. novel_downloader/models/tasks.py +0 -33
  155. novel_downloader/resources/css_styles/volume-intro.css +0 -56
  156. novel_downloader/resources/json/replace_word_map.json +0 -4
  157. novel_downloader/resources/text/blacklist.txt +0 -22
  158. novel_downloader/utils/text_utils/chapter_formatting.py +0 -46
  159. novel_downloader/utils/text_utils/font_mapping.py +0 -28
  160. novel_downloader/utils/text_utils/text_cleaning.py +0 -107
  161. novel_downloader-1.4.5.dist-info/RECORD +0 -165
  162. {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/WHEEL +0 -0
  163. {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/entry_points.txt +0 -0
  164. {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/licenses/LICENSE +0 -0
  165. {novel_downloader-1.4.5.dist-info → novel_downloader-1.5.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,253 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.utils.text_utils.numeric_conversion
4
+ ----------------------------------------------------
5
+
6
+ Utility functions to convert between Chinese numeral strings
7
+ and Python integers.
8
+ """
9
+
10
+ CHINESE_NUMERALS = {
11
+ "零": 0,
12
+ "〇": 0,
13
+ "一": 1,
14
+ "壹": 1,
15
+ "二": 2,
16
+ "两": 2,
17
+ "贰": 2,
18
+ "貮": 2,
19
+ "三": 3,
20
+ "叁": 3,
21
+ "四": 4,
22
+ "肆": 4,
23
+ "五": 5,
24
+ "伍": 5,
25
+ "六": 6,
26
+ "陆": 6,
27
+ "七": 7,
28
+ "柒": 7,
29
+ "八": 8,
30
+ "捌": 8,
31
+ "九": 9,
32
+ "玖": 9,
33
+ }
34
+
35
+ CHINESE_UNITS = {
36
+ "十": 10,
37
+ "拾": 10,
38
+ "百": 100,
39
+ "佰": 100,
40
+ "千": 1000,
41
+ "仟": 1000,
42
+ "万": 10_000,
43
+ "萬": 10_000,
44
+ "亿": 100_000_000,
45
+ "億": 100_000_000,
46
+ "兆": 10**12,
47
+ "京": 10**16,
48
+ "垓": 10**20,
49
+ }
50
+
51
+ LARGE_UNITS = [
52
+ ("垓", 10**20),
53
+ ("京", 10**16),
54
+ ("兆", 10**12),
55
+ ("亿", 10**8),
56
+ ("億", 10**8),
57
+ ("万", 10**4),
58
+ ("萬", 10**4),
59
+ ]
60
+
61
+
62
+ def chinese_to_arabic(s: str) -> int:
63
+ """
64
+ Convert a Chinese numeral string into its integer value.
65
+
66
+ Examples:
67
+ ---
68
+ >>> chinese_to_arabic("一千二百三十四")
69
+ 1234
70
+ >>> chinese_to_arabic("负一千二百三十四")
71
+ -1234
72
+ >>> chinese_to_arabic("一万零三")
73
+ 10003
74
+ >>> chinese_to_arabic("三亿二千五百")
75
+ 3000002500
76
+
77
+ :param s: A string of Chinese numerals, e.g. "三千零二十一", "五亿零七万".
78
+ :return: The integer value represented by the input string.
79
+ :raises KeyError: If `s` contains characters not found in the supported
80
+ numeral or unit mappings.
81
+ """
82
+ if not s:
83
+ raise ValueError("Input string is empty")
84
+
85
+ sign = 1
86
+ if s[0] in ("负", "-"):
87
+ sign = -1
88
+ s = s[1:]
89
+
90
+ def _parse_section(sec: str) -> int:
91
+ """Parse up to 千 unit."""
92
+ num = 0
93
+ section_total = 0
94
+ for ch in sec:
95
+ if ch in CHINESE_NUMERALS:
96
+ num = num * 10 + CHINESE_NUMERALS[ch]
97
+ else:
98
+ unit = CHINESE_UNITS[ch]
99
+ section_total += (num or 1) * unit
100
+ num = 0
101
+ return section_total + num
102
+
103
+ total = 0
104
+ rest = s
105
+ for char, val in LARGE_UNITS:
106
+ if char in rest:
107
+ left, rest = rest.split(char, 1)
108
+ total += _parse_section(left) * val
109
+
110
+ total += _parse_section(rest)
111
+
112
+ return sign * total
113
+
114
+
115
+ def arabic_to_chinese(num: int) -> str:
116
+ """
117
+ Convert an integer to its Chinese numeral representation.
118
+
119
+ Examples:
120
+ ---
121
+ >>> arabic_to_chinese(0)
122
+ "零"
123
+ >>> arabic_to_chinese(1234)
124
+ "一千二百三十四"
125
+ >>> arabic_to_chinese(10003)
126
+ "一万零三"
127
+ >>> arabic_to_chinese(-205)
128
+ "负二百零五"
129
+ >>> arabic_to_chinese(3000002500)
130
+ "三十亿零二百五百" # 3 000 002 500
131
+
132
+ :param num: The integer to convert (e.g. 42, -1300).
133
+ :return: The Chinese-numeral string for `num`.
134
+ :raises TypeError: If `num` is not an integer.
135
+ """
136
+ if not isinstance(num, int):
137
+ raise TypeError("Input must be an integer.")
138
+ if num == 0:
139
+ return "零"
140
+
141
+ digits = "零一二三四五六七八九"
142
+ small_units = ["", "十", "百", "千"]
143
+ big_units = ["", "万", "亿", "兆", "京", "垓"]
144
+
145
+ negative = num < 0
146
+ num = -num if negative else num
147
+
148
+ def _section_to_chinese(sec: int) -> str:
149
+ """
150
+ Convert a value 1..9999 into Chinese using 千/百/十 units,
151
+ without any large unit (万, 亿, ...) or leading '零'.
152
+ """
153
+ s = ""
154
+ unit_pos = 0
155
+ zero_flag = True
156
+ while sec > 0:
157
+ d = sec % 10
158
+ if d == 0:
159
+ # only emit one '零' for consecutive zeros
160
+ if not zero_flag:
161
+ s = digits[0] + s
162
+ zero_flag = True
163
+ else:
164
+ s = digits[d] + small_units[unit_pos] + s
165
+ zero_flag = False
166
+ unit_pos += 1
167
+ sec //= 10
168
+ return s
169
+
170
+ result = ""
171
+ section_pos = 0
172
+
173
+ while num > 0:
174
+ section = num % 10_000
175
+ if section != 0:
176
+ sec_str = _section_to_chinese(section)
177
+ result = sec_str + big_units[section_pos] + result
178
+ else:
179
+ # if there's already something in `result`, and the next non-zero
180
+ # block will appear further left, we need a '零' separator
181
+ if result and not result.startswith("零"):
182
+ result = "零" + result
183
+
184
+ num //= 10_000
185
+ section_pos += 1
186
+
187
+ if negative:
188
+ result = "负" + result
189
+
190
+ return result
191
+
192
+
193
+ if __name__ == "__main__":
194
+ import random
195
+
196
+ RED = "\033[91m"
197
+ GREEN = "\033[92m"
198
+ RESET = "\033[0m"
199
+ random.seed(42)
200
+
201
+ fail_count = 0
202
+ num_list = [
203
+ ("一千二百三十四", 1234),
204
+ ("一万五千", 15000),
205
+ ("一万零三", 10003),
206
+ ("三亿二千五百", 300002500),
207
+ ]
208
+ print("=== chinese_to_arabic() with fixed cases ===")
209
+ for s, expected in num_list:
210
+ actual = chinese_to_arabic(s)
211
+ if actual != expected:
212
+ print(f"{RED}FAIL:{RESET} “{s}” -> expected {expected}, got {actual}")
213
+ fail_count += 1
214
+
215
+ if fail_count:
216
+ print(f"{RED}{fail_count} chinese_to_arabic() tests failed.{RESET}\n")
217
+ else:
218
+ print(f"{GREEN}All {len(num_list)} chinese_to_arabic() tests passed!{RESET}\n")
219
+
220
+ fail_count = 0
221
+ print("=== Round-trip test for values 0 - 9999 ===")
222
+ for i in range(10_000):
223
+ s = arabic_to_chinese(i)
224
+ r = chinese_to_arabic(s)
225
+ if r != i:
226
+ print(f'{RED}FAIL round-trip:{RESET} {i} -> "{s}" -> {r}')
227
+ fail_count += 1
228
+ break
229
+
230
+ if fail_count:
231
+ print(f"{RED}{fail_count} round-trip failures in 0 - 9999.{RESET}\n")
232
+ else:
233
+ print(f"{GREEN}0 - 9999 round-trip all passed!{RESET}\n")
234
+
235
+ fail_count = 0
236
+ exponents = range(5, 22) # test around 10^5...
237
+ print("=== Random round-trip at larger scales ===")
238
+ for exp in exponents:
239
+ lower = 10**exp
240
+ upper = 10 ** (exp + 1)
241
+ for _ in range(2):
242
+ i = random.randint(lower, upper - 1)
243
+ for val in (i, -i):
244
+ s = arabic_to_chinese(val)
245
+ r = chinese_to_arabic(s)
246
+ if r != val:
247
+ print(f'{RED}FAIL:{RESET} {val} -> "{s}" -> {r}')
248
+ fail_count += 1
249
+
250
+ if fail_count:
251
+ print(f"{RED}{fail_count} random large-scale failures.{RESET}")
252
+ else:
253
+ print(f"{GREEN}All random large-scale round-trips passed!{RESET}")
@@ -0,0 +1,179 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.utils.text_utils.text_cleaner
4
+ ----------------------------------------------
5
+
6
+ Provides utilities to clean novel titles and content
7
+ by removing unwanted patterns, replacing strings.
8
+ """
9
+
10
+ import re
11
+ from re import Match, Pattern
12
+ from typing import Protocol, runtime_checkable
13
+
14
+ from novel_downloader.models import TextCleanerConfig
15
+
16
+
17
+ @runtime_checkable
18
+ class Cleaner(Protocol):
19
+ def clean(self, text: str, *, as_title: bool = False) -> str:
20
+ ...
21
+
22
+ def clean_title(self, text: str) -> str:
23
+ ...
24
+
25
+ def clean_content(self, text: str) -> str:
26
+ ...
27
+
28
+
29
+ class NullCleaner(Cleaner):
30
+ def clean_title(self, text: str) -> str:
31
+ return text
32
+
33
+ def clean_content(self, text: str) -> str:
34
+ return text
35
+
36
+ def clean(self, text: str, *, as_title: bool = False) -> str:
37
+ return text
38
+
39
+
40
+ class TextCleaner(Cleaner):
41
+ """
42
+ TextCleaner removes invisible characters, strips unwanted patterns,
43
+ and applies literal replacements in a single pass using a combined regex.
44
+
45
+ For regex that never matches, reference:
46
+
47
+ https://stackoverflow.com/questions/2930182/regex-to-not-match-anything
48
+ """
49
+
50
+ _INVISIBLE_PATTERN: Pattern[str] = re.compile(r"[\ufeff\u200B\u200C\u200D\u2060]")
51
+
52
+ def __init__(self, config: TextCleanerConfig) -> None:
53
+ """
54
+ Initialize TextCleaner with the given configuration.
55
+
56
+ :param config: TextCleanerConfig instance containing:
57
+
58
+ - remove_invisible: whether to strip BOM/zero-width chars
59
+ - title_remove_patterns: list of regex patterns to delete from titles
60
+ - content_remove_patterns: list of regex patterns to delete from content
61
+ - title_replacements: dict of literal replacements for titles
62
+ - content_replacements: dict of literal replacements for content
63
+ """
64
+ self._remove_invisible = config.remove_invisible
65
+
66
+ # Build literal‐to‐literal replacement maps
67
+ self._title_repl_map = config.title_replacements
68
+ self._content_repl_map = config.content_replacements
69
+
70
+ # Deduplicate removal patterns (keep order)
71
+ title_remove = list(dict.fromkeys(config.title_remove_patterns))
72
+ content_remove = list(dict.fromkeys(config.content_remove_patterns))
73
+
74
+ # Build a single combined regex for title:
75
+ # all delete‐patterns OR all escaped replacement‐keys
76
+ title_parts = title_remove + [re.escape(k) for k in self._title_repl_map]
77
+ title_parts.sort(
78
+ key=len, reverse=True
79
+ ) # longer first to avoid prefix collisions
80
+ title_pattern = "|".join(title_parts) if title_parts else r"$^"
81
+ self._title_combined_rx: Pattern[str] = re.compile(title_pattern)
82
+
83
+ # Build a single combined regex for content (multiline mode)
84
+ content_parts = content_remove + [re.escape(k) for k in self._content_repl_map]
85
+ content_parts.sort(key=len, reverse=True)
86
+ content_pattern = "|".join(content_parts) if content_parts else r"$^"
87
+ self._content_combined_rx: Pattern[str] = re.compile(
88
+ content_pattern, flags=re.MULTILINE
89
+ )
90
+
91
+ def clean_title(self, text: str) -> str:
92
+ """
93
+ Clean a title string.
94
+
95
+ Steps:
96
+ 1. Optionally strip BOM & zero-width characters.
97
+ 2. Remove unwanted patterns and apply literal replacements in one pass.
98
+ 3. Trim leading/trailing whitespace.
99
+
100
+ :param text: Raw title text.
101
+ :return: Cleaned title.
102
+ """
103
+ return self._do_clean(text, self._title_combined_rx, self._title_repl_map)
104
+
105
+ def clean_content(self, text: str) -> str:
106
+ """
107
+ Clean a content string.
108
+
109
+ Steps:
110
+ 1. Optionally strip BOM & zero-width characters.
111
+ 2. Remove unwanted patterns and apply literal replacements in one pass.
112
+ 3. Trim leading/trailing whitespace.
113
+
114
+ :param text: Raw content/body text.
115
+ :return: Cleaned content.
116
+ """
117
+ return self._do_clean(text, self._content_combined_rx, self._content_repl_map)
118
+
119
+ def clean(self, text: str, *, as_title: bool = False) -> str:
120
+ """
121
+ Unified clean method to process text as either title or content.
122
+
123
+ :param text: Raw text to clean.
124
+ :param as_title: If True, use title rules; otherwise content rules.
125
+ :return: Cleaned text.
126
+ """
127
+ return self.clean_title(text) if as_title else self.clean_content(text)
128
+
129
+ @classmethod
130
+ def _remove_bom_and_invisible(cls, text: str) -> str:
131
+ """
132
+ Remove BOM and zero-width/invisible characters from the text.
133
+
134
+ Matches:
135
+ - U+FEFF (BOM)
136
+ - U+200B ZERO WIDTH SPACE
137
+ - U+200C ZERO WIDTH NON-JOINER
138
+ - U+200D ZERO WIDTH JOINER
139
+ - U+2060 WORD JOINER
140
+
141
+ :param text: Input string possibly containing invisible chars.
142
+ :return: String with those characters stripped.
143
+ """
144
+ return cls._INVISIBLE_PATTERN.sub("", text)
145
+
146
+ def _do_clean(
147
+ self,
148
+ text: str,
149
+ combined_rx: Pattern[str],
150
+ repl_map: dict[str, str],
151
+ ) -> str:
152
+ """
153
+ Core cleaning logic:
154
+ optional invisible removal, single-pass remove/replace, trimming.
155
+
156
+ :param text: Text to clean.
157
+ :param combined_rx: Compiled regex for removal patterns and replacement keys.
158
+ :param repl_map: Mapping from matched token to replacement text.
159
+ :return: Cleaned text.
160
+ """
161
+ # Strip invisible chars if configured
162
+ if self._remove_invisible:
163
+ text = self._remove_bom_and_invisible(text)
164
+
165
+ # Single‐pass removal & replacement
166
+ def _sub(match: Match[str]) -> str:
167
+ token = match.group(0)
168
+ # If token in repl_map -> replacement; else -> delete (empty string)
169
+ return repl_map.get(token, "")
170
+
171
+ text = combined_rx.sub(_sub, text)
172
+ return text.strip()
173
+
174
+
175
+ def get_cleaner(
176
+ enabled: bool,
177
+ config: TextCleanerConfig,
178
+ ) -> Cleaner:
179
+ return TextCleaner(config) if enabled else NullCleaner()
@@ -0,0 +1,62 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ novel_downloader.utils.text_utils.truncate_utils
4
+ ------------------------------------------------
5
+
6
+ Tools for truncating text.
7
+ """
8
+
9
+ __all__ = [
10
+ "content_prefix",
11
+ "truncate_half_lines",
12
+ ]
13
+
14
+ import math
15
+
16
+
17
+ def content_prefix(
18
+ text: str,
19
+ n: int,
20
+ ignore_chars: set[str] | None = None,
21
+ ) -> str:
22
+ """
23
+ Return the prefix of `text` containing the first `n` non-ignored characters.
24
+
25
+ :param text: The full input string.
26
+ :param n: Number of content characters to include.
27
+ :param ignore_chars: Characters to ignore when counting content.
28
+ :return: Truncated string preserving original whitespace and line breaks.
29
+ """
30
+ ignore = ignore_chars or set()
31
+ cnt = 0
32
+
33
+ for i, ch in enumerate(text):
34
+ if ch not in ignore:
35
+ cnt += 1
36
+ if cnt >= n:
37
+ return text[: i + 1]
38
+
39
+ return text
40
+
41
+
42
+ def truncate_half_lines(text: str) -> str:
43
+ """
44
+ Keep the first half of the lines (rounded up), preserving line breaks.
45
+
46
+ :param text: Full input text
47
+ :return: Truncated text with first half of lines
48
+ """
49
+ lines = text.splitlines()
50
+ non_empty_lines = [line for line in lines if line.strip()]
51
+ keep_count = math.ceil(len(non_empty_lines) / 2)
52
+
53
+ result_lines = []
54
+ count = 0
55
+ for line in lines:
56
+ result_lines.append(line)
57
+ if line.strip():
58
+ count += 1
59
+ if count >= keep_count:
60
+ break
61
+
62
+ return "\n".join(result_lines)
@@ -12,11 +12,11 @@ Includes:
12
12
  Sleeps for a random duration, useful for human-like delays or rate limiting.
13
13
  """
14
14
 
15
- from .datetime_utils import calculate_time_difference
16
- from .sleep_utils import async_sleep_with_random_delay, sleep_with_random_delay
17
-
18
15
  __all__ = [
19
16
  "calculate_time_difference",
20
17
  "async_sleep_with_random_delay",
21
18
  "sleep_with_random_delay",
22
19
  ]
20
+
21
+ from .datetime_utils import calculate_time_difference
22
+ from .sleep_utils import async_sleep_with_random_delay, sleep_with_random_delay
@@ -12,6 +12,10 @@ Includes:
12
12
  Computes timedelta between two datetime strings, with optional timezones.
13
13
  """
14
14
 
15
+ __all__ = [
16
+ "calculate_time_difference",
17
+ ]
18
+
15
19
  import logging
16
20
  import re
17
21
  from datetime import UTC, datetime, timedelta, timezone
@@ -139,8 +143,3 @@ def calculate_time_difference(
139
143
  except Exception as e:
140
144
  logger.warning("[time] Failed to calculate time difference: %s", e)
141
145
  return 999, 23, 59, 59
142
-
143
-
144
- __all__ = [
145
- "calculate_time_difference",
146
- ]
@@ -10,6 +10,8 @@ Includes:
10
10
  optionally capped with a max_sleep limit.
11
11
  """
12
12
 
13
+ __all__ = ["sleep_with_random_delay", "async_sleep_with_random_delay"]
14
+
13
15
  import asyncio
14
16
  import logging
15
17
  import random
@@ -100,6 +102,3 @@ async def async_sleep_with_random_delay(
100
102
 
101
103
  logger.debug("[async time] Sleeping for %.2f seconds", duration)
102
104
  await asyncio.sleep(duration)
103
-
104
-
105
- __all__ = ["sleep_with_random_delay", "async_sleep_with_random_delay"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: novel-downloader
3
- Version: 1.4.5
3
+ Version: 1.5.0
4
4
  Summary: A command-line tool for downloading Chinese web novels from Qidian and similar platforms.
5
5
  Author-email: Saudade Z <saudadez217@gmail.com>
6
6
  License: MIT License
@@ -83,7 +83,7 @@ Dynamic: license-file
83
83
  - EPUB (可选包含章节插图)
84
84
  - 支持活动广告过滤:
85
85
  - [x] 章节标题
86
- - [ ] 章节正文
86
+ - [x] 章节正文
87
87
 
88
88
  ---
89
89