kreuzberg 3.16.0__py3-none-any.whl → 3.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +2 -0
- kreuzberg/_config.py +8 -9
- kreuzberg/_extractors/_base.py +0 -46
- kreuzberg/_extractors/_html.py +1 -1
- kreuzberg/_extractors/_pandoc.py +2 -2
- kreuzberg/_extractors/_pdf.py +4 -4
- kreuzberg/_gmft.py +2 -2
- kreuzberg/_mcp/server.py +1 -1
- kreuzberg/_mime_types.py +1 -1
- kreuzberg/_ocr/_easyocr.py +4 -9
- kreuzberg/_ocr/_paddleocr.py +1 -1
- kreuzberg/_ocr/_tesseract.py +15 -25
- kreuzberg/_token_reduction/__init__.py +11 -0
- kreuzberg/_token_reduction/_reducer.py +439 -0
- kreuzberg/_token_reduction/_stopwords.py +116 -0
- kreuzberg/_token_reduction/stopwords/af_stopwords.json +53 -0
- kreuzberg/_token_reduction/stopwords/ar_stopwords.json +482 -0
- kreuzberg/_token_reduction/stopwords/bg_stopwords.json +261 -0
- kreuzberg/_token_reduction/stopwords/bn_stopwords.json +400 -0
- kreuzberg/_token_reduction/stopwords/br_stopwords.json +1205 -0
- kreuzberg/_token_reduction/stopwords/ca_stopwords.json +280 -0
- kreuzberg/_token_reduction/stopwords/cs_stopwords.json +425 -0
- kreuzberg/_token_reduction/stopwords/da_stopwords.json +172 -0
- kreuzberg/_token_reduction/stopwords/de_stopwords.json +622 -0
- kreuzberg/_token_reduction/stopwords/el_stopwords.json +849 -0
- kreuzberg/_token_reduction/stopwords/en_stopwords.json +1300 -0
- kreuzberg/_token_reduction/stopwords/eo_stopwords.json +175 -0
- kreuzberg/_token_reduction/stopwords/es_stopwords.json +734 -0
- kreuzberg/_token_reduction/stopwords/et_stopwords.json +37 -0
- kreuzberg/_token_reduction/stopwords/eu_stopwords.json +100 -0
- kreuzberg/_token_reduction/stopwords/fa_stopwords.json +801 -0
- kreuzberg/_token_reduction/stopwords/fi_stopwords.json +849 -0
- kreuzberg/_token_reduction/stopwords/fr_stopwords.json +693 -0
- kreuzberg/_token_reduction/stopwords/ga_stopwords.json +111 -0
- kreuzberg/_token_reduction/stopwords/gl_stopwords.json +162 -0
- kreuzberg/_token_reduction/stopwords/gu_stopwords.json +226 -0
- kreuzberg/_token_reduction/stopwords/ha_stopwords.json +41 -0
- kreuzberg/_token_reduction/stopwords/he_stopwords.json +196 -0
- kreuzberg/_token_reduction/stopwords/hi_stopwords.json +227 -0
- kreuzberg/_token_reduction/stopwords/hr_stopwords.json +181 -0
- kreuzberg/_token_reduction/stopwords/hu_stopwords.json +791 -0
- kreuzberg/_token_reduction/stopwords/hy_stopwords.json +47 -0
- kreuzberg/_token_reduction/stopwords/id_stopwords.json +760 -0
- kreuzberg/_token_reduction/stopwords/it_stopwords.json +634 -0
- kreuzberg/_token_reduction/stopwords/ja_stopwords.json +136 -0
- kreuzberg/_token_reduction/stopwords/kn_stopwords.json +84 -0
- kreuzberg/_token_reduction/stopwords/ko_stopwords.json +681 -0
- kreuzberg/_token_reduction/stopwords/ku_stopwords.json +64 -0
- kreuzberg/_token_reduction/stopwords/la_stopwords.json +51 -0
- kreuzberg/_token_reduction/stopwords/lt_stopwords.json +476 -0
- kreuzberg/_token_reduction/stopwords/lv_stopwords.json +163 -0
- kreuzberg/_token_reduction/stopwords/ml_stopwords.json +11 -0
- kreuzberg/_token_reduction/stopwords/mr_stopwords.json +101 -0
- kreuzberg/_token_reduction/stopwords/ms_stopwords.json +477 -0
- kreuzberg/_token_reduction/stopwords/ne_stopwords.json +490 -0
- kreuzberg/_token_reduction/stopwords/nl_stopwords.json +415 -0
- kreuzberg/_token_reduction/stopwords/no_stopwords.json +223 -0
- kreuzberg/_token_reduction/stopwords/pl_stopwords.json +331 -0
- kreuzberg/_token_reduction/stopwords/pt_stopwords.json +562 -0
- kreuzberg/_token_reduction/stopwords/ro_stopwords.json +436 -0
- kreuzberg/_token_reduction/stopwords/ru_stopwords.json +561 -0
- kreuzberg/_token_reduction/stopwords/si_stopwords.json +193 -0
- kreuzberg/_token_reduction/stopwords/sk_stopwords.json +420 -0
- kreuzberg/_token_reduction/stopwords/sl_stopwords.json +448 -0
- kreuzberg/_token_reduction/stopwords/so_stopwords.json +32 -0
- kreuzberg/_token_reduction/stopwords/st_stopwords.json +33 -0
- kreuzberg/_token_reduction/stopwords/sv_stopwords.json +420 -0
- kreuzberg/_token_reduction/stopwords/sw_stopwords.json +76 -0
- kreuzberg/_token_reduction/stopwords/ta_stopwords.json +129 -0
- kreuzberg/_token_reduction/stopwords/te_stopwords.json +54 -0
- kreuzberg/_token_reduction/stopwords/th_stopwords.json +118 -0
- kreuzberg/_token_reduction/stopwords/tl_stopwords.json +149 -0
- kreuzberg/_token_reduction/stopwords/tr_stopwords.json +506 -0
- kreuzberg/_token_reduction/stopwords/uk_stopwords.json +75 -0
- kreuzberg/_token_reduction/stopwords/ur_stopwords.json +519 -0
- kreuzberg/_token_reduction/stopwords/vi_stopwords.json +647 -0
- kreuzberg/_token_reduction/stopwords/yo_stopwords.json +62 -0
- kreuzberg/_token_reduction/stopwords/zh_stopwords.json +796 -0
- kreuzberg/_token_reduction/stopwords/zu_stopwords.json +31 -0
- kreuzberg/_types.py +35 -3
- kreuzberg/_utils/_image_preprocessing.py +1 -1
- kreuzberg/_utils/_ref.py +14 -6
- kreuzberg/exceptions.py +0 -1
- kreuzberg/extraction.py +25 -9
- {kreuzberg-3.16.0.dist-info → kreuzberg-3.17.0.dist-info}/METADATA +4 -3
- kreuzberg-3.17.0.dist-info/RECORD +128 -0
- kreuzberg-3.16.0.dist-info/RECORD +0 -61
- {kreuzberg-3.16.0.dist-info → kreuzberg-3.17.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.16.0.dist-info → kreuzberg-3.17.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.16.0.dist-info → kreuzberg-3.17.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,439 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import re
|
4
|
+
import unicodedata
|
5
|
+
from functools import lru_cache
|
6
|
+
from typing import TYPE_CHECKING, Any, TypedDict
|
7
|
+
|
8
|
+
from kreuzberg._token_reduction._stopwords import get_default_stopwords_manager
|
9
|
+
from kreuzberg.exceptions import ValidationError
|
10
|
+
|
11
|
+
if TYPE_CHECKING:
|
12
|
+
from collections.abc import Callable
|
13
|
+
|
14
|
+
from kreuzberg._types import TokenReductionConfig
|
15
|
+
|
16
|
+
|
17
|
+
class ReductionStats(TypedDict):
|
18
|
+
"""Statistics about token reduction operation."""
|
19
|
+
|
20
|
+
character_reduction_ratio: float
|
21
|
+
token_reduction_ratio: float
|
22
|
+
original_characters: int
|
23
|
+
reduced_characters: int
|
24
|
+
original_tokens: int
|
25
|
+
reduced_tokens: int
|
26
|
+
|
27
|
+
|
28
|
+
HTML_COMMENT_PATTERN = re.compile(r"<!--.*?-->", re.DOTALL)
|
29
|
+
|
30
|
+
PUNCTUATION_CLEANUP_PATTERN = re.compile(
|
31
|
+
r"([!?.])(?:\1)+"
|
32
|
+
r"|(,)(?:,)+"
|
33
|
+
r"|[!?]+\.+[!?]*|[?!]{3,}"
|
34
|
+
)
|
35
|
+
|
36
|
+
WHITESPACE_CLEANUP_PATTERN = re.compile(r"\n{3,}|[ \t]+")
|
37
|
+
|
38
|
+
MARKDOWN_LIST_PATTERNS = (
|
39
|
+
re.compile(r"^\s*[-*+]\s"),
|
40
|
+
re.compile(r"^\s*\d+\.\s"),
|
41
|
+
)
|
42
|
+
|
43
|
+
WORD_CLEAN_PATTERN = re.compile(r"[^\w]", re.UNICODE)
|
44
|
+
LANGUAGE_CODE_PATTERN = re.compile(r"^[a-zA-Z0-9-]+$")
|
45
|
+
|
46
|
+
WORD_SPLIT_PATTERN = re.compile(r"\S+")
|
47
|
+
WORD_BOUNDARY_PATTERN = re.compile(r"^(\W*)(.*?)(\W*)$", re.UNICODE)
|
48
|
+
|
49
|
+
STREAMING_THRESHOLD = 1_000_000
|
50
|
+
|
51
|
+
|
52
|
+
def _normalize_unicode(text: str) -> str:
|
53
|
+
"""Normalize Unicode text to NFC form for consistent processing."""
|
54
|
+
return unicodedata.normalize("NFC", text)
|
55
|
+
|
56
|
+
|
57
|
+
def _normalize_newlines(text: str) -> str:
|
58
|
+
"""Remove excessive newlines, keeping at most double newlines."""
|
59
|
+
return WHITESPACE_CLEANUP_PATTERN.sub(lambda m: "\n\n" if m.group().startswith("\n") else " ", text)
|
60
|
+
|
61
|
+
|
62
|
+
def _process_text_streaming(
|
63
|
+
text: str, processor_func: Callable[..., str], chunk_size: int = 100_000, **kwargs: Any
|
64
|
+
) -> str:
|
65
|
+
"""Process large text in chunks to optimize memory usage."""
|
66
|
+
if len(text) <= chunk_size:
|
67
|
+
return processor_func(text, **kwargs)
|
68
|
+
|
69
|
+
chunks = []
|
70
|
+
start = 0
|
71
|
+
|
72
|
+
while start < len(text):
|
73
|
+
end = min(start + chunk_size, len(text))
|
74
|
+
|
75
|
+
if end < len(text):
|
76
|
+
search_start = max(start, end - 1000)
|
77
|
+
for i in range(end - 1, search_start - 1, -1):
|
78
|
+
if text[i] in ".!?\n":
|
79
|
+
end = i + 1
|
80
|
+
break
|
81
|
+
|
82
|
+
chunk = text[start:end]
|
83
|
+
processed_chunk = processor_func(chunk, **kwargs)
|
84
|
+
chunks.append(processed_chunk)
|
85
|
+
start = end
|
86
|
+
|
87
|
+
return " ".join(chunks).strip()
|
88
|
+
|
89
|
+
|
90
|
+
def _is_markdown_structural_line(line: str, in_code_block: bool) -> bool:
|
91
|
+
"""Check if a line contains markdown structural elements that should be preserved."""
|
92
|
+
if in_code_block:
|
93
|
+
return True
|
94
|
+
|
95
|
+
stripped = line.strip()
|
96
|
+
|
97
|
+
if stripped.startswith("#"):
|
98
|
+
return True
|
99
|
+
|
100
|
+
if "|" in line:
|
101
|
+
pipe_count = line.count("|")
|
102
|
+
if pipe_count >= 2 and (line.strip().startswith("|") or line.strip().endswith("|") or " | " in line):
|
103
|
+
return True
|
104
|
+
|
105
|
+
return MARKDOWN_LIST_PATTERNS[0].match(line) is not None or MARKDOWN_LIST_PATTERNS[1].match(line) is not None
|
106
|
+
|
107
|
+
|
108
|
+
@lru_cache(maxsize=64)
|
109
|
+
def _get_stopwords_with_custom(language: str, custom_words_tuple: tuple[str, ...] | None = None) -> set[str]:
|
110
|
+
"""Get stopwords for a language, optionally with custom additions."""
|
111
|
+
manager = get_default_stopwords_manager()
|
112
|
+
base_stopwords = manager.get_stopwords(language)
|
113
|
+
|
114
|
+
if custom_words_tuple:
|
115
|
+
return base_stopwords | set(custom_words_tuple)
|
116
|
+
return base_stopwords
|
117
|
+
|
118
|
+
|
119
|
+
@lru_cache(maxsize=64)
|
120
|
+
def _get_lowercase_stopwords(language: str, custom_words_tuple: tuple[str, ...] | None = None) -> set[str]:
|
121
|
+
"""Get pre-lowercased stopwords for faster comparison."""
|
122
|
+
stopwords = _get_stopwords_with_custom(language, custom_words_tuple)
|
123
|
+
return {sw.lower() for sw in stopwords}
|
124
|
+
|
125
|
+
|
126
|
+
def reduce_tokens(
|
127
|
+
text: str,
|
128
|
+
*,
|
129
|
+
config: TokenReductionConfig,
|
130
|
+
language: str | None = None,
|
131
|
+
) -> str:
|
132
|
+
"""Reduce tokens in text based on the specified configuration.
|
133
|
+
|
134
|
+
Args:
|
135
|
+
text: The text to reduce.
|
136
|
+
config: Configuration for token reduction.
|
137
|
+
language: Optional language code for stopword selection.
|
138
|
+
|
139
|
+
Returns:
|
140
|
+
The reduced text.
|
141
|
+
|
142
|
+
Raises:
|
143
|
+
ValidationError: If inputs are invalid.
|
144
|
+
"""
|
145
|
+
if config is None:
|
146
|
+
raise ValidationError("Config cannot be None")
|
147
|
+
|
148
|
+
if text is None:
|
149
|
+
raise ValidationError("Text cannot be None")
|
150
|
+
|
151
|
+
if not isinstance(text, str):
|
152
|
+
raise ValidationError(f"Text must be a string, got {type(text).__name__}")
|
153
|
+
|
154
|
+
if language is not None and not isinstance(language, str):
|
155
|
+
raise ValidationError(f"Language must be a string or None, got {type(language).__name__}")
|
156
|
+
|
157
|
+
if language is not None and len(language.strip()) == 0:
|
158
|
+
raise ValidationError("Language cannot be empty or whitespace-only")
|
159
|
+
|
160
|
+
if config.mode == "off":
|
161
|
+
return text
|
162
|
+
|
163
|
+
use_streaming = len(text) > STREAMING_THRESHOLD
|
164
|
+
|
165
|
+
if language and not LANGUAGE_CODE_PATTERN.match(language):
|
166
|
+
raise ValidationError(f"Invalid language code format: {language}")
|
167
|
+
|
168
|
+
if not text or not text.strip():
|
169
|
+
return ""
|
170
|
+
|
171
|
+
text = _normalize_unicode(text)
|
172
|
+
|
173
|
+
if config.mode == "light":
|
174
|
+
return _apply_light_reduction(text, preserve_markdown=config.preserve_markdown, use_streaming=use_streaming)
|
175
|
+
|
176
|
+
if config.mode == "moderate":
|
177
|
+
return _apply_moderate_reduction(
|
178
|
+
text,
|
179
|
+
config=config,
|
180
|
+
language=language,
|
181
|
+
use_streaming=use_streaming,
|
182
|
+
)
|
183
|
+
|
184
|
+
return text
|
185
|
+
|
186
|
+
|
187
|
+
def _apply_light_reduction(text: str, *, preserve_markdown: bool, use_streaming: bool = False) -> str:
|
188
|
+
"""Apply light reduction (formatting only)."""
|
189
|
+
if use_streaming:
|
190
|
+
if preserve_markdown:
|
191
|
+
return str(_process_text_streaming(text, _apply_light_reduction_markdown_aware))
|
192
|
+
return str(_process_text_streaming(text, _apply_light_reduction_plain))
|
193
|
+
|
194
|
+
if preserve_markdown:
|
195
|
+
return _apply_light_reduction_markdown_aware(text)
|
196
|
+
return _apply_light_reduction_plain(text)
|
197
|
+
|
198
|
+
|
199
|
+
def _apply_light_reduction_plain(text: str) -> str:
|
200
|
+
"""Apply light reduction to plain text."""
|
201
|
+
text = HTML_COMMENT_PATTERN.sub("", text)
|
202
|
+
|
203
|
+
def punctuation_replacer(match: re.Match[str]) -> str:
|
204
|
+
if match.group(1):
|
205
|
+
return match.group(1)
|
206
|
+
if match.group(2):
|
207
|
+
return ","
|
208
|
+
return "?"
|
209
|
+
|
210
|
+
text = PUNCTUATION_CLEANUP_PATTERN.sub(punctuation_replacer, text)
|
211
|
+
|
212
|
+
def whitespace_replacer(match: re.Match[str]) -> str:
|
213
|
+
if match.group().startswith("\n"):
|
214
|
+
return "\n\n"
|
215
|
+
return " "
|
216
|
+
|
217
|
+
text = WHITESPACE_CLEANUP_PATTERN.sub(whitespace_replacer, text)
|
218
|
+
|
219
|
+
return text.strip()
|
220
|
+
|
221
|
+
|
222
|
+
def _apply_light_reduction_markdown_aware(text: str) -> str:
|
223
|
+
"""Apply light reduction preserving markdown structure."""
|
224
|
+
lines = text.split("\n")
|
225
|
+
processed_lines = []
|
226
|
+
in_code_block = False
|
227
|
+
|
228
|
+
for line in lines:
|
229
|
+
if line.strip().startswith("```"):
|
230
|
+
in_code_block = not in_code_block
|
231
|
+
processed_lines.append(line)
|
232
|
+
continue
|
233
|
+
|
234
|
+
if _is_markdown_structural_line(line, in_code_block) or in_code_block:
|
235
|
+
processed_lines.append(line)
|
236
|
+
continue
|
237
|
+
|
238
|
+
if line.strip():
|
239
|
+
reduced = _apply_light_reduction_plain(line)
|
240
|
+
processed_lines.append(reduced)
|
241
|
+
else:
|
242
|
+
processed_lines.append(line)
|
243
|
+
|
244
|
+
result = "\n".join(processed_lines)
|
245
|
+
|
246
|
+
lines = result.split("\n")
|
247
|
+
normalized_lines = []
|
248
|
+
in_code_block = False
|
249
|
+
consecutive_empty = 0
|
250
|
+
|
251
|
+
for line in lines:
|
252
|
+
if line.strip().startswith("```"):
|
253
|
+
in_code_block = not in_code_block
|
254
|
+
normalized_lines.append(line)
|
255
|
+
consecutive_empty = 0
|
256
|
+
continue
|
257
|
+
|
258
|
+
if in_code_block:
|
259
|
+
normalized_lines.append(line)
|
260
|
+
consecutive_empty = 0
|
261
|
+
elif not line.strip():
|
262
|
+
consecutive_empty += 1
|
263
|
+
if consecutive_empty <= 2:
|
264
|
+
normalized_lines.append(line)
|
265
|
+
else:
|
266
|
+
normalized_lines.append(line)
|
267
|
+
consecutive_empty = 0
|
268
|
+
|
269
|
+
return "\n".join(normalized_lines).strip()
|
270
|
+
|
271
|
+
|
272
|
+
def _apply_moderate_reduction(
|
273
|
+
text: str,
|
274
|
+
*,
|
275
|
+
config: TokenReductionConfig,
|
276
|
+
language: str | None = None,
|
277
|
+
use_streaming: bool = False,
|
278
|
+
) -> str:
|
279
|
+
"""Apply moderate reduction (formatting + stopwords)."""
|
280
|
+
text = _apply_light_reduction(text, preserve_markdown=config.preserve_markdown, use_streaming=use_streaming)
|
281
|
+
|
282
|
+
lang = language or config.language_hint or "en"
|
283
|
+
|
284
|
+
manager = get_default_stopwords_manager()
|
285
|
+
if not manager.has_language(lang):
|
286
|
+
lang = "en"
|
287
|
+
if not manager.has_language("en"):
|
288
|
+
return text
|
289
|
+
|
290
|
+
custom_words_tuple = None
|
291
|
+
if config.custom_stopwords and lang in config.custom_stopwords:
|
292
|
+
custom_words_tuple = tuple(sorted(config.custom_stopwords[lang]))
|
293
|
+
|
294
|
+
if use_streaming:
|
295
|
+
if config.preserve_markdown:
|
296
|
+
return str(
|
297
|
+
_process_text_streaming(
|
298
|
+
text,
|
299
|
+
_apply_stopword_reduction_markdown_aware,
|
300
|
+
stopwords=_get_lowercase_stopwords(lang, custom_words_tuple),
|
301
|
+
)
|
302
|
+
)
|
303
|
+
return str(
|
304
|
+
_process_text_streaming(
|
305
|
+
text, _apply_stopword_reduction_plain, stopwords=_get_lowercase_stopwords(lang, custom_words_tuple)
|
306
|
+
)
|
307
|
+
)
|
308
|
+
|
309
|
+
stopwords = _get_lowercase_stopwords(lang, custom_words_tuple)
|
310
|
+
|
311
|
+
if config.preserve_markdown:
|
312
|
+
return _apply_stopword_reduction_markdown_aware(text, stopwords=stopwords)
|
313
|
+
return _apply_stopword_reduction_plain(text, stopwords=stopwords)
|
314
|
+
|
315
|
+
|
316
|
+
def _apply_stopword_reduction_plain(text: str, *, stopwords: set[str]) -> str:
|
317
|
+
"""Apply stopword reduction to plain text.
|
318
|
+
|
319
|
+
Args:
|
320
|
+
text: Text to process
|
321
|
+
stopwords: Pre-lowercased stopwords set for faster comparison
|
322
|
+
"""
|
323
|
+
words = WORD_SPLIT_PATTERN.findall(text)
|
324
|
+
if not words:
|
325
|
+
return ""
|
326
|
+
|
327
|
+
filtered_words = []
|
328
|
+
|
329
|
+
for word in words:
|
330
|
+
if len(word) <= 3 and word.isalpha():
|
331
|
+
if word.lower() not in stopwords or word.isupper() or len(word) == 1:
|
332
|
+
filtered_words.append(word)
|
333
|
+
continue
|
334
|
+
|
335
|
+
match = WORD_BOUNDARY_PATTERN.match(word)
|
336
|
+
if not match:
|
337
|
+
filtered_words.append(word)
|
338
|
+
continue
|
339
|
+
|
340
|
+
_prefix_punct, core_word, suffix_punct = match.groups()
|
341
|
+
|
342
|
+
if not core_word:
|
343
|
+
filtered_words.append(word)
|
344
|
+
continue
|
345
|
+
|
346
|
+
clean_word = core_word.lower() if core_word.isalpha() else WORD_CLEAN_PATTERN.sub("", core_word).lower()
|
347
|
+
|
348
|
+
if not clean_word:
|
349
|
+
filtered_words.append(word)
|
350
|
+
continue
|
351
|
+
|
352
|
+
is_stopword = clean_word in stopwords
|
353
|
+
should_keep = (
|
354
|
+
not is_stopword
|
355
|
+
or len(clean_word) <= 1
|
356
|
+
or (len(core_word) > 1 and core_word.isupper())
|
357
|
+
or any(c.isdigit() for c in core_word)
|
358
|
+
)
|
359
|
+
|
360
|
+
if should_keep:
|
361
|
+
filtered_words.append(word)
|
362
|
+
elif (
|
363
|
+
suffix_punct
|
364
|
+
and suffix_punct in ".,;:!?"
|
365
|
+
and filtered_words
|
366
|
+
and not filtered_words[-1].endswith(suffix_punct)
|
367
|
+
):
|
368
|
+
filtered_words[-1] += suffix_punct
|
369
|
+
|
370
|
+
return " ".join(filtered_words) if filtered_words else ""
|
371
|
+
|
372
|
+
|
373
|
+
def _apply_stopword_reduction_markdown_aware(text: str, *, stopwords: set[str]) -> str:
|
374
|
+
"""Apply stopword reduction preserving markdown structure."""
|
375
|
+
lines = text.split("\n")
|
376
|
+
processed_lines = []
|
377
|
+
in_code_block = False
|
378
|
+
|
379
|
+
for line in lines:
|
380
|
+
if line.strip().startswith("```"):
|
381
|
+
in_code_block = not in_code_block
|
382
|
+
processed_lines.append(line)
|
383
|
+
continue
|
384
|
+
|
385
|
+
if _is_markdown_structural_line(line, in_code_block):
|
386
|
+
processed_lines.append(line)
|
387
|
+
continue
|
388
|
+
|
389
|
+
if line.strip():
|
390
|
+
reduced = _apply_stopword_reduction_plain(line, stopwords=stopwords)
|
391
|
+
processed_lines.append(reduced)
|
392
|
+
else:
|
393
|
+
processed_lines.append(line)
|
394
|
+
|
395
|
+
result = "\n".join(processed_lines)
|
396
|
+
return _normalize_newlines(result).strip()
|
397
|
+
|
398
|
+
|
399
|
+
def get_reduction_stats(original: str, reduced: str) -> ReductionStats:
|
400
|
+
"""Get detailed statistics about the reduction.
|
401
|
+
|
402
|
+
Args:
|
403
|
+
original: The original text.
|
404
|
+
reduced: The reduced text.
|
405
|
+
|
406
|
+
Returns:
|
407
|
+
Statistics about the reduction.
|
408
|
+
|
409
|
+
Raises:
|
410
|
+
ValidationError: If inputs are invalid.
|
411
|
+
"""
|
412
|
+
if original is None:
|
413
|
+
raise ValidationError("Original text cannot be None")
|
414
|
+
|
415
|
+
if reduced is None:
|
416
|
+
raise ValidationError("Reduced text cannot be None")
|
417
|
+
|
418
|
+
if not isinstance(original, str):
|
419
|
+
raise ValidationError(f"Original text must be a string, got {type(original).__name__}")
|
420
|
+
|
421
|
+
if not isinstance(reduced, str):
|
422
|
+
raise ValidationError(f"Reduced text must be a string, got {type(reduced).__name__}")
|
423
|
+
|
424
|
+
original_chars = len(original)
|
425
|
+
reduced_chars = len(reduced)
|
426
|
+
original_tokens = len(original.split()) if original else 0
|
427
|
+
reduced_tokens = len(reduced.split()) if reduced else 0
|
428
|
+
|
429
|
+
char_reduction = (original_chars - reduced_chars) / original_chars if original_chars > 0 else 0.0
|
430
|
+
token_reduction = (original_tokens - reduced_tokens) / original_tokens if original_tokens > 0 else 0.0
|
431
|
+
|
432
|
+
return ReductionStats(
|
433
|
+
character_reduction_ratio=char_reduction,
|
434
|
+
token_reduction_ratio=token_reduction,
|
435
|
+
original_characters=original_chars,
|
436
|
+
reduced_characters=reduced_chars,
|
437
|
+
original_tokens=original_tokens,
|
438
|
+
reduced_tokens=reduced_tokens,
|
439
|
+
)
|
@@ -0,0 +1,116 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from functools import lru_cache
|
4
|
+
from pathlib import Path
|
5
|
+
|
6
|
+
import msgspec
|
7
|
+
|
8
|
+
from kreuzberg._utils._ref import Ref
|
9
|
+
|
10
|
+
_STOPWORDS_DIR = Path(__file__).parent / "stopwords"
|
11
|
+
|
12
|
+
|
13
|
+
@lru_cache(maxsize=16)
|
14
|
+
def _load_language_stopwords(lang_code: str) -> set[str]:
|
15
|
+
"""Load stopwords for a specific language from its JSON file."""
|
16
|
+
if not lang_code or "/" in lang_code or "\\" in lang_code or ".." in lang_code:
|
17
|
+
return set()
|
18
|
+
|
19
|
+
file_path = _STOPWORDS_DIR / f"{lang_code}_stopwords.json"
|
20
|
+
|
21
|
+
try:
|
22
|
+
file_path = file_path.resolve()
|
23
|
+
if not file_path.parent.samefile(_STOPWORDS_DIR):
|
24
|
+
return set()
|
25
|
+
except (OSError, ValueError):
|
26
|
+
return set()
|
27
|
+
|
28
|
+
if not file_path.exists():
|
29
|
+
return set()
|
30
|
+
|
31
|
+
try:
|
32
|
+
with file_path.open("rb") as f:
|
33
|
+
words: list[str] = msgspec.json.decode(f.read())
|
34
|
+
return set(words)
|
35
|
+
except (OSError, msgspec.DecodeError):
|
36
|
+
return set()
|
37
|
+
|
38
|
+
|
39
|
+
def _get_available_languages() -> frozenset[str]:
|
40
|
+
"""Get list of available stopword languages by scanning directory."""
|
41
|
+
try:
|
42
|
+
if not _STOPWORDS_DIR.exists():
|
43
|
+
return frozenset()
|
44
|
+
|
45
|
+
languages = set()
|
46
|
+
for file_path in _STOPWORDS_DIR.glob("*_stopwords.json"):
|
47
|
+
lang_code = file_path.stem.replace("_stopwords", "")
|
48
|
+
languages.add(lang_code)
|
49
|
+
|
50
|
+
return frozenset(languages)
|
51
|
+
except (OSError, ValueError):
|
52
|
+
return frozenset()
|
53
|
+
|
54
|
+
|
55
|
+
_available_languages_ref = Ref("available_languages", _get_available_languages)
|
56
|
+
|
57
|
+
|
58
|
+
class StopwordsManager:
|
59
|
+
"""Manages stopwords for multiple languages with lazy loading."""
|
60
|
+
|
61
|
+
def __init__(
|
62
|
+
self,
|
63
|
+
custom_stopwords: dict[str, list[str]] | None = None,
|
64
|
+
) -> None:
|
65
|
+
"""Initialize with optional custom stopwords.
|
66
|
+
|
67
|
+
Args:
|
68
|
+
custom_stopwords: Additional stopwords per language.
|
69
|
+
"""
|
70
|
+
self._custom_stopwords: dict[str, set[str]] = {}
|
71
|
+
|
72
|
+
if custom_stopwords:
|
73
|
+
self._custom_stopwords = {lang: set(words) for lang, words in custom_stopwords.items()}
|
74
|
+
|
75
|
+
def get_stopwords(self, language: str) -> set[str]:
|
76
|
+
"""Get stopwords for a language, combining default and custom."""
|
77
|
+
result = _load_language_stopwords(language)
|
78
|
+
|
79
|
+
if language in self._custom_stopwords:
|
80
|
+
result = result | self._custom_stopwords[language]
|
81
|
+
|
82
|
+
return result
|
83
|
+
|
84
|
+
def has_language(self, language: str) -> bool:
|
85
|
+
"""Check if stopwords are available for a language."""
|
86
|
+
available = _available_languages_ref.get()
|
87
|
+
return language in available or language in self._custom_stopwords
|
88
|
+
|
89
|
+
def supported_languages(self) -> list[str]:
|
90
|
+
"""Get sorted list of all supported languages."""
|
91
|
+
available = _available_languages_ref.get()
|
92
|
+
all_langs = set(available)
|
93
|
+
all_langs.update(self._custom_stopwords.keys())
|
94
|
+
return sorted(all_langs)
|
95
|
+
|
96
|
+
def add_custom_stopwords(self, language: str, words: list[str] | set[str]) -> None:
|
97
|
+
"""Add custom stopwords for a language."""
|
98
|
+
if language not in self._custom_stopwords:
|
99
|
+
self._custom_stopwords[language] = set()
|
100
|
+
|
101
|
+
if isinstance(words, list):
|
102
|
+
words = set(words)
|
103
|
+
|
104
|
+
self._custom_stopwords[language].update(words)
|
105
|
+
|
106
|
+
|
107
|
+
def _create_default_manager() -> StopwordsManager:
|
108
|
+
return StopwordsManager()
|
109
|
+
|
110
|
+
|
111
|
+
_default_manager_ref = Ref("default_stopwords_manager", _create_default_manager)
|
112
|
+
|
113
|
+
|
114
|
+
def get_default_stopwords_manager() -> StopwordsManager:
|
115
|
+
"""Get the default global stopwords manager."""
|
116
|
+
return _default_manager_ref.get()
|
@@ -0,0 +1,53 @@
|
|
1
|
+
[
|
2
|
+
"'n",
|
3
|
+
"aan",
|
4
|
+
"af",
|
5
|
+
"al",
|
6
|
+
"as",
|
7
|
+
"baie",
|
8
|
+
"by",
|
9
|
+
"daar",
|
10
|
+
"dag",
|
11
|
+
"dat",
|
12
|
+
"die",
|
13
|
+
"dit",
|
14
|
+
"een",
|
15
|
+
"ek",
|
16
|
+
"en",
|
17
|
+
"gaan",
|
18
|
+
"gesê",
|
19
|
+
"haar",
|
20
|
+
"het",
|
21
|
+
"hom",
|
22
|
+
"hulle",
|
23
|
+
"hy",
|
24
|
+
"in",
|
25
|
+
"is",
|
26
|
+
"jou",
|
27
|
+
"jy",
|
28
|
+
"kan",
|
29
|
+
"kom",
|
30
|
+
"ma",
|
31
|
+
"maar",
|
32
|
+
"met",
|
33
|
+
"my",
|
34
|
+
"na",
|
35
|
+
"nie",
|
36
|
+
"om",
|
37
|
+
"ons",
|
38
|
+
"op",
|
39
|
+
"saam",
|
40
|
+
"sal",
|
41
|
+
"se",
|
42
|
+
"sien",
|
43
|
+
"so",
|
44
|
+
"sy",
|
45
|
+
"te",
|
46
|
+
"toe",
|
47
|
+
"uit",
|
48
|
+
"van",
|
49
|
+
"vir",
|
50
|
+
"was",
|
51
|
+
"wat",
|
52
|
+
"ʼn"
|
53
|
+
]
|