kreuzberg 3.16.0__py3-none-any.whl → 3.17.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. kreuzberg/__init__.py +2 -0
  2. kreuzberg/_config.py +8 -9
  3. kreuzberg/_extractors/_base.py +0 -46
  4. kreuzberg/_extractors/_html.py +1 -1
  5. kreuzberg/_extractors/_pandoc.py +2 -2
  6. kreuzberg/_extractors/_pdf.py +4 -4
  7. kreuzberg/_gmft.py +2 -2
  8. kreuzberg/_language_detection.py +16 -39
  9. kreuzberg/_mcp/server.py +1 -1
  10. kreuzberg/_mime_types.py +1 -1
  11. kreuzberg/_ocr/_easyocr.py +4 -9
  12. kreuzberg/_ocr/_paddleocr.py +1 -1
  13. kreuzberg/_ocr/_tesseract.py +15 -25
  14. kreuzberg/_token_reduction/__init__.py +11 -0
  15. kreuzberg/_token_reduction/_reducer.py +439 -0
  16. kreuzberg/_token_reduction/_stopwords.py +116 -0
  17. kreuzberg/_token_reduction/stopwords/af_stopwords.json +53 -0
  18. kreuzberg/_token_reduction/stopwords/ar_stopwords.json +482 -0
  19. kreuzberg/_token_reduction/stopwords/bg_stopwords.json +261 -0
  20. kreuzberg/_token_reduction/stopwords/bn_stopwords.json +400 -0
  21. kreuzberg/_token_reduction/stopwords/br_stopwords.json +1205 -0
  22. kreuzberg/_token_reduction/stopwords/ca_stopwords.json +280 -0
  23. kreuzberg/_token_reduction/stopwords/cs_stopwords.json +425 -0
  24. kreuzberg/_token_reduction/stopwords/da_stopwords.json +172 -0
  25. kreuzberg/_token_reduction/stopwords/de_stopwords.json +622 -0
  26. kreuzberg/_token_reduction/stopwords/el_stopwords.json +849 -0
  27. kreuzberg/_token_reduction/stopwords/en_stopwords.json +1300 -0
  28. kreuzberg/_token_reduction/stopwords/eo_stopwords.json +175 -0
  29. kreuzberg/_token_reduction/stopwords/es_stopwords.json +734 -0
  30. kreuzberg/_token_reduction/stopwords/et_stopwords.json +37 -0
  31. kreuzberg/_token_reduction/stopwords/eu_stopwords.json +100 -0
  32. kreuzberg/_token_reduction/stopwords/fa_stopwords.json +801 -0
  33. kreuzberg/_token_reduction/stopwords/fi_stopwords.json +849 -0
  34. kreuzberg/_token_reduction/stopwords/fr_stopwords.json +693 -0
  35. kreuzberg/_token_reduction/stopwords/ga_stopwords.json +111 -0
  36. kreuzberg/_token_reduction/stopwords/gl_stopwords.json +162 -0
  37. kreuzberg/_token_reduction/stopwords/gu_stopwords.json +226 -0
  38. kreuzberg/_token_reduction/stopwords/ha_stopwords.json +41 -0
  39. kreuzberg/_token_reduction/stopwords/he_stopwords.json +196 -0
  40. kreuzberg/_token_reduction/stopwords/hi_stopwords.json +227 -0
  41. kreuzberg/_token_reduction/stopwords/hr_stopwords.json +181 -0
  42. kreuzberg/_token_reduction/stopwords/hu_stopwords.json +791 -0
  43. kreuzberg/_token_reduction/stopwords/hy_stopwords.json +47 -0
  44. kreuzberg/_token_reduction/stopwords/id_stopwords.json +760 -0
  45. kreuzberg/_token_reduction/stopwords/it_stopwords.json +634 -0
  46. kreuzberg/_token_reduction/stopwords/ja_stopwords.json +136 -0
  47. kreuzberg/_token_reduction/stopwords/kn_stopwords.json +84 -0
  48. kreuzberg/_token_reduction/stopwords/ko_stopwords.json +681 -0
  49. kreuzberg/_token_reduction/stopwords/ku_stopwords.json +64 -0
  50. kreuzberg/_token_reduction/stopwords/la_stopwords.json +51 -0
  51. kreuzberg/_token_reduction/stopwords/lt_stopwords.json +476 -0
  52. kreuzberg/_token_reduction/stopwords/lv_stopwords.json +163 -0
  53. kreuzberg/_token_reduction/stopwords/ml_stopwords.json +11 -0
  54. kreuzberg/_token_reduction/stopwords/mr_stopwords.json +101 -0
  55. kreuzberg/_token_reduction/stopwords/ms_stopwords.json +477 -0
  56. kreuzberg/_token_reduction/stopwords/ne_stopwords.json +490 -0
  57. kreuzberg/_token_reduction/stopwords/nl_stopwords.json +415 -0
  58. kreuzberg/_token_reduction/stopwords/no_stopwords.json +223 -0
  59. kreuzberg/_token_reduction/stopwords/pl_stopwords.json +331 -0
  60. kreuzberg/_token_reduction/stopwords/pt_stopwords.json +562 -0
  61. kreuzberg/_token_reduction/stopwords/ro_stopwords.json +436 -0
  62. kreuzberg/_token_reduction/stopwords/ru_stopwords.json +561 -0
  63. kreuzberg/_token_reduction/stopwords/si_stopwords.json +193 -0
  64. kreuzberg/_token_reduction/stopwords/sk_stopwords.json +420 -0
  65. kreuzberg/_token_reduction/stopwords/sl_stopwords.json +448 -0
  66. kreuzberg/_token_reduction/stopwords/so_stopwords.json +32 -0
  67. kreuzberg/_token_reduction/stopwords/st_stopwords.json +33 -0
  68. kreuzberg/_token_reduction/stopwords/sv_stopwords.json +420 -0
  69. kreuzberg/_token_reduction/stopwords/sw_stopwords.json +76 -0
  70. kreuzberg/_token_reduction/stopwords/ta_stopwords.json +129 -0
  71. kreuzberg/_token_reduction/stopwords/te_stopwords.json +54 -0
  72. kreuzberg/_token_reduction/stopwords/th_stopwords.json +118 -0
  73. kreuzberg/_token_reduction/stopwords/tl_stopwords.json +149 -0
  74. kreuzberg/_token_reduction/stopwords/tr_stopwords.json +506 -0
  75. kreuzberg/_token_reduction/stopwords/uk_stopwords.json +75 -0
  76. kreuzberg/_token_reduction/stopwords/ur_stopwords.json +519 -0
  77. kreuzberg/_token_reduction/stopwords/vi_stopwords.json +647 -0
  78. kreuzberg/_token_reduction/stopwords/yo_stopwords.json +62 -0
  79. kreuzberg/_token_reduction/stopwords/zh_stopwords.json +796 -0
  80. kreuzberg/_token_reduction/stopwords/zu_stopwords.json +31 -0
  81. kreuzberg/_types.py +50 -9
  82. kreuzberg/_utils/_image_preprocessing.py +1 -1
  83. kreuzberg/_utils/_ref.py +14 -6
  84. kreuzberg/exceptions.py +0 -1
  85. kreuzberg/extraction.py +33 -10
  86. {kreuzberg-3.16.0.dist-info → kreuzberg-3.17.1.dist-info}/METADATA +6 -5
  87. kreuzberg-3.17.1.dist-info/RECORD +128 -0
  88. kreuzberg-3.16.0.dist-info/RECORD +0 -61
  89. {kreuzberg-3.16.0.dist-info → kreuzberg-3.17.1.dist-info}/WHEEL +0 -0
  90. {kreuzberg-3.16.0.dist-info → kreuzberg-3.17.1.dist-info}/entry_points.txt +0 -0
  91. {kreuzberg-3.16.0.dist-info → kreuzberg-3.17.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,439 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ import unicodedata
5
+ from functools import lru_cache
6
+ from typing import TYPE_CHECKING, Any, TypedDict
7
+
8
+ from kreuzberg._token_reduction._stopwords import get_default_stopwords_manager
9
+ from kreuzberg.exceptions import ValidationError
10
+
11
+ if TYPE_CHECKING:
12
+ from collections.abc import Callable
13
+
14
+ from kreuzberg._types import TokenReductionConfig
15
+
16
+
17
+ class ReductionStats(TypedDict):
18
+ """Statistics about token reduction operation."""
19
+
20
+ character_reduction_ratio: float
21
+ token_reduction_ratio: float
22
+ original_characters: int
23
+ reduced_characters: int
24
+ original_tokens: int
25
+ reduced_tokens: int
26
+
27
+
28
+ HTML_COMMENT_PATTERN = re.compile(r"<!--.*?-->", re.DOTALL)
29
+
30
+ PUNCTUATION_CLEANUP_PATTERN = re.compile(
31
+ r"([!?.])(?:\1)+"
32
+ r"|(,)(?:,)+"
33
+ r"|[!?]+\.+[!?]*|[?!]{3,}"
34
+ )
35
+
36
+ WHITESPACE_CLEANUP_PATTERN = re.compile(r"\n{3,}|[ \t]+")
37
+
38
+ MARKDOWN_LIST_PATTERNS = (
39
+ re.compile(r"^\s*[-*+]\s"),
40
+ re.compile(r"^\s*\d+\.\s"),
41
+ )
42
+
43
+ WORD_CLEAN_PATTERN = re.compile(r"[^\w]", re.UNICODE)
44
+ LANGUAGE_CODE_PATTERN = re.compile(r"^[a-zA-Z0-9-]+$")
45
+
46
+ WORD_SPLIT_PATTERN = re.compile(r"\S+")
47
+ WORD_BOUNDARY_PATTERN = re.compile(r"^(\W*)(.*?)(\W*)$", re.UNICODE)
48
+
49
+ STREAMING_THRESHOLD = 1_000_000
50
+
51
+
52
+ def _normalize_unicode(text: str) -> str:
53
+ """Normalize Unicode text to NFC form for consistent processing."""
54
+ return unicodedata.normalize("NFC", text)
55
+
56
+
57
+ def _normalize_newlines(text: str) -> str:
58
+ """Remove excessive newlines, keeping at most double newlines."""
59
+ return WHITESPACE_CLEANUP_PATTERN.sub(lambda m: "\n\n" if m.group().startswith("\n") else " ", text)
60
+
61
+
62
+ def _process_text_streaming(
63
+ text: str, processor_func: Callable[..., str], chunk_size: int = 100_000, **kwargs: Any
64
+ ) -> str:
65
+ """Process large text in chunks to optimize memory usage."""
66
+ if len(text) <= chunk_size:
67
+ return processor_func(text, **kwargs)
68
+
69
+ chunks = []
70
+ start = 0
71
+
72
+ while start < len(text):
73
+ end = min(start + chunk_size, len(text))
74
+
75
+ if end < len(text):
76
+ search_start = max(start, end - 1000)
77
+ for i in range(end - 1, search_start - 1, -1):
78
+ if text[i] in ".!?\n":
79
+ end = i + 1
80
+ break
81
+
82
+ chunk = text[start:end]
83
+ processed_chunk = processor_func(chunk, **kwargs)
84
+ chunks.append(processed_chunk)
85
+ start = end
86
+
87
+ return " ".join(chunks).strip()
88
+
89
+
90
+ def _is_markdown_structural_line(line: str, in_code_block: bool) -> bool:
91
+ """Check if a line contains markdown structural elements that should be preserved."""
92
+ if in_code_block:
93
+ return True
94
+
95
+ stripped = line.strip()
96
+
97
+ if stripped.startswith("#"):
98
+ return True
99
+
100
+ if "|" in line:
101
+ pipe_count = line.count("|")
102
+ if pipe_count >= 2 and (line.strip().startswith("|") or line.strip().endswith("|") or " | " in line):
103
+ return True
104
+
105
+ return MARKDOWN_LIST_PATTERNS[0].match(line) is not None or MARKDOWN_LIST_PATTERNS[1].match(line) is not None
106
+
107
+
108
+ @lru_cache(maxsize=64)
109
+ def _get_stopwords_with_custom(language: str, custom_words_tuple: tuple[str, ...] | None = None) -> set[str]:
110
+ """Get stopwords for a language, optionally with custom additions."""
111
+ manager = get_default_stopwords_manager()
112
+ base_stopwords = manager.get_stopwords(language)
113
+
114
+ if custom_words_tuple:
115
+ return base_stopwords | set(custom_words_tuple)
116
+ return base_stopwords
117
+
118
+
119
+ @lru_cache(maxsize=64)
120
+ def _get_lowercase_stopwords(language: str, custom_words_tuple: tuple[str, ...] | None = None) -> set[str]:
121
+ """Get pre-lowercased stopwords for faster comparison."""
122
+ stopwords = _get_stopwords_with_custom(language, custom_words_tuple)
123
+ return {sw.lower() for sw in stopwords}
124
+
125
+
126
+ def reduce_tokens(
127
+ text: str,
128
+ *,
129
+ config: TokenReductionConfig,
130
+ language: str | None = None,
131
+ ) -> str:
132
+ """Reduce tokens in text based on the specified configuration.
133
+
134
+ Args:
135
+ text: The text to reduce.
136
+ config: Configuration for token reduction.
137
+ language: Optional language code for stopword selection.
138
+
139
+ Returns:
140
+ The reduced text.
141
+
142
+ Raises:
143
+ ValidationError: If inputs are invalid.
144
+ """
145
+ if config is None:
146
+ raise ValidationError("Config cannot be None")
147
+
148
+ if text is None:
149
+ raise ValidationError("Text cannot be None")
150
+
151
+ if not isinstance(text, str):
152
+ raise ValidationError(f"Text must be a string, got {type(text).__name__}")
153
+
154
+ if language is not None and not isinstance(language, str):
155
+ raise ValidationError(f"Language must be a string or None, got {type(language).__name__}")
156
+
157
+ if language is not None and len(language.strip()) == 0:
158
+ raise ValidationError("Language cannot be empty or whitespace-only")
159
+
160
+ if config.mode == "off":
161
+ return text
162
+
163
+ use_streaming = len(text) > STREAMING_THRESHOLD
164
+
165
+ if language and not LANGUAGE_CODE_PATTERN.match(language):
166
+ raise ValidationError(f"Invalid language code format: {language}")
167
+
168
+ if not text or not text.strip():
169
+ return ""
170
+
171
+ text = _normalize_unicode(text)
172
+
173
+ if config.mode == "light":
174
+ return _apply_light_reduction(text, preserve_markdown=config.preserve_markdown, use_streaming=use_streaming)
175
+
176
+ if config.mode == "moderate":
177
+ return _apply_moderate_reduction(
178
+ text,
179
+ config=config,
180
+ language=language,
181
+ use_streaming=use_streaming,
182
+ )
183
+
184
+ return text
185
+
186
+
187
+ def _apply_light_reduction(text: str, *, preserve_markdown: bool, use_streaming: bool = False) -> str:
188
+ """Apply light reduction (formatting only)."""
189
+ if use_streaming:
190
+ if preserve_markdown:
191
+ return str(_process_text_streaming(text, _apply_light_reduction_markdown_aware))
192
+ return str(_process_text_streaming(text, _apply_light_reduction_plain))
193
+
194
+ if preserve_markdown:
195
+ return _apply_light_reduction_markdown_aware(text)
196
+ return _apply_light_reduction_plain(text)
197
+
198
+
199
+ def _apply_light_reduction_plain(text: str) -> str:
200
+ """Apply light reduction to plain text."""
201
+ text = HTML_COMMENT_PATTERN.sub("", text)
202
+
203
+ def punctuation_replacer(match: re.Match[str]) -> str:
204
+ if match.group(1):
205
+ return match.group(1)
206
+ if match.group(2):
207
+ return ","
208
+ return "?"
209
+
210
+ text = PUNCTUATION_CLEANUP_PATTERN.sub(punctuation_replacer, text)
211
+
212
+ def whitespace_replacer(match: re.Match[str]) -> str:
213
+ if match.group().startswith("\n"):
214
+ return "\n\n"
215
+ return " "
216
+
217
+ text = WHITESPACE_CLEANUP_PATTERN.sub(whitespace_replacer, text)
218
+
219
+ return text.strip()
220
+
221
+
222
+ def _apply_light_reduction_markdown_aware(text: str) -> str:
223
+ """Apply light reduction preserving markdown structure."""
224
+ lines = text.split("\n")
225
+ processed_lines = []
226
+ in_code_block = False
227
+
228
+ for line in lines:
229
+ if line.strip().startswith("```"):
230
+ in_code_block = not in_code_block
231
+ processed_lines.append(line)
232
+ continue
233
+
234
+ if _is_markdown_structural_line(line, in_code_block) or in_code_block:
235
+ processed_lines.append(line)
236
+ continue
237
+
238
+ if line.strip():
239
+ reduced = _apply_light_reduction_plain(line)
240
+ processed_lines.append(reduced)
241
+ else:
242
+ processed_lines.append(line)
243
+
244
+ result = "\n".join(processed_lines)
245
+
246
+ lines = result.split("\n")
247
+ normalized_lines = []
248
+ in_code_block = False
249
+ consecutive_empty = 0
250
+
251
+ for line in lines:
252
+ if line.strip().startswith("```"):
253
+ in_code_block = not in_code_block
254
+ normalized_lines.append(line)
255
+ consecutive_empty = 0
256
+ continue
257
+
258
+ if in_code_block:
259
+ normalized_lines.append(line)
260
+ consecutive_empty = 0
261
+ elif not line.strip():
262
+ consecutive_empty += 1
263
+ if consecutive_empty <= 2:
264
+ normalized_lines.append(line)
265
+ else:
266
+ normalized_lines.append(line)
267
+ consecutive_empty = 0
268
+
269
+ return "\n".join(normalized_lines).strip()
270
+
271
+
272
+ def _apply_moderate_reduction(
273
+ text: str,
274
+ *,
275
+ config: TokenReductionConfig,
276
+ language: str | None = None,
277
+ use_streaming: bool = False,
278
+ ) -> str:
279
+ """Apply moderate reduction (formatting + stopwords)."""
280
+ text = _apply_light_reduction(text, preserve_markdown=config.preserve_markdown, use_streaming=use_streaming)
281
+
282
+ lang = language or config.language_hint or "en"
283
+
284
+ manager = get_default_stopwords_manager()
285
+ if not manager.has_language(lang):
286
+ lang = "en"
287
+ if not manager.has_language("en"):
288
+ return text
289
+
290
+ custom_words_tuple = None
291
+ if config.custom_stopwords and lang in config.custom_stopwords:
292
+ custom_words_tuple = tuple(sorted(config.custom_stopwords[lang]))
293
+
294
+ if use_streaming:
295
+ if config.preserve_markdown:
296
+ return str(
297
+ _process_text_streaming(
298
+ text,
299
+ _apply_stopword_reduction_markdown_aware,
300
+ stopwords=_get_lowercase_stopwords(lang, custom_words_tuple),
301
+ )
302
+ )
303
+ return str(
304
+ _process_text_streaming(
305
+ text, _apply_stopword_reduction_plain, stopwords=_get_lowercase_stopwords(lang, custom_words_tuple)
306
+ )
307
+ )
308
+
309
+ stopwords = _get_lowercase_stopwords(lang, custom_words_tuple)
310
+
311
+ if config.preserve_markdown:
312
+ return _apply_stopword_reduction_markdown_aware(text, stopwords=stopwords)
313
+ return _apply_stopword_reduction_plain(text, stopwords=stopwords)
314
+
315
+
316
+ def _apply_stopword_reduction_plain(text: str, *, stopwords: set[str]) -> str:
317
+ """Apply stopword reduction to plain text.
318
+
319
+ Args:
320
+ text: Text to process
321
+ stopwords: Pre-lowercased stopwords set for faster comparison
322
+ """
323
+ words = WORD_SPLIT_PATTERN.findall(text)
324
+ if not words:
325
+ return ""
326
+
327
+ filtered_words = []
328
+
329
+ for word in words:
330
+ if len(word) <= 3 and word.isalpha():
331
+ if word.lower() not in stopwords or word.isupper() or len(word) == 1:
332
+ filtered_words.append(word)
333
+ continue
334
+
335
+ match = WORD_BOUNDARY_PATTERN.match(word)
336
+ if not match:
337
+ filtered_words.append(word)
338
+ continue
339
+
340
+ _prefix_punct, core_word, suffix_punct = match.groups()
341
+
342
+ if not core_word:
343
+ filtered_words.append(word)
344
+ continue
345
+
346
+ clean_word = core_word.lower() if core_word.isalpha() else WORD_CLEAN_PATTERN.sub("", core_word).lower()
347
+
348
+ if not clean_word:
349
+ filtered_words.append(word)
350
+ continue
351
+
352
+ is_stopword = clean_word in stopwords
353
+ should_keep = (
354
+ not is_stopword
355
+ or len(clean_word) <= 1
356
+ or (len(core_word) > 1 and core_word.isupper())
357
+ or any(c.isdigit() for c in core_word)
358
+ )
359
+
360
+ if should_keep:
361
+ filtered_words.append(word)
362
+ elif (
363
+ suffix_punct
364
+ and suffix_punct in ".,;:!?"
365
+ and filtered_words
366
+ and not filtered_words[-1].endswith(suffix_punct)
367
+ ):
368
+ filtered_words[-1] += suffix_punct
369
+
370
+ return " ".join(filtered_words) if filtered_words else ""
371
+
372
+
373
+ def _apply_stopword_reduction_markdown_aware(text: str, *, stopwords: set[str]) -> str:
374
+ """Apply stopword reduction preserving markdown structure."""
375
+ lines = text.split("\n")
376
+ processed_lines = []
377
+ in_code_block = False
378
+
379
+ for line in lines:
380
+ if line.strip().startswith("```"):
381
+ in_code_block = not in_code_block
382
+ processed_lines.append(line)
383
+ continue
384
+
385
+ if _is_markdown_structural_line(line, in_code_block):
386
+ processed_lines.append(line)
387
+ continue
388
+
389
+ if line.strip():
390
+ reduced = _apply_stopword_reduction_plain(line, stopwords=stopwords)
391
+ processed_lines.append(reduced)
392
+ else:
393
+ processed_lines.append(line)
394
+
395
+ result = "\n".join(processed_lines)
396
+ return _normalize_newlines(result).strip()
397
+
398
+
399
+ def get_reduction_stats(original: str, reduced: str) -> ReductionStats:
400
+ """Get detailed statistics about the reduction.
401
+
402
+ Args:
403
+ original: The original text.
404
+ reduced: The reduced text.
405
+
406
+ Returns:
407
+ Statistics about the reduction.
408
+
409
+ Raises:
410
+ ValidationError: If inputs are invalid.
411
+ """
412
+ if original is None:
413
+ raise ValidationError("Original text cannot be None")
414
+
415
+ if reduced is None:
416
+ raise ValidationError("Reduced text cannot be None")
417
+
418
+ if not isinstance(original, str):
419
+ raise ValidationError(f"Original text must be a string, got {type(original).__name__}")
420
+
421
+ if not isinstance(reduced, str):
422
+ raise ValidationError(f"Reduced text must be a string, got {type(reduced).__name__}")
423
+
424
+ original_chars = len(original)
425
+ reduced_chars = len(reduced)
426
+ original_tokens = len(original.split()) if original else 0
427
+ reduced_tokens = len(reduced.split()) if reduced else 0
428
+
429
+ char_reduction = (original_chars - reduced_chars) / original_chars if original_chars > 0 else 0.0
430
+ token_reduction = (original_tokens - reduced_tokens) / original_tokens if original_tokens > 0 else 0.0
431
+
432
+ return ReductionStats(
433
+ character_reduction_ratio=char_reduction,
434
+ token_reduction_ratio=token_reduction,
435
+ original_characters=original_chars,
436
+ reduced_characters=reduced_chars,
437
+ original_tokens=original_tokens,
438
+ reduced_tokens=reduced_tokens,
439
+ )
@@ -0,0 +1,116 @@
1
+ from __future__ import annotations
2
+
3
+ from functools import lru_cache
4
+ from pathlib import Path
5
+
6
+ import msgspec
7
+
8
+ from kreuzberg._utils._ref import Ref
9
+
10
+ _STOPWORDS_DIR = Path(__file__).parent / "stopwords"
11
+
12
+
13
+ @lru_cache(maxsize=16)
14
+ def _load_language_stopwords(lang_code: str) -> set[str]:
15
+ """Load stopwords for a specific language from its JSON file."""
16
+ if not lang_code or "/" in lang_code or "\\" in lang_code or ".." in lang_code:
17
+ return set()
18
+
19
+ file_path = _STOPWORDS_DIR / f"{lang_code}_stopwords.json"
20
+
21
+ try:
22
+ file_path = file_path.resolve()
23
+ if not file_path.parent.samefile(_STOPWORDS_DIR):
24
+ return set()
25
+ except (OSError, ValueError):
26
+ return set()
27
+
28
+ if not file_path.exists():
29
+ return set()
30
+
31
+ try:
32
+ with file_path.open("rb") as f:
33
+ words: list[str] = msgspec.json.decode(f.read())
34
+ return set(words)
35
+ except (OSError, msgspec.DecodeError):
36
+ return set()
37
+
38
+
39
+ def _get_available_languages() -> frozenset[str]:
40
+ """Get list of available stopword languages by scanning directory."""
41
+ try:
42
+ if not _STOPWORDS_DIR.exists():
43
+ return frozenset()
44
+
45
+ languages = set()
46
+ for file_path in _STOPWORDS_DIR.glob("*_stopwords.json"):
47
+ lang_code = file_path.stem.replace("_stopwords", "")
48
+ languages.add(lang_code)
49
+
50
+ return frozenset(languages)
51
+ except (OSError, ValueError):
52
+ return frozenset()
53
+
54
+
55
+ _available_languages_ref = Ref("available_languages", _get_available_languages)
56
+
57
+
58
+ class StopwordsManager:
59
+ """Manages stopwords for multiple languages with lazy loading."""
60
+
61
+ def __init__(
62
+ self,
63
+ custom_stopwords: dict[str, list[str]] | None = None,
64
+ ) -> None:
65
+ """Initialize with optional custom stopwords.
66
+
67
+ Args:
68
+ custom_stopwords: Additional stopwords per language.
69
+ """
70
+ self._custom_stopwords: dict[str, set[str]] = {}
71
+
72
+ if custom_stopwords:
73
+ self._custom_stopwords = {lang: set(words) for lang, words in custom_stopwords.items()}
74
+
75
+ def get_stopwords(self, language: str) -> set[str]:
76
+ """Get stopwords for a language, combining default and custom."""
77
+ result = _load_language_stopwords(language)
78
+
79
+ if language in self._custom_stopwords:
80
+ result = result | self._custom_stopwords[language]
81
+
82
+ return result
83
+
84
+ def has_language(self, language: str) -> bool:
85
+ """Check if stopwords are available for a language."""
86
+ available = _available_languages_ref.get()
87
+ return language in available or language in self._custom_stopwords
88
+
89
+ def supported_languages(self) -> list[str]:
90
+ """Get sorted list of all supported languages."""
91
+ available = _available_languages_ref.get()
92
+ all_langs = set(available)
93
+ all_langs.update(self._custom_stopwords.keys())
94
+ return sorted(all_langs)
95
+
96
+ def add_custom_stopwords(self, language: str, words: list[str] | set[str]) -> None:
97
+ """Add custom stopwords for a language."""
98
+ if language not in self._custom_stopwords:
99
+ self._custom_stopwords[language] = set()
100
+
101
+ if isinstance(words, list):
102
+ words = set(words)
103
+
104
+ self._custom_stopwords[language].update(words)
105
+
106
+
107
+ def _create_default_manager() -> StopwordsManager:
108
+ return StopwordsManager()
109
+
110
+
111
+ _default_manager_ref = Ref("default_stopwords_manager", _create_default_manager)
112
+
113
+
114
+ def get_default_stopwords_manager() -> StopwordsManager:
115
+ """Get the default global stopwords manager."""
116
+ return _default_manager_ref.get()
@@ -0,0 +1,53 @@
1
+ [
2
+ "'n",
3
+ "aan",
4
+ "af",
5
+ "al",
6
+ "as",
7
+ "baie",
8
+ "by",
9
+ "daar",
10
+ "dag",
11
+ "dat",
12
+ "die",
13
+ "dit",
14
+ "een",
15
+ "ek",
16
+ "en",
17
+ "gaan",
18
+ "gesê",
19
+ "haar",
20
+ "het",
21
+ "hom",
22
+ "hulle",
23
+ "hy",
24
+ "in",
25
+ "is",
26
+ "jou",
27
+ "jy",
28
+ "kan",
29
+ "kom",
30
+ "ma",
31
+ "maar",
32
+ "met",
33
+ "my",
34
+ "na",
35
+ "nie",
36
+ "om",
37
+ "ons",
38
+ "op",
39
+ "saam",
40
+ "sal",
41
+ "se",
42
+ "sien",
43
+ "so",
44
+ "sy",
45
+ "te",
46
+ "toe",
47
+ "uit",
48
+ "van",
49
+ "vir",
50
+ "was",
51
+ "wat",
52
+ "ʼn"
53
+ ]