kreuzberg 3.15.0__py3-none-any.whl → 3.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. kreuzberg/__init__.py +6 -0
  2. kreuzberg/_api/main.py +0 -53
  3. kreuzberg/_config.py +17 -8
  4. kreuzberg/_document_classification.py +1 -1
  5. kreuzberg/_extractors/_base.py +0 -46
  6. kreuzberg/_extractors/_email.py +16 -10
  7. kreuzberg/_extractors/_html.py +39 -12
  8. kreuzberg/_extractors/_pandoc.py +2 -2
  9. kreuzberg/_extractors/_pdf.py +6 -7
  10. kreuzberg/_extractors/_presentation.py +4 -0
  11. kreuzberg/_extractors/_spread_sheet.py +0 -1
  12. kreuzberg/_extractors/_structured.py +83 -15
  13. kreuzberg/_gmft.py +7 -2
  14. kreuzberg/_mcp/server.py +1 -22
  15. kreuzberg/_mime_types.py +1 -1
  16. kreuzberg/_ocr/_easyocr.py +47 -20
  17. kreuzberg/_ocr/_paddleocr.py +1 -1
  18. kreuzberg/_ocr/_tesseract.py +27 -26
  19. kreuzberg/_token_reduction/__init__.py +11 -0
  20. kreuzberg/_token_reduction/_reducer.py +439 -0
  21. kreuzberg/_token_reduction/_stopwords.py +116 -0
  22. kreuzberg/_token_reduction/stopwords/af_stopwords.json +53 -0
  23. kreuzberg/_token_reduction/stopwords/ar_stopwords.json +482 -0
  24. kreuzberg/_token_reduction/stopwords/bg_stopwords.json +261 -0
  25. kreuzberg/_token_reduction/stopwords/bn_stopwords.json +400 -0
  26. kreuzberg/_token_reduction/stopwords/br_stopwords.json +1205 -0
  27. kreuzberg/_token_reduction/stopwords/ca_stopwords.json +280 -0
  28. kreuzberg/_token_reduction/stopwords/cs_stopwords.json +425 -0
  29. kreuzberg/_token_reduction/stopwords/da_stopwords.json +172 -0
  30. kreuzberg/_token_reduction/stopwords/de_stopwords.json +622 -0
  31. kreuzberg/_token_reduction/stopwords/el_stopwords.json +849 -0
  32. kreuzberg/_token_reduction/stopwords/en_stopwords.json +1300 -0
  33. kreuzberg/_token_reduction/stopwords/eo_stopwords.json +175 -0
  34. kreuzberg/_token_reduction/stopwords/es_stopwords.json +734 -0
  35. kreuzberg/_token_reduction/stopwords/et_stopwords.json +37 -0
  36. kreuzberg/_token_reduction/stopwords/eu_stopwords.json +100 -0
  37. kreuzberg/_token_reduction/stopwords/fa_stopwords.json +801 -0
  38. kreuzberg/_token_reduction/stopwords/fi_stopwords.json +849 -0
  39. kreuzberg/_token_reduction/stopwords/fr_stopwords.json +693 -0
  40. kreuzberg/_token_reduction/stopwords/ga_stopwords.json +111 -0
  41. kreuzberg/_token_reduction/stopwords/gl_stopwords.json +162 -0
  42. kreuzberg/_token_reduction/stopwords/gu_stopwords.json +226 -0
  43. kreuzberg/_token_reduction/stopwords/ha_stopwords.json +41 -0
  44. kreuzberg/_token_reduction/stopwords/he_stopwords.json +196 -0
  45. kreuzberg/_token_reduction/stopwords/hi_stopwords.json +227 -0
  46. kreuzberg/_token_reduction/stopwords/hr_stopwords.json +181 -0
  47. kreuzberg/_token_reduction/stopwords/hu_stopwords.json +791 -0
  48. kreuzberg/_token_reduction/stopwords/hy_stopwords.json +47 -0
  49. kreuzberg/_token_reduction/stopwords/id_stopwords.json +760 -0
  50. kreuzberg/_token_reduction/stopwords/it_stopwords.json +634 -0
  51. kreuzberg/_token_reduction/stopwords/ja_stopwords.json +136 -0
  52. kreuzberg/_token_reduction/stopwords/kn_stopwords.json +84 -0
  53. kreuzberg/_token_reduction/stopwords/ko_stopwords.json +681 -0
  54. kreuzberg/_token_reduction/stopwords/ku_stopwords.json +64 -0
  55. kreuzberg/_token_reduction/stopwords/la_stopwords.json +51 -0
  56. kreuzberg/_token_reduction/stopwords/lt_stopwords.json +476 -0
  57. kreuzberg/_token_reduction/stopwords/lv_stopwords.json +163 -0
  58. kreuzberg/_token_reduction/stopwords/ml_stopwords.json +11 -0
  59. kreuzberg/_token_reduction/stopwords/mr_stopwords.json +101 -0
  60. kreuzberg/_token_reduction/stopwords/ms_stopwords.json +477 -0
  61. kreuzberg/_token_reduction/stopwords/ne_stopwords.json +490 -0
  62. kreuzberg/_token_reduction/stopwords/nl_stopwords.json +415 -0
  63. kreuzberg/_token_reduction/stopwords/no_stopwords.json +223 -0
  64. kreuzberg/_token_reduction/stopwords/pl_stopwords.json +331 -0
  65. kreuzberg/_token_reduction/stopwords/pt_stopwords.json +562 -0
  66. kreuzberg/_token_reduction/stopwords/ro_stopwords.json +436 -0
  67. kreuzberg/_token_reduction/stopwords/ru_stopwords.json +561 -0
  68. kreuzberg/_token_reduction/stopwords/si_stopwords.json +193 -0
  69. kreuzberg/_token_reduction/stopwords/sk_stopwords.json +420 -0
  70. kreuzberg/_token_reduction/stopwords/sl_stopwords.json +448 -0
  71. kreuzberg/_token_reduction/stopwords/so_stopwords.json +32 -0
  72. kreuzberg/_token_reduction/stopwords/st_stopwords.json +33 -0
  73. kreuzberg/_token_reduction/stopwords/sv_stopwords.json +420 -0
  74. kreuzberg/_token_reduction/stopwords/sw_stopwords.json +76 -0
  75. kreuzberg/_token_reduction/stopwords/ta_stopwords.json +129 -0
  76. kreuzberg/_token_reduction/stopwords/te_stopwords.json +54 -0
  77. kreuzberg/_token_reduction/stopwords/th_stopwords.json +118 -0
  78. kreuzberg/_token_reduction/stopwords/tl_stopwords.json +149 -0
  79. kreuzberg/_token_reduction/stopwords/tr_stopwords.json +506 -0
  80. kreuzberg/_token_reduction/stopwords/uk_stopwords.json +75 -0
  81. kreuzberg/_token_reduction/stopwords/ur_stopwords.json +519 -0
  82. kreuzberg/_token_reduction/stopwords/vi_stopwords.json +647 -0
  83. kreuzberg/_token_reduction/stopwords/yo_stopwords.json +62 -0
  84. kreuzberg/_token_reduction/stopwords/zh_stopwords.json +796 -0
  85. kreuzberg/_token_reduction/stopwords/zu_stopwords.json +31 -0
  86. kreuzberg/_types.py +146 -43
  87. kreuzberg/_utils/_html_streaming.py +20 -0
  88. kreuzberg/_utils/_image_preprocessing.py +1 -1
  89. kreuzberg/_utils/_ref.py +14 -6
  90. kreuzberg/_utils/_serialization.py +13 -6
  91. kreuzberg/_utils/_sync.py +15 -16
  92. kreuzberg/exceptions.py +0 -1
  93. kreuzberg/extraction.py +27 -11
  94. {kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/METADATA +15 -13
  95. kreuzberg-3.17.0.dist-info/RECORD +128 -0
  96. kreuzberg-3.15.0.dist-info/RECORD +0 -60
  97. {kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/WHEEL +0 -0
  98. {kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/entry_points.txt +0 -0
  99. {kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/licenses/LICENSE +0 -0
@@ -29,6 +29,7 @@ from kreuzberg._ocr._base import OCRBackend
29
29
  from kreuzberg._ocr._table_extractor import extract_words, reconstruct_table, to_markdown
30
30
  from kreuzberg._types import ExtractionResult, HTMLToMarkdownConfig, PSMMode, TableData, TesseractConfig
31
31
  from kreuzberg._utils._cache import get_ocr_cache
32
+ from kreuzberg._utils._html_streaming import should_use_streaming
32
33
  from kreuzberg._utils._process_pool import ProcessPoolManager, get_optimal_worker_count
33
34
  from kreuzberg._utils._string import normalize_spaces
34
35
  from kreuzberg._utils._sync import run_sync
@@ -214,7 +215,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
214
215
 
215
216
  try:
216
217
  await run_sync(save_image.save, str(image_path), format="PNG")
217
- except OSError as e:
218
+ except OSError as e: # pragma: no cover
218
219
  if "cannot write mode" not in str(e):
219
220
  raise
220
221
  save_image = image.convert("RGB")
@@ -356,7 +357,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
356
357
  try:
357
358
  stat = path.stat()
358
359
  file_info = {"path": str(path.resolve()), "size": stat.st_size, "mtime": stat.st_mtime}
359
- except OSError:
360
+ except OSError: # pragma: no cover
360
361
  file_info = {"path": str(path), "size": 0, "mtime": 0}
361
362
 
362
363
  cache_kwargs = {
@@ -398,7 +399,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
398
399
  await ocr_cache.aset(extraction_result, **final_cache_kwargs)
399
400
 
400
401
  return extraction_result
401
- except (RuntimeError, OSError) as e:
402
+ except (RuntimeError, OSError) as e: # pragma: no cover
402
403
  raise OCRError(f"Failed to OCR using tesseract: {e}") from e
403
404
  finally:
404
405
  await unlink()
@@ -431,7 +432,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
431
432
 
432
433
  try:
433
434
  df = await run_sync(pl.DataFrame, table_data[1:], schema=table_data[0])
434
- except (ImportError, IndexError):
435
+ except (ImportError, IndexError): # pragma: no cover
435
436
  df = None
436
437
 
437
438
  table: TableData = {"text": markdown, "df": df, "page_number": 1, "cropped_image": None} # type: ignore[typeddict-item]
@@ -443,7 +444,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
443
444
  tables=[table],
444
445
  chunks=text_result.chunks,
445
446
  )
446
- except (ValueError, KeyError, ImportError):
447
+ except (ValueError, KeyError, ImportError): # pragma: no cover
447
448
  pass
448
449
 
449
450
  return text_result
@@ -506,12 +507,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
506
507
  table_min_confidence: float = 30.0,
507
508
  **_kwargs: Any,
508
509
  ) -> ExtractionResult:
509
- config = html_to_markdown_config or HTMLToMarkdownConfig(
510
- escape_asterisks=False,
511
- escape_underscores=False,
512
- extract_metadata=False,
513
- strip="meta title",
514
- )
510
+ config = html_to_markdown_config or HTMLToMarkdownConfig()
515
511
 
516
512
  tables: list[TableData] = []
517
513
  if enable_table_detection:
@@ -532,6 +528,10 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
532
528
  config_dict = config.to_dict()
533
529
  config_dict["custom_converters"] = all_converters
534
530
 
531
+ use_streaming, chunk_size = should_use_streaming(len(hocr_content.encode()))
532
+ config_dict["stream_processing"] = use_streaming
533
+ config_dict["chunk_size"] = chunk_size
534
+
535
535
  try:
536
536
  markdown_content = html_to_markdown.convert_to_markdown(hocr_content, **config_dict)
537
537
  markdown_content = normalize_spaces(markdown_content)
@@ -673,15 +673,17 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
673
673
 
674
674
  html_config = HTMLToMarkdownConfig(
675
675
  custom_converters=converters,
676
- escape_asterisks=False,
677
- escape_underscores=False,
678
- extract_metadata=False,
679
- strip="meta title",
680
676
  )
681
677
 
678
+ config_dict = html_config.to_dict()
679
+
680
+ use_streaming, chunk_size = should_use_streaming(len(hocr_content.encode()))
681
+ config_dict["stream_processing"] = use_streaming
682
+ config_dict["chunk_size"] = chunk_size
683
+
682
684
  markdown_content = html_to_markdown.convert_to_markdown(
683
685
  hocr_content,
684
- **html_config.to_dict(),
686
+ **config_dict,
685
687
  )
686
688
 
687
689
  markdown_content = normalize_spaces(markdown_content)
@@ -750,7 +752,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
750
752
 
751
753
  try:
752
754
  df = pl.DataFrame(table_data[1:], schema=table_data[0])
753
- except (ImportError, IndexError):
755
+ except (ImportError, IndexError): # pragma: no cover
754
756
  df = None
755
757
 
756
758
  table: TableData = {"text": markdown, "df": df, "page_number": 1, "cropped_image": None} # type: ignore[typeddict-item]
@@ -762,7 +764,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
762
764
  tables=[table],
763
765
  chunks=text_result.chunks,
764
766
  )
765
- except (ValueError, KeyError, ImportError):
767
+ except (ValueError, KeyError, ImportError): # pragma: no cover
766
768
  pass
767
769
 
768
770
  return text_result
@@ -799,7 +801,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
799
801
 
800
802
  try:
801
803
  df = await run_sync(pl.DataFrame, table_data[1:], schema=table_data[0])
802
- except (ImportError, IndexError):
804
+ except (ImportError, IndexError): # pragma: no cover
803
805
  df = None
804
806
 
805
807
  dummy_image = Image.new("RGB", (1, 1), "white")
@@ -812,7 +814,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
812
814
  "metadata": {"bbox": (min_x, min_y, max_x, max_y)},
813
815
  } # type: ignore[typeddict-unknown-key]
814
816
  tables.append(table)
815
- except (ValueError, KeyError, ImportError):
817
+ except (ValueError, KeyError, ImportError): # pragma: no cover
816
818
  pass
817
819
 
818
820
  return tables
@@ -868,7 +870,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
868
870
  env = {"OMP_THREAD_LIMIT": "1"} if sys.platform.startswith("linux") else None
869
871
  try:
870
872
  result = await run_process(command, env=env)
871
- except (subprocess.CalledProcessError, FileNotFoundError) as e:
873
+ except (subprocess.CalledProcessError, FileNotFoundError) as e: # pragma: no cover
872
874
  raise MissingDependencyError(
873
875
  "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
874
876
  ) from e
@@ -879,7 +881,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
879
881
  )
880
882
 
881
883
  cls._version_checked = True
882
- except FileNotFoundError as e:
884
+ except FileNotFoundError as e: # pragma: no cover
883
885
  raise MissingDependencyError(
884
886
  "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
885
887
  ) from e
@@ -1076,7 +1078,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
1076
1078
  "size": stat.st_size,
1077
1079
  "mtime": stat.st_mtime,
1078
1080
  }
1079
- except OSError:
1081
+ except OSError: # pragma: no cover
1080
1082
  return {
1081
1083
  "path": str(path),
1082
1084
  "size": 0,
@@ -1084,7 +1086,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
1084
1086
  }
1085
1087
 
1086
1088
  def _result_from_dict(self, result_dict: dict[str, Any]) -> ExtractionResult:
1087
- """Convert a worker result dict to ExtractionResult."""
1088
1089
  if result_dict.get("success"):
1089
1090
  return ExtractionResult(
1090
1091
  content=str(result_dict.get("text", "")),
@@ -1178,7 +1179,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
1178
1179
  command = ["tesseract", "--version"]
1179
1180
  try:
1180
1181
  result = subprocess.run(command, capture_output=True, text=True, check=True, encoding="utf-8")
1181
- except (subprocess.CalledProcessError, FileNotFoundError) as e:
1182
+ except (subprocess.CalledProcessError, FileNotFoundError) as e: # pragma: no cover
1182
1183
  raise MissingDependencyError(
1183
1184
  "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
1184
1185
  ) from e
@@ -1189,7 +1190,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
1189
1190
  )
1190
1191
 
1191
1192
  cls._version_checked = True
1192
- except FileNotFoundError as e:
1193
+ except FileNotFoundError as e: # pragma: no cover
1193
1194
  raise MissingDependencyError(
1194
1195
  "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
1195
1196
  ) from e
@@ -0,0 +1,11 @@
1
+ from __future__ import annotations
2
+
3
+ from kreuzberg._token_reduction._reducer import ReductionStats, get_reduction_stats, reduce_tokens
4
+ from kreuzberg._token_reduction._stopwords import StopwordsManager
5
+
6
+ __all__ = [
7
+ "ReductionStats",
8
+ "StopwordsManager",
9
+ "get_reduction_stats",
10
+ "reduce_tokens",
11
+ ]
@@ -0,0 +1,439 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ import unicodedata
5
+ from functools import lru_cache
6
+ from typing import TYPE_CHECKING, Any, TypedDict
7
+
8
+ from kreuzberg._token_reduction._stopwords import get_default_stopwords_manager
9
+ from kreuzberg.exceptions import ValidationError
10
+
11
+ if TYPE_CHECKING:
12
+ from collections.abc import Callable
13
+
14
+ from kreuzberg._types import TokenReductionConfig
15
+
16
+
17
+ class ReductionStats(TypedDict):
18
+ """Statistics about token reduction operation."""
19
+
20
+ character_reduction_ratio: float
21
+ token_reduction_ratio: float
22
+ original_characters: int
23
+ reduced_characters: int
24
+ original_tokens: int
25
+ reduced_tokens: int
26
+
27
+
28
+ HTML_COMMENT_PATTERN = re.compile(r"<!--.*?-->", re.DOTALL)
29
+
30
+ PUNCTUATION_CLEANUP_PATTERN = re.compile(
31
+ r"([!?.])(?:\1)+"
32
+ r"|(,)(?:,)+"
33
+ r"|[!?]+\.+[!?]*|[?!]{3,}"
34
+ )
35
+
36
+ WHITESPACE_CLEANUP_PATTERN = re.compile(r"\n{3,}|[ \t]+")
37
+
38
+ MARKDOWN_LIST_PATTERNS = (
39
+ re.compile(r"^\s*[-*+]\s"),
40
+ re.compile(r"^\s*\d+\.\s"),
41
+ )
42
+
43
+ WORD_CLEAN_PATTERN = re.compile(r"[^\w]", re.UNICODE)
44
+ LANGUAGE_CODE_PATTERN = re.compile(r"^[a-zA-Z0-9-]+$")
45
+
46
+ WORD_SPLIT_PATTERN = re.compile(r"\S+")
47
+ WORD_BOUNDARY_PATTERN = re.compile(r"^(\W*)(.*?)(\W*)$", re.UNICODE)
48
+
49
+ STREAMING_THRESHOLD = 1_000_000
50
+
51
+
52
+ def _normalize_unicode(text: str) -> str:
53
+ """Normalize Unicode text to NFC form for consistent processing."""
54
+ return unicodedata.normalize("NFC", text)
55
+
56
+
57
+ def _normalize_newlines(text: str) -> str:
58
+ """Remove excessive newlines, keeping at most double newlines."""
59
+ return WHITESPACE_CLEANUP_PATTERN.sub(lambda m: "\n\n" if m.group().startswith("\n") else " ", text)
60
+
61
+
62
+ def _process_text_streaming(
63
+ text: str, processor_func: Callable[..., str], chunk_size: int = 100_000, **kwargs: Any
64
+ ) -> str:
65
+ """Process large text in chunks to optimize memory usage."""
66
+ if len(text) <= chunk_size:
67
+ return processor_func(text, **kwargs)
68
+
69
+ chunks = []
70
+ start = 0
71
+
72
+ while start < len(text):
73
+ end = min(start + chunk_size, len(text))
74
+
75
+ if end < len(text):
76
+ search_start = max(start, end - 1000)
77
+ for i in range(end - 1, search_start - 1, -1):
78
+ if text[i] in ".!?\n":
79
+ end = i + 1
80
+ break
81
+
82
+ chunk = text[start:end]
83
+ processed_chunk = processor_func(chunk, **kwargs)
84
+ chunks.append(processed_chunk)
85
+ start = end
86
+
87
+ return " ".join(chunks).strip()
88
+
89
+
90
+ def _is_markdown_structural_line(line: str, in_code_block: bool) -> bool:
91
+ """Check if a line contains markdown structural elements that should be preserved."""
92
+ if in_code_block:
93
+ return True
94
+
95
+ stripped = line.strip()
96
+
97
+ if stripped.startswith("#"):
98
+ return True
99
+
100
+ if "|" in line:
101
+ pipe_count = line.count("|")
102
+ if pipe_count >= 2 and (line.strip().startswith("|") or line.strip().endswith("|") or " | " in line):
103
+ return True
104
+
105
+ return MARKDOWN_LIST_PATTERNS[0].match(line) is not None or MARKDOWN_LIST_PATTERNS[1].match(line) is not None
106
+
107
+
108
+ @lru_cache(maxsize=64)
109
+ def _get_stopwords_with_custom(language: str, custom_words_tuple: tuple[str, ...] | None = None) -> set[str]:
110
+ """Get stopwords for a language, optionally with custom additions."""
111
+ manager = get_default_stopwords_manager()
112
+ base_stopwords = manager.get_stopwords(language)
113
+
114
+ if custom_words_tuple:
115
+ return base_stopwords | set(custom_words_tuple)
116
+ return base_stopwords
117
+
118
+
119
+ @lru_cache(maxsize=64)
120
+ def _get_lowercase_stopwords(language: str, custom_words_tuple: tuple[str, ...] | None = None) -> set[str]:
121
+ """Get pre-lowercased stopwords for faster comparison."""
122
+ stopwords = _get_stopwords_with_custom(language, custom_words_tuple)
123
+ return {sw.lower() for sw in stopwords}
124
+
125
+
126
+ def reduce_tokens(
127
+ text: str,
128
+ *,
129
+ config: TokenReductionConfig,
130
+ language: str | None = None,
131
+ ) -> str:
132
+ """Reduce tokens in text based on the specified configuration.
133
+
134
+ Args:
135
+ text: The text to reduce.
136
+ config: Configuration for token reduction.
137
+ language: Optional language code for stopword selection.
138
+
139
+ Returns:
140
+ The reduced text.
141
+
142
+ Raises:
143
+ ValidationError: If inputs are invalid.
144
+ """
145
+ if config is None:
146
+ raise ValidationError("Config cannot be None")
147
+
148
+ if text is None:
149
+ raise ValidationError("Text cannot be None")
150
+
151
+ if not isinstance(text, str):
152
+ raise ValidationError(f"Text must be a string, got {type(text).__name__}")
153
+
154
+ if language is not None and not isinstance(language, str):
155
+ raise ValidationError(f"Language must be a string or None, got {type(language).__name__}")
156
+
157
+ if language is not None and len(language.strip()) == 0:
158
+ raise ValidationError("Language cannot be empty or whitespace-only")
159
+
160
+ if config.mode == "off":
161
+ return text
162
+
163
+ use_streaming = len(text) > STREAMING_THRESHOLD
164
+
165
+ if language and not LANGUAGE_CODE_PATTERN.match(language):
166
+ raise ValidationError(f"Invalid language code format: {language}")
167
+
168
+ if not text or not text.strip():
169
+ return ""
170
+
171
+ text = _normalize_unicode(text)
172
+
173
+ if config.mode == "light":
174
+ return _apply_light_reduction(text, preserve_markdown=config.preserve_markdown, use_streaming=use_streaming)
175
+
176
+ if config.mode == "moderate":
177
+ return _apply_moderate_reduction(
178
+ text,
179
+ config=config,
180
+ language=language,
181
+ use_streaming=use_streaming,
182
+ )
183
+
184
+ return text
185
+
186
+
187
+ def _apply_light_reduction(text: str, *, preserve_markdown: bool, use_streaming: bool = False) -> str:
188
+ """Apply light reduction (formatting only)."""
189
+ if use_streaming:
190
+ if preserve_markdown:
191
+ return str(_process_text_streaming(text, _apply_light_reduction_markdown_aware))
192
+ return str(_process_text_streaming(text, _apply_light_reduction_plain))
193
+
194
+ if preserve_markdown:
195
+ return _apply_light_reduction_markdown_aware(text)
196
+ return _apply_light_reduction_plain(text)
197
+
198
+
199
+ def _apply_light_reduction_plain(text: str) -> str:
200
+ """Apply light reduction to plain text."""
201
+ text = HTML_COMMENT_PATTERN.sub("", text)
202
+
203
+ def punctuation_replacer(match: re.Match[str]) -> str:
204
+ if match.group(1):
205
+ return match.group(1)
206
+ if match.group(2):
207
+ return ","
208
+ return "?"
209
+
210
+ text = PUNCTUATION_CLEANUP_PATTERN.sub(punctuation_replacer, text)
211
+
212
+ def whitespace_replacer(match: re.Match[str]) -> str:
213
+ if match.group().startswith("\n"):
214
+ return "\n\n"
215
+ return " "
216
+
217
+ text = WHITESPACE_CLEANUP_PATTERN.sub(whitespace_replacer, text)
218
+
219
+ return text.strip()
220
+
221
+
222
+ def _apply_light_reduction_markdown_aware(text: str) -> str:
223
+ """Apply light reduction preserving markdown structure."""
224
+ lines = text.split("\n")
225
+ processed_lines = []
226
+ in_code_block = False
227
+
228
+ for line in lines:
229
+ if line.strip().startswith("```"):
230
+ in_code_block = not in_code_block
231
+ processed_lines.append(line)
232
+ continue
233
+
234
+ if _is_markdown_structural_line(line, in_code_block) or in_code_block:
235
+ processed_lines.append(line)
236
+ continue
237
+
238
+ if line.strip():
239
+ reduced = _apply_light_reduction_plain(line)
240
+ processed_lines.append(reduced)
241
+ else:
242
+ processed_lines.append(line)
243
+
244
+ result = "\n".join(processed_lines)
245
+
246
+ lines = result.split("\n")
247
+ normalized_lines = []
248
+ in_code_block = False
249
+ consecutive_empty = 0
250
+
251
+ for line in lines:
252
+ if line.strip().startswith("```"):
253
+ in_code_block = not in_code_block
254
+ normalized_lines.append(line)
255
+ consecutive_empty = 0
256
+ continue
257
+
258
+ if in_code_block:
259
+ normalized_lines.append(line)
260
+ consecutive_empty = 0
261
+ elif not line.strip():
262
+ consecutive_empty += 1
263
+ if consecutive_empty <= 2:
264
+ normalized_lines.append(line)
265
+ else:
266
+ normalized_lines.append(line)
267
+ consecutive_empty = 0
268
+
269
+ return "\n".join(normalized_lines).strip()
270
+
271
+
272
+ def _apply_moderate_reduction(
273
+ text: str,
274
+ *,
275
+ config: TokenReductionConfig,
276
+ language: str | None = None,
277
+ use_streaming: bool = False,
278
+ ) -> str:
279
+ """Apply moderate reduction (formatting + stopwords)."""
280
+ text = _apply_light_reduction(text, preserve_markdown=config.preserve_markdown, use_streaming=use_streaming)
281
+
282
+ lang = language or config.language_hint or "en"
283
+
284
+ manager = get_default_stopwords_manager()
285
+ if not manager.has_language(lang):
286
+ lang = "en"
287
+ if not manager.has_language("en"):
288
+ return text
289
+
290
+ custom_words_tuple = None
291
+ if config.custom_stopwords and lang in config.custom_stopwords:
292
+ custom_words_tuple = tuple(sorted(config.custom_stopwords[lang]))
293
+
294
+ if use_streaming:
295
+ if config.preserve_markdown:
296
+ return str(
297
+ _process_text_streaming(
298
+ text,
299
+ _apply_stopword_reduction_markdown_aware,
300
+ stopwords=_get_lowercase_stopwords(lang, custom_words_tuple),
301
+ )
302
+ )
303
+ return str(
304
+ _process_text_streaming(
305
+ text, _apply_stopword_reduction_plain, stopwords=_get_lowercase_stopwords(lang, custom_words_tuple)
306
+ )
307
+ )
308
+
309
+ stopwords = _get_lowercase_stopwords(lang, custom_words_tuple)
310
+
311
+ if config.preserve_markdown:
312
+ return _apply_stopword_reduction_markdown_aware(text, stopwords=stopwords)
313
+ return _apply_stopword_reduction_plain(text, stopwords=stopwords)
314
+
315
+
316
+ def _apply_stopword_reduction_plain(text: str, *, stopwords: set[str]) -> str:
317
+ """Apply stopword reduction to plain text.
318
+
319
+ Args:
320
+ text: Text to process
321
+ stopwords: Pre-lowercased stopwords set for faster comparison
322
+ """
323
+ words = WORD_SPLIT_PATTERN.findall(text)
324
+ if not words:
325
+ return ""
326
+
327
+ filtered_words = []
328
+
329
+ for word in words:
330
+ if len(word) <= 3 and word.isalpha():
331
+ if word.lower() not in stopwords or word.isupper() or len(word) == 1:
332
+ filtered_words.append(word)
333
+ continue
334
+
335
+ match = WORD_BOUNDARY_PATTERN.match(word)
336
+ if not match:
337
+ filtered_words.append(word)
338
+ continue
339
+
340
+ _prefix_punct, core_word, suffix_punct = match.groups()
341
+
342
+ if not core_word:
343
+ filtered_words.append(word)
344
+ continue
345
+
346
+ clean_word = core_word.lower() if core_word.isalpha() else WORD_CLEAN_PATTERN.sub("", core_word).lower()
347
+
348
+ if not clean_word:
349
+ filtered_words.append(word)
350
+ continue
351
+
352
+ is_stopword = clean_word in stopwords
353
+ should_keep = (
354
+ not is_stopword
355
+ or len(clean_word) <= 1
356
+ or (len(core_word) > 1 and core_word.isupper())
357
+ or any(c.isdigit() for c in core_word)
358
+ )
359
+
360
+ if should_keep:
361
+ filtered_words.append(word)
362
+ elif (
363
+ suffix_punct
364
+ and suffix_punct in ".,;:!?"
365
+ and filtered_words
366
+ and not filtered_words[-1].endswith(suffix_punct)
367
+ ):
368
+ filtered_words[-1] += suffix_punct
369
+
370
+ return " ".join(filtered_words) if filtered_words else ""
371
+
372
+
373
+ def _apply_stopword_reduction_markdown_aware(text: str, *, stopwords: set[str]) -> str:
374
+ """Apply stopword reduction preserving markdown structure."""
375
+ lines = text.split("\n")
376
+ processed_lines = []
377
+ in_code_block = False
378
+
379
+ for line in lines:
380
+ if line.strip().startswith("```"):
381
+ in_code_block = not in_code_block
382
+ processed_lines.append(line)
383
+ continue
384
+
385
+ if _is_markdown_structural_line(line, in_code_block):
386
+ processed_lines.append(line)
387
+ continue
388
+
389
+ if line.strip():
390
+ reduced = _apply_stopword_reduction_plain(line, stopwords=stopwords)
391
+ processed_lines.append(reduced)
392
+ else:
393
+ processed_lines.append(line)
394
+
395
+ result = "\n".join(processed_lines)
396
+ return _normalize_newlines(result).strip()
397
+
398
+
399
+ def get_reduction_stats(original: str, reduced: str) -> ReductionStats:
400
+ """Get detailed statistics about the reduction.
401
+
402
+ Args:
403
+ original: The original text.
404
+ reduced: The reduced text.
405
+
406
+ Returns:
407
+ Statistics about the reduction.
408
+
409
+ Raises:
410
+ ValidationError: If inputs are invalid.
411
+ """
412
+ if original is None:
413
+ raise ValidationError("Original text cannot be None")
414
+
415
+ if reduced is None:
416
+ raise ValidationError("Reduced text cannot be None")
417
+
418
+ if not isinstance(original, str):
419
+ raise ValidationError(f"Original text must be a string, got {type(original).__name__}")
420
+
421
+ if not isinstance(reduced, str):
422
+ raise ValidationError(f"Reduced text must be a string, got {type(reduced).__name__}")
423
+
424
+ original_chars = len(original)
425
+ reduced_chars = len(reduced)
426
+ original_tokens = len(original.split()) if original else 0
427
+ reduced_tokens = len(reduced.split()) if reduced else 0
428
+
429
+ char_reduction = (original_chars - reduced_chars) / original_chars if original_chars > 0 else 0.0
430
+ token_reduction = (original_tokens - reduced_tokens) / original_tokens if original_tokens > 0 else 0.0
431
+
432
+ return ReductionStats(
433
+ character_reduction_ratio=char_reduction,
434
+ token_reduction_ratio=token_reduction,
435
+ original_characters=original_chars,
436
+ reduced_characters=reduced_chars,
437
+ original_tokens=original_tokens,
438
+ reduced_tokens=reduced_tokens,
439
+ )