kreuzberg 3.15.0__py3-none-any.whl → 3.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +6 -0
- kreuzberg/_api/main.py +0 -53
- kreuzberg/_config.py +17 -8
- kreuzberg/_document_classification.py +1 -1
- kreuzberg/_extractors/_base.py +0 -46
- kreuzberg/_extractors/_email.py +16 -10
- kreuzberg/_extractors/_html.py +39 -12
- kreuzberg/_extractors/_pandoc.py +2 -2
- kreuzberg/_extractors/_pdf.py +6 -7
- kreuzberg/_extractors/_presentation.py +4 -0
- kreuzberg/_extractors/_spread_sheet.py +0 -1
- kreuzberg/_extractors/_structured.py +83 -15
- kreuzberg/_gmft.py +7 -2
- kreuzberg/_mcp/server.py +1 -22
- kreuzberg/_mime_types.py +1 -1
- kreuzberg/_ocr/_easyocr.py +47 -20
- kreuzberg/_ocr/_paddleocr.py +1 -1
- kreuzberg/_ocr/_tesseract.py +27 -26
- kreuzberg/_token_reduction/__init__.py +11 -0
- kreuzberg/_token_reduction/_reducer.py +439 -0
- kreuzberg/_token_reduction/_stopwords.py +116 -0
- kreuzberg/_token_reduction/stopwords/af_stopwords.json +53 -0
- kreuzberg/_token_reduction/stopwords/ar_stopwords.json +482 -0
- kreuzberg/_token_reduction/stopwords/bg_stopwords.json +261 -0
- kreuzberg/_token_reduction/stopwords/bn_stopwords.json +400 -0
- kreuzberg/_token_reduction/stopwords/br_stopwords.json +1205 -0
- kreuzberg/_token_reduction/stopwords/ca_stopwords.json +280 -0
- kreuzberg/_token_reduction/stopwords/cs_stopwords.json +425 -0
- kreuzberg/_token_reduction/stopwords/da_stopwords.json +172 -0
- kreuzberg/_token_reduction/stopwords/de_stopwords.json +622 -0
- kreuzberg/_token_reduction/stopwords/el_stopwords.json +849 -0
- kreuzberg/_token_reduction/stopwords/en_stopwords.json +1300 -0
- kreuzberg/_token_reduction/stopwords/eo_stopwords.json +175 -0
- kreuzberg/_token_reduction/stopwords/es_stopwords.json +734 -0
- kreuzberg/_token_reduction/stopwords/et_stopwords.json +37 -0
- kreuzberg/_token_reduction/stopwords/eu_stopwords.json +100 -0
- kreuzberg/_token_reduction/stopwords/fa_stopwords.json +801 -0
- kreuzberg/_token_reduction/stopwords/fi_stopwords.json +849 -0
- kreuzberg/_token_reduction/stopwords/fr_stopwords.json +693 -0
- kreuzberg/_token_reduction/stopwords/ga_stopwords.json +111 -0
- kreuzberg/_token_reduction/stopwords/gl_stopwords.json +162 -0
- kreuzberg/_token_reduction/stopwords/gu_stopwords.json +226 -0
- kreuzberg/_token_reduction/stopwords/ha_stopwords.json +41 -0
- kreuzberg/_token_reduction/stopwords/he_stopwords.json +196 -0
- kreuzberg/_token_reduction/stopwords/hi_stopwords.json +227 -0
- kreuzberg/_token_reduction/stopwords/hr_stopwords.json +181 -0
- kreuzberg/_token_reduction/stopwords/hu_stopwords.json +791 -0
- kreuzberg/_token_reduction/stopwords/hy_stopwords.json +47 -0
- kreuzberg/_token_reduction/stopwords/id_stopwords.json +760 -0
- kreuzberg/_token_reduction/stopwords/it_stopwords.json +634 -0
- kreuzberg/_token_reduction/stopwords/ja_stopwords.json +136 -0
- kreuzberg/_token_reduction/stopwords/kn_stopwords.json +84 -0
- kreuzberg/_token_reduction/stopwords/ko_stopwords.json +681 -0
- kreuzberg/_token_reduction/stopwords/ku_stopwords.json +64 -0
- kreuzberg/_token_reduction/stopwords/la_stopwords.json +51 -0
- kreuzberg/_token_reduction/stopwords/lt_stopwords.json +476 -0
- kreuzberg/_token_reduction/stopwords/lv_stopwords.json +163 -0
- kreuzberg/_token_reduction/stopwords/ml_stopwords.json +11 -0
- kreuzberg/_token_reduction/stopwords/mr_stopwords.json +101 -0
- kreuzberg/_token_reduction/stopwords/ms_stopwords.json +477 -0
- kreuzberg/_token_reduction/stopwords/ne_stopwords.json +490 -0
- kreuzberg/_token_reduction/stopwords/nl_stopwords.json +415 -0
- kreuzberg/_token_reduction/stopwords/no_stopwords.json +223 -0
- kreuzberg/_token_reduction/stopwords/pl_stopwords.json +331 -0
- kreuzberg/_token_reduction/stopwords/pt_stopwords.json +562 -0
- kreuzberg/_token_reduction/stopwords/ro_stopwords.json +436 -0
- kreuzberg/_token_reduction/stopwords/ru_stopwords.json +561 -0
- kreuzberg/_token_reduction/stopwords/si_stopwords.json +193 -0
- kreuzberg/_token_reduction/stopwords/sk_stopwords.json +420 -0
- kreuzberg/_token_reduction/stopwords/sl_stopwords.json +448 -0
- kreuzberg/_token_reduction/stopwords/so_stopwords.json +32 -0
- kreuzberg/_token_reduction/stopwords/st_stopwords.json +33 -0
- kreuzberg/_token_reduction/stopwords/sv_stopwords.json +420 -0
- kreuzberg/_token_reduction/stopwords/sw_stopwords.json +76 -0
- kreuzberg/_token_reduction/stopwords/ta_stopwords.json +129 -0
- kreuzberg/_token_reduction/stopwords/te_stopwords.json +54 -0
- kreuzberg/_token_reduction/stopwords/th_stopwords.json +118 -0
- kreuzberg/_token_reduction/stopwords/tl_stopwords.json +149 -0
- kreuzberg/_token_reduction/stopwords/tr_stopwords.json +506 -0
- kreuzberg/_token_reduction/stopwords/uk_stopwords.json +75 -0
- kreuzberg/_token_reduction/stopwords/ur_stopwords.json +519 -0
- kreuzberg/_token_reduction/stopwords/vi_stopwords.json +647 -0
- kreuzberg/_token_reduction/stopwords/yo_stopwords.json +62 -0
- kreuzberg/_token_reduction/stopwords/zh_stopwords.json +796 -0
- kreuzberg/_token_reduction/stopwords/zu_stopwords.json +31 -0
- kreuzberg/_types.py +146 -43
- kreuzberg/_utils/_html_streaming.py +20 -0
- kreuzberg/_utils/_image_preprocessing.py +1 -1
- kreuzberg/_utils/_ref.py +14 -6
- kreuzberg/_utils/_serialization.py +13 -6
- kreuzberg/_utils/_sync.py +15 -16
- kreuzberg/exceptions.py +0 -1
- kreuzberg/extraction.py +27 -11
- {kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/METADATA +15 -13
- kreuzberg-3.17.0.dist-info/RECORD +128 -0
- kreuzberg-3.15.0.dist-info/RECORD +0 -60
- {kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_ocr/_tesseract.py
CHANGED
@@ -29,6 +29,7 @@ from kreuzberg._ocr._base import OCRBackend
|
|
29
29
|
from kreuzberg._ocr._table_extractor import extract_words, reconstruct_table, to_markdown
|
30
30
|
from kreuzberg._types import ExtractionResult, HTMLToMarkdownConfig, PSMMode, TableData, TesseractConfig
|
31
31
|
from kreuzberg._utils._cache import get_ocr_cache
|
32
|
+
from kreuzberg._utils._html_streaming import should_use_streaming
|
32
33
|
from kreuzberg._utils._process_pool import ProcessPoolManager, get_optimal_worker_count
|
33
34
|
from kreuzberg._utils._string import normalize_spaces
|
34
35
|
from kreuzberg._utils._sync import run_sync
|
@@ -214,7 +215,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
214
215
|
|
215
216
|
try:
|
216
217
|
await run_sync(save_image.save, str(image_path), format="PNG")
|
217
|
-
except OSError as e:
|
218
|
+
except OSError as e: # pragma: no cover
|
218
219
|
if "cannot write mode" not in str(e):
|
219
220
|
raise
|
220
221
|
save_image = image.convert("RGB")
|
@@ -356,7 +357,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
356
357
|
try:
|
357
358
|
stat = path.stat()
|
358
359
|
file_info = {"path": str(path.resolve()), "size": stat.st_size, "mtime": stat.st_mtime}
|
359
|
-
except OSError:
|
360
|
+
except OSError: # pragma: no cover
|
360
361
|
file_info = {"path": str(path), "size": 0, "mtime": 0}
|
361
362
|
|
362
363
|
cache_kwargs = {
|
@@ -398,7 +399,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
398
399
|
await ocr_cache.aset(extraction_result, **final_cache_kwargs)
|
399
400
|
|
400
401
|
return extraction_result
|
401
|
-
except (RuntimeError, OSError) as e:
|
402
|
+
except (RuntimeError, OSError) as e: # pragma: no cover
|
402
403
|
raise OCRError(f"Failed to OCR using tesseract: {e}") from e
|
403
404
|
finally:
|
404
405
|
await unlink()
|
@@ -431,7 +432,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
431
432
|
|
432
433
|
try:
|
433
434
|
df = await run_sync(pl.DataFrame, table_data[1:], schema=table_data[0])
|
434
|
-
except (ImportError, IndexError):
|
435
|
+
except (ImportError, IndexError): # pragma: no cover
|
435
436
|
df = None
|
436
437
|
|
437
438
|
table: TableData = {"text": markdown, "df": df, "page_number": 1, "cropped_image": None} # type: ignore[typeddict-item]
|
@@ -443,7 +444,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
443
444
|
tables=[table],
|
444
445
|
chunks=text_result.chunks,
|
445
446
|
)
|
446
|
-
except (ValueError, KeyError, ImportError):
|
447
|
+
except (ValueError, KeyError, ImportError): # pragma: no cover
|
447
448
|
pass
|
448
449
|
|
449
450
|
return text_result
|
@@ -506,12 +507,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
506
507
|
table_min_confidence: float = 30.0,
|
507
508
|
**_kwargs: Any,
|
508
509
|
) -> ExtractionResult:
|
509
|
-
config = html_to_markdown_config or HTMLToMarkdownConfig(
|
510
|
-
escape_asterisks=False,
|
511
|
-
escape_underscores=False,
|
512
|
-
extract_metadata=False,
|
513
|
-
strip="meta title",
|
514
|
-
)
|
510
|
+
config = html_to_markdown_config or HTMLToMarkdownConfig()
|
515
511
|
|
516
512
|
tables: list[TableData] = []
|
517
513
|
if enable_table_detection:
|
@@ -532,6 +528,10 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
532
528
|
config_dict = config.to_dict()
|
533
529
|
config_dict["custom_converters"] = all_converters
|
534
530
|
|
531
|
+
use_streaming, chunk_size = should_use_streaming(len(hocr_content.encode()))
|
532
|
+
config_dict["stream_processing"] = use_streaming
|
533
|
+
config_dict["chunk_size"] = chunk_size
|
534
|
+
|
535
535
|
try:
|
536
536
|
markdown_content = html_to_markdown.convert_to_markdown(hocr_content, **config_dict)
|
537
537
|
markdown_content = normalize_spaces(markdown_content)
|
@@ -673,15 +673,17 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
673
673
|
|
674
674
|
html_config = HTMLToMarkdownConfig(
|
675
675
|
custom_converters=converters,
|
676
|
-
escape_asterisks=False,
|
677
|
-
escape_underscores=False,
|
678
|
-
extract_metadata=False,
|
679
|
-
strip="meta title",
|
680
676
|
)
|
681
677
|
|
678
|
+
config_dict = html_config.to_dict()
|
679
|
+
|
680
|
+
use_streaming, chunk_size = should_use_streaming(len(hocr_content.encode()))
|
681
|
+
config_dict["stream_processing"] = use_streaming
|
682
|
+
config_dict["chunk_size"] = chunk_size
|
683
|
+
|
682
684
|
markdown_content = html_to_markdown.convert_to_markdown(
|
683
685
|
hocr_content,
|
684
|
-
**
|
686
|
+
**config_dict,
|
685
687
|
)
|
686
688
|
|
687
689
|
markdown_content = normalize_spaces(markdown_content)
|
@@ -750,7 +752,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
750
752
|
|
751
753
|
try:
|
752
754
|
df = pl.DataFrame(table_data[1:], schema=table_data[0])
|
753
|
-
except (ImportError, IndexError):
|
755
|
+
except (ImportError, IndexError): # pragma: no cover
|
754
756
|
df = None
|
755
757
|
|
756
758
|
table: TableData = {"text": markdown, "df": df, "page_number": 1, "cropped_image": None} # type: ignore[typeddict-item]
|
@@ -762,7 +764,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
762
764
|
tables=[table],
|
763
765
|
chunks=text_result.chunks,
|
764
766
|
)
|
765
|
-
except (ValueError, KeyError, ImportError):
|
767
|
+
except (ValueError, KeyError, ImportError): # pragma: no cover
|
766
768
|
pass
|
767
769
|
|
768
770
|
return text_result
|
@@ -799,7 +801,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
799
801
|
|
800
802
|
try:
|
801
803
|
df = await run_sync(pl.DataFrame, table_data[1:], schema=table_data[0])
|
802
|
-
except (ImportError, IndexError):
|
804
|
+
except (ImportError, IndexError): # pragma: no cover
|
803
805
|
df = None
|
804
806
|
|
805
807
|
dummy_image = Image.new("RGB", (1, 1), "white")
|
@@ -812,7 +814,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
812
814
|
"metadata": {"bbox": (min_x, min_y, max_x, max_y)},
|
813
815
|
} # type: ignore[typeddict-unknown-key]
|
814
816
|
tables.append(table)
|
815
|
-
except (ValueError, KeyError, ImportError):
|
817
|
+
except (ValueError, KeyError, ImportError): # pragma: no cover
|
816
818
|
pass
|
817
819
|
|
818
820
|
return tables
|
@@ -868,7 +870,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
868
870
|
env = {"OMP_THREAD_LIMIT": "1"} if sys.platform.startswith("linux") else None
|
869
871
|
try:
|
870
872
|
result = await run_process(command, env=env)
|
871
|
-
except (subprocess.CalledProcessError, FileNotFoundError) as e:
|
873
|
+
except (subprocess.CalledProcessError, FileNotFoundError) as e: # pragma: no cover
|
872
874
|
raise MissingDependencyError(
|
873
875
|
"Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
874
876
|
) from e
|
@@ -879,7 +881,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
879
881
|
)
|
880
882
|
|
881
883
|
cls._version_checked = True
|
882
|
-
except FileNotFoundError as e:
|
884
|
+
except FileNotFoundError as e: # pragma: no cover
|
883
885
|
raise MissingDependencyError(
|
884
886
|
"Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
885
887
|
) from e
|
@@ -1076,7 +1078,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
1076
1078
|
"size": stat.st_size,
|
1077
1079
|
"mtime": stat.st_mtime,
|
1078
1080
|
}
|
1079
|
-
except OSError:
|
1081
|
+
except OSError: # pragma: no cover
|
1080
1082
|
return {
|
1081
1083
|
"path": str(path),
|
1082
1084
|
"size": 0,
|
@@ -1084,7 +1086,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
1084
1086
|
}
|
1085
1087
|
|
1086
1088
|
def _result_from_dict(self, result_dict: dict[str, Any]) -> ExtractionResult:
|
1087
|
-
"""Convert a worker result dict to ExtractionResult."""
|
1088
1089
|
if result_dict.get("success"):
|
1089
1090
|
return ExtractionResult(
|
1090
1091
|
content=str(result_dict.get("text", "")),
|
@@ -1178,7 +1179,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
1178
1179
|
command = ["tesseract", "--version"]
|
1179
1180
|
try:
|
1180
1181
|
result = subprocess.run(command, capture_output=True, text=True, check=True, encoding="utf-8")
|
1181
|
-
except (subprocess.CalledProcessError, FileNotFoundError) as e:
|
1182
|
+
except (subprocess.CalledProcessError, FileNotFoundError) as e: # pragma: no cover
|
1182
1183
|
raise MissingDependencyError(
|
1183
1184
|
"Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
1184
1185
|
) from e
|
@@ -1189,7 +1190,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
1189
1190
|
)
|
1190
1191
|
|
1191
1192
|
cls._version_checked = True
|
1192
|
-
except FileNotFoundError as e:
|
1193
|
+
except FileNotFoundError as e: # pragma: no cover
|
1193
1194
|
raise MissingDependencyError(
|
1194
1195
|
"Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
1195
1196
|
) from e
|
@@ -0,0 +1,11 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from kreuzberg._token_reduction._reducer import ReductionStats, get_reduction_stats, reduce_tokens
|
4
|
+
from kreuzberg._token_reduction._stopwords import StopwordsManager
|
5
|
+
|
6
|
+
__all__ = [
|
7
|
+
"ReductionStats",
|
8
|
+
"StopwordsManager",
|
9
|
+
"get_reduction_stats",
|
10
|
+
"reduce_tokens",
|
11
|
+
]
|
@@ -0,0 +1,439 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import re
|
4
|
+
import unicodedata
|
5
|
+
from functools import lru_cache
|
6
|
+
from typing import TYPE_CHECKING, Any, TypedDict
|
7
|
+
|
8
|
+
from kreuzberg._token_reduction._stopwords import get_default_stopwords_manager
|
9
|
+
from kreuzberg.exceptions import ValidationError
|
10
|
+
|
11
|
+
if TYPE_CHECKING:
|
12
|
+
from collections.abc import Callable
|
13
|
+
|
14
|
+
from kreuzberg._types import TokenReductionConfig
|
15
|
+
|
16
|
+
|
17
|
+
class ReductionStats(TypedDict):
|
18
|
+
"""Statistics about token reduction operation."""
|
19
|
+
|
20
|
+
character_reduction_ratio: float
|
21
|
+
token_reduction_ratio: float
|
22
|
+
original_characters: int
|
23
|
+
reduced_characters: int
|
24
|
+
original_tokens: int
|
25
|
+
reduced_tokens: int
|
26
|
+
|
27
|
+
|
28
|
+
HTML_COMMENT_PATTERN = re.compile(r"<!--.*?-->", re.DOTALL)
|
29
|
+
|
30
|
+
PUNCTUATION_CLEANUP_PATTERN = re.compile(
|
31
|
+
r"([!?.])(?:\1)+"
|
32
|
+
r"|(,)(?:,)+"
|
33
|
+
r"|[!?]+\.+[!?]*|[?!]{3,}"
|
34
|
+
)
|
35
|
+
|
36
|
+
WHITESPACE_CLEANUP_PATTERN = re.compile(r"\n{3,}|[ \t]+")
|
37
|
+
|
38
|
+
MARKDOWN_LIST_PATTERNS = (
|
39
|
+
re.compile(r"^\s*[-*+]\s"),
|
40
|
+
re.compile(r"^\s*\d+\.\s"),
|
41
|
+
)
|
42
|
+
|
43
|
+
WORD_CLEAN_PATTERN = re.compile(r"[^\w]", re.UNICODE)
|
44
|
+
LANGUAGE_CODE_PATTERN = re.compile(r"^[a-zA-Z0-9-]+$")
|
45
|
+
|
46
|
+
WORD_SPLIT_PATTERN = re.compile(r"\S+")
|
47
|
+
WORD_BOUNDARY_PATTERN = re.compile(r"^(\W*)(.*?)(\W*)$", re.UNICODE)
|
48
|
+
|
49
|
+
STREAMING_THRESHOLD = 1_000_000
|
50
|
+
|
51
|
+
|
52
|
+
def _normalize_unicode(text: str) -> str:
|
53
|
+
"""Normalize Unicode text to NFC form for consistent processing."""
|
54
|
+
return unicodedata.normalize("NFC", text)
|
55
|
+
|
56
|
+
|
57
|
+
def _normalize_newlines(text: str) -> str:
|
58
|
+
"""Remove excessive newlines, keeping at most double newlines."""
|
59
|
+
return WHITESPACE_CLEANUP_PATTERN.sub(lambda m: "\n\n" if m.group().startswith("\n") else " ", text)
|
60
|
+
|
61
|
+
|
62
|
+
def _process_text_streaming(
|
63
|
+
text: str, processor_func: Callable[..., str], chunk_size: int = 100_000, **kwargs: Any
|
64
|
+
) -> str:
|
65
|
+
"""Process large text in chunks to optimize memory usage."""
|
66
|
+
if len(text) <= chunk_size:
|
67
|
+
return processor_func(text, **kwargs)
|
68
|
+
|
69
|
+
chunks = []
|
70
|
+
start = 0
|
71
|
+
|
72
|
+
while start < len(text):
|
73
|
+
end = min(start + chunk_size, len(text))
|
74
|
+
|
75
|
+
if end < len(text):
|
76
|
+
search_start = max(start, end - 1000)
|
77
|
+
for i in range(end - 1, search_start - 1, -1):
|
78
|
+
if text[i] in ".!?\n":
|
79
|
+
end = i + 1
|
80
|
+
break
|
81
|
+
|
82
|
+
chunk = text[start:end]
|
83
|
+
processed_chunk = processor_func(chunk, **kwargs)
|
84
|
+
chunks.append(processed_chunk)
|
85
|
+
start = end
|
86
|
+
|
87
|
+
return " ".join(chunks).strip()
|
88
|
+
|
89
|
+
|
90
|
+
def _is_markdown_structural_line(line: str, in_code_block: bool) -> bool:
|
91
|
+
"""Check if a line contains markdown structural elements that should be preserved."""
|
92
|
+
if in_code_block:
|
93
|
+
return True
|
94
|
+
|
95
|
+
stripped = line.strip()
|
96
|
+
|
97
|
+
if stripped.startswith("#"):
|
98
|
+
return True
|
99
|
+
|
100
|
+
if "|" in line:
|
101
|
+
pipe_count = line.count("|")
|
102
|
+
if pipe_count >= 2 and (line.strip().startswith("|") or line.strip().endswith("|") or " | " in line):
|
103
|
+
return True
|
104
|
+
|
105
|
+
return MARKDOWN_LIST_PATTERNS[0].match(line) is not None or MARKDOWN_LIST_PATTERNS[1].match(line) is not None
|
106
|
+
|
107
|
+
|
108
|
+
@lru_cache(maxsize=64)
|
109
|
+
def _get_stopwords_with_custom(language: str, custom_words_tuple: tuple[str, ...] | None = None) -> set[str]:
|
110
|
+
"""Get stopwords for a language, optionally with custom additions."""
|
111
|
+
manager = get_default_stopwords_manager()
|
112
|
+
base_stopwords = manager.get_stopwords(language)
|
113
|
+
|
114
|
+
if custom_words_tuple:
|
115
|
+
return base_stopwords | set(custom_words_tuple)
|
116
|
+
return base_stopwords
|
117
|
+
|
118
|
+
|
119
|
+
@lru_cache(maxsize=64)
|
120
|
+
def _get_lowercase_stopwords(language: str, custom_words_tuple: tuple[str, ...] | None = None) -> set[str]:
|
121
|
+
"""Get pre-lowercased stopwords for faster comparison."""
|
122
|
+
stopwords = _get_stopwords_with_custom(language, custom_words_tuple)
|
123
|
+
return {sw.lower() for sw in stopwords}
|
124
|
+
|
125
|
+
|
126
|
+
def reduce_tokens(
|
127
|
+
text: str,
|
128
|
+
*,
|
129
|
+
config: TokenReductionConfig,
|
130
|
+
language: str | None = None,
|
131
|
+
) -> str:
|
132
|
+
"""Reduce tokens in text based on the specified configuration.
|
133
|
+
|
134
|
+
Args:
|
135
|
+
text: The text to reduce.
|
136
|
+
config: Configuration for token reduction.
|
137
|
+
language: Optional language code for stopword selection.
|
138
|
+
|
139
|
+
Returns:
|
140
|
+
The reduced text.
|
141
|
+
|
142
|
+
Raises:
|
143
|
+
ValidationError: If inputs are invalid.
|
144
|
+
"""
|
145
|
+
if config is None:
|
146
|
+
raise ValidationError("Config cannot be None")
|
147
|
+
|
148
|
+
if text is None:
|
149
|
+
raise ValidationError("Text cannot be None")
|
150
|
+
|
151
|
+
if not isinstance(text, str):
|
152
|
+
raise ValidationError(f"Text must be a string, got {type(text).__name__}")
|
153
|
+
|
154
|
+
if language is not None and not isinstance(language, str):
|
155
|
+
raise ValidationError(f"Language must be a string or None, got {type(language).__name__}")
|
156
|
+
|
157
|
+
if language is not None and len(language.strip()) == 0:
|
158
|
+
raise ValidationError("Language cannot be empty or whitespace-only")
|
159
|
+
|
160
|
+
if config.mode == "off":
|
161
|
+
return text
|
162
|
+
|
163
|
+
use_streaming = len(text) > STREAMING_THRESHOLD
|
164
|
+
|
165
|
+
if language and not LANGUAGE_CODE_PATTERN.match(language):
|
166
|
+
raise ValidationError(f"Invalid language code format: {language}")
|
167
|
+
|
168
|
+
if not text or not text.strip():
|
169
|
+
return ""
|
170
|
+
|
171
|
+
text = _normalize_unicode(text)
|
172
|
+
|
173
|
+
if config.mode == "light":
|
174
|
+
return _apply_light_reduction(text, preserve_markdown=config.preserve_markdown, use_streaming=use_streaming)
|
175
|
+
|
176
|
+
if config.mode == "moderate":
|
177
|
+
return _apply_moderate_reduction(
|
178
|
+
text,
|
179
|
+
config=config,
|
180
|
+
language=language,
|
181
|
+
use_streaming=use_streaming,
|
182
|
+
)
|
183
|
+
|
184
|
+
return text
|
185
|
+
|
186
|
+
|
187
|
+
def _apply_light_reduction(text: str, *, preserve_markdown: bool, use_streaming: bool = False) -> str:
|
188
|
+
"""Apply light reduction (formatting only)."""
|
189
|
+
if use_streaming:
|
190
|
+
if preserve_markdown:
|
191
|
+
return str(_process_text_streaming(text, _apply_light_reduction_markdown_aware))
|
192
|
+
return str(_process_text_streaming(text, _apply_light_reduction_plain))
|
193
|
+
|
194
|
+
if preserve_markdown:
|
195
|
+
return _apply_light_reduction_markdown_aware(text)
|
196
|
+
return _apply_light_reduction_plain(text)
|
197
|
+
|
198
|
+
|
199
|
+
def _apply_light_reduction_plain(text: str) -> str:
|
200
|
+
"""Apply light reduction to plain text."""
|
201
|
+
text = HTML_COMMENT_PATTERN.sub("", text)
|
202
|
+
|
203
|
+
def punctuation_replacer(match: re.Match[str]) -> str:
|
204
|
+
if match.group(1):
|
205
|
+
return match.group(1)
|
206
|
+
if match.group(2):
|
207
|
+
return ","
|
208
|
+
return "?"
|
209
|
+
|
210
|
+
text = PUNCTUATION_CLEANUP_PATTERN.sub(punctuation_replacer, text)
|
211
|
+
|
212
|
+
def whitespace_replacer(match: re.Match[str]) -> str:
|
213
|
+
if match.group().startswith("\n"):
|
214
|
+
return "\n\n"
|
215
|
+
return " "
|
216
|
+
|
217
|
+
text = WHITESPACE_CLEANUP_PATTERN.sub(whitespace_replacer, text)
|
218
|
+
|
219
|
+
return text.strip()
|
220
|
+
|
221
|
+
|
222
|
+
def _apply_light_reduction_markdown_aware(text: str) -> str:
|
223
|
+
"""Apply light reduction preserving markdown structure."""
|
224
|
+
lines = text.split("\n")
|
225
|
+
processed_lines = []
|
226
|
+
in_code_block = False
|
227
|
+
|
228
|
+
for line in lines:
|
229
|
+
if line.strip().startswith("```"):
|
230
|
+
in_code_block = not in_code_block
|
231
|
+
processed_lines.append(line)
|
232
|
+
continue
|
233
|
+
|
234
|
+
if _is_markdown_structural_line(line, in_code_block) or in_code_block:
|
235
|
+
processed_lines.append(line)
|
236
|
+
continue
|
237
|
+
|
238
|
+
if line.strip():
|
239
|
+
reduced = _apply_light_reduction_plain(line)
|
240
|
+
processed_lines.append(reduced)
|
241
|
+
else:
|
242
|
+
processed_lines.append(line)
|
243
|
+
|
244
|
+
result = "\n".join(processed_lines)
|
245
|
+
|
246
|
+
lines = result.split("\n")
|
247
|
+
normalized_lines = []
|
248
|
+
in_code_block = False
|
249
|
+
consecutive_empty = 0
|
250
|
+
|
251
|
+
for line in lines:
|
252
|
+
if line.strip().startswith("```"):
|
253
|
+
in_code_block = not in_code_block
|
254
|
+
normalized_lines.append(line)
|
255
|
+
consecutive_empty = 0
|
256
|
+
continue
|
257
|
+
|
258
|
+
if in_code_block:
|
259
|
+
normalized_lines.append(line)
|
260
|
+
consecutive_empty = 0
|
261
|
+
elif not line.strip():
|
262
|
+
consecutive_empty += 1
|
263
|
+
if consecutive_empty <= 2:
|
264
|
+
normalized_lines.append(line)
|
265
|
+
else:
|
266
|
+
normalized_lines.append(line)
|
267
|
+
consecutive_empty = 0
|
268
|
+
|
269
|
+
return "\n".join(normalized_lines).strip()
|
270
|
+
|
271
|
+
|
272
|
+
def _apply_moderate_reduction(
|
273
|
+
text: str,
|
274
|
+
*,
|
275
|
+
config: TokenReductionConfig,
|
276
|
+
language: str | None = None,
|
277
|
+
use_streaming: bool = False,
|
278
|
+
) -> str:
|
279
|
+
"""Apply moderate reduction (formatting + stopwords)."""
|
280
|
+
text = _apply_light_reduction(text, preserve_markdown=config.preserve_markdown, use_streaming=use_streaming)
|
281
|
+
|
282
|
+
lang = language or config.language_hint or "en"
|
283
|
+
|
284
|
+
manager = get_default_stopwords_manager()
|
285
|
+
if not manager.has_language(lang):
|
286
|
+
lang = "en"
|
287
|
+
if not manager.has_language("en"):
|
288
|
+
return text
|
289
|
+
|
290
|
+
custom_words_tuple = None
|
291
|
+
if config.custom_stopwords and lang in config.custom_stopwords:
|
292
|
+
custom_words_tuple = tuple(sorted(config.custom_stopwords[lang]))
|
293
|
+
|
294
|
+
if use_streaming:
|
295
|
+
if config.preserve_markdown:
|
296
|
+
return str(
|
297
|
+
_process_text_streaming(
|
298
|
+
text,
|
299
|
+
_apply_stopword_reduction_markdown_aware,
|
300
|
+
stopwords=_get_lowercase_stopwords(lang, custom_words_tuple),
|
301
|
+
)
|
302
|
+
)
|
303
|
+
return str(
|
304
|
+
_process_text_streaming(
|
305
|
+
text, _apply_stopword_reduction_plain, stopwords=_get_lowercase_stopwords(lang, custom_words_tuple)
|
306
|
+
)
|
307
|
+
)
|
308
|
+
|
309
|
+
stopwords = _get_lowercase_stopwords(lang, custom_words_tuple)
|
310
|
+
|
311
|
+
if config.preserve_markdown:
|
312
|
+
return _apply_stopword_reduction_markdown_aware(text, stopwords=stopwords)
|
313
|
+
return _apply_stopword_reduction_plain(text, stopwords=stopwords)
|
314
|
+
|
315
|
+
|
316
|
+
def _apply_stopword_reduction_plain(text: str, *, stopwords: set[str]) -> str:
|
317
|
+
"""Apply stopword reduction to plain text.
|
318
|
+
|
319
|
+
Args:
|
320
|
+
text: Text to process
|
321
|
+
stopwords: Pre-lowercased stopwords set for faster comparison
|
322
|
+
"""
|
323
|
+
words = WORD_SPLIT_PATTERN.findall(text)
|
324
|
+
if not words:
|
325
|
+
return ""
|
326
|
+
|
327
|
+
filtered_words = []
|
328
|
+
|
329
|
+
for word in words:
|
330
|
+
if len(word) <= 3 and word.isalpha():
|
331
|
+
if word.lower() not in stopwords or word.isupper() or len(word) == 1:
|
332
|
+
filtered_words.append(word)
|
333
|
+
continue
|
334
|
+
|
335
|
+
match = WORD_BOUNDARY_PATTERN.match(word)
|
336
|
+
if not match:
|
337
|
+
filtered_words.append(word)
|
338
|
+
continue
|
339
|
+
|
340
|
+
_prefix_punct, core_word, suffix_punct = match.groups()
|
341
|
+
|
342
|
+
if not core_word:
|
343
|
+
filtered_words.append(word)
|
344
|
+
continue
|
345
|
+
|
346
|
+
clean_word = core_word.lower() if core_word.isalpha() else WORD_CLEAN_PATTERN.sub("", core_word).lower()
|
347
|
+
|
348
|
+
if not clean_word:
|
349
|
+
filtered_words.append(word)
|
350
|
+
continue
|
351
|
+
|
352
|
+
is_stopword = clean_word in stopwords
|
353
|
+
should_keep = (
|
354
|
+
not is_stopword
|
355
|
+
or len(clean_word) <= 1
|
356
|
+
or (len(core_word) > 1 and core_word.isupper())
|
357
|
+
or any(c.isdigit() for c in core_word)
|
358
|
+
)
|
359
|
+
|
360
|
+
if should_keep:
|
361
|
+
filtered_words.append(word)
|
362
|
+
elif (
|
363
|
+
suffix_punct
|
364
|
+
and suffix_punct in ".,;:!?"
|
365
|
+
and filtered_words
|
366
|
+
and not filtered_words[-1].endswith(suffix_punct)
|
367
|
+
):
|
368
|
+
filtered_words[-1] += suffix_punct
|
369
|
+
|
370
|
+
return " ".join(filtered_words) if filtered_words else ""
|
371
|
+
|
372
|
+
|
373
|
+
def _apply_stopword_reduction_markdown_aware(text: str, *, stopwords: set[str]) -> str:
|
374
|
+
"""Apply stopword reduction preserving markdown structure."""
|
375
|
+
lines = text.split("\n")
|
376
|
+
processed_lines = []
|
377
|
+
in_code_block = False
|
378
|
+
|
379
|
+
for line in lines:
|
380
|
+
if line.strip().startswith("```"):
|
381
|
+
in_code_block = not in_code_block
|
382
|
+
processed_lines.append(line)
|
383
|
+
continue
|
384
|
+
|
385
|
+
if _is_markdown_structural_line(line, in_code_block):
|
386
|
+
processed_lines.append(line)
|
387
|
+
continue
|
388
|
+
|
389
|
+
if line.strip():
|
390
|
+
reduced = _apply_stopword_reduction_plain(line, stopwords=stopwords)
|
391
|
+
processed_lines.append(reduced)
|
392
|
+
else:
|
393
|
+
processed_lines.append(line)
|
394
|
+
|
395
|
+
result = "\n".join(processed_lines)
|
396
|
+
return _normalize_newlines(result).strip()
|
397
|
+
|
398
|
+
|
399
|
+
def get_reduction_stats(original: str, reduced: str) -> ReductionStats:
|
400
|
+
"""Get detailed statistics about the reduction.
|
401
|
+
|
402
|
+
Args:
|
403
|
+
original: The original text.
|
404
|
+
reduced: The reduced text.
|
405
|
+
|
406
|
+
Returns:
|
407
|
+
Statistics about the reduction.
|
408
|
+
|
409
|
+
Raises:
|
410
|
+
ValidationError: If inputs are invalid.
|
411
|
+
"""
|
412
|
+
if original is None:
|
413
|
+
raise ValidationError("Original text cannot be None")
|
414
|
+
|
415
|
+
if reduced is None:
|
416
|
+
raise ValidationError("Reduced text cannot be None")
|
417
|
+
|
418
|
+
if not isinstance(original, str):
|
419
|
+
raise ValidationError(f"Original text must be a string, got {type(original).__name__}")
|
420
|
+
|
421
|
+
if not isinstance(reduced, str):
|
422
|
+
raise ValidationError(f"Reduced text must be a string, got {type(reduced).__name__}")
|
423
|
+
|
424
|
+
original_chars = len(original)
|
425
|
+
reduced_chars = len(reduced)
|
426
|
+
original_tokens = len(original.split()) if original else 0
|
427
|
+
reduced_tokens = len(reduced.split()) if reduced else 0
|
428
|
+
|
429
|
+
char_reduction = (original_chars - reduced_chars) / original_chars if original_chars > 0 else 0.0
|
430
|
+
token_reduction = (original_tokens - reduced_tokens) / original_tokens if original_tokens > 0 else 0.0
|
431
|
+
|
432
|
+
return ReductionStats(
|
433
|
+
character_reduction_ratio=char_reduction,
|
434
|
+
token_reduction_ratio=token_reduction,
|
435
|
+
original_characters=original_chars,
|
436
|
+
reduced_characters=reduced_chars,
|
437
|
+
original_tokens=original_tokens,
|
438
|
+
reduced_tokens=reduced_tokens,
|
439
|
+
)
|