PyPI - kreuzberg - Versions diffs - 3.15.0__py3-none-any.whl → 3.17.0__py3-none-any.whl - Mend

kreuzberg 3.15.0py3-none-any.whl → 3.17.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

kreuzberg/__init__.py +6 -0
kreuzberg/_api/main.py +0 -53
kreuzberg/_config.py +17 -8
kreuzberg/_document_classification.py +1 -1
kreuzberg/_extractors/_base.py +0 -46
kreuzberg/_extractors/_email.py +16 -10
kreuzberg/_extractors/_html.py +39 -12
kreuzberg/_extractors/_pandoc.py +2 -2
kreuzberg/_extractors/_pdf.py +6 -7
kreuzberg/_extractors/_presentation.py +4 -0
kreuzberg/_extractors/_spread_sheet.py +0 -1
kreuzberg/_extractors/_structured.py +83 -15
kreuzberg/_gmft.py +7 -2
kreuzberg/_mcp/server.py +1 -22
kreuzberg/_mime_types.py +1 -1
kreuzberg/_ocr/_easyocr.py +47 -20
kreuzberg/_ocr/_paddleocr.py +1 -1
kreuzberg/_ocr/_tesseract.py +27 -26
kreuzberg/_token_reduction/__init__.py +11 -0
kreuzberg/_token_reduction/_reducer.py +439 -0
kreuzberg/_token_reduction/_stopwords.py +116 -0
kreuzberg/_token_reduction/stopwords/af_stopwords.json +53 -0
kreuzberg/_token_reduction/stopwords/ar_stopwords.json +482 -0
kreuzberg/_token_reduction/stopwords/bg_stopwords.json +261 -0
kreuzberg/_token_reduction/stopwords/bn_stopwords.json +400 -0
kreuzberg/_token_reduction/stopwords/br_stopwords.json +1205 -0
kreuzberg/_token_reduction/stopwords/ca_stopwords.json +280 -0
kreuzberg/_token_reduction/stopwords/cs_stopwords.json +425 -0
kreuzberg/_token_reduction/stopwords/da_stopwords.json +172 -0
kreuzberg/_token_reduction/stopwords/de_stopwords.json +622 -0
kreuzberg/_token_reduction/stopwords/el_stopwords.json +849 -0
kreuzberg/_token_reduction/stopwords/en_stopwords.json +1300 -0
kreuzberg/_token_reduction/stopwords/eo_stopwords.json +175 -0
kreuzberg/_token_reduction/stopwords/es_stopwords.json +734 -0
kreuzberg/_token_reduction/stopwords/et_stopwords.json +37 -0
kreuzberg/_token_reduction/stopwords/eu_stopwords.json +100 -0
kreuzberg/_token_reduction/stopwords/fa_stopwords.json +801 -0
kreuzberg/_token_reduction/stopwords/fi_stopwords.json +849 -0
kreuzberg/_token_reduction/stopwords/fr_stopwords.json +693 -0
kreuzberg/_token_reduction/stopwords/ga_stopwords.json +111 -0
kreuzberg/_token_reduction/stopwords/gl_stopwords.json +162 -0
kreuzberg/_token_reduction/stopwords/gu_stopwords.json +226 -0
kreuzberg/_token_reduction/stopwords/ha_stopwords.json +41 -0
kreuzberg/_token_reduction/stopwords/he_stopwords.json +196 -0
kreuzberg/_token_reduction/stopwords/hi_stopwords.json +227 -0
kreuzberg/_token_reduction/stopwords/hr_stopwords.json +181 -0
kreuzberg/_token_reduction/stopwords/hu_stopwords.json +791 -0
kreuzberg/_token_reduction/stopwords/hy_stopwords.json +47 -0
kreuzberg/_token_reduction/stopwords/id_stopwords.json +760 -0
kreuzberg/_token_reduction/stopwords/it_stopwords.json +634 -0
kreuzberg/_token_reduction/stopwords/ja_stopwords.json +136 -0
kreuzberg/_token_reduction/stopwords/kn_stopwords.json +84 -0
kreuzberg/_token_reduction/stopwords/ko_stopwords.json +681 -0
kreuzberg/_token_reduction/stopwords/ku_stopwords.json +64 -0
kreuzberg/_token_reduction/stopwords/la_stopwords.json +51 -0
kreuzberg/_token_reduction/stopwords/lt_stopwords.json +476 -0
kreuzberg/_token_reduction/stopwords/lv_stopwords.json +163 -0
kreuzberg/_token_reduction/stopwords/ml_stopwords.json +11 -0
kreuzberg/_token_reduction/stopwords/mr_stopwords.json +101 -0
kreuzberg/_token_reduction/stopwords/ms_stopwords.json +477 -0
kreuzberg/_token_reduction/stopwords/ne_stopwords.json +490 -0
kreuzberg/_token_reduction/stopwords/nl_stopwords.json +415 -0
kreuzberg/_token_reduction/stopwords/no_stopwords.json +223 -0
kreuzberg/_token_reduction/stopwords/pl_stopwords.json +331 -0
kreuzberg/_token_reduction/stopwords/pt_stopwords.json +562 -0
kreuzberg/_token_reduction/stopwords/ro_stopwords.json +436 -0
kreuzberg/_token_reduction/stopwords/ru_stopwords.json +561 -0
kreuzberg/_token_reduction/stopwords/si_stopwords.json +193 -0
kreuzberg/_token_reduction/stopwords/sk_stopwords.json +420 -0
kreuzberg/_token_reduction/stopwords/sl_stopwords.json +448 -0
kreuzberg/_token_reduction/stopwords/so_stopwords.json +32 -0
kreuzberg/_token_reduction/stopwords/st_stopwords.json +33 -0
kreuzberg/_token_reduction/stopwords/sv_stopwords.json +420 -0
kreuzberg/_token_reduction/stopwords/sw_stopwords.json +76 -0
kreuzberg/_token_reduction/stopwords/ta_stopwords.json +129 -0
kreuzberg/_token_reduction/stopwords/te_stopwords.json +54 -0
kreuzberg/_token_reduction/stopwords/th_stopwords.json +118 -0
kreuzberg/_token_reduction/stopwords/tl_stopwords.json +149 -0
kreuzberg/_token_reduction/stopwords/tr_stopwords.json +506 -0
kreuzberg/_token_reduction/stopwords/uk_stopwords.json +75 -0
kreuzberg/_token_reduction/stopwords/ur_stopwords.json +519 -0
kreuzberg/_token_reduction/stopwords/vi_stopwords.json +647 -0
kreuzberg/_token_reduction/stopwords/yo_stopwords.json +62 -0
kreuzberg/_token_reduction/stopwords/zh_stopwords.json +796 -0
kreuzberg/_token_reduction/stopwords/zu_stopwords.json +31 -0
kreuzberg/_types.py +146 -43
kreuzberg/_utils/_html_streaming.py +20 -0
kreuzberg/_utils/_image_preprocessing.py +1 -1
kreuzberg/_utils/_ref.py +14 -6
kreuzberg/_utils/_serialization.py +13 -6
kreuzberg/_utils/_sync.py +15 -16
kreuzberg/exceptions.py +0 -1
kreuzberg/extraction.py +27 -11
{kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/METADATA +15 -13
kreuzberg-3.17.0.dist-info/RECORD +128 -0
kreuzberg-3.15.0.dist-info/RECORD +0 -60
{kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/WHEEL +0 -0
{kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_ocr/_tesseract.py CHANGED Viewed

@@ -29,6 +29,7 @@ from kreuzberg._ocr._base import OCRBackend
 from kreuzberg._ocr._table_extractor import extract_words, reconstruct_table, to_markdown
 from kreuzberg._types import ExtractionResult, HTMLToMarkdownConfig, PSMMode, TableData, TesseractConfig
 from kreuzberg._utils._cache import get_ocr_cache
+from kreuzberg._utils._html_streaming import should_use_streaming
 from kreuzberg._utils._process_pool import ProcessPoolManager, get_optimal_worker_count
 from kreuzberg._utils._string import normalize_spaces
 from kreuzberg._utils._sync import run_sync
@@ -214,7 +215,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
             try:
                 await run_sync(save_image.save, str(image_path), format="PNG")
-            except OSError as e:
+            except OSError as e:  # pragma: no cover
                 if "cannot write mode" not in str(e):
                     raise
                 save_image = image.convert("RGB")
@@ -356,7 +357,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
         try:
             stat = path.stat()
             file_info = {"path": str(path.resolve()), "size": stat.st_size, "mtime": stat.st_mtime}
-        except OSError:
+        except OSError:  # pragma: no cover
             file_info = {"path": str(path), "size": 0, "mtime": 0}
         cache_kwargs = {
@@ -398,7 +399,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
                     await ocr_cache.aset(extraction_result, **final_cache_kwargs)
                 return extraction_result
-            except (RuntimeError, OSError) as e:
+            except (RuntimeError, OSError) as e:  # pragma: no cover
                 raise OCRError(f"Failed to OCR using tesseract: {e}") from e
             finally:
                 await unlink()
@@ -431,7 +432,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
                 try:
                     df = await run_sync(pl.DataFrame, table_data[1:], schema=table_data[0])
-                except (ImportError, IndexError):
+                except (ImportError, IndexError):  # pragma: no cover
                     df = None
                 table: TableData = {"text": markdown, "df": df, "page_number": 1, "cropped_image": None}  # type: ignore[typeddict-item]
@@ -443,7 +444,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
                     tables=[table],
                     chunks=text_result.chunks,
                 )
-        except (ValueError, KeyError, ImportError):
+        except (ValueError, KeyError, ImportError):  # pragma: no cover
             pass
         return text_result
@@ -506,12 +507,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
         table_min_confidence: float = 30.0,
         **_kwargs: Any,
     ) -> ExtractionResult:
-        config = html_to_markdown_config or HTMLToMarkdownConfig(
-            escape_asterisks=False,
-            escape_underscores=False,
-            extract_metadata=False,
-            strip="meta title",
-        )
+        config = html_to_markdown_config or HTMLToMarkdownConfig()
         tables: list[TableData] = []
         if enable_table_detection:
@@ -532,6 +528,10 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
         config_dict = config.to_dict()
         config_dict["custom_converters"] = all_converters
+        use_streaming, chunk_size = should_use_streaming(len(hocr_content.encode()))
+        config_dict["stream_processing"] = use_streaming
+        config_dict["chunk_size"] = chunk_size
         try:
             markdown_content = html_to_markdown.convert_to_markdown(hocr_content, **config_dict)
             markdown_content = normalize_spaces(markdown_content)
@@ -673,15 +673,17 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
             html_config = HTMLToMarkdownConfig(
                 custom_converters=converters,
-                escape_asterisks=False,
-                escape_underscores=False,
-                extract_metadata=False,
-                strip="meta title",
             )
+            config_dict = html_config.to_dict()
+            use_streaming, chunk_size = should_use_streaming(len(hocr_content.encode()))
+            config_dict["stream_processing"] = use_streaming
+            config_dict["chunk_size"] = chunk_size
             markdown_content = html_to_markdown.convert_to_markdown(
                 hocr_content,
-                **html_config.to_dict(),
+                **config_dict,
             )
             markdown_content = normalize_spaces(markdown_content)
@@ -750,7 +752,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
                 try:
                     df = pl.DataFrame(table_data[1:], schema=table_data[0])
-                except (ImportError, IndexError):
+                except (ImportError, IndexError):  # pragma: no cover
                     df = None
                 table: TableData = {"text": markdown, "df": df, "page_number": 1, "cropped_image": None}  # type: ignore[typeddict-item]
@@ -762,7 +764,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
                     tables=[table],
                     chunks=text_result.chunks,
                 )
-        except (ValueError, KeyError, ImportError):
+        except (ValueError, KeyError, ImportError):  # pragma: no cover
             pass
         return text_result
@@ -799,7 +801,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
                 try:
                     df = await run_sync(pl.DataFrame, table_data[1:], schema=table_data[0])
-                except (ImportError, IndexError):
+                except (ImportError, IndexError):  # pragma: no cover
                     df = None
                 dummy_image = Image.new("RGB", (1, 1), "white")
@@ -812,7 +814,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
                     "metadata": {"bbox": (min_x, min_y, max_x, max_y)},
                 }  # type: ignore[typeddict-unknown-key]
                 tables.append(table)
-        except (ValueError, KeyError, ImportError):
+        except (ValueError, KeyError, ImportError):  # pragma: no cover
             pass
         return tables
@@ -868,7 +870,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
             env = {"OMP_THREAD_LIMIT": "1"} if sys.platform.startswith("linux") else None
             try:
                 result = await run_process(command, env=env)
-            except (subprocess.CalledProcessError, FileNotFoundError) as e:
+            except (subprocess.CalledProcessError, FileNotFoundError) as e:  # pragma: no cover
                 raise MissingDependencyError(
                     "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
                 ) from e
@@ -879,7 +881,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
                 )
             cls._version_checked = True
-        except FileNotFoundError as e:
+        except FileNotFoundError as e:  # pragma: no cover
             raise MissingDependencyError(
                 "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
             ) from e
@@ -1076,7 +1078,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
                 "size": stat.st_size,
                 "mtime": stat.st_mtime,
             }
-        except OSError:
+        except OSError:  # pragma: no cover
             return {
                 "path": str(path),
                 "size": 0,
@@ -1084,7 +1086,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
             }
     def _result_from_dict(self, result_dict: dict[str, Any]) -> ExtractionResult:
-        """Convert a worker result dict to ExtractionResult."""
         if result_dict.get("success"):
             return ExtractionResult(
                 content=str(result_dict.get("text", "")),
@@ -1178,7 +1179,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
             command = ["tesseract", "--version"]
             try:
                 result = subprocess.run(command, capture_output=True, text=True, check=True, encoding="utf-8")
-            except (subprocess.CalledProcessError, FileNotFoundError) as e:
+            except (subprocess.CalledProcessError, FileNotFoundError) as e:  # pragma: no cover
                 raise MissingDependencyError(
                     "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
                 ) from e
@@ -1189,7 +1190,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
                 )
             cls._version_checked = True
-        except FileNotFoundError as e:
+        except FileNotFoundError as e:  # pragma: no cover
             raise MissingDependencyError(
                 "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
             ) from e

kreuzberg/_token_reduction/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from __future__ import annotations
+from kreuzberg._token_reduction._reducer import ReductionStats, get_reduction_stats, reduce_tokens
+from kreuzberg._token_reduction._stopwords import StopwordsManager
+__all__ = [
+    "ReductionStats",
+    "StopwordsManager",
+    "get_reduction_stats",
+    "reduce_tokens",
+]

kreuzberg/_token_reduction/_reducer.py ADDED Viewed

@@ -0,0 +1,439 @@
+from __future__ import annotations
+import re
+import unicodedata
+from functools import lru_cache
+from typing import TYPE_CHECKING, Any, TypedDict
+from kreuzberg._token_reduction._stopwords import get_default_stopwords_manager
+from kreuzberg.exceptions import ValidationError
+if TYPE_CHECKING:
+    from collections.abc import Callable
+    from kreuzberg._types import TokenReductionConfig
+class ReductionStats(TypedDict):
+    """Statistics about token reduction operation."""
+    character_reduction_ratio: float
+    token_reduction_ratio: float
+    original_characters: int
+    reduced_characters: int
+    original_tokens: int
+    reduced_tokens: int
+HTML_COMMENT_PATTERN = re.compile(r"<!--.*?-->", re.DOTALL)
+PUNCTUATION_CLEANUP_PATTERN = re.compile(
+    r"([!?.])(?:\1)+"
+    r"|(,)(?:,)+"
+    r"|[!?]+\.+[!?]*|[?!]{3,}"
+)
+WHITESPACE_CLEANUP_PATTERN = re.compile(r"\n{3,}|[ \t]+")
+MARKDOWN_LIST_PATTERNS = (
+    re.compile(r"^\s*[-*+]\s"),
+    re.compile(r"^\s*\d+\.\s"),
+)
+WORD_CLEAN_PATTERN = re.compile(r"[^\w]", re.UNICODE)
+LANGUAGE_CODE_PATTERN = re.compile(r"^[a-zA-Z0-9-]+$")
+WORD_SPLIT_PATTERN = re.compile(r"\S+")
+WORD_BOUNDARY_PATTERN = re.compile(r"^(\W*)(.*?)(\W*)$", re.UNICODE)
+STREAMING_THRESHOLD = 1_000_000
+def _normalize_unicode(text: str) -> str:
+    """Normalize Unicode text to NFC form for consistent processing."""
+    return unicodedata.normalize("NFC", text)
+def _normalize_newlines(text: str) -> str:
+    """Remove excessive newlines, keeping at most double newlines."""
+    return WHITESPACE_CLEANUP_PATTERN.sub(lambda m: "\n\n" if m.group().startswith("\n") else " ", text)
+def _process_text_streaming(
+    text: str, processor_func: Callable[..., str], chunk_size: int = 100_000, **kwargs: Any
+) -> str:
+    """Process large text in chunks to optimize memory usage."""
+    if len(text) <= chunk_size:
+        return processor_func(text, **kwargs)
+    chunks = []
+    start = 0
+    while start < len(text):
+        end = min(start + chunk_size, len(text))
+        if end < len(text):
+            search_start = max(start, end - 1000)
+            for i in range(end - 1, search_start - 1, -1):
+                if text[i] in ".!?\n":
+                    end = i + 1
+                    break
+        chunk = text[start:end]
+        processed_chunk = processor_func(chunk, **kwargs)
+        chunks.append(processed_chunk)
+        start = end
+    return " ".join(chunks).strip()
+def _is_markdown_structural_line(line: str, in_code_block: bool) -> bool:
+    """Check if a line contains markdown structural elements that should be preserved."""
+    if in_code_block:
+        return True
+    stripped = line.strip()
+    if stripped.startswith("#"):
+        return True
+    if "|" in line:
+        pipe_count = line.count("|")
+        if pipe_count >= 2 and (line.strip().startswith("|") or line.strip().endswith("|") or " | " in line):
+            return True
+    return MARKDOWN_LIST_PATTERNS[0].match(line) is not None or MARKDOWN_LIST_PATTERNS[1].match(line) is not None
+@lru_cache(maxsize=64)
+def _get_stopwords_with_custom(language: str, custom_words_tuple: tuple[str, ...] | None = None) -> set[str]:
+    """Get stopwords for a language, optionally with custom additions."""
+    manager = get_default_stopwords_manager()
+    base_stopwords = manager.get_stopwords(language)
+    if custom_words_tuple:
+        return base_stopwords | set(custom_words_tuple)
+    return base_stopwords
+@lru_cache(maxsize=64)
+def _get_lowercase_stopwords(language: str, custom_words_tuple: tuple[str, ...] | None = None) -> set[str]:
+    """Get pre-lowercased stopwords for faster comparison."""
+    stopwords = _get_stopwords_with_custom(language, custom_words_tuple)
+    return {sw.lower() for sw in stopwords}
+def reduce_tokens(
+    text: str,
+    *,
+    config: TokenReductionConfig,
+    language: str | None = None,
+) -> str:
+    """Reduce tokens in text based on the specified configuration.
+    Args:
+        text: The text to reduce.
+        config: Configuration for token reduction.
+        language: Optional language code for stopword selection.
+    Returns:
+        The reduced text.
+    Raises:
+        ValidationError: If inputs are invalid.
+    """
+    if config is None:
+        raise ValidationError("Config cannot be None")
+    if text is None:
+        raise ValidationError("Text cannot be None")
+    if not isinstance(text, str):
+        raise ValidationError(f"Text must be a string, got {type(text).__name__}")
+    if language is not None and not isinstance(language, str):
+        raise ValidationError(f"Language must be a string or None, got {type(language).__name__}")
+    if language is not None and len(language.strip()) == 0:
+        raise ValidationError("Language cannot be empty or whitespace-only")
+    if config.mode == "off":
+        return text
+    use_streaming = len(text) > STREAMING_THRESHOLD
+    if language and not LANGUAGE_CODE_PATTERN.match(language):
+        raise ValidationError(f"Invalid language code format: {language}")
+    if not text or not text.strip():
+        return ""
+    text = _normalize_unicode(text)
+    if config.mode == "light":
+        return _apply_light_reduction(text, preserve_markdown=config.preserve_markdown, use_streaming=use_streaming)
+    if config.mode == "moderate":
+        return _apply_moderate_reduction(
+            text,
+            config=config,
+            language=language,
+            use_streaming=use_streaming,
+        )
+    return text
+def _apply_light_reduction(text: str, *, preserve_markdown: bool, use_streaming: bool = False) -> str:
+    """Apply light reduction (formatting only)."""
+    if use_streaming:
+        if preserve_markdown:
+            return str(_process_text_streaming(text, _apply_light_reduction_markdown_aware))
+        return str(_process_text_streaming(text, _apply_light_reduction_plain))
+    if preserve_markdown:
+        return _apply_light_reduction_markdown_aware(text)
+    return _apply_light_reduction_plain(text)
+def _apply_light_reduction_plain(text: str) -> str:
+    """Apply light reduction to plain text."""
+    text = HTML_COMMENT_PATTERN.sub("", text)
+    def punctuation_replacer(match: re.Match[str]) -> str:
+        if match.group(1):
+            return match.group(1)
+        if match.group(2):
+            return ","
+        return "?"
+    text = PUNCTUATION_CLEANUP_PATTERN.sub(punctuation_replacer, text)
+    def whitespace_replacer(match: re.Match[str]) -> str:
+        if match.group().startswith("\n"):
+            return "\n\n"
+        return " "
+    text = WHITESPACE_CLEANUP_PATTERN.sub(whitespace_replacer, text)
+    return text.strip()
+def _apply_light_reduction_markdown_aware(text: str) -> str:
+    """Apply light reduction preserving markdown structure."""
+    lines = text.split("\n")
+    processed_lines = []
+    in_code_block = False
+    for line in lines:
+        if line.strip().startswith("```"):
+            in_code_block = not in_code_block
+            processed_lines.append(line)
+            continue
+        if _is_markdown_structural_line(line, in_code_block) or in_code_block:
+            processed_lines.append(line)
+            continue
+        if line.strip():
+            reduced = _apply_light_reduction_plain(line)
+            processed_lines.append(reduced)
+        else:
+            processed_lines.append(line)
+    result = "\n".join(processed_lines)
+    lines = result.split("\n")
+    normalized_lines = []
+    in_code_block = False
+    consecutive_empty = 0
+    for line in lines:
+        if line.strip().startswith("```"):
+            in_code_block = not in_code_block
+            normalized_lines.append(line)
+            consecutive_empty = 0
+            continue
+        if in_code_block:
+            normalized_lines.append(line)
+            consecutive_empty = 0
+        elif not line.strip():
+            consecutive_empty += 1
+            if consecutive_empty <= 2:
+                normalized_lines.append(line)
+        else:
+            normalized_lines.append(line)
+            consecutive_empty = 0
+    return "\n".join(normalized_lines).strip()
+def _apply_moderate_reduction(
+    text: str,
+    *,
+    config: TokenReductionConfig,
+    language: str | None = None,
+    use_streaming: bool = False,
+) -> str:
+    """Apply moderate reduction (formatting + stopwords)."""
+    text = _apply_light_reduction(text, preserve_markdown=config.preserve_markdown, use_streaming=use_streaming)
+    lang = language or config.language_hint or "en"
+    manager = get_default_stopwords_manager()
+    if not manager.has_language(lang):
+        lang = "en"
+        if not manager.has_language("en"):
+            return text
+    custom_words_tuple = None
+    if config.custom_stopwords and lang in config.custom_stopwords:
+        custom_words_tuple = tuple(sorted(config.custom_stopwords[lang]))
+    if use_streaming:
+        if config.preserve_markdown:
+            return str(
+                _process_text_streaming(
+                    text,
+                    _apply_stopword_reduction_markdown_aware,
+                    stopwords=_get_lowercase_stopwords(lang, custom_words_tuple),
+                )
+            )
+        return str(
+            _process_text_streaming(
+                text, _apply_stopword_reduction_plain, stopwords=_get_lowercase_stopwords(lang, custom_words_tuple)
+            )
+        )
+    stopwords = _get_lowercase_stopwords(lang, custom_words_tuple)
+    if config.preserve_markdown:
+        return _apply_stopword_reduction_markdown_aware(text, stopwords=stopwords)
+    return _apply_stopword_reduction_plain(text, stopwords=stopwords)
+def _apply_stopword_reduction_plain(text: str, *, stopwords: set[str]) -> str:
+    """Apply stopword reduction to plain text.
+    Args:
+        text: Text to process
+        stopwords: Pre-lowercased stopwords set for faster comparison
+    """
+    words = WORD_SPLIT_PATTERN.findall(text)
+    if not words:
+        return ""
+    filtered_words = []
+    for word in words:
+        if len(word) <= 3 and word.isalpha():
+            if word.lower() not in stopwords or word.isupper() or len(word) == 1:
+                filtered_words.append(word)
+            continue
+        match = WORD_BOUNDARY_PATTERN.match(word)
+        if not match:
+            filtered_words.append(word)
+            continue
+        _prefix_punct, core_word, suffix_punct = match.groups()
+        if not core_word:
+            filtered_words.append(word)
+            continue
+        clean_word = core_word.lower() if core_word.isalpha() else WORD_CLEAN_PATTERN.sub("", core_word).lower()
+        if not clean_word:
+            filtered_words.append(word)
+            continue
+        is_stopword = clean_word in stopwords
+        should_keep = (
+            not is_stopword
+            or len(clean_word) <= 1
+            or (len(core_word) > 1 and core_word.isupper())
+            or any(c.isdigit() for c in core_word)
+        )
+        if should_keep:
+            filtered_words.append(word)
+        elif (
+            suffix_punct
+            and suffix_punct in ".,;:!?"
+            and filtered_words
+            and not filtered_words[-1].endswith(suffix_punct)
+        ):
+            filtered_words[-1] += suffix_punct
+    return " ".join(filtered_words) if filtered_words else ""
+def _apply_stopword_reduction_markdown_aware(text: str, *, stopwords: set[str]) -> str:
+    """Apply stopword reduction preserving markdown structure."""
+    lines = text.split("\n")
+    processed_lines = []
+    in_code_block = False
+    for line in lines:
+        if line.strip().startswith("```"):
+            in_code_block = not in_code_block
+            processed_lines.append(line)
+            continue
+        if _is_markdown_structural_line(line, in_code_block):
+            processed_lines.append(line)
+            continue
+        if line.strip():
+            reduced = _apply_stopword_reduction_plain(line, stopwords=stopwords)
+            processed_lines.append(reduced)
+        else:
+            processed_lines.append(line)
+    result = "\n".join(processed_lines)
+    return _normalize_newlines(result).strip()
+def get_reduction_stats(original: str, reduced: str) -> ReductionStats:
+    """Get detailed statistics about the reduction.
+    Args:
+        original: The original text.
+        reduced: The reduced text.
+    Returns:
+        Statistics about the reduction.
+    Raises:
+        ValidationError: If inputs are invalid.
+    """
+    if original is None:
+        raise ValidationError("Original text cannot be None")
+    if reduced is None:
+        raise ValidationError("Reduced text cannot be None")
+    if not isinstance(original, str):
+        raise ValidationError(f"Original text must be a string, got {type(original).__name__}")
+    if not isinstance(reduced, str):
+        raise ValidationError(f"Reduced text must be a string, got {type(reduced).__name__}")
+    original_chars = len(original)
+    reduced_chars = len(reduced)
+    original_tokens = len(original.split()) if original else 0
+    reduced_tokens = len(reduced.split()) if reduced else 0
+    char_reduction = (original_chars - reduced_chars) / original_chars if original_chars > 0 else 0.0
+    token_reduction = (original_tokens - reduced_tokens) / original_tokens if original_tokens > 0 else 0.0
+    return ReductionStats(
+        character_reduction_ratio=char_reduction,
+        token_reduction_ratio=token_reduction,
+        original_characters=original_chars,
+        reduced_characters=reduced_chars,
+        original_tokens=original_tokens,
+        reduced_tokens=reduced_tokens,
+    )

kreuzberg 3.15.0__py3-none-any.whl → 3.17.0__py3-none-any.whl

kreuzberg 3.15.0py3-none-any.whl → 3.17.0py3-none-any.whl