PyPI - kreuzberg - Versions diffs - 3.19.0__py3-none-any.whl → 3.20.0__py3-none-any.whl - Mend

kreuzberg 3.19.0py3-none-any.whl → 3.20.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

kreuzberg/_extractors/_html.py +93 -103
kreuzberg/_ocr/_tesseract.py +52 -295
kreuzberg/_types.py +125 -47
{kreuzberg-3.19.0.dist-info → kreuzberg-3.20.0.dist-info}/METADATA +23 -22
{kreuzberg-3.19.0.dist-info → kreuzberg-3.20.0.dist-info}/RECORD +8 -9
kreuzberg/_utils/_html_streaming.py +0 -20
{kreuzberg-3.19.0.dist-info → kreuzberg-3.20.0.dist-info}/WHEEL +0 -0
{kreuzberg-3.19.0.dist-info → kreuzberg-3.20.0.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.19.0.dist-info → kreuzberg-3.20.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_extractors/_html.py CHANGED Viewed

@@ -1,20 +1,21 @@
 from __future__ import annotations
-import base64
-import binascii
-import io
 import logging
-from typing import TYPE_CHECKING, ClassVar
+from typing import TYPE_CHECKING, Any, ClassVar
-import html_to_markdown
 from anyio import Path as AsyncPath
-from bs4 import BeautifulSoup
-from PIL import Image
+from html_to_markdown import HtmlToMarkdownError
+from html_to_markdown._html_to_markdown import (
+    InlineImageConfig,
+    convert_with_inline_images,
+)
+from html_to_markdown._html_to_markdown import (
+    convert as rust_convert,
+)
 from kreuzberg._extractors._base import MAX_SINGLE_IMAGE_SIZE, Extractor
 from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
 from kreuzberg._types import ExtractedImage, ExtractionResult, HTMLToMarkdownConfig
-from kreuzberg._utils._html_streaming import should_use_streaming
 from kreuzberg._utils._string import safe_decode
 from kreuzberg._utils._sync import run_maybe_async, run_sync
@@ -41,27 +42,59 @@ class HTMLExtractor(Extractor):
         return result
     def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
-        config = self.config.html_to_markdown_config if self.config else None
-        if config is None:
-            config = HTMLToMarkdownConfig()
-        config_dict = config.to_dict()
+        extraction_config = self.config
         html_content = safe_decode(content)
-        use_streaming, chunk_size = should_use_streaming(len(content))
-        config_dict["stream_processing"] = use_streaming
-        config_dict["chunk_size"] = chunk_size
-        result = html_to_markdown.convert_to_markdown(html_content, **config_dict)
-        extraction_result = ExtractionResult(content=result, mime_type=MARKDOWN_MIME_TYPE, metadata={})
-        if self.config.extract_images:
-            extraction_result.images = self._extract_images_from_html(html_content)
-            if self.config.ocr_extracted_images and extraction_result.images:
+        if extraction_config and extraction_config.html_to_markdown_config is not None:
+            html_config = extraction_config.html_to_markdown_config
+        else:
+            html_config = HTMLToMarkdownConfig()
+        conversion_options, _ = html_config.to_options()
+        extract_inline_images = bool(extraction_config and extraction_config.extract_images)
+        run_ocr_on_images = bool(
+            extraction_config and extraction_config.extract_images and extraction_config.ocr_extracted_images
+        )
+        inline_image_config = None
+        if extract_inline_images:
+            inline_image_config = InlineImageConfig(
+                max_decoded_size_bytes=MAX_SINGLE_IMAGE_SIZE,
+                filename_prefix=None,
+                capture_svg=True,
+                infer_dimensions=True,
+            )
+        try:
+            if extract_inline_images:
+                markdown, images_payload, warnings = convert_with_inline_images(
+                    html_content,
+                    options=conversion_options,
+                    image_config=inline_image_config,
+                )
+            else:
+                markdown = rust_convert(
+                    html_content,
+                    conversion_options,
+                )
+                images_payload = []
+                warnings = []
+        except (HtmlToMarkdownError, ValueError) as exc:
+            logger.exception("Failed to convert HTML to Markdown: %s", exc)
+            markdown = ""
+            images_payload = []
+            warnings = []
+        for warning in warnings:
+            self._log_inline_warning(warning)
+        extraction_result = ExtractionResult(content=markdown, mime_type=MARKDOWN_MIME_TYPE, metadata={})
+        inline_images = [self._build_extracted_image(image) for image in images_payload]
+        if inline_images:
+            extraction_result.images = inline_images
+            if run_ocr_on_images:
                 extraction_result.image_ocr_results = run_maybe_async(
-                    self._process_images_with_ocr, extraction_result.images
+                    self._process_images_with_ocr,
+                    inline_images,
                 )
         return self._apply_quality_processing(extraction_result)
@@ -70,79 +103,36 @@ class HTMLExtractor(Extractor):
         content = path.read_bytes()
         return self.extract_bytes_sync(content)
-    def _extract_images_from_html(self, html_content: str) -> list[ExtractedImage]:
-        images: list[ExtractedImage] = []
-        soup = BeautifulSoup(html_content, "xml")
-        for img in soup.find_all("img"):
-            src_val = img.get("src")
-            if isinstance(src_val, str) and src_val.startswith("data:image/"):
-                try:
-                    header, data = src_val.split(",", 1)
-                    mime_type = header.split(";")[0].split(":")[1]
-                    format_name = mime_type.split("/")[1]
-                    if not data or len(data) < 4:
-                        logger.debug("Skipping empty or too small base64 data")
-                        continue
-                    if len(data) > 67 * 1024 * 1024:
-                        logger.warning("Skipping base64 image larger than 67MB")
-                        continue
-                    image_data = base64.b64decode(data)
-                    if len(image_data) > MAX_SINGLE_IMAGE_SIZE:
-                        logger.warning(
-                            "Skipping decoded image larger than %dMB", MAX_SINGLE_IMAGE_SIZE // (1024 * 1024)
-                        )
-                        continue
-                    dimensions = None
-                    try:
-                        with Image.open(io.BytesIO(image_data)) as pil_img:
-                            dimensions = pil_img.size
-                    except (OSError, ValueError) as e:  # pragma: no cover
-                        logger.debug("Could not determine image dimensions for %s: %s", format_name, e)
-                    alt_val = img.get("alt")
-                    desc = alt_val if isinstance(alt_val, str) else None
-                    images.append(
-                        ExtractedImage(
-                            data=image_data,
-                            format=format_name,
-                            filename=f"embedded_image_{len(images) + 1}.{format_name}",
-                            description=desc,
-                            dimensions=dimensions,
-                        )
-                    )
-                except (ValueError, binascii.Error) as e:
-                    logger.warning("Failed to extract base64 image: %s", e)
-        def extract_svg_safe(svg_element: object) -> ExtractedImage | None:
-            try:
-                svg_content = str(svg_element).encode("utf-8")
-                def _get_attr_safe(obj: object, attr: str) -> str | None:
-                    get_method = getattr(obj, "get", None)
-                    if callable(get_method):
-                        result = get_method(attr)
-                        return result if isinstance(result, str) else None
-                    return None
-                title_or_aria = _get_attr_safe(svg_element, "title") or _get_attr_safe(svg_element, "aria-label")
-                desc_svg = title_or_aria if isinstance(title_or_aria, str) else None
-                return ExtractedImage(
-                    data=svg_content,
-                    format="svg",
-                    filename=f"inline_svg_{len(images) + 1}.svg",
-                    description=desc_svg,
-                )
-            except (UnicodeEncodeError, AttributeError) as e:
-                logger.warning("Failed to extract SVG: %s", e)
-                return None
-        svg_images = [extract_svg_safe(svg) for svg in soup.find_all("svg")]
-        images.extend(img for img in svg_images if img is not None)
-        return images
+    @staticmethod
+    def _build_extracted_image(image: dict[str, Any]) -> ExtractedImage:
+        dimensions_value = image.get("dimensions")
+        dimensions = tuple(dimensions_value) if dimensions_value else None
+        return ExtractedImage(
+            data=image["data"],
+            format=image["format"],
+            filename=image.get("filename"),
+            description=image.get("description"),
+            dimensions=dimensions,
+        )
+    @staticmethod
+    def _log_inline_warning(warning: Any) -> None:
+        if isinstance(warning, dict):
+            index = warning.get("index")
+            message = warning.get("message")
+            if index is not None and message:
+                logger.warning("Inline image %s: %s", index, message)
+            elif message:
+                logger.warning("Inline image warning: %s", message)
+            else:
+                logger.warning("Inline image warning received with no message")
+            return
+        message = getattr(warning, "message", None)
+        index = getattr(warning, "index", None)
+        if message and index is not None:
+            logger.warning("Inline image %s: %s", index, message)
+        elif message:
+            logger.warning("Inline image warning: %s", message)
+        else:
+            logger.warning("Inline image warning received with no message")

kreuzberg/_ocr/_tesseract.py CHANGED Viewed

@@ -3,6 +3,7 @@ from __future__ import annotations
 import csv
 import hashlib
 import io
+import logging
 import os
 import re
 import subprocess
@@ -14,12 +15,11 @@ from pathlib import Path
 from typing import TYPE_CHECKING, Any, ClassVar, Final
 import anyio
-import html_to_markdown
 import polars as pl
 from anyio import Path as AsyncPath
 from anyio import run_process
-from bs4 import BeautifulSoup
-from bs4.element import Tag
+from html_to_markdown import HtmlToMarkdownError
+from html_to_markdown._html_to_markdown import convert as rust_convert
 from PIL import Image
 from PIL.Image import Image as PILImage
 from typing_extensions import Self
@@ -29,15 +29,15 @@ from kreuzberg._ocr._base import OCRBackend
 from kreuzberg._ocr._table_extractor import extract_words, reconstruct_table, to_markdown
 from kreuzberg._types import ExtractionResult, HTMLToMarkdownConfig, PSMMode, TableData, TesseractConfig
 from kreuzberg._utils._cache import get_ocr_cache
-from kreuzberg._utils._html_streaming import should_use_streaming
 from kreuzberg._utils._process_pool import ProcessPoolManager, get_optimal_worker_count
 from kreuzberg._utils._string import normalize_spaces
 from kreuzberg._utils._sync import run_sync
 from kreuzberg._utils._tmp import create_temp_file, temporary_file_sync
 from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
+logger = logging.getLogger(__name__)
 if TYPE_CHECKING:
-    from bs4.element import Tag
     from PIL.Image import Image as PILImage
 try:  # pragma: no cover
@@ -301,8 +301,15 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
             "OFF",
         ]
-        if run_config["tesseract_format"] != "text":
-            command.append(run_config["tesseract_format"])
+        # Handle output format - use config option for HOCR to ensure Windows compatibility
+        # Windows Tesseract 5.5.0 doesn't respect 'hocr' configfile, needs explicit config
+        tesseract_format = run_config["tesseract_format"]
+        if tesseract_format == "hocr":
+            command.extend(["-c", "tessedit_create_hocr=1"])
+        elif tesseract_format == "tsv":
+            command.append("tsv")
+        elif tesseract_format != "text":
+            command.append(tesseract_format)
         for kwarg, value in run_config["remaining_kwargs"].items():
             if kwarg.startswith("table_"):
@@ -507,220 +514,56 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
         table_min_confidence: float = 30.0,
         **_kwargs: Any,
     ) -> ExtractionResult:
-        config = html_to_markdown_config or HTMLToMarkdownConfig()
-        tables: list[TableData] = []
-        if enable_table_detection:
-            soup = BeautifulSoup(hocr_content, "xml")
-            tables = await self._extract_tables_from_hocr(
-                soup,
-                table_column_threshold,
-                table_row_threshold_ratio,
-                table_min_confidence,
-            )
+        _ = (
+            enable_table_detection,
+            table_column_threshold,
+            table_row_threshold_ratio,
+            table_min_confidence,
+        )  # parameters retained for compatibility but handled internally by html-to-markdown
-        hocr_converters = self._create_hocr_converters(tables)
-        all_converters = dict(hocr_converters)
-        if config.custom_converters:
-            all_converters.update(config.custom_converters)
-        config_dict = config.to_dict()
-        config_dict["custom_converters"] = all_converters
-        use_streaming, chunk_size = should_use_streaming(len(hocr_content.encode()))
-        config_dict["stream_processing"] = use_streaming
-        config_dict["chunk_size"] = chunk_size
+        config = html_to_markdown_config or HTMLToMarkdownConfig()
+        conversion_options, _ = config.to_options()
         try:
-            markdown_content = html_to_markdown.convert_to_markdown(hocr_content, **config_dict)
+            markdown_content = rust_convert(
+                hocr_content,
+                conversion_options,
+            )
             markdown_content = normalize_spaces(markdown_content)
-        except (ValueError, TypeError, AttributeError):
-            try:
-                soup = BeautifulSoup(hocr_content, "xml")
-                words = soup.find_all("span", class_="ocrx_word")
-                text_parts = []
-                for word in words:
-                    text = word.get_text().strip()
-                    if text:
-                        text_parts.append(text)
-                if text_parts:
-                    markdown_content = " ".join(text_parts)
-                else:
-                    markdown_content = soup.get_text().strip() or "[No text detected]"
-                markdown_content = normalize_spaces(markdown_content)
-            except (ValueError, TypeError, AttributeError):
-                markdown_content = "[OCR processing failed]"
-        if tables:
-            table_sections = []
-            for i, table in enumerate(tables):
-                table_sections.append(f"\n## Table {i + 1}\n\n{table['text']}\n")
-            if markdown_content.strip():
-                final_content = f"{markdown_content}\n{''.join(table_sections)}"
-            else:
-                final_content = "".join(table_sections).strip()
-        else:
-            final_content = markdown_content
+        except (HtmlToMarkdownError, ValueError) as exc:
+            logger.exception("Failed to convert hOCR to Markdown: %s", exc)
+            markdown_content = "[OCR processing failed]"
+        tables: list[TableData] = []
         return ExtractionResult(
-            content=final_content,
+            content=markdown_content,
             mime_type=MARKDOWN_MIME_TYPE,
             metadata={"source_format": "hocr", "tables_detected": len(tables)},
             chunks=[],
             tables=tables,
         )
-    def _create_basic_converters(self) -> dict[str, Any]:
-        def ocrx_word_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
-            del tag
-            return f"{text.strip()} "
-        def ocr_line_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
-            del tag
-            return f"{text.strip()}\n"
-        def ocr_par_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
-            del tag
-            content = text.strip()
-            if not content:
-                return ""
-            return f"{content}\n\n"
-        def ocr_carea_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
-            del tag
-            content = text.strip()
-            if not content:
-                return ""
-            return f"{content}\n\n"
-        def ocr_page_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
-            del tag
-            return text.strip()
-        def ocr_separator_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
-            del tag, text
-            return "---\n"
-        def ocr_photo_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
-            del text
-            title = tag.get("title", "")
-            if isinstance(title, str):
-                bbox_match = re.search(r"bbox (\d+) (\d+) (\d+) (\d+)", title)
-                if bbox_match:
-                    x0, y0, x1, y1 = bbox_match.groups()
-                    width = int(x1) - int(x0)
-                    height = int(y1) - int(y0)
-                    return f"*[Image region: {width}x{height} pixels]*\n\n"
-            return "*[Image detected]*\n\n"
-        return {
-            "ocrx_word": ocrx_word_converter,
-            "ocr_line": ocr_line_converter,
-            "ocr_par": ocr_par_converter,
-            "ocr_carea": ocr_carea_converter,
-            "ocr_page": ocr_page_converter,
-            "ocr_separator": ocr_separator_converter,
-            "ocr_photo": ocr_photo_converter,
-        }
-    def _create_hocr_converters(self, _tables: list[TableData]) -> dict[str, Any]:
-        basic_converters = self._create_basic_converters()
-        def generic_div_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
-            class_attr = tag.get("class", "")
-            if isinstance(class_attr, list):
-                class_attr = " ".join(class_attr)
-            elif not isinstance(class_attr, str):
-                class_attr = ""
-            for class_name in ["ocr_separator", "ocr_photo", "ocr_page", "ocr_carea"]:
-                if class_name in class_attr:
-                    converter_result = basic_converters[class_name](tag=tag, text=text, **_conv_kwargs)
-                    return str(converter_result)
-            return text
-        def generic_span_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
-            class_attr = tag.get("class", "")
-            if isinstance(class_attr, list):
-                class_attr = " ".join(class_attr)
-            elif not isinstance(class_attr, str):
-                class_attr = ""
-            for class_name in ["ocrx_word", "ocr_line"]:
-                if class_name in class_attr:
-                    converter_result = basic_converters[class_name](tag=tag, text=text, **_conv_kwargs)
-                    return str(converter_result)
-            return f"{text.strip()} "
-        return {
-            "span": generic_span_converter,
-            "div": generic_div_converter,
-            "p": basic_converters["ocr_par"],
-        }
     def _process_hocr_to_markdown_sync(self, hocr_content: str, config: TesseractConfig) -> ExtractionResult:
-        tables: list[TableData] = []
+        _ = config  # retained for interface compatibility
-        if config.enable_table_detection:
-            pass
+        html_config = HTMLToMarkdownConfig()
+        conversion_options, _ = html_config.to_options()
         try:
-            converters = self._create_hocr_converters(tables)
-            html_config = HTMLToMarkdownConfig(
-                custom_converters=converters,
-            )
-            config_dict = html_config.to_dict()
-            use_streaming, chunk_size = should_use_streaming(len(hocr_content.encode()))
-            config_dict["stream_processing"] = use_streaming
-            config_dict["chunk_size"] = chunk_size
-            markdown_content = html_to_markdown.convert_to_markdown(
+            markdown_content = rust_convert(
                 hocr_content,
-                **config_dict,
+                conversion_options,
             )
             markdown_content = normalize_spaces(markdown_content)
+        except (HtmlToMarkdownError, ValueError) as exc:
+            logger.exception("Failed to convert hOCR to Markdown (sync path): %s", exc)
+            markdown_content = "[OCR processing failed]"
-        except (ValueError, TypeError, AttributeError):
-            try:
-                soup = BeautifulSoup(hocr_content, "xml")
-                words = soup.find_all("span", class_="ocrx_word")
-                text_parts = []
-                for word in words:
-                    text = word.get_text().strip()
-                    if text:
-                        text_parts.append(text)
-                if text_parts:
-                    markdown_content = " ".join(text_parts)
-                else:
-                    markdown_content = soup.get_text().strip() or "[No text detected]"
-                markdown_content = normalize_spaces(markdown_content)
-            except (ValueError, TypeError, AttributeError):
-                markdown_content = "[OCR processing failed]"
-        if tables:
-            table_sections = []
-            for i, table in enumerate(tables):
-                table_sections.append(f"\n## Table {i + 1}\n\n{table['text']}\n")
-            if markdown_content.strip():
-                final_content = f"{markdown_content}\n{''.join(table_sections)}"
-            else:
-                final_content = "".join(table_sections).strip()
-        else:
-            final_content = markdown_content
+        tables: list[TableData] = []
         return ExtractionResult(
-            content=final_content,
+            content=markdown_content,
             mime_type=MARKDOWN_MIME_TYPE,
             metadata={"source_format": "hocr", "tables_detected": len(tables)},
             chunks=[],
@@ -769,97 +612,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
         return text_result
-    async def _extract_tables_from_hocr(
-        self,
-        soup: Any,
-        column_threshold: int = 20,
-        row_threshold_ratio: float = 0.5,
-        min_confidence: float = 30.0,
-    ) -> list[TableData]:
-        tsv_data = await self._hocr_to_tsv_data(soup, min_confidence)
-        if not tsv_data:
-            return []
-        if not (words := extract_words(tsv_data, min_confidence=min_confidence)):
-            return []
-        tables: list[TableData] = []
-        try:
-            table_data = reconstruct_table(
-                words,
-                column_threshold=column_threshold,
-                row_threshold_ratio=row_threshold_ratio,
-            )
-            if table_data and len(table_data) > 1:  # ~keep At least header + one data row
-                markdown = to_markdown(table_data)
-                min_x = min(w["left"] for w in words)
-                max_x = max(w["left"] + w["width"] for w in words)
-                min_y = min(w["top"] for w in words)
-                max_y = max(w["top"] + w["height"] for w in words)
-                try:
-                    df = await run_sync(pl.DataFrame, table_data[1:], schema=table_data[0])
-                except (ImportError, IndexError):  # pragma: no cover
-                    df = None
-                dummy_image = Image.new("RGB", (1, 1), "white")
-                table: TableData = {
-                    "text": markdown,
-                    "df": df,
-                    "page_number": 1,
-                    "cropped_image": dummy_image,
-                    "metadata": {"bbox": (min_x, min_y, max_x, max_y)},
-                }  # type: ignore[typeddict-unknown-key]
-                tables.append(table)
-        except (ValueError, KeyError, ImportError):  # pragma: no cover
-            pass
-        return tables
-    async def _hocr_to_tsv_data(self, soup: Any, min_confidence: float) -> str:
-        tsv_lines = ["level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext"]
-        words = soup.find_all("span", class_="ocrx_word")
-        word_num = 1
-        for word in words:
-            title = word.get("title", "")
-            text = word.get_text().strip()
-            if not text:
-                continue
-            bbox_match = re.search(r"bbox (\d+) (\d+) (\d+) (\d+)", title)
-            if not bbox_match:
-                continue
-            x0, y0, x1, y1 = map(int, bbox_match.groups())
-            conf_match = re.search(r"x_wconf (\d+)", title)
-            confidence = float(conf_match.group(1)) if conf_match else 100.0
-            if confidence < min_confidence:
-                continue
-            line = word.find_parent(class_="ocr_line")
-            par = word.find_parent(class_="ocr_par")
-            block = word.find_parent(class_="ocr_carea")
-            tsv_line = f"5\t1\t{block.get('id', '1').split('_')[-1] if block else 1}\t{par.get('id', '1').split('_')[-1] if par else 1}\t{line.get('id', '1').split('_')[-1] if line else 1}\t{word_num}\t{x0}\t{y0}\t{x1 - x0}\t{y1 - y0}\t{confidence}\t{text}"
-            tsv_lines.append(tsv_line)
-            word_num += 1
-        return "\n".join(tsv_lines)
-    def _identify_table_regions(self, words: list[dict[str, Any]]) -> list[list[dict[str, Any]]]:
-        if not words:
-            return []
-        return [words]
     @classmethod
     async def _validate_tesseract_version(cls) -> None:
         try:
@@ -1162,7 +914,13 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
             "OFF",
         ]
-        if output_format != "text":
+        # Handle output format - use config option for HOCR to ensure Windows compatibility
+        # Windows Tesseract 5.5.0 doesn't respect 'hocr' configfile, needs explicit config
+        if output_format == "hocr":
+            command.extend(["-c", "tessedit_create_hocr=1"])
+        elif output_format == "tsv":
+            command.append("tsv")
+        elif output_format != "text":
             command.append(output_format)
         for kwarg, value in kwargs.items():
@@ -1296,10 +1054,9 @@ def _process_image_with_tesseract(
             # Process based on output format
             if output_format == "markdown" and tesseract_format == "hocr":
-                # Import here to avoid circular dependency ~keep
-                from html_to_markdown import convert_to_markdown  # noqa: PLC0415
-                text = convert_to_markdown(text, heading_style="atx")
+                html_config = HTMLToMarkdownConfig(heading_style="atx")
+                options, _ = html_config.to_options()
+                text = rust_convert(text, options)
             text = normalize_spaces(text)

kreuzberg/_types.py CHANGED Viewed

@@ -9,6 +9,12 @@ from typing import TYPE_CHECKING, Any, Literal, NamedTuple, TypedDict
 import langcodes
 import msgspec
+from html_to_markdown._html_to_markdown import (
+    ConversionOptions as HTMLToMarkdownConversionOptions,
+)
+from html_to_markdown._html_to_markdown import (
+    PreprocessingOptions as HTMLToMarkdownPreprocessingOptions,
+)
 from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
 from kreuzberg._utils._table import (
@@ -1166,71 +1172,143 @@ class ExtractionConfig(ConfigDict):
 @dataclass(unsafe_hash=True, frozen=True, slots=True)
 class HTMLToMarkdownConfig:
-    autolinks: bool = True
-    """Automatically convert valid URLs to Markdown links."""
-    br_in_tables: bool = False
-    """Use <br> tags for line breaks in table cells instead of spaces."""
+    heading_style: Literal["underlined", "atx", "atx_closed"] = "atx"
+    """Style for markdown headings."""
+    list_indent_type: Literal["spaces", "tabs"] = "spaces"
+    """Type of indentation to use for lists."""
+    list_indent_width: int = 4
+    """Number of spaces per indentation level (use 2 for Discord/Slack)."""
     bullets: str = "*+-"
     """Characters to use for unordered list bullets."""
-    code_language: str = ""
-    """Default language identifier for fenced code blocks."""
-    code_language_callback: Callable[[Any], str] | None = None
-    """Function to dynamically determine code block language."""
-    convert: list[str] | None = None
-    """List of HTML tags to convert (None = all supported tags)."""
-    convert_as_inline: bool = False
-    """Treat content as inline elements only."""
-    custom_converters: Mapping[str, Callable[..., str]] | None = None
-    """Mapping of HTML tag names to custom converter functions."""
-    default_title: bool = False
-    """Use default titles for elements like links."""
+    strong_em_symbol: Literal["*", "_"] = "*"
+    """Symbol to use for strong/emphasis formatting."""
     escape_asterisks: bool = False
     """Escape * characters to prevent unintended formatting."""
-    escape_misc: bool = False
-    """Escape miscellaneous characters to prevent Markdown conflicts."""
     escape_underscores: bool = False
     """Escape _ characters to prevent unintended formatting."""
+    escape_misc: bool = False
+    """Escape miscellaneous characters to prevent Markdown conflicts."""
+    escape_ascii: bool = False
+    """Escape all ASCII punctuation."""
+    code_language: str = ""
+    """Default language identifier for fenced code blocks."""
+    code_language_callback: Callable[[Any], str] | None = field(default=None, compare=False, hash=False)
+    """Legacy language callback (no longer used by v2 converter)."""
+    autolinks: bool = True
+    """Automatically convert valid URLs to Markdown links."""
+    default_title: bool = False
+    """Use default titles for elements like links."""
+    keep_inline_images_in: tuple[str, ...] | None = None
+    """Tags where inline images should be preserved."""
+    br_in_tables: bool = False
+    """Use <br> tags for line breaks in table cells instead of spaces."""
+    highlight_style: Literal["double-equal", "html", "bold", "none"] = "double-equal"
+    """Style for highlighting text."""
     extract_metadata: bool = True
     """Extract document metadata as comment header."""
-    heading_style: Literal["underlined", "atx", "atx_closed"] = "underlined"
-    """Style for markdown headings."""
-    highlight_style: Literal["double-equal", "html", "bold"] = "double-equal"
-    """Style for highlighting text."""
-    keep_inline_images_in: list[str] | None = None
-    """Tags where inline images should be preserved."""
-    list_indent_type: Literal["spaces", "tabs"] = "spaces"
-    """Type of indentation to use for lists."""
-    list_indent_width: int = 4
-    """Number of spaces per indentation level (use 2 for Discord/Slack)."""
+    whitespace_mode: Literal["normalized", "strict"] = "normalized"
+    """Whitespace handling mode."""
+    strip_newlines: bool = False
+    """Remove newlines from HTML input before processing."""
+    wrap: bool = False
+    """Enable text wrapping."""
+    wrap_width: int = 80
+    """Width for text wrapping."""
+    convert_as_inline: bool = False
+    """Treat content as inline elements only."""
+    sub_symbol: str = ""
+    """Symbol to use for subscript text."""
+    sup_symbol: str = ""
+    """Symbol to use for superscript text."""
     newline_style: Literal["spaces", "backslash"] = "spaces"
     """Style for line breaks in markdown."""
+    code_block_style: Literal["indented", "backticks", "tildes"] = "backticks"
+    """Style for fenced code blocks."""
+    strip_tags: tuple[str, ...] | None = None
+    """List of HTML tags to remove from output."""
+    convert: tuple[str, ...] | None = None
+    """Legacy list of tags to convert (no longer used by v2 converter)."""
+    custom_converters: Mapping[str, Callable[..., str]] | None = field(default=None, compare=False, hash=False)
+    """Legacy mapping of custom converters (ignored by v2 converter)."""
     preprocess_html: bool = False
     """Enable HTML preprocessing to clean messy HTML."""
     preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard"
     """Preprocessing level for cleaning HTML."""
-    remove_forms: bool = True
-    """Remove form elements during preprocessing."""
     remove_navigation: bool = True
     """Remove navigation elements during preprocessing."""
-    strip: list[str] | None = None
-    """List of HTML tags to remove from output."""
-    strip_newlines: bool = False
-    """Remove newlines from HTML input before processing."""
-    strong_em_symbol: Literal["*", "_"] = "*"
-    """Symbol to use for strong/emphasis formatting."""
-    sub_symbol: str = ""
-    """Symbol to use for subscript text."""
-    sup_symbol: str = ""
-    """Symbol to use for superscript text."""
-    whitespace_mode: Literal["normalized", "strict"] = "normalized"
-    """Whitespace handling mode."""
-    wrap: bool = False
-    """Enable text wrapping."""
-    wrap_width: int = 80
-    """Width for text wrapping."""
+    remove_forms: bool = True
+    """Remove form elements during preprocessing."""
+    encoding: str = "utf-8"
+    """Expected character encoding for the HTML input."""
+    debug: bool = False
+    """Enable debug diagnostics in the converter."""
-    def to_dict(self) -> dict[str, Any]:
+    def __post_init__(self) -> None:
+        if self.keep_inline_images_in is not None and not isinstance(self.keep_inline_images_in, tuple):
+            object.__setattr__(self, "keep_inline_images_in", tuple(self.keep_inline_images_in))
+        if self.strip_tags is not None and not isinstance(self.strip_tags, tuple):
+            object.__setattr__(self, "strip_tags", tuple(self.strip_tags))
+        if self.convert is not None and not isinstance(self.convert, tuple):
+            object.__setattr__(self, "convert", tuple(self.convert))
+    def to_options(self) -> tuple[HTMLToMarkdownConversionOptions, HTMLToMarkdownPreprocessingOptions]:
+        """Build html_to_markdown ConversionOptions and PreprocessingOptions instances."""
+        preprocessing = HTMLToMarkdownPreprocessingOptions(
+            enabled=self.preprocess_html,
+            preset=self.preprocessing_preset,
+            remove_navigation=self.remove_navigation,
+            remove_forms=self.remove_forms,
+        )
+        keep_inline_images_in = list(self.keep_inline_images_in) if self.keep_inline_images_in else []
+        strip_tags = list(self.strip_tags) if self.strip_tags else []
+        options = HTMLToMarkdownConversionOptions(
+            heading_style=self.heading_style,
+            list_indent_type=self.list_indent_type,
+            list_indent_width=self.list_indent_width,
+            bullets=self.bullets,
+            strong_em_symbol=self.strong_em_symbol,
+            escape_asterisks=self.escape_asterisks,
+            escape_underscores=self.escape_underscores,
+            escape_misc=self.escape_misc,
+            escape_ascii=self.escape_ascii,
+            code_language=self.code_language,
+            autolinks=self.autolinks,
+            default_title=self.default_title,
+            keep_inline_images_in=keep_inline_images_in,
+            br_in_tables=self.br_in_tables,
+            highlight_style=self.highlight_style,
+            extract_metadata=self.extract_metadata,
+            whitespace_mode=self.whitespace_mode,
+            strip_newlines=self.strip_newlines,
+            wrap=self.wrap,
+            wrap_width=self.wrap_width,
+            convert_as_inline=self.convert_as_inline,
+            sub_symbol=self.sub_symbol,
+            sup_symbol=self.sup_symbol,
+            newline_style=self.newline_style,
+            code_block_style=self.code_block_style,
+            strip_tags=strip_tags,
+            debug=self.debug,
+            encoding=self.encoding,
+        )
+        options.preprocessing = preprocessing
+        return options, preprocessing
+    def to_dict(self, include_none: bool = False) -> dict[str, Any]:
         result = msgspec.to_builtins(self, builtin_types=(type(None),), order="deterministic")
+        if result.get("keep_inline_images_in") is not None:
+            result["keep_inline_images_in"] = list(result["keep_inline_images_in"])
+        if result.get("strip_tags") is not None:
+            result["strip_tags"] = list(result["strip_tags"])
+        if result.get("convert") is not None:
+            result["convert"] = list(result["convert"])
+        if include_none:
+            return result  # type: ignore[no-any-return]
         return {k: v for k, v in result.items() if v is not None}

{kreuzberg-3.19.0.dist-info → kreuzberg-3.20.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: kreuzberg
-Version: 3.19.0
+Version: 3.20.0
 Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
 Project-URL: documentation, https://kreuzberg.dev
 Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -19,6 +19,7 @@ Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
+Classifier: Programming Language :: Python :: 3.14
 Classifier: Topic :: Database
 Classifier: Topic :: Multimedia :: Graphics :: Capture :: Scanners
 Classifier: Topic :: Office/Business :: Office Suites
@@ -27,69 +28,69 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Classifier: Topic :: Text Processing :: General
 Classifier: Typing :: Typed
-Requires-Python: >=3.10
+Requires-Python: <3.15,>=3.10
 Requires-Dist: anyio>=4.11.0
 Requires-Dist: chardetng-py>=0.3.5
 Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
-Requires-Dist: html-to-markdown[lxml]>=1.16.0
+Requires-Dist: html-to-markdown>=2.1.0
 Requires-Dist: langcodes>=3.5.0
-Requires-Dist: mcp>=1.15.0
+Requires-Dist: mcp>=1.17.0
 Requires-Dist: msgspec>=0.18.0
 Requires-Dist: numpy>=2.0.0
 Requires-Dist: playa-pdf>=0.7.0
-Requires-Dist: polars>=1.33.1
+Requires-Dist: polars>=1.34.0
 Requires-Dist: psutil>=7.1.0
 Requires-Dist: pypdfium2==4.30.0
 Requires-Dist: python-calamine>=0.5.3
 Requires-Dist: python-pptx>=1.0.2
-Requires-Dist: transformers>=4.30.0
+Requires-Dist: transformers>=4.57.0
 Requires-Dist: typing-extensions>=4.15.0; python_version < '3.12'
 Provides-Extra: additional-extensions
 Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
 Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
 Provides-Extra: all
-Requires-Dist: click>=8.2.1; extra == 'all'
+Requires-Dist: click>=8.3.0; extra == 'all'
 Requires-Dist: deep-translator>=1.11.4; extra == 'all'
-Requires-Dist: easyocr>=1.7.2; extra == 'all'
+Requires-Dist: easyocr>=1.7.2; (python_version < '3.14') and extra == 'all'
 Requires-Dist: fast-langdetect>=1.0.0; extra == 'all'
 Requires-Dist: gmft>=0.4.2; extra == 'all'
 Requires-Dist: keybert>=0.9.0; extra == 'all'
-Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'all'
+Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.18.0; extra == 'all'
 Requires-Dist: mailparse>=1.0.15; extra == 'all'
-Requires-Dist: paddleocr>=3.2.0; extra == 'all'
-Requires-Dist: paddlepaddle>=3.2.0; extra == 'all'
+Requires-Dist: paddleocr>=3.2.0; (python_version < '3.14') and extra == 'all'
+Requires-Dist: paddlepaddle>=3.2.0; (python_version < '3.14') and extra == 'all'
 Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'all'
-Requires-Dist: rich>=14.1.0; extra == 'all'
+Requires-Dist: rich>=14.2.0; extra == 'all'
 Requires-Dist: semantic-text-splitter>=0.28.0; extra == 'all'
 Requires-Dist: setuptools>=80.9.0; extra == 'all'
-Requires-Dist: spacy>=3.8.7; extra == 'all'
+Requires-Dist: spacy>=3.8.7; (python_version < '3.14') and extra == 'all'
 Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
-Requires-Dist: transformers>=4.25.0; extra == 'all'
+Requires-Dist: transformers>=4.57.0; extra == 'all'
 Provides-Extra: api
-Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'api'
+Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.18.0; extra == 'api'
 Provides-Extra: chunking
 Requires-Dist: semantic-text-splitter>=0.28.0; extra == 'chunking'
 Provides-Extra: cli
-Requires-Dist: click>=8.2.1; extra == 'cli'
-Requires-Dist: rich>=14.1.0; extra == 'cli'
+Requires-Dist: click>=8.3.0; extra == 'cli'
+Requires-Dist: rich>=14.2.0; extra == 'cli'
 Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
 Provides-Extra: crypto
 Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'crypto'
 Provides-Extra: document-classification
 Requires-Dist: deep-translator>=1.11.4; extra == 'document-classification'
 Provides-Extra: easyocr
-Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
+Requires-Dist: easyocr>=1.7.2; (python_version < '3.14') and extra == 'easyocr'
 Provides-Extra: entity-extraction
 Requires-Dist: keybert>=0.9.0; extra == 'entity-extraction'
-Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
+Requires-Dist: spacy>=3.8.7; (python_version < '3.14') and extra == 'entity-extraction'
 Provides-Extra: gmft
 Requires-Dist: gmft>=0.4.2; extra == 'gmft'
-Requires-Dist: transformers>=4.25.0; extra == 'gmft'
+Requires-Dist: transformers>=4.57.0; extra == 'gmft'
 Provides-Extra: langdetect
 Requires-Dist: fast-langdetect>=1.0.0; extra == 'langdetect'
 Provides-Extra: paddleocr
-Requires-Dist: paddleocr>=3.2.0; extra == 'paddleocr'
-Requires-Dist: paddlepaddle>=3.2.0; extra == 'paddleocr'
+Requires-Dist: paddleocr>=3.2.0; (python_version < '3.14') and extra == 'paddleocr'
+Requires-Dist: paddlepaddle>=3.2.0; (python_version < '3.14') and extra == 'paddleocr'
 Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
 Description-Content-Type: text/markdown

{kreuzberg-3.19.0.dist-info → kreuzberg-3.20.0.dist-info}/RECORD RENAMED Viewed

@@ -11,7 +11,7 @@ kreuzberg/_language_detection.py,sha256=4JzQldcDIVZRWUzRFc9AOFiq6Wfl9858mip1ZnrD
 kreuzberg/_mime_types.py,sha256=duEMDBg_qIf9A02tXAC_2znD-wgE-2BBMW9ofyYTJjE,8622
 kreuzberg/_playa.py,sha256=p4G5ymSSCbQoDeXJjH-yuVzdd4y-wKcolqDthjPtqok,11413
 kreuzberg/_registry.py,sha256=8XYT-vPhNYMAbB5RBIUKz-1Zdg48OCnBcdVZzBq6YwY,3307
-kreuzberg/_types.py,sha256=6oBsmUUihVr4hJJrYeuWoUVzCP_-eciCrBVvGQHQTDI,49920
+kreuzberg/_types.py,sha256=eh4bZFG3jIw5GhfC3u4R0aa_y9niKZDI4O93j0MCZGw,53672
 kreuzberg/cli.py,sha256=P_dqOHbGh-fFYZ4WErjngTKq7wbqaUmTD1Gjw2lIsDI,15242
 kreuzberg/exceptions.py,sha256=KiGAfIX3_TkGYG1h9eTZ_E_pALsAqhZ_A3XfhwxwaS0,2909
 kreuzberg/extraction.py,sha256=jMsomvg7SPnuXLGZKQl0YH64D0AhczSNDM4CKORd9d0,24185
@@ -22,7 +22,7 @@ kreuzberg/_api/main.py,sha256=tmg1fICU4wshq0XXhGOk22oivfXjELtsEgOumdkZNI4,15257
 kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 kreuzberg/_extractors/_base.py,sha256=99r-CUZcAp72c0mqkj-E41lj0SyzNaTb_w2EtKgfGJ8,9934
 kreuzberg/_extractors/_email.py,sha256=DzNthVbmbdlajDUfs0nNwbHNvG0CAQVqJsRfsatHtf0,8799
-kreuzberg/_extractors/_html.py,sha256=vNAgBrfok-16SOkhhsy10unqVwAczlTL_2KEn2X6S98,6315
+kreuzberg/_extractors/_html.py,sha256=9AH95f7Lt-agYSOpCv5qRyugn3MdQtX0CNm_pOjovJc,5492
 kreuzberg/_extractors/_image.py,sha256=7rKEGhUAmdzO0YcBKQVhVme4PqyKIi2UCn4esmmFXOY,4300
 kreuzberg/_extractors/_pandoc.py,sha256=cwthr--IFwbu8r0rCZ_Cx5zRlan94yuqt5e3mjYxesE,24182
 kreuzberg/_extractors/_pdf.py,sha256=_MPtO_8BCpyAXyIWusmfqOaEsPMDxucjTQKz3cTaj8o,22663
@@ -36,7 +36,7 @@ kreuzberg/_ocr/_base.py,sha256=ZvOJvW8DtylQJZdCPk9vlVNZiBFK-dC4Oj7Kb6-mWkY,1419
 kreuzberg/_ocr/_easyocr.py,sha256=bHz2S_8nNHaPHPemcJK-U0al9_qP-vUmWE4ECVlf7AA,15485
 kreuzberg/_ocr/_paddleocr.py,sha256=CV9cCjkRe-3cNJ5tRu_sBXd_HNghEwfPIgWwxAZTeRY,15026
 kreuzberg/_ocr/_table_extractor.py,sha256=LhBiCX8R_xR-uK1FH3ONA_vqOmqUWANZJ2HMCBLsmNY,5513
-kreuzberg/_ocr/_tesseract.py,sha256=Uu6H1LMh1WSC1OmKhPx-miG98r9KEfc0GF7b8isS33E,52420
+kreuzberg/_ocr/_tesseract.py,sha256=9F6V72WGi9ExruSNESjz8WGHCXuTYq1M1ctbayhQO0Y,43358
 kreuzberg/_token_reduction/__init__.py,sha256=y_2WgPxJes8_PD-VMfx7vQT0hGjFIixzS8PjaIseAGg,311
 kreuzberg/_token_reduction/_reducer.py,sha256=shAfMPznP69sTSzwX_bE1LpcBmoia9cpd7r6bSc4R5Q,13609
 kreuzberg/_token_reduction/_stopwords.py,sha256=mu-5CapG0RCP7LYzjhdTM6WWLtmt3cjZ08OOsyQkJVg,3608
@@ -109,7 +109,6 @@ kreuzberg/_utils/_cache.py,sha256=AtANbs1MWR4WLB2MhatVGhlh7kM-yjSfFuDnSVSNp50,14
 kreuzberg/_utils/_device.py,sha256=o03rLiHiRX6TKhJ55LO1Vj2Map1Po5YdjuMdA63tGOE,8249
 kreuzberg/_utils/_document_cache.py,sha256=tfk9_Yc1cQkT5_uM5R1uaI4w-2SjNn7QyAd6AmWkSz8,4851
 kreuzberg/_utils/_errors.py,sha256=aQYEnp8oJ-WJVmCNo7YY-25y1KZZFEwjAmxVRfw4a_M,4920
-kreuzberg/_utils/_html_streaming.py,sha256=ywQgEQfEGm6MSotS1g_HXgl0e7V59yLmf2wytALuZko,648
 kreuzberg/_utils/_image_preprocessing.py,sha256=f7ioWQyARnhzj0am0Y1_eteJwWomdPy7AnbXqw2xWBs,10954
 kreuzberg/_utils/_ocr_cache.py,sha256=uCCZfdY7EiqMhCnhNwqirFOr-Wfaobd2Ntc-F07TKec,3425
 kreuzberg/_utils/_pdf_lock.py,sha256=Ytvds30aZf3yXeZFo27ZenrhUoU-GZlR2rKEkhJ_wlk,1349
@@ -122,8 +121,8 @@ kreuzberg/_utils/_string.py,sha256=wVyvEHByHBeu_6evmqJGv9Ml-NAwkyz60n8l-7L5Cw0,4
 kreuzberg/_utils/_sync.py,sha256=gb828WYfVtkB4wKslJrPMmrdeI1h3htWceq-gywHtO4,3184
 kreuzberg/_utils/_table.py,sha256=OVg6T2QnerMhVNb1juLTBSIjyjFiE5-OrUWr5NSCgnQ,6493
 kreuzberg/_utils/_tmp.py,sha256=mwZ0BFzhGPfYa2tt8qSjUjfcHnSYvbQT4VlPRCRc_q8,2038
-kreuzberg-3.19.0.dist-info/METADATA,sha256=fV1j2iWA2-rcZodFFV3kmSsuBJhoDsW6OuyIu9Myf4A,12492
-kreuzberg-3.19.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-kreuzberg-3.19.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
-kreuzberg-3.19.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
-kreuzberg-3.19.0.dist-info/RECORD,,
+kreuzberg-3.20.0.dist-info/METADATA,sha256=pmBB6mlIuuD5tYx0_aOWNdHM00gd6nbxgDrXo1gEc6Y,12782
+kreuzberg-3.20.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+kreuzberg-3.20.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
+kreuzberg-3.20.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
+kreuzberg-3.20.0.dist-info/RECORD,,

kreuzberg/_utils/_html_streaming.py DELETED Viewed

@@ -1,20 +0,0 @@
-from __future__ import annotations
-_STREAMING_THRESHOLD_KB = 10
-_LARGE_FILE_THRESHOLD_MB = 1
-_DEFAULT_CHUNK_SIZE = 2048
-_LARGE_FILE_CHUNK_SIZE = 4096
-_STREAMING_THRESHOLD_BYTES = _STREAMING_THRESHOLD_KB * 1024
-_LARGE_FILE_THRESHOLD_BYTES = _LARGE_FILE_THRESHOLD_MB * 1024 * 1024
-def should_use_streaming(content_size: int) -> tuple[bool, int]:
-    if content_size < 0:
-        return False, _DEFAULT_CHUNK_SIZE
-    if content_size > _STREAMING_THRESHOLD_BYTES:
-        if content_size > _LARGE_FILE_THRESHOLD_BYTES:
-            return True, _LARGE_FILE_CHUNK_SIZE
-        return True, _DEFAULT_CHUNK_SIZE
-    return False, _DEFAULT_CHUNK_SIZE

{kreuzberg-3.19.0.dist-info → kreuzberg-3.20.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{kreuzberg-3.19.0.dist-info → kreuzberg-3.20.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{kreuzberg-3.19.0.dist-info → kreuzberg-3.20.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

kreuzberg 3.19.0__py3-none-any.whl → 3.20.0__py3-none-any.whl

kreuzberg 3.19.0py3-none-any.whl → 3.20.0py3-none-any.whl