kreuzberg 3.19.0__py3-none-any.whl → 3.20.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,20 +1,21 @@
1
1
  from __future__ import annotations
2
2
 
3
- import base64
4
- import binascii
5
- import io
6
3
  import logging
7
- from typing import TYPE_CHECKING, ClassVar
4
+ from typing import TYPE_CHECKING, Any, ClassVar
8
5
 
9
- import html_to_markdown
10
6
  from anyio import Path as AsyncPath
11
- from bs4 import BeautifulSoup
12
- from PIL import Image
7
+ from html_to_markdown import HtmlToMarkdownError
8
+ from html_to_markdown._html_to_markdown import (
9
+ InlineImageConfig,
10
+ convert_with_inline_images,
11
+ )
12
+ from html_to_markdown._html_to_markdown import (
13
+ convert as rust_convert,
14
+ )
13
15
 
14
16
  from kreuzberg._extractors._base import MAX_SINGLE_IMAGE_SIZE, Extractor
15
17
  from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
16
18
  from kreuzberg._types import ExtractedImage, ExtractionResult, HTMLToMarkdownConfig
17
- from kreuzberg._utils._html_streaming import should_use_streaming
18
19
  from kreuzberg._utils._string import safe_decode
19
20
  from kreuzberg._utils._sync import run_maybe_async, run_sync
20
21
 
@@ -41,27 +42,59 @@ class HTMLExtractor(Extractor):
41
42
  return result
42
43
 
43
44
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
44
- config = self.config.html_to_markdown_config if self.config else None
45
- if config is None:
46
- config = HTMLToMarkdownConfig()
47
-
48
- config_dict = config.to_dict()
49
-
45
+ extraction_config = self.config
50
46
  html_content = safe_decode(content)
51
-
52
- use_streaming, chunk_size = should_use_streaming(len(content))
53
- config_dict["stream_processing"] = use_streaming
54
- config_dict["chunk_size"] = chunk_size
55
-
56
- result = html_to_markdown.convert_to_markdown(html_content, **config_dict)
57
-
58
- extraction_result = ExtractionResult(content=result, mime_type=MARKDOWN_MIME_TYPE, metadata={})
59
-
60
- if self.config.extract_images:
61
- extraction_result.images = self._extract_images_from_html(html_content)
62
- if self.config.ocr_extracted_images and extraction_result.images:
47
+ if extraction_config and extraction_config.html_to_markdown_config is not None:
48
+ html_config = extraction_config.html_to_markdown_config
49
+ else:
50
+ html_config = HTMLToMarkdownConfig()
51
+ conversion_options, _ = html_config.to_options()
52
+
53
+ extract_inline_images = bool(extraction_config and extraction_config.extract_images)
54
+ run_ocr_on_images = bool(
55
+ extraction_config and extraction_config.extract_images and extraction_config.ocr_extracted_images
56
+ )
57
+ inline_image_config = None
58
+ if extract_inline_images:
59
+ inline_image_config = InlineImageConfig(
60
+ max_decoded_size_bytes=MAX_SINGLE_IMAGE_SIZE,
61
+ filename_prefix=None,
62
+ capture_svg=True,
63
+ infer_dimensions=True,
64
+ )
65
+
66
+ try:
67
+ if extract_inline_images:
68
+ markdown, images_payload, warnings = convert_with_inline_images(
69
+ html_content,
70
+ options=conversion_options,
71
+ image_config=inline_image_config,
72
+ )
73
+ else:
74
+ markdown = rust_convert(
75
+ html_content,
76
+ conversion_options,
77
+ )
78
+ images_payload = []
79
+ warnings = []
80
+ except (HtmlToMarkdownError, ValueError) as exc:
81
+ logger.exception("Failed to convert HTML to Markdown: %s", exc)
82
+ markdown = ""
83
+ images_payload = []
84
+ warnings = []
85
+
86
+ for warning in warnings:
87
+ self._log_inline_warning(warning)
88
+
89
+ extraction_result = ExtractionResult(content=markdown, mime_type=MARKDOWN_MIME_TYPE, metadata={})
90
+
91
+ inline_images = [self._build_extracted_image(image) for image in images_payload]
92
+ if inline_images:
93
+ extraction_result.images = inline_images
94
+ if run_ocr_on_images:
63
95
  extraction_result.image_ocr_results = run_maybe_async(
64
- self._process_images_with_ocr, extraction_result.images
96
+ self._process_images_with_ocr,
97
+ inline_images,
65
98
  )
66
99
 
67
100
  return self._apply_quality_processing(extraction_result)
@@ -70,79 +103,36 @@ class HTMLExtractor(Extractor):
70
103
  content = path.read_bytes()
71
104
  return self.extract_bytes_sync(content)
72
105
 
73
- def _extract_images_from_html(self, html_content: str) -> list[ExtractedImage]:
74
- images: list[ExtractedImage] = []
75
- soup = BeautifulSoup(html_content, "xml")
76
-
77
- for img in soup.find_all("img"):
78
- src_val = img.get("src")
79
- if isinstance(src_val, str) and src_val.startswith("data:image/"):
80
- try:
81
- header, data = src_val.split(",", 1)
82
- mime_type = header.split(";")[0].split(":")[1]
83
- format_name = mime_type.split("/")[1]
84
-
85
- if not data or len(data) < 4:
86
- logger.debug("Skipping empty or too small base64 data")
87
- continue
88
-
89
- if len(data) > 67 * 1024 * 1024:
90
- logger.warning("Skipping base64 image larger than 67MB")
91
- continue
92
-
93
- image_data = base64.b64decode(data)
94
-
95
- if len(image_data) > MAX_SINGLE_IMAGE_SIZE:
96
- logger.warning(
97
- "Skipping decoded image larger than %dMB", MAX_SINGLE_IMAGE_SIZE // (1024 * 1024)
98
- )
99
- continue
100
-
101
- dimensions = None
102
- try:
103
- with Image.open(io.BytesIO(image_data)) as pil_img:
104
- dimensions = pil_img.size
105
- except (OSError, ValueError) as e: # pragma: no cover
106
- logger.debug("Could not determine image dimensions for %s: %s", format_name, e)
107
-
108
- alt_val = img.get("alt")
109
- desc = alt_val if isinstance(alt_val, str) else None
110
- images.append(
111
- ExtractedImage(
112
- data=image_data,
113
- format=format_name,
114
- filename=f"embedded_image_{len(images) + 1}.{format_name}",
115
- description=desc,
116
- dimensions=dimensions,
117
- )
118
- )
119
- except (ValueError, binascii.Error) as e:
120
- logger.warning("Failed to extract base64 image: %s", e)
121
-
122
- def extract_svg_safe(svg_element: object) -> ExtractedImage | None:
123
- try:
124
- svg_content = str(svg_element).encode("utf-8")
125
-
126
- def _get_attr_safe(obj: object, attr: str) -> str | None:
127
- get_method = getattr(obj, "get", None)
128
- if callable(get_method):
129
- result = get_method(attr)
130
- return result if isinstance(result, str) else None
131
- return None
132
-
133
- title_or_aria = _get_attr_safe(svg_element, "title") or _get_attr_safe(svg_element, "aria-label")
134
- desc_svg = title_or_aria if isinstance(title_or_aria, str) else None
135
- return ExtractedImage(
136
- data=svg_content,
137
- format="svg",
138
- filename=f"inline_svg_{len(images) + 1}.svg",
139
- description=desc_svg,
140
- )
141
- except (UnicodeEncodeError, AttributeError) as e:
142
- logger.warning("Failed to extract SVG: %s", e)
143
- return None
144
-
145
- svg_images = [extract_svg_safe(svg) for svg in soup.find_all("svg")]
146
- images.extend(img for img in svg_images if img is not None)
147
-
148
- return images
106
+ @staticmethod
107
+ def _build_extracted_image(image: dict[str, Any]) -> ExtractedImage:
108
+ dimensions_value = image.get("dimensions")
109
+ dimensions = tuple(dimensions_value) if dimensions_value else None
110
+ return ExtractedImage(
111
+ data=image["data"],
112
+ format=image["format"],
113
+ filename=image.get("filename"),
114
+ description=image.get("description"),
115
+ dimensions=dimensions,
116
+ )
117
+
118
+ @staticmethod
119
+ def _log_inline_warning(warning: Any) -> None:
120
+ if isinstance(warning, dict):
121
+ index = warning.get("index")
122
+ message = warning.get("message")
123
+ if index is not None and message:
124
+ logger.warning("Inline image %s: %s", index, message)
125
+ elif message:
126
+ logger.warning("Inline image warning: %s", message)
127
+ else:
128
+ logger.warning("Inline image warning received with no message")
129
+ return
130
+
131
+ message = getattr(warning, "message", None)
132
+ index = getattr(warning, "index", None)
133
+ if message and index is not None:
134
+ logger.warning("Inline image %s: %s", index, message)
135
+ elif message:
136
+ logger.warning("Inline image warning: %s", message)
137
+ else:
138
+ logger.warning("Inline image warning received with no message")
@@ -3,6 +3,7 @@ from __future__ import annotations
3
3
  import csv
4
4
  import hashlib
5
5
  import io
6
+ import logging
6
7
  import os
7
8
  import re
8
9
  import subprocess
@@ -14,12 +15,11 @@ from pathlib import Path
14
15
  from typing import TYPE_CHECKING, Any, ClassVar, Final
15
16
 
16
17
  import anyio
17
- import html_to_markdown
18
18
  import polars as pl
19
19
  from anyio import Path as AsyncPath
20
20
  from anyio import run_process
21
- from bs4 import BeautifulSoup
22
- from bs4.element import Tag
21
+ from html_to_markdown import HtmlToMarkdownError
22
+ from html_to_markdown._html_to_markdown import convert as rust_convert
23
23
  from PIL import Image
24
24
  from PIL.Image import Image as PILImage
25
25
  from typing_extensions import Self
@@ -29,15 +29,15 @@ from kreuzberg._ocr._base import OCRBackend
29
29
  from kreuzberg._ocr._table_extractor import extract_words, reconstruct_table, to_markdown
30
30
  from kreuzberg._types import ExtractionResult, HTMLToMarkdownConfig, PSMMode, TableData, TesseractConfig
31
31
  from kreuzberg._utils._cache import get_ocr_cache
32
- from kreuzberg._utils._html_streaming import should_use_streaming
33
32
  from kreuzberg._utils._process_pool import ProcessPoolManager, get_optimal_worker_count
34
33
  from kreuzberg._utils._string import normalize_spaces
35
34
  from kreuzberg._utils._sync import run_sync
36
35
  from kreuzberg._utils._tmp import create_temp_file, temporary_file_sync
37
36
  from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
38
37
 
38
+ logger = logging.getLogger(__name__)
39
+
39
40
  if TYPE_CHECKING:
40
- from bs4.element import Tag
41
41
  from PIL.Image import Image as PILImage
42
42
 
43
43
  try: # pragma: no cover
@@ -301,8 +301,15 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
301
301
  "OFF",
302
302
  ]
303
303
 
304
- if run_config["tesseract_format"] != "text":
305
- command.append(run_config["tesseract_format"])
304
+ # Handle output format - use config option for HOCR to ensure Windows compatibility
305
+ # Windows Tesseract 5.5.0 doesn't respect 'hocr' configfile, needs explicit config
306
+ tesseract_format = run_config["tesseract_format"]
307
+ if tesseract_format == "hocr":
308
+ command.extend(["-c", "tessedit_create_hocr=1"])
309
+ elif tesseract_format == "tsv":
310
+ command.append("tsv")
311
+ elif tesseract_format != "text":
312
+ command.append(tesseract_format)
306
313
 
307
314
  for kwarg, value in run_config["remaining_kwargs"].items():
308
315
  if kwarg.startswith("table_"):
@@ -507,220 +514,56 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
507
514
  table_min_confidence: float = 30.0,
508
515
  **_kwargs: Any,
509
516
  ) -> ExtractionResult:
510
- config = html_to_markdown_config or HTMLToMarkdownConfig()
511
-
512
- tables: list[TableData] = []
513
- if enable_table_detection:
514
- soup = BeautifulSoup(hocr_content, "xml")
515
- tables = await self._extract_tables_from_hocr(
516
- soup,
517
- table_column_threshold,
518
- table_row_threshold_ratio,
519
- table_min_confidence,
520
- )
517
+ _ = (
518
+ enable_table_detection,
519
+ table_column_threshold,
520
+ table_row_threshold_ratio,
521
+ table_min_confidence,
522
+ ) # parameters retained for compatibility but handled internally by html-to-markdown
521
523
 
522
- hocr_converters = self._create_hocr_converters(tables)
523
-
524
- all_converters = dict(hocr_converters)
525
- if config.custom_converters:
526
- all_converters.update(config.custom_converters)
527
-
528
- config_dict = config.to_dict()
529
- config_dict["custom_converters"] = all_converters
530
-
531
- use_streaming, chunk_size = should_use_streaming(len(hocr_content.encode()))
532
- config_dict["stream_processing"] = use_streaming
533
- config_dict["chunk_size"] = chunk_size
524
+ config = html_to_markdown_config or HTMLToMarkdownConfig()
525
+ conversion_options, _ = config.to_options()
534
526
 
535
527
  try:
536
- markdown_content = html_to_markdown.convert_to_markdown(hocr_content, **config_dict)
528
+ markdown_content = rust_convert(
529
+ hocr_content,
530
+ conversion_options,
531
+ )
537
532
  markdown_content = normalize_spaces(markdown_content)
538
- except (ValueError, TypeError, AttributeError):
539
- try:
540
- soup = BeautifulSoup(hocr_content, "xml")
541
- words = soup.find_all("span", class_="ocrx_word")
542
- text_parts = []
543
- for word in words:
544
- text = word.get_text().strip()
545
- if text:
546
- text_parts.append(text)
547
-
548
- if text_parts:
549
- markdown_content = " ".join(text_parts)
550
- else:
551
- markdown_content = soup.get_text().strip() or "[No text detected]"
552
-
553
- markdown_content = normalize_spaces(markdown_content)
554
- except (ValueError, TypeError, AttributeError):
555
- markdown_content = "[OCR processing failed]"
556
-
557
- if tables:
558
- table_sections = []
559
- for i, table in enumerate(tables):
560
- table_sections.append(f"\n## Table {i + 1}\n\n{table['text']}\n")
561
-
562
- if markdown_content.strip():
563
- final_content = f"{markdown_content}\n{''.join(table_sections)}"
564
- else:
565
- final_content = "".join(table_sections).strip()
566
- else:
567
- final_content = markdown_content
533
+ except (HtmlToMarkdownError, ValueError) as exc:
534
+ logger.exception("Failed to convert hOCR to Markdown: %s", exc)
535
+ markdown_content = "[OCR processing failed]"
536
+
537
+ tables: list[TableData] = []
568
538
 
569
539
  return ExtractionResult(
570
- content=final_content,
540
+ content=markdown_content,
571
541
  mime_type=MARKDOWN_MIME_TYPE,
572
542
  metadata={"source_format": "hocr", "tables_detected": len(tables)},
573
543
  chunks=[],
574
544
  tables=tables,
575
545
  )
576
546
 
577
- def _create_basic_converters(self) -> dict[str, Any]:
578
- def ocrx_word_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
579
- del tag
580
- return f"{text.strip()} "
581
-
582
- def ocr_line_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
583
- del tag
584
- return f"{text.strip()}\n"
585
-
586
- def ocr_par_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
587
- del tag
588
- content = text.strip()
589
- if not content:
590
- return ""
591
- return f"{content}\n\n"
592
-
593
- def ocr_carea_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
594
- del tag
595
- content = text.strip()
596
- if not content:
597
- return ""
598
- return f"{content}\n\n"
599
-
600
- def ocr_page_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
601
- del tag
602
- return text.strip()
603
-
604
- def ocr_separator_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
605
- del tag, text
606
- return "---\n"
607
-
608
- def ocr_photo_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
609
- del text
610
- title = tag.get("title", "")
611
- if isinstance(title, str):
612
- bbox_match = re.search(r"bbox (\d+) (\d+) (\d+) (\d+)", title)
613
- if bbox_match:
614
- x0, y0, x1, y1 = bbox_match.groups()
615
- width = int(x1) - int(x0)
616
- height = int(y1) - int(y0)
617
- return f"*[Image region: {width}x{height} pixels]*\n\n"
618
- return "*[Image detected]*\n\n"
619
-
620
- return {
621
- "ocrx_word": ocrx_word_converter,
622
- "ocr_line": ocr_line_converter,
623
- "ocr_par": ocr_par_converter,
624
- "ocr_carea": ocr_carea_converter,
625
- "ocr_page": ocr_page_converter,
626
- "ocr_separator": ocr_separator_converter,
627
- "ocr_photo": ocr_photo_converter,
628
- }
629
-
630
- def _create_hocr_converters(self, _tables: list[TableData]) -> dict[str, Any]:
631
- basic_converters = self._create_basic_converters()
632
-
633
- def generic_div_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
634
- class_attr = tag.get("class", "")
635
- if isinstance(class_attr, list):
636
- class_attr = " ".join(class_attr)
637
- elif not isinstance(class_attr, str):
638
- class_attr = ""
639
-
640
- for class_name in ["ocr_separator", "ocr_photo", "ocr_page", "ocr_carea"]:
641
- if class_name in class_attr:
642
- converter_result = basic_converters[class_name](tag=tag, text=text, **_conv_kwargs)
643
- return str(converter_result)
644
- return text
645
-
646
- def generic_span_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
647
- class_attr = tag.get("class", "")
648
- if isinstance(class_attr, list):
649
- class_attr = " ".join(class_attr)
650
- elif not isinstance(class_attr, str):
651
- class_attr = ""
652
-
653
- for class_name in ["ocrx_word", "ocr_line"]:
654
- if class_name in class_attr:
655
- converter_result = basic_converters[class_name](tag=tag, text=text, **_conv_kwargs)
656
- return str(converter_result)
657
- return f"{text.strip()} "
658
-
659
- return {
660
- "span": generic_span_converter,
661
- "div": generic_div_converter,
662
- "p": basic_converters["ocr_par"],
663
- }
664
-
665
547
  def _process_hocr_to_markdown_sync(self, hocr_content: str, config: TesseractConfig) -> ExtractionResult:
666
- tables: list[TableData] = []
548
+ _ = config # retained for interface compatibility
667
549
 
668
- if config.enable_table_detection:
669
- pass
550
+ html_config = HTMLToMarkdownConfig()
551
+ conversion_options, _ = html_config.to_options()
670
552
 
671
553
  try:
672
- converters = self._create_hocr_converters(tables)
673
-
674
- html_config = HTMLToMarkdownConfig(
675
- custom_converters=converters,
676
- )
677
-
678
- config_dict = html_config.to_dict()
679
-
680
- use_streaming, chunk_size = should_use_streaming(len(hocr_content.encode()))
681
- config_dict["stream_processing"] = use_streaming
682
- config_dict["chunk_size"] = chunk_size
683
-
684
- markdown_content = html_to_markdown.convert_to_markdown(
554
+ markdown_content = rust_convert(
685
555
  hocr_content,
686
- **config_dict,
556
+ conversion_options,
687
557
  )
688
-
689
558
  markdown_content = normalize_spaces(markdown_content)
559
+ except (HtmlToMarkdownError, ValueError) as exc:
560
+ logger.exception("Failed to convert hOCR to Markdown (sync path): %s", exc)
561
+ markdown_content = "[OCR processing failed]"
690
562
 
691
- except (ValueError, TypeError, AttributeError):
692
- try:
693
- soup = BeautifulSoup(hocr_content, "xml")
694
- words = soup.find_all("span", class_="ocrx_word")
695
- text_parts = []
696
- for word in words:
697
- text = word.get_text().strip()
698
- if text:
699
- text_parts.append(text)
700
-
701
- if text_parts:
702
- markdown_content = " ".join(text_parts)
703
- else:
704
- markdown_content = soup.get_text().strip() or "[No text detected]"
705
-
706
- markdown_content = normalize_spaces(markdown_content)
707
- except (ValueError, TypeError, AttributeError):
708
- markdown_content = "[OCR processing failed]"
709
-
710
- if tables:
711
- table_sections = []
712
- for i, table in enumerate(tables):
713
- table_sections.append(f"\n## Table {i + 1}\n\n{table['text']}\n")
714
-
715
- if markdown_content.strip():
716
- final_content = f"{markdown_content}\n{''.join(table_sections)}"
717
- else:
718
- final_content = "".join(table_sections).strip()
719
- else:
720
- final_content = markdown_content
563
+ tables: list[TableData] = []
721
564
 
722
565
  return ExtractionResult(
723
- content=final_content,
566
+ content=markdown_content,
724
567
  mime_type=MARKDOWN_MIME_TYPE,
725
568
  metadata={"source_format": "hocr", "tables_detected": len(tables)},
726
569
  chunks=[],
@@ -769,97 +612,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
769
612
 
770
613
  return text_result
771
614
 
772
- async def _extract_tables_from_hocr(
773
- self,
774
- soup: Any,
775
- column_threshold: int = 20,
776
- row_threshold_ratio: float = 0.5,
777
- min_confidence: float = 30.0,
778
- ) -> list[TableData]:
779
- tsv_data = await self._hocr_to_tsv_data(soup, min_confidence)
780
-
781
- if not tsv_data:
782
- return []
783
-
784
- if not (words := extract_words(tsv_data, min_confidence=min_confidence)):
785
- return []
786
-
787
- tables: list[TableData] = []
788
- try:
789
- table_data = reconstruct_table(
790
- words,
791
- column_threshold=column_threshold,
792
- row_threshold_ratio=row_threshold_ratio,
793
- )
794
- if table_data and len(table_data) > 1: # ~keep At least header + one data row
795
- markdown = to_markdown(table_data)
796
-
797
- min_x = min(w["left"] for w in words)
798
- max_x = max(w["left"] + w["width"] for w in words)
799
- min_y = min(w["top"] for w in words)
800
- max_y = max(w["top"] + w["height"] for w in words)
801
-
802
- try:
803
- df = await run_sync(pl.DataFrame, table_data[1:], schema=table_data[0])
804
- except (ImportError, IndexError): # pragma: no cover
805
- df = None
806
-
807
- dummy_image = Image.new("RGB", (1, 1), "white")
808
-
809
- table: TableData = {
810
- "text": markdown,
811
- "df": df,
812
- "page_number": 1,
813
- "cropped_image": dummy_image,
814
- "metadata": {"bbox": (min_x, min_y, max_x, max_y)},
815
- } # type: ignore[typeddict-unknown-key]
816
- tables.append(table)
817
- except (ValueError, KeyError, ImportError): # pragma: no cover
818
- pass
819
-
820
- return tables
821
-
822
- async def _hocr_to_tsv_data(self, soup: Any, min_confidence: float) -> str:
823
- tsv_lines = ["level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext"]
824
-
825
- words = soup.find_all("span", class_="ocrx_word")
826
- word_num = 1
827
-
828
- for word in words:
829
- title = word.get("title", "")
830
- text = word.get_text().strip()
831
-
832
- if not text:
833
- continue
834
-
835
- bbox_match = re.search(r"bbox (\d+) (\d+) (\d+) (\d+)", title)
836
- if not bbox_match:
837
- continue
838
-
839
- x0, y0, x1, y1 = map(int, bbox_match.groups())
840
-
841
- conf_match = re.search(r"x_wconf (\d+)", title)
842
- confidence = float(conf_match.group(1)) if conf_match else 100.0
843
-
844
- if confidence < min_confidence:
845
- continue
846
-
847
- line = word.find_parent(class_="ocr_line")
848
- par = word.find_parent(class_="ocr_par")
849
- block = word.find_parent(class_="ocr_carea")
850
-
851
- tsv_line = f"5\t1\t{block.get('id', '1').split('_')[-1] if block else 1}\t{par.get('id', '1').split('_')[-1] if par else 1}\t{line.get('id', '1').split('_')[-1] if line else 1}\t{word_num}\t{x0}\t{y0}\t{x1 - x0}\t{y1 - y0}\t{confidence}\t{text}"
852
- tsv_lines.append(tsv_line)
853
- word_num += 1
854
-
855
- return "\n".join(tsv_lines)
856
-
857
- def _identify_table_regions(self, words: list[dict[str, Any]]) -> list[list[dict[str, Any]]]:
858
- if not words:
859
- return []
860
-
861
- return [words]
862
-
863
615
  @classmethod
864
616
  async def _validate_tesseract_version(cls) -> None:
865
617
  try:
@@ -1162,7 +914,13 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
1162
914
  "OFF",
1163
915
  ]
1164
916
 
1165
- if output_format != "text":
917
+ # Handle output format - use config option for HOCR to ensure Windows compatibility
918
+ # Windows Tesseract 5.5.0 doesn't respect 'hocr' configfile, needs explicit config
919
+ if output_format == "hocr":
920
+ command.extend(["-c", "tessedit_create_hocr=1"])
921
+ elif output_format == "tsv":
922
+ command.append("tsv")
923
+ elif output_format != "text":
1166
924
  command.append(output_format)
1167
925
 
1168
926
  for kwarg, value in kwargs.items():
@@ -1296,10 +1054,9 @@ def _process_image_with_tesseract(
1296
1054
 
1297
1055
  # Process based on output format
1298
1056
  if output_format == "markdown" and tesseract_format == "hocr":
1299
- # Import here to avoid circular dependency ~keep
1300
- from html_to_markdown import convert_to_markdown # noqa: PLC0415
1301
-
1302
- text = convert_to_markdown(text, heading_style="atx")
1057
+ html_config = HTMLToMarkdownConfig(heading_style="atx")
1058
+ options, _ = html_config.to_options()
1059
+ text = rust_convert(text, options)
1303
1060
 
1304
1061
  text = normalize_spaces(text)
1305
1062
 
kreuzberg/_types.py CHANGED
@@ -9,6 +9,12 @@ from typing import TYPE_CHECKING, Any, Literal, NamedTuple, TypedDict
9
9
 
10
10
  import langcodes
11
11
  import msgspec
12
+ from html_to_markdown._html_to_markdown import (
13
+ ConversionOptions as HTMLToMarkdownConversionOptions,
14
+ )
15
+ from html_to_markdown._html_to_markdown import (
16
+ PreprocessingOptions as HTMLToMarkdownPreprocessingOptions,
17
+ )
12
18
 
13
19
  from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
14
20
  from kreuzberg._utils._table import (
@@ -1166,71 +1172,143 @@ class ExtractionConfig(ConfigDict):
1166
1172
 
1167
1173
  @dataclass(unsafe_hash=True, frozen=True, slots=True)
1168
1174
  class HTMLToMarkdownConfig:
1169
- autolinks: bool = True
1170
- """Automatically convert valid URLs to Markdown links."""
1171
- br_in_tables: bool = False
1172
- """Use <br> tags for line breaks in table cells instead of spaces."""
1175
+ heading_style: Literal["underlined", "atx", "atx_closed"] = "atx"
1176
+ """Style for markdown headings."""
1177
+ list_indent_type: Literal["spaces", "tabs"] = "spaces"
1178
+ """Type of indentation to use for lists."""
1179
+ list_indent_width: int = 4
1180
+ """Number of spaces per indentation level (use 2 for Discord/Slack)."""
1173
1181
  bullets: str = "*+-"
1174
1182
  """Characters to use for unordered list bullets."""
1175
- code_language: str = ""
1176
- """Default language identifier for fenced code blocks."""
1177
- code_language_callback: Callable[[Any], str] | None = None
1178
- """Function to dynamically determine code block language."""
1179
- convert: list[str] | None = None
1180
- """List of HTML tags to convert (None = all supported tags)."""
1181
- convert_as_inline: bool = False
1182
- """Treat content as inline elements only."""
1183
- custom_converters: Mapping[str, Callable[..., str]] | None = None
1184
- """Mapping of HTML tag names to custom converter functions."""
1185
- default_title: bool = False
1186
- """Use default titles for elements like links."""
1183
+ strong_em_symbol: Literal["*", "_"] = "*"
1184
+ """Symbol to use for strong/emphasis formatting."""
1187
1185
  escape_asterisks: bool = False
1188
1186
  """Escape * characters to prevent unintended formatting."""
1189
- escape_misc: bool = False
1190
- """Escape miscellaneous characters to prevent Markdown conflicts."""
1191
1187
  escape_underscores: bool = False
1192
1188
  """Escape _ characters to prevent unintended formatting."""
1189
+ escape_misc: bool = False
1190
+ """Escape miscellaneous characters to prevent Markdown conflicts."""
1191
+ escape_ascii: bool = False
1192
+ """Escape all ASCII punctuation."""
1193
+ code_language: str = ""
1194
+ """Default language identifier for fenced code blocks."""
1195
+ code_language_callback: Callable[[Any], str] | None = field(default=None, compare=False, hash=False)
1196
+ """Legacy language callback (no longer used by v2 converter)."""
1197
+ autolinks: bool = True
1198
+ """Automatically convert valid URLs to Markdown links."""
1199
+ default_title: bool = False
1200
+ """Use default titles for elements like links."""
1201
+ keep_inline_images_in: tuple[str, ...] | None = None
1202
+ """Tags where inline images should be preserved."""
1203
+ br_in_tables: bool = False
1204
+ """Use <br> tags for line breaks in table cells instead of spaces."""
1205
+ highlight_style: Literal["double-equal", "html", "bold", "none"] = "double-equal"
1206
+ """Style for highlighting text."""
1193
1207
  extract_metadata: bool = True
1194
1208
  """Extract document metadata as comment header."""
1195
- heading_style: Literal["underlined", "atx", "atx_closed"] = "underlined"
1196
- """Style for markdown headings."""
1197
- highlight_style: Literal["double-equal", "html", "bold"] = "double-equal"
1198
- """Style for highlighting text."""
1199
- keep_inline_images_in: list[str] | None = None
1200
- """Tags where inline images should be preserved."""
1201
- list_indent_type: Literal["spaces", "tabs"] = "spaces"
1202
- """Type of indentation to use for lists."""
1203
- list_indent_width: int = 4
1204
- """Number of spaces per indentation level (use 2 for Discord/Slack)."""
1209
+ whitespace_mode: Literal["normalized", "strict"] = "normalized"
1210
+ """Whitespace handling mode."""
1211
+ strip_newlines: bool = False
1212
+ """Remove newlines from HTML input before processing."""
1213
+ wrap: bool = False
1214
+ """Enable text wrapping."""
1215
+ wrap_width: int = 80
1216
+ """Width for text wrapping."""
1217
+ convert_as_inline: bool = False
1218
+ """Treat content as inline elements only."""
1219
+ sub_symbol: str = ""
1220
+ """Symbol to use for subscript text."""
1221
+ sup_symbol: str = ""
1222
+ """Symbol to use for superscript text."""
1205
1223
  newline_style: Literal["spaces", "backslash"] = "spaces"
1206
1224
  """Style for line breaks in markdown."""
1225
+ code_block_style: Literal["indented", "backticks", "tildes"] = "backticks"
1226
+ """Style for fenced code blocks."""
1227
+ strip_tags: tuple[str, ...] | None = None
1228
+ """List of HTML tags to remove from output."""
1229
+ convert: tuple[str, ...] | None = None
1230
+ """Legacy list of tags to convert (no longer used by v2 converter)."""
1231
+ custom_converters: Mapping[str, Callable[..., str]] | None = field(default=None, compare=False, hash=False)
1232
+ """Legacy mapping of custom converters (ignored by v2 converter)."""
1207
1233
  preprocess_html: bool = False
1208
1234
  """Enable HTML preprocessing to clean messy HTML."""
1209
1235
  preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard"
1210
1236
  """Preprocessing level for cleaning HTML."""
1211
- remove_forms: bool = True
1212
- """Remove form elements during preprocessing."""
1213
1237
  remove_navigation: bool = True
1214
1238
  """Remove navigation elements during preprocessing."""
1215
- strip: list[str] | None = None
1216
- """List of HTML tags to remove from output."""
1217
- strip_newlines: bool = False
1218
- """Remove newlines from HTML input before processing."""
1219
- strong_em_symbol: Literal["*", "_"] = "*"
1220
- """Symbol to use for strong/emphasis formatting."""
1221
- sub_symbol: str = ""
1222
- """Symbol to use for subscript text."""
1223
- sup_symbol: str = ""
1224
- """Symbol to use for superscript text."""
1225
- whitespace_mode: Literal["normalized", "strict"] = "normalized"
1226
- """Whitespace handling mode."""
1227
- wrap: bool = False
1228
- """Enable text wrapping."""
1229
- wrap_width: int = 80
1230
- """Width for text wrapping."""
1239
+ remove_forms: bool = True
1240
+ """Remove form elements during preprocessing."""
1241
+ encoding: str = "utf-8"
1242
+ """Expected character encoding for the HTML input."""
1243
+ debug: bool = False
1244
+ """Enable debug diagnostics in the converter."""
1231
1245
 
1232
- def to_dict(self) -> dict[str, Any]:
1246
+ def __post_init__(self) -> None:
1247
+ if self.keep_inline_images_in is not None and not isinstance(self.keep_inline_images_in, tuple):
1248
+ object.__setattr__(self, "keep_inline_images_in", tuple(self.keep_inline_images_in))
1249
+ if self.strip_tags is not None and not isinstance(self.strip_tags, tuple):
1250
+ object.__setattr__(self, "strip_tags", tuple(self.strip_tags))
1251
+ if self.convert is not None and not isinstance(self.convert, tuple):
1252
+ object.__setattr__(self, "convert", tuple(self.convert))
1253
+
1254
+ def to_options(self) -> tuple[HTMLToMarkdownConversionOptions, HTMLToMarkdownPreprocessingOptions]:
1255
+ """Build html_to_markdown ConversionOptions and PreprocessingOptions instances."""
1256
+ preprocessing = HTMLToMarkdownPreprocessingOptions(
1257
+ enabled=self.preprocess_html,
1258
+ preset=self.preprocessing_preset,
1259
+ remove_navigation=self.remove_navigation,
1260
+ remove_forms=self.remove_forms,
1261
+ )
1262
+
1263
+ keep_inline_images_in = list(self.keep_inline_images_in) if self.keep_inline_images_in else []
1264
+ strip_tags = list(self.strip_tags) if self.strip_tags else []
1265
+
1266
+ options = HTMLToMarkdownConversionOptions(
1267
+ heading_style=self.heading_style,
1268
+ list_indent_type=self.list_indent_type,
1269
+ list_indent_width=self.list_indent_width,
1270
+ bullets=self.bullets,
1271
+ strong_em_symbol=self.strong_em_symbol,
1272
+ escape_asterisks=self.escape_asterisks,
1273
+ escape_underscores=self.escape_underscores,
1274
+ escape_misc=self.escape_misc,
1275
+ escape_ascii=self.escape_ascii,
1276
+ code_language=self.code_language,
1277
+ autolinks=self.autolinks,
1278
+ default_title=self.default_title,
1279
+ keep_inline_images_in=keep_inline_images_in,
1280
+ br_in_tables=self.br_in_tables,
1281
+ highlight_style=self.highlight_style,
1282
+ extract_metadata=self.extract_metadata,
1283
+ whitespace_mode=self.whitespace_mode,
1284
+ strip_newlines=self.strip_newlines,
1285
+ wrap=self.wrap,
1286
+ wrap_width=self.wrap_width,
1287
+ convert_as_inline=self.convert_as_inline,
1288
+ sub_symbol=self.sub_symbol,
1289
+ sup_symbol=self.sup_symbol,
1290
+ newline_style=self.newline_style,
1291
+ code_block_style=self.code_block_style,
1292
+ strip_tags=strip_tags,
1293
+ debug=self.debug,
1294
+ encoding=self.encoding,
1295
+ )
1296
+
1297
+ options.preprocessing = preprocessing
1298
+ return options, preprocessing
1299
+
1300
+ def to_dict(self, include_none: bool = False) -> dict[str, Any]:
1233
1301
  result = msgspec.to_builtins(self, builtin_types=(type(None),), order="deterministic")
1302
+ if result.get("keep_inline_images_in") is not None:
1303
+ result["keep_inline_images_in"] = list(result["keep_inline_images_in"])
1304
+ if result.get("strip_tags") is not None:
1305
+ result["strip_tags"] = list(result["strip_tags"])
1306
+ if result.get("convert") is not None:
1307
+ result["convert"] = list(result["convert"])
1308
+
1309
+ if include_none:
1310
+ return result # type: ignore[no-any-return]
1311
+
1234
1312
  return {k: v for k, v in result.items() if v is not None}
1235
1313
 
1236
1314
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.19.0
3
+ Version: 3.20.0
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -19,6 +19,7 @@ Classifier: Programming Language :: Python :: 3.10
19
19
  Classifier: Programming Language :: Python :: 3.11
20
20
  Classifier: Programming Language :: Python :: 3.12
21
21
  Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Programming Language :: Python :: 3.14
22
23
  Classifier: Topic :: Database
23
24
  Classifier: Topic :: Multimedia :: Graphics :: Capture :: Scanners
24
25
  Classifier: Topic :: Office/Business :: Office Suites
@@ -27,69 +28,69 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
27
28
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
28
29
  Classifier: Topic :: Text Processing :: General
29
30
  Classifier: Typing :: Typed
30
- Requires-Python: >=3.10
31
+ Requires-Python: <3.15,>=3.10
31
32
  Requires-Dist: anyio>=4.11.0
32
33
  Requires-Dist: chardetng-py>=0.3.5
33
34
  Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
34
- Requires-Dist: html-to-markdown[lxml]>=1.16.0
35
+ Requires-Dist: html-to-markdown>=2.1.0
35
36
  Requires-Dist: langcodes>=3.5.0
36
- Requires-Dist: mcp>=1.15.0
37
+ Requires-Dist: mcp>=1.17.0
37
38
  Requires-Dist: msgspec>=0.18.0
38
39
  Requires-Dist: numpy>=2.0.0
39
40
  Requires-Dist: playa-pdf>=0.7.0
40
- Requires-Dist: polars>=1.33.1
41
+ Requires-Dist: polars>=1.34.0
41
42
  Requires-Dist: psutil>=7.1.0
42
43
  Requires-Dist: pypdfium2==4.30.0
43
44
  Requires-Dist: python-calamine>=0.5.3
44
45
  Requires-Dist: python-pptx>=1.0.2
45
- Requires-Dist: transformers>=4.30.0
46
+ Requires-Dist: transformers>=4.57.0
46
47
  Requires-Dist: typing-extensions>=4.15.0; python_version < '3.12'
47
48
  Provides-Extra: additional-extensions
48
49
  Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
49
50
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
50
51
  Provides-Extra: all
51
- Requires-Dist: click>=8.2.1; extra == 'all'
52
+ Requires-Dist: click>=8.3.0; extra == 'all'
52
53
  Requires-Dist: deep-translator>=1.11.4; extra == 'all'
53
- Requires-Dist: easyocr>=1.7.2; extra == 'all'
54
+ Requires-Dist: easyocr>=1.7.2; (python_version < '3.14') and extra == 'all'
54
55
  Requires-Dist: fast-langdetect>=1.0.0; extra == 'all'
55
56
  Requires-Dist: gmft>=0.4.2; extra == 'all'
56
57
  Requires-Dist: keybert>=0.9.0; extra == 'all'
57
- Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'all'
58
+ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.18.0; extra == 'all'
58
59
  Requires-Dist: mailparse>=1.0.15; extra == 'all'
59
- Requires-Dist: paddleocr>=3.2.0; extra == 'all'
60
- Requires-Dist: paddlepaddle>=3.2.0; extra == 'all'
60
+ Requires-Dist: paddleocr>=3.2.0; (python_version < '3.14') and extra == 'all'
61
+ Requires-Dist: paddlepaddle>=3.2.0; (python_version < '3.14') and extra == 'all'
61
62
  Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'all'
62
- Requires-Dist: rich>=14.1.0; extra == 'all'
63
+ Requires-Dist: rich>=14.2.0; extra == 'all'
63
64
  Requires-Dist: semantic-text-splitter>=0.28.0; extra == 'all'
64
65
  Requires-Dist: setuptools>=80.9.0; extra == 'all'
65
- Requires-Dist: spacy>=3.8.7; extra == 'all'
66
+ Requires-Dist: spacy>=3.8.7; (python_version < '3.14') and extra == 'all'
66
67
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
67
- Requires-Dist: transformers>=4.25.0; extra == 'all'
68
+ Requires-Dist: transformers>=4.57.0; extra == 'all'
68
69
  Provides-Extra: api
69
- Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'api'
70
+ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.18.0; extra == 'api'
70
71
  Provides-Extra: chunking
71
72
  Requires-Dist: semantic-text-splitter>=0.28.0; extra == 'chunking'
72
73
  Provides-Extra: cli
73
- Requires-Dist: click>=8.2.1; extra == 'cli'
74
- Requires-Dist: rich>=14.1.0; extra == 'cli'
74
+ Requires-Dist: click>=8.3.0; extra == 'cli'
75
+ Requires-Dist: rich>=14.2.0; extra == 'cli'
75
76
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
76
77
  Provides-Extra: crypto
77
78
  Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'crypto'
78
79
  Provides-Extra: document-classification
79
80
  Requires-Dist: deep-translator>=1.11.4; extra == 'document-classification'
80
81
  Provides-Extra: easyocr
81
- Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
82
+ Requires-Dist: easyocr>=1.7.2; (python_version < '3.14') and extra == 'easyocr'
82
83
  Provides-Extra: entity-extraction
83
84
  Requires-Dist: keybert>=0.9.0; extra == 'entity-extraction'
84
- Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
85
+ Requires-Dist: spacy>=3.8.7; (python_version < '3.14') and extra == 'entity-extraction'
85
86
  Provides-Extra: gmft
86
87
  Requires-Dist: gmft>=0.4.2; extra == 'gmft'
87
- Requires-Dist: transformers>=4.25.0; extra == 'gmft'
88
+ Requires-Dist: transformers>=4.57.0; extra == 'gmft'
88
89
  Provides-Extra: langdetect
89
90
  Requires-Dist: fast-langdetect>=1.0.0; extra == 'langdetect'
90
91
  Provides-Extra: paddleocr
91
- Requires-Dist: paddleocr>=3.2.0; extra == 'paddleocr'
92
- Requires-Dist: paddlepaddle>=3.2.0; extra == 'paddleocr'
92
+ Requires-Dist: paddleocr>=3.2.0; (python_version < '3.14') and extra == 'paddleocr'
93
+ Requires-Dist: paddlepaddle>=3.2.0; (python_version < '3.14') and extra == 'paddleocr'
93
94
  Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
94
95
  Description-Content-Type: text/markdown
95
96
 
@@ -11,7 +11,7 @@ kreuzberg/_language_detection.py,sha256=4JzQldcDIVZRWUzRFc9AOFiq6Wfl9858mip1ZnrD
11
11
  kreuzberg/_mime_types.py,sha256=duEMDBg_qIf9A02tXAC_2znD-wgE-2BBMW9ofyYTJjE,8622
12
12
  kreuzberg/_playa.py,sha256=p4G5ymSSCbQoDeXJjH-yuVzdd4y-wKcolqDthjPtqok,11413
13
13
  kreuzberg/_registry.py,sha256=8XYT-vPhNYMAbB5RBIUKz-1Zdg48OCnBcdVZzBq6YwY,3307
14
- kreuzberg/_types.py,sha256=6oBsmUUihVr4hJJrYeuWoUVzCP_-eciCrBVvGQHQTDI,49920
14
+ kreuzberg/_types.py,sha256=eh4bZFG3jIw5GhfC3u4R0aa_y9niKZDI4O93j0MCZGw,53672
15
15
  kreuzberg/cli.py,sha256=P_dqOHbGh-fFYZ4WErjngTKq7wbqaUmTD1Gjw2lIsDI,15242
16
16
  kreuzberg/exceptions.py,sha256=KiGAfIX3_TkGYG1h9eTZ_E_pALsAqhZ_A3XfhwxwaS0,2909
17
17
  kreuzberg/extraction.py,sha256=jMsomvg7SPnuXLGZKQl0YH64D0AhczSNDM4CKORd9d0,24185
@@ -22,7 +22,7 @@ kreuzberg/_api/main.py,sha256=tmg1fICU4wshq0XXhGOk22oivfXjELtsEgOumdkZNI4,15257
22
22
  kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
23
  kreuzberg/_extractors/_base.py,sha256=99r-CUZcAp72c0mqkj-E41lj0SyzNaTb_w2EtKgfGJ8,9934
24
24
  kreuzberg/_extractors/_email.py,sha256=DzNthVbmbdlajDUfs0nNwbHNvG0CAQVqJsRfsatHtf0,8799
25
- kreuzberg/_extractors/_html.py,sha256=vNAgBrfok-16SOkhhsy10unqVwAczlTL_2KEn2X6S98,6315
25
+ kreuzberg/_extractors/_html.py,sha256=9AH95f7Lt-agYSOpCv5qRyugn3MdQtX0CNm_pOjovJc,5492
26
26
  kreuzberg/_extractors/_image.py,sha256=7rKEGhUAmdzO0YcBKQVhVme4PqyKIi2UCn4esmmFXOY,4300
27
27
  kreuzberg/_extractors/_pandoc.py,sha256=cwthr--IFwbu8r0rCZ_Cx5zRlan94yuqt5e3mjYxesE,24182
28
28
  kreuzberg/_extractors/_pdf.py,sha256=_MPtO_8BCpyAXyIWusmfqOaEsPMDxucjTQKz3cTaj8o,22663
@@ -36,7 +36,7 @@ kreuzberg/_ocr/_base.py,sha256=ZvOJvW8DtylQJZdCPk9vlVNZiBFK-dC4Oj7Kb6-mWkY,1419
36
36
  kreuzberg/_ocr/_easyocr.py,sha256=bHz2S_8nNHaPHPemcJK-U0al9_qP-vUmWE4ECVlf7AA,15485
37
37
  kreuzberg/_ocr/_paddleocr.py,sha256=CV9cCjkRe-3cNJ5tRu_sBXd_HNghEwfPIgWwxAZTeRY,15026
38
38
  kreuzberg/_ocr/_table_extractor.py,sha256=LhBiCX8R_xR-uK1FH3ONA_vqOmqUWANZJ2HMCBLsmNY,5513
39
- kreuzberg/_ocr/_tesseract.py,sha256=Uu6H1LMh1WSC1OmKhPx-miG98r9KEfc0GF7b8isS33E,52420
39
+ kreuzberg/_ocr/_tesseract.py,sha256=9F6V72WGi9ExruSNESjz8WGHCXuTYq1M1ctbayhQO0Y,43358
40
40
  kreuzberg/_token_reduction/__init__.py,sha256=y_2WgPxJes8_PD-VMfx7vQT0hGjFIixzS8PjaIseAGg,311
41
41
  kreuzberg/_token_reduction/_reducer.py,sha256=shAfMPznP69sTSzwX_bE1LpcBmoia9cpd7r6bSc4R5Q,13609
42
42
  kreuzberg/_token_reduction/_stopwords.py,sha256=mu-5CapG0RCP7LYzjhdTM6WWLtmt3cjZ08OOsyQkJVg,3608
@@ -109,7 +109,6 @@ kreuzberg/_utils/_cache.py,sha256=AtANbs1MWR4WLB2MhatVGhlh7kM-yjSfFuDnSVSNp50,14
109
109
  kreuzberg/_utils/_device.py,sha256=o03rLiHiRX6TKhJ55LO1Vj2Map1Po5YdjuMdA63tGOE,8249
110
110
  kreuzberg/_utils/_document_cache.py,sha256=tfk9_Yc1cQkT5_uM5R1uaI4w-2SjNn7QyAd6AmWkSz8,4851
111
111
  kreuzberg/_utils/_errors.py,sha256=aQYEnp8oJ-WJVmCNo7YY-25y1KZZFEwjAmxVRfw4a_M,4920
112
- kreuzberg/_utils/_html_streaming.py,sha256=ywQgEQfEGm6MSotS1g_HXgl0e7V59yLmf2wytALuZko,648
113
112
  kreuzberg/_utils/_image_preprocessing.py,sha256=f7ioWQyARnhzj0am0Y1_eteJwWomdPy7AnbXqw2xWBs,10954
114
113
  kreuzberg/_utils/_ocr_cache.py,sha256=uCCZfdY7EiqMhCnhNwqirFOr-Wfaobd2Ntc-F07TKec,3425
115
114
  kreuzberg/_utils/_pdf_lock.py,sha256=Ytvds30aZf3yXeZFo27ZenrhUoU-GZlR2rKEkhJ_wlk,1349
@@ -122,8 +121,8 @@ kreuzberg/_utils/_string.py,sha256=wVyvEHByHBeu_6evmqJGv9Ml-NAwkyz60n8l-7L5Cw0,4
122
121
  kreuzberg/_utils/_sync.py,sha256=gb828WYfVtkB4wKslJrPMmrdeI1h3htWceq-gywHtO4,3184
123
122
  kreuzberg/_utils/_table.py,sha256=OVg6T2QnerMhVNb1juLTBSIjyjFiE5-OrUWr5NSCgnQ,6493
124
123
  kreuzberg/_utils/_tmp.py,sha256=mwZ0BFzhGPfYa2tt8qSjUjfcHnSYvbQT4VlPRCRc_q8,2038
125
- kreuzberg-3.19.0.dist-info/METADATA,sha256=fV1j2iWA2-rcZodFFV3kmSsuBJhoDsW6OuyIu9Myf4A,12492
126
- kreuzberg-3.19.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
127
- kreuzberg-3.19.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
128
- kreuzberg-3.19.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
129
- kreuzberg-3.19.0.dist-info/RECORD,,
124
+ kreuzberg-3.20.0.dist-info/METADATA,sha256=pmBB6mlIuuD5tYx0_aOWNdHM00gd6nbxgDrXo1gEc6Y,12782
125
+ kreuzberg-3.20.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
126
+ kreuzberg-3.20.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
127
+ kreuzberg-3.20.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
128
+ kreuzberg-3.20.0.dist-info/RECORD,,
@@ -1,20 +0,0 @@
1
- from __future__ import annotations
2
-
3
- _STREAMING_THRESHOLD_KB = 10
4
- _LARGE_FILE_THRESHOLD_MB = 1
5
- _DEFAULT_CHUNK_SIZE = 2048
6
- _LARGE_FILE_CHUNK_SIZE = 4096
7
-
8
- _STREAMING_THRESHOLD_BYTES = _STREAMING_THRESHOLD_KB * 1024
9
- _LARGE_FILE_THRESHOLD_BYTES = _LARGE_FILE_THRESHOLD_MB * 1024 * 1024
10
-
11
-
12
- def should_use_streaming(content_size: int) -> tuple[bool, int]:
13
- if content_size < 0:
14
- return False, _DEFAULT_CHUNK_SIZE
15
-
16
- if content_size > _STREAMING_THRESHOLD_BYTES:
17
- if content_size > _LARGE_FILE_THRESHOLD_BYTES:
18
- return True, _LARGE_FILE_CHUNK_SIZE
19
- return True, _DEFAULT_CHUNK_SIZE
20
- return False, _DEFAULT_CHUNK_SIZE