kreuzberg 3.19.1__py3-none-any.whl → 3.20.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_extractors/_html.py +93 -103
- kreuzberg/_ocr/_tesseract.py +36 -292
- kreuzberg/_types.py +125 -47
- {kreuzberg-3.19.1.dist-info → kreuzberg-3.20.0.dist-info}/METADATA +23 -22
- {kreuzberg-3.19.1.dist-info → kreuzberg-3.20.0.dist-info}/RECORD +8 -9
- kreuzberg/_utils/_html_streaming.py +0 -20
- {kreuzberg-3.19.1.dist-info → kreuzberg-3.20.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.19.1.dist-info → kreuzberg-3.20.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.19.1.dist-info → kreuzberg-3.20.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_extractors/_html.py
CHANGED
@@ -1,20 +1,21 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
import base64
|
4
|
-
import binascii
|
5
|
-
import io
|
6
3
|
import logging
|
7
|
-
from typing import TYPE_CHECKING, ClassVar
|
4
|
+
from typing import TYPE_CHECKING, Any, ClassVar
|
8
5
|
|
9
|
-
import html_to_markdown
|
10
6
|
from anyio import Path as AsyncPath
|
11
|
-
from
|
12
|
-
from
|
7
|
+
from html_to_markdown import HtmlToMarkdownError
|
8
|
+
from html_to_markdown._html_to_markdown import (
|
9
|
+
InlineImageConfig,
|
10
|
+
convert_with_inline_images,
|
11
|
+
)
|
12
|
+
from html_to_markdown._html_to_markdown import (
|
13
|
+
convert as rust_convert,
|
14
|
+
)
|
13
15
|
|
14
16
|
from kreuzberg._extractors._base import MAX_SINGLE_IMAGE_SIZE, Extractor
|
15
17
|
from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
|
16
18
|
from kreuzberg._types import ExtractedImage, ExtractionResult, HTMLToMarkdownConfig
|
17
|
-
from kreuzberg._utils._html_streaming import should_use_streaming
|
18
19
|
from kreuzberg._utils._string import safe_decode
|
19
20
|
from kreuzberg._utils._sync import run_maybe_async, run_sync
|
20
21
|
|
@@ -41,27 +42,59 @@ class HTMLExtractor(Extractor):
|
|
41
42
|
return result
|
42
43
|
|
43
44
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
44
|
-
|
45
|
-
if config is None:
|
46
|
-
config = HTMLToMarkdownConfig()
|
47
|
-
|
48
|
-
config_dict = config.to_dict()
|
49
|
-
|
45
|
+
extraction_config = self.config
|
50
46
|
html_content = safe_decode(content)
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
47
|
+
if extraction_config and extraction_config.html_to_markdown_config is not None:
|
48
|
+
html_config = extraction_config.html_to_markdown_config
|
49
|
+
else:
|
50
|
+
html_config = HTMLToMarkdownConfig()
|
51
|
+
conversion_options, _ = html_config.to_options()
|
52
|
+
|
53
|
+
extract_inline_images = bool(extraction_config and extraction_config.extract_images)
|
54
|
+
run_ocr_on_images = bool(
|
55
|
+
extraction_config and extraction_config.extract_images and extraction_config.ocr_extracted_images
|
56
|
+
)
|
57
|
+
inline_image_config = None
|
58
|
+
if extract_inline_images:
|
59
|
+
inline_image_config = InlineImageConfig(
|
60
|
+
max_decoded_size_bytes=MAX_SINGLE_IMAGE_SIZE,
|
61
|
+
filename_prefix=None,
|
62
|
+
capture_svg=True,
|
63
|
+
infer_dimensions=True,
|
64
|
+
)
|
65
|
+
|
66
|
+
try:
|
67
|
+
if extract_inline_images:
|
68
|
+
markdown, images_payload, warnings = convert_with_inline_images(
|
69
|
+
html_content,
|
70
|
+
options=conversion_options,
|
71
|
+
image_config=inline_image_config,
|
72
|
+
)
|
73
|
+
else:
|
74
|
+
markdown = rust_convert(
|
75
|
+
html_content,
|
76
|
+
conversion_options,
|
77
|
+
)
|
78
|
+
images_payload = []
|
79
|
+
warnings = []
|
80
|
+
except (HtmlToMarkdownError, ValueError) as exc:
|
81
|
+
logger.exception("Failed to convert HTML to Markdown: %s", exc)
|
82
|
+
markdown = ""
|
83
|
+
images_payload = []
|
84
|
+
warnings = []
|
85
|
+
|
86
|
+
for warning in warnings:
|
87
|
+
self._log_inline_warning(warning)
|
88
|
+
|
89
|
+
extraction_result = ExtractionResult(content=markdown, mime_type=MARKDOWN_MIME_TYPE, metadata={})
|
90
|
+
|
91
|
+
inline_images = [self._build_extracted_image(image) for image in images_payload]
|
92
|
+
if inline_images:
|
93
|
+
extraction_result.images = inline_images
|
94
|
+
if run_ocr_on_images:
|
63
95
|
extraction_result.image_ocr_results = run_maybe_async(
|
64
|
-
self._process_images_with_ocr,
|
96
|
+
self._process_images_with_ocr,
|
97
|
+
inline_images,
|
65
98
|
)
|
66
99
|
|
67
100
|
return self._apply_quality_processing(extraction_result)
|
@@ -70,79 +103,36 @@ class HTMLExtractor(Extractor):
|
|
70
103
|
content = path.read_bytes()
|
71
104
|
return self.extract_bytes_sync(content)
|
72
105
|
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
logger.debug("Could not determine image dimensions for %s: %s", format_name, e)
|
107
|
-
|
108
|
-
alt_val = img.get("alt")
|
109
|
-
desc = alt_val if isinstance(alt_val, str) else None
|
110
|
-
images.append(
|
111
|
-
ExtractedImage(
|
112
|
-
data=image_data,
|
113
|
-
format=format_name,
|
114
|
-
filename=f"embedded_image_{len(images) + 1}.{format_name}",
|
115
|
-
description=desc,
|
116
|
-
dimensions=dimensions,
|
117
|
-
)
|
118
|
-
)
|
119
|
-
except (ValueError, binascii.Error) as e:
|
120
|
-
logger.warning("Failed to extract base64 image: %s", e)
|
121
|
-
|
122
|
-
def extract_svg_safe(svg_element: object) -> ExtractedImage | None:
|
123
|
-
try:
|
124
|
-
svg_content = str(svg_element).encode("utf-8")
|
125
|
-
|
126
|
-
def _get_attr_safe(obj: object, attr: str) -> str | None:
|
127
|
-
get_method = getattr(obj, "get", None)
|
128
|
-
if callable(get_method):
|
129
|
-
result = get_method(attr)
|
130
|
-
return result if isinstance(result, str) else None
|
131
|
-
return None
|
132
|
-
|
133
|
-
title_or_aria = _get_attr_safe(svg_element, "title") or _get_attr_safe(svg_element, "aria-label")
|
134
|
-
desc_svg = title_or_aria if isinstance(title_or_aria, str) else None
|
135
|
-
return ExtractedImage(
|
136
|
-
data=svg_content,
|
137
|
-
format="svg",
|
138
|
-
filename=f"inline_svg_{len(images) + 1}.svg",
|
139
|
-
description=desc_svg,
|
140
|
-
)
|
141
|
-
except (UnicodeEncodeError, AttributeError) as e:
|
142
|
-
logger.warning("Failed to extract SVG: %s", e)
|
143
|
-
return None
|
144
|
-
|
145
|
-
svg_images = [extract_svg_safe(svg) for svg in soup.find_all("svg")]
|
146
|
-
images.extend(img for img in svg_images if img is not None)
|
147
|
-
|
148
|
-
return images
|
106
|
+
@staticmethod
|
107
|
+
def _build_extracted_image(image: dict[str, Any]) -> ExtractedImage:
|
108
|
+
dimensions_value = image.get("dimensions")
|
109
|
+
dimensions = tuple(dimensions_value) if dimensions_value else None
|
110
|
+
return ExtractedImage(
|
111
|
+
data=image["data"],
|
112
|
+
format=image["format"],
|
113
|
+
filename=image.get("filename"),
|
114
|
+
description=image.get("description"),
|
115
|
+
dimensions=dimensions,
|
116
|
+
)
|
117
|
+
|
118
|
+
@staticmethod
|
119
|
+
def _log_inline_warning(warning: Any) -> None:
|
120
|
+
if isinstance(warning, dict):
|
121
|
+
index = warning.get("index")
|
122
|
+
message = warning.get("message")
|
123
|
+
if index is not None and message:
|
124
|
+
logger.warning("Inline image %s: %s", index, message)
|
125
|
+
elif message:
|
126
|
+
logger.warning("Inline image warning: %s", message)
|
127
|
+
else:
|
128
|
+
logger.warning("Inline image warning received with no message")
|
129
|
+
return
|
130
|
+
|
131
|
+
message = getattr(warning, "message", None)
|
132
|
+
index = getattr(warning, "index", None)
|
133
|
+
if message and index is not None:
|
134
|
+
logger.warning("Inline image %s: %s", index, message)
|
135
|
+
elif message:
|
136
|
+
logger.warning("Inline image warning: %s", message)
|
137
|
+
else:
|
138
|
+
logger.warning("Inline image warning received with no message")
|
kreuzberg/_ocr/_tesseract.py
CHANGED
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|
3
3
|
import csv
|
4
4
|
import hashlib
|
5
5
|
import io
|
6
|
+
import logging
|
6
7
|
import os
|
7
8
|
import re
|
8
9
|
import subprocess
|
@@ -14,12 +15,11 @@ from pathlib import Path
|
|
14
15
|
from typing import TYPE_CHECKING, Any, ClassVar, Final
|
15
16
|
|
16
17
|
import anyio
|
17
|
-
import html_to_markdown
|
18
18
|
import polars as pl
|
19
19
|
from anyio import Path as AsyncPath
|
20
20
|
from anyio import run_process
|
21
|
-
from
|
22
|
-
from
|
21
|
+
from html_to_markdown import HtmlToMarkdownError
|
22
|
+
from html_to_markdown._html_to_markdown import convert as rust_convert
|
23
23
|
from PIL import Image
|
24
24
|
from PIL.Image import Image as PILImage
|
25
25
|
from typing_extensions import Self
|
@@ -29,15 +29,15 @@ from kreuzberg._ocr._base import OCRBackend
|
|
29
29
|
from kreuzberg._ocr._table_extractor import extract_words, reconstruct_table, to_markdown
|
30
30
|
from kreuzberg._types import ExtractionResult, HTMLToMarkdownConfig, PSMMode, TableData, TesseractConfig
|
31
31
|
from kreuzberg._utils._cache import get_ocr_cache
|
32
|
-
from kreuzberg._utils._html_streaming import should_use_streaming
|
33
32
|
from kreuzberg._utils._process_pool import ProcessPoolManager, get_optimal_worker_count
|
34
33
|
from kreuzberg._utils._string import normalize_spaces
|
35
34
|
from kreuzberg._utils._sync import run_sync
|
36
35
|
from kreuzberg._utils._tmp import create_temp_file, temporary_file_sync
|
37
36
|
from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
|
38
37
|
|
38
|
+
logger = logging.getLogger(__name__)
|
39
|
+
|
39
40
|
if TYPE_CHECKING:
|
40
|
-
from bs4.element import Tag
|
41
41
|
from PIL.Image import Image as PILImage
|
42
42
|
|
43
43
|
try: # pragma: no cover
|
@@ -514,220 +514,56 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
514
514
|
table_min_confidence: float = 30.0,
|
515
515
|
**_kwargs: Any,
|
516
516
|
) -> ExtractionResult:
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
soup,
|
524
|
-
table_column_threshold,
|
525
|
-
table_row_threshold_ratio,
|
526
|
-
table_min_confidence,
|
527
|
-
)
|
517
|
+
_ = (
|
518
|
+
enable_table_detection,
|
519
|
+
table_column_threshold,
|
520
|
+
table_row_threshold_ratio,
|
521
|
+
table_min_confidence,
|
522
|
+
) # parameters retained for compatibility but handled internally by html-to-markdown
|
528
523
|
|
529
|
-
|
530
|
-
|
531
|
-
all_converters = dict(hocr_converters)
|
532
|
-
if config.custom_converters:
|
533
|
-
all_converters.update(config.custom_converters)
|
534
|
-
|
535
|
-
config_dict = config.to_dict()
|
536
|
-
config_dict["custom_converters"] = all_converters
|
537
|
-
|
538
|
-
use_streaming, chunk_size = should_use_streaming(len(hocr_content.encode()))
|
539
|
-
config_dict["stream_processing"] = use_streaming
|
540
|
-
config_dict["chunk_size"] = chunk_size
|
524
|
+
config = html_to_markdown_config or HTMLToMarkdownConfig()
|
525
|
+
conversion_options, _ = config.to_options()
|
541
526
|
|
542
527
|
try:
|
543
|
-
markdown_content =
|
528
|
+
markdown_content = rust_convert(
|
529
|
+
hocr_content,
|
530
|
+
conversion_options,
|
531
|
+
)
|
544
532
|
markdown_content = normalize_spaces(markdown_content)
|
545
|
-
except (
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
for word in words:
|
551
|
-
text = word.get_text().strip()
|
552
|
-
if text:
|
553
|
-
text_parts.append(text)
|
554
|
-
|
555
|
-
if text_parts:
|
556
|
-
markdown_content = " ".join(text_parts)
|
557
|
-
else:
|
558
|
-
markdown_content = soup.get_text().strip() or "[No text detected]"
|
559
|
-
|
560
|
-
markdown_content = normalize_spaces(markdown_content)
|
561
|
-
except (ValueError, TypeError, AttributeError):
|
562
|
-
markdown_content = "[OCR processing failed]"
|
563
|
-
|
564
|
-
if tables:
|
565
|
-
table_sections = []
|
566
|
-
for i, table in enumerate(tables):
|
567
|
-
table_sections.append(f"\n## Table {i + 1}\n\n{table['text']}\n")
|
568
|
-
|
569
|
-
if markdown_content.strip():
|
570
|
-
final_content = f"{markdown_content}\n{''.join(table_sections)}"
|
571
|
-
else:
|
572
|
-
final_content = "".join(table_sections).strip()
|
573
|
-
else:
|
574
|
-
final_content = markdown_content
|
533
|
+
except (HtmlToMarkdownError, ValueError) as exc:
|
534
|
+
logger.exception("Failed to convert hOCR to Markdown: %s", exc)
|
535
|
+
markdown_content = "[OCR processing failed]"
|
536
|
+
|
537
|
+
tables: list[TableData] = []
|
575
538
|
|
576
539
|
return ExtractionResult(
|
577
|
-
content=
|
540
|
+
content=markdown_content,
|
578
541
|
mime_type=MARKDOWN_MIME_TYPE,
|
579
542
|
metadata={"source_format": "hocr", "tables_detected": len(tables)},
|
580
543
|
chunks=[],
|
581
544
|
tables=tables,
|
582
545
|
)
|
583
546
|
|
584
|
-
def _create_basic_converters(self) -> dict[str, Any]:
|
585
|
-
def ocrx_word_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
586
|
-
del tag
|
587
|
-
return f"{text.strip()} "
|
588
|
-
|
589
|
-
def ocr_line_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
590
|
-
del tag
|
591
|
-
return f"{text.strip()}\n"
|
592
|
-
|
593
|
-
def ocr_par_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
594
|
-
del tag
|
595
|
-
content = text.strip()
|
596
|
-
if not content:
|
597
|
-
return ""
|
598
|
-
return f"{content}\n\n"
|
599
|
-
|
600
|
-
def ocr_carea_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
601
|
-
del tag
|
602
|
-
content = text.strip()
|
603
|
-
if not content:
|
604
|
-
return ""
|
605
|
-
return f"{content}\n\n"
|
606
|
-
|
607
|
-
def ocr_page_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
608
|
-
del tag
|
609
|
-
return text.strip()
|
610
|
-
|
611
|
-
def ocr_separator_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
612
|
-
del tag, text
|
613
|
-
return "---\n"
|
614
|
-
|
615
|
-
def ocr_photo_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
616
|
-
del text
|
617
|
-
title = tag.get("title", "")
|
618
|
-
if isinstance(title, str):
|
619
|
-
bbox_match = re.search(r"bbox (\d+) (\d+) (\d+) (\d+)", title)
|
620
|
-
if bbox_match:
|
621
|
-
x0, y0, x1, y1 = bbox_match.groups()
|
622
|
-
width = int(x1) - int(x0)
|
623
|
-
height = int(y1) - int(y0)
|
624
|
-
return f"*[Image region: {width}x{height} pixels]*\n\n"
|
625
|
-
return "*[Image detected]*\n\n"
|
626
|
-
|
627
|
-
return {
|
628
|
-
"ocrx_word": ocrx_word_converter,
|
629
|
-
"ocr_line": ocr_line_converter,
|
630
|
-
"ocr_par": ocr_par_converter,
|
631
|
-
"ocr_carea": ocr_carea_converter,
|
632
|
-
"ocr_page": ocr_page_converter,
|
633
|
-
"ocr_separator": ocr_separator_converter,
|
634
|
-
"ocr_photo": ocr_photo_converter,
|
635
|
-
}
|
636
|
-
|
637
|
-
def _create_hocr_converters(self, _tables: list[TableData]) -> dict[str, Any]:
|
638
|
-
basic_converters = self._create_basic_converters()
|
639
|
-
|
640
|
-
def generic_div_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
641
|
-
class_attr = tag.get("class", "")
|
642
|
-
if isinstance(class_attr, list):
|
643
|
-
class_attr = " ".join(class_attr)
|
644
|
-
elif not isinstance(class_attr, str):
|
645
|
-
class_attr = ""
|
646
|
-
|
647
|
-
for class_name in ["ocr_separator", "ocr_photo", "ocr_page", "ocr_carea"]:
|
648
|
-
if class_name in class_attr:
|
649
|
-
converter_result = basic_converters[class_name](tag=tag, text=text, **_conv_kwargs)
|
650
|
-
return str(converter_result)
|
651
|
-
return text
|
652
|
-
|
653
|
-
def generic_span_converter(*, tag: Tag, text: str, **_conv_kwargs: Any) -> str:
|
654
|
-
class_attr = tag.get("class", "")
|
655
|
-
if isinstance(class_attr, list):
|
656
|
-
class_attr = " ".join(class_attr)
|
657
|
-
elif not isinstance(class_attr, str):
|
658
|
-
class_attr = ""
|
659
|
-
|
660
|
-
for class_name in ["ocrx_word", "ocr_line"]:
|
661
|
-
if class_name in class_attr:
|
662
|
-
converter_result = basic_converters[class_name](tag=tag, text=text, **_conv_kwargs)
|
663
|
-
return str(converter_result)
|
664
|
-
return f"{text.strip()} "
|
665
|
-
|
666
|
-
return {
|
667
|
-
"span": generic_span_converter,
|
668
|
-
"div": generic_div_converter,
|
669
|
-
"p": basic_converters["ocr_par"],
|
670
|
-
}
|
671
|
-
|
672
547
|
def _process_hocr_to_markdown_sync(self, hocr_content: str, config: TesseractConfig) -> ExtractionResult:
|
673
|
-
|
548
|
+
_ = config # retained for interface compatibility
|
674
549
|
|
675
|
-
|
676
|
-
|
550
|
+
html_config = HTMLToMarkdownConfig()
|
551
|
+
conversion_options, _ = html_config.to_options()
|
677
552
|
|
678
553
|
try:
|
679
|
-
|
680
|
-
|
681
|
-
html_config = HTMLToMarkdownConfig(
|
682
|
-
custom_converters=converters,
|
683
|
-
)
|
684
|
-
|
685
|
-
config_dict = html_config.to_dict()
|
686
|
-
|
687
|
-
use_streaming, chunk_size = should_use_streaming(len(hocr_content.encode()))
|
688
|
-
config_dict["stream_processing"] = use_streaming
|
689
|
-
config_dict["chunk_size"] = chunk_size
|
690
|
-
|
691
|
-
markdown_content = html_to_markdown.convert_to_markdown(
|
554
|
+
markdown_content = rust_convert(
|
692
555
|
hocr_content,
|
693
|
-
|
556
|
+
conversion_options,
|
694
557
|
)
|
695
|
-
|
696
558
|
markdown_content = normalize_spaces(markdown_content)
|
559
|
+
except (HtmlToMarkdownError, ValueError) as exc:
|
560
|
+
logger.exception("Failed to convert hOCR to Markdown (sync path): %s", exc)
|
561
|
+
markdown_content = "[OCR processing failed]"
|
697
562
|
|
698
|
-
|
699
|
-
try:
|
700
|
-
soup = BeautifulSoup(hocr_content, "xml")
|
701
|
-
words = soup.find_all("span", class_="ocrx_word")
|
702
|
-
text_parts = []
|
703
|
-
for word in words:
|
704
|
-
text = word.get_text().strip()
|
705
|
-
if text:
|
706
|
-
text_parts.append(text)
|
707
|
-
|
708
|
-
if text_parts:
|
709
|
-
markdown_content = " ".join(text_parts)
|
710
|
-
else:
|
711
|
-
markdown_content = soup.get_text().strip() or "[No text detected]"
|
712
|
-
|
713
|
-
markdown_content = normalize_spaces(markdown_content)
|
714
|
-
except (ValueError, TypeError, AttributeError):
|
715
|
-
markdown_content = "[OCR processing failed]"
|
716
|
-
|
717
|
-
if tables:
|
718
|
-
table_sections = []
|
719
|
-
for i, table in enumerate(tables):
|
720
|
-
table_sections.append(f"\n## Table {i + 1}\n\n{table['text']}\n")
|
721
|
-
|
722
|
-
if markdown_content.strip():
|
723
|
-
final_content = f"{markdown_content}\n{''.join(table_sections)}"
|
724
|
-
else:
|
725
|
-
final_content = "".join(table_sections).strip()
|
726
|
-
else:
|
727
|
-
final_content = markdown_content
|
563
|
+
tables: list[TableData] = []
|
728
564
|
|
729
565
|
return ExtractionResult(
|
730
|
-
content=
|
566
|
+
content=markdown_content,
|
731
567
|
mime_type=MARKDOWN_MIME_TYPE,
|
732
568
|
metadata={"source_format": "hocr", "tables_detected": len(tables)},
|
733
569
|
chunks=[],
|
@@ -776,97 +612,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
776
612
|
|
777
613
|
return text_result
|
778
614
|
|
779
|
-
async def _extract_tables_from_hocr(
|
780
|
-
self,
|
781
|
-
soup: Any,
|
782
|
-
column_threshold: int = 20,
|
783
|
-
row_threshold_ratio: float = 0.5,
|
784
|
-
min_confidence: float = 30.0,
|
785
|
-
) -> list[TableData]:
|
786
|
-
tsv_data = await self._hocr_to_tsv_data(soup, min_confidence)
|
787
|
-
|
788
|
-
if not tsv_data:
|
789
|
-
return []
|
790
|
-
|
791
|
-
if not (words := extract_words(tsv_data, min_confidence=min_confidence)):
|
792
|
-
return []
|
793
|
-
|
794
|
-
tables: list[TableData] = []
|
795
|
-
try:
|
796
|
-
table_data = reconstruct_table(
|
797
|
-
words,
|
798
|
-
column_threshold=column_threshold,
|
799
|
-
row_threshold_ratio=row_threshold_ratio,
|
800
|
-
)
|
801
|
-
if table_data and len(table_data) > 1: # ~keep At least header + one data row
|
802
|
-
markdown = to_markdown(table_data)
|
803
|
-
|
804
|
-
min_x = min(w["left"] for w in words)
|
805
|
-
max_x = max(w["left"] + w["width"] for w in words)
|
806
|
-
min_y = min(w["top"] for w in words)
|
807
|
-
max_y = max(w["top"] + w["height"] for w in words)
|
808
|
-
|
809
|
-
try:
|
810
|
-
df = await run_sync(pl.DataFrame, table_data[1:], schema=table_data[0])
|
811
|
-
except (ImportError, IndexError): # pragma: no cover
|
812
|
-
df = None
|
813
|
-
|
814
|
-
dummy_image = Image.new("RGB", (1, 1), "white")
|
815
|
-
|
816
|
-
table: TableData = {
|
817
|
-
"text": markdown,
|
818
|
-
"df": df,
|
819
|
-
"page_number": 1,
|
820
|
-
"cropped_image": dummy_image,
|
821
|
-
"metadata": {"bbox": (min_x, min_y, max_x, max_y)},
|
822
|
-
} # type: ignore[typeddict-unknown-key]
|
823
|
-
tables.append(table)
|
824
|
-
except (ValueError, KeyError, ImportError): # pragma: no cover
|
825
|
-
pass
|
826
|
-
|
827
|
-
return tables
|
828
|
-
|
829
|
-
async def _hocr_to_tsv_data(self, soup: Any, min_confidence: float) -> str:
|
830
|
-
tsv_lines = ["level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext"]
|
831
|
-
|
832
|
-
words = soup.find_all("span", class_="ocrx_word")
|
833
|
-
word_num = 1
|
834
|
-
|
835
|
-
for word in words:
|
836
|
-
title = word.get("title", "")
|
837
|
-
text = word.get_text().strip()
|
838
|
-
|
839
|
-
if not text:
|
840
|
-
continue
|
841
|
-
|
842
|
-
bbox_match = re.search(r"bbox (\d+) (\d+) (\d+) (\d+)", title)
|
843
|
-
if not bbox_match:
|
844
|
-
continue
|
845
|
-
|
846
|
-
x0, y0, x1, y1 = map(int, bbox_match.groups())
|
847
|
-
|
848
|
-
conf_match = re.search(r"x_wconf (\d+)", title)
|
849
|
-
confidence = float(conf_match.group(1)) if conf_match else 100.0
|
850
|
-
|
851
|
-
if confidence < min_confidence:
|
852
|
-
continue
|
853
|
-
|
854
|
-
line = word.find_parent(class_="ocr_line")
|
855
|
-
par = word.find_parent(class_="ocr_par")
|
856
|
-
block = word.find_parent(class_="ocr_carea")
|
857
|
-
|
858
|
-
tsv_line = f"5\t1\t{block.get('id', '1').split('_')[-1] if block else 1}\t{par.get('id', '1').split('_')[-1] if par else 1}\t{line.get('id', '1').split('_')[-1] if line else 1}\t{word_num}\t{x0}\t{y0}\t{x1 - x0}\t{y1 - y0}\t{confidence}\t{text}"
|
859
|
-
tsv_lines.append(tsv_line)
|
860
|
-
word_num += 1
|
861
|
-
|
862
|
-
return "\n".join(tsv_lines)
|
863
|
-
|
864
|
-
def _identify_table_regions(self, words: list[dict[str, Any]]) -> list[list[dict[str, Any]]]:
|
865
|
-
if not words:
|
866
|
-
return []
|
867
|
-
|
868
|
-
return [words]
|
869
|
-
|
870
615
|
@classmethod
|
871
616
|
async def _validate_tesseract_version(cls) -> None:
|
872
617
|
try:
|
@@ -1309,10 +1054,9 @@ def _process_image_with_tesseract(
|
|
1309
1054
|
|
1310
1055
|
# Process based on output format
|
1311
1056
|
if output_format == "markdown" and tesseract_format == "hocr":
|
1312
|
-
|
1313
|
-
|
1314
|
-
|
1315
|
-
text = convert_to_markdown(text, heading_style="atx")
|
1057
|
+
html_config = HTMLToMarkdownConfig(heading_style="atx")
|
1058
|
+
options, _ = html_config.to_options()
|
1059
|
+
text = rust_convert(text, options)
|
1316
1060
|
|
1317
1061
|
text = normalize_spaces(text)
|
1318
1062
|
|
kreuzberg/_types.py
CHANGED
@@ -9,6 +9,12 @@ from typing import TYPE_CHECKING, Any, Literal, NamedTuple, TypedDict
|
|
9
9
|
|
10
10
|
import langcodes
|
11
11
|
import msgspec
|
12
|
+
from html_to_markdown._html_to_markdown import (
|
13
|
+
ConversionOptions as HTMLToMarkdownConversionOptions,
|
14
|
+
)
|
15
|
+
from html_to_markdown._html_to_markdown import (
|
16
|
+
PreprocessingOptions as HTMLToMarkdownPreprocessingOptions,
|
17
|
+
)
|
12
18
|
|
13
19
|
from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
|
14
20
|
from kreuzberg._utils._table import (
|
@@ -1166,71 +1172,143 @@ class ExtractionConfig(ConfigDict):
|
|
1166
1172
|
|
1167
1173
|
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
1168
1174
|
class HTMLToMarkdownConfig:
|
1169
|
-
|
1170
|
-
"""
|
1171
|
-
|
1172
|
-
"""
|
1175
|
+
heading_style: Literal["underlined", "atx", "atx_closed"] = "atx"
|
1176
|
+
"""Style for markdown headings."""
|
1177
|
+
list_indent_type: Literal["spaces", "tabs"] = "spaces"
|
1178
|
+
"""Type of indentation to use for lists."""
|
1179
|
+
list_indent_width: int = 4
|
1180
|
+
"""Number of spaces per indentation level (use 2 for Discord/Slack)."""
|
1173
1181
|
bullets: str = "*+-"
|
1174
1182
|
"""Characters to use for unordered list bullets."""
|
1175
|
-
|
1176
|
-
"""
|
1177
|
-
code_language_callback: Callable[[Any], str] | None = None
|
1178
|
-
"""Function to dynamically determine code block language."""
|
1179
|
-
convert: list[str] | None = None
|
1180
|
-
"""List of HTML tags to convert (None = all supported tags)."""
|
1181
|
-
convert_as_inline: bool = False
|
1182
|
-
"""Treat content as inline elements only."""
|
1183
|
-
custom_converters: Mapping[str, Callable[..., str]] | None = None
|
1184
|
-
"""Mapping of HTML tag names to custom converter functions."""
|
1185
|
-
default_title: bool = False
|
1186
|
-
"""Use default titles for elements like links."""
|
1183
|
+
strong_em_symbol: Literal["*", "_"] = "*"
|
1184
|
+
"""Symbol to use for strong/emphasis formatting."""
|
1187
1185
|
escape_asterisks: bool = False
|
1188
1186
|
"""Escape * characters to prevent unintended formatting."""
|
1189
|
-
escape_misc: bool = False
|
1190
|
-
"""Escape miscellaneous characters to prevent Markdown conflicts."""
|
1191
1187
|
escape_underscores: bool = False
|
1192
1188
|
"""Escape _ characters to prevent unintended formatting."""
|
1189
|
+
escape_misc: bool = False
|
1190
|
+
"""Escape miscellaneous characters to prevent Markdown conflicts."""
|
1191
|
+
escape_ascii: bool = False
|
1192
|
+
"""Escape all ASCII punctuation."""
|
1193
|
+
code_language: str = ""
|
1194
|
+
"""Default language identifier for fenced code blocks."""
|
1195
|
+
code_language_callback: Callable[[Any], str] | None = field(default=None, compare=False, hash=False)
|
1196
|
+
"""Legacy language callback (no longer used by v2 converter)."""
|
1197
|
+
autolinks: bool = True
|
1198
|
+
"""Automatically convert valid URLs to Markdown links."""
|
1199
|
+
default_title: bool = False
|
1200
|
+
"""Use default titles for elements like links."""
|
1201
|
+
keep_inline_images_in: tuple[str, ...] | None = None
|
1202
|
+
"""Tags where inline images should be preserved."""
|
1203
|
+
br_in_tables: bool = False
|
1204
|
+
"""Use <br> tags for line breaks in table cells instead of spaces."""
|
1205
|
+
highlight_style: Literal["double-equal", "html", "bold", "none"] = "double-equal"
|
1206
|
+
"""Style for highlighting text."""
|
1193
1207
|
extract_metadata: bool = True
|
1194
1208
|
"""Extract document metadata as comment header."""
|
1195
|
-
|
1196
|
-
"""
|
1197
|
-
|
1198
|
-
"""
|
1199
|
-
|
1200
|
-
"""
|
1201
|
-
|
1202
|
-
"""
|
1203
|
-
|
1204
|
-
"""
|
1209
|
+
whitespace_mode: Literal["normalized", "strict"] = "normalized"
|
1210
|
+
"""Whitespace handling mode."""
|
1211
|
+
strip_newlines: bool = False
|
1212
|
+
"""Remove newlines from HTML input before processing."""
|
1213
|
+
wrap: bool = False
|
1214
|
+
"""Enable text wrapping."""
|
1215
|
+
wrap_width: int = 80
|
1216
|
+
"""Width for text wrapping."""
|
1217
|
+
convert_as_inline: bool = False
|
1218
|
+
"""Treat content as inline elements only."""
|
1219
|
+
sub_symbol: str = ""
|
1220
|
+
"""Symbol to use for subscript text."""
|
1221
|
+
sup_symbol: str = ""
|
1222
|
+
"""Symbol to use for superscript text."""
|
1205
1223
|
newline_style: Literal["spaces", "backslash"] = "spaces"
|
1206
1224
|
"""Style for line breaks in markdown."""
|
1225
|
+
code_block_style: Literal["indented", "backticks", "tildes"] = "backticks"
|
1226
|
+
"""Style for fenced code blocks."""
|
1227
|
+
strip_tags: tuple[str, ...] | None = None
|
1228
|
+
"""List of HTML tags to remove from output."""
|
1229
|
+
convert: tuple[str, ...] | None = None
|
1230
|
+
"""Legacy list of tags to convert (no longer used by v2 converter)."""
|
1231
|
+
custom_converters: Mapping[str, Callable[..., str]] | None = field(default=None, compare=False, hash=False)
|
1232
|
+
"""Legacy mapping of custom converters (ignored by v2 converter)."""
|
1207
1233
|
preprocess_html: bool = False
|
1208
1234
|
"""Enable HTML preprocessing to clean messy HTML."""
|
1209
1235
|
preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard"
|
1210
1236
|
"""Preprocessing level for cleaning HTML."""
|
1211
|
-
remove_forms: bool = True
|
1212
|
-
"""Remove form elements during preprocessing."""
|
1213
1237
|
remove_navigation: bool = True
|
1214
1238
|
"""Remove navigation elements during preprocessing."""
|
1215
|
-
|
1216
|
-
"""
|
1217
|
-
|
1218
|
-
"""
|
1219
|
-
|
1220
|
-
"""
|
1221
|
-
sub_symbol: str = ""
|
1222
|
-
"""Symbol to use for subscript text."""
|
1223
|
-
sup_symbol: str = ""
|
1224
|
-
"""Symbol to use for superscript text."""
|
1225
|
-
whitespace_mode: Literal["normalized", "strict"] = "normalized"
|
1226
|
-
"""Whitespace handling mode."""
|
1227
|
-
wrap: bool = False
|
1228
|
-
"""Enable text wrapping."""
|
1229
|
-
wrap_width: int = 80
|
1230
|
-
"""Width for text wrapping."""
|
1239
|
+
remove_forms: bool = True
|
1240
|
+
"""Remove form elements during preprocessing."""
|
1241
|
+
encoding: str = "utf-8"
|
1242
|
+
"""Expected character encoding for the HTML input."""
|
1243
|
+
debug: bool = False
|
1244
|
+
"""Enable debug diagnostics in the converter."""
|
1231
1245
|
|
1232
|
-
def
|
1246
|
+
def __post_init__(self) -> None:
|
1247
|
+
if self.keep_inline_images_in is not None and not isinstance(self.keep_inline_images_in, tuple):
|
1248
|
+
object.__setattr__(self, "keep_inline_images_in", tuple(self.keep_inline_images_in))
|
1249
|
+
if self.strip_tags is not None and not isinstance(self.strip_tags, tuple):
|
1250
|
+
object.__setattr__(self, "strip_tags", tuple(self.strip_tags))
|
1251
|
+
if self.convert is not None and not isinstance(self.convert, tuple):
|
1252
|
+
object.__setattr__(self, "convert", tuple(self.convert))
|
1253
|
+
|
1254
|
+
def to_options(self) -> tuple[HTMLToMarkdownConversionOptions, HTMLToMarkdownPreprocessingOptions]:
|
1255
|
+
"""Build html_to_markdown ConversionOptions and PreprocessingOptions instances."""
|
1256
|
+
preprocessing = HTMLToMarkdownPreprocessingOptions(
|
1257
|
+
enabled=self.preprocess_html,
|
1258
|
+
preset=self.preprocessing_preset,
|
1259
|
+
remove_navigation=self.remove_navigation,
|
1260
|
+
remove_forms=self.remove_forms,
|
1261
|
+
)
|
1262
|
+
|
1263
|
+
keep_inline_images_in = list(self.keep_inline_images_in) if self.keep_inline_images_in else []
|
1264
|
+
strip_tags = list(self.strip_tags) if self.strip_tags else []
|
1265
|
+
|
1266
|
+
options = HTMLToMarkdownConversionOptions(
|
1267
|
+
heading_style=self.heading_style,
|
1268
|
+
list_indent_type=self.list_indent_type,
|
1269
|
+
list_indent_width=self.list_indent_width,
|
1270
|
+
bullets=self.bullets,
|
1271
|
+
strong_em_symbol=self.strong_em_symbol,
|
1272
|
+
escape_asterisks=self.escape_asterisks,
|
1273
|
+
escape_underscores=self.escape_underscores,
|
1274
|
+
escape_misc=self.escape_misc,
|
1275
|
+
escape_ascii=self.escape_ascii,
|
1276
|
+
code_language=self.code_language,
|
1277
|
+
autolinks=self.autolinks,
|
1278
|
+
default_title=self.default_title,
|
1279
|
+
keep_inline_images_in=keep_inline_images_in,
|
1280
|
+
br_in_tables=self.br_in_tables,
|
1281
|
+
highlight_style=self.highlight_style,
|
1282
|
+
extract_metadata=self.extract_metadata,
|
1283
|
+
whitespace_mode=self.whitespace_mode,
|
1284
|
+
strip_newlines=self.strip_newlines,
|
1285
|
+
wrap=self.wrap,
|
1286
|
+
wrap_width=self.wrap_width,
|
1287
|
+
convert_as_inline=self.convert_as_inline,
|
1288
|
+
sub_symbol=self.sub_symbol,
|
1289
|
+
sup_symbol=self.sup_symbol,
|
1290
|
+
newline_style=self.newline_style,
|
1291
|
+
code_block_style=self.code_block_style,
|
1292
|
+
strip_tags=strip_tags,
|
1293
|
+
debug=self.debug,
|
1294
|
+
encoding=self.encoding,
|
1295
|
+
)
|
1296
|
+
|
1297
|
+
options.preprocessing = preprocessing
|
1298
|
+
return options, preprocessing
|
1299
|
+
|
1300
|
+
def to_dict(self, include_none: bool = False) -> dict[str, Any]:
|
1233
1301
|
result = msgspec.to_builtins(self, builtin_types=(type(None),), order="deterministic")
|
1302
|
+
if result.get("keep_inline_images_in") is not None:
|
1303
|
+
result["keep_inline_images_in"] = list(result["keep_inline_images_in"])
|
1304
|
+
if result.get("strip_tags") is not None:
|
1305
|
+
result["strip_tags"] = list(result["strip_tags"])
|
1306
|
+
if result.get("convert") is not None:
|
1307
|
+
result["convert"] = list(result["convert"])
|
1308
|
+
|
1309
|
+
if include_none:
|
1310
|
+
return result # type: ignore[no-any-return]
|
1311
|
+
|
1234
1312
|
return {k: v for k, v in result.items() if v is not None}
|
1235
1313
|
|
1236
1314
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.20.0
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -19,6 +19,7 @@ Classifier: Programming Language :: Python :: 3.10
|
|
19
19
|
Classifier: Programming Language :: Python :: 3.11
|
20
20
|
Classifier: Programming Language :: Python :: 3.12
|
21
21
|
Classifier: Programming Language :: Python :: 3.13
|
22
|
+
Classifier: Programming Language :: Python :: 3.14
|
22
23
|
Classifier: Topic :: Database
|
23
24
|
Classifier: Topic :: Multimedia :: Graphics :: Capture :: Scanners
|
24
25
|
Classifier: Topic :: Office/Business :: Office Suites
|
@@ -27,69 +28,69 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
27
28
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
28
29
|
Classifier: Topic :: Text Processing :: General
|
29
30
|
Classifier: Typing :: Typed
|
30
|
-
Requires-Python:
|
31
|
+
Requires-Python: <3.15,>=3.10
|
31
32
|
Requires-Dist: anyio>=4.11.0
|
32
33
|
Requires-Dist: chardetng-py>=0.3.5
|
33
34
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
34
|
-
Requires-Dist: html-to-markdown
|
35
|
+
Requires-Dist: html-to-markdown>=2.1.0
|
35
36
|
Requires-Dist: langcodes>=3.5.0
|
36
|
-
Requires-Dist: mcp>=1.
|
37
|
+
Requires-Dist: mcp>=1.17.0
|
37
38
|
Requires-Dist: msgspec>=0.18.0
|
38
39
|
Requires-Dist: numpy>=2.0.0
|
39
40
|
Requires-Dist: playa-pdf>=0.7.0
|
40
|
-
Requires-Dist: polars>=1.
|
41
|
+
Requires-Dist: polars>=1.34.0
|
41
42
|
Requires-Dist: psutil>=7.1.0
|
42
43
|
Requires-Dist: pypdfium2==4.30.0
|
43
44
|
Requires-Dist: python-calamine>=0.5.3
|
44
45
|
Requires-Dist: python-pptx>=1.0.2
|
45
|
-
Requires-Dist: transformers>=4.
|
46
|
+
Requires-Dist: transformers>=4.57.0
|
46
47
|
Requires-Dist: typing-extensions>=4.15.0; python_version < '3.12'
|
47
48
|
Provides-Extra: additional-extensions
|
48
49
|
Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
|
49
50
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
|
50
51
|
Provides-Extra: all
|
51
|
-
Requires-Dist: click>=8.
|
52
|
+
Requires-Dist: click>=8.3.0; extra == 'all'
|
52
53
|
Requires-Dist: deep-translator>=1.11.4; extra == 'all'
|
53
|
-
Requires-Dist: easyocr>=1.7.2; extra == 'all'
|
54
|
+
Requires-Dist: easyocr>=1.7.2; (python_version < '3.14') and extra == 'all'
|
54
55
|
Requires-Dist: fast-langdetect>=1.0.0; extra == 'all'
|
55
56
|
Requires-Dist: gmft>=0.4.2; extra == 'all'
|
56
57
|
Requires-Dist: keybert>=0.9.0; extra == 'all'
|
57
|
-
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.
|
58
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.18.0; extra == 'all'
|
58
59
|
Requires-Dist: mailparse>=1.0.15; extra == 'all'
|
59
|
-
Requires-Dist: paddleocr>=3.2.0; extra == 'all'
|
60
|
-
Requires-Dist: paddlepaddle>=3.2.0; extra == 'all'
|
60
|
+
Requires-Dist: paddleocr>=3.2.0; (python_version < '3.14') and extra == 'all'
|
61
|
+
Requires-Dist: paddlepaddle>=3.2.0; (python_version < '3.14') and extra == 'all'
|
61
62
|
Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'all'
|
62
|
-
Requires-Dist: rich>=14.
|
63
|
+
Requires-Dist: rich>=14.2.0; extra == 'all'
|
63
64
|
Requires-Dist: semantic-text-splitter>=0.28.0; extra == 'all'
|
64
65
|
Requires-Dist: setuptools>=80.9.0; extra == 'all'
|
65
|
-
Requires-Dist: spacy>=3.8.7; extra == 'all'
|
66
|
+
Requires-Dist: spacy>=3.8.7; (python_version < '3.14') and extra == 'all'
|
66
67
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
|
67
|
-
Requires-Dist: transformers>=4.
|
68
|
+
Requires-Dist: transformers>=4.57.0; extra == 'all'
|
68
69
|
Provides-Extra: api
|
69
|
-
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.
|
70
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.18.0; extra == 'api'
|
70
71
|
Provides-Extra: chunking
|
71
72
|
Requires-Dist: semantic-text-splitter>=0.28.0; extra == 'chunking'
|
72
73
|
Provides-Extra: cli
|
73
|
-
Requires-Dist: click>=8.
|
74
|
-
Requires-Dist: rich>=14.
|
74
|
+
Requires-Dist: click>=8.3.0; extra == 'cli'
|
75
|
+
Requires-Dist: rich>=14.2.0; extra == 'cli'
|
75
76
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
|
76
77
|
Provides-Extra: crypto
|
77
78
|
Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'crypto'
|
78
79
|
Provides-Extra: document-classification
|
79
80
|
Requires-Dist: deep-translator>=1.11.4; extra == 'document-classification'
|
80
81
|
Provides-Extra: easyocr
|
81
|
-
Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
|
82
|
+
Requires-Dist: easyocr>=1.7.2; (python_version < '3.14') and extra == 'easyocr'
|
82
83
|
Provides-Extra: entity-extraction
|
83
84
|
Requires-Dist: keybert>=0.9.0; extra == 'entity-extraction'
|
84
|
-
Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
|
85
|
+
Requires-Dist: spacy>=3.8.7; (python_version < '3.14') and extra == 'entity-extraction'
|
85
86
|
Provides-Extra: gmft
|
86
87
|
Requires-Dist: gmft>=0.4.2; extra == 'gmft'
|
87
|
-
Requires-Dist: transformers>=4.
|
88
|
+
Requires-Dist: transformers>=4.57.0; extra == 'gmft'
|
88
89
|
Provides-Extra: langdetect
|
89
90
|
Requires-Dist: fast-langdetect>=1.0.0; extra == 'langdetect'
|
90
91
|
Provides-Extra: paddleocr
|
91
|
-
Requires-Dist: paddleocr>=3.2.0; extra == 'paddleocr'
|
92
|
-
Requires-Dist: paddlepaddle>=3.2.0; extra == 'paddleocr'
|
92
|
+
Requires-Dist: paddleocr>=3.2.0; (python_version < '3.14') and extra == 'paddleocr'
|
93
|
+
Requires-Dist: paddlepaddle>=3.2.0; (python_version < '3.14') and extra == 'paddleocr'
|
93
94
|
Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
|
94
95
|
Description-Content-Type: text/markdown
|
95
96
|
|
@@ -11,7 +11,7 @@ kreuzberg/_language_detection.py,sha256=4JzQldcDIVZRWUzRFc9AOFiq6Wfl9858mip1ZnrD
|
|
11
11
|
kreuzberg/_mime_types.py,sha256=duEMDBg_qIf9A02tXAC_2znD-wgE-2BBMW9ofyYTJjE,8622
|
12
12
|
kreuzberg/_playa.py,sha256=p4G5ymSSCbQoDeXJjH-yuVzdd4y-wKcolqDthjPtqok,11413
|
13
13
|
kreuzberg/_registry.py,sha256=8XYT-vPhNYMAbB5RBIUKz-1Zdg48OCnBcdVZzBq6YwY,3307
|
14
|
-
kreuzberg/_types.py,sha256=
|
14
|
+
kreuzberg/_types.py,sha256=eh4bZFG3jIw5GhfC3u4R0aa_y9niKZDI4O93j0MCZGw,53672
|
15
15
|
kreuzberg/cli.py,sha256=P_dqOHbGh-fFYZ4WErjngTKq7wbqaUmTD1Gjw2lIsDI,15242
|
16
16
|
kreuzberg/exceptions.py,sha256=KiGAfIX3_TkGYG1h9eTZ_E_pALsAqhZ_A3XfhwxwaS0,2909
|
17
17
|
kreuzberg/extraction.py,sha256=jMsomvg7SPnuXLGZKQl0YH64D0AhczSNDM4CKORd9d0,24185
|
@@ -22,7 +22,7 @@ kreuzberg/_api/main.py,sha256=tmg1fICU4wshq0XXhGOk22oivfXjELtsEgOumdkZNI4,15257
|
|
22
22
|
kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
23
|
kreuzberg/_extractors/_base.py,sha256=99r-CUZcAp72c0mqkj-E41lj0SyzNaTb_w2EtKgfGJ8,9934
|
24
24
|
kreuzberg/_extractors/_email.py,sha256=DzNthVbmbdlajDUfs0nNwbHNvG0CAQVqJsRfsatHtf0,8799
|
25
|
-
kreuzberg/_extractors/_html.py,sha256=
|
25
|
+
kreuzberg/_extractors/_html.py,sha256=9AH95f7Lt-agYSOpCv5qRyugn3MdQtX0CNm_pOjovJc,5492
|
26
26
|
kreuzberg/_extractors/_image.py,sha256=7rKEGhUAmdzO0YcBKQVhVme4PqyKIi2UCn4esmmFXOY,4300
|
27
27
|
kreuzberg/_extractors/_pandoc.py,sha256=cwthr--IFwbu8r0rCZ_Cx5zRlan94yuqt5e3mjYxesE,24182
|
28
28
|
kreuzberg/_extractors/_pdf.py,sha256=_MPtO_8BCpyAXyIWusmfqOaEsPMDxucjTQKz3cTaj8o,22663
|
@@ -36,7 +36,7 @@ kreuzberg/_ocr/_base.py,sha256=ZvOJvW8DtylQJZdCPk9vlVNZiBFK-dC4Oj7Kb6-mWkY,1419
|
|
36
36
|
kreuzberg/_ocr/_easyocr.py,sha256=bHz2S_8nNHaPHPemcJK-U0al9_qP-vUmWE4ECVlf7AA,15485
|
37
37
|
kreuzberg/_ocr/_paddleocr.py,sha256=CV9cCjkRe-3cNJ5tRu_sBXd_HNghEwfPIgWwxAZTeRY,15026
|
38
38
|
kreuzberg/_ocr/_table_extractor.py,sha256=LhBiCX8R_xR-uK1FH3ONA_vqOmqUWANZJ2HMCBLsmNY,5513
|
39
|
-
kreuzberg/_ocr/_tesseract.py,sha256=
|
39
|
+
kreuzberg/_ocr/_tesseract.py,sha256=9F6V72WGi9ExruSNESjz8WGHCXuTYq1M1ctbayhQO0Y,43358
|
40
40
|
kreuzberg/_token_reduction/__init__.py,sha256=y_2WgPxJes8_PD-VMfx7vQT0hGjFIixzS8PjaIseAGg,311
|
41
41
|
kreuzberg/_token_reduction/_reducer.py,sha256=shAfMPznP69sTSzwX_bE1LpcBmoia9cpd7r6bSc4R5Q,13609
|
42
42
|
kreuzberg/_token_reduction/_stopwords.py,sha256=mu-5CapG0RCP7LYzjhdTM6WWLtmt3cjZ08OOsyQkJVg,3608
|
@@ -109,7 +109,6 @@ kreuzberg/_utils/_cache.py,sha256=AtANbs1MWR4WLB2MhatVGhlh7kM-yjSfFuDnSVSNp50,14
|
|
109
109
|
kreuzberg/_utils/_device.py,sha256=o03rLiHiRX6TKhJ55LO1Vj2Map1Po5YdjuMdA63tGOE,8249
|
110
110
|
kreuzberg/_utils/_document_cache.py,sha256=tfk9_Yc1cQkT5_uM5R1uaI4w-2SjNn7QyAd6AmWkSz8,4851
|
111
111
|
kreuzberg/_utils/_errors.py,sha256=aQYEnp8oJ-WJVmCNo7YY-25y1KZZFEwjAmxVRfw4a_M,4920
|
112
|
-
kreuzberg/_utils/_html_streaming.py,sha256=ywQgEQfEGm6MSotS1g_HXgl0e7V59yLmf2wytALuZko,648
|
113
112
|
kreuzberg/_utils/_image_preprocessing.py,sha256=f7ioWQyARnhzj0am0Y1_eteJwWomdPy7AnbXqw2xWBs,10954
|
114
113
|
kreuzberg/_utils/_ocr_cache.py,sha256=uCCZfdY7EiqMhCnhNwqirFOr-Wfaobd2Ntc-F07TKec,3425
|
115
114
|
kreuzberg/_utils/_pdf_lock.py,sha256=Ytvds30aZf3yXeZFo27ZenrhUoU-GZlR2rKEkhJ_wlk,1349
|
@@ -122,8 +121,8 @@ kreuzberg/_utils/_string.py,sha256=wVyvEHByHBeu_6evmqJGv9Ml-NAwkyz60n8l-7L5Cw0,4
|
|
122
121
|
kreuzberg/_utils/_sync.py,sha256=gb828WYfVtkB4wKslJrPMmrdeI1h3htWceq-gywHtO4,3184
|
123
122
|
kreuzberg/_utils/_table.py,sha256=OVg6T2QnerMhVNb1juLTBSIjyjFiE5-OrUWr5NSCgnQ,6493
|
124
123
|
kreuzberg/_utils/_tmp.py,sha256=mwZ0BFzhGPfYa2tt8qSjUjfcHnSYvbQT4VlPRCRc_q8,2038
|
125
|
-
kreuzberg-3.
|
126
|
-
kreuzberg-3.
|
127
|
-
kreuzberg-3.
|
128
|
-
kreuzberg-3.
|
129
|
-
kreuzberg-3.
|
124
|
+
kreuzberg-3.20.0.dist-info/METADATA,sha256=pmBB6mlIuuD5tYx0_aOWNdHM00gd6nbxgDrXo1gEc6Y,12782
|
125
|
+
kreuzberg-3.20.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
126
|
+
kreuzberg-3.20.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
|
127
|
+
kreuzberg-3.20.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
128
|
+
kreuzberg-3.20.0.dist-info/RECORD,,
|
@@ -1,20 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
_STREAMING_THRESHOLD_KB = 10
|
4
|
-
_LARGE_FILE_THRESHOLD_MB = 1
|
5
|
-
_DEFAULT_CHUNK_SIZE = 2048
|
6
|
-
_LARGE_FILE_CHUNK_SIZE = 4096
|
7
|
-
|
8
|
-
_STREAMING_THRESHOLD_BYTES = _STREAMING_THRESHOLD_KB * 1024
|
9
|
-
_LARGE_FILE_THRESHOLD_BYTES = _LARGE_FILE_THRESHOLD_MB * 1024 * 1024
|
10
|
-
|
11
|
-
|
12
|
-
def should_use_streaming(content_size: int) -> tuple[bool, int]:
|
13
|
-
if content_size < 0:
|
14
|
-
return False, _DEFAULT_CHUNK_SIZE
|
15
|
-
|
16
|
-
if content_size > _STREAMING_THRESHOLD_BYTES:
|
17
|
-
if content_size > _LARGE_FILE_THRESHOLD_BYTES:
|
18
|
-
return True, _LARGE_FILE_CHUNK_SIZE
|
19
|
-
return True, _DEFAULT_CHUNK_SIZE
|
20
|
-
return False, _DEFAULT_CHUNK_SIZE
|
File without changes
|
File without changes
|
File without changes
|