kreuzberg 3.8.2__py3-none-any.whl → 3.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_chunker.py +3 -3
- kreuzberg/_config.py +1 -1
- kreuzberg/_document_classification.py +156 -0
- kreuzberg/_entity_extraction.py +3 -3
- kreuzberg/_extractors/_image.py +4 -3
- kreuzberg/_extractors/_pdf.py +18 -10
- kreuzberg/_extractors/_spread_sheet.py +4 -5
- kreuzberg/_extractors/_structured.py +24 -18
- kreuzberg/_gmft.py +25 -31
- kreuzberg/_mime_types.py +1 -1
- kreuzberg/_ocr/_base.py +1 -1
- kreuzberg/_ocr/_easyocr.py +4 -4
- kreuzberg/_ocr/_paddleocr.py +3 -3
- kreuzberg/_ocr/_tesseract.py +10 -14
- kreuzberg/_types.py +23 -7
- kreuzberg/_utils/_cache.py +2 -3
- kreuzberg/_utils/_device.py +7 -7
- kreuzberg/cli.py +2 -2
- kreuzberg/extraction.py +18 -9
- {kreuzberg-3.8.2.dist-info → kreuzberg-3.9.0.dist-info}/METADATA +7 -3
- {kreuzberg-3.8.2.dist-info → kreuzberg-3.9.0.dist-info}/RECORD +24 -23
- {kreuzberg-3.8.2.dist-info → kreuzberg-3.9.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.8.2.dist-info → kreuzberg-3.9.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.8.2.dist-info → kreuzberg-3.9.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_chunker.py
CHANGED
@@ -2,9 +2,9 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
from typing import TYPE_CHECKING
|
4
4
|
|
5
|
-
from kreuzberg import MissingDependencyError
|
6
5
|
from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
|
7
6
|
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
|
7
|
+
from kreuzberg.exceptions import MissingDependencyError
|
8
8
|
|
9
9
|
if TYPE_CHECKING:
|
10
10
|
from semantic_text_splitter import MarkdownSplitter, TextSplitter
|
@@ -36,11 +36,11 @@ def get_chunker(
|
|
36
36
|
if key not in _chunkers:
|
37
37
|
try:
|
38
38
|
if mime_type == MARKDOWN_MIME_TYPE:
|
39
|
-
from semantic_text_splitter import MarkdownSplitter
|
39
|
+
from semantic_text_splitter import MarkdownSplitter # noqa: PLC0415
|
40
40
|
|
41
41
|
_chunkers[key] = MarkdownSplitter(max_characters, overlap_characters)
|
42
42
|
else:
|
43
|
-
from semantic_text_splitter import TextSplitter
|
43
|
+
from semantic_text_splitter import TextSplitter # noqa: PLC0415
|
44
44
|
|
45
45
|
_chunkers[key] = TextSplitter(max_characters, overlap_characters)
|
46
46
|
except ImportError as e:
|
kreuzberg/_config.py
CHANGED
@@ -95,7 +95,7 @@ def parse_ocr_backend_config(
|
|
95
95
|
# Convert psm integer to PSMMode enum if needed
|
96
96
|
processed_config = backend_config.copy()
|
97
97
|
if "psm" in processed_config and isinstance(processed_config["psm"], int):
|
98
|
-
from kreuzberg._ocr._tesseract import PSMMode
|
98
|
+
from kreuzberg._ocr._tesseract import PSMMode # noqa: PLC0415
|
99
99
|
|
100
100
|
processed_config["psm"] = PSMMode(processed_config["psm"])
|
101
101
|
return TesseractConfig(**processed_config)
|
@@ -0,0 +1,156 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import re
|
4
|
+
from typing import TYPE_CHECKING
|
5
|
+
|
6
|
+
from kreuzberg._ocr import get_ocr_backend
|
7
|
+
from kreuzberg.exceptions import MissingDependencyError
|
8
|
+
|
9
|
+
if TYPE_CHECKING:
|
10
|
+
from pathlib import Path
|
11
|
+
|
12
|
+
from kreuzberg._types import ExtractionConfig, ExtractionResult
|
13
|
+
|
14
|
+
|
15
|
+
DOCUMENT_CLASSIFIERS = {
|
16
|
+
"invoice": [
|
17
|
+
r"invoice",
|
18
|
+
r"bill to",
|
19
|
+
r"invoice number",
|
20
|
+
r"total amount",
|
21
|
+
r"tax id",
|
22
|
+
],
|
23
|
+
"receipt": [
|
24
|
+
r"receipt",
|
25
|
+
r"cash receipt",
|
26
|
+
r"payment",
|
27
|
+
r"subtotal",
|
28
|
+
r"total due",
|
29
|
+
],
|
30
|
+
"contract": [
|
31
|
+
r"agreement",
|
32
|
+
r"contract",
|
33
|
+
r"party a",
|
34
|
+
r"party b",
|
35
|
+
r"terms and conditions",
|
36
|
+
r"signature",
|
37
|
+
],
|
38
|
+
"report": [r"report", r"summary", r"analysis", r"findings", r"conclusion"],
|
39
|
+
"form": [r"form", r"fill out", r"signature", r"date", r"submit"],
|
40
|
+
}
|
41
|
+
|
42
|
+
|
43
|
+
def _get_translated_text(result: ExtractionResult) -> str:
|
44
|
+
"""Translate extracted text to English using Google Translate API.
|
45
|
+
|
46
|
+
Args:
|
47
|
+
result: ExtractionResult containing the text to be translated
|
48
|
+
|
49
|
+
Returns:
|
50
|
+
str: The translated text in lowercase English
|
51
|
+
|
52
|
+
Raises:
|
53
|
+
MissingDependencyError: If the deep-translator package is not installed
|
54
|
+
"""
|
55
|
+
try:
|
56
|
+
from deep_translator import GoogleTranslator # noqa: PLC0415
|
57
|
+
except ImportError as e:
|
58
|
+
raise MissingDependencyError(
|
59
|
+
"The 'deep-translator' library is not installed. Please install it with: pip install 'kreuzberg[auto-classify-document-type]'"
|
60
|
+
) from e
|
61
|
+
|
62
|
+
return str(GoogleTranslator(source="auto", target="en").translate(result.content).lower())
|
63
|
+
|
64
|
+
|
65
|
+
def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tuple[str | None, float | None]:
|
66
|
+
"""Classifies the document type based on keywords and patterns.
|
67
|
+
|
68
|
+
Args:
|
69
|
+
result: The extraction result containing the content.
|
70
|
+
config: The extraction configuration.
|
71
|
+
|
72
|
+
Returns:
|
73
|
+
A tuple containing the detected document type and the confidence score,
|
74
|
+
or (None, None) if no type is detected with sufficient confidence.
|
75
|
+
"""
|
76
|
+
translated_text = _get_translated_text(result)
|
77
|
+
scores = dict.fromkeys(DOCUMENT_CLASSIFIERS, 0)
|
78
|
+
|
79
|
+
for doc_type, patterns in DOCUMENT_CLASSIFIERS.items():
|
80
|
+
for pattern in patterns:
|
81
|
+
if re.search(pattern, translated_text):
|
82
|
+
scores[doc_type] += 1
|
83
|
+
|
84
|
+
total_score = sum(scores.values())
|
85
|
+
if total_score == 0:
|
86
|
+
return None, None
|
87
|
+
|
88
|
+
confidences = {doc_type: score / total_score for doc_type, score in scores.items()}
|
89
|
+
|
90
|
+
best_type, best_confidence = max(confidences.items(), key=lambda item: item[1])
|
91
|
+
|
92
|
+
if best_confidence >= config.document_type_confidence_threshold:
|
93
|
+
return best_type, best_confidence
|
94
|
+
|
95
|
+
return None, None
|
96
|
+
|
97
|
+
|
98
|
+
def classify_document_from_layout(
|
99
|
+
result: ExtractionResult, config: ExtractionConfig
|
100
|
+
) -> tuple[str | None, float | None]:
|
101
|
+
"""Classifies the document type based on layout information from OCR.
|
102
|
+
|
103
|
+
Args:
|
104
|
+
result: The extraction result containing the layout data.
|
105
|
+
config: The extraction configuration.
|
106
|
+
|
107
|
+
Returns:
|
108
|
+
A tuple containing the detected document type and the confidence score,
|
109
|
+
or (None, None) if no type is detected with sufficient confidence.
|
110
|
+
"""
|
111
|
+
translated_text = _get_translated_text(result)
|
112
|
+
|
113
|
+
if result.layout is None or result.layout.empty:
|
114
|
+
return None, None
|
115
|
+
|
116
|
+
layout_df = result.layout
|
117
|
+
if not all(col in layout_df.columns for col in ["text", "top", "height"]):
|
118
|
+
return None, None
|
119
|
+
|
120
|
+
layout_df["translated_text"] = translated_text
|
121
|
+
|
122
|
+
page_height = layout_df["top"].max() + layout_df["height"].max()
|
123
|
+
scores = dict.fromkeys(DOCUMENT_CLASSIFIERS, 0.0)
|
124
|
+
|
125
|
+
for doc_type, patterns in DOCUMENT_CLASSIFIERS.items():
|
126
|
+
for pattern in patterns:
|
127
|
+
found_words = layout_df[layout_df["translated_text"].str.contains(pattern, case=False, na=False)]
|
128
|
+
if not found_words.empty:
|
129
|
+
scores[doc_type] += 1.0
|
130
|
+
word_top = found_words.iloc[0]["top"]
|
131
|
+
if word_top < page_height * 0.3:
|
132
|
+
scores[doc_type] += 0.5
|
133
|
+
|
134
|
+
total_score = sum(scores.values())
|
135
|
+
if total_score == 0:
|
136
|
+
return None, None
|
137
|
+
|
138
|
+
confidences = {doc_type: score / total_score for doc_type, score in scores.items()}
|
139
|
+
|
140
|
+
best_type, best_confidence = max(confidences.items(), key=lambda item: item[1])
|
141
|
+
|
142
|
+
if best_confidence >= config.document_type_confidence_threshold:
|
143
|
+
return best_type, best_confidence
|
144
|
+
|
145
|
+
return None, None
|
146
|
+
|
147
|
+
|
148
|
+
def auto_detect_document_type(
|
149
|
+
result: ExtractionResult, config: ExtractionConfig, file_path: Path | None = None
|
150
|
+
) -> ExtractionResult:
|
151
|
+
if config.document_classification_mode == "vision" and file_path:
|
152
|
+
layout_result = get_ocr_backend("tesseract").process_file_sync(file_path, **config.get_config_dict())
|
153
|
+
result.document_type, result.document_type_confidence = classify_document_from_layout(layout_result, config)
|
154
|
+
else:
|
155
|
+
result.document_type, result.document_type_confidence = classify_document(result, config)
|
156
|
+
return result
|
kreuzberg/_entity_extraction.py
CHANGED
@@ -138,7 +138,7 @@ def extract_entities(
|
|
138
138
|
spacy_config = SpacyEntityExtractionConfig()
|
139
139
|
|
140
140
|
try:
|
141
|
-
import spacy # noqa: F401
|
141
|
+
import spacy # noqa: F401, PLC0415
|
142
142
|
except ImportError as e:
|
143
143
|
raise MissingDependencyError.create_for_package(
|
144
144
|
package_name="spacy",
|
@@ -179,7 +179,7 @@ def extract_entities(
|
|
179
179
|
def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig) -> Any:
|
180
180
|
"""Load a spaCy model with caching."""
|
181
181
|
try:
|
182
|
-
import spacy
|
182
|
+
import spacy # noqa: PLC0415
|
183
183
|
|
184
184
|
if spacy_config.model_cache_dir:
|
185
185
|
os.environ["SPACY_DATA"] = str(spacy_config.model_cache_dir)
|
@@ -223,7 +223,7 @@ def extract_keywords(
|
|
223
223
|
MissingDependencyError: If `keybert` is not installed.
|
224
224
|
"""
|
225
225
|
try:
|
226
|
-
from keybert import KeyBERT
|
226
|
+
from keybert import KeyBERT # noqa: PLC0415
|
227
227
|
|
228
228
|
kw_model = KeyBERT()
|
229
229
|
keywords = kw_model.extract_keywords(text, top_n=keyword_count)
|
kreuzberg/_extractors/_image.py
CHANGED
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|
3
3
|
import contextlib
|
4
4
|
import os
|
5
5
|
import tempfile
|
6
|
+
from dataclasses import asdict
|
6
7
|
from pathlib import Path
|
7
8
|
from typing import TYPE_CHECKING, ClassVar
|
8
9
|
|
@@ -88,17 +89,17 @@ class ImageExtractor(Extractor):
|
|
88
89
|
config = (
|
89
90
|
self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
|
90
91
|
)
|
91
|
-
result = backend.process_file_sync(path, **config
|
92
|
+
result = backend.process_file_sync(path, **asdict(config))
|
92
93
|
elif self.config.ocr_backend == "paddleocr":
|
93
94
|
paddle_config = (
|
94
95
|
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
95
96
|
)
|
96
|
-
result = backend.process_file_sync(path, **paddle_config
|
97
|
+
result = backend.process_file_sync(path, **asdict(paddle_config))
|
97
98
|
elif self.config.ocr_backend == "easyocr":
|
98
99
|
easy_config = (
|
99
100
|
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
100
101
|
)
|
101
|
-
result = backend.process_file_sync(path, **easy_config
|
102
|
+
result = backend.process_file_sync(path, **asdict(easy_config))
|
102
103
|
else:
|
103
104
|
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
104
105
|
return self._apply_quality_processing(result)
|
kreuzberg/_extractors/_pdf.py
CHANGED
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|
3
3
|
import contextlib
|
4
4
|
import os
|
5
5
|
import tempfile
|
6
|
+
from dataclasses import asdict
|
6
7
|
from multiprocessing import cpu_count
|
7
8
|
from pathlib import Path
|
8
9
|
from re import Pattern
|
@@ -58,9 +59,13 @@ class PDFExtractor(Extractor):
|
|
58
59
|
result: ExtractionResult | None = None
|
59
60
|
|
60
61
|
if not self.config.force_ocr:
|
61
|
-
|
62
|
-
|
63
|
-
|
62
|
+
try:
|
63
|
+
content = await self._extract_pdf_searchable_text(path)
|
64
|
+
if self._validate_extracted_text(content):
|
65
|
+
result = ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
|
66
|
+
except ParsingError:
|
67
|
+
# If searchable text extraction fails, continue to OCR or empty result
|
68
|
+
pass
|
64
69
|
|
65
70
|
if not result and self.config.ocr_backend is not None:
|
66
71
|
result = await self._extract_pdf_text_with_ocr(path, self.config.ocr_backend)
|
@@ -73,7 +78,7 @@ class PDFExtractor(Extractor):
|
|
73
78
|
if self.config.extract_tables:
|
74
79
|
# GMFT is optional dependency
|
75
80
|
try:
|
76
|
-
from kreuzberg._gmft import extract_tables
|
81
|
+
from kreuzberg._gmft import extract_tables # noqa: PLC0415
|
77
82
|
|
78
83
|
result.tables = await extract_tables(path, self.config.gmft_config)
|
79
84
|
except ImportError:
|
@@ -112,16 +117,19 @@ class PDFExtractor(Extractor):
|
|
112
117
|
|
113
118
|
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
114
119
|
"""Pure sync implementation of PDF extraction from path."""
|
115
|
-
|
120
|
+
try:
|
121
|
+
text = self._extract_pdf_searchable_text_sync(path)
|
122
|
+
except ParsingError:
|
123
|
+
text = ""
|
116
124
|
|
117
|
-
if self.config.force_ocr or not self._validate_extracted_text(text):
|
125
|
+
if (self.config.force_ocr or not self._validate_extracted_text(text)) and self.config.ocr_backend is not None:
|
118
126
|
text = self._extract_pdf_with_ocr_sync(path)
|
119
127
|
|
120
128
|
tables = []
|
121
129
|
if self.config.extract_tables:
|
122
130
|
# GMFT is optional dependency
|
123
131
|
try:
|
124
|
-
from kreuzberg._gmft import extract_tables_sync
|
132
|
+
from kreuzberg._gmft import extract_tables_sync # noqa: PLC0415
|
125
133
|
|
126
134
|
tables = extract_tables_sync(path)
|
127
135
|
except ImportError:
|
@@ -381,17 +389,17 @@ class PDFExtractor(Extractor):
|
|
381
389
|
config = (
|
382
390
|
self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
|
383
391
|
)
|
384
|
-
results = backend.process_batch_sync(paths, **config
|
392
|
+
results = backend.process_batch_sync(paths, **asdict(config))
|
385
393
|
elif self.config.ocr_backend == "paddleocr":
|
386
394
|
paddle_config = (
|
387
395
|
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
388
396
|
)
|
389
|
-
results = backend.process_batch_sync(paths, **paddle_config
|
397
|
+
results = backend.process_batch_sync(paths, **asdict(paddle_config))
|
390
398
|
elif self.config.ocr_backend == "easyocr":
|
391
399
|
easy_config = (
|
392
400
|
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
393
401
|
)
|
394
|
-
results = backend.process_batch_sync(paths, **easy_config
|
402
|
+
results = backend.process_batch_sync(paths, **asdict(easy_config))
|
395
403
|
else:
|
396
404
|
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
397
405
|
|
@@ -11,6 +11,7 @@ from pathlib import Path
|
|
11
11
|
from typing import Any
|
12
12
|
|
13
13
|
from anyio import Path as AsyncPath
|
14
|
+
from PIL import Image
|
14
15
|
from python_calamine import CalamineWorkbook
|
15
16
|
|
16
17
|
from kreuzberg._extractors._base import Extractor
|
@@ -197,9 +198,9 @@ class SpreadSheetExtractor(Extractor):
|
|
197
198
|
"""Enhanced sheet processing with better table structure preservation."""
|
198
199
|
try:
|
199
200
|
# pandas is optional dependency
|
200
|
-
import pandas as pd
|
201
|
+
import pandas as pd # noqa: PLC0415
|
201
202
|
|
202
|
-
from kreuzberg._utils._table import enhance_table_markdown
|
203
|
+
from kreuzberg._utils._table import enhance_table_markdown # noqa: PLC0415
|
203
204
|
|
204
205
|
sheet = workbook.get_sheet_by_name(sheet_name)
|
205
206
|
data = sheet.to_python()
|
@@ -217,9 +218,7 @@ class SpreadSheetExtractor(Extractor):
|
|
217
218
|
return f"## {sheet_name}\n\n*No data*"
|
218
219
|
|
219
220
|
# Create a mock TableData for enhanced formatting
|
220
|
-
from
|
221
|
-
|
222
|
-
from kreuzberg._types import TableData
|
221
|
+
from kreuzberg._types import TableData # noqa: PLC0415
|
223
222
|
|
224
223
|
# Create a 1x1 transparent image as placeholder
|
225
224
|
placeholder_image = Image.new("RGBA", (1, 1), (0, 0, 0, 0))
|
@@ -1,8 +1,22 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import json
|
4
|
+
import sys
|
4
5
|
from typing import TYPE_CHECKING, Any, ClassVar
|
5
6
|
|
7
|
+
if sys.version_info >= (3, 11):
|
8
|
+
import tomllib
|
9
|
+
else:
|
10
|
+
try:
|
11
|
+
import tomli as tomllib # type: ignore[import-not-found]
|
12
|
+
except ImportError:
|
13
|
+
tomllib = None
|
14
|
+
|
15
|
+
try:
|
16
|
+
import yaml
|
17
|
+
except ImportError:
|
18
|
+
yaml = None
|
19
|
+
|
6
20
|
from anyio import Path as AsyncPath
|
7
21
|
|
8
22
|
from kreuzberg._extractors._base import Extractor
|
@@ -44,31 +58,23 @@ class StructuredDataExtractor(Extractor):
|
|
44
58
|
if self.mime_type in {JSON_MIME_TYPE, "text/json"}:
|
45
59
|
data = json.loads(text_content)
|
46
60
|
elif self.mime_type in {TOML_MIME_TYPE, "text/toml"}:
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
content=normalize_spaces(text_content),
|
55
|
-
mime_type=PLAIN_TEXT_MIME_TYPE,
|
56
|
-
metadata={"warning": "tomllib/tomli not available, returning raw text"},
|
57
|
-
chunks=[],
|
58
|
-
)
|
61
|
+
if tomllib is None:
|
62
|
+
return ExtractionResult(
|
63
|
+
content=normalize_spaces(text_content),
|
64
|
+
mime_type=PLAIN_TEXT_MIME_TYPE,
|
65
|
+
metadata={"warning": "tomllib/tomli not available, returning raw text"},
|
66
|
+
chunks=[],
|
67
|
+
)
|
59
68
|
data = tomllib.loads(text_content)
|
60
69
|
else:
|
61
|
-
|
62
|
-
import yaml
|
63
|
-
|
64
|
-
data = yaml.safe_load(text_content)
|
65
|
-
except ImportError:
|
70
|
+
if yaml is None:
|
66
71
|
return ExtractionResult(
|
67
72
|
content=normalize_spaces(text_content),
|
68
73
|
mime_type=PLAIN_TEXT_MIME_TYPE,
|
69
74
|
metadata={"warning": "PyYAML not available, returning raw text"},
|
70
75
|
chunks=[],
|
71
76
|
)
|
77
|
+
data = yaml.safe_load(text_content)
|
72
78
|
|
73
79
|
text_parts: list[str] = []
|
74
80
|
metadata: dict[str, Any] = {}
|
@@ -90,7 +96,7 @@ class StructuredDataExtractor(Extractor):
|
|
90
96
|
chunks=[],
|
91
97
|
)
|
92
98
|
|
93
|
-
except (ValueError, TypeError
|
99
|
+
except (json.JSONDecodeError, ValueError, TypeError) as e:
|
94
100
|
return ExtractionResult(
|
95
101
|
content=normalize_spaces(text_content),
|
96
102
|
mime_type=PLAIN_TEXT_MIME_TYPE,
|
kreuzberg/_gmft.py
CHANGED
@@ -5,12 +5,16 @@ import multiprocessing as mp
|
|
5
5
|
import os
|
6
6
|
import queue
|
7
7
|
import signal
|
8
|
+
import time
|
8
9
|
import traceback
|
9
10
|
from dataclasses import dataclass, field
|
10
11
|
from io import StringIO
|
12
|
+
from pathlib import Path
|
11
13
|
from typing import TYPE_CHECKING, Any, Literal
|
12
14
|
|
15
|
+
import anyio
|
13
16
|
import msgspec
|
17
|
+
from PIL import Image
|
14
18
|
|
15
19
|
from kreuzberg._types import TableData
|
16
20
|
from kreuzberg._utils._sync import run_sync
|
@@ -134,7 +138,7 @@ class GMFTConfig:
|
|
134
138
|
"""
|
135
139
|
|
136
140
|
|
137
|
-
async def extract_tables(
|
141
|
+
async def extract_tables(
|
138
142
|
file_path: str | PathLike[str], config: GMFTConfig | None = None, use_isolated_process: bool | None = None
|
139
143
|
) -> list[TableData]:
|
140
144
|
"""Extracts tables from a PDF file.
|
@@ -154,9 +158,7 @@ async def extract_tables( # noqa: PLR0915
|
|
154
158
|
Returns:
|
155
159
|
A list of table data dictionaries.
|
156
160
|
"""
|
157
|
-
from
|
158
|
-
|
159
|
-
from kreuzberg._utils._cache import get_table_cache
|
161
|
+
from kreuzberg._utils._cache import get_table_cache # noqa: PLC0415
|
160
162
|
|
161
163
|
# Determine if we should use isolated process # ~keep
|
162
164
|
if use_isolated_process is None:
|
@@ -190,8 +192,6 @@ async def extract_tables( # noqa: PLR0915
|
|
190
192
|
return cached_result # type: ignore[no-any-return]
|
191
193
|
|
192
194
|
if table_cache.is_processing(**cache_kwargs):
|
193
|
-
import anyio
|
194
|
-
|
195
195
|
event = table_cache.mark_processing(**cache_kwargs)
|
196
196
|
await anyio.to_thread.run_sync(event.wait)
|
197
197
|
|
@@ -211,10 +211,13 @@ async def extract_tables( # noqa: PLR0915
|
|
211
211
|
return result
|
212
212
|
|
213
213
|
try:
|
214
|
-
from gmft.auto import
|
215
|
-
|
216
|
-
|
217
|
-
|
214
|
+
from gmft.auto import ( # type: ignore[attr-defined] # noqa: PLC0415 # noqa: PLC0415
|
215
|
+
AutoTableDetector,
|
216
|
+
AutoTableFormatter,
|
217
|
+
)
|
218
|
+
from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined] # noqa: PLC0415
|
219
|
+
from gmft.formatters.tatr import TATRFormatConfig # noqa: PLC0415 # noqa: PLC0415
|
220
|
+
from gmft.pdf_bindings.pdfium import PyPDFium2Document # noqa: PLC0415 # noqa: PLC0415
|
218
221
|
|
219
222
|
formatter: Any = AutoTableFormatter( # type: ignore[no-untyped-call] # type: ignore[no-untyped-call]
|
220
223
|
config=TATRFormatConfig(
|
@@ -284,9 +287,7 @@ def extract_tables_sync(
|
|
284
287
|
Returns:
|
285
288
|
A list of table data dictionaries.
|
286
289
|
"""
|
287
|
-
from
|
288
|
-
|
289
|
-
from kreuzberg._utils._cache import get_table_cache
|
290
|
+
from kreuzberg._utils._cache import get_table_cache # noqa: PLC0415
|
290
291
|
|
291
292
|
# Determine if we should use isolated process # ~keep
|
292
293
|
if use_isolated_process is None:
|
@@ -327,10 +328,10 @@ def extract_tables_sync(
|
|
327
328
|
return result
|
328
329
|
|
329
330
|
try:
|
330
|
-
from gmft.auto import AutoTableDetector, AutoTableFormatter # type: ignore[attr-defined]
|
331
|
-
from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined]
|
332
|
-
from gmft.formatters.tatr import TATRFormatConfig
|
333
|
-
from gmft.pdf_bindings.pdfium import PyPDFium2Document
|
331
|
+
from gmft.auto import AutoTableDetector, AutoTableFormatter # type: ignore[attr-defined] # noqa: PLC0415
|
332
|
+
from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined] # noqa: PLC0415
|
333
|
+
from gmft.formatters.tatr import TATRFormatConfig # noqa: PLC0415
|
334
|
+
from gmft.pdf_bindings.pdfium import PyPDFium2Document # noqa: PLC0415
|
334
335
|
|
335
336
|
formatter: Any = AutoTableFormatter( # type: ignore[no-untyped-call]
|
336
337
|
config=TATRFormatConfig(
|
@@ -399,10 +400,10 @@ def _extract_tables_in_process(
|
|
399
400
|
signal.signal(signal.SIGINT, signal.SIG_IGN)
|
400
401
|
|
401
402
|
try:
|
402
|
-
from gmft.auto import AutoTableDetector, AutoTableFormatter # type: ignore[attr-defined]
|
403
|
-
from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined]
|
404
|
-
from gmft.formatters.tatr import TATRFormatConfig
|
405
|
-
from gmft.pdf_bindings.pdfium import PyPDFium2Document
|
403
|
+
from gmft.auto import AutoTableDetector, AutoTableFormatter # type: ignore[attr-defined] # noqa: PLC0415
|
404
|
+
from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined] # noqa: PLC0415
|
405
|
+
from gmft.formatters.tatr import TATRFormatConfig # noqa: PLC0415
|
406
|
+
from gmft.pdf_bindings.pdfium import PyPDFium2Document # noqa: PLC0415
|
406
407
|
|
407
408
|
config = GMFTConfig(**config_dict)
|
408
409
|
|
@@ -495,7 +496,6 @@ def _extract_tables_isolated(
|
|
495
496
|
|
496
497
|
try:
|
497
498
|
# Wait for result with timeout, checking for process death # ~keep
|
498
|
-
import time
|
499
499
|
|
500
500
|
start_time = time.time()
|
501
501
|
while True:
|
@@ -529,10 +529,8 @@ def _extract_tables_isolated(
|
|
529
529
|
if success:
|
530
530
|
tables = []
|
531
531
|
for table_dict in result:
|
532
|
-
from PIL import Image
|
533
|
-
|
534
532
|
img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
|
535
|
-
import pandas as pd
|
533
|
+
import pandas as pd # noqa: PLC0415
|
536
534
|
|
537
535
|
df = pd.read_csv(StringIO(table_dict["df_csv"]))
|
538
536
|
|
@@ -577,7 +575,7 @@ def _extract_tables_isolated(
|
|
577
575
|
async def _extract_tables_isolated_async(
|
578
576
|
file_path: str | PathLike[str],
|
579
577
|
config: GMFTConfig | None = None,
|
580
|
-
timeout: float = 300.0,
|
578
|
+
timeout: float = 300.0, # noqa: ASYNC109
|
581
579
|
) -> list[TableData]:
|
582
580
|
"""Async version of extract_tables_isolated using asyncio.
|
583
581
|
|
@@ -592,8 +590,6 @@ async def _extract_tables_isolated_async(
|
|
592
590
|
Raises:
|
593
591
|
RuntimeError: If extraction fails or times out
|
594
592
|
"""
|
595
|
-
import anyio
|
596
|
-
|
597
593
|
config = config or GMFTConfig()
|
598
594
|
config_dict = msgspec.to_builtins(config)
|
599
595
|
|
@@ -639,10 +635,8 @@ async def _extract_tables_isolated_async(
|
|
639
635
|
if success:
|
640
636
|
tables = []
|
641
637
|
for table_dict in result:
|
642
|
-
from PIL import Image
|
643
|
-
|
644
638
|
img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
|
645
|
-
import pandas as pd
|
639
|
+
import pandas as pd # noqa: PLC0415
|
646
640
|
|
647
641
|
df = pd.read_csv(StringIO(table_dict["df_csv"]))
|
648
642
|
|
kreuzberg/_mime_types.py
CHANGED
@@ -191,7 +191,7 @@ def validate_mime_type(
|
|
191
191
|
return _validate_explicit_mime_type(mime_type)
|
192
192
|
|
193
193
|
if file_path:
|
194
|
-
from kreuzberg._utils._cache import get_mime_cache
|
194
|
+
from kreuzberg._utils._cache import get_mime_cache # noqa: PLC0415
|
195
195
|
|
196
196
|
path = Path(file_path)
|
197
197
|
|
kreuzberg/_ocr/_base.py
CHANGED
@@ -103,7 +103,7 @@ class OCRBackend(ABC, Generic[T]):
|
|
103
103
|
Returns:
|
104
104
|
List of extraction result objects in the same order as input paths
|
105
105
|
"""
|
106
|
-
from kreuzberg._utils._sync import run_taskgroup
|
106
|
+
from kreuzberg._utils._sync import run_taskgroup # noqa: PLC0415
|
107
107
|
|
108
108
|
tasks = [self.process_file(path, **kwargs) for path in paths]
|
109
109
|
return await run_taskgroup(*tasks)
|
kreuzberg/_ocr/_easyocr.py
CHANGED
@@ -180,7 +180,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
180
180
|
Raises:
|
181
181
|
OCRError: If OCR processing fails.
|
182
182
|
"""
|
183
|
-
import numpy as np
|
183
|
+
import numpy as np # noqa: PLC0415
|
184
184
|
|
185
185
|
await self._init_easyocr(**kwargs)
|
186
186
|
|
@@ -318,7 +318,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
318
318
|
bool: True if GPU support is available.
|
319
319
|
"""
|
320
320
|
try:
|
321
|
-
import torch
|
321
|
+
import torch # noqa: PLC0415
|
322
322
|
|
323
323
|
return bool(torch.cuda.is_available())
|
324
324
|
except ImportError:
|
@@ -339,7 +339,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
339
339
|
return
|
340
340
|
|
341
341
|
try:
|
342
|
-
import easyocr
|
342
|
+
import easyocr # noqa: PLC0415
|
343
343
|
except ImportError as e:
|
344
344
|
raise MissingDependencyError.create_for_package(
|
345
345
|
dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
|
@@ -507,7 +507,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
507
507
|
return
|
508
508
|
|
509
509
|
try:
|
510
|
-
import easyocr
|
510
|
+
import easyocr # noqa: PLC0415
|
511
511
|
except ImportError as e:
|
512
512
|
raise MissingDependencyError.create_for_package(
|
513
513
|
dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
|
kreuzberg/_ocr/_paddleocr.py
CHANGED
@@ -124,7 +124,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
124
124
|
Raises:
|
125
125
|
OCRError: If OCR processing fails.
|
126
126
|
"""
|
127
|
-
import numpy as np
|
127
|
+
import numpy as np # noqa: PLC0415
|
128
128
|
|
129
129
|
await self._init_paddle_ocr(**kwargs)
|
130
130
|
|
@@ -260,7 +260,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
260
260
|
return
|
261
261
|
|
262
262
|
try:
|
263
|
-
from paddleocr import PaddleOCR
|
263
|
+
from paddleocr import PaddleOCR # noqa: PLC0415
|
264
264
|
except ImportError as e:
|
265
265
|
raise MissingDependencyError.create_for_package(
|
266
266
|
dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
|
@@ -427,7 +427,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
427
427
|
return
|
428
428
|
|
429
429
|
try:
|
430
|
-
from paddleocr import PaddleOCR
|
430
|
+
from paddleocr import PaddleOCR # noqa: PLC0415
|
431
431
|
except ImportError as e:
|
432
432
|
raise MissingDependencyError.create_for_package(
|
433
433
|
dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
|
kreuzberg/_ocr/_tesseract.py
CHANGED
@@ -12,8 +12,10 @@ from enum import Enum
|
|
12
12
|
from pathlib import Path
|
13
13
|
from typing import TYPE_CHECKING, Any, ClassVar, Final
|
14
14
|
|
15
|
+
import anyio
|
15
16
|
from anyio import Path as AsyncPath
|
16
17
|
from anyio import run_process
|
18
|
+
from PIL import Image
|
17
19
|
from typing_extensions import Self
|
18
20
|
|
19
21
|
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
@@ -25,7 +27,7 @@ from kreuzberg._utils._tmp import create_temp_file
|
|
25
27
|
from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
|
26
28
|
|
27
29
|
if TYPE_CHECKING:
|
28
|
-
from PIL.Image import Image
|
30
|
+
from PIL.Image import Image as PILImage
|
29
31
|
|
30
32
|
try: # pragma: no cover
|
31
33
|
from typing import Unpack # type: ignore[attr-defined]
|
@@ -233,10 +235,10 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
233
235
|
|
234
236
|
async def process_image(
|
235
237
|
self,
|
236
|
-
image:
|
238
|
+
image: PILImage,
|
237
239
|
**kwargs: Unpack[TesseractConfig],
|
238
240
|
) -> ExtractionResult:
|
239
|
-
from kreuzberg._utils._cache import get_ocr_cache
|
241
|
+
from kreuzberg._utils._cache import get_ocr_cache # noqa: PLC0415
|
240
242
|
|
241
243
|
image_buffer = io.BytesIO()
|
242
244
|
await run_sync(image.save, image_buffer, format="PNG")
|
@@ -254,8 +256,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
254
256
|
return cached_result
|
255
257
|
|
256
258
|
if ocr_cache.is_processing(**cache_kwargs):
|
257
|
-
import anyio
|
258
|
-
|
259
259
|
event = ocr_cache.mark_processing(**cache_kwargs)
|
260
260
|
await anyio.to_thread.run_sync(event.wait)
|
261
261
|
|
@@ -286,7 +286,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
286
286
|
path: Path,
|
287
287
|
**kwargs: Unpack[TesseractConfig],
|
288
288
|
) -> ExtractionResult:
|
289
|
-
from kreuzberg._utils._cache import get_ocr_cache
|
289
|
+
from kreuzberg._utils._cache import get_ocr_cache # noqa: PLC0415
|
290
290
|
|
291
291
|
try:
|
292
292
|
stat = path.stat()
|
@@ -314,8 +314,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
314
314
|
return cached_result
|
315
315
|
|
316
316
|
if ocr_cache.is_processing(**cache_kwargs):
|
317
|
-
import anyio
|
318
|
-
|
319
317
|
event = ocr_cache.mark_processing(**cache_kwargs)
|
320
318
|
await anyio.to_thread.run_sync(event.wait)
|
321
319
|
|
@@ -411,7 +409,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
411
409
|
|
412
410
|
def process_image_sync(
|
413
411
|
self,
|
414
|
-
image:
|
412
|
+
image: PILImage,
|
415
413
|
**kwargs: Unpack[TesseractConfig],
|
416
414
|
) -> ExtractionResult:
|
417
415
|
"""Synchronously process an image and extract its text and metadata.
|
@@ -423,7 +421,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
423
421
|
Returns:
|
424
422
|
The extraction result object
|
425
423
|
"""
|
426
|
-
from kreuzberg._utils._cache import get_ocr_cache
|
424
|
+
from kreuzberg._utils._cache import get_ocr_cache # noqa: PLC0415
|
427
425
|
|
428
426
|
image_buffer = io.BytesIO()
|
429
427
|
image.save(image_buffer, format="PNG")
|
@@ -482,7 +480,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
482
480
|
Returns:
|
483
481
|
The extraction result object
|
484
482
|
"""
|
485
|
-
from kreuzberg._utils._cache import get_ocr_cache
|
483
|
+
from kreuzberg._utils._cache import get_ocr_cache # noqa: PLC0415
|
486
484
|
|
487
485
|
file_info = self._get_file_info(path)
|
488
486
|
|
@@ -771,8 +769,6 @@ def _process_image_bytes_with_tesseract(
|
|
771
769
|
OCR result as dictionary.
|
772
770
|
"""
|
773
771
|
try:
|
774
|
-
from PIL import Image
|
775
|
-
|
776
772
|
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image:
|
777
773
|
with Image.open(io.BytesIO(image_bytes)) as image:
|
778
774
|
image.save(tmp_image.name, format="PNG")
|
@@ -810,7 +806,7 @@ class TesseractProcessPool:
|
|
810
806
|
max_processes: Maximum number of processes.
|
811
807
|
memory_limit_gb: Memory limit in GB.
|
812
808
|
"""
|
813
|
-
from kreuzberg._utils._process_pool import ProcessPoolManager
|
809
|
+
from kreuzberg._utils._process_pool import ProcessPoolManager # noqa: PLC0415
|
814
810
|
|
815
811
|
self.config = config or TesseractConfig()
|
816
812
|
self.process_manager = ProcessPoolManager(
|
kreuzberg/_types.py
CHANGED
@@ -8,7 +8,11 @@ from typing import TYPE_CHECKING, Any, Literal, TypedDict
|
|
8
8
|
import msgspec
|
9
9
|
|
10
10
|
from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
|
11
|
-
from kreuzberg._utils._table import
|
11
|
+
from kreuzberg._utils._table import (
|
12
|
+
export_table_to_csv,
|
13
|
+
export_table_to_tsv,
|
14
|
+
extract_table_structure_info,
|
15
|
+
)
|
12
16
|
from kreuzberg.exceptions import ValidationError
|
13
17
|
|
14
18
|
if sys.version_info < (3, 11): # pragma: no cover
|
@@ -228,6 +232,12 @@ class ExtractionResult:
|
|
228
232
|
"""Extracted keywords and their scores, if keyword extraction is enabled."""
|
229
233
|
detected_languages: list[str] | None = None
|
230
234
|
"""Languages detected in the extracted content, if language detection is enabled."""
|
235
|
+
document_type: str | None = None
|
236
|
+
"""Detected document type, if document type detection is enabled."""
|
237
|
+
document_type_confidence: float | None = None
|
238
|
+
"""Confidence of the detected document type."""
|
239
|
+
layout: DataFrame | None = field(default=None, repr=False, hash=False)
|
240
|
+
"""Internal layout data from OCR, not for public use."""
|
231
241
|
|
232
242
|
def to_dict(self, include_none: bool = False) -> dict[str, Any]:
|
233
243
|
"""Converts the ExtractionResult to a dictionary.
|
@@ -339,6 +349,12 @@ class ExtractionConfig:
|
|
339
349
|
"""Configuration for language detection. If None, uses default settings."""
|
340
350
|
spacy_entity_extraction_config: SpacyEntityExtractionConfig | None = None
|
341
351
|
"""Configuration for spaCy entity extraction. If None, uses default settings."""
|
352
|
+
auto_detect_document_type: bool = False
|
353
|
+
"""Whether to automatically detect the document type."""
|
354
|
+
document_type_confidence_threshold: float = 0.7
|
355
|
+
"""Confidence threshold for document type detection."""
|
356
|
+
document_classification_mode: Literal["text", "vision"] = "text"
|
357
|
+
"""The mode to use for document classification."""
|
342
358
|
enable_quality_processing: bool = True
|
343
359
|
"""Whether to apply quality post-processing to improve extraction results."""
|
344
360
|
|
@@ -349,9 +365,9 @@ class ExtractionConfig:
|
|
349
365
|
object.__setattr__(self, "post_processing_hooks", tuple(self.post_processing_hooks))
|
350
366
|
if self.validators is not None and isinstance(self.validators, list):
|
351
367
|
object.__setattr__(self, "validators", tuple(self.validators))
|
352
|
-
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
353
|
-
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
354
|
-
from kreuzberg._ocr._tesseract import TesseractConfig
|
368
|
+
from kreuzberg._ocr._easyocr import EasyOCRConfig # noqa: PLC0415
|
369
|
+
from kreuzberg._ocr._paddleocr import PaddleOCRConfig # noqa: PLC0415
|
370
|
+
from kreuzberg._ocr._tesseract import TesseractConfig # noqa: PLC0415
|
355
371
|
|
356
372
|
if self.ocr_backend is None and self.ocr_config is not None:
|
357
373
|
raise ValidationError("'ocr_backend' is None but 'ocr_config' is provided")
|
@@ -381,14 +397,14 @@ class ExtractionConfig:
|
|
381
397
|
|
382
398
|
# Lazy load and cache default configs instead of creating new instances
|
383
399
|
if self.ocr_backend == "tesseract":
|
384
|
-
from kreuzberg._ocr._tesseract import TesseractConfig
|
400
|
+
from kreuzberg._ocr._tesseract import TesseractConfig # noqa: PLC0415
|
385
401
|
|
386
402
|
return asdict(TesseractConfig())
|
387
403
|
if self.ocr_backend == "easyocr":
|
388
|
-
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
404
|
+
from kreuzberg._ocr._easyocr import EasyOCRConfig # noqa: PLC0415
|
389
405
|
|
390
406
|
return asdict(EasyOCRConfig())
|
391
407
|
# paddleocr
|
392
|
-
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
408
|
+
from kreuzberg._ocr._paddleocr import PaddleOCRConfig # noqa: PLC0415
|
393
409
|
|
394
410
|
return asdict(PaddleOCRConfig())
|
kreuzberg/_utils/_cache.py
CHANGED
@@ -7,6 +7,7 @@ import os
|
|
7
7
|
import threading
|
8
8
|
import time
|
9
9
|
from contextlib import suppress
|
10
|
+
from io import StringIO
|
10
11
|
from pathlib import Path
|
11
12
|
from typing import Any, Generic, TypeVar
|
12
13
|
|
@@ -126,9 +127,7 @@ class KreuzbergCache(Generic[T]):
|
|
126
127
|
data = cached_data["data"]
|
127
128
|
|
128
129
|
if cached_data.get("type") == "TableDataList" and isinstance(data, list):
|
129
|
-
|
130
|
-
|
131
|
-
import pandas as pd
|
130
|
+
import pandas as pd # noqa: PLC0415
|
132
131
|
|
133
132
|
deserialized_data = []
|
134
133
|
for item in data:
|
kreuzberg/_utils/_device.py
CHANGED
@@ -141,7 +141,7 @@ def get_device_memory_info(device: DeviceInfo) -> tuple[float | None, float | No
|
|
141
141
|
def _is_cuda_available() -> bool:
|
142
142
|
"""Check if CUDA is available."""
|
143
143
|
try:
|
144
|
-
import torch # type: ignore[import-not-found,unused-ignore]
|
144
|
+
import torch # type: ignore[import-not-found,unused-ignore] # noqa: PLC0415
|
145
145
|
|
146
146
|
return bool(torch.cuda.is_available())
|
147
147
|
except ImportError:
|
@@ -151,7 +151,7 @@ def _is_cuda_available() -> bool:
|
|
151
151
|
def _is_mps_available() -> bool:
|
152
152
|
"""Check if MPS (Apple Silicon) is available."""
|
153
153
|
try:
|
154
|
-
import torch # type: ignore[import-not-found,unused-ignore]
|
154
|
+
import torch # type: ignore[import-not-found,unused-ignore] # noqa: PLC0415
|
155
155
|
|
156
156
|
return bool(torch.backends.mps.is_available())
|
157
157
|
except ImportError:
|
@@ -163,7 +163,7 @@ def _get_cuda_devices() -> list[DeviceInfo]:
|
|
163
163
|
devices: list[DeviceInfo] = []
|
164
164
|
|
165
165
|
try:
|
166
|
-
import torch
|
166
|
+
import torch # noqa: PLC0415
|
167
167
|
|
168
168
|
if not torch.cuda.is_available():
|
169
169
|
return devices
|
@@ -199,7 +199,7 @@ def _get_cuda_devices() -> list[DeviceInfo]:
|
|
199
199
|
def _get_mps_device() -> DeviceInfo | None:
|
200
200
|
"""Get information about the MPS device."""
|
201
201
|
try:
|
202
|
-
import torch
|
202
|
+
import torch # noqa: PLC0415
|
203
203
|
|
204
204
|
if not torch.backends.mps.is_available():
|
205
205
|
return None
|
@@ -216,7 +216,7 @@ def _get_mps_device() -> DeviceInfo | None:
|
|
216
216
|
def _get_cuda_memory_info(device_id: int) -> tuple[float | None, float | None]:
|
217
217
|
"""Get CUDA memory information for a specific device."""
|
218
218
|
try:
|
219
|
-
import torch
|
219
|
+
import torch # noqa: PLC0415
|
220
220
|
|
221
221
|
if not torch.cuda.is_available():
|
222
222
|
return None, None
|
@@ -329,7 +329,7 @@ def cleanup_device_memory(device: DeviceInfo) -> None:
|
|
329
329
|
"""
|
330
330
|
if device.device_type == "cuda":
|
331
331
|
try:
|
332
|
-
import torch
|
332
|
+
import torch # noqa: PLC0415
|
333
333
|
|
334
334
|
if torch.cuda.is_available():
|
335
335
|
torch.cuda.empty_cache()
|
@@ -338,7 +338,7 @@ def cleanup_device_memory(device: DeviceInfo) -> None:
|
|
338
338
|
|
339
339
|
elif device.device_type == "mps":
|
340
340
|
try:
|
341
|
-
import torch
|
341
|
+
import torch # noqa: PLC0415
|
342
342
|
|
343
343
|
if torch.backends.mps.is_available():
|
344
344
|
torch.mps.empty_cache()
|
kreuzberg/cli.py
CHANGED
@@ -160,7 +160,7 @@ def _perform_extraction(file: Path | None, extraction_config: ExtractionConfig,
|
|
160
160
|
progress.add_task("Extracting text...", total=None)
|
161
161
|
|
162
162
|
try:
|
163
|
-
import magic # type: ignore[import-not-found]
|
163
|
+
import magic # type: ignore[import-not-found] # noqa: PLC0415
|
164
164
|
|
165
165
|
mime_type = magic.from_buffer(input_bytes, mime=True)
|
166
166
|
except ImportError:
|
@@ -260,7 +260,7 @@ def cli(ctx: click.Context) -> None:
|
|
260
260
|
@click.option("--paddleocr-languages", help="PaddleOCR language codes (comma-separated, e.g., 'en,german')")
|
261
261
|
@click.pass_context
|
262
262
|
def extract( # noqa: PLR0913
|
263
|
-
|
263
|
+
_: click.Context,
|
264
264
|
file: Path | None,
|
265
265
|
output: Path | None,
|
266
266
|
force_ocr: bool,
|
kreuzberg/extraction.py
CHANGED
@@ -7,15 +7,15 @@ from typing import TYPE_CHECKING, Any, Final, cast
|
|
7
7
|
|
8
8
|
import anyio
|
9
9
|
|
10
|
-
from kreuzberg import ExtractionResult
|
11
10
|
from kreuzberg._chunker import get_chunker
|
11
|
+
from kreuzberg._document_classification import auto_detect_document_type
|
12
12
|
from kreuzberg._entity_extraction import extract_entities, extract_keywords
|
13
13
|
from kreuzberg._language_detection import detect_languages
|
14
14
|
from kreuzberg._mime_types import (
|
15
15
|
validate_mime_type,
|
16
16
|
)
|
17
17
|
from kreuzberg._registry import ExtractorRegistry
|
18
|
-
from kreuzberg._types import ExtractionConfig
|
18
|
+
from kreuzberg._types import ExtractionConfig, ExtractionResult
|
19
19
|
from kreuzberg._utils._document_cache import get_document_cache
|
20
20
|
from kreuzberg._utils._errors import create_error_context
|
21
21
|
from kreuzberg._utils._string import safe_decode
|
@@ -30,7 +30,9 @@ if TYPE_CHECKING:
|
|
30
30
|
DEFAULT_CONFIG: Final[ExtractionConfig] = ExtractionConfig()
|
31
31
|
|
32
32
|
|
33
|
-
def _validate_and_post_process_helper(
|
33
|
+
def _validate_and_post_process_helper(
|
34
|
+
result: ExtractionResult, config: ExtractionConfig, file_path: Path | None = None
|
35
|
+
) -> ExtractionResult:
|
34
36
|
if config.chunk_content:
|
35
37
|
result.chunks = _handle_chunk_content(
|
36
38
|
mime_type=result.mime_type,
|
@@ -62,14 +64,19 @@ def _validate_and_post_process_helper(result: ExtractionResult, config: Extracti
|
|
62
64
|
config=config.language_detection_config,
|
63
65
|
)
|
64
66
|
|
67
|
+
if config.auto_detect_document_type:
|
68
|
+
result = auto_detect_document_type(result, config, file_path=file_path)
|
69
|
+
|
65
70
|
return result
|
66
71
|
|
67
72
|
|
68
|
-
async def _validate_and_post_process_async(
|
73
|
+
async def _validate_and_post_process_async(
|
74
|
+
result: ExtractionResult, config: ExtractionConfig, file_path: Path | None = None
|
75
|
+
) -> ExtractionResult:
|
69
76
|
for validator in config.validators or []:
|
70
77
|
await run_maybe_sync(validator, result)
|
71
78
|
|
72
|
-
result = _validate_and_post_process_helper(result, config)
|
79
|
+
result = _validate_and_post_process_helper(result, config, file_path)
|
73
80
|
|
74
81
|
for post_processor in config.post_processing_hooks or []:
|
75
82
|
result = await run_maybe_sync(post_processor, result)
|
@@ -77,11 +84,13 @@ async def _validate_and_post_process_async(result: ExtractionResult, config: Ext
|
|
77
84
|
return result
|
78
85
|
|
79
86
|
|
80
|
-
def _validate_and_post_process_sync(
|
87
|
+
def _validate_and_post_process_sync(
|
88
|
+
result: ExtractionResult, config: ExtractionConfig, file_path: Path | None = None
|
89
|
+
) -> ExtractionResult:
|
81
90
|
for validator in config.validators or []:
|
82
91
|
run_sync_only(validator, result)
|
83
92
|
|
84
|
-
result = _validate_and_post_process_helper(result, config)
|
93
|
+
result = _validate_and_post_process_helper(result, config, file_path)
|
85
94
|
|
86
95
|
for post_processor in config.post_processing_hooks or []:
|
87
96
|
result = run_sync_only(post_processor, result)
|
@@ -172,7 +181,7 @@ async def extract_file(
|
|
172
181
|
metadata={},
|
173
182
|
)
|
174
183
|
|
175
|
-
result = await _validate_and_post_process_async(result=result, config=config)
|
184
|
+
result = await _validate_and_post_process_async(result=result, config=config, file_path=path)
|
176
185
|
|
177
186
|
cache.set(path, config, result)
|
178
187
|
|
@@ -357,7 +366,7 @@ def extract_file_sync(
|
|
357
366
|
metadata={},
|
358
367
|
)
|
359
368
|
|
360
|
-
result = _validate_and_post_process_sync(result=result, config=config)
|
369
|
+
result = _validate_and_post_process_sync(result=result, config=config, file_path=path)
|
361
370
|
|
362
371
|
cache.set(path, config, result)
|
363
372
|
|
@@ -1,13 +1,13 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.9.0
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
7
7
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
8
8
|
License: MIT
|
9
9
|
License-File: LICENSE
|
10
|
-
Keywords: async,document-analysis,document-intelligence,document-processing,extensible,information-extraction,mcp,metadata-extraction,model-context-protocol,ocr,pandoc,pdf-extraction,pdfium,plugin-architecture,rag,retrieval-augmented-generation,structured-data,table-extraction,tesseract,text-extraction
|
10
|
+
Keywords: async,document-analysis,document-classification,document-intelligence,document-processing,extensible,information-extraction,mcp,metadata-extraction,model-context-protocol,ocr,pandoc,pdf-extraction,pdfium,plugin-architecture,rag,retrieval-augmented-generation,structured-data,table-extraction,tesseract,text-extraction
|
11
11
|
Classifier: Development Status :: 5 - Production/Stable
|
12
12
|
Classifier: Intended Audience :: Developers
|
13
13
|
Classifier: Intended Audience :: Information Technology
|
@@ -60,6 +60,9 @@ Requires-Dist: spacy>=3.8.7; extra == 'all'
|
|
60
60
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
|
61
61
|
Provides-Extra: api
|
62
62
|
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'api'
|
63
|
+
Provides-Extra: auto-classify-document-type
|
64
|
+
Requires-Dist: deep-translator>=1.11.4; extra == 'auto-classify-document-type'
|
65
|
+
Requires-Dist: pandas>=2.3.1; extra == 'auto-classify-document-type'
|
63
66
|
Provides-Extra: chunking
|
64
67
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
|
65
68
|
Provides-Extra: cli
|
@@ -88,7 +91,7 @@ Description-Content-Type: text/markdown
|
|
88
91
|
[](https://kreuzberg.dev/)
|
89
92
|
[](https://benchmarks.kreuzberg.dev/)
|
90
93
|
[](https://opensource.org/licenses/MIT)
|
91
|
-
[](https://app.deepsource.com/gh/Goldziher/kreuzberg/)
|
92
95
|
|
93
96
|
**A document intelligence framework for Python.** Extract text, metadata, and structured information from diverse document formats through a unified, extensible API. Built on established open source foundations including Pandoc, PDFium, and Tesseract.
|
94
97
|
|
@@ -103,6 +106,7 @@ Description-Content-Type: text/markdown
|
|
103
106
|
- **Format Support**: 18 document types including PDF, Microsoft Office, images, HTML, and structured data formats
|
104
107
|
- **OCR Integration**: Multiple OCR engines (Tesseract, EasyOCR, PaddleOCR) with automatic fallback
|
105
108
|
- **Table Detection**: Structured table extraction with cell-level precision via GMFT integration
|
109
|
+
- **Document Classification**: Automatic document type detection (contracts, forms, invoices, receipts, reports)
|
106
110
|
|
107
111
|
### Technical Architecture
|
108
112
|
|
@@ -1,18 +1,19 @@
|
|
1
1
|
kreuzberg/__init__.py,sha256=0OJ_jNKbS6GxzWC5-EfRCiE80as_ya0-wwyNsTYbxzY,1721
|
2
2
|
kreuzberg/__main__.py,sha256=s2qM1nPEkRHAQP-G3P7sf5l6qA_KJeIEHS5LpPz04lg,183
|
3
|
-
kreuzberg/_chunker.py,sha256=
|
4
|
-
kreuzberg/_config.py,sha256=
|
3
|
+
kreuzberg/_chunker.py,sha256=QmYbPHPE36ztMT70xPwg_Y4NIftCDl0wyufg5X9lmTo,1932
|
4
|
+
kreuzberg/_config.py,sha256=EvrBFAawjfKgXu49tACi4CuMmmoIRt_EzbHayZqM_jU,12983
|
5
5
|
kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
|
6
|
-
kreuzberg/
|
7
|
-
kreuzberg/
|
6
|
+
kreuzberg/_document_classification.py,sha256=8XVTKh8ohsb4mbKw2gPFr5OB6v4dWuzXhFE_63vHLrw,5189
|
7
|
+
kreuzberg/_entity_extraction.py,sha256=43VxtCPbuSrfi-XGSop-VRoZadJcNe02qRGwjGSaR0c,7862
|
8
|
+
kreuzberg/_gmft.py,sha256=JWDeBYWSDkh3tLUhwTmW5CbIBEig2A3o712BLshk7hE,25533
|
8
9
|
kreuzberg/_language_detection.py,sha256=eEfj4tsh91SfB2_zQIdY-qD7TlPcppaFm0SqQmETS6Y,3295
|
9
|
-
kreuzberg/_mime_types.py,sha256=
|
10
|
+
kreuzberg/_mime_types.py,sha256=2warRVqfBUNIg8JBg8yP4pRqaMPvwINosHMkJwtH_Fc,8488
|
10
11
|
kreuzberg/_playa.py,sha256=9z4If0WHxbYQxfb8xT7T96L9Du2Fj3Ar5-rF0OHHiMM,11877
|
11
12
|
kreuzberg/_registry.py,sha256=wGSlkS0U1zqruWQCLE95vj4a2mw1yyvf0j6rgz80sJg,3473
|
12
|
-
kreuzberg/_types.py,sha256=
|
13
|
-
kreuzberg/cli.py,sha256=
|
13
|
+
kreuzberg/_types.py,sha256=Si-Kb58HgE4ckGyZnJFqbWRbCNbdyC_Y0-p75aQP838,15065
|
14
|
+
kreuzberg/cli.py,sha256=lmthZa0x8pz7SQfCoPPdTaUF9aQZ8W4w5FlnPcsGr9k,12438
|
14
15
|
kreuzberg/exceptions.py,sha256=PTiAZgQwcG9hXbgYg2W7sfxksFhq5_wzOFgZGnTJAoc,2991
|
15
|
-
kreuzberg/extraction.py,sha256=
|
16
|
+
kreuzberg/extraction.py,sha256=Kt1mOxdlOb35yVOdpdhiRPuTgA9BW_TTG9qwCkSxSkc,17332
|
16
17
|
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
18
|
kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
19
|
kreuzberg/_api/main.py,sha256=g3kqXUfSie2pcw3-EWOM4TAoJUqM7yj2e-cBQJ_bmYc,3253
|
@@ -20,22 +21,22 @@ kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3h
|
|
20
21
|
kreuzberg/_extractors/_base.py,sha256=yNVQSECFad-8_MjqpQZ4q0jQoNdzP6-tqw6l3TfgsMc,4418
|
21
22
|
kreuzberg/_extractors/_email.py,sha256=6-Mk1TRXPyy9ylWKCpgdrogyzhiFnJOTuTRld1ghO8I,5695
|
22
23
|
kreuzberg/_extractors/_html.py,sha256=lOM1Tgrrvd7vpEeFAxC1dp0Tibr6N2FEHCjgFx0FK64,1745
|
23
|
-
kreuzberg/_extractors/_image.py,sha256=
|
24
|
+
kreuzberg/_extractors/_image.py,sha256=OmkqR5Cd6bTM_qHEdNHeXa5eK-3KvtLgHX-JE5oZWec,4483
|
24
25
|
kreuzberg/_extractors/_pandoc.py,sha256=51k7XISfKaPorhapG7aIeQb94KGsfozxKyT2rwhk9Bk,26553
|
25
|
-
kreuzberg/_extractors/_pdf.py,sha256=
|
26
|
+
kreuzberg/_extractors/_pdf.py,sha256=UlliWggWHuVwwJE-bRa7H9-_cieSa8kdrQP3x_GOxxY,17018
|
26
27
|
kreuzberg/_extractors/_presentation.py,sha256=CUlqZl_QCdJdumsZh0BpROkFbvi9uq7yMoIt3bRTUeE,10859
|
27
|
-
kreuzberg/_extractors/_spread_sheet.py,sha256=
|
28
|
-
kreuzberg/_extractors/_structured.py,sha256=
|
28
|
+
kreuzberg/_extractors/_spread_sheet.py,sha256=iagiyJsnl-89OP1eqmEv8jWl7gZBJm2x0YOyqBgLasA,13733
|
29
|
+
kreuzberg/_extractors/_structured.py,sha256=J7op5ZZ663a_CkE2SB5eKjPQPXKHdqPst-GFCJMNGqw,5727
|
29
30
|
kreuzberg/_mcp/__init__.py,sha256=8PYV-omC8Rln7Cove8C3rHu3d7sR1FuiwSBG1O7vkAE,92
|
30
31
|
kreuzberg/_mcp/server.py,sha256=Ab0w7kR3m7_L1cfhYHiC8HqDL282vt4uBYwYc9w9E08,8703
|
31
32
|
kreuzberg/_ocr/__init__.py,sha256=grshVFwVQl2rMvH1hg1JNlYXjy5-Tdb_rusLD1Cselk,706
|
32
|
-
kreuzberg/_ocr/_base.py,sha256=
|
33
|
-
kreuzberg/_ocr/_easyocr.py,sha256=
|
34
|
-
kreuzberg/_ocr/_paddleocr.py,sha256=
|
35
|
-
kreuzberg/_ocr/_tesseract.py,sha256=
|
33
|
+
kreuzberg/_ocr/_base.py,sha256=urvsLRgOmVYHjxil_IsSL69FmMnboklC4CHAjdBQLKQ,3893
|
34
|
+
kreuzberg/_ocr/_easyocr.py,sha256=pw2uDmULuMQ9T1Gl4axP_ev7-qwjLt1mJHHyZ34P_FI,17178
|
35
|
+
kreuzberg/_ocr/_paddleocr.py,sha256=s75aQJILXm1ZbacyZiLPXh6jEAg9tk2NYnwPnfSDrRU,17543
|
36
|
+
kreuzberg/_ocr/_tesseract.py,sha256=teLMH1pBhpcmEXDcyZlv56hYINLGMuaKZ0CQtcu_czQ,31510
|
36
37
|
kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
37
|
-
kreuzberg/_utils/_cache.py,sha256=
|
38
|
-
kreuzberg/_utils/_device.py,sha256=
|
38
|
+
kreuzberg/_utils/_cache.py,sha256=hYd_a5Ni5VJBE1XU_eN9gvQ5gg0FRsdbRgmJe-OIJHM,15253
|
39
|
+
kreuzberg/_utils/_device.py,sha256=arVrJOSp_2LbbN6lu_rMEUOezzRogdWdkF8d5q5Bg8U,10345
|
39
40
|
kreuzberg/_utils/_document_cache.py,sha256=z8irioKsOu8xve1YgHatm__wIFvs9I1gDK3tLNsNyqM,6926
|
40
41
|
kreuzberg/_utils/_errors.py,sha256=UsktQ_p7eOj9crPsFDg8HgRSE5-IpuFC7y1e6dDI_fY,6503
|
41
42
|
kreuzberg/_utils/_pdf_lock.py,sha256=nqxAYCNlfWDrJtP4ZNu57st1YnkDl-gYXdr0q8nv0kA,1961
|
@@ -46,8 +47,8 @@ kreuzberg/_utils/_string.py,sha256=bCzO3UO6nXupxvtMWvHqfp1Vd9CTzEH9jmpJXQ7upAU,6
|
|
46
47
|
kreuzberg/_utils/_sync.py,sha256=7LSavBmxVKQUzdjfx9fYRAI9IbJtRw8iGf_Q8B7RX9g,4923
|
47
48
|
kreuzberg/_utils/_table.py,sha256=IomrfQBP85DZI8RmQjOVs2Siq7VP9FUTYPaZR4t3yRw,8199
|
48
49
|
kreuzberg/_utils/_tmp.py,sha256=hVn-VVijIg2FM7EZJ899gc7wZg-TGoJZoeAcxMX-Cxg,1044
|
49
|
-
kreuzberg-3.
|
50
|
-
kreuzberg-3.
|
51
|
-
kreuzberg-3.
|
52
|
-
kreuzberg-3.
|
53
|
-
kreuzberg-3.
|
50
|
+
kreuzberg-3.9.0.dist-info/METADATA,sha256=C83JYzqxhGHhrqWDUmo0eJwK_2szx9ZQt3cnkocgwBY,11876
|
51
|
+
kreuzberg-3.9.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
52
|
+
kreuzberg-3.9.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
|
53
|
+
kreuzberg-3.9.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
54
|
+
kreuzberg-3.9.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|