kreuzberg 3.10.0__py3-none-any.whl → 3.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_api/main.py +1 -1
- kreuzberg/_chunker.py +1 -1
- kreuzberg/_config.py +41 -16
- kreuzberg/_document_classification.py +41 -6
- kreuzberg/_entity_extraction.py +2 -2
- kreuzberg/_extractors/_base.py +1 -2
- kreuzberg/_extractors/_email.py +31 -8
- kreuzberg/_extractors/_image.py +18 -17
- kreuzberg/_extractors/_pdf.py +31 -34
- kreuzberg/_extractors/_structured.py +3 -3
- kreuzberg/_gmft.py +2 -2
- kreuzberg/_language_detection.py +1 -1
- kreuzberg/_mcp/server.py +2 -2
- kreuzberg/_ocr/_base.py +3 -3
- kreuzberg/_ocr/_easyocr.py +3 -3
- kreuzberg/_ocr/_paddleocr.py +2 -2
- kreuzberg/_playa.py +3 -1
- kreuzberg/_types.py +14 -13
- kreuzberg/_utils/_device.py +6 -6
- kreuzberg/_utils/_document_cache.py +1 -0
- kreuzberg/cli.py +6 -6
- {kreuzberg-3.10.0.dist-info → kreuzberg-3.11.0.dist-info}/METADATA +7 -5
- {kreuzberg-3.10.0.dist-info → kreuzberg-3.11.0.dist-info}/RECORD +26 -26
- {kreuzberg-3.10.0.dist-info → kreuzberg-3.11.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.10.0.dist-info → kreuzberg-3.11.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.10.0.dist-info → kreuzberg-3.11.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_api/main.py
CHANGED
@@ -30,7 +30,7 @@ try:
|
|
30
30
|
HTTP_422_UNPROCESSABLE_ENTITY,
|
31
31
|
HTTP_500_INTERNAL_SERVER_ERROR,
|
32
32
|
)
|
33
|
-
except ImportError as e:
|
33
|
+
except ImportError as e: # pragma: no cover
|
34
34
|
raise MissingDependencyError.create_for_package(
|
35
35
|
dependency_group="litestar",
|
36
36
|
functionality="Litestar API and docker container",
|
kreuzberg/_chunker.py
CHANGED
@@ -43,7 +43,7 @@ def get_chunker(
|
|
43
43
|
from semantic_text_splitter import TextSplitter # noqa: PLC0415
|
44
44
|
|
45
45
|
_chunkers[key] = TextSplitter(max_characters, overlap_characters)
|
46
|
-
except ImportError as e:
|
46
|
+
except ImportError as e: # pragma: no cover
|
47
47
|
raise MissingDependencyError.create_for_package(
|
48
48
|
dependency_group="chunking", functionality="chunking", package_name="semantic-text-splitter"
|
49
49
|
) from e
|
kreuzberg/_config.py
CHANGED
@@ -13,7 +13,7 @@ from typing import TYPE_CHECKING, Any
|
|
13
13
|
|
14
14
|
if sys.version_info >= (3, 11):
|
15
15
|
import tomllib
|
16
|
-
else:
|
16
|
+
else: # pragma: no cover
|
17
17
|
import tomli as tomllib # type: ignore[import-not-found]
|
18
18
|
|
19
19
|
from kreuzberg._gmft import GMFTConfig
|
@@ -50,7 +50,13 @@ def load_config_from_file(config_path: Path) -> dict[str, Any]:
|
|
50
50
|
# Handle both kreuzberg.toml (root level) and pyproject.toml ([tool.kreuzberg])
|
51
51
|
if config_path.name == "kreuzberg.toml":
|
52
52
|
return data # type: ignore[no-any-return]
|
53
|
-
|
53
|
+
|
54
|
+
# For other files, check if they have [tool.kreuzberg] section
|
55
|
+
if config_path.name == "pyproject.toml" or ("tool" in data and "kreuzberg" in data.get("tool", {})):
|
56
|
+
return data.get("tool", {}).get("kreuzberg", {}) # type: ignore[no-any-return]
|
57
|
+
|
58
|
+
# Otherwise assume root-level configuration
|
59
|
+
return data # type: ignore[no-any-return]
|
54
60
|
|
55
61
|
|
56
62
|
def merge_configs(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
|
@@ -91,19 +97,21 @@ def parse_ocr_backend_config(
|
|
91
97
|
if not isinstance(backend_config, dict):
|
92
98
|
return None
|
93
99
|
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
100
|
+
match backend:
|
101
|
+
case "tesseract":
|
102
|
+
# Convert psm integer to PSMMode enum if needed
|
103
|
+
processed_config = backend_config.copy()
|
104
|
+
if "psm" in processed_config and isinstance(processed_config["psm"], int):
|
105
|
+
from kreuzberg._ocr._tesseract import PSMMode # noqa: PLC0415
|
106
|
+
|
107
|
+
processed_config["psm"] = PSMMode(processed_config["psm"])
|
108
|
+
return TesseractConfig(**processed_config)
|
109
|
+
case "easyocr":
|
110
|
+
return EasyOCRConfig(**backend_config)
|
111
|
+
case "paddleocr":
|
112
|
+
return PaddleOCRConfig(**backend_config)
|
113
|
+
case _:
|
114
|
+
return None
|
107
115
|
|
108
116
|
|
109
117
|
def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> ExtractionConfig:
|
@@ -129,12 +137,25 @@ def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> Extraction
|
|
129
137
|
"extract_keywords",
|
130
138
|
"auto_detect_language",
|
131
139
|
"enable_quality_processing",
|
140
|
+
"auto_detect_document_type",
|
141
|
+
"document_type_confidence_threshold",
|
142
|
+
"document_classification_mode",
|
143
|
+
"keyword_count",
|
144
|
+
}
|
145
|
+
extraction_config = extraction_config | {
|
146
|
+
field: config_dict[field] for field in basic_fields if field in config_dict
|
132
147
|
}
|
133
|
-
extraction_config.update({field: config_dict[field] for field in basic_fields if field in config_dict})
|
134
148
|
|
135
149
|
# Handle OCR backend configuration
|
136
150
|
ocr_backend = extraction_config.get("ocr_backend")
|
137
151
|
if ocr_backend and ocr_backend != "none":
|
152
|
+
# Validate OCR backend
|
153
|
+
valid_backends = {"tesseract", "easyocr", "paddleocr"}
|
154
|
+
if ocr_backend not in valid_backends:
|
155
|
+
raise ValidationError(
|
156
|
+
f"Invalid OCR backend: {ocr_backend}. Must be one of: {', '.join(sorted(valid_backends))} or 'none'",
|
157
|
+
context={"provided": ocr_backend, "valid": sorted(valid_backends)},
|
158
|
+
)
|
138
159
|
ocr_config = parse_ocr_backend_config(config_dict, ocr_backend)
|
139
160
|
if ocr_config:
|
140
161
|
extraction_config["ocr_config"] = ocr_config
|
@@ -286,6 +307,10 @@ _CONFIG_FIELDS = [
|
|
286
307
|
"extract_keywords",
|
287
308
|
"auto_detect_language",
|
288
309
|
"enable_quality_processing",
|
310
|
+
"auto_detect_document_type",
|
311
|
+
"document_type_confidence_threshold",
|
312
|
+
"document_classification_mode",
|
313
|
+
"keyword_count",
|
289
314
|
]
|
290
315
|
|
291
316
|
|
@@ -4,13 +4,12 @@ import re
|
|
4
4
|
from typing import TYPE_CHECKING
|
5
5
|
|
6
6
|
from kreuzberg._ocr import get_ocr_backend
|
7
|
+
from kreuzberg._types import ExtractionConfig, ExtractionResult # noqa: TC001
|
7
8
|
from kreuzberg.exceptions import MissingDependencyError
|
8
9
|
|
9
10
|
if TYPE_CHECKING:
|
10
11
|
from pathlib import Path
|
11
12
|
|
12
|
-
from kreuzberg._types import ExtractionConfig, ExtractionResult
|
13
|
-
|
14
13
|
|
15
14
|
DOCUMENT_CLASSIFIERS = {
|
16
15
|
"invoice": [
|
@@ -52,14 +51,25 @@ def _get_translated_text(result: ExtractionResult) -> str:
|
|
52
51
|
Raises:
|
53
52
|
MissingDependencyError: If the deep-translator package is not installed
|
54
53
|
"""
|
54
|
+
# Combine content with metadata for classification
|
55
|
+
text_to_classify = result.content
|
56
|
+
if result.metadata:
|
57
|
+
# Add metadata values to the text for classification
|
58
|
+
metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
|
59
|
+
text_to_classify = f"{text_to_classify} {metadata_text}"
|
60
|
+
|
55
61
|
try:
|
56
62
|
from deep_translator import GoogleTranslator # noqa: PLC0415
|
57
|
-
except ImportError as e:
|
63
|
+
except ImportError as e: # pragma: no cover
|
58
64
|
raise MissingDependencyError(
|
59
|
-
"The 'deep-translator' library is not installed. Please install it with: pip install 'kreuzberg[
|
65
|
+
"The 'deep-translator' library is not installed. Please install it with: pip install 'kreuzberg[document-classification]'"
|
60
66
|
) from e
|
61
67
|
|
62
|
-
|
68
|
+
try:
|
69
|
+
return str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
|
70
|
+
except Exception: # noqa: BLE001
|
71
|
+
# Fall back to original content in lowercase if translation fails
|
72
|
+
return text_to_classify.lower()
|
63
73
|
|
64
74
|
|
65
75
|
def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tuple[str | None, float | None]:
|
@@ -73,6 +83,9 @@ def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tup
|
|
73
83
|
A tuple containing the detected document type and the confidence score,
|
74
84
|
or (None, None) if no type is detected with sufficient confidence.
|
75
85
|
"""
|
86
|
+
if not config.auto_detect_document_type:
|
87
|
+
return None, None
|
88
|
+
|
76
89
|
translated_text = _get_translated_text(result)
|
77
90
|
scores = dict.fromkeys(DOCUMENT_CLASSIFIERS, 0)
|
78
91
|
|
@@ -108,7 +121,8 @@ def classify_document_from_layout(
|
|
108
121
|
A tuple containing the detected document type and the confidence score,
|
109
122
|
or (None, None) if no type is detected with sufficient confidence.
|
110
123
|
"""
|
111
|
-
|
124
|
+
if not config.auto_detect_document_type:
|
125
|
+
return None, None
|
112
126
|
|
113
127
|
if result.layout is None or result.layout.empty:
|
114
128
|
return None, None
|
@@ -117,6 +131,24 @@ def classify_document_from_layout(
|
|
117
131
|
if not all(col in layout_df.columns for col in ["text", "top", "height"]):
|
118
132
|
return None, None
|
119
133
|
|
134
|
+
# Use layout text for classification, not the content
|
135
|
+
layout_text = " ".join(layout_df["text"].astype(str).tolist())
|
136
|
+
|
137
|
+
# Translate layout text directly for classification
|
138
|
+
text_to_classify = layout_text
|
139
|
+
if result.metadata:
|
140
|
+
# Add metadata values to the text for classification
|
141
|
+
metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
|
142
|
+
text_to_classify = f"{text_to_classify} {metadata_text}"
|
143
|
+
|
144
|
+
try:
|
145
|
+
from deep_translator import GoogleTranslator # noqa: PLC0415
|
146
|
+
|
147
|
+
translated_text = str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
|
148
|
+
except Exception: # noqa: BLE001
|
149
|
+
# Fall back to original content in lowercase if translation fails
|
150
|
+
translated_text = text_to_classify.lower()
|
151
|
+
|
120
152
|
layout_df["translated_text"] = translated_text
|
121
153
|
|
122
154
|
page_height = layout_df["top"].max() + layout_df["height"].max()
|
@@ -151,6 +183,9 @@ def auto_detect_document_type(
|
|
151
183
|
if config.document_classification_mode == "vision" and file_path:
|
152
184
|
layout_result = get_ocr_backend("tesseract").process_file_sync(file_path, **config.get_config_dict())
|
153
185
|
result.document_type, result.document_type_confidence = classify_document_from_layout(layout_result, config)
|
186
|
+
elif result.layout is not None and not result.layout.empty:
|
187
|
+
# Use layout-based classification if layout data is available
|
188
|
+
result.document_type, result.document_type_confidence = classify_document_from_layout(result, config)
|
154
189
|
else:
|
155
190
|
result.document_type, result.document_type_confidence = classify_document(result, config)
|
156
191
|
return result
|
kreuzberg/_entity_extraction.py
CHANGED
@@ -139,7 +139,7 @@ def extract_entities(
|
|
139
139
|
|
140
140
|
try:
|
141
141
|
import spacy # noqa: F401, PLC0415
|
142
|
-
except ImportError as e:
|
142
|
+
except ImportError as e: # pragma: no cover
|
143
143
|
raise MissingDependencyError.create_for_package(
|
144
144
|
package_name="spacy",
|
145
145
|
dependency_group="entity-extraction",
|
@@ -230,7 +230,7 @@ def extract_keywords(
|
|
230
230
|
return [(kw, float(score)) for kw, score in keywords]
|
231
231
|
except (RuntimeError, OSError, ValueError):
|
232
232
|
return []
|
233
|
-
except ImportError as e:
|
233
|
+
except ImportError as e: # pragma: no cover
|
234
234
|
raise MissingDependencyError.create_for_package(
|
235
235
|
package_name="keybert",
|
236
236
|
dependency_group="entity-extraction",
|
kreuzberg/_extractors/_base.py
CHANGED
@@ -116,8 +116,7 @@ class Extractor(ABC):
|
|
116
116
|
quality_score = calculate_quality_score(cleaned_content, dict(result.metadata) if result.metadata else None)
|
117
117
|
|
118
118
|
# Add quality metadata
|
119
|
-
enhanced_metadata = dict(result.metadata) if result.metadata else {}
|
120
|
-
enhanced_metadata["quality_score"] = quality_score
|
119
|
+
enhanced_metadata = (dict(result.metadata) if result.metadata else {}) | {"quality_score": quality_score}
|
121
120
|
|
122
121
|
# Return enhanced result
|
123
122
|
return ExtractionResult(
|
kreuzberg/_extractors/_email.py
CHANGED
@@ -19,12 +19,12 @@ if TYPE_CHECKING:
|
|
19
19
|
# Import optional dependencies at module level with proper error handling
|
20
20
|
try:
|
21
21
|
import mailparse
|
22
|
-
except ImportError:
|
22
|
+
except ImportError: # pragma: no cover
|
23
23
|
mailparse = None
|
24
24
|
|
25
25
|
try:
|
26
26
|
import html2text # type: ignore[import-not-found]
|
27
|
-
except ImportError:
|
27
|
+
except ImportError: # pragma: no cover
|
28
28
|
html2text = None
|
29
29
|
|
30
30
|
# Compile regex pattern once at module level
|
@@ -59,14 +59,19 @@ class EmailExtractor(Extractor):
|
|
59
59
|
|
60
60
|
to_info = parsed_email.get("to")
|
61
61
|
if to_info:
|
62
|
+
# Store the raw value in metadata (could be string, dict, or list)
|
62
63
|
if isinstance(to_info, list) and to_info:
|
64
|
+
# For metadata, use first recipient's email if it's a list
|
63
65
|
to_email = to_info[0].get("email", "") if isinstance(to_info[0], dict) else str(to_info[0])
|
66
|
+
metadata["email_to"] = to_email
|
64
67
|
elif isinstance(to_info, dict):
|
65
|
-
|
68
|
+
metadata["email_to"] = to_info.get("email", "")
|
66
69
|
else:
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
+
metadata["email_to"] = str(to_info)
|
71
|
+
|
72
|
+
# For display, format all recipients
|
73
|
+
to_formatted = self._format_email_field(to_info)
|
74
|
+
text_parts.append(f"To: {to_formatted}")
|
70
75
|
|
71
76
|
date = parsed_email.get("date")
|
72
77
|
if date:
|
@@ -76,12 +81,30 @@ class EmailExtractor(Extractor):
|
|
76
81
|
cc = parsed_email.get("cc")
|
77
82
|
if cc:
|
78
83
|
metadata["email_cc"] = cc
|
79
|
-
|
84
|
+
cc_formatted = self._format_email_field(cc)
|
85
|
+
text_parts.append(f"CC: {cc_formatted}")
|
80
86
|
|
81
87
|
bcc = parsed_email.get("bcc")
|
82
88
|
if bcc:
|
83
89
|
metadata["email_bcc"] = bcc
|
84
|
-
|
90
|
+
bcc_formatted = self._format_email_field(bcc)
|
91
|
+
text_parts.append(f"BCC: {bcc_formatted}")
|
92
|
+
|
93
|
+
def _format_email_field(self, field: Any) -> str:
|
94
|
+
"""Format email field (to, cc, bcc) for display."""
|
95
|
+
if isinstance(field, list):
|
96
|
+
emails = []
|
97
|
+
for item in field:
|
98
|
+
if isinstance(item, dict):
|
99
|
+
email = item.get("email", "")
|
100
|
+
if email:
|
101
|
+
emails.append(email)
|
102
|
+
else:
|
103
|
+
emails.append(str(item))
|
104
|
+
return ", ".join(emails)
|
105
|
+
if isinstance(field, dict):
|
106
|
+
return str(field.get("email", ""))
|
107
|
+
return str(field)
|
85
108
|
|
86
109
|
def _extract_email_body(self, parsed_email: dict[str, Any], text_parts: list[str]) -> None:
|
87
110
|
"""Extract and process email body content."""
|
kreuzberg/_extractors/_image.py
CHANGED
@@ -85,23 +85,24 @@ class ImageExtractor(Extractor):
|
|
85
85
|
|
86
86
|
backend = get_ocr_backend(self.config.ocr_backend)
|
87
87
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
88
|
+
match self.config.ocr_backend:
|
89
|
+
case "tesseract":
|
90
|
+
config = (
|
91
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
|
92
|
+
)
|
93
|
+
result = backend.process_file_sync(path, **asdict(config))
|
94
|
+
case "paddleocr":
|
95
|
+
paddle_config = (
|
96
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
97
|
+
)
|
98
|
+
result = backend.process_file_sync(path, **asdict(paddle_config))
|
99
|
+
case "easyocr":
|
100
|
+
easy_config = (
|
101
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
102
|
+
)
|
103
|
+
result = backend.process_file_sync(path, **asdict(easy_config))
|
104
|
+
case _:
|
105
|
+
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
105
106
|
return self._apply_quality_processing(result)
|
106
107
|
|
107
108
|
def _get_extension_from_mime_type(self, mime_type: str) -> str:
|
kreuzberg/_extractors/_pdf.py
CHANGED
@@ -82,20 +82,18 @@ class PDFExtractor(Extractor):
|
|
82
82
|
from kreuzberg._gmft import extract_tables # noqa: PLC0415
|
83
83
|
|
84
84
|
result.tables = await extract_tables(path, self.config.gmft_config)
|
85
|
-
except ImportError:
|
85
|
+
except ImportError: # pragma: no cover
|
86
86
|
result.tables = []
|
87
87
|
|
88
88
|
# Enhance metadata with table information
|
89
89
|
if result.tables:
|
90
90
|
table_summary = generate_table_summary(result.tables)
|
91
|
-
result.metadata.
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
}
|
98
|
-
)
|
91
|
+
result.metadata = result.metadata | {
|
92
|
+
"table_count": table_summary["table_count"],
|
93
|
+
"tables_summary": f"Document contains {table_summary['table_count']} tables "
|
94
|
+
f"across {table_summary['pages_with_tables']} pages with "
|
95
|
+
f"{table_summary['total_rows']} total rows",
|
96
|
+
}
|
99
97
|
|
100
98
|
return self._apply_quality_processing(result)
|
101
99
|
|
@@ -153,14 +151,12 @@ class PDFExtractor(Extractor):
|
|
153
151
|
# Enhance metadata with table information
|
154
152
|
if tables:
|
155
153
|
table_summary = generate_table_summary(tables)
|
156
|
-
result.metadata.
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
}
|
163
|
-
)
|
154
|
+
result.metadata = result.metadata | {
|
155
|
+
"table_count": table_summary["table_count"],
|
156
|
+
"tables_summary": f"Document contains {table_summary['table_count']} tables "
|
157
|
+
f"across {table_summary['pages_with_tables']} pages with "
|
158
|
+
f"{table_summary['total_rows']} total rows",
|
159
|
+
}
|
164
160
|
|
165
161
|
# Apply quality processing
|
166
162
|
return self._apply_quality_processing(result)
|
@@ -386,23 +382,24 @@ class PDFExtractor(Extractor):
|
|
386
382
|
backend = get_ocr_backend(self.config.ocr_backend)
|
387
383
|
paths = [Path(p) for p in image_paths]
|
388
384
|
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
385
|
+
match self.config.ocr_backend:
|
386
|
+
case "tesseract":
|
387
|
+
config = (
|
388
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
|
389
|
+
)
|
390
|
+
results = backend.process_batch_sync(paths, **asdict(config))
|
391
|
+
case "paddleocr":
|
392
|
+
paddle_config = (
|
393
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
394
|
+
)
|
395
|
+
results = backend.process_batch_sync(paths, **asdict(paddle_config))
|
396
|
+
case "easyocr":
|
397
|
+
easy_config = (
|
398
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
399
|
+
)
|
400
|
+
results = backend.process_batch_sync(paths, **asdict(easy_config))
|
401
|
+
case _:
|
402
|
+
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
406
403
|
|
407
404
|
# Use list comprehension and join for efficient string building
|
408
405
|
return "\n\n".join(result.content for result in results)
|
@@ -6,15 +6,15 @@ from typing import TYPE_CHECKING, Any, ClassVar
|
|
6
6
|
|
7
7
|
if sys.version_info >= (3, 11):
|
8
8
|
import tomllib
|
9
|
-
else:
|
9
|
+
else: # pragma: no cover
|
10
10
|
try:
|
11
11
|
import tomli as tomllib # type: ignore[import-not-found]
|
12
|
-
except ImportError:
|
12
|
+
except ImportError: # pragma: no cover
|
13
13
|
tomllib = None
|
14
14
|
|
15
15
|
try:
|
16
16
|
import yaml
|
17
|
-
except ImportError:
|
17
|
+
except ImportError: # pragma: no cover
|
18
18
|
yaml = None
|
19
19
|
|
20
20
|
from anyio import Path as AsyncPath
|
kreuzberg/_gmft.py
CHANGED
@@ -265,7 +265,7 @@ async def extract_tables(
|
|
265
265
|
finally:
|
266
266
|
await run_sync(doc.close)
|
267
267
|
|
268
|
-
except ImportError as e:
|
268
|
+
except ImportError as e: # pragma: no cover
|
269
269
|
raise MissingDependencyError.create_for_package(
|
270
270
|
dependency_group="gmft", functionality="table extraction", package_name="gmft"
|
271
271
|
) from e
|
@@ -379,7 +379,7 @@ def extract_tables_sync(
|
|
379
379
|
finally:
|
380
380
|
doc.close() # type: ignore[no-untyped-call]
|
381
381
|
|
382
|
-
except ImportError as e:
|
382
|
+
except ImportError as e: # pragma: no cover
|
383
383
|
raise MissingDependencyError.create_for_package(
|
384
384
|
dependency_group="gmft", functionality="table extraction", package_name="gmft"
|
385
385
|
) from e
|
kreuzberg/_language_detection.py
CHANGED
kreuzberg/_mcp/server.py
CHANGED
@@ -51,7 +51,7 @@ def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
|
|
51
51
|
}
|
52
52
|
|
53
53
|
# Override with provided parameters
|
54
|
-
config_dict
|
54
|
+
config_dict = config_dict | kwargs
|
55
55
|
|
56
56
|
return ExtractionConfig(**config_dict)
|
57
57
|
|
@@ -268,7 +268,7 @@ def extract_structured(file_path: str) -> list[TextContent]:
|
|
268
268
|
return [TextContent(type="text", text=content)]
|
269
269
|
|
270
270
|
|
271
|
-
def main() -> None:
|
271
|
+
def main() -> None: # pragma: no cover
|
272
272
|
"""Main entry point for the MCP server."""
|
273
273
|
mcp.run()
|
274
274
|
|
kreuzberg/_ocr/_base.py
CHANGED
@@ -88,7 +88,7 @@ class OCRBackend(ABC, Generic[T]):
|
|
88
88
|
Returns:
|
89
89
|
List of extraction result objects in the same order as input paths
|
90
90
|
"""
|
91
|
-
return [self.process_file_sync(path, **kwargs) for path in paths]
|
91
|
+
return [self.process_file_sync(path, **kwargs) for path in paths] # pragma: no cover
|
92
92
|
|
93
93
|
async def process_batch(self, paths: list[Path], **kwargs: Unpack[T]) -> list[ExtractionResult]:
|
94
94
|
"""Asynchronously process a batch of files and extract their text and metadata.
|
@@ -106,8 +106,8 @@ class OCRBackend(ABC, Generic[T]):
|
|
106
106
|
from kreuzberg._utils._sync import run_taskgroup # noqa: PLC0415
|
107
107
|
|
108
108
|
tasks = [self.process_file(path, **kwargs) for path in paths]
|
109
|
-
return await run_taskgroup(*tasks)
|
109
|
+
return await run_taskgroup(*tasks) # pragma: no cover
|
110
110
|
|
111
111
|
def __hash__(self) -> int:
|
112
112
|
"""Hash function for allowing caching."""
|
113
|
-
return hash(type(self).__name__)
|
113
|
+
return hash(type(self).__name__) # pragma: no cover
|
kreuzberg/_ocr/_easyocr.py
CHANGED
@@ -321,7 +321,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
321
321
|
import torch # noqa: PLC0415
|
322
322
|
|
323
323
|
return bool(torch.cuda.is_available())
|
324
|
-
except ImportError:
|
324
|
+
except ImportError: # pragma: no cover
|
325
325
|
return False
|
326
326
|
|
327
327
|
@classmethod
|
@@ -340,7 +340,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
340
340
|
|
341
341
|
try:
|
342
342
|
import easyocr # noqa: PLC0415
|
343
|
-
except ImportError as e:
|
343
|
+
except ImportError as e: # pragma: no cover
|
344
344
|
raise MissingDependencyError.create_for_package(
|
345
345
|
dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
|
346
346
|
) from e
|
@@ -508,7 +508,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
508
508
|
|
509
509
|
try:
|
510
510
|
import easyocr # noqa: PLC0415
|
511
|
-
except ImportError as e:
|
511
|
+
except ImportError as e: # pragma: no cover
|
512
512
|
raise MissingDependencyError.create_for_package(
|
513
513
|
dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
|
514
514
|
) from e
|
kreuzberg/_ocr/_paddleocr.py
CHANGED
@@ -261,7 +261,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
261
261
|
|
262
262
|
try:
|
263
263
|
from paddleocr import PaddleOCR # noqa: PLC0415
|
264
|
-
except ImportError as e:
|
264
|
+
except ImportError as e: # pragma: no cover
|
265
265
|
raise MissingDependencyError.create_for_package(
|
266
266
|
dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
|
267
267
|
) from e
|
@@ -428,7 +428,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
428
428
|
|
429
429
|
try:
|
430
430
|
from paddleocr import PaddleOCR # noqa: PLC0415
|
431
|
-
except ImportError as e:
|
431
|
+
except ImportError as e: # pragma: no cover
|
432
432
|
raise MissingDependencyError.create_for_package(
|
433
433
|
dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
|
434
434
|
) from e
|
kreuzberg/_playa.py
CHANGED
@@ -143,7 +143,9 @@ def _parse_date_string(date_str: str) -> str:
|
|
143
143
|
minute = date_str[10:12]
|
144
144
|
second = date_str[12:14]
|
145
145
|
time_part = f"T{hour}:{minute}:{second}"
|
146
|
-
|
146
|
+
if time_part:
|
147
|
+
return datetime.strptime(f"{year}-{month}-{day}{time_part}", "%Y-%m-%dT%H:%M:%S").isoformat() # noqa: DTZ007
|
148
|
+
return datetime.strptime(f"{year}-{month}-{day}", "%Y-%m-%d").isoformat() # noqa: DTZ007
|
147
149
|
return date_str
|
148
150
|
|
149
151
|
|
kreuzberg/_types.py
CHANGED
@@ -269,7 +269,7 @@ class ExtractionResult:
|
|
269
269
|
Returns:
|
270
270
|
List of CSV strings, one per table
|
271
271
|
"""
|
272
|
-
if not self.tables:
|
272
|
+
if not self.tables: # pragma: no cover
|
273
273
|
return []
|
274
274
|
|
275
275
|
return [export_table_to_csv(table) for table in self.tables]
|
@@ -280,7 +280,7 @@ class ExtractionResult:
|
|
280
280
|
Returns:
|
281
281
|
List of TSV strings, one per table
|
282
282
|
"""
|
283
|
-
if not self.tables:
|
283
|
+
if not self.tables: # pragma: no cover
|
284
284
|
return []
|
285
285
|
|
286
286
|
return [export_table_to_tsv(table) for table in self.tables]
|
@@ -291,7 +291,7 @@ class ExtractionResult:
|
|
291
291
|
Returns:
|
292
292
|
List of table structure dictionaries
|
293
293
|
"""
|
294
|
-
if not self.tables:
|
294
|
+
if not self.tables: # pragma: no cover
|
295
295
|
return []
|
296
296
|
|
297
297
|
return [extract_table_structure_info(table) for table in self.tables]
|
@@ -351,7 +351,7 @@ class ExtractionConfig:
|
|
351
351
|
"""Configuration for spaCy entity extraction. If None, uses default settings."""
|
352
352
|
auto_detect_document_type: bool = False
|
353
353
|
"""Whether to automatically detect the document type."""
|
354
|
-
document_type_confidence_threshold: float = 0.
|
354
|
+
document_type_confidence_threshold: float = 0.5
|
355
355
|
"""Confidence threshold for document type detection."""
|
356
356
|
document_classification_mode: Literal["text", "vision"] = "text"
|
357
357
|
"""The mode to use for document classification."""
|
@@ -398,15 +398,16 @@ class ExtractionConfig:
|
|
398
398
|
return asdict(self.ocr_config)
|
399
399
|
|
400
400
|
# Lazy load and cache default configs instead of creating new instances
|
401
|
-
|
402
|
-
|
401
|
+
match self.ocr_backend:
|
402
|
+
case "tesseract":
|
403
|
+
from kreuzberg._ocr._tesseract import TesseractConfig # noqa: PLC0415
|
403
404
|
|
404
|
-
|
405
|
-
|
406
|
-
|
405
|
+
return asdict(TesseractConfig())
|
406
|
+
case "easyocr":
|
407
|
+
from kreuzberg._ocr._easyocr import EasyOCRConfig # noqa: PLC0415
|
407
408
|
|
408
|
-
|
409
|
-
|
410
|
-
|
409
|
+
return asdict(EasyOCRConfig())
|
410
|
+
case _: # paddleocr or any other backend
|
411
|
+
from kreuzberg._ocr._paddleocr import PaddleOCRConfig # noqa: PLC0415
|
411
412
|
|
412
|
-
|
413
|
+
return asdict(PaddleOCRConfig())
|
kreuzberg/_utils/_device.py
CHANGED
@@ -144,7 +144,7 @@ def _is_cuda_available() -> bool:
|
|
144
144
|
import torch # type: ignore[import-not-found,unused-ignore] # noqa: PLC0415
|
145
145
|
|
146
146
|
return bool(torch.cuda.is_available())
|
147
|
-
except ImportError:
|
147
|
+
except ImportError: # pragma: no cover
|
148
148
|
return False
|
149
149
|
|
150
150
|
|
@@ -154,7 +154,7 @@ def _is_mps_available() -> bool:
|
|
154
154
|
import torch # type: ignore[import-not-found,unused-ignore] # noqa: PLC0415
|
155
155
|
|
156
156
|
return bool(torch.backends.mps.is_available())
|
157
|
-
except ImportError:
|
157
|
+
except ImportError: # pragma: no cover
|
158
158
|
return False
|
159
159
|
|
160
160
|
|
@@ -190,7 +190,7 @@ def _get_cuda_devices() -> list[DeviceInfo]:
|
|
190
190
|
)
|
191
191
|
)
|
192
192
|
|
193
|
-
except ImportError:
|
193
|
+
except ImportError: # pragma: no cover
|
194
194
|
pass
|
195
195
|
|
196
196
|
return devices
|
@@ -209,7 +209,7 @@ def _get_mps_device() -> DeviceInfo | None:
|
|
209
209
|
name="Apple Silicon GPU (MPS)",
|
210
210
|
)
|
211
211
|
|
212
|
-
except ImportError:
|
212
|
+
except ImportError: # pragma: no cover
|
213
213
|
return None
|
214
214
|
|
215
215
|
|
@@ -232,7 +232,7 @@ def _get_cuda_memory_info(device_id: int) -> tuple[float | None, float | None]:
|
|
232
232
|
|
233
233
|
return total_memory, available_memory
|
234
234
|
|
235
|
-
except ImportError:
|
235
|
+
except ImportError: # pragma: no cover
|
236
236
|
return None, None
|
237
237
|
|
238
238
|
|
@@ -333,7 +333,7 @@ def cleanup_device_memory(device: DeviceInfo) -> None:
|
|
333
333
|
|
334
334
|
if torch.cuda.is_available():
|
335
335
|
torch.cuda.empty_cache()
|
336
|
-
except ImportError:
|
336
|
+
except ImportError: # pragma: no cover # pragma: no cover
|
337
337
|
pass
|
338
338
|
|
339
339
|
elif device.device_type == "mps":
|
kreuzberg/cli.py
CHANGED
@@ -12,7 +12,7 @@ try:
|
|
12
12
|
import click
|
13
13
|
from rich.console import Console
|
14
14
|
from rich.progress import Progress, SpinnerColumn, TextColumn
|
15
|
-
except ImportError as e:
|
15
|
+
except ImportError as e: # pragma: no cover
|
16
16
|
raise ImportError(
|
17
17
|
"CLI dependencies are not installed. Please install kreuzberg with the 'cli' extra: pip install kreuzberg[cli]"
|
18
18
|
) from e
|
@@ -163,7 +163,7 @@ def _perform_extraction(file: Path | None, extraction_config: ExtractionConfig,
|
|
163
163
|
import magic # type: ignore[import-not-found] # noqa: PLC0415
|
164
164
|
|
165
165
|
mime_type = magic.from_buffer(input_bytes, mime=True)
|
166
|
-
except ImportError:
|
166
|
+
except ImportError: # pragma: no cover
|
167
167
|
content_str = input_bytes.decode("utf-8", errors="ignore").lower()
|
168
168
|
mime_type = "text/html" if "<html" in content_str or "<body" in content_str else "text/plain"
|
169
169
|
|
@@ -193,7 +193,7 @@ def _write_output(
|
|
193
193
|
click.echo(formatted_output)
|
194
194
|
|
195
195
|
|
196
|
-
def handle_error(error: Exception, verbose: bool) -> None:
|
196
|
+
def handle_error(error: Exception, verbose: bool) -> None: # pragma: no cover
|
197
197
|
"""Handle and display errors.
|
198
198
|
|
199
199
|
Args:
|
@@ -202,19 +202,19 @@ def handle_error(error: Exception, verbose: bool) -> None:
|
|
202
202
|
"""
|
203
203
|
if isinstance(error, MissingDependencyError):
|
204
204
|
console.print(f"[red]Missing dependency:[/red] {error}", style="bold")
|
205
|
-
sys.exit(2)
|
205
|
+
sys.exit(2) # pragma: no cover
|
206
206
|
elif isinstance(error, KreuzbergError):
|
207
207
|
console.print(f"[red]Error:[/red] {error}", style="bold")
|
208
208
|
if verbose and error.context:
|
209
209
|
console.print("\n[dim]Context:[/dim]")
|
210
210
|
console.print(json.dumps(error.context, indent=2))
|
211
|
-
sys.exit(1)
|
211
|
+
sys.exit(1) # pragma: no cover
|
212
212
|
else:
|
213
213
|
console.print(f"[red]Unexpected error:[/red] {type(error).__name__}: {error}", style="bold")
|
214
214
|
if verbose:
|
215
215
|
console.print("\n[dim]Traceback:[/dim]")
|
216
216
|
traceback.print_exc()
|
217
|
-
sys.exit(1)
|
217
|
+
sys.exit(1) # pragma: no cover
|
218
218
|
|
219
219
|
|
220
220
|
@click.group(invoke_without_command=True)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.11.0
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -32,7 +32,7 @@ Requires-Dist: anyio>=4.9.0
|
|
32
32
|
Requires-Dist: chardetng-py>=0.3.5
|
33
33
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
34
34
|
Requires-Dist: html-to-markdown[lxml]>=1.9.0
|
35
|
-
Requires-Dist: mcp>=1.12.
|
35
|
+
Requires-Dist: mcp>=1.12.3
|
36
36
|
Requires-Dist: msgspec>=0.18.0
|
37
37
|
Requires-Dist: playa-pdf>=0.6.4
|
38
38
|
Requires-Dist: psutil>=7.0.0
|
@@ -45,6 +45,7 @@ Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
|
|
45
45
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
|
46
46
|
Provides-Extra: all
|
47
47
|
Requires-Dist: click>=8.2.1; extra == 'all'
|
48
|
+
Requires-Dist: deep-translator>=1.11.4; extra == 'all'
|
48
49
|
Requires-Dist: easyocr>=1.7.2; extra == 'all'
|
49
50
|
Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
|
50
51
|
Requires-Dist: gmft>=0.4.2; extra == 'all'
|
@@ -53,6 +54,7 @@ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all
|
|
53
54
|
Requires-Dist: mailparse>=1.0.15; extra == 'all'
|
54
55
|
Requires-Dist: paddleocr>=3.1.0; extra == 'all'
|
55
56
|
Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
|
57
|
+
Requires-Dist: pandas>=2.3.1; extra == 'all'
|
56
58
|
Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'all'
|
57
59
|
Requires-Dist: rich>=14.1.0; extra == 'all'
|
58
60
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
|
@@ -61,9 +63,6 @@ Requires-Dist: spacy>=3.8.7; extra == 'all'
|
|
61
63
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
|
62
64
|
Provides-Extra: api
|
63
65
|
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'api'
|
64
|
-
Provides-Extra: auto-classify-document-type
|
65
|
-
Requires-Dist: deep-translator>=1.11.4; extra == 'auto-classify-document-type'
|
66
|
-
Requires-Dist: pandas>=2.3.1; extra == 'auto-classify-document-type'
|
67
66
|
Provides-Extra: chunking
|
68
67
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
|
69
68
|
Provides-Extra: cli
|
@@ -72,6 +71,9 @@ Requires-Dist: rich>=14.1.0; extra == 'cli'
|
|
72
71
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
|
73
72
|
Provides-Extra: crypto
|
74
73
|
Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'crypto'
|
74
|
+
Provides-Extra: document-classification
|
75
|
+
Requires-Dist: deep-translator>=1.11.4; extra == 'document-classification'
|
76
|
+
Requires-Dist: pandas>=2.3.1; extra == 'document-classification'
|
75
77
|
Provides-Extra: easyocr
|
76
78
|
Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
|
77
79
|
Provides-Extra: entity-extraction
|
@@ -1,43 +1,43 @@
|
|
1
1
|
kreuzberg/__init__.py,sha256=0OJ_jNKbS6GxzWC5-EfRCiE80as_ya0-wwyNsTYbxzY,1721
|
2
2
|
kreuzberg/__main__.py,sha256=s2qM1nPEkRHAQP-G3P7sf5l6qA_KJeIEHS5LpPz04lg,183
|
3
|
-
kreuzberg/_chunker.py,sha256=
|
4
|
-
kreuzberg/_config.py,sha256=
|
3
|
+
kreuzberg/_chunker.py,sha256=y4-dX6ILjjBkkC1gkCzXb7v7vbi8844m7vz1gIzbmv4,1952
|
4
|
+
kreuzberg/_config.py,sha256=Au521UiR7vcQs_8_hhoWIfmDDMJIrDM3XZUB_qHfCmo,14035
|
5
5
|
kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
|
6
|
-
kreuzberg/_document_classification.py,sha256=
|
7
|
-
kreuzberg/_entity_extraction.py,sha256=
|
8
|
-
kreuzberg/_gmft.py,sha256=
|
9
|
-
kreuzberg/_language_detection.py,sha256=
|
6
|
+
kreuzberg/_document_classification.py,sha256=qFGmwvUMhnNAvNNJO7E-huPx-Ps-_DWxdNxsozIzgaw,6870
|
7
|
+
kreuzberg/_entity_extraction.py,sha256=Oa1T-9mptimpOHtcda-GtrVYH9PFy7DSJj3thJZUD7k,7902
|
8
|
+
kreuzberg/_gmft.py,sha256=HdQ7Xpuixxl2Y0jY8C3KfyQEU0mN4yQdqErWCv4TnFY,25573
|
9
|
+
kreuzberg/_language_detection.py,sha256=_Ng2aHgPxOHFgd507gVNiIGVmnxxbpgYwsO0bD0yTzg,3315
|
10
10
|
kreuzberg/_mime_types.py,sha256=2warRVqfBUNIg8JBg8yP4pRqaMPvwINosHMkJwtH_Fc,8488
|
11
|
-
kreuzberg/_playa.py,sha256=
|
11
|
+
kreuzberg/_playa.py,sha256=_IPrUSWwSfDQlWXOpKlauV0D9MhGrujGP5kmQ0U3L0g,12188
|
12
12
|
kreuzberg/_registry.py,sha256=wGSlkS0U1zqruWQCLE95vj4a2mw1yyvf0j6rgz80sJg,3473
|
13
|
-
kreuzberg/_types.py,sha256=
|
14
|
-
kreuzberg/cli.py,sha256=
|
13
|
+
kreuzberg/_types.py,sha256=bMaU6VuoqwOpW6ufshA-DWpNw6t9EokjEDEfFsznvdo,15389
|
14
|
+
kreuzberg/cli.py,sha256=rJMdHg7FhUxefCrx-sf4c2qVGRXr8Xrpjgfx_DQSKMg,12558
|
15
15
|
kreuzberg/exceptions.py,sha256=PTiAZgQwcG9hXbgYg2W7sfxksFhq5_wzOFgZGnTJAoc,2991
|
16
16
|
kreuzberg/extraction.py,sha256=Kt1mOxdlOb35yVOdpdhiRPuTgA9BW_TTG9qwCkSxSkc,17332
|
17
17
|
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
18
|
kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
|
-
kreuzberg/_api/main.py,sha256=
|
19
|
+
kreuzberg/_api/main.py,sha256=8VwxRlIXwnPs7ZYm0saUZsNOjevEAWJQpNreG-X7ZpE,3273
|
20
20
|
kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
|
-
kreuzberg/_extractors/_base.py,sha256=
|
22
|
-
kreuzberg/_extractors/_email.py,sha256=
|
21
|
+
kreuzberg/_extractors/_base.py,sha256=H_nwynBX3fozncVjV13c329x5eCLl5r7nyVTLQyDAzI,4396
|
22
|
+
kreuzberg/_extractors/_email.py,sha256=Jpr4NFef640uVgNFkR1or-omy8RVt-NOHUYgWRDjyBo,6753
|
23
23
|
kreuzberg/_extractors/_html.py,sha256=lOM1Tgrrvd7vpEeFAxC1dp0Tibr6N2FEHCjgFx0FK64,1745
|
24
|
-
kreuzberg/_extractors/_image.py,sha256=
|
24
|
+
kreuzberg/_extractors/_image.py,sha256=Iz1JpvGqcYyh9g4zO_bMZG3E9S39KNHFu8PrXDRXeOk,4513
|
25
25
|
kreuzberg/_extractors/_pandoc.py,sha256=51k7XISfKaPorhapG7aIeQb94KGsfozxKyT2rwhk9Bk,26553
|
26
|
-
kreuzberg/_extractors/_pdf.py,sha256=
|
26
|
+
kreuzberg/_extractors/_pdf.py,sha256=OflyvwEkuFLmw8E3si35MCGH31fvd5o50VdMmu5QRVs,19884
|
27
27
|
kreuzberg/_extractors/_presentation.py,sha256=CUlqZl_QCdJdumsZh0BpROkFbvi9uq7yMoIt3bRTUeE,10859
|
28
28
|
kreuzberg/_extractors/_spread_sheet.py,sha256=iagiyJsnl-89OP1eqmEv8jWl7gZBJm2x0YOyqBgLasA,13733
|
29
|
-
kreuzberg/_extractors/_structured.py,sha256=
|
29
|
+
kreuzberg/_extractors/_structured.py,sha256=PbNaXd-_PUPsE0yZkISod_vLBokbWdVTKEPpEmqaEMM,5787
|
30
30
|
kreuzberg/_mcp/__init__.py,sha256=8PYV-omC8Rln7Cove8C3rHu3d7sR1FuiwSBG1O7vkAE,92
|
31
|
-
kreuzberg/_mcp/server.py,sha256=
|
31
|
+
kreuzberg/_mcp/server.py,sha256=Dxed80MqZsYCFyYo0QdArpKE4H8DhpKY34fijdzV5uw,8731
|
32
32
|
kreuzberg/_ocr/__init__.py,sha256=grshVFwVQl2rMvH1hg1JNlYXjy5-Tdb_rusLD1Cselk,706
|
33
|
-
kreuzberg/_ocr/_base.py,sha256=
|
34
|
-
kreuzberg/_ocr/_easyocr.py,sha256=
|
35
|
-
kreuzberg/_ocr/_paddleocr.py,sha256=
|
33
|
+
kreuzberg/_ocr/_base.py,sha256=IkONqwG6zxZoVMni1JlYugBoyONahlRny7J2_7Dy69c,3953
|
34
|
+
kreuzberg/_ocr/_easyocr.py,sha256=dWfoj5fPIGqJPGTVeZ0W59TrW3DpNwF0bcfgt6FwQUw,17238
|
35
|
+
kreuzberg/_ocr/_paddleocr.py,sha256=Is_iJQaSUeCMfCvg5RnuG_pmBRjBt0b3dCBPY1IAc3A,17583
|
36
36
|
kreuzberg/_ocr/_tesseract.py,sha256=teLMH1pBhpcmEXDcyZlv56hYINLGMuaKZ0CQtcu_czQ,31510
|
37
37
|
kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
38
38
|
kreuzberg/_utils/_cache.py,sha256=hYd_a5Ni5VJBE1XU_eN9gvQ5gg0FRsdbRgmJe-OIJHM,15253
|
39
|
-
kreuzberg/_utils/_device.py,sha256=
|
40
|
-
kreuzberg/_utils/_document_cache.py,sha256=
|
39
|
+
kreuzberg/_utils/_device.py,sha256=JI9p9TGSfQHEi2SL-ovOXMr9RUnVq-RrEly89OvmQ5w,10485
|
40
|
+
kreuzberg/_utils/_document_cache.py,sha256=ka90JIT-FXUMOv8z2u3fztQgZZb2XQDHTMnBi32mySA,7005
|
41
41
|
kreuzberg/_utils/_errors.py,sha256=UsktQ_p7eOj9crPsFDg8HgRSE5-IpuFC7y1e6dDI_fY,6503
|
42
42
|
kreuzberg/_utils/_pdf_lock.py,sha256=nqxAYCNlfWDrJtP4ZNu57st1YnkDl-gYXdr0q8nv0kA,1961
|
43
43
|
kreuzberg/_utils/_process_pool.py,sha256=4BqhmRspwMyPT2EBfTu_rrn7v722wlMLD8qlYvYsc00,8621
|
@@ -47,8 +47,8 @@ kreuzberg/_utils/_string.py,sha256=bCzO3UO6nXupxvtMWvHqfp1Vd9CTzEH9jmpJXQ7upAU,6
|
|
47
47
|
kreuzberg/_utils/_sync.py,sha256=7LSavBmxVKQUzdjfx9fYRAI9IbJtRw8iGf_Q8B7RX9g,4923
|
48
48
|
kreuzberg/_utils/_table.py,sha256=IomrfQBP85DZI8RmQjOVs2Siq7VP9FUTYPaZR4t3yRw,8199
|
49
49
|
kreuzberg/_utils/_tmp.py,sha256=hVn-VVijIg2FM7EZJ899gc7wZg-TGoJZoeAcxMX-Cxg,1044
|
50
|
-
kreuzberg-3.
|
51
|
-
kreuzberg-3.
|
52
|
-
kreuzberg-3.
|
53
|
-
kreuzberg-3.
|
54
|
-
kreuzberg-3.
|
50
|
+
kreuzberg-3.11.0.dist-info/METADATA,sha256=pvyRM3TAmXE3TnYaNOZ1chD_IQTgWn254wxnqDsy6EM,12135
|
51
|
+
kreuzberg-3.11.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
52
|
+
kreuzberg-3.11.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
|
53
|
+
kreuzberg-3.11.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
54
|
+
kreuzberg-3.11.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|