kreuzberg 3.10.1__py3-none-any.whl → 3.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_config.py +18 -14
- kreuzberg/_document_classification.py +1 -1
- kreuzberg/_extractors/_base.py +1 -2
- kreuzberg/_extractors/_image.py +18 -17
- kreuzberg/_extractors/_pdf.py +30 -33
- kreuzberg/_mcp/server.py +1 -1
- kreuzberg/_types.py +11 -10
- {kreuzberg-3.10.1.dist-info → kreuzberg-3.11.0.dist-info}/METADATA +7 -5
- {kreuzberg-3.10.1.dist-info → kreuzberg-3.11.0.dist-info}/RECORD +12 -12
- {kreuzberg-3.10.1.dist-info → kreuzberg-3.11.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.10.1.dist-info → kreuzberg-3.11.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.10.1.dist-info → kreuzberg-3.11.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_config.py
CHANGED
@@ -97,19 +97,21 @@ def parse_ocr_backend_config(
|
|
97
97
|
if not isinstance(backend_config, dict):
|
98
98
|
return None
|
99
99
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
100
|
+
match backend:
|
101
|
+
case "tesseract":
|
102
|
+
# Convert psm integer to PSMMode enum if needed
|
103
|
+
processed_config = backend_config.copy()
|
104
|
+
if "psm" in processed_config and isinstance(processed_config["psm"], int):
|
105
|
+
from kreuzberg._ocr._tesseract import PSMMode # noqa: PLC0415
|
106
|
+
|
107
|
+
processed_config["psm"] = PSMMode(processed_config["psm"])
|
108
|
+
return TesseractConfig(**processed_config)
|
109
|
+
case "easyocr":
|
110
|
+
return EasyOCRConfig(**backend_config)
|
111
|
+
case "paddleocr":
|
112
|
+
return PaddleOCRConfig(**backend_config)
|
113
|
+
case _:
|
114
|
+
return None
|
113
115
|
|
114
116
|
|
115
117
|
def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> ExtractionConfig:
|
@@ -140,7 +142,9 @@ def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> Extraction
|
|
140
142
|
"document_classification_mode",
|
141
143
|
"keyword_count",
|
142
144
|
}
|
143
|
-
extraction_config
|
145
|
+
extraction_config = extraction_config | {
|
146
|
+
field: config_dict[field] for field in basic_fields if field in config_dict
|
147
|
+
}
|
144
148
|
|
145
149
|
# Handle OCR backend configuration
|
146
150
|
ocr_backend = extraction_config.get("ocr_backend")
|
@@ -62,7 +62,7 @@ def _get_translated_text(result: ExtractionResult) -> str:
|
|
62
62
|
from deep_translator import GoogleTranslator # noqa: PLC0415
|
63
63
|
except ImportError as e: # pragma: no cover
|
64
64
|
raise MissingDependencyError(
|
65
|
-
"The 'deep-translator' library is not installed. Please install it with: pip install 'kreuzberg[
|
65
|
+
"The 'deep-translator' library is not installed. Please install it with: pip install 'kreuzberg[document-classification]'"
|
66
66
|
) from e
|
67
67
|
|
68
68
|
try:
|
kreuzberg/_extractors/_base.py
CHANGED
@@ -116,8 +116,7 @@ class Extractor(ABC):
|
|
116
116
|
quality_score = calculate_quality_score(cleaned_content, dict(result.metadata) if result.metadata else None)
|
117
117
|
|
118
118
|
# Add quality metadata
|
119
|
-
enhanced_metadata = dict(result.metadata) if result.metadata else {}
|
120
|
-
enhanced_metadata["quality_score"] = quality_score
|
119
|
+
enhanced_metadata = (dict(result.metadata) if result.metadata else {}) | {"quality_score": quality_score}
|
121
120
|
|
122
121
|
# Return enhanced result
|
123
122
|
return ExtractionResult(
|
kreuzberg/_extractors/_image.py
CHANGED
@@ -85,23 +85,24 @@ class ImageExtractor(Extractor):
|
|
85
85
|
|
86
86
|
backend = get_ocr_backend(self.config.ocr_backend)
|
87
87
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
88
|
+
match self.config.ocr_backend:
|
89
|
+
case "tesseract":
|
90
|
+
config = (
|
91
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
|
92
|
+
)
|
93
|
+
result = backend.process_file_sync(path, **asdict(config))
|
94
|
+
case "paddleocr":
|
95
|
+
paddle_config = (
|
96
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
97
|
+
)
|
98
|
+
result = backend.process_file_sync(path, **asdict(paddle_config))
|
99
|
+
case "easyocr":
|
100
|
+
easy_config = (
|
101
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
102
|
+
)
|
103
|
+
result = backend.process_file_sync(path, **asdict(easy_config))
|
104
|
+
case _:
|
105
|
+
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
105
106
|
return self._apply_quality_processing(result)
|
106
107
|
|
107
108
|
def _get_extension_from_mime_type(self, mime_type: str) -> str:
|
kreuzberg/_extractors/_pdf.py
CHANGED
@@ -88,14 +88,12 @@ class PDFExtractor(Extractor):
|
|
88
88
|
# Enhance metadata with table information
|
89
89
|
if result.tables:
|
90
90
|
table_summary = generate_table_summary(result.tables)
|
91
|
-
result.metadata.
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
}
|
98
|
-
)
|
91
|
+
result.metadata = result.metadata | {
|
92
|
+
"table_count": table_summary["table_count"],
|
93
|
+
"tables_summary": f"Document contains {table_summary['table_count']} tables "
|
94
|
+
f"across {table_summary['pages_with_tables']} pages with "
|
95
|
+
f"{table_summary['total_rows']} total rows",
|
96
|
+
}
|
99
97
|
|
100
98
|
return self._apply_quality_processing(result)
|
101
99
|
|
@@ -153,14 +151,12 @@ class PDFExtractor(Extractor):
|
|
153
151
|
# Enhance metadata with table information
|
154
152
|
if tables:
|
155
153
|
table_summary = generate_table_summary(tables)
|
156
|
-
result.metadata.
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
}
|
163
|
-
)
|
154
|
+
result.metadata = result.metadata | {
|
155
|
+
"table_count": table_summary["table_count"],
|
156
|
+
"tables_summary": f"Document contains {table_summary['table_count']} tables "
|
157
|
+
f"across {table_summary['pages_with_tables']} pages with "
|
158
|
+
f"{table_summary['total_rows']} total rows",
|
159
|
+
}
|
164
160
|
|
165
161
|
# Apply quality processing
|
166
162
|
return self._apply_quality_processing(result)
|
@@ -386,23 +382,24 @@ class PDFExtractor(Extractor):
|
|
386
382
|
backend = get_ocr_backend(self.config.ocr_backend)
|
387
383
|
paths = [Path(p) for p in image_paths]
|
388
384
|
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
385
|
+
match self.config.ocr_backend:
|
386
|
+
case "tesseract":
|
387
|
+
config = (
|
388
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
|
389
|
+
)
|
390
|
+
results = backend.process_batch_sync(paths, **asdict(config))
|
391
|
+
case "paddleocr":
|
392
|
+
paddle_config = (
|
393
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
394
|
+
)
|
395
|
+
results = backend.process_batch_sync(paths, **asdict(paddle_config))
|
396
|
+
case "easyocr":
|
397
|
+
easy_config = (
|
398
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
399
|
+
)
|
400
|
+
results = backend.process_batch_sync(paths, **asdict(easy_config))
|
401
|
+
case _:
|
402
|
+
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
406
403
|
|
407
404
|
# Use list comprehension and join for efficient string building
|
408
405
|
return "\n\n".join(result.content for result in results)
|
kreuzberg/_mcp/server.py
CHANGED
kreuzberg/_types.py
CHANGED
@@ -349,7 +349,7 @@ class ExtractionConfig:
|
|
349
349
|
"""Configuration for language detection. If None, uses default settings."""
|
350
350
|
spacy_entity_extraction_config: SpacyEntityExtractionConfig | None = None
|
351
351
|
"""Configuration for spaCy entity extraction. If None, uses default settings."""
|
352
|
-
auto_detect_document_type: bool =
|
352
|
+
auto_detect_document_type: bool = False
|
353
353
|
"""Whether to automatically detect the document type."""
|
354
354
|
document_type_confidence_threshold: float = 0.5
|
355
355
|
"""Confidence threshold for document type detection."""
|
@@ -398,15 +398,16 @@ class ExtractionConfig:
|
|
398
398
|
return asdict(self.ocr_config)
|
399
399
|
|
400
400
|
# Lazy load and cache default configs instead of creating new instances
|
401
|
-
|
402
|
-
|
401
|
+
match self.ocr_backend:
|
402
|
+
case "tesseract":
|
403
|
+
from kreuzberg._ocr._tesseract import TesseractConfig # noqa: PLC0415
|
403
404
|
|
404
|
-
|
405
|
-
|
406
|
-
|
405
|
+
return asdict(TesseractConfig())
|
406
|
+
case "easyocr":
|
407
|
+
from kreuzberg._ocr._easyocr import EasyOCRConfig # noqa: PLC0415
|
407
408
|
|
408
|
-
|
409
|
-
|
410
|
-
|
409
|
+
return asdict(EasyOCRConfig())
|
410
|
+
case _: # paddleocr or any other backend
|
411
|
+
from kreuzberg._ocr._paddleocr import PaddleOCRConfig # noqa: PLC0415
|
411
412
|
|
412
|
-
|
413
|
+
return asdict(PaddleOCRConfig())
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.11.0
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -32,7 +32,7 @@ Requires-Dist: anyio>=4.9.0
|
|
32
32
|
Requires-Dist: chardetng-py>=0.3.5
|
33
33
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
34
34
|
Requires-Dist: html-to-markdown[lxml]>=1.9.0
|
35
|
-
Requires-Dist: mcp>=1.12.
|
35
|
+
Requires-Dist: mcp>=1.12.3
|
36
36
|
Requires-Dist: msgspec>=0.18.0
|
37
37
|
Requires-Dist: playa-pdf>=0.6.4
|
38
38
|
Requires-Dist: psutil>=7.0.0
|
@@ -45,6 +45,7 @@ Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
|
|
45
45
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
|
46
46
|
Provides-Extra: all
|
47
47
|
Requires-Dist: click>=8.2.1; extra == 'all'
|
48
|
+
Requires-Dist: deep-translator>=1.11.4; extra == 'all'
|
48
49
|
Requires-Dist: easyocr>=1.7.2; extra == 'all'
|
49
50
|
Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
|
50
51
|
Requires-Dist: gmft>=0.4.2; extra == 'all'
|
@@ -53,6 +54,7 @@ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all
|
|
53
54
|
Requires-Dist: mailparse>=1.0.15; extra == 'all'
|
54
55
|
Requires-Dist: paddleocr>=3.1.0; extra == 'all'
|
55
56
|
Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
|
57
|
+
Requires-Dist: pandas>=2.3.1; extra == 'all'
|
56
58
|
Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'all'
|
57
59
|
Requires-Dist: rich>=14.1.0; extra == 'all'
|
58
60
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
|
@@ -61,9 +63,6 @@ Requires-Dist: spacy>=3.8.7; extra == 'all'
|
|
61
63
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
|
62
64
|
Provides-Extra: api
|
63
65
|
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'api'
|
64
|
-
Provides-Extra: auto-classify-document-type
|
65
|
-
Requires-Dist: deep-translator>=1.11.4; extra == 'auto-classify-document-type'
|
66
|
-
Requires-Dist: pandas>=2.3.1; extra == 'auto-classify-document-type'
|
67
66
|
Provides-Extra: chunking
|
68
67
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
|
69
68
|
Provides-Extra: cli
|
@@ -72,6 +71,9 @@ Requires-Dist: rich>=14.1.0; extra == 'cli'
|
|
72
71
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
|
73
72
|
Provides-Extra: crypto
|
74
73
|
Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'crypto'
|
74
|
+
Provides-Extra: document-classification
|
75
|
+
Requires-Dist: deep-translator>=1.11.4; extra == 'document-classification'
|
76
|
+
Requires-Dist: pandas>=2.3.1; extra == 'document-classification'
|
75
77
|
Provides-Extra: easyocr
|
76
78
|
Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
|
77
79
|
Provides-Extra: entity-extraction
|
@@ -1,16 +1,16 @@
|
|
1
1
|
kreuzberg/__init__.py,sha256=0OJ_jNKbS6GxzWC5-EfRCiE80as_ya0-wwyNsTYbxzY,1721
|
2
2
|
kreuzberg/__main__.py,sha256=s2qM1nPEkRHAQP-G3P7sf5l6qA_KJeIEHS5LpPz04lg,183
|
3
3
|
kreuzberg/_chunker.py,sha256=y4-dX6ILjjBkkC1gkCzXb7v7vbi8844m7vz1gIzbmv4,1952
|
4
|
-
kreuzberg/_config.py,sha256=
|
4
|
+
kreuzberg/_config.py,sha256=Au521UiR7vcQs_8_hhoWIfmDDMJIrDM3XZUB_qHfCmo,14035
|
5
5
|
kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
|
6
|
-
kreuzberg/_document_classification.py,sha256=
|
6
|
+
kreuzberg/_document_classification.py,sha256=qFGmwvUMhnNAvNNJO7E-huPx-Ps-_DWxdNxsozIzgaw,6870
|
7
7
|
kreuzberg/_entity_extraction.py,sha256=Oa1T-9mptimpOHtcda-GtrVYH9PFy7DSJj3thJZUD7k,7902
|
8
8
|
kreuzberg/_gmft.py,sha256=HdQ7Xpuixxl2Y0jY8C3KfyQEU0mN4yQdqErWCv4TnFY,25573
|
9
9
|
kreuzberg/_language_detection.py,sha256=_Ng2aHgPxOHFgd507gVNiIGVmnxxbpgYwsO0bD0yTzg,3315
|
10
10
|
kreuzberg/_mime_types.py,sha256=2warRVqfBUNIg8JBg8yP4pRqaMPvwINosHMkJwtH_Fc,8488
|
11
11
|
kreuzberg/_playa.py,sha256=_IPrUSWwSfDQlWXOpKlauV0D9MhGrujGP5kmQ0U3L0g,12188
|
12
12
|
kreuzberg/_registry.py,sha256=wGSlkS0U1zqruWQCLE95vj4a2mw1yyvf0j6rgz80sJg,3473
|
13
|
-
kreuzberg/_types.py,sha256=
|
13
|
+
kreuzberg/_types.py,sha256=bMaU6VuoqwOpW6ufshA-DWpNw6t9EokjEDEfFsznvdo,15389
|
14
14
|
kreuzberg/cli.py,sha256=rJMdHg7FhUxefCrx-sf4c2qVGRXr8Xrpjgfx_DQSKMg,12558
|
15
15
|
kreuzberg/exceptions.py,sha256=PTiAZgQwcG9hXbgYg2W7sfxksFhq5_wzOFgZGnTJAoc,2991
|
16
16
|
kreuzberg/extraction.py,sha256=Kt1mOxdlOb35yVOdpdhiRPuTgA9BW_TTG9qwCkSxSkc,17332
|
@@ -18,17 +18,17 @@ kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
18
|
kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
19
|
kreuzberg/_api/main.py,sha256=8VwxRlIXwnPs7ZYm0saUZsNOjevEAWJQpNreG-X7ZpE,3273
|
20
20
|
kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
|
-
kreuzberg/_extractors/_base.py,sha256=
|
21
|
+
kreuzberg/_extractors/_base.py,sha256=H_nwynBX3fozncVjV13c329x5eCLl5r7nyVTLQyDAzI,4396
|
22
22
|
kreuzberg/_extractors/_email.py,sha256=Jpr4NFef640uVgNFkR1or-omy8RVt-NOHUYgWRDjyBo,6753
|
23
23
|
kreuzberg/_extractors/_html.py,sha256=lOM1Tgrrvd7vpEeFAxC1dp0Tibr6N2FEHCjgFx0FK64,1745
|
24
|
-
kreuzberg/_extractors/_image.py,sha256=
|
24
|
+
kreuzberg/_extractors/_image.py,sha256=Iz1JpvGqcYyh9g4zO_bMZG3E9S39KNHFu8PrXDRXeOk,4513
|
25
25
|
kreuzberg/_extractors/_pandoc.py,sha256=51k7XISfKaPorhapG7aIeQb94KGsfozxKyT2rwhk9Bk,26553
|
26
|
-
kreuzberg/_extractors/_pdf.py,sha256=
|
26
|
+
kreuzberg/_extractors/_pdf.py,sha256=OflyvwEkuFLmw8E3si35MCGH31fvd5o50VdMmu5QRVs,19884
|
27
27
|
kreuzberg/_extractors/_presentation.py,sha256=CUlqZl_QCdJdumsZh0BpROkFbvi9uq7yMoIt3bRTUeE,10859
|
28
28
|
kreuzberg/_extractors/_spread_sheet.py,sha256=iagiyJsnl-89OP1eqmEv8jWl7gZBJm2x0YOyqBgLasA,13733
|
29
29
|
kreuzberg/_extractors/_structured.py,sha256=PbNaXd-_PUPsE0yZkISod_vLBokbWdVTKEPpEmqaEMM,5787
|
30
30
|
kreuzberg/_mcp/__init__.py,sha256=8PYV-omC8Rln7Cove8C3rHu3d7sR1FuiwSBG1O7vkAE,92
|
31
|
-
kreuzberg/_mcp/server.py,sha256=
|
31
|
+
kreuzberg/_mcp/server.py,sha256=Dxed80MqZsYCFyYo0QdArpKE4H8DhpKY34fijdzV5uw,8731
|
32
32
|
kreuzberg/_ocr/__init__.py,sha256=grshVFwVQl2rMvH1hg1JNlYXjy5-Tdb_rusLD1Cselk,706
|
33
33
|
kreuzberg/_ocr/_base.py,sha256=IkONqwG6zxZoVMni1JlYugBoyONahlRny7J2_7Dy69c,3953
|
34
34
|
kreuzberg/_ocr/_easyocr.py,sha256=dWfoj5fPIGqJPGTVeZ0W59TrW3DpNwF0bcfgt6FwQUw,17238
|
@@ -47,8 +47,8 @@ kreuzberg/_utils/_string.py,sha256=bCzO3UO6nXupxvtMWvHqfp1Vd9CTzEH9jmpJXQ7upAU,6
|
|
47
47
|
kreuzberg/_utils/_sync.py,sha256=7LSavBmxVKQUzdjfx9fYRAI9IbJtRw8iGf_Q8B7RX9g,4923
|
48
48
|
kreuzberg/_utils/_table.py,sha256=IomrfQBP85DZI8RmQjOVs2Siq7VP9FUTYPaZR4t3yRw,8199
|
49
49
|
kreuzberg/_utils/_tmp.py,sha256=hVn-VVijIg2FM7EZJ899gc7wZg-TGoJZoeAcxMX-Cxg,1044
|
50
|
-
kreuzberg-3.
|
51
|
-
kreuzberg-3.
|
52
|
-
kreuzberg-3.
|
53
|
-
kreuzberg-3.
|
54
|
-
kreuzberg-3.
|
50
|
+
kreuzberg-3.11.0.dist-info/METADATA,sha256=pvyRM3TAmXE3TnYaNOZ1chD_IQTgWn254wxnqDsy6EM,12135
|
51
|
+
kreuzberg-3.11.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
52
|
+
kreuzberg-3.11.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
|
53
|
+
kreuzberg-3.11.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
54
|
+
kreuzberg-3.11.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|