kreuzberg 3.14.1__py3-none-any.whl → 3.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +10 -0
- kreuzberg/_api/_config_cache.py +247 -0
- kreuzberg/_api/main.py +74 -45
- kreuzberg/_chunker.py +7 -6
- kreuzberg/_config.py +11 -1
- kreuzberg/_constants.py +2 -0
- kreuzberg/_document_classification.py +5 -7
- kreuzberg/_entity_extraction.py +9 -4
- kreuzberg/_extractors/_base.py +269 -3
- kreuzberg/_extractors/_email.py +101 -27
- kreuzberg/_extractors/_html.py +112 -7
- kreuzberg/_extractors/_image.py +23 -22
- kreuzberg/_extractors/_pandoc.py +106 -75
- kreuzberg/_extractors/_pdf.py +208 -99
- kreuzberg/_extractors/_presentation.py +76 -8
- kreuzberg/_extractors/_spread_sheet.py +24 -30
- kreuzberg/_extractors/_structured.py +83 -15
- kreuzberg/_gmft.py +5 -0
- kreuzberg/_mcp/server.py +324 -25
- kreuzberg/_mime_types.py +42 -0
- kreuzberg/_ocr/_easyocr.py +53 -21
- kreuzberg/_ocr/_paddleocr.py +1 -1
- kreuzberg/_ocr/_tesseract.py +88 -37
- kreuzberg/_types.py +291 -61
- kreuzberg/_utils/_cache.py +10 -4
- kreuzberg/_utils/_device.py +2 -4
- kreuzberg/_utils/_html_streaming.py +20 -0
- kreuzberg/_utils/_image_preprocessing.py +12 -39
- kreuzberg/_utils/_process_pool.py +29 -8
- kreuzberg/_utils/_quality.py +7 -2
- kreuzberg/_utils/_resource_managers.py +65 -0
- kreuzberg/_utils/_serialization.py +13 -6
- kreuzberg/_utils/_sync.py +39 -10
- kreuzberg/_utils/_tmp.py +37 -1
- kreuzberg/cli.py +34 -20
- kreuzberg/extraction.py +44 -28
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/METADATA +13 -11
- kreuzberg-3.16.0.dist-info/RECORD +61 -0
- kreuzberg-3.14.1.dist-info/RECORD +0 -58
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/__init__.py
CHANGED
@@ -4,9 +4,14 @@ from ._registry import ExtractorRegistry
|
|
4
4
|
from ._types import (
|
5
5
|
EasyOCRConfig,
|
6
6
|
Entity,
|
7
|
+
ExtractedImage,
|
7
8
|
ExtractionConfig,
|
8
9
|
ExtractionResult,
|
9
10
|
GMFTConfig,
|
11
|
+
HTMLToMarkdownConfig,
|
12
|
+
ImageOCRConfig,
|
13
|
+
ImageOCRResult,
|
14
|
+
JSONExtractionConfig,
|
10
15
|
LanguageDetectionConfig,
|
11
16
|
Metadata,
|
12
17
|
PaddleOCRConfig,
|
@@ -32,10 +37,15 @@ __version__ = version("kreuzberg")
|
|
32
37
|
__all__ = [
|
33
38
|
"EasyOCRConfig",
|
34
39
|
"Entity",
|
40
|
+
"ExtractedImage",
|
35
41
|
"ExtractionConfig",
|
36
42
|
"ExtractionResult",
|
37
43
|
"ExtractorRegistry",
|
38
44
|
"GMFTConfig",
|
45
|
+
"HTMLToMarkdownConfig",
|
46
|
+
"ImageOCRConfig",
|
47
|
+
"ImageOCRResult",
|
48
|
+
"JSONExtractionConfig",
|
39
49
|
"KreuzbergError",
|
40
50
|
"LanguageDetectionConfig",
|
41
51
|
"Metadata",
|
@@ -0,0 +1,247 @@
|
|
1
|
+
"""API Configuration Caching Module.
|
2
|
+
|
3
|
+
This module provides LRU cached functions for API config operations to improve performance
|
4
|
+
by avoiding repeated file system operations and object creation.
|
5
|
+
"""
|
6
|
+
|
7
|
+
from __future__ import annotations
|
8
|
+
|
9
|
+
import hashlib
|
10
|
+
import json
|
11
|
+
from functools import lru_cache
|
12
|
+
from pathlib import Path
|
13
|
+
from typing import Any
|
14
|
+
|
15
|
+
from kreuzberg._config import discover_config
|
16
|
+
from kreuzberg._types import (
|
17
|
+
EasyOCRConfig,
|
18
|
+
ExtractionConfig,
|
19
|
+
GMFTConfig,
|
20
|
+
HTMLToMarkdownConfig,
|
21
|
+
LanguageDetectionConfig,
|
22
|
+
PaddleOCRConfig,
|
23
|
+
SpacyEntityExtractionConfig,
|
24
|
+
TesseractConfig,
|
25
|
+
)
|
26
|
+
|
27
|
+
|
28
|
+
@lru_cache(maxsize=16)
|
29
|
+
def _cached_discover_config(
|
30
|
+
search_path: str,
|
31
|
+
config_file_mtime: float, # noqa: ARG001
|
32
|
+
config_file_size: int, # noqa: ARG001
|
33
|
+
) -> ExtractionConfig | None:
|
34
|
+
"""Cache config discovery with file modification time validation."""
|
35
|
+
return discover_config(Path(search_path))
|
36
|
+
|
37
|
+
|
38
|
+
def discover_config_cached(search_path: Path | str | None = None) -> ExtractionConfig | None:
|
39
|
+
"""Cached version of discover_config with automatic invalidation.
|
40
|
+
|
41
|
+
This function caches the result of discover_config() and automatically invalidates
|
42
|
+
the cache when config files are modified.
|
43
|
+
|
44
|
+
Args:
|
45
|
+
search_path: Path to start searching for config files from
|
46
|
+
|
47
|
+
Returns:
|
48
|
+
ExtractionConfig if found, None otherwise
|
49
|
+
"""
|
50
|
+
search_path = Path.cwd() if search_path is None else Path(search_path)
|
51
|
+
|
52
|
+
config_files = ["kreuzberg.toml", "pyproject.toml"]
|
53
|
+
for config_file_name in config_files:
|
54
|
+
config_path = search_path / config_file_name
|
55
|
+
if config_path.exists():
|
56
|
+
try:
|
57
|
+
stat = config_path.stat()
|
58
|
+
return _cached_discover_config(
|
59
|
+
str(search_path),
|
60
|
+
stat.st_mtime,
|
61
|
+
stat.st_size,
|
62
|
+
)
|
63
|
+
except OSError:
|
64
|
+
return discover_config(search_path)
|
65
|
+
|
66
|
+
return _cached_discover_config(str(search_path), 0.0, 0)
|
67
|
+
|
68
|
+
|
69
|
+
@lru_cache(maxsize=128)
|
70
|
+
def _cached_create_ocr_config(
|
71
|
+
config_type: str,
|
72
|
+
config_json: str,
|
73
|
+
) -> TesseractConfig | EasyOCRConfig | PaddleOCRConfig:
|
74
|
+
"""Cache OCR config object creation."""
|
75
|
+
config_dict = json.loads(config_json)
|
76
|
+
|
77
|
+
if config_type == "tesseract":
|
78
|
+
return TesseractConfig(**config_dict)
|
79
|
+
if config_type == "easyocr":
|
80
|
+
return EasyOCRConfig(**config_dict)
|
81
|
+
if config_type == "paddleocr":
|
82
|
+
return PaddleOCRConfig(**config_dict)
|
83
|
+
msg = f"Unknown OCR config type: {config_type}"
|
84
|
+
raise ValueError(msg)
|
85
|
+
|
86
|
+
|
87
|
+
@lru_cache(maxsize=64)
|
88
|
+
def _cached_create_gmft_config(config_json: str) -> GMFTConfig:
|
89
|
+
"""Cache GMFT config creation."""
|
90
|
+
return GMFTConfig(**json.loads(config_json))
|
91
|
+
|
92
|
+
|
93
|
+
@lru_cache(maxsize=64)
|
94
|
+
def _cached_create_language_detection_config(config_json: str) -> LanguageDetectionConfig:
|
95
|
+
"""Cache language detection config creation."""
|
96
|
+
return LanguageDetectionConfig(**json.loads(config_json))
|
97
|
+
|
98
|
+
|
99
|
+
@lru_cache(maxsize=64)
|
100
|
+
def _cached_create_spacy_config(config_json: str) -> SpacyEntityExtractionConfig:
|
101
|
+
"""Cache spaCy entity extraction config creation."""
|
102
|
+
return SpacyEntityExtractionConfig(**json.loads(config_json))
|
103
|
+
|
104
|
+
|
105
|
+
@lru_cache(maxsize=64)
|
106
|
+
def _cached_create_html_markdown_config(config_json: str) -> HTMLToMarkdownConfig:
|
107
|
+
"""Cache HTML to Markdown config creation."""
|
108
|
+
return HTMLToMarkdownConfig(**json.loads(config_json))
|
109
|
+
|
110
|
+
|
111
|
+
@lru_cache(maxsize=256)
|
112
|
+
def _cached_parse_header_config(header_value: str) -> dict[str, Any]:
|
113
|
+
"""Cache parsed header configurations."""
|
114
|
+
parsed_config: dict[str, Any] = json.loads(header_value)
|
115
|
+
return parsed_config
|
116
|
+
|
117
|
+
|
118
|
+
def create_ocr_config_cached(
|
119
|
+
ocr_backend: str | None, config_dict: dict[str, Any]
|
120
|
+
) -> TesseractConfig | EasyOCRConfig | PaddleOCRConfig:
|
121
|
+
"""Cached version of OCR config creation.
|
122
|
+
|
123
|
+
Args:
|
124
|
+
ocr_backend: The OCR backend type
|
125
|
+
config_dict: Configuration dictionary
|
126
|
+
|
127
|
+
Returns:
|
128
|
+
Configured OCR config object
|
129
|
+
"""
|
130
|
+
if not ocr_backend:
|
131
|
+
return TesseractConfig()
|
132
|
+
|
133
|
+
config_json = json.dumps(config_dict, sort_keys=True)
|
134
|
+
return _cached_create_ocr_config(ocr_backend, config_json)
|
135
|
+
|
136
|
+
|
137
|
+
def create_gmft_config_cached(config_dict: dict[str, Any]) -> GMFTConfig:
|
138
|
+
"""Cached version of GMFT config creation."""
|
139
|
+
config_json = json.dumps(config_dict, sort_keys=True)
|
140
|
+
return _cached_create_gmft_config(config_json)
|
141
|
+
|
142
|
+
|
143
|
+
def create_language_detection_config_cached(config_dict: dict[str, Any]) -> LanguageDetectionConfig:
|
144
|
+
"""Cached version of language detection config creation."""
|
145
|
+
config_json = json.dumps(config_dict, sort_keys=True)
|
146
|
+
return _cached_create_language_detection_config(config_json)
|
147
|
+
|
148
|
+
|
149
|
+
def create_spacy_config_cached(config_dict: dict[str, Any]) -> SpacyEntityExtractionConfig:
|
150
|
+
"""Cached version of spaCy config creation."""
|
151
|
+
config_json = json.dumps(config_dict, sort_keys=True)
|
152
|
+
return _cached_create_spacy_config(config_json)
|
153
|
+
|
154
|
+
|
155
|
+
def create_html_markdown_config_cached(config_dict: dict[str, Any]) -> HTMLToMarkdownConfig:
|
156
|
+
"""Cached version of HTML to Markdown config creation."""
|
157
|
+
config_json = json.dumps(config_dict, sort_keys=True)
|
158
|
+
return _cached_create_html_markdown_config(config_json)
|
159
|
+
|
160
|
+
|
161
|
+
def parse_header_config_cached(header_value: str) -> dict[str, Any]:
|
162
|
+
"""Cached version of header config parsing.
|
163
|
+
|
164
|
+
Args:
|
165
|
+
header_value: JSON string from X-Extraction-Config header
|
166
|
+
|
167
|
+
Returns:
|
168
|
+
Parsed configuration dictionary
|
169
|
+
"""
|
170
|
+
return _cached_parse_header_config(header_value)
|
171
|
+
|
172
|
+
|
173
|
+
@lru_cache(maxsize=512)
|
174
|
+
def _cached_merge_configs(
|
175
|
+
static_config_hash: str,
|
176
|
+
query_params_hash: str,
|
177
|
+
header_config_hash: str,
|
178
|
+
) -> ExtractionConfig:
|
179
|
+
"""Cache the complete config merging process.
|
180
|
+
|
181
|
+
This is the ultimate optimization - cache the entire result of merge_configs()
|
182
|
+
based on content hashes of all inputs.
|
183
|
+
"""
|
184
|
+
msg = "Not implemented yet - use individual component caching"
|
185
|
+
raise NotImplementedError(msg)
|
186
|
+
|
187
|
+
|
188
|
+
def _hash_dict(data: dict[str, Any] | None) -> str:
|
189
|
+
"""Create a hash string from a dictionary for cache keys."""
|
190
|
+
if data is None:
|
191
|
+
return "none"
|
192
|
+
|
193
|
+
json_str = json.dumps(data, sort_keys=True, default=str)
|
194
|
+
return hashlib.sha256(json_str.encode()).hexdigest()[:16]
|
195
|
+
|
196
|
+
|
197
|
+
def get_cache_stats() -> dict[str, Any]:
|
198
|
+
"""Get cache statistics for monitoring performance."""
|
199
|
+
return {
|
200
|
+
"discover_config": {
|
201
|
+
"hits": _cached_discover_config.cache_info().hits,
|
202
|
+
"misses": _cached_discover_config.cache_info().misses,
|
203
|
+
"size": _cached_discover_config.cache_info().currsize,
|
204
|
+
"max_size": _cached_discover_config.cache_info().maxsize,
|
205
|
+
},
|
206
|
+
"ocr_config": {
|
207
|
+
"hits": _cached_create_ocr_config.cache_info().hits,
|
208
|
+
"misses": _cached_create_ocr_config.cache_info().misses,
|
209
|
+
"size": _cached_create_ocr_config.cache_info().currsize,
|
210
|
+
"max_size": _cached_create_ocr_config.cache_info().maxsize,
|
211
|
+
},
|
212
|
+
"header_parsing": {
|
213
|
+
"hits": _cached_parse_header_config.cache_info().hits,
|
214
|
+
"misses": _cached_parse_header_config.cache_info().misses,
|
215
|
+
"size": _cached_parse_header_config.cache_info().currsize,
|
216
|
+
"max_size": _cached_parse_header_config.cache_info().maxsize,
|
217
|
+
},
|
218
|
+
"gmft_config": {
|
219
|
+
"hits": _cached_create_gmft_config.cache_info().hits,
|
220
|
+
"misses": _cached_create_gmft_config.cache_info().misses,
|
221
|
+
"size": _cached_create_gmft_config.cache_info().currsize,
|
222
|
+
"max_size": _cached_create_gmft_config.cache_info().maxsize,
|
223
|
+
},
|
224
|
+
"language_detection_config": {
|
225
|
+
"hits": _cached_create_language_detection_config.cache_info().hits,
|
226
|
+
"misses": _cached_create_language_detection_config.cache_info().misses,
|
227
|
+
"size": _cached_create_language_detection_config.cache_info().currsize,
|
228
|
+
"max_size": _cached_create_language_detection_config.cache_info().maxsize,
|
229
|
+
},
|
230
|
+
"spacy_config": {
|
231
|
+
"hits": _cached_create_spacy_config.cache_info().hits,
|
232
|
+
"misses": _cached_create_spacy_config.cache_info().misses,
|
233
|
+
"size": _cached_create_spacy_config.cache_info().currsize,
|
234
|
+
"max_size": _cached_create_spacy_config.cache_info().maxsize,
|
235
|
+
},
|
236
|
+
}
|
237
|
+
|
238
|
+
|
239
|
+
def clear_all_caches() -> None:
|
240
|
+
"""Clear all API configuration caches."""
|
241
|
+
_cached_discover_config.cache_clear()
|
242
|
+
_cached_create_ocr_config.cache_clear()
|
243
|
+
_cached_create_gmft_config.cache_clear()
|
244
|
+
_cached_create_language_detection_config.cache_clear()
|
245
|
+
_cached_create_spacy_config.cache_clear()
|
246
|
+
_cached_create_html_markdown_config.cache_clear()
|
247
|
+
_cached_parse_header_config.cache_clear()
|
kreuzberg/_api/main.py
CHANGED
@@ -3,8 +3,7 @@ from __future__ import annotations
|
|
3
3
|
import base64
|
4
4
|
import io
|
5
5
|
import traceback
|
6
|
-
from
|
7
|
-
from json import dumps, loads
|
6
|
+
from json import dumps
|
8
7
|
from typing import TYPE_CHECKING, Annotated, Any, Literal
|
9
8
|
|
10
9
|
import msgspec
|
@@ -16,19 +15,24 @@ from kreuzberg import (
|
|
16
15
|
EasyOCRConfig,
|
17
16
|
ExtractionConfig,
|
18
17
|
ExtractionResult,
|
19
|
-
GMFTConfig,
|
20
18
|
KreuzbergError,
|
21
|
-
LanguageDetectionConfig,
|
22
19
|
MissingDependencyError,
|
23
20
|
PaddleOCRConfig,
|
24
21
|
ParsingError,
|
25
|
-
SpacyEntityExtractionConfig,
|
26
22
|
TesseractConfig,
|
27
23
|
ValidationError,
|
28
24
|
batch_extract_bytes,
|
29
25
|
)
|
26
|
+
from kreuzberg._api._config_cache import (
|
27
|
+
create_gmft_config_cached,
|
28
|
+
create_html_markdown_config_cached,
|
29
|
+
create_language_detection_config_cached,
|
30
|
+
create_ocr_config_cached,
|
31
|
+
create_spacy_config_cached,
|
32
|
+
discover_config_cached,
|
33
|
+
parse_header_config_cached,
|
34
|
+
)
|
30
35
|
from kreuzberg._config import discover_config
|
31
|
-
from kreuzberg._types import HTMLToMarkdownConfig
|
32
36
|
|
33
37
|
if TYPE_CHECKING:
|
34
38
|
from litestar.datastructures import UploadFile
|
@@ -146,68 +150,65 @@ def _create_ocr_config(
|
|
146
150
|
return config_dict
|
147
151
|
|
148
152
|
|
149
|
-
|
150
|
-
|
153
|
+
def _create_dimension_tuple(width: int | None, height: int | None) -> tuple[int, int] | None:
|
154
|
+
"""Create a dimension tuple from width and height values.
|
155
|
+
|
156
|
+
Args:
|
157
|
+
width: Width value or None
|
158
|
+
height: Height value or None
|
159
|
+
|
160
|
+
Returns:
|
161
|
+
Tuple of (width, height) if both values are not None, otherwise None
|
162
|
+
"""
|
163
|
+
if width is not None and height is not None:
|
164
|
+
return (width, height)
|
165
|
+
return None
|
166
|
+
|
167
|
+
|
168
|
+
def merge_configs(
|
151
169
|
static_config: ExtractionConfig | None,
|
152
|
-
query_params:
|
153
|
-
header_config:
|
170
|
+
query_params: dict[str, Any],
|
171
|
+
header_config: dict[str, Any] | None,
|
154
172
|
) -> ExtractionConfig:
|
155
173
|
base_config = static_config or ExtractionConfig()
|
156
174
|
config_dict = base_config.to_dict()
|
157
175
|
|
158
|
-
|
159
|
-
for key, value in query_dict.items():
|
176
|
+
for key, value in query_params.items():
|
160
177
|
if value is not None and key in config_dict:
|
161
178
|
config_dict[key] = _convert_value_type(config_dict[key], value)
|
162
179
|
|
163
180
|
if header_config:
|
164
|
-
|
165
|
-
for key, value in header_dict.items():
|
181
|
+
for key, value in header_config.items():
|
166
182
|
if key in config_dict:
|
167
183
|
config_dict[key] = value
|
168
184
|
|
169
185
|
if "ocr_config" in config_dict and isinstance(config_dict["ocr_config"], dict):
|
170
186
|
ocr_backend = config_dict.get("ocr_backend")
|
171
|
-
config_dict["ocr_config"] =
|
187
|
+
config_dict["ocr_config"] = create_ocr_config_cached(ocr_backend, config_dict["ocr_config"])
|
172
188
|
|
173
189
|
if "gmft_config" in config_dict and isinstance(config_dict["gmft_config"], dict):
|
174
|
-
config_dict["gmft_config"] =
|
190
|
+
config_dict["gmft_config"] = create_gmft_config_cached(config_dict["gmft_config"])
|
175
191
|
|
176
192
|
if "language_detection_config" in config_dict and isinstance(config_dict["language_detection_config"], dict):
|
177
|
-
config_dict["language_detection_config"] =
|
193
|
+
config_dict["language_detection_config"] = create_language_detection_config_cached(
|
194
|
+
config_dict["language_detection_config"]
|
195
|
+
)
|
178
196
|
|
179
197
|
if "spacy_entity_extraction_config" in config_dict and isinstance(
|
180
198
|
config_dict["spacy_entity_extraction_config"], dict
|
181
199
|
):
|
182
|
-
config_dict["spacy_entity_extraction_config"] =
|
183
|
-
|
200
|
+
config_dict["spacy_entity_extraction_config"] = create_spacy_config_cached(
|
201
|
+
config_dict["spacy_entity_extraction_config"]
|
184
202
|
)
|
185
203
|
|
186
204
|
if "html_to_markdown_config" in config_dict and isinstance(config_dict["html_to_markdown_config"], dict):
|
187
|
-
config_dict["html_to_markdown_config"] =
|
205
|
+
config_dict["html_to_markdown_config"] = create_html_markdown_config_cached(
|
206
|
+
config_dict["html_to_markdown_config"]
|
207
|
+
)
|
188
208
|
|
189
209
|
return ExtractionConfig(**config_dict)
|
190
210
|
|
191
211
|
|
192
|
-
def _make_hashable(obj: Any) -> Any:
|
193
|
-
if isinstance(obj, dict):
|
194
|
-
return tuple(sorted((k, _make_hashable(v)) for k, v in obj.items()))
|
195
|
-
if isinstance(obj, list):
|
196
|
-
return tuple(_make_hashable(item) for item in obj)
|
197
|
-
return obj
|
198
|
-
|
199
|
-
|
200
|
-
def merge_configs(
|
201
|
-
static_config: ExtractionConfig | None,
|
202
|
-
query_params: dict[str, Any],
|
203
|
-
header_config: dict[str, Any] | None,
|
204
|
-
) -> ExtractionConfig:
|
205
|
-
query_tuple = tuple(sorted(query_params.items())) if query_params else ()
|
206
|
-
header_tuple = _make_hashable(header_config) if header_config else None
|
207
|
-
|
208
|
-
return _merge_configs_cached(static_config, query_tuple, header_tuple)
|
209
|
-
|
210
|
-
|
211
212
|
@post("/extract", operation_id="ExtractFiles")
|
212
213
|
async def handle_files_upload( # noqa: PLR0913
|
213
214
|
request: Request[Any, Any, Any],
|
@@ -223,6 +224,13 @@ async def handle_files_upload( # noqa: PLR0913
|
|
223
224
|
ocr_backend: Literal["tesseract", "easyocr", "paddleocr"] | None = None,
|
224
225
|
auto_detect_language: str | bool | None = None,
|
225
226
|
pdf_password: str | None = None,
|
227
|
+
extract_images: str | bool | None = None,
|
228
|
+
ocr_extracted_images: str | bool | None = None,
|
229
|
+
image_ocr_backend: Literal["tesseract", "easyocr", "paddleocr"] | None = None,
|
230
|
+
image_ocr_min_width: int | None = None,
|
231
|
+
image_ocr_min_height: int | None = None,
|
232
|
+
image_ocr_max_width: int | None = None,
|
233
|
+
image_ocr_max_height: int | None = None,
|
226
234
|
) -> list[ExtractionResult]:
|
227
235
|
"""Extract text, metadata, and structured data from uploaded documents.
|
228
236
|
|
@@ -250,11 +258,30 @@ async def handle_files_upload( # noqa: PLR0913
|
|
250
258
|
ocr_backend: OCR engine to use (tesseract, easyocr, paddleocr)
|
251
259
|
auto_detect_language: Enable automatic language detection
|
252
260
|
pdf_password: Password for encrypted PDF files
|
261
|
+
extract_images: Enable image extraction for supported formats
|
262
|
+
ocr_extracted_images: Run OCR over extracted images
|
263
|
+
image_ocr_backend: Optional backend override for image OCR
|
264
|
+
image_ocr_min_width: Minimum image width for OCR eligibility
|
265
|
+
image_ocr_min_height: Minimum image height for OCR eligibility
|
266
|
+
image_ocr_max_width: Maximum image width for OCR eligibility
|
267
|
+
image_ocr_max_height: Maximum image height for OCR eligibility
|
253
268
|
|
254
269
|
Returns:
|
255
270
|
List of extraction results, one per uploaded file
|
271
|
+
|
272
|
+
Additional query parameters:
|
273
|
+
extract_images: Enable image extraction for supported formats
|
274
|
+
ocr_extracted_images: Run OCR over extracted images
|
275
|
+
image_ocr_backend: Optional backend override for image OCR
|
276
|
+
image_ocr_min_width: Minimum image width for OCR eligibility
|
277
|
+
image_ocr_min_height: Minimum image height for OCR eligibility
|
278
|
+
image_ocr_max_width: Maximum image width for OCR eligibility
|
279
|
+
image_ocr_max_height: Maximum image height for OCR eligibility
|
256
280
|
"""
|
257
|
-
static_config =
|
281
|
+
static_config = discover_config_cached()
|
282
|
+
|
283
|
+
min_dims = _create_dimension_tuple(image_ocr_min_width, image_ocr_min_height)
|
284
|
+
max_dims = _create_dimension_tuple(image_ocr_max_width, image_ocr_max_height)
|
258
285
|
|
259
286
|
query_params = {
|
260
287
|
"chunk_content": chunk_content,
|
@@ -268,12 +295,17 @@ async def handle_files_upload( # noqa: PLR0913
|
|
268
295
|
"ocr_backend": ocr_backend,
|
269
296
|
"auto_detect_language": auto_detect_language,
|
270
297
|
"pdf_password": pdf_password,
|
298
|
+
"extract_images": extract_images,
|
299
|
+
"ocr_extracted_images": ocr_extracted_images,
|
300
|
+
"image_ocr_backend": image_ocr_backend,
|
301
|
+
"image_ocr_min_dimensions": min_dims,
|
302
|
+
"image_ocr_max_dimensions": max_dims,
|
271
303
|
}
|
272
304
|
|
273
305
|
header_config = None
|
274
306
|
if config_header := request.headers.get("X-Extraction-Config"):
|
275
307
|
try:
|
276
|
-
header_config =
|
308
|
+
header_config = parse_header_config_cached(config_header)
|
277
309
|
except Exception as e:
|
278
310
|
raise ValidationError(f"Invalid JSON in X-Extraction-Config header: {e}", context={"error": str(e)}) from e
|
279
311
|
|
@@ -316,12 +348,10 @@ async def get_configuration() -> ConfigurationResponse:
|
|
316
348
|
|
317
349
|
|
318
350
|
def _polars_dataframe_encoder(obj: Any) -> Any:
|
319
|
-
"""Convert polars DataFrame to dict for JSON serialization."""
|
320
351
|
return obj.to_dicts()
|
321
352
|
|
322
353
|
|
323
354
|
def _pil_image_encoder(obj: Any) -> str:
|
324
|
-
"""Convert PIL Image to base64 string for JSON serialization."""
|
325
355
|
buffer = io.BytesIO()
|
326
356
|
obj.save(buffer, format="PNG")
|
327
357
|
img_str = base64.b64encode(buffer.getvalue()).decode()
|
@@ -344,7 +374,6 @@ openapi_config = OpenAPIConfig(
|
|
344
374
|
create_examples=True,
|
345
375
|
)
|
346
376
|
|
347
|
-
# Type encoders for custom serialization
|
348
377
|
type_encoders = {
|
349
378
|
pl.DataFrame: _polars_dataframe_encoder,
|
350
379
|
Image.Image: _pil_image_encoder,
|
@@ -360,5 +389,5 @@ app = Litestar(
|
|
360
389
|
Exception: general_exception_handler,
|
361
390
|
},
|
362
391
|
type_encoders=type_encoders,
|
363
|
-
request_max_body_size=1024 * 1024 * 1024,
|
392
|
+
request_max_body_size=1024 * 1024 * 1024,
|
364
393
|
)
|
kreuzberg/_chunker.py
CHANGED
@@ -20,14 +20,15 @@ def get_chunker(
|
|
20
20
|
key = (max_characters, overlap_characters, mime_type)
|
21
21
|
if key not in _chunkers:
|
22
22
|
try:
|
23
|
-
|
24
|
-
|
23
|
+
match mime_type:
|
24
|
+
case x if x == MARKDOWN_MIME_TYPE:
|
25
|
+
from semantic_text_splitter import MarkdownSplitter # noqa: PLC0415
|
25
26
|
|
26
|
-
|
27
|
-
|
28
|
-
|
27
|
+
_chunkers[key] = MarkdownSplitter(max_characters, overlap_characters)
|
28
|
+
case _:
|
29
|
+
from semantic_text_splitter import TextSplitter # noqa: PLC0415
|
29
30
|
|
30
|
-
|
31
|
+
_chunkers[key] = TextSplitter(max_characters, overlap_characters)
|
31
32
|
except ImportError as e: # pragma: no cover
|
32
33
|
raise MissingDependencyError.create_for_package(
|
33
34
|
dependency_group="chunking", functionality="chunking", package_name="semantic-text-splitter"
|
kreuzberg/_config.py
CHANGED
@@ -69,7 +69,17 @@ def _build_ocr_config_from_cli(
|
|
69
69
|
try:
|
70
70
|
match ocr_backend:
|
71
71
|
case "tesseract":
|
72
|
-
|
72
|
+
# Handle PSM mode conversion from int to enum
|
73
|
+
processed_args = backend_args.copy()
|
74
|
+
if "psm" in processed_args and isinstance(processed_args["psm"], int):
|
75
|
+
try:
|
76
|
+
processed_args["psm"] = PSMMode(processed_args["psm"])
|
77
|
+
except ValueError as e:
|
78
|
+
raise ValidationError(
|
79
|
+
f"Invalid PSM mode value: {processed_args['psm']}",
|
80
|
+
context={"psm_value": processed_args["psm"], "error": str(e)},
|
81
|
+
) from e
|
82
|
+
return TesseractConfig(**processed_args)
|
73
83
|
case "easyocr":
|
74
84
|
return EasyOCRConfig(**backend_args)
|
75
85
|
case "paddleocr":
|
kreuzberg/_constants.py
CHANGED
@@ -65,12 +65,10 @@ def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tup
|
|
65
65
|
return None, None
|
66
66
|
|
67
67
|
translated_text = _get_translated_text(result)
|
68
|
-
scores =
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
if re.search(pattern, translated_text):
|
73
|
-
scores[doc_type] += 1
|
68
|
+
scores = {
|
69
|
+
doc_type: sum(1 for pattern in patterns if re.search(pattern, translated_text))
|
70
|
+
for doc_type, patterns in DOCUMENT_CLASSIFIERS.items()
|
71
|
+
}
|
74
72
|
|
75
73
|
total_score = sum(scores.values())
|
76
74
|
if total_score == 0:
|
@@ -134,7 +132,7 @@ def classify_document_from_layout(
|
|
134
132
|
if not found_words.is_empty():
|
135
133
|
scores[doc_type] += 1.0
|
136
134
|
word_top = found_words[0, "top"]
|
137
|
-
if word_top < page_height * 0.3:
|
135
|
+
if word_top is not None and word_top < page_height * 0.3:
|
138
136
|
scores[doc_type] += 0.5
|
139
137
|
|
140
138
|
total_score = sum(scores.values())
|
kreuzberg/_entity_extraction.py
CHANGED
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|
3
3
|
import os
|
4
4
|
import re
|
5
5
|
from functools import lru_cache
|
6
|
+
from itertools import chain
|
6
7
|
from typing import TYPE_CHECKING, Any
|
7
8
|
|
8
9
|
from kreuzberg._types import Entity, SpacyEntityExtractionConfig
|
@@ -21,11 +22,15 @@ def extract_entities(
|
|
21
22
|
) -> list[Entity]:
|
22
23
|
entities: list[Entity] = []
|
23
24
|
if custom_patterns:
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
25
|
+
entities.extend(
|
26
|
+
chain.from_iterable(
|
27
|
+
(
|
28
|
+
Entity(type=ent_type, text=match.group(), start=match.start(), end=match.end())
|
29
|
+
for match in re.finditer(pattern, text)
|
30
|
+
)
|
31
|
+
for ent_type, pattern in custom_patterns
|
28
32
|
)
|
33
|
+
)
|
29
34
|
|
30
35
|
if spacy_config is None:
|
31
36
|
spacy_config = SpacyEntityExtractionConfig()
|