kreuzberg 3.11.3__py3-none-any.whl → 3.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +14 -13
- kreuzberg/__main__.py +0 -2
- kreuzberg/_api/main.py +119 -9
- kreuzberg/_config.py +248 -204
- kreuzberg/_document_classification.py +0 -8
- kreuzberg/_entity_extraction.py +1 -93
- kreuzberg/_extractors/_base.py +0 -5
- kreuzberg/_extractors/_email.py +1 -11
- kreuzberg/_extractors/_html.py +9 -12
- kreuzberg/_extractors/_image.py +1 -23
- kreuzberg/_extractors/_pandoc.py +10 -89
- kreuzberg/_extractors/_pdf.py +39 -92
- kreuzberg/_extractors/_presentation.py +0 -17
- kreuzberg/_extractors/_spread_sheet.py +13 -53
- kreuzberg/_extractors/_structured.py +1 -4
- kreuzberg/_gmft.py +14 -138
- kreuzberg/_language_detection.py +1 -22
- kreuzberg/_mcp/__init__.py +0 -2
- kreuzberg/_mcp/server.py +3 -10
- kreuzberg/_mime_types.py +1 -2
- kreuzberg/_ocr/_easyocr.py +21 -108
- kreuzberg/_ocr/_paddleocr.py +16 -94
- kreuzberg/_ocr/_table_extractor.py +260 -0
- kreuzberg/_ocr/_tesseract.py +906 -264
- kreuzberg/_playa.py +5 -4
- kreuzberg/_types.py +638 -40
- kreuzberg/_utils/_cache.py +88 -90
- kreuzberg/_utils/_device.py +0 -18
- kreuzberg/_utils/_document_cache.py +0 -2
- kreuzberg/_utils/_errors.py +0 -3
- kreuzberg/_utils/_pdf_lock.py +0 -2
- kreuzberg/_utils/_process_pool.py +19 -19
- kreuzberg/_utils/_quality.py +0 -43
- kreuzberg/_utils/_ref.py +48 -0
- kreuzberg/_utils/_serialization.py +0 -5
- kreuzberg/_utils/_string.py +9 -39
- kreuzberg/_utils/_sync.py +0 -1
- kreuzberg/_utils/_table.py +50 -57
- kreuzberg/cli.py +55 -77
- kreuzberg/extraction.py +39 -32
- {kreuzberg-3.11.3.dist-info → kreuzberg-3.13.0.dist-info}/METADATA +17 -14
- kreuzberg-3.13.0.dist-info/RECORD +56 -0
- kreuzberg-3.11.3.dist-info/RECORD +0 -54
- {kreuzberg-3.11.3.dist-info → kreuzberg-3.13.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.11.3.dist-info → kreuzberg-3.13.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.11.3.dist-info → kreuzberg-3.13.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/__init__.py
CHANGED
@@ -1,16 +1,20 @@
|
|
1
1
|
from importlib.metadata import version
|
2
2
|
|
3
|
-
from kreuzberg._config import discover_and_load_config, load_config_from_path, try_discover_config
|
4
|
-
from kreuzberg._entity_extraction import SpacyEntityExtractionConfig
|
5
|
-
from kreuzberg._gmft import GMFTConfig
|
6
|
-
from kreuzberg._language_detection import LanguageDetectionConfig
|
7
|
-
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
8
|
-
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
9
|
-
from kreuzberg._ocr._tesseract import TesseractConfig
|
10
|
-
|
11
|
-
from ._ocr._tesseract import PSMMode
|
12
3
|
from ._registry import ExtractorRegistry
|
13
|
-
from ._types import
|
4
|
+
from ._types import (
|
5
|
+
EasyOCRConfig,
|
6
|
+
Entity,
|
7
|
+
ExtractionConfig,
|
8
|
+
ExtractionResult,
|
9
|
+
GMFTConfig,
|
10
|
+
LanguageDetectionConfig,
|
11
|
+
Metadata,
|
12
|
+
PaddleOCRConfig,
|
13
|
+
PSMMode,
|
14
|
+
SpacyEntityExtractionConfig,
|
15
|
+
TableData,
|
16
|
+
TesseractConfig,
|
17
|
+
)
|
14
18
|
from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
|
15
19
|
from .extraction import (
|
16
20
|
batch_extract_bytes,
|
@@ -49,11 +53,8 @@ __all__ = [
|
|
49
53
|
"batch_extract_bytes_sync",
|
50
54
|
"batch_extract_file",
|
51
55
|
"batch_extract_file_sync",
|
52
|
-
"discover_and_load_config",
|
53
56
|
"extract_bytes",
|
54
57
|
"extract_bytes_sync",
|
55
58
|
"extract_file",
|
56
59
|
"extract_file_sync",
|
57
|
-
"load_config_from_path",
|
58
|
-
"try_discover_config",
|
59
60
|
]
|
kreuzberg/__main__.py
CHANGED
kreuzberg/_api/main.py
CHANGED
@@ -1,20 +1,24 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from
|
4
|
-
from
|
3
|
+
from functools import lru_cache
|
4
|
+
from json import dumps, loads
|
5
|
+
from typing import TYPE_CHECKING, Annotated, Any, Literal
|
5
6
|
|
6
7
|
import msgspec
|
7
8
|
|
8
9
|
from kreuzberg import (
|
10
|
+
EasyOCRConfig,
|
9
11
|
ExtractionConfig,
|
10
12
|
ExtractionResult,
|
11
13
|
KreuzbergError,
|
12
14
|
MissingDependencyError,
|
15
|
+
PaddleOCRConfig,
|
13
16
|
ParsingError,
|
17
|
+
TesseractConfig,
|
14
18
|
ValidationError,
|
15
19
|
batch_extract_bytes,
|
16
20
|
)
|
17
|
-
from kreuzberg._config import
|
21
|
+
from kreuzberg._config import discover_config
|
18
22
|
|
19
23
|
if TYPE_CHECKING:
|
20
24
|
from litestar.datastructures import UploadFile
|
@@ -65,17 +69,123 @@ def exception_handler(request: Request[Any, Any, Any], exception: KreuzbergError
|
|
65
69
|
)
|
66
70
|
|
67
71
|
|
72
|
+
def _convert_value_type(current_value: Any, new_value: Any) -> Any:
|
73
|
+
if isinstance(current_value, bool):
|
74
|
+
if isinstance(new_value, str):
|
75
|
+
return str(new_value).lower() in ("true", "1", "yes", "on")
|
76
|
+
return bool(new_value)
|
77
|
+
if isinstance(current_value, int) and not isinstance(new_value, bool):
|
78
|
+
return int(new_value) if new_value is not None else current_value
|
79
|
+
if isinstance(current_value, float):
|
80
|
+
return float(new_value) if new_value is not None else current_value
|
81
|
+
return new_value
|
82
|
+
|
83
|
+
|
84
|
+
def _create_ocr_config(
|
85
|
+
ocr_backend: Literal["tesseract", "easyocr", "paddleocr"] | None, config_dict: dict[str, Any]
|
86
|
+
) -> Any:
|
87
|
+
if ocr_backend == "tesseract":
|
88
|
+
return TesseractConfig(**config_dict)
|
89
|
+
if ocr_backend == "easyocr":
|
90
|
+
return EasyOCRConfig(**config_dict)
|
91
|
+
if ocr_backend == "paddleocr":
|
92
|
+
return PaddleOCRConfig(**config_dict)
|
93
|
+
return config_dict
|
94
|
+
|
95
|
+
|
96
|
+
@lru_cache(maxsize=128)
|
97
|
+
def _merge_configs_cached(
|
98
|
+
static_config: ExtractionConfig | None,
|
99
|
+
query_params: tuple[tuple[str, Any], ...],
|
100
|
+
header_config: tuple[tuple[str, Any], ...] | None,
|
101
|
+
) -> ExtractionConfig:
|
102
|
+
"""Cached implementation of merge_configs with hashable parameters."""
|
103
|
+
base_config = static_config or ExtractionConfig()
|
104
|
+
config_dict = base_config.to_dict()
|
105
|
+
|
106
|
+
query_dict = dict(query_params) if query_params else {}
|
107
|
+
for key, value in query_dict.items():
|
108
|
+
if value is not None and key in config_dict:
|
109
|
+
config_dict[key] = _convert_value_type(config_dict[key], value)
|
110
|
+
|
111
|
+
if header_config:
|
112
|
+
header_dict = dict(header_config)
|
113
|
+
for key, value in header_dict.items():
|
114
|
+
if key in config_dict:
|
115
|
+
config_dict[key] = value
|
116
|
+
|
117
|
+
if "ocr_config" in config_dict and isinstance(config_dict["ocr_config"], dict):
|
118
|
+
ocr_backend = config_dict.get("ocr_backend")
|
119
|
+
config_dict["ocr_config"] = _create_ocr_config(ocr_backend, config_dict["ocr_config"])
|
120
|
+
|
121
|
+
return ExtractionConfig(**config_dict)
|
122
|
+
|
123
|
+
|
124
|
+
def merge_configs(
|
125
|
+
static_config: ExtractionConfig | None,
|
126
|
+
query_params: dict[str, Any],
|
127
|
+
header_config: dict[str, Any] | None,
|
128
|
+
) -> ExtractionConfig:
|
129
|
+
"""Merge configurations with precedence: header > query > static > default."""
|
130
|
+
query_tuple = tuple(sorted(query_params.items())) if query_params else ()
|
131
|
+
header_tuple = tuple(sorted(header_config.items())) if header_config else None
|
132
|
+
|
133
|
+
return _merge_configs_cached(static_config, query_tuple, header_tuple)
|
134
|
+
|
135
|
+
|
68
136
|
@post("/extract", operation_id="ExtractFiles")
|
69
|
-
async def handle_files_upload(
|
137
|
+
async def handle_files_upload( # noqa: PLR0913
|
138
|
+
request: Request[Any, Any, Any],
|
70
139
|
data: Annotated[list[UploadFile], Body(media_type=RequestEncodingType.MULTI_PART)],
|
140
|
+
chunk_content: str | bool | None = None,
|
141
|
+
max_chars: int | None = None,
|
142
|
+
max_overlap: int | None = None,
|
143
|
+
extract_tables: str | bool | None = None,
|
144
|
+
extract_entities: str | bool | None = None,
|
145
|
+
extract_keywords: str | bool | None = None,
|
146
|
+
keyword_count: int | None = None,
|
147
|
+
force_ocr: str | bool | None = None,
|
148
|
+
ocr_backend: Literal["tesseract", "easyocr", "paddleocr"] | None = None,
|
149
|
+
auto_detect_language: str | bool | None = None,
|
150
|
+
pdf_password: str | None = None,
|
71
151
|
) -> list[ExtractionResult]:
|
72
|
-
"""Extracts text content from
|
73
|
-
|
74
|
-
|
152
|
+
"""Extracts text content from uploaded files with optional runtime configuration.
|
153
|
+
|
154
|
+
Configuration can be provided via:
|
155
|
+
1. Query parameters for common settings
|
156
|
+
2. X-Extraction-Config header for complex nested configurations (JSON format)
|
157
|
+
3. Static configuration file (kreuzberg.toml or pyproject.toml)
|
158
|
+
|
159
|
+
Precedence: Header config > Query params > Static config > Defaults
|
160
|
+
"""
|
161
|
+
static_config = discover_config()
|
162
|
+
|
163
|
+
query_params = {
|
164
|
+
"chunk_content": chunk_content,
|
165
|
+
"max_chars": max_chars,
|
166
|
+
"max_overlap": max_overlap,
|
167
|
+
"extract_tables": extract_tables,
|
168
|
+
"extract_entities": extract_entities,
|
169
|
+
"extract_keywords": extract_keywords,
|
170
|
+
"keyword_count": keyword_count,
|
171
|
+
"force_ocr": force_ocr,
|
172
|
+
"ocr_backend": ocr_backend,
|
173
|
+
"auto_detect_language": auto_detect_language,
|
174
|
+
"pdf_password": pdf_password,
|
175
|
+
}
|
176
|
+
|
177
|
+
header_config = None
|
178
|
+
if config_header := request.headers.get("X-Extraction-Config"):
|
179
|
+
try:
|
180
|
+
header_config = loads(config_header)
|
181
|
+
except Exception as e:
|
182
|
+
raise ValidationError(f"Invalid JSON in X-Extraction-Config header: {e}", context={"error": str(e)}) from e
|
183
|
+
|
184
|
+
final_config = merge_configs(static_config, query_params, header_config)
|
75
185
|
|
76
186
|
return await batch_extract_bytes(
|
77
187
|
[(await file.read(), file.content_type) for file in data],
|
78
|
-
config=
|
188
|
+
config=final_config,
|
79
189
|
)
|
80
190
|
|
81
191
|
|
@@ -88,7 +198,7 @@ async def health_check() -> dict[str, str]:
|
|
88
198
|
@get("/config", operation_id="GetConfiguration")
|
89
199
|
async def get_configuration() -> dict[str, Any]:
|
90
200
|
"""Get the current configuration."""
|
91
|
-
config =
|
201
|
+
config = discover_config()
|
92
202
|
if config is None:
|
93
203
|
return {"message": "No configuration file found", "config": None}
|
94
204
|
|