kreuzberg 3.13.3__py3-none-any.whl → 3.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_api/main.py +82 -18
- kreuzberg/_config.py +0 -1
- kreuzberg/_extractors/_image.py +20 -2
- kreuzberg/_extractors/_pdf.py +21 -1
- kreuzberg/_extractors/_spread_sheet.py +0 -1
- kreuzberg/_gmft.py +79 -33
- kreuzberg/_mcp/server.py +0 -76
- kreuzberg/_ocr/_base.py +1 -2
- kreuzberg/_ocr/_paddleocr.py +39 -13
- kreuzberg/_ocr/_tesseract.py +2 -3
- kreuzberg/_registry.py +26 -0
- kreuzberg/_types.py +64 -1
- kreuzberg/_utils/_cache.py +34 -12
- kreuzberg/_utils/_image_preprocessing.py +346 -0
- kreuzberg/_utils/_ocr_cache.py +2 -5
- kreuzberg/_utils/_process_pool.py +3 -3
- kreuzberg/_utils/_table.py +4 -1
- kreuzberg/cli.py +19 -2
- kreuzberg/extraction.py +4 -4
- {kreuzberg-3.13.3.dist-info → kreuzberg-3.14.0.dist-info}/METADATA +4 -4
- {kreuzberg-3.13.3.dist-info → kreuzberg-3.14.0.dist-info}/RECORD +24 -23
- {kreuzberg-3.13.3.dist-info → kreuzberg-3.14.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.13.3.dist-info → kreuzberg-3.14.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.13.3.dist-info → kreuzberg-3.14.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_api/main.py
CHANGED
@@ -6,6 +6,7 @@ from json import dumps, loads
|
|
6
6
|
from typing import TYPE_CHECKING, Annotated, Any, Literal
|
7
7
|
|
8
8
|
import msgspec
|
9
|
+
from typing_extensions import TypedDict
|
9
10
|
|
10
11
|
from kreuzberg import (
|
11
12
|
EasyOCRConfig,
|
@@ -24,11 +25,28 @@ from kreuzberg._config import discover_config
|
|
24
25
|
if TYPE_CHECKING:
|
25
26
|
from litestar.datastructures import UploadFile
|
26
27
|
|
28
|
+
|
29
|
+
class HealthResponse(TypedDict):
|
30
|
+
"""Response model for health check endpoint."""
|
31
|
+
|
32
|
+
status: str
|
33
|
+
|
34
|
+
|
35
|
+
class ConfigurationResponse(TypedDict):
|
36
|
+
"""Response model for configuration endpoint."""
|
37
|
+
|
38
|
+
message: str
|
39
|
+
config: dict[str, Any] | None
|
40
|
+
|
41
|
+
|
27
42
|
try:
|
28
43
|
from litestar import Litestar, Request, Response, get, post
|
29
44
|
from litestar.contrib.opentelemetry import OpenTelemetryConfig, OpenTelemetryPlugin
|
30
45
|
from litestar.enums import RequestEncodingType
|
31
46
|
from litestar.logging import StructLoggingConfig
|
47
|
+
from litestar.openapi.config import OpenAPIConfig
|
48
|
+
from litestar.openapi.spec.contact import Contact
|
49
|
+
from litestar.openapi.spec.license import License
|
32
50
|
from litestar.params import Body
|
33
51
|
from litestar.status_codes import (
|
34
52
|
HTTP_400_BAD_REQUEST,
|
@@ -71,7 +89,6 @@ def exception_handler(request: Request[Any, Any, Any], exception: KreuzbergError
|
|
71
89
|
|
72
90
|
|
73
91
|
def general_exception_handler(request: Request[Any, Any, Any], exception: Exception) -> Response[Any]:
|
74
|
-
"""Temporary handler to catch ALL exceptions for debugging."""
|
75
92
|
error_type = type(exception).__name__
|
76
93
|
error_message = str(exception)
|
77
94
|
traceback_str = traceback.format_exc()
|
@@ -127,7 +144,6 @@ def _merge_configs_cached(
|
|
127
144
|
query_params: tuple[tuple[str, Any], ...],
|
128
145
|
header_config: tuple[tuple[str, Any], ...] | None,
|
129
146
|
) -> ExtractionConfig:
|
130
|
-
"""Cached implementation of merge_configs with hashable parameters."""
|
131
147
|
base_config = static_config or ExtractionConfig()
|
132
148
|
config_dict = base_config.to_dict()
|
133
149
|
|
@@ -150,7 +166,6 @@ def _merge_configs_cached(
|
|
150
166
|
|
151
167
|
|
152
168
|
def _make_hashable(obj: Any) -> Any:
|
153
|
-
"""Convert nested dicts/lists to hashable tuples."""
|
154
169
|
if isinstance(obj, dict):
|
155
170
|
return tuple(sorted((k, _make_hashable(v)) for k, v in obj.items()))
|
156
171
|
if isinstance(obj, list):
|
@@ -163,7 +178,6 @@ def merge_configs(
|
|
163
178
|
query_params: dict[str, Any],
|
164
179
|
header_config: dict[str, Any] | None,
|
165
180
|
) -> ExtractionConfig:
|
166
|
-
"""Merge configurations with precedence: header > query > static > default."""
|
167
181
|
query_tuple = tuple(sorted(query_params.items())) if query_params else ()
|
168
182
|
header_tuple = _make_hashable(header_config) if header_config else None
|
169
183
|
|
@@ -186,14 +200,35 @@ async def handle_files_upload( # noqa: PLR0913
|
|
186
200
|
auto_detect_language: str | bool | None = None,
|
187
201
|
pdf_password: str | None = None,
|
188
202
|
) -> list[ExtractionResult]:
|
189
|
-
"""
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
203
|
+
"""Extract text, metadata, and structured data from uploaded documents.
|
204
|
+
|
205
|
+
This endpoint processes multiple file uploads and extracts comprehensive information including:
|
206
|
+
- Text content with metadata
|
207
|
+
- Tables (if enabled)
|
208
|
+
- Named entities (if enabled)
|
209
|
+
- Keywords (if enabled)
|
210
|
+
- Language detection (if enabled)
|
211
|
+
|
212
|
+
Supports various file formats including PDF, Office documents, images, and more.
|
213
|
+
Maximum file size: 1GB per file.
|
214
|
+
|
215
|
+
Args:
|
216
|
+
request: The HTTP request object
|
217
|
+
data: List of files to process (multipart form data)
|
218
|
+
chunk_content: Enable text chunking for large documents
|
219
|
+
max_chars: Maximum characters per chunk (default: 1000)
|
220
|
+
max_overlap: Character overlap between chunks (default: 200)
|
221
|
+
extract_tables: Extract tables from documents
|
222
|
+
extract_entities: Extract named entities from text
|
223
|
+
extract_keywords: Extract keywords from text
|
224
|
+
keyword_count: Number of keywords to extract (default: 10)
|
225
|
+
force_ocr: Force OCR processing even for text-based documents
|
226
|
+
ocr_backend: OCR engine to use (tesseract, easyocr, paddleocr)
|
227
|
+
auto_detect_language: Enable automatic language detection
|
228
|
+
pdf_password: Password for encrypted PDF files
|
229
|
+
|
230
|
+
Returns:
|
231
|
+
List of extraction results, one per uploaded file
|
197
232
|
"""
|
198
233
|
static_config = discover_config()
|
199
234
|
|
@@ -227,14 +262,25 @@ async def handle_files_upload( # noqa: PLR0913
|
|
227
262
|
|
228
263
|
|
229
264
|
@get("/health", operation_id="HealthCheck")
|
230
|
-
async def health_check() ->
|
231
|
-
"""
|
265
|
+
async def health_check() -> HealthResponse:
|
266
|
+
"""Check the health status of the API.
|
267
|
+
|
268
|
+
Returns:
|
269
|
+
Simple status response indicating the API is operational
|
270
|
+
"""
|
232
271
|
return {"status": "ok"}
|
233
272
|
|
234
273
|
|
235
274
|
@get("/config", operation_id="GetConfiguration")
|
236
|
-
async def get_configuration() ->
|
237
|
-
"""Get the current configuration.
|
275
|
+
async def get_configuration() -> ConfigurationResponse:
|
276
|
+
"""Get the current extraction configuration.
|
277
|
+
|
278
|
+
Returns the loaded configuration from kreuzberg.toml file if available,
|
279
|
+
or indicates that no configuration file was found.
|
280
|
+
|
281
|
+
Returns:
|
282
|
+
Configuration data with status message
|
283
|
+
"""
|
238
284
|
config = discover_config()
|
239
285
|
if config is None:
|
240
286
|
return {"message": "No configuration file found", "config": None}
|
@@ -245,12 +291,30 @@ async def get_configuration() -> dict[str, Any]:
|
|
245
291
|
}
|
246
292
|
|
247
293
|
|
294
|
+
openapi_config = OpenAPIConfig(
|
295
|
+
title="Kreuzberg API",
|
296
|
+
version="3.14.0",
|
297
|
+
description="Document intelligence framework API for extracting text, metadata, and structured data from diverse file formats",
|
298
|
+
contact=Contact(
|
299
|
+
name="Kreuzberg",
|
300
|
+
url="https://github.com/Goldziher/kreuzberg",
|
301
|
+
),
|
302
|
+
license=License(
|
303
|
+
name="MIT",
|
304
|
+
identifier="MIT",
|
305
|
+
),
|
306
|
+
use_handler_docstrings=True,
|
307
|
+
create_examples=True,
|
308
|
+
)
|
309
|
+
|
248
310
|
app = Litestar(
|
249
311
|
route_handlers=[handle_files_upload, health_check, get_configuration],
|
250
312
|
plugins=[OpenTelemetryPlugin(OpenTelemetryConfig())],
|
251
|
-
logging_config=StructLoggingConfig(),
|
313
|
+
logging_config=StructLoggingConfig(),
|
314
|
+
openapi_config=openapi_config,
|
252
315
|
exception_handlers={
|
253
316
|
KreuzbergError: exception_handler,
|
254
|
-
Exception: general_exception_handler,
|
317
|
+
Exception: general_exception_handler,
|
255
318
|
},
|
319
|
+
request_max_body_size=1024 * 1024 * 1024, # 1GB limit for large file uploads
|
256
320
|
)
|
kreuzberg/_config.py
CHANGED
@@ -162,7 +162,6 @@ def load_config_from_file(config_path: Path) -> dict[str, Any]:
|
|
162
162
|
if config_path.name == "pyproject.toml":
|
163
163
|
return data.get("tool", {}).get("kreuzberg", {}) # type: ignore[no-any-return]
|
164
164
|
|
165
|
-
# For any other TOML file, check if it has [tool.kreuzberg] section
|
166
165
|
if "tool" in data and "kreuzberg" in data["tool"]:
|
167
166
|
return data["tool"]["kreuzberg"] # type: ignore[no-any-return]
|
168
167
|
|
kreuzberg/_extractors/_image.py
CHANGED
@@ -7,10 +7,13 @@ from pathlib import Path
|
|
7
7
|
from typing import TYPE_CHECKING, ClassVar
|
8
8
|
|
9
9
|
from anyio import Path as AsyncPath
|
10
|
+
from PIL import Image
|
10
11
|
|
11
12
|
from kreuzberg._extractors._base import Extractor
|
12
13
|
from kreuzberg._mime_types import IMAGE_MIME_TYPES
|
13
14
|
from kreuzberg._ocr import get_ocr_backend
|
15
|
+
from kreuzberg._utils._image_preprocessing import normalize_image_dpi
|
16
|
+
from kreuzberg._utils._sync import run_sync
|
14
17
|
from kreuzberg._utils._tmp import create_temp_file
|
15
18
|
from kreuzberg.exceptions import ValidationError
|
16
19
|
|
@@ -57,7 +60,15 @@ class ImageExtractor(Extractor):
|
|
57
60
|
if self.config.ocr_backend is None:
|
58
61
|
raise ValidationError("ocr_backend is None, cannot perform OCR")
|
59
62
|
|
60
|
-
|
63
|
+
image = await run_sync(Image.open, str(path))
|
64
|
+
normalized_image, preprocessing_metadata = normalize_image_dpi(image, self.config)
|
65
|
+
|
66
|
+
backend = get_ocr_backend(self.config.ocr_backend)
|
67
|
+
result = await backend.process_image(normalized_image, **self.config.get_config_dict())
|
68
|
+
|
69
|
+
if preprocessing_metadata:
|
70
|
+
result.metadata["image_preprocessing"] = preprocessing_metadata
|
71
|
+
|
61
72
|
return self._apply_quality_processing(result)
|
62
73
|
|
63
74
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
@@ -77,8 +88,15 @@ class ImageExtractor(Extractor):
|
|
77
88
|
if self.config.ocr_backend is None:
|
78
89
|
raise ValidationError("ocr_backend is None, cannot perform OCR")
|
79
90
|
|
91
|
+
image = Image.open(str(path))
|
92
|
+
normalized_image, preprocessing_metadata = normalize_image_dpi(image, self.config)
|
93
|
+
|
80
94
|
backend = get_ocr_backend(self.config.ocr_backend)
|
81
|
-
result = backend.
|
95
|
+
result = backend.process_image_sync(normalized_image, **self.config.get_config_dict())
|
96
|
+
|
97
|
+
if preprocessing_metadata:
|
98
|
+
result.metadata["image_preprocessing"] = preprocessing_metadata
|
99
|
+
|
82
100
|
return self._apply_quality_processing(result)
|
83
101
|
|
84
102
|
def _get_extension_from_mime_type(self, mime_type: str) -> str:
|
kreuzberg/_extractors/_pdf.py
CHANGED
@@ -21,6 +21,7 @@ from kreuzberg._ocr import get_ocr_backend
|
|
21
21
|
from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
|
22
22
|
from kreuzberg._types import EasyOCRConfig, ExtractionResult, Metadata, OcrBackendType, PaddleOCRConfig, TesseractConfig
|
23
23
|
from kreuzberg._utils._errors import create_error_context, should_retry
|
24
|
+
from kreuzberg._utils._image_preprocessing import calculate_optimal_dpi
|
24
25
|
from kreuzberg._utils._pdf_lock import pypdfium_file_lock
|
25
26
|
from kreuzberg._utils._string import normalize_spaces
|
26
27
|
from kreuzberg._utils._sync import run_sync, run_taskgroup_batched
|
@@ -170,7 +171,26 @@ class PDFExtractor(Extractor):
|
|
170
171
|
try:
|
171
172
|
with pypdfium_file_lock(input_file):
|
172
173
|
document = await run_sync(pypdfium2.PdfDocument, str(input_file))
|
173
|
-
|
174
|
+
images = []
|
175
|
+
for page in cast("pypdfium2.PdfDocument", document):
|
176
|
+
width, height = page.get_size()
|
177
|
+
|
178
|
+
if self.config.auto_adjust_dpi:
|
179
|
+
optimal_dpi = calculate_optimal_dpi(
|
180
|
+
page_width=width,
|
181
|
+
page_height=height,
|
182
|
+
target_dpi=self.config.target_dpi,
|
183
|
+
max_dimension=self.config.max_image_dimension,
|
184
|
+
min_dpi=self.config.min_dpi,
|
185
|
+
max_dpi=self.config.max_dpi,
|
186
|
+
)
|
187
|
+
else:
|
188
|
+
optimal_dpi = self.config.target_dpi
|
189
|
+
|
190
|
+
scale = optimal_dpi / 72.0
|
191
|
+
|
192
|
+
images.append(page.render(scale=scale).to_pil())
|
193
|
+
return images
|
174
194
|
except pypdfium2.PdfiumError as e: # noqa: PERF203
|
175
195
|
last_error = e
|
176
196
|
if not should_retry(e, attempt + 1):
|
@@ -35,7 +35,6 @@ class SpreadSheetExtractor(Extractor):
|
|
35
35
|
SUPPORTED_MIME_TYPES = SPREADSHEET_MIME_TYPES
|
36
36
|
|
37
37
|
def _get_file_extension(self) -> str:
|
38
|
-
"""Get the appropriate file extension based on MIME type."""
|
39
38
|
mime_to_ext = {
|
40
39
|
"application/vnd.ms-excel": ".xls",
|
41
40
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
|
kreuzberg/_gmft.py
CHANGED
@@ -9,11 +9,11 @@ import time
|
|
9
9
|
import traceback
|
10
10
|
from io import StringIO
|
11
11
|
from pathlib import Path
|
12
|
-
from typing import TYPE_CHECKING, Any
|
12
|
+
from typing import TYPE_CHECKING, Any, cast
|
13
13
|
|
14
14
|
import anyio
|
15
15
|
import msgspec
|
16
|
-
import
|
16
|
+
import polars as pl
|
17
17
|
from PIL import Image
|
18
18
|
|
19
19
|
from kreuzberg._types import GMFTConfig, TableData
|
@@ -25,7 +25,63 @@ if TYPE_CHECKING:
|
|
25
25
|
from os import PathLike
|
26
26
|
|
27
27
|
from gmft.detectors.base import CroppedTable
|
28
|
-
|
28
|
+
|
29
|
+
|
30
|
+
def _pandas_to_polars(pandas_df: Any) -> pl.DataFrame:
|
31
|
+
if pandas_df is None:
|
32
|
+
return pl.DataFrame()
|
33
|
+
|
34
|
+
try:
|
35
|
+
return pl.from_pandas(pandas_df)
|
36
|
+
except (TypeError, ValueError, AttributeError):
|
37
|
+
if hasattr(pandas_df, "columns") and hasattr(pandas_df.columns, "duplicated"):
|
38
|
+
mask = ~pandas_df.columns.duplicated()
|
39
|
+
pandas_df = pandas_df.loc[:, mask]
|
40
|
+
return pl.from_pandas(pandas_df)
|
41
|
+
return pl.DataFrame()
|
42
|
+
|
43
|
+
|
44
|
+
def _dataframe_to_markdown(df: Any) -> str:
|
45
|
+
if df is None:
|
46
|
+
return ""
|
47
|
+
|
48
|
+
if isinstance(df, pl.DataFrame):
|
49
|
+
if df.is_empty():
|
50
|
+
return ""
|
51
|
+
return str(df)
|
52
|
+
|
53
|
+
if hasattr(df, "to_markdown"):
|
54
|
+
return cast("str", df.to_markdown())
|
55
|
+
|
56
|
+
return str(df)
|
57
|
+
|
58
|
+
|
59
|
+
def _dataframe_to_csv(df: Any) -> str:
|
60
|
+
if df is None:
|
61
|
+
return ""
|
62
|
+
|
63
|
+
if isinstance(df, pl.DataFrame):
|
64
|
+
if df.is_empty():
|
65
|
+
return ""
|
66
|
+
return df.write_csv()
|
67
|
+
|
68
|
+
if hasattr(df, "to_csv"):
|
69
|
+
return cast("str", df.to_csv(index=False))
|
70
|
+
|
71
|
+
return ""
|
72
|
+
|
73
|
+
|
74
|
+
def _is_dataframe_empty(df: Any) -> bool:
|
75
|
+
if df is None:
|
76
|
+
return True
|
77
|
+
|
78
|
+
if isinstance(df, pl.DataFrame):
|
79
|
+
return df.is_empty()
|
80
|
+
|
81
|
+
if hasattr(df, "empty"):
|
82
|
+
return cast("bool", df.empty)
|
83
|
+
|
84
|
+
return True
|
29
85
|
|
30
86
|
|
31
87
|
async def extract_tables(
|
@@ -111,7 +167,7 @@ async def extract_tables(
|
|
111
167
|
)
|
112
168
|
doc = await run_sync(PyPDFium2Document, str(file_path))
|
113
169
|
cropped_tables: list[CroppedTable] = []
|
114
|
-
dataframes: list[DataFrame] = []
|
170
|
+
dataframes: list[pl.DataFrame] = []
|
115
171
|
try:
|
116
172
|
for page in doc:
|
117
173
|
cropped_tables.extend(await run_sync(detector.extract, page))
|
@@ -124,8 +180,8 @@ async def extract_tables(
|
|
124
180
|
TableData(
|
125
181
|
cropped_image=cropped_table.image(),
|
126
182
|
page_number=cropped_table.page.page_number,
|
127
|
-
text=data_frame
|
128
|
-
df=data_frame,
|
183
|
+
text=_dataframe_to_markdown(data_frame),
|
184
|
+
df=_pandas_to_polars(data_frame),
|
129
185
|
)
|
130
186
|
for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False)
|
131
187
|
]
|
@@ -225,7 +281,7 @@ def extract_tables_sync(
|
|
225
281
|
TableData(
|
226
282
|
cropped_image=cropped_table.image(),
|
227
283
|
page_number=cropped_table.page.page_number,
|
228
|
-
text=data_frame
|
284
|
+
text=_dataframe_to_markdown(data_frame),
|
229
285
|
df=data_frame,
|
230
286
|
)
|
231
287
|
for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False)
|
@@ -295,26 +351,16 @@ def _extract_tables_in_process(
|
|
295
351
|
cropped_image.save(img_bytes, format="PNG")
|
296
352
|
img_bytes.seek(0)
|
297
353
|
|
298
|
-
if data_frame
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
else:
|
309
|
-
results.append(
|
310
|
-
{
|
311
|
-
"cropped_image_bytes": img_bytes.getvalue(),
|
312
|
-
"page_number": cropped_table.page.page_number,
|
313
|
-
"text": data_frame.to_markdown(),
|
314
|
-
"df_columns": None,
|
315
|
-
"df_csv": data_frame.to_csv(index=False),
|
316
|
-
}
|
317
|
-
)
|
354
|
+
csv_data = _dataframe_to_csv(data_frame) if not _is_dataframe_empty(data_frame) else ""
|
355
|
+
results.append(
|
356
|
+
{
|
357
|
+
"cropped_image_bytes": img_bytes.getvalue(),
|
358
|
+
"page_number": cropped_table.page.page_number,
|
359
|
+
"text": _dataframe_to_markdown(data_frame),
|
360
|
+
"df_columns": data_frame.columns,
|
361
|
+
"df_csv": csv_data if csv_data else None,
|
362
|
+
}
|
363
|
+
)
|
318
364
|
|
319
365
|
result_queue.put((True, results))
|
320
366
|
|
@@ -381,10 +427,10 @@ def _extract_tables_isolated(
|
|
381
427
|
for table_dict in result:
|
382
428
|
img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
|
383
429
|
|
384
|
-
if table_dict["df_csv"] is None:
|
385
|
-
df =
|
430
|
+
if table_dict["df_csv"] is None or table_dict["df_csv"] == "":
|
431
|
+
df = pl.DataFrame()
|
386
432
|
else:
|
387
|
-
df =
|
433
|
+
df = pl.read_csv(StringIO(table_dict["df_csv"]), truncate_ragged_lines=True)
|
388
434
|
|
389
435
|
tables.append(
|
390
436
|
TableData(
|
@@ -468,10 +514,10 @@ async def _extract_tables_isolated_async(
|
|
468
514
|
for table_dict in result:
|
469
515
|
img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
|
470
516
|
|
471
|
-
if table_dict["df_csv"] is None:
|
472
|
-
df =
|
517
|
+
if table_dict["df_csv"] is None or table_dict["df_csv"] == "":
|
518
|
+
df = pl.DataFrame()
|
473
519
|
else:
|
474
|
-
df =
|
520
|
+
df = pl.read_csv(StringIO(table_dict["df_csv"]), truncate_ragged_lines=True)
|
475
521
|
|
476
522
|
tables.append(
|
477
523
|
TableData(
|
kreuzberg/_mcp/server.py
CHANGED
@@ -16,14 +16,6 @@ mcp = FastMCP("Kreuzberg Text Extraction")
|
|
16
16
|
|
17
17
|
|
18
18
|
def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
|
19
|
-
"""Create ExtractionConfig with discovered config as base and tool parameters as overrides.
|
20
|
-
|
21
|
-
Args:
|
22
|
-
**kwargs: Tool parameters to override defaults/discovered config.
|
23
|
-
|
24
|
-
Returns:
|
25
|
-
ExtractionConfig instance.
|
26
|
-
"""
|
27
19
|
base_config = discover_config()
|
28
20
|
|
29
21
|
if base_config is None:
|
@@ -64,25 +56,6 @@ def extract_document( # noqa: PLR0913
|
|
64
56
|
keyword_count: int = 10,
|
65
57
|
auto_detect_language: bool = False,
|
66
58
|
) -> dict[str, Any]:
|
67
|
-
"""Extract text content from a document file.
|
68
|
-
|
69
|
-
Args:
|
70
|
-
file_path: Path to the document file
|
71
|
-
mime_type: MIME type of the document (auto-detected if not provided)
|
72
|
-
force_ocr: Force OCR even for text-based documents
|
73
|
-
chunk_content: Split content into chunks
|
74
|
-
extract_tables: Extract tables from the document
|
75
|
-
extract_entities: Extract named entities
|
76
|
-
extract_keywords: Extract keywords
|
77
|
-
ocr_backend: OCR backend to use (tesseract, easyocr, paddleocr)
|
78
|
-
max_chars: Maximum characters per chunk
|
79
|
-
max_overlap: Character overlap between chunks
|
80
|
-
keyword_count: Number of keywords to extract
|
81
|
-
auto_detect_language: Auto-detect document language
|
82
|
-
|
83
|
-
Returns:
|
84
|
-
Extracted content with metadata, tables, chunks, entities, and keywords
|
85
|
-
"""
|
86
59
|
config = _create_config_with_overrides(
|
87
60
|
force_ocr=force_ocr,
|
88
61
|
chunk_content=chunk_content,
|
@@ -115,25 +88,6 @@ def extract_bytes( # noqa: PLR0913
|
|
115
88
|
keyword_count: int = 10,
|
116
89
|
auto_detect_language: bool = False,
|
117
90
|
) -> dict[str, Any]:
|
118
|
-
"""Extract text content from document bytes.
|
119
|
-
|
120
|
-
Args:
|
121
|
-
content_base64: Base64-encoded document content
|
122
|
-
mime_type: MIME type of the document
|
123
|
-
force_ocr: Force OCR even for text-based documents
|
124
|
-
chunk_content: Split content into chunks
|
125
|
-
extract_tables: Extract tables from the document
|
126
|
-
extract_entities: Extract named entities
|
127
|
-
extract_keywords: Extract keywords
|
128
|
-
ocr_backend: OCR backend to use (tesseract, easyocr, paddleocr)
|
129
|
-
max_chars: Maximum characters per chunk
|
130
|
-
max_overlap: Character overlap between chunks
|
131
|
-
keyword_count: Number of keywords to extract
|
132
|
-
auto_detect_language: Auto-detect document language
|
133
|
-
|
134
|
-
Returns:
|
135
|
-
Extracted content with metadata, tables, chunks, entities, and keywords
|
136
|
-
"""
|
137
91
|
content_bytes = base64.b64decode(content_base64)
|
138
92
|
|
139
93
|
config = _create_config_with_overrides(
|
@@ -158,15 +112,6 @@ def extract_simple(
|
|
158
112
|
file_path: str,
|
159
113
|
mime_type: str | None = None,
|
160
114
|
) -> str:
|
161
|
-
"""Simple text extraction from a document file.
|
162
|
-
|
163
|
-
Args:
|
164
|
-
file_path: Path to the document file
|
165
|
-
mime_type: MIME type of the document (auto-detected if not provided)
|
166
|
-
|
167
|
-
Returns:
|
168
|
-
Extracted text content as a string
|
169
|
-
"""
|
170
115
|
config = _create_config_with_overrides()
|
171
116
|
result = extract_file_sync(file_path, mime_type, config)
|
172
117
|
return result.content
|
@@ -174,14 +119,12 @@ def extract_simple(
|
|
174
119
|
|
175
120
|
@mcp.resource("config://default")
|
176
121
|
def get_default_config() -> str:
|
177
|
-
"""Get the default extraction configuration."""
|
178
122
|
config = ExtractionConfig()
|
179
123
|
return json.dumps(msgspec.to_builtins(config, order="deterministic"), indent=2)
|
180
124
|
|
181
125
|
|
182
126
|
@mcp.resource("config://discovered")
|
183
127
|
def get_discovered_config() -> str:
|
184
|
-
"""Get the discovered configuration from config files."""
|
185
128
|
config = discover_config()
|
186
129
|
if config is None:
|
187
130
|
return "No configuration file found"
|
@@ -190,13 +133,11 @@ def get_discovered_config() -> str:
|
|
190
133
|
|
191
134
|
@mcp.resource("config://available-backends")
|
192
135
|
def get_available_backends() -> str:
|
193
|
-
"""Get available OCR backends."""
|
194
136
|
return "tesseract, easyocr, paddleocr"
|
195
137
|
|
196
138
|
|
197
139
|
@mcp.resource("extractors://supported-formats")
|
198
140
|
def get_supported_formats() -> str:
|
199
|
-
"""Get supported document formats."""
|
200
141
|
return """
|
201
142
|
Supported formats:
|
202
143
|
- PDF documents
|
@@ -210,14 +151,6 @@ def get_supported_formats() -> str:
|
|
210
151
|
|
211
152
|
@mcp.prompt()
|
212
153
|
def extract_and_summarize(file_path: str) -> list[TextContent]:
|
213
|
-
"""Extract text from a document and provide a summary prompt.
|
214
|
-
|
215
|
-
Args:
|
216
|
-
file_path: Path to the document file
|
217
|
-
|
218
|
-
Returns:
|
219
|
-
Extracted content with summarization prompt
|
220
|
-
"""
|
221
154
|
result = extract_file_sync(file_path, None, _create_config_with_overrides())
|
222
155
|
|
223
156
|
return [
|
@@ -230,14 +163,6 @@ def extract_and_summarize(file_path: str) -> list[TextContent]:
|
|
230
163
|
|
231
164
|
@mcp.prompt()
|
232
165
|
def extract_structured(file_path: str) -> list[TextContent]:
|
233
|
-
"""Extract text with structured analysis prompt.
|
234
|
-
|
235
|
-
Args:
|
236
|
-
file_path: Path to the document file
|
237
|
-
|
238
|
-
Returns:
|
239
|
-
Extracted content with structured analysis prompt
|
240
|
-
"""
|
241
166
|
config = _create_config_with_overrides(
|
242
167
|
extract_entities=True,
|
243
168
|
extract_keywords=True,
|
@@ -262,7 +187,6 @@ def extract_structured(file_path: str) -> list[TextContent]:
|
|
262
187
|
|
263
188
|
|
264
189
|
def main() -> None: # pragma: no cover
|
265
|
-
"""Main entry point for the MCP server."""
|
266
190
|
mcp.run()
|
267
191
|
|
268
192
|
|
kreuzberg/_ocr/_base.py
CHANGED
@@ -5,6 +5,7 @@ from typing import Generic, TypeVar
|
|
5
5
|
from PIL.Image import Image
|
6
6
|
|
7
7
|
from kreuzberg._types import ExtractionResult
|
8
|
+
from kreuzberg._utils._sync import run_taskgroup
|
8
9
|
|
9
10
|
try: # pragma: no cover
|
10
11
|
from typing import Unpack # type: ignore[attr-defined]
|
@@ -32,8 +33,6 @@ class OCRBackend(ABC, Generic[T]):
|
|
32
33
|
return [self.process_file_sync(path, **kwargs) for path in paths] # pragma: no cover
|
33
34
|
|
34
35
|
async def process_batch(self, paths: list[Path], **kwargs: Unpack[T]) -> list[ExtractionResult]:
|
35
|
-
from kreuzberg._utils._sync import run_taskgroup # noqa: PLC0415
|
36
|
-
|
37
36
|
tasks = [self.process_file(path, **kwargs) for path in paths]
|
38
37
|
return await run_taskgroup(*tasks) # pragma: no cover
|
39
38
|
|