kreuzberg 3.14.0__py3-none-any.whl → 3.14.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_api/main.py +44 -0
- kreuzberg/_types.py +2 -2
- {kreuzberg-3.14.0.dist-info → kreuzberg-3.14.1.dist-info}/METADATA +1 -1
- {kreuzberg-3.14.0.dist-info → kreuzberg-3.14.1.dist-info}/RECORD +7 -7
- {kreuzberg-3.14.0.dist-info → kreuzberg-3.14.1.dist-info}/WHEEL +0 -0
- {kreuzberg-3.14.0.dist-info → kreuzberg-3.14.1.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.14.0.dist-info → kreuzberg-3.14.1.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_api/main.py
CHANGED
@@ -1,26 +1,34 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import base64
|
4
|
+
import io
|
3
5
|
import traceback
|
4
6
|
from functools import lru_cache
|
5
7
|
from json import dumps, loads
|
6
8
|
from typing import TYPE_CHECKING, Annotated, Any, Literal
|
7
9
|
|
8
10
|
import msgspec
|
11
|
+
import polars as pl
|
12
|
+
from PIL import Image
|
9
13
|
from typing_extensions import TypedDict
|
10
14
|
|
11
15
|
from kreuzberg import (
|
12
16
|
EasyOCRConfig,
|
13
17
|
ExtractionConfig,
|
14
18
|
ExtractionResult,
|
19
|
+
GMFTConfig,
|
15
20
|
KreuzbergError,
|
21
|
+
LanguageDetectionConfig,
|
16
22
|
MissingDependencyError,
|
17
23
|
PaddleOCRConfig,
|
18
24
|
ParsingError,
|
25
|
+
SpacyEntityExtractionConfig,
|
19
26
|
TesseractConfig,
|
20
27
|
ValidationError,
|
21
28
|
batch_extract_bytes,
|
22
29
|
)
|
23
30
|
from kreuzberg._config import discover_config
|
31
|
+
from kreuzberg._types import HTMLToMarkdownConfig
|
24
32
|
|
25
33
|
if TYPE_CHECKING:
|
26
34
|
from litestar.datastructures import UploadFile
|
@@ -162,6 +170,22 @@ def _merge_configs_cached(
|
|
162
170
|
ocr_backend = config_dict.get("ocr_backend")
|
163
171
|
config_dict["ocr_config"] = _create_ocr_config(ocr_backend, config_dict["ocr_config"])
|
164
172
|
|
173
|
+
if "gmft_config" in config_dict and isinstance(config_dict["gmft_config"], dict):
|
174
|
+
config_dict["gmft_config"] = GMFTConfig(**config_dict["gmft_config"])
|
175
|
+
|
176
|
+
if "language_detection_config" in config_dict and isinstance(config_dict["language_detection_config"], dict):
|
177
|
+
config_dict["language_detection_config"] = LanguageDetectionConfig(**config_dict["language_detection_config"])
|
178
|
+
|
179
|
+
if "spacy_entity_extraction_config" in config_dict and isinstance(
|
180
|
+
config_dict["spacy_entity_extraction_config"], dict
|
181
|
+
):
|
182
|
+
config_dict["spacy_entity_extraction_config"] = SpacyEntityExtractionConfig(
|
183
|
+
**config_dict["spacy_entity_extraction_config"]
|
184
|
+
)
|
185
|
+
|
186
|
+
if "html_to_markdown_config" in config_dict and isinstance(config_dict["html_to_markdown_config"], dict):
|
187
|
+
config_dict["html_to_markdown_config"] = HTMLToMarkdownConfig(**config_dict["html_to_markdown_config"])
|
188
|
+
|
165
189
|
return ExtractionConfig(**config_dict)
|
166
190
|
|
167
191
|
|
@@ -291,6 +315,19 @@ async def get_configuration() -> ConfigurationResponse:
|
|
291
315
|
}
|
292
316
|
|
293
317
|
|
318
|
+
def _polars_dataframe_encoder(obj: Any) -> Any:
|
319
|
+
"""Convert polars DataFrame to dict for JSON serialization."""
|
320
|
+
return obj.to_dicts()
|
321
|
+
|
322
|
+
|
323
|
+
def _pil_image_encoder(obj: Any) -> str:
|
324
|
+
"""Convert PIL Image to base64 string for JSON serialization."""
|
325
|
+
buffer = io.BytesIO()
|
326
|
+
obj.save(buffer, format="PNG")
|
327
|
+
img_str = base64.b64encode(buffer.getvalue()).decode()
|
328
|
+
return f"data:image/png;base64,{img_str}"
|
329
|
+
|
330
|
+
|
294
331
|
openapi_config = OpenAPIConfig(
|
295
332
|
title="Kreuzberg API",
|
296
333
|
version="3.14.0",
|
@@ -307,6 +344,12 @@ openapi_config = OpenAPIConfig(
|
|
307
344
|
create_examples=True,
|
308
345
|
)
|
309
346
|
|
347
|
+
# Type encoders for custom serialization
|
348
|
+
type_encoders = {
|
349
|
+
pl.DataFrame: _polars_dataframe_encoder,
|
350
|
+
Image.Image: _pil_image_encoder,
|
351
|
+
}
|
352
|
+
|
310
353
|
app = Litestar(
|
311
354
|
route_handlers=[handle_files_upload, health_check, get_configuration],
|
312
355
|
plugins=[OpenTelemetryPlugin(OpenTelemetryConfig())],
|
@@ -316,5 +359,6 @@ app = Litestar(
|
|
316
359
|
KreuzbergError: exception_handler,
|
317
360
|
Exception: general_exception_handler,
|
318
361
|
},
|
362
|
+
type_encoders=type_encoders,
|
319
363
|
request_max_body_size=1024 * 1024 * 1024, # 1GB limit for large file uploads
|
320
364
|
)
|
kreuzberg/_types.py
CHANGED
@@ -349,7 +349,7 @@ class GMFTConfig(ConfigDict):
|
|
349
349
|
"""
|
350
350
|
|
351
351
|
|
352
|
-
@dataclass(frozen=True, slots=True)
|
352
|
+
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
353
353
|
class LanguageDetectionConfig(ConfigDict):
|
354
354
|
low_memory: bool = True
|
355
355
|
"""If True, uses a smaller model (~200MB). If False, uses a larger, more accurate model.
|
@@ -751,7 +751,7 @@ PostProcessingHook = Callable[[ExtractionResult], ExtractionResult | Awaitable[E
|
|
751
751
|
ValidationHook = Callable[[ExtractionResult], None | Awaitable[None]]
|
752
752
|
|
753
753
|
|
754
|
-
@dataclass(unsafe_hash=True, slots=True)
|
754
|
+
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
755
755
|
class ExtractionConfig(ConfigDict):
|
756
756
|
force_ocr: bool = False
|
757
757
|
"""Whether to force OCR."""
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.14.
|
3
|
+
Version: 3.14.1
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -10,13 +10,13 @@ kreuzberg/_language_detection.py,sha256=T9p6aimB7QFXAQiEntIMZeH_Z62E52E8fBQ43hWu
|
|
10
10
|
kreuzberg/_mime_types.py,sha256=kGBDSMO4XPgzUKC7iaBeChCtRQXZ9_zXq6eJydejX_k,7739
|
11
11
|
kreuzberg/_playa.py,sha256=p4G5ymSSCbQoDeXJjH-yuVzdd4y-wKcolqDthjPtqok,11413
|
12
12
|
kreuzberg/_registry.py,sha256=8XYT-vPhNYMAbB5RBIUKz-1Zdg48OCnBcdVZzBq6YwY,3307
|
13
|
-
kreuzberg/_types.py,sha256=
|
13
|
+
kreuzberg/_types.py,sha256=BEMTnA8fvHL0dDCnjq7g9Jjd2Ze8NFq988YkMH4zQ9g,39163
|
14
14
|
kreuzberg/cli.py,sha256=Ob0IfqWcaiM09pFdC6wTpdSeql0SGZDxBxfrEhJAGmo,13501
|
15
15
|
kreuzberg/exceptions.py,sha256=PTiAZgQwcG9hXbgYg2W7sfxksFhq5_wzOFgZGnTJAoc,2991
|
16
16
|
kreuzberg/extraction.py,sha256=qT-Ziw5FmMqcPT88VrglikL1RASSJCf5W7xP6L9Vi5s,17673
|
17
17
|
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
18
|
kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
|
-
kreuzberg/_api/main.py,sha256=
|
19
|
+
kreuzberg/_api/main.py,sha256=8g_8j8Dp2e70_yYYUUrJNC5Ku9fuyNgyjUuIgJTRUW8,12500
|
20
20
|
kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
21
|
kreuzberg/_extractors/_base.py,sha256=i2FvAhRnamEtBb4a-C7pfcdWIXnkEBw0saMQu7h1_RQ,2069
|
22
22
|
kreuzberg/_extractors/_email.py,sha256=jn_8J4BASKJ7zFHBG0PgxNe3OT4pjmEM2tTKX8y_0AE,5887
|
@@ -51,8 +51,8 @@ kreuzberg/_utils/_string.py,sha256=wVyvEHByHBeu_6evmqJGv9Ml-NAwkyz60n8l-7L5Cw0,4
|
|
51
51
|
kreuzberg/_utils/_sync.py,sha256=OWiciXPTGHIxgiGoHI2AglZ1siTNT-nU_JCgHPNzzHk,2196
|
52
52
|
kreuzberg/_utils/_table.py,sha256=OVg6T2QnerMhVNb1juLTBSIjyjFiE5-OrUWr5NSCgnQ,6493
|
53
53
|
kreuzberg/_utils/_tmp.py,sha256=wnOInBkcuQoxI1vBLvNv9NqbRCEu9Y03qfOjqQuAk3s,841
|
54
|
-
kreuzberg-3.14.
|
55
|
-
kreuzberg-3.14.
|
56
|
-
kreuzberg-3.14.
|
57
|
-
kreuzberg-3.14.
|
58
|
-
kreuzberg-3.14.
|
54
|
+
kreuzberg-3.14.1.dist-info/METADATA,sha256=4sG9L9AtvBHFxjv84obrcaYNToc_sO0-AHnnpo1-ZGY,12127
|
55
|
+
kreuzberg-3.14.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
56
|
+
kreuzberg-3.14.1.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
|
57
|
+
kreuzberg-3.14.1.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
58
|
+
kreuzberg-3.14.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|