kreuzberg 3.14.0__py3-none-any.whl → 3.14.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/_api/main.py CHANGED
@@ -1,26 +1,34 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import base64
4
+ import io
3
5
  import traceback
4
6
  from functools import lru_cache
5
7
  from json import dumps, loads
6
8
  from typing import TYPE_CHECKING, Annotated, Any, Literal
7
9
 
8
10
  import msgspec
11
+ import polars as pl
12
+ from PIL import Image
9
13
  from typing_extensions import TypedDict
10
14
 
11
15
  from kreuzberg import (
12
16
  EasyOCRConfig,
13
17
  ExtractionConfig,
14
18
  ExtractionResult,
19
+ GMFTConfig,
15
20
  KreuzbergError,
21
+ LanguageDetectionConfig,
16
22
  MissingDependencyError,
17
23
  PaddleOCRConfig,
18
24
  ParsingError,
25
+ SpacyEntityExtractionConfig,
19
26
  TesseractConfig,
20
27
  ValidationError,
21
28
  batch_extract_bytes,
22
29
  )
23
30
  from kreuzberg._config import discover_config
31
+ from kreuzberg._types import HTMLToMarkdownConfig
24
32
 
25
33
  if TYPE_CHECKING:
26
34
  from litestar.datastructures import UploadFile
@@ -162,6 +170,22 @@ def _merge_configs_cached(
162
170
  ocr_backend = config_dict.get("ocr_backend")
163
171
  config_dict["ocr_config"] = _create_ocr_config(ocr_backend, config_dict["ocr_config"])
164
172
 
173
+ if "gmft_config" in config_dict and isinstance(config_dict["gmft_config"], dict):
174
+ config_dict["gmft_config"] = GMFTConfig(**config_dict["gmft_config"])
175
+
176
+ if "language_detection_config" in config_dict and isinstance(config_dict["language_detection_config"], dict):
177
+ config_dict["language_detection_config"] = LanguageDetectionConfig(**config_dict["language_detection_config"])
178
+
179
+ if "spacy_entity_extraction_config" in config_dict and isinstance(
180
+ config_dict["spacy_entity_extraction_config"], dict
181
+ ):
182
+ config_dict["spacy_entity_extraction_config"] = SpacyEntityExtractionConfig(
183
+ **config_dict["spacy_entity_extraction_config"]
184
+ )
185
+
186
+ if "html_to_markdown_config" in config_dict and isinstance(config_dict["html_to_markdown_config"], dict):
187
+ config_dict["html_to_markdown_config"] = HTMLToMarkdownConfig(**config_dict["html_to_markdown_config"])
188
+
165
189
  return ExtractionConfig(**config_dict)
166
190
 
167
191
 
@@ -291,6 +315,19 @@ async def get_configuration() -> ConfigurationResponse:
291
315
  }
292
316
 
293
317
 
318
+ def _polars_dataframe_encoder(obj: Any) -> Any:
319
+ """Convert polars DataFrame to dict for JSON serialization."""
320
+ return obj.to_dicts()
321
+
322
+
323
+ def _pil_image_encoder(obj: Any) -> str:
324
+ """Convert PIL Image to base64 string for JSON serialization."""
325
+ buffer = io.BytesIO()
326
+ obj.save(buffer, format="PNG")
327
+ img_str = base64.b64encode(buffer.getvalue()).decode()
328
+ return f"data:image/png;base64,{img_str}"
329
+
330
+
294
331
  openapi_config = OpenAPIConfig(
295
332
  title="Kreuzberg API",
296
333
  version="3.14.0",
@@ -307,6 +344,12 @@ openapi_config = OpenAPIConfig(
307
344
  create_examples=True,
308
345
  )
309
346
 
347
+ # Type encoders for custom serialization
348
+ type_encoders = {
349
+ pl.DataFrame: _polars_dataframe_encoder,
350
+ Image.Image: _pil_image_encoder,
351
+ }
352
+
310
353
  app = Litestar(
311
354
  route_handlers=[handle_files_upload, health_check, get_configuration],
312
355
  plugins=[OpenTelemetryPlugin(OpenTelemetryConfig())],
@@ -316,5 +359,6 @@ app = Litestar(
316
359
  KreuzbergError: exception_handler,
317
360
  Exception: general_exception_handler,
318
361
  },
362
+ type_encoders=type_encoders,
319
363
  request_max_body_size=1024 * 1024 * 1024, # 1GB limit for large file uploads
320
364
  )
kreuzberg/_types.py CHANGED
@@ -349,7 +349,7 @@ class GMFTConfig(ConfigDict):
349
349
  """
350
350
 
351
351
 
352
- @dataclass(frozen=True, slots=True)
352
+ @dataclass(unsafe_hash=True, frozen=True, slots=True)
353
353
  class LanguageDetectionConfig(ConfigDict):
354
354
  low_memory: bool = True
355
355
  """If True, uses a smaller model (~200MB). If False, uses a larger, more accurate model.
@@ -751,7 +751,7 @@ PostProcessingHook = Callable[[ExtractionResult], ExtractionResult | Awaitable[E
751
751
  ValidationHook = Callable[[ExtractionResult], None | Awaitable[None]]
752
752
 
753
753
 
754
- @dataclass(unsafe_hash=True, slots=True)
754
+ @dataclass(unsafe_hash=True, frozen=True, slots=True)
755
755
  class ExtractionConfig(ConfigDict):
756
756
  force_ocr: bool = False
757
757
  """Whether to force OCR."""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.14.0
3
+ Version: 3.14.1
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -10,13 +10,13 @@ kreuzberg/_language_detection.py,sha256=T9p6aimB7QFXAQiEntIMZeH_Z62E52E8fBQ43hWu
10
10
  kreuzberg/_mime_types.py,sha256=kGBDSMO4XPgzUKC7iaBeChCtRQXZ9_zXq6eJydejX_k,7739
11
11
  kreuzberg/_playa.py,sha256=p4G5ymSSCbQoDeXJjH-yuVzdd4y-wKcolqDthjPtqok,11413
12
12
  kreuzberg/_registry.py,sha256=8XYT-vPhNYMAbB5RBIUKz-1Zdg48OCnBcdVZzBq6YwY,3307
13
- kreuzberg/_types.py,sha256=yw8ZzCgwp8T4byh00gdSlABDtRwro6H1pemQsO5IZMQ,39132
13
+ kreuzberg/_types.py,sha256=BEMTnA8fvHL0dDCnjq7g9Jjd2Ze8NFq988YkMH4zQ9g,39163
14
14
  kreuzberg/cli.py,sha256=Ob0IfqWcaiM09pFdC6wTpdSeql0SGZDxBxfrEhJAGmo,13501
15
15
  kreuzberg/exceptions.py,sha256=PTiAZgQwcG9hXbgYg2W7sfxksFhq5_wzOFgZGnTJAoc,2991
16
16
  kreuzberg/extraction.py,sha256=qT-Ziw5FmMqcPT88VrglikL1RASSJCf5W7xP6L9Vi5s,17673
17
17
  kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
18
  kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
- kreuzberg/_api/main.py,sha256=bZLaQpW8eoTFGvCGJgFodALy4rDfe9kuY1oj9OKPQpU,10792
19
+ kreuzberg/_api/main.py,sha256=8g_8j8Dp2e70_yYYUUrJNC5Ku9fuyNgyjUuIgJTRUW8,12500
20
20
  kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
21
  kreuzberg/_extractors/_base.py,sha256=i2FvAhRnamEtBb4a-C7pfcdWIXnkEBw0saMQu7h1_RQ,2069
22
22
  kreuzberg/_extractors/_email.py,sha256=jn_8J4BASKJ7zFHBG0PgxNe3OT4pjmEM2tTKX8y_0AE,5887
@@ -51,8 +51,8 @@ kreuzberg/_utils/_string.py,sha256=wVyvEHByHBeu_6evmqJGv9Ml-NAwkyz60n8l-7L5Cw0,4
51
51
  kreuzberg/_utils/_sync.py,sha256=OWiciXPTGHIxgiGoHI2AglZ1siTNT-nU_JCgHPNzzHk,2196
52
52
  kreuzberg/_utils/_table.py,sha256=OVg6T2QnerMhVNb1juLTBSIjyjFiE5-OrUWr5NSCgnQ,6493
53
53
  kreuzberg/_utils/_tmp.py,sha256=wnOInBkcuQoxI1vBLvNv9NqbRCEu9Y03qfOjqQuAk3s,841
54
- kreuzberg-3.14.0.dist-info/METADATA,sha256=68rRivXnf8n_F9lqekOydDOd8sehWpHpbbKzRup7XDc,12127
55
- kreuzberg-3.14.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
56
- kreuzberg-3.14.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
57
- kreuzberg-3.14.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
58
- kreuzberg-3.14.0.dist-info/RECORD,,
54
+ kreuzberg-3.14.1.dist-info/METADATA,sha256=4sG9L9AtvBHFxjv84obrcaYNToc_sO0-AHnnpo1-ZGY,12127
55
+ kreuzberg-3.14.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
56
+ kreuzberg-3.14.1.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
57
+ kreuzberg-3.14.1.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
58
+ kreuzberg-3.14.1.dist-info/RECORD,,