kreuzberg 3.14.1__py3-none-any.whl → 3.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. kreuzberg/__init__.py +10 -0
  2. kreuzberg/_api/_config_cache.py +247 -0
  3. kreuzberg/_api/main.py +74 -45
  4. kreuzberg/_chunker.py +7 -6
  5. kreuzberg/_config.py +11 -1
  6. kreuzberg/_constants.py +2 -0
  7. kreuzberg/_document_classification.py +5 -7
  8. kreuzberg/_entity_extraction.py +9 -4
  9. kreuzberg/_extractors/_base.py +269 -3
  10. kreuzberg/_extractors/_email.py +101 -27
  11. kreuzberg/_extractors/_html.py +112 -7
  12. kreuzberg/_extractors/_image.py +23 -22
  13. kreuzberg/_extractors/_pandoc.py +106 -75
  14. kreuzberg/_extractors/_pdf.py +208 -99
  15. kreuzberg/_extractors/_presentation.py +76 -8
  16. kreuzberg/_extractors/_spread_sheet.py +24 -30
  17. kreuzberg/_extractors/_structured.py +83 -15
  18. kreuzberg/_gmft.py +5 -0
  19. kreuzberg/_mcp/server.py +324 -25
  20. kreuzberg/_mime_types.py +42 -0
  21. kreuzberg/_ocr/_easyocr.py +53 -21
  22. kreuzberg/_ocr/_paddleocr.py +1 -1
  23. kreuzberg/_ocr/_tesseract.py +88 -37
  24. kreuzberg/_types.py +291 -61
  25. kreuzberg/_utils/_cache.py +10 -4
  26. kreuzberg/_utils/_device.py +2 -4
  27. kreuzberg/_utils/_html_streaming.py +20 -0
  28. kreuzberg/_utils/_image_preprocessing.py +12 -39
  29. kreuzberg/_utils/_process_pool.py +29 -8
  30. kreuzberg/_utils/_quality.py +7 -2
  31. kreuzberg/_utils/_resource_managers.py +65 -0
  32. kreuzberg/_utils/_serialization.py +13 -6
  33. kreuzberg/_utils/_sync.py +39 -10
  34. kreuzberg/_utils/_tmp.py +37 -1
  35. kreuzberg/cli.py +34 -20
  36. kreuzberg/extraction.py +44 -28
  37. {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/METADATA +13 -11
  38. kreuzberg-3.16.0.dist-info/RECORD +61 -0
  39. kreuzberg-3.14.1.dist-info/RECORD +0 -58
  40. {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/WHEEL +0 -0
  41. {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/entry_points.txt +0 -0
  42. {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/__init__.py CHANGED
@@ -4,9 +4,14 @@ from ._registry import ExtractorRegistry
4
4
  from ._types import (
5
5
  EasyOCRConfig,
6
6
  Entity,
7
+ ExtractedImage,
7
8
  ExtractionConfig,
8
9
  ExtractionResult,
9
10
  GMFTConfig,
11
+ HTMLToMarkdownConfig,
12
+ ImageOCRConfig,
13
+ ImageOCRResult,
14
+ JSONExtractionConfig,
10
15
  LanguageDetectionConfig,
11
16
  Metadata,
12
17
  PaddleOCRConfig,
@@ -32,10 +37,15 @@ __version__ = version("kreuzberg")
32
37
  __all__ = [
33
38
  "EasyOCRConfig",
34
39
  "Entity",
40
+ "ExtractedImage",
35
41
  "ExtractionConfig",
36
42
  "ExtractionResult",
37
43
  "ExtractorRegistry",
38
44
  "GMFTConfig",
45
+ "HTMLToMarkdownConfig",
46
+ "ImageOCRConfig",
47
+ "ImageOCRResult",
48
+ "JSONExtractionConfig",
39
49
  "KreuzbergError",
40
50
  "LanguageDetectionConfig",
41
51
  "Metadata",
@@ -0,0 +1,247 @@
1
+ """API Configuration Caching Module.
2
+
3
+ This module provides LRU cached functions for API config operations to improve performance
4
+ by avoiding repeated file system operations and object creation.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import hashlib
10
+ import json
11
+ from functools import lru_cache
12
+ from pathlib import Path
13
+ from typing import Any
14
+
15
+ from kreuzberg._config import discover_config
16
+ from kreuzberg._types import (
17
+ EasyOCRConfig,
18
+ ExtractionConfig,
19
+ GMFTConfig,
20
+ HTMLToMarkdownConfig,
21
+ LanguageDetectionConfig,
22
+ PaddleOCRConfig,
23
+ SpacyEntityExtractionConfig,
24
+ TesseractConfig,
25
+ )
26
+
27
+
28
+ @lru_cache(maxsize=16)
29
+ def _cached_discover_config(
30
+ search_path: str,
31
+ config_file_mtime: float, # noqa: ARG001
32
+ config_file_size: int, # noqa: ARG001
33
+ ) -> ExtractionConfig | None:
34
+ """Cache config discovery with file modification time validation."""
35
+ return discover_config(Path(search_path))
36
+
37
+
38
+ def discover_config_cached(search_path: Path | str | None = None) -> ExtractionConfig | None:
39
+ """Cached version of discover_config with automatic invalidation.
40
+
41
+ This function caches the result of discover_config() and automatically invalidates
42
+ the cache when config files are modified.
43
+
44
+ Args:
45
+ search_path: Path to start searching for config files from
46
+
47
+ Returns:
48
+ ExtractionConfig if found, None otherwise
49
+ """
50
+ search_path = Path.cwd() if search_path is None else Path(search_path)
51
+
52
+ config_files = ["kreuzberg.toml", "pyproject.toml"]
53
+ for config_file_name in config_files:
54
+ config_path = search_path / config_file_name
55
+ if config_path.exists():
56
+ try:
57
+ stat = config_path.stat()
58
+ return _cached_discover_config(
59
+ str(search_path),
60
+ stat.st_mtime,
61
+ stat.st_size,
62
+ )
63
+ except OSError:
64
+ return discover_config(search_path)
65
+
66
+ return _cached_discover_config(str(search_path), 0.0, 0)
67
+
68
+
69
+ @lru_cache(maxsize=128)
70
+ def _cached_create_ocr_config(
71
+ config_type: str,
72
+ config_json: str,
73
+ ) -> TesseractConfig | EasyOCRConfig | PaddleOCRConfig:
74
+ """Cache OCR config object creation."""
75
+ config_dict = json.loads(config_json)
76
+
77
+ if config_type == "tesseract":
78
+ return TesseractConfig(**config_dict)
79
+ if config_type == "easyocr":
80
+ return EasyOCRConfig(**config_dict)
81
+ if config_type == "paddleocr":
82
+ return PaddleOCRConfig(**config_dict)
83
+ msg = f"Unknown OCR config type: {config_type}"
84
+ raise ValueError(msg)
85
+
86
+
87
+ @lru_cache(maxsize=64)
88
+ def _cached_create_gmft_config(config_json: str) -> GMFTConfig:
89
+ """Cache GMFT config creation."""
90
+ return GMFTConfig(**json.loads(config_json))
91
+
92
+
93
+ @lru_cache(maxsize=64)
94
+ def _cached_create_language_detection_config(config_json: str) -> LanguageDetectionConfig:
95
+ """Cache language detection config creation."""
96
+ return LanguageDetectionConfig(**json.loads(config_json))
97
+
98
+
99
+ @lru_cache(maxsize=64)
100
+ def _cached_create_spacy_config(config_json: str) -> SpacyEntityExtractionConfig:
101
+ """Cache spaCy entity extraction config creation."""
102
+ return SpacyEntityExtractionConfig(**json.loads(config_json))
103
+
104
+
105
+ @lru_cache(maxsize=64)
106
+ def _cached_create_html_markdown_config(config_json: str) -> HTMLToMarkdownConfig:
107
+ """Cache HTML to Markdown config creation."""
108
+ return HTMLToMarkdownConfig(**json.loads(config_json))
109
+
110
+
111
+ @lru_cache(maxsize=256)
112
+ def _cached_parse_header_config(header_value: str) -> dict[str, Any]:
113
+ """Cache parsed header configurations."""
114
+ parsed_config: dict[str, Any] = json.loads(header_value)
115
+ return parsed_config
116
+
117
+
118
+ def create_ocr_config_cached(
119
+ ocr_backend: str | None, config_dict: dict[str, Any]
120
+ ) -> TesseractConfig | EasyOCRConfig | PaddleOCRConfig:
121
+ """Cached version of OCR config creation.
122
+
123
+ Args:
124
+ ocr_backend: The OCR backend type
125
+ config_dict: Configuration dictionary
126
+
127
+ Returns:
128
+ Configured OCR config object
129
+ """
130
+ if not ocr_backend:
131
+ return TesseractConfig()
132
+
133
+ config_json = json.dumps(config_dict, sort_keys=True)
134
+ return _cached_create_ocr_config(ocr_backend, config_json)
135
+
136
+
137
+ def create_gmft_config_cached(config_dict: dict[str, Any]) -> GMFTConfig:
138
+ """Cached version of GMFT config creation."""
139
+ config_json = json.dumps(config_dict, sort_keys=True)
140
+ return _cached_create_gmft_config(config_json)
141
+
142
+
143
+ def create_language_detection_config_cached(config_dict: dict[str, Any]) -> LanguageDetectionConfig:
144
+ """Cached version of language detection config creation."""
145
+ config_json = json.dumps(config_dict, sort_keys=True)
146
+ return _cached_create_language_detection_config(config_json)
147
+
148
+
149
+ def create_spacy_config_cached(config_dict: dict[str, Any]) -> SpacyEntityExtractionConfig:
150
+ """Cached version of spaCy config creation."""
151
+ config_json = json.dumps(config_dict, sort_keys=True)
152
+ return _cached_create_spacy_config(config_json)
153
+
154
+
155
+ def create_html_markdown_config_cached(config_dict: dict[str, Any]) -> HTMLToMarkdownConfig:
156
+ """Cached version of HTML to Markdown config creation."""
157
+ config_json = json.dumps(config_dict, sort_keys=True)
158
+ return _cached_create_html_markdown_config(config_json)
159
+
160
+
161
+ def parse_header_config_cached(header_value: str) -> dict[str, Any]:
162
+ """Cached version of header config parsing.
163
+
164
+ Args:
165
+ header_value: JSON string from X-Extraction-Config header
166
+
167
+ Returns:
168
+ Parsed configuration dictionary
169
+ """
170
+ return _cached_parse_header_config(header_value)
171
+
172
+
173
+ @lru_cache(maxsize=512)
174
+ def _cached_merge_configs(
175
+ static_config_hash: str,
176
+ query_params_hash: str,
177
+ header_config_hash: str,
178
+ ) -> ExtractionConfig:
179
+ """Cache the complete config merging process.
180
+
181
+ This is the ultimate optimization - cache the entire result of merge_configs()
182
+ based on content hashes of all inputs.
183
+ """
184
+ msg = "Not implemented yet - use individual component caching"
185
+ raise NotImplementedError(msg)
186
+
187
+
188
+ def _hash_dict(data: dict[str, Any] | None) -> str:
189
+ """Create a hash string from a dictionary for cache keys."""
190
+ if data is None:
191
+ return "none"
192
+
193
+ json_str = json.dumps(data, sort_keys=True, default=str)
194
+ return hashlib.sha256(json_str.encode()).hexdigest()[:16]
195
+
196
+
197
+ def get_cache_stats() -> dict[str, Any]:
198
+ """Get cache statistics for monitoring performance."""
199
+ return {
200
+ "discover_config": {
201
+ "hits": _cached_discover_config.cache_info().hits,
202
+ "misses": _cached_discover_config.cache_info().misses,
203
+ "size": _cached_discover_config.cache_info().currsize,
204
+ "max_size": _cached_discover_config.cache_info().maxsize,
205
+ },
206
+ "ocr_config": {
207
+ "hits": _cached_create_ocr_config.cache_info().hits,
208
+ "misses": _cached_create_ocr_config.cache_info().misses,
209
+ "size": _cached_create_ocr_config.cache_info().currsize,
210
+ "max_size": _cached_create_ocr_config.cache_info().maxsize,
211
+ },
212
+ "header_parsing": {
213
+ "hits": _cached_parse_header_config.cache_info().hits,
214
+ "misses": _cached_parse_header_config.cache_info().misses,
215
+ "size": _cached_parse_header_config.cache_info().currsize,
216
+ "max_size": _cached_parse_header_config.cache_info().maxsize,
217
+ },
218
+ "gmft_config": {
219
+ "hits": _cached_create_gmft_config.cache_info().hits,
220
+ "misses": _cached_create_gmft_config.cache_info().misses,
221
+ "size": _cached_create_gmft_config.cache_info().currsize,
222
+ "max_size": _cached_create_gmft_config.cache_info().maxsize,
223
+ },
224
+ "language_detection_config": {
225
+ "hits": _cached_create_language_detection_config.cache_info().hits,
226
+ "misses": _cached_create_language_detection_config.cache_info().misses,
227
+ "size": _cached_create_language_detection_config.cache_info().currsize,
228
+ "max_size": _cached_create_language_detection_config.cache_info().maxsize,
229
+ },
230
+ "spacy_config": {
231
+ "hits": _cached_create_spacy_config.cache_info().hits,
232
+ "misses": _cached_create_spacy_config.cache_info().misses,
233
+ "size": _cached_create_spacy_config.cache_info().currsize,
234
+ "max_size": _cached_create_spacy_config.cache_info().maxsize,
235
+ },
236
+ }
237
+
238
+
239
+ def clear_all_caches() -> None:
240
+ """Clear all API configuration caches."""
241
+ _cached_discover_config.cache_clear()
242
+ _cached_create_ocr_config.cache_clear()
243
+ _cached_create_gmft_config.cache_clear()
244
+ _cached_create_language_detection_config.cache_clear()
245
+ _cached_create_spacy_config.cache_clear()
246
+ _cached_create_html_markdown_config.cache_clear()
247
+ _cached_parse_header_config.cache_clear()
kreuzberg/_api/main.py CHANGED
@@ -3,8 +3,7 @@ from __future__ import annotations
3
3
  import base64
4
4
  import io
5
5
  import traceback
6
- from functools import lru_cache
7
- from json import dumps, loads
6
+ from json import dumps
8
7
  from typing import TYPE_CHECKING, Annotated, Any, Literal
9
8
 
10
9
  import msgspec
@@ -16,19 +15,24 @@ from kreuzberg import (
16
15
  EasyOCRConfig,
17
16
  ExtractionConfig,
18
17
  ExtractionResult,
19
- GMFTConfig,
20
18
  KreuzbergError,
21
- LanguageDetectionConfig,
22
19
  MissingDependencyError,
23
20
  PaddleOCRConfig,
24
21
  ParsingError,
25
- SpacyEntityExtractionConfig,
26
22
  TesseractConfig,
27
23
  ValidationError,
28
24
  batch_extract_bytes,
29
25
  )
26
+ from kreuzberg._api._config_cache import (
27
+ create_gmft_config_cached,
28
+ create_html_markdown_config_cached,
29
+ create_language_detection_config_cached,
30
+ create_ocr_config_cached,
31
+ create_spacy_config_cached,
32
+ discover_config_cached,
33
+ parse_header_config_cached,
34
+ )
30
35
  from kreuzberg._config import discover_config
31
- from kreuzberg._types import HTMLToMarkdownConfig
32
36
 
33
37
  if TYPE_CHECKING:
34
38
  from litestar.datastructures import UploadFile
@@ -146,68 +150,65 @@ def _create_ocr_config(
146
150
  return config_dict
147
151
 
148
152
 
149
- @lru_cache(maxsize=128)
150
- def _merge_configs_cached(
153
+ def _create_dimension_tuple(width: int | None, height: int | None) -> tuple[int, int] | None:
154
+ """Create a dimension tuple from width and height values.
155
+
156
+ Args:
157
+ width: Width value or None
158
+ height: Height value or None
159
+
160
+ Returns:
161
+ Tuple of (width, height) if both values are not None, otherwise None
162
+ """
163
+ if width is not None and height is not None:
164
+ return (width, height)
165
+ return None
166
+
167
+
168
+ def merge_configs(
151
169
  static_config: ExtractionConfig | None,
152
- query_params: tuple[tuple[str, Any], ...],
153
- header_config: tuple[tuple[str, Any], ...] | None,
170
+ query_params: dict[str, Any],
171
+ header_config: dict[str, Any] | None,
154
172
  ) -> ExtractionConfig:
155
173
  base_config = static_config or ExtractionConfig()
156
174
  config_dict = base_config.to_dict()
157
175
 
158
- query_dict = dict(query_params) if query_params else {}
159
- for key, value in query_dict.items():
176
+ for key, value in query_params.items():
160
177
  if value is not None and key in config_dict:
161
178
  config_dict[key] = _convert_value_type(config_dict[key], value)
162
179
 
163
180
  if header_config:
164
- header_dict = dict(header_config)
165
- for key, value in header_dict.items():
181
+ for key, value in header_config.items():
166
182
  if key in config_dict:
167
183
  config_dict[key] = value
168
184
 
169
185
  if "ocr_config" in config_dict and isinstance(config_dict["ocr_config"], dict):
170
186
  ocr_backend = config_dict.get("ocr_backend")
171
- config_dict["ocr_config"] = _create_ocr_config(ocr_backend, config_dict["ocr_config"])
187
+ config_dict["ocr_config"] = create_ocr_config_cached(ocr_backend, config_dict["ocr_config"])
172
188
 
173
189
  if "gmft_config" in config_dict and isinstance(config_dict["gmft_config"], dict):
174
- config_dict["gmft_config"] = GMFTConfig(**config_dict["gmft_config"])
190
+ config_dict["gmft_config"] = create_gmft_config_cached(config_dict["gmft_config"])
175
191
 
176
192
  if "language_detection_config" in config_dict and isinstance(config_dict["language_detection_config"], dict):
177
- config_dict["language_detection_config"] = LanguageDetectionConfig(**config_dict["language_detection_config"])
193
+ config_dict["language_detection_config"] = create_language_detection_config_cached(
194
+ config_dict["language_detection_config"]
195
+ )
178
196
 
179
197
  if "spacy_entity_extraction_config" in config_dict and isinstance(
180
198
  config_dict["spacy_entity_extraction_config"], dict
181
199
  ):
182
- config_dict["spacy_entity_extraction_config"] = SpacyEntityExtractionConfig(
183
- **config_dict["spacy_entity_extraction_config"]
200
+ config_dict["spacy_entity_extraction_config"] = create_spacy_config_cached(
201
+ config_dict["spacy_entity_extraction_config"]
184
202
  )
185
203
 
186
204
  if "html_to_markdown_config" in config_dict and isinstance(config_dict["html_to_markdown_config"], dict):
187
- config_dict["html_to_markdown_config"] = HTMLToMarkdownConfig(**config_dict["html_to_markdown_config"])
205
+ config_dict["html_to_markdown_config"] = create_html_markdown_config_cached(
206
+ config_dict["html_to_markdown_config"]
207
+ )
188
208
 
189
209
  return ExtractionConfig(**config_dict)
190
210
 
191
211
 
192
- def _make_hashable(obj: Any) -> Any:
193
- if isinstance(obj, dict):
194
- return tuple(sorted((k, _make_hashable(v)) for k, v in obj.items()))
195
- if isinstance(obj, list):
196
- return tuple(_make_hashable(item) for item in obj)
197
- return obj
198
-
199
-
200
- def merge_configs(
201
- static_config: ExtractionConfig | None,
202
- query_params: dict[str, Any],
203
- header_config: dict[str, Any] | None,
204
- ) -> ExtractionConfig:
205
- query_tuple = tuple(sorted(query_params.items())) if query_params else ()
206
- header_tuple = _make_hashable(header_config) if header_config else None
207
-
208
- return _merge_configs_cached(static_config, query_tuple, header_tuple)
209
-
210
-
211
212
  @post("/extract", operation_id="ExtractFiles")
212
213
  async def handle_files_upload( # noqa: PLR0913
213
214
  request: Request[Any, Any, Any],
@@ -223,6 +224,13 @@ async def handle_files_upload( # noqa: PLR0913
223
224
  ocr_backend: Literal["tesseract", "easyocr", "paddleocr"] | None = None,
224
225
  auto_detect_language: str | bool | None = None,
225
226
  pdf_password: str | None = None,
227
+ extract_images: str | bool | None = None,
228
+ ocr_extracted_images: str | bool | None = None,
229
+ image_ocr_backend: Literal["tesseract", "easyocr", "paddleocr"] | None = None,
230
+ image_ocr_min_width: int | None = None,
231
+ image_ocr_min_height: int | None = None,
232
+ image_ocr_max_width: int | None = None,
233
+ image_ocr_max_height: int | None = None,
226
234
  ) -> list[ExtractionResult]:
227
235
  """Extract text, metadata, and structured data from uploaded documents.
228
236
 
@@ -250,11 +258,30 @@ async def handle_files_upload( # noqa: PLR0913
250
258
  ocr_backend: OCR engine to use (tesseract, easyocr, paddleocr)
251
259
  auto_detect_language: Enable automatic language detection
252
260
  pdf_password: Password for encrypted PDF files
261
+ extract_images: Enable image extraction for supported formats
262
+ ocr_extracted_images: Run OCR over extracted images
263
+ image_ocr_backend: Optional backend override for image OCR
264
+ image_ocr_min_width: Minimum image width for OCR eligibility
265
+ image_ocr_min_height: Minimum image height for OCR eligibility
266
+ image_ocr_max_width: Maximum image width for OCR eligibility
267
+ image_ocr_max_height: Maximum image height for OCR eligibility
253
268
 
254
269
  Returns:
255
270
  List of extraction results, one per uploaded file
271
+
272
+ Additional query parameters:
273
+ extract_images: Enable image extraction for supported formats
274
+ ocr_extracted_images: Run OCR over extracted images
275
+ image_ocr_backend: Optional backend override for image OCR
276
+ image_ocr_min_width: Minimum image width for OCR eligibility
277
+ image_ocr_min_height: Minimum image height for OCR eligibility
278
+ image_ocr_max_width: Maximum image width for OCR eligibility
279
+ image_ocr_max_height: Maximum image height for OCR eligibility
256
280
  """
257
- static_config = discover_config()
281
+ static_config = discover_config_cached()
282
+
283
+ min_dims = _create_dimension_tuple(image_ocr_min_width, image_ocr_min_height)
284
+ max_dims = _create_dimension_tuple(image_ocr_max_width, image_ocr_max_height)
258
285
 
259
286
  query_params = {
260
287
  "chunk_content": chunk_content,
@@ -268,12 +295,17 @@ async def handle_files_upload( # noqa: PLR0913
268
295
  "ocr_backend": ocr_backend,
269
296
  "auto_detect_language": auto_detect_language,
270
297
  "pdf_password": pdf_password,
298
+ "extract_images": extract_images,
299
+ "ocr_extracted_images": ocr_extracted_images,
300
+ "image_ocr_backend": image_ocr_backend,
301
+ "image_ocr_min_dimensions": min_dims,
302
+ "image_ocr_max_dimensions": max_dims,
271
303
  }
272
304
 
273
305
  header_config = None
274
306
  if config_header := request.headers.get("X-Extraction-Config"):
275
307
  try:
276
- header_config = loads(config_header)
308
+ header_config = parse_header_config_cached(config_header)
277
309
  except Exception as e:
278
310
  raise ValidationError(f"Invalid JSON in X-Extraction-Config header: {e}", context={"error": str(e)}) from e
279
311
 
@@ -316,12 +348,10 @@ async def get_configuration() -> ConfigurationResponse:
316
348
 
317
349
 
318
350
  def _polars_dataframe_encoder(obj: Any) -> Any:
319
- """Convert polars DataFrame to dict for JSON serialization."""
320
351
  return obj.to_dicts()
321
352
 
322
353
 
323
354
  def _pil_image_encoder(obj: Any) -> str:
324
- """Convert PIL Image to base64 string for JSON serialization."""
325
355
  buffer = io.BytesIO()
326
356
  obj.save(buffer, format="PNG")
327
357
  img_str = base64.b64encode(buffer.getvalue()).decode()
@@ -344,7 +374,6 @@ openapi_config = OpenAPIConfig(
344
374
  create_examples=True,
345
375
  )
346
376
 
347
- # Type encoders for custom serialization
348
377
  type_encoders = {
349
378
  pl.DataFrame: _polars_dataframe_encoder,
350
379
  Image.Image: _pil_image_encoder,
@@ -360,5 +389,5 @@ app = Litestar(
360
389
  Exception: general_exception_handler,
361
390
  },
362
391
  type_encoders=type_encoders,
363
- request_max_body_size=1024 * 1024 * 1024, # 1GB limit for large file uploads
392
+ request_max_body_size=1024 * 1024 * 1024,
364
393
  )
kreuzberg/_chunker.py CHANGED
@@ -20,14 +20,15 @@ def get_chunker(
20
20
  key = (max_characters, overlap_characters, mime_type)
21
21
  if key not in _chunkers:
22
22
  try:
23
- if mime_type == MARKDOWN_MIME_TYPE:
24
- from semantic_text_splitter import MarkdownSplitter # noqa: PLC0415
23
+ match mime_type:
24
+ case x if x == MARKDOWN_MIME_TYPE:
25
+ from semantic_text_splitter import MarkdownSplitter # noqa: PLC0415
25
26
 
26
- _chunkers[key] = MarkdownSplitter(max_characters, overlap_characters)
27
- else:
28
- from semantic_text_splitter import TextSplitter # noqa: PLC0415
27
+ _chunkers[key] = MarkdownSplitter(max_characters, overlap_characters)
28
+ case _:
29
+ from semantic_text_splitter import TextSplitter # noqa: PLC0415
29
30
 
30
- _chunkers[key] = TextSplitter(max_characters, overlap_characters)
31
+ _chunkers[key] = TextSplitter(max_characters, overlap_characters)
31
32
  except ImportError as e: # pragma: no cover
32
33
  raise MissingDependencyError.create_for_package(
33
34
  dependency_group="chunking", functionality="chunking", package_name="semantic-text-splitter"
kreuzberg/_config.py CHANGED
@@ -69,7 +69,17 @@ def _build_ocr_config_from_cli(
69
69
  try:
70
70
  match ocr_backend:
71
71
  case "tesseract":
72
- return TesseractConfig(**backend_args)
72
+ # Handle PSM mode conversion from int to enum
73
+ processed_args = backend_args.copy()
74
+ if "psm" in processed_args and isinstance(processed_args["psm"], int):
75
+ try:
76
+ processed_args["psm"] = PSMMode(processed_args["psm"])
77
+ except ValueError as e:
78
+ raise ValidationError(
79
+ f"Invalid PSM mode value: {processed_args['psm']}",
80
+ context={"psm_value": processed_args["psm"], "error": str(e)},
81
+ ) from e
82
+ return TesseractConfig(**processed_args)
73
83
  case "easyocr":
74
84
  return EasyOCRConfig(**backend_args)
75
85
  case "paddleocr":
kreuzberg/_constants.py CHANGED
@@ -5,3 +5,5 @@ from typing import Final
5
5
  MINIMAL_SUPPORTED_PANDOC_VERSION: Final[int] = 2
6
6
  DEFAULT_MAX_CHARACTERS: Final[int] = 2000
7
7
  DEFAULT_MAX_OVERLAP: Final[int] = 100
8
+
9
+ PDF_POINTS_PER_INCH: Final[float] = 72.0 # Standard PDF unit conversion
@@ -65,12 +65,10 @@ def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tup
65
65
  return None, None
66
66
 
67
67
  translated_text = _get_translated_text(result)
68
- scores = dict.fromkeys(DOCUMENT_CLASSIFIERS, 0)
69
-
70
- for doc_type, patterns in DOCUMENT_CLASSIFIERS.items():
71
- for pattern in patterns:
72
- if re.search(pattern, translated_text):
73
- scores[doc_type] += 1
68
+ scores = {
69
+ doc_type: sum(1 for pattern in patterns if re.search(pattern, translated_text))
70
+ for doc_type, patterns in DOCUMENT_CLASSIFIERS.items()
71
+ }
74
72
 
75
73
  total_score = sum(scores.values())
76
74
  if total_score == 0:
@@ -134,7 +132,7 @@ def classify_document_from_layout(
134
132
  if not found_words.is_empty():
135
133
  scores[doc_type] += 1.0
136
134
  word_top = found_words[0, "top"]
137
- if word_top < page_height * 0.3:
135
+ if word_top is not None and word_top < page_height * 0.3:
138
136
  scores[doc_type] += 0.5
139
137
 
140
138
  total_score = sum(scores.values())
@@ -3,6 +3,7 @@ from __future__ import annotations
3
3
  import os
4
4
  import re
5
5
  from functools import lru_cache
6
+ from itertools import chain
6
7
  from typing import TYPE_CHECKING, Any
7
8
 
8
9
  from kreuzberg._types import Entity, SpacyEntityExtractionConfig
@@ -21,11 +22,15 @@ def extract_entities(
21
22
  ) -> list[Entity]:
22
23
  entities: list[Entity] = []
23
24
  if custom_patterns:
24
- for ent_type, pattern in custom_patterns:
25
- entities.extend(
26
- Entity(type=ent_type, text=match.group(), start=match.start(), end=match.end())
27
- for match in re.finditer(pattern, text)
25
+ entities.extend(
26
+ chain.from_iterable(
27
+ (
28
+ Entity(type=ent_type, text=match.group(), start=match.start(), end=match.end())
29
+ for match in re.finditer(pattern, text)
30
+ )
31
+ for ent_type, pattern in custom_patterns
28
32
  )
33
+ )
29
34
 
30
35
  if spacy_config is None:
31
36
  spacy_config = SpacyEntityExtractionConfig()