kreuzberg 3.14.0__py3-none-any.whl → 3.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. kreuzberg/__init__.py +6 -0
  2. kreuzberg/_api/_config_cache.py +247 -0
  3. kreuzberg/_api/main.py +156 -30
  4. kreuzberg/_chunker.py +7 -6
  5. kreuzberg/_constants.py +2 -0
  6. kreuzberg/_document_classification.py +4 -6
  7. kreuzberg/_entity_extraction.py +9 -4
  8. kreuzberg/_extractors/_base.py +269 -3
  9. kreuzberg/_extractors/_email.py +95 -27
  10. kreuzberg/_extractors/_html.py +85 -7
  11. kreuzberg/_extractors/_image.py +23 -22
  12. kreuzberg/_extractors/_pandoc.py +106 -75
  13. kreuzberg/_extractors/_pdf.py +209 -99
  14. kreuzberg/_extractors/_presentation.py +72 -8
  15. kreuzberg/_extractors/_spread_sheet.py +25 -30
  16. kreuzberg/_mcp/server.py +345 -25
  17. kreuzberg/_mime_types.py +42 -0
  18. kreuzberg/_ocr/_easyocr.py +2 -2
  19. kreuzberg/_ocr/_paddleocr.py +1 -1
  20. kreuzberg/_ocr/_tesseract.py +74 -34
  21. kreuzberg/_types.py +182 -23
  22. kreuzberg/_utils/_cache.py +10 -4
  23. kreuzberg/_utils/_device.py +2 -4
  24. kreuzberg/_utils/_image_preprocessing.py +12 -39
  25. kreuzberg/_utils/_process_pool.py +29 -8
  26. kreuzberg/_utils/_quality.py +7 -2
  27. kreuzberg/_utils/_resource_managers.py +65 -0
  28. kreuzberg/_utils/_sync.py +36 -6
  29. kreuzberg/_utils/_tmp.py +37 -1
  30. kreuzberg/cli.py +34 -20
  31. kreuzberg/extraction.py +43 -27
  32. {kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/METADATA +2 -1
  33. kreuzberg-3.15.0.dist-info/RECORD +60 -0
  34. kreuzberg-3.14.0.dist-info/RECORD +0 -58
  35. {kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/WHEEL +0 -0
  36. {kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/entry_points.txt +0 -0
  37. {kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/licenses/LICENSE +0 -0
@@ -2,13 +2,10 @@ from __future__ import annotations
2
2
 
3
3
  import contextlib
4
4
  import csv
5
- import os
6
5
  import sys
7
- import tempfile
8
6
  from datetime import date, datetime, time, timedelta
9
7
  from io import StringIO
10
- from pathlib import Path
11
- from typing import Any
8
+ from typing import TYPE_CHECKING, Any
12
9
 
13
10
  import polars as pl
14
11
  from anyio import Path as AsyncPath
@@ -21,9 +18,12 @@ from kreuzberg._types import ExtractionResult, Metadata, TableData
21
18
  from kreuzberg._utils._string import normalize_spaces
22
19
  from kreuzberg._utils._sync import run_sync, run_taskgroup
23
20
  from kreuzberg._utils._table import enhance_table_markdown
24
- from kreuzberg._utils._tmp import create_temp_file
21
+ from kreuzberg._utils._tmp import create_temp_file, temporary_file, temporary_file_sync
25
22
  from kreuzberg.exceptions import ParsingError
26
23
 
24
+ if TYPE_CHECKING:
25
+ from pathlib import Path
26
+
27
27
  if sys.version_info < (3, 11): # pragma: no cover
28
28
  from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
29
29
 
@@ -48,12 +48,8 @@ class SpreadSheetExtractor(Extractor):
48
48
 
49
49
  async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
50
50
  file_extension = self._get_file_extension()
51
- xlsx_path, unlink = await create_temp_file(file_extension)
52
- await AsyncPath(xlsx_path).write_bytes(content)
53
- try:
51
+ async with temporary_file(file_extension, content) as xlsx_path:
54
52
  return await self.extract_path_async(xlsx_path)
55
- finally:
56
- await unlink()
57
53
 
58
54
  async def extract_path_async(self, path: Path) -> ExtractionResult:
59
55
  try:
@@ -86,16 +82,8 @@ class SpreadSheetExtractor(Extractor):
86
82
 
87
83
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
88
84
  file_extension = self._get_file_extension()
89
- fd, temp_path = tempfile.mkstemp(suffix=file_extension)
90
-
91
- try:
92
- with os.fdopen(fd, "wb") as f:
93
- f.write(content)
94
-
95
- return self.extract_path_sync(Path(temp_path))
96
- finally:
97
- with contextlib.suppress(OSError):
98
- Path(temp_path).unlink()
85
+ with temporary_file_sync(file_extension, content) as temp_path:
86
+ return self.extract_path_sync(temp_path)
99
87
 
100
88
  def extract_path_sync(self, path: Path) -> ExtractionResult:
101
89
  try:
@@ -122,15 +110,17 @@ class SpreadSheetExtractor(Extractor):
122
110
 
123
111
  @staticmethod
124
112
  def _convert_cell_to_str(value: Any) -> str:
125
- if value is None:
126
- return ""
127
- if isinstance(value, bool):
128
- return str(value).lower()
129
- if isinstance(value, (datetime, date, time)):
130
- return value.isoformat()
131
- if isinstance(value, timedelta):
132
- return f"{value.total_seconds()} seconds"
133
- return str(value)
113
+ match value:
114
+ case None:
115
+ return ""
116
+ case bool():
117
+ return str(value).lower()
118
+ case datetime() | date() | time():
119
+ return value.isoformat()
120
+ case timedelta():
121
+ return f"{value.total_seconds()} seconds"
122
+ case _:
123
+ return str(value)
134
124
 
135
125
  async def _convert_sheet_to_text(self, workbook: CalamineWorkbook, sheet_name: str) -> str:
136
126
  values = workbook.get_sheet_by_name(sheet_name).to_python()
@@ -207,7 +197,12 @@ class SpreadSheetExtractor(Extractor):
207
197
  if not data or not any(row for row in data):
208
198
  return f"## {sheet_name}\n\n*Empty sheet*"
209
199
 
210
- df = pl.DataFrame(data)
200
+ # Normalize row lengths to avoid polars ShapeError
201
+ if data:
202
+ max_cols = max(len(row) if row else 0 for row in data)
203
+ data = [row + [None] * (max_cols - len(row)) if row else [None] * max_cols for row in data] # type: ignore[list-item]
204
+
205
+ df = pl.DataFrame(data, strict=False)
211
206
 
212
207
  df = df.filter(~pl.all_horizontal(pl.all().is_null()))
213
208
  df = df.select([col for col in df.columns if not df[col].is_null().all()])
kreuzberg/_mcp/server.py CHANGED
@@ -1,7 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import base64
4
+ import binascii
4
5
  import json
6
+ from pathlib import Path
5
7
  from typing import Any
6
8
 
7
9
  import msgspec
@@ -9,34 +11,178 @@ from mcp.server import FastMCP
9
11
  from mcp.types import TextContent
10
12
 
11
13
  from kreuzberg._config import discover_config
12
- from kreuzberg._types import ExtractionConfig, OcrBackendType
13
- from kreuzberg.extraction import extract_bytes_sync, extract_file_sync
14
+ from kreuzberg._types import ExtractionConfig, OcrBackendType, PSMMode, TesseractConfig
15
+ from kreuzberg.exceptions import ValidationError
16
+ from kreuzberg.extraction import (
17
+ batch_extract_bytes_sync,
18
+ batch_extract_file_sync,
19
+ extract_bytes_sync,
20
+ extract_file_sync,
21
+ )
14
22
 
15
23
  mcp = FastMCP("Kreuzberg Text Extraction")
16
24
 
25
+ # Security and performance limits
26
+ MAX_BATCH_SIZE = 100
27
+
28
+
29
+ def _validate_file_path(file_path: str) -> Path:
30
+ """Validate file path to prevent path traversal attacks.
31
+
32
+ Args:
33
+ file_path: The file path to validate
34
+
35
+ Returns:
36
+ Path: The validated Path object
37
+
38
+ Raises:
39
+ ValidationError: If path traversal is detected or path is invalid
40
+ """
41
+ try:
42
+ path = Path(file_path).resolve()
43
+ except (OSError, ValueError) as e:
44
+ raise ValidationError(
45
+ f"Invalid file path: {file_path}",
46
+ context={"file_path": file_path, "error": str(e)},
47
+ ) from e
48
+
49
+ # Check for path traversal attempts
50
+ if ".." in file_path and not file_path.startswith("/"):
51
+ raise ValidationError(
52
+ "Path traversal detected in file path",
53
+ context={"file_path": file_path, "resolved_path": str(path)},
54
+ )
55
+
56
+ if not path.exists():
57
+ raise ValidationError(
58
+ f"File not found: {file_path}",
59
+ context={"file_path": file_path, "resolved_path": str(path)},
60
+ )
61
+
62
+ if not path.is_file():
63
+ raise ValidationError(
64
+ f"Path is not a file: {file_path}",
65
+ context={"file_path": file_path, "resolved_path": str(path)},
66
+ )
67
+
68
+ return path
69
+
70
+
71
+ def _validate_file_path_with_context(file_path: str, index: int, total: int) -> Path:
72
+ """Validate file path and add context for batch operations."""
73
+ try:
74
+ return _validate_file_path(file_path)
75
+ except ValidationError as e:
76
+ # Add context about which file in the batch failed
77
+ e.context = e.context or {}
78
+ e.context["batch_index"] = index
79
+ e.context["total_files"] = total
80
+ raise
81
+
82
+
83
+ def _validate_base64_content(content_base64: str, context_info: str | None = None) -> bytes:
84
+ """Validate and decode base64 content with proper error handling.
85
+
86
+ Args:
87
+ content_base64: The base64 string to validate and decode
88
+ context_info: Additional context information for error reporting
89
+
90
+ Returns:
91
+ bytes: The decoded content
92
+
93
+ Raises:
94
+ ValidationError: If the base64 content is invalid
95
+ """
96
+ if not content_base64:
97
+ raise ValidationError(
98
+ "Base64 content cannot be empty",
99
+ context={"context": context_info},
100
+ )
101
+
102
+ # Check for whitespace-only content
103
+ if not content_base64.strip():
104
+ raise ValidationError(
105
+ "Base64 content cannot be whitespace only",
106
+ context={"content_preview": content_base64[:50], "context": context_info},
107
+ )
108
+
109
+ try:
110
+ content_bytes = base64.b64decode(content_base64, validate=True)
111
+ except (ValueError, binascii.Error) as e:
112
+ error_type = type(e).__name__
113
+ raise ValidationError(
114
+ f"Invalid base64 content: {error_type}: {e}",
115
+ context={
116
+ "error_type": error_type,
117
+ "error": str(e),
118
+ "content_preview": content_base64[:50] + "..." if len(content_base64) > 50 else content_base64,
119
+ "context": context_info,
120
+ },
121
+ ) from e
122
+
123
+ return content_bytes
124
+
17
125
 
18
126
  def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
19
127
  base_config = discover_config()
20
128
 
129
+ # Extract Tesseract-specific parameters from kwargs first
130
+ tesseract_lang = kwargs.pop("tesseract_lang", None)
131
+ tesseract_psm = kwargs.pop("tesseract_psm", None)
132
+ tesseract_output_format = kwargs.pop("tesseract_output_format", None)
133
+ enable_table_detection = kwargs.pop("enable_table_detection", None)
134
+
21
135
  if base_config is None:
22
- return ExtractionConfig(**kwargs)
23
-
24
- config_dict: dict[str, Any] = {
25
- "force_ocr": base_config.force_ocr,
26
- "chunk_content": base_config.chunk_content,
27
- "extract_tables": base_config.extract_tables,
28
- "extract_entities": base_config.extract_entities,
29
- "extract_keywords": base_config.extract_keywords,
30
- "ocr_backend": base_config.ocr_backend,
31
- "max_chars": base_config.max_chars,
32
- "max_overlap": base_config.max_overlap,
33
- "keyword_count": base_config.keyword_count,
34
- "auto_detect_language": base_config.auto_detect_language,
35
- "ocr_config": base_config.ocr_config,
36
- "gmft_config": base_config.gmft_config,
37
- }
38
-
39
- config_dict = config_dict | kwargs
136
+ config_dict = kwargs
137
+ else:
138
+ config_dict = {
139
+ "force_ocr": base_config.force_ocr,
140
+ "chunk_content": base_config.chunk_content,
141
+ "extract_tables": base_config.extract_tables,
142
+ "extract_entities": base_config.extract_entities,
143
+ "extract_keywords": base_config.extract_keywords,
144
+ "ocr_backend": base_config.ocr_backend,
145
+ "max_chars": base_config.max_chars,
146
+ "max_overlap": base_config.max_overlap,
147
+ "keyword_count": base_config.keyword_count,
148
+ "auto_detect_language": base_config.auto_detect_language,
149
+ "ocr_config": base_config.ocr_config,
150
+ "gmft_config": base_config.gmft_config,
151
+ }
152
+ config_dict = config_dict | kwargs
153
+
154
+ # Handle Tesseract OCR configuration
155
+ ocr_backend = config_dict.get("ocr_backend")
156
+ if ocr_backend == "tesseract" and (
157
+ tesseract_lang or tesseract_psm is not None or tesseract_output_format or enable_table_detection
158
+ ):
159
+ tesseract_config_dict = {}
160
+
161
+ if tesseract_lang:
162
+ tesseract_config_dict["language"] = tesseract_lang
163
+ if tesseract_psm is not None:
164
+ try:
165
+ tesseract_config_dict["psm"] = PSMMode(tesseract_psm)
166
+ except ValueError as e:
167
+ raise ValidationError(
168
+ f"Invalid PSM mode value: {tesseract_psm}",
169
+ context={"psm_value": tesseract_psm, "error": str(e)},
170
+ ) from e
171
+ if tesseract_output_format:
172
+ tesseract_config_dict["output_format"] = tesseract_output_format
173
+ if enable_table_detection:
174
+ tesseract_config_dict["enable_table_detection"] = True
175
+
176
+ if tesseract_config_dict:
177
+ # Merge with existing tesseract config if present
178
+ existing_ocr_config = config_dict.get("ocr_config")
179
+ if existing_ocr_config and isinstance(existing_ocr_config, TesseractConfig):
180
+ # Convert existing config to dict, merge, and recreate
181
+ existing_dict = existing_ocr_config.to_dict()
182
+ merged_dict = existing_dict | tesseract_config_dict
183
+ config_dict["ocr_config"] = TesseractConfig(**merged_dict)
184
+ else:
185
+ config_dict["ocr_config"] = TesseractConfig(**tesseract_config_dict)
40
186
 
41
187
  return ExtractionConfig(**config_dict)
42
188
 
@@ -55,7 +201,13 @@ def extract_document( # noqa: PLR0913
55
201
  max_overlap: int = 200,
56
202
  keyword_count: int = 10,
57
203
  auto_detect_language: bool = False,
204
+ tesseract_lang: str | None = None,
205
+ tesseract_psm: int | None = None,
206
+ tesseract_output_format: str | None = None,
207
+ enable_table_detection: bool | None = None,
58
208
  ) -> dict[str, Any]:
209
+ # Validate file path for security
210
+ validated_path = _validate_file_path(file_path)
59
211
  config = _create_config_with_overrides(
60
212
  force_ocr=force_ocr,
61
213
  chunk_content=chunk_content,
@@ -67,9 +219,13 @@ def extract_document( # noqa: PLR0913
67
219
  max_overlap=max_overlap,
68
220
  keyword_count=keyword_count,
69
221
  auto_detect_language=auto_detect_language,
222
+ tesseract_lang=tesseract_lang,
223
+ tesseract_psm=tesseract_psm,
224
+ tesseract_output_format=tesseract_output_format,
225
+ enable_table_detection=enable_table_detection,
70
226
  )
71
227
 
72
- result = extract_file_sync(file_path, mime_type, config)
228
+ result = extract_file_sync(str(validated_path), mime_type, config)
73
229
  return result.to_dict(include_none=True)
74
230
 
75
231
 
@@ -87,8 +243,12 @@ def extract_bytes( # noqa: PLR0913
87
243
  max_overlap: int = 200,
88
244
  keyword_count: int = 10,
89
245
  auto_detect_language: bool = False,
246
+ tesseract_lang: str | None = None,
247
+ tesseract_psm: int | None = None,
248
+ tesseract_output_format: str | None = None,
249
+ enable_table_detection: bool | None = None,
90
250
  ) -> dict[str, Any]:
91
- content_bytes = base64.b64decode(content_base64)
251
+ content_bytes = _validate_base64_content(content_base64, "extract_bytes")
92
252
 
93
253
  config = _create_config_with_overrides(
94
254
  force_ocr=force_ocr,
@@ -101,19 +261,175 @@ def extract_bytes( # noqa: PLR0913
101
261
  max_overlap=max_overlap,
102
262
  keyword_count=keyword_count,
103
263
  auto_detect_language=auto_detect_language,
264
+ tesseract_lang=tesseract_lang,
265
+ tesseract_psm=tesseract_psm,
266
+ tesseract_output_format=tesseract_output_format,
267
+ enable_table_detection=enable_table_detection,
104
268
  )
105
269
 
106
270
  result = extract_bytes_sync(content_bytes, mime_type, config)
107
271
  return result.to_dict(include_none=True)
108
272
 
109
273
 
274
+ @mcp.tool()
275
+ def batch_extract_document( # noqa: PLR0913
276
+ file_paths: list[str],
277
+ force_ocr: bool = False,
278
+ chunk_content: bool = False,
279
+ extract_tables: bool = False,
280
+ extract_entities: bool = False,
281
+ extract_keywords: bool = False,
282
+ ocr_backend: OcrBackendType = "tesseract",
283
+ max_chars: int = 1000,
284
+ max_overlap: int = 200,
285
+ keyword_count: int = 10,
286
+ auto_detect_language: bool = False,
287
+ tesseract_lang: str | None = None,
288
+ tesseract_psm: int | None = None,
289
+ tesseract_output_format: str | None = None,
290
+ enable_table_detection: bool | None = None,
291
+ ) -> list[dict[str, Any]]:
292
+ # Validate batch size
293
+ if len(file_paths) > MAX_BATCH_SIZE:
294
+ raise ValidationError(
295
+ f"Batch size exceeds maximum limit of {MAX_BATCH_SIZE}",
296
+ context={"batch_size": len(file_paths), "max_batch_size": MAX_BATCH_SIZE},
297
+ )
298
+
299
+ if not file_paths:
300
+ raise ValidationError(
301
+ "File paths list cannot be empty",
302
+ context={"file_paths": file_paths},
303
+ )
304
+
305
+ # Validate all file paths for security
306
+ validated_paths = []
307
+ for i, file_path in enumerate(file_paths):
308
+ validated_path = _validate_file_path_with_context(file_path, i, len(file_paths))
309
+ validated_paths.append(str(validated_path))
310
+ config = _create_config_with_overrides(
311
+ force_ocr=force_ocr,
312
+ chunk_content=chunk_content,
313
+ extract_tables=extract_tables,
314
+ extract_entities=extract_entities,
315
+ extract_keywords=extract_keywords,
316
+ ocr_backend=ocr_backend,
317
+ max_chars=max_chars,
318
+ max_overlap=max_overlap,
319
+ keyword_count=keyword_count,
320
+ auto_detect_language=auto_detect_language,
321
+ tesseract_lang=tesseract_lang,
322
+ tesseract_psm=tesseract_psm,
323
+ tesseract_output_format=tesseract_output_format,
324
+ enable_table_detection=enable_table_detection,
325
+ )
326
+
327
+ results = batch_extract_file_sync(validated_paths, config)
328
+ return [result.to_dict(include_none=True) for result in results]
329
+
330
+
331
+ @mcp.tool()
332
+ def batch_extract_bytes( # noqa: PLR0913
333
+ content_items: list[dict[str, str]],
334
+ force_ocr: bool = False,
335
+ chunk_content: bool = False,
336
+ extract_tables: bool = False,
337
+ extract_entities: bool = False,
338
+ extract_keywords: bool = False,
339
+ ocr_backend: OcrBackendType = "tesseract",
340
+ max_chars: int = 1000,
341
+ max_overlap: int = 200,
342
+ keyword_count: int = 10,
343
+ auto_detect_language: bool = False,
344
+ tesseract_lang: str | None = None,
345
+ tesseract_psm: int | None = None,
346
+ tesseract_output_format: str | None = None,
347
+ enable_table_detection: bool | None = None,
348
+ ) -> list[dict[str, Any]]:
349
+ # Validate input
350
+ if not content_items:
351
+ raise ValidationError("content_items cannot be empty", context={"content_items": content_items})
352
+
353
+ if not isinstance(content_items, list):
354
+ raise ValidationError(
355
+ "content_items must be a list", context={"content_items_type": type(content_items).__name__}
356
+ )
357
+
358
+ # Validate batch size
359
+ if len(content_items) > MAX_BATCH_SIZE:
360
+ raise ValidationError(
361
+ f"Batch size exceeds maximum limit of {MAX_BATCH_SIZE}",
362
+ context={"batch_size": len(content_items), "max_batch_size": MAX_BATCH_SIZE},
363
+ )
364
+
365
+ config = _create_config_with_overrides(
366
+ force_ocr=force_ocr,
367
+ chunk_content=chunk_content,
368
+ extract_tables=extract_tables,
369
+ extract_entities=extract_entities,
370
+ extract_keywords=extract_keywords,
371
+ ocr_backend=ocr_backend,
372
+ max_chars=max_chars,
373
+ max_overlap=max_overlap,
374
+ keyword_count=keyword_count,
375
+ auto_detect_language=auto_detect_language,
376
+ tesseract_lang=tesseract_lang,
377
+ tesseract_psm=tesseract_psm,
378
+ tesseract_output_format=tesseract_output_format,
379
+ enable_table_detection=enable_table_detection,
380
+ )
381
+
382
+ # Convert list of dicts to list of tuples (bytes, mime_type)
383
+ contents = []
384
+ for i, item in enumerate(content_items):
385
+ # Validate item structure
386
+ if not isinstance(item, dict):
387
+ raise ValidationError(
388
+ f"Item at index {i} must be a dictionary",
389
+ context={"item_index": i, "item_type": type(item).__name__, "item": item},
390
+ )
391
+
392
+ # Check for required keys
393
+ if "content_base64" not in item:
394
+ raise ValidationError(
395
+ f"Item at index {i} is missing required key 'content_base64'",
396
+ context={"item_index": i, "item_keys": list(item.keys()), "item": item},
397
+ )
398
+
399
+ if "mime_type" not in item:
400
+ raise ValidationError(
401
+ f"Item at index {i} is missing required key 'mime_type'",
402
+ context={"item_index": i, "item_keys": list(item.keys()), "item": item},
403
+ )
404
+
405
+ content_base64 = item["content_base64"]
406
+ mime_type = item["mime_type"]
407
+
408
+ # Validate base64 content
409
+ try:
410
+ content_bytes = _validate_base64_content(content_base64, f"batch_extract_bytes item {i}")
411
+ except ValidationError as e:
412
+ # Add batch-specific context
413
+ e.context = e.context or {}
414
+ e.context["item_index"] = i
415
+ e.context["total_items"] = len(content_items)
416
+ raise
417
+
418
+ contents.append((content_bytes, mime_type))
419
+
420
+ results = batch_extract_bytes_sync(contents, config)
421
+ return [result.to_dict(include_none=True) for result in results]
422
+
423
+
110
424
  @mcp.tool()
111
425
  def extract_simple(
112
426
  file_path: str,
113
427
  mime_type: str | None = None,
114
428
  ) -> str:
429
+ # Validate file path for security
430
+ validated_path = _validate_file_path(file_path)
115
431
  config = _create_config_with_overrides()
116
- result = extract_file_sync(file_path, mime_type, config)
432
+ result = extract_file_sync(str(validated_path), mime_type, config)
117
433
  return result.content
118
434
 
119
435
 
@@ -151,7 +467,9 @@ def get_supported_formats() -> str:
151
467
 
152
468
  @mcp.prompt()
153
469
  def extract_and_summarize(file_path: str) -> list[TextContent]:
154
- result = extract_file_sync(file_path, None, _create_config_with_overrides())
470
+ # Validate file path for security
471
+ validated_path = _validate_file_path(file_path)
472
+ result = extract_file_sync(str(validated_path), None, _create_config_with_overrides())
155
473
 
156
474
  return [
157
475
  TextContent(
@@ -163,12 +481,14 @@ def extract_and_summarize(file_path: str) -> list[TextContent]:
163
481
 
164
482
  @mcp.prompt()
165
483
  def extract_structured(file_path: str) -> list[TextContent]:
484
+ # Validate file path for security
485
+ validated_path = _validate_file_path(file_path)
166
486
  config = _create_config_with_overrides(
167
487
  extract_entities=True,
168
488
  extract_keywords=True,
169
489
  extract_tables=True,
170
490
  )
171
- result = extract_file_sync(file_path, None, config)
491
+ result = extract_file_sync(str(validated_path), None, config)
172
492
 
173
493
  content = f"Document Content:\n{result.content}\n\n"
174
494
 
kreuzberg/_mime_types.py CHANGED
@@ -56,6 +56,48 @@ IMAGE_MIME_TYPES: Final[set[str]] = {
56
56
  "image/x-tiff",
57
57
  }
58
58
 
59
+ IMAGE_FORMATS: Final[frozenset[str]] = frozenset(
60
+ {
61
+ "jpg",
62
+ "jpeg",
63
+ "png",
64
+ "gif",
65
+ "bmp",
66
+ "tiff",
67
+ "tif",
68
+ "webp",
69
+ "jp2",
70
+ "jpx",
71
+ "jpm",
72
+ "mj2",
73
+ "pnm",
74
+ "pbm",
75
+ "pgm",
76
+ "ppm",
77
+ }
78
+ )
79
+
80
+ IMAGE_MIME_TO_EXT: Final[dict[str, str]] = {
81
+ "image/bmp": "bmp",
82
+ "image/x-bmp": "bmp",
83
+ "image/x-ms-bmp": "bmp",
84
+ "image/gif": "gif",
85
+ "image/jpeg": "jpg",
86
+ "image/pjpeg": "jpg",
87
+ "image/png": "png",
88
+ "image/tiff": "tiff",
89
+ "image/x-tiff": "tiff",
90
+ "image/jp2": "jp2",
91
+ "image/jpx": "jpx",
92
+ "image/jpm": "jpm",
93
+ "image/mj2": "mj2",
94
+ "image/webp": "webp",
95
+ "image/x-portable-anymap": "pnm",
96
+ "image/x-portable-bitmap": "pbm",
97
+ "image/x-portable-graymap": "pgm",
98
+ "image/x-portable-pixmap": "ppm",
99
+ }
100
+
59
101
  PANDOC_SUPPORTED_MIME_TYPES: Final[set[str]] = {
60
102
  "application/csl+json",
61
103
  "application/docbook+xml",
@@ -239,7 +239,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
239
239
  )
240
240
 
241
241
  return ExtractionResult(
242
- content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[]
242
+ content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata
243
243
  )
244
244
 
245
245
  # Group text boxes by lines based on Y coordinate # ~keep
@@ -287,7 +287,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
287
287
  )
288
288
 
289
289
  return ExtractionResult(
290
- content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[]
290
+ content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata
291
291
  )
292
292
 
293
293
  @classmethod
@@ -192,7 +192,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
192
192
  )
193
193
 
194
194
  return ExtractionResult(
195
- content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[]
195
+ content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata
196
196
  )
197
197
 
198
198
  @classmethod