kreuzberg 3.14.1__py3-none-any.whl → 3.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. kreuzberg/__init__.py +6 -0
  2. kreuzberg/_api/_config_cache.py +247 -0
  3. kreuzberg/_api/main.py +127 -45
  4. kreuzberg/_chunker.py +7 -6
  5. kreuzberg/_constants.py +2 -0
  6. kreuzberg/_document_classification.py +4 -6
  7. kreuzberg/_entity_extraction.py +9 -4
  8. kreuzberg/_extractors/_base.py +269 -3
  9. kreuzberg/_extractors/_email.py +95 -27
  10. kreuzberg/_extractors/_html.py +85 -7
  11. kreuzberg/_extractors/_image.py +23 -22
  12. kreuzberg/_extractors/_pandoc.py +106 -75
  13. kreuzberg/_extractors/_pdf.py +209 -99
  14. kreuzberg/_extractors/_presentation.py +72 -8
  15. kreuzberg/_extractors/_spread_sheet.py +25 -30
  16. kreuzberg/_mcp/server.py +345 -25
  17. kreuzberg/_mime_types.py +42 -0
  18. kreuzberg/_ocr/_easyocr.py +2 -2
  19. kreuzberg/_ocr/_paddleocr.py +1 -1
  20. kreuzberg/_ocr/_tesseract.py +74 -34
  21. kreuzberg/_types.py +180 -21
  22. kreuzberg/_utils/_cache.py +10 -4
  23. kreuzberg/_utils/_device.py +2 -4
  24. kreuzberg/_utils/_image_preprocessing.py +12 -39
  25. kreuzberg/_utils/_process_pool.py +29 -8
  26. kreuzberg/_utils/_quality.py +7 -2
  27. kreuzberg/_utils/_resource_managers.py +65 -0
  28. kreuzberg/_utils/_sync.py +36 -6
  29. kreuzberg/_utils/_tmp.py +37 -1
  30. kreuzberg/cli.py +34 -20
  31. kreuzberg/extraction.py +43 -27
  32. {kreuzberg-3.14.1.dist-info → kreuzberg-3.15.0.dist-info}/METADATA +2 -1
  33. kreuzberg-3.15.0.dist-info/RECORD +60 -0
  34. kreuzberg-3.14.1.dist-info/RECORD +0 -58
  35. {kreuzberg-3.14.1.dist-info → kreuzberg-3.15.0.dist-info}/WHEEL +0 -0
  36. {kreuzberg-3.14.1.dist-info → kreuzberg-3.15.0.dist-info}/entry_points.txt +0 -0
  37. {kreuzberg-3.14.1.dist-info → kreuzberg-3.15.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,12 +1,11 @@
1
1
  from __future__ import annotations
2
2
 
3
- import contextlib
4
- import os
3
+ import logging
5
4
  import re
6
5
  import subprocess
7
6
  import sys
8
- import tempfile
9
- from json import JSONDecodeError, loads
7
+ from itertools import chain
8
+ from json import loads
10
9
  from pathlib import Path
11
10
  from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal, cast
12
11
 
@@ -16,10 +15,10 @@ from anyio import run_process
16
15
  from kreuzberg._constants import MINIMAL_SUPPORTED_PANDOC_VERSION
17
16
  from kreuzberg._extractors._base import Extractor
18
17
  from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
19
- from kreuzberg._types import ExtractionResult, Metadata
18
+ from kreuzberg._types import ExtractedImage, ExtractionResult, ImageOCRResult, Metadata
20
19
  from kreuzberg._utils._string import normalize_spaces
21
- from kreuzberg._utils._sync import run_taskgroup
22
- from kreuzberg._utils._tmp import create_temp_file
20
+ from kreuzberg._utils._sync import run_maybe_async, run_taskgroup
21
+ from kreuzberg._utils._tmp import temporary_directory, temporary_file, temporary_file_sync
23
22
  from kreuzberg.exceptions import MissingDependencyError, ParsingError, ValidationError
24
23
 
25
24
  if TYPE_CHECKING: # pragma: no cover
@@ -152,13 +151,8 @@ class PandocExtractor(Extractor):
152
151
 
153
152
  async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
154
153
  extension = self._get_pandoc_type_from_mime_type(self.mime_type)
155
- input_file, unlink = await create_temp_file(f".{extension}")
156
-
157
- try:
158
- await AsyncPath(input_file).write_bytes(content)
154
+ async with temporary_file(f".{extension}", content) as input_file:
159
155
  return await self.extract_path_async(input_file)
160
- finally:
161
- await unlink()
162
156
 
163
157
  async def extract_path_async(self, path: Path) -> ExtractionResult:
164
158
  await self._validate_pandoc_version()
@@ -170,24 +164,25 @@ class PandocExtractor(Extractor):
170
164
  results = await run_taskgroup(metadata_task, content_task)
171
165
  metadata, content = cast("tuple[Metadata, str]", results)
172
166
 
173
- return ExtractionResult(
174
- content=normalize_spaces(content), metadata=metadata, mime_type=MARKDOWN_MIME_TYPE, chunks=[]
167
+ result = ExtractionResult(
168
+ content=normalize_spaces(content), metadata=metadata, mime_type=MARKDOWN_MIME_TYPE
175
169
  )
170
+
171
+ if self.config.extract_images:
172
+ images = await self._extract_images_with_pandoc(str(path))
173
+ result.images = images
174
+ if self.config.ocr_extracted_images and result.images:
175
+ image_ocr_results = await self._process_images_with_ocr(result.images)
176
+ result.image_ocr_results = image_ocr_results
177
+
178
+ return result
176
179
  except ExceptionGroup as eg:
177
180
  raise ParsingError("Failed to process file", context={"file": str(path), "errors": eg.exceptions}) from eg
178
181
 
179
182
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
180
183
  extension = self._get_pandoc_type_from_mime_type(self.mime_type)
181
- fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
182
-
183
- try:
184
- with os.fdopen(fd, "wb") as f:
185
- f.write(content)
186
-
187
- return self.extract_path_sync(Path(temp_path))
188
- finally:
189
- with contextlib.suppress(OSError):
190
- Path(temp_path).unlink()
184
+ with temporary_file_sync(f".{extension}", content) as temp_path:
185
+ return self.extract_path_sync(temp_path)
191
186
 
192
187
  def extract_path_sync(self, path: Path) -> ExtractionResult:
193
188
  self._validate_pandoc_version_sync()
@@ -197,9 +192,20 @@ class PandocExtractor(Extractor):
197
192
  metadata = self._extract_metadata_sync(path)
198
193
  content = self._extract_file_sync(path)
199
194
 
200
- return ExtractionResult(
201
- content=normalize_spaces(content), metadata=metadata, mime_type=MARKDOWN_MIME_TYPE, chunks=[]
195
+ result = ExtractionResult(
196
+ content=normalize_spaces(content), metadata=metadata, mime_type=MARKDOWN_MIME_TYPE
202
197
  )
198
+
199
+ if self.config.extract_images:
200
+ images: list[ExtractedImage] = run_maybe_async(self._extract_images_with_pandoc, str(path))
201
+ result.images = images
202
+ if self.config.ocr_extracted_images and result.images:
203
+ image_ocr_results: list[ImageOCRResult] = run_maybe_async(
204
+ self._process_images_with_ocr, result.images
205
+ )
206
+ result.image_ocr_results = image_ocr_results
207
+
208
+ return result
203
209
  except Exception as e:
204
210
  raise ParsingError("Failed to process file", context={"file": str(path), "error": str(e)}) from e
205
211
 
@@ -286,8 +292,7 @@ class PandocExtractor(Extractor):
286
292
 
287
293
  async def _handle_extract_metadata(self, input_file: str | PathLike[str]) -> Metadata:
288
294
  pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
289
- metadata_file, unlink = await create_temp_file(".json")
290
- try:
295
+ async with temporary_file(".json") as metadata_file:
291
296
  command = [
292
297
  "pandoc",
293
298
  str(input_file),
@@ -308,15 +313,10 @@ class PandocExtractor(Extractor):
308
313
 
309
314
  json_data = loads(await AsyncPath(metadata_file).read_text("utf-8"))
310
315
  return self._extract_metadata(json_data)
311
- except (RuntimeError, OSError, JSONDecodeError) as e:
312
- raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
313
- finally:
314
- await unlink()
315
316
 
316
317
  async def _handle_extract_file(self, input_file: str | PathLike[str]) -> str:
317
318
  pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
318
- output_path, unlink = await create_temp_file(".md")
319
- try:
319
+ async with temporary_file(".md") as output_path:
320
320
  command = [
321
321
  "pandoc",
322
322
  str(input_file),
@@ -339,10 +339,6 @@ class PandocExtractor(Extractor):
339
339
  text = await AsyncPath(output_path).read_text("utf-8")
340
340
 
341
341
  return normalize_spaces(text)
342
- except (RuntimeError, OSError) as e:
343
- raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
344
- finally:
345
- await unlink()
346
342
 
347
343
  def _extract_metadata(self, raw_meta: dict[str, Any]) -> Metadata:
348
344
  meta: Metadata = {}
@@ -372,15 +368,17 @@ class PandocExtractor(Extractor):
372
368
 
373
369
  extracted = self._extract_meta_value(value)
374
370
  if extracted:
375
- if pandoc_key in ("languages", "authors"):
376
- extracted = [extracted] # type: ignore[list-item]
371
+ if pandoc_key in ("languages", "authors") and not isinstance(extracted, list):
372
+ extracted = [extracted]
377
373
  meta[pandoc_key] = extracted # type: ignore[literal-required]
378
374
 
379
375
  citations_from_blocks = [
380
376
  cite["citationId"]
381
- for block in raw_meta.get("blocks", [])
382
- if block.get(TYPE_FIELD) == "Cite"
383
- for cite in block.get(CONTENT_FIELD, [[{}]])[0]
377
+ for cite in chain.from_iterable(
378
+ block.get(CONTENT_FIELD, [[{}]])[0]
379
+ for block in raw_meta.get("blocks", [])
380
+ if block.get(TYPE_FIELD) == "Cite"
381
+ )
384
382
  if isinstance(cite, dict)
385
383
  ]
386
384
  if citations_from_blocks and "citations" not in meta:
@@ -391,14 +389,15 @@ class PandocExtractor(Extractor):
391
389
  return meta
392
390
 
393
391
  def _extract_inline_text(self, node: dict[str, Any], type_field: str = "t", content_field: str = "c") -> str | None:
394
- if node_type := node.get(type_field):
395
- if node_type == "Str":
392
+ match node.get(type_field):
393
+ case "Str":
396
394
  return node.get(content_field)
397
- if node_type == "Space":
395
+ case "Space":
398
396
  return " "
399
- if node_type in ("Emph", "Strong"):
397
+ case "Emph" | "Strong":
400
398
  return self._extract_inlines(node.get(content_field, []))
401
- return None
399
+ case _:
400
+ return None
402
401
 
403
402
  def _extract_inlines(self, nodes: list[dict[str, Any]]) -> str | None:
404
403
  texts = [text for node in nodes if (text := self._extract_inline_text(node))]
@@ -431,13 +430,8 @@ class PandocExtractor(Extractor):
431
430
  return self._extract_inlines(content)
432
431
 
433
432
  if node_type == "MetaList":
434
- results = []
435
- for value in [value for item in content if (value := self._extract_meta_value(item))]:
436
- if isinstance(value, list):
437
- results.extend(value)
438
- else:
439
- results.append(value)
440
- return results
433
+ values = [value for item in content if (value := self._extract_meta_value(item))]
434
+ return list(chain.from_iterable(value if isinstance(value, list) else [value] for value in values))
441
435
 
442
436
  if node_type == "MetaBlocks" and (
443
437
  blocks := [block for block in content if block.get(type_field) == "Para"]
@@ -505,10 +499,8 @@ class PandocExtractor(Extractor):
505
499
 
506
500
  def _extract_metadata_sync(self, path: Path) -> Metadata:
507
501
  pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
508
- fd, metadata_file = tempfile.mkstemp(suffix=".json")
509
- os.close(fd)
510
502
 
511
- try:
503
+ with temporary_file_sync(".json") as metadata_file:
512
504
  command = [
513
505
  "pandoc",
514
506
  str(path),
@@ -525,23 +517,15 @@ class PandocExtractor(Extractor):
525
517
  if result.returncode != 0:
526
518
  raise ParsingError("Failed to extract file data", context={"file": str(path), "error": result.stderr})
527
519
 
528
- with Path(metadata_file).open(encoding="utf-8") as f:
520
+ with metadata_file.open(encoding="utf-8") as f:
529
521
  json_data = loads(f.read())
530
522
 
531
523
  return self._extract_metadata(json_data)
532
524
 
533
- except (OSError, JSONDecodeError) as e:
534
- raise ParsingError("Failed to extract file data", context={"file": str(path)}) from e
535
- finally:
536
- with contextlib.suppress(OSError):
537
- Path(metadata_file).unlink()
538
-
539
525
  def _extract_file_sync(self, path: Path) -> str:
540
526
  pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
541
- fd, output_path = tempfile.mkstemp(suffix=".md")
542
- os.close(fd)
543
527
 
544
- try:
528
+ with temporary_file_sync(".md") as output_path:
545
529
  command = [
546
530
  "pandoc",
547
531
  str(path),
@@ -559,16 +543,63 @@ class PandocExtractor(Extractor):
559
543
  if result.returncode != 0:
560
544
  raise ParsingError("Failed to extract file data", context={"file": str(path), "error": result.stderr})
561
545
 
562
- with Path(output_path).open(encoding="utf-8") as f:
546
+ with output_path.open(encoding="utf-8") as f:
563
547
  text = f.read()
564
548
 
565
549
  return normalize_spaces(text)
566
550
 
567
- except OSError as e:
568
- raise ParsingError("Failed to extract file data", context={"file": str(path)}) from e
569
- finally:
570
- with contextlib.suppress(OSError):
571
- Path(output_path).unlink()
551
+ async def _extract_images_with_pandoc(self, file_path: str) -> list[ExtractedImage]:
552
+ images = []
553
+
554
+ with temporary_directory() as temp_dir:
555
+ media_dir = Path(temp_dir) / "media"
556
+ media_dir.mkdir()
557
+
558
+ try:
559
+ cmd = [
560
+ "pandoc",
561
+ str(file_path),
562
+ "--extract-media",
563
+ str(media_dir),
564
+ "-t",
565
+ "markdown",
566
+ "-o",
567
+ "/dev/null",
568
+ ]
569
+
570
+ await run_process(cmd)
571
+
572
+ if media_dir.exists():
573
+ for img_path in media_dir.rglob("*"):
574
+ if img_path.is_file() and img_path.suffix.lower() in {
575
+ ".jpg",
576
+ ".jpeg",
577
+ ".png",
578
+ ".gif",
579
+ ".bmp",
580
+ ".tiff",
581
+ ".webp",
582
+ }:
583
+ try:
584
+ image_data = await AsyncPath(img_path).read_bytes()
585
+
586
+ images.append(
587
+ ExtractedImage(
588
+ data=image_data,
589
+ format=img_path.suffix[1:].lower(),
590
+ filename=img_path.name,
591
+ page_number=None,
592
+ )
593
+ )
594
+ except Exception as e: # noqa: BLE001
595
+ logging.getLogger(__name__).warning(
596
+ "Failed to read extracted image %s: %s", img_path, e
597
+ )
598
+
599
+ except Exception as e: # noqa: BLE001
600
+ logging.getLogger(__name__).warning("Pandoc image extraction failed: %s", e)
601
+
602
+ return images
572
603
 
573
604
 
574
605
  class MarkdownExtractor(PandocExtractor):