kreuzberg 3.14.0__py3-none-any.whl → 3.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +6 -0
- kreuzberg/_api/_config_cache.py +247 -0
- kreuzberg/_api/main.py +156 -30
- kreuzberg/_chunker.py +7 -6
- kreuzberg/_constants.py +2 -0
- kreuzberg/_document_classification.py +4 -6
- kreuzberg/_entity_extraction.py +9 -4
- kreuzberg/_extractors/_base.py +269 -3
- kreuzberg/_extractors/_email.py +95 -27
- kreuzberg/_extractors/_html.py +85 -7
- kreuzberg/_extractors/_image.py +23 -22
- kreuzberg/_extractors/_pandoc.py +106 -75
- kreuzberg/_extractors/_pdf.py +209 -99
- kreuzberg/_extractors/_presentation.py +72 -8
- kreuzberg/_extractors/_spread_sheet.py +25 -30
- kreuzberg/_mcp/server.py +345 -25
- kreuzberg/_mime_types.py +42 -0
- kreuzberg/_ocr/_easyocr.py +2 -2
- kreuzberg/_ocr/_paddleocr.py +1 -1
- kreuzberg/_ocr/_tesseract.py +74 -34
- kreuzberg/_types.py +182 -23
- kreuzberg/_utils/_cache.py +10 -4
- kreuzberg/_utils/_device.py +2 -4
- kreuzberg/_utils/_image_preprocessing.py +12 -39
- kreuzberg/_utils/_process_pool.py +29 -8
- kreuzberg/_utils/_quality.py +7 -2
- kreuzberg/_utils/_resource_managers.py +65 -0
- kreuzberg/_utils/_sync.py +36 -6
- kreuzberg/_utils/_tmp.py +37 -1
- kreuzberg/cli.py +34 -20
- kreuzberg/extraction.py +43 -27
- {kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/METADATA +2 -1
- kreuzberg-3.15.0.dist-info/RECORD +60 -0
- kreuzberg-3.14.0.dist-info/RECORD +0 -58
- {kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_extractors/_pandoc.py
CHANGED
@@ -1,12 +1,11 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
import
|
4
|
-
import os
|
3
|
+
import logging
|
5
4
|
import re
|
6
5
|
import subprocess
|
7
6
|
import sys
|
8
|
-
import
|
9
|
-
from json import
|
7
|
+
from itertools import chain
|
8
|
+
from json import loads
|
10
9
|
from pathlib import Path
|
11
10
|
from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal, cast
|
12
11
|
|
@@ -16,10 +15,10 @@ from anyio import run_process
|
|
16
15
|
from kreuzberg._constants import MINIMAL_SUPPORTED_PANDOC_VERSION
|
17
16
|
from kreuzberg._extractors._base import Extractor
|
18
17
|
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
|
19
|
-
from kreuzberg._types import ExtractionResult, Metadata
|
18
|
+
from kreuzberg._types import ExtractedImage, ExtractionResult, ImageOCRResult, Metadata
|
20
19
|
from kreuzberg._utils._string import normalize_spaces
|
21
|
-
from kreuzberg._utils._sync import run_taskgroup
|
22
|
-
from kreuzberg._utils._tmp import
|
20
|
+
from kreuzberg._utils._sync import run_maybe_async, run_taskgroup
|
21
|
+
from kreuzberg._utils._tmp import temporary_directory, temporary_file, temporary_file_sync
|
23
22
|
from kreuzberg.exceptions import MissingDependencyError, ParsingError, ValidationError
|
24
23
|
|
25
24
|
if TYPE_CHECKING: # pragma: no cover
|
@@ -152,13 +151,8 @@ class PandocExtractor(Extractor):
|
|
152
151
|
|
153
152
|
async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
|
154
153
|
extension = self._get_pandoc_type_from_mime_type(self.mime_type)
|
155
|
-
|
156
|
-
|
157
|
-
try:
|
158
|
-
await AsyncPath(input_file).write_bytes(content)
|
154
|
+
async with temporary_file(f".{extension}", content) as input_file:
|
159
155
|
return await self.extract_path_async(input_file)
|
160
|
-
finally:
|
161
|
-
await unlink()
|
162
156
|
|
163
157
|
async def extract_path_async(self, path: Path) -> ExtractionResult:
|
164
158
|
await self._validate_pandoc_version()
|
@@ -170,24 +164,25 @@ class PandocExtractor(Extractor):
|
|
170
164
|
results = await run_taskgroup(metadata_task, content_task)
|
171
165
|
metadata, content = cast("tuple[Metadata, str]", results)
|
172
166
|
|
173
|
-
|
174
|
-
content=normalize_spaces(content), metadata=metadata, mime_type=MARKDOWN_MIME_TYPE
|
167
|
+
result = ExtractionResult(
|
168
|
+
content=normalize_spaces(content), metadata=metadata, mime_type=MARKDOWN_MIME_TYPE
|
175
169
|
)
|
170
|
+
|
171
|
+
if self.config.extract_images:
|
172
|
+
images = await self._extract_images_with_pandoc(str(path))
|
173
|
+
result.images = images
|
174
|
+
if self.config.ocr_extracted_images and result.images:
|
175
|
+
image_ocr_results = await self._process_images_with_ocr(result.images)
|
176
|
+
result.image_ocr_results = image_ocr_results
|
177
|
+
|
178
|
+
return result
|
176
179
|
except ExceptionGroup as eg:
|
177
180
|
raise ParsingError("Failed to process file", context={"file": str(path), "errors": eg.exceptions}) from eg
|
178
181
|
|
179
182
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
180
183
|
extension = self._get_pandoc_type_from_mime_type(self.mime_type)
|
181
|
-
|
182
|
-
|
183
|
-
try:
|
184
|
-
with os.fdopen(fd, "wb") as f:
|
185
|
-
f.write(content)
|
186
|
-
|
187
|
-
return self.extract_path_sync(Path(temp_path))
|
188
|
-
finally:
|
189
|
-
with contextlib.suppress(OSError):
|
190
|
-
Path(temp_path).unlink()
|
184
|
+
with temporary_file_sync(f".{extension}", content) as temp_path:
|
185
|
+
return self.extract_path_sync(temp_path)
|
191
186
|
|
192
187
|
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
193
188
|
self._validate_pandoc_version_sync()
|
@@ -197,9 +192,20 @@ class PandocExtractor(Extractor):
|
|
197
192
|
metadata = self._extract_metadata_sync(path)
|
198
193
|
content = self._extract_file_sync(path)
|
199
194
|
|
200
|
-
|
201
|
-
content=normalize_spaces(content), metadata=metadata, mime_type=MARKDOWN_MIME_TYPE
|
195
|
+
result = ExtractionResult(
|
196
|
+
content=normalize_spaces(content), metadata=metadata, mime_type=MARKDOWN_MIME_TYPE
|
202
197
|
)
|
198
|
+
|
199
|
+
if self.config.extract_images:
|
200
|
+
images: list[ExtractedImage] = run_maybe_async(self._extract_images_with_pandoc, str(path))
|
201
|
+
result.images = images
|
202
|
+
if self.config.ocr_extracted_images and result.images:
|
203
|
+
image_ocr_results: list[ImageOCRResult] = run_maybe_async(
|
204
|
+
self._process_images_with_ocr, result.images
|
205
|
+
)
|
206
|
+
result.image_ocr_results = image_ocr_results
|
207
|
+
|
208
|
+
return result
|
203
209
|
except Exception as e:
|
204
210
|
raise ParsingError("Failed to process file", context={"file": str(path), "error": str(e)}) from e
|
205
211
|
|
@@ -286,8 +292,7 @@ class PandocExtractor(Extractor):
|
|
286
292
|
|
287
293
|
async def _handle_extract_metadata(self, input_file: str | PathLike[str]) -> Metadata:
|
288
294
|
pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
|
289
|
-
|
290
|
-
try:
|
295
|
+
async with temporary_file(".json") as metadata_file:
|
291
296
|
command = [
|
292
297
|
"pandoc",
|
293
298
|
str(input_file),
|
@@ -308,15 +313,10 @@ class PandocExtractor(Extractor):
|
|
308
313
|
|
309
314
|
json_data = loads(await AsyncPath(metadata_file).read_text("utf-8"))
|
310
315
|
return self._extract_metadata(json_data)
|
311
|
-
except (RuntimeError, OSError, JSONDecodeError) as e:
|
312
|
-
raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
|
313
|
-
finally:
|
314
|
-
await unlink()
|
315
316
|
|
316
317
|
async def _handle_extract_file(self, input_file: str | PathLike[str]) -> str:
|
317
318
|
pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
|
318
|
-
|
319
|
-
try:
|
319
|
+
async with temporary_file(".md") as output_path:
|
320
320
|
command = [
|
321
321
|
"pandoc",
|
322
322
|
str(input_file),
|
@@ -339,10 +339,6 @@ class PandocExtractor(Extractor):
|
|
339
339
|
text = await AsyncPath(output_path).read_text("utf-8")
|
340
340
|
|
341
341
|
return normalize_spaces(text)
|
342
|
-
except (RuntimeError, OSError) as e:
|
343
|
-
raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
|
344
|
-
finally:
|
345
|
-
await unlink()
|
346
342
|
|
347
343
|
def _extract_metadata(self, raw_meta: dict[str, Any]) -> Metadata:
|
348
344
|
meta: Metadata = {}
|
@@ -372,15 +368,17 @@ class PandocExtractor(Extractor):
|
|
372
368
|
|
373
369
|
extracted = self._extract_meta_value(value)
|
374
370
|
if extracted:
|
375
|
-
if pandoc_key in ("languages", "authors"):
|
376
|
-
extracted = [extracted]
|
371
|
+
if pandoc_key in ("languages", "authors") and not isinstance(extracted, list):
|
372
|
+
extracted = [extracted]
|
377
373
|
meta[pandoc_key] = extracted # type: ignore[literal-required]
|
378
374
|
|
379
375
|
citations_from_blocks = [
|
380
376
|
cite["citationId"]
|
381
|
-
for
|
382
|
-
|
383
|
-
|
377
|
+
for cite in chain.from_iterable(
|
378
|
+
block.get(CONTENT_FIELD, [[{}]])[0]
|
379
|
+
for block in raw_meta.get("blocks", [])
|
380
|
+
if block.get(TYPE_FIELD) == "Cite"
|
381
|
+
)
|
384
382
|
if isinstance(cite, dict)
|
385
383
|
]
|
386
384
|
if citations_from_blocks and "citations" not in meta:
|
@@ -391,14 +389,15 @@ class PandocExtractor(Extractor):
|
|
391
389
|
return meta
|
392
390
|
|
393
391
|
def _extract_inline_text(self, node: dict[str, Any], type_field: str = "t", content_field: str = "c") -> str | None:
|
394
|
-
|
395
|
-
|
392
|
+
match node.get(type_field):
|
393
|
+
case "Str":
|
396
394
|
return node.get(content_field)
|
397
|
-
|
395
|
+
case "Space":
|
398
396
|
return " "
|
399
|
-
|
397
|
+
case "Emph" | "Strong":
|
400
398
|
return self._extract_inlines(node.get(content_field, []))
|
401
|
-
|
399
|
+
case _:
|
400
|
+
return None
|
402
401
|
|
403
402
|
def _extract_inlines(self, nodes: list[dict[str, Any]]) -> str | None:
|
404
403
|
texts = [text for node in nodes if (text := self._extract_inline_text(node))]
|
@@ -431,13 +430,8 @@ class PandocExtractor(Extractor):
|
|
431
430
|
return self._extract_inlines(content)
|
432
431
|
|
433
432
|
if node_type == "MetaList":
|
434
|
-
|
435
|
-
|
436
|
-
if isinstance(value, list):
|
437
|
-
results.extend(value)
|
438
|
-
else:
|
439
|
-
results.append(value)
|
440
|
-
return results
|
433
|
+
values = [value for item in content if (value := self._extract_meta_value(item))]
|
434
|
+
return list(chain.from_iterable(value if isinstance(value, list) else [value] for value in values))
|
441
435
|
|
442
436
|
if node_type == "MetaBlocks" and (
|
443
437
|
blocks := [block for block in content if block.get(type_field) == "Para"]
|
@@ -505,10 +499,8 @@ class PandocExtractor(Extractor):
|
|
505
499
|
|
506
500
|
def _extract_metadata_sync(self, path: Path) -> Metadata:
|
507
501
|
pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
|
508
|
-
fd, metadata_file = tempfile.mkstemp(suffix=".json")
|
509
|
-
os.close(fd)
|
510
502
|
|
511
|
-
|
503
|
+
with temporary_file_sync(".json") as metadata_file:
|
512
504
|
command = [
|
513
505
|
"pandoc",
|
514
506
|
str(path),
|
@@ -525,23 +517,15 @@ class PandocExtractor(Extractor):
|
|
525
517
|
if result.returncode != 0:
|
526
518
|
raise ParsingError("Failed to extract file data", context={"file": str(path), "error": result.stderr})
|
527
519
|
|
528
|
-
with
|
520
|
+
with metadata_file.open(encoding="utf-8") as f:
|
529
521
|
json_data = loads(f.read())
|
530
522
|
|
531
523
|
return self._extract_metadata(json_data)
|
532
524
|
|
533
|
-
except (OSError, JSONDecodeError) as e:
|
534
|
-
raise ParsingError("Failed to extract file data", context={"file": str(path)}) from e
|
535
|
-
finally:
|
536
|
-
with contextlib.suppress(OSError):
|
537
|
-
Path(metadata_file).unlink()
|
538
|
-
|
539
525
|
def _extract_file_sync(self, path: Path) -> str:
|
540
526
|
pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
|
541
|
-
fd, output_path = tempfile.mkstemp(suffix=".md")
|
542
|
-
os.close(fd)
|
543
527
|
|
544
|
-
|
528
|
+
with temporary_file_sync(".md") as output_path:
|
545
529
|
command = [
|
546
530
|
"pandoc",
|
547
531
|
str(path),
|
@@ -559,16 +543,63 @@ class PandocExtractor(Extractor):
|
|
559
543
|
if result.returncode != 0:
|
560
544
|
raise ParsingError("Failed to extract file data", context={"file": str(path), "error": result.stderr})
|
561
545
|
|
562
|
-
with
|
546
|
+
with output_path.open(encoding="utf-8") as f:
|
563
547
|
text = f.read()
|
564
548
|
|
565
549
|
return normalize_spaces(text)
|
566
550
|
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
551
|
+
async def _extract_images_with_pandoc(self, file_path: str) -> list[ExtractedImage]:
|
552
|
+
images = []
|
553
|
+
|
554
|
+
with temporary_directory() as temp_dir:
|
555
|
+
media_dir = Path(temp_dir) / "media"
|
556
|
+
media_dir.mkdir()
|
557
|
+
|
558
|
+
try:
|
559
|
+
cmd = [
|
560
|
+
"pandoc",
|
561
|
+
str(file_path),
|
562
|
+
"--extract-media",
|
563
|
+
str(media_dir),
|
564
|
+
"-t",
|
565
|
+
"markdown",
|
566
|
+
"-o",
|
567
|
+
"/dev/null",
|
568
|
+
]
|
569
|
+
|
570
|
+
await run_process(cmd)
|
571
|
+
|
572
|
+
if media_dir.exists():
|
573
|
+
for img_path in media_dir.rglob("*"):
|
574
|
+
if img_path.is_file() and img_path.suffix.lower() in {
|
575
|
+
".jpg",
|
576
|
+
".jpeg",
|
577
|
+
".png",
|
578
|
+
".gif",
|
579
|
+
".bmp",
|
580
|
+
".tiff",
|
581
|
+
".webp",
|
582
|
+
}:
|
583
|
+
try:
|
584
|
+
image_data = await AsyncPath(img_path).read_bytes()
|
585
|
+
|
586
|
+
images.append(
|
587
|
+
ExtractedImage(
|
588
|
+
data=image_data,
|
589
|
+
format=img_path.suffix[1:].lower(),
|
590
|
+
filename=img_path.name,
|
591
|
+
page_number=None,
|
592
|
+
)
|
593
|
+
)
|
594
|
+
except Exception as e: # noqa: BLE001
|
595
|
+
logging.getLogger(__name__).warning(
|
596
|
+
"Failed to read extracted image %s: %s", img_path, e
|
597
|
+
)
|
598
|
+
|
599
|
+
except Exception as e: # noqa: BLE001
|
600
|
+
logging.getLogger(__name__).warning("Pandoc image extraction failed: %s", e)
|
601
|
+
|
602
|
+
return images
|
572
603
|
|
573
604
|
|
574
605
|
class MarkdownExtractor(PandocExtractor):
|