kreuzberg 3.0.0__py3-none-any.whl → 3.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,613 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ import sys
5
+ from json import JSONDecodeError, loads
6
+ from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal, cast
7
+
8
+ import anyio
9
+ from anyio import Path as AsyncPath
10
+ from anyio import run_process
11
+
12
+ from kreuzberg._constants import MINIMAL_SUPPORTED_PANDOC_VERSION
13
+ from kreuzberg._extractors._base import Extractor
14
+ from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
15
+ from kreuzberg._types import ExtractionResult, Metadata
16
+ from kreuzberg._utils._string import normalize_spaces
17
+ from kreuzberg._utils._sync import run_taskgroup
18
+ from kreuzberg._utils._tmp import create_temp_file
19
+ from kreuzberg.exceptions import MissingDependencyError, ParsingError, ValidationError
20
+
21
+ if TYPE_CHECKING: # pragma: no cover
22
+ from collections.abc import Mapping
23
+ from os import PathLike
24
+ from pathlib import Path
25
+
26
+ if sys.version_info < (3, 11): # pragma: no cover
27
+ from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
28
+
29
+
30
+ BLOCK_HEADER: Final = "Header"
31
+ BLOCK_PARA: Final = "Para"
32
+ BLOCK_CODE: Final = "CodeBlock"
33
+ BLOCK_QUOTE: Final = "BlockQuote"
34
+ BLOCK_LIST: Final = "BulletList"
35
+ BLOCK_ORDERED: Final = "OrderedList"
36
+
37
+
38
+ INLINE_STR: Final = "Str"
39
+ INLINE_SPACE: Final = "Space"
40
+ INLINE_EMPH: Final = "Emph"
41
+ INLINE_STRONG: Final = "Strong"
42
+ INLINE_LINK: Final = "Link"
43
+ INLINE_IMAGE: Final = "Image"
44
+ INLINE_CODE: Final = "Code"
45
+ INLINE_MATH: Final = "Math"
46
+
47
+
48
+ META_MAP: Final = "MetaMap"
49
+ META_LIST: Final = "MetaList"
50
+ META_INLINES: Final = "MetaInlines"
51
+ META_STRING: Final = "MetaString"
52
+ META_BLOCKS: Final = "MetaBlocks"
53
+
54
+
55
+ CONTENT_FIELD: Final = "c"
56
+ TYPE_FIELD: Final = "t"
57
+
58
+
59
+ NodeType = Literal[
60
+ "Header",
61
+ "Para",
62
+ "CodeBlock",
63
+ "BlockQuote",
64
+ "BulletList",
65
+ "OrderedList",
66
+ "Str",
67
+ "Space",
68
+ "Emph",
69
+ "Strong",
70
+ "Link",
71
+ "Image",
72
+ "Code",
73
+ "Math",
74
+ "MetaMap",
75
+ "MetaList",
76
+ "MetaInlines",
77
+ "MetaString",
78
+ "MetaBlocks",
79
+ ]
80
+
81
+
82
+ class PandocExtractor(Extractor):
83
+ """Extractor for documents supported by Pandoc."""
84
+
85
+ _checked_version: bool = False
86
+
87
+ MIMETYPE_TO_PANDOC_TYPE_MAPPING: ClassVar[Mapping[str, str]] = {
88
+ "application/csl+json": "csljson",
89
+ "application/docbook+xml": "docbook",
90
+ "application/epub+zip": "epub",
91
+ "application/rtf": "rtf",
92
+ "application/vnd.oasis.opendocument.text": "odt",
93
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
94
+ "application/x-biblatex": "biblatex",
95
+ "application/x-bibtex": "bibtex",
96
+ "application/x-endnote+xml": "endnotexml",
97
+ "application/x-fictionbook+xml": "fb2",
98
+ "application/x-ipynb+json": "ipynb",
99
+ "application/x-jats+xml": "jats",
100
+ "application/x-latex": "latex",
101
+ "application/x-opml+xml": "opml",
102
+ "application/x-research-info-systems": "ris",
103
+ "application/x-typst": "typst",
104
+ "text/csv": "csv",
105
+ "text/tab-separated-values": "tsv",
106
+ "text/troff": "man",
107
+ "text/x-commonmark": "commonmark",
108
+ "text/x-dokuwiki": "dokuwiki",
109
+ "text/x-gfm": "gfm",
110
+ "text/x-markdown": "markdown",
111
+ "text/x-markdown-extra": "markdown_phpextra",
112
+ "text/x-mdoc": "mdoc",
113
+ "text/x-multimarkdown": "markdown_mmd",
114
+ "text/x-org": "org",
115
+ "text/x-pod": "pod",
116
+ "text/x-rst": "rst",
117
+ }
118
+
119
+ MIMETYPE_TO_FILE_EXTENSION_MAPPING: ClassVar[Mapping[str, str]] = {
120
+ "application/csl+json": "json",
121
+ "application/docbook+xml": "xml",
122
+ "application/epub+zip": "epub",
123
+ "application/rtf": "rtf",
124
+ "application/vnd.oasis.opendocument.text": "odt",
125
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
126
+ "application/x-biblatex": "bib",
127
+ "application/x-bibtex": "bib",
128
+ "application/x-endnote+xml": "xml",
129
+ "application/x-fictionbook+xml": "fb2",
130
+ "application/x-ipynb+json": "ipynb",
131
+ "application/x-jats+xml": "xml",
132
+ "application/x-latex": "tex",
133
+ "application/x-opml+xml": "opml",
134
+ "application/x-research-info-systems": "ris",
135
+ "application/x-typst": "typst",
136
+ "text/csv": "csv",
137
+ "text/tab-separated-values": "tsv",
138
+ "text/troff": "1",
139
+ "text/x-commonmark": "md",
140
+ "text/x-dokuwiki": "wiki",
141
+ "text/x-gfm": "md",
142
+ "text/x-markdown": "md",
143
+ "text/x-markdown-extra": "md",
144
+ "text/x-mdoc": "md",
145
+ "text/x-multimarkdown": "md",
146
+ "text/x-org": "org",
147
+ "text/x-pod": "pod",
148
+ "text/x-rst": "rst",
149
+ }
150
+
151
+ async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
152
+ """Extract text and metadata from bytes content using Pandoc.
153
+
154
+ Args:
155
+ content: The content bytes to process.
156
+
157
+ Returns:
158
+ ExtractionResult with the extracted text and metadata.
159
+ """
160
+ extension = self._get_pandoc_type_from_mime_type(self.mime_type)
161
+ input_file, unlink = await create_temp_file(f".{extension}")
162
+
163
+ try:
164
+ await AsyncPath(input_file).write_bytes(content)
165
+ return await self.extract_path_async(input_file)
166
+ finally:
167
+ await unlink()
168
+
169
+ async def extract_path_async(self, path: Path) -> ExtractionResult:
170
+ """Extract text and metadata from a file using Pandoc.
171
+
172
+ Args:
173
+ path: The path to the file to process.
174
+
175
+ Raises:
176
+ ParsingError: If the file data could not be extracted.
177
+
178
+ Returns:
179
+ ExtractionResult with the extracted text and metadata.
180
+ """
181
+ await self._validate_pandoc_version()
182
+ self._get_pandoc_type_from_mime_type(self.mime_type)
183
+
184
+ try:
185
+ metadata_task = self._handle_extract_metadata(path)
186
+ content_task = self._handle_extract_file(path)
187
+ results = await run_taskgroup(metadata_task, content_task)
188
+ metadata, content = cast("tuple[Metadata, str]", results)
189
+
190
+ return ExtractionResult(
191
+ content=normalize_spaces(content), metadata=metadata, mime_type=MARKDOWN_MIME_TYPE, chunks=[]
192
+ )
193
+ except ExceptionGroup as eg:
194
+ raise ParsingError("Failed to process file", context={"file": str(path), "errors": eg.exceptions}) from eg
195
+
196
+ def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
197
+ """Synchronous version of extract_bytes_async.
198
+
199
+ Args:
200
+ content: The content bytes to process.
201
+
202
+ Returns:
203
+ ExtractionResult with the extracted text and metadata.
204
+ """
205
+ return anyio.run(self.extract_bytes_async, content)
206
+
207
+ def extract_path_sync(self, path: Path) -> ExtractionResult:
208
+ """Synchronous version of extract_path_async.
209
+
210
+ Args:
211
+ path: The path to the file to process.
212
+
213
+ Returns:
214
+ ExtractionResult with the extracted text and metadata.
215
+ """
216
+ return anyio.run(self.extract_path_async, path)
217
+
218
+ async def _validate_pandoc_version(self) -> None:
219
+ """Validate that the installed Pandoc version meets the minimum requirement.
220
+
221
+ Raises:
222
+ MissingDependencyError: If Pandoc is not installed or version is too low
223
+ """
224
+ try:
225
+ if self._checked_version:
226
+ return
227
+
228
+ command = ["pandoc", "--version"]
229
+ result = await run_process(command)
230
+
231
+ version_match = re.search(r"pandoc\s+v?(\d+)\.\d+\.\d+", result.stdout.decode())
232
+ if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_PANDOC_VERSION:
233
+ raise MissingDependencyError(
234
+ "Pandoc version 2 or above is a required system dependency. Please install it on your system and make sure its available in $PATH."
235
+ )
236
+
237
+ self._checked_version = True
238
+
239
+ except FileNotFoundError as e:
240
+ raise MissingDependencyError(
241
+ "Pandoc version 2 or above is a required system dependency. Please install it on your system and make sure its available in $PATH."
242
+ ) from e
243
+
244
+ @staticmethod
245
+ def _get_pandoc_key(key: str) -> str | None:
246
+ """Map Pandoc metadata keys to our standard metadata keys.
247
+
248
+ Args:
249
+ key: The key from Pandoc metadata
250
+
251
+ Returns:
252
+ The mapped key name for our system, or None if not mapped
253
+ """
254
+ if key == "abstract":
255
+ return "summary"
256
+
257
+ if key == "date":
258
+ return "created_at"
259
+
260
+ if key in ("contributors", "author"):
261
+ return "authors"
262
+
263
+ if key == "institute":
264
+ return "organization"
265
+
266
+ if key not in Metadata.__annotations__:
267
+ return None
268
+
269
+ return key
270
+
271
+ def _get_pandoc_type_from_mime_type(self, mime_type: str) -> str:
272
+ """Get Pandoc format type from MIME type.
273
+
274
+ Args:
275
+ mime_type: The MIME type to look up
276
+
277
+ Returns:
278
+ The corresponding Pandoc type
279
+
280
+ Raises:
281
+ ValidationError: If mime_type is not supported
282
+ """
283
+ if pandoc_type := (self.MIMETYPE_TO_PANDOC_TYPE_MAPPING.get(mime_type, "")):
284
+ return pandoc_type
285
+
286
+ if mime_type == "text/markdown":
287
+ return "markdown"
288
+
289
+ for k, v in self.MIMETYPE_TO_PANDOC_TYPE_MAPPING.items():
290
+ if mime_type.startswith(k):
291
+ return v
292
+
293
+ raise ValidationError(f"Unsupported mime type: {mime_type}")
294
+
295
+ async def _handle_extract_metadata(self, input_file: str | PathLike[str]) -> Metadata:
296
+ """Extract metadata from a file using Pandoc.
297
+
298
+ Args:
299
+ input_file: The file to extract metadata from
300
+
301
+ Returns:
302
+ The extracted metadata
303
+
304
+ Raises:
305
+ ParsingError: If metadata extraction fails
306
+ """
307
+ pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
308
+ metadata_file, unlink = await create_temp_file(".json")
309
+ try:
310
+ command = [
311
+ "pandoc",
312
+ str(input_file),
313
+ f"--from={pandoc_type}",
314
+ "--to=json",
315
+ "--standalone",
316
+ "--quiet",
317
+ "--output",
318
+ str(metadata_file),
319
+ ]
320
+
321
+ result = await run_process(command)
322
+
323
+ if result.returncode != 0:
324
+ raise ParsingError(
325
+ "Failed to extract file data", context={"file": str(input_file), "error": result.stderr}
326
+ )
327
+
328
+ json_data = loads(await AsyncPath(metadata_file).read_text("utf-8"))
329
+ return self._extract_metadata(json_data)
330
+ except (RuntimeError, OSError, JSONDecodeError) as e:
331
+ raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
332
+ finally:
333
+ await unlink()
334
+
335
+ async def _handle_extract_file(self, input_file: str | PathLike[str]) -> str:
336
+ """Extract text content from a file using Pandoc.
337
+
338
+ Args:
339
+ input_file: The file to extract content from
340
+
341
+ Returns:
342
+ The extracted text content
343
+
344
+ Raises:
345
+ ParsingError: If content extraction fails
346
+ """
347
+ pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
348
+ output_path, unlink = await create_temp_file(".md")
349
+ try:
350
+ command = [
351
+ "pandoc",
352
+ str(input_file),
353
+ f"--from={pandoc_type}",
354
+ "--to=markdown",
355
+ "--standalone",
356
+ "--wrap=preserve",
357
+ "--quiet",
358
+ ]
359
+
360
+ command.extend(["--output", str(output_path)])
361
+
362
+ result = await run_process(command)
363
+
364
+ if result.returncode != 0:
365
+ raise ParsingError(
366
+ "Failed to extract file data", context={"file": str(input_file), "error": result.stderr}
367
+ )
368
+
369
+ text = await AsyncPath(output_path).read_text("utf-8")
370
+
371
+ return normalize_spaces(text)
372
+ except (RuntimeError, OSError) as e:
373
+ raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
374
+ finally:
375
+ await unlink()
376
+
377
+ def _extract_metadata(self, raw_meta: dict[str, Any]) -> Metadata:
378
+ """Extract structured metadata from Pandoc JSON metadata.
379
+
380
+ Args:
381
+ raw_meta: The raw metadata from Pandoc
382
+
383
+ Returns:
384
+ Structured metadata
385
+ """
386
+ meta: Metadata = {}
387
+
388
+ if (
389
+ "citations" in raw_meta
390
+ and isinstance(raw_meta["citations"], list)
391
+ and (
392
+ citations := [
393
+ c["citationId"] for c in raw_meta["citations"] if isinstance(c, dict) and "citationId" in c
394
+ ]
395
+ )
396
+ ):
397
+ meta["citations"] = citations
398
+
399
+ for key, value in raw_meta.items():
400
+ if key == "citations":
401
+ continue
402
+
403
+ pandoc_key = self._get_pandoc_key(key)
404
+ if pandoc_key is None:
405
+ continue
406
+
407
+ if key == "valid" and isinstance(value, dict) and value.get("t") == "MetaString" and "c" in value:
408
+ meta[key] = value["c"] # type: ignore[literal-required]
409
+ continue
410
+
411
+ extracted = self._extract_meta_value(value)
412
+ if extracted:
413
+ if pandoc_key in ("languages", "authors"):
414
+ extracted = [extracted] # type: ignore[list-item]
415
+ meta[pandoc_key] = extracted # type: ignore[literal-required]
416
+
417
+ citations_from_blocks = [
418
+ cite["citationId"]
419
+ for block in raw_meta.get("blocks", [])
420
+ if block.get(TYPE_FIELD) == "Cite"
421
+ for cite in block.get(CONTENT_FIELD, [[{}]])[0]
422
+ if isinstance(cite, dict)
423
+ ]
424
+ if citations_from_blocks and "citations" not in meta:
425
+ meta["citations"] = citations_from_blocks
426
+ elif citations_from_blocks and "citations" in meta:
427
+ meta["citations"].extend(citations_from_blocks)
428
+
429
+ return meta
430
+
431
+ def _extract_inline_text(self, node: dict[str, Any], type_field: str = "t", content_field: str = "c") -> str | None:
432
+ """Extract text from an inline node in a document structure.
433
+
434
+ Args:
435
+ node: The node to extract text from
436
+ type_field: The field name for the node type
437
+ content_field: The field name for the node content
438
+
439
+ Returns:
440
+ The extracted text or None if no text could be extracted
441
+ """
442
+ if node_type := node.get(type_field):
443
+ if node_type == "Str":
444
+ return node.get(content_field)
445
+ if node_type == "Space":
446
+ return " "
447
+ if node_type in ("Emph", "Strong"):
448
+ return self._extract_inlines(node.get(content_field, []))
449
+ return None
450
+
451
+ def _extract_inlines(self, nodes: list[dict[str, Any]]) -> str | None:
452
+ """Extract text from a list of inline nodes.
453
+
454
+ Args:
455
+ nodes: The list of nodes to extract text from
456
+
457
+ Returns:
458
+ The extracted text or None if no text could be extracted
459
+ """
460
+ texts = [text for node in nodes if (text := self._extract_inline_text(node))]
461
+ result = "".join(texts).strip()
462
+ return result if result else None
463
+
464
+ def _extract_meta_value(self, node: Any, type_field: str = "t", content_field: str = "c") -> str | list[str] | None:
465
+ """Extract a metadata value from a node.
466
+
467
+ Args:
468
+ node: The node to extract metadata from
469
+ type_field: The field name for the node type
470
+ content_field: The field name for the node content
471
+
472
+ Returns:
473
+ The extracted metadata value or None if no metadata could be extracted
474
+ """
475
+ if not isinstance(node, dict) or type_field not in node:
476
+ return None
477
+
478
+ if (node_type := node.get(type_field)) and (
479
+ node_type == "MetaString" and content_field in node and isinstance(node[content_field], str)
480
+ ):
481
+ return cast("str | list[str] | None", node[content_field])
482
+
483
+ if content_field not in node:
484
+ return None
485
+
486
+ content = node[content_field]
487
+ node_type = node[type_field]
488
+
489
+ if not content:
490
+ return None
491
+
492
+ if node_type == "MetaString" and isinstance(content, str):
493
+ return content
494
+
495
+ if isinstance(content, list) and (content := [v for v in content if isinstance(v, dict)]):
496
+ if node_type == "MetaInlines":
497
+ return self._extract_inlines(content)
498
+
499
+ if node_type == "MetaList":
500
+ results = []
501
+ for value in [value for item in content if (value := self._extract_meta_value(item))]:
502
+ if isinstance(value, list):
503
+ results.extend(value)
504
+ else:
505
+ results.append(value)
506
+ return results
507
+
508
+ if node_type == "MetaBlocks" and (
509
+ blocks := [block for block in content if block.get(type_field) == "Para"]
510
+ ):
511
+ block_texts = []
512
+ for block in blocks:
513
+ block_content = block.get(content_field, [])
514
+ if isinstance(block_content, list) and (text := self._extract_inlines(block_content)):
515
+ block_texts.append(text)
516
+
517
+ if block_texts:
518
+ return " ".join(block_texts)
519
+ return None
520
+
521
+ return None
522
+
523
+
524
+ class MarkdownExtractor(PandocExtractor):
525
+ """Extractor for Markdown-based document formats."""
526
+
527
+ SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
528
+ "text/x-markdown",
529
+ "text/x-commonmark",
530
+ "text/x-gfm",
531
+ "text/x-markdown-extra",
532
+ "text/x-multimarkdown",
533
+ "text/x-mdoc",
534
+ }
535
+
536
+
537
+ class OfficeDocumentExtractor(PandocExtractor):
538
+ """Extractor for Office document formats (Word, ODT)."""
539
+
540
+ SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
541
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
542
+ "application/vnd.oasis.opendocument.text",
543
+ }
544
+
545
+
546
+ class EbookExtractor(PandocExtractor):
547
+ """Extractor for e-book formats (EPUB, FB2)."""
548
+
549
+ SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
550
+ "application/epub+zip",
551
+ "application/x-fictionbook+xml",
552
+ }
553
+
554
+
555
+ class StructuredTextExtractor(PandocExtractor):
556
+ """Extractor for structured text formats (RST, Org, etc.)."""
557
+
558
+ SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
559
+ "text/x-rst",
560
+ "text/x-org",
561
+ "text/x-dokuwiki",
562
+ "text/x-pod",
563
+ }
564
+
565
+
566
+ class LaTeXExtractor(PandocExtractor):
567
+ """Extractor for LaTeX and Typst documents."""
568
+
569
+ SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
570
+ "application/x-latex",
571
+ "application/x-typst",
572
+ }
573
+
574
+
575
+ class BibliographyExtractor(PandocExtractor):
576
+ """Extractor for bibliography formats (BibTeX, CSL JSON, etc.)."""
577
+
578
+ SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
579
+ "application/x-bibtex",
580
+ "application/x-biblatex",
581
+ "application/csl+json",
582
+ "application/x-research-info-systems",
583
+ "application/x-endnote+xml",
584
+ }
585
+
586
+
587
+ class XMLBasedExtractor(PandocExtractor):
588
+ """Extractor for XML-based document formats (DocBook, JATS, OPML)."""
589
+
590
+ SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
591
+ "application/docbook+xml",
592
+ "application/x-jats+xml",
593
+ "application/x-opml+xml",
594
+ }
595
+
596
+
597
+ class TabularDataExtractor(PandocExtractor):
598
+ """Extractor for tabular data formats (CSV, TSV)."""
599
+
600
+ SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
601
+ "text/csv",
602
+ "text/tab-separated-values",
603
+ }
604
+
605
+
606
+ class MiscFormatExtractor(PandocExtractor):
607
+ """Extractor for miscellaneous formats (RTF, man, Jupyter notebooks)."""
608
+
609
+ SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
610
+ "application/rtf",
611
+ "text/troff",
612
+ "application/x-ipynb+json",
613
+ }