kreuzberg 1.3.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_extractors.py +46 -81
- kreuzberg/_mime_types.py +22 -31
- kreuzberg/_pandoc.py +416 -0
- kreuzberg/_string.py +9 -12
- kreuzberg/_tesseract.py +318 -0
- kreuzberg/exceptions.py +9 -1
- kreuzberg/extraction.py +16 -16
- kreuzberg-1.5.0.dist-info/METADATA +318 -0
- kreuzberg-1.5.0.dist-info/RECORD +15 -0
- kreuzberg-1.3.0.dist-info/METADATA +0 -306
- kreuzberg-1.3.0.dist-info/RECORD +0 -13
- {kreuzberg-1.3.0.dist-info → kreuzberg-1.5.0.dist-info}/LICENSE +0 -0
- {kreuzberg-1.3.0.dist-info → kreuzberg-1.5.0.dist-info}/WHEEL +0 -0
- {kreuzberg-1.3.0.dist-info → kreuzberg-1.5.0.dist-info}/top_level.txt +0 -0
kreuzberg/_pandoc.py
ADDED
@@ -0,0 +1,416 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import json
|
4
|
+
import subprocess
|
5
|
+
from asyncio import gather
|
6
|
+
from dataclasses import dataclass
|
7
|
+
from tempfile import NamedTemporaryFile
|
8
|
+
from typing import TYPE_CHECKING, Any, Final, Literal, TypedDict, cast
|
9
|
+
|
10
|
+
from anyio import Path as AsyncPath
|
11
|
+
|
12
|
+
from kreuzberg._string import normalize_spaces
|
13
|
+
from kreuzberg._sync import run_sync
|
14
|
+
from kreuzberg.exceptions import MissingDependencyError, ParsingError, ValidationError
|
15
|
+
|
16
|
+
if TYPE_CHECKING:
|
17
|
+
from collections.abc import Mapping
|
18
|
+
from os import PathLike
|
19
|
+
|
20
|
+
try: # pragma: no cover
|
21
|
+
from typing import NotRequired # type: ignore[attr-defined]
|
22
|
+
except ImportError: # pragma: no cover
|
23
|
+
from typing_extensions import NotRequired
|
24
|
+
|
25
|
+
version_ref: Final[dict[str, bool]] = {"checked": False}
|
26
|
+
|
27
|
+
|
28
|
+
# Block-level node types in Pandoc AST
|
29
|
+
BLOCK_HEADER: Final = "Header" # Header with level, attributes and inline content
|
30
|
+
BLOCK_PARA: Final = "Para" # Paragraph containing inline content
|
31
|
+
BLOCK_CODE: Final = "CodeBlock" # Code block with attributes and string content
|
32
|
+
BLOCK_QUOTE: Final = "BlockQuote" # Block quote containing blocks
|
33
|
+
BLOCK_LIST: Final = "BulletList" # Bullet list containing items (blocks)
|
34
|
+
BLOCK_ORDERED: Final = "OrderedList" # Numbered list with attrs and items
|
35
|
+
|
36
|
+
# Inline-level node types in Pandoc AST
|
37
|
+
INLINE_STR: Final = "Str" # Plain text string
|
38
|
+
INLINE_SPACE: Final = "Space" # Single space
|
39
|
+
INLINE_EMPH: Final = "Emph" # Emphasized text (contains inlines)
|
40
|
+
INLINE_STRONG: Final = "Strong" # Strong/bold text (contains inlines)
|
41
|
+
INLINE_LINK: Final = "Link" # Link with text and target
|
42
|
+
INLINE_IMAGE: Final = "Image" # Image with alt text and source
|
43
|
+
INLINE_CODE: Final = "Code" # Inline code span
|
44
|
+
INLINE_MATH: Final = "Math" # Math expression
|
45
|
+
|
46
|
+
# Metadata node types in Pandoc AST
|
47
|
+
META_MAP: Final = "MetaMap" # Key-value mapping of metadata
|
48
|
+
META_LIST: Final = "MetaList" # List of metadata values
|
49
|
+
META_INLINES: Final = "MetaInlines" # Inline content in metadata
|
50
|
+
META_STRING: Final = "MetaString" # Plain string in metadata
|
51
|
+
META_BLOCKS: Final = "MetaBlocks" # Block content in metadata
|
52
|
+
|
53
|
+
# Node content field name
|
54
|
+
CONTENT_FIELD: Final = "c"
|
55
|
+
TYPE_FIELD: Final = "t"
|
56
|
+
|
57
|
+
# Valid node types
|
58
|
+
NodeType = Literal[
|
59
|
+
# Block types
|
60
|
+
"Header",
|
61
|
+
"Para",
|
62
|
+
"CodeBlock",
|
63
|
+
"BlockQuote",
|
64
|
+
"BulletList",
|
65
|
+
"OrderedList",
|
66
|
+
# Inline types
|
67
|
+
"Str",
|
68
|
+
"Space",
|
69
|
+
"Emph",
|
70
|
+
"Strong",
|
71
|
+
"Link",
|
72
|
+
"Image",
|
73
|
+
"Code",
|
74
|
+
"Math",
|
75
|
+
# Meta types
|
76
|
+
"MetaMap",
|
77
|
+
"MetaList",
|
78
|
+
"MetaInlines",
|
79
|
+
"MetaString",
|
80
|
+
"MetaBlocks",
|
81
|
+
]
|
82
|
+
|
83
|
+
PANDOC_MIMETYPE_TO_FORMAT_MAPPING: Final[Mapping[str, str]] = {
|
84
|
+
"application/csl+json": "csljson",
|
85
|
+
"application/docbook+xml": "docbook",
|
86
|
+
"application/epub+zip": "epub",
|
87
|
+
"application/rtf": "rtf",
|
88
|
+
"application/vnd.oasis.opendocument.text": "odt",
|
89
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
|
90
|
+
"application/x-biblatex": "biblatex",
|
91
|
+
"application/x-bibtex": "bibtex",
|
92
|
+
"application/x-endnote+xml": "endnotexml",
|
93
|
+
"application/x-fictionbook+xml": "fb2",
|
94
|
+
"application/x-ipynb+json": "ipynb",
|
95
|
+
"application/x-jats+xml": "jats",
|
96
|
+
"application/x-latex": "latex",
|
97
|
+
"application/x-opml+xml": "opml",
|
98
|
+
"application/x-research-info-systems": "ris",
|
99
|
+
"application/x-typst": "typst",
|
100
|
+
"text/csv": "csv",
|
101
|
+
"text/tab-separated-values": "tsv",
|
102
|
+
"text/troff": "man",
|
103
|
+
"text/x-commonmark": "commonmark",
|
104
|
+
"text/x-dokuwiki": "dokuwiki",
|
105
|
+
"text/x-gfm": "gfm",
|
106
|
+
"text/x-markdown": "markdown",
|
107
|
+
"text/x-markdown-extra": "markdown_phpextra",
|
108
|
+
"text/x-mdoc": "mdoc",
|
109
|
+
"text/x-multimarkdown": "markdown_mmd",
|
110
|
+
"text/x-org": "org",
|
111
|
+
"text/x-pod": "pod",
|
112
|
+
"text/x-rst": "rst",
|
113
|
+
}
|
114
|
+
|
115
|
+
|
116
|
+
class Metadata(TypedDict, total=False):
|
117
|
+
"""Document metadata extracted from Pandoc document.
|
118
|
+
|
119
|
+
All fields are optional but will only be included if they contain non-empty values.
|
120
|
+
Any field that would be empty or None is omitted from the dictionary.
|
121
|
+
"""
|
122
|
+
|
123
|
+
title: NotRequired[str]
|
124
|
+
"""Document title."""
|
125
|
+
subtitle: NotRequired[str]
|
126
|
+
"""Document subtitle."""
|
127
|
+
abstract: NotRequired[str | list[str]]
|
128
|
+
"""Document abstract, summary or description."""
|
129
|
+
authors: NotRequired[list[str]]
|
130
|
+
"""List of document authors."""
|
131
|
+
date: NotRequired[str]
|
132
|
+
"""Document date as string to preserve original format."""
|
133
|
+
subject: NotRequired[str]
|
134
|
+
"""Document subject or topic."""
|
135
|
+
description: NotRequired[str]
|
136
|
+
"""Extended description."""
|
137
|
+
keywords: NotRequired[list[str]]
|
138
|
+
"""Keywords or tags."""
|
139
|
+
categories: NotRequired[list[str]]
|
140
|
+
"""Categories or classifications."""
|
141
|
+
version: NotRequired[str]
|
142
|
+
"""Version identifier."""
|
143
|
+
language: NotRequired[str]
|
144
|
+
"""Document language code."""
|
145
|
+
references: NotRequired[list[str]]
|
146
|
+
"""Reference entries."""
|
147
|
+
citations: NotRequired[list[str]]
|
148
|
+
"""Citation identifiers."""
|
149
|
+
copyright: NotRequired[str]
|
150
|
+
"""Copyright information."""
|
151
|
+
license: NotRequired[str]
|
152
|
+
"""License information."""
|
153
|
+
identifier: NotRequired[str]
|
154
|
+
"""Document identifier."""
|
155
|
+
publisher: NotRequired[str]
|
156
|
+
"""Publisher name."""
|
157
|
+
contributors: NotRequired[list[str]]
|
158
|
+
"""Additional contributors."""
|
159
|
+
creator: NotRequired[str]
|
160
|
+
"""Document creator."""
|
161
|
+
institute: NotRequired[str | list[str]]
|
162
|
+
"""Institute or organization."""
|
163
|
+
|
164
|
+
|
165
|
+
@dataclass
|
166
|
+
class PandocResult:
|
167
|
+
"""Result of a pandoc conversion including content and metadata."""
|
168
|
+
|
169
|
+
content: str
|
170
|
+
"""The processed markdown content."""
|
171
|
+
metadata: Metadata
|
172
|
+
"""Document metadata extracted from the source."""
|
173
|
+
|
174
|
+
|
175
|
+
def _extract_inline_text(node: dict[str, Any]) -> str | None:
|
176
|
+
if node_type := node.get(TYPE_FIELD):
|
177
|
+
if node_type == INLINE_STR:
|
178
|
+
return node.get(CONTENT_FIELD)
|
179
|
+
if node_type == INLINE_SPACE:
|
180
|
+
return " "
|
181
|
+
if node_type in (INLINE_EMPH, INLINE_STRONG):
|
182
|
+
return _extract_inlines(node.get(CONTENT_FIELD, []))
|
183
|
+
return None # pragma: no cover
|
184
|
+
|
185
|
+
|
186
|
+
def _extract_inlines(nodes: list[dict[str, Any]]) -> str | None:
|
187
|
+
texts = [text for node in nodes if (text := _extract_inline_text(node))]
|
188
|
+
result = "".join(texts).strip()
|
189
|
+
return result if result else None
|
190
|
+
|
191
|
+
|
192
|
+
def _extract_meta_value(node: Any) -> str | list[str] | None:
|
193
|
+
if not isinstance(node, dict) or CONTENT_FIELD not in node or TYPE_FIELD not in node:
|
194
|
+
return None
|
195
|
+
|
196
|
+
content = node[CONTENT_FIELD]
|
197
|
+
node_type = node[TYPE_FIELD]
|
198
|
+
|
199
|
+
if not content or node_type not in {
|
200
|
+
META_STRING,
|
201
|
+
META_INLINES,
|
202
|
+
META_LIST,
|
203
|
+
META_BLOCKS,
|
204
|
+
}:
|
205
|
+
return None
|
206
|
+
|
207
|
+
if node_type == META_STRING and isinstance(content, str):
|
208
|
+
return content
|
209
|
+
|
210
|
+
if isinstance(content, list) and (content := [v for v in content if isinstance(v, dict)]):
|
211
|
+
if node_type == META_INLINES:
|
212
|
+
return _extract_inlines(cast(list[dict[str, Any]], content))
|
213
|
+
|
214
|
+
if node_type == META_LIST:
|
215
|
+
results = []
|
216
|
+
for value in [value for item in content if (value := _extract_meta_value(item))]:
|
217
|
+
if isinstance(value, list):
|
218
|
+
results.extend(value) # pragma: no cover
|
219
|
+
else:
|
220
|
+
results.append(value)
|
221
|
+
return results
|
222
|
+
|
223
|
+
if blocks := [block for block in content if block.get(TYPE_FIELD) == BLOCK_PARA]:
|
224
|
+
block_texts = []
|
225
|
+
for block in blocks:
|
226
|
+
block_content = block.get(CONTENT_FIELD, [])
|
227
|
+
if isinstance(block_content, list) and (text := _extract_inlines(block_content)):
|
228
|
+
block_texts.append(text)
|
229
|
+
return block_texts if block_texts else None
|
230
|
+
|
231
|
+
return None
|
232
|
+
|
233
|
+
|
234
|
+
def _extract_metadata(raw_meta: dict[str, Any]) -> Metadata:
|
235
|
+
"""Extract all non-empty metadata values from Pandoc AST metadata."""
|
236
|
+
meta: Metadata = {}
|
237
|
+
|
238
|
+
for key, value in raw_meta.items():
|
239
|
+
if extracted := _extract_meta_value(value):
|
240
|
+
meta[key] = extracted # type: ignore[literal-required]
|
241
|
+
|
242
|
+
citations = [
|
243
|
+
cite["citationId"]
|
244
|
+
for block in raw_meta.get("blocks", [])
|
245
|
+
if block.get(TYPE_FIELD) == "Cite"
|
246
|
+
for cite in block.get(CONTENT_FIELD, [[{}]])[0]
|
247
|
+
if isinstance(cite, dict)
|
248
|
+
]
|
249
|
+
if citations:
|
250
|
+
meta["citations"] = citations
|
251
|
+
|
252
|
+
return meta
|
253
|
+
|
254
|
+
|
255
|
+
def _get_extension_from_mime_type(mime_type: str) -> str:
|
256
|
+
if mime_type not in PANDOC_MIMETYPE_TO_FORMAT_MAPPING or not any(
|
257
|
+
mime_type.startswith(value) for value in PANDOC_MIMETYPE_TO_FORMAT_MAPPING
|
258
|
+
):
|
259
|
+
raise ValidationError(
|
260
|
+
f"Unsupported mime type: {mime_type}",
|
261
|
+
context={
|
262
|
+
"mime_type": mime_type,
|
263
|
+
"supported_mimetypes": ",".join(sorted(PANDOC_MIMETYPE_TO_FORMAT_MAPPING)),
|
264
|
+
},
|
265
|
+
)
|
266
|
+
|
267
|
+
return PANDOC_MIMETYPE_TO_FORMAT_MAPPING.get(mime_type) or next(
|
268
|
+
PANDOC_MIMETYPE_TO_FORMAT_MAPPING[k] for k in PANDOC_MIMETYPE_TO_FORMAT_MAPPING if k.startswith(mime_type)
|
269
|
+
)
|
270
|
+
|
271
|
+
|
272
|
+
async def validate_pandoc_version() -> None:
|
273
|
+
"""Validate that Pandoc is installed and is version 3 or above.
|
274
|
+
|
275
|
+
Raises:
|
276
|
+
MissingDependencyError: If Pandoc is not installed or is below version 3.
|
277
|
+
"""
|
278
|
+
try:
|
279
|
+
if version_ref["checked"]:
|
280
|
+
return
|
281
|
+
|
282
|
+
result = await run_sync(subprocess.run, ["pandoc", "--version"], capture_output=True)
|
283
|
+
version = result.stdout.decode().split("\n")[0].split()[1]
|
284
|
+
if not version.startswith("3."):
|
285
|
+
raise MissingDependencyError("Pandoc version 3 or above is required.")
|
286
|
+
|
287
|
+
version_ref["checked"] = True
|
288
|
+
|
289
|
+
except FileNotFoundError as e:
|
290
|
+
raise MissingDependencyError("Pandoc is not installed.") from e
|
291
|
+
|
292
|
+
|
293
|
+
async def extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -> Metadata:
|
294
|
+
"""Extract metadata from a document using pandoc.
|
295
|
+
|
296
|
+
Args:
|
297
|
+
input_file: The path to the file to process.
|
298
|
+
mime_type: The mime type of the file.
|
299
|
+
|
300
|
+
Raises:
|
301
|
+
ParsingError: If Pandoc fails to extract metadata.
|
302
|
+
|
303
|
+
Returns:
|
304
|
+
Dictionary containing document metadata.
|
305
|
+
"""
|
306
|
+
extension = _get_extension_from_mime_type(mime_type)
|
307
|
+
|
308
|
+
with NamedTemporaryFile(suffix=".json") as metadata_file:
|
309
|
+
try:
|
310
|
+
command = [
|
311
|
+
"pandoc",
|
312
|
+
str(input_file),
|
313
|
+
f"--from={extension}",
|
314
|
+
"--to=json",
|
315
|
+
"--standalone",
|
316
|
+
"--quiet",
|
317
|
+
"--output",
|
318
|
+
metadata_file.name,
|
319
|
+
]
|
320
|
+
|
321
|
+
result = await run_sync(
|
322
|
+
subprocess.run,
|
323
|
+
command,
|
324
|
+
capture_output=True,
|
325
|
+
)
|
326
|
+
|
327
|
+
if result.returncode != 0:
|
328
|
+
raise ParsingError(
|
329
|
+
"Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
|
330
|
+
)
|
331
|
+
|
332
|
+
json_data = json.loads(await AsyncPath(metadata_file.name).read_text())
|
333
|
+
return _extract_metadata(json_data)
|
334
|
+
|
335
|
+
except (RuntimeError, OSError, json.JSONDecodeError) as e:
|
336
|
+
raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
|
337
|
+
|
338
|
+
|
339
|
+
async def _extract_file(input_file: str | PathLike[str], *, mime_type: str, extra_args: list[str] | None = None) -> str:
|
340
|
+
extension = _get_extension_from_mime_type(mime_type)
|
341
|
+
|
342
|
+
with NamedTemporaryFile(suffix=".md") as output_file:
|
343
|
+
command = [
|
344
|
+
"pandoc",
|
345
|
+
str(input_file),
|
346
|
+
f"--from={extension}",
|
347
|
+
"--to=markdown",
|
348
|
+
"--standalone",
|
349
|
+
"--wrap=preserve",
|
350
|
+
"--quiet",
|
351
|
+
"--output",
|
352
|
+
output_file.name,
|
353
|
+
]
|
354
|
+
|
355
|
+
if extra_args:
|
356
|
+
command.extend(extra_args)
|
357
|
+
|
358
|
+
result = await run_sync(
|
359
|
+
subprocess.run,
|
360
|
+
command,
|
361
|
+
capture_output=True,
|
362
|
+
)
|
363
|
+
|
364
|
+
if result.returncode != 0:
|
365
|
+
raise ParsingError(
|
366
|
+
"Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
|
367
|
+
)
|
368
|
+
|
369
|
+
text = await AsyncPath(output_file.name).read_text()
|
370
|
+
|
371
|
+
return normalize_spaces(text)
|
372
|
+
|
373
|
+
|
374
|
+
async def process_file(
|
375
|
+
input_file: str | PathLike[str], *, mime_type: str, extra_args: list[str] | None = None
|
376
|
+
) -> PandocResult:
|
377
|
+
"""Process a single file using Pandoc and convert to markdown.
|
378
|
+
|
379
|
+
Args:
|
380
|
+
input_file: The path to the file to process.
|
381
|
+
mime_type: The mime type of the file.
|
382
|
+
extra_args: Additional Pandoc command line arguments.
|
383
|
+
|
384
|
+
Returns:
|
385
|
+
PandocResult containing processed content and metadata.
|
386
|
+
"""
|
387
|
+
await validate_pandoc_version()
|
388
|
+
|
389
|
+
metadata, content = await gather(
|
390
|
+
*[
|
391
|
+
extract_metadata(input_file, mime_type=mime_type),
|
392
|
+
_extract_file(input_file, mime_type=mime_type, extra_args=extra_args),
|
393
|
+
]
|
394
|
+
)
|
395
|
+
return PandocResult(
|
396
|
+
content=content, # type: ignore[arg-type]
|
397
|
+
metadata=metadata, # type: ignore[arg-type]
|
398
|
+
)
|
399
|
+
|
400
|
+
|
401
|
+
async def process_content(content: bytes, *, mime_type: str, extra_args: list[str] | None = None) -> PandocResult:
|
402
|
+
"""Process content using Pandoc and convert to markdown.
|
403
|
+
|
404
|
+
Args:
|
405
|
+
content: The content to process.
|
406
|
+
mime_type: The mime type of the content.
|
407
|
+
extra_args: Additional Pandoc command line arguments.
|
408
|
+
|
409
|
+
Returns:
|
410
|
+
PandocResult containing processed content and metadata.
|
411
|
+
"""
|
412
|
+
extension = _get_extension_from_mime_type(mime_type)
|
413
|
+
|
414
|
+
with NamedTemporaryFile(suffix=f".{extension}") as input_file:
|
415
|
+
await AsyncPath(input_file.name).write_bytes(content)
|
416
|
+
return await process_file(input_file.name, mime_type=mime_type, extra_args=extra_args)
|
kreuzberg/_string.py
CHANGED
@@ -4,6 +4,8 @@ from contextlib import suppress
|
|
4
4
|
|
5
5
|
from charset_normalizer import detect
|
6
6
|
|
7
|
+
from kreuzberg.exceptions import ParsingError
|
8
|
+
|
7
9
|
|
8
10
|
def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
|
9
11
|
"""Decode a byte string safely, removing invalid sequences.
|
@@ -12,27 +14,22 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
|
|
12
14
|
byte_data: The byte string to decode.
|
13
15
|
encoding: The encoding to use when decoding the byte string.
|
14
16
|
|
17
|
+
Raises:
|
18
|
+
ParsingError: If the byte string could not be decoded.
|
19
|
+
|
15
20
|
Returns:
|
16
21
|
The decoded string.
|
17
22
|
"""
|
18
23
|
if not byte_data:
|
19
24
|
return ""
|
20
25
|
|
21
|
-
encodings = ["utf-8", "latin-1"]
|
22
|
-
|
23
|
-
if encoding:
|
24
|
-
with suppress(UnicodeDecodeError):
|
25
|
-
return byte_data.decode(encoding, errors="ignore")
|
26
|
-
|
27
|
-
if encoding := detect(byte_data).get("encoding"):
|
28
|
-
encodings.append(encoding)
|
26
|
+
encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8", "latin-1"]
|
29
27
|
|
30
|
-
for
|
28
|
+
for enc in [e for e in encodings if e]:
|
31
29
|
with suppress(UnicodeDecodeError):
|
32
|
-
return byte_data.decode(
|
30
|
+
return byte_data.decode(enc)
|
33
31
|
|
34
|
-
|
35
|
-
return byte_data.decode("latin-1", errors="replace")
|
32
|
+
raise ParsingError("Could not decode byte string. Please provide an encoding.")
|
36
33
|
|
37
34
|
|
38
35
|
def normalize_spaces(text: str) -> str:
|