kreuzberg 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_extractors.py +8 -34
- kreuzberg/_mime_types.py +22 -31
- kreuzberg/_pandoc.py +416 -0
- kreuzberg-1.5.0.dist-info/METADATA +318 -0
- {kreuzberg-1.4.0.dist-info → kreuzberg-1.5.0.dist-info}/RECORD +8 -7
- kreuzberg-1.4.0.dist-info/METADATA +0 -304
- {kreuzberg-1.4.0.dist-info → kreuzberg-1.5.0.dist-info}/LICENSE +0 -0
- {kreuzberg-1.4.0.dist-info → kreuzberg-1.5.0.dist-info}/WHEEL +0 -0
- {kreuzberg-1.4.0.dist-info → kreuzberg-1.5.0.dist-info}/top_level.txt +0 -0
kreuzberg/_extractors.py
CHANGED
@@ -4,16 +4,15 @@ import re
|
|
4
4
|
from contextlib import suppress
|
5
5
|
from html import escape
|
6
6
|
from io import BytesIO
|
7
|
-
from typing import TYPE_CHECKING
|
7
|
+
from typing import TYPE_CHECKING
|
8
8
|
|
9
9
|
import html_to_markdown
|
10
10
|
import pptx
|
11
|
-
import pypandoc
|
12
11
|
import pypdfium2
|
13
12
|
from anyio import Path as AsyncPath
|
14
|
-
from
|
13
|
+
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
15
14
|
|
16
|
-
from kreuzberg.
|
15
|
+
from kreuzberg._pandoc import process_content, process_file
|
17
16
|
from kreuzberg._string import normalize_spaces, safe_decode
|
18
17
|
from kreuzberg._sync import run_sync
|
19
18
|
from kreuzberg._tesseract import batch_process_images
|
@@ -98,32 +97,18 @@ async def extract_pdf_file(file_path: Path, force_ocr: bool = False) -> str:
|
|
98
97
|
return await extract_pdf_with_tesseract(file_path)
|
99
98
|
|
100
99
|
|
101
|
-
async def extract_content_with_pandoc(file_data: bytes, mime_type: str
|
100
|
+
async def extract_content_with_pandoc(file_data: bytes, mime_type: str) -> str:
|
102
101
|
"""Extract text using pandoc.
|
103
102
|
|
104
103
|
Args:
|
105
104
|
file_data: The content of the file.
|
106
105
|
mime_type: The mime type of the file.
|
107
|
-
encoding: An optional encoding to use when decoding the string.
|
108
|
-
|
109
|
-
Raises:
|
110
|
-
ParsingError: If the text could not be extracted from the file using pandoc.
|
111
106
|
|
112
107
|
Returns:
|
113
108
|
The extracted text.
|
114
109
|
"""
|
115
|
-
|
116
|
-
|
117
|
-
try:
|
118
|
-
return normalize_spaces(
|
119
|
-
cast(str, await run_sync(pypandoc.convert_text, file_data, to="md", format=ext, encoding=encoding))
|
120
|
-
)
|
121
|
-
except RuntimeError as e:
|
122
|
-
# TODO: add test case
|
123
|
-
raise ParsingError(
|
124
|
-
f"Could not extract text from {PANDOC_MIME_TYPE_EXT_MAP[mime_type]} file contents",
|
125
|
-
context={"error": str(e)},
|
126
|
-
) from e
|
110
|
+
result = await process_content(file_data, mime_type=mime_type)
|
111
|
+
return normalize_spaces(result.content)
|
127
112
|
|
128
113
|
|
129
114
|
async def extract_file_with_pandoc(file_path: Path | str, mime_type: str) -> str:
|
@@ -133,20 +118,11 @@ async def extract_file_with_pandoc(file_path: Path | str, mime_type: str) -> str
|
|
133
118
|
file_path: The path to the file.
|
134
119
|
mime_type: The mime type of the file.
|
135
120
|
|
136
|
-
Raises:
|
137
|
-
ParsingError: If the text could not be extracted from the file using pandoc.
|
138
|
-
|
139
121
|
Returns:
|
140
122
|
The extracted text.
|
141
123
|
"""
|
142
|
-
|
143
|
-
|
144
|
-
return normalize_spaces(cast(str, await run_sync(pypandoc.convert_file, file_path, to="md", format=ext)))
|
145
|
-
except RuntimeError as e:
|
146
|
-
raise ParsingError(
|
147
|
-
f"Could not extract text from {PANDOC_MIME_TYPE_EXT_MAP[mime_type]} file",
|
148
|
-
context={"file_path": str(file_path), "error": str(e)},
|
149
|
-
) from e
|
124
|
+
result = await process_file(file_path, mime_type=mime_type)
|
125
|
+
return normalize_spaces(result.content)
|
150
126
|
|
151
127
|
|
152
128
|
async def extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
|
@@ -161,8 +137,6 @@ async def extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
|
|
161
137
|
Returns:
|
162
138
|
The extracted text content
|
163
139
|
"""
|
164
|
-
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
165
|
-
|
166
140
|
md_content = ""
|
167
141
|
file_contents = (
|
168
142
|
file_path_or_contents
|
kreuzberg/_mime_types.py
CHANGED
@@ -54,44 +54,35 @@ IMAGE_MIME_TYPE_EXT_MAP: Final[Mapping[str, str]] = {
|
|
54
54
|
"image/x-portable-pixmap": "ppm",
|
55
55
|
}
|
56
56
|
PANDOC_SUPPORTED_MIME_TYPES: Final[set[str]] = {
|
57
|
-
"application/
|
58
|
-
"application/
|
57
|
+
"application/csl+json",
|
58
|
+
"application/docbook+xml",
|
59
|
+
"application/epub+zip",
|
59
60
|
"application/rtf",
|
60
61
|
"application/vnd.oasis.opendocument.text",
|
61
62
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
62
|
-
"application/x-
|
63
|
+
"application/x-biblatex",
|
64
|
+
"application/x-bibtex",
|
65
|
+
"application/x-endnote+xml",
|
66
|
+
"application/x-fictionbook+xml",
|
67
|
+
"application/x-ipynb+json",
|
68
|
+
"application/x-jats+xml",
|
63
69
|
"application/x-latex",
|
64
|
-
"application/x-
|
65
|
-
"application/x-
|
70
|
+
"application/x-opml+xml",
|
71
|
+
"application/x-research-info-systems",
|
72
|
+
"application/x-typst",
|
66
73
|
"text/csv",
|
67
|
-
"text/latex",
|
68
|
-
"text/rst",
|
69
|
-
"text/rtf",
|
70
74
|
"text/tab-separated-values",
|
71
|
-
"text/
|
72
|
-
"text/x-
|
75
|
+
"text/troff",
|
76
|
+
"text/x-commonmark",
|
77
|
+
"text/x-dokuwiki",
|
78
|
+
"text/x-gfm",
|
79
|
+
"text/x-markdown",
|
80
|
+
"text/x-markdown-extra",
|
81
|
+
"text/x-mdoc",
|
82
|
+
"text/x-multimarkdown",
|
83
|
+
"text/x-org",
|
84
|
+
"text/x-pod",
|
73
85
|
"text/x-rst",
|
74
|
-
"text/x-tsv",
|
75
|
-
}
|
76
|
-
PANDOC_MIME_TYPE_EXT_MAP: Final[Mapping[str, str]] = {
|
77
|
-
"application/csv": "csv",
|
78
|
-
"application/latex": "latex",
|
79
|
-
"application/rtf": "rtf",
|
80
|
-
"application/vnd.oasis.opendocument.text": "odt",
|
81
|
-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
|
82
|
-
"application/x-csv": "csv",
|
83
|
-
"application/x-latex": "latex",
|
84
|
-
"application/x-rtf": "rtf",
|
85
|
-
"application/x-vnd.oasis.opendocument.text": "odt",
|
86
|
-
"text/csv": "csv",
|
87
|
-
"text/latex": "latex",
|
88
|
-
"text/rst": "rst",
|
89
|
-
"text/rtf": "rtf",
|
90
|
-
"text/tab-separated-values": "tsv",
|
91
|
-
"text/x-csv": "csv",
|
92
|
-
"text/x-latex": "latex",
|
93
|
-
"text/x-rst": "rst",
|
94
|
-
"text/x-tsv": "tsv",
|
95
86
|
}
|
96
87
|
|
97
88
|
SUPPORTED_MIME_TYPES: Final[set[str]] = (
|
kreuzberg/_pandoc.py
ADDED
@@ -0,0 +1,416 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import json
|
4
|
+
import subprocess
|
5
|
+
from asyncio import gather
|
6
|
+
from dataclasses import dataclass
|
7
|
+
from tempfile import NamedTemporaryFile
|
8
|
+
from typing import TYPE_CHECKING, Any, Final, Literal, TypedDict, cast
|
9
|
+
|
10
|
+
from anyio import Path as AsyncPath
|
11
|
+
|
12
|
+
from kreuzberg._string import normalize_spaces
|
13
|
+
from kreuzberg._sync import run_sync
|
14
|
+
from kreuzberg.exceptions import MissingDependencyError, ParsingError, ValidationError
|
15
|
+
|
16
|
+
if TYPE_CHECKING:
|
17
|
+
from collections.abc import Mapping
|
18
|
+
from os import PathLike
|
19
|
+
|
20
|
+
try: # pragma: no cover
|
21
|
+
from typing import NotRequired # type: ignore[attr-defined]
|
22
|
+
except ImportError: # pragma: no cover
|
23
|
+
from typing_extensions import NotRequired
|
24
|
+
|
25
|
+
version_ref: Final[dict[str, bool]] = {"checked": False}
|
26
|
+
|
27
|
+
|
28
|
+
# Block-level node types in Pandoc AST
|
29
|
+
BLOCK_HEADER: Final = "Header" # Header with level, attributes and inline content
|
30
|
+
BLOCK_PARA: Final = "Para" # Paragraph containing inline content
|
31
|
+
BLOCK_CODE: Final = "CodeBlock" # Code block with attributes and string content
|
32
|
+
BLOCK_QUOTE: Final = "BlockQuote" # Block quote containing blocks
|
33
|
+
BLOCK_LIST: Final = "BulletList" # Bullet list containing items (blocks)
|
34
|
+
BLOCK_ORDERED: Final = "OrderedList" # Numbered list with attrs and items
|
35
|
+
|
36
|
+
# Inline-level node types in Pandoc AST
|
37
|
+
INLINE_STR: Final = "Str" # Plain text string
|
38
|
+
INLINE_SPACE: Final = "Space" # Single space
|
39
|
+
INLINE_EMPH: Final = "Emph" # Emphasized text (contains inlines)
|
40
|
+
INLINE_STRONG: Final = "Strong" # Strong/bold text (contains inlines)
|
41
|
+
INLINE_LINK: Final = "Link" # Link with text and target
|
42
|
+
INLINE_IMAGE: Final = "Image" # Image with alt text and source
|
43
|
+
INLINE_CODE: Final = "Code" # Inline code span
|
44
|
+
INLINE_MATH: Final = "Math" # Math expression
|
45
|
+
|
46
|
+
# Metadata node types in Pandoc AST
|
47
|
+
META_MAP: Final = "MetaMap" # Key-value mapping of metadata
|
48
|
+
META_LIST: Final = "MetaList" # List of metadata values
|
49
|
+
META_INLINES: Final = "MetaInlines" # Inline content in metadata
|
50
|
+
META_STRING: Final = "MetaString" # Plain string in metadata
|
51
|
+
META_BLOCKS: Final = "MetaBlocks" # Block content in metadata
|
52
|
+
|
53
|
+
# Node content field name
|
54
|
+
CONTENT_FIELD: Final = "c"
|
55
|
+
TYPE_FIELD: Final = "t"
|
56
|
+
|
57
|
+
# Valid node types
|
58
|
+
NodeType = Literal[
|
59
|
+
# Block types
|
60
|
+
"Header",
|
61
|
+
"Para",
|
62
|
+
"CodeBlock",
|
63
|
+
"BlockQuote",
|
64
|
+
"BulletList",
|
65
|
+
"OrderedList",
|
66
|
+
# Inline types
|
67
|
+
"Str",
|
68
|
+
"Space",
|
69
|
+
"Emph",
|
70
|
+
"Strong",
|
71
|
+
"Link",
|
72
|
+
"Image",
|
73
|
+
"Code",
|
74
|
+
"Math",
|
75
|
+
# Meta types
|
76
|
+
"MetaMap",
|
77
|
+
"MetaList",
|
78
|
+
"MetaInlines",
|
79
|
+
"MetaString",
|
80
|
+
"MetaBlocks",
|
81
|
+
]
|
82
|
+
|
83
|
+
PANDOC_MIMETYPE_TO_FORMAT_MAPPING: Final[Mapping[str, str]] = {
|
84
|
+
"application/csl+json": "csljson",
|
85
|
+
"application/docbook+xml": "docbook",
|
86
|
+
"application/epub+zip": "epub",
|
87
|
+
"application/rtf": "rtf",
|
88
|
+
"application/vnd.oasis.opendocument.text": "odt",
|
89
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
|
90
|
+
"application/x-biblatex": "biblatex",
|
91
|
+
"application/x-bibtex": "bibtex",
|
92
|
+
"application/x-endnote+xml": "endnotexml",
|
93
|
+
"application/x-fictionbook+xml": "fb2",
|
94
|
+
"application/x-ipynb+json": "ipynb",
|
95
|
+
"application/x-jats+xml": "jats",
|
96
|
+
"application/x-latex": "latex",
|
97
|
+
"application/x-opml+xml": "opml",
|
98
|
+
"application/x-research-info-systems": "ris",
|
99
|
+
"application/x-typst": "typst",
|
100
|
+
"text/csv": "csv",
|
101
|
+
"text/tab-separated-values": "tsv",
|
102
|
+
"text/troff": "man",
|
103
|
+
"text/x-commonmark": "commonmark",
|
104
|
+
"text/x-dokuwiki": "dokuwiki",
|
105
|
+
"text/x-gfm": "gfm",
|
106
|
+
"text/x-markdown": "markdown",
|
107
|
+
"text/x-markdown-extra": "markdown_phpextra",
|
108
|
+
"text/x-mdoc": "mdoc",
|
109
|
+
"text/x-multimarkdown": "markdown_mmd",
|
110
|
+
"text/x-org": "org",
|
111
|
+
"text/x-pod": "pod",
|
112
|
+
"text/x-rst": "rst",
|
113
|
+
}
|
114
|
+
|
115
|
+
|
116
|
+
class Metadata(TypedDict, total=False):
|
117
|
+
"""Document metadata extracted from Pandoc document.
|
118
|
+
|
119
|
+
All fields are optional but will only be included if they contain non-empty values.
|
120
|
+
Any field that would be empty or None is omitted from the dictionary.
|
121
|
+
"""
|
122
|
+
|
123
|
+
title: NotRequired[str]
|
124
|
+
"""Document title."""
|
125
|
+
subtitle: NotRequired[str]
|
126
|
+
"""Document subtitle."""
|
127
|
+
abstract: NotRequired[str | list[str]]
|
128
|
+
"""Document abstract, summary or description."""
|
129
|
+
authors: NotRequired[list[str]]
|
130
|
+
"""List of document authors."""
|
131
|
+
date: NotRequired[str]
|
132
|
+
"""Document date as string to preserve original format."""
|
133
|
+
subject: NotRequired[str]
|
134
|
+
"""Document subject or topic."""
|
135
|
+
description: NotRequired[str]
|
136
|
+
"""Extended description."""
|
137
|
+
keywords: NotRequired[list[str]]
|
138
|
+
"""Keywords or tags."""
|
139
|
+
categories: NotRequired[list[str]]
|
140
|
+
"""Categories or classifications."""
|
141
|
+
version: NotRequired[str]
|
142
|
+
"""Version identifier."""
|
143
|
+
language: NotRequired[str]
|
144
|
+
"""Document language code."""
|
145
|
+
references: NotRequired[list[str]]
|
146
|
+
"""Reference entries."""
|
147
|
+
citations: NotRequired[list[str]]
|
148
|
+
"""Citation identifiers."""
|
149
|
+
copyright: NotRequired[str]
|
150
|
+
"""Copyright information."""
|
151
|
+
license: NotRequired[str]
|
152
|
+
"""License information."""
|
153
|
+
identifier: NotRequired[str]
|
154
|
+
"""Document identifier."""
|
155
|
+
publisher: NotRequired[str]
|
156
|
+
"""Publisher name."""
|
157
|
+
contributors: NotRequired[list[str]]
|
158
|
+
"""Additional contributors."""
|
159
|
+
creator: NotRequired[str]
|
160
|
+
"""Document creator."""
|
161
|
+
institute: NotRequired[str | list[str]]
|
162
|
+
"""Institute or organization."""
|
163
|
+
|
164
|
+
|
165
|
+
@dataclass
|
166
|
+
class PandocResult:
|
167
|
+
"""Result of a pandoc conversion including content and metadata."""
|
168
|
+
|
169
|
+
content: str
|
170
|
+
"""The processed markdown content."""
|
171
|
+
metadata: Metadata
|
172
|
+
"""Document metadata extracted from the source."""
|
173
|
+
|
174
|
+
|
175
|
+
def _extract_inline_text(node: dict[str, Any]) -> str | None:
|
176
|
+
if node_type := node.get(TYPE_FIELD):
|
177
|
+
if node_type == INLINE_STR:
|
178
|
+
return node.get(CONTENT_FIELD)
|
179
|
+
if node_type == INLINE_SPACE:
|
180
|
+
return " "
|
181
|
+
if node_type in (INLINE_EMPH, INLINE_STRONG):
|
182
|
+
return _extract_inlines(node.get(CONTENT_FIELD, []))
|
183
|
+
return None # pragma: no cover
|
184
|
+
|
185
|
+
|
186
|
+
def _extract_inlines(nodes: list[dict[str, Any]]) -> str | None:
|
187
|
+
texts = [text for node in nodes if (text := _extract_inline_text(node))]
|
188
|
+
result = "".join(texts).strip()
|
189
|
+
return result if result else None
|
190
|
+
|
191
|
+
|
192
|
+
def _extract_meta_value(node: Any) -> str | list[str] | None:
|
193
|
+
if not isinstance(node, dict) or CONTENT_FIELD not in node or TYPE_FIELD not in node:
|
194
|
+
return None
|
195
|
+
|
196
|
+
content = node[CONTENT_FIELD]
|
197
|
+
node_type = node[TYPE_FIELD]
|
198
|
+
|
199
|
+
if not content or node_type not in {
|
200
|
+
META_STRING,
|
201
|
+
META_INLINES,
|
202
|
+
META_LIST,
|
203
|
+
META_BLOCKS,
|
204
|
+
}:
|
205
|
+
return None
|
206
|
+
|
207
|
+
if node_type == META_STRING and isinstance(content, str):
|
208
|
+
return content
|
209
|
+
|
210
|
+
if isinstance(content, list) and (content := [v for v in content if isinstance(v, dict)]):
|
211
|
+
if node_type == META_INLINES:
|
212
|
+
return _extract_inlines(cast(list[dict[str, Any]], content))
|
213
|
+
|
214
|
+
if node_type == META_LIST:
|
215
|
+
results = []
|
216
|
+
for value in [value for item in content if (value := _extract_meta_value(item))]:
|
217
|
+
if isinstance(value, list):
|
218
|
+
results.extend(value) # pragma: no cover
|
219
|
+
else:
|
220
|
+
results.append(value)
|
221
|
+
return results
|
222
|
+
|
223
|
+
if blocks := [block for block in content if block.get(TYPE_FIELD) == BLOCK_PARA]:
|
224
|
+
block_texts = []
|
225
|
+
for block in blocks:
|
226
|
+
block_content = block.get(CONTENT_FIELD, [])
|
227
|
+
if isinstance(block_content, list) and (text := _extract_inlines(block_content)):
|
228
|
+
block_texts.append(text)
|
229
|
+
return block_texts if block_texts else None
|
230
|
+
|
231
|
+
return None
|
232
|
+
|
233
|
+
|
234
|
+
def _extract_metadata(raw_meta: dict[str, Any]) -> Metadata:
|
235
|
+
"""Extract all non-empty metadata values from Pandoc AST metadata."""
|
236
|
+
meta: Metadata = {}
|
237
|
+
|
238
|
+
for key, value in raw_meta.items():
|
239
|
+
if extracted := _extract_meta_value(value):
|
240
|
+
meta[key] = extracted # type: ignore[literal-required]
|
241
|
+
|
242
|
+
citations = [
|
243
|
+
cite["citationId"]
|
244
|
+
for block in raw_meta.get("blocks", [])
|
245
|
+
if block.get(TYPE_FIELD) == "Cite"
|
246
|
+
for cite in block.get(CONTENT_FIELD, [[{}]])[0]
|
247
|
+
if isinstance(cite, dict)
|
248
|
+
]
|
249
|
+
if citations:
|
250
|
+
meta["citations"] = citations
|
251
|
+
|
252
|
+
return meta
|
253
|
+
|
254
|
+
|
255
|
+
def _get_extension_from_mime_type(mime_type: str) -> str:
|
256
|
+
if mime_type not in PANDOC_MIMETYPE_TO_FORMAT_MAPPING or not any(
|
257
|
+
mime_type.startswith(value) for value in PANDOC_MIMETYPE_TO_FORMAT_MAPPING
|
258
|
+
):
|
259
|
+
raise ValidationError(
|
260
|
+
f"Unsupported mime type: {mime_type}",
|
261
|
+
context={
|
262
|
+
"mime_type": mime_type,
|
263
|
+
"supported_mimetypes": ",".join(sorted(PANDOC_MIMETYPE_TO_FORMAT_MAPPING)),
|
264
|
+
},
|
265
|
+
)
|
266
|
+
|
267
|
+
return PANDOC_MIMETYPE_TO_FORMAT_MAPPING.get(mime_type) or next(
|
268
|
+
PANDOC_MIMETYPE_TO_FORMAT_MAPPING[k] for k in PANDOC_MIMETYPE_TO_FORMAT_MAPPING if k.startswith(mime_type)
|
269
|
+
)
|
270
|
+
|
271
|
+
|
272
|
+
async def validate_pandoc_version() -> None:
|
273
|
+
"""Validate that Pandoc is installed and is version 3 or above.
|
274
|
+
|
275
|
+
Raises:
|
276
|
+
MissingDependencyError: If Pandoc is not installed or is below version 3.
|
277
|
+
"""
|
278
|
+
try:
|
279
|
+
if version_ref["checked"]:
|
280
|
+
return
|
281
|
+
|
282
|
+
result = await run_sync(subprocess.run, ["pandoc", "--version"], capture_output=True)
|
283
|
+
version = result.stdout.decode().split("\n")[0].split()[1]
|
284
|
+
if not version.startswith("3."):
|
285
|
+
raise MissingDependencyError("Pandoc version 3 or above is required.")
|
286
|
+
|
287
|
+
version_ref["checked"] = True
|
288
|
+
|
289
|
+
except FileNotFoundError as e:
|
290
|
+
raise MissingDependencyError("Pandoc is not installed.") from e
|
291
|
+
|
292
|
+
|
293
|
+
async def extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -> Metadata:
|
294
|
+
"""Extract metadata from a document using pandoc.
|
295
|
+
|
296
|
+
Args:
|
297
|
+
input_file: The path to the file to process.
|
298
|
+
mime_type: The mime type of the file.
|
299
|
+
|
300
|
+
Raises:
|
301
|
+
ParsingError: If Pandoc fails to extract metadata.
|
302
|
+
|
303
|
+
Returns:
|
304
|
+
Dictionary containing document metadata.
|
305
|
+
"""
|
306
|
+
extension = _get_extension_from_mime_type(mime_type)
|
307
|
+
|
308
|
+
with NamedTemporaryFile(suffix=".json") as metadata_file:
|
309
|
+
try:
|
310
|
+
command = [
|
311
|
+
"pandoc",
|
312
|
+
str(input_file),
|
313
|
+
f"--from={extension}",
|
314
|
+
"--to=json",
|
315
|
+
"--standalone",
|
316
|
+
"--quiet",
|
317
|
+
"--output",
|
318
|
+
metadata_file.name,
|
319
|
+
]
|
320
|
+
|
321
|
+
result = await run_sync(
|
322
|
+
subprocess.run,
|
323
|
+
command,
|
324
|
+
capture_output=True,
|
325
|
+
)
|
326
|
+
|
327
|
+
if result.returncode != 0:
|
328
|
+
raise ParsingError(
|
329
|
+
"Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
|
330
|
+
)
|
331
|
+
|
332
|
+
json_data = json.loads(await AsyncPath(metadata_file.name).read_text())
|
333
|
+
return _extract_metadata(json_data)
|
334
|
+
|
335
|
+
except (RuntimeError, OSError, json.JSONDecodeError) as e:
|
336
|
+
raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
|
337
|
+
|
338
|
+
|
339
|
+
async def _extract_file(input_file: str | PathLike[str], *, mime_type: str, extra_args: list[str] | None = None) -> str:
|
340
|
+
extension = _get_extension_from_mime_type(mime_type)
|
341
|
+
|
342
|
+
with NamedTemporaryFile(suffix=".md") as output_file:
|
343
|
+
command = [
|
344
|
+
"pandoc",
|
345
|
+
str(input_file),
|
346
|
+
f"--from={extension}",
|
347
|
+
"--to=markdown",
|
348
|
+
"--standalone",
|
349
|
+
"--wrap=preserve",
|
350
|
+
"--quiet",
|
351
|
+
"--output",
|
352
|
+
output_file.name,
|
353
|
+
]
|
354
|
+
|
355
|
+
if extra_args:
|
356
|
+
command.extend(extra_args)
|
357
|
+
|
358
|
+
result = await run_sync(
|
359
|
+
subprocess.run,
|
360
|
+
command,
|
361
|
+
capture_output=True,
|
362
|
+
)
|
363
|
+
|
364
|
+
if result.returncode != 0:
|
365
|
+
raise ParsingError(
|
366
|
+
"Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
|
367
|
+
)
|
368
|
+
|
369
|
+
text = await AsyncPath(output_file.name).read_text()
|
370
|
+
|
371
|
+
return normalize_spaces(text)
|
372
|
+
|
373
|
+
|
374
|
+
async def process_file(
|
375
|
+
input_file: str | PathLike[str], *, mime_type: str, extra_args: list[str] | None = None
|
376
|
+
) -> PandocResult:
|
377
|
+
"""Process a single file using Pandoc and convert to markdown.
|
378
|
+
|
379
|
+
Args:
|
380
|
+
input_file: The path to the file to process.
|
381
|
+
mime_type: The mime type of the file.
|
382
|
+
extra_args: Additional Pandoc command line arguments.
|
383
|
+
|
384
|
+
Returns:
|
385
|
+
PandocResult containing processed content and metadata.
|
386
|
+
"""
|
387
|
+
await validate_pandoc_version()
|
388
|
+
|
389
|
+
metadata, content = await gather(
|
390
|
+
*[
|
391
|
+
extract_metadata(input_file, mime_type=mime_type),
|
392
|
+
_extract_file(input_file, mime_type=mime_type, extra_args=extra_args),
|
393
|
+
]
|
394
|
+
)
|
395
|
+
return PandocResult(
|
396
|
+
content=content, # type: ignore[arg-type]
|
397
|
+
metadata=metadata, # type: ignore[arg-type]
|
398
|
+
)
|
399
|
+
|
400
|
+
|
401
|
+
async def process_content(content: bytes, *, mime_type: str, extra_args: list[str] | None = None) -> PandocResult:
|
402
|
+
"""Process content using Pandoc and convert to markdown.
|
403
|
+
|
404
|
+
Args:
|
405
|
+
content: The content to process.
|
406
|
+
mime_type: The mime type of the content.
|
407
|
+
extra_args: Additional Pandoc command line arguments.
|
408
|
+
|
409
|
+
Returns:
|
410
|
+
PandocResult containing processed content and metadata.
|
411
|
+
"""
|
412
|
+
extension = _get_extension_from_mime_type(mime_type)
|
413
|
+
|
414
|
+
with NamedTemporaryFile(suffix=f".{extension}") as input_file:
|
415
|
+
await AsyncPath(input_file.name).write_bytes(content)
|
416
|
+
return await process_file(input_file.name, mime_type=mime_type, extra_args=extra_args)
|
@@ -0,0 +1,318 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: kreuzberg
|
3
|
+
Version: 1.5.0
|
4
|
+
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
|
+
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
|
+
License: MIT
|
7
|
+
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
8
|
+
Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,tesseract,text-extraction,text-processing
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
10
|
+
Classifier: Intended Audience :: Developers
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
12
|
+
Classifier: Operating System :: OS Independent
|
13
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
21
|
+
Classifier: Topic :: Text Processing :: General
|
22
|
+
Classifier: Topic :: Utilities
|
23
|
+
Classifier: Typing :: Typed
|
24
|
+
Requires-Python: >=3.9
|
25
|
+
Description-Content-Type: text/markdown
|
26
|
+
License-File: LICENSE
|
27
|
+
Requires-Dist: anyio>=4.8.0
|
28
|
+
Requires-Dist: charset-normalizer>=3.4.1
|
29
|
+
Requires-Dist: html-to-markdown>=1.2.0
|
30
|
+
Requires-Dist: pypdfium2>=4.30.1
|
31
|
+
Requires-Dist: python-pptx>=1.0.2
|
32
|
+
Requires-Dist: typing-extensions>=4.12.2; python_version < "3.10"
|
33
|
+
|
34
|
+
# Kreuzberg
|
35
|
+
|
36
|
+
Kreuzberg is a modern Python library for text extraction from documents, designed for simplicity and efficiency. It provides a unified async interface for extracting text from a wide range of file formats including PDFs, images, office documents, and more.
|
37
|
+
|
38
|
+
## Why Kreuzberg?
|
39
|
+
|
40
|
+
- **Simple and Hassle-Free**: Clean API that just works, without complex configuration
|
41
|
+
- **Local Processing**: No external API calls or cloud dependencies required
|
42
|
+
- **Resource Efficient**: Lightweight processing without GPU requirements
|
43
|
+
- **Format Support**: Comprehensive support for documents, images, and text formats
|
44
|
+
- **Modern Python**: Built with async/await, type hints, and current best practices
|
45
|
+
|
46
|
+
Kreuzberg was created to solve text extraction needs in RAG (Retrieval Augmented Generation) applications, but it's suitable for any text extraction use case. Unlike many commercial solutions that require API calls or complex setups, Kreuzberg focuses on local processing with minimal dependencies.
|
47
|
+
|
48
|
+
## Features
|
49
|
+
|
50
|
+
- **Universal Text Extraction**: Extract text from PDFs (both searchable and scanned), images, office documents, and more
|
51
|
+
- **Smart Processing**: Automatic OCR for scanned documents, encoding detection for text files
|
52
|
+
- **Modern Python Design**:
|
53
|
+
- Async-first API using `anyio`
|
54
|
+
- Comprehensive type hints for better IDE support
|
55
|
+
- Detailed error handling with context information
|
56
|
+
- **Production Ready**:
|
57
|
+
- Robust error handling
|
58
|
+
- Detailed debugging information
|
59
|
+
- Memory efficient processing
|
60
|
+
|
61
|
+
## Installation
|
62
|
+
|
63
|
+
### 1. Install the Python Package
|
64
|
+
|
65
|
+
```shell
|
66
|
+
pip install kreuzberg
|
67
|
+
```
|
68
|
+
|
69
|
+
### 2. Install System Dependencies
|
70
|
+
|
71
|
+
Kreuzberg requires two open-source tools:
|
72
|
+
|
73
|
+
- [Pandoc](https://pandoc.org/installing.html) - For document format conversion
|
74
|
+
|
75
|
+
- GPL v2.0 licensed (used via CLI only)
|
76
|
+
- Handles office documents and markup formats
|
77
|
+
|
78
|
+
- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
|
79
|
+
- Apache License
|
80
|
+
- Required for scanned documents and images
|
81
|
+
|
82
|
+
## Architecture
|
83
|
+
|
84
|
+
Kreuzberg is designed as a high-level async abstraction over established open-source tools. It integrates:
|
85
|
+
|
86
|
+
- **PDF Processing**:
|
87
|
+
- `pdfium2` for searchable PDFs
|
88
|
+
- Tesseract OCR for scanned content
|
89
|
+
- **Document Conversion**:
|
90
|
+
- Pandoc for office documents and markup
|
91
|
+
- `python-pptx` for PowerPoint files
|
92
|
+
- `html-to-markdown` for HTML content
|
93
|
+
- **Text Processing**:
|
94
|
+
- Smart encoding detection
|
95
|
+
- Markdown and plain text handling
|
96
|
+
|
97
|
+
### Supported Formats
|
98
|
+
|
99
|
+
#### Document Formats
|
100
|
+
|
101
|
+
- PDF (`.pdf`, both searchable and scanned documents)
|
102
|
+
- Microsoft Word (`.docx`, `.doc`)
|
103
|
+
- PowerPoint presentations (`.pptx`)
|
104
|
+
- OpenDocument Text (`.odt`)
|
105
|
+
- Rich Text Format (`.rtf`)
|
106
|
+
- EPUB (`.epub`)
|
107
|
+
- DocBook XML (`.dbk`, `.xml`)
|
108
|
+
- FictionBook (`.fb2`)
|
109
|
+
- LaTeX (`.tex`, `.latex`)
|
110
|
+
- Typst (`.typ`)
|
111
|
+
|
112
|
+
#### Markup and Text Formats
|
113
|
+
|
114
|
+
- HTML (`.html`, `.htm`)
|
115
|
+
- Plain text (`.txt`) and Markdown (`.md`, `.markdown`)
|
116
|
+
- reStructuredText (`.rst`)
|
117
|
+
- Org-mode (`.org`)
|
118
|
+
- DokuWiki (`.txt`)
|
119
|
+
- Pod (`.pod`)
|
120
|
+
- Man pages (`.1`, `.2`, etc.)
|
121
|
+
|
122
|
+
#### Data and Research Formats
|
123
|
+
|
124
|
+
- CSV (`.csv`) and TSV (`.tsv`) files
|
125
|
+
- Jupyter Notebooks (`.ipynb`)
|
126
|
+
- BibTeX (`.bib`) and BibLaTeX (`.bib`)
|
127
|
+
- CSL-JSON (`.json`)
|
128
|
+
- EndNote XML (`.xml`)
|
129
|
+
- RIS (`.ris`)
|
130
|
+
- JATS XML (`.xml`)
|
131
|
+
|
132
|
+
#### Image Formats
|
133
|
+
|
134
|
+
- JPEG (`.jpg`, `.jpeg`, `.pjpeg`)
|
135
|
+
- PNG (`.png`)
|
136
|
+
- TIFF (`.tiff`, `.tif`)
|
137
|
+
- BMP (`.bmp`)
|
138
|
+
- GIF (`.gif`)
|
139
|
+
- WebP (`.webp`)
|
140
|
+
- JPEG 2000 (`.jp2`, `.jpx`, `.jpm`, `.mj2`)
|
141
|
+
- Portable Anymap (`.pnm`)
|
142
|
+
- Portable Bitmap (`.pbm`)
|
143
|
+
- Portable Graymap (`.pgm`)
|
144
|
+
- Portable Pixmap (`.ppm`)
|
145
|
+
|
146
|
+
## Usage
|
147
|
+
|
148
|
+
Kreuzberg provides a simple, async-first API for text extraction. The library exports two main functions:
|
149
|
+
|
150
|
+
- `extract_file()`: Extract text from a file (accepts string path or `pathlib.Path`)
|
151
|
+
- `extract_bytes()`: Extract text from bytes (accepts a byte string)
|
152
|
+
|
153
|
+
### Quick Start
|
154
|
+
|
155
|
+
```python
|
156
|
+
from pathlib import Path
|
157
|
+
from kreuzberg import extract_file, extract_bytes
|
158
|
+
|
159
|
+
# Basic file extraction
|
160
|
+
async def extract_document():
|
161
|
+
# Extract from a PDF file
|
162
|
+
pdf_result = await extract_file("document.pdf")
|
163
|
+
print(f"PDF text: {pdf_result.content}")
|
164
|
+
|
165
|
+
# Extract from an image
|
166
|
+
img_result = await extract_file("scan.png")
|
167
|
+
print(f"Image text: {img_result.content}")
|
168
|
+
|
169
|
+
# Extract from Word document
|
170
|
+
docx_result = await extract_file(Path("document.docx"))
|
171
|
+
print(f"Word text: {docx_result.content}")
|
172
|
+
```
|
173
|
+
|
174
|
+
### Processing Uploaded Files
|
175
|
+
|
176
|
+
```python
|
177
|
+
from kreuzberg import extract_bytes
|
178
|
+
|
179
|
+
async def process_upload(file_content: bytes, mime_type: str):
|
180
|
+
"""Process uploaded file content with known MIME type."""
|
181
|
+
result = await extract_bytes(file_content, mime_type=mime_type)
|
182
|
+
return result.content
|
183
|
+
|
184
|
+
# Example usage with different file types
|
185
|
+
async def handle_uploads():
|
186
|
+
# Process PDF upload
|
187
|
+
pdf_result = await extract_bytes(pdf_bytes, mime_type="application/pdf")
|
188
|
+
|
189
|
+
# Process image upload
|
190
|
+
img_result = await extract_bytes(image_bytes, mime_type="image/jpeg")
|
191
|
+
|
192
|
+
# Process Word document upload
|
193
|
+
docx_result = await extract_bytes(docx_bytes,
|
194
|
+
mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document")
|
195
|
+
```
|
196
|
+
|
197
|
+
### Advanced Features
|
198
|
+
|
199
|
+
#### PDF Processing Options
|
200
|
+
|
201
|
+
```python
|
202
|
+
from kreuzberg import extract_file
|
203
|
+
|
204
|
+
async def process_pdf():
|
205
|
+
# Force OCR for PDFs with embedded images or scanned content
|
206
|
+
result = await extract_file("document.pdf", force_ocr=True)
|
207
|
+
|
208
|
+
# Process a scanned PDF (automatically uses OCR)
|
209
|
+
scanned = await extract_file("scanned.pdf")
|
210
|
+
```
|
211
|
+
|
212
|
+
#### ExtractionResult Object
|
213
|
+
|
214
|
+
All extraction functions return an `ExtractionResult` containing:
|
215
|
+
|
216
|
+
- `content`: The extracted text (str)
|
217
|
+
- `mime_type`: Output format ("text/plain" or "text/markdown" for Pandoc conversions)
|
218
|
+
|
219
|
+
```python
|
220
|
+
from kreuzberg import ExtractionResult
|
221
|
+
|
222
|
+
async def process_document(path: str) -> tuple[str, str]:
|
223
|
+
# Access as a named tuple
|
224
|
+
result: ExtractionResult = await extract_file(path)
|
225
|
+
print(f"Content: {result.content}")
|
226
|
+
print(f"Format: {result.mime_type}")
|
227
|
+
|
228
|
+
# Or unpack as a tuple
|
229
|
+
content, mime_type = await extract_file(path)
|
230
|
+
return content, mime_type
|
231
|
+
```
|
232
|
+
|
233
|
+
### Error Handling
|
234
|
+
|
235
|
+
Kreuzberg provides detailed error handling with two main exception types:
|
236
|
+
|
237
|
+
```python
|
238
|
+
from kreuzberg import extract_file
|
239
|
+
from kreuzberg.exceptions import ValidationError, ParsingError
|
240
|
+
|
241
|
+
async def safe_extract(path: str) -> str:
|
242
|
+
try:
|
243
|
+
result = await extract_file(path)
|
244
|
+
return result.content
|
245
|
+
|
246
|
+
except ValidationError as e:
|
247
|
+
# Handles input validation issues:
|
248
|
+
# - Unsupported file types
|
249
|
+
# - Missing files
|
250
|
+
# - Invalid MIME types
|
251
|
+
print(f"Invalid input: {e.message}")
|
252
|
+
print(f"Details: {e.context}")
|
253
|
+
|
254
|
+
except ParsingError as e:
|
255
|
+
# Handles processing errors:
|
256
|
+
# - PDF parsing failures
|
257
|
+
# - OCR errors
|
258
|
+
# - Format conversion issues
|
259
|
+
print(f"Processing failed: {e.message}")
|
260
|
+
print(f"Details: {e.context}")
|
261
|
+
|
262
|
+
return ""
|
263
|
+
|
264
|
+
# Example error contexts
|
265
|
+
try:
|
266
|
+
result = await extract_file("document.xyz")
|
267
|
+
except ValidationError as e:
|
268
|
+
# e.context might contain:
|
269
|
+
# {
|
270
|
+
# "file_path": "document.xyz",
|
271
|
+
# "error": "Unsupported file type",
|
272
|
+
# "supported_types": ["pdf", "docx", ...]
|
273
|
+
# }
|
274
|
+
|
275
|
+
try:
|
276
|
+
result = await extract_file("scan.pdf")
|
277
|
+
except ParsingError as e:
|
278
|
+
# e.context might contain:
|
279
|
+
# {
|
280
|
+
# "file_path": "scan.pdf",
|
281
|
+
# "error": "OCR processing failed",
|
282
|
+
# "details": "Tesseract error: Unable to process image"
|
283
|
+
# }
|
284
|
+
```
|
285
|
+
|
286
|
+
## Roadmap
|
287
|
+
|
288
|
+
V1:
|
289
|
+
|
290
|
+
- [x] - html file text extraction
|
291
|
+
- [ ] - better PDF table extraction
|
292
|
+
- [ ] - batch APIs
|
293
|
+
- [ ] - sync APIs
|
294
|
+
|
295
|
+
V2:
|
296
|
+
|
297
|
+
- [ ] - metadata extraction (breaking change)
|
298
|
+
- [ ] - TBD
|
299
|
+
|
300
|
+
## Contribution
|
301
|
+
|
302
|
+
This library is open to contribution. Feel free to open issues or submit PRs. Its better to discuss issues before
|
303
|
+
submitting PRs to avoid disappointment.
|
304
|
+
|
305
|
+
### Local Development
|
306
|
+
|
307
|
+
1. Clone the repo
|
308
|
+
2. Install the system dependencies
|
309
|
+
3. Install the full dependencies with `uv sync`
|
310
|
+
4. Install the pre-commit hooks with:
|
311
|
+
```shell
|
312
|
+
pre-commit install && pre-commit install --hook-type commit-msg
|
313
|
+
```
|
314
|
+
5. Make your changes and submit a PR
|
315
|
+
|
316
|
+
## License
|
317
|
+
|
318
|
+
This library uses the MIT license.
|
@@ -1,14 +1,15 @@
|
|
1
1
|
kreuzberg/__init__.py,sha256=5IBPjPsZ7faK15gFB9ZEROHhkEX7KKQmrHPCZuGnhb0,285
|
2
|
-
kreuzberg/_extractors.py,sha256=
|
3
|
-
kreuzberg/_mime_types.py,sha256=
|
2
|
+
kreuzberg/_extractors.py,sha256=k6xO_2ItaftPmlqzfXyxTn8rdaWdwrJHGziBbo7gCio,6599
|
3
|
+
kreuzberg/_mime_types.py,sha256=0ZYtRrMAaKpCMDkhpTbWAXHCsVob5MFRMGlbni8iYSA,2573
|
4
|
+
kreuzberg/_pandoc.py,sha256=DC6y_NN_CG9dF6fhAj3WumXqKIJLjYmnql2H53_KHnE,13766
|
4
5
|
kreuzberg/_string.py,sha256=4txRDnkdR12oO6G8V-jXEMlA9ivgmw8E8EbjyhfL-W4,1106
|
5
6
|
kreuzberg/_sync.py,sha256=ovsFHFdkcczz7gNEUJsbZzY8KHG0_GAOOYipQNE4hIY,874
|
6
7
|
kreuzberg/_tesseract.py,sha256=nnhkjRIS0BSoovjMIqOlBEXlzngE0QJeFDe7BIqUik8,7872
|
7
8
|
kreuzberg/exceptions.py,sha256=pxoEPS0T9e5QSgxsfXn1VmxsY_EGXvTwY0gETPiNn8E,945
|
8
9
|
kreuzberg/extraction.py,sha256=gux3fkPIs8IbIKtRGuPFWJBLB5jO6Y9JsBfhHRcpQ0k,6160
|
9
10
|
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
|
-
kreuzberg-1.
|
11
|
-
kreuzberg-1.
|
12
|
-
kreuzberg-1.
|
13
|
-
kreuzberg-1.
|
14
|
-
kreuzberg-1.
|
11
|
+
kreuzberg-1.5.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
12
|
+
kreuzberg-1.5.0.dist-info/METADATA,sha256=O462ss7M6Cb8cO6fJXwqsOdzkzaZekqa1oGwb7Vrgx8,9641
|
13
|
+
kreuzberg-1.5.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
14
|
+
kreuzberg-1.5.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
|
15
|
+
kreuzberg-1.5.0.dist-info/RECORD,,
|
@@ -1,304 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.2
|
2
|
-
Name: kreuzberg
|
3
|
-
Version: 1.4.0
|
4
|
-
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
|
-
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
|
-
License: MIT
|
7
|
-
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
8
|
-
Keywords: document-processing,docx,image-to-text,latex,markdown,ocr,odt,office-documents,pandoc,pdf,pdf-extraction,rag,text-extraction,text-processing
|
9
|
-
Classifier: Development Status :: 4 - Beta
|
10
|
-
Classifier: Intended Audience :: Developers
|
11
|
-
Classifier: License :: OSI Approved :: MIT License
|
12
|
-
Classifier: Operating System :: OS Independent
|
13
|
-
Classifier: Programming Language :: Python :: 3 :: Only
|
14
|
-
Classifier: Programming Language :: Python :: 3.9
|
15
|
-
Classifier: Programming Language :: Python :: 3.10
|
16
|
-
Classifier: Programming Language :: Python :: 3.11
|
17
|
-
Classifier: Programming Language :: Python :: 3.12
|
18
|
-
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
19
|
-
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
20
|
-
Classifier: Topic :: Text Processing :: General
|
21
|
-
Classifier: Topic :: Utilities
|
22
|
-
Classifier: Typing :: Typed
|
23
|
-
Requires-Python: >=3.9
|
24
|
-
Description-Content-Type: text/markdown
|
25
|
-
License-File: LICENSE
|
26
|
-
Requires-Dist: anyio>=4.8.0
|
27
|
-
Requires-Dist: charset-normalizer>=3.4.1
|
28
|
-
Requires-Dist: html-to-markdown>=1.2.0
|
29
|
-
Requires-Dist: pypandoc>=1.15
|
30
|
-
Requires-Dist: pypdfium2>=4.30.1
|
31
|
-
Requires-Dist: python-pptx>=1.0.2
|
32
|
-
|
33
|
-
# Kreuzberg
|
34
|
-
|
35
|
-
Kreuzberg is a library for simplified text extraction from PDF files. It's meant to offer simple, hassle free text
|
36
|
-
extraction.
|
37
|
-
|
38
|
-
Why?
|
39
|
-
|
40
|
-
I am building, like many do now, a RAG focused service (checkout https://grantflow.ai). I have text extraction needs.
|
41
|
-
There are quite a lot of commercial options out there, and several open-source + paid options.
|
42
|
-
But I wanted something simple, which does not require expansive round-trips to an external API.
|
43
|
-
Furthermore, I wanted something that is easy to run locally and isn't very heavy / requires a GPU.
|
44
|
-
|
45
|
-
Hence, this library.
|
46
|
-
|
47
|
-
## Features
|
48
|
-
|
49
|
-
- Extract text from PDFs, images, office documents and more (see supported formats below)
|
50
|
-
- Use modern Python with async (via `anyio`) and proper type hints
|
51
|
-
- Extensive error handling for easy debugging
|
52
|
-
|
53
|
-
## Installation
|
54
|
-
|
55
|
-
1. Begin by installing the python package:
|
56
|
-
|
57
|
-
```shell
|
58
|
-
|
59
|
-
pip install kreuzberg
|
60
|
-
|
61
|
-
```
|
62
|
-
|
63
|
-
2. Install the system dependencies:
|
64
|
-
|
65
|
-
- [pandoc](https://pandoc.org/installing.html) (non-pdf text extraction, GPL v2.0 licensed but used via CLI only)
|
66
|
-
- [tesseract-ocr](https://tesseract-ocr.github.io/) (for image/PDF OCR, Apache License)
|
67
|
-
|
68
|
-
## Dependencies and Philosophy
|
69
|
-
|
70
|
-
This library is built to be minimalist and simple. It also aims to utilize OSS tools for the job. Its fundamentally a
|
71
|
-
high order async abstraction on top of other tools, think of it like the library you would bake in your code base, but
|
72
|
-
polished and well maintained.
|
73
|
-
|
74
|
-
### Dependencies
|
75
|
-
|
76
|
-
- PDFs are processed using pdfium2 for searchable PDFs + Tesseract OCR for scanned documents
|
77
|
-
- Images are processed using Tesseract OCR
|
78
|
-
- Office documents and other formats are processed using Pandoc
|
79
|
-
- PPTX files are converted using python-pptx
|
80
|
-
- HTML files are converted using html-to-markdown
|
81
|
-
- Plain text files are read directly with appropriate encoding detection
|
82
|
-
|
83
|
-
### Roadmap
|
84
|
-
|
85
|
-
V1:
|
86
|
-
|
87
|
-
- [x] - html file text extraction
|
88
|
-
- [ ] - better PDF table extraction
|
89
|
-
- [ ] - TBD
|
90
|
-
|
91
|
-
V2:
|
92
|
-
|
93
|
-
- [ ] - extra install groups (to make dependencies optional)
|
94
|
-
- [ ] - metadata extraction (possible breaking change)
|
95
|
-
- [ ] - TBD
|
96
|
-
|
97
|
-
### Feature Requests
|
98
|
-
|
99
|
-
Feel free to open a discussion in GitHub or an issue if you have any feature requests
|
100
|
-
|
101
|
-
### Contribution
|
102
|
-
|
103
|
-
Is welcome! Read guidelines below.
|
104
|
-
|
105
|
-
## Supported File Types
|
106
|
-
|
107
|
-
Kreuzberg supports a wide range of file formats:
|
108
|
-
|
109
|
-
### Document Formats
|
110
|
-
|
111
|
-
- PDF (`.pdf`) - both searchable and scanned documents
|
112
|
-
- Word Documents (`.docx`, `.doc`)
|
113
|
-
- Power Point Presentations (`.pptx`)
|
114
|
-
- OpenDocument Text (`.odt`)
|
115
|
-
- Rich Text Format (`.rtf`)
|
116
|
-
|
117
|
-
### Image Formats
|
118
|
-
|
119
|
-
- JPEG, JPG (`.jpg`, `.jpeg`, `.pjpeg`)
|
120
|
-
- PNG (`.png`)
|
121
|
-
- TIFF (`.tiff`, `.tif`)
|
122
|
-
- BMP (`.bmp`)
|
123
|
-
- GIF (`.gif`)
|
124
|
-
- WebP (`.webp`)
|
125
|
-
- JPEG 2000 (`.jp2`, `.jpx`, `.jpm`, `.mj2`)
|
126
|
-
- Portable Anymap (`.pnm`)
|
127
|
-
- Portable Bitmap (`.pbm`)
|
128
|
-
- Portable Graymap (`.pgm`)
|
129
|
-
- Portable Pixmap (`.ppm`)
|
130
|
-
|
131
|
-
#### Text and Markup Formats
|
132
|
-
|
133
|
-
- HTML (`.html`, `.htm`)
|
134
|
-
- Plain Text (`.txt`)
|
135
|
-
- Markdown (`.md`)
|
136
|
-
- reStructuredText (`.rst`)
|
137
|
-
- LaTeX (`.tex`)
|
138
|
-
|
139
|
-
#### Data Formats
|
140
|
-
|
141
|
-
- Comma-Separated Values (`.csv`)
|
142
|
-
- Tab-Separated Values (`.tsv`)
|
143
|
-
|
144
|
-
## Usage
|
145
|
-
|
146
|
-
Kreuzberg exports two async functions:
|
147
|
-
|
148
|
-
- Extract text from a file (string path or `pathlib.Path`) using `extract_file()`
|
149
|
-
- Extract text from a byte-string using `extract_bytes()`
|
150
|
-
|
151
|
-
### Extract from File
|
152
|
-
|
153
|
-
```python
|
154
|
-
from pathlib import Path
|
155
|
-
from kreuzberg import extract_file
|
156
|
-
|
157
|
-
|
158
|
-
# Extract text from a PDF file
|
159
|
-
async def extract_pdf():
|
160
|
-
result = await extract_file("document.pdf")
|
161
|
-
print(f"Extracted text: {result.content}")
|
162
|
-
print(f"Output mime type: {result.mime_type}")
|
163
|
-
|
164
|
-
|
165
|
-
# Extract text from an image
|
166
|
-
async def extract_image():
|
167
|
-
result = await extract_file("scan.png")
|
168
|
-
print(f"Extracted text: {result.content}")
|
169
|
-
|
170
|
-
|
171
|
-
# or use Path
|
172
|
-
|
173
|
-
async def extract_pdf():
|
174
|
-
result = await extract_file(Path("document.pdf"))
|
175
|
-
print(f"Extracted text: {result.content}")
|
176
|
-
print(f"Output mime type: {result.mime_type}")
|
177
|
-
```
|
178
|
-
|
179
|
-
### Extract from Bytes
|
180
|
-
|
181
|
-
```python
|
182
|
-
from kreuzberg import extract_bytes
|
183
|
-
|
184
|
-
|
185
|
-
# Extract text from PDF bytes
|
186
|
-
async def process_uploaded_pdf(pdf_content: bytes):
|
187
|
-
result = await extract_bytes(pdf_content, mime_type="application/pdf")
|
188
|
-
return result.content
|
189
|
-
|
190
|
-
|
191
|
-
# Extract text from image bytes
|
192
|
-
async def process_uploaded_image(image_content: bytes):
|
193
|
-
result = await extract_bytes(image_content, mime_type="image/jpeg")
|
194
|
-
return result.content
|
195
|
-
```
|
196
|
-
|
197
|
-
### Forcing OCR
|
198
|
-
|
199
|
-
When extracting a PDF file or bytes, you might want to force OCR - for example, if the PDF includes images that have text that should be extracted etc.
|
200
|
-
You can do this by passing `force_ocr=True`:
|
201
|
-
|
202
|
-
```python
|
203
|
-
from kreuzberg import extract_bytes
|
204
|
-
|
205
|
-
|
206
|
-
# Extract text from PDF bytes and force OCR
|
207
|
-
async def process_uploaded_pdf(pdf_content: bytes):
|
208
|
-
result = await extract_bytes(pdf_content, mime_type="application/pdf", force_ocr=True)
|
209
|
-
return result.content
|
210
|
-
```
|
211
|
-
|
212
|
-
### Error Handling
|
213
|
-
|
214
|
-
Kreuzberg raises two exception types:
|
215
|
-
|
216
|
-
#### ValidationError
|
217
|
-
|
218
|
-
Raised when there are issues with input validation:
|
219
|
-
|
220
|
-
- Unsupported mime types
|
221
|
-
- Undetectable mime types
|
222
|
-
- Path doesn't point at an exist file
|
223
|
-
|
224
|
-
#### ParsingError
|
225
|
-
|
226
|
-
Raised when there are issues during the text extraction process:
|
227
|
-
|
228
|
-
- PDF parsing failures
|
229
|
-
- OCR errors
|
230
|
-
- Pandoc conversion errors
|
231
|
-
|
232
|
-
```python
|
233
|
-
from kreuzberg import extract_file
|
234
|
-
from kreuzberg.exceptions import ValidationError, ParsingError
|
235
|
-
|
236
|
-
|
237
|
-
async def safe_extract():
|
238
|
-
try:
|
239
|
-
result = await extract_file("document.doc")
|
240
|
-
return result.content
|
241
|
-
except ValidationError as e:
|
242
|
-
print(f"Validation error: {e.message}")
|
243
|
-
print(f"Context: {e.context}")
|
244
|
-
except ParsingError as e:
|
245
|
-
print(f"Parsing error: {e.message}")
|
246
|
-
print(f"Context: {e.context}") # Contains detailed error information
|
247
|
-
```
|
248
|
-
|
249
|
-
Both error types include helpful context information for debugging:
|
250
|
-
|
251
|
-
```python
|
252
|
-
try:
|
253
|
-
result = await extract_file("scanned.pdf")
|
254
|
-
except ParsingError as e:
|
255
|
-
# e.context might contain:
|
256
|
-
# {
|
257
|
-
# "file_path": "scanned.pdf",
|
258
|
-
# "error": "Tesseract OCR failed: Unable to process image"
|
259
|
-
# }
|
260
|
-
```
|
261
|
-
|
262
|
-
### ExtractionResult
|
263
|
-
|
264
|
-
All extraction functions return an ExtractionResult named tuple containing:
|
265
|
-
|
266
|
-
- `content`: The extracted text as a string
|
267
|
-
- `mime_type`: The mime type of the output (either "text/plain" or, if pandoc is used- "text/markdown")
|
268
|
-
|
269
|
-
```python
|
270
|
-
from kreuzberg import ExtractionResult
|
271
|
-
|
272
|
-
|
273
|
-
async def process_document(path: str) -> str:
|
274
|
-
result: ExtractionResult = await extract_file(path)
|
275
|
-
return result.content
|
276
|
-
|
277
|
-
|
278
|
-
# or access the result as tuple
|
279
|
-
|
280
|
-
async def process_document(path: str) -> str:
|
281
|
-
content, mime_type = await extract_file(path)
|
282
|
-
# do something with mime_type
|
283
|
-
return content
|
284
|
-
```
|
285
|
-
|
286
|
-
## Contribution
|
287
|
-
|
288
|
-
This library is open to contribution. Feel free to open issues or submit PRs. Its better to discuss issues before
|
289
|
-
submitting PRs to avoid disappointment.
|
290
|
-
|
291
|
-
### Local Development
|
292
|
-
|
293
|
-
1. Clone the repo
|
294
|
-
2. Install the system dependencies
|
295
|
-
3. Install the full dependencies with `uv sync`
|
296
|
-
4. Install the pre-commit hooks with:
|
297
|
-
```shell
|
298
|
-
pre-commit install && pre-commit install --hook-type commit-msg
|
299
|
-
```
|
300
|
-
5. Make your changes and submit a PR
|
301
|
-
|
302
|
-
## License
|
303
|
-
|
304
|
-
This library uses the MIT license.
|
File without changes
|
File without changes
|
File without changes
|