kreuzberg 1.4.0__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_extractors.py +45 -36
- kreuzberg/_mime_types.py +24 -33
- kreuzberg/_pandoc.py +416 -0
- kreuzberg/extraction.py +8 -0
- kreuzberg-1.6.0.dist-info/METADATA +317 -0
- kreuzberg-1.6.0.dist-info/RECORD +15 -0
- kreuzberg-1.4.0.dist-info/METADATA +0 -304
- kreuzberg-1.4.0.dist-info/RECORD +0 -14
- {kreuzberg-1.4.0.dist-info → kreuzberg-1.6.0.dist-info}/LICENSE +0 -0
- {kreuzberg-1.4.0.dist-info → kreuzberg-1.6.0.dist-info}/WHEEL +0 -0
- {kreuzberg-1.4.0.dist-info → kreuzberg-1.6.0.dist-info}/top_level.txt +0 -0
kreuzberg/_extractors.py
CHANGED
@@ -4,24 +4,24 @@ import re
|
|
4
4
|
from contextlib import suppress
|
5
5
|
from html import escape
|
6
6
|
from io import BytesIO
|
7
|
-
from
|
7
|
+
from pathlib import Path
|
8
|
+
from tempfile import NamedTemporaryFile
|
9
|
+
from typing import TYPE_CHECKING
|
8
10
|
|
9
11
|
import html_to_markdown
|
10
12
|
import pptx
|
11
|
-
import pypandoc
|
12
13
|
import pypdfium2
|
13
14
|
from anyio import Path as AsyncPath
|
14
|
-
from
|
15
|
+
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
16
|
+
from xlsx2csv import Xlsx2csv
|
15
17
|
|
16
|
-
from kreuzberg.
|
18
|
+
from kreuzberg._pandoc import process_content, process_file
|
17
19
|
from kreuzberg._string import normalize_spaces, safe_decode
|
18
20
|
from kreuzberg._sync import run_sync
|
19
21
|
from kreuzberg._tesseract import batch_process_images
|
20
22
|
from kreuzberg.exceptions import ParsingError
|
21
23
|
|
22
24
|
if TYPE_CHECKING: # pragma: no cover
|
23
|
-
from pathlib import Path
|
24
|
-
|
25
25
|
from PIL.Image import Image
|
26
26
|
|
27
27
|
|
@@ -98,32 +98,18 @@ async def extract_pdf_file(file_path: Path, force_ocr: bool = False) -> str:
|
|
98
98
|
return await extract_pdf_with_tesseract(file_path)
|
99
99
|
|
100
100
|
|
101
|
-
async def extract_content_with_pandoc(file_data: bytes, mime_type: str
|
101
|
+
async def extract_content_with_pandoc(file_data: bytes, mime_type: str) -> str:
|
102
102
|
"""Extract text using pandoc.
|
103
103
|
|
104
104
|
Args:
|
105
105
|
file_data: The content of the file.
|
106
106
|
mime_type: The mime type of the file.
|
107
|
-
encoding: An optional encoding to use when decoding the string.
|
108
|
-
|
109
|
-
Raises:
|
110
|
-
ParsingError: If the text could not be extracted from the file using pandoc.
|
111
107
|
|
112
108
|
Returns:
|
113
109
|
The extracted text.
|
114
110
|
"""
|
115
|
-
|
116
|
-
|
117
|
-
try:
|
118
|
-
return normalize_spaces(
|
119
|
-
cast(str, await run_sync(pypandoc.convert_text, file_data, to="md", format=ext, encoding=encoding))
|
120
|
-
)
|
121
|
-
except RuntimeError as e:
|
122
|
-
# TODO: add test case
|
123
|
-
raise ParsingError(
|
124
|
-
f"Could not extract text from {PANDOC_MIME_TYPE_EXT_MAP[mime_type]} file contents",
|
125
|
-
context={"error": str(e)},
|
126
|
-
) from e
|
111
|
+
result = await process_content(file_data, mime_type=mime_type)
|
112
|
+
return normalize_spaces(result.content)
|
127
113
|
|
128
114
|
|
129
115
|
async def extract_file_with_pandoc(file_path: Path | str, mime_type: str) -> str:
|
@@ -133,20 +119,11 @@ async def extract_file_with_pandoc(file_path: Path | str, mime_type: str) -> str
|
|
133
119
|
file_path: The path to the file.
|
134
120
|
mime_type: The mime type of the file.
|
135
121
|
|
136
|
-
Raises:
|
137
|
-
ParsingError: If the text could not be extracted from the file using pandoc.
|
138
|
-
|
139
122
|
Returns:
|
140
123
|
The extracted text.
|
141
124
|
"""
|
142
|
-
|
143
|
-
|
144
|
-
return normalize_spaces(cast(str, await run_sync(pypandoc.convert_file, file_path, to="md", format=ext)))
|
145
|
-
except RuntimeError as e:
|
146
|
-
raise ParsingError(
|
147
|
-
f"Could not extract text from {PANDOC_MIME_TYPE_EXT_MAP[mime_type]} file",
|
148
|
-
context={"file_path": str(file_path), "error": str(e)},
|
149
|
-
) from e
|
125
|
+
result = await process_file(file_path, mime_type=mime_type)
|
126
|
+
return normalize_spaces(result.content)
|
150
127
|
|
151
128
|
|
152
129
|
async def extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
|
@@ -161,8 +138,6 @@ async def extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
|
|
161
138
|
Returns:
|
162
139
|
The extracted text content
|
163
140
|
"""
|
164
|
-
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
165
|
-
|
166
141
|
md_content = ""
|
167
142
|
file_contents = (
|
168
143
|
file_path_or_contents
|
@@ -221,6 +196,40 @@ async def extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
|
|
221
196
|
return normalize_spaces(md_content)
|
222
197
|
|
223
198
|
|
199
|
+
async def extract_xlsx_file(file_path_or_contents: Path | bytes) -> str:
|
200
|
+
"""Extract text from an XLSX file by converting it to CSV and then to markdown.
|
201
|
+
|
202
|
+
Args:
|
203
|
+
file_path_or_contents: The path to the XLSX file or its contents as bytes.
|
204
|
+
|
205
|
+
Returns:
|
206
|
+
The extracted text content.
|
207
|
+
|
208
|
+
Raises:
|
209
|
+
ParsingError: If the XLSX file could not be parsed.
|
210
|
+
"""
|
211
|
+
try:
|
212
|
+
with NamedTemporaryFile(suffix=".xlsx") as xlsx_file, NamedTemporaryFile(suffix=".csv") as csv_file:
|
213
|
+
if isinstance(file_path_or_contents, bytes):
|
214
|
+
xlsx_file.write(file_path_or_contents)
|
215
|
+
xlsx_file.flush()
|
216
|
+
xlsx_path = xlsx_file.name
|
217
|
+
else:
|
218
|
+
xlsx_path = str(file_path_or_contents)
|
219
|
+
|
220
|
+
await run_sync(Xlsx2csv(xlsx_path).convert, csv_file.name)
|
221
|
+
result = await process_file(csv_file.name, mime_type="text/csv")
|
222
|
+
return normalize_spaces(result.content)
|
223
|
+
except Exception as e:
|
224
|
+
raise ParsingError(
|
225
|
+
"Could not extract text from XLSX file",
|
226
|
+
context={
|
227
|
+
"error": str(e),
|
228
|
+
"file_path": str(file_path_or_contents) if isinstance(file_path_or_contents, Path) else None,
|
229
|
+
},
|
230
|
+
) from e
|
231
|
+
|
232
|
+
|
224
233
|
async def extract_html_string(file_path_or_contents: Path | bytes) -> str:
|
225
234
|
"""Extract text from an HTML string.
|
226
235
|
|
kreuzberg/_mime_types.py
CHANGED
@@ -10,7 +10,7 @@ MARKDOWN_MIME_TYPE: Final = "text/markdown"
|
|
10
10
|
PDF_MIME_TYPE: Final = "application/pdf"
|
11
11
|
PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
|
12
12
|
POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
13
|
-
|
13
|
+
EXCEL_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
14
14
|
PLAIN_TEXT_MIME_TYPES: Final[set[str]] = {PLAIN_TEXT_MIME_TYPE, MARKDOWN_MIME_TYPE}
|
15
15
|
|
16
16
|
IMAGE_MIME_TYPES: Final[set[str]] = {
|
@@ -54,49 +54,40 @@ IMAGE_MIME_TYPE_EXT_MAP: Final[Mapping[str, str]] = {
|
|
54
54
|
"image/x-portable-pixmap": "ppm",
|
55
55
|
}
|
56
56
|
PANDOC_SUPPORTED_MIME_TYPES: Final[set[str]] = {
|
57
|
-
"application/
|
58
|
-
"application/
|
57
|
+
"application/csl+json",
|
58
|
+
"application/docbook+xml",
|
59
|
+
"application/epub+zip",
|
59
60
|
"application/rtf",
|
60
61
|
"application/vnd.oasis.opendocument.text",
|
61
62
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
62
|
-
"application/x-
|
63
|
+
"application/x-biblatex",
|
64
|
+
"application/x-bibtex",
|
65
|
+
"application/x-endnote+xml",
|
66
|
+
"application/x-fictionbook+xml",
|
67
|
+
"application/x-ipynb+json",
|
68
|
+
"application/x-jats+xml",
|
63
69
|
"application/x-latex",
|
64
|
-
"application/x-
|
65
|
-
"application/x-
|
70
|
+
"application/x-opml+xml",
|
71
|
+
"application/x-research-info-systems",
|
72
|
+
"application/x-typst",
|
66
73
|
"text/csv",
|
67
|
-
"text/latex",
|
68
|
-
"text/rst",
|
69
|
-
"text/rtf",
|
70
74
|
"text/tab-separated-values",
|
71
|
-
"text/
|
72
|
-
"text/x-
|
75
|
+
"text/troff",
|
76
|
+
"text/x-commonmark",
|
77
|
+
"text/x-dokuwiki",
|
78
|
+
"text/x-gfm",
|
79
|
+
"text/x-markdown",
|
80
|
+
"text/x-markdown-extra",
|
81
|
+
"text/x-mdoc",
|
82
|
+
"text/x-multimarkdown",
|
83
|
+
"text/x-org",
|
84
|
+
"text/x-pod",
|
73
85
|
"text/x-rst",
|
74
|
-
"text/x-tsv",
|
75
|
-
}
|
76
|
-
PANDOC_MIME_TYPE_EXT_MAP: Final[Mapping[str, str]] = {
|
77
|
-
"application/csv": "csv",
|
78
|
-
"application/latex": "latex",
|
79
|
-
"application/rtf": "rtf",
|
80
|
-
"application/vnd.oasis.opendocument.text": "odt",
|
81
|
-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
|
82
|
-
"application/x-csv": "csv",
|
83
|
-
"application/x-latex": "latex",
|
84
|
-
"application/x-rtf": "rtf",
|
85
|
-
"application/x-vnd.oasis.opendocument.text": "odt",
|
86
|
-
"text/csv": "csv",
|
87
|
-
"text/latex": "latex",
|
88
|
-
"text/rst": "rst",
|
89
|
-
"text/rtf": "rtf",
|
90
|
-
"text/tab-separated-values": "tsv",
|
91
|
-
"text/x-csv": "csv",
|
92
|
-
"text/x-latex": "latex",
|
93
|
-
"text/x-rst": "rst",
|
94
|
-
"text/x-tsv": "tsv",
|
95
86
|
}
|
96
87
|
|
97
88
|
SUPPORTED_MIME_TYPES: Final[set[str]] = (
|
98
89
|
PLAIN_TEXT_MIME_TYPES
|
99
90
|
| IMAGE_MIME_TYPES
|
100
91
|
| PANDOC_SUPPORTED_MIME_TYPES
|
101
|
-
| {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE}
|
92
|
+
| {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE, EXCEL_MIME_TYPE}
|
102
93
|
)
|
kreuzberg/_pandoc.py
ADDED
@@ -0,0 +1,416 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import json
|
4
|
+
import subprocess
|
5
|
+
from asyncio import gather
|
6
|
+
from dataclasses import dataclass
|
7
|
+
from tempfile import NamedTemporaryFile
|
8
|
+
from typing import TYPE_CHECKING, Any, Final, Literal, TypedDict, cast
|
9
|
+
|
10
|
+
from anyio import Path as AsyncPath
|
11
|
+
|
12
|
+
from kreuzberg._string import normalize_spaces
|
13
|
+
from kreuzberg._sync import run_sync
|
14
|
+
from kreuzberg.exceptions import MissingDependencyError, ParsingError, ValidationError
|
15
|
+
|
16
|
+
if TYPE_CHECKING:
|
17
|
+
from collections.abc import Mapping
|
18
|
+
from os import PathLike
|
19
|
+
|
20
|
+
try: # pragma: no cover
|
21
|
+
from typing import NotRequired # type: ignore[attr-defined]
|
22
|
+
except ImportError: # pragma: no cover
|
23
|
+
from typing_extensions import NotRequired
|
24
|
+
|
25
|
+
version_ref: Final[dict[str, bool]] = {"checked": False}
|
26
|
+
|
27
|
+
|
28
|
+
# Block-level node types in Pandoc AST
|
29
|
+
BLOCK_HEADER: Final = "Header" # Header with level, attributes and inline content
|
30
|
+
BLOCK_PARA: Final = "Para" # Paragraph containing inline content
|
31
|
+
BLOCK_CODE: Final = "CodeBlock" # Code block with attributes and string content
|
32
|
+
BLOCK_QUOTE: Final = "BlockQuote" # Block quote containing blocks
|
33
|
+
BLOCK_LIST: Final = "BulletList" # Bullet list containing items (blocks)
|
34
|
+
BLOCK_ORDERED: Final = "OrderedList" # Numbered list with attrs and items
|
35
|
+
|
36
|
+
# Inline-level node types in Pandoc AST
|
37
|
+
INLINE_STR: Final = "Str" # Plain text string
|
38
|
+
INLINE_SPACE: Final = "Space" # Single space
|
39
|
+
INLINE_EMPH: Final = "Emph" # Emphasized text (contains inlines)
|
40
|
+
INLINE_STRONG: Final = "Strong" # Strong/bold text (contains inlines)
|
41
|
+
INLINE_LINK: Final = "Link" # Link with text and target
|
42
|
+
INLINE_IMAGE: Final = "Image" # Image with alt text and source
|
43
|
+
INLINE_CODE: Final = "Code" # Inline code span
|
44
|
+
INLINE_MATH: Final = "Math" # Math expression
|
45
|
+
|
46
|
+
# Metadata node types in Pandoc AST
|
47
|
+
META_MAP: Final = "MetaMap" # Key-value mapping of metadata
|
48
|
+
META_LIST: Final = "MetaList" # List of metadata values
|
49
|
+
META_INLINES: Final = "MetaInlines" # Inline content in metadata
|
50
|
+
META_STRING: Final = "MetaString" # Plain string in metadata
|
51
|
+
META_BLOCKS: Final = "MetaBlocks" # Block content in metadata
|
52
|
+
|
53
|
+
# Node content field name
|
54
|
+
CONTENT_FIELD: Final = "c"
|
55
|
+
TYPE_FIELD: Final = "t"
|
56
|
+
|
57
|
+
# Valid node types
|
58
|
+
NodeType = Literal[
|
59
|
+
# Block types
|
60
|
+
"Header",
|
61
|
+
"Para",
|
62
|
+
"CodeBlock",
|
63
|
+
"BlockQuote",
|
64
|
+
"BulletList",
|
65
|
+
"OrderedList",
|
66
|
+
# Inline types
|
67
|
+
"Str",
|
68
|
+
"Space",
|
69
|
+
"Emph",
|
70
|
+
"Strong",
|
71
|
+
"Link",
|
72
|
+
"Image",
|
73
|
+
"Code",
|
74
|
+
"Math",
|
75
|
+
# Meta types
|
76
|
+
"MetaMap",
|
77
|
+
"MetaList",
|
78
|
+
"MetaInlines",
|
79
|
+
"MetaString",
|
80
|
+
"MetaBlocks",
|
81
|
+
]
|
82
|
+
|
83
|
+
PANDOC_MIMETYPE_TO_FORMAT_MAPPING: Final[Mapping[str, str]] = {
|
84
|
+
"application/csl+json": "csljson",
|
85
|
+
"application/docbook+xml": "docbook",
|
86
|
+
"application/epub+zip": "epub",
|
87
|
+
"application/rtf": "rtf",
|
88
|
+
"application/vnd.oasis.opendocument.text": "odt",
|
89
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
|
90
|
+
"application/x-biblatex": "biblatex",
|
91
|
+
"application/x-bibtex": "bibtex",
|
92
|
+
"application/x-endnote+xml": "endnotexml",
|
93
|
+
"application/x-fictionbook+xml": "fb2",
|
94
|
+
"application/x-ipynb+json": "ipynb",
|
95
|
+
"application/x-jats+xml": "jats",
|
96
|
+
"application/x-latex": "latex",
|
97
|
+
"application/x-opml+xml": "opml",
|
98
|
+
"application/x-research-info-systems": "ris",
|
99
|
+
"application/x-typst": "typst",
|
100
|
+
"text/csv": "csv",
|
101
|
+
"text/tab-separated-values": "tsv",
|
102
|
+
"text/troff": "man",
|
103
|
+
"text/x-commonmark": "commonmark",
|
104
|
+
"text/x-dokuwiki": "dokuwiki",
|
105
|
+
"text/x-gfm": "gfm",
|
106
|
+
"text/x-markdown": "markdown",
|
107
|
+
"text/x-markdown-extra": "markdown_phpextra",
|
108
|
+
"text/x-mdoc": "mdoc",
|
109
|
+
"text/x-multimarkdown": "markdown_mmd",
|
110
|
+
"text/x-org": "org",
|
111
|
+
"text/x-pod": "pod",
|
112
|
+
"text/x-rst": "rst",
|
113
|
+
}
|
114
|
+
|
115
|
+
|
116
|
+
class Metadata(TypedDict, total=False):
|
117
|
+
"""Document metadata extracted from Pandoc document.
|
118
|
+
|
119
|
+
All fields are optional but will only be included if they contain non-empty values.
|
120
|
+
Any field that would be empty or None is omitted from the dictionary.
|
121
|
+
"""
|
122
|
+
|
123
|
+
title: NotRequired[str]
|
124
|
+
"""Document title."""
|
125
|
+
subtitle: NotRequired[str]
|
126
|
+
"""Document subtitle."""
|
127
|
+
abstract: NotRequired[str | list[str]]
|
128
|
+
"""Document abstract, summary or description."""
|
129
|
+
authors: NotRequired[list[str]]
|
130
|
+
"""List of document authors."""
|
131
|
+
date: NotRequired[str]
|
132
|
+
"""Document date as string to preserve original format."""
|
133
|
+
subject: NotRequired[str]
|
134
|
+
"""Document subject or topic."""
|
135
|
+
description: NotRequired[str]
|
136
|
+
"""Extended description."""
|
137
|
+
keywords: NotRequired[list[str]]
|
138
|
+
"""Keywords or tags."""
|
139
|
+
categories: NotRequired[list[str]]
|
140
|
+
"""Categories or classifications."""
|
141
|
+
version: NotRequired[str]
|
142
|
+
"""Version identifier."""
|
143
|
+
language: NotRequired[str]
|
144
|
+
"""Document language code."""
|
145
|
+
references: NotRequired[list[str]]
|
146
|
+
"""Reference entries."""
|
147
|
+
citations: NotRequired[list[str]]
|
148
|
+
"""Citation identifiers."""
|
149
|
+
copyright: NotRequired[str]
|
150
|
+
"""Copyright information."""
|
151
|
+
license: NotRequired[str]
|
152
|
+
"""License information."""
|
153
|
+
identifier: NotRequired[str]
|
154
|
+
"""Document identifier."""
|
155
|
+
publisher: NotRequired[str]
|
156
|
+
"""Publisher name."""
|
157
|
+
contributors: NotRequired[list[str]]
|
158
|
+
"""Additional contributors."""
|
159
|
+
creator: NotRequired[str]
|
160
|
+
"""Document creator."""
|
161
|
+
institute: NotRequired[str | list[str]]
|
162
|
+
"""Institute or organization."""
|
163
|
+
|
164
|
+
|
165
|
+
@dataclass
|
166
|
+
class PandocResult:
|
167
|
+
"""Result of a pandoc conversion including content and metadata."""
|
168
|
+
|
169
|
+
content: str
|
170
|
+
"""The processed markdown content."""
|
171
|
+
metadata: Metadata
|
172
|
+
"""Document metadata extracted from the source."""
|
173
|
+
|
174
|
+
|
175
|
+
def _extract_inline_text(node: dict[str, Any]) -> str | None:
|
176
|
+
if node_type := node.get(TYPE_FIELD):
|
177
|
+
if node_type == INLINE_STR:
|
178
|
+
return node.get(CONTENT_FIELD)
|
179
|
+
if node_type == INLINE_SPACE:
|
180
|
+
return " "
|
181
|
+
if node_type in (INLINE_EMPH, INLINE_STRONG):
|
182
|
+
return _extract_inlines(node.get(CONTENT_FIELD, []))
|
183
|
+
return None # pragma: no cover
|
184
|
+
|
185
|
+
|
186
|
+
def _extract_inlines(nodes: list[dict[str, Any]]) -> str | None:
|
187
|
+
texts = [text for node in nodes if (text := _extract_inline_text(node))]
|
188
|
+
result = "".join(texts).strip()
|
189
|
+
return result if result else None
|
190
|
+
|
191
|
+
|
192
|
+
def _extract_meta_value(node: Any) -> str | list[str] | None:
|
193
|
+
if not isinstance(node, dict) or CONTENT_FIELD not in node or TYPE_FIELD not in node:
|
194
|
+
return None
|
195
|
+
|
196
|
+
content = node[CONTENT_FIELD]
|
197
|
+
node_type = node[TYPE_FIELD]
|
198
|
+
|
199
|
+
if not content or node_type not in {
|
200
|
+
META_STRING,
|
201
|
+
META_INLINES,
|
202
|
+
META_LIST,
|
203
|
+
META_BLOCKS,
|
204
|
+
}:
|
205
|
+
return None
|
206
|
+
|
207
|
+
if node_type == META_STRING and isinstance(content, str):
|
208
|
+
return content
|
209
|
+
|
210
|
+
if isinstance(content, list) and (content := [v for v in content if isinstance(v, dict)]):
|
211
|
+
if node_type == META_INLINES:
|
212
|
+
return _extract_inlines(cast(list[dict[str, Any]], content))
|
213
|
+
|
214
|
+
if node_type == META_LIST:
|
215
|
+
results = []
|
216
|
+
for value in [value for item in content if (value := _extract_meta_value(item))]:
|
217
|
+
if isinstance(value, list):
|
218
|
+
results.extend(value) # pragma: no cover
|
219
|
+
else:
|
220
|
+
results.append(value)
|
221
|
+
return results
|
222
|
+
|
223
|
+
if blocks := [block for block in content if block.get(TYPE_FIELD) == BLOCK_PARA]:
|
224
|
+
block_texts = []
|
225
|
+
for block in blocks:
|
226
|
+
block_content = block.get(CONTENT_FIELD, [])
|
227
|
+
if isinstance(block_content, list) and (text := _extract_inlines(block_content)):
|
228
|
+
block_texts.append(text)
|
229
|
+
return block_texts if block_texts else None
|
230
|
+
|
231
|
+
return None
|
232
|
+
|
233
|
+
|
234
|
+
def _extract_metadata(raw_meta: dict[str, Any]) -> Metadata:
|
235
|
+
"""Extract all non-empty metadata values from Pandoc AST metadata."""
|
236
|
+
meta: Metadata = {}
|
237
|
+
|
238
|
+
for key, value in raw_meta.items():
|
239
|
+
if extracted := _extract_meta_value(value):
|
240
|
+
meta[key] = extracted # type: ignore[literal-required]
|
241
|
+
|
242
|
+
citations = [
|
243
|
+
cite["citationId"]
|
244
|
+
for block in raw_meta.get("blocks", [])
|
245
|
+
if block.get(TYPE_FIELD) == "Cite"
|
246
|
+
for cite in block.get(CONTENT_FIELD, [[{}]])[0]
|
247
|
+
if isinstance(cite, dict)
|
248
|
+
]
|
249
|
+
if citations:
|
250
|
+
meta["citations"] = citations
|
251
|
+
|
252
|
+
return meta
|
253
|
+
|
254
|
+
|
255
|
+
def _get_extension_from_mime_type(mime_type: str) -> str:
|
256
|
+
if mime_type not in PANDOC_MIMETYPE_TO_FORMAT_MAPPING or not any(
|
257
|
+
mime_type.startswith(value) for value in PANDOC_MIMETYPE_TO_FORMAT_MAPPING
|
258
|
+
):
|
259
|
+
raise ValidationError(
|
260
|
+
f"Unsupported mime type: {mime_type}",
|
261
|
+
context={
|
262
|
+
"mime_type": mime_type,
|
263
|
+
"supported_mimetypes": ",".join(sorted(PANDOC_MIMETYPE_TO_FORMAT_MAPPING)),
|
264
|
+
},
|
265
|
+
)
|
266
|
+
|
267
|
+
return PANDOC_MIMETYPE_TO_FORMAT_MAPPING.get(mime_type) or next(
|
268
|
+
PANDOC_MIMETYPE_TO_FORMAT_MAPPING[k] for k in PANDOC_MIMETYPE_TO_FORMAT_MAPPING if k.startswith(mime_type)
|
269
|
+
)
|
270
|
+
|
271
|
+
|
272
|
+
async def validate_pandoc_version() -> None:
|
273
|
+
"""Validate that Pandoc is installed and is version 3 or above.
|
274
|
+
|
275
|
+
Raises:
|
276
|
+
MissingDependencyError: If Pandoc is not installed or is below version 3.
|
277
|
+
"""
|
278
|
+
try:
|
279
|
+
if version_ref["checked"]:
|
280
|
+
return
|
281
|
+
|
282
|
+
result = await run_sync(subprocess.run, ["pandoc", "--version"], capture_output=True)
|
283
|
+
version = result.stdout.decode().split("\n")[0].split()[1]
|
284
|
+
if not version.startswith("3."):
|
285
|
+
raise MissingDependencyError("Pandoc version 3 or above is required.")
|
286
|
+
|
287
|
+
version_ref["checked"] = True
|
288
|
+
|
289
|
+
except FileNotFoundError as e:
|
290
|
+
raise MissingDependencyError("Pandoc is not installed.") from e
|
291
|
+
|
292
|
+
|
293
|
+
async def extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -> Metadata:
|
294
|
+
"""Extract metadata from a document using pandoc.
|
295
|
+
|
296
|
+
Args:
|
297
|
+
input_file: The path to the file to process.
|
298
|
+
mime_type: The mime type of the file.
|
299
|
+
|
300
|
+
Raises:
|
301
|
+
ParsingError: If Pandoc fails to extract metadata.
|
302
|
+
|
303
|
+
Returns:
|
304
|
+
Dictionary containing document metadata.
|
305
|
+
"""
|
306
|
+
extension = _get_extension_from_mime_type(mime_type)
|
307
|
+
|
308
|
+
with NamedTemporaryFile(suffix=".json") as metadata_file:
|
309
|
+
try:
|
310
|
+
command = [
|
311
|
+
"pandoc",
|
312
|
+
str(input_file),
|
313
|
+
f"--from={extension}",
|
314
|
+
"--to=json",
|
315
|
+
"--standalone",
|
316
|
+
"--quiet",
|
317
|
+
"--output",
|
318
|
+
metadata_file.name,
|
319
|
+
]
|
320
|
+
|
321
|
+
result = await run_sync(
|
322
|
+
subprocess.run,
|
323
|
+
command,
|
324
|
+
capture_output=True,
|
325
|
+
)
|
326
|
+
|
327
|
+
if result.returncode != 0:
|
328
|
+
raise ParsingError(
|
329
|
+
"Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
|
330
|
+
)
|
331
|
+
|
332
|
+
json_data = json.loads(await AsyncPath(metadata_file.name).read_text())
|
333
|
+
return _extract_metadata(json_data)
|
334
|
+
|
335
|
+
except (RuntimeError, OSError, json.JSONDecodeError) as e:
|
336
|
+
raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
|
337
|
+
|
338
|
+
|
339
|
+
async def _extract_file(input_file: str | PathLike[str], *, mime_type: str, extra_args: list[str] | None = None) -> str:
|
340
|
+
extension = _get_extension_from_mime_type(mime_type)
|
341
|
+
|
342
|
+
with NamedTemporaryFile(suffix=".md") as output_file:
|
343
|
+
command = [
|
344
|
+
"pandoc",
|
345
|
+
str(input_file),
|
346
|
+
f"--from={extension}",
|
347
|
+
"--to=markdown",
|
348
|
+
"--standalone",
|
349
|
+
"--wrap=preserve",
|
350
|
+
"--quiet",
|
351
|
+
"--output",
|
352
|
+
output_file.name,
|
353
|
+
]
|
354
|
+
|
355
|
+
if extra_args:
|
356
|
+
command.extend(extra_args)
|
357
|
+
|
358
|
+
result = await run_sync(
|
359
|
+
subprocess.run,
|
360
|
+
command,
|
361
|
+
capture_output=True,
|
362
|
+
)
|
363
|
+
|
364
|
+
if result.returncode != 0:
|
365
|
+
raise ParsingError(
|
366
|
+
"Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
|
367
|
+
)
|
368
|
+
|
369
|
+
text = await AsyncPath(output_file.name).read_text()
|
370
|
+
|
371
|
+
return normalize_spaces(text)
|
372
|
+
|
373
|
+
|
374
|
+
async def process_file(
|
375
|
+
input_file: str | PathLike[str], *, mime_type: str, extra_args: list[str] | None = None
|
376
|
+
) -> PandocResult:
|
377
|
+
"""Process a single file using Pandoc and convert to markdown.
|
378
|
+
|
379
|
+
Args:
|
380
|
+
input_file: The path to the file to process.
|
381
|
+
mime_type: The mime type of the file.
|
382
|
+
extra_args: Additional Pandoc command line arguments.
|
383
|
+
|
384
|
+
Returns:
|
385
|
+
PandocResult containing processed content and metadata.
|
386
|
+
"""
|
387
|
+
await validate_pandoc_version()
|
388
|
+
|
389
|
+
metadata, content = await gather(
|
390
|
+
*[
|
391
|
+
extract_metadata(input_file, mime_type=mime_type),
|
392
|
+
_extract_file(input_file, mime_type=mime_type, extra_args=extra_args),
|
393
|
+
]
|
394
|
+
)
|
395
|
+
return PandocResult(
|
396
|
+
content=content, # type: ignore[arg-type]
|
397
|
+
metadata=metadata, # type: ignore[arg-type]
|
398
|
+
)
|
399
|
+
|
400
|
+
|
401
|
+
async def process_content(content: bytes, *, mime_type: str, extra_args: list[str] | None = None) -> PandocResult:
|
402
|
+
"""Process content using Pandoc and convert to markdown.
|
403
|
+
|
404
|
+
Args:
|
405
|
+
content: The content to process.
|
406
|
+
mime_type: The mime type of the content.
|
407
|
+
extra_args: Additional Pandoc command line arguments.
|
408
|
+
|
409
|
+
Returns:
|
410
|
+
PandocResult containing processed content and metadata.
|
411
|
+
"""
|
412
|
+
extension = _get_extension_from_mime_type(mime_type)
|
413
|
+
|
414
|
+
with NamedTemporaryFile(suffix=f".{extension}") as input_file:
|
415
|
+
await AsyncPath(input_file.name).write_bytes(content)
|
416
|
+
return await process_file(input_file.name, mime_type=mime_type, extra_args=extra_args)
|
kreuzberg/extraction.py
CHANGED
@@ -22,8 +22,10 @@ from kreuzberg._extractors import (
|
|
22
22
|
extract_html_string,
|
23
23
|
extract_pdf_file,
|
24
24
|
extract_pptx_file,
|
25
|
+
extract_xlsx_file,
|
25
26
|
)
|
26
27
|
from kreuzberg._mime_types import (
|
28
|
+
EXCEL_MIME_TYPE,
|
27
29
|
HTML_MIME_TYPE,
|
28
30
|
IMAGE_MIME_TYPE_EXT_MAP,
|
29
31
|
IMAGE_MIME_TYPES,
|
@@ -75,6 +77,9 @@ async def extract_bytes(content: bytes, mime_type: str, force_ocr: bool = False)
|
|
75
77
|
content=await extract_pdf_file(Path(temp_file.name), force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE
|
76
78
|
)
|
77
79
|
|
80
|
+
if mime_type == EXCEL_MIME_TYPE or mime_type.startswith(EXCEL_MIME_TYPE):
|
81
|
+
return ExtractionResult(content=await extract_xlsx_file(content), mime_type=MARKDOWN_MIME_TYPE)
|
82
|
+
|
78
83
|
if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
|
79
84
|
with NamedTemporaryFile(suffix=IMAGE_MIME_TYPE_EXT_MAP[mime_type]) as temp_file:
|
80
85
|
temp_file.write(content)
|
@@ -134,6 +139,9 @@ async def extract_file(
|
|
134
139
|
if mime_type == PDF_MIME_TYPE or mime_type.startswith(PDF_MIME_TYPE):
|
135
140
|
return ExtractionResult(content=await extract_pdf_file(file_path, force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE)
|
136
141
|
|
142
|
+
if mime_type == EXCEL_MIME_TYPE or mime_type.startswith(EXCEL_MIME_TYPE):
|
143
|
+
return ExtractionResult(content=await extract_xlsx_file(file_path), mime_type=MARKDOWN_MIME_TYPE)
|
144
|
+
|
137
145
|
if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
|
138
146
|
return ExtractionResult(content=await process_image_with_tesseract(file_path), mime_type=PLAIN_TEXT_MIME_TYPE)
|
139
147
|
|
@@ -0,0 +1,317 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: kreuzberg
|
3
|
+
Version: 1.6.0
|
4
|
+
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
|
+
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
|
+
License: MIT
|
7
|
+
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
8
|
+
Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,tesseract,text-extraction,text-processing
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
10
|
+
Classifier: Intended Audience :: Developers
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
12
|
+
Classifier: Operating System :: OS Independent
|
13
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
21
|
+
Classifier: Topic :: Text Processing :: General
|
22
|
+
Classifier: Topic :: Utilities
|
23
|
+
Classifier: Typing :: Typed
|
24
|
+
Requires-Python: >=3.9
|
25
|
+
Description-Content-Type: text/markdown
|
26
|
+
License-File: LICENSE
|
27
|
+
Requires-Dist: anyio>=4.8.0
|
28
|
+
Requires-Dist: charset-normalizer>=3.4.1
|
29
|
+
Requires-Dist: html-to-markdown>=1.2.0
|
30
|
+
Requires-Dist: pypdfium2>=4.30.1
|
31
|
+
Requires-Dist: python-pptx>=1.0.2
|
32
|
+
Requires-Dist: typing-extensions>=4.12.2; python_version < "3.10"
|
33
|
+
Requires-Dist: xlsx2csv>=0.8.4
|
34
|
+
|
35
|
+
# Kreuzberg
|
36
|
+
|
37
|
+
Kreuzberg is a modern Python library for text extraction from documents, designed for simplicity and efficiency. It provides a unified async interface for extracting text from a wide range of file formats including PDFs, images, office documents, and more.
|
38
|
+
|
39
|
+
## Why Kreuzberg?
|
40
|
+
|
41
|
+
- **Simple and Hassle-Free**: Clean API that just works, without complex configuration
|
42
|
+
- **Local Processing**: No external API calls or cloud dependencies required
|
43
|
+
- **Resource Efficient**: Lightweight processing without GPU requirements
|
44
|
+
- **Format Support**: Comprehensive support for documents, images, and text formats
|
45
|
+
- **Modern Python**: Built with async/await, type hints, and current best practices
|
46
|
+
|
47
|
+
Kreuzberg was created to solve text extraction needs in RAG (Retrieval Augmented Generation) applications, but it's suitable for any text extraction use case. Unlike many commercial solutions that require API calls or complex setups, Kreuzberg focuses on local processing with minimal dependencies.
|
48
|
+
|
49
|
+
## Features
|
50
|
+
|
51
|
+
- **Universal Text Extraction**: Extract text from PDFs (both searchable and scanned), images, office documents, and more
|
52
|
+
- **Smart Processing**: Automatic OCR for scanned documents, encoding detection for text files
|
53
|
+
- **Modern Python Design**:
|
54
|
+
- Async-first API using `anyio`
|
55
|
+
- Comprehensive type hints for better IDE support
|
56
|
+
- Detailed error handling with context information
|
57
|
+
- **Production Ready**:
|
58
|
+
- Robust error handling
|
59
|
+
- Detailed debugging information
|
60
|
+
- Memory efficient processing
|
61
|
+
|
62
|
+
## Installation
|
63
|
+
|
64
|
+
### 1. Install the Python Package
|
65
|
+
|
66
|
+
```shell
|
67
|
+
pip install kreuzberg
|
68
|
+
```
|
69
|
+
|
70
|
+
### 2. Install System Dependencies
|
71
|
+
|
72
|
+
Kreuzberg requires two system level dependencies:
|
73
|
+
|
74
|
+
- [Pandoc](https://pandoc.org/installing.html) - For document format conversion
|
75
|
+
- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
|
76
|
+
|
77
|
+
Please install these using their respective installation guides.
|
78
|
+
|
79
|
+
## Architecture
|
80
|
+
|
81
|
+
Kreuzberg is designed as a high-level async abstraction over established open-source tools. It integrates:
|
82
|
+
|
83
|
+
- **PDF Processing**:
|
84
|
+
- `pdfium2` for searchable PDFs
|
85
|
+
- Tesseract OCR for scanned content
|
86
|
+
- **Document Conversion**:
|
87
|
+
- Pandoc for many document and markup formats
|
88
|
+
- `python-pptx` for PowerPoint files
|
89
|
+
- `html-to-markdown` for HTML content
|
90
|
+
- `xlsx2csv` for Excel spreadsheets
|
91
|
+
- **Text Processing**:
|
92
|
+
- Smart encoding detection
|
93
|
+
- Markdown and plain text handling
|
94
|
+
|
95
|
+
### Supported Formats
|
96
|
+
|
97
|
+
#### Document Formats
|
98
|
+
|
99
|
+
- PDF (`.pdf`, both searchable and scanned documents)
|
100
|
+
- Microsoft Word (`.docx`, `.doc`)
|
101
|
+
- PowerPoint presentations (`.pptx`)
|
102
|
+
- OpenDocument Text (`.odt`)
|
103
|
+
- Rich Text Format (`.rtf`)
|
104
|
+
- EPUB (`.epub`)
|
105
|
+
- DocBook XML (`.dbk`, `.xml`)
|
106
|
+
- FictionBook (`.fb2`)
|
107
|
+
- LaTeX (`.tex`, `.latex`)
|
108
|
+
- Typst (`.typ`)
|
109
|
+
|
110
|
+
#### Markup and Text Formats
|
111
|
+
|
112
|
+
- HTML (`.html`, `.htm`)
|
113
|
+
- Plain text (`.txt`) and Markdown (`.md`, `.markdown`)
|
114
|
+
- reStructuredText (`.rst`)
|
115
|
+
- Org-mode (`.org`)
|
116
|
+
- DokuWiki (`.txt`)
|
117
|
+
- Pod (`.pod`)
|
118
|
+
- Man pages (`.1`, `.2`, etc.)
|
119
|
+
|
120
|
+
#### Data and Research Formats
|
121
|
+
|
122
|
+
- Excel spreadsheets (`.xlsx`)
|
123
|
+
- CSV (`.csv`) and TSV (`.tsv`) files
|
124
|
+
- Jupyter Notebooks (`.ipynb`)
|
125
|
+
- BibTeX (`.bib`) and BibLaTeX (`.bib`)
|
126
|
+
- CSL-JSON (`.json`)
|
127
|
+
- EndNote XML (`.xml`)
|
128
|
+
- RIS (`.ris`)
|
129
|
+
- JATS XML (`.xml`)
|
130
|
+
|
131
|
+
#### Image Formats
|
132
|
+
|
133
|
+
- JPEG (`.jpg`, `.jpeg`, `.pjpeg`)
|
134
|
+
- PNG (`.png`)
|
135
|
+
- TIFF (`.tiff`, `.tif`)
|
136
|
+
- BMP (`.bmp`)
|
137
|
+
- GIF (`.gif`)
|
138
|
+
- WebP (`.webp`)
|
139
|
+
- JPEG 2000 (`.jp2`, `.jpx`, `.jpm`, `.mj2`)
|
140
|
+
- Portable Anymap (`.pnm`)
|
141
|
+
- Portable Bitmap (`.pbm`)
|
142
|
+
- Portable Graymap (`.pgm`)
|
143
|
+
- Portable Pixmap (`.ppm`)
|
144
|
+
|
145
|
+
## Usage
|
146
|
+
|
147
|
+
Kreuzberg provides a simple, async-first API for text extraction. The library exports two main functions:
|
148
|
+
|
149
|
+
- `extract_file()`: Extract text from a file (accepts string path or `pathlib.Path`)
|
150
|
+
- `extract_bytes()`: Extract text from bytes (accepts a byte string)
|
151
|
+
|
152
|
+
### Quick Start
|
153
|
+
|
154
|
+
```python
|
155
|
+
from pathlib import Path
|
156
|
+
from kreuzberg import extract_file, extract_bytes
|
157
|
+
|
158
|
+
# Basic file extraction
|
159
|
+
async def extract_document():
|
160
|
+
# Extract from a PDF file
|
161
|
+
pdf_result = await extract_file("document.pdf")
|
162
|
+
print(f"PDF text: {pdf_result.content}")
|
163
|
+
|
164
|
+
# Extract from an image
|
165
|
+
img_result = await extract_file("scan.png")
|
166
|
+
print(f"Image text: {img_result.content}")
|
167
|
+
|
168
|
+
# Extract from Word document
|
169
|
+
docx_result = await extract_file(Path("document.docx"))
|
170
|
+
print(f"Word text: {docx_result.content}")
|
171
|
+
```
|
172
|
+
|
173
|
+
### Processing Uploaded Files
|
174
|
+
|
175
|
+
```python
|
176
|
+
from kreuzberg import extract_bytes
|
177
|
+
|
178
|
+
async def process_upload(file_content: bytes, mime_type: str):
|
179
|
+
"""Process uploaded file content with known MIME type."""
|
180
|
+
result = await extract_bytes(file_content, mime_type=mime_type)
|
181
|
+
return result.content
|
182
|
+
|
183
|
+
# Example usage with different file types
|
184
|
+
async def handle_uploads():
|
185
|
+
# Process PDF upload
|
186
|
+
pdf_result = await extract_bytes(pdf_bytes, mime_type="application/pdf")
|
187
|
+
|
188
|
+
# Process image upload
|
189
|
+
img_result = await extract_bytes(image_bytes, mime_type="image/jpeg")
|
190
|
+
|
191
|
+
# Process Word document upload
|
192
|
+
docx_result = await extract_bytes(docx_bytes,
|
193
|
+
mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document")
|
194
|
+
```
|
195
|
+
|
196
|
+
### Advanced Features
|
197
|
+
|
198
|
+
#### PDF Processing Options
|
199
|
+
|
200
|
+
```python
|
201
|
+
from kreuzberg import extract_file
|
202
|
+
|
203
|
+
async def process_pdf():
|
204
|
+
# Force OCR for PDFs with embedded images or scanned content
|
205
|
+
result = await extract_file("document.pdf", force_ocr=True)
|
206
|
+
|
207
|
+
# Process a scanned PDF (automatically uses OCR)
|
208
|
+
scanned = await extract_file("scanned.pdf")
|
209
|
+
```
|
210
|
+
|
211
|
+
#### ExtractionResult Object
|
212
|
+
|
213
|
+
All extraction functions return an `ExtractionResult` containing:
|
214
|
+
|
215
|
+
- `content`: The extracted text (str)
|
216
|
+
- `mime_type`: Output format ("text/plain" or "text/markdown" for Pandoc conversions)
|
217
|
+
|
218
|
+
```python
|
219
|
+
from kreuzberg import ExtractionResult
|
220
|
+
|
221
|
+
async def process_document(path: str) -> tuple[str, str]:
|
222
|
+
# Access as a named tuple
|
223
|
+
result: ExtractionResult = await extract_file(path)
|
224
|
+
print(f"Content: {result.content}")
|
225
|
+
print(f"Format: {result.mime_type}")
|
226
|
+
|
227
|
+
# Or unpack as a tuple
|
228
|
+
content, mime_type = await extract_file(path)
|
229
|
+
return content, mime_type
|
230
|
+
```
|
231
|
+
|
232
|
+
### Error Handling
|
233
|
+
|
234
|
+
Kreuzberg provides detailed error handling with two main exception types:
|
235
|
+
|
236
|
+
```python
|
237
|
+
from kreuzberg import extract_file
|
238
|
+
from kreuzberg.exceptions import ValidationError, ParsingError
|
239
|
+
|
240
|
+
async def safe_extract(path: str) -> str:
|
241
|
+
try:
|
242
|
+
result = await extract_file(path)
|
243
|
+
return result.content
|
244
|
+
|
245
|
+
except ValidationError as e:
|
246
|
+
# Handles input validation issues:
|
247
|
+
# - Unsupported file types
|
248
|
+
# - Missing files
|
249
|
+
# - Invalid MIME types
|
250
|
+
print(f"Invalid input: {e.message}")
|
251
|
+
print(f"Details: {e.context}")
|
252
|
+
|
253
|
+
except ParsingError as e:
|
254
|
+
# Handles processing errors:
|
255
|
+
# - PDF parsing failures
|
256
|
+
# - OCR errors
|
257
|
+
# - Format conversion issues
|
258
|
+
print(f"Processing failed: {e.message}")
|
259
|
+
print(f"Details: {e.context}")
|
260
|
+
|
261
|
+
return ""
|
262
|
+
|
263
|
+
# Example error contexts
|
264
|
+
try:
|
265
|
+
result = await extract_file("document.xyz")
|
266
|
+
except ValidationError as e:
|
267
|
+
# e.context might contain:
|
268
|
+
# {
|
269
|
+
# "file_path": "document.xyz",
|
270
|
+
# "error": "Unsupported file type",
|
271
|
+
# "supported_types": ["pdf", "docx", ...]
|
272
|
+
# }
|
273
|
+
|
274
|
+
try:
|
275
|
+
result = await extract_file("scan.pdf")
|
276
|
+
except ParsingError as e:
|
277
|
+
# e.context might contain:
|
278
|
+
# {
|
279
|
+
# "file_path": "scan.pdf",
|
280
|
+
# "error": "OCR processing failed",
|
281
|
+
# "details": "Tesseract error: Unable to process image"
|
282
|
+
# }
|
283
|
+
```
|
284
|
+
|
285
|
+
## Roadmap
|
286
|
+
|
287
|
+
V1:
|
288
|
+
|
289
|
+
- [x] - html file text extraction
|
290
|
+
- [ ] - better PDF table extraction
|
291
|
+
- [ ] - batch APIs
|
292
|
+
- [ ] - sync APIs
|
293
|
+
|
294
|
+
V2:
|
295
|
+
|
296
|
+
- [ ] - metadata extraction (breaking change)
|
297
|
+
- [ ] - TBD
|
298
|
+
|
299
|
+
## Contribution
|
300
|
+
|
301
|
+
This library is open to contribution. Feel free to open issues or submit PRs. Its better to discuss issues before
|
302
|
+
submitting PRs to avoid disappointment.
|
303
|
+
|
304
|
+
### Local Development
|
305
|
+
|
306
|
+
1. Clone the repo
|
307
|
+
2. Install the system dependencies
|
308
|
+
3. Install the full dependencies with `uv sync`
|
309
|
+
4. Install the pre-commit hooks with:
|
310
|
+
```shell
|
311
|
+
pre-commit install && pre-commit install --hook-type commit-msg
|
312
|
+
```
|
313
|
+
5. Make your changes and submit a PR
|
314
|
+
|
315
|
+
## License
|
316
|
+
|
317
|
+
This library uses the MIT license.
|
@@ -0,0 +1,15 @@
|
|
1
|
+
kreuzberg/__init__.py,sha256=5IBPjPsZ7faK15gFB9ZEROHhkEX7KKQmrHPCZuGnhb0,285
|
2
|
+
kreuzberg/_extractors.py,sha256=cbDjitvqI35Gimh27iXvEE0Zczf9jZRJZS7Do8ugVNE,7934
|
3
|
+
kreuzberg/_mime_types.py,sha256=nvRSWDUhtntO9-E9gv2l5BVYow61zim4llJ6n33k_BE,2682
|
4
|
+
kreuzberg/_pandoc.py,sha256=DC6y_NN_CG9dF6fhAj3WumXqKIJLjYmnql2H53_KHnE,13766
|
5
|
+
kreuzberg/_string.py,sha256=4txRDnkdR12oO6G8V-jXEMlA9ivgmw8E8EbjyhfL-W4,1106
|
6
|
+
kreuzberg/_sync.py,sha256=ovsFHFdkcczz7gNEUJsbZzY8KHG0_GAOOYipQNE4hIY,874
|
7
|
+
kreuzberg/_tesseract.py,sha256=nnhkjRIS0BSoovjMIqOlBEXlzngE0QJeFDe7BIqUik8,7872
|
8
|
+
kreuzberg/exceptions.py,sha256=pxoEPS0T9e5QSgxsfXn1VmxsY_EGXvTwY0gETPiNn8E,945
|
9
|
+
kreuzberg/extraction.py,sha256=G3_Uyzhe99qEib4WLE7_l1oC9JKlvoVdn3WEY56J_Wo,6572
|
10
|
+
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
|
+
kreuzberg-1.6.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
12
|
+
kreuzberg-1.6.0.dist-info/METADATA,sha256=GQNbGnxmym5vAcXDivDUccdVBUGnYh-4M38xYEkKTJk,9663
|
13
|
+
kreuzberg-1.6.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
14
|
+
kreuzberg-1.6.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
|
15
|
+
kreuzberg-1.6.0.dist-info/RECORD,,
|
@@ -1,304 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.2
|
2
|
-
Name: kreuzberg
|
3
|
-
Version: 1.4.0
|
4
|
-
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
|
-
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
|
-
License: MIT
|
7
|
-
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
8
|
-
Keywords: document-processing,docx,image-to-text,latex,markdown,ocr,odt,office-documents,pandoc,pdf,pdf-extraction,rag,text-extraction,text-processing
|
9
|
-
Classifier: Development Status :: 4 - Beta
|
10
|
-
Classifier: Intended Audience :: Developers
|
11
|
-
Classifier: License :: OSI Approved :: MIT License
|
12
|
-
Classifier: Operating System :: OS Independent
|
13
|
-
Classifier: Programming Language :: Python :: 3 :: Only
|
14
|
-
Classifier: Programming Language :: Python :: 3.9
|
15
|
-
Classifier: Programming Language :: Python :: 3.10
|
16
|
-
Classifier: Programming Language :: Python :: 3.11
|
17
|
-
Classifier: Programming Language :: Python :: 3.12
|
18
|
-
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
19
|
-
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
20
|
-
Classifier: Topic :: Text Processing :: General
|
21
|
-
Classifier: Topic :: Utilities
|
22
|
-
Classifier: Typing :: Typed
|
23
|
-
Requires-Python: >=3.9
|
24
|
-
Description-Content-Type: text/markdown
|
25
|
-
License-File: LICENSE
|
26
|
-
Requires-Dist: anyio>=4.8.0
|
27
|
-
Requires-Dist: charset-normalizer>=3.4.1
|
28
|
-
Requires-Dist: html-to-markdown>=1.2.0
|
29
|
-
Requires-Dist: pypandoc>=1.15
|
30
|
-
Requires-Dist: pypdfium2>=4.30.1
|
31
|
-
Requires-Dist: python-pptx>=1.0.2
|
32
|
-
|
33
|
-
# Kreuzberg
|
34
|
-
|
35
|
-
Kreuzberg is a library for simplified text extraction from PDF files. It's meant to offer simple, hassle free text
|
36
|
-
extraction.
|
37
|
-
|
38
|
-
Why?
|
39
|
-
|
40
|
-
I am building, like many do now, a RAG focused service (checkout https://grantflow.ai). I have text extraction needs.
|
41
|
-
There are quite a lot of commercial options out there, and several open-source + paid options.
|
42
|
-
But I wanted something simple, which does not require expansive round-trips to an external API.
|
43
|
-
Furthermore, I wanted something that is easy to run locally and isn't very heavy / requires a GPU.
|
44
|
-
|
45
|
-
Hence, this library.
|
46
|
-
|
47
|
-
## Features
|
48
|
-
|
49
|
-
- Extract text from PDFs, images, office documents and more (see supported formats below)
|
50
|
-
- Use modern Python with async (via `anyio`) and proper type hints
|
51
|
-
- Extensive error handling for easy debugging
|
52
|
-
|
53
|
-
## Installation
|
54
|
-
|
55
|
-
1. Begin by installing the python package:
|
56
|
-
|
57
|
-
```shell
|
58
|
-
|
59
|
-
pip install kreuzberg
|
60
|
-
|
61
|
-
```
|
62
|
-
|
63
|
-
2. Install the system dependencies:
|
64
|
-
|
65
|
-
- [pandoc](https://pandoc.org/installing.html) (non-pdf text extraction, GPL v2.0 licensed but used via CLI only)
|
66
|
-
- [tesseract-ocr](https://tesseract-ocr.github.io/) (for image/PDF OCR, Apache License)
|
67
|
-
|
68
|
-
## Dependencies and Philosophy
|
69
|
-
|
70
|
-
This library is built to be minimalist and simple. It also aims to utilize OSS tools for the job. Its fundamentally a
|
71
|
-
high order async abstraction on top of other tools, think of it like the library you would bake in your code base, but
|
72
|
-
polished and well maintained.
|
73
|
-
|
74
|
-
### Dependencies
|
75
|
-
|
76
|
-
- PDFs are processed using pdfium2 for searchable PDFs + Tesseract OCR for scanned documents
|
77
|
-
- Images are processed using Tesseract OCR
|
78
|
-
- Office documents and other formats are processed using Pandoc
|
79
|
-
- PPTX files are converted using python-pptx
|
80
|
-
- HTML files are converted using html-to-markdown
|
81
|
-
- Plain text files are read directly with appropriate encoding detection
|
82
|
-
|
83
|
-
### Roadmap
|
84
|
-
|
85
|
-
V1:
|
86
|
-
|
87
|
-
- [x] - html file text extraction
|
88
|
-
- [ ] - better PDF table extraction
|
89
|
-
- [ ] - TBD
|
90
|
-
|
91
|
-
V2:
|
92
|
-
|
93
|
-
- [ ] - extra install groups (to make dependencies optional)
|
94
|
-
- [ ] - metadata extraction (possible breaking change)
|
95
|
-
- [ ] - TBD
|
96
|
-
|
97
|
-
### Feature Requests
|
98
|
-
|
99
|
-
Feel free to open a discussion in GitHub or an issue if you have any feature requests
|
100
|
-
|
101
|
-
### Contribution
|
102
|
-
|
103
|
-
Is welcome! Read guidelines below.
|
104
|
-
|
105
|
-
## Supported File Types
|
106
|
-
|
107
|
-
Kreuzberg supports a wide range of file formats:
|
108
|
-
|
109
|
-
### Document Formats
|
110
|
-
|
111
|
-
- PDF (`.pdf`) - both searchable and scanned documents
|
112
|
-
- Word Documents (`.docx`, `.doc`)
|
113
|
-
- Power Point Presentations (`.pptx`)
|
114
|
-
- OpenDocument Text (`.odt`)
|
115
|
-
- Rich Text Format (`.rtf`)
|
116
|
-
|
117
|
-
### Image Formats
|
118
|
-
|
119
|
-
- JPEG, JPG (`.jpg`, `.jpeg`, `.pjpeg`)
|
120
|
-
- PNG (`.png`)
|
121
|
-
- TIFF (`.tiff`, `.tif`)
|
122
|
-
- BMP (`.bmp`)
|
123
|
-
- GIF (`.gif`)
|
124
|
-
- WebP (`.webp`)
|
125
|
-
- JPEG 2000 (`.jp2`, `.jpx`, `.jpm`, `.mj2`)
|
126
|
-
- Portable Anymap (`.pnm`)
|
127
|
-
- Portable Bitmap (`.pbm`)
|
128
|
-
- Portable Graymap (`.pgm`)
|
129
|
-
- Portable Pixmap (`.ppm`)
|
130
|
-
|
131
|
-
#### Text and Markup Formats
|
132
|
-
|
133
|
-
- HTML (`.html`, `.htm`)
|
134
|
-
- Plain Text (`.txt`)
|
135
|
-
- Markdown (`.md`)
|
136
|
-
- reStructuredText (`.rst`)
|
137
|
-
- LaTeX (`.tex`)
|
138
|
-
|
139
|
-
#### Data Formats
|
140
|
-
|
141
|
-
- Comma-Separated Values (`.csv`)
|
142
|
-
- Tab-Separated Values (`.tsv`)
|
143
|
-
|
144
|
-
## Usage
|
145
|
-
|
146
|
-
Kreuzberg exports two async functions:
|
147
|
-
|
148
|
-
- Extract text from a file (string path or `pathlib.Path`) using `extract_file()`
|
149
|
-
- Extract text from a byte-string using `extract_bytes()`
|
150
|
-
|
151
|
-
### Extract from File
|
152
|
-
|
153
|
-
```python
|
154
|
-
from pathlib import Path
|
155
|
-
from kreuzberg import extract_file
|
156
|
-
|
157
|
-
|
158
|
-
# Extract text from a PDF file
|
159
|
-
async def extract_pdf():
|
160
|
-
result = await extract_file("document.pdf")
|
161
|
-
print(f"Extracted text: {result.content}")
|
162
|
-
print(f"Output mime type: {result.mime_type}")
|
163
|
-
|
164
|
-
|
165
|
-
# Extract text from an image
|
166
|
-
async def extract_image():
|
167
|
-
result = await extract_file("scan.png")
|
168
|
-
print(f"Extracted text: {result.content}")
|
169
|
-
|
170
|
-
|
171
|
-
# or use Path
|
172
|
-
|
173
|
-
async def extract_pdf():
|
174
|
-
result = await extract_file(Path("document.pdf"))
|
175
|
-
print(f"Extracted text: {result.content}")
|
176
|
-
print(f"Output mime type: {result.mime_type}")
|
177
|
-
```
|
178
|
-
|
179
|
-
### Extract from Bytes
|
180
|
-
|
181
|
-
```python
|
182
|
-
from kreuzberg import extract_bytes
|
183
|
-
|
184
|
-
|
185
|
-
# Extract text from PDF bytes
|
186
|
-
async def process_uploaded_pdf(pdf_content: bytes):
|
187
|
-
result = await extract_bytes(pdf_content, mime_type="application/pdf")
|
188
|
-
return result.content
|
189
|
-
|
190
|
-
|
191
|
-
# Extract text from image bytes
|
192
|
-
async def process_uploaded_image(image_content: bytes):
|
193
|
-
result = await extract_bytes(image_content, mime_type="image/jpeg")
|
194
|
-
return result.content
|
195
|
-
```
|
196
|
-
|
197
|
-
### Forcing OCR
|
198
|
-
|
199
|
-
When extracting a PDF file or bytes, you might want to force OCR - for example, if the PDF includes images that have text that should be extracted etc.
|
200
|
-
You can do this by passing `force_ocr=True`:
|
201
|
-
|
202
|
-
```python
|
203
|
-
from kreuzberg import extract_bytes
|
204
|
-
|
205
|
-
|
206
|
-
# Extract text from PDF bytes and force OCR
|
207
|
-
async def process_uploaded_pdf(pdf_content: bytes):
|
208
|
-
result = await extract_bytes(pdf_content, mime_type="application/pdf", force_ocr=True)
|
209
|
-
return result.content
|
210
|
-
```
|
211
|
-
|
212
|
-
### Error Handling
|
213
|
-
|
214
|
-
Kreuzberg raises two exception types:
|
215
|
-
|
216
|
-
#### ValidationError
|
217
|
-
|
218
|
-
Raised when there are issues with input validation:
|
219
|
-
|
220
|
-
- Unsupported mime types
|
221
|
-
- Undetectable mime types
|
222
|
-
- Path doesn't point at an exist file
|
223
|
-
|
224
|
-
#### ParsingError
|
225
|
-
|
226
|
-
Raised when there are issues during the text extraction process:
|
227
|
-
|
228
|
-
- PDF parsing failures
|
229
|
-
- OCR errors
|
230
|
-
- Pandoc conversion errors
|
231
|
-
|
232
|
-
```python
|
233
|
-
from kreuzberg import extract_file
|
234
|
-
from kreuzberg.exceptions import ValidationError, ParsingError
|
235
|
-
|
236
|
-
|
237
|
-
async def safe_extract():
|
238
|
-
try:
|
239
|
-
result = await extract_file("document.doc")
|
240
|
-
return result.content
|
241
|
-
except ValidationError as e:
|
242
|
-
print(f"Validation error: {e.message}")
|
243
|
-
print(f"Context: {e.context}")
|
244
|
-
except ParsingError as e:
|
245
|
-
print(f"Parsing error: {e.message}")
|
246
|
-
print(f"Context: {e.context}") # Contains detailed error information
|
247
|
-
```
|
248
|
-
|
249
|
-
Both error types include helpful context information for debugging:
|
250
|
-
|
251
|
-
```python
|
252
|
-
try:
|
253
|
-
result = await extract_file("scanned.pdf")
|
254
|
-
except ParsingError as e:
|
255
|
-
# e.context might contain:
|
256
|
-
# {
|
257
|
-
# "file_path": "scanned.pdf",
|
258
|
-
# "error": "Tesseract OCR failed: Unable to process image"
|
259
|
-
# }
|
260
|
-
```
|
261
|
-
|
262
|
-
### ExtractionResult
|
263
|
-
|
264
|
-
All extraction functions return an ExtractionResult named tuple containing:
|
265
|
-
|
266
|
-
- `content`: The extracted text as a string
|
267
|
-
- `mime_type`: The mime type of the output (either "text/plain" or, if pandoc is used- "text/markdown")
|
268
|
-
|
269
|
-
```python
|
270
|
-
from kreuzberg import ExtractionResult
|
271
|
-
|
272
|
-
|
273
|
-
async def process_document(path: str) -> str:
|
274
|
-
result: ExtractionResult = await extract_file(path)
|
275
|
-
return result.content
|
276
|
-
|
277
|
-
|
278
|
-
# or access the result as tuple
|
279
|
-
|
280
|
-
async def process_document(path: str) -> str:
|
281
|
-
content, mime_type = await extract_file(path)
|
282
|
-
# do something with mime_type
|
283
|
-
return content
|
284
|
-
```
|
285
|
-
|
286
|
-
## Contribution
|
287
|
-
|
288
|
-
This library is open to contribution. Feel free to open issues or submit PRs. Its better to discuss issues before
|
289
|
-
submitting PRs to avoid disappointment.
|
290
|
-
|
291
|
-
### Local Development
|
292
|
-
|
293
|
-
1. Clone the repo
|
294
|
-
2. Install the system dependencies
|
295
|
-
3. Install the full dependencies with `uv sync`
|
296
|
-
4. Install the pre-commit hooks with:
|
297
|
-
```shell
|
298
|
-
pre-commit install && pre-commit install --hook-type commit-msg
|
299
|
-
```
|
300
|
-
5. Make your changes and submit a PR
|
301
|
-
|
302
|
-
## License
|
303
|
-
|
304
|
-
This library uses the MIT license.
|
kreuzberg-1.4.0.dist-info/RECORD
DELETED
@@ -1,14 +0,0 @@
|
|
1
|
-
kreuzberg/__init__.py,sha256=5IBPjPsZ7faK15gFB9ZEROHhkEX7KKQmrHPCZuGnhb0,285
|
2
|
-
kreuzberg/_extractors.py,sha256=Z6fxNMODsiNGPBv8gYpZ0jrc2hPbX-56xdrVPJ-6SQ4,7658
|
3
|
-
kreuzberg/_mime_types.py,sha256=hR6LFXWn8dtCDB05PkADYk2l__HpmETNyf4YFixhecE,2918
|
4
|
-
kreuzberg/_string.py,sha256=4txRDnkdR12oO6G8V-jXEMlA9ivgmw8E8EbjyhfL-W4,1106
|
5
|
-
kreuzberg/_sync.py,sha256=ovsFHFdkcczz7gNEUJsbZzY8KHG0_GAOOYipQNE4hIY,874
|
6
|
-
kreuzberg/_tesseract.py,sha256=nnhkjRIS0BSoovjMIqOlBEXlzngE0QJeFDe7BIqUik8,7872
|
7
|
-
kreuzberg/exceptions.py,sha256=pxoEPS0T9e5QSgxsfXn1VmxsY_EGXvTwY0gETPiNn8E,945
|
8
|
-
kreuzberg/extraction.py,sha256=gux3fkPIs8IbIKtRGuPFWJBLB5jO6Y9JsBfhHRcpQ0k,6160
|
9
|
-
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
|
-
kreuzberg-1.4.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
11
|
-
kreuzberg-1.4.0.dist-info/METADATA,sha256=ul0iSWSu_1i029aq8X4T4ZboOzWpKK8wZRuvvLVqAoQ,8503
|
12
|
-
kreuzberg-1.4.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
13
|
-
kreuzberg-1.4.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
|
14
|
-
kreuzberg-1.4.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|