kreuzberg 2.1.1__py3-none-any.whl → 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +16 -2
- kreuzberg/_chunker.py +51 -0
- kreuzberg/_constants.py +2 -3
- kreuzberg/_mime_types.py +19 -26
- kreuzberg/_playa.py +276 -0
- kreuzberg/_registry.py +108 -0
- kreuzberg/_types.py +133 -36
- kreuzberg/exceptions.py +25 -0
- kreuzberg/extraction.py +114 -227
- kreuzberg-3.0.0.dist-info/METADATA +178 -0
- kreuzberg-3.0.0.dist-info/RECORD +15 -0
- {kreuzberg-2.1.1.dist-info → kreuzberg-3.0.0.dist-info}/WHEEL +1 -1
- kreuzberg/_html.py +0 -31
- kreuzberg/_pandoc.py +0 -366
- kreuzberg/_pdf.py +0 -190
- kreuzberg/_pptx.py +0 -88
- kreuzberg/_string.py +0 -41
- kreuzberg/_sync.py +0 -74
- kreuzberg/_tesseract.py +0 -231
- kreuzberg/_tmp.py +0 -37
- kreuzberg/_xlsx.py +0 -88
- kreuzberg-2.1.1.dist-info/METADATA +0 -446
- kreuzberg-2.1.1.dist-info/RECORD +0 -21
- {kreuzberg-2.1.1.dist-info → kreuzberg-3.0.0.dist-info/licenses}/LICENSE +0 -0
- {kreuzberg-2.1.1.dist-info → kreuzberg-3.0.0.dist-info}/top_level.txt +0 -0
kreuzberg/extraction.py
CHANGED
@@ -1,198 +1,140 @@
|
|
1
|
-
"""This module provides functions to extract textual content from files.
|
2
|
-
|
3
|
-
It includes vendored code:
|
4
|
-
|
5
|
-
- The extract PPTX logic is based on code vendored from `markitdown` to extract text from PPTX files.
|
6
|
-
See: https://github.com/microsoft/markitdown/blob/main/src/markitdown/_markitdown.py
|
7
|
-
Refer to the markitdown repository for it's license (MIT).
|
8
|
-
"""
|
9
|
-
|
10
1
|
from __future__ import annotations
|
11
2
|
|
12
|
-
from functools import partial
|
13
|
-
from io import BytesIO
|
14
3
|
from pathlib import Path
|
15
|
-
from typing import TYPE_CHECKING, cast
|
4
|
+
from typing import TYPE_CHECKING, Final, cast
|
16
5
|
|
17
6
|
import anyio
|
18
|
-
from anyio import Path as AsyncPath
|
19
|
-
from PIL.Image import open as open_image
|
20
7
|
|
21
8
|
from kreuzberg import ExtractionResult
|
22
|
-
from kreuzberg.
|
23
|
-
from kreuzberg._html import extract_html_string
|
9
|
+
from kreuzberg._chunker import get_chunker
|
24
10
|
from kreuzberg._mime_types import (
|
25
|
-
EXCEL_MIME_TYPE,
|
26
|
-
HTML_MIME_TYPE,
|
27
|
-
IMAGE_MIME_TYPES,
|
28
|
-
PANDOC_SUPPORTED_MIME_TYPES,
|
29
|
-
PDF_MIME_TYPE,
|
30
|
-
POWER_POINT_MIME_TYPE,
|
31
|
-
SUPPORTED_MIME_TYPES,
|
32
11
|
validate_mime_type,
|
33
12
|
)
|
34
|
-
from kreuzberg.
|
35
|
-
from kreuzberg.
|
36
|
-
|
37
|
-
|
38
|
-
)
|
39
|
-
from kreuzberg._pptx import extract_pptx_file_content
|
40
|
-
from kreuzberg._string import safe_decode
|
41
|
-
from kreuzberg._tesseract import PSMMode, process_image_with_tesseract
|
42
|
-
from kreuzberg._xlsx import extract_xlsx_content, extract_xlsx_file
|
43
|
-
from kreuzberg.exceptions import ValidationError
|
13
|
+
from kreuzberg._registry import ExtractorRegistry
|
14
|
+
from kreuzberg._types import ExtractionConfig
|
15
|
+
from kreuzberg._utils._string import safe_decode
|
16
|
+
from kreuzberg._utils._sync import run_maybe_async, run_maybe_sync
|
44
17
|
|
45
18
|
if TYPE_CHECKING:
|
46
19
|
from collections.abc import Sequence
|
47
20
|
from os import PathLike
|
48
21
|
|
49
22
|
|
50
|
-
|
51
|
-
content: bytes,
|
52
|
-
mime_type: str,
|
53
|
-
*,
|
54
|
-
force_ocr: bool = False,
|
55
|
-
language: str = "eng",
|
56
|
-
max_processes: int = DEFAULT_MAX_PROCESSES,
|
57
|
-
psm: PSMMode = PSMMode.AUTO,
|
58
|
-
) -> ExtractionResult:
|
59
|
-
"""Extract the textual content from a given byte string representing a file's contents.
|
23
|
+
DEFAULT_CONFIG: Final[ExtractionConfig] = ExtractionConfig()
|
60
24
|
|
61
|
-
Args:
|
62
|
-
content: The content to extract.
|
63
|
-
mime_type: The mime type of the content.
|
64
|
-
force_ocr: Whether to force OCR on PDF files that have a text layer.
|
65
|
-
language: The language code for OCR. Defaults to "eng".
|
66
|
-
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
67
|
-
psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
|
68
25
|
|
69
|
-
|
70
|
-
|
26
|
+
async def _validate_and_post_process_async(result: ExtractionResult, config: ExtractionConfig) -> ExtractionResult:
|
27
|
+
for validator in config.validators or []:
|
28
|
+
await run_maybe_sync(validator, result)
|
71
29
|
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
f"Unsupported mime type: {mime_type}",
|
78
|
-
context={"mime_type": mime_type, "supported_mimetypes": ",".join(sorted(SUPPORTED_MIME_TYPES))},
|
30
|
+
if config.chunk_content:
|
31
|
+
result.chunks = _handle_chunk_content(
|
32
|
+
mime_type=result.mime_type,
|
33
|
+
config=config,
|
34
|
+
content=result.content,
|
79
35
|
)
|
80
36
|
|
81
|
-
|
82
|
-
|
83
|
-
content, force_ocr=force_ocr, max_processes=max_processes, psm=psm, language=language
|
84
|
-
)
|
37
|
+
for post_processor in config.post_processing_hooks or []:
|
38
|
+
result = await run_maybe_sync(post_processor, result)
|
85
39
|
|
86
|
-
|
87
|
-
return await extract_xlsx_content(content)
|
40
|
+
return result
|
88
41
|
|
89
|
-
if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
|
90
|
-
return await process_image_with_tesseract(open_image(BytesIO(content)), psm=psm, language=language)
|
91
42
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
return await process_content_with_pandoc(content=content, mime_type=mime_type)
|
43
|
+
def _validate_and_post_process_sync(result: ExtractionResult, config: ExtractionConfig) -> ExtractionResult:
|
44
|
+
for validator in config.validators or []:
|
45
|
+
run_maybe_async(validator, result)
|
96
46
|
|
97
|
-
if
|
98
|
-
|
47
|
+
if config.chunk_content:
|
48
|
+
result.chunks = _handle_chunk_content(
|
49
|
+
mime_type=result.mime_type,
|
50
|
+
config=config,
|
51
|
+
content=result.content,
|
52
|
+
)
|
99
53
|
|
100
|
-
|
101
|
-
|
54
|
+
for post_processor in config.post_processing_hooks or []:
|
55
|
+
result = run_maybe_async(post_processor, result)
|
102
56
|
|
103
|
-
return
|
104
|
-
content=safe_decode(content),
|
105
|
-
mime_type=mime_type,
|
106
|
-
metadata={},
|
107
|
-
)
|
57
|
+
return result
|
108
58
|
|
109
59
|
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
60
|
+
def _handle_chunk_content(
|
61
|
+
mime_type: str,
|
62
|
+
config: ExtractionConfig,
|
63
|
+
content: str,
|
64
|
+
) -> list[str]:
|
65
|
+
chunker = get_chunker(mime_type=mime_type, max_characters=config.max_chars, overlap_characters=config.max_overlap)
|
66
|
+
return chunker.chunks(content)
|
67
|
+
|
68
|
+
|
69
|
+
async def extract_bytes(content: bytes, mime_type: str, config: ExtractionConfig = DEFAULT_CONFIG) -> ExtractionResult:
|
70
|
+
"""Extract the textual content from a given byte string representing a file's contents.
|
120
71
|
|
121
72
|
Args:
|
122
|
-
|
73
|
+
content: The content to extract.
|
123
74
|
mime_type: The mime type of the content.
|
124
|
-
|
125
|
-
language: The language code for OCR. Defaults to "eng".
|
126
|
-
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
127
|
-
psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
|
75
|
+
config: Extraction options object, defaults to the default object.
|
128
76
|
|
129
|
-
Raises:
|
130
|
-
ValidationError: If the mime type is not supported.
|
131
77
|
|
132
78
|
Returns:
|
133
79
|
The extracted content and the mime type of the content.
|
134
80
|
"""
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
Path(input_file), force_ocr=force_ocr, max_processes=max_processes, psm=psm, language=language
|
81
|
+
mime_type = validate_mime_type(mime_type=mime_type)
|
82
|
+
if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=config):
|
83
|
+
result = await extractor.extract_bytes_async(content)
|
84
|
+
else:
|
85
|
+
result = ExtractionResult(
|
86
|
+
content=safe_decode(content),
|
87
|
+
chunks=[],
|
88
|
+
mime_type=mime_type,
|
89
|
+
metadata={},
|
145
90
|
)
|
146
91
|
|
147
|
-
|
148
|
-
return await extract_xlsx_file(Path(input_file))
|
92
|
+
return await _validate_and_post_process_async(result=result, config=config)
|
149
93
|
|
150
|
-
if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
|
151
|
-
return await process_image_with_tesseract(input_file, psm=psm, language=language)
|
152
94
|
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
95
|
+
async def extract_file(
|
96
|
+
file_path: PathLike[str] | str, mime_type: str | None = None, config: ExtractionConfig = DEFAULT_CONFIG
|
97
|
+
) -> ExtractionResult:
|
98
|
+
"""Extract the textual content from a given file.
|
157
99
|
|
158
|
-
|
159
|
-
|
100
|
+
Args:
|
101
|
+
file_path: The path to the file.
|
102
|
+
mime_type: The mime type of the content.
|
103
|
+
config: Extraction options object, defaults to the default object.
|
160
104
|
|
161
|
-
|
162
|
-
|
105
|
+
Returns:
|
106
|
+
The extracted content and the mime type of the content.
|
107
|
+
"""
|
108
|
+
mime_type = validate_mime_type(file_path=file_path, mime_type=mime_type)
|
109
|
+
if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=config):
|
110
|
+
result = await extractor.extract_path_async(Path(file_path))
|
111
|
+
else:
|
112
|
+
result = ExtractionResult(
|
113
|
+
content=safe_decode(await anyio.Path(file_path).read_bytes()), chunks=[], mime_type=mime_type, metadata={}
|
114
|
+
)
|
163
115
|
|
164
|
-
return
|
116
|
+
return await _validate_and_post_process_async(result=result, config=config)
|
165
117
|
|
166
118
|
|
167
119
|
async def batch_extract_file(
|
168
|
-
file_paths: Sequence[PathLike[str] | str],
|
169
|
-
*,
|
170
|
-
force_ocr: bool = False,
|
171
|
-
language: str = "eng",
|
172
|
-
max_processes: int = DEFAULT_MAX_PROCESSES,
|
173
|
-
psm: PSMMode = PSMMode.AUTO,
|
120
|
+
file_paths: Sequence[PathLike[str] | str], config: ExtractionConfig = DEFAULT_CONFIG
|
174
121
|
) -> list[ExtractionResult]:
|
175
122
|
"""Extract text from multiple files concurrently.
|
176
123
|
|
177
124
|
Args:
|
178
125
|
file_paths: A sequence of paths to files to extract text from.
|
179
|
-
|
180
|
-
language: The language code for OCR. Defaults to "eng".
|
181
|
-
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
182
|
-
psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
|
126
|
+
config: Extraction options object, defaults to the default object.
|
183
127
|
|
184
128
|
Returns:
|
185
129
|
A list of extraction results in the same order as the input paths.
|
186
130
|
"""
|
187
|
-
results = cast(list[ExtractionResult], ([None] * len(file_paths)))
|
131
|
+
results = cast("list[ExtractionResult]", ([None] * len(file_paths)))
|
188
132
|
|
189
133
|
async def _extract_file(path: PathLike[str] | str, index: int) -> None:
|
190
134
|
result = await extract_file(
|
191
135
|
path,
|
192
|
-
|
193
|
-
|
194
|
-
psm=psm,
|
195
|
-
language=language,
|
136
|
+
None,
|
137
|
+
config,
|
196
138
|
)
|
197
139
|
results[index] = result
|
198
140
|
|
@@ -204,36 +146,21 @@ async def batch_extract_file(
|
|
204
146
|
|
205
147
|
|
206
148
|
async def batch_extract_bytes(
|
207
|
-
contents: Sequence[tuple[bytes, str]],
|
208
|
-
*,
|
209
|
-
force_ocr: bool = False,
|
210
|
-
language: str = "eng",
|
211
|
-
max_processes: int = DEFAULT_MAX_PROCESSES,
|
212
|
-
psm: PSMMode = PSMMode.AUTO,
|
149
|
+
contents: Sequence[tuple[bytes, str]], config: ExtractionConfig = DEFAULT_CONFIG
|
213
150
|
) -> list[ExtractionResult]:
|
214
151
|
"""Extract text from multiple byte contents concurrently.
|
215
152
|
|
216
153
|
Args:
|
217
154
|
contents: A sequence of tuples containing (content, mime_type) pairs.
|
218
|
-
|
219
|
-
language: The language code for OCR. Defaults to "eng".
|
220
|
-
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
221
|
-
psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
|
155
|
+
config: Extraction options object, defaults to the default object.
|
222
156
|
|
223
157
|
Returns:
|
224
158
|
A list of extraction results in the same order as the input contents.
|
225
159
|
"""
|
226
|
-
results = cast(list[ExtractionResult], [None] * len(contents))
|
160
|
+
results = cast("list[ExtractionResult]", [None] * len(contents))
|
227
161
|
|
228
162
|
async def _extract_bytes(content: bytes, mime_type: str, index: int) -> None:
|
229
|
-
result = await extract_bytes(
|
230
|
-
content,
|
231
|
-
mime_type,
|
232
|
-
force_ocr=force_ocr,
|
233
|
-
max_processes=max_processes,
|
234
|
-
psm=psm,
|
235
|
-
language=language,
|
236
|
-
)
|
163
|
+
result = await extract_bytes(content, mime_type, config)
|
237
164
|
results[index] = result
|
238
165
|
|
239
166
|
async with anyio.create_task_group() as tg:
|
@@ -243,122 +170,82 @@ async def batch_extract_bytes(
|
|
243
170
|
return results
|
244
171
|
|
245
172
|
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
def extract_bytes_sync(
|
250
|
-
content: bytes,
|
251
|
-
mime_type: str,
|
252
|
-
*,
|
253
|
-
force_ocr: bool = False,
|
254
|
-
language: str = "eng",
|
255
|
-
max_processes: int = DEFAULT_MAX_PROCESSES,
|
256
|
-
psm: PSMMode = PSMMode.AUTO,
|
257
|
-
) -> ExtractionResult:
|
173
|
+
def extract_bytes_sync(content: bytes, mime_type: str, config: ExtractionConfig = DEFAULT_CONFIG) -> ExtractionResult:
|
258
174
|
"""Synchronous version of extract_bytes.
|
259
175
|
|
260
176
|
Args:
|
261
177
|
content: The content to extract.
|
262
178
|
mime_type: The mime type of the content.
|
263
|
-
|
264
|
-
language: The language code for OCR. Defaults to "eng".
|
265
|
-
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
266
|
-
psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
|
179
|
+
config: Extraction options object, defaults to the default object.
|
267
180
|
|
268
181
|
Returns:
|
269
182
|
The extracted content and the mime type of the content.
|
270
183
|
"""
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
184
|
+
mime_type = validate_mime_type(mime_type=mime_type)
|
185
|
+
if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=config):
|
186
|
+
result = extractor.extract_bytes_sync(content)
|
187
|
+
else:
|
188
|
+
result = ExtractionResult(
|
189
|
+
content=safe_decode(content),
|
190
|
+
chunks=[],
|
191
|
+
mime_type=mime_type,
|
192
|
+
metadata={},
|
193
|
+
)
|
194
|
+
|
195
|
+
return _validate_and_post_process_sync(result=result, config=config)
|
275
196
|
|
276
197
|
|
277
198
|
def extract_file_sync(
|
278
|
-
file_path: Path | str,
|
279
|
-
mime_type: str | None = None,
|
280
|
-
*,
|
281
|
-
force_ocr: bool = False,
|
282
|
-
language: str = "eng",
|
283
|
-
max_processes: int = DEFAULT_MAX_PROCESSES,
|
284
|
-
psm: PSMMode = PSMMode.AUTO,
|
199
|
+
file_path: Path | str, mime_type: str | None = None, config: ExtractionConfig = DEFAULT_CONFIG
|
285
200
|
) -> ExtractionResult:
|
286
201
|
"""Synchronous version of extract_file.
|
287
202
|
|
288
203
|
Args:
|
289
204
|
file_path: The path to the file.
|
290
205
|
mime_type: The mime type of the content.
|
291
|
-
|
292
|
-
language: The language code for OCR. Defaults to "eng".
|
293
|
-
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
294
|
-
psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
|
206
|
+
config: Extraction options object, defaults to the default object.
|
295
207
|
|
296
208
|
Returns:
|
297
209
|
The extracted content and the mime type of the content.
|
298
210
|
"""
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
211
|
+
mime_type = validate_mime_type(file_path=file_path, mime_type=mime_type)
|
212
|
+
if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=config):
|
213
|
+
result = extractor.extract_path_sync(Path(file_path))
|
214
|
+
else:
|
215
|
+
result = ExtractionResult(
|
216
|
+
content=Path(file_path).read_text(),
|
217
|
+
chunks=[],
|
218
|
+
mime_type=mime_type,
|
219
|
+
metadata={},
|
220
|
+
)
|
221
|
+
return _validate_and_post_process_sync(result=result, config=config)
|
303
222
|
|
304
223
|
|
305
224
|
def batch_extract_file_sync(
|
306
|
-
file_paths: Sequence[PathLike[str] | str],
|
307
|
-
*,
|
308
|
-
force_ocr: bool = False,
|
309
|
-
language: str = "eng",
|
310
|
-
max_processes: int = DEFAULT_MAX_PROCESSES,
|
311
|
-
psm: PSMMode = PSMMode.AUTO,
|
225
|
+
file_paths: Sequence[PathLike[str] | str], config: ExtractionConfig = DEFAULT_CONFIG
|
312
226
|
) -> list[ExtractionResult]:
|
313
227
|
"""Synchronous version of batch_extract_file.
|
314
228
|
|
315
229
|
Args:
|
316
230
|
file_paths: A sequence of paths to files to extract text from.
|
317
|
-
|
318
|
-
language: The language code for OCR. Defaults to "eng".
|
319
|
-
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
320
|
-
psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
|
231
|
+
config: Extraction options object, defaults to the default object.
|
321
232
|
|
322
233
|
Returns:
|
323
234
|
A list of extraction results in the same order as the input paths.
|
324
235
|
"""
|
325
|
-
|
326
|
-
batch_extract_file,
|
327
|
-
file_paths,
|
328
|
-
force_ocr=force_ocr,
|
329
|
-
max_processes=max_processes,
|
330
|
-
language=language,
|
331
|
-
psm=psm,
|
332
|
-
)
|
333
|
-
return anyio.run(handler)
|
236
|
+
return [extract_file_sync(file_path=Path(file_path), mime_type=None, config=config) for file_path in file_paths]
|
334
237
|
|
335
238
|
|
336
239
|
def batch_extract_bytes_sync(
|
337
|
-
contents: Sequence[tuple[bytes, str]],
|
338
|
-
*,
|
339
|
-
force_ocr: bool = False,
|
340
|
-
language: str = "eng",
|
341
|
-
max_processes: int = DEFAULT_MAX_PROCESSES,
|
342
|
-
psm: PSMMode = PSMMode.AUTO,
|
240
|
+
contents: Sequence[tuple[bytes, str]], config: ExtractionConfig = DEFAULT_CONFIG
|
343
241
|
) -> list[ExtractionResult]:
|
344
242
|
"""Synchronous version of batch_extract_bytes.
|
345
243
|
|
346
244
|
Args:
|
347
245
|
contents: A sequence of tuples containing (content, mime_type) pairs.
|
348
|
-
|
349
|
-
language: The language code for OCR. Defaults to "eng".
|
350
|
-
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
351
|
-
psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
|
246
|
+
config: Extraction options object, defaults to the default object.
|
352
247
|
|
353
248
|
Returns:
|
354
249
|
A list of extraction results in the same order as the input contents.
|
355
250
|
"""
|
356
|
-
|
357
|
-
batch_extract_bytes,
|
358
|
-
contents,
|
359
|
-
force_ocr=force_ocr,
|
360
|
-
max_processes=max_processes,
|
361
|
-
language=language,
|
362
|
-
psm=psm,
|
363
|
-
)
|
364
|
-
return anyio.run(handler)
|
251
|
+
return [extract_bytes_sync(content=content, mime_type=mime_type, config=config) for content, mime_type in contents]
|
@@ -0,0 +1,178 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: kreuzberg
|
3
|
+
Version: 3.0.0
|
4
|
+
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
|
+
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
|
+
License: MIT
|
7
|
+
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
8
|
+
Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,tesseract,text-extraction,text-processing
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
10
|
+
Classifier: Intended Audience :: Developers
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
12
|
+
Classifier: Operating System :: OS Independent
|
13
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
21
|
+
Classifier: Topic :: Text Processing :: General
|
22
|
+
Classifier: Topic :: Utilities
|
23
|
+
Classifier: Typing :: Typed
|
24
|
+
Requires-Python: >=3.9
|
25
|
+
Description-Content-Type: text/markdown
|
26
|
+
License-File: LICENSE
|
27
|
+
Requires-Dist: anyio>=4.9.0
|
28
|
+
Requires-Dist: charset-normalizer>=3.4.1
|
29
|
+
Requires-Dist: exceptiongroup>=1.2.2; python_version < "3.11"
|
30
|
+
Requires-Dist: html-to-markdown>=1.2.0
|
31
|
+
Requires-Dist: playa-pdf>=0.4.1
|
32
|
+
Requires-Dist: pypdfium2==4.30.0
|
33
|
+
Requires-Dist: python-calamine>=0.3.1
|
34
|
+
Requires-Dist: python-pptx>=1.0.2
|
35
|
+
Requires-Dist: typing-extensions>=4.12.2; python_version < "3.12"
|
36
|
+
Provides-Extra: all
|
37
|
+
Requires-Dist: easyocr>=1.7.2; extra == "all"
|
38
|
+
Requires-Dist: numpy>=2.0.2; extra == "all"
|
39
|
+
Requires-Dist: paddleocr>=2.10.0; extra == "all"
|
40
|
+
Requires-Dist: paddlepaddle>=2.6.2; python_version < "3.13" and extra == "all"
|
41
|
+
Requires-Dist: semantic-text-splitter>=0.24.1; extra == "all"
|
42
|
+
Requires-Dist: setuptools>=76.0.0; extra == "all"
|
43
|
+
Provides-Extra: chunking
|
44
|
+
Requires-Dist: semantic-text-splitter>=0.24.1; extra == "chunking"
|
45
|
+
Provides-Extra: easyocr
|
46
|
+
Requires-Dist: easyocr>=1.7.2; extra == "easyocr"
|
47
|
+
Provides-Extra: paddleocr
|
48
|
+
Requires-Dist: numpy>=2.0.2; extra == "paddleocr"
|
49
|
+
Requires-Dist: paddleocr>=2.10.0; extra == "paddleocr"
|
50
|
+
Requires-Dist: paddlepaddle>=2.6.2; python_version < "3.13" and extra == "paddleocr"
|
51
|
+
Requires-Dist: setuptools>=76.0.0; extra == "paddleocr"
|
52
|
+
Dynamic: license-file
|
53
|
+
|
54
|
+
# Kreuzberg
|
55
|
+
|
56
|
+
[](https://badge.fury.io/py/kreuzberg)
|
57
|
+
[](https://goldziher.github.io/kreuzberg/)
|
58
|
+
[](https://opensource.org/licenses/MIT)
|
59
|
+
|
60
|
+
Kreuzberg is a Python library for text extraction from documents. It provides a unified interface for extracting text from PDFs, images, office documents, and more, with both async and sync APIs.
|
61
|
+
|
62
|
+
## Why Kreuzberg?
|
63
|
+
|
64
|
+
- **Simple and Hassle-Free**: Clean API that just works, without complex configuration
|
65
|
+
- **Local Processing**: No external API calls or cloud dependencies required
|
66
|
+
- **Resource Efficient**: Lightweight processing without GPU requirements
|
67
|
+
- **Format Support**: Comprehensive support for documents, images, and text formats
|
68
|
+
- **Multiple OCR Engines**: Support for Tesseract, EasyOCR, and PaddleOCR
|
69
|
+
- **Modern Python**: Built with async/await, type hints, and a functional-first approach
|
70
|
+
- **Permissive OSS**: MIT licensed with permissively licensed dependencies
|
71
|
+
|
72
|
+
## Quick Start
|
73
|
+
|
74
|
+
```bash
|
75
|
+
pip install kreuzberg
|
76
|
+
```
|
77
|
+
|
78
|
+
Install pandoc:
|
79
|
+
|
80
|
+
```bash
|
81
|
+
# Ubuntu/Debian
|
82
|
+
sudo apt-get install tesseract-ocr pandoc
|
83
|
+
|
84
|
+
# macOS
|
85
|
+
brew install tesseract pandoc
|
86
|
+
|
87
|
+
# Windows
|
88
|
+
choco install -y tesseract pandoc
|
89
|
+
```
|
90
|
+
|
91
|
+
The tesseract OCR engine is the default OCR engine. You can decide not to use it - and then either use one of the two alternative OCR engines, or have no OCR at all.
|
92
|
+
|
93
|
+
### Alternative OCR engines
|
94
|
+
|
95
|
+
```bash
|
96
|
+
# Install with EasyOCR support
|
97
|
+
pip install "kreuzberg[easyocr]"
|
98
|
+
|
99
|
+
# Install with PaddleOCR support
|
100
|
+
pip install "kreuzberg[paddleocr]"
|
101
|
+
```
|
102
|
+
|
103
|
+
## Quick Example
|
104
|
+
|
105
|
+
```python
|
106
|
+
import asyncio
|
107
|
+
from kreuzberg import extract_file
|
108
|
+
|
109
|
+
async def main():
|
110
|
+
# Extract text from a PDF
|
111
|
+
result = await extract_file("document.pdf")
|
112
|
+
print(result.content)
|
113
|
+
|
114
|
+
# Extract text from an image
|
115
|
+
result = await extract_file("scan.jpg")
|
116
|
+
print(result.content)
|
117
|
+
|
118
|
+
# Extract text from a Word document
|
119
|
+
result = await extract_file("report.docx")
|
120
|
+
print(result.content)
|
121
|
+
|
122
|
+
asyncio.run(main())
|
123
|
+
```
|
124
|
+
|
125
|
+
## Documentation
|
126
|
+
|
127
|
+
For comprehensive documentation, visit our [GitHub Pages](https://goldziher.github.io/kreuzberg/):
|
128
|
+
|
129
|
+
- [Getting Started](https://goldziher.github.io/kreuzberg/getting-started/) - Installation and basic usage
|
130
|
+
- [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - In-depth usage information
|
131
|
+
- [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Detailed API documentation
|
132
|
+
- [Examples](https://goldziher.github.io/kreuzberg/examples/) - Code examples for common use cases
|
133
|
+
- [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - Configure OCR engines
|
134
|
+
- [OCR Backends](https://goldziher.github.io/kreuzberg/user-guide/ocr-backends/) - Choose the right OCR engine
|
135
|
+
|
136
|
+
## Supported Formats
|
137
|
+
|
138
|
+
Kreuzberg supports a wide range of document formats:
|
139
|
+
|
140
|
+
- **Documents**: PDF, DOCX, DOC, RTF, TXT, EPUB, etc.
|
141
|
+
- **Images**: JPG, PNG, TIFF, BMP, GIF, etc.
|
142
|
+
- **Spreadsheets**: XLSX, XLS, CSV, etc.
|
143
|
+
- **Presentations**: PPTX, PPT, etc.
|
144
|
+
- **Web Content**: HTML, XML, etc.
|
145
|
+
|
146
|
+
## OCR Engines
|
147
|
+
|
148
|
+
Kreuzberg supports multiple OCR engines:
|
149
|
+
|
150
|
+
- **Tesseract** (Default): Lightweight, fast startup, requires system installation
|
151
|
+
- **EasyOCR**: Good for many languages, pure Python, but downloads models on first use
|
152
|
+
- **PaddleOCR**: Excellent for Asian languages, pure Python, but downloads models on first use
|
153
|
+
|
154
|
+
For comparison and selection guidance, see the [OCR Backends](https://example.com/ocr-backends) documentation.
|
155
|
+
|
156
|
+
## Contribution
|
157
|
+
|
158
|
+
This library is open to contribution. Feel free to open issues or submit PRs. It's better to discuss issues before submitting PRs to avoid disappointment.
|
159
|
+
|
160
|
+
### Local Development
|
161
|
+
|
162
|
+
1. Clone the repo
|
163
|
+
|
164
|
+
1. Install the system dependencies
|
165
|
+
|
166
|
+
1. Install the full dependencies with `uv sync`
|
167
|
+
|
168
|
+
1. Install the pre-commit hooks with:
|
169
|
+
|
170
|
+
```shell
|
171
|
+
pre-commit install && pre-commit install --hook-type commit-msg
|
172
|
+
```
|
173
|
+
|
174
|
+
1. Make your changes and submit a PR
|
175
|
+
|
176
|
+
## License
|
177
|
+
|
178
|
+
This library is released under the MIT license.
|
@@ -0,0 +1,15 @@
|
|
1
|
+
kreuzberg/__init__.py,sha256=KZ_y21m64cafWL7goGeG3EIDutM184st28n4UGajADs,1131
|
2
|
+
kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
|
3
|
+
kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
|
4
|
+
kreuzberg/_mime_types.py,sha256=pKtxBPDoye2knyou_VODDMPIt3eXotP-ak4MAKFI2SU,6310
|
5
|
+
kreuzberg/_playa.py,sha256=agHdhKfKLNtiP37XdNncbCP65v3Qv3m1Gn2KTRUkVx8,10396
|
6
|
+
kreuzberg/_registry.py,sha256=c2B_PJbaL0q3ab2eNmj_0jldeyMaqgvRwkZqUU4MM5Q,3290
|
7
|
+
kreuzberg/_types.py,sha256=sZMxjRZQ1c_MzxdumhYSWghW6yXBwohTUIBa5eR-FKA,6582
|
8
|
+
kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
|
9
|
+
kreuzberg/extraction.py,sha256=0sjvbunx5srbR5lzjOAQjGK5JY3bCUHw-dRFmHjFz7o,8671
|
10
|
+
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
|
+
kreuzberg-3.0.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
12
|
+
kreuzberg-3.0.0.dist-info/METADATA,sha256=wlO9VCvZQy_gJJTmhGzH9j8BlPQPFQdmMZQxJOcQAUg,6515
|
13
|
+
kreuzberg-3.0.0.dist-info/WHEEL,sha256=1tXe9gY0PYatrMPMDd6jXqjfpz_B-Wqm32CPfRC58XU,91
|
14
|
+
kreuzberg-3.0.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
|
15
|
+
kreuzberg-3.0.0.dist-info/RECORD,,
|