kreuzberg 3.2.0__py3-none-any.whl → 3.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +3 -0
- kreuzberg/__main__.py +8 -0
- kreuzberg/_cli_config.py +175 -0
- kreuzberg/_extractors/_image.py +39 -4
- kreuzberg/_extractors/_pandoc.py +158 -18
- kreuzberg/_extractors/_pdf.py +199 -19
- kreuzberg/_extractors/_presentation.py +1 -1
- kreuzberg/_extractors/_spread_sheet.py +65 -7
- kreuzberg/_gmft.py +222 -16
- kreuzberg/_mime_types.py +62 -16
- kreuzberg/_multiprocessing/__init__.py +6 -0
- kreuzberg/_multiprocessing/gmft_isolated.py +332 -0
- kreuzberg/_multiprocessing/process_manager.py +188 -0
- kreuzberg/_multiprocessing/sync_tesseract.py +261 -0
- kreuzberg/_multiprocessing/tesseract_pool.py +359 -0
- kreuzberg/_ocr/_easyocr.py +6 -12
- kreuzberg/_ocr/_paddleocr.py +15 -13
- kreuzberg/_ocr/_tesseract.py +136 -46
- kreuzberg/_playa.py +43 -0
- kreuzberg/_utils/_cache.py +372 -0
- kreuzberg/_utils/_device.py +10 -27
- kreuzberg/_utils/_document_cache.py +220 -0
- kreuzberg/_utils/_errors.py +232 -0
- kreuzberg/_utils/_pdf_lock.py +72 -0
- kreuzberg/_utils/_process_pool.py +100 -0
- kreuzberg/_utils/_serialization.py +82 -0
- kreuzberg/_utils/_string.py +1 -1
- kreuzberg/_utils/_sync.py +21 -0
- kreuzberg/cli.py +338 -0
- kreuzberg/extraction.py +247 -36
- {kreuzberg-3.2.0.dist-info → kreuzberg-3.3.0.dist-info}/METADATA +93 -24
- kreuzberg-3.3.0.dist-info/RECORD +48 -0
- {kreuzberg-3.2.0.dist-info → kreuzberg-3.3.0.dist-info}/WHEEL +1 -2
- kreuzberg-3.3.0.dist-info/entry_points.txt +2 -0
- kreuzberg-3.2.0.dist-info/RECORD +0 -34
- kreuzberg-3.2.0.dist-info/top_level.txt +0 -1
- {kreuzberg-3.2.0.dist-info → kreuzberg-3.3.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/extraction.py
CHANGED
@@ -13,7 +13,8 @@ from kreuzberg._mime_types import (
|
|
13
13
|
from kreuzberg._registry import ExtractorRegistry
|
14
14
|
from kreuzberg._types import ExtractionConfig
|
15
15
|
from kreuzberg._utils._string import safe_decode
|
16
|
-
from kreuzberg._utils._sync import
|
16
|
+
from kreuzberg._utils._sync import run_maybe_sync, run_sync_only
|
17
|
+
from kreuzberg.exceptions import ValidationError
|
17
18
|
|
18
19
|
if TYPE_CHECKING:
|
19
20
|
from collections.abc import Sequence
|
@@ -42,7 +43,7 @@ async def _validate_and_post_process_async(result: ExtractionResult, config: Ext
|
|
42
43
|
|
43
44
|
def _validate_and_post_process_sync(result: ExtractionResult, config: ExtractionConfig) -> ExtractionResult:
|
44
45
|
for validator in config.validators or []:
|
45
|
-
|
46
|
+
run_sync_only(validator, result)
|
46
47
|
|
47
48
|
if config.chunk_content:
|
48
49
|
result.chunks = _handle_chunk_content(
|
@@ -52,7 +53,7 @@ def _validate_and_post_process_sync(result: ExtractionResult, config: Extraction
|
|
52
53
|
)
|
53
54
|
|
54
55
|
for post_processor in config.post_processing_hooks or []:
|
55
|
-
result =
|
56
|
+
result = run_sync_only(post_processor, result)
|
56
57
|
|
57
58
|
return result
|
58
59
|
|
@@ -104,22 +105,57 @@ async def extract_file(
|
|
104
105
|
|
105
106
|
Returns:
|
106
107
|
The extracted content and the mime type of the content.
|
108
|
+
|
109
|
+
Raises:
|
110
|
+
ValidationError: If the file path or configuration is invalid.
|
107
111
|
"""
|
108
|
-
|
109
|
-
if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=config):
|
110
|
-
result = await extractor.extract_path_async(Path(file_path))
|
111
|
-
else:
|
112
|
-
result = ExtractionResult(
|
113
|
-
content=safe_decode(await anyio.Path(file_path).read_bytes()), chunks=[], mime_type=mime_type, metadata={}
|
114
|
-
)
|
112
|
+
from kreuzberg._utils._document_cache import get_document_cache
|
115
113
|
|
116
|
-
|
114
|
+
cache = get_document_cache()
|
115
|
+
path = Path(file_path)
|
116
|
+
cached_result = cache.get(path, config)
|
117
|
+
if cached_result is not None:
|
118
|
+
return cached_result
|
119
|
+
|
120
|
+
if cache.is_processing(path, config):
|
121
|
+
event = cache.mark_processing(path, config)
|
122
|
+
await anyio.to_thread.run_sync(event.wait) # pragma: no cover
|
123
|
+
|
124
|
+
# Try cache again after waiting for other process to complete # ~keep
|
125
|
+
cached_result = cache.get(path, config) # pragma: no cover
|
126
|
+
if cached_result is not None: # pragma: no cover
|
127
|
+
return cached_result
|
128
|
+
|
129
|
+
cache.mark_processing(path, config)
|
130
|
+
|
131
|
+
try:
|
132
|
+
if not path.exists():
|
133
|
+
raise ValidationError("The file does not exist", context={"file_path": str(path)})
|
134
|
+
|
135
|
+
mime_type = validate_mime_type(file_path=file_path, mime_type=mime_type)
|
136
|
+
if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=config):
|
137
|
+
result = await extractor.extract_path_async(Path(file_path))
|
138
|
+
else:
|
139
|
+
result = ExtractionResult(
|
140
|
+
content=safe_decode(await anyio.Path(file_path).read_bytes()),
|
141
|
+
chunks=[],
|
142
|
+
mime_type=mime_type,
|
143
|
+
metadata={},
|
144
|
+
)
|
145
|
+
|
146
|
+
result = await _validate_and_post_process_async(result=result, config=config)
|
147
|
+
|
148
|
+
cache.set(path, config, result)
|
149
|
+
|
150
|
+
return result
|
151
|
+
finally:
|
152
|
+
cache.mark_complete(path, config)
|
117
153
|
|
118
154
|
|
119
155
|
async def batch_extract_file(
|
120
156
|
file_paths: Sequence[PathLike[str] | str], config: ExtractionConfig = DEFAULT_CONFIG
|
121
157
|
) -> list[ExtractionResult]:
|
122
|
-
"""Extract text from multiple files concurrently.
|
158
|
+
"""Extract text from multiple files concurrently with optimizations.
|
123
159
|
|
124
160
|
Args:
|
125
161
|
file_paths: A sequence of paths to files to extract text from.
|
@@ -128,15 +164,43 @@ async def batch_extract_file(
|
|
128
164
|
Returns:
|
129
165
|
A list of extraction results in the same order as the input paths.
|
130
166
|
"""
|
167
|
+
if not file_paths:
|
168
|
+
return []
|
169
|
+
|
170
|
+
import multiprocessing as mp
|
171
|
+
|
172
|
+
max_concurrency = min(len(file_paths), mp.cpu_count() * 2)
|
173
|
+
semaphore = anyio.Semaphore(max_concurrency)
|
174
|
+
|
131
175
|
results = cast("list[ExtractionResult]", ([None] * len(file_paths)))
|
132
176
|
|
133
177
|
async def _extract_file(path: PathLike[str] | str, index: int) -> None:
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
178
|
+
async with semaphore:
|
179
|
+
try:
|
180
|
+
result = await extract_file(
|
181
|
+
path,
|
182
|
+
None,
|
183
|
+
config,
|
184
|
+
)
|
185
|
+
results[index] = result
|
186
|
+
except Exception as e: # noqa: BLE001
|
187
|
+
from kreuzberg._utils._errors import create_error_context
|
188
|
+
|
189
|
+
error_result = ExtractionResult(
|
190
|
+
content=f"Error: {type(e).__name__}: {e!s}",
|
191
|
+
mime_type="text/plain",
|
192
|
+
metadata={ # type: ignore[typeddict-unknown-key]
|
193
|
+
"error": True,
|
194
|
+
"error_context": create_error_context(
|
195
|
+
operation="batch_extract_file",
|
196
|
+
file_path=path,
|
197
|
+
error=e,
|
198
|
+
index=index,
|
199
|
+
),
|
200
|
+
},
|
201
|
+
chunks=[],
|
202
|
+
)
|
203
|
+
results[index] = error_result
|
140
204
|
|
141
205
|
async with anyio.create_task_group() as tg:
|
142
206
|
for i, path in enumerate(file_paths):
|
@@ -148,7 +212,7 @@ async def batch_extract_file(
|
|
148
212
|
async def batch_extract_bytes(
|
149
213
|
contents: Sequence[tuple[bytes, str]], config: ExtractionConfig = DEFAULT_CONFIG
|
150
214
|
) -> list[ExtractionResult]:
|
151
|
-
"""Extract text from multiple byte contents concurrently.
|
215
|
+
"""Extract text from multiple byte contents concurrently with optimizations.
|
152
216
|
|
153
217
|
Args:
|
154
218
|
contents: A sequence of tuples containing (content, mime_type) pairs.
|
@@ -157,11 +221,40 @@ async def batch_extract_bytes(
|
|
157
221
|
Returns:
|
158
222
|
A list of extraction results in the same order as the input contents.
|
159
223
|
"""
|
224
|
+
if not contents:
|
225
|
+
return []
|
226
|
+
|
227
|
+
import multiprocessing as mp
|
228
|
+
|
229
|
+
max_concurrency = min(len(contents), mp.cpu_count() * 2)
|
230
|
+
semaphore = anyio.Semaphore(max_concurrency)
|
231
|
+
|
160
232
|
results = cast("list[ExtractionResult]", [None] * len(contents))
|
161
233
|
|
162
234
|
async def _extract_bytes(content: bytes, mime_type: str, index: int) -> None:
|
163
|
-
|
164
|
-
|
235
|
+
async with semaphore:
|
236
|
+
try:
|
237
|
+
result = await extract_bytes(content, mime_type, config)
|
238
|
+
results[index] = result
|
239
|
+
except Exception as e: # noqa: BLE001
|
240
|
+
from kreuzberg._utils._errors import create_error_context
|
241
|
+
|
242
|
+
error_result = ExtractionResult(
|
243
|
+
content=f"Error: {type(e).__name__}: {e!s}",
|
244
|
+
mime_type="text/plain",
|
245
|
+
metadata={ # type: ignore[typeddict-unknown-key]
|
246
|
+
"error": True,
|
247
|
+
"error_context": create_error_context(
|
248
|
+
operation="batch_extract_bytes",
|
249
|
+
error=e,
|
250
|
+
index=index,
|
251
|
+
mime_type=mime_type,
|
252
|
+
content_size=len(content),
|
253
|
+
),
|
254
|
+
},
|
255
|
+
chunks=[],
|
256
|
+
)
|
257
|
+
results[index] = error_result
|
165
258
|
|
166
259
|
async with anyio.create_task_group() as tg:
|
167
260
|
for i, (content, mime_type) in enumerate(contents):
|
@@ -207,24 +300,57 @@ def extract_file_sync(
|
|
207
300
|
|
208
301
|
Returns:
|
209
302
|
The extracted content and the mime type of the content.
|
303
|
+
|
304
|
+
Raises:
|
305
|
+
ValidationError: If the file path or configuration is invalid.
|
210
306
|
"""
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
)
|
221
|
-
|
307
|
+
from kreuzberg._utils._document_cache import get_document_cache
|
308
|
+
|
309
|
+
cache = get_document_cache()
|
310
|
+
path = Path(file_path)
|
311
|
+
cached_result = cache.get(path, config)
|
312
|
+
if cached_result is not None:
|
313
|
+
return cached_result
|
314
|
+
|
315
|
+
if cache.is_processing(path, config):
|
316
|
+
event = cache.mark_processing(path, config)
|
317
|
+
event.wait() # pragma: no cover
|
318
|
+
|
319
|
+
# Try cache again after waiting for other process to complete # ~keep
|
320
|
+
cached_result = cache.get(path, config) # pragma: no cover
|
321
|
+
if cached_result is not None: # pragma: no cover
|
322
|
+
return cached_result
|
323
|
+
|
324
|
+
cache.mark_processing(path, config)
|
325
|
+
|
326
|
+
try:
|
327
|
+
if not path.exists():
|
328
|
+
raise ValidationError("The file does not exist", context={"file_path": str(path)})
|
329
|
+
|
330
|
+
mime_type = validate_mime_type(file_path=file_path, mime_type=mime_type)
|
331
|
+
if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=config):
|
332
|
+
result = extractor.extract_path_sync(Path(file_path))
|
333
|
+
else:
|
334
|
+
result = ExtractionResult(
|
335
|
+
content=Path(file_path).read_text(),
|
336
|
+
chunks=[],
|
337
|
+
mime_type=mime_type,
|
338
|
+
metadata={},
|
339
|
+
)
|
340
|
+
|
341
|
+
result = _validate_and_post_process_sync(result=result, config=config)
|
342
|
+
|
343
|
+
cache.set(path, config, result)
|
344
|
+
|
345
|
+
return result
|
346
|
+
finally:
|
347
|
+
cache.mark_complete(path, config)
|
222
348
|
|
223
349
|
|
224
350
|
def batch_extract_file_sync(
|
225
351
|
file_paths: Sequence[PathLike[str] | str], config: ExtractionConfig = DEFAULT_CONFIG
|
226
352
|
) -> list[ExtractionResult]:
|
227
|
-
"""Synchronous version of batch_extract_file.
|
353
|
+
"""Synchronous version of batch_extract_file with parallel processing.
|
228
354
|
|
229
355
|
Args:
|
230
356
|
file_paths: A sequence of paths to files to extract text from.
|
@@ -233,13 +359,54 @@ def batch_extract_file_sync(
|
|
233
359
|
Returns:
|
234
360
|
A list of extraction results in the same order as the input paths.
|
235
361
|
"""
|
236
|
-
|
362
|
+
if len(file_paths) <= 1:
|
363
|
+
return [extract_file_sync(file_path=Path(file_path), mime_type=None, config=config) for file_path in file_paths]
|
364
|
+
|
365
|
+
import multiprocessing as mp
|
366
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
367
|
+
|
368
|
+
max_workers = min(len(file_paths), mp.cpu_count())
|
369
|
+
|
370
|
+
def extract_single(file_path: PathLike[str] | str) -> tuple[int, ExtractionResult]:
|
371
|
+
"""Extract single file with index for ordering."""
|
372
|
+
try:
|
373
|
+
return (
|
374
|
+
file_paths.index(file_path),
|
375
|
+
extract_file_sync(file_path=Path(file_path), mime_type=None, config=config),
|
376
|
+
)
|
377
|
+
except Exception as e: # noqa: BLE001
|
378
|
+
from kreuzberg._utils._errors import create_error_context
|
379
|
+
|
380
|
+
error_result = ExtractionResult(
|
381
|
+
content=f"Error: {type(e).__name__}: {e!s}",
|
382
|
+
mime_type="text/plain",
|
383
|
+
metadata={ # type: ignore[typeddict-unknown-key]
|
384
|
+
"error": True,
|
385
|
+
"error_context": create_error_context(
|
386
|
+
operation="batch_extract_file_sync",
|
387
|
+
file_path=file_path,
|
388
|
+
error=e,
|
389
|
+
),
|
390
|
+
},
|
391
|
+
chunks=[],
|
392
|
+
)
|
393
|
+
return (file_paths.index(file_path), error_result)
|
394
|
+
|
395
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
396
|
+
future_to_index = {executor.submit(extract_single, fp): i for i, fp in enumerate(file_paths)}
|
397
|
+
|
398
|
+
results: list[ExtractionResult] = [None] * len(file_paths) # type: ignore[list-item]
|
399
|
+
for future in as_completed(future_to_index):
|
400
|
+
index, result = future.result()
|
401
|
+
results[index] = result
|
402
|
+
|
403
|
+
return results
|
237
404
|
|
238
405
|
|
239
406
|
def batch_extract_bytes_sync(
|
240
407
|
contents: Sequence[tuple[bytes, str]], config: ExtractionConfig = DEFAULT_CONFIG
|
241
408
|
) -> list[ExtractionResult]:
|
242
|
-
"""Synchronous version of batch_extract_bytes.
|
409
|
+
"""Synchronous version of batch_extract_bytes with parallel processing.
|
243
410
|
|
244
411
|
Args:
|
245
412
|
contents: A sequence of tuples containing (content, mime_type) pairs.
|
@@ -248,4 +415,48 @@ def batch_extract_bytes_sync(
|
|
248
415
|
Returns:
|
249
416
|
A list of extraction results in the same order as the input contents.
|
250
417
|
"""
|
251
|
-
|
418
|
+
if len(contents) <= 1:
|
419
|
+
return [
|
420
|
+
extract_bytes_sync(content=content, mime_type=mime_type, config=config) for content, mime_type in contents
|
421
|
+
]
|
422
|
+
|
423
|
+
import multiprocessing as mp
|
424
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
425
|
+
|
426
|
+
max_workers = min(len(contents), mp.cpu_count())
|
427
|
+
|
428
|
+
def extract_single(index_and_content: tuple[int, tuple[bytes, str]]) -> tuple[int, ExtractionResult]:
|
429
|
+
"""Extract single content with index for ordering."""
|
430
|
+
index, (content, mime_type) = index_and_content
|
431
|
+
try:
|
432
|
+
return (index, extract_bytes_sync(content=content, mime_type=mime_type, config=config))
|
433
|
+
except Exception as e: # noqa: BLE001
|
434
|
+
from kreuzberg._utils._errors import create_error_context
|
435
|
+
|
436
|
+
error_result = ExtractionResult(
|
437
|
+
content=f"Error: {type(e).__name__}: {e!s}",
|
438
|
+
mime_type="text/plain",
|
439
|
+
metadata={ # type: ignore[typeddict-unknown-key]
|
440
|
+
"error": True,
|
441
|
+
"error_context": create_error_context(
|
442
|
+
operation="batch_extract_bytes_sync",
|
443
|
+
error=e,
|
444
|
+
index=index,
|
445
|
+
mime_type=mime_type,
|
446
|
+
content_size=len(content),
|
447
|
+
),
|
448
|
+
},
|
449
|
+
chunks=[],
|
450
|
+
)
|
451
|
+
return (index, error_result)
|
452
|
+
|
453
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
454
|
+
indexed_contents = list(enumerate(contents))
|
455
|
+
future_to_index = {executor.submit(extract_single, ic): i for i, ic in enumerate(indexed_contents)}
|
456
|
+
|
457
|
+
results: list[ExtractionResult] = [None] * len(contents) # type: ignore[list-item]
|
458
|
+
for future in as_completed(future_to_index):
|
459
|
+
index, result = future.result()
|
460
|
+
results[index] = result
|
461
|
+
|
462
|
+
return results
|
@@ -1,56 +1,60 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.3.0
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
|
+
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
5
6
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
7
|
License: MIT
|
7
|
-
|
8
|
+
License-File: LICENSE
|
8
9
|
Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,table-extraction,tesseract,text-extraction,text-processing
|
9
10
|
Classifier: Development Status :: 4 - Beta
|
10
11
|
Classifier: Intended Audience :: Developers
|
11
12
|
Classifier: License :: OSI Approved :: MIT License
|
12
13
|
Classifier: Operating System :: OS Independent
|
13
14
|
Classifier: Programming Language :: Python :: 3 :: Only
|
14
|
-
Classifier: Programming Language :: Python :: 3.9
|
15
|
-
Classifier: Programming Language :: Python :: 3.10
|
16
|
-
Classifier: Programming Language :: Python :: 3.11
|
17
|
-
Classifier: Programming Language :: Python :: 3.12
|
18
15
|
Classifier: Programming Language :: Python :: 3.13
|
19
16
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
20
17
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
21
18
|
Classifier: Topic :: Text Processing :: General
|
22
19
|
Classifier: Topic :: Utilities
|
23
20
|
Classifier: Typing :: Typed
|
24
|
-
Requires-Python: >=3.
|
25
|
-
Description-Content-Type: text/markdown
|
26
|
-
License-File: LICENSE
|
21
|
+
Requires-Python: >=3.13
|
27
22
|
Requires-Dist: anyio>=4.9.0
|
28
23
|
Requires-Dist: charset-normalizer>=3.4.2
|
29
|
-
Requires-Dist: exceptiongroup>=1.2.2; python_version <
|
24
|
+
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
30
25
|
Requires-Dist: html-to-markdown>=1.4.0
|
26
|
+
Requires-Dist: msgspec>=0.18.0
|
31
27
|
Requires-Dist: playa-pdf>=0.6.1
|
28
|
+
Requires-Dist: psutil>=7.0.0
|
32
29
|
Requires-Dist: pypdfium2==4.30.0
|
33
30
|
Requires-Dist: python-calamine>=0.3.2
|
34
31
|
Requires-Dist: python-pptx>=1.0.2
|
35
|
-
Requires-Dist: typing-extensions>=4.14.0; python_version <
|
32
|
+
Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
|
36
33
|
Provides-Extra: all
|
37
|
-
Requires-Dist:
|
38
|
-
Requires-Dist:
|
39
|
-
Requires-Dist:
|
40
|
-
Requires-Dist:
|
41
|
-
Requires-Dist:
|
42
|
-
Requires-Dist:
|
34
|
+
Requires-Dist: click>=8.2.1; extra == 'all'
|
35
|
+
Requires-Dist: easyocr>=1.7.2; extra == 'all'
|
36
|
+
Requires-Dist: gmft>=0.4.2; extra == 'all'
|
37
|
+
Requires-Dist: paddleocr>=3.1.0; extra == 'all'
|
38
|
+
Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
|
39
|
+
Requires-Dist: rich>=14.0.0; extra == 'all'
|
40
|
+
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
|
41
|
+
Requires-Dist: setuptools>=80.9.0; extra == 'all'
|
42
|
+
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
|
43
43
|
Provides-Extra: chunking
|
44
|
-
Requires-Dist: semantic-text-splitter>=0.27.0; extra ==
|
44
|
+
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
|
45
|
+
Provides-Extra: cli
|
46
|
+
Requires-Dist: click>=8.2.1; extra == 'cli'
|
47
|
+
Requires-Dist: rich>=14.0.0; extra == 'cli'
|
48
|
+
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
|
45
49
|
Provides-Extra: easyocr
|
46
|
-
Requires-Dist: easyocr>=1.7.2; extra ==
|
50
|
+
Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
|
47
51
|
Provides-Extra: gmft
|
48
|
-
Requires-Dist: gmft>=0.4.
|
52
|
+
Requires-Dist: gmft>=0.4.2; extra == 'gmft'
|
49
53
|
Provides-Extra: paddleocr
|
50
|
-
Requires-Dist: paddleocr>=3.0
|
51
|
-
Requires-Dist: paddlepaddle>=3.
|
52
|
-
Requires-Dist: setuptools>=80.9.0; extra ==
|
53
|
-
|
54
|
+
Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
|
55
|
+
Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
|
56
|
+
Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
|
57
|
+
Description-Content-Type: text/markdown
|
54
58
|
|
55
59
|
# Kreuzberg
|
56
60
|
|
@@ -68,6 +72,7 @@ Kreuzberg is a Python library for text extraction from documents. It provides a
|
|
68
72
|
- **Resource Efficient**: Lightweight processing without GPU requirements
|
69
73
|
- **Format Support**: Comprehensive support for documents, images, and text formats
|
70
74
|
- **Multiple OCR Engines**: Support for Tesseract, EasyOCR, and PaddleOCR
|
75
|
+
- **Command Line Interface**: Powerful CLI for batch processing and automation
|
71
76
|
- **Metadata Extraction**: Get document metadata alongside text content
|
72
77
|
- **Table Extraction**: Extract tables from documents using the excellent GMFT library
|
73
78
|
- **Modern Python**: Built with async/await, type hints, and a functional-first approach
|
@@ -77,6 +82,9 @@ Kreuzberg is a Python library for text extraction from documents. It provides a
|
|
77
82
|
|
78
83
|
```bash
|
79
84
|
pip install kreuzberg
|
85
|
+
|
86
|
+
# Or install with CLI support
|
87
|
+
pip install "kreuzberg[cli]"
|
80
88
|
```
|
81
89
|
|
82
90
|
Install pandoc:
|
@@ -126,12 +134,53 @@ async def main():
|
|
126
134
|
asyncio.run(main())
|
127
135
|
```
|
128
136
|
|
137
|
+
## Command Line Interface
|
138
|
+
|
139
|
+
Kreuzberg includes a powerful CLI for processing documents from the command line:
|
140
|
+
|
141
|
+
```bash
|
142
|
+
# Extract text from a file
|
143
|
+
kreuzberg extract document.pdf
|
144
|
+
|
145
|
+
# Extract with JSON output and metadata
|
146
|
+
kreuzberg extract document.pdf --output-format json --show-metadata
|
147
|
+
|
148
|
+
# Extract from stdin
|
149
|
+
cat document.html | kreuzberg extract
|
150
|
+
|
151
|
+
# Use specific OCR backend
|
152
|
+
kreuzberg extract image.png --ocr-backend easyocr --easyocr-languages en,de
|
153
|
+
|
154
|
+
# Extract with configuration file
|
155
|
+
kreuzberg extract document.pdf --config config.toml
|
156
|
+
```
|
157
|
+
|
158
|
+
### CLI Configuration
|
159
|
+
|
160
|
+
Configure via `pyproject.toml`:
|
161
|
+
|
162
|
+
```toml
|
163
|
+
[tool.kreuzberg]
|
164
|
+
force_ocr = true
|
165
|
+
chunk_content = false
|
166
|
+
extract_tables = true
|
167
|
+
max_chars = 4000
|
168
|
+
ocr_backend = "tesseract"
|
169
|
+
|
170
|
+
[tool.kreuzberg.tesseract]
|
171
|
+
language = "eng+deu"
|
172
|
+
psm = 3
|
173
|
+
```
|
174
|
+
|
175
|
+
For full CLI documentation, see the [CLI Guide](https://goldziher.github.io/kreuzberg/cli/).
|
176
|
+
|
129
177
|
## Documentation
|
130
178
|
|
131
179
|
For comprehensive documentation, visit our [GitHub Pages](https://goldziher.github.io/kreuzberg/):
|
132
180
|
|
133
181
|
- [Getting Started](https://goldziher.github.io/kreuzberg/getting-started/) - Installation and basic usage
|
134
182
|
- [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - In-depth usage information
|
183
|
+
- [CLI Guide](https://goldziher.github.io/kreuzberg/cli/) - Command-line interface documentation
|
135
184
|
- [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Detailed API documentation
|
136
185
|
- [Examples](https://goldziher.github.io/kreuzberg/examples/) - Code examples for common use cases
|
137
186
|
- [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - Configure OCR engines
|
@@ -157,6 +206,26 @@ Kreuzberg supports multiple OCR engines:
|
|
157
206
|
|
158
207
|
For comparison and selection guidance, see the [OCR Backends](https://goldziher.github.io/kreuzberg/user-guide/ocr-backends/) documentation.
|
159
208
|
|
209
|
+
## Performance
|
210
|
+
|
211
|
+
Kreuzberg offers both sync and async APIs. Choose the right one based on your use case:
|
212
|
+
|
213
|
+
| Operation | Sync Time | Async Time | Async Advantage |
|
214
|
+
| ---------------------- | --------- | ---------- | ------------------ |
|
215
|
+
| Simple text (Markdown) | 0.4ms | 17.5ms | **❌ 41x slower** |
|
216
|
+
| HTML documents | 1.6ms | 1.1ms | **✅ 1.5x faster** |
|
217
|
+
| Complex PDFs | 39.0s | 8.5s | **✅ 4.6x faster** |
|
218
|
+
| OCR processing | 0.4s | 0.7s | **✅ 1.7x faster** |
|
219
|
+
| Batch operations | 38.6s | 8.5s | **✅ 4.5x faster** |
|
220
|
+
|
221
|
+
**Rule of thumb:**
|
222
|
+
|
223
|
+
- Use **sync** for simple documents and CLI applications
|
224
|
+
- Use **async** for complex PDFs, OCR, and batch processing
|
225
|
+
- Use **batch operations** for multiple files
|
226
|
+
|
227
|
+
For detailed benchmarks and methodology, see our [Performance Documentation](https://goldziher.github.io/kreuzberg/advanced/performance/).
|
228
|
+
|
160
229
|
## Contributing
|
161
230
|
|
162
231
|
We welcome contributions! Please see our [Contributing Guide](docs/contributing.md) for details on setting up your development environment and submitting pull requests.
|
@@ -0,0 +1,48 @@
|
|
1
|
+
kreuzberg/__init__.py,sha256=jRm2U-loiKWwJpgOFgZ8Ev2mfz9sI1qJOZ2V3OoJUlg,1258
|
2
|
+
kreuzberg/__main__.py,sha256=s2qM1nPEkRHAQP-G3P7sf5l6qA_KJeIEHS5LpPz04lg,183
|
3
|
+
kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
|
4
|
+
kreuzberg/_cli_config.py,sha256=WD_seFjbuay_NJv77vGLBW6BVV9WZNujdzf3zQkhzPc,5691
|
5
|
+
kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
|
6
|
+
kreuzberg/_gmft.py,sha256=6liCjedPxH5Xbe7V-AmrZIq5Y9Dejn7D-LSCbgYs2Sg,14762
|
7
|
+
kreuzberg/_mime_types.py,sha256=QgX-k8aI4lTKArObDM0TFPt7DUjUVwWrdIaIZDh_XQY,7815
|
8
|
+
kreuzberg/_playa.py,sha256=rU6ii2Qnrj8tkDYlSiab5h-BCYLJnUg4QwSLVDEXF5g,11883
|
9
|
+
kreuzberg/_registry.py,sha256=c2B_PJbaL0q3ab2eNmj_0jldeyMaqgvRwkZqUU4MM5Q,3290
|
10
|
+
kreuzberg/_types.py,sha256=G7UQ5ZUWcpgwHoasexW7f2te3gKe3PHHi_3Fm1cju-w,7503
|
11
|
+
kreuzberg/cli.py,sha256=S0w2nGXBWPFn1NhxppW7dpUwB9f_3ymFuWSAB2aRu9g,12465
|
12
|
+
kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
|
13
|
+
kreuzberg/extraction.py,sha256=z8sht8Yw9v6bE_WgLdWx-phu4T58eExME296DV_41VU,16551
|
14
|
+
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
+
kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
|
+
kreuzberg/_extractors/_base.py,sha256=YUr6A2n34LlFzbYQkiKqhXAphL9RYrvAls5SlkoQqNg,3028
|
17
|
+
kreuzberg/_extractors/_html.py,sha256=L_vcKyv1ObI6irPaD3-WTKqxeRfZA4Rhsl3zUiAe_ws,1312
|
18
|
+
kreuzberg/_extractors/_image.py,sha256=Vks6WEDoW5AlGqIGVSeuhZzvJNwS8V6wxeD46Fxxogw,3947
|
19
|
+
kreuzberg/_extractors/_pandoc.py,sha256=oQ4DgQSPoX1LXjGAKh_A40JHqiKWb91LeRBYSS_6EUA,26750
|
20
|
+
kreuzberg/_extractors/_pdf.py,sha256=qgYwGvAlvyZzb94lXGcKGIhzmSFpP6YGzYc7fs8b-yw,13432
|
21
|
+
kreuzberg/_extractors/_presentation.py,sha256=ZX-EKQppHwvKtyKk0-IQVF6QAqJi0SfGgCiiyqMQh0w,8701
|
22
|
+
kreuzberg/_extractors/_spread_sheet.py,sha256=ToLZIK_PO72IYbsdtSQkHOwTUhDwptjOfSX--e1UdSM,6487
|
23
|
+
kreuzberg/_multiprocessing/__init__.py,sha256=nwYQpKH7ixHwzkQbTMFCstOCBKktmbNq5dTrwI2Mn94,203
|
24
|
+
kreuzberg/_multiprocessing/gmft_isolated.py,sha256=wpZ5br5dL9P6hhGjAYckHbz8IvXrDdEvajJ7fxbFmAU,11199
|
25
|
+
kreuzberg/_multiprocessing/process_manager.py,sha256=dvO9JBWYnH1KCpzwn9h3Tz-wAoihMwTLE6OS-DF_sK0,6030
|
26
|
+
kreuzberg/_multiprocessing/sync_tesseract.py,sha256=Ck1PvHGWOMQWUcC7RyVrBt8K9VDFQ0lQcwFkwYzl3rE,8240
|
27
|
+
kreuzberg/_multiprocessing/tesseract_pool.py,sha256=UN7BtS_ib1ux9xuR6d6AB3PY7UEUhd-5Ti1n1H0UnYw,10945
|
28
|
+
kreuzberg/_ocr/__init__.py,sha256=VTqwKDlIRbjve71Y11Ztygyhv5aWG9LWTj8iX66ANxE,533
|
29
|
+
kreuzberg/_ocr/_base.py,sha256=lNT0Tin4hzbmaamqqySxvYEwNtrJB5gGlStrANQQcyc,1637
|
30
|
+
kreuzberg/_ocr/_easyocr.py,sha256=QSd6Bw7RBsOyL5ry-6lFLD7gJxcpK1P3AD_RRK4TPWs,13734
|
31
|
+
kreuzberg/_ocr/_paddleocr.py,sha256=UvugDdZd7RojHUiFeBaI8aqz36ecegPLj2v6oT6c42g,13776
|
32
|
+
kreuzberg/_ocr/_tesseract.py,sha256=NAHklkHvDKMgHVqjhgYfxC3DIJuQn8fXPkvnmQxUiV8,12784
|
33
|
+
kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
34
|
+
kreuzberg/_utils/_cache.py,sha256=JGiwwcNBoD950IbsPUUAD5gAGS7byUuz0BqYSneVakc,13088
|
35
|
+
kreuzberg/_utils/_device.py,sha256=Dk4g-LzUMJ-WMM-9czNQJj3mUI43l2w7t6MJcERYb2U,10264
|
36
|
+
kreuzberg/_utils/_document_cache.py,sha256=z8irioKsOu8xve1YgHatm__wIFvs9I1gDK3tLNsNyqM,6926
|
37
|
+
kreuzberg/_utils/_errors.py,sha256=AV3oaRQDgJxe1YUZd9pCQUysUv9KW8Ib37MvnyFOZ4o,6386
|
38
|
+
kreuzberg/_utils/_pdf_lock.py,sha256=nqxAYCNlfWDrJtP4ZNu57st1YnkDl-gYXdr0q8nv0kA,1961
|
39
|
+
kreuzberg/_utils/_process_pool.py,sha256=7n5UN3d-xeYHU5TiRI62u-JenERPinJzFhbRUq-zL9k,2895
|
40
|
+
kreuzberg/_utils/_serialization.py,sha256=AhZvyAu4KsjAqyZDh--Kn2kSWGgCuH7udio8lTklO0g,2132
|
41
|
+
kreuzberg/_utils/_string.py,sha256=owIVkUtP0__GiJD9RIJzPdvyIigT5sQho3mOXPbsnW0,958
|
42
|
+
kreuzberg/_utils/_sync.py,sha256=IsKkR_YmseZKY6Asz6w3k-dgMXcrVaI06jWfDY7Bol4,4842
|
43
|
+
kreuzberg/_utils/_tmp.py,sha256=5rqG_Nlb9xweaLqJA8Kc5csHDase9_eY_Fq93rNQGWc,1044
|
44
|
+
kreuzberg-3.3.0.dist-info/METADATA,sha256=beRlFJzCsZNcQ_DsRyzRc2WDT-UkBCfBvY6vTWiOxp0,8748
|
45
|
+
kreuzberg-3.3.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
46
|
+
kreuzberg-3.3.0.dist-info/entry_points.txt,sha256=VdoFaTl3QSvVWOZcIlPpDd47o6kn7EvmXSs8FI0ExLc,48
|
47
|
+
kreuzberg-3.3.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
48
|
+
kreuzberg-3.3.0.dist-info/RECORD,,
|
kreuzberg-3.2.0.dist-info/RECORD
DELETED
@@ -1,34 +0,0 @@
|
|
1
|
-
kreuzberg/__init__.py,sha256=lT9OwIdf5CEhSX7IVmtSFPgRhz6B2z2A-RE8Zdm0PH4,1216
|
2
|
-
kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
|
3
|
-
kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
|
4
|
-
kreuzberg/_gmft.py,sha256=qLhfepQuaROjPOdI-tDRqqqnOcqDY1D411ZXzoywnpg,7229
|
5
|
-
kreuzberg/_mime_types.py,sha256=pKtxBPDoye2knyou_VODDMPIt3eXotP-ak4MAKFI2SU,6310
|
6
|
-
kreuzberg/_playa.py,sha256=agHdhKfKLNtiP37XdNncbCP65v3Qv3m1Gn2KTRUkVx8,10396
|
7
|
-
kreuzberg/_registry.py,sha256=c2B_PJbaL0q3ab2eNmj_0jldeyMaqgvRwkZqUU4MM5Q,3290
|
8
|
-
kreuzberg/_types.py,sha256=G7UQ5ZUWcpgwHoasexW7f2te3gKe3PHHi_3Fm1cju-w,7503
|
9
|
-
kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
|
10
|
-
kreuzberg/extraction.py,sha256=0sjvbunx5srbR5lzjOAQjGK5JY3bCUHw-dRFmHjFz7o,8671
|
11
|
-
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
|
-
kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
|
-
kreuzberg/_extractors/_base.py,sha256=YUr6A2n34LlFzbYQkiKqhXAphL9RYrvAls5SlkoQqNg,3028
|
14
|
-
kreuzberg/_extractors/_html.py,sha256=L_vcKyv1ObI6irPaD3-WTKqxeRfZA4Rhsl3zUiAe_ws,1312
|
15
|
-
kreuzberg/_extractors/_image.py,sha256=VQgSFSzXIMX3A52-DyvuKgfTRXUJIjYn6IX4-sQWWdg,2626
|
16
|
-
kreuzberg/_extractors/_pandoc.py,sha256=OAbWvfzEx3rjim9uNMS9yBRnvkI71rYJKlgVzndsvyc,22157
|
17
|
-
kreuzberg/_extractors/_pdf.py,sha256=eNFws_UxLgWSTC_VC_zJmVojpyQvioOXgNjSHQzBq5c,6607
|
18
|
-
kreuzberg/_extractors/_presentation.py,sha256=7W6RHTk-zksuHoSk0i6UaSBf5NatnPo17MxepQoI6XI,8758
|
19
|
-
kreuzberg/_extractors/_spread_sheet.py,sha256=1ejRZk8AE1dXS1tRIdg2S0J9Vo0wG81iKkW2IF6PjlE,4445
|
20
|
-
kreuzberg/_ocr/__init__.py,sha256=VTqwKDlIRbjve71Y11Ztygyhv5aWG9LWTj8iX66ANxE,533
|
21
|
-
kreuzberg/_ocr/_base.py,sha256=lNT0Tin4hzbmaamqqySxvYEwNtrJB5gGlStrANQQcyc,1637
|
22
|
-
kreuzberg/_ocr/_easyocr.py,sha256=1OG2IbLdg4cXouV0FVzMnCkYYh6GN1pvXqXWw40PUz8,14054
|
23
|
-
kreuzberg/_ocr/_paddleocr.py,sha256=K6D3B2cn-JIhipI5UHMa0Kn2M-GKtyUFCahs8wJQZcA,13855
|
24
|
-
kreuzberg/_ocr/_tesseract.py,sha256=KcJMK4o__2H2ftibk1lC7HVqEfpaE_jVZgLhUXkxTvk,9773
|
25
|
-
kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
26
|
-
kreuzberg/_utils/_device.py,sha256=Ja28S2psgEwWzjdO05ZI11RFb3MSlUZDT19sC4SAyVE,10955
|
27
|
-
kreuzberg/_utils/_string.py,sha256=oNO0cmwjVNG0jAzaqNCjYtzvM_nxH5TW2KV-Uh3oEUU,978
|
28
|
-
kreuzberg/_utils/_sync.py,sha256=lycobEMXk0tBMWLwkuMdOuNMStDwPKMC0V1Qgp_oi6k,4071
|
29
|
-
kreuzberg/_utils/_tmp.py,sha256=5rqG_Nlb9xweaLqJA8Kc5csHDase9_eY_Fq93rNQGWc,1044
|
30
|
-
kreuzberg-3.2.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
31
|
-
kreuzberg-3.2.0.dist-info/METADATA,sha256=xffQAGQur7sCgUT9RDqZpfkYTdthsuYIhCvbUDKFnmA,6504
|
32
|
-
kreuzberg-3.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
33
|
-
kreuzberg-3.2.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
|
34
|
-
kreuzberg-3.2.0.dist-info/RECORD,,
|
@@ -1 +0,0 @@
|
|
1
|
-
kreuzberg
|
File without changes
|