kreuzberg 3.2.0__py3-none-any.whl → 3.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. kreuzberg/__init__.py +3 -0
  2. kreuzberg/__main__.py +8 -0
  3. kreuzberg/_cli_config.py +175 -0
  4. kreuzberg/_extractors/_image.py +39 -4
  5. kreuzberg/_extractors/_pandoc.py +158 -18
  6. kreuzberg/_extractors/_pdf.py +199 -19
  7. kreuzberg/_extractors/_presentation.py +1 -1
  8. kreuzberg/_extractors/_spread_sheet.py +65 -7
  9. kreuzberg/_gmft.py +222 -16
  10. kreuzberg/_mime_types.py +62 -16
  11. kreuzberg/_multiprocessing/__init__.py +6 -0
  12. kreuzberg/_multiprocessing/gmft_isolated.py +332 -0
  13. kreuzberg/_multiprocessing/process_manager.py +188 -0
  14. kreuzberg/_multiprocessing/sync_tesseract.py +261 -0
  15. kreuzberg/_multiprocessing/tesseract_pool.py +359 -0
  16. kreuzberg/_ocr/_easyocr.py +6 -12
  17. kreuzberg/_ocr/_paddleocr.py +15 -13
  18. kreuzberg/_ocr/_tesseract.py +136 -46
  19. kreuzberg/_playa.py +43 -0
  20. kreuzberg/_utils/_cache.py +372 -0
  21. kreuzberg/_utils/_device.py +10 -27
  22. kreuzberg/_utils/_document_cache.py +220 -0
  23. kreuzberg/_utils/_errors.py +232 -0
  24. kreuzberg/_utils/_pdf_lock.py +72 -0
  25. kreuzberg/_utils/_process_pool.py +100 -0
  26. kreuzberg/_utils/_serialization.py +82 -0
  27. kreuzberg/_utils/_string.py +1 -1
  28. kreuzberg/_utils/_sync.py +21 -0
  29. kreuzberg/cli.py +338 -0
  30. kreuzberg/extraction.py +247 -36
  31. {kreuzberg-3.2.0.dist-info → kreuzberg-3.3.0.dist-info}/METADATA +93 -24
  32. kreuzberg-3.3.0.dist-info/RECORD +48 -0
  33. {kreuzberg-3.2.0.dist-info → kreuzberg-3.3.0.dist-info}/WHEEL +1 -2
  34. kreuzberg-3.3.0.dist-info/entry_points.txt +2 -0
  35. kreuzberg-3.2.0.dist-info/RECORD +0 -34
  36. kreuzberg-3.2.0.dist-info/top_level.txt +0 -1
  37. {kreuzberg-3.2.0.dist-info → kreuzberg-3.3.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/extraction.py CHANGED
@@ -13,7 +13,8 @@ from kreuzberg._mime_types import (
13
13
  from kreuzberg._registry import ExtractorRegistry
14
14
  from kreuzberg._types import ExtractionConfig
15
15
  from kreuzberg._utils._string import safe_decode
16
- from kreuzberg._utils._sync import run_maybe_async, run_maybe_sync
16
+ from kreuzberg._utils._sync import run_maybe_sync, run_sync_only
17
+ from kreuzberg.exceptions import ValidationError
17
18
 
18
19
  if TYPE_CHECKING:
19
20
  from collections.abc import Sequence
@@ -42,7 +43,7 @@ async def _validate_and_post_process_async(result: ExtractionResult, config: Ext
42
43
 
43
44
  def _validate_and_post_process_sync(result: ExtractionResult, config: ExtractionConfig) -> ExtractionResult:
44
45
  for validator in config.validators or []:
45
- run_maybe_async(validator, result)
46
+ run_sync_only(validator, result)
46
47
 
47
48
  if config.chunk_content:
48
49
  result.chunks = _handle_chunk_content(
@@ -52,7 +53,7 @@ def _validate_and_post_process_sync(result: ExtractionResult, config: Extraction
52
53
  )
53
54
 
54
55
  for post_processor in config.post_processing_hooks or []:
55
- result = run_maybe_async(post_processor, result)
56
+ result = run_sync_only(post_processor, result)
56
57
 
57
58
  return result
58
59
 
@@ -104,22 +105,57 @@ async def extract_file(
104
105
 
105
106
  Returns:
106
107
  The extracted content and the mime type of the content.
108
+
109
+ Raises:
110
+ ValidationError: If the file path or configuration is invalid.
107
111
  """
108
- mime_type = validate_mime_type(file_path=file_path, mime_type=mime_type)
109
- if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=config):
110
- result = await extractor.extract_path_async(Path(file_path))
111
- else:
112
- result = ExtractionResult(
113
- content=safe_decode(await anyio.Path(file_path).read_bytes()), chunks=[], mime_type=mime_type, metadata={}
114
- )
112
+ from kreuzberg._utils._document_cache import get_document_cache
115
113
 
116
- return await _validate_and_post_process_async(result=result, config=config)
114
+ cache = get_document_cache()
115
+ path = Path(file_path)
116
+ cached_result = cache.get(path, config)
117
+ if cached_result is not None:
118
+ return cached_result
119
+
120
+ if cache.is_processing(path, config):
121
+ event = cache.mark_processing(path, config)
122
+ await anyio.to_thread.run_sync(event.wait) # pragma: no cover
123
+
124
+ # Try cache again after waiting for other process to complete # ~keep
125
+ cached_result = cache.get(path, config) # pragma: no cover
126
+ if cached_result is not None: # pragma: no cover
127
+ return cached_result
128
+
129
+ cache.mark_processing(path, config)
130
+
131
+ try:
132
+ if not path.exists():
133
+ raise ValidationError("The file does not exist", context={"file_path": str(path)})
134
+
135
+ mime_type = validate_mime_type(file_path=file_path, mime_type=mime_type)
136
+ if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=config):
137
+ result = await extractor.extract_path_async(Path(file_path))
138
+ else:
139
+ result = ExtractionResult(
140
+ content=safe_decode(await anyio.Path(file_path).read_bytes()),
141
+ chunks=[],
142
+ mime_type=mime_type,
143
+ metadata={},
144
+ )
145
+
146
+ result = await _validate_and_post_process_async(result=result, config=config)
147
+
148
+ cache.set(path, config, result)
149
+
150
+ return result
151
+ finally:
152
+ cache.mark_complete(path, config)
117
153
 
118
154
 
119
155
  async def batch_extract_file(
120
156
  file_paths: Sequence[PathLike[str] | str], config: ExtractionConfig = DEFAULT_CONFIG
121
157
  ) -> list[ExtractionResult]:
122
- """Extract text from multiple files concurrently.
158
+ """Extract text from multiple files concurrently with optimizations.
123
159
 
124
160
  Args:
125
161
  file_paths: A sequence of paths to files to extract text from.
@@ -128,15 +164,43 @@ async def batch_extract_file(
128
164
  Returns:
129
165
  A list of extraction results in the same order as the input paths.
130
166
  """
167
+ if not file_paths:
168
+ return []
169
+
170
+ import multiprocessing as mp
171
+
172
+ max_concurrency = min(len(file_paths), mp.cpu_count() * 2)
173
+ semaphore = anyio.Semaphore(max_concurrency)
174
+
131
175
  results = cast("list[ExtractionResult]", ([None] * len(file_paths)))
132
176
 
133
177
  async def _extract_file(path: PathLike[str] | str, index: int) -> None:
134
- result = await extract_file(
135
- path,
136
- None,
137
- config,
138
- )
139
- results[index] = result
178
+ async with semaphore:
179
+ try:
180
+ result = await extract_file(
181
+ path,
182
+ None,
183
+ config,
184
+ )
185
+ results[index] = result
186
+ except Exception as e: # noqa: BLE001
187
+ from kreuzberg._utils._errors import create_error_context
188
+
189
+ error_result = ExtractionResult(
190
+ content=f"Error: {type(e).__name__}: {e!s}",
191
+ mime_type="text/plain",
192
+ metadata={ # type: ignore[typeddict-unknown-key]
193
+ "error": True,
194
+ "error_context": create_error_context(
195
+ operation="batch_extract_file",
196
+ file_path=path,
197
+ error=e,
198
+ index=index,
199
+ ),
200
+ },
201
+ chunks=[],
202
+ )
203
+ results[index] = error_result
140
204
 
141
205
  async with anyio.create_task_group() as tg:
142
206
  for i, path in enumerate(file_paths):
@@ -148,7 +212,7 @@ async def batch_extract_file(
148
212
  async def batch_extract_bytes(
149
213
  contents: Sequence[tuple[bytes, str]], config: ExtractionConfig = DEFAULT_CONFIG
150
214
  ) -> list[ExtractionResult]:
151
- """Extract text from multiple byte contents concurrently.
215
+ """Extract text from multiple byte contents concurrently with optimizations.
152
216
 
153
217
  Args:
154
218
  contents: A sequence of tuples containing (content, mime_type) pairs.
@@ -157,11 +221,40 @@ async def batch_extract_bytes(
157
221
  Returns:
158
222
  A list of extraction results in the same order as the input contents.
159
223
  """
224
+ if not contents:
225
+ return []
226
+
227
+ import multiprocessing as mp
228
+
229
+ max_concurrency = min(len(contents), mp.cpu_count() * 2)
230
+ semaphore = anyio.Semaphore(max_concurrency)
231
+
160
232
  results = cast("list[ExtractionResult]", [None] * len(contents))
161
233
 
162
234
  async def _extract_bytes(content: bytes, mime_type: str, index: int) -> None:
163
- result = await extract_bytes(content, mime_type, config)
164
- results[index] = result
235
+ async with semaphore:
236
+ try:
237
+ result = await extract_bytes(content, mime_type, config)
238
+ results[index] = result
239
+ except Exception as e: # noqa: BLE001
240
+ from kreuzberg._utils._errors import create_error_context
241
+
242
+ error_result = ExtractionResult(
243
+ content=f"Error: {type(e).__name__}: {e!s}",
244
+ mime_type="text/plain",
245
+ metadata={ # type: ignore[typeddict-unknown-key]
246
+ "error": True,
247
+ "error_context": create_error_context(
248
+ operation="batch_extract_bytes",
249
+ error=e,
250
+ index=index,
251
+ mime_type=mime_type,
252
+ content_size=len(content),
253
+ ),
254
+ },
255
+ chunks=[],
256
+ )
257
+ results[index] = error_result
165
258
 
166
259
  async with anyio.create_task_group() as tg:
167
260
  for i, (content, mime_type) in enumerate(contents):
@@ -207,24 +300,57 @@ def extract_file_sync(
207
300
 
208
301
  Returns:
209
302
  The extracted content and the mime type of the content.
303
+
304
+ Raises:
305
+ ValidationError: If the file path or configuration is invalid.
210
306
  """
211
- mime_type = validate_mime_type(file_path=file_path, mime_type=mime_type)
212
- if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=config):
213
- result = extractor.extract_path_sync(Path(file_path))
214
- else:
215
- result = ExtractionResult(
216
- content=Path(file_path).read_text(),
217
- chunks=[],
218
- mime_type=mime_type,
219
- metadata={},
220
- )
221
- return _validate_and_post_process_sync(result=result, config=config)
307
+ from kreuzberg._utils._document_cache import get_document_cache
308
+
309
+ cache = get_document_cache()
310
+ path = Path(file_path)
311
+ cached_result = cache.get(path, config)
312
+ if cached_result is not None:
313
+ return cached_result
314
+
315
+ if cache.is_processing(path, config):
316
+ event = cache.mark_processing(path, config)
317
+ event.wait() # pragma: no cover
318
+
319
+ # Try cache again after waiting for other process to complete # ~keep
320
+ cached_result = cache.get(path, config) # pragma: no cover
321
+ if cached_result is not None: # pragma: no cover
322
+ return cached_result
323
+
324
+ cache.mark_processing(path, config)
325
+
326
+ try:
327
+ if not path.exists():
328
+ raise ValidationError("The file does not exist", context={"file_path": str(path)})
329
+
330
+ mime_type = validate_mime_type(file_path=file_path, mime_type=mime_type)
331
+ if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=config):
332
+ result = extractor.extract_path_sync(Path(file_path))
333
+ else:
334
+ result = ExtractionResult(
335
+ content=Path(file_path).read_text(),
336
+ chunks=[],
337
+ mime_type=mime_type,
338
+ metadata={},
339
+ )
340
+
341
+ result = _validate_and_post_process_sync(result=result, config=config)
342
+
343
+ cache.set(path, config, result)
344
+
345
+ return result
346
+ finally:
347
+ cache.mark_complete(path, config)
222
348
 
223
349
 
224
350
  def batch_extract_file_sync(
225
351
  file_paths: Sequence[PathLike[str] | str], config: ExtractionConfig = DEFAULT_CONFIG
226
352
  ) -> list[ExtractionResult]:
227
- """Synchronous version of batch_extract_file.
353
+ """Synchronous version of batch_extract_file with parallel processing.
228
354
 
229
355
  Args:
230
356
  file_paths: A sequence of paths to files to extract text from.
@@ -233,13 +359,54 @@ def batch_extract_file_sync(
233
359
  Returns:
234
360
  A list of extraction results in the same order as the input paths.
235
361
  """
236
- return [extract_file_sync(file_path=Path(file_path), mime_type=None, config=config) for file_path in file_paths]
362
+ if len(file_paths) <= 1:
363
+ return [extract_file_sync(file_path=Path(file_path), mime_type=None, config=config) for file_path in file_paths]
364
+
365
+ import multiprocessing as mp
366
+ from concurrent.futures import ThreadPoolExecutor, as_completed
367
+
368
+ max_workers = min(len(file_paths), mp.cpu_count())
369
+
370
+ def extract_single(file_path: PathLike[str] | str) -> tuple[int, ExtractionResult]:
371
+ """Extract single file with index for ordering."""
372
+ try:
373
+ return (
374
+ file_paths.index(file_path),
375
+ extract_file_sync(file_path=Path(file_path), mime_type=None, config=config),
376
+ )
377
+ except Exception as e: # noqa: BLE001
378
+ from kreuzberg._utils._errors import create_error_context
379
+
380
+ error_result = ExtractionResult(
381
+ content=f"Error: {type(e).__name__}: {e!s}",
382
+ mime_type="text/plain",
383
+ metadata={ # type: ignore[typeddict-unknown-key]
384
+ "error": True,
385
+ "error_context": create_error_context(
386
+ operation="batch_extract_file_sync",
387
+ file_path=file_path,
388
+ error=e,
389
+ ),
390
+ },
391
+ chunks=[],
392
+ )
393
+ return (file_paths.index(file_path), error_result)
394
+
395
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
396
+ future_to_index = {executor.submit(extract_single, fp): i for i, fp in enumerate(file_paths)}
397
+
398
+ results: list[ExtractionResult] = [None] * len(file_paths) # type: ignore[list-item]
399
+ for future in as_completed(future_to_index):
400
+ index, result = future.result()
401
+ results[index] = result
402
+
403
+ return results
237
404
 
238
405
 
239
406
  def batch_extract_bytes_sync(
240
407
  contents: Sequence[tuple[bytes, str]], config: ExtractionConfig = DEFAULT_CONFIG
241
408
  ) -> list[ExtractionResult]:
242
- """Synchronous version of batch_extract_bytes.
409
+ """Synchronous version of batch_extract_bytes with parallel processing.
243
410
 
244
411
  Args:
245
412
  contents: A sequence of tuples containing (content, mime_type) pairs.
@@ -248,4 +415,48 @@ def batch_extract_bytes_sync(
248
415
  Returns:
249
416
  A list of extraction results in the same order as the input contents.
250
417
  """
251
- return [extract_bytes_sync(content=content, mime_type=mime_type, config=config) for content, mime_type in contents]
418
+ if len(contents) <= 1:
419
+ return [
420
+ extract_bytes_sync(content=content, mime_type=mime_type, config=config) for content, mime_type in contents
421
+ ]
422
+
423
+ import multiprocessing as mp
424
+ from concurrent.futures import ThreadPoolExecutor, as_completed
425
+
426
+ max_workers = min(len(contents), mp.cpu_count())
427
+
428
+ def extract_single(index_and_content: tuple[int, tuple[bytes, str]]) -> tuple[int, ExtractionResult]:
429
+ """Extract single content with index for ordering."""
430
+ index, (content, mime_type) = index_and_content
431
+ try:
432
+ return (index, extract_bytes_sync(content=content, mime_type=mime_type, config=config))
433
+ except Exception as e: # noqa: BLE001
434
+ from kreuzberg._utils._errors import create_error_context
435
+
436
+ error_result = ExtractionResult(
437
+ content=f"Error: {type(e).__name__}: {e!s}",
438
+ mime_type="text/plain",
439
+ metadata={ # type: ignore[typeddict-unknown-key]
440
+ "error": True,
441
+ "error_context": create_error_context(
442
+ operation="batch_extract_bytes_sync",
443
+ error=e,
444
+ index=index,
445
+ mime_type=mime_type,
446
+ content_size=len(content),
447
+ ),
448
+ },
449
+ chunks=[],
450
+ )
451
+ return (index, error_result)
452
+
453
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
454
+ indexed_contents = list(enumerate(contents))
455
+ future_to_index = {executor.submit(extract_single, ic): i for i, ic in enumerate(indexed_contents)}
456
+
457
+ results: list[ExtractionResult] = [None] * len(contents) # type: ignore[list-item]
458
+ for future in as_completed(future_to_index):
459
+ index, result = future.result()
460
+ results[index] = result
461
+
462
+ return results
@@ -1,56 +1,60 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.2.0
3
+ Version: 3.3.0
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
+ Project-URL: homepage, https://github.com/Goldziher/kreuzberg
5
6
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
7
  License: MIT
7
- Project-URL: homepage, https://github.com/Goldziher/kreuzberg
8
+ License-File: LICENSE
8
9
  Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,table-extraction,tesseract,text-extraction,text-processing
9
10
  Classifier: Development Status :: 4 - Beta
10
11
  Classifier: Intended Audience :: Developers
11
12
  Classifier: License :: OSI Approved :: MIT License
12
13
  Classifier: Operating System :: OS Independent
13
14
  Classifier: Programming Language :: Python :: 3 :: Only
14
- Classifier: Programming Language :: Python :: 3.9
15
- Classifier: Programming Language :: Python :: 3.10
16
- Classifier: Programming Language :: Python :: 3.11
17
- Classifier: Programming Language :: Python :: 3.12
18
15
  Classifier: Programming Language :: Python :: 3.13
19
16
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
17
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
18
  Classifier: Topic :: Text Processing :: General
22
19
  Classifier: Topic :: Utilities
23
20
  Classifier: Typing :: Typed
24
- Requires-Python: >=3.9
25
- Description-Content-Type: text/markdown
26
- License-File: LICENSE
21
+ Requires-Python: >=3.13
27
22
  Requires-Dist: anyio>=4.9.0
28
23
  Requires-Dist: charset-normalizer>=3.4.2
29
- Requires-Dist: exceptiongroup>=1.2.2; python_version < "3.11"
24
+ Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
30
25
  Requires-Dist: html-to-markdown>=1.4.0
26
+ Requires-Dist: msgspec>=0.18.0
31
27
  Requires-Dist: playa-pdf>=0.6.1
28
+ Requires-Dist: psutil>=7.0.0
32
29
  Requires-Dist: pypdfium2==4.30.0
33
30
  Requires-Dist: python-calamine>=0.3.2
34
31
  Requires-Dist: python-pptx>=1.0.2
35
- Requires-Dist: typing-extensions>=4.14.0; python_version < "3.12"
32
+ Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
36
33
  Provides-Extra: all
37
- Requires-Dist: easyocr>=1.7.2; extra == "all"
38
- Requires-Dist: gmft>=0.4.1; extra == "all"
39
- Requires-Dist: paddleocr>=3.0.2; extra == "all"
40
- Requires-Dist: paddlepaddle>=3.0.0; extra == "all"
41
- Requires-Dist: semantic-text-splitter>=0.27.0; extra == "all"
42
- Requires-Dist: setuptools>=80.9.0; extra == "all"
34
+ Requires-Dist: click>=8.2.1; extra == 'all'
35
+ Requires-Dist: easyocr>=1.7.2; extra == 'all'
36
+ Requires-Dist: gmft>=0.4.2; extra == 'all'
37
+ Requires-Dist: paddleocr>=3.1.0; extra == 'all'
38
+ Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
39
+ Requires-Dist: rich>=14.0.0; extra == 'all'
40
+ Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
41
+ Requires-Dist: setuptools>=80.9.0; extra == 'all'
42
+ Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
43
43
  Provides-Extra: chunking
44
- Requires-Dist: semantic-text-splitter>=0.27.0; extra == "chunking"
44
+ Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
45
+ Provides-Extra: cli
46
+ Requires-Dist: click>=8.2.1; extra == 'cli'
47
+ Requires-Dist: rich>=14.0.0; extra == 'cli'
48
+ Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
45
49
  Provides-Extra: easyocr
46
- Requires-Dist: easyocr>=1.7.2; extra == "easyocr"
50
+ Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
47
51
  Provides-Extra: gmft
48
- Requires-Dist: gmft>=0.4.1; extra == "gmft"
52
+ Requires-Dist: gmft>=0.4.2; extra == 'gmft'
49
53
  Provides-Extra: paddleocr
50
- Requires-Dist: paddleocr>=3.0.2; extra == "paddleocr"
51
- Requires-Dist: paddlepaddle>=3.0.0; extra == "paddleocr"
52
- Requires-Dist: setuptools>=80.9.0; extra == "paddleocr"
53
- Dynamic: license-file
54
+ Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
55
+ Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
56
+ Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
57
+ Description-Content-Type: text/markdown
54
58
 
55
59
  # Kreuzberg
56
60
 
@@ -68,6 +72,7 @@ Kreuzberg is a Python library for text extraction from documents. It provides a
68
72
  - **Resource Efficient**: Lightweight processing without GPU requirements
69
73
  - **Format Support**: Comprehensive support for documents, images, and text formats
70
74
  - **Multiple OCR Engines**: Support for Tesseract, EasyOCR, and PaddleOCR
75
+ - **Command Line Interface**: Powerful CLI for batch processing and automation
71
76
  - **Metadata Extraction**: Get document metadata alongside text content
72
77
  - **Table Extraction**: Extract tables from documents using the excellent GMFT library
73
78
  - **Modern Python**: Built with async/await, type hints, and a functional-first approach
@@ -77,6 +82,9 @@ Kreuzberg is a Python library for text extraction from documents. It provides a
77
82
 
78
83
  ```bash
79
84
  pip install kreuzberg
85
+
86
+ # Or install with CLI support
87
+ pip install "kreuzberg[cli]"
80
88
  ```
81
89
 
82
90
  Install pandoc:
@@ -126,12 +134,53 @@ async def main():
126
134
  asyncio.run(main())
127
135
  ```
128
136
 
137
+ ## Command Line Interface
138
+
139
+ Kreuzberg includes a powerful CLI for processing documents from the command line:
140
+
141
+ ```bash
142
+ # Extract text from a file
143
+ kreuzberg extract document.pdf
144
+
145
+ # Extract with JSON output and metadata
146
+ kreuzberg extract document.pdf --output-format json --show-metadata
147
+
148
+ # Extract from stdin
149
+ cat document.html | kreuzberg extract
150
+
151
+ # Use specific OCR backend
152
+ kreuzberg extract image.png --ocr-backend easyocr --easyocr-languages en,de
153
+
154
+ # Extract with configuration file
155
+ kreuzberg extract document.pdf --config config.toml
156
+ ```
157
+
158
+ ### CLI Configuration
159
+
160
+ Configure via `pyproject.toml`:
161
+
162
+ ```toml
163
+ [tool.kreuzberg]
164
+ force_ocr = true
165
+ chunk_content = false
166
+ extract_tables = true
167
+ max_chars = 4000
168
+ ocr_backend = "tesseract"
169
+
170
+ [tool.kreuzberg.tesseract]
171
+ language = "eng+deu"
172
+ psm = 3
173
+ ```
174
+
175
+ For full CLI documentation, see the [CLI Guide](https://goldziher.github.io/kreuzberg/cli/).
176
+
129
177
  ## Documentation
130
178
 
131
179
  For comprehensive documentation, visit our [GitHub Pages](https://goldziher.github.io/kreuzberg/):
132
180
 
133
181
  - [Getting Started](https://goldziher.github.io/kreuzberg/getting-started/) - Installation and basic usage
134
182
  - [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - In-depth usage information
183
+ - [CLI Guide](https://goldziher.github.io/kreuzberg/cli/) - Command-line interface documentation
135
184
  - [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Detailed API documentation
136
185
  - [Examples](https://goldziher.github.io/kreuzberg/examples/) - Code examples for common use cases
137
186
  - [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - Configure OCR engines
@@ -157,6 +206,26 @@ Kreuzberg supports multiple OCR engines:
157
206
 
158
207
  For comparison and selection guidance, see the [OCR Backends](https://goldziher.github.io/kreuzberg/user-guide/ocr-backends/) documentation.
159
208
 
209
+ ## Performance
210
+
211
+ Kreuzberg offers both sync and async APIs. Choose the right one based on your use case:
212
+
213
+ | Operation | Sync Time | Async Time | Async Advantage |
214
+ | ---------------------- | --------- | ---------- | ------------------ |
215
+ | Simple text (Markdown) | 0.4ms | 17.5ms | **❌ 41x slower** |
216
+ | HTML documents | 1.6ms | 1.1ms | **✅ 1.5x faster** |
217
+ | Complex PDFs | 39.0s | 8.5s | **✅ 4.6x faster** |
218
+ | OCR processing | 0.4s | 0.7s | **✅ 1.7x faster** |
219
+ | Batch operations | 38.6s | 8.5s | **✅ 4.5x faster** |
220
+
221
+ **Rule of thumb:**
222
+
223
+ - Use **sync** for simple documents and CLI applications
224
+ - Use **async** for complex PDFs, OCR, and batch processing
225
+ - Use **batch operations** for multiple files
226
+
227
+ For detailed benchmarks and methodology, see our [Performance Documentation](https://goldziher.github.io/kreuzberg/advanced/performance/).
228
+
160
229
  ## Contributing
161
230
 
162
231
  We welcome contributions! Please see our [Contributing Guide](docs/contributing.md) for details on setting up your development environment and submitting pull requests.
@@ -0,0 +1,48 @@
1
+ kreuzberg/__init__.py,sha256=jRm2U-loiKWwJpgOFgZ8Ev2mfz9sI1qJOZ2V3OoJUlg,1258
2
+ kreuzberg/__main__.py,sha256=s2qM1nPEkRHAQP-G3P7sf5l6qA_KJeIEHS5LpPz04lg,183
3
+ kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
4
+ kreuzberg/_cli_config.py,sha256=WD_seFjbuay_NJv77vGLBW6BVV9WZNujdzf3zQkhzPc,5691
5
+ kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
6
+ kreuzberg/_gmft.py,sha256=6liCjedPxH5Xbe7V-AmrZIq5Y9Dejn7D-LSCbgYs2Sg,14762
7
+ kreuzberg/_mime_types.py,sha256=QgX-k8aI4lTKArObDM0TFPt7DUjUVwWrdIaIZDh_XQY,7815
8
+ kreuzberg/_playa.py,sha256=rU6ii2Qnrj8tkDYlSiab5h-BCYLJnUg4QwSLVDEXF5g,11883
9
+ kreuzberg/_registry.py,sha256=c2B_PJbaL0q3ab2eNmj_0jldeyMaqgvRwkZqUU4MM5Q,3290
10
+ kreuzberg/_types.py,sha256=G7UQ5ZUWcpgwHoasexW7f2te3gKe3PHHi_3Fm1cju-w,7503
11
+ kreuzberg/cli.py,sha256=S0w2nGXBWPFn1NhxppW7dpUwB9f_3ymFuWSAB2aRu9g,12465
12
+ kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
13
+ kreuzberg/extraction.py,sha256=z8sht8Yw9v6bE_WgLdWx-phu4T58eExME296DV_41VU,16551
14
+ kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
+ kreuzberg/_extractors/_base.py,sha256=YUr6A2n34LlFzbYQkiKqhXAphL9RYrvAls5SlkoQqNg,3028
17
+ kreuzberg/_extractors/_html.py,sha256=L_vcKyv1ObI6irPaD3-WTKqxeRfZA4Rhsl3zUiAe_ws,1312
18
+ kreuzberg/_extractors/_image.py,sha256=Vks6WEDoW5AlGqIGVSeuhZzvJNwS8V6wxeD46Fxxogw,3947
19
+ kreuzberg/_extractors/_pandoc.py,sha256=oQ4DgQSPoX1LXjGAKh_A40JHqiKWb91LeRBYSS_6EUA,26750
20
+ kreuzberg/_extractors/_pdf.py,sha256=qgYwGvAlvyZzb94lXGcKGIhzmSFpP6YGzYc7fs8b-yw,13432
21
+ kreuzberg/_extractors/_presentation.py,sha256=ZX-EKQppHwvKtyKk0-IQVF6QAqJi0SfGgCiiyqMQh0w,8701
22
+ kreuzberg/_extractors/_spread_sheet.py,sha256=ToLZIK_PO72IYbsdtSQkHOwTUhDwptjOfSX--e1UdSM,6487
23
+ kreuzberg/_multiprocessing/__init__.py,sha256=nwYQpKH7ixHwzkQbTMFCstOCBKktmbNq5dTrwI2Mn94,203
24
+ kreuzberg/_multiprocessing/gmft_isolated.py,sha256=wpZ5br5dL9P6hhGjAYckHbz8IvXrDdEvajJ7fxbFmAU,11199
25
+ kreuzberg/_multiprocessing/process_manager.py,sha256=dvO9JBWYnH1KCpzwn9h3Tz-wAoihMwTLE6OS-DF_sK0,6030
26
+ kreuzberg/_multiprocessing/sync_tesseract.py,sha256=Ck1PvHGWOMQWUcC7RyVrBt8K9VDFQ0lQcwFkwYzl3rE,8240
27
+ kreuzberg/_multiprocessing/tesseract_pool.py,sha256=UN7BtS_ib1ux9xuR6d6AB3PY7UEUhd-5Ti1n1H0UnYw,10945
28
+ kreuzberg/_ocr/__init__.py,sha256=VTqwKDlIRbjve71Y11Ztygyhv5aWG9LWTj8iX66ANxE,533
29
+ kreuzberg/_ocr/_base.py,sha256=lNT0Tin4hzbmaamqqySxvYEwNtrJB5gGlStrANQQcyc,1637
30
+ kreuzberg/_ocr/_easyocr.py,sha256=QSd6Bw7RBsOyL5ry-6lFLD7gJxcpK1P3AD_RRK4TPWs,13734
31
+ kreuzberg/_ocr/_paddleocr.py,sha256=UvugDdZd7RojHUiFeBaI8aqz36ecegPLj2v6oT6c42g,13776
32
+ kreuzberg/_ocr/_tesseract.py,sha256=NAHklkHvDKMgHVqjhgYfxC3DIJuQn8fXPkvnmQxUiV8,12784
33
+ kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
+ kreuzberg/_utils/_cache.py,sha256=JGiwwcNBoD950IbsPUUAD5gAGS7byUuz0BqYSneVakc,13088
35
+ kreuzberg/_utils/_device.py,sha256=Dk4g-LzUMJ-WMM-9czNQJj3mUI43l2w7t6MJcERYb2U,10264
36
+ kreuzberg/_utils/_document_cache.py,sha256=z8irioKsOu8xve1YgHatm__wIFvs9I1gDK3tLNsNyqM,6926
37
+ kreuzberg/_utils/_errors.py,sha256=AV3oaRQDgJxe1YUZd9pCQUysUv9KW8Ib37MvnyFOZ4o,6386
38
+ kreuzberg/_utils/_pdf_lock.py,sha256=nqxAYCNlfWDrJtP4ZNu57st1YnkDl-gYXdr0q8nv0kA,1961
39
+ kreuzberg/_utils/_process_pool.py,sha256=7n5UN3d-xeYHU5TiRI62u-JenERPinJzFhbRUq-zL9k,2895
40
+ kreuzberg/_utils/_serialization.py,sha256=AhZvyAu4KsjAqyZDh--Kn2kSWGgCuH7udio8lTklO0g,2132
41
+ kreuzberg/_utils/_string.py,sha256=owIVkUtP0__GiJD9RIJzPdvyIigT5sQho3mOXPbsnW0,958
42
+ kreuzberg/_utils/_sync.py,sha256=IsKkR_YmseZKY6Asz6w3k-dgMXcrVaI06jWfDY7Bol4,4842
43
+ kreuzberg/_utils/_tmp.py,sha256=5rqG_Nlb9xweaLqJA8Kc5csHDase9_eY_Fq93rNQGWc,1044
44
+ kreuzberg-3.3.0.dist-info/METADATA,sha256=beRlFJzCsZNcQ_DsRyzRc2WDT-UkBCfBvY6vTWiOxp0,8748
45
+ kreuzberg-3.3.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
46
+ kreuzberg-3.3.0.dist-info/entry_points.txt,sha256=VdoFaTl3QSvVWOZcIlPpDd47o6kn7EvmXSs8FI0ExLc,48
47
+ kreuzberg-3.3.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
48
+ kreuzberg-3.3.0.dist-info/RECORD,,
@@ -1,5 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: hatchling 1.27.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
-
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ kreuzberg = kreuzberg.cli:cli
@@ -1,34 +0,0 @@
1
- kreuzberg/__init__.py,sha256=lT9OwIdf5CEhSX7IVmtSFPgRhz6B2z2A-RE8Zdm0PH4,1216
2
- kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
3
- kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
4
- kreuzberg/_gmft.py,sha256=qLhfepQuaROjPOdI-tDRqqqnOcqDY1D411ZXzoywnpg,7229
5
- kreuzberg/_mime_types.py,sha256=pKtxBPDoye2knyou_VODDMPIt3eXotP-ak4MAKFI2SU,6310
6
- kreuzberg/_playa.py,sha256=agHdhKfKLNtiP37XdNncbCP65v3Qv3m1Gn2KTRUkVx8,10396
7
- kreuzberg/_registry.py,sha256=c2B_PJbaL0q3ab2eNmj_0jldeyMaqgvRwkZqUU4MM5Q,3290
8
- kreuzberg/_types.py,sha256=G7UQ5ZUWcpgwHoasexW7f2te3gKe3PHHi_3Fm1cju-w,7503
9
- kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
10
- kreuzberg/extraction.py,sha256=0sjvbunx5srbR5lzjOAQjGK5JY3bCUHw-dRFmHjFz7o,8671
11
- kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
- kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
- kreuzberg/_extractors/_base.py,sha256=YUr6A2n34LlFzbYQkiKqhXAphL9RYrvAls5SlkoQqNg,3028
14
- kreuzberg/_extractors/_html.py,sha256=L_vcKyv1ObI6irPaD3-WTKqxeRfZA4Rhsl3zUiAe_ws,1312
15
- kreuzberg/_extractors/_image.py,sha256=VQgSFSzXIMX3A52-DyvuKgfTRXUJIjYn6IX4-sQWWdg,2626
16
- kreuzberg/_extractors/_pandoc.py,sha256=OAbWvfzEx3rjim9uNMS9yBRnvkI71rYJKlgVzndsvyc,22157
17
- kreuzberg/_extractors/_pdf.py,sha256=eNFws_UxLgWSTC_VC_zJmVojpyQvioOXgNjSHQzBq5c,6607
18
- kreuzberg/_extractors/_presentation.py,sha256=7W6RHTk-zksuHoSk0i6UaSBf5NatnPo17MxepQoI6XI,8758
19
- kreuzberg/_extractors/_spread_sheet.py,sha256=1ejRZk8AE1dXS1tRIdg2S0J9Vo0wG81iKkW2IF6PjlE,4445
20
- kreuzberg/_ocr/__init__.py,sha256=VTqwKDlIRbjve71Y11Ztygyhv5aWG9LWTj8iX66ANxE,533
21
- kreuzberg/_ocr/_base.py,sha256=lNT0Tin4hzbmaamqqySxvYEwNtrJB5gGlStrANQQcyc,1637
22
- kreuzberg/_ocr/_easyocr.py,sha256=1OG2IbLdg4cXouV0FVzMnCkYYh6GN1pvXqXWw40PUz8,14054
23
- kreuzberg/_ocr/_paddleocr.py,sha256=K6D3B2cn-JIhipI5UHMa0Kn2M-GKtyUFCahs8wJQZcA,13855
24
- kreuzberg/_ocr/_tesseract.py,sha256=KcJMK4o__2H2ftibk1lC7HVqEfpaE_jVZgLhUXkxTvk,9773
25
- kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
- kreuzberg/_utils/_device.py,sha256=Ja28S2psgEwWzjdO05ZI11RFb3MSlUZDT19sC4SAyVE,10955
27
- kreuzberg/_utils/_string.py,sha256=oNO0cmwjVNG0jAzaqNCjYtzvM_nxH5TW2KV-Uh3oEUU,978
28
- kreuzberg/_utils/_sync.py,sha256=lycobEMXk0tBMWLwkuMdOuNMStDwPKMC0V1Qgp_oi6k,4071
29
- kreuzberg/_utils/_tmp.py,sha256=5rqG_Nlb9xweaLqJA8Kc5csHDase9_eY_Fq93rNQGWc,1044
30
- kreuzberg-3.2.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
31
- kreuzberg-3.2.0.dist-info/METADATA,sha256=xffQAGQur7sCgUT9RDqZpfkYTdthsuYIhCvbUDKFnmA,6504
32
- kreuzberg-3.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
33
- kreuzberg-3.2.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
34
- kreuzberg-3.2.0.dist-info/RECORD,,
@@ -1 +0,0 @@
1
- kreuzberg