kreuzberg 3.2.0__py3-none-any.whl → 3.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. kreuzberg/__init__.py +3 -0
  2. kreuzberg/__main__.py +8 -0
  3. kreuzberg/_api/__init__.py +0 -0
  4. kreuzberg/_api/main.py +87 -0
  5. kreuzberg/_cli_config.py +175 -0
  6. kreuzberg/_extractors/_image.py +39 -4
  7. kreuzberg/_extractors/_pandoc.py +158 -18
  8. kreuzberg/_extractors/_pdf.py +199 -19
  9. kreuzberg/_extractors/_presentation.py +1 -1
  10. kreuzberg/_extractors/_spread_sheet.py +65 -7
  11. kreuzberg/_gmft.py +222 -16
  12. kreuzberg/_mime_types.py +62 -16
  13. kreuzberg/_multiprocessing/__init__.py +6 -0
  14. kreuzberg/_multiprocessing/gmft_isolated.py +332 -0
  15. kreuzberg/_multiprocessing/process_manager.py +188 -0
  16. kreuzberg/_multiprocessing/sync_tesseract.py +261 -0
  17. kreuzberg/_multiprocessing/tesseract_pool.py +359 -0
  18. kreuzberg/_ocr/_easyocr.py +6 -12
  19. kreuzberg/_ocr/_paddleocr.py +15 -13
  20. kreuzberg/_ocr/_tesseract.py +136 -46
  21. kreuzberg/_playa.py +43 -0
  22. kreuzberg/_types.py +4 -0
  23. kreuzberg/_utils/_cache.py +372 -0
  24. kreuzberg/_utils/_device.py +10 -27
  25. kreuzberg/_utils/_document_cache.py +220 -0
  26. kreuzberg/_utils/_errors.py +232 -0
  27. kreuzberg/_utils/_pdf_lock.py +72 -0
  28. kreuzberg/_utils/_process_pool.py +100 -0
  29. kreuzberg/_utils/_serialization.py +82 -0
  30. kreuzberg/_utils/_string.py +1 -1
  31. kreuzberg/_utils/_sync.py +21 -0
  32. kreuzberg/cli.py +338 -0
  33. kreuzberg/extraction.py +247 -36
  34. kreuzberg-3.4.0.dist-info/METADATA +290 -0
  35. kreuzberg-3.4.0.dist-info/RECORD +50 -0
  36. {kreuzberg-3.2.0.dist-info → kreuzberg-3.4.0.dist-info}/WHEEL +1 -2
  37. kreuzberg-3.4.0.dist-info/entry_points.txt +2 -0
  38. kreuzberg-3.2.0.dist-info/METADATA +0 -166
  39. kreuzberg-3.2.0.dist-info/RECORD +0 -34
  40. kreuzberg-3.2.0.dist-info/top_level.txt +0 -1
  41. {kreuzberg-3.2.0.dist-info → kreuzberg-3.4.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/extraction.py CHANGED
@@ -13,7 +13,8 @@ from kreuzberg._mime_types import (
13
13
  from kreuzberg._registry import ExtractorRegistry
14
14
  from kreuzberg._types import ExtractionConfig
15
15
  from kreuzberg._utils._string import safe_decode
16
- from kreuzberg._utils._sync import run_maybe_async, run_maybe_sync
16
+ from kreuzberg._utils._sync import run_maybe_sync, run_sync_only
17
+ from kreuzberg.exceptions import ValidationError
17
18
 
18
19
  if TYPE_CHECKING:
19
20
  from collections.abc import Sequence
@@ -42,7 +43,7 @@ async def _validate_and_post_process_async(result: ExtractionResult, config: Ext
42
43
 
43
44
  def _validate_and_post_process_sync(result: ExtractionResult, config: ExtractionConfig) -> ExtractionResult:
44
45
  for validator in config.validators or []:
45
- run_maybe_async(validator, result)
46
+ run_sync_only(validator, result)
46
47
 
47
48
  if config.chunk_content:
48
49
  result.chunks = _handle_chunk_content(
@@ -52,7 +53,7 @@ def _validate_and_post_process_sync(result: ExtractionResult, config: Extraction
52
53
  )
53
54
 
54
55
  for post_processor in config.post_processing_hooks or []:
55
- result = run_maybe_async(post_processor, result)
56
+ result = run_sync_only(post_processor, result)
56
57
 
57
58
  return result
58
59
 
@@ -104,22 +105,57 @@ async def extract_file(
104
105
 
105
106
  Returns:
106
107
  The extracted content and the mime type of the content.
108
+
109
+ Raises:
110
+ ValidationError: If the file path or configuration is invalid.
107
111
  """
108
- mime_type = validate_mime_type(file_path=file_path, mime_type=mime_type)
109
- if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=config):
110
- result = await extractor.extract_path_async(Path(file_path))
111
- else:
112
- result = ExtractionResult(
113
- content=safe_decode(await anyio.Path(file_path).read_bytes()), chunks=[], mime_type=mime_type, metadata={}
114
- )
112
+ from kreuzberg._utils._document_cache import get_document_cache
115
113
 
116
- return await _validate_and_post_process_async(result=result, config=config)
114
+ cache = get_document_cache()
115
+ path = Path(file_path)
116
+ cached_result = cache.get(path, config)
117
+ if cached_result is not None:
118
+ return cached_result
119
+
120
+ if cache.is_processing(path, config):
121
+ event = cache.mark_processing(path, config)
122
+ await anyio.to_thread.run_sync(event.wait) # pragma: no cover
123
+
124
+ # Try cache again after waiting for other process to complete # ~keep
125
+ cached_result = cache.get(path, config) # pragma: no cover
126
+ if cached_result is not None: # pragma: no cover
127
+ return cached_result
128
+
129
+ cache.mark_processing(path, config)
130
+
131
+ try:
132
+ if not path.exists():
133
+ raise ValidationError("The file does not exist", context={"file_path": str(path)})
134
+
135
+ mime_type = validate_mime_type(file_path=file_path, mime_type=mime_type)
136
+ if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=config):
137
+ result = await extractor.extract_path_async(Path(file_path))
138
+ else:
139
+ result = ExtractionResult(
140
+ content=safe_decode(await anyio.Path(file_path).read_bytes()),
141
+ chunks=[],
142
+ mime_type=mime_type,
143
+ metadata={},
144
+ )
145
+
146
+ result = await _validate_and_post_process_async(result=result, config=config)
147
+
148
+ cache.set(path, config, result)
149
+
150
+ return result
151
+ finally:
152
+ cache.mark_complete(path, config)
117
153
 
118
154
 
119
155
  async def batch_extract_file(
120
156
  file_paths: Sequence[PathLike[str] | str], config: ExtractionConfig = DEFAULT_CONFIG
121
157
  ) -> list[ExtractionResult]:
122
- """Extract text from multiple files concurrently.
158
+ """Extract text from multiple files concurrently with optimizations.
123
159
 
124
160
  Args:
125
161
  file_paths: A sequence of paths to files to extract text from.
@@ -128,15 +164,43 @@ async def batch_extract_file(
128
164
  Returns:
129
165
  A list of extraction results in the same order as the input paths.
130
166
  """
167
+ if not file_paths:
168
+ return []
169
+
170
+ import multiprocessing as mp
171
+
172
+ max_concurrency = min(len(file_paths), mp.cpu_count() * 2)
173
+ semaphore = anyio.Semaphore(max_concurrency)
174
+
131
175
  results = cast("list[ExtractionResult]", ([None] * len(file_paths)))
132
176
 
133
177
  async def _extract_file(path: PathLike[str] | str, index: int) -> None:
134
- result = await extract_file(
135
- path,
136
- None,
137
- config,
138
- )
139
- results[index] = result
178
+ async with semaphore:
179
+ try:
180
+ result = await extract_file(
181
+ path,
182
+ None,
183
+ config,
184
+ )
185
+ results[index] = result
186
+ except Exception as e: # noqa: BLE001
187
+ from kreuzberg._utils._errors import create_error_context
188
+
189
+ error_result = ExtractionResult(
190
+ content=f"Error: {type(e).__name__}: {e!s}",
191
+ mime_type="text/plain",
192
+ metadata={ # type: ignore[typeddict-unknown-key]
193
+ "error": True,
194
+ "error_context": create_error_context(
195
+ operation="batch_extract_file",
196
+ file_path=path,
197
+ error=e,
198
+ index=index,
199
+ ),
200
+ },
201
+ chunks=[],
202
+ )
203
+ results[index] = error_result
140
204
 
141
205
  async with anyio.create_task_group() as tg:
142
206
  for i, path in enumerate(file_paths):
@@ -148,7 +212,7 @@ async def batch_extract_file(
148
212
  async def batch_extract_bytes(
149
213
  contents: Sequence[tuple[bytes, str]], config: ExtractionConfig = DEFAULT_CONFIG
150
214
  ) -> list[ExtractionResult]:
151
- """Extract text from multiple byte contents concurrently.
215
+ """Extract text from multiple byte contents concurrently with optimizations.
152
216
 
153
217
  Args:
154
218
  contents: A sequence of tuples containing (content, mime_type) pairs.
@@ -157,11 +221,40 @@ async def batch_extract_bytes(
157
221
  Returns:
158
222
  A list of extraction results in the same order as the input contents.
159
223
  """
224
+ if not contents:
225
+ return []
226
+
227
+ import multiprocessing as mp
228
+
229
+ max_concurrency = min(len(contents), mp.cpu_count() * 2)
230
+ semaphore = anyio.Semaphore(max_concurrency)
231
+
160
232
  results = cast("list[ExtractionResult]", [None] * len(contents))
161
233
 
162
234
  async def _extract_bytes(content: bytes, mime_type: str, index: int) -> None:
163
- result = await extract_bytes(content, mime_type, config)
164
- results[index] = result
235
+ async with semaphore:
236
+ try:
237
+ result = await extract_bytes(content, mime_type, config)
238
+ results[index] = result
239
+ except Exception as e: # noqa: BLE001
240
+ from kreuzberg._utils._errors import create_error_context
241
+
242
+ error_result = ExtractionResult(
243
+ content=f"Error: {type(e).__name__}: {e!s}",
244
+ mime_type="text/plain",
245
+ metadata={ # type: ignore[typeddict-unknown-key]
246
+ "error": True,
247
+ "error_context": create_error_context(
248
+ operation="batch_extract_bytes",
249
+ error=e,
250
+ index=index,
251
+ mime_type=mime_type,
252
+ content_size=len(content),
253
+ ),
254
+ },
255
+ chunks=[],
256
+ )
257
+ results[index] = error_result
165
258
 
166
259
  async with anyio.create_task_group() as tg:
167
260
  for i, (content, mime_type) in enumerate(contents):
@@ -207,24 +300,57 @@ def extract_file_sync(
207
300
 
208
301
  Returns:
209
302
  The extracted content and the mime type of the content.
303
+
304
+ Raises:
305
+ ValidationError: If the file path or configuration is invalid.
210
306
  """
211
- mime_type = validate_mime_type(file_path=file_path, mime_type=mime_type)
212
- if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=config):
213
- result = extractor.extract_path_sync(Path(file_path))
214
- else:
215
- result = ExtractionResult(
216
- content=Path(file_path).read_text(),
217
- chunks=[],
218
- mime_type=mime_type,
219
- metadata={},
220
- )
221
- return _validate_and_post_process_sync(result=result, config=config)
307
+ from kreuzberg._utils._document_cache import get_document_cache
308
+
309
+ cache = get_document_cache()
310
+ path = Path(file_path)
311
+ cached_result = cache.get(path, config)
312
+ if cached_result is not None:
313
+ return cached_result
314
+
315
+ if cache.is_processing(path, config):
316
+ event = cache.mark_processing(path, config)
317
+ event.wait() # pragma: no cover
318
+
319
+ # Try cache again after waiting for other process to complete # ~keep
320
+ cached_result = cache.get(path, config) # pragma: no cover
321
+ if cached_result is not None: # pragma: no cover
322
+ return cached_result
323
+
324
+ cache.mark_processing(path, config)
325
+
326
+ try:
327
+ if not path.exists():
328
+ raise ValidationError("The file does not exist", context={"file_path": str(path)})
329
+
330
+ mime_type = validate_mime_type(file_path=file_path, mime_type=mime_type)
331
+ if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=config):
332
+ result = extractor.extract_path_sync(Path(file_path))
333
+ else:
334
+ result = ExtractionResult(
335
+ content=Path(file_path).read_text(),
336
+ chunks=[],
337
+ mime_type=mime_type,
338
+ metadata={},
339
+ )
340
+
341
+ result = _validate_and_post_process_sync(result=result, config=config)
342
+
343
+ cache.set(path, config, result)
344
+
345
+ return result
346
+ finally:
347
+ cache.mark_complete(path, config)
222
348
 
223
349
 
224
350
  def batch_extract_file_sync(
225
351
  file_paths: Sequence[PathLike[str] | str], config: ExtractionConfig = DEFAULT_CONFIG
226
352
  ) -> list[ExtractionResult]:
227
- """Synchronous version of batch_extract_file.
353
+ """Synchronous version of batch_extract_file with parallel processing.
228
354
 
229
355
  Args:
230
356
  file_paths: A sequence of paths to files to extract text from.
@@ -233,13 +359,54 @@ def batch_extract_file_sync(
233
359
  Returns:
234
360
  A list of extraction results in the same order as the input paths.
235
361
  """
236
- return [extract_file_sync(file_path=Path(file_path), mime_type=None, config=config) for file_path in file_paths]
362
+ if len(file_paths) <= 1:
363
+ return [extract_file_sync(file_path=Path(file_path), mime_type=None, config=config) for file_path in file_paths]
364
+
365
+ import multiprocessing as mp
366
+ from concurrent.futures import ThreadPoolExecutor, as_completed
367
+
368
+ max_workers = min(len(file_paths), mp.cpu_count())
369
+
370
+ def extract_single(file_path: PathLike[str] | str) -> tuple[int, ExtractionResult]:
371
+ """Extract single file with index for ordering."""
372
+ try:
373
+ return (
374
+ file_paths.index(file_path),
375
+ extract_file_sync(file_path=Path(file_path), mime_type=None, config=config),
376
+ )
377
+ except Exception as e: # noqa: BLE001
378
+ from kreuzberg._utils._errors import create_error_context
379
+
380
+ error_result = ExtractionResult(
381
+ content=f"Error: {type(e).__name__}: {e!s}",
382
+ mime_type="text/plain",
383
+ metadata={ # type: ignore[typeddict-unknown-key]
384
+ "error": True,
385
+ "error_context": create_error_context(
386
+ operation="batch_extract_file_sync",
387
+ file_path=file_path,
388
+ error=e,
389
+ ),
390
+ },
391
+ chunks=[],
392
+ )
393
+ return (file_paths.index(file_path), error_result)
394
+
395
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
396
+ future_to_index = {executor.submit(extract_single, fp): i for i, fp in enumerate(file_paths)}
397
+
398
+ results: list[ExtractionResult] = [None] * len(file_paths) # type: ignore[list-item]
399
+ for future in as_completed(future_to_index):
400
+ index, result = future.result()
401
+ results[index] = result
402
+
403
+ return results
237
404
 
238
405
 
239
406
  def batch_extract_bytes_sync(
240
407
  contents: Sequence[tuple[bytes, str]], config: ExtractionConfig = DEFAULT_CONFIG
241
408
  ) -> list[ExtractionResult]:
242
- """Synchronous version of batch_extract_bytes.
409
+ """Synchronous version of batch_extract_bytes with parallel processing.
243
410
 
244
411
  Args:
245
412
  contents: A sequence of tuples containing (content, mime_type) pairs.
@@ -248,4 +415,48 @@ def batch_extract_bytes_sync(
248
415
  Returns:
249
416
  A list of extraction results in the same order as the input contents.
250
417
  """
251
- return [extract_bytes_sync(content=content, mime_type=mime_type, config=config) for content, mime_type in contents]
418
+ if len(contents) <= 1:
419
+ return [
420
+ extract_bytes_sync(content=content, mime_type=mime_type, config=config) for content, mime_type in contents
421
+ ]
422
+
423
+ import multiprocessing as mp
424
+ from concurrent.futures import ThreadPoolExecutor, as_completed
425
+
426
+ max_workers = min(len(contents), mp.cpu_count())
427
+
428
+ def extract_single(index_and_content: tuple[int, tuple[bytes, str]]) -> tuple[int, ExtractionResult]:
429
+ """Extract single content with index for ordering."""
430
+ index, (content, mime_type) = index_and_content
431
+ try:
432
+ return (index, extract_bytes_sync(content=content, mime_type=mime_type, config=config))
433
+ except Exception as e: # noqa: BLE001
434
+ from kreuzberg._utils._errors import create_error_context
435
+
436
+ error_result = ExtractionResult(
437
+ content=f"Error: {type(e).__name__}: {e!s}",
438
+ mime_type="text/plain",
439
+ metadata={ # type: ignore[typeddict-unknown-key]
440
+ "error": True,
441
+ "error_context": create_error_context(
442
+ operation="batch_extract_bytes_sync",
443
+ error=e,
444
+ index=index,
445
+ mime_type=mime_type,
446
+ content_size=len(content),
447
+ ),
448
+ },
449
+ chunks=[],
450
+ )
451
+ return (index, error_result)
452
+
453
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
454
+ indexed_contents = list(enumerate(contents))
455
+ future_to_index = {executor.submit(extract_single, ic): i for i, ic in enumerate(indexed_contents)}
456
+
457
+ results: list[ExtractionResult] = [None] * len(contents) # type: ignore[list-item]
458
+ for future in as_completed(future_to_index):
459
+ index, result = future.result()
460
+ results[index] = result
461
+
462
+ return results
@@ -0,0 +1,290 @@
1
+ Metadata-Version: 2.4
2
+ Name: kreuzberg
3
+ Version: 3.4.0
4
+ Summary: A text extraction library supporting PDFs, images, office documents and more
5
+ Project-URL: homepage, https://github.com/Goldziher/kreuzberg
6
+ Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
7
+ License: MIT
8
+ License-File: LICENSE
9
+ Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,table-extraction,tesseract,text-extraction,text-processing
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3 :: Only
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
+ Classifier: Topic :: Text Processing :: General
19
+ Classifier: Topic :: Utilities
20
+ Classifier: Typing :: Typed
21
+ Requires-Python: >=3.13
22
+ Requires-Dist: anyio>=4.9.0
23
+ Requires-Dist: charset-normalizer>=3.4.2
24
+ Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
25
+ Requires-Dist: html-to-markdown>=1.4.0
26
+ Requires-Dist: msgspec>=0.18.0
27
+ Requires-Dist: playa-pdf>=0.6.1
28
+ Requires-Dist: psutil>=7.0.0
29
+ Requires-Dist: pypdfium2==4.30.0
30
+ Requires-Dist: python-calamine>=0.3.2
31
+ Requires-Dist: python-pptx>=1.0.2
32
+ Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
33
+ Provides-Extra: all
34
+ Requires-Dist: click>=8.2.1; extra == 'all'
35
+ Requires-Dist: easyocr>=1.7.2; extra == 'all'
36
+ Requires-Dist: gmft>=0.4.2; extra == 'all'
37
+ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'all'
38
+ Requires-Dist: paddleocr>=3.1.0; extra == 'all'
39
+ Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
40
+ Requires-Dist: rich>=14.0.0; extra == 'all'
41
+ Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
42
+ Requires-Dist: setuptools>=80.9.0; extra == 'all'
43
+ Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
44
+ Provides-Extra: api
45
+ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'api'
46
+ Provides-Extra: chunking
47
+ Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
48
+ Provides-Extra: cli
49
+ Requires-Dist: click>=8.2.1; extra == 'cli'
50
+ Requires-Dist: rich>=14.0.0; extra == 'cli'
51
+ Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
52
+ Provides-Extra: easyocr
53
+ Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
54
+ Provides-Extra: gmft
55
+ Requires-Dist: gmft>=0.4.2; extra == 'gmft'
56
+ Provides-Extra: paddleocr
57
+ Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
58
+ Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
59
+ Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
60
+ Description-Content-Type: text/markdown
61
+
62
+ # Kreuzberg
63
+
64
+ [![Discord](https://img.shields.io/badge/Discord-Join%20our%20community-7289da)](https://discord.gg/pXxagNK2zN)
65
+ [![PyPI version](https://badge.fury.io/py/kreuzberg.svg)](https://badge.fury.io/py/kreuzberg)
66
+ [![Documentation](https://img.shields.io/badge/docs-GitHub_Pages-blue)](https://goldziher.github.io/kreuzberg/)
67
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
68
+
69
+ Kreuzberg is a **high-performance** Python library for text extraction from documents. **Benchmarked as one of the fastest text extraction libraries available**, it provides a unified interface for extracting text from PDFs, images, office documents, and more, with both async and sync APIs optimized for speed and efficiency.
70
+
71
+ ## Why Kreuzberg?
72
+
73
+ - **🚀 Substantially Faster**: Extraction speeds that significantly outperform other text extraction libraries
74
+ - **⚡ Unique Dual API**: The only framework supporting both sync and async APIs for maximum flexibility
75
+ - **💾 Memory Efficient**: Lower memory footprint compared to competing libraries
76
+ - **📊 Proven Performance**: [Comprehensive benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) demonstrate superior performance across formats
77
+ - **Simple and Hassle-Free**: Clean API that just works, without complex configuration
78
+ - **Local Processing**: No external API calls or cloud dependencies required
79
+ - **Resource Efficient**: Lightweight processing without GPU requirements
80
+ - **Format Support**: Comprehensive support for documents, images, and text formats
81
+ - **Multiple OCR Engines**: Support for Tesseract, EasyOCR, and PaddleOCR
82
+ - **Command Line Interface**: Powerful CLI for batch processing and automation
83
+ - **Metadata Extraction**: Get document metadata alongside text content
84
+ - **Table Extraction**: Extract tables from documents using the excellent GMFT library
85
+ - **Modern Python**: Built with async/await, type hints, and a functional-first approach
86
+ - **Permissive OSS**: MIT licensed with permissively licensed dependencies
87
+
88
+ ## Quick Start
89
+
90
+ ```bash
91
+ pip install kreuzberg
92
+
93
+ # Or install with CLI support
94
+ pip install "kreuzberg[cli]"
95
+
96
+ # Or install with API server
97
+ pip install "kreuzberg[api]"
98
+ ```
99
+
100
+ Install pandoc:
101
+
102
+ ```bash
103
+ # Ubuntu/Debian
104
+ sudo apt-get install tesseract-ocr pandoc
105
+
106
+ # macOS
107
+ brew install tesseract pandoc
108
+
109
+ # Windows
110
+ choco install -y tesseract pandoc
111
+ ```
112
+
113
+ The tesseract OCR engine is the default OCR engine. You can decide not to use it - and then either use one of the two alternative OCR engines, or have no OCR at all.
114
+
115
+ ### Alternative OCR engines
116
+
117
+ ```bash
118
+ # Install with EasyOCR support
119
+ pip install "kreuzberg[easyocr]"
120
+
121
+ # Install with PaddleOCR support
122
+ pip install "kreuzberg[paddleocr]"
123
+ ```
124
+
125
+ ## Quick Example
126
+
127
+ ```python
128
+ import asyncio
129
+ from kreuzberg import extract_file
130
+
131
+ async def main():
132
+ # Extract text from a PDF
133
+ result = await extract_file("document.pdf")
134
+ print(result.content)
135
+
136
+ # Extract text from an image
137
+ result = await extract_file("scan.jpg")
138
+ print(result.content)
139
+
140
+ # Extract text from a Word document
141
+ result = await extract_file("report.docx")
142
+ print(result.content)
143
+
144
+ asyncio.run(main())
145
+ ```
146
+
147
+ ## Docker
148
+
149
+ Docker images are available for easy deployment:
150
+
151
+ ```bash
152
+ # Run the API server
153
+ docker run -p 8000:8000 goldziher/kreuzberg:latest
154
+
155
+ # Extract files via API
156
+ curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
157
+ ```
158
+
159
+ See the [Docker documentation](https://goldziher.github.io/kreuzberg/user-guide/docker/) for more options.
160
+
161
+ ## REST API
162
+
163
+ Run Kreuzberg as a REST API server:
164
+
165
+ ```bash
166
+ pip install "kreuzberg[api]"
167
+ litestar --app kreuzberg._api.main:app run
168
+ ```
169
+
170
+ See the [API documentation](https://goldziher.github.io/kreuzberg/user-guide/api-server/) for endpoints and usage.
171
+
172
+ ## Command Line Interface
173
+
174
+ Kreuzberg includes a powerful CLI for processing documents from the command line:
175
+
176
+ ```bash
177
+ # Extract text from a file
178
+ kreuzberg extract document.pdf
179
+
180
+ # Extract with JSON output and metadata
181
+ kreuzberg extract document.pdf --output-format json --show-metadata
182
+
183
+ # Extract from stdin
184
+ cat document.html | kreuzberg extract
185
+
186
+ # Use specific OCR backend
187
+ kreuzberg extract image.png --ocr-backend easyocr --easyocr-languages en,de
188
+
189
+ # Extract with configuration file
190
+ kreuzberg extract document.pdf --config config.toml
191
+ ```
192
+
193
+ ### CLI Configuration
194
+
195
+ Configure via `pyproject.toml`:
196
+
197
+ ```toml
198
+ [tool.kreuzberg]
199
+ force_ocr = true
200
+ chunk_content = false
201
+ extract_tables = true
202
+ max_chars = 4000
203
+ ocr_backend = "tesseract"
204
+
205
+ [tool.kreuzberg.tesseract]
206
+ language = "eng+deu"
207
+ psm = 3
208
+ ```
209
+
210
+ For full CLI documentation, see the [CLI Guide](https://goldziher.github.io/kreuzberg/cli/).
211
+
212
+ ## Documentation
213
+
214
+ For comprehensive documentation, visit our [GitHub Pages](https://goldziher.github.io/kreuzberg/):
215
+
216
+ - [Getting Started](https://goldziher.github.io/kreuzberg/getting-started/) - Installation and basic usage
217
+ - [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - In-depth usage information
218
+ - [CLI Guide](https://goldziher.github.io/kreuzberg/cli/) - Command-line interface documentation
219
+ - [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Detailed API documentation
220
+ - [Examples](https://goldziher.github.io/kreuzberg/examples/) - Code examples for common use cases
221
+ - [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - Configure OCR engines
222
+ - [OCR Backends](https://goldziher.github.io/kreuzberg/user-guide/ocr-backends/) - Choose the right OCR engine
223
+
224
+ ## Supported Formats
225
+
226
+ Kreuzberg supports a wide range of document formats:
227
+
228
+ - **Documents**: PDF, DOCX, RTF, TXT, EPUB, etc.
229
+ - **Images**: JPG, PNG, TIFF, BMP, GIF, etc.
230
+ - **Spreadsheets**: XLSX, XLS, CSV, etc.
231
+ - **Presentations**: PPTX, PPT, etc.
232
+ - **Web Content**: HTML, XML, etc.
233
+
234
+ ## OCR Engines
235
+
236
+ Kreuzberg supports multiple OCR engines:
237
+
238
+ - **Tesseract** (Default): Lightweight, fast startup, requires system installation
239
+ - **EasyOCR**: Good for many languages, pure Python, but downloads models on first use
240
+ - **PaddleOCR**: Excellent for Asian languages, pure Python, but downloads models on first use
241
+
242
+ For comparison and selection guidance, see the [OCR Backends](https://goldziher.github.io/kreuzberg/user-guide/ocr-backends/) documentation.
243
+
244
+ ## Performance
245
+
246
+ Kreuzberg delivers **exceptional performance** compared to other text extraction libraries:
247
+
248
+ ### 🏆 Competitive Benchmarks
249
+
250
+ [Comprehensive benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) comparing Kreuzberg against other popular Python text extraction libraries show:
251
+
252
+ - **Fastest Extraction**: Consistently fastest processing times across file formats
253
+ - **Lowest Memory Usage**: Most memory-efficient text extraction solution
254
+ - **100% Success Rate**: Reliable extraction across all tested document types
255
+ - **Optimal for High-Throughput**: Designed for real-time, production applications
256
+
257
+ ### 💾 Installation Size Efficiency
258
+
259
+ Kreuzberg delivers maximum performance with minimal overhead:
260
+
261
+ 1. **Kreuzberg**: 71.0 MB (20 deps) - Most lightweight
262
+ 1. **Unstructured**: 145.8 MB (54 deps) - Moderate footprint
263
+ 1. **MarkItDown**: 250.7 MB (25 deps) - ML inference overhead
264
+ 1. **Docling**: 1,031.9 MB (88 deps) - Full ML stack included
265
+
266
+ **Kreuzberg is up to 14x smaller** than competing solutions while delivering superior performance.
267
+
268
+ ### ⚡ Sync vs Async Performance
269
+
270
+ Kreuzberg is the only library offering both sync and async APIs. Choose based on your use case:
271
+
272
+ | Operation | Sync Time | Async Time | Async Advantage |
273
+ | ---------------------- | --------- | ---------- | ------------------ |
274
+ | Simple text (Markdown) | 0.4ms | 17.5ms | **❌ 41x slower** |
275
+ | HTML documents | 1.6ms | 1.1ms | **✅ 1.5x faster** |
276
+ | Complex PDFs | 39.0s | 8.5s | **✅ 4.6x faster** |
277
+ | OCR processing | 0.4s | 0.7s | **✅ 1.7x faster** |
278
+ | Batch operations | 38.6s | 8.5s | **✅ 4.5x faster** |
279
+
280
+ **Rule of thumb:** Use async for complex documents, OCR, batch processing, and backend APIs.
281
+
282
+ For detailed benchmarks and methodology, see our [Performance Documentation](https://goldziher.github.io/kreuzberg/advanced/performance/).
283
+
284
+ ## Contributing
285
+
286
+ We welcome contributions! Please see our [Contributing Guide](docs/contributing.md) for details on setting up your development environment and submitting pull requests.
287
+
288
+ ## License
289
+
290
+ This library is released under the MIT license.