kreuzberg 1.7.0__tar.gz → 2.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. kreuzberg-2.0.1/PKG-INFO +451 -0
  2. kreuzberg-2.0.1/README.md +416 -0
  3. kreuzberg-2.0.1/kreuzberg/__init__.py +26 -0
  4. kreuzberg-2.0.1/kreuzberg/_constants.py +6 -0
  5. kreuzberg-2.0.1/kreuzberg/_html.py +32 -0
  6. kreuzberg-2.0.1/kreuzberg/_mime_types.py +201 -0
  7. {kreuzberg-1.7.0 → kreuzberg-2.0.1}/kreuzberg/_pandoc.py +122 -169
  8. kreuzberg-2.0.1/kreuzberg/_pdf.py +189 -0
  9. kreuzberg-2.0.1/kreuzberg/_pptx.py +88 -0
  10. {kreuzberg-1.7.0 → kreuzberg-2.0.1}/kreuzberg/_string.py +5 -8
  11. {kreuzberg-1.7.0 → kreuzberg-2.0.1}/kreuzberg/_sync.py +6 -1
  12. kreuzberg-2.0.1/kreuzberg/_tesseract.py +225 -0
  13. kreuzberg-2.0.1/kreuzberg/_tmp.py +37 -0
  14. kreuzberg-2.0.1/kreuzberg/_types.py +71 -0
  15. kreuzberg-2.0.1/kreuzberg/_xlsx.py +92 -0
  16. kreuzberg-2.0.1/kreuzberg/extraction.py +366 -0
  17. kreuzberg-2.0.1/kreuzberg.egg-info/PKG-INFO +451 -0
  18. {kreuzberg-1.7.0 → kreuzberg-2.0.1}/kreuzberg.egg-info/SOURCES.txt +7 -1
  19. {kreuzberg-1.7.0 → kreuzberg-2.0.1}/kreuzberg.egg-info/requires.txt +4 -1
  20. {kreuzberg-1.7.0 → kreuzberg-2.0.1}/pyproject.toml +42 -40
  21. kreuzberg-1.7.0/PKG-INFO +0 -342
  22. kreuzberg-1.7.0/README.md +0 -308
  23. kreuzberg-1.7.0/kreuzberg/__init__.py +0 -11
  24. kreuzberg-1.7.0/kreuzberg/_extractors.py +0 -280
  25. kreuzberg-1.7.0/kreuzberg/_mime_types.py +0 -93
  26. kreuzberg-1.7.0/kreuzberg/_tesseract.py +0 -328
  27. kreuzberg-1.7.0/kreuzberg/extraction.py +0 -161
  28. kreuzberg-1.7.0/kreuzberg.egg-info/PKG-INFO +0 -342
  29. {kreuzberg-1.7.0 → kreuzberg-2.0.1}/LICENSE +0 -0
  30. {kreuzberg-1.7.0 → kreuzberg-2.0.1}/kreuzberg/exceptions.py +0 -0
  31. {kreuzberg-1.7.0 → kreuzberg-2.0.1}/kreuzberg/py.typed +0 -0
  32. {kreuzberg-1.7.0 → kreuzberg-2.0.1}/kreuzberg.egg-info/dependency_links.txt +0 -0
  33. {kreuzberg-1.7.0 → kreuzberg-2.0.1}/kreuzberg.egg-info/top_level.txt +0 -0
  34. {kreuzberg-1.7.0 → kreuzberg-2.0.1}/setup.cfg +0 -0
@@ -0,0 +1,451 @@
1
+ Metadata-Version: 2.2
2
+ Name: kreuzberg
3
+ Version: 2.0.1
4
+ Summary: A text extraction library supporting PDFs, images, office documents and more
5
+ Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
+ License: MIT
7
+ Project-URL: homepage, https://github.com/Goldziher/kreuzberg
8
+ Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,tesseract,text-extraction,text-processing
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Programming Language :: Python :: 3 :: Only
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Classifier: Topic :: Text Processing :: General
22
+ Classifier: Topic :: Utilities
23
+ Classifier: Typing :: Typed
24
+ Requires-Python: >=3.9
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ Requires-Dist: anyio>=4.8.0
28
+ Requires-Dist: charset-normalizer>=3.4.1
29
+ Requires-Dist: exceptiongroup>=1.2.2; python_version < "3.11"
30
+ Requires-Dist: html-to-markdown>=1.2.0
31
+ Requires-Dist: pypdfium2>=4.30.1
32
+ Requires-Dist: python-calamine>=0.3.1
33
+ Requires-Dist: python-pptx>=1.0.2
34
+ Requires-Dist: typing-extensions>=4.12.2; python_version < "3.11"
35
+
36
+ # Kreuzberg
37
+
38
+ Kreuzberg is a Python library for text extraction from documents. It provides a unified async interface for extracting text from PDFs, images, office documents, and more.
39
+
40
+ ## Why Kreuzberg?
41
+
42
+ - **Simple and Hassle-Free**: Clean API that just works, without complex configuration
43
+ - **Local Processing**: No external API calls or cloud dependencies required
44
+ - **Resource Efficient**: Lightweight processing without GPU requirements
45
+ - **Lightweight**: Has few curated dependencies and a minimal footprint
46
+ - **Format Support**: Comprehensive support for documents, images, and text formats
47
+ - **Modern Python**: Built with async/await, type hints, and functional first approach
48
+ - **Permissive OSS**: Kreuzberg and its dependencies have a permissive OSS license
49
+
50
+ Kreuzberg was built for RAG (Retrieval Augmented Generation) applications, focusing on local processing with minimal dependencies. Its designed for modern async applications, serverless functions, and dockerized applications.
51
+
52
+ ## Installation
53
+
54
+ ### 1. Install the Python Package
55
+
56
+ ```shell
57
+ pip install kreuzberg
58
+ ```
59
+
60
+ ### 2. Install System Dependencies
61
+
62
+ Kreuzberg requires two system level dependencies:
63
+
64
+ - [Pandoc](https://pandoc.org/installing.html) - For document format conversion
65
+ - [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
66
+
67
+ You can install these with:
68
+
69
+ #### Linux (Ubuntu)
70
+
71
+ ```shell
72
+ sudo apt-get install pandoc tesseract-ocr
73
+ ```
74
+
75
+ #### MacOS
76
+
77
+ ```shell
78
+ # MacOS
79
+ brew install tesseract pandoc
80
+ ```
81
+
82
+ #### Windows
83
+
84
+ ```shell
85
+ choco install -y tesseract pandoc
86
+ ```
87
+
88
+ Notes:
89
+
90
+ - in most distributions the tesseract-ocr package is split into multiple packages, you may need to install any language models you need aside from English separately.
91
+ - please consult the official documentation for these libraries for the most up-to-date installation instructions for your platform.
92
+
93
+ ## Architecture
94
+
95
+ Kreuzberg integrates:
96
+
97
+ - **PDF Processing**:
98
+ - `pdfium2` for searchable PDFs
99
+ - Tesseract OCR for scanned content
100
+ - **Document Conversion**:
101
+ - Pandoc for many document and markup formats
102
+ - `python-pptx` for PowerPoint files
103
+ - `html-to-markdown` for HTML content
104
+ - `calamine` for Excel spreadsheets (with multi-sheet support)
105
+ - **Text Processing**:
106
+ - Smart encoding detection
107
+ - Markdown and plain text handling
108
+
109
+ ### Supported Formats
110
+
111
+ #### Document Formats
112
+
113
+ - PDF (`.pdf`, both searchable and scanned)
114
+ - Microsoft Word (`.docx`)
115
+ - PowerPoint presentations (`.pptx`)
116
+ - OpenDocument Text (`.odt`)
117
+ - Rich Text Format (`.rtf`)
118
+ - EPUB (`.epub`)
119
+ - DocBook XML (`.dbk`, `.xml`)
120
+ - FictionBook (`.fb2`)
121
+ - LaTeX (`.tex`, `.latex`)
122
+ - Typst (`.typ`)
123
+
124
+ #### Markup and Text Formats
125
+
126
+ - HTML (`.html`, `.htm`)
127
+ - Plain text (`.txt`) and Markdown (`.md`, `.markdown`)
128
+ - reStructuredText (`.rst`)
129
+ - Org-mode (`.org`)
130
+ - DokuWiki (`.txt`)
131
+ - Pod (`.pod`)
132
+ - Troff/Man (`.1`, `.2`, etc.)
133
+
134
+ #### Data and Research Formats
135
+
136
+ - Spreadsheets (`.xlsx`, `.xls`, `.xlsm`, `.xlsb`, `.xlam`, `.xla`, `.ods`)
137
+ - CSV (`.csv`) and TSV (`.tsv`) files
138
+ - OPML files (`.opml`)
139
+ - Jupyter Notebooks (`.ipynb`)
140
+ - BibTeX (`.bib`) and BibLaTeX (`.bib`)
141
+ - CSL-JSON (`.json`)
142
+ - EndNote and JATS XML (`.xml`)
143
+ - RIS (`.ris`)
144
+
145
+ #### Image Formats
146
+
147
+ - JPEG (`.jpg`, `.jpeg`, `.pjpeg`)
148
+ - PNG (`.png`)
149
+ - TIFF (`.tiff`, `.tif`)
150
+ - BMP (`.bmp`)
151
+ - GIF (`.gif`)
152
+ - JPEG 2000 family (`.jp2`, `.jpm`, `.jpx`, `.mj2`)
153
+ - WebP (`.webp`)
154
+ - Portable anymap formats (`.pbm`, `.pgm`, `.ppm`, `.pnm`)
155
+
156
+ ## Usage
157
+
158
+ Kreuzberg provides both async and sync APIs for text extraction, including batch processing. The library exports the following main functions:
159
+
160
+ - Single Item Processing:
161
+
162
+ - `extract_file()`: Async function to extract text from a file (accepts string path or `pathlib.Path`)
163
+ - `extract_bytes()`: Async function to extract text from bytes (accepts a byte string)
164
+ - `extract_file_sync()`: Synchronous version of `extract_file()`
165
+ - `extract_bytes_sync()`: Synchronous version of `extract_bytes()`
166
+
167
+ - Batch Processing:
168
+ - `batch_extract_file()`: Async function to extract text from multiple files concurrently
169
+ - `batch_extract_bytes()`: Async function to extract text from multiple byte contents concurrently
170
+ - `batch_extract_file_sync()`: Synchronous version of `batch_extract_file()`
171
+ - `batch_extract_bytes_sync()`: Synchronous version of `batch_extract_bytes()`
172
+
173
+ ### Configuration Parameters
174
+
175
+ All extraction functions accept the following optional parameters for configuring OCR and performance:
176
+
177
+ #### OCR Configuration
178
+
179
+ - `force_ocr`(default: `False`): Forces OCR processing even for searchable PDFs.
180
+ - `language` (default: `eng`): Specifies the language model for Tesseract OCR. This affects text recognition accuracy for documents in different languages. Examples:
181
+
182
+ - `eng` for English
183
+ - `deu` for German
184
+ - `eng+deu` for English and German
185
+
186
+ Notes: - the order of languages effect processing time, the first language is the primary language and the second language is the secondary language etc.
187
+
188
+ - `psm` (Page Segmentation Mode, default: `PSM.AUTO`): Controls how Tesseract analyzes page layout. In most cases you do not need to change this to a different value.
189
+
190
+ Consult the [Tesseract documentation](https://tesseract-ocr.github.io/tessdoc/) for more information on both options.
191
+
192
+ #### Processing Configuration
193
+
194
+ - `max_processes` (default: CPU count / 2): Maximum number of concurrent processes for Tesseract and Pandoc.
195
+
196
+ Notes:
197
+
198
+ - Higher values can lead to performance improvements when batch processing especially with OCR, but may cause resource exhaustion and deadlocks (especially for tesseract).
199
+
200
+ ### Quick Start
201
+
202
+ ```python
203
+ from pathlib import Path
204
+ from kreuzberg import extract_file
205
+ from kreuzberg.extraction import ExtractionResult
206
+ from kreuzberg._tesseract import PSMMode
207
+
208
+
209
+ # Basic file extraction
210
+ async def extract_document():
211
+ # Extract from a PDF file with default settings
212
+ pdf_result: ExtractionResult = await extract_file("document.pdf")
213
+ print(f"Content: {pdf_result.content}")
214
+
215
+ # Extract from an image with German language model
216
+ img_result = await extract_file(
217
+ "scan.png",
218
+ language="deu", # German language model
219
+ psm=PSMMode.SINGLE_BLOCK, # Treat as single block of text
220
+ max_processes=4 # Limit concurrent processes
221
+ )
222
+ print(f"Image text: {img_result.content}")
223
+
224
+ # Extract from Word document with metadata
225
+ docx_result = await extract_file(Path("document.docx"))
226
+ if docx_result.metadata:
227
+ print(f"Title: {docx_result.metadata.get('title')}")
228
+ print(f"Author: {docx_result.metadata.get('creator')}")
229
+ ```
230
+
231
+ ### Extracting Bytes
232
+
233
+ ```python
234
+ from kreuzberg import extract_bytes
235
+ from kreuzberg.extraction import ExtractionResult
236
+
237
+
238
+ async def process_upload(file_content: bytes, mime_type: str) -> ExtractionResult:
239
+ """Process uploaded file content with known MIME type."""
240
+ return await extract_bytes(
241
+ file_content,
242
+ mime_type=mime_type,
243
+ )
244
+
245
+
246
+ # Example usage with different file types
247
+ async def handle_uploads(docx_bytes: bytes, pdf_bytes: bytes, image_bytes: bytes):
248
+ # Process PDF upload
249
+ pdf_result = await process_upload(pdf_bytes, mime_type="application/pdf")
250
+ print(f"PDF content: {pdf_result.content}")
251
+ print(f"PDF metadata: {pdf_result.metadata}")
252
+
253
+ # Process image upload (will use OCR)
254
+ img_result = await process_upload(image_bytes, mime_type="image/jpeg")
255
+ print(f"Image text: {img_result.content}")
256
+
257
+ # Process Word document upload
258
+ docx_result = await process_upload(
259
+ docx_bytes,
260
+ mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
261
+ )
262
+ print(f"Word content: {docx_result.content}")
263
+ ```
264
+
265
+ ### Batch Processing
266
+
267
+ Kreuzberg supports efficient batch processing of multiple files or byte contents:
268
+
269
+ ```python
270
+ from pathlib import Path
271
+ from kreuzberg import batch_extract_file, batch_extract_bytes, batch_extract_file_sync
272
+
273
+
274
+ # Process multiple files concurrently
275
+ async def process_documents(file_paths: list[Path]) -> None:
276
+ # Extract from multiple files
277
+ results = await batch_extract_file(file_paths)
278
+ for path, result in zip(file_paths, results):
279
+ print(f"File {path}: {result.content[:100]}...")
280
+
281
+
282
+ # Process multiple uploaded files concurrently
283
+ async def process_uploads(contents: list[tuple[bytes, str]]) -> None:
284
+ # Each item is a tuple of (content, mime_type)
285
+ results = await batch_extract_bytes(contents)
286
+ for (_, mime_type), result in zip(contents, results):
287
+ print(f"Upload {mime_type}: {result.content[:100]}...")
288
+
289
+
290
+ # Synchronous batch processing is also available
291
+ def process_documents_sync(file_paths: list[Path]) -> None:
292
+ results = batch_extract_file_sync(file_paths)
293
+ for path, result in zip(file_paths, results):
294
+ print(f"File {path}: {result.content[:100]}...")
295
+ ```
296
+
297
+ Features:
298
+
299
+ - Ordered results
300
+ - Concurrent processing
301
+ - Error handling per item
302
+ - Async and sync interfaces
303
+ - Same options as single extraction
304
+
305
+ ### PDF Processing
306
+
307
+ Kreuzberg employs a smart approach to PDF text extraction:
308
+
309
+ 1. **Searchable Text Detection**: First attempts to extract text directly from searchable PDFs using `pdfium2`.
310
+
311
+ 2. **Text Validation**: Extracted text is validated for corruption by checking for:
312
+
313
+ - Control and non-printable characters
314
+ - Unicode replacement characters (�)
315
+ - Zero-width spaces and other invisible characters
316
+ - Empty or whitespace-only content
317
+
318
+ 3. **Automatic OCR Fallback**: If the extracted text appears corrupted or if the PDF is image-based, automatically falls back to OCR using Tesseract.
319
+
320
+ This approach works well for searchable PDFs and standard text documents. For complex OCR (e.g., handwriting, photographs), use a specialized tool.
321
+
322
+ ### PDF Processing Options
323
+
324
+ You can control PDF processing behavior using optional parameters:
325
+
326
+ ```python
327
+ from kreuzberg import extract_file
328
+
329
+
330
+ async def process_pdf():
331
+ # Default behavior: auto-detect and use OCR if needed
332
+ # By default, max_processes=1 for safe operation
333
+ result = await extract_file("document.pdf")
334
+ print(result.content)
335
+
336
+ # Force OCR even for searchable PDFs
337
+ result = await extract_file("document.pdf", force_ocr=True)
338
+ print(result.content)
339
+
340
+ # Control OCR concurrency for large documents
341
+ # Warning: High concurrency values can cause system resource exhaustion
342
+ # Start with a low value and increase based on your system's capabilities
343
+ result = await extract_file(
344
+ "large_document.pdf",
345
+ max_processes=4 # Process up to 4 pages concurrently
346
+ )
347
+ print(result.content)
348
+
349
+ # Process a scanned PDF (automatically uses OCR)
350
+ result = await extract_file("scanned.pdf")
351
+ print(result.content)
352
+ ```
353
+
354
+ ### ExtractionResult Object
355
+
356
+ All extraction functions return an `ExtractionResult` or a list thereof (for batch functions). The `ExtractionResult` object is a `NamedTuple`:
357
+
358
+ - `content`: The extracted text (str)
359
+ - `mime_type`: Output format ("text/plain" or "text/markdown" for Pandoc conversions)
360
+ - `metadata`: A metadata dictionary. Currently this dictionary is only populated when extracting documents using pandoc.
361
+
362
+ ```python
363
+ from kreuzberg import extract_file, ExtractionResult, Metadata
364
+
365
+ async def process_document(path: str) -> tuple[str, str, Metadata]:
366
+ # Access as a named tuple
367
+ result: ExtractionResult = await extract_file(path)
368
+ print(f"Content: {result.content}")
369
+ print(f"Format: {result.mime_type}")
370
+
371
+ # Or unpack as a tuple
372
+ content, mime_type, metadata = await extract_file(path)
373
+ return content, mime_type, metadata
374
+ ```
375
+
376
+ ### Error Handling
377
+
378
+ Kreuzberg provides comprehensive error handling through several exception types, all inheriting from `KreuzbergError`. Each exception includes helpful context information for debugging.
379
+
380
+ ```python
381
+ from kreuzberg import extract_file
382
+ from kreuzberg.exceptions import (
383
+ ValidationError,
384
+ ParsingError,
385
+ OCRError,
386
+ MissingDependencyError
387
+ )
388
+
389
+ async def safe_extract(path: str) -> str:
390
+ try:
391
+ result = await extract_file(path)
392
+ return result.content
393
+
394
+ except ValidationError as e:
395
+ # Input validation issues
396
+ # - Unsupported or undetectable MIME types
397
+ # - Missing files
398
+ # - Invalid input parameters
399
+ print(f"Validation failed: {e}")
400
+
401
+ except OCRError as e:
402
+ # OCR-specific issues
403
+ # - Tesseract processing failures
404
+ # - Image conversion problems
405
+ print(f"OCR failed: {e}")
406
+
407
+ except MissingDependencyError as e:
408
+ # System dependency issues
409
+ # - Missing Tesseract OCR
410
+ # - Missing Pandoc
411
+ # - Incompatible versions
412
+ print(f"Dependency missing: {e}")
413
+
414
+ except ParsingError as e:
415
+ # General processing errors
416
+ # - PDF parsing failures
417
+ # - Format conversion issues
418
+ # - Encoding problems
419
+ print(f"Processing failed: {e}")
420
+
421
+ return ""
422
+ ```
423
+
424
+ All exceptions include:
425
+
426
+ - Error message
427
+ - Context in the `context` attribute
428
+ - String representation
429
+ - Exception chaining
430
+
431
+ ## Contribution
432
+
433
+ This library is open to contribution. Feel free to open issues or submit PRs. Its better to discuss issues before
434
+ submitting PRs to avoid disappointment.
435
+
436
+ ### Local Development
437
+
438
+ 1. Clone the repo
439
+ 2. Install the system dependencies
440
+ 3. Install the full dependencies with `uv sync`
441
+ 4. Install the pre-commit hooks with:
442
+
443
+ ```shell
444
+ pre-commit install && pre-commit install --hook-type commit-msg
445
+ ```
446
+
447
+ 5. Make your changes and submit a PR
448
+
449
+ ## License
450
+
451
+ This library uses the MIT license.