kreuzberg 1.7.0__tar.gz → 2.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. kreuzberg-2.0.0/PKG-INFO +419 -0
  2. kreuzberg-2.0.0/README.md +384 -0
  3. kreuzberg-2.0.0/kreuzberg/__init__.py +15 -0
  4. kreuzberg-2.0.0/kreuzberg/_constants.py +6 -0
  5. kreuzberg-2.0.0/kreuzberg/_html.py +32 -0
  6. kreuzberg-2.0.0/kreuzberg/_mime_types.py +201 -0
  7. {kreuzberg-1.7.0 → kreuzberg-2.0.0}/kreuzberg/_pandoc.py +122 -169
  8. kreuzberg-2.0.0/kreuzberg/_pdf.py +189 -0
  9. kreuzberg-2.0.0/kreuzberg/_pptx.py +88 -0
  10. {kreuzberg-1.7.0 → kreuzberg-2.0.0}/kreuzberg/_string.py +5 -8
  11. {kreuzberg-1.7.0 → kreuzberg-2.0.0}/kreuzberg/_sync.py +6 -1
  12. {kreuzberg-1.7.0 → kreuzberg-2.0.0}/kreuzberg/_tesseract.py +98 -71
  13. kreuzberg-2.0.0/kreuzberg/_tmp.py +37 -0
  14. kreuzberg-2.0.0/kreuzberg/_types.py +71 -0
  15. kreuzberg-2.0.0/kreuzberg/_xlsx.py +92 -0
  16. kreuzberg-2.0.0/kreuzberg/extraction.py +366 -0
  17. kreuzberg-2.0.0/kreuzberg.egg-info/PKG-INFO +419 -0
  18. {kreuzberg-1.7.0 → kreuzberg-2.0.0}/kreuzberg.egg-info/SOURCES.txt +7 -1
  19. {kreuzberg-1.7.0 → kreuzberg-2.0.0}/kreuzberg.egg-info/requires.txt +4 -1
  20. {kreuzberg-1.7.0 → kreuzberg-2.0.0}/pyproject.toml +42 -40
  21. kreuzberg-1.7.0/PKG-INFO +0 -342
  22. kreuzberg-1.7.0/README.md +0 -308
  23. kreuzberg-1.7.0/kreuzberg/__init__.py +0 -11
  24. kreuzberg-1.7.0/kreuzberg/_extractors.py +0 -280
  25. kreuzberg-1.7.0/kreuzberg/_mime_types.py +0 -93
  26. kreuzberg-1.7.0/kreuzberg/extraction.py +0 -161
  27. kreuzberg-1.7.0/kreuzberg.egg-info/PKG-INFO +0 -342
  28. {kreuzberg-1.7.0 → kreuzberg-2.0.0}/LICENSE +0 -0
  29. {kreuzberg-1.7.0 → kreuzberg-2.0.0}/kreuzberg/exceptions.py +0 -0
  30. {kreuzberg-1.7.0 → kreuzberg-2.0.0}/kreuzberg/py.typed +0 -0
  31. {kreuzberg-1.7.0 → kreuzberg-2.0.0}/kreuzberg.egg-info/dependency_links.txt +0 -0
  32. {kreuzberg-1.7.0 → kreuzberg-2.0.0}/kreuzberg.egg-info/top_level.txt +0 -0
  33. {kreuzberg-1.7.0 → kreuzberg-2.0.0}/setup.cfg +0 -0
@@ -0,0 +1,419 @@
1
+ Metadata-Version: 2.2
2
+ Name: kreuzberg
3
+ Version: 2.0.0
4
+ Summary: A text extraction library supporting PDFs, images, office documents and more
5
+ Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
+ License: MIT
7
+ Project-URL: homepage, https://github.com/Goldziher/kreuzberg
8
+ Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,tesseract,text-extraction,text-processing
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Programming Language :: Python :: 3 :: Only
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Classifier: Topic :: Text Processing :: General
22
+ Classifier: Topic :: Utilities
23
+ Classifier: Typing :: Typed
24
+ Requires-Python: >=3.9
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ Requires-Dist: anyio>=4.8.0
28
+ Requires-Dist: charset-normalizer>=3.4.1
29
+ Requires-Dist: exceptiongroup>=1.2.2; python_version < "3.11"
30
+ Requires-Dist: html-to-markdown>=1.2.0
31
+ Requires-Dist: pypdfium2>=4.30.1
32
+ Requires-Dist: python-calamine>=0.3.1
33
+ Requires-Dist: python-pptx>=1.0.2
34
+ Requires-Dist: typing-extensions>=4.12.2; python_version < "3.11"
35
+
36
+ # Kreuzberg
37
+
38
+ Kreuzberg is a Python library for text extraction from documents. It provides a unified async interface for extracting text from PDFs, images, office documents, and more.
39
+
40
+ ## Why Kreuzberg?
41
+
42
+ - **Simple and Hassle-Free**: Clean API that just works, without complex configuration
43
+ - **Local Processing**: No external API calls or cloud dependencies required
44
+ - **Resource Efficient**: Lightweight processing without GPU requirements
45
+ - **Lightweight**: Has few curated dependencies and a minimal footprint
46
+ - **Format Support**: Comprehensive support for documents, images, and text formats
47
+ - **Modern Python**: Built with async/await, type hints, and functional first approach
48
+ - **Permissive OSS**: Kreuzberg and its dependencies have a permissive OSS license
49
+
50
+ Kreuzberg was built for RAG (Retrieval Augmented Generation) applications, focusing on local processing with minimal dependencies. Its designed for modern async applications, serverless functions, and dockerized applications.
51
+
52
+ ## Installation
53
+
54
+ ### 1. Install the Python Package
55
+
56
+ ```shell
57
+ pip install kreuzberg
58
+ ```
59
+
60
+ ### 2. Install System Dependencies
61
+
62
+ Kreuzberg requires two system level dependencies:
63
+
64
+ - [Pandoc](https://pandoc.org/installing.html) - For document format conversion
65
+ - [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
66
+
67
+ Please install these using their respective installation guides.
68
+
69
+ ## Architecture
70
+
71
+ Kreuzberg integrates:
72
+
73
+ - **PDF Processing**:
74
+ - `pdfium2` for searchable PDFs
75
+ - Tesseract OCR for scanned content
76
+ - **Document Conversion**:
77
+ - Pandoc for many document and markup formats
78
+ - `python-pptx` for PowerPoint files
79
+ - `html-to-markdown` for HTML content
80
+ - `calamine` for Excel spreadsheets (with multi-sheet support)
81
+ - **Text Processing**:
82
+ - Smart encoding detection
83
+ - Markdown and plain text handling
84
+
85
+ ### Supported Formats
86
+
87
+ #### Document Formats
88
+
89
+ - PDF (`.pdf`, both searchable and scanned)
90
+ - Microsoft Word (`.docx`)
91
+ - PowerPoint presentations (`.pptx`)
92
+ - OpenDocument Text (`.odt`)
93
+ - Rich Text Format (`.rtf`)
94
+ - EPUB (`.epub`)
95
+ - DocBook XML (`.dbk`, `.xml`)
96
+ - FictionBook (`.fb2`)
97
+ - LaTeX (`.tex`, `.latex`)
98
+ - Typst (`.typ`)
99
+
100
+ #### Markup and Text Formats
101
+
102
+ - HTML (`.html`, `.htm`)
103
+ - Plain text (`.txt`) and Markdown (`.md`, `.markdown`)
104
+ - reStructuredText (`.rst`)
105
+ - Org-mode (`.org`)
106
+ - DokuWiki (`.txt`)
107
+ - Pod (`.pod`)
108
+ - Troff/Man (`.1`, `.2`, etc.)
109
+
110
+ #### Data and Research Formats
111
+
112
+ - Spreadsheets (`.xlsx`, `.xls`, `.xlsm`, `.xlsb`, `.xlam`, `.xla`, `.ods`)
113
+ - CSV (`.csv`) and TSV (`.tsv`) files
114
+ - OPML files (`.opml`)
115
+ - Jupyter Notebooks (`.ipynb`)
116
+ - BibTeX (`.bib`) and BibLaTeX (`.bib`)
117
+ - CSL-JSON (`.json`)
118
+ - EndNote and JATS XML (`.xml`)
119
+ - RIS (`.ris`)
120
+
121
+ #### Image Formats
122
+
123
+ - JPEG (`.jpg`, `.jpeg`, `.pjpeg`)
124
+ - PNG (`.png`)
125
+ - TIFF (`.tiff`, `.tif`)
126
+ - BMP (`.bmp`)
127
+ - GIF (`.gif`)
128
+ - JPEG 2000 family (`.jp2`, `.jpm`, `.jpx`, `.mj2`)
129
+ - WebP (`.webp`)
130
+ - Portable anymap formats (`.pbm`, `.pgm`, `.ppm`, `.pnm`)
131
+
132
+ ## Usage
133
+
134
+ Kreuzberg provides both async and sync APIs for text extraction, including batch processing. The library exports the following main functions:
135
+
136
+ - Single Item Processing:
137
+
138
+ - `extract_file()`: Async function to extract text from a file (accepts string path or `pathlib.Path`)
139
+ - `extract_bytes()`: Async function to extract text from bytes (accepts a byte string)
140
+ - `extract_file_sync()`: Synchronous version of `extract_file()`
141
+ - `extract_bytes_sync()`: Synchronous version of `extract_bytes()`
142
+
143
+ - Batch Processing:
144
+ - `batch_extract_file()`: Async function to extract text from multiple files concurrently
145
+ - `batch_extract_bytes()`: Async function to extract text from multiple byte contents concurrently
146
+ - `batch_extract_file_sync()`: Synchronous version of `batch_extract_file()`
147
+ - `batch_extract_bytes_sync()`: Synchronous version of `batch_extract_bytes()`
148
+
149
+ ### Configuration Parameters
150
+
151
+ All extraction functions accept the following optional parameters for configuring OCR and performance:
152
+
153
+ #### OCR Configuration
154
+
155
+ - `language` (default: "eng"): Specifies the language model for Tesseract OCR. This affects text recognition accuracy for non-English documents. Examples:
156
+ - "eng" for English
157
+ - "deu" for German
158
+ - "fra" for French
159
+
160
+ Consult the [Tesseract documentation](https://tesseract-ocr.github.io/tessdoc/) for more information.
161
+
162
+ - `psm` (Page Segmentation Mode, default: PSM.AUTO): Controls how Tesseract analyzes page layout. In most cases you do not need to change this to a different value.
163
+
164
+ #### Performance Configuration
165
+
166
+ - `max_processes` (default: CPU count / 2): Maximum number of concurrent processes for Tesseract and Pandoc. Higher values can lead to performance improvements, but may cause resource exhaustion and deadlocks (especially for tesseract).
167
+
168
+ ### Quick Start
169
+
170
+ ```python
171
+ from pathlib import Path
172
+ from kreuzberg import extract_file
173
+ from kreuzberg.extraction import ExtractionResult
174
+ from kreuzberg._tesseract import PSMMode, SupportedLanguage
175
+
176
+
177
+ # Basic file extraction
178
+ async def extract_document():
179
+ # Extract from a PDF file with default settings
180
+ pdf_result: ExtractionResult = await extract_file("document.pdf")
181
+ print(f"Content: {pdf_result.content}")
182
+
183
+ # Extract from an image with German language model
184
+ img_result = await extract_file(
185
+ "scan.png",
186
+ language="deu", # German language model
187
+ psm=PSMMode.SINGLE_BLOCK, # Treat as single block of text
188
+ max_processes=4 # Limit concurrent processes
189
+ )
190
+ print(f"Image text: {img_result.content}")
191
+
192
+ # Extract from Word document with metadata
193
+ docx_result = await extract_file(Path("document.docx"))
194
+ if docx_result.metadata:
195
+ print(f"Title: {docx_result.metadata.get('title')}")
196
+ print(f"Author: {docx_result.metadata.get('author')}")
197
+ ```
198
+
199
+ ### Extracting Bytes
200
+
201
+ ```python
202
+ from kreuzberg import extract_bytes
203
+ from kreuzberg.extraction import ExtractionResult
204
+
205
+
206
+ async def process_upload(file_content: bytes, mime_type: str) -> ExtractionResult:
207
+ """Process uploaded file content with known MIME type."""
208
+ return await extract_bytes(
209
+ file_content,
210
+ mime_type=mime_type,
211
+ )
212
+
213
+
214
+ # Example usage with different file types
215
+ async def handle_uploads(docx_bytes: bytes, pdf_bytes: bytes, image_bytes: bytes):
216
+ # Process PDF upload
217
+ pdf_result = await process_upload(pdf_bytes, mime_type="application/pdf")
218
+ print(f"PDF content: {pdf_result.content}")
219
+ print(f"PDF metadata: {pdf_result.metadata}")
220
+
221
+ # Process image upload (will use OCR)
222
+ img_result = await process_upload(image_bytes, mime_type="image/jpeg")
223
+ print(f"Image text: {img_result.content}")
224
+
225
+ # Process Word document upload
226
+ docx_result = await process_upload(
227
+ docx_bytes,
228
+ mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
229
+ )
230
+ print(f"Word content: {docx_result.content}")
231
+ ```
232
+
233
+ ### Batch Processing
234
+
235
+ Kreuzberg supports efficient batch processing of multiple files or byte contents:
236
+
237
+ ```python
238
+ from pathlib import Path
239
+ from kreuzberg import batch_extract_file, batch_extract_bytes
240
+
241
+
242
+ # Process multiple files concurrently
243
+ async def process_documents(file_paths: list[Path]) -> None:
244
+ # Extract from multiple files
245
+ results = await batch_extract_file(file_paths)
246
+ for path, result in zip(file_paths, results):
247
+ print(f"File {path}: {result.content[:100]}...")
248
+
249
+
250
+ # Process multiple uploaded files concurrently
251
+ async def process_uploads(contents: list[tuple[bytes, str]]) -> None:
252
+ # Each item is a tuple of (content, mime_type)
253
+ results = await batch_extract_bytes(contents)
254
+ for (_, mime_type), result in zip(contents, results):
255
+ print(f"Upload {mime_type}: {result.content[:100]}...")
256
+
257
+
258
+ # Synchronous batch processing is also available
259
+ def process_documents_sync(file_paths: list[Path]) -> None:
260
+ results = batch_extract_file_sync(file_paths)
261
+ for path, result in zip(file_paths, results):
262
+ print(f"File {path}: {result.content[:100]}...")
263
+ ```
264
+
265
+ Features:
266
+
267
+ - Ordered results
268
+ - Concurrent processing
269
+ - Error handling per item
270
+ - Async and sync interfaces
271
+ - Same options as single extraction
272
+
273
+ ### PDF Processing
274
+
275
+ Kreuzberg employs a smart approach to PDF text extraction:
276
+
277
+ 1. **Searchable Text Detection**: First attempts to extract text directly from searchable PDFs using `pdfium2`.
278
+
279
+ 2. **Text Validation**: Extracted text is validated for corruption by checking for:
280
+
281
+ - Control and non-printable characters
282
+ - Unicode replacement characters (�)
283
+ - Zero-width spaces and other invisible characters
284
+ - Empty or whitespace-only content
285
+
286
+ 3. **Automatic OCR Fallback**: If the extracted text appears corrupted or if the PDF is image-based, automatically falls back to OCR using Tesseract.
287
+
288
+ This approach works well for searchable PDFs and standard text documents. For complex OCR (e.g., handwriting, photographs), use a specialized tool.
289
+
290
+ ### PDF Processing Options
291
+
292
+ You can control PDF processing behavior using optional parameters:
293
+
294
+ ```python
295
+ from kreuzberg import extract_file
296
+
297
+
298
+ async def process_pdf():
299
+ # Default behavior: auto-detect and use OCR if needed
300
+ # By default, max_processes=1 for safe operation
301
+ result = await extract_file("document.pdf")
302
+ print(result.content)
303
+
304
+ # Force OCR even for searchable PDFs
305
+ result = await extract_file("document.pdf", force_ocr=True)
306
+ print(result.content)
307
+
308
+ # Control OCR concurrency for large documents
309
+ # Warning: High concurrency values can cause system resource exhaustion
310
+ # Start with a low value and increase based on your system's capabilities
311
+ result = await extract_file(
312
+ "large_document.pdf",
313
+ max_processes=4 # Process up to 4 pages concurrently
314
+ )
315
+ print(result.content)
316
+
317
+ # Process a scanned PDF (automatically uses OCR)
318
+ result = await extract_file("scanned.pdf")
319
+ print(result.content)
320
+ ```
321
+
322
+ ### ExtractionResult Object
323
+
324
+ All extraction functions return an `ExtractionResult` or a list thereof (for batch functions). The `ExtractionResult` object is a `NamedTuple`:
325
+
326
+ - `content`: The extracted text (str)
327
+ - `mime_type`: Output format ("text/plain" or "text/markdown" for Pandoc conversions)
328
+ - `metadata`: A metadata dictionary. Currently this dictionary is only populated when extracting documents using pandoc.
329
+
330
+ ```python
331
+ from kreuzberg import extract_file, ExtractionResult, Metadata
332
+
333
+ async def process_document(path: str) -> tuple[str, str, Metadata]:
334
+ # Access as a named tuple
335
+ result: ExtractionResult = await extract_file(path)
336
+ print(f"Content: {result.content}")
337
+ print(f"Format: {result.mime_type}")
338
+
339
+ # Or unpack as a tuple
340
+ content, mime_type, metadata = await extract_file(path)
341
+ return content, mime_type, metadata
342
+ ```
343
+
344
+ ### Error Handling
345
+
346
+ Kreuzberg provides comprehensive error handling through several exception types, all inheriting from `KreuzbergError`. Each exception includes helpful context information for debugging.
347
+
348
+ ```python
349
+ from kreuzberg import extract_file
350
+ from kreuzberg.exceptions import (
351
+ ValidationError,
352
+ ParsingError,
353
+ OCRError,
354
+ MissingDependencyError
355
+ )
356
+
357
+ async def safe_extract(path: str) -> str:
358
+ try:
359
+ result = await extract_file(path)
360
+ return result.content
361
+
362
+ except ValidationError as e:
363
+ # Input validation issues
364
+ # - Unsupported or undetectable MIME types
365
+ # - Missing files
366
+ # - Invalid input parameters
367
+ print(f"Validation failed: {e}")
368
+
369
+ except OCRError as e:
370
+ # OCR-specific issues
371
+ # - Tesseract processing failures
372
+ # - Image conversion problems
373
+ print(f"OCR failed: {e}")
374
+
375
+ except MissingDependencyError as e:
376
+ # System dependency issues
377
+ # - Missing Tesseract OCR
378
+ # - Missing Pandoc
379
+ # - Incompatible versions
380
+ print(f"Dependency missing: {e}")
381
+
382
+ except ParsingError as e:
383
+ # General processing errors
384
+ # - PDF parsing failures
385
+ # - Format conversion issues
386
+ # - Encoding problems
387
+ print(f"Processing failed: {e}")
388
+
389
+ return ""
390
+ ```
391
+
392
+ All exceptions include:
393
+
394
+ - Error message
395
+ - Context in the `context` attribute
396
+ - String representation
397
+ - Exception chaining
398
+
399
+ ## Contribution
400
+
401
+ This library is open to contribution. Feel free to open issues or submit PRs. Its better to discuss issues before
402
+ submitting PRs to avoid disappointment.
403
+
404
+ ### Local Development
405
+
406
+ 1. Clone the repo
407
+ 2. Install the system dependencies
408
+ 3. Install the full dependencies with `uv sync`
409
+ 4. Install the pre-commit hooks with:
410
+
411
+ ```shell
412
+ pre-commit install && pre-commit install --hook-type commit-msg
413
+ ```
414
+
415
+ 5. Make your changes and submit a PR
416
+
417
+ ## License
418
+
419
+ This library uses the MIT license.