kreuzberg 3.8.0__py3-none-any.whl → 3.8.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +4 -0
- kreuzberg/_api/main.py +22 -1
- kreuzberg/_config.py +404 -0
- kreuzberg/_entity_extraction.py +4 -5
- kreuzberg/_extractors/_base.py +3 -5
- kreuzberg/_extractors/_image.py +18 -32
- kreuzberg/_extractors/_pandoc.py +3 -14
- kreuzberg/_extractors/_pdf.py +39 -57
- kreuzberg/_extractors/_spread_sheet.py +2 -3
- kreuzberg/_extractors/_structured.py +10 -7
- kreuzberg/_gmft.py +314 -10
- kreuzberg/_language_detection.py +1 -1
- kreuzberg/_mcp/server.py +58 -8
- kreuzberg/_ocr/__init__.py +1 -22
- kreuzberg/_ocr/_base.py +59 -0
- kreuzberg/_ocr/_easyocr.py +92 -1
- kreuzberg/_ocr/_paddleocr.py +90 -1
- kreuzberg/_ocr/_tesseract.py +556 -5
- kreuzberg/_playa.py +2 -3
- kreuzberg/_types.py +46 -24
- kreuzberg/_utils/_cache.py +35 -4
- kreuzberg/_utils/_device.py +10 -20
- kreuzberg/_utils/_errors.py +44 -45
- kreuzberg/_utils/_process_pool.py +2 -6
- kreuzberg/_utils/_quality.py +7 -11
- kreuzberg/_utils/_serialization.py +21 -16
- kreuzberg/_utils/_string.py +22 -12
- kreuzberg/_utils/_table.py +3 -4
- kreuzberg/cli.py +4 -5
- kreuzberg/exceptions.py +10 -0
- kreuzberg/extraction.py +6 -24
- kreuzberg-3.8.2.dist-info/METADATA +265 -0
- kreuzberg-3.8.2.dist-info/RECORD +53 -0
- kreuzberg/_cli_config.py +0 -175
- kreuzberg/_multiprocessing/__init__.py +0 -5
- kreuzberg/_multiprocessing/gmft_isolated.py +0 -330
- kreuzberg/_ocr/_pool.py +0 -357
- kreuzberg/_ocr/_sync.py +0 -566
- kreuzberg-3.8.0.dist-info/METADATA +0 -313
- kreuzberg-3.8.0.dist-info/RECORD +0 -57
- {kreuzberg-3.8.0.dist-info → kreuzberg-3.8.2.dist-info}/WHEEL +0 -0
- {kreuzberg-3.8.0.dist-info → kreuzberg-3.8.2.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.8.0.dist-info → kreuzberg-3.8.2.dist-info}/licenses/LICENSE +0 -0
kreuzberg/exceptions.py
CHANGED
@@ -7,6 +7,8 @@ from typing import Any
|
|
7
7
|
class KreuzbergError(Exception):
|
8
8
|
"""Base exception for all Kreuzberg errors."""
|
9
9
|
|
10
|
+
__slots__ = ("context",)
|
11
|
+
|
10
12
|
context: Any
|
11
13
|
"""The context of the error."""
|
12
14
|
|
@@ -43,14 +45,20 @@ class KreuzbergError(Exception):
|
|
43
45
|
class ParsingError(KreuzbergError):
|
44
46
|
"""Raised when a parsing error occurs."""
|
45
47
|
|
48
|
+
__slots__ = ()
|
49
|
+
|
46
50
|
|
47
51
|
class ValidationError(KreuzbergError):
|
48
52
|
"""Raised when a validation error occurs."""
|
49
53
|
|
54
|
+
__slots__ = ()
|
55
|
+
|
50
56
|
|
51
57
|
class MissingDependencyError(KreuzbergError):
|
52
58
|
"""Raised when a dependency is missing."""
|
53
59
|
|
60
|
+
__slots__ = ()
|
61
|
+
|
54
62
|
@classmethod
|
55
63
|
def create_for_package(
|
56
64
|
cls, *, dependency_group: str, functionality: str, package_name: str
|
@@ -79,3 +87,5 @@ class MissingDependencyError(KreuzbergError):
|
|
79
87
|
|
80
88
|
class OCRError(KreuzbergError):
|
81
89
|
"""Raised when an OCR error occurs."""
|
90
|
+
|
91
|
+
__slots__ = ()
|
kreuzberg/extraction.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import multiprocessing as mp
|
4
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
3
5
|
from pathlib import Path
|
4
6
|
from typing import TYPE_CHECKING, Any, Final, cast
|
5
7
|
|
@@ -14,6 +16,8 @@ from kreuzberg._mime_types import (
|
|
14
16
|
)
|
15
17
|
from kreuzberg._registry import ExtractorRegistry
|
16
18
|
from kreuzberg._types import ExtractionConfig
|
19
|
+
from kreuzberg._utils._document_cache import get_document_cache
|
20
|
+
from kreuzberg._utils._errors import create_error_context
|
17
21
|
from kreuzberg._utils._string import safe_decode
|
18
22
|
from kreuzberg._utils._sync import run_maybe_sync, run_sync_only
|
19
23
|
from kreuzberg.exceptions import ValidationError
|
@@ -136,8 +140,6 @@ async def extract_file(
|
|
136
140
|
Raises:
|
137
141
|
ValidationError: If the file path or configuration is invalid.
|
138
142
|
"""
|
139
|
-
from kreuzberg._utils._document_cache import get_document_cache
|
140
|
-
|
141
143
|
cache = get_document_cache()
|
142
144
|
path = Path(file_path)
|
143
145
|
cached_result = cache.get(path, config)
|
@@ -194,8 +196,6 @@ async def batch_extract_file(
|
|
194
196
|
if not file_paths:
|
195
197
|
return []
|
196
198
|
|
197
|
-
import multiprocessing as mp
|
198
|
-
|
199
199
|
max_concurrency = min(len(file_paths), mp.cpu_count() * 2)
|
200
200
|
semaphore = anyio.Semaphore(max_concurrency)
|
201
201
|
|
@@ -211,8 +211,6 @@ async def batch_extract_file(
|
|
211
211
|
)
|
212
212
|
results[index] = result
|
213
213
|
except Exception as e: # noqa: BLE001
|
214
|
-
from kreuzberg._utils._errors import create_error_context
|
215
|
-
|
216
214
|
error_result = ExtractionResult(
|
217
215
|
content=f"Error: {type(e).__name__}: {e!s}",
|
218
216
|
mime_type="text/plain",
|
@@ -251,8 +249,6 @@ async def batch_extract_bytes(
|
|
251
249
|
if not contents:
|
252
250
|
return []
|
253
251
|
|
254
|
-
import multiprocessing as mp
|
255
|
-
|
256
252
|
max_concurrency = min(len(contents), mp.cpu_count() * 2)
|
257
253
|
semaphore = anyio.Semaphore(max_concurrency)
|
258
254
|
|
@@ -264,8 +260,6 @@ async def batch_extract_bytes(
|
|
264
260
|
result = await extract_bytes(content, mime_type, config)
|
265
261
|
results[index] = result
|
266
262
|
except Exception as e: # noqa: BLE001
|
267
|
-
from kreuzberg._utils._errors import create_error_context
|
268
|
-
|
269
263
|
error_result = ExtractionResult(
|
270
264
|
content=f"Error: {type(e).__name__}: {e!s}",
|
271
265
|
mime_type="text/plain",
|
@@ -331,8 +325,6 @@ def extract_file_sync(
|
|
331
325
|
Raises:
|
332
326
|
ValidationError: If the file path or configuration is invalid.
|
333
327
|
"""
|
334
|
-
from kreuzberg._utils._document_cache import get_document_cache
|
335
|
-
|
336
328
|
cache = get_document_cache()
|
337
329
|
path = Path(file_path)
|
338
330
|
cached_result = cache.get(path, config)
|
@@ -389,9 +381,6 @@ def batch_extract_file_sync(
|
|
389
381
|
if len(file_paths) <= 1:
|
390
382
|
return [extract_file_sync(file_path=Path(file_path), mime_type=None, config=config) for file_path in file_paths]
|
391
383
|
|
392
|
-
import multiprocessing as mp
|
393
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
394
|
-
|
395
384
|
max_workers = min(len(file_paths), mp.cpu_count())
|
396
385
|
|
397
386
|
def extract_single(file_path: PathLike[str] | str) -> tuple[int, ExtractionResult]:
|
@@ -402,8 +391,6 @@ def batch_extract_file_sync(
|
|
402
391
|
extract_file_sync(file_path=Path(file_path), mime_type=None, config=config),
|
403
392
|
)
|
404
393
|
except Exception as e: # noqa: BLE001
|
405
|
-
from kreuzberg._utils._errors import create_error_context
|
406
|
-
|
407
394
|
error_result = ExtractionResult(
|
408
395
|
content=f"Error: {type(e).__name__}: {e!s}",
|
409
396
|
mime_type="text/plain",
|
@@ -447,9 +434,6 @@ def batch_extract_bytes_sync(
|
|
447
434
|
extract_bytes_sync(content=content, mime_type=mime_type, config=config) for content, mime_type in contents
|
448
435
|
]
|
449
436
|
|
450
|
-
import multiprocessing as mp
|
451
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
452
|
-
|
453
437
|
max_workers = min(len(contents), mp.cpu_count())
|
454
438
|
|
455
439
|
def extract_single(index_and_content: tuple[int, tuple[bytes, str]]) -> tuple[int, ExtractionResult]:
|
@@ -458,8 +442,6 @@ def batch_extract_bytes_sync(
|
|
458
442
|
try:
|
459
443
|
return (index, extract_bytes_sync(content=content, mime_type=mime_type, config=config))
|
460
444
|
except Exception as e: # noqa: BLE001
|
461
|
-
from kreuzberg._utils._errors import create_error_context
|
462
|
-
|
463
445
|
error_result = ExtractionResult(
|
464
446
|
content=f"Error: {type(e).__name__}: {e!s}",
|
465
447
|
mime_type="text/plain",
|
@@ -478,8 +460,8 @@ def batch_extract_bytes_sync(
|
|
478
460
|
return (index, error_result)
|
479
461
|
|
480
462
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
481
|
-
|
482
|
-
future_to_index = {executor.submit(extract_single,
|
463
|
+
# Avoid creating intermediate list, use enumerate directly
|
464
|
+
future_to_index = {executor.submit(extract_single, (i, content)): i for i, content in enumerate(contents)}
|
483
465
|
|
484
466
|
results: list[ExtractionResult] = [None] * len(contents) # type: ignore[list-item]
|
485
467
|
for future in as_completed(future_to_index):
|
@@ -0,0 +1,265 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: kreuzberg
|
3
|
+
Version: 3.8.2
|
4
|
+
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
|
+
Project-URL: documentation, https://kreuzberg.dev
|
6
|
+
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
7
|
+
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
8
|
+
License: MIT
|
9
|
+
License-File: LICENSE
|
10
|
+
Keywords: async,document-analysis,document-intelligence,document-processing,extensible,information-extraction,mcp,metadata-extraction,model-context-protocol,ocr,pandoc,pdf-extraction,pdfium,plugin-architecture,rag,retrieval-augmented-generation,structured-data,table-extraction,tesseract,text-extraction
|
11
|
+
Classifier: Development Status :: 5 - Production/Stable
|
12
|
+
Classifier: Intended Audience :: Developers
|
13
|
+
Classifier: Intended Audience :: Information Technology
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
16
|
+
Classifier: Operating System :: OS Independent
|
17
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
22
|
+
Classifier: Topic :: Database
|
23
|
+
Classifier: Topic :: Multimedia :: Graphics :: Capture :: Scanners
|
24
|
+
Classifier: Topic :: Office/Business :: Office Suites
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
26
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
27
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
28
|
+
Classifier: Topic :: Text Processing :: General
|
29
|
+
Classifier: Typing :: Typed
|
30
|
+
Requires-Python: >=3.10
|
31
|
+
Requires-Dist: anyio>=4.9.0
|
32
|
+
Requires-Dist: chardetng-py>=0.3.4
|
33
|
+
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
34
|
+
Requires-Dist: html-to-markdown[lxml]>=1.8.0
|
35
|
+
Requires-Dist: mcp>=1.11.0
|
36
|
+
Requires-Dist: msgspec>=0.18.0
|
37
|
+
Requires-Dist: playa-pdf>=0.6.1
|
38
|
+
Requires-Dist: psutil>=7.0.0
|
39
|
+
Requires-Dist: pypdfium2==4.30.0
|
40
|
+
Requires-Dist: python-calamine>=0.3.2
|
41
|
+
Requires-Dist: python-pptx>=1.0.2
|
42
|
+
Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
|
43
|
+
Provides-Extra: additional-extensions
|
44
|
+
Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
|
45
|
+
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
|
46
|
+
Provides-Extra: all
|
47
|
+
Requires-Dist: click>=8.2.1; extra == 'all'
|
48
|
+
Requires-Dist: easyocr>=1.7.2; extra == 'all'
|
49
|
+
Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
|
50
|
+
Requires-Dist: gmft>=0.4.2; extra == 'all'
|
51
|
+
Requires-Dist: keybert>=0.9.0; extra == 'all'
|
52
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all'
|
53
|
+
Requires-Dist: mailparse>=1.0.15; extra == 'all'
|
54
|
+
Requires-Dist: paddleocr>=3.1.0; extra == 'all'
|
55
|
+
Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
|
56
|
+
Requires-Dist: rich>=14.0.0; extra == 'all'
|
57
|
+
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
|
58
|
+
Requires-Dist: setuptools>=80.9.0; extra == 'all'
|
59
|
+
Requires-Dist: spacy>=3.8.7; extra == 'all'
|
60
|
+
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
|
61
|
+
Provides-Extra: api
|
62
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'api'
|
63
|
+
Provides-Extra: chunking
|
64
|
+
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
|
65
|
+
Provides-Extra: cli
|
66
|
+
Requires-Dist: click>=8.2.1; extra == 'cli'
|
67
|
+
Requires-Dist: rich>=14.0.0; extra == 'cli'
|
68
|
+
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
|
69
|
+
Provides-Extra: easyocr
|
70
|
+
Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
|
71
|
+
Provides-Extra: entity-extraction
|
72
|
+
Requires-Dist: keybert>=0.9.0; extra == 'entity-extraction'
|
73
|
+
Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
|
74
|
+
Provides-Extra: gmft
|
75
|
+
Requires-Dist: gmft>=0.4.2; extra == 'gmft'
|
76
|
+
Provides-Extra: langdetect
|
77
|
+
Requires-Dist: fast-langdetect>=0.3.2; extra == 'langdetect'
|
78
|
+
Provides-Extra: paddleocr
|
79
|
+
Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
|
80
|
+
Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
|
81
|
+
Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
|
82
|
+
Description-Content-Type: text/markdown
|
83
|
+
|
84
|
+
# Kreuzberg
|
85
|
+
|
86
|
+
[](https://discord.gg/pXxagNK2zN)
|
87
|
+
[](https://badge.fury.io/py/kreuzberg)
|
88
|
+
[](https://kreuzberg.dev/)
|
89
|
+
[](https://benchmarks.kreuzberg.dev/)
|
90
|
+
[](https://opensource.org/licenses/MIT)
|
91
|
+
[](https://github.com/Goldziher/kreuzberg)
|
92
|
+
|
93
|
+
**A document intelligence framework for Python.** Extract text, metadata, and structured information from diverse document formats through a unified, extensible API. Built on established open source foundations including Pandoc, PDFium, and Tesseract.
|
94
|
+
|
95
|
+
📖 **[Complete Documentation](https://kreuzberg.dev/)**
|
96
|
+
|
97
|
+
## Framework Overview
|
98
|
+
|
99
|
+
### Document Intelligence Capabilities
|
100
|
+
|
101
|
+
- **Text Extraction**: High-fidelity text extraction preserving document structure and formatting
|
102
|
+
- **Metadata Extraction**: Comprehensive metadata including author, creation date, language, and document properties
|
103
|
+
- **Format Support**: 18 document types including PDF, Microsoft Office, images, HTML, and structured data formats
|
104
|
+
- **OCR Integration**: Multiple OCR engines (Tesseract, EasyOCR, PaddleOCR) with automatic fallback
|
105
|
+
- **Table Detection**: Structured table extraction with cell-level precision via GMFT integration
|
106
|
+
|
107
|
+
### Technical Architecture
|
108
|
+
|
109
|
+
- **Performance**: Highest throughput among Python document processing frameworks (30+ docs/second)
|
110
|
+
- **Resource Efficiency**: 71MB installation, ~360MB runtime memory footprint
|
111
|
+
- **Extensibility**: Plugin architecture for custom extractors via the Extractor base class
|
112
|
+
- **API Design**: Synchronous and asynchronous APIs with consistent interfaces
|
113
|
+
- **Type Safety**: Complete type annotations throughout the codebase
|
114
|
+
|
115
|
+
### Open Source Foundation
|
116
|
+
|
117
|
+
Kreuzberg leverages established open source technologies:
|
118
|
+
|
119
|
+
- **Pandoc**: Universal document converter for robust format support
|
120
|
+
- **PDFium**: Google's PDF rendering engine for accurate PDF processing
|
121
|
+
- **Tesseract**: Google's OCR engine for text recognition
|
122
|
+
- **Python-docx/pptx**: Native Microsoft Office format support
|
123
|
+
|
124
|
+
## Quick Start
|
125
|
+
|
126
|
+
### Extract Text with CLI
|
127
|
+
|
128
|
+
```bash
|
129
|
+
# Extract text from any file to markdown
|
130
|
+
uvx kreuzberg extract document.pdf > output.md
|
131
|
+
|
132
|
+
# With all features (OCR, table extraction, etc.)
|
133
|
+
uvx --from "kreuzberg[all]" kreuzberg extract invoice.pdf --ocr --format markdown
|
134
|
+
|
135
|
+
# Extract with rich metadata
|
136
|
+
uvx kreuzberg extract report.pdf --show-metadata --format json
|
137
|
+
```
|
138
|
+
|
139
|
+
### Python Usage
|
140
|
+
|
141
|
+
**Async (recommended for web apps):**
|
142
|
+
|
143
|
+
```python
|
144
|
+
from kreuzberg import extract_file
|
145
|
+
|
146
|
+
# In your async function
|
147
|
+
result = await extract_file("presentation.pptx")
|
148
|
+
print(result.content)
|
149
|
+
|
150
|
+
# Rich metadata extraction
|
151
|
+
print(f"Title: {result.metadata.title}")
|
152
|
+
print(f"Author: {result.metadata.author}")
|
153
|
+
print(f"Page count: {result.metadata.page_count}")
|
154
|
+
print(f"Created: {result.metadata.created_at}")
|
155
|
+
```
|
156
|
+
|
157
|
+
**Sync (for scripts and CLI tools):**
|
158
|
+
|
159
|
+
```python
|
160
|
+
from kreuzberg import extract_file_sync
|
161
|
+
|
162
|
+
result = extract_file_sync("report.docx")
|
163
|
+
print(result.content)
|
164
|
+
|
165
|
+
# Access rich metadata
|
166
|
+
print(f"Language: {result.metadata.language}")
|
167
|
+
print(f"Word count: {result.metadata.word_count}")
|
168
|
+
print(f"Keywords: {result.metadata.keywords}")
|
169
|
+
```
|
170
|
+
|
171
|
+
### Docker
|
172
|
+
|
173
|
+
```bash
|
174
|
+
# Run the REST API
|
175
|
+
docker run -p 8000:8000 goldziher/kreuzberg
|
176
|
+
|
177
|
+
# Extract via API
|
178
|
+
curl -X POST -F "file=@document.pdf" http://localhost:8000/extract
|
179
|
+
```
|
180
|
+
|
181
|
+
📖 **[Installation Guide](https://kreuzberg.dev/getting-started/installation/)** • **[CLI Documentation](https://kreuzberg.dev/cli/)** • **[API Reference](https://kreuzberg.dev/api-reference/)**
|
182
|
+
|
183
|
+
## Deployment Options
|
184
|
+
|
185
|
+
### 🤖 MCP Server (AI Integration)
|
186
|
+
|
187
|
+
**Add to Claude Desktop with one command:**
|
188
|
+
|
189
|
+
```bash
|
190
|
+
claude mcp add kreuzberg uvx -- --from "kreuzberg[all]" kreuzberg-mcp
|
191
|
+
```
|
192
|
+
|
193
|
+
**Or configure manually in `claude_desktop_config.json`:**
|
194
|
+
|
195
|
+
```json
|
196
|
+
{
|
197
|
+
"mcpServers": {
|
198
|
+
"kreuzberg": {
|
199
|
+
"command": "uvx",
|
200
|
+
"args": ["--from", "kreuzberg[all]", "kreuzberg-mcp"]
|
201
|
+
}
|
202
|
+
}
|
203
|
+
}
|
204
|
+
```
|
205
|
+
|
206
|
+
**MCP capabilities:**
|
207
|
+
|
208
|
+
- Extract text from PDFs, images, Office docs, and more
|
209
|
+
- Full OCR support with multiple engines
|
210
|
+
- Table extraction and metadata parsing
|
211
|
+
|
212
|
+
📖 **[MCP Documentation](https://kreuzberg.dev/user-guide/mcp-server/)**
|
213
|
+
|
214
|
+
## Supported Formats
|
215
|
+
|
216
|
+
| Category | Formats |
|
217
|
+
| ----------------- | ------------------------------ |
|
218
|
+
| **Documents** | PDF, DOCX, DOC, RTF, TXT, EPUB |
|
219
|
+
| **Images** | JPG, PNG, TIFF, BMP, GIF, WEBP |
|
220
|
+
| **Spreadsheets** | XLSX, XLS, CSV, ODS |
|
221
|
+
| **Presentations** | PPTX, PPT, ODP |
|
222
|
+
| **Web** | HTML, XML, MHTML |
|
223
|
+
| **Archives** | Support via extraction |
|
224
|
+
|
225
|
+
## 📊 Performance Characteristics
|
226
|
+
|
227
|
+
[View comprehensive benchmarks](https://benchmarks.kreuzberg.dev/) • [Benchmark methodology](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [**Detailed Analysis**](https://kreuzberg.dev/performance-analysis/)
|
228
|
+
|
229
|
+
### Technical Specifications
|
230
|
+
|
231
|
+
| Metric | Kreuzberg Sync | Kreuzberg Async | Benchmarked |
|
232
|
+
| ---------------------------- | -------------- | --------------- | ------------------ |
|
233
|
+
| **Throughput (tiny files)** | 31.78 files/s | 23.94 files/s | Highest throughput |
|
234
|
+
| **Throughput (small files)** | 8.91 files/s | 9.31 files/s | Highest throughput |
|
235
|
+
| **Memory footprint** | 359.8 MB | 395.2 MB | Lowest usage |
|
236
|
+
| **Installation size** | 71 MB | 71 MB | Smallest size |
|
237
|
+
| **Success rate** | 100% | 100% | Perfect |
|
238
|
+
| **Supported formats** | 18 | 18 | Comprehensive |
|
239
|
+
|
240
|
+
### Architecture Advantages
|
241
|
+
|
242
|
+
- **Native C extensions**: Built on PDFium and Tesseract for maximum performance
|
243
|
+
- **Async/await support**: True asynchronous processing with intelligent task scheduling
|
244
|
+
- **Memory efficiency**: Streaming architecture minimizes memory allocation
|
245
|
+
- **Process pooling**: Automatic multiprocessing for CPU-intensive operations
|
246
|
+
- **Optimized data flow**: Efficient data handling with minimal transformations
|
247
|
+
|
248
|
+
> **Benchmark details**: Tests include PDFs, Word docs, HTML, images, and spreadsheets in multiple languages (English, Hebrew, German, Chinese, Japanese, Korean) on standardized hardware.
|
249
|
+
|
250
|
+
## Documentation
|
251
|
+
|
252
|
+
### Quick Links
|
253
|
+
|
254
|
+
- [Installation Guide](https://kreuzberg.dev/getting-started/installation/) - Setup and dependencies
|
255
|
+
- [User Guide](https://kreuzberg.dev/user-guide/) - Comprehensive usage guide
|
256
|
+
- [Performance Analysis](https://kreuzberg.dev/performance-analysis/) - Detailed benchmark results
|
257
|
+
- [API Reference](https://kreuzberg.dev/api-reference/) - Complete API documentation
|
258
|
+
- [Docker Guide](https://kreuzberg.dev/user-guide/docker/) - Container deployment
|
259
|
+
- [REST API](https://kreuzberg.dev/user-guide/api-server/) - HTTP endpoints
|
260
|
+
- [CLI Guide](https://kreuzberg.dev/cli/) - Command-line usage
|
261
|
+
- [OCR Configuration](https://kreuzberg.dev/user-guide/ocr-configuration/) - OCR engine setup
|
262
|
+
|
263
|
+
## License
|
264
|
+
|
265
|
+
MIT License - see [LICENSE](LICENSE) for details.
|
@@ -0,0 +1,53 @@
|
|
1
|
+
kreuzberg/__init__.py,sha256=0OJ_jNKbS6GxzWC5-EfRCiE80as_ya0-wwyNsTYbxzY,1721
|
2
|
+
kreuzberg/__main__.py,sha256=s2qM1nPEkRHAQP-G3P7sf5l6qA_KJeIEHS5LpPz04lg,183
|
3
|
+
kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
|
4
|
+
kreuzberg/_config.py,sha256=_9JU88ChId8dWUjZ13ueo9_JoFekkyzuv7rZpFkrPZk,12966
|
5
|
+
kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
|
6
|
+
kreuzberg/_entity_extraction.py,sha256=woNxARG27Z3T_l6w6N-dbt1PPe1IHptFMOZY_6etv54,7819
|
7
|
+
kreuzberg/_gmft.py,sha256=Q46CyBxRxY_oDGpSuXMOJ7qfR9LwuCKXnrl60wcPvU4,25286
|
8
|
+
kreuzberg/_language_detection.py,sha256=eEfj4tsh91SfB2_zQIdY-qD7TlPcppaFm0SqQmETS6Y,3295
|
9
|
+
kreuzberg/_mime_types.py,sha256=OhJ6gEyyLHjyvRtkk37zyLFBsRcSd_QybBaV8TxinIg,8471
|
10
|
+
kreuzberg/_playa.py,sha256=9z4If0WHxbYQxfb8xT7T96L9Du2Fj3Ar5-rF0OHHiMM,11877
|
11
|
+
kreuzberg/_registry.py,sha256=wGSlkS0U1zqruWQCLE95vj4a2mw1yyvf0j6rgz80sJg,3473
|
12
|
+
kreuzberg/_types.py,sha256=GisvL0ps2LCc0heKopFwSyrEbzH3WpDxaeev4vn59X4,14257
|
13
|
+
kreuzberg/cli.py,sha256=vTGS2TJlFTNMWp5LwZd3G2SS8u0m6bhQkH9n6a1oOoM,12439
|
14
|
+
kreuzberg/exceptions.py,sha256=PTiAZgQwcG9hXbgYg2W7sfxksFhq5_wzOFgZGnTJAoc,2991
|
15
|
+
kreuzberg/extraction.py,sha256=UmeEVN-eSile4HMxP0iqG9092BrsH5_zSZNVHhwy0ko,16993
|
16
|
+
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
|
+
kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
|
+
kreuzberg/_api/main.py,sha256=g3kqXUfSie2pcw3-EWOM4TAoJUqM7yj2e-cBQJ_bmYc,3253
|
19
|
+
kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
20
|
+
kreuzberg/_extractors/_base.py,sha256=yNVQSECFad-8_MjqpQZ4q0jQoNdzP6-tqw6l3TfgsMc,4418
|
21
|
+
kreuzberg/_extractors/_email.py,sha256=6-Mk1TRXPyy9ylWKCpgdrogyzhiFnJOTuTRld1ghO8I,5695
|
22
|
+
kreuzberg/_extractors/_html.py,sha256=lOM1Tgrrvd7vpEeFAxC1dp0Tibr6N2FEHCjgFx0FK64,1745
|
23
|
+
kreuzberg/_extractors/_image.py,sha256=eZ7mR4F-mTwYwUzd70xrY7SZYZrNiDxnP5bYDY5P75U,4455
|
24
|
+
kreuzberg/_extractors/_pandoc.py,sha256=51k7XISfKaPorhapG7aIeQb94KGsfozxKyT2rwhk9Bk,26553
|
25
|
+
kreuzberg/_extractors/_pdf.py,sha256=d-hG_mhAMj22bQ35YuP2nq017z27_2Pp08r1qyHxlYI,16676
|
26
|
+
kreuzberg/_extractors/_presentation.py,sha256=CUlqZl_QCdJdumsZh0BpROkFbvi9uq7yMoIt3bRTUeE,10859
|
27
|
+
kreuzberg/_extractors/_spread_sheet.py,sha256=vPxEDAyH-gDoVXSg-A0guOjOfaWIuRI3i2NU8xPwhK8,13695
|
28
|
+
kreuzberg/_extractors/_structured.py,sha256=d0x6EyRimr8eWmr1qPb7HRWnrbKBuD-GpIrZd8XJp0o,5824
|
29
|
+
kreuzberg/_mcp/__init__.py,sha256=8PYV-omC8Rln7Cove8C3rHu3d7sR1FuiwSBG1O7vkAE,92
|
30
|
+
kreuzberg/_mcp/server.py,sha256=Ab0w7kR3m7_L1cfhYHiC8HqDL282vt4uBYwYc9w9E08,8703
|
31
|
+
kreuzberg/_ocr/__init__.py,sha256=grshVFwVQl2rMvH1hg1JNlYXjy5-Tdb_rusLD1Cselk,706
|
32
|
+
kreuzberg/_ocr/_base.py,sha256=CUzYMsJjCqCmHzWckmDeIB2L5hd261xrPrK8Ql-Gdm0,3876
|
33
|
+
kreuzberg/_ocr/_easyocr.py,sha256=c2ndpDlIHvAI2WyvQUXLQ1hb6XynKeKARsXQcQ3ntJ0,17110
|
34
|
+
kreuzberg/_ocr/_paddleocr.py,sha256=fab8a-3cvDgnt97qF-Km9ZfmkacFeKD_g15O8HXYRVc,17492
|
35
|
+
kreuzberg/_ocr/_tesseract.py,sha256=r1g_PCAXgJbZ0RPGn4aSxctZ0F9lLvI3zLGLEPAnviI,31455
|
36
|
+
kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
37
|
+
kreuzberg/_utils/_cache.py,sha256=H2d6JOiTTAoJx5HPJoToCk4ik-ztTRNEJRrHgcSUTLs,15249
|
38
|
+
kreuzberg/_utils/_device.py,sha256=PC8YUPE95pzOyU7sU_icqNZpSfi6HZlEFfmWcV1Uees,10226
|
39
|
+
kreuzberg/_utils/_document_cache.py,sha256=z8irioKsOu8xve1YgHatm__wIFvs9I1gDK3tLNsNyqM,6926
|
40
|
+
kreuzberg/_utils/_errors.py,sha256=UsktQ_p7eOj9crPsFDg8HgRSE5-IpuFC7y1e6dDI_fY,6503
|
41
|
+
kreuzberg/_utils/_pdf_lock.py,sha256=nqxAYCNlfWDrJtP4ZNu57st1YnkDl-gYXdr0q8nv0kA,1961
|
42
|
+
kreuzberg/_utils/_process_pool.py,sha256=4BqhmRspwMyPT2EBfTu_rrn7v722wlMLD8qlYvYsc00,8621
|
43
|
+
kreuzberg/_utils/_quality.py,sha256=-nKzj5n7yJDYrvl556oq2T5S5oKMEOrjpcRMlZ00Jqo,7668
|
44
|
+
kreuzberg/_utils/_serialization.py,sha256=cqqxqN2cmtndBhIr4v2wqiMwnNadnKhvuN7EUj3i18M,2290
|
45
|
+
kreuzberg/_utils/_string.py,sha256=bCzO3UO6nXupxvtMWvHqfp1Vd9CTzEH9jmpJXQ7upAU,6800
|
46
|
+
kreuzberg/_utils/_sync.py,sha256=7LSavBmxVKQUzdjfx9fYRAI9IbJtRw8iGf_Q8B7RX9g,4923
|
47
|
+
kreuzberg/_utils/_table.py,sha256=IomrfQBP85DZI8RmQjOVs2Siq7VP9FUTYPaZR4t3yRw,8199
|
48
|
+
kreuzberg/_utils/_tmp.py,sha256=hVn-VVijIg2FM7EZJ899gc7wZg-TGoJZoeAcxMX-Cxg,1044
|
49
|
+
kreuzberg-3.8.2.dist-info/METADATA,sha256=RiP64og5wOaf9gPZ7CwOsNYYx9GBnVMg8orgqZdncKA,11466
|
50
|
+
kreuzberg-3.8.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
51
|
+
kreuzberg-3.8.2.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
|
52
|
+
kreuzberg-3.8.2.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
53
|
+
kreuzberg-3.8.2.dist-info/RECORD,,
|
kreuzberg/_cli_config.py
DELETED
@@ -1,175 +0,0 @@
|
|
1
|
-
"""Configuration parsing for the CLI."""
|
2
|
-
|
3
|
-
from __future__ import annotations
|
4
|
-
|
5
|
-
import sys
|
6
|
-
from pathlib import Path
|
7
|
-
from typing import TYPE_CHECKING, Any
|
8
|
-
|
9
|
-
if sys.version_info >= (3, 11):
|
10
|
-
import tomllib
|
11
|
-
else:
|
12
|
-
import tomli as tomllib # type: ignore[import-not-found]
|
13
|
-
|
14
|
-
from kreuzberg._gmft import GMFTConfig
|
15
|
-
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
16
|
-
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
17
|
-
from kreuzberg._ocr._tesseract import TesseractConfig
|
18
|
-
from kreuzberg._types import ExtractionConfig, OcrBackendType
|
19
|
-
from kreuzberg.exceptions import ValidationError
|
20
|
-
|
21
|
-
if TYPE_CHECKING:
|
22
|
-
from collections.abc import MutableMapping
|
23
|
-
|
24
|
-
|
25
|
-
def load_config_from_file(config_path: Path) -> dict[str, Any]:
|
26
|
-
"""Load configuration from a TOML file.
|
27
|
-
|
28
|
-
Args:
|
29
|
-
config_path: Path to the configuration file.
|
30
|
-
|
31
|
-
Returns:
|
32
|
-
Dictionary containing the loaded configuration.
|
33
|
-
|
34
|
-
Raises:
|
35
|
-
ValidationError: If the file cannot be read or parsed.
|
36
|
-
"""
|
37
|
-
try:
|
38
|
-
with config_path.open("rb") as f:
|
39
|
-
data = tomllib.load(f)
|
40
|
-
except FileNotFoundError as e:
|
41
|
-
raise ValidationError(f"Configuration file not found: {config_path}") from e
|
42
|
-
except tomllib.TOMLDecodeError as e:
|
43
|
-
raise ValidationError(f"Invalid TOML in configuration file: {e}") from e
|
44
|
-
|
45
|
-
return data.get("tool", {}).get("kreuzberg", {}) # type: ignore[no-any-return]
|
46
|
-
|
47
|
-
|
48
|
-
def merge_configs(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
|
49
|
-
"""Merge two configuration dictionaries recursively.
|
50
|
-
|
51
|
-
Args:
|
52
|
-
base: Base configuration dictionary.
|
53
|
-
override: Configuration dictionary to override base values.
|
54
|
-
|
55
|
-
Returns:
|
56
|
-
Merged configuration dictionary.
|
57
|
-
"""
|
58
|
-
result = base.copy()
|
59
|
-
for key, value in override.items():
|
60
|
-
if isinstance(value, dict) and key in result and isinstance(result[key], dict):
|
61
|
-
result[key] = merge_configs(result[key], value)
|
62
|
-
else:
|
63
|
-
result[key] = value
|
64
|
-
return result
|
65
|
-
|
66
|
-
|
67
|
-
def parse_ocr_backend_config(
|
68
|
-
config_dict: dict[str, Any], backend: OcrBackendType
|
69
|
-
) -> TesseractConfig | EasyOCRConfig | PaddleOCRConfig | None:
|
70
|
-
"""Parse OCR backend-specific configuration.
|
71
|
-
|
72
|
-
Args:
|
73
|
-
config_dict: Configuration dictionary.
|
74
|
-
backend: The OCR backend type.
|
75
|
-
|
76
|
-
Returns:
|
77
|
-
Backend-specific configuration object or None.
|
78
|
-
"""
|
79
|
-
if backend not in config_dict:
|
80
|
-
return None
|
81
|
-
|
82
|
-
backend_config = config_dict[backend]
|
83
|
-
if not isinstance(backend_config, dict):
|
84
|
-
return None
|
85
|
-
|
86
|
-
if backend == "tesseract":
|
87
|
-
return TesseractConfig(**backend_config)
|
88
|
-
if backend == "easyocr":
|
89
|
-
return EasyOCRConfig(**backend_config)
|
90
|
-
if backend == "paddleocr":
|
91
|
-
return PaddleOCRConfig(**backend_config)
|
92
|
-
return None
|
93
|
-
|
94
|
-
|
95
|
-
def build_extraction_config( # noqa: C901, PLR0912
|
96
|
-
file_config: dict[str, Any],
|
97
|
-
cli_args: MutableMapping[str, Any],
|
98
|
-
) -> ExtractionConfig:
|
99
|
-
"""Build ExtractionConfig from file config and CLI arguments.
|
100
|
-
|
101
|
-
Args:
|
102
|
-
file_config: Configuration loaded from file.
|
103
|
-
cli_args: CLI arguments.
|
104
|
-
|
105
|
-
Returns:
|
106
|
-
ExtractionConfig instance.
|
107
|
-
"""
|
108
|
-
config_dict: dict[str, Any] = {}
|
109
|
-
|
110
|
-
if file_config:
|
111
|
-
for field in ["force_ocr", "chunk_content", "extract_tables", "max_chars", "max_overlap", "ocr_backend"]:
|
112
|
-
if field in file_config:
|
113
|
-
config_dict[field] = file_config[field]
|
114
|
-
|
115
|
-
for field in ["force_ocr", "chunk_content", "extract_tables", "max_chars", "max_overlap", "ocr_backend"]:
|
116
|
-
cli_key = field
|
117
|
-
if cli_key in cli_args and cli_args[cli_key] is not None:
|
118
|
-
config_dict[field] = cli_args[cli_key]
|
119
|
-
|
120
|
-
ocr_backend = config_dict.get("ocr_backend")
|
121
|
-
if ocr_backend and ocr_backend != "none":
|
122
|
-
ocr_config = None
|
123
|
-
|
124
|
-
if cli_args.get(f"{ocr_backend}_config"):
|
125
|
-
backend_args = cli_args[f"{ocr_backend}_config"]
|
126
|
-
if ocr_backend == "tesseract":
|
127
|
-
ocr_config = TesseractConfig(**backend_args)
|
128
|
-
elif ocr_backend == "easyocr":
|
129
|
-
ocr_config = EasyOCRConfig(**backend_args) # type: ignore[assignment]
|
130
|
-
elif ocr_backend == "paddleocr":
|
131
|
-
ocr_config = PaddleOCRConfig(**backend_args) # type: ignore[assignment]
|
132
|
-
|
133
|
-
if not ocr_config and file_config:
|
134
|
-
ocr_config = parse_ocr_backend_config(file_config, ocr_backend) # type: ignore[assignment]
|
135
|
-
|
136
|
-
if ocr_config:
|
137
|
-
config_dict["ocr_config"] = ocr_config
|
138
|
-
|
139
|
-
if config_dict.get("extract_tables"):
|
140
|
-
gmft_config = None
|
141
|
-
|
142
|
-
if cli_args.get("gmft_config"):
|
143
|
-
gmft_config = GMFTConfig(**cli_args["gmft_config"])
|
144
|
-
|
145
|
-
elif "gmft" in file_config and isinstance(file_config["gmft"], dict):
|
146
|
-
gmft_config = GMFTConfig(**file_config["gmft"])
|
147
|
-
|
148
|
-
if gmft_config:
|
149
|
-
config_dict["gmft_config"] = gmft_config
|
150
|
-
|
151
|
-
if config_dict.get("ocr_backend") == "none":
|
152
|
-
config_dict["ocr_backend"] = None
|
153
|
-
|
154
|
-
return ExtractionConfig(**config_dict)
|
155
|
-
|
156
|
-
|
157
|
-
def find_default_config() -> Path | None:
|
158
|
-
"""Find the default configuration file (pyproject.toml).
|
159
|
-
|
160
|
-
Returns:
|
161
|
-
Path to the configuration file or None if not found.
|
162
|
-
"""
|
163
|
-
current = Path.cwd()
|
164
|
-
while current != current.parent:
|
165
|
-
config_path = current / "pyproject.toml"
|
166
|
-
if config_path.exists():
|
167
|
-
try:
|
168
|
-
with config_path.open("rb") as f:
|
169
|
-
data = tomllib.load(f)
|
170
|
-
if "tool" in data and "kreuzberg" in data["tool"]:
|
171
|
-
return config_path
|
172
|
-
except Exception: # noqa: BLE001
|
173
|
-
pass
|
174
|
-
current = current.parent
|
175
|
-
return None
|