kreuzberg 3.8.1__py3-none-any.whl → 3.8.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +4 -0
- kreuzberg/_api/main.py +22 -1
- kreuzberg/_config.py +404 -0
- kreuzberg/_entity_extraction.py +3 -3
- kreuzberg/_extractors/_pdf.py +22 -19
- kreuzberg/_extractors/_spread_sheet.py +2 -3
- kreuzberg/_extractors/_structured.py +10 -7
- kreuzberg/_gmft.py +8 -11
- kreuzberg/_language_detection.py +1 -1
- kreuzberg/_mcp/server.py +58 -8
- kreuzberg/_ocr/_easyocr.py +1 -1
- kreuzberg/_ocr/_paddleocr.py +1 -1
- kreuzberg/_ocr/_tesseract.py +2 -7
- kreuzberg/_playa.py +2 -3
- kreuzberg/_types.py +46 -24
- kreuzberg/_utils/_cache.py +15 -17
- kreuzberg/_utils/_device.py +10 -20
- kreuzberg/_utils/_errors.py +41 -38
- kreuzberg/_utils/_quality.py +7 -11
- kreuzberg/_utils/_serialization.py +21 -16
- kreuzberg/_utils/_string.py +22 -12
- kreuzberg/_utils/_table.py +3 -4
- kreuzberg/cli.py +3 -3
- kreuzberg/exceptions.py +10 -0
- kreuzberg/extraction.py +2 -2
- kreuzberg-3.8.2.dist-info/METADATA +265 -0
- kreuzberg-3.8.2.dist-info/RECORD +53 -0
- kreuzberg/_cli_config.py +0 -175
- kreuzberg-3.8.1.dist-info/METADATA +0 -301
- kreuzberg-3.8.1.dist-info/RECORD +0 -53
- {kreuzberg-3.8.1.dist-info → kreuzberg-3.8.2.dist-info}/WHEEL +0 -0
- {kreuzberg-3.8.1.dist-info → kreuzberg-3.8.2.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.8.1.dist-info → kreuzberg-3.8.2.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_utils/_quality.py
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
from __future__ import annotations
|
4
4
|
|
5
5
|
import re
|
6
|
+
from functools import reduce
|
6
7
|
from typing import Any
|
7
8
|
|
8
9
|
# Pre-compiled patterns for performance
|
@@ -102,9 +103,8 @@ def clean_extracted_text(text: str) -> str:
|
|
102
103
|
if not text:
|
103
104
|
return text
|
104
105
|
|
105
|
-
# Remove script and style content
|
106
|
-
|
107
|
-
text = pattern.sub(" ", text)
|
106
|
+
# Remove script and style content using functools.reduce for single pass
|
107
|
+
text = reduce(lambda t, pattern: pattern.sub(" ", t), _SCRIPT_PATTERNS.values(), text)
|
108
108
|
|
109
109
|
# Clean OCR artifacts
|
110
110
|
text = _clean_ocr_artifacts(text)
|
@@ -134,10 +134,8 @@ def _calculate_script_penalty(text: str, total_chars: int) -> float:
|
|
134
134
|
if total_chars == 0:
|
135
135
|
return 0.0
|
136
136
|
|
137
|
-
|
138
|
-
for pattern in _SCRIPT_PATTERNS.values()
|
139
|
-
matches = pattern.findall(text)
|
140
|
-
script_chars += sum(len(match) for match in matches)
|
137
|
+
# Use sum with generator expression for single-pass calculation
|
138
|
+
script_chars = sum(len(match) for pattern in _SCRIPT_PATTERNS.values() for match in pattern.findall(text))
|
141
139
|
|
142
140
|
return min(1.0, script_chars / total_chars)
|
143
141
|
|
@@ -147,10 +145,8 @@ def _calculate_navigation_penalty(text: str, total_chars: int) -> float:
|
|
147
145
|
if total_chars == 0:
|
148
146
|
return 0.0
|
149
147
|
|
150
|
-
|
151
|
-
for pattern in _NAVIGATION_PATTERNS.values()
|
152
|
-
matches = pattern.findall(text)
|
153
|
-
nav_chars += sum(len(match) for match in matches)
|
148
|
+
# Use sum with generator expression for single-pass calculation
|
149
|
+
nav_chars = sum(len(match) for pattern in _NAVIGATION_PATTERNS.values() for match in pattern.findall(text))
|
154
150
|
|
155
151
|
return min(1.0, nav_chars / total_chars)
|
156
152
|
|
@@ -2,16 +2,28 @@
|
|
2
2
|
|
3
3
|
from __future__ import annotations
|
4
4
|
|
5
|
-
from dataclasses import
|
6
|
-
from enum import Enum
|
5
|
+
from dataclasses import is_dataclass
|
7
6
|
from typing import Any, TypeVar, cast
|
8
7
|
|
8
|
+
import msgspec
|
9
9
|
from msgspec import MsgspecError
|
10
10
|
from msgspec.msgpack import decode, encode
|
11
11
|
|
12
12
|
T = TypeVar("T")
|
13
13
|
|
14
14
|
|
15
|
+
# Define dict method names in priority order
|
16
|
+
_DICT_METHOD_NAMES = (
|
17
|
+
"to_dict",
|
18
|
+
"as_dict",
|
19
|
+
"dict",
|
20
|
+
"model_dump",
|
21
|
+
"json",
|
22
|
+
"to_list",
|
23
|
+
"tolist",
|
24
|
+
)
|
25
|
+
|
26
|
+
|
15
27
|
def encode_hook(obj: Any) -> Any:
|
16
28
|
"""Custom encoder for complex objects."""
|
17
29
|
if callable(obj):
|
@@ -20,22 +32,15 @@ def encode_hook(obj: Any) -> Any:
|
|
20
32
|
if isinstance(obj, Exception):
|
21
33
|
return {"message": str(obj), "type": type(obj).__name__}
|
22
34
|
|
23
|
-
for
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
"json",
|
29
|
-
"to_list",
|
30
|
-
"tolist",
|
31
|
-
):
|
32
|
-
if hasattr(obj, key):
|
33
|
-
method = getattr(obj, key) # Cache the attribute lookup
|
34
|
-
if callable(method):
|
35
|
-
return method()
|
35
|
+
# Check for dict-like methods more efficiently using any() with generator
|
36
|
+
for attr_name in _DICT_METHOD_NAMES:
|
37
|
+
method = getattr(obj, attr_name, None)
|
38
|
+
if method is not None and callable(method):
|
39
|
+
return method()
|
36
40
|
|
37
41
|
if is_dataclass(obj) and not isinstance(obj, type):
|
38
|
-
|
42
|
+
# Use msgspec.to_builtins for more efficient conversion
|
43
|
+
return msgspec.to_builtins(obj)
|
39
44
|
|
40
45
|
if hasattr(obj, "save") and hasattr(obj, "format"):
|
41
46
|
return None
|
kreuzberg/_utils/_string.py
CHANGED
@@ -28,6 +28,7 @@ _encoding_cache: dict[str, str] = {}
|
|
28
28
|
@lru_cache(maxsize=128)
|
29
29
|
def _get_encoding_cache_key(data_hash: str, size: int) -> str:
|
30
30
|
"""Generate cache key for encoding detection."""
|
31
|
+
# Use string interpolation which is faster than format strings for simple cases
|
31
32
|
return f"{data_hash}:{size}"
|
32
33
|
|
33
34
|
|
@@ -104,25 +105,29 @@ def _calculate_text_confidence(text: str) -> float:
|
|
104
105
|
if not text:
|
105
106
|
return 0.0
|
106
107
|
|
107
|
-
# Check for common encoding problems
|
108
|
-
replacement_count = len(_MOJIBAKE_PATTERNS["replacement_chars"].findall(text))
|
109
|
-
control_count = len(_MOJIBAKE_PATTERNS["control_chars"].findall(text))
|
110
108
|
total_chars = len(text)
|
111
|
-
|
112
109
|
if total_chars == 0:
|
113
110
|
return 0.0
|
114
111
|
|
112
|
+
# Check for common encoding problems - compile patterns once
|
113
|
+
replacement_count = len(_MOJIBAKE_PATTERNS["replacement_chars"].findall(text))
|
114
|
+
control_count = len(_MOJIBAKE_PATTERNS["control_chars"].findall(text))
|
115
|
+
|
115
116
|
# Penalize replacement and control characters
|
116
117
|
penalty = (replacement_count + control_count * 2) / total_chars
|
117
118
|
|
118
|
-
# Bonus for readable character ranges
|
119
|
+
# Bonus for readable character ranges - more efficient counting
|
120
|
+
# Use generator expression with early termination
|
119
121
|
readable_chars = sum(1 for c in text if c.isprintable() or c.isspace())
|
120
122
|
readability_score = readable_chars / total_chars
|
121
123
|
|
122
124
|
# Check for suspicious Cyrillic that might be misencoded Hebrew
|
123
125
|
cyrillic_matches = _MOJIBAKE_PATTERNS["hebrew_as_cyrillic"].findall(text)
|
124
|
-
if cyrillic_matches
|
125
|
-
|
126
|
+
if cyrillic_matches:
|
127
|
+
# Calculate total length more efficiently
|
128
|
+
cyrillic_length = sum(len(match) for match in cyrillic_matches)
|
129
|
+
if cyrillic_length > total_chars * 0.1:
|
130
|
+
penalty += 0.3 # Heavy penalty for likely mojibake
|
126
131
|
|
127
132
|
return max(0.0, min(1.0, readability_score - penalty))
|
128
133
|
|
@@ -164,7 +169,8 @@ def normalize_spaces(text: str) -> str:
|
|
164
169
|
|
165
170
|
# Split by double newlines to preserve paragraph breaks
|
166
171
|
paragraphs = text.split("\n\n")
|
167
|
-
|
172
|
+
|
173
|
+
result_paragraphs = []
|
168
174
|
|
169
175
|
for paragraph in paragraphs:
|
170
176
|
# Use pre-compiled patterns for better performance
|
@@ -173,10 +179,14 @@ def normalize_spaces(text: str) -> str:
|
|
173
179
|
# Clean up multiple newlines within paragraph (keep single newlines)
|
174
180
|
cleaned = _NEWLINES_PATTERN.sub("\n", cleaned)
|
175
181
|
|
176
|
-
#
|
177
|
-
lines = [
|
182
|
+
# Process lines efficiently - manual loop avoids double strip() calls
|
183
|
+
lines = []
|
184
|
+
for line in cleaned.split("\n"):
|
185
|
+
stripped_line = line.strip()
|
186
|
+
if stripped_line:
|
187
|
+
lines.append(stripped_line)
|
178
188
|
|
179
189
|
if lines:
|
180
|
-
|
190
|
+
result_paragraphs.append("\n".join(lines))
|
181
191
|
|
182
|
-
return "\n\n".join(
|
192
|
+
return "\n\n".join(result_paragraphs)
|
kreuzberg/_utils/_table.py
CHANGED
@@ -3,7 +3,6 @@
|
|
3
3
|
from __future__ import annotations
|
4
4
|
|
5
5
|
import csv
|
6
|
-
from io import StringIO
|
7
6
|
from typing import TYPE_CHECKING, Any
|
8
7
|
|
9
8
|
if TYPE_CHECKING:
|
@@ -23,9 +22,9 @@ def export_table_to_csv(table: TableData, separator: str = ",") -> str:
|
|
23
22
|
if "df" not in table or table["df"] is None:
|
24
23
|
return ""
|
25
24
|
|
26
|
-
|
27
|
-
table["df"].to_csv(
|
28
|
-
return
|
25
|
+
# Use pandas to_csv() direct string return instead of StringIO
|
26
|
+
csv_output = table["df"].to_csv(sep=separator, index=False, quoting=csv.QUOTE_MINIMAL, lineterminator="\n")
|
27
|
+
return str(csv_output).strip()
|
29
28
|
|
30
29
|
|
31
30
|
def export_table_to_tsv(table: TableData) -> str:
|
kreuzberg/cli.py
CHANGED
@@ -18,7 +18,7 @@ except ImportError as e:
|
|
18
18
|
) from e
|
19
19
|
|
20
20
|
from kreuzberg import __version__, extract_bytes_sync, extract_file_sync
|
21
|
-
from kreuzberg.
|
21
|
+
from kreuzberg._config import build_extraction_config, find_config_file, load_config_from_file
|
22
22
|
from kreuzberg.exceptions import KreuzbergError, MissingDependencyError
|
23
23
|
|
24
24
|
DEFAULT_MAX_CHARACTERS = 4000
|
@@ -92,7 +92,7 @@ def _load_config(config: Path | None, verbose: bool) -> dict[str, Any]:
|
|
92
92
|
if config:
|
93
93
|
file_config = load_config_from_file(config)
|
94
94
|
else:
|
95
|
-
default_config =
|
95
|
+
default_config = find_config_file()
|
96
96
|
if default_config:
|
97
97
|
try:
|
98
98
|
file_config = load_config_from_file(default_config)
|
@@ -314,7 +314,7 @@ def extract( # noqa: PLR0913
|
|
314
314
|
def config(config: Path | None) -> None:
|
315
315
|
"""Show current configuration."""
|
316
316
|
try:
|
317
|
-
config_path = config or
|
317
|
+
config_path = config or find_config_file()
|
318
318
|
|
319
319
|
if config_path:
|
320
320
|
file_config = load_config_from_file(config_path)
|
kreuzberg/exceptions.py
CHANGED
@@ -7,6 +7,8 @@ from typing import Any
|
|
7
7
|
class KreuzbergError(Exception):
|
8
8
|
"""Base exception for all Kreuzberg errors."""
|
9
9
|
|
10
|
+
__slots__ = ("context",)
|
11
|
+
|
10
12
|
context: Any
|
11
13
|
"""The context of the error."""
|
12
14
|
|
@@ -43,14 +45,20 @@ class KreuzbergError(Exception):
|
|
43
45
|
class ParsingError(KreuzbergError):
|
44
46
|
"""Raised when a parsing error occurs."""
|
45
47
|
|
48
|
+
__slots__ = ()
|
49
|
+
|
46
50
|
|
47
51
|
class ValidationError(KreuzbergError):
|
48
52
|
"""Raised when a validation error occurs."""
|
49
53
|
|
54
|
+
__slots__ = ()
|
55
|
+
|
50
56
|
|
51
57
|
class MissingDependencyError(KreuzbergError):
|
52
58
|
"""Raised when a dependency is missing."""
|
53
59
|
|
60
|
+
__slots__ = ()
|
61
|
+
|
54
62
|
@classmethod
|
55
63
|
def create_for_package(
|
56
64
|
cls, *, dependency_group: str, functionality: str, package_name: str
|
@@ -79,3 +87,5 @@ class MissingDependencyError(KreuzbergError):
|
|
79
87
|
|
80
88
|
class OCRError(KreuzbergError):
|
81
89
|
"""Raised when an OCR error occurs."""
|
90
|
+
|
91
|
+
__slots__ = ()
|
kreuzberg/extraction.py
CHANGED
@@ -460,8 +460,8 @@ def batch_extract_bytes_sync(
|
|
460
460
|
return (index, error_result)
|
461
461
|
|
462
462
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
463
|
-
|
464
|
-
future_to_index = {executor.submit(extract_single,
|
463
|
+
# Avoid creating intermediate list, use enumerate directly
|
464
|
+
future_to_index = {executor.submit(extract_single, (i, content)): i for i, content in enumerate(contents)}
|
465
465
|
|
466
466
|
results: list[ExtractionResult] = [None] * len(contents) # type: ignore[list-item]
|
467
467
|
for future in as_completed(future_to_index):
|
@@ -0,0 +1,265 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: kreuzberg
|
3
|
+
Version: 3.8.2
|
4
|
+
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
|
+
Project-URL: documentation, https://kreuzberg.dev
|
6
|
+
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
7
|
+
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
8
|
+
License: MIT
|
9
|
+
License-File: LICENSE
|
10
|
+
Keywords: async,document-analysis,document-intelligence,document-processing,extensible,information-extraction,mcp,metadata-extraction,model-context-protocol,ocr,pandoc,pdf-extraction,pdfium,plugin-architecture,rag,retrieval-augmented-generation,structured-data,table-extraction,tesseract,text-extraction
|
11
|
+
Classifier: Development Status :: 5 - Production/Stable
|
12
|
+
Classifier: Intended Audience :: Developers
|
13
|
+
Classifier: Intended Audience :: Information Technology
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
16
|
+
Classifier: Operating System :: OS Independent
|
17
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
22
|
+
Classifier: Topic :: Database
|
23
|
+
Classifier: Topic :: Multimedia :: Graphics :: Capture :: Scanners
|
24
|
+
Classifier: Topic :: Office/Business :: Office Suites
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
26
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
27
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
28
|
+
Classifier: Topic :: Text Processing :: General
|
29
|
+
Classifier: Typing :: Typed
|
30
|
+
Requires-Python: >=3.10
|
31
|
+
Requires-Dist: anyio>=4.9.0
|
32
|
+
Requires-Dist: chardetng-py>=0.3.4
|
33
|
+
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
34
|
+
Requires-Dist: html-to-markdown[lxml]>=1.8.0
|
35
|
+
Requires-Dist: mcp>=1.11.0
|
36
|
+
Requires-Dist: msgspec>=0.18.0
|
37
|
+
Requires-Dist: playa-pdf>=0.6.1
|
38
|
+
Requires-Dist: psutil>=7.0.0
|
39
|
+
Requires-Dist: pypdfium2==4.30.0
|
40
|
+
Requires-Dist: python-calamine>=0.3.2
|
41
|
+
Requires-Dist: python-pptx>=1.0.2
|
42
|
+
Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
|
43
|
+
Provides-Extra: additional-extensions
|
44
|
+
Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
|
45
|
+
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
|
46
|
+
Provides-Extra: all
|
47
|
+
Requires-Dist: click>=8.2.1; extra == 'all'
|
48
|
+
Requires-Dist: easyocr>=1.7.2; extra == 'all'
|
49
|
+
Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
|
50
|
+
Requires-Dist: gmft>=0.4.2; extra == 'all'
|
51
|
+
Requires-Dist: keybert>=0.9.0; extra == 'all'
|
52
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all'
|
53
|
+
Requires-Dist: mailparse>=1.0.15; extra == 'all'
|
54
|
+
Requires-Dist: paddleocr>=3.1.0; extra == 'all'
|
55
|
+
Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
|
56
|
+
Requires-Dist: rich>=14.0.0; extra == 'all'
|
57
|
+
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
|
58
|
+
Requires-Dist: setuptools>=80.9.0; extra == 'all'
|
59
|
+
Requires-Dist: spacy>=3.8.7; extra == 'all'
|
60
|
+
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
|
61
|
+
Provides-Extra: api
|
62
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'api'
|
63
|
+
Provides-Extra: chunking
|
64
|
+
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
|
65
|
+
Provides-Extra: cli
|
66
|
+
Requires-Dist: click>=8.2.1; extra == 'cli'
|
67
|
+
Requires-Dist: rich>=14.0.0; extra == 'cli'
|
68
|
+
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
|
69
|
+
Provides-Extra: easyocr
|
70
|
+
Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
|
71
|
+
Provides-Extra: entity-extraction
|
72
|
+
Requires-Dist: keybert>=0.9.0; extra == 'entity-extraction'
|
73
|
+
Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
|
74
|
+
Provides-Extra: gmft
|
75
|
+
Requires-Dist: gmft>=0.4.2; extra == 'gmft'
|
76
|
+
Provides-Extra: langdetect
|
77
|
+
Requires-Dist: fast-langdetect>=0.3.2; extra == 'langdetect'
|
78
|
+
Provides-Extra: paddleocr
|
79
|
+
Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
|
80
|
+
Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
|
81
|
+
Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
|
82
|
+
Description-Content-Type: text/markdown
|
83
|
+
|
84
|
+
# Kreuzberg
|
85
|
+
|
86
|
+
[](https://discord.gg/pXxagNK2zN)
|
87
|
+
[](https://badge.fury.io/py/kreuzberg)
|
88
|
+
[](https://kreuzberg.dev/)
|
89
|
+
[](https://benchmarks.kreuzberg.dev/)
|
90
|
+
[](https://opensource.org/licenses/MIT)
|
91
|
+
[](https://github.com/Goldziher/kreuzberg)
|
92
|
+
|
93
|
+
**A document intelligence framework for Python.** Extract text, metadata, and structured information from diverse document formats through a unified, extensible API. Built on established open source foundations including Pandoc, PDFium, and Tesseract.
|
94
|
+
|
95
|
+
📖 **[Complete Documentation](https://kreuzberg.dev/)**
|
96
|
+
|
97
|
+
## Framework Overview
|
98
|
+
|
99
|
+
### Document Intelligence Capabilities
|
100
|
+
|
101
|
+
- **Text Extraction**: High-fidelity text extraction preserving document structure and formatting
|
102
|
+
- **Metadata Extraction**: Comprehensive metadata including author, creation date, language, and document properties
|
103
|
+
- **Format Support**: 18 document types including PDF, Microsoft Office, images, HTML, and structured data formats
|
104
|
+
- **OCR Integration**: Multiple OCR engines (Tesseract, EasyOCR, PaddleOCR) with automatic fallback
|
105
|
+
- **Table Detection**: Structured table extraction with cell-level precision via GMFT integration
|
106
|
+
|
107
|
+
### Technical Architecture
|
108
|
+
|
109
|
+
- **Performance**: Highest throughput among Python document processing frameworks (30+ docs/second)
|
110
|
+
- **Resource Efficiency**: 71MB installation, ~360MB runtime memory footprint
|
111
|
+
- **Extensibility**: Plugin architecture for custom extractors via the Extractor base class
|
112
|
+
- **API Design**: Synchronous and asynchronous APIs with consistent interfaces
|
113
|
+
- **Type Safety**: Complete type annotations throughout the codebase
|
114
|
+
|
115
|
+
### Open Source Foundation
|
116
|
+
|
117
|
+
Kreuzberg leverages established open source technologies:
|
118
|
+
|
119
|
+
- **Pandoc**: Universal document converter for robust format support
|
120
|
+
- **PDFium**: Google's PDF rendering engine for accurate PDF processing
|
121
|
+
- **Tesseract**: Google's OCR engine for text recognition
|
122
|
+
- **Python-docx/pptx**: Native Microsoft Office format support
|
123
|
+
|
124
|
+
## Quick Start
|
125
|
+
|
126
|
+
### Extract Text with CLI
|
127
|
+
|
128
|
+
```bash
|
129
|
+
# Extract text from any file to markdown
|
130
|
+
uvx kreuzberg extract document.pdf > output.md
|
131
|
+
|
132
|
+
# With all features (OCR, table extraction, etc.)
|
133
|
+
uvx --from "kreuzberg[all]" kreuzberg extract invoice.pdf --ocr --format markdown
|
134
|
+
|
135
|
+
# Extract with rich metadata
|
136
|
+
uvx kreuzberg extract report.pdf --show-metadata --format json
|
137
|
+
```
|
138
|
+
|
139
|
+
### Python Usage
|
140
|
+
|
141
|
+
**Async (recommended for web apps):**
|
142
|
+
|
143
|
+
```python
|
144
|
+
from kreuzberg import extract_file
|
145
|
+
|
146
|
+
# In your async function
|
147
|
+
result = await extract_file("presentation.pptx")
|
148
|
+
print(result.content)
|
149
|
+
|
150
|
+
# Rich metadata extraction
|
151
|
+
print(f"Title: {result.metadata.title}")
|
152
|
+
print(f"Author: {result.metadata.author}")
|
153
|
+
print(f"Page count: {result.metadata.page_count}")
|
154
|
+
print(f"Created: {result.metadata.created_at}")
|
155
|
+
```
|
156
|
+
|
157
|
+
**Sync (for scripts and CLI tools):**
|
158
|
+
|
159
|
+
```python
|
160
|
+
from kreuzberg import extract_file_sync
|
161
|
+
|
162
|
+
result = extract_file_sync("report.docx")
|
163
|
+
print(result.content)
|
164
|
+
|
165
|
+
# Access rich metadata
|
166
|
+
print(f"Language: {result.metadata.language}")
|
167
|
+
print(f"Word count: {result.metadata.word_count}")
|
168
|
+
print(f"Keywords: {result.metadata.keywords}")
|
169
|
+
```
|
170
|
+
|
171
|
+
### Docker
|
172
|
+
|
173
|
+
```bash
|
174
|
+
# Run the REST API
|
175
|
+
docker run -p 8000:8000 goldziher/kreuzberg
|
176
|
+
|
177
|
+
# Extract via API
|
178
|
+
curl -X POST -F "file=@document.pdf" http://localhost:8000/extract
|
179
|
+
```
|
180
|
+
|
181
|
+
📖 **[Installation Guide](https://kreuzberg.dev/getting-started/installation/)** • **[CLI Documentation](https://kreuzberg.dev/cli/)** • **[API Reference](https://kreuzberg.dev/api-reference/)**
|
182
|
+
|
183
|
+
## Deployment Options
|
184
|
+
|
185
|
+
### 🤖 MCP Server (AI Integration)
|
186
|
+
|
187
|
+
**Add to Claude Desktop with one command:**
|
188
|
+
|
189
|
+
```bash
|
190
|
+
claude mcp add kreuzberg uvx -- --from "kreuzberg[all]" kreuzberg-mcp
|
191
|
+
```
|
192
|
+
|
193
|
+
**Or configure manually in `claude_desktop_config.json`:**
|
194
|
+
|
195
|
+
```json
|
196
|
+
{
|
197
|
+
"mcpServers": {
|
198
|
+
"kreuzberg": {
|
199
|
+
"command": "uvx",
|
200
|
+
"args": ["--from", "kreuzberg[all]", "kreuzberg-mcp"]
|
201
|
+
}
|
202
|
+
}
|
203
|
+
}
|
204
|
+
```
|
205
|
+
|
206
|
+
**MCP capabilities:**
|
207
|
+
|
208
|
+
- Extract text from PDFs, images, Office docs, and more
|
209
|
+
- Full OCR support with multiple engines
|
210
|
+
- Table extraction and metadata parsing
|
211
|
+
|
212
|
+
📖 **[MCP Documentation](https://kreuzberg.dev/user-guide/mcp-server/)**
|
213
|
+
|
214
|
+
## Supported Formats
|
215
|
+
|
216
|
+
| Category | Formats |
|
217
|
+
| ----------------- | ------------------------------ |
|
218
|
+
| **Documents** | PDF, DOCX, DOC, RTF, TXT, EPUB |
|
219
|
+
| **Images** | JPG, PNG, TIFF, BMP, GIF, WEBP |
|
220
|
+
| **Spreadsheets** | XLSX, XLS, CSV, ODS |
|
221
|
+
| **Presentations** | PPTX, PPT, ODP |
|
222
|
+
| **Web** | HTML, XML, MHTML |
|
223
|
+
| **Archives** | Support via extraction |
|
224
|
+
|
225
|
+
## 📊 Performance Characteristics
|
226
|
+
|
227
|
+
[View comprehensive benchmarks](https://benchmarks.kreuzberg.dev/) • [Benchmark methodology](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [**Detailed Analysis**](https://kreuzberg.dev/performance-analysis/)
|
228
|
+
|
229
|
+
### Technical Specifications
|
230
|
+
|
231
|
+
| Metric | Kreuzberg Sync | Kreuzberg Async | Benchmarked |
|
232
|
+
| ---------------------------- | -------------- | --------------- | ------------------ |
|
233
|
+
| **Throughput (tiny files)** | 31.78 files/s | 23.94 files/s | Highest throughput |
|
234
|
+
| **Throughput (small files)** | 8.91 files/s | 9.31 files/s | Highest throughput |
|
235
|
+
| **Memory footprint** | 359.8 MB | 395.2 MB | Lowest usage |
|
236
|
+
| **Installation size** | 71 MB | 71 MB | Smallest size |
|
237
|
+
| **Success rate** | 100% | 100% | Perfect |
|
238
|
+
| **Supported formats** | 18 | 18 | Comprehensive |
|
239
|
+
|
240
|
+
### Architecture Advantages
|
241
|
+
|
242
|
+
- **Native C extensions**: Built on PDFium and Tesseract for maximum performance
|
243
|
+
- **Async/await support**: True asynchronous processing with intelligent task scheduling
|
244
|
+
- **Memory efficiency**: Streaming architecture minimizes memory allocation
|
245
|
+
- **Process pooling**: Automatic multiprocessing for CPU-intensive operations
|
246
|
+
- **Optimized data flow**: Efficient data handling with minimal transformations
|
247
|
+
|
248
|
+
> **Benchmark details**: Tests include PDFs, Word docs, HTML, images, and spreadsheets in multiple languages (English, Hebrew, German, Chinese, Japanese, Korean) on standardized hardware.
|
249
|
+
|
250
|
+
## Documentation
|
251
|
+
|
252
|
+
### Quick Links
|
253
|
+
|
254
|
+
- [Installation Guide](https://kreuzberg.dev/getting-started/installation/) - Setup and dependencies
|
255
|
+
- [User Guide](https://kreuzberg.dev/user-guide/) - Comprehensive usage guide
|
256
|
+
- [Performance Analysis](https://kreuzberg.dev/performance-analysis/) - Detailed benchmark results
|
257
|
+
- [API Reference](https://kreuzberg.dev/api-reference/) - Complete API documentation
|
258
|
+
- [Docker Guide](https://kreuzberg.dev/user-guide/docker/) - Container deployment
|
259
|
+
- [REST API](https://kreuzberg.dev/user-guide/api-server/) - HTTP endpoints
|
260
|
+
- [CLI Guide](https://kreuzberg.dev/cli/) - Command-line usage
|
261
|
+
- [OCR Configuration](https://kreuzberg.dev/user-guide/ocr-configuration/) - OCR engine setup
|
262
|
+
|
263
|
+
## License
|
264
|
+
|
265
|
+
MIT License - see [LICENSE](LICENSE) for details.
|
@@ -0,0 +1,53 @@
|
|
1
|
+
kreuzberg/__init__.py,sha256=0OJ_jNKbS6GxzWC5-EfRCiE80as_ya0-wwyNsTYbxzY,1721
|
2
|
+
kreuzberg/__main__.py,sha256=s2qM1nPEkRHAQP-G3P7sf5l6qA_KJeIEHS5LpPz04lg,183
|
3
|
+
kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
|
4
|
+
kreuzberg/_config.py,sha256=_9JU88ChId8dWUjZ13ueo9_JoFekkyzuv7rZpFkrPZk,12966
|
5
|
+
kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
|
6
|
+
kreuzberg/_entity_extraction.py,sha256=woNxARG27Z3T_l6w6N-dbt1PPe1IHptFMOZY_6etv54,7819
|
7
|
+
kreuzberg/_gmft.py,sha256=Q46CyBxRxY_oDGpSuXMOJ7qfR9LwuCKXnrl60wcPvU4,25286
|
8
|
+
kreuzberg/_language_detection.py,sha256=eEfj4tsh91SfB2_zQIdY-qD7TlPcppaFm0SqQmETS6Y,3295
|
9
|
+
kreuzberg/_mime_types.py,sha256=OhJ6gEyyLHjyvRtkk37zyLFBsRcSd_QybBaV8TxinIg,8471
|
10
|
+
kreuzberg/_playa.py,sha256=9z4If0WHxbYQxfb8xT7T96L9Du2Fj3Ar5-rF0OHHiMM,11877
|
11
|
+
kreuzberg/_registry.py,sha256=wGSlkS0U1zqruWQCLE95vj4a2mw1yyvf0j6rgz80sJg,3473
|
12
|
+
kreuzberg/_types.py,sha256=GisvL0ps2LCc0heKopFwSyrEbzH3WpDxaeev4vn59X4,14257
|
13
|
+
kreuzberg/cli.py,sha256=vTGS2TJlFTNMWp5LwZd3G2SS8u0m6bhQkH9n6a1oOoM,12439
|
14
|
+
kreuzberg/exceptions.py,sha256=PTiAZgQwcG9hXbgYg2W7sfxksFhq5_wzOFgZGnTJAoc,2991
|
15
|
+
kreuzberg/extraction.py,sha256=UmeEVN-eSile4HMxP0iqG9092BrsH5_zSZNVHhwy0ko,16993
|
16
|
+
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
|
+
kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
|
+
kreuzberg/_api/main.py,sha256=g3kqXUfSie2pcw3-EWOM4TAoJUqM7yj2e-cBQJ_bmYc,3253
|
19
|
+
kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
20
|
+
kreuzberg/_extractors/_base.py,sha256=yNVQSECFad-8_MjqpQZ4q0jQoNdzP6-tqw6l3TfgsMc,4418
|
21
|
+
kreuzberg/_extractors/_email.py,sha256=6-Mk1TRXPyy9ylWKCpgdrogyzhiFnJOTuTRld1ghO8I,5695
|
22
|
+
kreuzberg/_extractors/_html.py,sha256=lOM1Tgrrvd7vpEeFAxC1dp0Tibr6N2FEHCjgFx0FK64,1745
|
23
|
+
kreuzberg/_extractors/_image.py,sha256=eZ7mR4F-mTwYwUzd70xrY7SZYZrNiDxnP5bYDY5P75U,4455
|
24
|
+
kreuzberg/_extractors/_pandoc.py,sha256=51k7XISfKaPorhapG7aIeQb94KGsfozxKyT2rwhk9Bk,26553
|
25
|
+
kreuzberg/_extractors/_pdf.py,sha256=d-hG_mhAMj22bQ35YuP2nq017z27_2Pp08r1qyHxlYI,16676
|
26
|
+
kreuzberg/_extractors/_presentation.py,sha256=CUlqZl_QCdJdumsZh0BpROkFbvi9uq7yMoIt3bRTUeE,10859
|
27
|
+
kreuzberg/_extractors/_spread_sheet.py,sha256=vPxEDAyH-gDoVXSg-A0guOjOfaWIuRI3i2NU8xPwhK8,13695
|
28
|
+
kreuzberg/_extractors/_structured.py,sha256=d0x6EyRimr8eWmr1qPb7HRWnrbKBuD-GpIrZd8XJp0o,5824
|
29
|
+
kreuzberg/_mcp/__init__.py,sha256=8PYV-omC8Rln7Cove8C3rHu3d7sR1FuiwSBG1O7vkAE,92
|
30
|
+
kreuzberg/_mcp/server.py,sha256=Ab0w7kR3m7_L1cfhYHiC8HqDL282vt4uBYwYc9w9E08,8703
|
31
|
+
kreuzberg/_ocr/__init__.py,sha256=grshVFwVQl2rMvH1hg1JNlYXjy5-Tdb_rusLD1Cselk,706
|
32
|
+
kreuzberg/_ocr/_base.py,sha256=CUzYMsJjCqCmHzWckmDeIB2L5hd261xrPrK8Ql-Gdm0,3876
|
33
|
+
kreuzberg/_ocr/_easyocr.py,sha256=c2ndpDlIHvAI2WyvQUXLQ1hb6XynKeKARsXQcQ3ntJ0,17110
|
34
|
+
kreuzberg/_ocr/_paddleocr.py,sha256=fab8a-3cvDgnt97qF-Km9ZfmkacFeKD_g15O8HXYRVc,17492
|
35
|
+
kreuzberg/_ocr/_tesseract.py,sha256=r1g_PCAXgJbZ0RPGn4aSxctZ0F9lLvI3zLGLEPAnviI,31455
|
36
|
+
kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
37
|
+
kreuzberg/_utils/_cache.py,sha256=H2d6JOiTTAoJx5HPJoToCk4ik-ztTRNEJRrHgcSUTLs,15249
|
38
|
+
kreuzberg/_utils/_device.py,sha256=PC8YUPE95pzOyU7sU_icqNZpSfi6HZlEFfmWcV1Uees,10226
|
39
|
+
kreuzberg/_utils/_document_cache.py,sha256=z8irioKsOu8xve1YgHatm__wIFvs9I1gDK3tLNsNyqM,6926
|
40
|
+
kreuzberg/_utils/_errors.py,sha256=UsktQ_p7eOj9crPsFDg8HgRSE5-IpuFC7y1e6dDI_fY,6503
|
41
|
+
kreuzberg/_utils/_pdf_lock.py,sha256=nqxAYCNlfWDrJtP4ZNu57st1YnkDl-gYXdr0q8nv0kA,1961
|
42
|
+
kreuzberg/_utils/_process_pool.py,sha256=4BqhmRspwMyPT2EBfTu_rrn7v722wlMLD8qlYvYsc00,8621
|
43
|
+
kreuzberg/_utils/_quality.py,sha256=-nKzj5n7yJDYrvl556oq2T5S5oKMEOrjpcRMlZ00Jqo,7668
|
44
|
+
kreuzberg/_utils/_serialization.py,sha256=cqqxqN2cmtndBhIr4v2wqiMwnNadnKhvuN7EUj3i18M,2290
|
45
|
+
kreuzberg/_utils/_string.py,sha256=bCzO3UO6nXupxvtMWvHqfp1Vd9CTzEH9jmpJXQ7upAU,6800
|
46
|
+
kreuzberg/_utils/_sync.py,sha256=7LSavBmxVKQUzdjfx9fYRAI9IbJtRw8iGf_Q8B7RX9g,4923
|
47
|
+
kreuzberg/_utils/_table.py,sha256=IomrfQBP85DZI8RmQjOVs2Siq7VP9FUTYPaZR4t3yRw,8199
|
48
|
+
kreuzberg/_utils/_tmp.py,sha256=hVn-VVijIg2FM7EZJ899gc7wZg-TGoJZoeAcxMX-Cxg,1044
|
49
|
+
kreuzberg-3.8.2.dist-info/METADATA,sha256=RiP64og5wOaf9gPZ7CwOsNYYx9GBnVMg8orgqZdncKA,11466
|
50
|
+
kreuzberg-3.8.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
51
|
+
kreuzberg-3.8.2.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
|
52
|
+
kreuzberg-3.8.2.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
53
|
+
kreuzberg-3.8.2.dist-info/RECORD,,
|