kreuzberg 3.7.0__py3-none-any.whl → 3.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_entity_extraction.py +1 -2
- kreuzberg/_extractors/_base.py +39 -1
- kreuzberg/_extractors/_email.py +149 -0
- kreuzberg/_extractors/_html.py +15 -3
- kreuzberg/_extractors/_image.py +21 -36
- kreuzberg/_extractors/_pandoc.py +3 -14
- kreuzberg/_extractors/_pdf.py +81 -48
- kreuzberg/_extractors/_presentation.py +62 -10
- kreuzberg/_extractors/_spread_sheet.py +179 -4
- kreuzberg/_extractors/_structured.py +148 -0
- kreuzberg/_gmft.py +314 -7
- kreuzberg/_mime_types.py +27 -1
- kreuzberg/_ocr/__init__.py +10 -1
- kreuzberg/_ocr/_base.py +59 -0
- kreuzberg/_ocr/_easyocr.py +91 -0
- kreuzberg/_ocr/_paddleocr.py +89 -0
- kreuzberg/_ocr/_tesseract.py +564 -4
- kreuzberg/_registry.py +4 -0
- kreuzberg/_types.py +131 -0
- kreuzberg/_utils/_cache.py +52 -4
- kreuzberg/_utils/_errors.py +3 -7
- kreuzberg/_utils/_process_pool.py +180 -7
- kreuzberg/_utils/_quality.py +237 -0
- kreuzberg/_utils/_serialization.py +4 -2
- kreuzberg/_utils/_string.py +153 -10
- kreuzberg/_utils/_sync.py +5 -2
- kreuzberg/_utils/_table.py +261 -0
- kreuzberg/cli.py +1 -2
- kreuzberg/extraction.py +4 -22
- {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/METADATA +58 -54
- kreuzberg-3.8.1.dist-info/RECORD +53 -0
- kreuzberg/_multiprocessing/__init__.py +0 -6
- kreuzberg/_multiprocessing/gmft_isolated.py +0 -330
- kreuzberg/_multiprocessing/process_manager.py +0 -189
- kreuzberg/_multiprocessing/sync_easyocr.py +0 -235
- kreuzberg/_multiprocessing/sync_paddleocr.py +0 -199
- kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
- kreuzberg/_multiprocessing/tesseract_pool.py +0 -359
- kreuzberg-3.7.0.dist-info/RECORD +0 -56
- {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/WHEEL +0 -0
- {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,261 @@
|
|
1
|
+
"""Table processing and export utilities."""
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
import csv
|
6
|
+
from io import StringIO
|
7
|
+
from typing import TYPE_CHECKING, Any
|
8
|
+
|
9
|
+
if TYPE_CHECKING:
|
10
|
+
from kreuzberg._types import TableData
|
11
|
+
|
12
|
+
|
13
|
+
def export_table_to_csv(table: TableData, separator: str = ",") -> str:
|
14
|
+
r"""Export a TableData object to CSV/TSV format.
|
15
|
+
|
16
|
+
Args:
|
17
|
+
table: TableData object containing DataFrame
|
18
|
+
separator: Field separator ("," for CSV, "\t" for TSV)
|
19
|
+
|
20
|
+
Returns:
|
21
|
+
String representation in CSV/TSV format
|
22
|
+
"""
|
23
|
+
if "df" not in table or table["df"] is None:
|
24
|
+
return ""
|
25
|
+
|
26
|
+
output = StringIO()
|
27
|
+
table["df"].to_csv(output, sep=separator, index=False, quoting=csv.QUOTE_MINIMAL)
|
28
|
+
return output.getvalue().strip()
|
29
|
+
|
30
|
+
|
31
|
+
def export_table_to_tsv(table: TableData) -> str:
|
32
|
+
"""Export a TableData object to TSV format.
|
33
|
+
|
34
|
+
Args:
|
35
|
+
table: TableData object containing DataFrame
|
36
|
+
|
37
|
+
Returns:
|
38
|
+
String representation in TSV format
|
39
|
+
"""
|
40
|
+
return export_table_to_csv(table, separator="\t")
|
41
|
+
|
42
|
+
|
43
|
+
def enhance_table_markdown(table: TableData) -> str:
|
44
|
+
"""Generate enhanced markdown table with better formatting.
|
45
|
+
|
46
|
+
Args:
|
47
|
+
table: TableData object
|
48
|
+
|
49
|
+
Returns:
|
50
|
+
Enhanced markdown table string
|
51
|
+
"""
|
52
|
+
if "df" not in table or table["df"] is None:
|
53
|
+
return table.get("text", "")
|
54
|
+
|
55
|
+
df = table["df"]
|
56
|
+
|
57
|
+
if df.empty:
|
58
|
+
return table.get("text", "")
|
59
|
+
|
60
|
+
# Create enhanced markdown with proper alignment
|
61
|
+
lines = []
|
62
|
+
|
63
|
+
# Header row
|
64
|
+
headers = [str(col).strip() for col in df.columns]
|
65
|
+
lines.append("| " + " | ".join(headers) + " |")
|
66
|
+
|
67
|
+
# Separator row with alignment hints
|
68
|
+
lines.append(_generate_separator_row(df))
|
69
|
+
|
70
|
+
# Analyze float columns to determine formatting strategy
|
71
|
+
float_col_formatting = _analyze_float_columns(df)
|
72
|
+
|
73
|
+
# Data rows with proper formatting
|
74
|
+
for _, row in df.iterrows():
|
75
|
+
formatted_row = _format_table_row(row, df, float_col_formatting)
|
76
|
+
lines.append("| " + " | ".join(formatted_row) + " |")
|
77
|
+
|
78
|
+
return "\n".join(lines)
|
79
|
+
|
80
|
+
|
81
|
+
def _generate_separator_row(df: Any) -> str:
|
82
|
+
"""Generate separator row with proper alignment hints."""
|
83
|
+
separators = []
|
84
|
+
for col in df.columns:
|
85
|
+
# Check if column contains mostly numbers for right alignment
|
86
|
+
if df[col].dtype in ["int64", "float64"] or _is_numeric_column(df[col]):
|
87
|
+
separators.append("---:") # Right align numbers
|
88
|
+
else:
|
89
|
+
separators.append("---") # Left align text
|
90
|
+
return "| " + " | ".join(separators) + " |"
|
91
|
+
|
92
|
+
|
93
|
+
def _analyze_float_columns(df: Any) -> dict[str, str]:
|
94
|
+
"""Analyze float columns to determine formatting strategy."""
|
95
|
+
float_col_formatting = {}
|
96
|
+
for col in df.columns:
|
97
|
+
if str(df[col].dtype) == "float64":
|
98
|
+
non_null_values = df[col].dropna()
|
99
|
+
if len(non_null_values) > 0:
|
100
|
+
# If all non-null values are whole numbers, format as integers
|
101
|
+
all_integers = all(val.is_integer() for val in non_null_values)
|
102
|
+
float_col_formatting[col] = "int" if all_integers else "float"
|
103
|
+
else:
|
104
|
+
float_col_formatting[col] = "int"
|
105
|
+
return float_col_formatting
|
106
|
+
|
107
|
+
|
108
|
+
def _format_table_row(row: Any, df: Any, float_col_formatting: dict[str, str]) -> list[str]:
|
109
|
+
"""Format a single table row with proper value formatting."""
|
110
|
+
formatted_row = []
|
111
|
+
for col_name, value in row.items():
|
112
|
+
if value is None or (isinstance(value, float) and str(value) == "nan"):
|
113
|
+
formatted_row.append("")
|
114
|
+
elif str(df[col_name].dtype) in ["int64", "int32"]:
|
115
|
+
# For integer columns, format as integers
|
116
|
+
formatted_row.append(str(int(value)))
|
117
|
+
elif isinstance(value, float):
|
118
|
+
# For float columns, use the determined formatting strategy
|
119
|
+
if col_name in float_col_formatting and float_col_formatting[col_name] == "int":
|
120
|
+
formatted_row.append(str(int(value)))
|
121
|
+
else:
|
122
|
+
formatted_row.append(f"{value:.2f}")
|
123
|
+
else:
|
124
|
+
# Clean up text values
|
125
|
+
clean_value = str(value).strip().replace("|", "\\|") # Escape pipes
|
126
|
+
formatted_row.append(clean_value)
|
127
|
+
return formatted_row
|
128
|
+
|
129
|
+
|
130
|
+
def _is_numeric_column(series: Any) -> bool:
|
131
|
+
"""Check if a pandas Series contains mostly numeric values."""
|
132
|
+
if len(series) == 0:
|
133
|
+
return False
|
134
|
+
|
135
|
+
try:
|
136
|
+
# Check if already numeric dtype first (fastest path)
|
137
|
+
if str(series.dtype) in {"int64", "float64", "int32", "float32"}:
|
138
|
+
return True
|
139
|
+
|
140
|
+
# Sample-based approach for large series (>1000 rows)
|
141
|
+
sample_size = min(100, len(series))
|
142
|
+
if len(series) > 1000:
|
143
|
+
sample_series = series.dropna().sample(n=sample_size, random_state=42)
|
144
|
+
else:
|
145
|
+
sample_series = series.dropna()
|
146
|
+
|
147
|
+
if len(sample_series) == 0:
|
148
|
+
return False
|
149
|
+
|
150
|
+
# Optimized numeric conversion - avoid exception overhead
|
151
|
+
numeric_count = 0
|
152
|
+
for val in sample_series:
|
153
|
+
val_str = str(val).replace(",", "").replace("$", "").replace("%", "")
|
154
|
+
# Quick check: if it contains only digits, decimal point, minus, plus, or e
|
155
|
+
if val_str and all(c in "0123456789.-+eE" for c in val_str):
|
156
|
+
try:
|
157
|
+
float(val_str)
|
158
|
+
numeric_count += 1
|
159
|
+
except (ValueError, TypeError):
|
160
|
+
pass
|
161
|
+
|
162
|
+
# Consider numeric if >70% of sampled values are numeric
|
163
|
+
return (numeric_count / len(sample_series)) > 0.7
|
164
|
+
|
165
|
+
except (ValueError, TypeError, ZeroDivisionError):
|
166
|
+
return False
|
167
|
+
|
168
|
+
|
169
|
+
def generate_table_summary(tables: list[TableData]) -> dict[str, Any]:
|
170
|
+
"""Generate summary statistics for extracted tables.
|
171
|
+
|
172
|
+
Args:
|
173
|
+
tables: List of TableData objects
|
174
|
+
|
175
|
+
Returns:
|
176
|
+
Dictionary with table statistics
|
177
|
+
"""
|
178
|
+
if not tables:
|
179
|
+
return {
|
180
|
+
"table_count": 0,
|
181
|
+
"total_rows": 0,
|
182
|
+
"total_columns": 0,
|
183
|
+
"pages_with_tables": 0,
|
184
|
+
}
|
185
|
+
|
186
|
+
total_rows = 0
|
187
|
+
total_columns = 0
|
188
|
+
pages_with_tables = set()
|
189
|
+
tables_by_page = {}
|
190
|
+
|
191
|
+
for table in tables:
|
192
|
+
if "df" in table and table["df"] is not None:
|
193
|
+
df = table["df"]
|
194
|
+
total_rows += len(df)
|
195
|
+
total_columns += len(df.columns)
|
196
|
+
|
197
|
+
if "page_number" in table:
|
198
|
+
page_num = table["page_number"]
|
199
|
+
pages_with_tables.add(page_num)
|
200
|
+
|
201
|
+
if page_num not in tables_by_page:
|
202
|
+
tables_by_page[page_num] = 0
|
203
|
+
tables_by_page[page_num] += 1
|
204
|
+
|
205
|
+
return {
|
206
|
+
"table_count": len(tables),
|
207
|
+
"total_rows": total_rows,
|
208
|
+
"total_columns": total_columns,
|
209
|
+
"pages_with_tables": len(pages_with_tables),
|
210
|
+
"avg_rows_per_table": total_rows / len(tables) if tables else 0,
|
211
|
+
"avg_columns_per_table": total_columns / len(tables) if tables else 0,
|
212
|
+
"tables_by_page": dict(tables_by_page),
|
213
|
+
}
|
214
|
+
|
215
|
+
|
216
|
+
def extract_table_structure_info(table: TableData) -> dict[str, Any]:
|
217
|
+
"""Extract structural information from a table.
|
218
|
+
|
219
|
+
Args:
|
220
|
+
table: TableData object
|
221
|
+
|
222
|
+
Returns:
|
223
|
+
Dictionary with structural information
|
224
|
+
"""
|
225
|
+
info = {
|
226
|
+
"has_headers": False,
|
227
|
+
"row_count": 0,
|
228
|
+
"column_count": 0,
|
229
|
+
"numeric_columns": 0,
|
230
|
+
"text_columns": 0,
|
231
|
+
"empty_cells": 0,
|
232
|
+
"data_density": 0.0,
|
233
|
+
}
|
234
|
+
|
235
|
+
if "df" not in table or table["df"] is None:
|
236
|
+
return info
|
237
|
+
|
238
|
+
df = table["df"]
|
239
|
+
|
240
|
+
if df.empty:
|
241
|
+
return info
|
242
|
+
|
243
|
+
info["row_count"] = len(df)
|
244
|
+
info["column_count"] = len(df.columns)
|
245
|
+
info["has_headers"] = len(df.columns) > 0
|
246
|
+
|
247
|
+
# Analyze column types
|
248
|
+
for col in df.columns:
|
249
|
+
if _is_numeric_column(df[col]):
|
250
|
+
info["numeric_columns"] += 1
|
251
|
+
else:
|
252
|
+
info["text_columns"] += 1
|
253
|
+
|
254
|
+
# Calculate data density
|
255
|
+
total_cells = len(df) * len(df.columns)
|
256
|
+
if total_cells > 0:
|
257
|
+
empty_cells = df.isnull().sum().sum()
|
258
|
+
info["empty_cells"] = int(empty_cells)
|
259
|
+
info["data_density"] = (total_cells - empty_cells) / total_cells
|
260
|
+
|
261
|
+
return info
|
kreuzberg/cli.py
CHANGED
@@ -4,6 +4,7 @@ from __future__ import annotations
|
|
4
4
|
|
5
5
|
import json
|
6
6
|
import sys
|
7
|
+
import traceback
|
7
8
|
from pathlib import Path
|
8
9
|
from typing import TYPE_CHECKING, Any
|
9
10
|
|
@@ -211,8 +212,6 @@ def handle_error(error: Exception, verbose: bool) -> None:
|
|
211
212
|
else:
|
212
213
|
console.print(f"[red]Unexpected error:[/red] {type(error).__name__}: {error}", style="bold")
|
213
214
|
if verbose:
|
214
|
-
import traceback
|
215
|
-
|
216
215
|
console.print("\n[dim]Traceback:[/dim]")
|
217
216
|
traceback.print_exc()
|
218
217
|
sys.exit(1)
|
kreuzberg/extraction.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import multiprocessing as mp
|
4
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
3
5
|
from pathlib import Path
|
4
6
|
from typing import TYPE_CHECKING, Any, Final, cast
|
5
7
|
|
@@ -14,6 +16,8 @@ from kreuzberg._mime_types import (
|
|
14
16
|
)
|
15
17
|
from kreuzberg._registry import ExtractorRegistry
|
16
18
|
from kreuzberg._types import ExtractionConfig
|
19
|
+
from kreuzberg._utils._document_cache import get_document_cache
|
20
|
+
from kreuzberg._utils._errors import create_error_context
|
17
21
|
from kreuzberg._utils._string import safe_decode
|
18
22
|
from kreuzberg._utils._sync import run_maybe_sync, run_sync_only
|
19
23
|
from kreuzberg.exceptions import ValidationError
|
@@ -136,8 +140,6 @@ async def extract_file(
|
|
136
140
|
Raises:
|
137
141
|
ValidationError: If the file path or configuration is invalid.
|
138
142
|
"""
|
139
|
-
from kreuzberg._utils._document_cache import get_document_cache
|
140
|
-
|
141
143
|
cache = get_document_cache()
|
142
144
|
path = Path(file_path)
|
143
145
|
cached_result = cache.get(path, config)
|
@@ -194,8 +196,6 @@ async def batch_extract_file(
|
|
194
196
|
if not file_paths:
|
195
197
|
return []
|
196
198
|
|
197
|
-
import multiprocessing as mp
|
198
|
-
|
199
199
|
max_concurrency = min(len(file_paths), mp.cpu_count() * 2)
|
200
200
|
semaphore = anyio.Semaphore(max_concurrency)
|
201
201
|
|
@@ -211,8 +211,6 @@ async def batch_extract_file(
|
|
211
211
|
)
|
212
212
|
results[index] = result
|
213
213
|
except Exception as e: # noqa: BLE001
|
214
|
-
from kreuzberg._utils._errors import create_error_context
|
215
|
-
|
216
214
|
error_result = ExtractionResult(
|
217
215
|
content=f"Error: {type(e).__name__}: {e!s}",
|
218
216
|
mime_type="text/plain",
|
@@ -251,8 +249,6 @@ async def batch_extract_bytes(
|
|
251
249
|
if not contents:
|
252
250
|
return []
|
253
251
|
|
254
|
-
import multiprocessing as mp
|
255
|
-
|
256
252
|
max_concurrency = min(len(contents), mp.cpu_count() * 2)
|
257
253
|
semaphore = anyio.Semaphore(max_concurrency)
|
258
254
|
|
@@ -264,8 +260,6 @@ async def batch_extract_bytes(
|
|
264
260
|
result = await extract_bytes(content, mime_type, config)
|
265
261
|
results[index] = result
|
266
262
|
except Exception as e: # noqa: BLE001
|
267
|
-
from kreuzberg._utils._errors import create_error_context
|
268
|
-
|
269
263
|
error_result = ExtractionResult(
|
270
264
|
content=f"Error: {type(e).__name__}: {e!s}",
|
271
265
|
mime_type="text/plain",
|
@@ -331,8 +325,6 @@ def extract_file_sync(
|
|
331
325
|
Raises:
|
332
326
|
ValidationError: If the file path or configuration is invalid.
|
333
327
|
"""
|
334
|
-
from kreuzberg._utils._document_cache import get_document_cache
|
335
|
-
|
336
328
|
cache = get_document_cache()
|
337
329
|
path = Path(file_path)
|
338
330
|
cached_result = cache.get(path, config)
|
@@ -389,9 +381,6 @@ def batch_extract_file_sync(
|
|
389
381
|
if len(file_paths) <= 1:
|
390
382
|
return [extract_file_sync(file_path=Path(file_path), mime_type=None, config=config) for file_path in file_paths]
|
391
383
|
|
392
|
-
import multiprocessing as mp
|
393
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
394
|
-
|
395
384
|
max_workers = min(len(file_paths), mp.cpu_count())
|
396
385
|
|
397
386
|
def extract_single(file_path: PathLike[str] | str) -> tuple[int, ExtractionResult]:
|
@@ -402,8 +391,6 @@ def batch_extract_file_sync(
|
|
402
391
|
extract_file_sync(file_path=Path(file_path), mime_type=None, config=config),
|
403
392
|
)
|
404
393
|
except Exception as e: # noqa: BLE001
|
405
|
-
from kreuzberg._utils._errors import create_error_context
|
406
|
-
|
407
394
|
error_result = ExtractionResult(
|
408
395
|
content=f"Error: {type(e).__name__}: {e!s}",
|
409
396
|
mime_type="text/plain",
|
@@ -447,9 +434,6 @@ def batch_extract_bytes_sync(
|
|
447
434
|
extract_bytes_sync(content=content, mime_type=mime_type, config=config) for content, mime_type in contents
|
448
435
|
]
|
449
436
|
|
450
|
-
import multiprocessing as mp
|
451
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
452
|
-
|
453
437
|
max_workers = min(len(contents), mp.cpu_count())
|
454
438
|
|
455
439
|
def extract_single(index_and_content: tuple[int, tuple[bytes, str]]) -> tuple[int, ExtractionResult]:
|
@@ -458,8 +442,6 @@ def batch_extract_bytes_sync(
|
|
458
442
|
try:
|
459
443
|
return (index, extract_bytes_sync(content=content, mime_type=mime_type, config=config))
|
460
444
|
except Exception as e: # noqa: BLE001
|
461
|
-
from kreuzberg._utils._errors import create_error_context
|
462
|
-
|
463
445
|
error_result = ExtractionResult(
|
464
446
|
content=f"Error: {type(e).__name__}: {e!s}",
|
465
447
|
mime_type="text/plain",
|
@@ -1,14 +1,16 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
4
|
-
Summary:
|
3
|
+
Version: 3.8.1
|
4
|
+
Summary: Advanced document intelligence framework for extracting structured content from PDFs, images, and office documents
|
5
5
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
6
6
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
7
7
|
License: MIT
|
8
8
|
License-File: LICENSE
|
9
|
-
Keywords: document-processing,entity-extraction,image-to-text,
|
9
|
+
Keywords: automation,content-extraction,data-processing,document-analysis,document-intelligence,document-processing,entity-extraction,image-to-text,information-extraction,ocr,pdf-extraction,rag,structured-data,table-extraction,text-extraction
|
10
10
|
Classifier: Development Status :: 5 - Production/Stable
|
11
11
|
Classifier: Intended Audience :: Developers
|
12
|
+
Classifier: Intended Audience :: Information Technology
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
12
14
|
Classifier: License :: OSI Approved :: MIT License
|
13
15
|
Classifier: Operating System :: OS Independent
|
14
16
|
Classifier: Programming Language :: Python :: 3 :: Only
|
@@ -16,16 +18,19 @@ Classifier: Programming Language :: Python :: 3.10
|
|
16
18
|
Classifier: Programming Language :: Python :: 3.11
|
17
19
|
Classifier: Programming Language :: Python :: 3.12
|
18
20
|
Classifier: Programming Language :: Python :: 3.13
|
21
|
+
Classifier: Topic :: Database
|
22
|
+
Classifier: Topic :: Multimedia :: Graphics :: Capture :: Scanners
|
23
|
+
Classifier: Topic :: Office/Business :: Office Suites
|
19
24
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
20
26
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
21
27
|
Classifier: Topic :: Text Processing :: General
|
22
|
-
Classifier: Topic :: Utilities
|
23
28
|
Classifier: Typing :: Typed
|
24
29
|
Requires-Python: >=3.10
|
25
30
|
Requires-Dist: anyio>=4.9.0
|
26
|
-
Requires-Dist:
|
31
|
+
Requires-Dist: chardetng-py>=0.3.4
|
27
32
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
28
|
-
Requires-Dist: html-to-markdown[lxml]>=1.
|
33
|
+
Requires-Dist: html-to-markdown[lxml]>=1.8.0
|
29
34
|
Requires-Dist: mcp>=1.11.0
|
30
35
|
Requires-Dist: msgspec>=0.18.0
|
31
36
|
Requires-Dist: playa-pdf>=0.6.1
|
@@ -34,6 +39,9 @@ Requires-Dist: pypdfium2==4.30.0
|
|
34
39
|
Requires-Dist: python-calamine>=0.3.2
|
35
40
|
Requires-Dist: python-pptx>=1.0.2
|
36
41
|
Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
|
42
|
+
Provides-Extra: additional-extensions
|
43
|
+
Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
|
44
|
+
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
|
37
45
|
Provides-Extra: all
|
38
46
|
Requires-Dist: click>=8.2.1; extra == 'all'
|
39
47
|
Requires-Dist: easyocr>=1.7.2; extra == 'all'
|
@@ -41,6 +49,7 @@ Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
|
|
41
49
|
Requires-Dist: gmft>=0.4.2; extra == 'all'
|
42
50
|
Requires-Dist: keybert>=0.9.0; extra == 'all'
|
43
51
|
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all'
|
52
|
+
Requires-Dist: mailparse>=1.0.15; extra == 'all'
|
44
53
|
Requires-Dist: paddleocr>=3.1.0; extra == 'all'
|
45
54
|
Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
|
46
55
|
Requires-Dist: rich>=14.0.0; extra == 'all'
|
@@ -77,22 +86,33 @@ Description-Content-Type: text/markdown
|
|
77
86
|
[](https://badge.fury.io/py/kreuzberg)
|
78
87
|
[](https://goldziher.github.io/kreuzberg/)
|
79
88
|
[](https://opensource.org/licenses/MIT)
|
89
|
+
[](https://github.com/Goldziher/kreuzberg)
|
80
90
|
|
81
|
-
**
|
91
|
+
**Advanced Document Intelligence for Modern Python Applications.** Transform PDFs, images, and office documents into structured data with production-grade performance. Built by engineers who understand that speed, reliability, and developer experience matter.
|
82
92
|
|
83
93
|
📖 **[Complete Documentation](https://goldziher.github.io/kreuzberg/)**
|
84
94
|
|
85
|
-
## Why Kreuzberg?
|
95
|
+
## Why Choose Kreuzberg?
|
86
96
|
|
87
|
-
|
88
|
-
|
89
|
-
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
97
|
+
### ⚡ Proven Performance
|
98
|
+
|
99
|
+
[Benchmarked](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) 6-126x faster than alternatives while using minimal resources. Process up to 14 files per second with 87MB install size and ~360MB memory usage. Optimized for production workloads and resource-constrained environments.
|
100
|
+
|
101
|
+
### 🏗️ Production Engineering
|
102
|
+
|
103
|
+
Comprehensive test coverage (95%+), robust error handling, and true async/await support. Built with modern Python practices for reliability in production environments.
|
104
|
+
|
105
|
+
### 🔧 Developer Experience
|
106
|
+
|
107
|
+
Works immediately with smart defaults, scales as you grow. Native MCP integration for AI tools, full type safety, and clear documentation.
|
108
|
+
|
109
|
+
### 🚀 Flexible Deployment
|
110
|
+
|
111
|
+
Deploy on serverless platforms, containers, or traditional servers. Supports both CPU and GPU processing (via PaddleOCR and EasyOCR). No external API dependencies. Multiple deployment modes: CLI, REST API, MCP server.
|
112
|
+
|
113
|
+
### 📄 Comprehensive Format Support
|
114
|
+
|
115
|
+
Extract from PDFs, images, Office documents, HTML, spreadsheets, and presentations. Multiple OCR engines with intelligent fallbacks, table extraction, and content preparation for RAG workflows.
|
96
116
|
|
97
117
|
## Quick Start
|
98
118
|
|
@@ -128,7 +148,7 @@ import asyncio
|
|
128
148
|
from kreuzberg import extract_file
|
129
149
|
|
130
150
|
async def main():
|
131
|
-
# Extract from
|
151
|
+
# Extract content from files
|
132
152
|
result = await extract_file("document.pdf")
|
133
153
|
print(result.content)
|
134
154
|
print(result.metadata)
|
@@ -197,7 +217,7 @@ docker run -p 8000:8000 goldziher/kreuzberg:latest
|
|
197
217
|
curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
|
198
218
|
```
|
199
219
|
|
200
|
-
Available variants: `latest`, `
|
220
|
+
Available variants: `latest`, `v3.8.0`, `v3.8.0-easyocr`, `v3.8.0-paddle`, `v3.8.0-gmft`, `v3.8.0-all`
|
201
221
|
|
202
222
|
### 🌐 REST API
|
203
223
|
|
@@ -240,23 +260,28 @@ kreuzberg extract *.pdf --output-dir ./extracted/
|
|
240
260
|
| **Web** | HTML, XML, MHTML |
|
241
261
|
| **Archives** | Support via extraction |
|
242
262
|
|
243
|
-
## Performance
|
263
|
+
## 📊 Performance Comparison
|
244
264
|
|
245
|
-
|
265
|
+
[Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) across ~100 real-world documents • [View source](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [**Detailed Analysis**](https://goldziher.github.io/kreuzberg/performance-analysis/):
|
246
266
|
|
247
|
-
|
|
248
|
-
| ------------- |
|
249
|
-
| **Kreuzberg** |
|
250
|
-
| Unstructured |
|
251
|
-
| MarkItDown |
|
252
|
-
| Docling |
|
267
|
+
| Framework | Speed | Memory | Install Size | Dependencies | Success Rate |
|
268
|
+
| ------------- | ------------ | ------ | ------------ | ------------ | ------------ |
|
269
|
+
| **Kreuzberg** | 14.4 files/s | 360MB | 87MB | 43 | 100% |
|
270
|
+
| Unstructured | ~12 files/s | ~1GB | 146MB | 54 | 88%+ |
|
271
|
+
| MarkItDown | ~15 files/s | ~1.5GB | 251MB | 25 | 80%\* |
|
272
|
+
| Docling | ~1 file/min | ~5GB | 1,032MB | 88 | 45%\* |
|
253
273
|
|
254
|
-
\*
|
255
|
-
†_Good on simple documents, struggles with large/complex files (>10MB)_
|
256
|
-
‡_Frequently fails/times out on medium files (>1MB)_
|
274
|
+
\*_Performance varies significantly with document complexity and size_
|
257
275
|
|
258
|
-
|
259
|
-
|
276
|
+
**Key strengths:**
|
277
|
+
|
278
|
+
- 6-126x faster processing than comparable frameworks
|
279
|
+
- Smallest installation footprint and memory usage
|
280
|
+
- Only framework with built-in async/await support
|
281
|
+
- Supports both CPU and GPU processing
|
282
|
+
- Built by software engineers for production reliability
|
283
|
+
|
284
|
+
> **Benchmark details**: Tests include PDFs, Word docs, HTML, images, and spreadsheets in multiple languages (English, Hebrew, German, Chinese, Japanese, Korean) on standardized hardware.
|
260
285
|
|
261
286
|
## Documentation
|
262
287
|
|
@@ -264,34 +289,13 @@ kreuzberg extract *.pdf --output-dir ./extracted/
|
|
264
289
|
|
265
290
|
- [Installation Guide](https://goldziher.github.io/kreuzberg/getting-started/installation/) - Setup and dependencies
|
266
291
|
- [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - Comprehensive usage guide
|
292
|
+
- [Performance Analysis](https://goldziher.github.io/kreuzberg/performance-analysis/) - Detailed benchmark results
|
267
293
|
- [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Complete API documentation
|
268
294
|
- [Docker Guide](https://goldziher.github.io/kreuzberg/user-guide/docker/) - Container deployment
|
269
295
|
- [REST API](https://goldziher.github.io/kreuzberg/user-guide/api-server/) - HTTP endpoints
|
270
296
|
- [CLI Guide](https://goldziher.github.io/kreuzberg/cli/) - Command-line usage
|
271
297
|
- [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - OCR engine setup
|
272
298
|
|
273
|
-
## Advanced Features
|
274
|
-
|
275
|
-
- **🤖 MCP Server**: Native integration with Claude Desktop and AI tools
|
276
|
-
- **📊 Table Extraction**: Extract tables from PDFs with GMFT
|
277
|
-
- **🧩 Content Chunking**: Split documents for RAG applications
|
278
|
-
- **🎯 Custom Extractors**: Extend with your own document handlers
|
279
|
-
- **🔧 Configuration**: Flexible TOML-based configuration
|
280
|
-
- **🪝 Hooks**: Pre/post-processing customization
|
281
|
-
- **🌍 Multi-language OCR**: 100+ languages supported
|
282
|
-
- **⚙️ Metadata Extraction**: Rich document metadata
|
283
|
-
- **🔄 Batch Processing**: Efficient bulk document processing
|
284
|
-
|
285
299
|
## License
|
286
300
|
|
287
301
|
MIT License - see [LICENSE](LICENSE) for details.
|
288
|
-
|
289
|
-
______________________________________________________________________
|
290
|
-
|
291
|
-
<div align="center">
|
292
|
-
|
293
|
-
**[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [Discord](https://discord.gg/pXxagNK2zN)**
|
294
|
-
|
295
|
-
Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
|
296
|
-
|
297
|
-
</div>
|
@@ -0,0 +1,53 @@
|
|
1
|
+
kreuzberg/__init__.py,sha256=wVxbug-w1cO2xHcP04Bf6QeIKmT2Ep6aeenb8EOYLA0,1534
|
2
|
+
kreuzberg/__main__.py,sha256=s2qM1nPEkRHAQP-G3P7sf5l6qA_KJeIEHS5LpPz04lg,183
|
3
|
+
kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
|
4
|
+
kreuzberg/_cli_config.py,sha256=WD_seFjbuay_NJv77vGLBW6BVV9WZNujdzf3zQkhzPc,5691
|
5
|
+
kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
|
6
|
+
kreuzberg/_entity_extraction.py,sha256=nqpQPmR2Rf1vOwoQsjm22nPLDIcsXdYfMwCL3h8iUTQ,7802
|
7
|
+
kreuzberg/_gmft.py,sha256=Heovj2n2kgi7eHtvvRzpBgSLGyXjz8M9PAQMX-npd40,25295
|
8
|
+
kreuzberg/_language_detection.py,sha256=22-uXoOu_ws0K8Hz2M7U_SF9QX3npRYLhntAE1dNLFU,3283
|
9
|
+
kreuzberg/_mime_types.py,sha256=OhJ6gEyyLHjyvRtkk37zyLFBsRcSd_QybBaV8TxinIg,8471
|
10
|
+
kreuzberg/_playa.py,sha256=rU6ii2Qnrj8tkDYlSiab5h-BCYLJnUg4QwSLVDEXF5g,11883
|
11
|
+
kreuzberg/_registry.py,sha256=wGSlkS0U1zqruWQCLE95vj4a2mw1yyvf0j6rgz80sJg,3473
|
12
|
+
kreuzberg/_types.py,sha256=R_0Xc2kq4nEwkruvkB3qfrLeJ996419hBQ_1C6Xrqjo,13388
|
13
|
+
kreuzberg/cli.py,sha256=H9xxh4-zhGLfbhya2iD-NcEs-BvajVttm6cSiNx3ANU,12452
|
14
|
+
kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
|
15
|
+
kreuzberg/extraction.py,sha256=hY5d4oelwocX6eOBF0Bu3nHCcCbTL5JOIbaPCCFNKsU,16972
|
16
|
+
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
|
+
kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
|
+
kreuzberg/_api/main.py,sha256=kZCMPPzP4BGzEege9pdhQTJPKKVjCaC6kZdMMeaqP2M,2599
|
19
|
+
kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
20
|
+
kreuzberg/_extractors/_base.py,sha256=yNVQSECFad-8_MjqpQZ4q0jQoNdzP6-tqw6l3TfgsMc,4418
|
21
|
+
kreuzberg/_extractors/_email.py,sha256=6-Mk1TRXPyy9ylWKCpgdrogyzhiFnJOTuTRld1ghO8I,5695
|
22
|
+
kreuzberg/_extractors/_html.py,sha256=lOM1Tgrrvd7vpEeFAxC1dp0Tibr6N2FEHCjgFx0FK64,1745
|
23
|
+
kreuzberg/_extractors/_image.py,sha256=eZ7mR4F-mTwYwUzd70xrY7SZYZrNiDxnP5bYDY5P75U,4455
|
24
|
+
kreuzberg/_extractors/_pandoc.py,sha256=51k7XISfKaPorhapG7aIeQb94KGsfozxKyT2rwhk9Bk,26553
|
25
|
+
kreuzberg/_extractors/_pdf.py,sha256=Deb1ZIcqDY18CHa7cJL4vO4S7gy09yXWNSuH7O7kSzY,16430
|
26
|
+
kreuzberg/_extractors/_presentation.py,sha256=CUlqZl_QCdJdumsZh0BpROkFbvi9uq7yMoIt3bRTUeE,10859
|
27
|
+
kreuzberg/_extractors/_spread_sheet.py,sha256=Nvyz7XT7C2ai4QeUashBeENQpuP5rs8SmKfumxEqlCg,13712
|
28
|
+
kreuzberg/_extractors/_structured.py,sha256=i3jAvhHZt_BsRGgZZfgcsUqlwAg_RNc8vsuecb04T0c,5581
|
29
|
+
kreuzberg/_mcp/__init__.py,sha256=8PYV-omC8Rln7Cove8C3rHu3d7sR1FuiwSBG1O7vkAE,92
|
30
|
+
kreuzberg/_mcp/server.py,sha256=BQHeKI89aKf24BIE4n6m8r1rVA1Zgt6vM8Ki_OHuGnc,6780
|
31
|
+
kreuzberg/_ocr/__init__.py,sha256=grshVFwVQl2rMvH1hg1JNlYXjy5-Tdb_rusLD1Cselk,706
|
32
|
+
kreuzberg/_ocr/_base.py,sha256=CUzYMsJjCqCmHzWckmDeIB2L5hd261xrPrK8Ql-Gdm0,3876
|
33
|
+
kreuzberg/_ocr/_easyocr.py,sha256=sWyVnF7My4F1GU-IPSVtpaDJPYogw8N-NYxwuy-6loc,17098
|
34
|
+
kreuzberg/_ocr/_paddleocr.py,sha256=nXfQq6t2a7O-IpbCZRv8BvzP_lEBLgyYwXI5-wjzec0,17480
|
35
|
+
kreuzberg/_ocr/_tesseract.py,sha256=RjJ_C8c74LmLN53sdDo8WPCpUYeJ6fmRwsQdp6dJYio,31490
|
36
|
+
kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
37
|
+
kreuzberg/_utils/_cache.py,sha256=6T2K9BXWaPkEKphSFrfXtFFE7ck5q9CYV9NmAFS56e4,15204
|
38
|
+
kreuzberg/_utils/_device.py,sha256=rnaSSB5ibf2wr7EDxrcmOUZ4Ocor0pHkwb3N1pC46EY,10276
|
39
|
+
kreuzberg/_utils/_document_cache.py,sha256=z8irioKsOu8xve1YgHatm__wIFvs9I1gDK3tLNsNyqM,6926
|
40
|
+
kreuzberg/_utils/_errors.py,sha256=4OseKJI5qscD9jHxpP8CtpPWNHAOdhrJwcg6dlQl2fk,6310
|
41
|
+
kreuzberg/_utils/_pdf_lock.py,sha256=nqxAYCNlfWDrJtP4ZNu57st1YnkDl-gYXdr0q8nv0kA,1961
|
42
|
+
kreuzberg/_utils/_process_pool.py,sha256=4BqhmRspwMyPT2EBfTu_rrn7v722wlMLD8qlYvYsc00,8621
|
43
|
+
kreuzberg/_utils/_quality.py,sha256=dgFLt40NSqB8Ciej5QcZQLiV4U7LcrGux0vXckiE31U,7568
|
44
|
+
kreuzberg/_utils/_serialization.py,sha256=Rt5zSkvzf1SVNDrI6F2Zvnkel24mQkD1QvP0WjgZUgk,2195
|
45
|
+
kreuzberg/_utils/_string.py,sha256=5YKu9EZlZQ-LkphXUq8fdwKQrX9jWACFEhMGfjIysf4,6381
|
46
|
+
kreuzberg/_utils/_sync.py,sha256=7LSavBmxVKQUzdjfx9fYRAI9IbJtRw8iGf_Q8B7RX9g,4923
|
47
|
+
kreuzberg/_utils/_table.py,sha256=C2skLtcyczxDEH33Qw2dOwnR15SGillvNEP-NzBG3R8,8156
|
48
|
+
kreuzberg/_utils/_tmp.py,sha256=hVn-VVijIg2FM7EZJ899gc7wZg-TGoJZoeAcxMX-Cxg,1044
|
49
|
+
kreuzberg-3.8.1.dist-info/METADATA,sha256=IqJ6RTcFlwkMN6JZIkb9c8O4rgTrPqIuzXWerD6He1I,11507
|
50
|
+
kreuzberg-3.8.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
51
|
+
kreuzberg-3.8.1.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
|
52
|
+
kreuzberg-3.8.1.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
53
|
+
kreuzberg-3.8.1.dist-info/RECORD,,
|