kreuzberg 3.7.0__py3-none-any.whl → 3.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_extractors/_base.py +40 -0
- kreuzberg/_extractors/_email.py +149 -0
- kreuzberg/_extractors/_html.py +15 -3
- kreuzberg/_extractors/_image.py +17 -18
- kreuzberg/_extractors/_pdf.py +68 -14
- kreuzberg/_extractors/_presentation.py +62 -10
- kreuzberg/_extractors/_spread_sheet.py +179 -4
- kreuzberg/_extractors/_structured.py +148 -0
- kreuzberg/_gmft.py +2 -2
- kreuzberg/_mime_types.py +27 -1
- kreuzberg/_multiprocessing/__init__.py +2 -3
- kreuzberg/_ocr/__init__.py +30 -0
- kreuzberg/{_multiprocessing/tesseract_pool.py → _ocr/_pool.py} +3 -5
- kreuzberg/_ocr/_sync.py +566 -0
- kreuzberg/_ocr/_tesseract.py +6 -2
- kreuzberg/_registry.py +4 -0
- kreuzberg/_types.py +131 -0
- kreuzberg/_utils/_cache.py +17 -2
- kreuzberg/_utils/_process_pool.py +178 -1
- kreuzberg/_utils/_quality.py +237 -0
- kreuzberg/_utils/_serialization.py +4 -2
- kreuzberg/_utils/_string.py +153 -10
- kreuzberg/_utils/_sync.py +5 -2
- kreuzberg/_utils/_table.py +261 -0
- {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.0.dist-info}/METADATA +66 -50
- {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.0.dist-info}/RECORD +29 -28
- kreuzberg/_multiprocessing/process_manager.py +0 -189
- kreuzberg/_multiprocessing/sync_easyocr.py +0 -235
- kreuzberg/_multiprocessing/sync_paddleocr.py +0 -199
- kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
- {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_utils/_string.py
CHANGED
@@ -1,39 +1,182 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import hashlib
|
4
|
+
import re
|
3
5
|
from contextlib import suppress
|
6
|
+
from functools import lru_cache
|
4
7
|
|
5
|
-
|
8
|
+
import chardetng_py
|
9
|
+
|
10
|
+
# Compile regex patterns once at module level for performance
|
11
|
+
_WHITESPACE_PATTERN = re.compile(r"[ \t\f\v\r\xa0\u2000-\u200b\u2028\u2029\u3000]+")
|
12
|
+
_NEWLINES_PATTERN = re.compile(r"\n+")
|
13
|
+
_MOJIBAKE_PATTERNS = {
|
14
|
+
# Hebrew as Cyrillic patterns
|
15
|
+
"hebrew_as_cyrillic": re.compile(r"[\u0400-\u04FF]{3,}"),
|
16
|
+
# Control characters that shouldn't appear in text
|
17
|
+
"control_chars": re.compile(r"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F-\x9F]"),
|
18
|
+
# Unicode replacement characters
|
19
|
+
"replacement_chars": re.compile(r"\uFFFD+"),
|
20
|
+
# Isolated combining marks (likely encoding issues)
|
21
|
+
"isolated_combining": re.compile(r"[\u0300-\u036F](?![^\u0300-\u036F])"),
|
22
|
+
}
|
23
|
+
|
24
|
+
# Simple cache for encoding detection (in-memory, session-scoped)
|
25
|
+
_encoding_cache: dict[str, str] = {}
|
26
|
+
|
27
|
+
|
28
|
+
@lru_cache(maxsize=128)
|
29
|
+
def _get_encoding_cache_key(data_hash: str, size: int) -> str:
|
30
|
+
"""Generate cache key for encoding detection."""
|
31
|
+
return f"{data_hash}:{size}"
|
6
32
|
|
7
33
|
|
8
34
|
def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
|
9
|
-
"""Decode a byte string safely
|
35
|
+
"""Decode a byte string safely with mojibake detection and correction.
|
10
36
|
|
11
37
|
Args:
|
12
38
|
byte_data: The byte string to decode.
|
13
39
|
encoding: The encoding to use when decoding the byte string.
|
14
40
|
|
15
41
|
Returns:
|
16
|
-
The decoded string.
|
42
|
+
The decoded string with mojibake detection and correction.
|
17
43
|
"""
|
18
44
|
if not byte_data:
|
19
45
|
return ""
|
20
46
|
|
21
|
-
|
47
|
+
# Try provided encoding first (fastest path)
|
48
|
+
if encoding:
|
49
|
+
with suppress(UnicodeDecodeError, LookupError):
|
50
|
+
decoded = byte_data.decode(encoding)
|
51
|
+
return _fix_mojibake(decoded)
|
22
52
|
|
23
|
-
|
53
|
+
# Check cache for similar content (performance optimization)
|
54
|
+
data_hash = hashlib.sha256(byte_data[:1024]).hexdigest()[:16] # Hash first 1KB
|
55
|
+
cache_key = _get_encoding_cache_key(data_hash, len(byte_data))
|
56
|
+
|
57
|
+
if cache_key in _encoding_cache:
|
58
|
+
cached_encoding = _encoding_cache[cache_key]
|
59
|
+
with suppress(UnicodeDecodeError, LookupError):
|
60
|
+
decoded = byte_data.decode(cached_encoding)
|
61
|
+
return _fix_mojibake(decoded)
|
62
|
+
|
63
|
+
# Use chardetng for better performance than charset-normalizer
|
64
|
+
detected_encoding = chardetng_py.detect(byte_data)
|
65
|
+
if detected_encoding:
|
24
66
|
with suppress(UnicodeDecodeError, LookupError):
|
25
|
-
|
67
|
+
decoded = byte_data.decode(detected_encoding)
|
68
|
+
# Cache successful encoding detection
|
69
|
+
if len(_encoding_cache) < 1000: # Prevent unlimited growth
|
70
|
+
_encoding_cache[cache_key] = detected_encoding
|
71
|
+
return _fix_mojibake(decoded)
|
72
|
+
|
73
|
+
# Try multiple encodings with confidence scoring
|
74
|
+
encodings_to_try = [
|
75
|
+
"utf-8",
|
76
|
+
"windows-1255", # Hebrew
|
77
|
+
"iso-8859-8", # Hebrew
|
78
|
+
"windows-1256", # Arabic
|
79
|
+
"iso-8859-6", # Arabic
|
80
|
+
"windows-1252", # Western European
|
81
|
+
"cp1251", # Cyrillic
|
82
|
+
]
|
26
83
|
|
84
|
+
best_result = None
|
85
|
+
best_confidence = 0.0
|
86
|
+
|
87
|
+
for enc in encodings_to_try:
|
88
|
+
with suppress(UnicodeDecodeError, LookupError):
|
89
|
+
decoded = byte_data.decode(enc)
|
90
|
+
confidence = _calculate_text_confidence(decoded)
|
91
|
+
if confidence > best_confidence:
|
92
|
+
best_confidence = confidence
|
93
|
+
best_result = decoded
|
94
|
+
|
95
|
+
if best_result and best_confidence > 0.5:
|
96
|
+
return _fix_mojibake(best_result)
|
97
|
+
|
98
|
+
# Final fallback
|
27
99
|
return byte_data.decode("latin-1", errors="replace")
|
28
100
|
|
29
101
|
|
102
|
+
def _calculate_text_confidence(text: str) -> float:
|
103
|
+
"""Calculate confidence score for decoded text quality."""
|
104
|
+
if not text:
|
105
|
+
return 0.0
|
106
|
+
|
107
|
+
# Check for common encoding problems
|
108
|
+
replacement_count = len(_MOJIBAKE_PATTERNS["replacement_chars"].findall(text))
|
109
|
+
control_count = len(_MOJIBAKE_PATTERNS["control_chars"].findall(text))
|
110
|
+
total_chars = len(text)
|
111
|
+
|
112
|
+
if total_chars == 0:
|
113
|
+
return 0.0
|
114
|
+
|
115
|
+
# Penalize replacement and control characters
|
116
|
+
penalty = (replacement_count + control_count * 2) / total_chars
|
117
|
+
|
118
|
+
# Bonus for readable character ranges
|
119
|
+
readable_chars = sum(1 for c in text if c.isprintable() or c.isspace())
|
120
|
+
readability_score = readable_chars / total_chars
|
121
|
+
|
122
|
+
# Check for suspicious Cyrillic that might be misencoded Hebrew
|
123
|
+
cyrillic_matches = _MOJIBAKE_PATTERNS["hebrew_as_cyrillic"].findall(text)
|
124
|
+
if cyrillic_matches and len("".join(cyrillic_matches)) > total_chars * 0.1:
|
125
|
+
penalty += 0.3 # Heavy penalty for likely mojibake
|
126
|
+
|
127
|
+
return max(0.0, min(1.0, readability_score - penalty))
|
128
|
+
|
129
|
+
|
130
|
+
def _fix_mojibake(text: str) -> str:
|
131
|
+
"""Attempt to fix common mojibake patterns."""
|
132
|
+
if not text:
|
133
|
+
return text
|
134
|
+
|
135
|
+
# Remove control characters
|
136
|
+
text = _MOJIBAKE_PATTERNS["control_chars"].sub("", text)
|
137
|
+
|
138
|
+
# Remove replacement characters
|
139
|
+
text = _MOJIBAKE_PATTERNS["replacement_chars"].sub("", text)
|
140
|
+
|
141
|
+
# Remove isolated combining marks
|
142
|
+
text = _MOJIBAKE_PATTERNS["isolated_combining"].sub("", text)
|
143
|
+
|
144
|
+
# Try to fix Hebrew encoded as Cyrillic (common Windows-1255 -> CP1251 confusion)
|
145
|
+
if _MOJIBAKE_PATTERNS["hebrew_as_cyrillic"].search(text):
|
146
|
+
# This is a heuristic fix - in practice, you'd need actual character mapping
|
147
|
+
# For now, we flag it for manual review by keeping the text but adding a marker
|
148
|
+
pass
|
149
|
+
|
150
|
+
return text
|
151
|
+
|
152
|
+
|
30
153
|
def normalize_spaces(text: str) -> str:
|
31
|
-
"""Normalize
|
154
|
+
"""Normalize spaces while preserving line breaks and paragraph structure.
|
32
155
|
|
33
156
|
Args:
|
34
|
-
text: The text to
|
157
|
+
text: The text to normalize.
|
35
158
|
|
36
159
|
Returns:
|
37
|
-
The
|
160
|
+
The normalized text with proper spacing.
|
38
161
|
"""
|
39
|
-
|
162
|
+
if not text or not text.strip():
|
163
|
+
return ""
|
164
|
+
|
165
|
+
# Split by double newlines to preserve paragraph breaks
|
166
|
+
paragraphs = text.split("\n\n")
|
167
|
+
normalized_paragraphs = []
|
168
|
+
|
169
|
+
for paragraph in paragraphs:
|
170
|
+
# Use pre-compiled patterns for better performance
|
171
|
+
# Replace multiple whitespace (except newlines) with single space
|
172
|
+
cleaned = _WHITESPACE_PATTERN.sub(" ", paragraph)
|
173
|
+
# Clean up multiple newlines within paragraph (keep single newlines)
|
174
|
+
cleaned = _NEWLINES_PATTERN.sub("\n", cleaned)
|
175
|
+
|
176
|
+
# Strip and filter empty lines efficiently
|
177
|
+
lines = [line.strip() for line in cleaned.split("\n") if line.strip()]
|
178
|
+
|
179
|
+
if lines:
|
180
|
+
normalized_paragraphs.append("\n".join(lines))
|
181
|
+
|
182
|
+
return "\n\n".join(normalized_paragraphs)
|
kreuzberg/_utils/_sync.py
CHANGED
@@ -28,8 +28,11 @@ async def run_sync(sync_fn: Callable[P, T], *args: P.args, **kwargs: P.kwargs) -
|
|
28
28
|
Returns:
|
29
29
|
The result of the synchronous function.
|
30
30
|
"""
|
31
|
-
|
32
|
-
|
31
|
+
# Optimize: only create partial if we have kwargs
|
32
|
+
if kwargs:
|
33
|
+
handler = partial(sync_fn, **kwargs)
|
34
|
+
return cast("T", await any_io_run_sync(handler, *args, abandon_on_cancel=True)) # pyright: ignore [reportCallIssue]
|
35
|
+
return cast("T", await any_io_run_sync(sync_fn, *args, abandon_on_cancel=True)) # pyright: ignore [reportCallIssue]
|
33
36
|
|
34
37
|
|
35
38
|
async def run_taskgroup(*async_tasks: Awaitable[Any]) -> list[Any]:
|
@@ -0,0 +1,261 @@
|
|
1
|
+
"""Table processing and export utilities."""
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
import csv
|
6
|
+
from io import StringIO
|
7
|
+
from typing import TYPE_CHECKING, Any
|
8
|
+
|
9
|
+
if TYPE_CHECKING:
|
10
|
+
from kreuzberg._types import TableData
|
11
|
+
|
12
|
+
|
13
|
+
def export_table_to_csv(table: TableData, separator: str = ",") -> str:
|
14
|
+
r"""Export a TableData object to CSV/TSV format.
|
15
|
+
|
16
|
+
Args:
|
17
|
+
table: TableData object containing DataFrame
|
18
|
+
separator: Field separator ("," for CSV, "\t" for TSV)
|
19
|
+
|
20
|
+
Returns:
|
21
|
+
String representation in CSV/TSV format
|
22
|
+
"""
|
23
|
+
if "df" not in table or table["df"] is None:
|
24
|
+
return ""
|
25
|
+
|
26
|
+
output = StringIO()
|
27
|
+
table["df"].to_csv(output, sep=separator, index=False, quoting=csv.QUOTE_MINIMAL)
|
28
|
+
return output.getvalue().strip()
|
29
|
+
|
30
|
+
|
31
|
+
def export_table_to_tsv(table: TableData) -> str:
|
32
|
+
"""Export a TableData object to TSV format.
|
33
|
+
|
34
|
+
Args:
|
35
|
+
table: TableData object containing DataFrame
|
36
|
+
|
37
|
+
Returns:
|
38
|
+
String representation in TSV format
|
39
|
+
"""
|
40
|
+
return export_table_to_csv(table, separator="\t")
|
41
|
+
|
42
|
+
|
43
|
+
def enhance_table_markdown(table: TableData) -> str:
|
44
|
+
"""Generate enhanced markdown table with better formatting.
|
45
|
+
|
46
|
+
Args:
|
47
|
+
table: TableData object
|
48
|
+
|
49
|
+
Returns:
|
50
|
+
Enhanced markdown table string
|
51
|
+
"""
|
52
|
+
if "df" not in table or table["df"] is None:
|
53
|
+
return table.get("text", "")
|
54
|
+
|
55
|
+
df = table["df"]
|
56
|
+
|
57
|
+
if df.empty:
|
58
|
+
return table.get("text", "")
|
59
|
+
|
60
|
+
# Create enhanced markdown with proper alignment
|
61
|
+
lines = []
|
62
|
+
|
63
|
+
# Header row
|
64
|
+
headers = [str(col).strip() for col in df.columns]
|
65
|
+
lines.append("| " + " | ".join(headers) + " |")
|
66
|
+
|
67
|
+
# Separator row with alignment hints
|
68
|
+
lines.append(_generate_separator_row(df))
|
69
|
+
|
70
|
+
# Analyze float columns to determine formatting strategy
|
71
|
+
float_col_formatting = _analyze_float_columns(df)
|
72
|
+
|
73
|
+
# Data rows with proper formatting
|
74
|
+
for _, row in df.iterrows():
|
75
|
+
formatted_row = _format_table_row(row, df, float_col_formatting)
|
76
|
+
lines.append("| " + " | ".join(formatted_row) + " |")
|
77
|
+
|
78
|
+
return "\n".join(lines)
|
79
|
+
|
80
|
+
|
81
|
+
def _generate_separator_row(df: Any) -> str:
|
82
|
+
"""Generate separator row with proper alignment hints."""
|
83
|
+
separators = []
|
84
|
+
for col in df.columns:
|
85
|
+
# Check if column contains mostly numbers for right alignment
|
86
|
+
if df[col].dtype in ["int64", "float64"] or _is_numeric_column(df[col]):
|
87
|
+
separators.append("---:") # Right align numbers
|
88
|
+
else:
|
89
|
+
separators.append("---") # Left align text
|
90
|
+
return "| " + " | ".join(separators) + " |"
|
91
|
+
|
92
|
+
|
93
|
+
def _analyze_float_columns(df: Any) -> dict[str, str]:
|
94
|
+
"""Analyze float columns to determine formatting strategy."""
|
95
|
+
float_col_formatting = {}
|
96
|
+
for col in df.columns:
|
97
|
+
if str(df[col].dtype) == "float64":
|
98
|
+
non_null_values = df[col].dropna()
|
99
|
+
if len(non_null_values) > 0:
|
100
|
+
# If all non-null values are whole numbers, format as integers
|
101
|
+
all_integers = all(val.is_integer() for val in non_null_values)
|
102
|
+
float_col_formatting[col] = "int" if all_integers else "float"
|
103
|
+
else:
|
104
|
+
float_col_formatting[col] = "int"
|
105
|
+
return float_col_formatting
|
106
|
+
|
107
|
+
|
108
|
+
def _format_table_row(row: Any, df: Any, float_col_formatting: dict[str, str]) -> list[str]:
|
109
|
+
"""Format a single table row with proper value formatting."""
|
110
|
+
formatted_row = []
|
111
|
+
for col_name, value in row.items():
|
112
|
+
if value is None or (isinstance(value, float) and str(value) == "nan"):
|
113
|
+
formatted_row.append("")
|
114
|
+
elif str(df[col_name].dtype) in ["int64", "int32"]:
|
115
|
+
# For integer columns, format as integers
|
116
|
+
formatted_row.append(str(int(value)))
|
117
|
+
elif isinstance(value, float):
|
118
|
+
# For float columns, use the determined formatting strategy
|
119
|
+
if col_name in float_col_formatting and float_col_formatting[col_name] == "int":
|
120
|
+
formatted_row.append(str(int(value)))
|
121
|
+
else:
|
122
|
+
formatted_row.append(f"{value:.2f}")
|
123
|
+
else:
|
124
|
+
# Clean up text values
|
125
|
+
clean_value = str(value).strip().replace("|", "\\|") # Escape pipes
|
126
|
+
formatted_row.append(clean_value)
|
127
|
+
return formatted_row
|
128
|
+
|
129
|
+
|
130
|
+
def _is_numeric_column(series: Any) -> bool:
|
131
|
+
"""Check if a pandas Series contains mostly numeric values."""
|
132
|
+
if len(series) == 0:
|
133
|
+
return False
|
134
|
+
|
135
|
+
try:
|
136
|
+
# Check if already numeric dtype first (fastest path)
|
137
|
+
if str(series.dtype) in {"int64", "float64", "int32", "float32"}:
|
138
|
+
return True
|
139
|
+
|
140
|
+
# Sample-based approach for large series (>1000 rows)
|
141
|
+
sample_size = min(100, len(series))
|
142
|
+
if len(series) > 1000:
|
143
|
+
sample_series = series.dropna().sample(n=sample_size, random_state=42)
|
144
|
+
else:
|
145
|
+
sample_series = series.dropna()
|
146
|
+
|
147
|
+
if len(sample_series) == 0:
|
148
|
+
return False
|
149
|
+
|
150
|
+
# Optimized numeric conversion - avoid exception overhead
|
151
|
+
numeric_count = 0
|
152
|
+
for val in sample_series:
|
153
|
+
val_str = str(val).replace(",", "").replace("$", "").replace("%", "")
|
154
|
+
# Quick check: if it contains only digits, decimal point, minus, plus, or e
|
155
|
+
if val_str and all(c in "0123456789.-+eE" for c in val_str):
|
156
|
+
try:
|
157
|
+
float(val_str)
|
158
|
+
numeric_count += 1
|
159
|
+
except (ValueError, TypeError):
|
160
|
+
pass
|
161
|
+
|
162
|
+
# Consider numeric if >70% of sampled values are numeric
|
163
|
+
return (numeric_count / len(sample_series)) > 0.7
|
164
|
+
|
165
|
+
except (ValueError, TypeError, ZeroDivisionError):
|
166
|
+
return False
|
167
|
+
|
168
|
+
|
169
|
+
def generate_table_summary(tables: list[TableData]) -> dict[str, Any]:
|
170
|
+
"""Generate summary statistics for extracted tables.
|
171
|
+
|
172
|
+
Args:
|
173
|
+
tables: List of TableData objects
|
174
|
+
|
175
|
+
Returns:
|
176
|
+
Dictionary with table statistics
|
177
|
+
"""
|
178
|
+
if not tables:
|
179
|
+
return {
|
180
|
+
"table_count": 0,
|
181
|
+
"total_rows": 0,
|
182
|
+
"total_columns": 0,
|
183
|
+
"pages_with_tables": 0,
|
184
|
+
}
|
185
|
+
|
186
|
+
total_rows = 0
|
187
|
+
total_columns = 0
|
188
|
+
pages_with_tables = set()
|
189
|
+
tables_by_page = {}
|
190
|
+
|
191
|
+
for table in tables:
|
192
|
+
if "df" in table and table["df"] is not None:
|
193
|
+
df = table["df"]
|
194
|
+
total_rows += len(df)
|
195
|
+
total_columns += len(df.columns)
|
196
|
+
|
197
|
+
if "page_number" in table:
|
198
|
+
page_num = table["page_number"]
|
199
|
+
pages_with_tables.add(page_num)
|
200
|
+
|
201
|
+
if page_num not in tables_by_page:
|
202
|
+
tables_by_page[page_num] = 0
|
203
|
+
tables_by_page[page_num] += 1
|
204
|
+
|
205
|
+
return {
|
206
|
+
"table_count": len(tables),
|
207
|
+
"total_rows": total_rows,
|
208
|
+
"total_columns": total_columns,
|
209
|
+
"pages_with_tables": len(pages_with_tables),
|
210
|
+
"avg_rows_per_table": total_rows / len(tables) if tables else 0,
|
211
|
+
"avg_columns_per_table": total_columns / len(tables) if tables else 0,
|
212
|
+
"tables_by_page": dict(tables_by_page),
|
213
|
+
}
|
214
|
+
|
215
|
+
|
216
|
+
def extract_table_structure_info(table: TableData) -> dict[str, Any]:
|
217
|
+
"""Extract structural information from a table.
|
218
|
+
|
219
|
+
Args:
|
220
|
+
table: TableData object
|
221
|
+
|
222
|
+
Returns:
|
223
|
+
Dictionary with structural information
|
224
|
+
"""
|
225
|
+
info = {
|
226
|
+
"has_headers": False,
|
227
|
+
"row_count": 0,
|
228
|
+
"column_count": 0,
|
229
|
+
"numeric_columns": 0,
|
230
|
+
"text_columns": 0,
|
231
|
+
"empty_cells": 0,
|
232
|
+
"data_density": 0.0,
|
233
|
+
}
|
234
|
+
|
235
|
+
if "df" not in table or table["df"] is None:
|
236
|
+
return info
|
237
|
+
|
238
|
+
df = table["df"]
|
239
|
+
|
240
|
+
if df.empty:
|
241
|
+
return info
|
242
|
+
|
243
|
+
info["row_count"] = len(df)
|
244
|
+
info["column_count"] = len(df.columns)
|
245
|
+
info["has_headers"] = len(df.columns) > 0
|
246
|
+
|
247
|
+
# Analyze column types
|
248
|
+
for col in df.columns:
|
249
|
+
if _is_numeric_column(df[col]):
|
250
|
+
info["numeric_columns"] += 1
|
251
|
+
else:
|
252
|
+
info["text_columns"] += 1
|
253
|
+
|
254
|
+
# Calculate data density
|
255
|
+
total_cells = len(df) * len(df.columns)
|
256
|
+
if total_cells > 0:
|
257
|
+
empty_cells = df.isnull().sum().sum()
|
258
|
+
info["empty_cells"] = int(empty_cells)
|
259
|
+
info["data_density"] = (total_cells - empty_cells) / total_cells
|
260
|
+
|
261
|
+
return info
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.8.0
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
6
6
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
@@ -23,9 +23,9 @@ Classifier: Topic :: Utilities
|
|
23
23
|
Classifier: Typing :: Typed
|
24
24
|
Requires-Python: >=3.10
|
25
25
|
Requires-Dist: anyio>=4.9.0
|
26
|
-
Requires-Dist:
|
26
|
+
Requires-Dist: chardetng-py>=0.3.4
|
27
27
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
28
|
-
Requires-Dist: html-to-markdown[lxml]>=1.
|
28
|
+
Requires-Dist: html-to-markdown[lxml]>=1.8.0
|
29
29
|
Requires-Dist: mcp>=1.11.0
|
30
30
|
Requires-Dist: msgspec>=0.18.0
|
31
31
|
Requires-Dist: playa-pdf>=0.6.1
|
@@ -34,6 +34,9 @@ Requires-Dist: pypdfium2==4.30.0
|
|
34
34
|
Requires-Dist: python-calamine>=0.3.2
|
35
35
|
Requires-Dist: python-pptx>=1.0.2
|
36
36
|
Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
|
37
|
+
Provides-Extra: additional-extensions
|
38
|
+
Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
|
39
|
+
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
|
37
40
|
Provides-Extra: all
|
38
41
|
Requires-Dist: click>=8.2.1; extra == 'all'
|
39
42
|
Requires-Dist: easyocr>=1.7.2; extra == 'all'
|
@@ -41,6 +44,7 @@ Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
|
|
41
44
|
Requires-Dist: gmft>=0.4.2; extra == 'all'
|
42
45
|
Requires-Dist: keybert>=0.9.0; extra == 'all'
|
43
46
|
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all'
|
47
|
+
Requires-Dist: mailparse>=1.0.15; extra == 'all'
|
44
48
|
Requires-Dist: paddleocr>=3.1.0; extra == 'all'
|
45
49
|
Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
|
46
50
|
Requires-Dist: rich>=14.0.0; extra == 'all'
|
@@ -77,22 +81,51 @@ Description-Content-Type: text/markdown
|
|
77
81
|
[](https://badge.fury.io/py/kreuzberg)
|
78
82
|
[](https://goldziher.github.io/kreuzberg/)
|
79
83
|
[](https://opensource.org/licenses/MIT)
|
84
|
+
[](https://github.com/Goldziher/kreuzberg)
|
80
85
|
|
81
|
-
**High-performance
|
86
|
+
**High-performance Open Source Document Intelligence framework for Python.** Built by engineers for production workloads - extract text from any document with excellent performance and minimal complexity.
|
82
87
|
|
83
88
|
📖 **[Complete Documentation](https://goldziher.github.io/kreuzberg/)**
|
84
89
|
|
85
|
-
## Why Kreuzberg?
|
90
|
+
## Why Choose Kreuzberg?
|
86
91
|
|
87
|
-
|
88
|
-
|
89
|
-
-
|
90
|
-
-
|
91
|
-
-
|
92
|
-
-
|
93
|
-
|
94
|
-
|
95
|
-
|
92
|
+
### 🚀 Performance
|
93
|
+
|
94
|
+
- [benchmarked as the fastest framework](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) - 2-3x faster than the nearest alternatives
|
95
|
+
- Minimal footprint: 71MB install vs 1GB+ for competitors
|
96
|
+
- Lowest memory usage (~530MB average) optimized for production workloads
|
97
|
+
- Edge and serverless ready - deploy anywhere without heavy dependencies
|
98
|
+
|
99
|
+
### 🛠️ Engineering Quality
|
100
|
+
|
101
|
+
- Built by software engineers with modern Python best practices
|
102
|
+
- 95%+ test coverage with comprehensive test suite
|
103
|
+
- Thoroughly benchmarked and profiled for real-world performance
|
104
|
+
- Only framework offering true async/await support alongside sync APIs
|
105
|
+
- Robust error handling and detailed logging
|
106
|
+
|
107
|
+
### 🎯 Developer Experience
|
108
|
+
|
109
|
+
- Works out of the box with sane defaults, scales with your needs
|
110
|
+
- Native MCP server for AI tool integration (Claude Desktop, Cursor)
|
111
|
+
- Full type safety with excellent IDE support (completions)
|
112
|
+
- Comprehensive documentation including full API reference
|
113
|
+
|
114
|
+
### 🌍 Deployment Options
|
115
|
+
|
116
|
+
- Docker images for all architectures (AMD64, ARM64)
|
117
|
+
- Cloud native - AWS Lambda, Google Cloud Functions, Azure Functions
|
118
|
+
- CPU-only processing - no GPU requirements, lower energy consumption
|
119
|
+
- 100% local processing - no external API dependencies
|
120
|
+
- Multiple deployment modes: CLI, REST API, MCP server
|
121
|
+
|
122
|
+
### 🎯 Complete Solution
|
123
|
+
|
124
|
+
- Universal format support: PDFs, images, Office docs, HTML, spreadsheets, presentations
|
125
|
+
- Multiple OCR engines: Tesseract, EasyOCR, PaddleOCR with intelligent fallbacks
|
126
|
+
- Advanced features: Table extraction, metadata extraction, content chunking for RAG
|
127
|
+
- Production tools: REST API, CLI tools, batch processing, custom extractors
|
128
|
+
- Fully extensible: Add your own extractors
|
96
129
|
|
97
130
|
## Quick Start
|
98
131
|
|
@@ -197,7 +230,7 @@ docker run -p 8000:8000 goldziher/kreuzberg:latest
|
|
197
230
|
curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
|
198
231
|
```
|
199
232
|
|
200
|
-
Available variants: `latest`, `
|
233
|
+
Available variants: `latest`, `v3.8.0`, `v3.8.0-easyocr`, `v3.8.0-paddle`, `v3.8.0-gmft`, `v3.8.0-all`
|
201
234
|
|
202
235
|
### 🌐 REST API
|
203
236
|
|
@@ -240,23 +273,28 @@ kreuzberg extract *.pdf --output-dir ./extracted/
|
|
240
273
|
| **Web** | HTML, XML, MHTML |
|
241
274
|
| **Archives** | Support via extraction |
|
242
275
|
|
243
|
-
## Performance
|
276
|
+
## 📊 Performance Comparison
|
244
277
|
|
245
|
-
|
278
|
+
[Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) across 94 real-world documents • [View source](https://github.com/Goldziher/python-text-extraction-libs-benchmarks):
|
246
279
|
|
247
|
-
|
|
248
|
-
| ------------- |
|
249
|
-
| **Kreuzberg** |
|
250
|
-
| Unstructured |
|
251
|
-
| MarkItDown |
|
252
|
-
| Docling |
|
280
|
+
| Framework | Speed | Memory | Install Size | Dependencies | Success Rate |
|
281
|
+
| ------------- | ----------- | ------ | ------------ | ------------ | ------------ |
|
282
|
+
| **Kreuzberg** | 35+ files/s | 530MB | 71MB | 20 | High |
|
283
|
+
| Unstructured | ~12 files/s | ~1GB | 146MB | 54 | 88%+ |
|
284
|
+
| MarkItDown | ~15 files/s | ~1.5GB | 251MB | 25 | 80%\* |
|
285
|
+
| Docling | ~1 file/min | ~5GB | 1,032MB | 88 | 45%\* |
|
253
286
|
|
254
|
-
\*
|
255
|
-
†_Good on simple documents, struggles with large/complex files (>10MB)_
|
256
|
-
‡_Frequently fails/times out on medium files (>1MB)_
|
287
|
+
\*_Performance varies significantly with document complexity and size_
|
257
288
|
|
258
|
-
|
259
|
-
|
289
|
+
**Key strengths:**
|
290
|
+
|
291
|
+
- 2-3x faster processing than comparable frameworks
|
292
|
+
- Smallest installation footprint and memory usage
|
293
|
+
- Only framework with built-in async/await support
|
294
|
+
- CPU-only processing - no GPU dependencies
|
295
|
+
- Built by software engineers for production reliability
|
296
|
+
|
297
|
+
> **Benchmark details**: Tests include PDFs, Word docs, HTML, images, and spreadsheets in multiple languages (English, Hebrew, German, Chinese, Japanese, Korean) on standardized hardware.
|
260
298
|
|
261
299
|
## Documentation
|
262
300
|
|
@@ -270,28 +308,6 @@ kreuzberg extract *.pdf --output-dir ./extracted/
|
|
270
308
|
- [CLI Guide](https://goldziher.github.io/kreuzberg/cli/) - Command-line usage
|
271
309
|
- [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - OCR engine setup
|
272
310
|
|
273
|
-
## Advanced Features
|
274
|
-
|
275
|
-
- **🤖 MCP Server**: Native integration with Claude Desktop and AI tools
|
276
|
-
- **📊 Table Extraction**: Extract tables from PDFs with GMFT
|
277
|
-
- **🧩 Content Chunking**: Split documents for RAG applications
|
278
|
-
- **🎯 Custom Extractors**: Extend with your own document handlers
|
279
|
-
- **🔧 Configuration**: Flexible TOML-based configuration
|
280
|
-
- **🪝 Hooks**: Pre/post-processing customization
|
281
|
-
- **🌍 Multi-language OCR**: 100+ languages supported
|
282
|
-
- **⚙️ Metadata Extraction**: Rich document metadata
|
283
|
-
- **🔄 Batch Processing**: Efficient bulk document processing
|
284
|
-
|
285
311
|
## License
|
286
312
|
|
287
313
|
MIT License - see [LICENSE](LICENSE) for details.
|
288
|
-
|
289
|
-
______________________________________________________________________
|
290
|
-
|
291
|
-
<div align="center">
|
292
|
-
|
293
|
-
**[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [Discord](https://discord.gg/pXxagNK2zN)**
|
294
|
-
|
295
|
-
Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
|
296
|
-
|
297
|
-
</div>
|