kreuzberg 3.7.0__py3-none-any.whl → 3.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. kreuzberg/_entity_extraction.py +1 -2
  2. kreuzberg/_extractors/_base.py +39 -1
  3. kreuzberg/_extractors/_email.py +149 -0
  4. kreuzberg/_extractors/_html.py +15 -3
  5. kreuzberg/_extractors/_image.py +21 -36
  6. kreuzberg/_extractors/_pandoc.py +3 -14
  7. kreuzberg/_extractors/_pdf.py +81 -48
  8. kreuzberg/_extractors/_presentation.py +62 -10
  9. kreuzberg/_extractors/_spread_sheet.py +179 -4
  10. kreuzberg/_extractors/_structured.py +148 -0
  11. kreuzberg/_gmft.py +314 -7
  12. kreuzberg/_mime_types.py +27 -1
  13. kreuzberg/_ocr/__init__.py +10 -1
  14. kreuzberg/_ocr/_base.py +59 -0
  15. kreuzberg/_ocr/_easyocr.py +91 -0
  16. kreuzberg/_ocr/_paddleocr.py +89 -0
  17. kreuzberg/_ocr/_tesseract.py +564 -4
  18. kreuzberg/_registry.py +4 -0
  19. kreuzberg/_types.py +131 -0
  20. kreuzberg/_utils/_cache.py +52 -4
  21. kreuzberg/_utils/_errors.py +3 -7
  22. kreuzberg/_utils/_process_pool.py +180 -7
  23. kreuzberg/_utils/_quality.py +237 -0
  24. kreuzberg/_utils/_serialization.py +4 -2
  25. kreuzberg/_utils/_string.py +153 -10
  26. kreuzberg/_utils/_sync.py +5 -2
  27. kreuzberg/_utils/_table.py +261 -0
  28. kreuzberg/cli.py +1 -2
  29. kreuzberg/extraction.py +4 -22
  30. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/METADATA +58 -54
  31. kreuzberg-3.8.1.dist-info/RECORD +53 -0
  32. kreuzberg/_multiprocessing/__init__.py +0 -6
  33. kreuzberg/_multiprocessing/gmft_isolated.py +0 -330
  34. kreuzberg/_multiprocessing/process_manager.py +0 -189
  35. kreuzberg/_multiprocessing/sync_easyocr.py +0 -235
  36. kreuzberg/_multiprocessing/sync_paddleocr.py +0 -199
  37. kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
  38. kreuzberg/_multiprocessing/tesseract_pool.py +0 -359
  39. kreuzberg-3.7.0.dist-info/RECORD +0 -56
  40. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/WHEEL +0 -0
  41. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/entry_points.txt +0 -0
  42. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,261 @@
1
+ """Table processing and export utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import csv
6
+ from io import StringIO
7
+ from typing import TYPE_CHECKING, Any
8
+
9
+ if TYPE_CHECKING:
10
+ from kreuzberg._types import TableData
11
+
12
+
13
+ def export_table_to_csv(table: TableData, separator: str = ",") -> str:
14
+ r"""Export a TableData object to CSV/TSV format.
15
+
16
+ Args:
17
+ table: TableData object containing DataFrame
18
+ separator: Field separator ("," for CSV, "\t" for TSV)
19
+
20
+ Returns:
21
+ String representation in CSV/TSV format
22
+ """
23
+ if "df" not in table or table["df"] is None:
24
+ return ""
25
+
26
+ output = StringIO()
27
+ table["df"].to_csv(output, sep=separator, index=False, quoting=csv.QUOTE_MINIMAL)
28
+ return output.getvalue().strip()
29
+
30
+
31
+ def export_table_to_tsv(table: TableData) -> str:
32
+ """Export a TableData object to TSV format.
33
+
34
+ Args:
35
+ table: TableData object containing DataFrame
36
+
37
+ Returns:
38
+ String representation in TSV format
39
+ """
40
+ return export_table_to_csv(table, separator="\t")
41
+
42
+
43
+ def enhance_table_markdown(table: TableData) -> str:
44
+ """Generate enhanced markdown table with better formatting.
45
+
46
+ Args:
47
+ table: TableData object
48
+
49
+ Returns:
50
+ Enhanced markdown table string
51
+ """
52
+ if "df" not in table or table["df"] is None:
53
+ return table.get("text", "")
54
+
55
+ df = table["df"]
56
+
57
+ if df.empty:
58
+ return table.get("text", "")
59
+
60
+ # Create enhanced markdown with proper alignment
61
+ lines = []
62
+
63
+ # Header row
64
+ headers = [str(col).strip() for col in df.columns]
65
+ lines.append("| " + " | ".join(headers) + " |")
66
+
67
+ # Separator row with alignment hints
68
+ lines.append(_generate_separator_row(df))
69
+
70
+ # Analyze float columns to determine formatting strategy
71
+ float_col_formatting = _analyze_float_columns(df)
72
+
73
+ # Data rows with proper formatting
74
+ for _, row in df.iterrows():
75
+ formatted_row = _format_table_row(row, df, float_col_formatting)
76
+ lines.append("| " + " | ".join(formatted_row) + " |")
77
+
78
+ return "\n".join(lines)
79
+
80
+
81
+ def _generate_separator_row(df: Any) -> str:
82
+ """Generate separator row with proper alignment hints."""
83
+ separators = []
84
+ for col in df.columns:
85
+ # Check if column contains mostly numbers for right alignment
86
+ if df[col].dtype in ["int64", "float64"] or _is_numeric_column(df[col]):
87
+ separators.append("---:") # Right align numbers
88
+ else:
89
+ separators.append("---") # Left align text
90
+ return "| " + " | ".join(separators) + " |"
91
+
92
+
93
+ def _analyze_float_columns(df: Any) -> dict[str, str]:
94
+ """Analyze float columns to determine formatting strategy."""
95
+ float_col_formatting = {}
96
+ for col in df.columns:
97
+ if str(df[col].dtype) == "float64":
98
+ non_null_values = df[col].dropna()
99
+ if len(non_null_values) > 0:
100
+ # If all non-null values are whole numbers, format as integers
101
+ all_integers = all(val.is_integer() for val in non_null_values)
102
+ float_col_formatting[col] = "int" if all_integers else "float"
103
+ else:
104
+ float_col_formatting[col] = "int"
105
+ return float_col_formatting
106
+
107
+
108
+ def _format_table_row(row: Any, df: Any, float_col_formatting: dict[str, str]) -> list[str]:
109
+ """Format a single table row with proper value formatting."""
110
+ formatted_row = []
111
+ for col_name, value in row.items():
112
+ if value is None or (isinstance(value, float) and str(value) == "nan"):
113
+ formatted_row.append("")
114
+ elif str(df[col_name].dtype) in ["int64", "int32"]:
115
+ # For integer columns, format as integers
116
+ formatted_row.append(str(int(value)))
117
+ elif isinstance(value, float):
118
+ # For float columns, use the determined formatting strategy
119
+ if col_name in float_col_formatting and float_col_formatting[col_name] == "int":
120
+ formatted_row.append(str(int(value)))
121
+ else:
122
+ formatted_row.append(f"{value:.2f}")
123
+ else:
124
+ # Clean up text values
125
+ clean_value = str(value).strip().replace("|", "\\|") # Escape pipes
126
+ formatted_row.append(clean_value)
127
+ return formatted_row
128
+
129
+
130
+ def _is_numeric_column(series: Any) -> bool:
131
+ """Check if a pandas Series contains mostly numeric values."""
132
+ if len(series) == 0:
133
+ return False
134
+
135
+ try:
136
+ # Check if already numeric dtype first (fastest path)
137
+ if str(series.dtype) in {"int64", "float64", "int32", "float32"}:
138
+ return True
139
+
140
+ # Sample-based approach for large series (>1000 rows)
141
+ sample_size = min(100, len(series))
142
+ if len(series) > 1000:
143
+ sample_series = series.dropna().sample(n=sample_size, random_state=42)
144
+ else:
145
+ sample_series = series.dropna()
146
+
147
+ if len(sample_series) == 0:
148
+ return False
149
+
150
+ # Optimized numeric conversion - avoid exception overhead
151
+ numeric_count = 0
152
+ for val in sample_series:
153
+ val_str = str(val).replace(",", "").replace("$", "").replace("%", "")
154
+ # Quick check: if it contains only digits, decimal point, minus, plus, or e
155
+ if val_str and all(c in "0123456789.-+eE" for c in val_str):
156
+ try:
157
+ float(val_str)
158
+ numeric_count += 1
159
+ except (ValueError, TypeError):
160
+ pass
161
+
162
+ # Consider numeric if >70% of sampled values are numeric
163
+ return (numeric_count / len(sample_series)) > 0.7
164
+
165
+ except (ValueError, TypeError, ZeroDivisionError):
166
+ return False
167
+
168
+
169
+ def generate_table_summary(tables: list[TableData]) -> dict[str, Any]:
170
+ """Generate summary statistics for extracted tables.
171
+
172
+ Args:
173
+ tables: List of TableData objects
174
+
175
+ Returns:
176
+ Dictionary with table statistics
177
+ """
178
+ if not tables:
179
+ return {
180
+ "table_count": 0,
181
+ "total_rows": 0,
182
+ "total_columns": 0,
183
+ "pages_with_tables": 0,
184
+ }
185
+
186
+ total_rows = 0
187
+ total_columns = 0
188
+ pages_with_tables = set()
189
+ tables_by_page = {}
190
+
191
+ for table in tables:
192
+ if "df" in table and table["df"] is not None:
193
+ df = table["df"]
194
+ total_rows += len(df)
195
+ total_columns += len(df.columns)
196
+
197
+ if "page_number" in table:
198
+ page_num = table["page_number"]
199
+ pages_with_tables.add(page_num)
200
+
201
+ if page_num not in tables_by_page:
202
+ tables_by_page[page_num] = 0
203
+ tables_by_page[page_num] += 1
204
+
205
+ return {
206
+ "table_count": len(tables),
207
+ "total_rows": total_rows,
208
+ "total_columns": total_columns,
209
+ "pages_with_tables": len(pages_with_tables),
210
+ "avg_rows_per_table": total_rows / len(tables) if tables else 0,
211
+ "avg_columns_per_table": total_columns / len(tables) if tables else 0,
212
+ "tables_by_page": dict(tables_by_page),
213
+ }
214
+
215
+
216
+ def extract_table_structure_info(table: TableData) -> dict[str, Any]:
217
+ """Extract structural information from a table.
218
+
219
+ Args:
220
+ table: TableData object
221
+
222
+ Returns:
223
+ Dictionary with structural information
224
+ """
225
+ info = {
226
+ "has_headers": False,
227
+ "row_count": 0,
228
+ "column_count": 0,
229
+ "numeric_columns": 0,
230
+ "text_columns": 0,
231
+ "empty_cells": 0,
232
+ "data_density": 0.0,
233
+ }
234
+
235
+ if "df" not in table or table["df"] is None:
236
+ return info
237
+
238
+ df = table["df"]
239
+
240
+ if df.empty:
241
+ return info
242
+
243
+ info["row_count"] = len(df)
244
+ info["column_count"] = len(df.columns)
245
+ info["has_headers"] = len(df.columns) > 0
246
+
247
+ # Analyze column types
248
+ for col in df.columns:
249
+ if _is_numeric_column(df[col]):
250
+ info["numeric_columns"] += 1
251
+ else:
252
+ info["text_columns"] += 1
253
+
254
+ # Calculate data density
255
+ total_cells = len(df) * len(df.columns)
256
+ if total_cells > 0:
257
+ empty_cells = df.isnull().sum().sum()
258
+ info["empty_cells"] = int(empty_cells)
259
+ info["data_density"] = (total_cells - empty_cells) / total_cells
260
+
261
+ return info
kreuzberg/cli.py CHANGED
@@ -4,6 +4,7 @@ from __future__ import annotations
4
4
 
5
5
  import json
6
6
  import sys
7
+ import traceback
7
8
  from pathlib import Path
8
9
  from typing import TYPE_CHECKING, Any
9
10
 
@@ -211,8 +212,6 @@ def handle_error(error: Exception, verbose: bool) -> None:
211
212
  else:
212
213
  console.print(f"[red]Unexpected error:[/red] {type(error).__name__}: {error}", style="bold")
213
214
  if verbose:
214
- import traceback
215
-
216
215
  console.print("\n[dim]Traceback:[/dim]")
217
216
  traceback.print_exc()
218
217
  sys.exit(1)
kreuzberg/extraction.py CHANGED
@@ -1,5 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import multiprocessing as mp
4
+ from concurrent.futures import ThreadPoolExecutor, as_completed
3
5
  from pathlib import Path
4
6
  from typing import TYPE_CHECKING, Any, Final, cast
5
7
 
@@ -14,6 +16,8 @@ from kreuzberg._mime_types import (
14
16
  )
15
17
  from kreuzberg._registry import ExtractorRegistry
16
18
  from kreuzberg._types import ExtractionConfig
19
+ from kreuzberg._utils._document_cache import get_document_cache
20
+ from kreuzberg._utils._errors import create_error_context
17
21
  from kreuzberg._utils._string import safe_decode
18
22
  from kreuzberg._utils._sync import run_maybe_sync, run_sync_only
19
23
  from kreuzberg.exceptions import ValidationError
@@ -136,8 +140,6 @@ async def extract_file(
136
140
  Raises:
137
141
  ValidationError: If the file path or configuration is invalid.
138
142
  """
139
- from kreuzberg._utils._document_cache import get_document_cache
140
-
141
143
  cache = get_document_cache()
142
144
  path = Path(file_path)
143
145
  cached_result = cache.get(path, config)
@@ -194,8 +196,6 @@ async def batch_extract_file(
194
196
  if not file_paths:
195
197
  return []
196
198
 
197
- import multiprocessing as mp
198
-
199
199
  max_concurrency = min(len(file_paths), mp.cpu_count() * 2)
200
200
  semaphore = anyio.Semaphore(max_concurrency)
201
201
 
@@ -211,8 +211,6 @@ async def batch_extract_file(
211
211
  )
212
212
  results[index] = result
213
213
  except Exception as e: # noqa: BLE001
214
- from kreuzberg._utils._errors import create_error_context
215
-
216
214
  error_result = ExtractionResult(
217
215
  content=f"Error: {type(e).__name__}: {e!s}",
218
216
  mime_type="text/plain",
@@ -251,8 +249,6 @@ async def batch_extract_bytes(
251
249
  if not contents:
252
250
  return []
253
251
 
254
- import multiprocessing as mp
255
-
256
252
  max_concurrency = min(len(contents), mp.cpu_count() * 2)
257
253
  semaphore = anyio.Semaphore(max_concurrency)
258
254
 
@@ -264,8 +260,6 @@ async def batch_extract_bytes(
264
260
  result = await extract_bytes(content, mime_type, config)
265
261
  results[index] = result
266
262
  except Exception as e: # noqa: BLE001
267
- from kreuzberg._utils._errors import create_error_context
268
-
269
263
  error_result = ExtractionResult(
270
264
  content=f"Error: {type(e).__name__}: {e!s}",
271
265
  mime_type="text/plain",
@@ -331,8 +325,6 @@ def extract_file_sync(
331
325
  Raises:
332
326
  ValidationError: If the file path or configuration is invalid.
333
327
  """
334
- from kreuzberg._utils._document_cache import get_document_cache
335
-
336
328
  cache = get_document_cache()
337
329
  path = Path(file_path)
338
330
  cached_result = cache.get(path, config)
@@ -389,9 +381,6 @@ def batch_extract_file_sync(
389
381
  if len(file_paths) <= 1:
390
382
  return [extract_file_sync(file_path=Path(file_path), mime_type=None, config=config) for file_path in file_paths]
391
383
 
392
- import multiprocessing as mp
393
- from concurrent.futures import ThreadPoolExecutor, as_completed
394
-
395
384
  max_workers = min(len(file_paths), mp.cpu_count())
396
385
 
397
386
  def extract_single(file_path: PathLike[str] | str) -> tuple[int, ExtractionResult]:
@@ -402,8 +391,6 @@ def batch_extract_file_sync(
402
391
  extract_file_sync(file_path=Path(file_path), mime_type=None, config=config),
403
392
  )
404
393
  except Exception as e: # noqa: BLE001
405
- from kreuzberg._utils._errors import create_error_context
406
-
407
394
  error_result = ExtractionResult(
408
395
  content=f"Error: {type(e).__name__}: {e!s}",
409
396
  mime_type="text/plain",
@@ -447,9 +434,6 @@ def batch_extract_bytes_sync(
447
434
  extract_bytes_sync(content=content, mime_type=mime_type, config=config) for content, mime_type in contents
448
435
  ]
449
436
 
450
- import multiprocessing as mp
451
- from concurrent.futures import ThreadPoolExecutor, as_completed
452
-
453
437
  max_workers = min(len(contents), mp.cpu_count())
454
438
 
455
439
  def extract_single(index_and_content: tuple[int, tuple[bytes, str]]) -> tuple[int, ExtractionResult]:
@@ -458,8 +442,6 @@ def batch_extract_bytes_sync(
458
442
  try:
459
443
  return (index, extract_bytes_sync(content=content, mime_type=mime_type, config=config))
460
444
  except Exception as e: # noqa: BLE001
461
- from kreuzberg._utils._errors import create_error_context
462
-
463
445
  error_result = ExtractionResult(
464
446
  content=f"Error: {type(e).__name__}: {e!s}",
465
447
  mime_type="text/plain",
@@ -1,14 +1,16 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.7.0
4
- Summary: A text extraction library supporting PDFs, images, office documents and more
3
+ Version: 3.8.1
4
+ Summary: Advanced document intelligence framework for extracting structured content from PDFs, images, and office documents
5
5
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
6
6
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
7
7
  License: MIT
8
8
  License-File: LICENSE
9
- Keywords: document-processing,entity-extraction,image-to-text,keyword-extraction,named-entity-recognition,ner,ocr,pandoc,pdf-extraction,rag,spacy,table-extraction,tesseract,text-extraction,text-processing
9
+ Keywords: automation,content-extraction,data-processing,document-analysis,document-intelligence,document-processing,entity-extraction,image-to-text,information-extraction,ocr,pdf-extraction,rag,structured-data,table-extraction,text-extraction
10
10
  Classifier: Development Status :: 5 - Production/Stable
11
11
  Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Information Technology
13
+ Classifier: Intended Audience :: Science/Research
12
14
  Classifier: License :: OSI Approved :: MIT License
13
15
  Classifier: Operating System :: OS Independent
14
16
  Classifier: Programming Language :: Python :: 3 :: Only
@@ -16,16 +18,19 @@ Classifier: Programming Language :: Python :: 3.10
16
18
  Classifier: Programming Language :: Python :: 3.11
17
19
  Classifier: Programming Language :: Python :: 3.12
18
20
  Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Database
22
+ Classifier: Topic :: Multimedia :: Graphics :: Capture :: Scanners
23
+ Classifier: Topic :: Office/Business :: Office Suites
19
24
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
25
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
20
26
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
27
  Classifier: Topic :: Text Processing :: General
22
- Classifier: Topic :: Utilities
23
28
  Classifier: Typing :: Typed
24
29
  Requires-Python: >=3.10
25
30
  Requires-Dist: anyio>=4.9.0
26
- Requires-Dist: charset-normalizer>=3.4.2
31
+ Requires-Dist: chardetng-py>=0.3.4
27
32
  Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
28
- Requires-Dist: html-to-markdown[lxml]>=1.6.0
33
+ Requires-Dist: html-to-markdown[lxml]>=1.8.0
29
34
  Requires-Dist: mcp>=1.11.0
30
35
  Requires-Dist: msgspec>=0.18.0
31
36
  Requires-Dist: playa-pdf>=0.6.1
@@ -34,6 +39,9 @@ Requires-Dist: pypdfium2==4.30.0
34
39
  Requires-Dist: python-calamine>=0.3.2
35
40
  Requires-Dist: python-pptx>=1.0.2
36
41
  Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
42
+ Provides-Extra: additional-extensions
43
+ Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
44
+ Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
37
45
  Provides-Extra: all
38
46
  Requires-Dist: click>=8.2.1; extra == 'all'
39
47
  Requires-Dist: easyocr>=1.7.2; extra == 'all'
@@ -41,6 +49,7 @@ Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
41
49
  Requires-Dist: gmft>=0.4.2; extra == 'all'
42
50
  Requires-Dist: keybert>=0.9.0; extra == 'all'
43
51
  Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all'
52
+ Requires-Dist: mailparse>=1.0.15; extra == 'all'
44
53
  Requires-Dist: paddleocr>=3.1.0; extra == 'all'
45
54
  Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
46
55
  Requires-Dist: rich>=14.0.0; extra == 'all'
@@ -77,22 +86,33 @@ Description-Content-Type: text/markdown
77
86
  [![PyPI version](https://badge.fury.io/py/kreuzberg.svg)](https://badge.fury.io/py/kreuzberg)
78
87
  [![Documentation](https://img.shields.io/badge/docs-GitHub_Pages-blue)](https://goldziher.github.io/kreuzberg/)
79
88
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
89
+ [![Test Coverage](https://img.shields.io/badge/coverage-95%25-green)](https://github.com/Goldziher/kreuzberg)
80
90
 
81
- **High-performance Python library for text extraction from documents.** Extract text from PDFs, images, office documents, and more with both async and sync APIs.
91
+ **Advanced Document Intelligence for Modern Python Applications.** Transform PDFs, images, and office documents into structured data with production-grade performance. Built by engineers who understand that speed, reliability, and developer experience matter.
82
92
 
83
93
  📖 **[Complete Documentation](https://goldziher.github.io/kreuzberg/)**
84
94
 
85
- ## Why Kreuzberg?
95
+ ## Why Choose Kreuzberg?
86
96
 
87
- - **🚀 Fastest Performance**: [35+ files/second](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) - the fastest text extraction library
88
- - **💾 Memory Efficient**: 14x smaller than alternatives (71MB vs 1GB+) with lowest memory usage (~530MB)
89
- - **⚡ Dual APIs**: Only library with both sync and async support
90
- - **🔧 Zero Configuration**: Works out of the box with sane defaults
91
- - **🏠 Local Processing**: No cloud dependencies or external API calls
92
- - **📦 Rich Format Support**: PDFs, images, Office docs, HTML, and more
93
- - **🔍 Multiple OCR Engines**: Tesseract, EasyOCR, and PaddleOCR support
94
- - **🤖 AI Integration**: Native MCP server for Claude and other AI tools
95
- - **🐳 Production Ready**: CLI, REST API, MCP server, and Docker images included
97
+ ### Proven Performance
98
+
99
+ [Benchmarked](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) 6-126x faster than alternatives while using minimal resources. Process up to 14 files per second with 87MB install size and ~360MB memory usage. Optimized for production workloads and resource-constrained environments.
100
+
101
+ ### 🏗️ Production Engineering
102
+
103
+ Comprehensive test coverage (95%+), robust error handling, and true async/await support. Built with modern Python practices for reliability in production environments.
104
+
105
+ ### 🔧 Developer Experience
106
+
107
+ Works immediately with smart defaults, scales as you grow. Native MCP integration for AI tools, full type safety, and clear documentation.
108
+
109
+ ### 🚀 Flexible Deployment
110
+
111
+ Deploy on serverless platforms, containers, or traditional servers. Supports both CPU and GPU processing (via PaddleOCR and EasyOCR). No external API dependencies. Multiple deployment modes: CLI, REST API, MCP server.
112
+
113
+ ### 📄 Comprehensive Format Support
114
+
115
+ Extract from PDFs, images, Office documents, HTML, spreadsheets, and presentations. Multiple OCR engines with intelligent fallbacks, table extraction, and content preparation for RAG workflows.
96
116
 
97
117
  ## Quick Start
98
118
 
@@ -128,7 +148,7 @@ import asyncio
128
148
  from kreuzberg import extract_file
129
149
 
130
150
  async def main():
131
- # Extract from any document type
151
+ # Extract content from files
132
152
  result = await extract_file("document.pdf")
133
153
  print(result.content)
134
154
  print(result.metadata)
@@ -197,7 +217,7 @@ docker run -p 8000:8000 goldziher/kreuzberg:latest
197
217
  curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
198
218
  ```
199
219
 
200
- Available variants: `latest`, `3.6.1`, `3.6.1-easyocr`, `3.6.1-paddle`, `3.6.1-gmft`, `3.6.1-all`
220
+ Available variants: `latest`, `v3.8.0`, `v3.8.0-easyocr`, `v3.8.0-paddle`, `v3.8.0-gmft`, `v3.8.0-all`
201
221
 
202
222
  ### 🌐 REST API
203
223
 
@@ -240,23 +260,28 @@ kreuzberg extract *.pdf --output-dir ./extracted/
240
260
  | **Web** | HTML, XML, MHTML |
241
261
  | **Archives** | Support via extraction |
242
262
 
243
- ## Performance
263
+ ## 📊 Performance Comparison
244
264
 
245
- **[Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/)** across 94 real-world documents (~210MB) • [View source](https://github.com/Goldziher/python-text-extraction-libs-benchmarks):
265
+ [Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) across ~100 real-world documents • [View source](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [**Detailed Analysis**](https://goldziher.github.io/kreuzberg/performance-analysis/):
246
266
 
247
- | Library | Speed | Memory | Install Size | Dependencies | Success Rate |
248
- | ------------- | --------------- | --------- | ------------ | ------------ | ------------ |
249
- | **Kreuzberg** | **35+ files/s** | **530MB** | **71MB** | **20** | High\* |
250
- | Unstructured | Moderate | ~1GB | 146MB | 54 | 88%+ |
251
- | MarkItDown | Good† | ~1.5GB | 251MB | 25 | 80%† |
252
- | Docling | 60+ min/file‡ | ~5GB | 1,032MB | 88 | Low‡ |
267
+ | Framework | Speed | Memory | Install Size | Dependencies | Success Rate |
268
+ | ------------- | ------------ | ------ | ------------ | ------------ | ------------ |
269
+ | **Kreuzberg** | 14.4 files/s | 360MB | 87MB | 43 | 100% |
270
+ | Unstructured | ~12 files/s | ~1GB | 146MB | 54 | 88%+ |
271
+ | MarkItDown | ~15 files/s | ~1.5GB | 251MB | 25 | 80%\* |
272
+ | Docling | ~1 file/min | ~5GB | 1,032MB | 88 | 45%\* |
253
273
 
254
- \*_Can achieve 75% reliability with 15% performance trade-off when configured_
255
- †_Good on simple documents, struggles with large/complex files (>10MB)_
256
- ‡_Frequently fails/times out on medium files (>1MB)_
274
+ \*_Performance varies significantly with document complexity and size_
257
275
 
258
- > **Benchmark details**: Tested across PDFs, Word docs, HTML, images, spreadsheets in 6 languages (English, Hebrew, German, Chinese, Japanese, Korean)
259
- > **Rule of thumb**: Use async API for complex documents and batch processing (up to 4.5x faster)
276
+ **Key strengths:**
277
+
278
+ - 6-126x faster processing than comparable frameworks
279
+ - Smallest installation footprint and memory usage
280
+ - Only framework with built-in async/await support
281
+ - Supports both CPU and GPU processing
282
+ - Built by software engineers for production reliability
283
+
284
+ > **Benchmark details**: Tests include PDFs, Word docs, HTML, images, and spreadsheets in multiple languages (English, Hebrew, German, Chinese, Japanese, Korean) on standardized hardware.
260
285
 
261
286
  ## Documentation
262
287
 
@@ -264,34 +289,13 @@ kreuzberg extract *.pdf --output-dir ./extracted/
264
289
 
265
290
  - [Installation Guide](https://goldziher.github.io/kreuzberg/getting-started/installation/) - Setup and dependencies
266
291
  - [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - Comprehensive usage guide
292
+ - [Performance Analysis](https://goldziher.github.io/kreuzberg/performance-analysis/) - Detailed benchmark results
267
293
  - [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Complete API documentation
268
294
  - [Docker Guide](https://goldziher.github.io/kreuzberg/user-guide/docker/) - Container deployment
269
295
  - [REST API](https://goldziher.github.io/kreuzberg/user-guide/api-server/) - HTTP endpoints
270
296
  - [CLI Guide](https://goldziher.github.io/kreuzberg/cli/) - Command-line usage
271
297
  - [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - OCR engine setup
272
298
 
273
- ## Advanced Features
274
-
275
- - **🤖 MCP Server**: Native integration with Claude Desktop and AI tools
276
- - **📊 Table Extraction**: Extract tables from PDFs with GMFT
277
- - **🧩 Content Chunking**: Split documents for RAG applications
278
- - **🎯 Custom Extractors**: Extend with your own document handlers
279
- - **🔧 Configuration**: Flexible TOML-based configuration
280
- - **🪝 Hooks**: Pre/post-processing customization
281
- - **🌍 Multi-language OCR**: 100+ languages supported
282
- - **⚙️ Metadata Extraction**: Rich document metadata
283
- - **🔄 Batch Processing**: Efficient bulk document processing
284
-
285
299
  ## License
286
300
 
287
301
  MIT License - see [LICENSE](LICENSE) for details.
288
-
289
- ______________________________________________________________________
290
-
291
- <div align="center">
292
-
293
- **[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [Discord](https://discord.gg/pXxagNK2zN)**
294
-
295
- Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
296
-
297
- </div>
@@ -0,0 +1,53 @@
1
+ kreuzberg/__init__.py,sha256=wVxbug-w1cO2xHcP04Bf6QeIKmT2Ep6aeenb8EOYLA0,1534
2
+ kreuzberg/__main__.py,sha256=s2qM1nPEkRHAQP-G3P7sf5l6qA_KJeIEHS5LpPz04lg,183
3
+ kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
4
+ kreuzberg/_cli_config.py,sha256=WD_seFjbuay_NJv77vGLBW6BVV9WZNujdzf3zQkhzPc,5691
5
+ kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
6
+ kreuzberg/_entity_extraction.py,sha256=nqpQPmR2Rf1vOwoQsjm22nPLDIcsXdYfMwCL3h8iUTQ,7802
7
+ kreuzberg/_gmft.py,sha256=Heovj2n2kgi7eHtvvRzpBgSLGyXjz8M9PAQMX-npd40,25295
8
+ kreuzberg/_language_detection.py,sha256=22-uXoOu_ws0K8Hz2M7U_SF9QX3npRYLhntAE1dNLFU,3283
9
+ kreuzberg/_mime_types.py,sha256=OhJ6gEyyLHjyvRtkk37zyLFBsRcSd_QybBaV8TxinIg,8471
10
+ kreuzberg/_playa.py,sha256=rU6ii2Qnrj8tkDYlSiab5h-BCYLJnUg4QwSLVDEXF5g,11883
11
+ kreuzberg/_registry.py,sha256=wGSlkS0U1zqruWQCLE95vj4a2mw1yyvf0j6rgz80sJg,3473
12
+ kreuzberg/_types.py,sha256=R_0Xc2kq4nEwkruvkB3qfrLeJ996419hBQ_1C6Xrqjo,13388
13
+ kreuzberg/cli.py,sha256=H9xxh4-zhGLfbhya2iD-NcEs-BvajVttm6cSiNx3ANU,12452
14
+ kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
15
+ kreuzberg/extraction.py,sha256=hY5d4oelwocX6eOBF0Bu3nHCcCbTL5JOIbaPCCFNKsU,16972
16
+ kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
+ kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
+ kreuzberg/_api/main.py,sha256=kZCMPPzP4BGzEege9pdhQTJPKKVjCaC6kZdMMeaqP2M,2599
19
+ kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
+ kreuzberg/_extractors/_base.py,sha256=yNVQSECFad-8_MjqpQZ4q0jQoNdzP6-tqw6l3TfgsMc,4418
21
+ kreuzberg/_extractors/_email.py,sha256=6-Mk1TRXPyy9ylWKCpgdrogyzhiFnJOTuTRld1ghO8I,5695
22
+ kreuzberg/_extractors/_html.py,sha256=lOM1Tgrrvd7vpEeFAxC1dp0Tibr6N2FEHCjgFx0FK64,1745
23
+ kreuzberg/_extractors/_image.py,sha256=eZ7mR4F-mTwYwUzd70xrY7SZYZrNiDxnP5bYDY5P75U,4455
24
+ kreuzberg/_extractors/_pandoc.py,sha256=51k7XISfKaPorhapG7aIeQb94KGsfozxKyT2rwhk9Bk,26553
25
+ kreuzberg/_extractors/_pdf.py,sha256=Deb1ZIcqDY18CHa7cJL4vO4S7gy09yXWNSuH7O7kSzY,16430
26
+ kreuzberg/_extractors/_presentation.py,sha256=CUlqZl_QCdJdumsZh0BpROkFbvi9uq7yMoIt3bRTUeE,10859
27
+ kreuzberg/_extractors/_spread_sheet.py,sha256=Nvyz7XT7C2ai4QeUashBeENQpuP5rs8SmKfumxEqlCg,13712
28
+ kreuzberg/_extractors/_structured.py,sha256=i3jAvhHZt_BsRGgZZfgcsUqlwAg_RNc8vsuecb04T0c,5581
29
+ kreuzberg/_mcp/__init__.py,sha256=8PYV-omC8Rln7Cove8C3rHu3d7sR1FuiwSBG1O7vkAE,92
30
+ kreuzberg/_mcp/server.py,sha256=BQHeKI89aKf24BIE4n6m8r1rVA1Zgt6vM8Ki_OHuGnc,6780
31
+ kreuzberg/_ocr/__init__.py,sha256=grshVFwVQl2rMvH1hg1JNlYXjy5-Tdb_rusLD1Cselk,706
32
+ kreuzberg/_ocr/_base.py,sha256=CUzYMsJjCqCmHzWckmDeIB2L5hd261xrPrK8Ql-Gdm0,3876
33
+ kreuzberg/_ocr/_easyocr.py,sha256=sWyVnF7My4F1GU-IPSVtpaDJPYogw8N-NYxwuy-6loc,17098
34
+ kreuzberg/_ocr/_paddleocr.py,sha256=nXfQq6t2a7O-IpbCZRv8BvzP_lEBLgyYwXI5-wjzec0,17480
35
+ kreuzberg/_ocr/_tesseract.py,sha256=RjJ_C8c74LmLN53sdDo8WPCpUYeJ6fmRwsQdp6dJYio,31490
36
+ kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
+ kreuzberg/_utils/_cache.py,sha256=6T2K9BXWaPkEKphSFrfXtFFE7ck5q9CYV9NmAFS56e4,15204
38
+ kreuzberg/_utils/_device.py,sha256=rnaSSB5ibf2wr7EDxrcmOUZ4Ocor0pHkwb3N1pC46EY,10276
39
+ kreuzberg/_utils/_document_cache.py,sha256=z8irioKsOu8xve1YgHatm__wIFvs9I1gDK3tLNsNyqM,6926
40
+ kreuzberg/_utils/_errors.py,sha256=4OseKJI5qscD9jHxpP8CtpPWNHAOdhrJwcg6dlQl2fk,6310
41
+ kreuzberg/_utils/_pdf_lock.py,sha256=nqxAYCNlfWDrJtP4ZNu57st1YnkDl-gYXdr0q8nv0kA,1961
42
+ kreuzberg/_utils/_process_pool.py,sha256=4BqhmRspwMyPT2EBfTu_rrn7v722wlMLD8qlYvYsc00,8621
43
+ kreuzberg/_utils/_quality.py,sha256=dgFLt40NSqB8Ciej5QcZQLiV4U7LcrGux0vXckiE31U,7568
44
+ kreuzberg/_utils/_serialization.py,sha256=Rt5zSkvzf1SVNDrI6F2Zvnkel24mQkD1QvP0WjgZUgk,2195
45
+ kreuzberg/_utils/_string.py,sha256=5YKu9EZlZQ-LkphXUq8fdwKQrX9jWACFEhMGfjIysf4,6381
46
+ kreuzberg/_utils/_sync.py,sha256=7LSavBmxVKQUzdjfx9fYRAI9IbJtRw8iGf_Q8B7RX9g,4923
47
+ kreuzberg/_utils/_table.py,sha256=C2skLtcyczxDEH33Qw2dOwnR15SGillvNEP-NzBG3R8,8156
48
+ kreuzberg/_utils/_tmp.py,sha256=hVn-VVijIg2FM7EZJ899gc7wZg-TGoJZoeAcxMX-Cxg,1044
49
+ kreuzberg-3.8.1.dist-info/METADATA,sha256=IqJ6RTcFlwkMN6JZIkb9c8O4rgTrPqIuzXWerD6He1I,11507
50
+ kreuzberg-3.8.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
51
+ kreuzberg-3.8.1.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
52
+ kreuzberg-3.8.1.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
53
+ kreuzberg-3.8.1.dist-info/RECORD,,
@@ -1,6 +0,0 @@
1
- """Multiprocessing utilities for kreuzberg."""
2
-
3
- from .process_manager import ProcessPoolManager
4
- from .tesseract_pool import TesseractProcessPool
5
-
6
- __all__ = ["ProcessPoolManager", "TesseractProcessPool"]