kreuzberg 3.8.1__py3-none-any.whl → 3.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,7 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import re
6
+ from functools import reduce
6
7
  from typing import Any
7
8
 
8
9
  # Pre-compiled patterns for performance
@@ -102,9 +103,8 @@ def clean_extracted_text(text: str) -> str:
102
103
  if not text:
103
104
  return text
104
105
 
105
- # Remove script and style content
106
- for pattern in _SCRIPT_PATTERNS.values():
107
- text = pattern.sub(" ", text)
106
+ # Remove script and style content using functools.reduce for single pass
107
+ text = reduce(lambda t, pattern: pattern.sub(" ", t), _SCRIPT_PATTERNS.values(), text)
108
108
 
109
109
  # Clean OCR artifacts
110
110
  text = _clean_ocr_artifacts(text)
@@ -134,10 +134,8 @@ def _calculate_script_penalty(text: str, total_chars: int) -> float:
134
134
  if total_chars == 0:
135
135
  return 0.0
136
136
 
137
- script_chars = 0
138
- for pattern in _SCRIPT_PATTERNS.values():
139
- matches = pattern.findall(text)
140
- script_chars += sum(len(match) for match in matches)
137
+ # Use sum with generator expression for single-pass calculation
138
+ script_chars = sum(len(match) for pattern in _SCRIPT_PATTERNS.values() for match in pattern.findall(text))
141
139
 
142
140
  return min(1.0, script_chars / total_chars)
143
141
 
@@ -147,10 +145,8 @@ def _calculate_navigation_penalty(text: str, total_chars: int) -> float:
147
145
  if total_chars == 0:
148
146
  return 0.0
149
147
 
150
- nav_chars = 0
151
- for pattern in _NAVIGATION_PATTERNS.values():
152
- matches = pattern.findall(text)
153
- nav_chars += sum(len(match) for match in matches)
148
+ # Use sum with generator expression for single-pass calculation
149
+ nav_chars = sum(len(match) for pattern in _NAVIGATION_PATTERNS.values() for match in pattern.findall(text))
154
150
 
155
151
  return min(1.0, nav_chars / total_chars)
156
152
 
@@ -2,16 +2,28 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- from dataclasses import asdict, is_dataclass
6
- from enum import Enum
5
+ from dataclasses import is_dataclass
7
6
  from typing import Any, TypeVar, cast
8
7
 
8
+ import msgspec
9
9
  from msgspec import MsgspecError
10
10
  from msgspec.msgpack import decode, encode
11
11
 
12
12
  T = TypeVar("T")
13
13
 
14
14
 
15
+ # Define dict method names in priority order
16
+ _DICT_METHOD_NAMES = (
17
+ "to_dict",
18
+ "as_dict",
19
+ "dict",
20
+ "model_dump",
21
+ "json",
22
+ "to_list",
23
+ "tolist",
24
+ )
25
+
26
+
15
27
  def encode_hook(obj: Any) -> Any:
16
28
  """Custom encoder for complex objects."""
17
29
  if callable(obj):
@@ -20,22 +32,15 @@ def encode_hook(obj: Any) -> Any:
20
32
  if isinstance(obj, Exception):
21
33
  return {"message": str(obj), "type": type(obj).__name__}
22
34
 
23
- for key in (
24
- "to_dict",
25
- "as_dict",
26
- "dict",
27
- "model_dump",
28
- "json",
29
- "to_list",
30
- "tolist",
31
- ):
32
- if hasattr(obj, key):
33
- method = getattr(obj, key) # Cache the attribute lookup
34
- if callable(method):
35
- return method()
35
+ # Check for dict-like methods more efficiently using any() with generator
36
+ for attr_name in _DICT_METHOD_NAMES:
37
+ method = getattr(obj, attr_name, None)
38
+ if method is not None and callable(method):
39
+ return method()
36
40
 
37
41
  if is_dataclass(obj) and not isinstance(obj, type):
38
- return {k: v if not isinstance(v, Enum) else v.value for (k, v) in asdict(obj).items()}
42
+ # Use msgspec.to_builtins for more efficient conversion
43
+ return msgspec.to_builtins(obj)
39
44
 
40
45
  if hasattr(obj, "save") and hasattr(obj, "format"):
41
46
  return None
@@ -28,6 +28,7 @@ _encoding_cache: dict[str, str] = {}
28
28
  @lru_cache(maxsize=128)
29
29
  def _get_encoding_cache_key(data_hash: str, size: int) -> str:
30
30
  """Generate cache key for encoding detection."""
31
+ # Use string interpolation which is faster than format strings for simple cases
31
32
  return f"{data_hash}:{size}"
32
33
 
33
34
 
@@ -104,25 +105,29 @@ def _calculate_text_confidence(text: str) -> float:
104
105
  if not text:
105
106
  return 0.0
106
107
 
107
- # Check for common encoding problems
108
- replacement_count = len(_MOJIBAKE_PATTERNS["replacement_chars"].findall(text))
109
- control_count = len(_MOJIBAKE_PATTERNS["control_chars"].findall(text))
110
108
  total_chars = len(text)
111
-
112
109
  if total_chars == 0:
113
110
  return 0.0
114
111
 
112
+ # Check for common encoding problems - compile patterns once
113
+ replacement_count = len(_MOJIBAKE_PATTERNS["replacement_chars"].findall(text))
114
+ control_count = len(_MOJIBAKE_PATTERNS["control_chars"].findall(text))
115
+
115
116
  # Penalize replacement and control characters
116
117
  penalty = (replacement_count + control_count * 2) / total_chars
117
118
 
118
- # Bonus for readable character ranges
119
+ # Bonus for readable character ranges - more efficient counting
120
+ # Use generator expression with early termination
119
121
  readable_chars = sum(1 for c in text if c.isprintable() or c.isspace())
120
122
  readability_score = readable_chars / total_chars
121
123
 
122
124
  # Check for suspicious Cyrillic that might be misencoded Hebrew
123
125
  cyrillic_matches = _MOJIBAKE_PATTERNS["hebrew_as_cyrillic"].findall(text)
124
- if cyrillic_matches and len("".join(cyrillic_matches)) > total_chars * 0.1:
125
- penalty += 0.3 # Heavy penalty for likely mojibake
126
+ if cyrillic_matches:
127
+ # Calculate total length more efficiently
128
+ cyrillic_length = sum(len(match) for match in cyrillic_matches)
129
+ if cyrillic_length > total_chars * 0.1:
130
+ penalty += 0.3 # Heavy penalty for likely mojibake
126
131
 
127
132
  return max(0.0, min(1.0, readability_score - penalty))
128
133
 
@@ -164,7 +169,8 @@ def normalize_spaces(text: str) -> str:
164
169
 
165
170
  # Split by double newlines to preserve paragraph breaks
166
171
  paragraphs = text.split("\n\n")
167
- normalized_paragraphs = []
172
+
173
+ result_paragraphs = []
168
174
 
169
175
  for paragraph in paragraphs:
170
176
  # Use pre-compiled patterns for better performance
@@ -173,10 +179,14 @@ def normalize_spaces(text: str) -> str:
173
179
  # Clean up multiple newlines within paragraph (keep single newlines)
174
180
  cleaned = _NEWLINES_PATTERN.sub("\n", cleaned)
175
181
 
176
- # Strip and filter empty lines efficiently
177
- lines = [line.strip() for line in cleaned.split("\n") if line.strip()]
182
+ # Process lines efficiently - manual loop avoids double strip() calls
183
+ lines = []
184
+ for line in cleaned.split("\n"):
185
+ stripped_line = line.strip()
186
+ if stripped_line:
187
+ lines.append(stripped_line)
178
188
 
179
189
  if lines:
180
- normalized_paragraphs.append("\n".join(lines))
190
+ result_paragraphs.append("\n".join(lines))
181
191
 
182
- return "\n\n".join(normalized_paragraphs)
192
+ return "\n\n".join(result_paragraphs)
@@ -3,7 +3,6 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import csv
6
- from io import StringIO
7
6
  from typing import TYPE_CHECKING, Any
8
7
 
9
8
  if TYPE_CHECKING:
@@ -23,9 +22,9 @@ def export_table_to_csv(table: TableData, separator: str = ",") -> str:
23
22
  if "df" not in table or table["df"] is None:
24
23
  return ""
25
24
 
26
- output = StringIO()
27
- table["df"].to_csv(output, sep=separator, index=False, quoting=csv.QUOTE_MINIMAL)
28
- return output.getvalue().strip()
25
+ # Use pandas to_csv() direct string return instead of StringIO
26
+ csv_output = table["df"].to_csv(sep=separator, index=False, quoting=csv.QUOTE_MINIMAL, lineterminator="\n")
27
+ return str(csv_output).strip()
29
28
 
30
29
 
31
30
  def export_table_to_tsv(table: TableData) -> str:
kreuzberg/cli.py CHANGED
@@ -18,7 +18,7 @@ except ImportError as e:
18
18
  ) from e
19
19
 
20
20
  from kreuzberg import __version__, extract_bytes_sync, extract_file_sync
21
- from kreuzberg._cli_config import build_extraction_config, find_default_config, load_config_from_file
21
+ from kreuzberg._config import build_extraction_config, find_config_file, load_config_from_file
22
22
  from kreuzberg.exceptions import KreuzbergError, MissingDependencyError
23
23
 
24
24
  DEFAULT_MAX_CHARACTERS = 4000
@@ -92,7 +92,7 @@ def _load_config(config: Path | None, verbose: bool) -> dict[str, Any]:
92
92
  if config:
93
93
  file_config = load_config_from_file(config)
94
94
  else:
95
- default_config = find_default_config()
95
+ default_config = find_config_file()
96
96
  if default_config:
97
97
  try:
98
98
  file_config = load_config_from_file(default_config)
@@ -314,7 +314,7 @@ def extract( # noqa: PLR0913
314
314
  def config(config: Path | None) -> None:
315
315
  """Show current configuration."""
316
316
  try:
317
- config_path = config or find_default_config()
317
+ config_path = config or find_config_file()
318
318
 
319
319
  if config_path:
320
320
  file_config = load_config_from_file(config_path)
kreuzberg/exceptions.py CHANGED
@@ -7,6 +7,8 @@ from typing import Any
7
7
  class KreuzbergError(Exception):
8
8
  """Base exception for all Kreuzberg errors."""
9
9
 
10
+ __slots__ = ("context",)
11
+
10
12
  context: Any
11
13
  """The context of the error."""
12
14
 
@@ -43,14 +45,20 @@ class KreuzbergError(Exception):
43
45
  class ParsingError(KreuzbergError):
44
46
  """Raised when a parsing error occurs."""
45
47
 
48
+ __slots__ = ()
49
+
46
50
 
47
51
  class ValidationError(KreuzbergError):
48
52
  """Raised when a validation error occurs."""
49
53
 
54
+ __slots__ = ()
55
+
50
56
 
51
57
  class MissingDependencyError(KreuzbergError):
52
58
  """Raised when a dependency is missing."""
53
59
 
60
+ __slots__ = ()
61
+
54
62
  @classmethod
55
63
  def create_for_package(
56
64
  cls, *, dependency_group: str, functionality: str, package_name: str
@@ -79,3 +87,5 @@ class MissingDependencyError(KreuzbergError):
79
87
 
80
88
  class OCRError(KreuzbergError):
81
89
  """Raised when an OCR error occurs."""
90
+
91
+ __slots__ = ()
kreuzberg/extraction.py CHANGED
@@ -460,8 +460,8 @@ def batch_extract_bytes_sync(
460
460
  return (index, error_result)
461
461
 
462
462
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
463
- indexed_contents = list(enumerate(contents))
464
- future_to_index = {executor.submit(extract_single, ic): i for i, ic in enumerate(indexed_contents)}
463
+ # Avoid creating intermediate list, use enumerate directly
464
+ future_to_index = {executor.submit(extract_single, (i, content)): i for i, content in enumerate(contents)}
465
465
 
466
466
  results: list[ExtractionResult] = [None] * len(contents) # type: ignore[list-item]
467
467
  for future in as_completed(future_to_index):
@@ -0,0 +1,265 @@
1
+ Metadata-Version: 2.4
2
+ Name: kreuzberg
3
+ Version: 3.8.2
4
+ Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
+ Project-URL: documentation, https://kreuzberg.dev
6
+ Project-URL: homepage, https://github.com/Goldziher/kreuzberg
7
+ Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
8
+ License: MIT
9
+ License-File: LICENSE
10
+ Keywords: async,document-analysis,document-intelligence,document-processing,extensible,information-extraction,mcp,metadata-extraction,model-context-protocol,ocr,pandoc,pdf-extraction,pdfium,plugin-architecture,rag,retrieval-augmented-generation,structured-data,table-extraction,tesseract,text-extraction
11
+ Classifier: Development Status :: 5 - Production/Stable
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Information Technology
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3 :: Only
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Topic :: Database
23
+ Classifier: Topic :: Multimedia :: Graphics :: Capture :: Scanners
24
+ Classifier: Topic :: Office/Business :: Office Suites
25
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
26
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
27
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
28
+ Classifier: Topic :: Text Processing :: General
29
+ Classifier: Typing :: Typed
30
+ Requires-Python: >=3.10
31
+ Requires-Dist: anyio>=4.9.0
32
+ Requires-Dist: chardetng-py>=0.3.4
33
+ Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
34
+ Requires-Dist: html-to-markdown[lxml]>=1.8.0
35
+ Requires-Dist: mcp>=1.11.0
36
+ Requires-Dist: msgspec>=0.18.0
37
+ Requires-Dist: playa-pdf>=0.6.1
38
+ Requires-Dist: psutil>=7.0.0
39
+ Requires-Dist: pypdfium2==4.30.0
40
+ Requires-Dist: python-calamine>=0.3.2
41
+ Requires-Dist: python-pptx>=1.0.2
42
+ Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
43
+ Provides-Extra: additional-extensions
44
+ Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
45
+ Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
46
+ Provides-Extra: all
47
+ Requires-Dist: click>=8.2.1; extra == 'all'
48
+ Requires-Dist: easyocr>=1.7.2; extra == 'all'
49
+ Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
50
+ Requires-Dist: gmft>=0.4.2; extra == 'all'
51
+ Requires-Dist: keybert>=0.9.0; extra == 'all'
52
+ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all'
53
+ Requires-Dist: mailparse>=1.0.15; extra == 'all'
54
+ Requires-Dist: paddleocr>=3.1.0; extra == 'all'
55
+ Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
56
+ Requires-Dist: rich>=14.0.0; extra == 'all'
57
+ Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
58
+ Requires-Dist: setuptools>=80.9.0; extra == 'all'
59
+ Requires-Dist: spacy>=3.8.7; extra == 'all'
60
+ Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
61
+ Provides-Extra: api
62
+ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'api'
63
+ Provides-Extra: chunking
64
+ Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
65
+ Provides-Extra: cli
66
+ Requires-Dist: click>=8.2.1; extra == 'cli'
67
+ Requires-Dist: rich>=14.0.0; extra == 'cli'
68
+ Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
69
+ Provides-Extra: easyocr
70
+ Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
71
+ Provides-Extra: entity-extraction
72
+ Requires-Dist: keybert>=0.9.0; extra == 'entity-extraction'
73
+ Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
74
+ Provides-Extra: gmft
75
+ Requires-Dist: gmft>=0.4.2; extra == 'gmft'
76
+ Provides-Extra: langdetect
77
+ Requires-Dist: fast-langdetect>=0.3.2; extra == 'langdetect'
78
+ Provides-Extra: paddleocr
79
+ Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
80
+ Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
81
+ Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
82
+ Description-Content-Type: text/markdown
83
+
84
+ # Kreuzberg
85
+
86
+ [![Discord](https://img.shields.io/badge/Discord-Join%20our%20community-7289da)](https://discord.gg/pXxagNK2zN)
87
+ [![PyPI version](https://badge.fury.io/py/kreuzberg.svg)](https://badge.fury.io/py/kreuzberg)
88
+ [![Documentation](https://img.shields.io/badge/docs-kreuzberg.dev-blue)](https://kreuzberg.dev/)
89
+ [![Benchmarks](https://img.shields.io/badge/benchmarks-fastest%20CPU-orange)](https://benchmarks.kreuzberg.dev/)
90
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
91
+ [![Test Coverage](https://img.shields.io/badge/coverage-95%25-green)](https://github.com/Goldziher/kreuzberg)
92
+
93
+ **A document intelligence framework for Python.** Extract text, metadata, and structured information from diverse document formats through a unified, extensible API. Built on established open source foundations including Pandoc, PDFium, and Tesseract.
94
+
95
+ 📖 **[Complete Documentation](https://kreuzberg.dev/)**
96
+
97
+ ## Framework Overview
98
+
99
+ ### Document Intelligence Capabilities
100
+
101
+ - **Text Extraction**: High-fidelity text extraction preserving document structure and formatting
102
+ - **Metadata Extraction**: Comprehensive metadata including author, creation date, language, and document properties
103
+ - **Format Support**: 18 document types including PDF, Microsoft Office, images, HTML, and structured data formats
104
+ - **OCR Integration**: Multiple OCR engines (Tesseract, EasyOCR, PaddleOCR) with automatic fallback
105
+ - **Table Detection**: Structured table extraction with cell-level precision via GMFT integration
106
+
107
+ ### Technical Architecture
108
+
109
+ - **Performance**: Highest throughput among Python document processing frameworks (30+ docs/second)
110
+ - **Resource Efficiency**: 71MB installation, ~360MB runtime memory footprint
111
+ - **Extensibility**: Plugin architecture for custom extractors via the Extractor base class
112
+ - **API Design**: Synchronous and asynchronous APIs with consistent interfaces
113
+ - **Type Safety**: Complete type annotations throughout the codebase
114
+
115
+ ### Open Source Foundation
116
+
117
+ Kreuzberg leverages established open source technologies:
118
+
119
+ - **Pandoc**: Universal document converter for robust format support
120
+ - **PDFium**: Google's PDF rendering engine for accurate PDF processing
121
+ - **Tesseract**: Google's OCR engine for text recognition
122
+ - **Python-docx/pptx**: Native Microsoft Office format support
123
+
124
+ ## Quick Start
125
+
126
+ ### Extract Text with CLI
127
+
128
+ ```bash
129
+ # Extract text from any file to markdown
130
+ uvx kreuzberg extract document.pdf > output.md
131
+
132
+ # With all features (OCR, table extraction, etc.)
133
+ uvx --from "kreuzberg[all]" kreuzberg extract invoice.pdf --ocr --format markdown
134
+
135
+ # Extract with rich metadata
136
+ uvx kreuzberg extract report.pdf --show-metadata --format json
137
+ ```
138
+
139
+ ### Python Usage
140
+
141
+ **Async (recommended for web apps):**
142
+
143
+ ```python
144
+ from kreuzberg import extract_file
145
+
146
+ # In your async function
147
+ result = await extract_file("presentation.pptx")
148
+ print(result.content)
149
+
150
+ # Rich metadata extraction
151
+ print(f"Title: {result.metadata.title}")
152
+ print(f"Author: {result.metadata.author}")
153
+ print(f"Page count: {result.metadata.page_count}")
154
+ print(f"Created: {result.metadata.created_at}")
155
+ ```
156
+
157
+ **Sync (for scripts and CLI tools):**
158
+
159
+ ```python
160
+ from kreuzberg import extract_file_sync
161
+
162
+ result = extract_file_sync("report.docx")
163
+ print(result.content)
164
+
165
+ # Access rich metadata
166
+ print(f"Language: {result.metadata.language}")
167
+ print(f"Word count: {result.metadata.word_count}")
168
+ print(f"Keywords: {result.metadata.keywords}")
169
+ ```
170
+
171
+ ### Docker
172
+
173
+ ```bash
174
+ # Run the REST API
175
+ docker run -p 8000:8000 goldziher/kreuzberg
176
+
177
+ # Extract via API
178
+ curl -X POST -F "file=@document.pdf" http://localhost:8000/extract
179
+ ```
180
+
181
+ 📖 **[Installation Guide](https://kreuzberg.dev/getting-started/installation/)** • **[CLI Documentation](https://kreuzberg.dev/cli/)** • **[API Reference](https://kreuzberg.dev/api-reference/)**
182
+
183
+ ## Deployment Options
184
+
185
+ ### 🤖 MCP Server (AI Integration)
186
+
187
+ **Add to Claude Desktop with one command:**
188
+
189
+ ```bash
190
+ claude mcp add kreuzberg uvx -- --from "kreuzberg[all]" kreuzberg-mcp
191
+ ```
192
+
193
+ **Or configure manually in `claude_desktop_config.json`:**
194
+
195
+ ```json
196
+ {
197
+ "mcpServers": {
198
+ "kreuzberg": {
199
+ "command": "uvx",
200
+ "args": ["--from", "kreuzberg[all]", "kreuzberg-mcp"]
201
+ }
202
+ }
203
+ }
204
+ ```
205
+
206
+ **MCP capabilities:**
207
+
208
+ - Extract text from PDFs, images, Office docs, and more
209
+ - Full OCR support with multiple engines
210
+ - Table extraction and metadata parsing
211
+
212
+ 📖 **[MCP Documentation](https://kreuzberg.dev/user-guide/mcp-server/)**
213
+
214
+ ## Supported Formats
215
+
216
+ | Category | Formats |
217
+ | ----------------- | ------------------------------ |
218
+ | **Documents** | PDF, DOCX, DOC, RTF, TXT, EPUB |
219
+ | **Images** | JPG, PNG, TIFF, BMP, GIF, WEBP |
220
+ | **Spreadsheets** | XLSX, XLS, CSV, ODS |
221
+ | **Presentations** | PPTX, PPT, ODP |
222
+ | **Web** | HTML, XML, MHTML |
223
+ | **Archives** | Support via extraction |
224
+
225
+ ## 📊 Performance Characteristics
226
+
227
+ [View comprehensive benchmarks](https://benchmarks.kreuzberg.dev/) • [Benchmark methodology](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [**Detailed Analysis**](https://kreuzberg.dev/performance-analysis/)
228
+
229
+ ### Technical Specifications
230
+
231
+ | Metric | Kreuzberg Sync | Kreuzberg Async | Benchmarked |
232
+ | ---------------------------- | -------------- | --------------- | ------------------ |
233
+ | **Throughput (tiny files)** | 31.78 files/s | 23.94 files/s | Highest throughput |
234
+ | **Throughput (small files)** | 8.91 files/s | 9.31 files/s | Highest throughput |
235
+ | **Memory footprint** | 359.8 MB | 395.2 MB | Lowest usage |
236
+ | **Installation size** | 71 MB | 71 MB | Smallest size |
237
+ | **Success rate** | 100% | 100% | Perfect |
238
+ | **Supported formats** | 18 | 18 | Comprehensive |
239
+
240
+ ### Architecture Advantages
241
+
242
+ - **Native C extensions**: Built on PDFium and Tesseract for maximum performance
243
+ - **Async/await support**: True asynchronous processing with intelligent task scheduling
244
+ - **Memory efficiency**: Streaming architecture minimizes memory allocation
245
+ - **Process pooling**: Automatic multiprocessing for CPU-intensive operations
246
+ - **Optimized data flow**: Efficient data handling with minimal transformations
247
+
248
+ > **Benchmark details**: Tests include PDFs, Word docs, HTML, images, and spreadsheets in multiple languages (English, Hebrew, German, Chinese, Japanese, Korean) on standardized hardware.
249
+
250
+ ## Documentation
251
+
252
+ ### Quick Links
253
+
254
+ - [Installation Guide](https://kreuzberg.dev/getting-started/installation/) - Setup and dependencies
255
+ - [User Guide](https://kreuzberg.dev/user-guide/) - Comprehensive usage guide
256
+ - [Performance Analysis](https://kreuzberg.dev/performance-analysis/) - Detailed benchmark results
257
+ - [API Reference](https://kreuzberg.dev/api-reference/) - Complete API documentation
258
+ - [Docker Guide](https://kreuzberg.dev/user-guide/docker/) - Container deployment
259
+ - [REST API](https://kreuzberg.dev/user-guide/api-server/) - HTTP endpoints
260
+ - [CLI Guide](https://kreuzberg.dev/cli/) - Command-line usage
261
+ - [OCR Configuration](https://kreuzberg.dev/user-guide/ocr-configuration/) - OCR engine setup
262
+
263
+ ## License
264
+
265
+ MIT License - see [LICENSE](LICENSE) for details.
@@ -0,0 +1,53 @@
1
+ kreuzberg/__init__.py,sha256=0OJ_jNKbS6GxzWC5-EfRCiE80as_ya0-wwyNsTYbxzY,1721
2
+ kreuzberg/__main__.py,sha256=s2qM1nPEkRHAQP-G3P7sf5l6qA_KJeIEHS5LpPz04lg,183
3
+ kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
4
+ kreuzberg/_config.py,sha256=_9JU88ChId8dWUjZ13ueo9_JoFekkyzuv7rZpFkrPZk,12966
5
+ kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
6
+ kreuzberg/_entity_extraction.py,sha256=woNxARG27Z3T_l6w6N-dbt1PPe1IHptFMOZY_6etv54,7819
7
+ kreuzberg/_gmft.py,sha256=Q46CyBxRxY_oDGpSuXMOJ7qfR9LwuCKXnrl60wcPvU4,25286
8
+ kreuzberg/_language_detection.py,sha256=eEfj4tsh91SfB2_zQIdY-qD7TlPcppaFm0SqQmETS6Y,3295
9
+ kreuzberg/_mime_types.py,sha256=OhJ6gEyyLHjyvRtkk37zyLFBsRcSd_QybBaV8TxinIg,8471
10
+ kreuzberg/_playa.py,sha256=9z4If0WHxbYQxfb8xT7T96L9Du2Fj3Ar5-rF0OHHiMM,11877
11
+ kreuzberg/_registry.py,sha256=wGSlkS0U1zqruWQCLE95vj4a2mw1yyvf0j6rgz80sJg,3473
12
+ kreuzberg/_types.py,sha256=GisvL0ps2LCc0heKopFwSyrEbzH3WpDxaeev4vn59X4,14257
13
+ kreuzberg/cli.py,sha256=vTGS2TJlFTNMWp5LwZd3G2SS8u0m6bhQkH9n6a1oOoM,12439
14
+ kreuzberg/exceptions.py,sha256=PTiAZgQwcG9hXbgYg2W7sfxksFhq5_wzOFgZGnTJAoc,2991
15
+ kreuzberg/extraction.py,sha256=UmeEVN-eSile4HMxP0iqG9092BrsH5_zSZNVHhwy0ko,16993
16
+ kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
+ kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
+ kreuzberg/_api/main.py,sha256=g3kqXUfSie2pcw3-EWOM4TAoJUqM7yj2e-cBQJ_bmYc,3253
19
+ kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
+ kreuzberg/_extractors/_base.py,sha256=yNVQSECFad-8_MjqpQZ4q0jQoNdzP6-tqw6l3TfgsMc,4418
21
+ kreuzberg/_extractors/_email.py,sha256=6-Mk1TRXPyy9ylWKCpgdrogyzhiFnJOTuTRld1ghO8I,5695
22
+ kreuzberg/_extractors/_html.py,sha256=lOM1Tgrrvd7vpEeFAxC1dp0Tibr6N2FEHCjgFx0FK64,1745
23
+ kreuzberg/_extractors/_image.py,sha256=eZ7mR4F-mTwYwUzd70xrY7SZYZrNiDxnP5bYDY5P75U,4455
24
+ kreuzberg/_extractors/_pandoc.py,sha256=51k7XISfKaPorhapG7aIeQb94KGsfozxKyT2rwhk9Bk,26553
25
+ kreuzberg/_extractors/_pdf.py,sha256=d-hG_mhAMj22bQ35YuP2nq017z27_2Pp08r1qyHxlYI,16676
26
+ kreuzberg/_extractors/_presentation.py,sha256=CUlqZl_QCdJdumsZh0BpROkFbvi9uq7yMoIt3bRTUeE,10859
27
+ kreuzberg/_extractors/_spread_sheet.py,sha256=vPxEDAyH-gDoVXSg-A0guOjOfaWIuRI3i2NU8xPwhK8,13695
28
+ kreuzberg/_extractors/_structured.py,sha256=d0x6EyRimr8eWmr1qPb7HRWnrbKBuD-GpIrZd8XJp0o,5824
29
+ kreuzberg/_mcp/__init__.py,sha256=8PYV-omC8Rln7Cove8C3rHu3d7sR1FuiwSBG1O7vkAE,92
30
+ kreuzberg/_mcp/server.py,sha256=Ab0w7kR3m7_L1cfhYHiC8HqDL282vt4uBYwYc9w9E08,8703
31
+ kreuzberg/_ocr/__init__.py,sha256=grshVFwVQl2rMvH1hg1JNlYXjy5-Tdb_rusLD1Cselk,706
32
+ kreuzberg/_ocr/_base.py,sha256=CUzYMsJjCqCmHzWckmDeIB2L5hd261xrPrK8Ql-Gdm0,3876
33
+ kreuzberg/_ocr/_easyocr.py,sha256=c2ndpDlIHvAI2WyvQUXLQ1hb6XynKeKARsXQcQ3ntJ0,17110
34
+ kreuzberg/_ocr/_paddleocr.py,sha256=fab8a-3cvDgnt97qF-Km9ZfmkacFeKD_g15O8HXYRVc,17492
35
+ kreuzberg/_ocr/_tesseract.py,sha256=r1g_PCAXgJbZ0RPGn4aSxctZ0F9lLvI3zLGLEPAnviI,31455
36
+ kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
+ kreuzberg/_utils/_cache.py,sha256=H2d6JOiTTAoJx5HPJoToCk4ik-ztTRNEJRrHgcSUTLs,15249
38
+ kreuzberg/_utils/_device.py,sha256=PC8YUPE95pzOyU7sU_icqNZpSfi6HZlEFfmWcV1Uees,10226
39
+ kreuzberg/_utils/_document_cache.py,sha256=z8irioKsOu8xve1YgHatm__wIFvs9I1gDK3tLNsNyqM,6926
40
+ kreuzberg/_utils/_errors.py,sha256=UsktQ_p7eOj9crPsFDg8HgRSE5-IpuFC7y1e6dDI_fY,6503
41
+ kreuzberg/_utils/_pdf_lock.py,sha256=nqxAYCNlfWDrJtP4ZNu57st1YnkDl-gYXdr0q8nv0kA,1961
42
+ kreuzberg/_utils/_process_pool.py,sha256=4BqhmRspwMyPT2EBfTu_rrn7v722wlMLD8qlYvYsc00,8621
43
+ kreuzberg/_utils/_quality.py,sha256=-nKzj5n7yJDYrvl556oq2T5S5oKMEOrjpcRMlZ00Jqo,7668
44
+ kreuzberg/_utils/_serialization.py,sha256=cqqxqN2cmtndBhIr4v2wqiMwnNadnKhvuN7EUj3i18M,2290
45
+ kreuzberg/_utils/_string.py,sha256=bCzO3UO6nXupxvtMWvHqfp1Vd9CTzEH9jmpJXQ7upAU,6800
46
+ kreuzberg/_utils/_sync.py,sha256=7LSavBmxVKQUzdjfx9fYRAI9IbJtRw8iGf_Q8B7RX9g,4923
47
+ kreuzberg/_utils/_table.py,sha256=IomrfQBP85DZI8RmQjOVs2Siq7VP9FUTYPaZR4t3yRw,8199
48
+ kreuzberg/_utils/_tmp.py,sha256=hVn-VVijIg2FM7EZJ899gc7wZg-TGoJZoeAcxMX-Cxg,1044
49
+ kreuzberg-3.8.2.dist-info/METADATA,sha256=RiP64og5wOaf9gPZ7CwOsNYYx9GBnVMg8orgqZdncKA,11466
50
+ kreuzberg-3.8.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
51
+ kreuzberg-3.8.2.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
52
+ kreuzberg-3.8.2.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
53
+ kreuzberg-3.8.2.dist-info/RECORD,,