kreuzberg 3.8.0__py3-none-any.whl → 3.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. kreuzberg/__init__.py +4 -0
  2. kreuzberg/_api/main.py +22 -1
  3. kreuzberg/_config.py +404 -0
  4. kreuzberg/_entity_extraction.py +4 -5
  5. kreuzberg/_extractors/_base.py +3 -5
  6. kreuzberg/_extractors/_image.py +18 -32
  7. kreuzberg/_extractors/_pandoc.py +3 -14
  8. kreuzberg/_extractors/_pdf.py +39 -57
  9. kreuzberg/_extractors/_spread_sheet.py +2 -3
  10. kreuzberg/_extractors/_structured.py +10 -7
  11. kreuzberg/_gmft.py +314 -10
  12. kreuzberg/_language_detection.py +1 -1
  13. kreuzberg/_mcp/server.py +58 -8
  14. kreuzberg/_ocr/__init__.py +1 -22
  15. kreuzberg/_ocr/_base.py +59 -0
  16. kreuzberg/_ocr/_easyocr.py +92 -1
  17. kreuzberg/_ocr/_paddleocr.py +90 -1
  18. kreuzberg/_ocr/_tesseract.py +556 -5
  19. kreuzberg/_playa.py +2 -3
  20. kreuzberg/_types.py +46 -24
  21. kreuzberg/_utils/_cache.py +35 -4
  22. kreuzberg/_utils/_device.py +10 -20
  23. kreuzberg/_utils/_errors.py +44 -45
  24. kreuzberg/_utils/_process_pool.py +2 -6
  25. kreuzberg/_utils/_quality.py +7 -11
  26. kreuzberg/_utils/_serialization.py +21 -16
  27. kreuzberg/_utils/_string.py +22 -12
  28. kreuzberg/_utils/_table.py +3 -4
  29. kreuzberg/cli.py +4 -5
  30. kreuzberg/exceptions.py +10 -0
  31. kreuzberg/extraction.py +6 -24
  32. kreuzberg-3.8.2.dist-info/METADATA +265 -0
  33. kreuzberg-3.8.2.dist-info/RECORD +53 -0
  34. kreuzberg/_cli_config.py +0 -175
  35. kreuzberg/_multiprocessing/__init__.py +0 -5
  36. kreuzberg/_multiprocessing/gmft_isolated.py +0 -330
  37. kreuzberg/_ocr/_pool.py +0 -357
  38. kreuzberg/_ocr/_sync.py +0 -566
  39. kreuzberg-3.8.0.dist-info/METADATA +0 -313
  40. kreuzberg-3.8.0.dist-info/RECORD +0 -57
  41. {kreuzberg-3.8.0.dist-info → kreuzberg-3.8.2.dist-info}/WHEEL +0 -0
  42. {kreuzberg-3.8.0.dist-info → kreuzberg-3.8.2.dist-info}/entry_points.txt +0 -0
  43. {kreuzberg-3.8.0.dist-info → kreuzberg-3.8.2.dist-info}/licenses/LICENSE +0 -0
kreuzberg/exceptions.py CHANGED
@@ -7,6 +7,8 @@ from typing import Any
7
7
  class KreuzbergError(Exception):
8
8
  """Base exception for all Kreuzberg errors."""
9
9
 
10
+ __slots__ = ("context",)
11
+
10
12
  context: Any
11
13
  """The context of the error."""
12
14
 
@@ -43,14 +45,20 @@ class KreuzbergError(Exception):
43
45
  class ParsingError(KreuzbergError):
44
46
  """Raised when a parsing error occurs."""
45
47
 
48
+ __slots__ = ()
49
+
46
50
 
47
51
  class ValidationError(KreuzbergError):
48
52
  """Raised when a validation error occurs."""
49
53
 
54
+ __slots__ = ()
55
+
50
56
 
51
57
  class MissingDependencyError(KreuzbergError):
52
58
  """Raised when a dependency is missing."""
53
59
 
60
+ __slots__ = ()
61
+
54
62
  @classmethod
55
63
  def create_for_package(
56
64
  cls, *, dependency_group: str, functionality: str, package_name: str
@@ -79,3 +87,5 @@ class MissingDependencyError(KreuzbergError):
79
87
 
80
88
  class OCRError(KreuzbergError):
81
89
  """Raised when an OCR error occurs."""
90
+
91
+ __slots__ = ()
kreuzberg/extraction.py CHANGED
@@ -1,5 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import multiprocessing as mp
4
+ from concurrent.futures import ThreadPoolExecutor, as_completed
3
5
  from pathlib import Path
4
6
  from typing import TYPE_CHECKING, Any, Final, cast
5
7
 
@@ -14,6 +16,8 @@ from kreuzberg._mime_types import (
14
16
  )
15
17
  from kreuzberg._registry import ExtractorRegistry
16
18
  from kreuzberg._types import ExtractionConfig
19
+ from kreuzberg._utils._document_cache import get_document_cache
20
+ from kreuzberg._utils._errors import create_error_context
17
21
  from kreuzberg._utils._string import safe_decode
18
22
  from kreuzberg._utils._sync import run_maybe_sync, run_sync_only
19
23
  from kreuzberg.exceptions import ValidationError
@@ -136,8 +140,6 @@ async def extract_file(
136
140
  Raises:
137
141
  ValidationError: If the file path or configuration is invalid.
138
142
  """
139
- from kreuzberg._utils._document_cache import get_document_cache
140
-
141
143
  cache = get_document_cache()
142
144
  path = Path(file_path)
143
145
  cached_result = cache.get(path, config)
@@ -194,8 +196,6 @@ async def batch_extract_file(
194
196
  if not file_paths:
195
197
  return []
196
198
 
197
- import multiprocessing as mp
198
-
199
199
  max_concurrency = min(len(file_paths), mp.cpu_count() * 2)
200
200
  semaphore = anyio.Semaphore(max_concurrency)
201
201
 
@@ -211,8 +211,6 @@ async def batch_extract_file(
211
211
  )
212
212
  results[index] = result
213
213
  except Exception as e: # noqa: BLE001
214
- from kreuzberg._utils._errors import create_error_context
215
-
216
214
  error_result = ExtractionResult(
217
215
  content=f"Error: {type(e).__name__}: {e!s}",
218
216
  mime_type="text/plain",
@@ -251,8 +249,6 @@ async def batch_extract_bytes(
251
249
  if not contents:
252
250
  return []
253
251
 
254
- import multiprocessing as mp
255
-
256
252
  max_concurrency = min(len(contents), mp.cpu_count() * 2)
257
253
  semaphore = anyio.Semaphore(max_concurrency)
258
254
 
@@ -264,8 +260,6 @@ async def batch_extract_bytes(
264
260
  result = await extract_bytes(content, mime_type, config)
265
261
  results[index] = result
266
262
  except Exception as e: # noqa: BLE001
267
- from kreuzberg._utils._errors import create_error_context
268
-
269
263
  error_result = ExtractionResult(
270
264
  content=f"Error: {type(e).__name__}: {e!s}",
271
265
  mime_type="text/plain",
@@ -331,8 +325,6 @@ def extract_file_sync(
331
325
  Raises:
332
326
  ValidationError: If the file path or configuration is invalid.
333
327
  """
334
- from kreuzberg._utils._document_cache import get_document_cache
335
-
336
328
  cache = get_document_cache()
337
329
  path = Path(file_path)
338
330
  cached_result = cache.get(path, config)
@@ -389,9 +381,6 @@ def batch_extract_file_sync(
389
381
  if len(file_paths) <= 1:
390
382
  return [extract_file_sync(file_path=Path(file_path), mime_type=None, config=config) for file_path in file_paths]
391
383
 
392
- import multiprocessing as mp
393
- from concurrent.futures import ThreadPoolExecutor, as_completed
394
-
395
384
  max_workers = min(len(file_paths), mp.cpu_count())
396
385
 
397
386
  def extract_single(file_path: PathLike[str] | str) -> tuple[int, ExtractionResult]:
@@ -402,8 +391,6 @@ def batch_extract_file_sync(
402
391
  extract_file_sync(file_path=Path(file_path), mime_type=None, config=config),
403
392
  )
404
393
  except Exception as e: # noqa: BLE001
405
- from kreuzberg._utils._errors import create_error_context
406
-
407
394
  error_result = ExtractionResult(
408
395
  content=f"Error: {type(e).__name__}: {e!s}",
409
396
  mime_type="text/plain",
@@ -447,9 +434,6 @@ def batch_extract_bytes_sync(
447
434
  extract_bytes_sync(content=content, mime_type=mime_type, config=config) for content, mime_type in contents
448
435
  ]
449
436
 
450
- import multiprocessing as mp
451
- from concurrent.futures import ThreadPoolExecutor, as_completed
452
-
453
437
  max_workers = min(len(contents), mp.cpu_count())
454
438
 
455
439
  def extract_single(index_and_content: tuple[int, tuple[bytes, str]]) -> tuple[int, ExtractionResult]:
@@ -458,8 +442,6 @@ def batch_extract_bytes_sync(
458
442
  try:
459
443
  return (index, extract_bytes_sync(content=content, mime_type=mime_type, config=config))
460
444
  except Exception as e: # noqa: BLE001
461
- from kreuzberg._utils._errors import create_error_context
462
-
463
445
  error_result = ExtractionResult(
464
446
  content=f"Error: {type(e).__name__}: {e!s}",
465
447
  mime_type="text/plain",
@@ -478,8 +460,8 @@ def batch_extract_bytes_sync(
478
460
  return (index, error_result)
479
461
 
480
462
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
481
- indexed_contents = list(enumerate(contents))
482
- future_to_index = {executor.submit(extract_single, ic): i for i, ic in enumerate(indexed_contents)}
463
+ # Avoid creating intermediate list, use enumerate directly
464
+ future_to_index = {executor.submit(extract_single, (i, content)): i for i, content in enumerate(contents)}
483
465
 
484
466
  results: list[ExtractionResult] = [None] * len(contents) # type: ignore[list-item]
485
467
  for future in as_completed(future_to_index):
@@ -0,0 +1,265 @@
1
+ Metadata-Version: 2.4
2
+ Name: kreuzberg
3
+ Version: 3.8.2
4
+ Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
+ Project-URL: documentation, https://kreuzberg.dev
6
+ Project-URL: homepage, https://github.com/Goldziher/kreuzberg
7
+ Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
8
+ License: MIT
9
+ License-File: LICENSE
10
+ Keywords: async,document-analysis,document-intelligence,document-processing,extensible,information-extraction,mcp,metadata-extraction,model-context-protocol,ocr,pandoc,pdf-extraction,pdfium,plugin-architecture,rag,retrieval-augmented-generation,structured-data,table-extraction,tesseract,text-extraction
11
+ Classifier: Development Status :: 5 - Production/Stable
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Information Technology
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3 :: Only
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Topic :: Database
23
+ Classifier: Topic :: Multimedia :: Graphics :: Capture :: Scanners
24
+ Classifier: Topic :: Office/Business :: Office Suites
25
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
26
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
27
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
28
+ Classifier: Topic :: Text Processing :: General
29
+ Classifier: Typing :: Typed
30
+ Requires-Python: >=3.10
31
+ Requires-Dist: anyio>=4.9.0
32
+ Requires-Dist: chardetng-py>=0.3.4
33
+ Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
34
+ Requires-Dist: html-to-markdown[lxml]>=1.8.0
35
+ Requires-Dist: mcp>=1.11.0
36
+ Requires-Dist: msgspec>=0.18.0
37
+ Requires-Dist: playa-pdf>=0.6.1
38
+ Requires-Dist: psutil>=7.0.0
39
+ Requires-Dist: pypdfium2==4.30.0
40
+ Requires-Dist: python-calamine>=0.3.2
41
+ Requires-Dist: python-pptx>=1.0.2
42
+ Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
43
+ Provides-Extra: additional-extensions
44
+ Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
45
+ Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
46
+ Provides-Extra: all
47
+ Requires-Dist: click>=8.2.1; extra == 'all'
48
+ Requires-Dist: easyocr>=1.7.2; extra == 'all'
49
+ Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
50
+ Requires-Dist: gmft>=0.4.2; extra == 'all'
51
+ Requires-Dist: keybert>=0.9.0; extra == 'all'
52
+ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all'
53
+ Requires-Dist: mailparse>=1.0.15; extra == 'all'
54
+ Requires-Dist: paddleocr>=3.1.0; extra == 'all'
55
+ Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
56
+ Requires-Dist: rich>=14.0.0; extra == 'all'
57
+ Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
58
+ Requires-Dist: setuptools>=80.9.0; extra == 'all'
59
+ Requires-Dist: spacy>=3.8.7; extra == 'all'
60
+ Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
61
+ Provides-Extra: api
62
+ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'api'
63
+ Provides-Extra: chunking
64
+ Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
65
+ Provides-Extra: cli
66
+ Requires-Dist: click>=8.2.1; extra == 'cli'
67
+ Requires-Dist: rich>=14.0.0; extra == 'cli'
68
+ Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
69
+ Provides-Extra: easyocr
70
+ Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
71
+ Provides-Extra: entity-extraction
72
+ Requires-Dist: keybert>=0.9.0; extra == 'entity-extraction'
73
+ Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
74
+ Provides-Extra: gmft
75
+ Requires-Dist: gmft>=0.4.2; extra == 'gmft'
76
+ Provides-Extra: langdetect
77
+ Requires-Dist: fast-langdetect>=0.3.2; extra == 'langdetect'
78
+ Provides-Extra: paddleocr
79
+ Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
80
+ Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
81
+ Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
82
+ Description-Content-Type: text/markdown
83
+
84
+ # Kreuzberg
85
+
86
+ [![Discord](https://img.shields.io/badge/Discord-Join%20our%20community-7289da)](https://discord.gg/pXxagNK2zN)
87
+ [![PyPI version](https://badge.fury.io/py/kreuzberg.svg)](https://badge.fury.io/py/kreuzberg)
88
+ [![Documentation](https://img.shields.io/badge/docs-kreuzberg.dev-blue)](https://kreuzberg.dev/)
89
+ [![Benchmarks](https://img.shields.io/badge/benchmarks-fastest%20CPU-orange)](https://benchmarks.kreuzberg.dev/)
90
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
91
+ [![Test Coverage](https://img.shields.io/badge/coverage-95%25-green)](https://github.com/Goldziher/kreuzberg)
92
+
93
+ **A document intelligence framework for Python.** Extract text, metadata, and structured information from diverse document formats through a unified, extensible API. Built on established open source foundations including Pandoc, PDFium, and Tesseract.
94
+
95
+ 📖 **[Complete Documentation](https://kreuzberg.dev/)**
96
+
97
+ ## Framework Overview
98
+
99
+ ### Document Intelligence Capabilities
100
+
101
+ - **Text Extraction**: High-fidelity text extraction preserving document structure and formatting
102
+ - **Metadata Extraction**: Comprehensive metadata including author, creation date, language, and document properties
103
+ - **Format Support**: 18 document types including PDF, Microsoft Office, images, HTML, and structured data formats
104
+ - **OCR Integration**: Multiple OCR engines (Tesseract, EasyOCR, PaddleOCR) with automatic fallback
105
+ - **Table Detection**: Structured table extraction with cell-level precision via GMFT integration
106
+
107
+ ### Technical Architecture
108
+
109
+ - **Performance**: Highest throughput among Python document processing frameworks (30+ docs/second)
110
+ - **Resource Efficiency**: 71MB installation, ~360MB runtime memory footprint
111
+ - **Extensibility**: Plugin architecture for custom extractors via the Extractor base class
112
+ - **API Design**: Synchronous and asynchronous APIs with consistent interfaces
113
+ - **Type Safety**: Complete type annotations throughout the codebase
114
+
115
+ ### Open Source Foundation
116
+
117
+ Kreuzberg leverages established open source technologies:
118
+
119
+ - **Pandoc**: Universal document converter for robust format support
120
+ - **PDFium**: Google's PDF rendering engine for accurate PDF processing
121
+ - **Tesseract**: Google's OCR engine for text recognition
122
+ - **Python-docx/pptx**: Native Microsoft Office format support
123
+
124
+ ## Quick Start
125
+
126
+ ### Extract Text with CLI
127
+
128
+ ```bash
129
+ # Extract text from any file to markdown
130
+ uvx kreuzberg extract document.pdf > output.md
131
+
132
+ # With all features (OCR, table extraction, etc.)
133
+ uvx --from "kreuzberg[all]" kreuzberg extract invoice.pdf --ocr --format markdown
134
+
135
+ # Extract with rich metadata
136
+ uvx kreuzberg extract report.pdf --show-metadata --format json
137
+ ```
138
+
139
+ ### Python Usage
140
+
141
+ **Async (recommended for web apps):**
142
+
143
+ ```python
144
+ from kreuzberg import extract_file
145
+
146
+ # In your async function
147
+ result = await extract_file("presentation.pptx")
148
+ print(result.content)
149
+
150
+ # Rich metadata extraction
151
+ print(f"Title: {result.metadata.title}")
152
+ print(f"Author: {result.metadata.author}")
153
+ print(f"Page count: {result.metadata.page_count}")
154
+ print(f"Created: {result.metadata.created_at}")
155
+ ```
156
+
157
+ **Sync (for scripts and CLI tools):**
158
+
159
+ ```python
160
+ from kreuzberg import extract_file_sync
161
+
162
+ result = extract_file_sync("report.docx")
163
+ print(result.content)
164
+
165
+ # Access rich metadata
166
+ print(f"Language: {result.metadata.language}")
167
+ print(f"Word count: {result.metadata.word_count}")
168
+ print(f"Keywords: {result.metadata.keywords}")
169
+ ```
170
+
171
+ ### Docker
172
+
173
+ ```bash
174
+ # Run the REST API
175
+ docker run -p 8000:8000 goldziher/kreuzberg
176
+
177
+ # Extract via API
178
+ curl -X POST -F "file=@document.pdf" http://localhost:8000/extract
179
+ ```
180
+
181
+ 📖 **[Installation Guide](https://kreuzberg.dev/getting-started/installation/)** • **[CLI Documentation](https://kreuzberg.dev/cli/)** • **[API Reference](https://kreuzberg.dev/api-reference/)**
182
+
183
+ ## Deployment Options
184
+
185
+ ### 🤖 MCP Server (AI Integration)
186
+
187
+ **Add to Claude Desktop with one command:**
188
+
189
+ ```bash
190
+ claude mcp add kreuzberg uvx -- --from "kreuzberg[all]" kreuzberg-mcp
191
+ ```
192
+
193
+ **Or configure manually in `claude_desktop_config.json`:**
194
+
195
+ ```json
196
+ {
197
+ "mcpServers": {
198
+ "kreuzberg": {
199
+ "command": "uvx",
200
+ "args": ["--from", "kreuzberg[all]", "kreuzberg-mcp"]
201
+ }
202
+ }
203
+ }
204
+ ```
205
+
206
+ **MCP capabilities:**
207
+
208
+ - Extract text from PDFs, images, Office docs, and more
209
+ - Full OCR support with multiple engines
210
+ - Table extraction and metadata parsing
211
+
212
+ 📖 **[MCP Documentation](https://kreuzberg.dev/user-guide/mcp-server/)**
213
+
214
+ ## Supported Formats
215
+
216
+ | Category | Formats |
217
+ | ----------------- | ------------------------------ |
218
+ | **Documents** | PDF, DOCX, DOC, RTF, TXT, EPUB |
219
+ | **Images** | JPG, PNG, TIFF, BMP, GIF, WEBP |
220
+ | **Spreadsheets** | XLSX, XLS, CSV, ODS |
221
+ | **Presentations** | PPTX, PPT, ODP |
222
+ | **Web** | HTML, XML, MHTML |
223
+ | **Archives** | Support via extraction |
224
+
225
+ ## 📊 Performance Characteristics
226
+
227
+ [View comprehensive benchmarks](https://benchmarks.kreuzberg.dev/) • [Benchmark methodology](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [**Detailed Analysis**](https://kreuzberg.dev/performance-analysis/)
228
+
229
+ ### Technical Specifications
230
+
231
+ | Metric | Kreuzberg Sync | Kreuzberg Async | Benchmarked |
232
+ | ---------------------------- | -------------- | --------------- | ------------------ |
233
+ | **Throughput (tiny files)** | 31.78 files/s | 23.94 files/s | Highest throughput |
234
+ | **Throughput (small files)** | 8.91 files/s | 9.31 files/s | Highest throughput |
235
+ | **Memory footprint** | 359.8 MB | 395.2 MB | Lowest usage |
236
+ | **Installation size** | 71 MB | 71 MB | Smallest size |
237
+ | **Success rate** | 100% | 100% | Perfect |
238
+ | **Supported formats** | 18 | 18 | Comprehensive |
239
+
240
+ ### Architecture Advantages
241
+
242
+ - **Native C extensions**: Built on PDFium and Tesseract for maximum performance
243
+ - **Async/await support**: True asynchronous processing with intelligent task scheduling
244
+ - **Memory efficiency**: Streaming architecture minimizes memory allocation
245
+ - **Process pooling**: Automatic multiprocessing for CPU-intensive operations
246
+ - **Optimized data flow**: Efficient data handling with minimal transformations
247
+
248
+ > **Benchmark details**: Tests include PDFs, Word docs, HTML, images, and spreadsheets in multiple languages (English, Hebrew, German, Chinese, Japanese, Korean) on standardized hardware.
249
+
250
+ ## Documentation
251
+
252
+ ### Quick Links
253
+
254
+ - [Installation Guide](https://kreuzberg.dev/getting-started/installation/) - Setup and dependencies
255
+ - [User Guide](https://kreuzberg.dev/user-guide/) - Comprehensive usage guide
256
+ - [Performance Analysis](https://kreuzberg.dev/performance-analysis/) - Detailed benchmark results
257
+ - [API Reference](https://kreuzberg.dev/api-reference/) - Complete API documentation
258
+ - [Docker Guide](https://kreuzberg.dev/user-guide/docker/) - Container deployment
259
+ - [REST API](https://kreuzberg.dev/user-guide/api-server/) - HTTP endpoints
260
+ - [CLI Guide](https://kreuzberg.dev/cli/) - Command-line usage
261
+ - [OCR Configuration](https://kreuzberg.dev/user-guide/ocr-configuration/) - OCR engine setup
262
+
263
+ ## License
264
+
265
+ MIT License - see [LICENSE](LICENSE) for details.
@@ -0,0 +1,53 @@
1
+ kreuzberg/__init__.py,sha256=0OJ_jNKbS6GxzWC5-EfRCiE80as_ya0-wwyNsTYbxzY,1721
2
+ kreuzberg/__main__.py,sha256=s2qM1nPEkRHAQP-G3P7sf5l6qA_KJeIEHS5LpPz04lg,183
3
+ kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
4
+ kreuzberg/_config.py,sha256=_9JU88ChId8dWUjZ13ueo9_JoFekkyzuv7rZpFkrPZk,12966
5
+ kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
6
+ kreuzberg/_entity_extraction.py,sha256=woNxARG27Z3T_l6w6N-dbt1PPe1IHptFMOZY_6etv54,7819
7
+ kreuzberg/_gmft.py,sha256=Q46CyBxRxY_oDGpSuXMOJ7qfR9LwuCKXnrl60wcPvU4,25286
8
+ kreuzberg/_language_detection.py,sha256=eEfj4tsh91SfB2_zQIdY-qD7TlPcppaFm0SqQmETS6Y,3295
9
+ kreuzberg/_mime_types.py,sha256=OhJ6gEyyLHjyvRtkk37zyLFBsRcSd_QybBaV8TxinIg,8471
10
+ kreuzberg/_playa.py,sha256=9z4If0WHxbYQxfb8xT7T96L9Du2Fj3Ar5-rF0OHHiMM,11877
11
+ kreuzberg/_registry.py,sha256=wGSlkS0U1zqruWQCLE95vj4a2mw1yyvf0j6rgz80sJg,3473
12
+ kreuzberg/_types.py,sha256=GisvL0ps2LCc0heKopFwSyrEbzH3WpDxaeev4vn59X4,14257
13
+ kreuzberg/cli.py,sha256=vTGS2TJlFTNMWp5LwZd3G2SS8u0m6bhQkH9n6a1oOoM,12439
14
+ kreuzberg/exceptions.py,sha256=PTiAZgQwcG9hXbgYg2W7sfxksFhq5_wzOFgZGnTJAoc,2991
15
+ kreuzberg/extraction.py,sha256=UmeEVN-eSile4HMxP0iqG9092BrsH5_zSZNVHhwy0ko,16993
16
+ kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
+ kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
+ kreuzberg/_api/main.py,sha256=g3kqXUfSie2pcw3-EWOM4TAoJUqM7yj2e-cBQJ_bmYc,3253
19
+ kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
+ kreuzberg/_extractors/_base.py,sha256=yNVQSECFad-8_MjqpQZ4q0jQoNdzP6-tqw6l3TfgsMc,4418
21
+ kreuzberg/_extractors/_email.py,sha256=6-Mk1TRXPyy9ylWKCpgdrogyzhiFnJOTuTRld1ghO8I,5695
22
+ kreuzberg/_extractors/_html.py,sha256=lOM1Tgrrvd7vpEeFAxC1dp0Tibr6N2FEHCjgFx0FK64,1745
23
+ kreuzberg/_extractors/_image.py,sha256=eZ7mR4F-mTwYwUzd70xrY7SZYZrNiDxnP5bYDY5P75U,4455
24
+ kreuzberg/_extractors/_pandoc.py,sha256=51k7XISfKaPorhapG7aIeQb94KGsfozxKyT2rwhk9Bk,26553
25
+ kreuzberg/_extractors/_pdf.py,sha256=d-hG_mhAMj22bQ35YuP2nq017z27_2Pp08r1qyHxlYI,16676
26
+ kreuzberg/_extractors/_presentation.py,sha256=CUlqZl_QCdJdumsZh0BpROkFbvi9uq7yMoIt3bRTUeE,10859
27
+ kreuzberg/_extractors/_spread_sheet.py,sha256=vPxEDAyH-gDoVXSg-A0guOjOfaWIuRI3i2NU8xPwhK8,13695
28
+ kreuzberg/_extractors/_structured.py,sha256=d0x6EyRimr8eWmr1qPb7HRWnrbKBuD-GpIrZd8XJp0o,5824
29
+ kreuzberg/_mcp/__init__.py,sha256=8PYV-omC8Rln7Cove8C3rHu3d7sR1FuiwSBG1O7vkAE,92
30
+ kreuzberg/_mcp/server.py,sha256=Ab0w7kR3m7_L1cfhYHiC8HqDL282vt4uBYwYc9w9E08,8703
31
+ kreuzberg/_ocr/__init__.py,sha256=grshVFwVQl2rMvH1hg1JNlYXjy5-Tdb_rusLD1Cselk,706
32
+ kreuzberg/_ocr/_base.py,sha256=CUzYMsJjCqCmHzWckmDeIB2L5hd261xrPrK8Ql-Gdm0,3876
33
+ kreuzberg/_ocr/_easyocr.py,sha256=c2ndpDlIHvAI2WyvQUXLQ1hb6XynKeKARsXQcQ3ntJ0,17110
34
+ kreuzberg/_ocr/_paddleocr.py,sha256=fab8a-3cvDgnt97qF-Km9ZfmkacFeKD_g15O8HXYRVc,17492
35
+ kreuzberg/_ocr/_tesseract.py,sha256=r1g_PCAXgJbZ0RPGn4aSxctZ0F9lLvI3zLGLEPAnviI,31455
36
+ kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
+ kreuzberg/_utils/_cache.py,sha256=H2d6JOiTTAoJx5HPJoToCk4ik-ztTRNEJRrHgcSUTLs,15249
38
+ kreuzberg/_utils/_device.py,sha256=PC8YUPE95pzOyU7sU_icqNZpSfi6HZlEFfmWcV1Uees,10226
39
+ kreuzberg/_utils/_document_cache.py,sha256=z8irioKsOu8xve1YgHatm__wIFvs9I1gDK3tLNsNyqM,6926
40
+ kreuzberg/_utils/_errors.py,sha256=UsktQ_p7eOj9crPsFDg8HgRSE5-IpuFC7y1e6dDI_fY,6503
41
+ kreuzberg/_utils/_pdf_lock.py,sha256=nqxAYCNlfWDrJtP4ZNu57st1YnkDl-gYXdr0q8nv0kA,1961
42
+ kreuzberg/_utils/_process_pool.py,sha256=4BqhmRspwMyPT2EBfTu_rrn7v722wlMLD8qlYvYsc00,8621
43
+ kreuzberg/_utils/_quality.py,sha256=-nKzj5n7yJDYrvl556oq2T5S5oKMEOrjpcRMlZ00Jqo,7668
44
+ kreuzberg/_utils/_serialization.py,sha256=cqqxqN2cmtndBhIr4v2wqiMwnNadnKhvuN7EUj3i18M,2290
45
+ kreuzberg/_utils/_string.py,sha256=bCzO3UO6nXupxvtMWvHqfp1Vd9CTzEH9jmpJXQ7upAU,6800
46
+ kreuzberg/_utils/_sync.py,sha256=7LSavBmxVKQUzdjfx9fYRAI9IbJtRw8iGf_Q8B7RX9g,4923
47
+ kreuzberg/_utils/_table.py,sha256=IomrfQBP85DZI8RmQjOVs2Siq7VP9FUTYPaZR4t3yRw,8199
48
+ kreuzberg/_utils/_tmp.py,sha256=hVn-VVijIg2FM7EZJ899gc7wZg-TGoJZoeAcxMX-Cxg,1044
49
+ kreuzberg-3.8.2.dist-info/METADATA,sha256=RiP64og5wOaf9gPZ7CwOsNYYx9GBnVMg8orgqZdncKA,11466
50
+ kreuzberg-3.8.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
51
+ kreuzberg-3.8.2.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
52
+ kreuzberg-3.8.2.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
53
+ kreuzberg-3.8.2.dist-info/RECORD,,
kreuzberg/_cli_config.py DELETED
@@ -1,175 +0,0 @@
1
- """Configuration parsing for the CLI."""
2
-
3
- from __future__ import annotations
4
-
5
- import sys
6
- from pathlib import Path
7
- from typing import TYPE_CHECKING, Any
8
-
9
- if sys.version_info >= (3, 11):
10
- import tomllib
11
- else:
12
- import tomli as tomllib # type: ignore[import-not-found]
13
-
14
- from kreuzberg._gmft import GMFTConfig
15
- from kreuzberg._ocr._easyocr import EasyOCRConfig
16
- from kreuzberg._ocr._paddleocr import PaddleOCRConfig
17
- from kreuzberg._ocr._tesseract import TesseractConfig
18
- from kreuzberg._types import ExtractionConfig, OcrBackendType
19
- from kreuzberg.exceptions import ValidationError
20
-
21
- if TYPE_CHECKING:
22
- from collections.abc import MutableMapping
23
-
24
-
25
- def load_config_from_file(config_path: Path) -> dict[str, Any]:
26
- """Load configuration from a TOML file.
27
-
28
- Args:
29
- config_path: Path to the configuration file.
30
-
31
- Returns:
32
- Dictionary containing the loaded configuration.
33
-
34
- Raises:
35
- ValidationError: If the file cannot be read or parsed.
36
- """
37
- try:
38
- with config_path.open("rb") as f:
39
- data = tomllib.load(f)
40
- except FileNotFoundError as e:
41
- raise ValidationError(f"Configuration file not found: {config_path}") from e
42
- except tomllib.TOMLDecodeError as e:
43
- raise ValidationError(f"Invalid TOML in configuration file: {e}") from e
44
-
45
- return data.get("tool", {}).get("kreuzberg", {}) # type: ignore[no-any-return]
46
-
47
-
48
- def merge_configs(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
49
- """Merge two configuration dictionaries recursively.
50
-
51
- Args:
52
- base: Base configuration dictionary.
53
- override: Configuration dictionary to override base values.
54
-
55
- Returns:
56
- Merged configuration dictionary.
57
- """
58
- result = base.copy()
59
- for key, value in override.items():
60
- if isinstance(value, dict) and key in result and isinstance(result[key], dict):
61
- result[key] = merge_configs(result[key], value)
62
- else:
63
- result[key] = value
64
- return result
65
-
66
-
67
- def parse_ocr_backend_config(
68
- config_dict: dict[str, Any], backend: OcrBackendType
69
- ) -> TesseractConfig | EasyOCRConfig | PaddleOCRConfig | None:
70
- """Parse OCR backend-specific configuration.
71
-
72
- Args:
73
- config_dict: Configuration dictionary.
74
- backend: The OCR backend type.
75
-
76
- Returns:
77
- Backend-specific configuration object or None.
78
- """
79
- if backend not in config_dict:
80
- return None
81
-
82
- backend_config = config_dict[backend]
83
- if not isinstance(backend_config, dict):
84
- return None
85
-
86
- if backend == "tesseract":
87
- return TesseractConfig(**backend_config)
88
- if backend == "easyocr":
89
- return EasyOCRConfig(**backend_config)
90
- if backend == "paddleocr":
91
- return PaddleOCRConfig(**backend_config)
92
- return None
93
-
94
-
95
- def build_extraction_config( # noqa: C901, PLR0912
96
- file_config: dict[str, Any],
97
- cli_args: MutableMapping[str, Any],
98
- ) -> ExtractionConfig:
99
- """Build ExtractionConfig from file config and CLI arguments.
100
-
101
- Args:
102
- file_config: Configuration loaded from file.
103
- cli_args: CLI arguments.
104
-
105
- Returns:
106
- ExtractionConfig instance.
107
- """
108
- config_dict: dict[str, Any] = {}
109
-
110
- if file_config:
111
- for field in ["force_ocr", "chunk_content", "extract_tables", "max_chars", "max_overlap", "ocr_backend"]:
112
- if field in file_config:
113
- config_dict[field] = file_config[field]
114
-
115
- for field in ["force_ocr", "chunk_content", "extract_tables", "max_chars", "max_overlap", "ocr_backend"]:
116
- cli_key = field
117
- if cli_key in cli_args and cli_args[cli_key] is not None:
118
- config_dict[field] = cli_args[cli_key]
119
-
120
- ocr_backend = config_dict.get("ocr_backend")
121
- if ocr_backend and ocr_backend != "none":
122
- ocr_config = None
123
-
124
- if cli_args.get(f"{ocr_backend}_config"):
125
- backend_args = cli_args[f"{ocr_backend}_config"]
126
- if ocr_backend == "tesseract":
127
- ocr_config = TesseractConfig(**backend_args)
128
- elif ocr_backend == "easyocr":
129
- ocr_config = EasyOCRConfig(**backend_args) # type: ignore[assignment]
130
- elif ocr_backend == "paddleocr":
131
- ocr_config = PaddleOCRConfig(**backend_args) # type: ignore[assignment]
132
-
133
- if not ocr_config and file_config:
134
- ocr_config = parse_ocr_backend_config(file_config, ocr_backend) # type: ignore[assignment]
135
-
136
- if ocr_config:
137
- config_dict["ocr_config"] = ocr_config
138
-
139
- if config_dict.get("extract_tables"):
140
- gmft_config = None
141
-
142
- if cli_args.get("gmft_config"):
143
- gmft_config = GMFTConfig(**cli_args["gmft_config"])
144
-
145
- elif "gmft" in file_config and isinstance(file_config["gmft"], dict):
146
- gmft_config = GMFTConfig(**file_config["gmft"])
147
-
148
- if gmft_config:
149
- config_dict["gmft_config"] = gmft_config
150
-
151
- if config_dict.get("ocr_backend") == "none":
152
- config_dict["ocr_backend"] = None
153
-
154
- return ExtractionConfig(**config_dict)
155
-
156
-
157
- def find_default_config() -> Path | None:
158
- """Find the default configuration file (pyproject.toml).
159
-
160
- Returns:
161
- Path to the configuration file or None if not found.
162
- """
163
- current = Path.cwd()
164
- while current != current.parent:
165
- config_path = current / "pyproject.toml"
166
- if config_path.exists():
167
- try:
168
- with config_path.open("rb") as f:
169
- data = tomllib.load(f)
170
- if "tool" in data and "kreuzberg" in data["tool"]:
171
- return config_path
172
- except Exception: # noqa: BLE001
173
- pass
174
- current = current.parent
175
- return None
@@ -1,5 +0,0 @@
1
- """Multiprocessing utilities for kreuzberg."""
2
-
3
- from .gmft_isolated import extract_tables_isolated, extract_tables_isolated_async
4
-
5
- __all__ = ["extract_tables_isolated", "extract_tables_isolated_async"]