kreuzberg 3.8.1__py3-none-any.whl → 3.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,269 @@
1
+ Metadata-Version: 2.4
2
+ Name: kreuzberg
3
+ Version: 3.9.0
4
+ Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
+ Project-URL: documentation, https://kreuzberg.dev
6
+ Project-URL: homepage, https://github.com/Goldziher/kreuzberg
7
+ Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
8
+ License: MIT
9
+ License-File: LICENSE
10
+ Keywords: async,document-analysis,document-classification,document-intelligence,document-processing,extensible,information-extraction,mcp,metadata-extraction,model-context-protocol,ocr,pandoc,pdf-extraction,pdfium,plugin-architecture,rag,retrieval-augmented-generation,structured-data,table-extraction,tesseract,text-extraction
11
+ Classifier: Development Status :: 5 - Production/Stable
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Information Technology
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3 :: Only
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Topic :: Database
23
+ Classifier: Topic :: Multimedia :: Graphics :: Capture :: Scanners
24
+ Classifier: Topic :: Office/Business :: Office Suites
25
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
26
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
27
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
28
+ Classifier: Topic :: Text Processing :: General
29
+ Classifier: Typing :: Typed
30
+ Requires-Python: >=3.10
31
+ Requires-Dist: anyio>=4.9.0
32
+ Requires-Dist: chardetng-py>=0.3.4
33
+ Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
34
+ Requires-Dist: html-to-markdown[lxml]>=1.8.0
35
+ Requires-Dist: mcp>=1.11.0
36
+ Requires-Dist: msgspec>=0.18.0
37
+ Requires-Dist: playa-pdf>=0.6.1
38
+ Requires-Dist: psutil>=7.0.0
39
+ Requires-Dist: pypdfium2==4.30.0
40
+ Requires-Dist: python-calamine>=0.3.2
41
+ Requires-Dist: python-pptx>=1.0.2
42
+ Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
43
+ Provides-Extra: additional-extensions
44
+ Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
45
+ Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
46
+ Provides-Extra: all
47
+ Requires-Dist: click>=8.2.1; extra == 'all'
48
+ Requires-Dist: easyocr>=1.7.2; extra == 'all'
49
+ Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
50
+ Requires-Dist: gmft>=0.4.2; extra == 'all'
51
+ Requires-Dist: keybert>=0.9.0; extra == 'all'
52
+ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all'
53
+ Requires-Dist: mailparse>=1.0.15; extra == 'all'
54
+ Requires-Dist: paddleocr>=3.1.0; extra == 'all'
55
+ Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
56
+ Requires-Dist: rich>=14.0.0; extra == 'all'
57
+ Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
58
+ Requires-Dist: setuptools>=80.9.0; extra == 'all'
59
+ Requires-Dist: spacy>=3.8.7; extra == 'all'
60
+ Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
61
+ Provides-Extra: api
62
+ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'api'
63
+ Provides-Extra: auto-classify-document-type
64
+ Requires-Dist: deep-translator>=1.11.4; extra == 'auto-classify-document-type'
65
+ Requires-Dist: pandas>=2.3.1; extra == 'auto-classify-document-type'
66
+ Provides-Extra: chunking
67
+ Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
68
+ Provides-Extra: cli
69
+ Requires-Dist: click>=8.2.1; extra == 'cli'
70
+ Requires-Dist: rich>=14.0.0; extra == 'cli'
71
+ Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
72
+ Provides-Extra: easyocr
73
+ Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
74
+ Provides-Extra: entity-extraction
75
+ Requires-Dist: keybert>=0.9.0; extra == 'entity-extraction'
76
+ Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
77
+ Provides-Extra: gmft
78
+ Requires-Dist: gmft>=0.4.2; extra == 'gmft'
79
+ Provides-Extra: langdetect
80
+ Requires-Dist: fast-langdetect>=0.3.2; extra == 'langdetect'
81
+ Provides-Extra: paddleocr
82
+ Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
83
+ Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
84
+ Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
85
+ Description-Content-Type: text/markdown
86
+
87
+ # Kreuzberg
88
+
89
+ [![Discord](https://img.shields.io/badge/Discord-Join%20our%20community-7289da)](https://discord.gg/pXxagNK2zN)
90
+ [![PyPI version](https://badge.fury.io/py/kreuzberg.svg)](https://badge.fury.io/py/kreuzberg)
91
+ [![Documentation](https://img.shields.io/badge/docs-kreuzberg.dev-blue)](https://kreuzberg.dev/)
92
+ [![Benchmarks](https://img.shields.io/badge/benchmarks-fastest%20CPU-orange)](https://benchmarks.kreuzberg.dev/)
93
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
94
+ [![DeepSource](https://app.deepsource.com/gh/Goldziher/kreuzberg.svg/?label=code+coverage&show_trend=true&token=U8AW1VWWSLwVhrbtL8LmLBDN)](https://app.deepsource.com/gh/Goldziher/kreuzberg/)
95
+
96
+ **A document intelligence framework for Python.** Extract text, metadata, and structured information from diverse document formats through a unified, extensible API. Built on established open source foundations including Pandoc, PDFium, and Tesseract.
97
+
98
+ 📖 **[Complete Documentation](https://kreuzberg.dev/)**
99
+
100
+ ## Framework Overview
101
+
102
+ ### Document Intelligence Capabilities
103
+
104
+ - **Text Extraction**: High-fidelity text extraction preserving document structure and formatting
105
+ - **Metadata Extraction**: Comprehensive metadata including author, creation date, language, and document properties
106
+ - **Format Support**: 18 document types including PDF, Microsoft Office, images, HTML, and structured data formats
107
+ - **OCR Integration**: Multiple OCR engines (Tesseract, EasyOCR, PaddleOCR) with automatic fallback
108
+ - **Table Detection**: Structured table extraction with cell-level precision via GMFT integration
109
+ - **Document Classification**: Automatic document type detection (contracts, forms, invoices, receipts, reports)
110
+
111
+ ### Technical Architecture
112
+
113
+ - **Performance**: Highest throughput among Python document processing frameworks (30+ docs/second)
114
+ - **Resource Efficiency**: 71MB installation, ~360MB runtime memory footprint
115
+ - **Extensibility**: Plugin architecture for custom extractors via the Extractor base class
116
+ - **API Design**: Synchronous and asynchronous APIs with consistent interfaces
117
+ - **Type Safety**: Complete type annotations throughout the codebase
118
+
119
+ ### Open Source Foundation
120
+
121
+ Kreuzberg leverages established open source technologies:
122
+
123
+ - **Pandoc**: Universal document converter for robust format support
124
+ - **PDFium**: Google's PDF rendering engine for accurate PDF processing
125
+ - **Tesseract**: Google's OCR engine for text recognition
126
+ - **Python-docx/pptx**: Native Microsoft Office format support
127
+
128
+ ## Quick Start
129
+
130
+ ### Extract Text with CLI
131
+
132
+ ```bash
133
+ # Extract text from any file to markdown
134
+ uvx kreuzberg extract document.pdf > output.md
135
+
136
+ # With all features (OCR, table extraction, etc.)
137
+ uvx --from "kreuzberg[all]" kreuzberg extract invoice.pdf --ocr --format markdown
138
+
139
+ # Extract with rich metadata
140
+ uvx kreuzberg extract report.pdf --show-metadata --format json
141
+ ```
142
+
143
+ ### Python Usage
144
+
145
+ **Async (recommended for web apps):**
146
+
147
+ ```python
148
+ from kreuzberg import extract_file
149
+
150
+ # In your async function
151
+ result = await extract_file("presentation.pptx")
152
+ print(result.content)
153
+
154
+ # Rich metadata extraction
155
+ print(f"Title: {result.metadata.title}")
156
+ print(f"Author: {result.metadata.author}")
157
+ print(f"Page count: {result.metadata.page_count}")
158
+ print(f"Created: {result.metadata.created_at}")
159
+ ```
160
+
161
+ **Sync (for scripts and CLI tools):**
162
+
163
+ ```python
164
+ from kreuzberg import extract_file_sync
165
+
166
+ result = extract_file_sync("report.docx")
167
+ print(result.content)
168
+
169
+ # Access rich metadata
170
+ print(f"Language: {result.metadata.language}")
171
+ print(f"Word count: {result.metadata.word_count}")
172
+ print(f"Keywords: {result.metadata.keywords}")
173
+ ```
174
+
175
+ ### Docker
176
+
177
+ ```bash
178
+ # Run the REST API
179
+ docker run -p 8000:8000 goldziher/kreuzberg
180
+
181
+ # Extract via API
182
+ curl -X POST -F "file=@document.pdf" http://localhost:8000/extract
183
+ ```
184
+
185
+ 📖 **[Installation Guide](https://kreuzberg.dev/getting-started/installation/)** • **[CLI Documentation](https://kreuzberg.dev/cli/)** • **[API Reference](https://kreuzberg.dev/api-reference/)**
186
+
187
+ ## Deployment Options
188
+
189
+ ### 🤖 MCP Server (AI Integration)
190
+
191
+ **Add to Claude Desktop with one command:**
192
+
193
+ ```bash
194
+ claude mcp add kreuzberg uvx -- --from "kreuzberg[all]" kreuzberg-mcp
195
+ ```
196
+
197
+ **Or configure manually in `claude_desktop_config.json`:**
198
+
199
+ ```json
200
+ {
201
+ "mcpServers": {
202
+ "kreuzberg": {
203
+ "command": "uvx",
204
+ "args": ["--from", "kreuzberg[all]", "kreuzberg-mcp"]
205
+ }
206
+ }
207
+ }
208
+ ```
209
+
210
+ **MCP capabilities:**
211
+
212
+ - Extract text from PDFs, images, Office docs, and more
213
+ - Full OCR support with multiple engines
214
+ - Table extraction and metadata parsing
215
+
216
+ 📖 **[MCP Documentation](https://kreuzberg.dev/user-guide/mcp-server/)**
217
+
218
+ ## Supported Formats
219
+
220
+ | Category | Formats |
221
+ | ----------------- | ------------------------------ |
222
+ | **Documents** | PDF, DOCX, DOC, RTF, TXT, EPUB |
223
+ | **Images** | JPG, PNG, TIFF, BMP, GIF, WEBP |
224
+ | **Spreadsheets** | XLSX, XLS, CSV, ODS |
225
+ | **Presentations** | PPTX, PPT, ODP |
226
+ | **Web** | HTML, XML, MHTML |
227
+ | **Archives** | Support via extraction |
228
+
229
+ ## 📊 Performance Characteristics
230
+
231
+ [View comprehensive benchmarks](https://benchmarks.kreuzberg.dev/) • [Benchmark methodology](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [**Detailed Analysis**](https://kreuzberg.dev/performance-analysis/)
232
+
233
+ ### Technical Specifications
234
+
235
+ | Metric | Kreuzberg Sync | Kreuzberg Async | Benchmarked |
236
+ | ---------------------------- | -------------- | --------------- | ------------------ |
237
+ | **Throughput (tiny files)** | 31.78 files/s | 23.94 files/s | Highest throughput |
238
+ | **Throughput (small files)** | 8.91 files/s | 9.31 files/s | Highest throughput |
239
+ | **Memory footprint** | 359.8 MB | 395.2 MB | Lowest usage |
240
+ | **Installation size** | 71 MB | 71 MB | Smallest size |
241
+ | **Success rate** | 100% | 100% | Perfect |
242
+ | **Supported formats** | 18 | 18 | Comprehensive |
243
+
244
+ ### Architecture Advantages
245
+
246
+ - **Native C extensions**: Built on PDFium and Tesseract for maximum performance
247
+ - **Async/await support**: True asynchronous processing with intelligent task scheduling
248
+ - **Memory efficiency**: Streaming architecture minimizes memory allocation
249
+ - **Process pooling**: Automatic multiprocessing for CPU-intensive operations
250
+ - **Optimized data flow**: Efficient data handling with minimal transformations
251
+
252
+ > **Benchmark details**: Tests include PDFs, Word docs, HTML, images, and spreadsheets in multiple languages (English, Hebrew, German, Chinese, Japanese, Korean) on standardized hardware.
253
+
254
+ ## Documentation
255
+
256
+ ### Quick Links
257
+
258
+ - [Installation Guide](https://kreuzberg.dev/getting-started/installation/) - Setup and dependencies
259
+ - [User Guide](https://kreuzberg.dev/user-guide/) - Comprehensive usage guide
260
+ - [Performance Analysis](https://kreuzberg.dev/performance-analysis/) - Detailed benchmark results
261
+ - [API Reference](https://kreuzberg.dev/api-reference/) - Complete API documentation
262
+ - [Docker Guide](https://kreuzberg.dev/user-guide/docker/) - Container deployment
263
+ - [REST API](https://kreuzberg.dev/user-guide/api-server/) - HTTP endpoints
264
+ - [CLI Guide](https://kreuzberg.dev/cli/) - Command-line usage
265
+ - [OCR Configuration](https://kreuzberg.dev/user-guide/ocr-configuration/) - OCR engine setup
266
+
267
+ ## License
268
+
269
+ MIT License - see [LICENSE](LICENSE) for details.
@@ -0,0 +1,54 @@
1
+ kreuzberg/__init__.py,sha256=0OJ_jNKbS6GxzWC5-EfRCiE80as_ya0-wwyNsTYbxzY,1721
2
+ kreuzberg/__main__.py,sha256=s2qM1nPEkRHAQP-G3P7sf5l6qA_KJeIEHS5LpPz04lg,183
3
+ kreuzberg/_chunker.py,sha256=QmYbPHPE36ztMT70xPwg_Y4NIftCDl0wyufg5X9lmTo,1932
4
+ kreuzberg/_config.py,sha256=EvrBFAawjfKgXu49tACi4CuMmmoIRt_EzbHayZqM_jU,12983
5
+ kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
6
+ kreuzberg/_document_classification.py,sha256=8XVTKh8ohsb4mbKw2gPFr5OB6v4dWuzXhFE_63vHLrw,5189
7
+ kreuzberg/_entity_extraction.py,sha256=43VxtCPbuSrfi-XGSop-VRoZadJcNe02qRGwjGSaR0c,7862
8
+ kreuzberg/_gmft.py,sha256=JWDeBYWSDkh3tLUhwTmW5CbIBEig2A3o712BLshk7hE,25533
9
+ kreuzberg/_language_detection.py,sha256=eEfj4tsh91SfB2_zQIdY-qD7TlPcppaFm0SqQmETS6Y,3295
10
+ kreuzberg/_mime_types.py,sha256=2warRVqfBUNIg8JBg8yP4pRqaMPvwINosHMkJwtH_Fc,8488
11
+ kreuzberg/_playa.py,sha256=9z4If0WHxbYQxfb8xT7T96L9Du2Fj3Ar5-rF0OHHiMM,11877
12
+ kreuzberg/_registry.py,sha256=wGSlkS0U1zqruWQCLE95vj4a2mw1yyvf0j6rgz80sJg,3473
13
+ kreuzberg/_types.py,sha256=Si-Kb58HgE4ckGyZnJFqbWRbCNbdyC_Y0-p75aQP838,15065
14
+ kreuzberg/cli.py,sha256=lmthZa0x8pz7SQfCoPPdTaUF9aQZ8W4w5FlnPcsGr9k,12438
15
+ kreuzberg/exceptions.py,sha256=PTiAZgQwcG9hXbgYg2W7sfxksFhq5_wzOFgZGnTJAoc,2991
16
+ kreuzberg/extraction.py,sha256=Kt1mOxdlOb35yVOdpdhiRPuTgA9BW_TTG9qwCkSxSkc,17332
17
+ kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
+ kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
+ kreuzberg/_api/main.py,sha256=g3kqXUfSie2pcw3-EWOM4TAoJUqM7yj2e-cBQJ_bmYc,3253
20
+ kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
+ kreuzberg/_extractors/_base.py,sha256=yNVQSECFad-8_MjqpQZ4q0jQoNdzP6-tqw6l3TfgsMc,4418
22
+ kreuzberg/_extractors/_email.py,sha256=6-Mk1TRXPyy9ylWKCpgdrogyzhiFnJOTuTRld1ghO8I,5695
23
+ kreuzberg/_extractors/_html.py,sha256=lOM1Tgrrvd7vpEeFAxC1dp0Tibr6N2FEHCjgFx0FK64,1745
24
+ kreuzberg/_extractors/_image.py,sha256=OmkqR5Cd6bTM_qHEdNHeXa5eK-3KvtLgHX-JE5oZWec,4483
25
+ kreuzberg/_extractors/_pandoc.py,sha256=51k7XISfKaPorhapG7aIeQb94KGsfozxKyT2rwhk9Bk,26553
26
+ kreuzberg/_extractors/_pdf.py,sha256=UlliWggWHuVwwJE-bRa7H9-_cieSa8kdrQP3x_GOxxY,17018
27
+ kreuzberg/_extractors/_presentation.py,sha256=CUlqZl_QCdJdumsZh0BpROkFbvi9uq7yMoIt3bRTUeE,10859
28
+ kreuzberg/_extractors/_spread_sheet.py,sha256=iagiyJsnl-89OP1eqmEv8jWl7gZBJm2x0YOyqBgLasA,13733
29
+ kreuzberg/_extractors/_structured.py,sha256=J7op5ZZ663a_CkE2SB5eKjPQPXKHdqPst-GFCJMNGqw,5727
30
+ kreuzberg/_mcp/__init__.py,sha256=8PYV-omC8Rln7Cove8C3rHu3d7sR1FuiwSBG1O7vkAE,92
31
+ kreuzberg/_mcp/server.py,sha256=Ab0w7kR3m7_L1cfhYHiC8HqDL282vt4uBYwYc9w9E08,8703
32
+ kreuzberg/_ocr/__init__.py,sha256=grshVFwVQl2rMvH1hg1JNlYXjy5-Tdb_rusLD1Cselk,706
33
+ kreuzberg/_ocr/_base.py,sha256=urvsLRgOmVYHjxil_IsSL69FmMnboklC4CHAjdBQLKQ,3893
34
+ kreuzberg/_ocr/_easyocr.py,sha256=pw2uDmULuMQ9T1Gl4axP_ev7-qwjLt1mJHHyZ34P_FI,17178
35
+ kreuzberg/_ocr/_paddleocr.py,sha256=s75aQJILXm1ZbacyZiLPXh6jEAg9tk2NYnwPnfSDrRU,17543
36
+ kreuzberg/_ocr/_tesseract.py,sha256=teLMH1pBhpcmEXDcyZlv56hYINLGMuaKZ0CQtcu_czQ,31510
37
+ kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
+ kreuzberg/_utils/_cache.py,sha256=hYd_a5Ni5VJBE1XU_eN9gvQ5gg0FRsdbRgmJe-OIJHM,15253
39
+ kreuzberg/_utils/_device.py,sha256=arVrJOSp_2LbbN6lu_rMEUOezzRogdWdkF8d5q5Bg8U,10345
40
+ kreuzberg/_utils/_document_cache.py,sha256=z8irioKsOu8xve1YgHatm__wIFvs9I1gDK3tLNsNyqM,6926
41
+ kreuzberg/_utils/_errors.py,sha256=UsktQ_p7eOj9crPsFDg8HgRSE5-IpuFC7y1e6dDI_fY,6503
42
+ kreuzberg/_utils/_pdf_lock.py,sha256=nqxAYCNlfWDrJtP4ZNu57st1YnkDl-gYXdr0q8nv0kA,1961
43
+ kreuzberg/_utils/_process_pool.py,sha256=4BqhmRspwMyPT2EBfTu_rrn7v722wlMLD8qlYvYsc00,8621
44
+ kreuzberg/_utils/_quality.py,sha256=-nKzj5n7yJDYrvl556oq2T5S5oKMEOrjpcRMlZ00Jqo,7668
45
+ kreuzberg/_utils/_serialization.py,sha256=cqqxqN2cmtndBhIr4v2wqiMwnNadnKhvuN7EUj3i18M,2290
46
+ kreuzberg/_utils/_string.py,sha256=bCzO3UO6nXupxvtMWvHqfp1Vd9CTzEH9jmpJXQ7upAU,6800
47
+ kreuzberg/_utils/_sync.py,sha256=7LSavBmxVKQUzdjfx9fYRAI9IbJtRw8iGf_Q8B7RX9g,4923
48
+ kreuzberg/_utils/_table.py,sha256=IomrfQBP85DZI8RmQjOVs2Siq7VP9FUTYPaZR4t3yRw,8199
49
+ kreuzberg/_utils/_tmp.py,sha256=hVn-VVijIg2FM7EZJ899gc7wZg-TGoJZoeAcxMX-Cxg,1044
50
+ kreuzberg-3.9.0.dist-info/METADATA,sha256=C83JYzqxhGHhrqWDUmo0eJwK_2szx9ZQt3cnkocgwBY,11876
51
+ kreuzberg-3.9.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
52
+ kreuzberg-3.9.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
53
+ kreuzberg-3.9.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
54
+ kreuzberg-3.9.0.dist-info/RECORD,,
kreuzberg/_cli_config.py DELETED
@@ -1,175 +0,0 @@
1
- """Configuration parsing for the CLI."""
2
-
3
- from __future__ import annotations
4
-
5
- import sys
6
- from pathlib import Path
7
- from typing import TYPE_CHECKING, Any
8
-
9
- if sys.version_info >= (3, 11):
10
- import tomllib
11
- else:
12
- import tomli as tomllib # type: ignore[import-not-found]
13
-
14
- from kreuzberg._gmft import GMFTConfig
15
- from kreuzberg._ocr._easyocr import EasyOCRConfig
16
- from kreuzberg._ocr._paddleocr import PaddleOCRConfig
17
- from kreuzberg._ocr._tesseract import TesseractConfig
18
- from kreuzberg._types import ExtractionConfig, OcrBackendType
19
- from kreuzberg.exceptions import ValidationError
20
-
21
- if TYPE_CHECKING:
22
- from collections.abc import MutableMapping
23
-
24
-
25
- def load_config_from_file(config_path: Path) -> dict[str, Any]:
26
- """Load configuration from a TOML file.
27
-
28
- Args:
29
- config_path: Path to the configuration file.
30
-
31
- Returns:
32
- Dictionary containing the loaded configuration.
33
-
34
- Raises:
35
- ValidationError: If the file cannot be read or parsed.
36
- """
37
- try:
38
- with config_path.open("rb") as f:
39
- data = tomllib.load(f)
40
- except FileNotFoundError as e:
41
- raise ValidationError(f"Configuration file not found: {config_path}") from e
42
- except tomllib.TOMLDecodeError as e:
43
- raise ValidationError(f"Invalid TOML in configuration file: {e}") from e
44
-
45
- return data.get("tool", {}).get("kreuzberg", {}) # type: ignore[no-any-return]
46
-
47
-
48
- def merge_configs(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
49
- """Merge two configuration dictionaries recursively.
50
-
51
- Args:
52
- base: Base configuration dictionary.
53
- override: Configuration dictionary to override base values.
54
-
55
- Returns:
56
- Merged configuration dictionary.
57
- """
58
- result = base.copy()
59
- for key, value in override.items():
60
- if isinstance(value, dict) and key in result and isinstance(result[key], dict):
61
- result[key] = merge_configs(result[key], value)
62
- else:
63
- result[key] = value
64
- return result
65
-
66
-
67
- def parse_ocr_backend_config(
68
- config_dict: dict[str, Any], backend: OcrBackendType
69
- ) -> TesseractConfig | EasyOCRConfig | PaddleOCRConfig | None:
70
- """Parse OCR backend-specific configuration.
71
-
72
- Args:
73
- config_dict: Configuration dictionary.
74
- backend: The OCR backend type.
75
-
76
- Returns:
77
- Backend-specific configuration object or None.
78
- """
79
- if backend not in config_dict:
80
- return None
81
-
82
- backend_config = config_dict[backend]
83
- if not isinstance(backend_config, dict):
84
- return None
85
-
86
- if backend == "tesseract":
87
- return TesseractConfig(**backend_config)
88
- if backend == "easyocr":
89
- return EasyOCRConfig(**backend_config)
90
- if backend == "paddleocr":
91
- return PaddleOCRConfig(**backend_config)
92
- return None
93
-
94
-
95
- def build_extraction_config( # noqa: C901, PLR0912
96
- file_config: dict[str, Any],
97
- cli_args: MutableMapping[str, Any],
98
- ) -> ExtractionConfig:
99
- """Build ExtractionConfig from file config and CLI arguments.
100
-
101
- Args:
102
- file_config: Configuration loaded from file.
103
- cli_args: CLI arguments.
104
-
105
- Returns:
106
- ExtractionConfig instance.
107
- """
108
- config_dict: dict[str, Any] = {}
109
-
110
- if file_config:
111
- for field in ["force_ocr", "chunk_content", "extract_tables", "max_chars", "max_overlap", "ocr_backend"]:
112
- if field in file_config:
113
- config_dict[field] = file_config[field]
114
-
115
- for field in ["force_ocr", "chunk_content", "extract_tables", "max_chars", "max_overlap", "ocr_backend"]:
116
- cli_key = field
117
- if cli_key in cli_args and cli_args[cli_key] is not None:
118
- config_dict[field] = cli_args[cli_key]
119
-
120
- ocr_backend = config_dict.get("ocr_backend")
121
- if ocr_backend and ocr_backend != "none":
122
- ocr_config = None
123
-
124
- if cli_args.get(f"{ocr_backend}_config"):
125
- backend_args = cli_args[f"{ocr_backend}_config"]
126
- if ocr_backend == "tesseract":
127
- ocr_config = TesseractConfig(**backend_args)
128
- elif ocr_backend == "easyocr":
129
- ocr_config = EasyOCRConfig(**backend_args) # type: ignore[assignment]
130
- elif ocr_backend == "paddleocr":
131
- ocr_config = PaddleOCRConfig(**backend_args) # type: ignore[assignment]
132
-
133
- if not ocr_config and file_config:
134
- ocr_config = parse_ocr_backend_config(file_config, ocr_backend) # type: ignore[assignment]
135
-
136
- if ocr_config:
137
- config_dict["ocr_config"] = ocr_config
138
-
139
- if config_dict.get("extract_tables"):
140
- gmft_config = None
141
-
142
- if cli_args.get("gmft_config"):
143
- gmft_config = GMFTConfig(**cli_args["gmft_config"])
144
-
145
- elif "gmft" in file_config and isinstance(file_config["gmft"], dict):
146
- gmft_config = GMFTConfig(**file_config["gmft"])
147
-
148
- if gmft_config:
149
- config_dict["gmft_config"] = gmft_config
150
-
151
- if config_dict.get("ocr_backend") == "none":
152
- config_dict["ocr_backend"] = None
153
-
154
- return ExtractionConfig(**config_dict)
155
-
156
-
157
- def find_default_config() -> Path | None:
158
- """Find the default configuration file (pyproject.toml).
159
-
160
- Returns:
161
- Path to the configuration file or None if not found.
162
- """
163
- current = Path.cwd()
164
- while current != current.parent:
165
- config_path = current / "pyproject.toml"
166
- if config_path.exists():
167
- try:
168
- with config_path.open("rb") as f:
169
- data = tomllib.load(f)
170
- if "tool" in data and "kreuzberg" in data["tool"]:
171
- return config_path
172
- except Exception: # noqa: BLE001
173
- pass
174
- current = current.parent
175
- return None