kreuzberg 3.3.0__py3-none-any.whl → 3.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. kreuzberg/__init__.py +9 -2
  2. kreuzberg/_api/__init__.py +0 -0
  3. kreuzberg/_api/main.py +87 -0
  4. kreuzberg/_entity_extraction.py +238 -0
  5. kreuzberg/_extractors/_base.py +39 -1
  6. kreuzberg/_extractors/_email.py +149 -0
  7. kreuzberg/_extractors/_html.py +15 -3
  8. kreuzberg/_extractors/_image.py +27 -22
  9. kreuzberg/_extractors/_pandoc.py +3 -14
  10. kreuzberg/_extractors/_pdf.py +97 -34
  11. kreuzberg/_extractors/_presentation.py +62 -10
  12. kreuzberg/_extractors/_spread_sheet.py +181 -6
  13. kreuzberg/_extractors/_structured.py +148 -0
  14. kreuzberg/_gmft.py +318 -11
  15. kreuzberg/_language_detection.py +95 -0
  16. kreuzberg/_mcp/__init__.py +5 -0
  17. kreuzberg/_mcp/server.py +227 -0
  18. kreuzberg/_mime_types.py +27 -1
  19. kreuzberg/_ocr/__init__.py +10 -1
  20. kreuzberg/_ocr/_base.py +59 -0
  21. kreuzberg/_ocr/_easyocr.py +92 -1
  22. kreuzberg/_ocr/_paddleocr.py +89 -0
  23. kreuzberg/_ocr/_tesseract.py +569 -5
  24. kreuzberg/_registry.py +4 -0
  25. kreuzberg/_types.py +181 -4
  26. kreuzberg/_utils/_cache.py +52 -4
  27. kreuzberg/_utils/_device.py +2 -2
  28. kreuzberg/_utils/_errors.py +3 -7
  29. kreuzberg/_utils/_process_pool.py +182 -9
  30. kreuzberg/_utils/_quality.py +237 -0
  31. kreuzberg/_utils/_serialization.py +4 -2
  32. kreuzberg/_utils/_string.py +153 -10
  33. kreuzberg/_utils/_sync.py +6 -7
  34. kreuzberg/_utils/_table.py +261 -0
  35. kreuzberg/_utils/_tmp.py +2 -2
  36. kreuzberg/cli.py +1 -2
  37. kreuzberg/extraction.py +43 -34
  38. kreuzberg-3.8.1.dist-info/METADATA +301 -0
  39. kreuzberg-3.8.1.dist-info/RECORD +53 -0
  40. {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/entry_points.txt +1 -0
  41. kreuzberg/_multiprocessing/__init__.py +0 -6
  42. kreuzberg/_multiprocessing/gmft_isolated.py +0 -332
  43. kreuzberg/_multiprocessing/process_manager.py +0 -188
  44. kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
  45. kreuzberg/_multiprocessing/tesseract_pool.py +0 -359
  46. kreuzberg-3.3.0.dist-info/METADATA +0 -235
  47. kreuzberg-3.3.0.dist-info/RECORD +0 -48
  48. {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/WHEEL +0 -0
  49. {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,235 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: kreuzberg
3
- Version: 3.3.0
4
- Summary: A text extraction library supporting PDFs, images, office documents and more
5
- Project-URL: homepage, https://github.com/Goldziher/kreuzberg
6
- Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
7
- License: MIT
8
- License-File: LICENSE
9
- Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,table-extraction,tesseract,text-extraction,text-processing
10
- Classifier: Development Status :: 4 - Beta
11
- Classifier: Intended Audience :: Developers
12
- Classifier: License :: OSI Approved :: MIT License
13
- Classifier: Operating System :: OS Independent
14
- Classifier: Programming Language :: Python :: 3 :: Only
15
- Classifier: Programming Language :: Python :: 3.13
16
- Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
- Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
- Classifier: Topic :: Text Processing :: General
19
- Classifier: Topic :: Utilities
20
- Classifier: Typing :: Typed
21
- Requires-Python: >=3.13
22
- Requires-Dist: anyio>=4.9.0
23
- Requires-Dist: charset-normalizer>=3.4.2
24
- Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
25
- Requires-Dist: html-to-markdown>=1.4.0
26
- Requires-Dist: msgspec>=0.18.0
27
- Requires-Dist: playa-pdf>=0.6.1
28
- Requires-Dist: psutil>=7.0.0
29
- Requires-Dist: pypdfium2==4.30.0
30
- Requires-Dist: python-calamine>=0.3.2
31
- Requires-Dist: python-pptx>=1.0.2
32
- Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
33
- Provides-Extra: all
34
- Requires-Dist: click>=8.2.1; extra == 'all'
35
- Requires-Dist: easyocr>=1.7.2; extra == 'all'
36
- Requires-Dist: gmft>=0.4.2; extra == 'all'
37
- Requires-Dist: paddleocr>=3.1.0; extra == 'all'
38
- Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
39
- Requires-Dist: rich>=14.0.0; extra == 'all'
40
- Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
41
- Requires-Dist: setuptools>=80.9.0; extra == 'all'
42
- Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
43
- Provides-Extra: chunking
44
- Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
45
- Provides-Extra: cli
46
- Requires-Dist: click>=8.2.1; extra == 'cli'
47
- Requires-Dist: rich>=14.0.0; extra == 'cli'
48
- Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
49
- Provides-Extra: easyocr
50
- Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
51
- Provides-Extra: gmft
52
- Requires-Dist: gmft>=0.4.2; extra == 'gmft'
53
- Provides-Extra: paddleocr
54
- Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
55
- Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
56
- Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
57
- Description-Content-Type: text/markdown
58
-
59
- # Kreuzberg
60
-
61
- [![Discord](https://img.shields.io/badge/Discord-Join%20our%20community-7289da)](https://discord.gg/pXxagNK2zN)
62
- [![PyPI version](https://badge.fury.io/py/kreuzberg.svg)](https://badge.fury.io/py/kreuzberg)
63
- [![Documentation](https://img.shields.io/badge/docs-GitHub_Pages-blue)](https://goldziher.github.io/kreuzberg/)
64
- [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
65
-
66
- Kreuzberg is a Python library for text extraction from documents. It provides a unified interface for extracting text from PDFs, images, office documents, and more, with both async and sync APIs.
67
-
68
- ## Why Kreuzberg?
69
-
70
- - **Simple and Hassle-Free**: Clean API that just works, without complex configuration
71
- - **Local Processing**: No external API calls or cloud dependencies required
72
- - **Resource Efficient**: Lightweight processing without GPU requirements
73
- - **Format Support**: Comprehensive support for documents, images, and text formats
74
- - **Multiple OCR Engines**: Support for Tesseract, EasyOCR, and PaddleOCR
75
- - **Command Line Interface**: Powerful CLI for batch processing and automation
76
- - **Metadata Extraction**: Get document metadata alongside text content
77
- - **Table Extraction**: Extract tables from documents using the excellent GMFT library
78
- - **Modern Python**: Built with async/await, type hints, and a functional-first approach
79
- - **Permissive OSS**: MIT licensed with permissively licensed dependencies
80
-
81
- ## Quick Start
82
-
83
- ```bash
84
- pip install kreuzberg
85
-
86
- # Or install with CLI support
87
- pip install "kreuzberg[cli]"
88
- ```
89
-
90
- Install pandoc:
91
-
92
- ```bash
93
- # Ubuntu/Debian
94
- sudo apt-get install tesseract-ocr pandoc
95
-
96
- # macOS
97
- brew install tesseract pandoc
98
-
99
- # Windows
100
- choco install -y tesseract pandoc
101
- ```
102
-
103
- The tesseract OCR engine is the default OCR engine. You can decide not to use it - and then either use one of the two alternative OCR engines, or have no OCR at all.
104
-
105
- ### Alternative OCR engines
106
-
107
- ```bash
108
- # Install with EasyOCR support
109
- pip install "kreuzberg[easyocr]"
110
-
111
- # Install with PaddleOCR support
112
- pip install "kreuzberg[paddleocr]"
113
- ```
114
-
115
- ## Quick Example
116
-
117
- ```python
118
- import asyncio
119
- from kreuzberg import extract_file
120
-
121
- async def main():
122
- # Extract text from a PDF
123
- result = await extract_file("document.pdf")
124
- print(result.content)
125
-
126
- # Extract text from an image
127
- result = await extract_file("scan.jpg")
128
- print(result.content)
129
-
130
- # Extract text from a Word document
131
- result = await extract_file("report.docx")
132
- print(result.content)
133
-
134
- asyncio.run(main())
135
- ```
136
-
137
- ## Command Line Interface
138
-
139
- Kreuzberg includes a powerful CLI for processing documents from the command line:
140
-
141
- ```bash
142
- # Extract text from a file
143
- kreuzberg extract document.pdf
144
-
145
- # Extract with JSON output and metadata
146
- kreuzberg extract document.pdf --output-format json --show-metadata
147
-
148
- # Extract from stdin
149
- cat document.html | kreuzberg extract
150
-
151
- # Use specific OCR backend
152
- kreuzberg extract image.png --ocr-backend easyocr --easyocr-languages en,de
153
-
154
- # Extract with configuration file
155
- kreuzberg extract document.pdf --config config.toml
156
- ```
157
-
158
- ### CLI Configuration
159
-
160
- Configure via `pyproject.toml`:
161
-
162
- ```toml
163
- [tool.kreuzberg]
164
- force_ocr = true
165
- chunk_content = false
166
- extract_tables = true
167
- max_chars = 4000
168
- ocr_backend = "tesseract"
169
-
170
- [tool.kreuzberg.tesseract]
171
- language = "eng+deu"
172
- psm = 3
173
- ```
174
-
175
- For full CLI documentation, see the [CLI Guide](https://goldziher.github.io/kreuzberg/cli/).
176
-
177
- ## Documentation
178
-
179
- For comprehensive documentation, visit our [GitHub Pages](https://goldziher.github.io/kreuzberg/):
180
-
181
- - [Getting Started](https://goldziher.github.io/kreuzberg/getting-started/) - Installation and basic usage
182
- - [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - In-depth usage information
183
- - [CLI Guide](https://goldziher.github.io/kreuzberg/cli/) - Command-line interface documentation
184
- - [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Detailed API documentation
185
- - [Examples](https://goldziher.github.io/kreuzberg/examples/) - Code examples for common use cases
186
- - [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - Configure OCR engines
187
- - [OCR Backends](https://goldziher.github.io/kreuzberg/user-guide/ocr-backends/) - Choose the right OCR engine
188
-
189
- ## Supported Formats
190
-
191
- Kreuzberg supports a wide range of document formats:
192
-
193
- - **Documents**: PDF, DOCX, RTF, TXT, EPUB, etc.
194
- - **Images**: JPG, PNG, TIFF, BMP, GIF, etc.
195
- - **Spreadsheets**: XLSX, XLS, CSV, etc.
196
- - **Presentations**: PPTX, PPT, etc.
197
- - **Web Content**: HTML, XML, etc.
198
-
199
- ## OCR Engines
200
-
201
- Kreuzberg supports multiple OCR engines:
202
-
203
- - **Tesseract** (Default): Lightweight, fast startup, requires system installation
204
- - **EasyOCR**: Good for many languages, pure Python, but downloads models on first use
205
- - **PaddleOCR**: Excellent for Asian languages, pure Python, but downloads models on first use
206
-
207
- For comparison and selection guidance, see the [OCR Backends](https://goldziher.github.io/kreuzberg/user-guide/ocr-backends/) documentation.
208
-
209
- ## Performance
210
-
211
- Kreuzberg offers both sync and async APIs. Choose the right one based on your use case:
212
-
213
- | Operation | Sync Time | Async Time | Async Advantage |
214
- | ---------------------- | --------- | ---------- | ------------------ |
215
- | Simple text (Markdown) | 0.4ms | 17.5ms | **❌ 41x slower** |
216
- | HTML documents | 1.6ms | 1.1ms | **✅ 1.5x faster** |
217
- | Complex PDFs | 39.0s | 8.5s | **✅ 4.6x faster** |
218
- | OCR processing | 0.4s | 0.7s | **✅ 1.7x faster** |
219
- | Batch operations | 38.6s | 8.5s | **✅ 4.5x faster** |
220
-
221
- **Rule of thumb:**
222
-
223
- - Use **sync** for simple documents and CLI applications
224
- - Use **async** for complex PDFs, OCR, and batch processing
225
- - Use **batch operations** for multiple files
226
-
227
- For detailed benchmarks and methodology, see our [Performance Documentation](https://goldziher.github.io/kreuzberg/advanced/performance/).
228
-
229
- ## Contributing
230
-
231
- We welcome contributions! Please see our [Contributing Guide](docs/contributing.md) for details on setting up your development environment and submitting pull requests.
232
-
233
- ## License
234
-
235
- This library is released under the MIT license.
@@ -1,48 +0,0 @@
1
- kreuzberg/__init__.py,sha256=jRm2U-loiKWwJpgOFgZ8Ev2mfz9sI1qJOZ2V3OoJUlg,1258
2
- kreuzberg/__main__.py,sha256=s2qM1nPEkRHAQP-G3P7sf5l6qA_KJeIEHS5LpPz04lg,183
3
- kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
4
- kreuzberg/_cli_config.py,sha256=WD_seFjbuay_NJv77vGLBW6BVV9WZNujdzf3zQkhzPc,5691
5
- kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
6
- kreuzberg/_gmft.py,sha256=6liCjedPxH5Xbe7V-AmrZIq5Y9Dejn7D-LSCbgYs2Sg,14762
7
- kreuzberg/_mime_types.py,sha256=QgX-k8aI4lTKArObDM0TFPt7DUjUVwWrdIaIZDh_XQY,7815
8
- kreuzberg/_playa.py,sha256=rU6ii2Qnrj8tkDYlSiab5h-BCYLJnUg4QwSLVDEXF5g,11883
9
- kreuzberg/_registry.py,sha256=c2B_PJbaL0q3ab2eNmj_0jldeyMaqgvRwkZqUU4MM5Q,3290
10
- kreuzberg/_types.py,sha256=G7UQ5ZUWcpgwHoasexW7f2te3gKe3PHHi_3Fm1cju-w,7503
11
- kreuzberg/cli.py,sha256=S0w2nGXBWPFn1NhxppW7dpUwB9f_3ymFuWSAB2aRu9g,12465
12
- kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
13
- kreuzberg/extraction.py,sha256=z8sht8Yw9v6bE_WgLdWx-phu4T58eExME296DV_41VU,16551
14
- kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
- kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
- kreuzberg/_extractors/_base.py,sha256=YUr6A2n34LlFzbYQkiKqhXAphL9RYrvAls5SlkoQqNg,3028
17
- kreuzberg/_extractors/_html.py,sha256=L_vcKyv1ObI6irPaD3-WTKqxeRfZA4Rhsl3zUiAe_ws,1312
18
- kreuzberg/_extractors/_image.py,sha256=Vks6WEDoW5AlGqIGVSeuhZzvJNwS8V6wxeD46Fxxogw,3947
19
- kreuzberg/_extractors/_pandoc.py,sha256=oQ4DgQSPoX1LXjGAKh_A40JHqiKWb91LeRBYSS_6EUA,26750
20
- kreuzberg/_extractors/_pdf.py,sha256=qgYwGvAlvyZzb94lXGcKGIhzmSFpP6YGzYc7fs8b-yw,13432
21
- kreuzberg/_extractors/_presentation.py,sha256=ZX-EKQppHwvKtyKk0-IQVF6QAqJi0SfGgCiiyqMQh0w,8701
22
- kreuzberg/_extractors/_spread_sheet.py,sha256=ToLZIK_PO72IYbsdtSQkHOwTUhDwptjOfSX--e1UdSM,6487
23
- kreuzberg/_multiprocessing/__init__.py,sha256=nwYQpKH7ixHwzkQbTMFCstOCBKktmbNq5dTrwI2Mn94,203
24
- kreuzberg/_multiprocessing/gmft_isolated.py,sha256=wpZ5br5dL9P6hhGjAYckHbz8IvXrDdEvajJ7fxbFmAU,11199
25
- kreuzberg/_multiprocessing/process_manager.py,sha256=dvO9JBWYnH1KCpzwn9h3Tz-wAoihMwTLE6OS-DF_sK0,6030
26
- kreuzberg/_multiprocessing/sync_tesseract.py,sha256=Ck1PvHGWOMQWUcC7RyVrBt8K9VDFQ0lQcwFkwYzl3rE,8240
27
- kreuzberg/_multiprocessing/tesseract_pool.py,sha256=UN7BtS_ib1ux9xuR6d6AB3PY7UEUhd-5Ti1n1H0UnYw,10945
28
- kreuzberg/_ocr/__init__.py,sha256=VTqwKDlIRbjve71Y11Ztygyhv5aWG9LWTj8iX66ANxE,533
29
- kreuzberg/_ocr/_base.py,sha256=lNT0Tin4hzbmaamqqySxvYEwNtrJB5gGlStrANQQcyc,1637
30
- kreuzberg/_ocr/_easyocr.py,sha256=QSd6Bw7RBsOyL5ry-6lFLD7gJxcpK1P3AD_RRK4TPWs,13734
31
- kreuzberg/_ocr/_paddleocr.py,sha256=UvugDdZd7RojHUiFeBaI8aqz36ecegPLj2v6oT6c42g,13776
32
- kreuzberg/_ocr/_tesseract.py,sha256=NAHklkHvDKMgHVqjhgYfxC3DIJuQn8fXPkvnmQxUiV8,12784
33
- kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
- kreuzberg/_utils/_cache.py,sha256=JGiwwcNBoD950IbsPUUAD5gAGS7byUuz0BqYSneVakc,13088
35
- kreuzberg/_utils/_device.py,sha256=Dk4g-LzUMJ-WMM-9czNQJj3mUI43l2w7t6MJcERYb2U,10264
36
- kreuzberg/_utils/_document_cache.py,sha256=z8irioKsOu8xve1YgHatm__wIFvs9I1gDK3tLNsNyqM,6926
37
- kreuzberg/_utils/_errors.py,sha256=AV3oaRQDgJxe1YUZd9pCQUysUv9KW8Ib37MvnyFOZ4o,6386
38
- kreuzberg/_utils/_pdf_lock.py,sha256=nqxAYCNlfWDrJtP4ZNu57st1YnkDl-gYXdr0q8nv0kA,1961
39
- kreuzberg/_utils/_process_pool.py,sha256=7n5UN3d-xeYHU5TiRI62u-JenERPinJzFhbRUq-zL9k,2895
40
- kreuzberg/_utils/_serialization.py,sha256=AhZvyAu4KsjAqyZDh--Kn2kSWGgCuH7udio8lTklO0g,2132
41
- kreuzberg/_utils/_string.py,sha256=owIVkUtP0__GiJD9RIJzPdvyIigT5sQho3mOXPbsnW0,958
42
- kreuzberg/_utils/_sync.py,sha256=IsKkR_YmseZKY6Asz6w3k-dgMXcrVaI06jWfDY7Bol4,4842
43
- kreuzberg/_utils/_tmp.py,sha256=5rqG_Nlb9xweaLqJA8Kc5csHDase9_eY_Fq93rNQGWc,1044
44
- kreuzberg-3.3.0.dist-info/METADATA,sha256=beRlFJzCsZNcQ_DsRyzRc2WDT-UkBCfBvY6vTWiOxp0,8748
45
- kreuzberg-3.3.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
46
- kreuzberg-3.3.0.dist-info/entry_points.txt,sha256=VdoFaTl3QSvVWOZcIlPpDd47o6kn7EvmXSs8FI0ExLc,48
47
- kreuzberg-3.3.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
48
- kreuzberg-3.3.0.dist-info/RECORD,,