kreuzberg 3.6.2__py3-none-any.whl → 3.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_extractors/_base.py +40 -0
- kreuzberg/_extractors/_email.py +149 -0
- kreuzberg/_extractors/_html.py +15 -3
- kreuzberg/_extractors/_image.py +17 -18
- kreuzberg/_extractors/_pdf.py +68 -14
- kreuzberg/_extractors/_presentation.py +62 -10
- kreuzberg/_extractors/_spread_sheet.py +179 -4
- kreuzberg/_extractors/_structured.py +148 -0
- kreuzberg/_gmft.py +2 -2
- kreuzberg/_mcp/__init__.py +5 -0
- kreuzberg/_mcp/server.py +227 -0
- kreuzberg/_mime_types.py +27 -1
- kreuzberg/_multiprocessing/__init__.py +2 -3
- kreuzberg/_ocr/__init__.py +30 -0
- kreuzberg/{_multiprocessing/tesseract_pool.py → _ocr/_pool.py} +3 -5
- kreuzberg/_ocr/_sync.py +566 -0
- kreuzberg/_ocr/_tesseract.py +6 -2
- kreuzberg/_registry.py +4 -0
- kreuzberg/_types.py +131 -0
- kreuzberg/_utils/_cache.py +17 -2
- kreuzberg/_utils/_process_pool.py +178 -1
- kreuzberg/_utils/_quality.py +237 -0
- kreuzberg/_utils/_serialization.py +4 -2
- kreuzberg/_utils/_string.py +153 -10
- kreuzberg/_utils/_sync.py +5 -2
- kreuzberg/_utils/_table.py +261 -0
- {kreuzberg-3.6.2.dist-info → kreuzberg-3.8.0.dist-info}/METADATA +116 -48
- kreuzberg-3.8.0.dist-info/RECORD +57 -0
- {kreuzberg-3.6.2.dist-info → kreuzberg-3.8.0.dist-info}/entry_points.txt +1 -0
- kreuzberg/_multiprocessing/process_manager.py +0 -189
- kreuzberg/_multiprocessing/sync_easyocr.py +0 -235
- kreuzberg/_multiprocessing/sync_paddleocr.py +0 -199
- kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
- kreuzberg-3.6.2.dist-info/RECORD +0 -54
- {kreuzberg-3.6.2.dist-info → kreuzberg-3.8.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.6.2.dist-info → kreuzberg-3.8.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.8.0
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
6
6
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
@@ -23,9 +23,10 @@ Classifier: Topic :: Utilities
|
|
23
23
|
Classifier: Typing :: Typed
|
24
24
|
Requires-Python: >=3.10
|
25
25
|
Requires-Dist: anyio>=4.9.0
|
26
|
-
Requires-Dist:
|
26
|
+
Requires-Dist: chardetng-py>=0.3.4
|
27
27
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
28
|
-
Requires-Dist: html-to-markdown[lxml]>=1.
|
28
|
+
Requires-Dist: html-to-markdown[lxml]>=1.8.0
|
29
|
+
Requires-Dist: mcp>=1.11.0
|
29
30
|
Requires-Dist: msgspec>=0.18.0
|
30
31
|
Requires-Dist: playa-pdf>=0.6.1
|
31
32
|
Requires-Dist: psutil>=7.0.0
|
@@ -33,6 +34,9 @@ Requires-Dist: pypdfium2==4.30.0
|
|
33
34
|
Requires-Dist: python-calamine>=0.3.2
|
34
35
|
Requires-Dist: python-pptx>=1.0.2
|
35
36
|
Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
|
37
|
+
Provides-Extra: additional-extensions
|
38
|
+
Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
|
39
|
+
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
|
36
40
|
Provides-Extra: all
|
37
41
|
Requires-Dist: click>=8.2.1; extra == 'all'
|
38
42
|
Requires-Dist: easyocr>=1.7.2; extra == 'all'
|
@@ -40,6 +44,7 @@ Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
|
|
40
44
|
Requires-Dist: gmft>=0.4.2; extra == 'all'
|
41
45
|
Requires-Dist: keybert>=0.9.0; extra == 'all'
|
42
46
|
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all'
|
47
|
+
Requires-Dist: mailparse>=1.0.15; extra == 'all'
|
43
48
|
Requires-Dist: paddleocr>=3.1.0; extra == 'all'
|
44
49
|
Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
|
45
50
|
Requires-Dist: rich>=14.0.0; extra == 'all'
|
@@ -76,21 +81,51 @@ Description-Content-Type: text/markdown
|
|
76
81
|
[](https://badge.fury.io/py/kreuzberg)
|
77
82
|
[](https://goldziher.github.io/kreuzberg/)
|
78
83
|
[](https://opensource.org/licenses/MIT)
|
84
|
+
[](https://github.com/Goldziher/kreuzberg)
|
79
85
|
|
80
|
-
**High-performance
|
86
|
+
**High-performance Open Source Document Intelligence framework for Python.** Built by engineers for production workloads - extract text from any document with excellent performance and minimal complexity.
|
81
87
|
|
82
88
|
📖 **[Complete Documentation](https://goldziher.github.io/kreuzberg/)**
|
83
89
|
|
84
|
-
## Why Kreuzberg?
|
90
|
+
## Why Choose Kreuzberg?
|
85
91
|
|
86
|
-
|
87
|
-
|
88
|
-
-
|
89
|
-
-
|
90
|
-
-
|
91
|
-
-
|
92
|
-
|
93
|
-
|
92
|
+
### 🚀 Performance
|
93
|
+
|
94
|
+
- [benchmarked as the fastest framework](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) - 2-3x faster than the nearest alternatives
|
95
|
+
- Minimal footprint: 71MB install vs 1GB+ for competitors
|
96
|
+
- Lowest memory usage (~530MB average) optimized for production workloads
|
97
|
+
- Edge and serverless ready - deploy anywhere without heavy dependencies
|
98
|
+
|
99
|
+
### 🛠️ Engineering Quality
|
100
|
+
|
101
|
+
- Built by software engineers with modern Python best practices
|
102
|
+
- 95%+ test coverage with comprehensive test suite
|
103
|
+
- Thoroughly benchmarked and profiled for real-world performance
|
104
|
+
- Only framework offering true async/await support alongside sync APIs
|
105
|
+
- Robust error handling and detailed logging
|
106
|
+
|
107
|
+
### 🎯 Developer Experience
|
108
|
+
|
109
|
+
- Works out of the box with sane defaults, scales with your needs
|
110
|
+
- Native MCP server for AI tool integration (Claude Desktop, Cursor)
|
111
|
+
- Full type safety with excellent IDE support (completions)
|
112
|
+
- Comprehensive documentation including full API reference
|
113
|
+
|
114
|
+
### 🌍 Deployment Options
|
115
|
+
|
116
|
+
- Docker images for all architectures (AMD64, ARM64)
|
117
|
+
- Cloud native - AWS Lambda, Google Cloud Functions, Azure Functions
|
118
|
+
- CPU-only processing - no GPU requirements, lower energy consumption
|
119
|
+
- 100% local processing - no external API dependencies
|
120
|
+
- Multiple deployment modes: CLI, REST API, MCP server
|
121
|
+
|
122
|
+
### 🎯 Complete Solution
|
123
|
+
|
124
|
+
- Universal format support: PDFs, images, Office docs, HTML, spreadsheets, presentations
|
125
|
+
- Multiple OCR engines: Tesseract, EasyOCR, PaddleOCR with intelligent fallbacks
|
126
|
+
- Advanced features: Table extraction, metadata extraction, content chunking for RAG
|
127
|
+
- Production tools: REST API, CLI tools, batch processing, custom extractors
|
128
|
+
- Fully extensible: Add your own extractors
|
94
129
|
|
95
130
|
## Quick Start
|
96
131
|
|
@@ -136,6 +171,55 @@ asyncio.run(main())
|
|
136
171
|
|
137
172
|
## Deployment Options
|
138
173
|
|
174
|
+
### 🤖 MCP Server (AI Integration)
|
175
|
+
|
176
|
+
**Connect directly to Claude Desktop, Cursor, and other AI tools with the Model Context Protocol:**
|
177
|
+
|
178
|
+
```bash
|
179
|
+
# Install and run MCP server with all features (recommended)
|
180
|
+
pip install "kreuzberg[all]"
|
181
|
+
kreuzberg-mcp
|
182
|
+
|
183
|
+
# Or with uvx (recommended for Claude Desktop)
|
184
|
+
uvx --with "kreuzberg[all]" kreuzberg-mcp
|
185
|
+
|
186
|
+
# Basic installation (core features only)
|
187
|
+
pip install kreuzberg
|
188
|
+
kreuzberg-mcp
|
189
|
+
```
|
190
|
+
|
191
|
+
**Configure in Claude Desktop (`claude_desktop_config.json`):**
|
192
|
+
|
193
|
+
```json
|
194
|
+
{
|
195
|
+
"mcpServers": {
|
196
|
+
"kreuzberg": {
|
197
|
+
"command": "uvx",
|
198
|
+
"args": ["--with", "kreuzberg[all]", "kreuzberg-mcp"]
|
199
|
+
}
|
200
|
+
}
|
201
|
+
}
|
202
|
+
```
|
203
|
+
|
204
|
+
**Basic configuration (core features only):**
|
205
|
+
|
206
|
+
```json
|
207
|
+
{
|
208
|
+
"mcpServers": {
|
209
|
+
"kreuzberg": {
|
210
|
+
"command": "uvx",
|
211
|
+
"args": ["kreuzberg-mcp"]
|
212
|
+
}
|
213
|
+
}
|
214
|
+
}
|
215
|
+
```
|
216
|
+
|
217
|
+
**Available MCP capabilities:**
|
218
|
+
|
219
|
+
- **Tools**: `extract_document`, `extract_bytes`, `extract_simple`
|
220
|
+
- **Resources**: Configuration, supported formats, OCR backends
|
221
|
+
- **Prompts**: Extract-and-summarize, structured analysis workflows
|
222
|
+
|
139
223
|
### 🐳 Docker (Recommended)
|
140
224
|
|
141
225
|
```bash
|
@@ -146,7 +230,7 @@ docker run -p 8000:8000 goldziher/kreuzberg:latest
|
|
146
230
|
curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
|
147
231
|
```
|
148
232
|
|
149
|
-
Available variants: `latest`, `
|
233
|
+
Available variants: `latest`, `v3.8.0`, `v3.8.0-easyocr`, `v3.8.0-paddle`, `v3.8.0-gmft`, `v3.8.0-all`
|
150
234
|
|
151
235
|
### 🌐 REST API
|
152
236
|
|
@@ -189,23 +273,28 @@ kreuzberg extract *.pdf --output-dir ./extracted/
|
|
189
273
|
| **Web** | HTML, XML, MHTML |
|
190
274
|
| **Archives** | Support via extraction |
|
191
275
|
|
192
|
-
## Performance
|
276
|
+
## 📊 Performance Comparison
|
277
|
+
|
278
|
+
[Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) across 94 real-world documents • [View source](https://github.com/Goldziher/python-text-extraction-libs-benchmarks):
|
193
279
|
|
194
|
-
|
280
|
+
| Framework | Speed | Memory | Install Size | Dependencies | Success Rate |
|
281
|
+
| ------------- | ----------- | ------ | ------------ | ------------ | ------------ |
|
282
|
+
| **Kreuzberg** | 35+ files/s | 530MB | 71MB | 20 | High |
|
283
|
+
| Unstructured | ~12 files/s | ~1GB | 146MB | 54 | 88%+ |
|
284
|
+
| MarkItDown | ~15 files/s | ~1.5GB | 251MB | 25 | 80%\* |
|
285
|
+
| Docling | ~1 file/min | ~5GB | 1,032MB | 88 | 45%\* |
|
195
286
|
|
196
|
-
|
197
|
-
| ------------- | --------------- | --------- | ------------ | ------------ | ------------ |
|
198
|
-
| **Kreuzberg** | **35+ files/s** | **530MB** | **71MB** | **20** | High\* |
|
199
|
-
| Unstructured | Moderate | ~1GB | 146MB | 54 | 88%+ |
|
200
|
-
| MarkItDown | Good† | ~1.5GB | 251MB | 25 | 80%† |
|
201
|
-
| Docling | 60+ min/file‡ | ~5GB | 1,032MB | 88 | Low‡ |
|
287
|
+
\*_Performance varies significantly with document complexity and size_
|
202
288
|
|
203
|
-
|
204
|
-
†_Good on simple documents, struggles with large/complex files (>10MB)_
|
205
|
-
‡_Frequently fails/times out on medium files (>1MB)_
|
289
|
+
**Key strengths:**
|
206
290
|
|
207
|
-
|
208
|
-
|
291
|
+
- 2-3x faster processing than comparable frameworks
|
292
|
+
- Smallest installation footprint and memory usage
|
293
|
+
- Only framework with built-in async/await support
|
294
|
+
- CPU-only processing - no GPU dependencies
|
295
|
+
- Built by software engineers for production reliability
|
296
|
+
|
297
|
+
> **Benchmark details**: Tests include PDFs, Word docs, HTML, images, and spreadsheets in multiple languages (English, Hebrew, German, Chinese, Japanese, Korean) on standardized hardware.
|
209
298
|
|
210
299
|
## Documentation
|
211
300
|
|
@@ -219,27 +308,6 @@ kreuzberg extract *.pdf --output-dir ./extracted/
|
|
219
308
|
- [CLI Guide](https://goldziher.github.io/kreuzberg/cli/) - Command-line usage
|
220
309
|
- [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - OCR engine setup
|
221
310
|
|
222
|
-
## Advanced Features
|
223
|
-
|
224
|
-
- **📊 Table Extraction**: Extract tables from PDFs with GMFT
|
225
|
-
- **🧩 Content Chunking**: Split documents for RAG applications
|
226
|
-
- **🎯 Custom Extractors**: Extend with your own document handlers
|
227
|
-
- **🔧 Configuration**: Flexible TOML-based configuration
|
228
|
-
- **🪝 Hooks**: Pre/post-processing customization
|
229
|
-
- **🌍 Multi-language OCR**: 100+ languages supported
|
230
|
-
- **⚙️ Metadata Extraction**: Rich document metadata
|
231
|
-
- **🔄 Batch Processing**: Efficient bulk document processing
|
232
|
-
|
233
311
|
## License
|
234
312
|
|
235
313
|
MIT License - see [LICENSE](LICENSE) for details.
|
236
|
-
|
237
|
-
______________________________________________________________________
|
238
|
-
|
239
|
-
<div align="center">
|
240
|
-
|
241
|
-
**[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [Discord](https://discord.gg/pXxagNK2zN)**
|
242
|
-
|
243
|
-
Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
|
244
|
-
|
245
|
-
</div>
|
@@ -0,0 +1,57 @@
|
|
1
|
+
kreuzberg/__init__.py,sha256=wVxbug-w1cO2xHcP04Bf6QeIKmT2Ep6aeenb8EOYLA0,1534
|
2
|
+
kreuzberg/__main__.py,sha256=s2qM1nPEkRHAQP-G3P7sf5l6qA_KJeIEHS5LpPz04lg,183
|
3
|
+
kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
|
4
|
+
kreuzberg/_cli_config.py,sha256=WD_seFjbuay_NJv77vGLBW6BVV9WZNujdzf3zQkhzPc,5691
|
5
|
+
kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
|
6
|
+
kreuzberg/_entity_extraction.py,sha256=EIasBGpkZ-3FwivjEpisz23LilTwx8os-IbfrDtzNl4,7815
|
7
|
+
kreuzberg/_gmft.py,sha256=ZIEUu4Uy5zYNFEeDRbz1cLJhnCAStVsSzm1PQ3vDeO8,14828
|
8
|
+
kreuzberg/_language_detection.py,sha256=22-uXoOu_ws0K8Hz2M7U_SF9QX3npRYLhntAE1dNLFU,3283
|
9
|
+
kreuzberg/_mime_types.py,sha256=OhJ6gEyyLHjyvRtkk37zyLFBsRcSd_QybBaV8TxinIg,8471
|
10
|
+
kreuzberg/_playa.py,sha256=rU6ii2Qnrj8tkDYlSiab5h-BCYLJnUg4QwSLVDEXF5g,11883
|
11
|
+
kreuzberg/_registry.py,sha256=wGSlkS0U1zqruWQCLE95vj4a2mw1yyvf0j6rgz80sJg,3473
|
12
|
+
kreuzberg/_types.py,sha256=R_0Xc2kq4nEwkruvkB3qfrLeJ996419hBQ_1C6Xrqjo,13388
|
13
|
+
kreuzberg/cli.py,sha256=S0w2nGXBWPFn1NhxppW7dpUwB9f_3ymFuWSAB2aRu9g,12465
|
14
|
+
kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
|
15
|
+
kreuzberg/extraction.py,sha256=mdH45bMAAUUNXYT7UrNyWJ2oD_gXuLUU-NyuYxQM884,17459
|
16
|
+
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
|
+
kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
|
+
kreuzberg/_api/main.py,sha256=kZCMPPzP4BGzEege9pdhQTJPKKVjCaC6kZdMMeaqP2M,2599
|
19
|
+
kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
20
|
+
kreuzberg/_extractors/_base.py,sha256=ECEwBpxnIy_J9kGZGuqsaPCgLFfxRn7kn4hIf11gDJ8,4478
|
21
|
+
kreuzberg/_extractors/_email.py,sha256=6-Mk1TRXPyy9ylWKCpgdrogyzhiFnJOTuTRld1ghO8I,5695
|
22
|
+
kreuzberg/_extractors/_html.py,sha256=lOM1Tgrrvd7vpEeFAxC1dp0Tibr6N2FEHCjgFx0FK64,1745
|
23
|
+
kreuzberg/_extractors/_image.py,sha256=0kzOQTTeJacaA8I9833fFvVQSz6FtUe9Nuw1oy0ToD0,4939
|
24
|
+
kreuzberg/_extractors/_pandoc.py,sha256=oQ4DgQSPoX1LXjGAKh_A40JHqiKWb91LeRBYSS_6EUA,26750
|
25
|
+
kreuzberg/_extractors/_pdf.py,sha256=giYG3aEdmsxT0tGWKBaMzHDPz74-jVmK4HZARDEBhsM,17108
|
26
|
+
kreuzberg/_extractors/_presentation.py,sha256=CUlqZl_QCdJdumsZh0BpROkFbvi9uq7yMoIt3bRTUeE,10859
|
27
|
+
kreuzberg/_extractors/_spread_sheet.py,sha256=Nvyz7XT7C2ai4QeUashBeENQpuP5rs8SmKfumxEqlCg,13712
|
28
|
+
kreuzberg/_extractors/_structured.py,sha256=i3jAvhHZt_BsRGgZZfgcsUqlwAg_RNc8vsuecb04T0c,5581
|
29
|
+
kreuzberg/_mcp/__init__.py,sha256=8PYV-omC8Rln7Cove8C3rHu3d7sR1FuiwSBG1O7vkAE,92
|
30
|
+
kreuzberg/_mcp/server.py,sha256=BQHeKI89aKf24BIE4n6m8r1rVA1Zgt6vM8Ki_OHuGnc,6780
|
31
|
+
kreuzberg/_multiprocessing/__init__.py,sha256=X2BtgKmWhF1rl0JYg2gvoSUaozKExfsWh-RRNvzNoOs,202
|
32
|
+
kreuzberg/_multiprocessing/gmft_isolated.py,sha256=ZfbhiL5bhBEJnibUSls3WV-FECrnU9VvKfq5O2foHcc,11191
|
33
|
+
kreuzberg/_ocr/__init__.py,sha256=CC9Ob1t_ltTYUamK1ZtmkswfCYdn1B-Z0kPemsQU0xU,1439
|
34
|
+
kreuzberg/_ocr/_base.py,sha256=lNT0Tin4hzbmaamqqySxvYEwNtrJB5gGlStrANQQcyc,1637
|
35
|
+
kreuzberg/_ocr/_easyocr.py,sha256=90Dv1xaLXbpG7EtmRQE5ykvnhqZJR3xSFXlxFMCSVSI,13740
|
36
|
+
kreuzberg/_ocr/_paddleocr.py,sha256=UvugDdZd7RojHUiFeBaI8aqz36ecegPLj2v6oT6c42g,13776
|
37
|
+
kreuzberg/_ocr/_pool.py,sha256=Yb0l_GxnPsIWn3NA2FuBYEC8ipIqgwaYglUt0ltqSvk,10948
|
38
|
+
kreuzberg/_ocr/_sync.py,sha256=cdLiH9hYqygzqW3LkibhrE6C8atin7mfTv_k3JJFE0k,18287
|
39
|
+
kreuzberg/_ocr/_tesseract.py,sha256=KtenEIGL63gRhdH2hxOEVM89locAETGo2bNjQMXjTwY,13266
|
40
|
+
kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
41
|
+
kreuzberg/_utils/_cache.py,sha256=CtpSmEggWoIPDZ9_Nl0i5pr7wtPyci8EVT-ajYsARGI,13609
|
42
|
+
kreuzberg/_utils/_device.py,sha256=rnaSSB5ibf2wr7EDxrcmOUZ4Ocor0pHkwb3N1pC46EY,10276
|
43
|
+
kreuzberg/_utils/_document_cache.py,sha256=z8irioKsOu8xve1YgHatm__wIFvs9I1gDK3tLNsNyqM,6926
|
44
|
+
kreuzberg/_utils/_errors.py,sha256=AV3oaRQDgJxe1YUZd9pCQUysUv9KW8Ib37MvnyFOZ4o,6386
|
45
|
+
kreuzberg/_utils/_pdf_lock.py,sha256=nqxAYCNlfWDrJtP4ZNu57st1YnkDl-gYXdr0q8nv0kA,1961
|
46
|
+
kreuzberg/_utils/_process_pool.py,sha256=E3bHOO67TeoLUBjtw5HoY9gyFl621VaImYI-_itQ96c,8653
|
47
|
+
kreuzberg/_utils/_quality.py,sha256=dgFLt40NSqB8Ciej5QcZQLiV4U7LcrGux0vXckiE31U,7568
|
48
|
+
kreuzberg/_utils/_serialization.py,sha256=Rt5zSkvzf1SVNDrI6F2Zvnkel24mQkD1QvP0WjgZUgk,2195
|
49
|
+
kreuzberg/_utils/_string.py,sha256=5YKu9EZlZQ-LkphXUq8fdwKQrX9jWACFEhMGfjIysf4,6381
|
50
|
+
kreuzberg/_utils/_sync.py,sha256=7LSavBmxVKQUzdjfx9fYRAI9IbJtRw8iGf_Q8B7RX9g,4923
|
51
|
+
kreuzberg/_utils/_table.py,sha256=C2skLtcyczxDEH33Qw2dOwnR15SGillvNEP-NzBG3R8,8156
|
52
|
+
kreuzberg/_utils/_tmp.py,sha256=hVn-VVijIg2FM7EZJ899gc7wZg-TGoJZoeAcxMX-Cxg,1044
|
53
|
+
kreuzberg-3.8.0.dist-info/METADATA,sha256=d1N7v0EvJA-22g071Dctler5zF11WlKGTgLjGpsV8iw,11422
|
54
|
+
kreuzberg-3.8.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
55
|
+
kreuzberg-3.8.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
|
56
|
+
kreuzberg-3.8.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
57
|
+
kreuzberg-3.8.0.dist-info/RECORD,,
|
@@ -1,189 +0,0 @@
|
|
1
|
-
"""Process pool manager for resource-aware multiprocessing."""
|
2
|
-
|
3
|
-
from __future__ import annotations
|
4
|
-
|
5
|
-
import multiprocessing as mp
|
6
|
-
from concurrent.futures import ProcessPoolExecutor
|
7
|
-
from typing import TYPE_CHECKING, Any, TypeVar
|
8
|
-
|
9
|
-
import anyio
|
10
|
-
import psutil
|
11
|
-
from typing_extensions import Self
|
12
|
-
|
13
|
-
if TYPE_CHECKING:
|
14
|
-
import types
|
15
|
-
from collections.abc import Callable
|
16
|
-
|
17
|
-
T = TypeVar("T")
|
18
|
-
|
19
|
-
|
20
|
-
class ProcessPoolManager:
|
21
|
-
"""Resource-aware process pool manager for CPU-intensive tasks."""
|
22
|
-
|
23
|
-
def __init__(
|
24
|
-
self,
|
25
|
-
max_processes: int | None = None,
|
26
|
-
memory_limit_gb: float | None = None,
|
27
|
-
) -> None:
|
28
|
-
"""Initialize the process pool manager.
|
29
|
-
|
30
|
-
Args:
|
31
|
-
max_processes: Maximum number of processes. Defaults to CPU count.
|
32
|
-
memory_limit_gb: Memory limit in GB. Defaults to 75% of available memory.
|
33
|
-
"""
|
34
|
-
self.max_processes = max_processes or mp.cpu_count()
|
35
|
-
|
36
|
-
if memory_limit_gb is None:
|
37
|
-
available_memory = psutil.virtual_memory().available
|
38
|
-
self.memory_limit_bytes = int(available_memory * 0.75) # Use 75% of available # ~keep
|
39
|
-
else:
|
40
|
-
self.memory_limit_bytes = int(memory_limit_gb * 1024**3)
|
41
|
-
|
42
|
-
self._executor: ProcessPoolExecutor | None = None
|
43
|
-
self._active_tasks = 0
|
44
|
-
|
45
|
-
def get_optimal_workers(self, task_memory_mb: float = 100) -> int:
|
46
|
-
"""Calculate optimal number of workers based on memory constraints.
|
47
|
-
|
48
|
-
Args:
|
49
|
-
task_memory_mb: Estimated memory usage per task in MB.
|
50
|
-
|
51
|
-
Returns:
|
52
|
-
Optimal number of workers.
|
53
|
-
"""
|
54
|
-
task_memory_bytes = task_memory_mb * 1024**2
|
55
|
-
memory_based_limit = max(1, int(self.memory_limit_bytes / task_memory_bytes))
|
56
|
-
|
57
|
-
return min(self.max_processes, memory_based_limit)
|
58
|
-
|
59
|
-
def _ensure_executor(self, max_workers: int | None = None) -> ProcessPoolExecutor:
|
60
|
-
"""Ensure process pool executor is initialized."""
|
61
|
-
if self._executor is None or getattr(self._executor, "_max_workers", None) != max_workers:
|
62
|
-
if self._executor is not None:
|
63
|
-
self._executor.shutdown(wait=False)
|
64
|
-
|
65
|
-
workers = max_workers or self.max_processes
|
66
|
-
self._executor = ProcessPoolExecutor(max_workers=workers)
|
67
|
-
|
68
|
-
return self._executor
|
69
|
-
|
70
|
-
async def submit_task(
|
71
|
-
self,
|
72
|
-
func: Callable[..., T],
|
73
|
-
*args: Any,
|
74
|
-
task_memory_mb: float = 100,
|
75
|
-
) -> T:
|
76
|
-
"""Submit a task to the process pool.
|
77
|
-
|
78
|
-
Args:
|
79
|
-
func: Function to execute.
|
80
|
-
*args: Positional arguments for the function.
|
81
|
-
task_memory_mb: Estimated memory usage in MB.
|
82
|
-
|
83
|
-
Returns:
|
84
|
-
Result of the function execution.
|
85
|
-
"""
|
86
|
-
workers = self.get_optimal_workers(task_memory_mb)
|
87
|
-
self._ensure_executor(workers)
|
88
|
-
|
89
|
-
self._active_tasks += 1
|
90
|
-
|
91
|
-
try:
|
92
|
-
return await anyio.to_thread.run_sync(func, *args)
|
93
|
-
finally:
|
94
|
-
self._active_tasks -= 1
|
95
|
-
|
96
|
-
async def submit_batch(
|
97
|
-
self,
|
98
|
-
func: Callable[..., T],
|
99
|
-
arg_batches: list[tuple[Any, ...]],
|
100
|
-
task_memory_mb: float = 100,
|
101
|
-
max_concurrent: int | None = None,
|
102
|
-
) -> list[T]:
|
103
|
-
"""Submit a batch of tasks to the process pool.
|
104
|
-
|
105
|
-
Args:
|
106
|
-
func: Function to execute.
|
107
|
-
arg_batches: List of argument tuples for each task.
|
108
|
-
task_memory_mb: Estimated memory usage per task in MB.
|
109
|
-
max_concurrent: Maximum concurrent tasks. Defaults to optimal workers.
|
110
|
-
|
111
|
-
Returns:
|
112
|
-
List of results in the same order as input.
|
113
|
-
"""
|
114
|
-
if not arg_batches:
|
115
|
-
return []
|
116
|
-
|
117
|
-
workers = self.get_optimal_workers(task_memory_mb)
|
118
|
-
max_concurrent = max_concurrent or workers
|
119
|
-
|
120
|
-
self._ensure_executor(workers)
|
121
|
-
|
122
|
-
semaphore = anyio.CapacityLimiter(max_concurrent)
|
123
|
-
|
124
|
-
async def submit_single(args: tuple[Any, ...]) -> T:
|
125
|
-
async with semaphore:
|
126
|
-
self._active_tasks += 1
|
127
|
-
try:
|
128
|
-
return await anyio.to_thread.run_sync(func, *args)
|
129
|
-
finally:
|
130
|
-
self._active_tasks -= 1
|
131
|
-
|
132
|
-
async with anyio.create_task_group() as tg:
|
133
|
-
results: list[T] = [None] * len(arg_batches) # type: ignore[list-item]
|
134
|
-
|
135
|
-
async def run_task(idx: int, args: tuple[Any, ...]) -> None:
|
136
|
-
results[idx] = await submit_single(args)
|
137
|
-
|
138
|
-
for idx, args in enumerate(arg_batches):
|
139
|
-
tg.start_soon(run_task, idx, args)
|
140
|
-
|
141
|
-
return results
|
142
|
-
|
143
|
-
def get_system_info(self) -> dict[str, Any]:
|
144
|
-
"""Get current system resource information."""
|
145
|
-
memory = psutil.virtual_memory()
|
146
|
-
cpu_percent = psutil.cpu_percent(interval=1)
|
147
|
-
|
148
|
-
return {
|
149
|
-
"cpu_count": mp.cpu_count(),
|
150
|
-
"cpu_percent": cpu_percent,
|
151
|
-
"memory_total": memory.total,
|
152
|
-
"memory_available": memory.available,
|
153
|
-
"memory_percent": memory.percent,
|
154
|
-
"active_tasks": self._active_tasks,
|
155
|
-
"max_processes": self.max_processes,
|
156
|
-
"memory_limit": self.memory_limit_bytes,
|
157
|
-
}
|
158
|
-
|
159
|
-
def shutdown(self, wait: bool = True) -> None:
|
160
|
-
"""Shutdown the process pool."""
|
161
|
-
if self._executor is not None:
|
162
|
-
self._executor.shutdown(wait=wait)
|
163
|
-
self._executor = None
|
164
|
-
|
165
|
-
def __enter__(self) -> Self:
|
166
|
-
"""Context manager entry."""
|
167
|
-
return self
|
168
|
-
|
169
|
-
def __exit__(
|
170
|
-
self,
|
171
|
-
exc_type: type[BaseException] | None,
|
172
|
-
exc_val: BaseException | None,
|
173
|
-
exc_tb: types.TracebackType | None,
|
174
|
-
) -> None:
|
175
|
-
"""Context manager exit."""
|
176
|
-
self.shutdown()
|
177
|
-
|
178
|
-
async def __aenter__(self) -> Self:
|
179
|
-
"""Async context manager entry."""
|
180
|
-
return self
|
181
|
-
|
182
|
-
async def __aexit__(
|
183
|
-
self,
|
184
|
-
exc_type: type[BaseException] | None,
|
185
|
-
exc_val: BaseException | None,
|
186
|
-
exc_tb: types.TracebackType | None,
|
187
|
-
) -> None:
|
188
|
-
"""Async context manager exit."""
|
189
|
-
self.shutdown()
|