kreuzberg 3.6.2__py3-none-any.whl → 3.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. kreuzberg/_extractors/_base.py +40 -0
  2. kreuzberg/_extractors/_email.py +149 -0
  3. kreuzberg/_extractors/_html.py +15 -3
  4. kreuzberg/_extractors/_image.py +17 -18
  5. kreuzberg/_extractors/_pdf.py +68 -14
  6. kreuzberg/_extractors/_presentation.py +62 -10
  7. kreuzberg/_extractors/_spread_sheet.py +179 -4
  8. kreuzberg/_extractors/_structured.py +148 -0
  9. kreuzberg/_gmft.py +2 -2
  10. kreuzberg/_mcp/__init__.py +5 -0
  11. kreuzberg/_mcp/server.py +227 -0
  12. kreuzberg/_mime_types.py +27 -1
  13. kreuzberg/_multiprocessing/__init__.py +2 -3
  14. kreuzberg/_ocr/__init__.py +30 -0
  15. kreuzberg/{_multiprocessing/tesseract_pool.py → _ocr/_pool.py} +3 -5
  16. kreuzberg/_ocr/_sync.py +566 -0
  17. kreuzberg/_ocr/_tesseract.py +6 -2
  18. kreuzberg/_registry.py +4 -0
  19. kreuzberg/_types.py +131 -0
  20. kreuzberg/_utils/_cache.py +17 -2
  21. kreuzberg/_utils/_process_pool.py +178 -1
  22. kreuzberg/_utils/_quality.py +237 -0
  23. kreuzberg/_utils/_serialization.py +4 -2
  24. kreuzberg/_utils/_string.py +153 -10
  25. kreuzberg/_utils/_sync.py +5 -2
  26. kreuzberg/_utils/_table.py +261 -0
  27. {kreuzberg-3.6.2.dist-info → kreuzberg-3.8.0.dist-info}/METADATA +116 -48
  28. kreuzberg-3.8.0.dist-info/RECORD +57 -0
  29. {kreuzberg-3.6.2.dist-info → kreuzberg-3.8.0.dist-info}/entry_points.txt +1 -0
  30. kreuzberg/_multiprocessing/process_manager.py +0 -189
  31. kreuzberg/_multiprocessing/sync_easyocr.py +0 -235
  32. kreuzberg/_multiprocessing/sync_paddleocr.py +0 -199
  33. kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
  34. kreuzberg-3.6.2.dist-info/RECORD +0 -54
  35. {kreuzberg-3.6.2.dist-info → kreuzberg-3.8.0.dist-info}/WHEEL +0 -0
  36. {kreuzberg-3.6.2.dist-info → kreuzberg-3.8.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.6.2
3
+ Version: 3.8.0
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
6
6
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
@@ -23,9 +23,10 @@ Classifier: Topic :: Utilities
23
23
  Classifier: Typing :: Typed
24
24
  Requires-Python: >=3.10
25
25
  Requires-Dist: anyio>=4.9.0
26
- Requires-Dist: charset-normalizer>=3.4.2
26
+ Requires-Dist: chardetng-py>=0.3.4
27
27
  Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
28
- Requires-Dist: html-to-markdown[lxml]>=1.6.0
28
+ Requires-Dist: html-to-markdown[lxml]>=1.8.0
29
+ Requires-Dist: mcp>=1.11.0
29
30
  Requires-Dist: msgspec>=0.18.0
30
31
  Requires-Dist: playa-pdf>=0.6.1
31
32
  Requires-Dist: psutil>=7.0.0
@@ -33,6 +34,9 @@ Requires-Dist: pypdfium2==4.30.0
33
34
  Requires-Dist: python-calamine>=0.3.2
34
35
  Requires-Dist: python-pptx>=1.0.2
35
36
  Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
37
+ Provides-Extra: additional-extensions
38
+ Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
39
+ Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
36
40
  Provides-Extra: all
37
41
  Requires-Dist: click>=8.2.1; extra == 'all'
38
42
  Requires-Dist: easyocr>=1.7.2; extra == 'all'
@@ -40,6 +44,7 @@ Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
40
44
  Requires-Dist: gmft>=0.4.2; extra == 'all'
41
45
  Requires-Dist: keybert>=0.9.0; extra == 'all'
42
46
  Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all'
47
+ Requires-Dist: mailparse>=1.0.15; extra == 'all'
43
48
  Requires-Dist: paddleocr>=3.1.0; extra == 'all'
44
49
  Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
45
50
  Requires-Dist: rich>=14.0.0; extra == 'all'
@@ -76,21 +81,51 @@ Description-Content-Type: text/markdown
76
81
  [![PyPI version](https://badge.fury.io/py/kreuzberg.svg)](https://badge.fury.io/py/kreuzberg)
77
82
  [![Documentation](https://img.shields.io/badge/docs-GitHub_Pages-blue)](https://goldziher.github.io/kreuzberg/)
78
83
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
84
+ [![Test Coverage](https://img.shields.io/badge/coverage-95%25-green)](https://github.com/Goldziher/kreuzberg)
79
85
 
80
- **High-performance Python library for text extraction from documents.** Extract text from PDFs, images, office documents, and more with both async and sync APIs.
86
+ **High-performance Open Source Document Intelligence framework for Python.** Built by engineers for production workloads - extract text from any document with excellent performance and minimal complexity.
81
87
 
82
88
  📖 **[Complete Documentation](https://goldziher.github.io/kreuzberg/)**
83
89
 
84
- ## Why Kreuzberg?
90
+ ## Why Choose Kreuzberg?
85
91
 
86
- - **🚀 Fastest Performance**: [35+ files/second](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) - the fastest text extraction library
87
- - **💾 Memory Efficient**: 14x smaller than alternatives (71MB vs 1GB+) with lowest memory usage (~530MB)
88
- - **⚡ Dual APIs**: Only library with both sync and async support
89
- - **🔧 Zero Configuration**: Works out of the box with sane defaults
90
- - **🏠 Local Processing**: No cloud dependencies or external API calls
91
- - **📦 Rich Format Support**: PDFs, images, Office docs, HTML, and more
92
- - **🔍 Multiple OCR Engines**: Tesseract, EasyOCR, and PaddleOCR support
93
- - **🐳 Production Ready**: CLI, REST API, and Docker images included
92
+ ### 🚀 Performance
93
+
94
+ - [benchmarked as the fastest framework](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) - 2-3x faster than the nearest alternatives
95
+ - Minimal footprint: 71MB install vs 1GB+ for competitors
96
+ - Lowest memory usage (~530MB average) optimized for production workloads
97
+ - Edge and serverless ready - deploy anywhere without heavy dependencies
98
+
99
+ ### 🛠️ Engineering Quality
100
+
101
+ - Built by software engineers with modern Python best practices
102
+ - 95%+ test coverage with comprehensive test suite
103
+ - Thoroughly benchmarked and profiled for real-world performance
104
+ - Only framework offering true async/await support alongside sync APIs
105
+ - Robust error handling and detailed logging
106
+
107
+ ### 🎯 Developer Experience
108
+
109
+ - Works out of the box with sane defaults, scales with your needs
110
+ - Native MCP server for AI tool integration (Claude Desktop, Cursor)
111
+ - Full type safety with excellent IDE support (completions)
112
+ - Comprehensive documentation including full API reference
113
+
114
+ ### 🌍 Deployment Options
115
+
116
+ - Docker images for all architectures (AMD64, ARM64)
117
+ - Cloud native - AWS Lambda, Google Cloud Functions, Azure Functions
118
+ - CPU-only processing - no GPU requirements, lower energy consumption
119
+ - 100% local processing - no external API dependencies
120
+ - Multiple deployment modes: CLI, REST API, MCP server
121
+
122
+ ### 🎯 Complete Solution
123
+
124
+ - Universal format support: PDFs, images, Office docs, HTML, spreadsheets, presentations
125
+ - Multiple OCR engines: Tesseract, EasyOCR, PaddleOCR with intelligent fallbacks
126
+ - Advanced features: Table extraction, metadata extraction, content chunking for RAG
127
+ - Production tools: REST API, CLI tools, batch processing, custom extractors
128
+ - Fully extensible: Add your own extractors
94
129
 
95
130
  ## Quick Start
96
131
 
@@ -136,6 +171,55 @@ asyncio.run(main())
136
171
 
137
172
  ## Deployment Options
138
173
 
174
+ ### 🤖 MCP Server (AI Integration)
175
+
176
+ **Connect directly to Claude Desktop, Cursor, and other AI tools with the Model Context Protocol:**
177
+
178
+ ```bash
179
+ # Install and run MCP server with all features (recommended)
180
+ pip install "kreuzberg[all]"
181
+ kreuzberg-mcp
182
+
183
+ # Or with uvx (recommended for Claude Desktop)
184
+ uvx --with "kreuzberg[all]" kreuzberg-mcp
185
+
186
+ # Basic installation (core features only)
187
+ pip install kreuzberg
188
+ kreuzberg-mcp
189
+ ```
190
+
191
+ **Configure in Claude Desktop (`claude_desktop_config.json`):**
192
+
193
+ ```json
194
+ {
195
+ "mcpServers": {
196
+ "kreuzberg": {
197
+ "command": "uvx",
198
+ "args": ["--with", "kreuzberg[all]", "kreuzberg-mcp"]
199
+ }
200
+ }
201
+ }
202
+ ```
203
+
204
+ **Basic configuration (core features only):**
205
+
206
+ ```json
207
+ {
208
+ "mcpServers": {
209
+ "kreuzberg": {
210
+ "command": "uvx",
211
+ "args": ["kreuzberg-mcp"]
212
+ }
213
+ }
214
+ }
215
+ ```
216
+
217
+ **Available MCP capabilities:**
218
+
219
+ - **Tools**: `extract_document`, `extract_bytes`, `extract_simple`
220
+ - **Resources**: Configuration, supported formats, OCR backends
221
+ - **Prompts**: Extract-and-summarize, structured analysis workflows
222
+
139
223
  ### 🐳 Docker (Recommended)
140
224
 
141
225
  ```bash
@@ -146,7 +230,7 @@ docker run -p 8000:8000 goldziher/kreuzberg:latest
146
230
  curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
147
231
  ```
148
232
 
149
- Available variants: `latest`, `3.6.1`, `3.6.1-easyocr`, `3.6.1-paddle`, `3.6.1-gmft`, `3.6.1-all`
233
+ Available variants: `latest`, `v3.8.0`, `v3.8.0-easyocr`, `v3.8.0-paddle`, `v3.8.0-gmft`, `v3.8.0-all`
150
234
 
151
235
  ### 🌐 REST API
152
236
 
@@ -189,23 +273,28 @@ kreuzberg extract *.pdf --output-dir ./extracted/
189
273
  | **Web** | HTML, XML, MHTML |
190
274
  | **Archives** | Support via extraction |
191
275
 
192
- ## Performance
276
+ ## 📊 Performance Comparison
277
+
278
+ [Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/) across 94 real-world documents • [View source](https://github.com/Goldziher/python-text-extraction-libs-benchmarks):
193
279
 
194
- **[Comprehensive benchmarks](https://goldziher.github.io/python-text-extraction-libs-benchmarks/)** across 94 real-world documents (~210MB) [View source](https://github.com/Goldziher/python-text-extraction-libs-benchmarks):
280
+ | Framework | Speed | Memory | Install Size | Dependencies | Success Rate |
281
+ | ------------- | ----------- | ------ | ------------ | ------------ | ------------ |
282
+ | **Kreuzberg** | 35+ files/s | 530MB | 71MB | 20 | High |
283
+ | Unstructured | ~12 files/s | ~1GB | 146MB | 54 | 88%+ |
284
+ | MarkItDown | ~15 files/s | ~1.5GB | 251MB | 25 | 80%\* |
285
+ | Docling | ~1 file/min | ~5GB | 1,032MB | 88 | 45%\* |
195
286
 
196
- | Library | Speed | Memory | Install Size | Dependencies | Success Rate |
197
- | ------------- | --------------- | --------- | ------------ | ------------ | ------------ |
198
- | **Kreuzberg** | **35+ files/s** | **530MB** | **71MB** | **20** | High\* |
199
- | Unstructured | Moderate | ~1GB | 146MB | 54 | 88%+ |
200
- | MarkItDown | Good† | ~1.5GB | 251MB | 25 | 80%† |
201
- | Docling | 60+ min/file‡ | ~5GB | 1,032MB | 88 | Low‡ |
287
+ \*_Performance varies significantly with document complexity and size_
202
288
 
203
- \*_Can achieve 75% reliability with 15% performance trade-off when configured_
204
- †_Good on simple documents, struggles with large/complex files (>10MB)_
205
- ‡_Frequently fails/times out on medium files (>1MB)_
289
+ **Key strengths:**
206
290
 
207
- > **Benchmark details**: Tested across PDFs, Word docs, HTML, images, spreadsheets in 6 languages (English, Hebrew, German, Chinese, Japanese, Korean)
208
- > **Rule of thumb**: Use async API for complex documents and batch processing (up to 4.5x faster)
291
+ - 2-3x faster processing than comparable frameworks
292
+ - Smallest installation footprint and memory usage
293
+ - Only framework with built-in async/await support
294
+ - CPU-only processing - no GPU dependencies
295
+ - Built by software engineers for production reliability
296
+
297
+ > **Benchmark details**: Tests include PDFs, Word docs, HTML, images, and spreadsheets in multiple languages (English, Hebrew, German, Chinese, Japanese, Korean) on standardized hardware.
209
298
 
210
299
  ## Documentation
211
300
 
@@ -219,27 +308,6 @@ kreuzberg extract *.pdf --output-dir ./extracted/
219
308
  - [CLI Guide](https://goldziher.github.io/kreuzberg/cli/) - Command-line usage
220
309
  - [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - OCR engine setup
221
310
 
222
- ## Advanced Features
223
-
224
- - **📊 Table Extraction**: Extract tables from PDFs with GMFT
225
- - **🧩 Content Chunking**: Split documents for RAG applications
226
- - **🎯 Custom Extractors**: Extend with your own document handlers
227
- - **🔧 Configuration**: Flexible TOML-based configuration
228
- - **🪝 Hooks**: Pre/post-processing customization
229
- - **🌍 Multi-language OCR**: 100+ languages supported
230
- - **⚙️ Metadata Extraction**: Rich document metadata
231
- - **🔄 Batch Processing**: Efficient bulk document processing
232
-
233
311
  ## License
234
312
 
235
313
  MIT License - see [LICENSE](LICENSE) for details.
236
-
237
- ______________________________________________________________________
238
-
239
- <div align="center">
240
-
241
- **[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Benchmarks](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) • [Discord](https://discord.gg/pXxagNK2zN)**
242
-
243
- Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
244
-
245
- </div>
@@ -0,0 +1,57 @@
1
+ kreuzberg/__init__.py,sha256=wVxbug-w1cO2xHcP04Bf6QeIKmT2Ep6aeenb8EOYLA0,1534
2
+ kreuzberg/__main__.py,sha256=s2qM1nPEkRHAQP-G3P7sf5l6qA_KJeIEHS5LpPz04lg,183
3
+ kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
4
+ kreuzberg/_cli_config.py,sha256=WD_seFjbuay_NJv77vGLBW6BVV9WZNujdzf3zQkhzPc,5691
5
+ kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
6
+ kreuzberg/_entity_extraction.py,sha256=EIasBGpkZ-3FwivjEpisz23LilTwx8os-IbfrDtzNl4,7815
7
+ kreuzberg/_gmft.py,sha256=ZIEUu4Uy5zYNFEeDRbz1cLJhnCAStVsSzm1PQ3vDeO8,14828
8
+ kreuzberg/_language_detection.py,sha256=22-uXoOu_ws0K8Hz2M7U_SF9QX3npRYLhntAE1dNLFU,3283
9
+ kreuzberg/_mime_types.py,sha256=OhJ6gEyyLHjyvRtkk37zyLFBsRcSd_QybBaV8TxinIg,8471
10
+ kreuzberg/_playa.py,sha256=rU6ii2Qnrj8tkDYlSiab5h-BCYLJnUg4QwSLVDEXF5g,11883
11
+ kreuzberg/_registry.py,sha256=wGSlkS0U1zqruWQCLE95vj4a2mw1yyvf0j6rgz80sJg,3473
12
+ kreuzberg/_types.py,sha256=R_0Xc2kq4nEwkruvkB3qfrLeJ996419hBQ_1C6Xrqjo,13388
13
+ kreuzberg/cli.py,sha256=S0w2nGXBWPFn1NhxppW7dpUwB9f_3ymFuWSAB2aRu9g,12465
14
+ kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
15
+ kreuzberg/extraction.py,sha256=mdH45bMAAUUNXYT7UrNyWJ2oD_gXuLUU-NyuYxQM884,17459
16
+ kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
+ kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
+ kreuzberg/_api/main.py,sha256=kZCMPPzP4BGzEege9pdhQTJPKKVjCaC6kZdMMeaqP2M,2599
19
+ kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
+ kreuzberg/_extractors/_base.py,sha256=ECEwBpxnIy_J9kGZGuqsaPCgLFfxRn7kn4hIf11gDJ8,4478
21
+ kreuzberg/_extractors/_email.py,sha256=6-Mk1TRXPyy9ylWKCpgdrogyzhiFnJOTuTRld1ghO8I,5695
22
+ kreuzberg/_extractors/_html.py,sha256=lOM1Tgrrvd7vpEeFAxC1dp0Tibr6N2FEHCjgFx0FK64,1745
23
+ kreuzberg/_extractors/_image.py,sha256=0kzOQTTeJacaA8I9833fFvVQSz6FtUe9Nuw1oy0ToD0,4939
24
+ kreuzberg/_extractors/_pandoc.py,sha256=oQ4DgQSPoX1LXjGAKh_A40JHqiKWb91LeRBYSS_6EUA,26750
25
+ kreuzberg/_extractors/_pdf.py,sha256=giYG3aEdmsxT0tGWKBaMzHDPz74-jVmK4HZARDEBhsM,17108
26
+ kreuzberg/_extractors/_presentation.py,sha256=CUlqZl_QCdJdumsZh0BpROkFbvi9uq7yMoIt3bRTUeE,10859
27
+ kreuzberg/_extractors/_spread_sheet.py,sha256=Nvyz7XT7C2ai4QeUashBeENQpuP5rs8SmKfumxEqlCg,13712
28
+ kreuzberg/_extractors/_structured.py,sha256=i3jAvhHZt_BsRGgZZfgcsUqlwAg_RNc8vsuecb04T0c,5581
29
+ kreuzberg/_mcp/__init__.py,sha256=8PYV-omC8Rln7Cove8C3rHu3d7sR1FuiwSBG1O7vkAE,92
30
+ kreuzberg/_mcp/server.py,sha256=BQHeKI89aKf24BIE4n6m8r1rVA1Zgt6vM8Ki_OHuGnc,6780
31
+ kreuzberg/_multiprocessing/__init__.py,sha256=X2BtgKmWhF1rl0JYg2gvoSUaozKExfsWh-RRNvzNoOs,202
32
+ kreuzberg/_multiprocessing/gmft_isolated.py,sha256=ZfbhiL5bhBEJnibUSls3WV-FECrnU9VvKfq5O2foHcc,11191
33
+ kreuzberg/_ocr/__init__.py,sha256=CC9Ob1t_ltTYUamK1ZtmkswfCYdn1B-Z0kPemsQU0xU,1439
34
+ kreuzberg/_ocr/_base.py,sha256=lNT0Tin4hzbmaamqqySxvYEwNtrJB5gGlStrANQQcyc,1637
35
+ kreuzberg/_ocr/_easyocr.py,sha256=90Dv1xaLXbpG7EtmRQE5ykvnhqZJR3xSFXlxFMCSVSI,13740
36
+ kreuzberg/_ocr/_paddleocr.py,sha256=UvugDdZd7RojHUiFeBaI8aqz36ecegPLj2v6oT6c42g,13776
37
+ kreuzberg/_ocr/_pool.py,sha256=Yb0l_GxnPsIWn3NA2FuBYEC8ipIqgwaYglUt0ltqSvk,10948
38
+ kreuzberg/_ocr/_sync.py,sha256=cdLiH9hYqygzqW3LkibhrE6C8atin7mfTv_k3JJFE0k,18287
39
+ kreuzberg/_ocr/_tesseract.py,sha256=KtenEIGL63gRhdH2hxOEVM89locAETGo2bNjQMXjTwY,13266
40
+ kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
+ kreuzberg/_utils/_cache.py,sha256=CtpSmEggWoIPDZ9_Nl0i5pr7wtPyci8EVT-ajYsARGI,13609
42
+ kreuzberg/_utils/_device.py,sha256=rnaSSB5ibf2wr7EDxrcmOUZ4Ocor0pHkwb3N1pC46EY,10276
43
+ kreuzberg/_utils/_document_cache.py,sha256=z8irioKsOu8xve1YgHatm__wIFvs9I1gDK3tLNsNyqM,6926
44
+ kreuzberg/_utils/_errors.py,sha256=AV3oaRQDgJxe1YUZd9pCQUysUv9KW8Ib37MvnyFOZ4o,6386
45
+ kreuzberg/_utils/_pdf_lock.py,sha256=nqxAYCNlfWDrJtP4ZNu57st1YnkDl-gYXdr0q8nv0kA,1961
46
+ kreuzberg/_utils/_process_pool.py,sha256=E3bHOO67TeoLUBjtw5HoY9gyFl621VaImYI-_itQ96c,8653
47
+ kreuzberg/_utils/_quality.py,sha256=dgFLt40NSqB8Ciej5QcZQLiV4U7LcrGux0vXckiE31U,7568
48
+ kreuzberg/_utils/_serialization.py,sha256=Rt5zSkvzf1SVNDrI6F2Zvnkel24mQkD1QvP0WjgZUgk,2195
49
+ kreuzberg/_utils/_string.py,sha256=5YKu9EZlZQ-LkphXUq8fdwKQrX9jWACFEhMGfjIysf4,6381
50
+ kreuzberg/_utils/_sync.py,sha256=7LSavBmxVKQUzdjfx9fYRAI9IbJtRw8iGf_Q8B7RX9g,4923
51
+ kreuzberg/_utils/_table.py,sha256=C2skLtcyczxDEH33Qw2dOwnR15SGillvNEP-NzBG3R8,8156
52
+ kreuzberg/_utils/_tmp.py,sha256=hVn-VVijIg2FM7EZJ899gc7wZg-TGoJZoeAcxMX-Cxg,1044
53
+ kreuzberg-3.8.0.dist-info/METADATA,sha256=d1N7v0EvJA-22g071Dctler5zF11WlKGTgLjGpsV8iw,11422
54
+ kreuzberg-3.8.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
55
+ kreuzberg-3.8.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
56
+ kreuzberg-3.8.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
57
+ kreuzberg-3.8.0.dist-info/RECORD,,
@@ -1,2 +1,3 @@
1
1
  [console_scripts]
2
2
  kreuzberg = kreuzberg.cli:cli
3
+ kreuzberg-mcp = kreuzberg._mcp.server:main
@@ -1,189 +0,0 @@
1
- """Process pool manager for resource-aware multiprocessing."""
2
-
3
- from __future__ import annotations
4
-
5
- import multiprocessing as mp
6
- from concurrent.futures import ProcessPoolExecutor
7
- from typing import TYPE_CHECKING, Any, TypeVar
8
-
9
- import anyio
10
- import psutil
11
- from typing_extensions import Self
12
-
13
- if TYPE_CHECKING:
14
- import types
15
- from collections.abc import Callable
16
-
17
- T = TypeVar("T")
18
-
19
-
20
- class ProcessPoolManager:
21
- """Resource-aware process pool manager for CPU-intensive tasks."""
22
-
23
- def __init__(
24
- self,
25
- max_processes: int | None = None,
26
- memory_limit_gb: float | None = None,
27
- ) -> None:
28
- """Initialize the process pool manager.
29
-
30
- Args:
31
- max_processes: Maximum number of processes. Defaults to CPU count.
32
- memory_limit_gb: Memory limit in GB. Defaults to 75% of available memory.
33
- """
34
- self.max_processes = max_processes or mp.cpu_count()
35
-
36
- if memory_limit_gb is None:
37
- available_memory = psutil.virtual_memory().available
38
- self.memory_limit_bytes = int(available_memory * 0.75) # Use 75% of available # ~keep
39
- else:
40
- self.memory_limit_bytes = int(memory_limit_gb * 1024**3)
41
-
42
- self._executor: ProcessPoolExecutor | None = None
43
- self._active_tasks = 0
44
-
45
- def get_optimal_workers(self, task_memory_mb: float = 100) -> int:
46
- """Calculate optimal number of workers based on memory constraints.
47
-
48
- Args:
49
- task_memory_mb: Estimated memory usage per task in MB.
50
-
51
- Returns:
52
- Optimal number of workers.
53
- """
54
- task_memory_bytes = task_memory_mb * 1024**2
55
- memory_based_limit = max(1, int(self.memory_limit_bytes / task_memory_bytes))
56
-
57
- return min(self.max_processes, memory_based_limit)
58
-
59
- def _ensure_executor(self, max_workers: int | None = None) -> ProcessPoolExecutor:
60
- """Ensure process pool executor is initialized."""
61
- if self._executor is None or getattr(self._executor, "_max_workers", None) != max_workers:
62
- if self._executor is not None:
63
- self._executor.shutdown(wait=False)
64
-
65
- workers = max_workers or self.max_processes
66
- self._executor = ProcessPoolExecutor(max_workers=workers)
67
-
68
- return self._executor
69
-
70
- async def submit_task(
71
- self,
72
- func: Callable[..., T],
73
- *args: Any,
74
- task_memory_mb: float = 100,
75
- ) -> T:
76
- """Submit a task to the process pool.
77
-
78
- Args:
79
- func: Function to execute.
80
- *args: Positional arguments for the function.
81
- task_memory_mb: Estimated memory usage in MB.
82
-
83
- Returns:
84
- Result of the function execution.
85
- """
86
- workers = self.get_optimal_workers(task_memory_mb)
87
- self._ensure_executor(workers)
88
-
89
- self._active_tasks += 1
90
-
91
- try:
92
- return await anyio.to_thread.run_sync(func, *args)
93
- finally:
94
- self._active_tasks -= 1
95
-
96
- async def submit_batch(
97
- self,
98
- func: Callable[..., T],
99
- arg_batches: list[tuple[Any, ...]],
100
- task_memory_mb: float = 100,
101
- max_concurrent: int | None = None,
102
- ) -> list[T]:
103
- """Submit a batch of tasks to the process pool.
104
-
105
- Args:
106
- func: Function to execute.
107
- arg_batches: List of argument tuples for each task.
108
- task_memory_mb: Estimated memory usage per task in MB.
109
- max_concurrent: Maximum concurrent tasks. Defaults to optimal workers.
110
-
111
- Returns:
112
- List of results in the same order as input.
113
- """
114
- if not arg_batches:
115
- return []
116
-
117
- workers = self.get_optimal_workers(task_memory_mb)
118
- max_concurrent = max_concurrent or workers
119
-
120
- self._ensure_executor(workers)
121
-
122
- semaphore = anyio.CapacityLimiter(max_concurrent)
123
-
124
- async def submit_single(args: tuple[Any, ...]) -> T:
125
- async with semaphore:
126
- self._active_tasks += 1
127
- try:
128
- return await anyio.to_thread.run_sync(func, *args)
129
- finally:
130
- self._active_tasks -= 1
131
-
132
- async with anyio.create_task_group() as tg:
133
- results: list[T] = [None] * len(arg_batches) # type: ignore[list-item]
134
-
135
- async def run_task(idx: int, args: tuple[Any, ...]) -> None:
136
- results[idx] = await submit_single(args)
137
-
138
- for idx, args in enumerate(arg_batches):
139
- tg.start_soon(run_task, idx, args)
140
-
141
- return results
142
-
143
- def get_system_info(self) -> dict[str, Any]:
144
- """Get current system resource information."""
145
- memory = psutil.virtual_memory()
146
- cpu_percent = psutil.cpu_percent(interval=1)
147
-
148
- return {
149
- "cpu_count": mp.cpu_count(),
150
- "cpu_percent": cpu_percent,
151
- "memory_total": memory.total,
152
- "memory_available": memory.available,
153
- "memory_percent": memory.percent,
154
- "active_tasks": self._active_tasks,
155
- "max_processes": self.max_processes,
156
- "memory_limit": self.memory_limit_bytes,
157
- }
158
-
159
- def shutdown(self, wait: bool = True) -> None:
160
- """Shutdown the process pool."""
161
- if self._executor is not None:
162
- self._executor.shutdown(wait=wait)
163
- self._executor = None
164
-
165
- def __enter__(self) -> Self:
166
- """Context manager entry."""
167
- return self
168
-
169
- def __exit__(
170
- self,
171
- exc_type: type[BaseException] | None,
172
- exc_val: BaseException | None,
173
- exc_tb: types.TracebackType | None,
174
- ) -> None:
175
- """Context manager exit."""
176
- self.shutdown()
177
-
178
- async def __aenter__(self) -> Self:
179
- """Async context manager entry."""
180
- return self
181
-
182
- async def __aexit__(
183
- self,
184
- exc_type: type[BaseException] | None,
185
- exc_val: BaseException | None,
186
- exc_tb: types.TracebackType | None,
187
- ) -> None:
188
- """Async context manager exit."""
189
- self.shutdown()