kreuzberg 3.15.0__py3-none-any.whl → 3.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +4 -0
- kreuzberg/_api/main.py +0 -53
- kreuzberg/_config.py +11 -1
- kreuzberg/_document_classification.py +1 -1
- kreuzberg/_extractors/_email.py +16 -10
- kreuzberg/_extractors/_html.py +39 -12
- kreuzberg/_extractors/_pdf.py +2 -3
- kreuzberg/_extractors/_presentation.py +4 -0
- kreuzberg/_extractors/_spread_sheet.py +0 -1
- kreuzberg/_extractors/_structured.py +83 -15
- kreuzberg/_gmft.py +5 -0
- kreuzberg/_mcp/server.py +0 -21
- kreuzberg/_ocr/_easyocr.py +51 -19
- kreuzberg/_ocr/_tesseract.py +14 -3
- kreuzberg/_types.py +111 -40
- kreuzberg/_utils/_html_streaming.py +20 -0
- kreuzberg/_utils/_serialization.py +13 -6
- kreuzberg/_utils/_sync.py +15 -16
- kreuzberg/extraction.py +2 -2
- {kreuzberg-3.15.0.dist-info → kreuzberg-3.16.0.dist-info}/METADATA +12 -11
- {kreuzberg-3.15.0.dist-info → kreuzberg-3.16.0.dist-info}/RECORD +24 -23
- {kreuzberg-3.15.0.dist-info → kreuzberg-3.16.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.15.0.dist-info → kreuzberg-3.16.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.15.0.dist-info → kreuzberg-3.16.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_utils/_sync.py
CHANGED
@@ -1,19 +1,16 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
import asyncio
|
4
3
|
from functools import partial
|
5
4
|
from inspect import isawaitable, iscoroutinefunction
|
6
|
-
from typing import TYPE_CHECKING, Any, TypeVar, cast
|
5
|
+
from typing import TYPE_CHECKING, Any, ParamSpec, TypeVar, cast
|
7
6
|
|
8
7
|
import anyio
|
9
|
-
from anyio import create_task_group
|
8
|
+
from anyio import CapacityLimiter, create_task_group
|
10
9
|
from anyio.to_thread import run_sync as any_io_run_sync
|
11
10
|
|
12
11
|
if TYPE_CHECKING: # pragma: no cover
|
13
12
|
from collections.abc import Awaitable, Callable
|
14
13
|
|
15
|
-
from typing import ParamSpec
|
16
|
-
|
17
14
|
T = TypeVar("T")
|
18
15
|
P = ParamSpec("P")
|
19
16
|
|
@@ -57,24 +54,26 @@ async def run_taskgroup_batched(
|
|
57
54
|
return []
|
58
55
|
|
59
56
|
if len(async_tasks) <= batch_size or not use_semaphore:
|
60
|
-
|
57
|
+
batch_results: list[Any] = []
|
61
58
|
for i in range(0, len(async_tasks), batch_size):
|
62
59
|
batch = async_tasks[i : i + batch_size]
|
63
|
-
|
64
|
-
return
|
60
|
+
batch_results.extend(await run_taskgroup(*batch))
|
61
|
+
return batch_results
|
65
62
|
|
66
|
-
|
63
|
+
limiter = CapacityLimiter(batch_size)
|
64
|
+
results: list[tuple[int, Any]] = []
|
67
65
|
|
68
|
-
async def run_with_semaphore(task: Awaitable[Any], index: int) ->
|
69
|
-
async with
|
66
|
+
async def run_with_semaphore(task: Awaitable[Any], index: int) -> None:
|
67
|
+
async with limiter:
|
70
68
|
result = await task
|
71
|
-
|
69
|
+
results.append((index, result))
|
72
70
|
|
73
|
-
|
74
|
-
|
71
|
+
async with create_task_group() as tg:
|
72
|
+
for i, task in enumerate(async_tasks):
|
73
|
+
tg.start_soon(run_with_semaphore, task, i)
|
75
74
|
|
76
|
-
|
77
|
-
return [result for _, result in
|
75
|
+
results.sort(key=lambda x: x[0])
|
76
|
+
return [result for _, result in results]
|
78
77
|
|
79
78
|
|
80
79
|
async def run_maybe_sync(fn: Callable[P, T | Awaitable[T]], *args: P.args, **kwargs: P.kwargs) -> T:
|
kreuzberg/extraction.py
CHANGED
@@ -47,7 +47,7 @@ async def _handle_cache_async(path: Path, config: ExtractionConfig) -> Extractio
|
|
47
47
|
return cached_result
|
48
48
|
|
49
49
|
if cache.is_processing(path, config):
|
50
|
-
event = cache.mark_processing(path, config)
|
50
|
+
event = cache.mark_processing(path, config) # pragma: no cover
|
51
51
|
await anyio.to_thread.run_sync(event.wait) # pragma: no cover
|
52
52
|
|
53
53
|
return cache.get(path, config) # pragma: no cover
|
@@ -362,7 +362,7 @@ def extract_file_sync(
|
|
362
362
|
return cached_result
|
363
363
|
|
364
364
|
if cache.is_processing(path, config):
|
365
|
-
event = cache.mark_processing(path, config)
|
365
|
+
event = cache.mark_processing(path, config) # pragma: no cover
|
366
366
|
event.wait() # pragma: no cover
|
367
367
|
|
368
368
|
# Try cache again after waiting for other process to complete # ~keep
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.16.0
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -31,7 +31,7 @@ Requires-Python: >=3.10
|
|
31
31
|
Requires-Dist: anyio>=4.10.0
|
32
32
|
Requires-Dist: chardetng-py>=0.3.5
|
33
33
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
34
|
-
Requires-Dist: html-to-markdown[lxml]>=1.
|
34
|
+
Requires-Dist: html-to-markdown[lxml]>=1.13.0
|
35
35
|
Requires-Dist: mcp>=1.14.0
|
36
36
|
Requires-Dist: msgspec>=0.18.0
|
37
37
|
Requires-Dist: numpy>=2.0.0
|
@@ -109,7 +109,7 @@ Description-Content-Type: text/markdown
|
|
109
109
|
- **Text Extraction**: High-fidelity text extraction preserving document structure and formatting
|
110
110
|
- **Image Extraction**: Extract embedded images from PDFs, presentations, HTML, and Office documents with optional OCR
|
111
111
|
- **Metadata Extraction**: Comprehensive metadata including author, creation date, language, and document properties
|
112
|
-
- **Format Support**:
|
112
|
+
- **Format Support**: 21 document types including PDF, Microsoft Office, images, HTML, and structured data formats
|
113
113
|
- **OCR Integration**: Tesseract OCR with markdown output (default) and table extraction from scanned documents
|
114
114
|
- **Document Classification**: Automatic document type detection (contracts, forms, invoices, receipts, reports)
|
115
115
|
|
@@ -227,14 +227,15 @@ claude mcp add kreuzberg uvx kreuzberg-mcp
|
|
227
227
|
|
228
228
|
## Supported Formats
|
229
229
|
|
230
|
-
| Category
|
231
|
-
|
|
232
|
-
| **Documents**
|
233
|
-
| **Images**
|
234
|
-
| **Spreadsheets**
|
235
|
-
| **Presentations**
|
236
|
-
| **Web**
|
237
|
-
| **
|
230
|
+
| Category | Formats |
|
231
|
+
| ------------------- | ------------------------------ |
|
232
|
+
| **Documents** | PDF, DOCX, DOC, RTF, TXT, EPUB |
|
233
|
+
| **Images** | JPG, PNG, TIFF, BMP, GIF, WEBP |
|
234
|
+
| **Spreadsheets** | XLSX, XLS, CSV, ODS |
|
235
|
+
| **Presentations** | PPTX, PPT, ODP |
|
236
|
+
| **Web** | HTML, XML, MHTML |
|
237
|
+
| **Structured Data** | JSON, YAML, TOML |
|
238
|
+
| **Archives** | Support via extraction |
|
238
239
|
|
239
240
|
## 📊 Performance Characteristics
|
240
241
|
|
@@ -1,46 +1,47 @@
|
|
1
|
-
kreuzberg/__init__.py,sha256
|
1
|
+
kreuzberg/__init__.py,sha256=EE6ENEjyKlt0o6QN1cG3Z_1isCtminVOjQT7ii5eBHA,1575
|
2
2
|
kreuzberg/__main__.py,sha256=3cIDdzTggj2kj8uKx4WShWHmCWqdZazdM3BxUGbAuSI,104
|
3
3
|
kreuzberg/_chunker.py,sha256=lRXvVN60vmWaTxa1b3QzvE-jBmOqYzh5dY-3Kl6pSqI,1427
|
4
|
-
kreuzberg/_config.py,sha256=
|
4
|
+
kreuzberg/_config.py,sha256=H4jUAL0fNY-YE61GbGq5UtAUtXHbZA4-9W3YwcT_hu8,12988
|
5
5
|
kreuzberg/_constants.py,sha256=gY6SpCi9za59ghRuLX_z7xfSok6qqvPbvEnv4BLczqI,265
|
6
|
-
kreuzberg/_document_classification.py,sha256=
|
6
|
+
kreuzberg/_document_classification.py,sha256=55aDxDIJ65qK6yEXt-fRYTn8LgALvYsWssjWSheVpR0,5697
|
7
7
|
kreuzberg/_entity_extraction.py,sha256=YvcELIo3kV8A_WbzwNjhKn7rPhkZXjbpNMgm2UK0oJw,3621
|
8
|
-
kreuzberg/_gmft.py,sha256=
|
8
|
+
kreuzberg/_gmft.py,sha256=XI8vdBG0tdEVwFiabVieCuvxM5esqTSiFtsEwJ0YT5g,20787
|
9
9
|
kreuzberg/_language_detection.py,sha256=T9p6aimB7QFXAQiEntIMZeH_Z62E52E8fBQ43hWuyhs,1960
|
10
10
|
kreuzberg/_mime_types.py,sha256=-05mBS5AoF4LUmfB_WyLoce0y4peiOyOf2JucF714WQ,8602
|
11
11
|
kreuzberg/_playa.py,sha256=p4G5ymSSCbQoDeXJjH-yuVzdd4y-wKcolqDthjPtqok,11413
|
12
12
|
kreuzberg/_registry.py,sha256=8XYT-vPhNYMAbB5RBIUKz-1Zdg48OCnBcdVZzBq6YwY,3307
|
13
|
-
kreuzberg/_types.py,sha256=
|
13
|
+
kreuzberg/_types.py,sha256=Xht1_TcvsbIpdmLYMy6Pa_HpbQuF9MBOo-BrKkZ7cLA,47358
|
14
14
|
kreuzberg/cli.py,sha256=OoHA5MiIcRBATFJpb-FZYlZfpohxL2AbVgamyhnEMFo,14342
|
15
15
|
kreuzberg/exceptions.py,sha256=PTiAZgQwcG9hXbgYg2W7sfxksFhq5_wzOFgZGnTJAoc,2991
|
16
|
-
kreuzberg/extraction.py,sha256=
|
16
|
+
kreuzberg/extraction.py,sha256=5TuuRqLRmboLaTS0x9eZ2lrYOHKJBSHuTT_U-5nn6ek,17829
|
17
17
|
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
18
|
kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
19
|
kreuzberg/_api/_config_cache.py,sha256=gX_ezGNq6SCpTn02yFkn24zMVrQwfIk8-u5XkKJiHFg,8774
|
20
|
-
kreuzberg/_api/main.py,sha256=
|
20
|
+
kreuzberg/_api/main.py,sha256=_tBZaRiq7qq7x4nXkVRgU5FBivLFJ_dmadAc7aT0H_k,13901
|
21
21
|
kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
22
|
kreuzberg/_extractors/_base.py,sha256=39E7R7hV6C2uMJdQKLBVSWK3tN-mtK0LaayU10-8Fqo,11191
|
23
|
-
kreuzberg/_extractors/_email.py,sha256=
|
24
|
-
kreuzberg/_extractors/_html.py,sha256=
|
23
|
+
kreuzberg/_extractors/_email.py,sha256=DzNthVbmbdlajDUfs0nNwbHNvG0CAQVqJsRfsatHtf0,8799
|
24
|
+
kreuzberg/_extractors/_html.py,sha256=zZ9WZmmoIG9B5dGF25ulm_GmW9RsYFI1HddDUUp3hOE,6351
|
25
25
|
kreuzberg/_extractors/_image.py,sha256=7rKEGhUAmdzO0YcBKQVhVme4PqyKIi2UCn4esmmFXOY,4300
|
26
26
|
kreuzberg/_extractors/_pandoc.py,sha256=CPEJxKTZdfyb7jPacZkiAsR2NEGL6KyiHzOr88tprJY,24142
|
27
|
-
kreuzberg/_extractors/_pdf.py,sha256=
|
28
|
-
kreuzberg/_extractors/_presentation.py,sha256=
|
29
|
-
kreuzberg/_extractors/_spread_sheet.py,sha256=
|
30
|
-
kreuzberg/_extractors/_structured.py,sha256=
|
27
|
+
kreuzberg/_extractors/_pdf.py,sha256=78gPO7m8nPFIOskqqRpUfyOhKUk6f5rjJ0cZDnL9Vdk,23224
|
28
|
+
kreuzberg/_extractors/_presentation.py,sha256=2g6PJnpgUpUfMjQJh-7_gHywDulE8QE8ypH__BrEUTQ,10692
|
29
|
+
kreuzberg/_extractors/_spread_sheet.py,sha256=TJOM70DLN0HzcOkAowZJogAx7QFrouohvU5V0OIliag,12738
|
30
|
+
kreuzberg/_extractors/_structured.py,sha256=YkTOfSQJOe127ZURrAYAomNrIkKoAYC4gt0P9ypY3RY,8919
|
31
31
|
kreuzberg/_mcp/__init__.py,sha256=h6DgLFO4TMUk7_wCJ2jn2Y6IkFmfzb-Z7jX-G5UCYVc,43
|
32
|
-
kreuzberg/_mcp/server.py,sha256=
|
32
|
+
kreuzberg/_mcp/server.py,sha256=vJWCXbBiv0ktIPZeLedSWZEwKF46p6642H6lxhTnjek,16723
|
33
33
|
kreuzberg/_ocr/__init__.py,sha256=grshVFwVQl2rMvH1hg1JNlYXjy5-Tdb_rusLD1Cselk,706
|
34
34
|
kreuzberg/_ocr/_base.py,sha256=ZvOJvW8DtylQJZdCPk9vlVNZiBFK-dC4Oj7Kb6-mWkY,1419
|
35
|
-
kreuzberg/_ocr/_easyocr.py,sha256=
|
35
|
+
kreuzberg/_ocr/_easyocr.py,sha256=7bkMM_zN0h7ZiX0-VHxxnwNOhQloI-dlOOibpRc-vNs,15710
|
36
36
|
kreuzberg/_ocr/_paddleocr.py,sha256=XyYc3gtmnvOGfQ0qBQYFphJa1kSv5hZ_LJ0weD2hQ08,15006
|
37
37
|
kreuzberg/_ocr/_table_extractor.py,sha256=LhBiCX8R_xR-uK1FH3ONA_vqOmqUWANZJ2HMCBLsmNY,5513
|
38
|
-
kreuzberg/_ocr/_tesseract.py,sha256=
|
38
|
+
kreuzberg/_ocr/_tesseract.py,sha256=BjTKE6ilUpSEKarHdgP3PbsE6I89JeqgDtpQ-XHniBA,51452
|
39
39
|
kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
40
40
|
kreuzberg/_utils/_cache.py,sha256=AtANbs1MWR4WLB2MhatVGhlh7kM-yjSfFuDnSVSNp50,14110
|
41
41
|
kreuzberg/_utils/_device.py,sha256=o03rLiHiRX6TKhJ55LO1Vj2Map1Po5YdjuMdA63tGOE,8249
|
42
42
|
kreuzberg/_utils/_document_cache.py,sha256=tfk9_Yc1cQkT5_uM5R1uaI4w-2SjNn7QyAd6AmWkSz8,4851
|
43
43
|
kreuzberg/_utils/_errors.py,sha256=aQYEnp8oJ-WJVmCNo7YY-25y1KZZFEwjAmxVRfw4a_M,4920
|
44
|
+
kreuzberg/_utils/_html_streaming.py,sha256=ywQgEQfEGm6MSotS1g_HXgl0e7V59yLmf2wytALuZko,648
|
44
45
|
kreuzberg/_utils/_image_preprocessing.py,sha256=arl4UDDiD_Z6SKM-jTXENaOaaHZBVFTsueb6DcpFXOo,10934
|
45
46
|
kreuzberg/_utils/_ocr_cache.py,sha256=uCCZfdY7EiqMhCnhNwqirFOr-Wfaobd2Ntc-F07TKec,3425
|
46
47
|
kreuzberg/_utils/_pdf_lock.py,sha256=Ytvds30aZf3yXeZFo27ZenrhUoU-GZlR2rKEkhJ_wlk,1349
|
@@ -48,13 +49,13 @@ kreuzberg/_utils/_process_pool.py,sha256=fqlxNsxDoqS28BLrZeDBH743HdaUBuGPYFH5hjS
|
|
48
49
|
kreuzberg/_utils/_quality.py,sha256=FCVh9KieWUYgT1klLxudbslzKuqbOTBbTsHbvIuru7M,5510
|
49
50
|
kreuzberg/_utils/_ref.py,sha256=iOflvjTUc_F0XaL28Bd6fpvL6qkeoURGA4B77Nqky7I,840
|
50
51
|
kreuzberg/_utils/_resource_managers.py,sha256=N3-VeHDj6sKBeg3UL-PqRtKGExUBoVcEB5UuQ8FncY8,2079
|
51
|
-
kreuzberg/_utils/_serialization.py,sha256=
|
52
|
+
kreuzberg/_utils/_serialization.py,sha256=G-kxtCPDPGFqBMyHfzvAPo-bNUmPdaXYdeg1dnBLfN4,1789
|
52
53
|
kreuzberg/_utils/_string.py,sha256=wVyvEHByHBeu_6evmqJGv9Ml-NAwkyz60n8l-7L5Cw0,4366
|
53
|
-
kreuzberg/_utils/_sync.py,sha256=
|
54
|
+
kreuzberg/_utils/_sync.py,sha256=gb828WYfVtkB4wKslJrPMmrdeI1h3htWceq-gywHtO4,3184
|
54
55
|
kreuzberg/_utils/_table.py,sha256=OVg6T2QnerMhVNb1juLTBSIjyjFiE5-OrUWr5NSCgnQ,6493
|
55
56
|
kreuzberg/_utils/_tmp.py,sha256=mwZ0BFzhGPfYa2tt8qSjUjfcHnSYvbQT4VlPRCRc_q8,2038
|
56
|
-
kreuzberg-3.
|
57
|
-
kreuzberg-3.
|
58
|
-
kreuzberg-3.
|
59
|
-
kreuzberg-3.
|
60
|
-
kreuzberg-3.
|
57
|
+
kreuzberg-3.16.0.dist-info/METADATA,sha256=d1sUA7WBl0VcXHX0jPGzTHeXmj7yyJzTWjzHUmT-Dp4,12319
|
58
|
+
kreuzberg-3.16.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
59
|
+
kreuzberg-3.16.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
|
60
|
+
kreuzberg-3.16.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
61
|
+
kreuzberg-3.16.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|