kreuzberg 3.15.0__py3-none-any.whl → 3.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/_utils/_sync.py CHANGED
@@ -1,19 +1,16 @@
1
1
  from __future__ import annotations
2
2
 
3
- import asyncio
4
3
  from functools import partial
5
4
  from inspect import isawaitable, iscoroutinefunction
6
- from typing import TYPE_CHECKING, Any, TypeVar, cast
5
+ from typing import TYPE_CHECKING, Any, ParamSpec, TypeVar, cast
7
6
 
8
7
  import anyio
9
- from anyio import create_task_group
8
+ from anyio import CapacityLimiter, create_task_group
10
9
  from anyio.to_thread import run_sync as any_io_run_sync
11
10
 
12
11
  if TYPE_CHECKING: # pragma: no cover
13
12
  from collections.abc import Awaitable, Callable
14
13
 
15
- from typing import ParamSpec
16
-
17
14
  T = TypeVar("T")
18
15
  P = ParamSpec("P")
19
16
 
@@ -57,24 +54,26 @@ async def run_taskgroup_batched(
57
54
  return []
58
55
 
59
56
  if len(async_tasks) <= batch_size or not use_semaphore:
60
- results: list[Any] = []
57
+ batch_results: list[Any] = []
61
58
  for i in range(0, len(async_tasks), batch_size):
62
59
  batch = async_tasks[i : i + batch_size]
63
- results.extend(await run_taskgroup(*batch))
64
- return results
60
+ batch_results.extend(await run_taskgroup(*batch))
61
+ return batch_results
65
62
 
66
- semaphore = asyncio.Semaphore(batch_size)
63
+ limiter = CapacityLimiter(batch_size)
64
+ results: list[tuple[int, Any]] = []
67
65
 
68
- async def run_with_semaphore(task: Awaitable[Any], index: int) -> tuple[int, Any]:
69
- async with semaphore:
66
+ async def run_with_semaphore(task: Awaitable[Any], index: int) -> None:
67
+ async with limiter:
70
68
  result = await task
71
- return (index, result)
69
+ results.append((index, result))
72
70
 
73
- indexed_tasks = [run_with_semaphore(task, i) for i, task in enumerate(async_tasks)]
74
- indexed_results = await asyncio.gather(*indexed_tasks)
71
+ async with create_task_group() as tg:
72
+ for i, task in enumerate(async_tasks):
73
+ tg.start_soon(run_with_semaphore, task, i)
75
74
 
76
- indexed_results.sort(key=lambda x: x[0])
77
- return [result for _, result in indexed_results]
75
+ results.sort(key=lambda x: x[0])
76
+ return [result for _, result in results]
78
77
 
79
78
 
80
79
  async def run_maybe_sync(fn: Callable[P, T | Awaitable[T]], *args: P.args, **kwargs: P.kwargs) -> T:
kreuzberg/extraction.py CHANGED
@@ -47,7 +47,7 @@ async def _handle_cache_async(path: Path, config: ExtractionConfig) -> Extractio
47
47
  return cached_result
48
48
 
49
49
  if cache.is_processing(path, config):
50
- event = cache.mark_processing(path, config)
50
+ event = cache.mark_processing(path, config) # pragma: no cover
51
51
  await anyio.to_thread.run_sync(event.wait) # pragma: no cover
52
52
 
53
53
  return cache.get(path, config) # pragma: no cover
@@ -362,7 +362,7 @@ def extract_file_sync(
362
362
  return cached_result
363
363
 
364
364
  if cache.is_processing(path, config):
365
- event = cache.mark_processing(path, config)
365
+ event = cache.mark_processing(path, config) # pragma: no cover
366
366
  event.wait() # pragma: no cover
367
367
 
368
368
  # Try cache again after waiting for other process to complete # ~keep
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.15.0
3
+ Version: 3.16.0
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -31,7 +31,7 @@ Requires-Python: >=3.10
31
31
  Requires-Dist: anyio>=4.10.0
32
32
  Requires-Dist: chardetng-py>=0.3.5
33
33
  Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
34
- Requires-Dist: html-to-markdown[lxml]>=1.11.0
34
+ Requires-Dist: html-to-markdown[lxml]>=1.13.0
35
35
  Requires-Dist: mcp>=1.14.0
36
36
  Requires-Dist: msgspec>=0.18.0
37
37
  Requires-Dist: numpy>=2.0.0
@@ -109,7 +109,7 @@ Description-Content-Type: text/markdown
109
109
  - **Text Extraction**: High-fidelity text extraction preserving document structure and formatting
110
110
  - **Image Extraction**: Extract embedded images from PDFs, presentations, HTML, and Office documents with optional OCR
111
111
  - **Metadata Extraction**: Comprehensive metadata including author, creation date, language, and document properties
112
- - **Format Support**: 18 document types including PDF, Microsoft Office, images, HTML, and structured data formats
112
+ - **Format Support**: 21 document types including PDF, Microsoft Office, images, HTML, and structured data formats
113
113
  - **OCR Integration**: Tesseract OCR with markdown output (default) and table extraction from scanned documents
114
114
  - **Document Classification**: Automatic document type detection (contracts, forms, invoices, receipts, reports)
115
115
 
@@ -227,14 +227,15 @@ claude mcp add kreuzberg uvx kreuzberg-mcp
227
227
 
228
228
  ## Supported Formats
229
229
 
230
- | Category | Formats |
231
- | ----------------- | ------------------------------ |
232
- | **Documents** | PDF, DOCX, DOC, RTF, TXT, EPUB |
233
- | **Images** | JPG, PNG, TIFF, BMP, GIF, WEBP |
234
- | **Spreadsheets** | XLSX, XLS, CSV, ODS |
235
- | **Presentations** | PPTX, PPT, ODP |
236
- | **Web** | HTML, XML, MHTML |
237
- | **Archives** | Support via extraction |
230
+ | Category | Formats |
231
+ | ------------------- | ------------------------------ |
232
+ | **Documents** | PDF, DOCX, DOC, RTF, TXT, EPUB |
233
+ | **Images** | JPG, PNG, TIFF, BMP, GIF, WEBP |
234
+ | **Spreadsheets** | XLSX, XLS, CSV, ODS |
235
+ | **Presentations** | PPTX, PPT, ODP |
236
+ | **Web** | HTML, XML, MHTML |
237
+ | **Structured Data** | JSON, YAML, TOML |
238
+ | **Archives** | Support via extraction |
238
239
 
239
240
  ## 📊 Performance Characteristics
240
241
 
@@ -1,46 +1,47 @@
1
- kreuzberg/__init__.py,sha256=-IHDHXKE7q43MBr_KklpqvhNPjJRhX3qFpMge8kuViE,1467
1
+ kreuzberg/__init__.py,sha256=EE6ENEjyKlt0o6QN1cG3Z_1isCtminVOjQT7ii5eBHA,1575
2
2
  kreuzberg/__main__.py,sha256=3cIDdzTggj2kj8uKx4WShWHmCWqdZazdM3BxUGbAuSI,104
3
3
  kreuzberg/_chunker.py,sha256=lRXvVN60vmWaTxa1b3QzvE-jBmOqYzh5dY-3Kl6pSqI,1427
4
- kreuzberg/_config.py,sha256=2LI5z9gXniqO4afrMmbZfMdhlT2701O5OlGKkrMo-bM,12385
4
+ kreuzberg/_config.py,sha256=H4jUAL0fNY-YE61GbGq5UtAUtXHbZA4-9W3YwcT_hu8,12988
5
5
  kreuzberg/_constants.py,sha256=gY6SpCi9za59ghRuLX_z7xfSok6qqvPbvEnv4BLczqI,265
6
- kreuzberg/_document_classification.py,sha256=zgBjqiHCqhtz74JLtt_V8kk6HQTkK5egGWdAGk9dOEQ,5672
6
+ kreuzberg/_document_classification.py,sha256=55aDxDIJ65qK6yEXt-fRYTn8LgALvYsWssjWSheVpR0,5697
7
7
  kreuzberg/_entity_extraction.py,sha256=YvcELIo3kV8A_WbzwNjhKn7rPhkZXjbpNMgm2UK0oJw,3621
8
- kreuzberg/_gmft.py,sha256=a7KDXbZM0PxyFpAIjM0xMRvxzoMo4fTQuGlFNa8uXBU,20502
8
+ kreuzberg/_gmft.py,sha256=XI8vdBG0tdEVwFiabVieCuvxM5esqTSiFtsEwJ0YT5g,20787
9
9
  kreuzberg/_language_detection.py,sha256=T9p6aimB7QFXAQiEntIMZeH_Z62E52E8fBQ43hWuyhs,1960
10
10
  kreuzberg/_mime_types.py,sha256=-05mBS5AoF4LUmfB_WyLoce0y4peiOyOf2JucF714WQ,8602
11
11
  kreuzberg/_playa.py,sha256=p4G5ymSSCbQoDeXJjH-yuVzdd4y-wKcolqDthjPtqok,11413
12
12
  kreuzberg/_registry.py,sha256=8XYT-vPhNYMAbB5RBIUKz-1Zdg48OCnBcdVZzBq6YwY,3307
13
- kreuzberg/_types.py,sha256=7hj2KWohuSKQ9cJd_VCuSeciuyuOC5MdSkS1s5QaPOg,44870
13
+ kreuzberg/_types.py,sha256=Xht1_TcvsbIpdmLYMy6Pa_HpbQuF9MBOo-BrKkZ7cLA,47358
14
14
  kreuzberg/cli.py,sha256=OoHA5MiIcRBATFJpb-FZYlZfpohxL2AbVgamyhnEMFo,14342
15
15
  kreuzberg/exceptions.py,sha256=PTiAZgQwcG9hXbgYg2W7sfxksFhq5_wzOFgZGnTJAoc,2991
16
- kreuzberg/extraction.py,sha256=gDkwuj_omQ8OCx4RALD0NjasxMhZLhIju7odK7wMwDM,17789
16
+ kreuzberg/extraction.py,sha256=5TuuRqLRmboLaTS0x9eZ2lrYOHKJBSHuTT_U-5nn6ek,17829
17
17
  kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
18
  kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
19
  kreuzberg/_api/_config_cache.py,sha256=gX_ezGNq6SCpTn02yFkn24zMVrQwfIk8-u5XkKJiHFg,8774
20
- kreuzberg/_api/main.py,sha256=_r2R_-4zBkyJBn0bcPWogVEDICxWWt5_FFiQIF-r4N4,15463
20
+ kreuzberg/_api/main.py,sha256=_tBZaRiq7qq7x4nXkVRgU5FBivLFJ_dmadAc7aT0H_k,13901
21
21
  kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
22
  kreuzberg/_extractors/_base.py,sha256=39E7R7hV6C2uMJdQKLBVSWK3tN-mtK0LaayU10-8Fqo,11191
23
- kreuzberg/_extractors/_email.py,sha256=8tsHycVBQ2KSSqp2TZ9a0O1Yxjwe0YvE2GVxUajCVz4,8478
24
- kreuzberg/_extractors/_html.py,sha256=7fzNr7-BJ4IND7PWTlEIiqfeKDUb_ZjWO3KDdU3umgI,5151
23
+ kreuzberg/_extractors/_email.py,sha256=DzNthVbmbdlajDUfs0nNwbHNvG0CAQVqJsRfsatHtf0,8799
24
+ kreuzberg/_extractors/_html.py,sha256=zZ9WZmmoIG9B5dGF25ulm_GmW9RsYFI1HddDUUp3hOE,6351
25
25
  kreuzberg/_extractors/_image.py,sha256=7rKEGhUAmdzO0YcBKQVhVme4PqyKIi2UCn4esmmFXOY,4300
26
26
  kreuzberg/_extractors/_pandoc.py,sha256=CPEJxKTZdfyb7jPacZkiAsR2NEGL6KyiHzOr88tprJY,24142
27
- kreuzberg/_extractors/_pdf.py,sha256=MKfihJcveulfkMQc-s5VUCgvK1aw8EyCbUMRwJo_KoM,23225
28
- kreuzberg/_extractors/_presentation.py,sha256=MZd4Ft2g5oIrEZ1h3ZWsQTW_VpHI2yi4g4Tdh5iw_7I,10466
29
- kreuzberg/_extractors/_spread_sheet.py,sha256=Q2uXvotwqvWiYkIPrtnVL2Ci9ZA7fmTgN6tDN_huwdE,12801
30
- kreuzberg/_extractors/_structured.py,sha256=PpefI_GDrdLyUgnElrbdB-MeTMKVWium4Ckxm5Zg100,5536
27
+ kreuzberg/_extractors/_pdf.py,sha256=78gPO7m8nPFIOskqqRpUfyOhKUk6f5rjJ0cZDnL9Vdk,23224
28
+ kreuzberg/_extractors/_presentation.py,sha256=2g6PJnpgUpUfMjQJh-7_gHywDulE8QE8ypH__BrEUTQ,10692
29
+ kreuzberg/_extractors/_spread_sheet.py,sha256=TJOM70DLN0HzcOkAowZJogAx7QFrouohvU5V0OIliag,12738
30
+ kreuzberg/_extractors/_structured.py,sha256=YkTOfSQJOe127ZURrAYAomNrIkKoAYC4gt0P9ypY3RY,8919
31
31
  kreuzberg/_mcp/__init__.py,sha256=h6DgLFO4TMUk7_wCJ2jn2Y6IkFmfzb-Z7jX-G5UCYVc,43
32
- kreuzberg/_mcp/server.py,sha256=n_bfNPSU_SvXVJ5z05oKVj2sFv2uRYoe3ZZzyVOHQOI,17608
32
+ kreuzberg/_mcp/server.py,sha256=vJWCXbBiv0ktIPZeLedSWZEwKF46p6642H6lxhTnjek,16723
33
33
  kreuzberg/_ocr/__init__.py,sha256=grshVFwVQl2rMvH1hg1JNlYXjy5-Tdb_rusLD1Cselk,706
34
34
  kreuzberg/_ocr/_base.py,sha256=ZvOJvW8DtylQJZdCPk9vlVNZiBFK-dC4Oj7Kb6-mWkY,1419
35
- kreuzberg/_ocr/_easyocr.py,sha256=6Naqy9JvL96Mm9gw4s-4nRsubd0Z0t8Zn6VC_HInUfc,14577
35
+ kreuzberg/_ocr/_easyocr.py,sha256=7bkMM_zN0h7ZiX0-VHxxnwNOhQloI-dlOOibpRc-vNs,15710
36
36
  kreuzberg/_ocr/_paddleocr.py,sha256=XyYc3gtmnvOGfQ0qBQYFphJa1kSv5hZ_LJ0weD2hQ08,15006
37
37
  kreuzberg/_ocr/_table_extractor.py,sha256=LhBiCX8R_xR-uK1FH3ONA_vqOmqUWANZJ2HMCBLsmNY,5513
38
- kreuzberg/_ocr/_tesseract.py,sha256=fq4qdrzPss9ZaIneUxmwq9x3sFJe8FEi__DLOa1AXN4,50945
38
+ kreuzberg/_ocr/_tesseract.py,sha256=BjTKE6ilUpSEKarHdgP3PbsE6I89JeqgDtpQ-XHniBA,51452
39
39
  kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
40
  kreuzberg/_utils/_cache.py,sha256=AtANbs1MWR4WLB2MhatVGhlh7kM-yjSfFuDnSVSNp50,14110
41
41
  kreuzberg/_utils/_device.py,sha256=o03rLiHiRX6TKhJ55LO1Vj2Map1Po5YdjuMdA63tGOE,8249
42
42
  kreuzberg/_utils/_document_cache.py,sha256=tfk9_Yc1cQkT5_uM5R1uaI4w-2SjNn7QyAd6AmWkSz8,4851
43
43
  kreuzberg/_utils/_errors.py,sha256=aQYEnp8oJ-WJVmCNo7YY-25y1KZZFEwjAmxVRfw4a_M,4920
44
+ kreuzberg/_utils/_html_streaming.py,sha256=ywQgEQfEGm6MSotS1g_HXgl0e7V59yLmf2wytALuZko,648
44
45
  kreuzberg/_utils/_image_preprocessing.py,sha256=arl4UDDiD_Z6SKM-jTXENaOaaHZBVFTsueb6DcpFXOo,10934
45
46
  kreuzberg/_utils/_ocr_cache.py,sha256=uCCZfdY7EiqMhCnhNwqirFOr-Wfaobd2Ntc-F07TKec,3425
46
47
  kreuzberg/_utils/_pdf_lock.py,sha256=Ytvds30aZf3yXeZFo27ZenrhUoU-GZlR2rKEkhJ_wlk,1349
@@ -48,13 +49,13 @@ kreuzberg/_utils/_process_pool.py,sha256=fqlxNsxDoqS28BLrZeDBH743HdaUBuGPYFH5hjS
48
49
  kreuzberg/_utils/_quality.py,sha256=FCVh9KieWUYgT1klLxudbslzKuqbOTBbTsHbvIuru7M,5510
49
50
  kreuzberg/_utils/_ref.py,sha256=iOflvjTUc_F0XaL28Bd6fpvL6qkeoURGA4B77Nqky7I,840
50
51
  kreuzberg/_utils/_resource_managers.py,sha256=N3-VeHDj6sKBeg3UL-PqRtKGExUBoVcEB5UuQ8FncY8,2079
51
- kreuzberg/_utils/_serialization.py,sha256=97iIgdcxdbym-BEvy0J6HAduBCUXyCGwhuEHCT_l7I4,1513
52
+ kreuzberg/_utils/_serialization.py,sha256=G-kxtCPDPGFqBMyHfzvAPo-bNUmPdaXYdeg1dnBLfN4,1789
52
53
  kreuzberg/_utils/_string.py,sha256=wVyvEHByHBeu_6evmqJGv9Ml-NAwkyz60n8l-7L5Cw0,4366
53
- kreuzberg/_utils/_sync.py,sha256=O4ukJfo8hIr72kaoRvvJjbkBeorIw0SUfkovv0YXa7k,3170
54
+ kreuzberg/_utils/_sync.py,sha256=gb828WYfVtkB4wKslJrPMmrdeI1h3htWceq-gywHtO4,3184
54
55
  kreuzberg/_utils/_table.py,sha256=OVg6T2QnerMhVNb1juLTBSIjyjFiE5-OrUWr5NSCgnQ,6493
55
56
  kreuzberg/_utils/_tmp.py,sha256=mwZ0BFzhGPfYa2tt8qSjUjfcHnSYvbQT4VlPRCRc_q8,2038
56
- kreuzberg-3.15.0.dist-info/METADATA,sha256=-4oGIVQAYBB8BSPbwA_MA1LK-ZROaCxwX6g-re4ZtCQ,12246
57
- kreuzberg-3.15.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
58
- kreuzberg-3.15.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
59
- kreuzberg-3.15.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
60
- kreuzberg-3.15.0.dist-info/RECORD,,
57
+ kreuzberg-3.16.0.dist-info/METADATA,sha256=d1sUA7WBl0VcXHX0jPGzTHeXmj7yyJzTWjzHUmT-Dp4,12319
58
+ kreuzberg-3.16.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
59
+ kreuzberg-3.16.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
60
+ kreuzberg-3.16.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
61
+ kreuzberg-3.16.0.dist-info/RECORD,,