kreuzberg 3.14.1__py3-none-any.whl → 3.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. kreuzberg/__init__.py +10 -0
  2. kreuzberg/_api/_config_cache.py +247 -0
  3. kreuzberg/_api/main.py +74 -45
  4. kreuzberg/_chunker.py +7 -6
  5. kreuzberg/_config.py +11 -1
  6. kreuzberg/_constants.py +2 -0
  7. kreuzberg/_document_classification.py +5 -7
  8. kreuzberg/_entity_extraction.py +9 -4
  9. kreuzberg/_extractors/_base.py +269 -3
  10. kreuzberg/_extractors/_email.py +101 -27
  11. kreuzberg/_extractors/_html.py +112 -7
  12. kreuzberg/_extractors/_image.py +23 -22
  13. kreuzberg/_extractors/_pandoc.py +106 -75
  14. kreuzberg/_extractors/_pdf.py +208 -99
  15. kreuzberg/_extractors/_presentation.py +76 -8
  16. kreuzberg/_extractors/_spread_sheet.py +24 -30
  17. kreuzberg/_extractors/_structured.py +83 -15
  18. kreuzberg/_gmft.py +5 -0
  19. kreuzberg/_mcp/server.py +324 -25
  20. kreuzberg/_mime_types.py +42 -0
  21. kreuzberg/_ocr/_easyocr.py +53 -21
  22. kreuzberg/_ocr/_paddleocr.py +1 -1
  23. kreuzberg/_ocr/_tesseract.py +88 -37
  24. kreuzberg/_types.py +291 -61
  25. kreuzberg/_utils/_cache.py +10 -4
  26. kreuzberg/_utils/_device.py +2 -4
  27. kreuzberg/_utils/_html_streaming.py +20 -0
  28. kreuzberg/_utils/_image_preprocessing.py +12 -39
  29. kreuzberg/_utils/_process_pool.py +29 -8
  30. kreuzberg/_utils/_quality.py +7 -2
  31. kreuzberg/_utils/_resource_managers.py +65 -0
  32. kreuzberg/_utils/_serialization.py +13 -6
  33. kreuzberg/_utils/_sync.py +39 -10
  34. kreuzberg/_utils/_tmp.py +37 -1
  35. kreuzberg/cli.py +34 -20
  36. kreuzberg/extraction.py +44 -28
  37. {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/METADATA +13 -11
  38. kreuzberg-3.16.0.dist-info/RECORD +61 -0
  39. kreuzberg-3.14.1.dist-info/RECORD +0 -58
  40. {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/WHEEL +0 -0
  41. {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/entry_points.txt +0 -0
  42. {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/extraction.py CHANGED
@@ -3,7 +3,7 @@ from __future__ import annotations
3
3
  import multiprocessing as mp
4
4
  from concurrent.futures import ThreadPoolExecutor, as_completed
5
5
  from pathlib import Path
6
- from typing import TYPE_CHECKING, Any, Final, cast
6
+ from typing import TYPE_CHECKING, Final, cast
7
7
 
8
8
  import anyio
9
9
 
@@ -30,6 +30,31 @@ if TYPE_CHECKING:
30
30
  DEFAULT_CONFIG: Final[ExtractionConfig] = ExtractionConfig()
31
31
 
32
32
 
33
+ async def _handle_cache_async(path: Path, config: ExtractionConfig) -> ExtractionResult | None:
34
+ """Handle cache lookup and coordination with other processes.
35
+
36
+ Args:
37
+ path: Path to the file being processed
38
+ config: Extraction configuration
39
+
40
+ Returns:
41
+ Cached result if available, None otherwise
42
+ """
43
+ cache = get_document_cache()
44
+
45
+ cached_result = cache.get(path, config)
46
+ if cached_result is not None:
47
+ return cached_result
48
+
49
+ if cache.is_processing(path, config):
50
+ event = cache.mark_processing(path, config) # pragma: no cover
51
+ await anyio.to_thread.run_sync(event.wait) # pragma: no cover
52
+
53
+ return cache.get(path, config) # pragma: no cover
54
+
55
+ return None
56
+
57
+
33
58
  def _validate_and_post_process_helper(
34
59
  result: ExtractionResult, config: ExtractionConfig, file_path: Path | None = None
35
60
  ) -> ExtractionResult:
@@ -102,9 +127,9 @@ def _handle_chunk_content(
102
127
  mime_type: str,
103
128
  config: ExtractionConfig,
104
129
  content: str,
105
- ) -> Any:
130
+ ) -> list[str]:
106
131
  chunker = get_chunker(mime_type=mime_type, max_characters=config.max_chars, overlap_characters=config.max_overlap)
107
- return chunker.chunks(content)
132
+ return list(chunker.chunks(content))
108
133
 
109
134
 
110
135
  async def extract_bytes(content: bytes, mime_type: str, config: ExtractionConfig = DEFAULT_CONFIG) -> ExtractionResult:
@@ -153,19 +178,9 @@ async def extract_file(
153
178
  path = Path(file_path)
154
179
 
155
180
  if config.use_cache:
156
- cached_result = cache.get(path, config)
181
+ cached_result = await _handle_cache_async(path, config)
157
182
  if cached_result is not None:
158
183
  return cached_result
159
-
160
- if cache.is_processing(path, config):
161
- event = cache.mark_processing(path, config)
162
- await anyio.to_thread.run_sync(event.wait) # pragma: no cover
163
-
164
- # Try cache again after waiting for other process to complete # ~keep
165
- cached_result = cache.get(path, config) # pragma: no cover
166
- if cached_result is not None: # pragma: no cover
167
- return cached_result
168
-
169
184
  cache.mark_processing(path, config)
170
185
 
171
186
  try:
@@ -227,11 +242,11 @@ async def batch_extract_file(
227
242
  error_result = ExtractionResult(
228
243
  content=f"Error: {type(e).__name__}: {e!s}",
229
244
  mime_type="text/plain",
230
- metadata={ # type: ignore[typeddict-unknown-key]
245
+ metadata={
231
246
  "error": f"{type(e).__name__}: {e!s}",
232
247
  "error_context": create_error_context(
233
248
  operation="batch_extract_file",
234
- file_path=path,
249
+ file_path=str(path),
235
250
  error=e,
236
251
  index=index,
237
252
  ),
@@ -276,7 +291,7 @@ async def batch_extract_bytes(
276
291
  error_result = ExtractionResult(
277
292
  content=f"Error: {type(e).__name__}: {e!s}",
278
293
  mime_type="text/plain",
279
- metadata={ # type: ignore[typeddict-unknown-key]
294
+ metadata={
280
295
  "error": f"{type(e).__name__}: {e!s}",
281
296
  "error_context": create_error_context(
282
297
  operation="batch_extract_bytes",
@@ -347,7 +362,7 @@ def extract_file_sync(
347
362
  return cached_result
348
363
 
349
364
  if cache.is_processing(path, config):
350
- event = cache.mark_processing(path, config)
365
+ event = cache.mark_processing(path, config) # pragma: no cover
351
366
  event.wait() # pragma: no cover
352
367
 
353
368
  # Try cache again after waiting for other process to complete # ~keep
@@ -400,31 +415,31 @@ def batch_extract_file_sync(
400
415
 
401
416
  max_workers = min(len(file_paths), mp.cpu_count())
402
417
 
403
- def extract_single(file_path: PathLike[str] | str) -> tuple[int, ExtractionResult]:
418
+ def extract_single(index: int, file_path: PathLike[str] | str) -> tuple[int, ExtractionResult]:
404
419
  """Extract single file with index for ordering."""
405
420
  try:
406
421
  return (
407
- file_paths.index(file_path),
422
+ index,
408
423
  extract_file_sync(file_path=Path(file_path), mime_type=None, config=config),
409
424
  )
410
425
  except Exception as e: # noqa: BLE001
411
426
  error_result = ExtractionResult(
412
427
  content=f"Error: {type(e).__name__}: {e!s}",
413
428
  mime_type="text/plain",
414
- metadata={ # type: ignore[typeddict-unknown-key]
429
+ metadata={
415
430
  "error": f"{type(e).__name__}: {e!s}",
416
431
  "error_context": create_error_context(
417
432
  operation="batch_extract_file_sync",
418
- file_path=file_path,
433
+ file_path=str(file_path),
419
434
  error=e,
420
435
  ),
421
436
  },
422
437
  chunks=[],
423
438
  )
424
- return (file_paths.index(file_path), error_result)
439
+ return (index, error_result)
425
440
 
426
441
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
427
- future_to_index = {executor.submit(extract_single, fp): i for i, fp in enumerate(file_paths)}
442
+ future_to_index = {executor.submit(extract_single, i, fp): i for i, fp in enumerate(file_paths)}
428
443
 
429
444
  results: list[ExtractionResult | None] = [None] * len(file_paths)
430
445
  for future in as_completed(future_to_index):
@@ -453,16 +468,15 @@ def batch_extract_bytes_sync(
453
468
 
454
469
  max_workers = min(len(contents), mp.cpu_count())
455
470
 
456
- def extract_single(index_and_content: tuple[int, tuple[bytes, str]]) -> tuple[int, ExtractionResult]:
471
+ def extract_single(index: int, content: bytes, mime_type: str) -> tuple[int, ExtractionResult]:
457
472
  """Extract single content with index for ordering."""
458
- index, (content, mime_type) = index_and_content
459
473
  try:
460
474
  return (index, extract_bytes_sync(content=content, mime_type=mime_type, config=config))
461
475
  except Exception as e: # noqa: BLE001
462
476
  error_result = ExtractionResult(
463
477
  content=f"Error: {type(e).__name__}: {e!s}",
464
478
  mime_type="text/plain",
465
- metadata={ # type: ignore[typeddict-unknown-key]
479
+ metadata={
466
480
  "error": f"{type(e).__name__}: {e!s}",
467
481
  "error_context": create_error_context(
468
482
  operation="batch_extract_bytes_sync",
@@ -477,7 +491,9 @@ def batch_extract_bytes_sync(
477
491
  return (index, error_result)
478
492
 
479
493
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
480
- future_to_index = {executor.submit(extract_single, (i, content)): i for i, content in enumerate(contents)}
494
+ future_to_index = {
495
+ executor.submit(extract_single, i, content, mime_type): i for i, (content, mime_type) in enumerate(contents)
496
+ }
481
497
 
482
498
  results: list[ExtractionResult | None] = [None] * len(contents)
483
499
  for future in as_completed(future_to_index):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.14.1
3
+ Version: 3.16.0
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -31,7 +31,7 @@ Requires-Python: >=3.10
31
31
  Requires-Dist: anyio>=4.10.0
32
32
  Requires-Dist: chardetng-py>=0.3.5
33
33
  Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
34
- Requires-Dist: html-to-markdown[lxml]>=1.11.0
34
+ Requires-Dist: html-to-markdown[lxml]>=1.13.0
35
35
  Requires-Dist: mcp>=1.14.0
36
36
  Requires-Dist: msgspec>=0.18.0
37
37
  Requires-Dist: numpy>=2.0.0
@@ -107,8 +107,9 @@ Description-Content-Type: text/markdown
107
107
  ### Document Intelligence Capabilities
108
108
 
109
109
  - **Text Extraction**: High-fidelity text extraction preserving document structure and formatting
110
+ - **Image Extraction**: Extract embedded images from PDFs, presentations, HTML, and Office documents with optional OCR
110
111
  - **Metadata Extraction**: Comprehensive metadata including author, creation date, language, and document properties
111
- - **Format Support**: 18 document types including PDF, Microsoft Office, images, HTML, and structured data formats
112
+ - **Format Support**: 21 document types including PDF, Microsoft Office, images, HTML, and structured data formats
112
113
  - **OCR Integration**: Tesseract OCR with markdown output (default) and table extraction from scanned documents
113
114
  - **Document Classification**: Automatic document type detection (contracts, forms, invoices, receipts, reports)
114
115
 
@@ -226,14 +227,15 @@ claude mcp add kreuzberg uvx kreuzberg-mcp
226
227
 
227
228
  ## Supported Formats
228
229
 
229
- | Category | Formats |
230
- | ----------------- | ------------------------------ |
231
- | **Documents** | PDF, DOCX, DOC, RTF, TXT, EPUB |
232
- | **Images** | JPG, PNG, TIFF, BMP, GIF, WEBP |
233
- | **Spreadsheets** | XLSX, XLS, CSV, ODS |
234
- | **Presentations** | PPTX, PPT, ODP |
235
- | **Web** | HTML, XML, MHTML |
236
- | **Archives** | Support via extraction |
230
+ | Category | Formats |
231
+ | ------------------- | ------------------------------ |
232
+ | **Documents** | PDF, DOCX, DOC, RTF, TXT, EPUB |
233
+ | **Images** | JPG, PNG, TIFF, BMP, GIF, WEBP |
234
+ | **Spreadsheets** | XLSX, XLS, CSV, ODS |
235
+ | **Presentations** | PPTX, PPT, ODP |
236
+ | **Web** | HTML, XML, MHTML |
237
+ | **Structured Data** | JSON, YAML, TOML |
238
+ | **Archives** | Support via extraction |
237
239
 
238
240
  ## 📊 Performance Characteristics
239
241
 
@@ -0,0 +1,61 @@
1
+ kreuzberg/__init__.py,sha256=EE6ENEjyKlt0o6QN1cG3Z_1isCtminVOjQT7ii5eBHA,1575
2
+ kreuzberg/__main__.py,sha256=3cIDdzTggj2kj8uKx4WShWHmCWqdZazdM3BxUGbAuSI,104
3
+ kreuzberg/_chunker.py,sha256=lRXvVN60vmWaTxa1b3QzvE-jBmOqYzh5dY-3Kl6pSqI,1427
4
+ kreuzberg/_config.py,sha256=H4jUAL0fNY-YE61GbGq5UtAUtXHbZA4-9W3YwcT_hu8,12988
5
+ kreuzberg/_constants.py,sha256=gY6SpCi9za59ghRuLX_z7xfSok6qqvPbvEnv4BLczqI,265
6
+ kreuzberg/_document_classification.py,sha256=55aDxDIJ65qK6yEXt-fRYTn8LgALvYsWssjWSheVpR0,5697
7
+ kreuzberg/_entity_extraction.py,sha256=YvcELIo3kV8A_WbzwNjhKn7rPhkZXjbpNMgm2UK0oJw,3621
8
+ kreuzberg/_gmft.py,sha256=XI8vdBG0tdEVwFiabVieCuvxM5esqTSiFtsEwJ0YT5g,20787
9
+ kreuzberg/_language_detection.py,sha256=T9p6aimB7QFXAQiEntIMZeH_Z62E52E8fBQ43hWuyhs,1960
10
+ kreuzberg/_mime_types.py,sha256=-05mBS5AoF4LUmfB_WyLoce0y4peiOyOf2JucF714WQ,8602
11
+ kreuzberg/_playa.py,sha256=p4G5ymSSCbQoDeXJjH-yuVzdd4y-wKcolqDthjPtqok,11413
12
+ kreuzberg/_registry.py,sha256=8XYT-vPhNYMAbB5RBIUKz-1Zdg48OCnBcdVZzBq6YwY,3307
13
+ kreuzberg/_types.py,sha256=Xht1_TcvsbIpdmLYMy6Pa_HpbQuF9MBOo-BrKkZ7cLA,47358
14
+ kreuzberg/cli.py,sha256=OoHA5MiIcRBATFJpb-FZYlZfpohxL2AbVgamyhnEMFo,14342
15
+ kreuzberg/exceptions.py,sha256=PTiAZgQwcG9hXbgYg2W7sfxksFhq5_wzOFgZGnTJAoc,2991
16
+ kreuzberg/extraction.py,sha256=5TuuRqLRmboLaTS0x9eZ2lrYOHKJBSHuTT_U-5nn6ek,17829
17
+ kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
+ kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
+ kreuzberg/_api/_config_cache.py,sha256=gX_ezGNq6SCpTn02yFkn24zMVrQwfIk8-u5XkKJiHFg,8774
20
+ kreuzberg/_api/main.py,sha256=_tBZaRiq7qq7x4nXkVRgU5FBivLFJ_dmadAc7aT0H_k,13901
21
+ kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
+ kreuzberg/_extractors/_base.py,sha256=39E7R7hV6C2uMJdQKLBVSWK3tN-mtK0LaayU10-8Fqo,11191
23
+ kreuzberg/_extractors/_email.py,sha256=DzNthVbmbdlajDUfs0nNwbHNvG0CAQVqJsRfsatHtf0,8799
24
+ kreuzberg/_extractors/_html.py,sha256=zZ9WZmmoIG9B5dGF25ulm_GmW9RsYFI1HddDUUp3hOE,6351
25
+ kreuzberg/_extractors/_image.py,sha256=7rKEGhUAmdzO0YcBKQVhVme4PqyKIi2UCn4esmmFXOY,4300
26
+ kreuzberg/_extractors/_pandoc.py,sha256=CPEJxKTZdfyb7jPacZkiAsR2NEGL6KyiHzOr88tprJY,24142
27
+ kreuzberg/_extractors/_pdf.py,sha256=78gPO7m8nPFIOskqqRpUfyOhKUk6f5rjJ0cZDnL9Vdk,23224
28
+ kreuzberg/_extractors/_presentation.py,sha256=2g6PJnpgUpUfMjQJh-7_gHywDulE8QE8ypH__BrEUTQ,10692
29
+ kreuzberg/_extractors/_spread_sheet.py,sha256=TJOM70DLN0HzcOkAowZJogAx7QFrouohvU5V0OIliag,12738
30
+ kreuzberg/_extractors/_structured.py,sha256=YkTOfSQJOe127ZURrAYAomNrIkKoAYC4gt0P9ypY3RY,8919
31
+ kreuzberg/_mcp/__init__.py,sha256=h6DgLFO4TMUk7_wCJ2jn2Y6IkFmfzb-Z7jX-G5UCYVc,43
32
+ kreuzberg/_mcp/server.py,sha256=vJWCXbBiv0ktIPZeLedSWZEwKF46p6642H6lxhTnjek,16723
33
+ kreuzberg/_ocr/__init__.py,sha256=grshVFwVQl2rMvH1hg1JNlYXjy5-Tdb_rusLD1Cselk,706
34
+ kreuzberg/_ocr/_base.py,sha256=ZvOJvW8DtylQJZdCPk9vlVNZiBFK-dC4Oj7Kb6-mWkY,1419
35
+ kreuzberg/_ocr/_easyocr.py,sha256=7bkMM_zN0h7ZiX0-VHxxnwNOhQloI-dlOOibpRc-vNs,15710
36
+ kreuzberg/_ocr/_paddleocr.py,sha256=XyYc3gtmnvOGfQ0qBQYFphJa1kSv5hZ_LJ0weD2hQ08,15006
37
+ kreuzberg/_ocr/_table_extractor.py,sha256=LhBiCX8R_xR-uK1FH3ONA_vqOmqUWANZJ2HMCBLsmNY,5513
38
+ kreuzberg/_ocr/_tesseract.py,sha256=BjTKE6ilUpSEKarHdgP3PbsE6I89JeqgDtpQ-XHniBA,51452
39
+ kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
+ kreuzberg/_utils/_cache.py,sha256=AtANbs1MWR4WLB2MhatVGhlh7kM-yjSfFuDnSVSNp50,14110
41
+ kreuzberg/_utils/_device.py,sha256=o03rLiHiRX6TKhJ55LO1Vj2Map1Po5YdjuMdA63tGOE,8249
42
+ kreuzberg/_utils/_document_cache.py,sha256=tfk9_Yc1cQkT5_uM5R1uaI4w-2SjNn7QyAd6AmWkSz8,4851
43
+ kreuzberg/_utils/_errors.py,sha256=aQYEnp8oJ-WJVmCNo7YY-25y1KZZFEwjAmxVRfw4a_M,4920
44
+ kreuzberg/_utils/_html_streaming.py,sha256=ywQgEQfEGm6MSotS1g_HXgl0e7V59yLmf2wytALuZko,648
45
+ kreuzberg/_utils/_image_preprocessing.py,sha256=arl4UDDiD_Z6SKM-jTXENaOaaHZBVFTsueb6DcpFXOo,10934
46
+ kreuzberg/_utils/_ocr_cache.py,sha256=uCCZfdY7EiqMhCnhNwqirFOr-Wfaobd2Ntc-F07TKec,3425
47
+ kreuzberg/_utils/_pdf_lock.py,sha256=Ytvds30aZf3yXeZFo27ZenrhUoU-GZlR2rKEkhJ_wlk,1349
48
+ kreuzberg/_utils/_process_pool.py,sha256=fqlxNsxDoqS28BLrZeDBH743HdaUBuGPYFH5hjSajIg,7493
49
+ kreuzberg/_utils/_quality.py,sha256=FCVh9KieWUYgT1klLxudbslzKuqbOTBbTsHbvIuru7M,5510
50
+ kreuzberg/_utils/_ref.py,sha256=iOflvjTUc_F0XaL28Bd6fpvL6qkeoURGA4B77Nqky7I,840
51
+ kreuzberg/_utils/_resource_managers.py,sha256=N3-VeHDj6sKBeg3UL-PqRtKGExUBoVcEB5UuQ8FncY8,2079
52
+ kreuzberg/_utils/_serialization.py,sha256=G-kxtCPDPGFqBMyHfzvAPo-bNUmPdaXYdeg1dnBLfN4,1789
53
+ kreuzberg/_utils/_string.py,sha256=wVyvEHByHBeu_6evmqJGv9Ml-NAwkyz60n8l-7L5Cw0,4366
54
+ kreuzberg/_utils/_sync.py,sha256=gb828WYfVtkB4wKslJrPMmrdeI1h3htWceq-gywHtO4,3184
55
+ kreuzberg/_utils/_table.py,sha256=OVg6T2QnerMhVNb1juLTBSIjyjFiE5-OrUWr5NSCgnQ,6493
56
+ kreuzberg/_utils/_tmp.py,sha256=mwZ0BFzhGPfYa2tt8qSjUjfcHnSYvbQT4VlPRCRc_q8,2038
57
+ kreuzberg-3.16.0.dist-info/METADATA,sha256=d1sUA7WBl0VcXHX0jPGzTHeXmj7yyJzTWjzHUmT-Dp4,12319
58
+ kreuzberg-3.16.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
59
+ kreuzberg-3.16.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
60
+ kreuzberg-3.16.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
61
+ kreuzberg-3.16.0.dist-info/RECORD,,
@@ -1,58 +0,0 @@
1
- kreuzberg/__init__.py,sha256=Oh_NTp8wf0BlvD8CSBad2A493nEWH4jTE0x8v7v1Y9w,1341
2
- kreuzberg/__main__.py,sha256=3cIDdzTggj2kj8uKx4WShWHmCWqdZazdM3BxUGbAuSI,104
3
- kreuzberg/_chunker.py,sha256=tr9_KUYTSLauFois3MsB-A-0hGcTT8hTQFrqNRTii-I,1373
4
- kreuzberg/_config.py,sha256=2LI5z9gXniqO4afrMmbZfMdhlT2701O5OlGKkrMo-bM,12385
5
- kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
6
- kreuzberg/_document_classification.py,sha256=Mz_s2GJGsEl7MQ-67BPoGYCZibTy9Sw0PScUZKBjKOA,5736
7
- kreuzberg/_entity_extraction.py,sha256=5YpPnqoJ5aiHd_sy4bN4-Ngiq79RhCV6yaUQE8joGXo,3503
8
- kreuzberg/_gmft.py,sha256=a7KDXbZM0PxyFpAIjM0xMRvxzoMo4fTQuGlFNa8uXBU,20502
9
- kreuzberg/_language_detection.py,sha256=T9p6aimB7QFXAQiEntIMZeH_Z62E52E8fBQ43hWuyhs,1960
10
- kreuzberg/_mime_types.py,sha256=kGBDSMO4XPgzUKC7iaBeChCtRQXZ9_zXq6eJydejX_k,7739
11
- kreuzberg/_playa.py,sha256=p4G5ymSSCbQoDeXJjH-yuVzdd4y-wKcolqDthjPtqok,11413
12
- kreuzberg/_registry.py,sha256=8XYT-vPhNYMAbB5RBIUKz-1Zdg48OCnBcdVZzBq6YwY,3307
13
- kreuzberg/_types.py,sha256=BEMTnA8fvHL0dDCnjq7g9Jjd2Ze8NFq988YkMH4zQ9g,39163
14
- kreuzberg/cli.py,sha256=Ob0IfqWcaiM09pFdC6wTpdSeql0SGZDxBxfrEhJAGmo,13501
15
- kreuzberg/exceptions.py,sha256=PTiAZgQwcG9hXbgYg2W7sfxksFhq5_wzOFgZGnTJAoc,2991
16
- kreuzberg/extraction.py,sha256=qT-Ziw5FmMqcPT88VrglikL1RASSJCf5W7xP6L9Vi5s,17673
17
- kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
- kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
- kreuzberg/_api/main.py,sha256=8g_8j8Dp2e70_yYYUUrJNC5Ku9fuyNgyjUuIgJTRUW8,12500
20
- kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
- kreuzberg/_extractors/_base.py,sha256=i2FvAhRnamEtBb4a-C7pfcdWIXnkEBw0saMQu7h1_RQ,2069
22
- kreuzberg/_extractors/_email.py,sha256=jn_8J4BASKJ7zFHBG0PgxNe3OT4pjmEM2tTKX8y_0AE,5887
23
- kreuzberg/_extractors/_html.py,sha256=NyQKChNLvaSUC_5x1qTYlIQGwL4lEbgUF7BgH9ejEVY,1583
24
- kreuzberg/_extractors/_image.py,sha256=lFPoxAf7_Zbx-1t8W4vU2bhHauiNGOAFbZxr_2gNUsw,3991
25
- kreuzberg/_extractors/_pandoc.py,sha256=-Ai4S1cXs7F6yeonb_7Y7_ZoWHn29E2oP1WlPtM-4HM,22505
26
- kreuzberg/_extractors/_pdf.py,sha256=naJ_AgtAgtGIjAqiU4_G7lgftKWhUjZDLVILSG2AyVc,18757
27
- kreuzberg/_extractors/_presentation.py,sha256=ULGkt7dzeA9sYSEhpAucKZmkdv9EubzeZtOjoLP3Z2E,6994
28
- kreuzberg/_extractors/_spread_sheet.py,sha256=eBAx_OwoyRqMzmD4Z07UlOBwcXckymgvj_0o7di6thA,12715
29
- kreuzberg/_extractors/_structured.py,sha256=PpefI_GDrdLyUgnElrbdB-MeTMKVWium4Ckxm5Zg100,5536
30
- kreuzberg/_mcp/__init__.py,sha256=h6DgLFO4TMUk7_wCJ2jn2Y6IkFmfzb-Z7jX-G5UCYVc,43
31
- kreuzberg/_mcp/server.py,sha256=YPMJp6xnZ3DC32NEdX5Gqf3vwxsHZxXxUxZ6jghpv6I,5688
32
- kreuzberg/_ocr/__init__.py,sha256=grshVFwVQl2rMvH1hg1JNlYXjy5-Tdb_rusLD1Cselk,706
33
- kreuzberg/_ocr/_base.py,sha256=ZvOJvW8DtylQJZdCPk9vlVNZiBFK-dC4Oj7Kb6-mWkY,1419
34
- kreuzberg/_ocr/_easyocr.py,sha256=XbgpGt5tkE4xHleIGvV1cHlpOQTp43rSXBO1CyIyKTg,14599
35
- kreuzberg/_ocr/_paddleocr.py,sha256=hfc6Zi2eSUYTVVF9y9D1P2_pLiLXPfFRoJ6QDJ6oZag,15017
36
- kreuzberg/_ocr/_table_extractor.py,sha256=LhBiCX8R_xR-uK1FH3ONA_vqOmqUWANZJ2HMCBLsmNY,5513
37
- kreuzberg/_ocr/_tesseract.py,sha256=QEKK_PDZnNiZRgpklOgMXB-cObJy6C-HuxL6Gza5Z3c,49136
38
- kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
- kreuzberg/_utils/_cache.py,sha256=qeyI6rJOQlKtdHjJeOjUxx31eItak_drrNn8Cf8HbN8,13956
40
- kreuzberg/_utils/_device.py,sha256=UxGkSTN3Up-Zn43CSyvf8CozW2xAF05Cm01LWA2FZmg,8263
41
- kreuzberg/_utils/_document_cache.py,sha256=tfk9_Yc1cQkT5_uM5R1uaI4w-2SjNn7QyAd6AmWkSz8,4851
42
- kreuzberg/_utils/_errors.py,sha256=aQYEnp8oJ-WJVmCNo7YY-25y1KZZFEwjAmxVRfw4a_M,4920
43
- kreuzberg/_utils/_image_preprocessing.py,sha256=2u0A28M07F9XlYebTG5salOUVEE3YT3m8fiR8Z2ZM8E,12326
44
- kreuzberg/_utils/_ocr_cache.py,sha256=uCCZfdY7EiqMhCnhNwqirFOr-Wfaobd2Ntc-F07TKec,3425
45
- kreuzberg/_utils/_pdf_lock.py,sha256=Ytvds30aZf3yXeZFo27ZenrhUoU-GZlR2rKEkhJ_wlk,1349
46
- kreuzberg/_utils/_process_pool.py,sha256=7p8Co1w-Tvh2MUdxMcPMpvOikumrb0nN2ApQVytV-_c,6726
47
- kreuzberg/_utils/_quality.py,sha256=f7NbyZysyJQD8jKCNWhogvluU9A7GdEYhMsDBeMbGAA,5412
48
- kreuzberg/_utils/_ref.py,sha256=iOflvjTUc_F0XaL28Bd6fpvL6qkeoURGA4B77Nqky7I,840
49
- kreuzberg/_utils/_serialization.py,sha256=97iIgdcxdbym-BEvy0J6HAduBCUXyCGwhuEHCT_l7I4,1513
50
- kreuzberg/_utils/_string.py,sha256=wVyvEHByHBeu_6evmqJGv9Ml-NAwkyz60n8l-7L5Cw0,4366
51
- kreuzberg/_utils/_sync.py,sha256=OWiciXPTGHIxgiGoHI2AglZ1siTNT-nU_JCgHPNzzHk,2196
52
- kreuzberg/_utils/_table.py,sha256=OVg6T2QnerMhVNb1juLTBSIjyjFiE5-OrUWr5NSCgnQ,6493
53
- kreuzberg/_utils/_tmp.py,sha256=wnOInBkcuQoxI1vBLvNv9NqbRCEu9Y03qfOjqQuAk3s,841
54
- kreuzberg-3.14.1.dist-info/METADATA,sha256=4sG9L9AtvBHFxjv84obrcaYNToc_sO0-AHnnpo1-ZGY,12127
55
- kreuzberg-3.14.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
56
- kreuzberg-3.14.1.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
57
- kreuzberg-3.14.1.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
58
- kreuzberg-3.14.1.dist-info/RECORD,,