kreuzberg 3.20.1__tar.gz → 3.20.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/PKG-INFO +2 -2
  2. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_gmft.py +36 -4
  3. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_types.py +10 -0
  4. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/pyproject.toml +2 -2
  5. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/README.md +0 -0
  6. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/__init__.py +0 -0
  7. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/__main__.py +0 -0
  8. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_api/__init__.py +0 -0
  9. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_api/_config_cache.py +0 -0
  10. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_api/main.py +0 -0
  11. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_chunker.py +0 -0
  12. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_config.py +0 -0
  13. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_constants.py +0 -0
  14. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_document_classification.py +0 -0
  15. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_entity_extraction.py +0 -0
  16. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_error_handling.py +0 -0
  17. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_extractors/__init__.py +0 -0
  18. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_extractors/_base.py +0 -0
  19. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_extractors/_email.py +0 -0
  20. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_extractors/_html.py +0 -0
  21. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_extractors/_image.py +0 -0
  22. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_extractors/_pandoc.py +0 -0
  23. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_extractors/_pdf.py +0 -0
  24. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_extractors/_presentation.py +0 -0
  25. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_extractors/_spread_sheet.py +0 -0
  26. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_extractors/_structured.py +0 -0
  27. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_language_detection.py +0 -0
  28. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_mcp/__init__.py +0 -0
  29. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_mcp/server.py +0 -0
  30. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_mime_types.py +0 -0
  31. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_ocr/__init__.py +0 -0
  32. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_ocr/_base.py +0 -0
  33. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_ocr/_easyocr.py +0 -0
  34. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_ocr/_paddleocr.py +0 -0
  35. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_ocr/_table_extractor.py +0 -0
  36. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_ocr/_tesseract.py +0 -0
  37. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_playa.py +0 -0
  38. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_registry.py +0 -0
  39. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/__init__.py +0 -0
  40. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/_reducer.py +0 -0
  41. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/_stopwords.py +0 -0
  42. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/af_stopwords.json +0 -0
  43. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/ar_stopwords.json +0 -0
  44. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/bg_stopwords.json +0 -0
  45. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/bn_stopwords.json +0 -0
  46. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/br_stopwords.json +0 -0
  47. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/ca_stopwords.json +0 -0
  48. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/cs_stopwords.json +0 -0
  49. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/da_stopwords.json +0 -0
  50. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/de_stopwords.json +0 -0
  51. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/el_stopwords.json +0 -0
  52. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/en_stopwords.json +0 -0
  53. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/eo_stopwords.json +0 -0
  54. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/es_stopwords.json +0 -0
  55. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/et_stopwords.json +0 -0
  56. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/eu_stopwords.json +0 -0
  57. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/fa_stopwords.json +0 -0
  58. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/fi_stopwords.json +0 -0
  59. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/fr_stopwords.json +0 -0
  60. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/ga_stopwords.json +0 -0
  61. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/gl_stopwords.json +0 -0
  62. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/gu_stopwords.json +0 -0
  63. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/ha_stopwords.json +0 -0
  64. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/he_stopwords.json +0 -0
  65. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/hi_stopwords.json +0 -0
  66. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/hr_stopwords.json +0 -0
  67. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/hu_stopwords.json +0 -0
  68. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/hy_stopwords.json +0 -0
  69. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/id_stopwords.json +0 -0
  70. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/it_stopwords.json +0 -0
  71. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/ja_stopwords.json +0 -0
  72. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/kn_stopwords.json +0 -0
  73. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/ko_stopwords.json +0 -0
  74. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/ku_stopwords.json +0 -0
  75. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/la_stopwords.json +0 -0
  76. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/lt_stopwords.json +0 -0
  77. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/lv_stopwords.json +0 -0
  78. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/ml_stopwords.json +0 -0
  79. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/mr_stopwords.json +0 -0
  80. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/ms_stopwords.json +0 -0
  81. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/ne_stopwords.json +0 -0
  82. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/nl_stopwords.json +0 -0
  83. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/no_stopwords.json +0 -0
  84. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/pl_stopwords.json +0 -0
  85. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/pt_stopwords.json +0 -0
  86. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/ro_stopwords.json +0 -0
  87. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/ru_stopwords.json +0 -0
  88. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/si_stopwords.json +0 -0
  89. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/sk_stopwords.json +0 -0
  90. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/sl_stopwords.json +0 -0
  91. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/so_stopwords.json +0 -0
  92. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/st_stopwords.json +0 -0
  93. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/sv_stopwords.json +0 -0
  94. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/sw_stopwords.json +0 -0
  95. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/ta_stopwords.json +0 -0
  96. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/te_stopwords.json +0 -0
  97. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/th_stopwords.json +0 -0
  98. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/tl_stopwords.json +0 -0
  99. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/tr_stopwords.json +0 -0
  100. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/uk_stopwords.json +0 -0
  101. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/ur_stopwords.json +0 -0
  102. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/vi_stopwords.json +0 -0
  103. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/yo_stopwords.json +0 -0
  104. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/zh_stopwords.json +0 -0
  105. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_token_reduction/stopwords/zu_stopwords.json +0 -0
  106. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_utils/__init__.py +0 -0
  107. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_utils/_cache.py +0 -0
  108. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_utils/_device.py +0 -0
  109. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_utils/_document_cache.py +0 -0
  110. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_utils/_errors.py +0 -0
  111. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_utils/_image_preprocessing.py +0 -0
  112. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_utils/_ocr_cache.py +0 -0
  113. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_utils/_pdf_lock.py +0 -0
  114. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_utils/_process_pool.py +0 -0
  115. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_utils/_quality.py +0 -0
  116. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_utils/_ref.py +0 -0
  117. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_utils/_resource_managers.py +0 -0
  118. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_utils/_serialization.py +0 -0
  119. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_utils/_string.py +0 -0
  120. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_utils/_sync.py +0 -0
  121. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_utils/_table.py +0 -0
  122. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/_utils/_tmp.py +0 -0
  123. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/cli.py +0 -0
  124. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/exceptions.py +0 -0
  125. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/extraction.py +0 -0
  126. {kreuzberg-3.20.1 → kreuzberg-3.20.2}/kreuzberg/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: kreuzberg
3
- Version: 3.20.1
3
+ Version: 3.20.2
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Keywords: async,document-analysis,document-classification,document-intelligence,document-processing,extensible,information-extraction,mcp,metadata-extraction,model-context-protocol,ocr,pandoc,pdf-extraction,pdfium,plugin-architecture,rag,retrieval-augmented-generation,structured-data,table-extraction,tesseract,text-extraction
6
6
  Author: Na'aman Hirschfeld
@@ -28,7 +28,7 @@ Classifier: Typing :: Typed
28
28
  Requires-Dist: anyio>=4.11.0
29
29
  Requires-Dist: chardetng-py>=0.3.5
30
30
  Requires-Dist: exceptiongroup>=1.2.2 ; python_full_version < '3.11'
31
- Requires-Dist: html-to-markdown>=2.1.0
31
+ Requires-Dist: html-to-markdown>=2.1.2
32
32
  Requires-Dist: langcodes>=3.5.0
33
33
  Requires-Dist: mcp>=1.17.0
34
34
  Requires-Dist: msgspec>=0.18.0
@@ -193,9 +193,14 @@ async def extract_tables(
193
193
  await run_sync(doc.close)
194
194
 
195
195
  except ImportError as e: # pragma: no cover
196
- raise MissingDependencyError.create_for_package(
196
+ error = MissingDependencyError.create_for_package(
197
197
  dependency_group="gmft", functionality="table extraction", package_name="gmft"
198
- ) from e
198
+ )
199
+ error.context = {
200
+ "file_path": str(Path(file_path)),
201
+ "error_message": str(e),
202
+ }
203
+ raise error from e
199
204
  finally:
200
205
  table_cache.mark_complete(**cache_kwargs)
201
206
 
@@ -294,9 +299,14 @@ def extract_tables_sync(
294
299
  doc.close() # type: ignore[no-untyped-call]
295
300
 
296
301
  except ImportError as e: # pragma: no cover
297
- raise MissingDependencyError.create_for_package(
302
+ error = MissingDependencyError.create_for_package(
298
303
  dependency_group="gmft", functionality="table extraction", package_name="gmft"
299
- ) from e
304
+ )
305
+ error.context = {
306
+ "file_path": str(Path(file_path)),
307
+ "error_message": str(e),
308
+ }
309
+ raise error from e
300
310
 
301
311
 
302
312
  def _extract_tables_in_process(
@@ -449,6 +459,17 @@ def _extract_tables_isolated(
449
459
  return tables
450
460
 
451
461
  error_info = result
462
+ if error_info.get("type") == "ImportError":
463
+ error = MissingDependencyError.create_for_package(
464
+ dependency_group="gmft", functionality="table extraction", package_name="gmft"
465
+ )
466
+ error.context = {
467
+ "file_path": str(Path(file_path)),
468
+ "error_message": error_info["error"],
469
+ "traceback": error_info.get("traceback"),
470
+ }
471
+ raise error from ImportError(error_info["error"])
472
+
452
473
  raise ParsingError(
453
474
  f"GMFT table extraction failed: {error_info['error']}",
454
475
  context={
@@ -536,6 +557,17 @@ async def _extract_tables_isolated_async(
536
557
  return tables
537
558
 
538
559
  error_info = result
560
+ if error_info.get("type") == "ImportError":
561
+ error = MissingDependencyError.create_for_package(
562
+ dependency_group="gmft", functionality="table extraction", package_name="gmft"
563
+ )
564
+ error.context = {
565
+ "file_path": str(Path(file_path)),
566
+ "error_message": error_info["error"],
567
+ "traceback": error_info.get("traceback"),
568
+ }
569
+ raise error from ImportError(error_info["error"])
570
+
539
571
  raise ParsingError(
540
572
  f"GMFT table extraction failed: {error_info['error']}",
541
573
  context={
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import sys
4
+ import warnings
4
5
  from collections.abc import Awaitable, Callable, Mapping
5
6
  from dataclasses import asdict, dataclass, field
6
7
  from enum import Enum
@@ -262,6 +263,15 @@ class PaddleOCRConfig(ConfigDict):
262
263
 
263
264
  @dataclass(unsafe_hash=True, frozen=True, slots=True)
264
265
  class GMFTConfig(ConfigDict):
266
+ def __post_init__(self) -> None:
267
+ warnings.warn(
268
+ "GMFTConfig is deprecated and will be removed in Kreuzberg v4.0. "
269
+ "Install `kreuzberg[gmft]` only if you still rely on GMFT. "
270
+ "Future versions use native TATR-based table extraction via TableExtractionConfig.",
271
+ FutureWarning,
272
+ stacklevel=2,
273
+ )
274
+
265
275
  verbosity: int = 0
266
276
  """
267
277
  Verbosity level for logging.
@@ -4,7 +4,7 @@ requires = [ "uv-build" ]
4
4
 
5
5
  [project]
6
6
  name = "kreuzberg"
7
- version = "3.20.1"
7
+ version = "3.20.2"
8
8
  description = "Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats"
9
9
  readme = "README.md"
10
10
  keywords = [
@@ -59,7 +59,7 @@ dependencies = [
59
59
  "anyio>=4.11.0",
60
60
  "chardetng-py>=0.3.5",
61
61
  "exceptiongroup>=1.2.2; python_version<'3.11'",
62
- "html-to-markdown>=2.1.0",
62
+ "html-to-markdown>=2.1.2",
63
63
  "langcodes>=3.5.0",
64
64
  "mcp>=1.17.0",
65
65
  "msgspec>=0.18.0",
File without changes
File without changes