kreuzberg 3.20.1__py3-none-any.whl → 3.20.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/_gmft.py CHANGED
@@ -193,9 +193,14 @@ async def extract_tables(
193
193
  await run_sync(doc.close)
194
194
 
195
195
  except ImportError as e: # pragma: no cover
196
- raise MissingDependencyError.create_for_package(
196
+ error = MissingDependencyError.create_for_package(
197
197
  dependency_group="gmft", functionality="table extraction", package_name="gmft"
198
- ) from e
198
+ )
199
+ error.context = {
200
+ "file_path": str(Path(file_path)),
201
+ "error_message": str(e),
202
+ }
203
+ raise error from e
199
204
  finally:
200
205
  table_cache.mark_complete(**cache_kwargs)
201
206
 
@@ -294,9 +299,14 @@ def extract_tables_sync(
294
299
  doc.close() # type: ignore[no-untyped-call]
295
300
 
296
301
  except ImportError as e: # pragma: no cover
297
- raise MissingDependencyError.create_for_package(
302
+ error = MissingDependencyError.create_for_package(
298
303
  dependency_group="gmft", functionality="table extraction", package_name="gmft"
299
- ) from e
304
+ )
305
+ error.context = {
306
+ "file_path": str(Path(file_path)),
307
+ "error_message": str(e),
308
+ }
309
+ raise error from e
300
310
 
301
311
 
302
312
  def _extract_tables_in_process(
@@ -449,6 +459,17 @@ def _extract_tables_isolated(
449
459
  return tables
450
460
 
451
461
  error_info = result
462
+ if error_info.get("type") == "ImportError":
463
+ error = MissingDependencyError.create_for_package(
464
+ dependency_group="gmft", functionality="table extraction", package_name="gmft"
465
+ )
466
+ error.context = {
467
+ "file_path": str(Path(file_path)),
468
+ "error_message": error_info["error"],
469
+ "traceback": error_info.get("traceback"),
470
+ }
471
+ raise error from ImportError(error_info["error"])
472
+
452
473
  raise ParsingError(
453
474
  f"GMFT table extraction failed: {error_info['error']}",
454
475
  context={
@@ -536,6 +557,17 @@ async def _extract_tables_isolated_async(
536
557
  return tables
537
558
 
538
559
  error_info = result
560
+ if error_info.get("type") == "ImportError":
561
+ error = MissingDependencyError.create_for_package(
562
+ dependency_group="gmft", functionality="table extraction", package_name="gmft"
563
+ )
564
+ error.context = {
565
+ "file_path": str(Path(file_path)),
566
+ "error_message": error_info["error"],
567
+ "traceback": error_info.get("traceback"),
568
+ }
569
+ raise error from ImportError(error_info["error"])
570
+
539
571
  raise ParsingError(
540
572
  f"GMFT table extraction failed: {error_info['error']}",
541
573
  context={
kreuzberg/_types.py CHANGED
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import sys
4
+ import warnings
4
5
  from collections.abc import Awaitable, Callable, Mapping
5
6
  from dataclasses import asdict, dataclass, field
6
7
  from enum import Enum
@@ -262,6 +263,15 @@ class PaddleOCRConfig(ConfigDict):
262
263
 
263
264
  @dataclass(unsafe_hash=True, frozen=True, slots=True)
264
265
  class GMFTConfig(ConfigDict):
266
+ def __post_init__(self) -> None:
267
+ warnings.warn(
268
+ "GMFTConfig is deprecated and will be removed in Kreuzberg v4.0. "
269
+ "Install `kreuzberg[gmft]` only if you still rely on GMFT. "
270
+ "Future versions use native TATR-based table extraction via TableExtractionConfig.",
271
+ FutureWarning,
272
+ stacklevel=2,
273
+ )
274
+
265
275
  verbosity: int = 0
266
276
  """
267
277
  Verbosity level for logging.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: kreuzberg
3
- Version: 3.20.1
3
+ Version: 3.20.2
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Keywords: async,document-analysis,document-classification,document-intelligence,document-processing,extensible,information-extraction,mcp,metadata-extraction,model-context-protocol,ocr,pandoc,pdf-extraction,pdfium,plugin-architecture,rag,retrieval-augmented-generation,structured-data,table-extraction,tesseract,text-extraction
6
6
  Author: Na'aman Hirschfeld
@@ -28,7 +28,7 @@ Classifier: Typing :: Typed
28
28
  Requires-Dist: anyio>=4.11.0
29
29
  Requires-Dist: chardetng-py>=0.3.5
30
30
  Requires-Dist: exceptiongroup>=1.2.2 ; python_full_version < '3.11'
31
- Requires-Dist: html-to-markdown>=2.1.0
31
+ Requires-Dist: html-to-markdown>=2.1.2
32
32
  Requires-Dist: langcodes>=3.5.0
33
33
  Requires-Dist: mcp>=1.17.0
34
34
  Requires-Dist: msgspec>=0.18.0
@@ -19,7 +19,7 @@ kreuzberg/_extractors/_pdf.py,sha256=_MPtO_8BCpyAXyIWusmfqOaEsPMDxucjTQKz3cTaj8o
19
19
  kreuzberg/_extractors/_presentation.py,sha256=2g6PJnpgUpUfMjQJh-7_gHywDulE8QE8ypH__BrEUTQ,10692
20
20
  kreuzberg/_extractors/_spread_sheet.py,sha256=TJOM70DLN0HzcOkAowZJogAx7QFrouohvU5V0OIliag,12738
21
21
  kreuzberg/_extractors/_structured.py,sha256=thpXhsBnvaHzGQX4sy6eVHowFv0yaYxLGHwxx4DouCI,8947
22
- kreuzberg/_gmft.py,sha256=gfRXOsv-K9R7Y0zZ2SUa5wid3FpP2eFIlg5nepWcz1Q,20827
22
+ kreuzberg/_gmft.py,sha256=0jmCTEs9EV78awlpCvmfLWAEiiq9n9Qx3n3FnCboTgg,22119
23
23
  kreuzberg/_language_detection.py,sha256=4JzQldcDIVZRWUzRFc9AOFiq6Wfl9858mip1ZnrD2Ks,1143
24
24
  kreuzberg/_mcp/__init__.py,sha256=h6DgLFO4TMUk7_wCJ2jn2Y6IkFmfzb-Z7jX-G5UCYVc,43
25
25
  kreuzberg/_mcp/server.py,sha256=71MhjiFDwgFROdGejf0djgO1eG370qudWmZsN59CUeA,16743
@@ -99,7 +99,7 @@ kreuzberg/_token_reduction/stopwords/vi_stopwords.json,sha256=UOyAEKBwMcQV65QGpQ
99
99
  kreuzberg/_token_reduction/stopwords/yo_stopwords.json,sha256=60liY89h7KReEvHEPxe-hCWLPuqr4U89aQDCi7iRCfo,651
100
100
  kreuzberg/_token_reduction/stopwords/zh_stopwords.json,sha256=rouSTCkXun90Q1aCvLjHyt4I7pGrtlcruDpNVybpAMI,8934
101
101
  kreuzberg/_token_reduction/stopwords/zu_stopwords.json,sha256=hfm4E2EDI_VWyR0GUOVjcMQA7ZDH7FsV4FUMcns1H28,324
102
- kreuzberg/_types.py,sha256=eh4bZFG3jIw5GhfC3u4R0aa_y9niKZDI4O93j0MCZGw,53672
102
+ kreuzberg/_types.py,sha256=qh4S9PhIZkvueTCgvO1d3lKmy5pWYfuhHpoGcoIEyYY,54061
103
103
  kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
104
104
  kreuzberg/_utils/_cache.py,sha256=AtANbs1MWR4WLB2MhatVGhlh7kM-yjSfFuDnSVSNp50,14110
105
105
  kreuzberg/_utils/_device.py,sha256=o03rLiHiRX6TKhJ55LO1Vj2Map1Po5YdjuMdA63tGOE,8249
@@ -121,7 +121,7 @@ kreuzberg/cli.py,sha256=P_dqOHbGh-fFYZ4WErjngTKq7wbqaUmTD1Gjw2lIsDI,15242
121
121
  kreuzberg/exceptions.py,sha256=KiGAfIX3_TkGYG1h9eTZ_E_pALsAqhZ_A3XfhwxwaS0,2909
122
122
  kreuzberg/extraction.py,sha256=jMsomvg7SPnuXLGZKQl0YH64D0AhczSNDM4CKORd9d0,24185
123
123
  kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
124
- kreuzberg-3.20.1.dist-info/WHEEL,sha256=X16MKk8bp2DRsAuyteHJ-9qOjzmnY0x1aj0P1ftqqWA,78
125
- kreuzberg-3.20.1.dist-info/entry_points.txt,sha256=nFqXF3_dNlhHuvtNX26GKKutjOjH2a7TUiaYSu2eOAk,92
126
- kreuzberg-3.20.1.dist-info/METADATA,sha256=FIG_Res9xL_wupiTlRzQ6sdofF-UeJ_z9_8uArvjCO0,11923
127
- kreuzberg-3.20.1.dist-info/RECORD,,
124
+ kreuzberg-3.20.2.dist-info/WHEEL,sha256=X16MKk8bp2DRsAuyteHJ-9qOjzmnY0x1aj0P1ftqqWA,78
125
+ kreuzberg-3.20.2.dist-info/entry_points.txt,sha256=nFqXF3_dNlhHuvtNX26GKKutjOjH2a7TUiaYSu2eOAk,92
126
+ kreuzberg-3.20.2.dist-info/METADATA,sha256=vR4TfFvm5tx66RV7dS0CXRCTmCFksHb8Jyhy-3FHNUQ,11923
127
+ kreuzberg-3.20.2.dist-info/RECORD,,