kreuzberg 3.11.0__py3-none-any.whl → 3.11.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/_gmft.py CHANGED
@@ -444,14 +444,26 @@ def _extract_tables_in_process(
444
444
  cropped_image.save(img_bytes, format="PNG")
445
445
  img_bytes.seek(0)
446
446
 
447
- results.append(
448
- {
449
- "cropped_image_bytes": img_bytes.getvalue(),
450
- "page_number": cropped_table.page.page_number,
451
- "text": data_frame.to_markdown(),
452
- "df_csv": data_frame.to_csv(index=False),
453
- }
454
- )
447
+ if data_frame.empty:
448
+ results.append(
449
+ {
450
+ "cropped_image_bytes": img_bytes.getvalue(),
451
+ "page_number": cropped_table.page.page_number,
452
+ "text": data_frame.to_markdown(),
453
+ "df_columns": data_frame.columns.tolist(),
454
+ "df_csv": None,
455
+ }
456
+ )
457
+ else:
458
+ results.append(
459
+ {
460
+ "cropped_image_bytes": img_bytes.getvalue(),
461
+ "page_number": cropped_table.page.page_number,
462
+ "text": data_frame.to_markdown(),
463
+ "df_columns": None,
464
+ "df_csv": data_frame.to_csv(index=False),
465
+ }
466
+ )
455
467
 
456
468
  result_queue.put((True, results))
457
469
 
@@ -532,7 +544,10 @@ def _extract_tables_isolated(
532
544
  img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
533
545
  import pandas as pd # noqa: PLC0415
534
546
 
535
- df = pd.read_csv(StringIO(table_dict["df_csv"]))
547
+ if table_dict["df_csv"] is None:
548
+ df = pd.DataFrame(columns=table_dict["df_columns"])
549
+ else:
550
+ df = pd.read_csv(StringIO(table_dict["df_csv"]))
536
551
 
537
552
  tables.append(
538
553
  TableData(
@@ -638,7 +653,10 @@ async def _extract_tables_isolated_async(
638
653
  img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
639
654
  import pandas as pd # noqa: PLC0415
640
655
 
641
- df = pd.read_csv(StringIO(table_dict["df_csv"]))
656
+ if table_dict["df_csv"] is None:
657
+ df = pd.DataFrame(columns=table_dict["df_columns"])
658
+ else:
659
+ df = pd.read_csv(StringIO(table_dict["df_csv"]))
642
660
 
643
661
  tables.append(
644
662
  TableData(
@@ -4,7 +4,6 @@ import warnings
4
4
  from dataclasses import dataclass
5
5
  from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
6
6
 
7
- import numpy as np
8
7
  from PIL import Image
9
8
 
10
9
  from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
@@ -188,6 +187,9 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
188
187
 
189
188
  kwargs.pop("language", None)
190
189
  kwargs.pop("use_gpu", None)
190
+ kwargs.pop("device", None)
191
+ kwargs.pop("gpu_memory_limit", None)
192
+ kwargs.pop("fallback_to_cpu", None)
191
193
 
192
194
  try:
193
195
  result = await run_sync(
@@ -455,11 +457,16 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
455
457
  Raises:
456
458
  OCRError: If OCR processing fails.
457
459
  """
460
+ import numpy as np # noqa: PLC0415
461
+
458
462
  self._init_easyocr_sync(**kwargs)
459
463
 
460
464
  beam_width = kwargs.pop("beam_width")
461
465
  kwargs.pop("language", None)
462
466
  kwargs.pop("use_gpu", None)
467
+ kwargs.pop("device", None)
468
+ kwargs.pop("gpu_memory_limit", None)
469
+ kwargs.pop("fallback_to_cpu", None)
463
470
 
464
471
  try:
465
472
  result = self._reader.readtext(
@@ -7,7 +7,6 @@ from importlib.util import find_spec
7
7
  from pathlib import Path
8
8
  from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
9
9
 
10
- import numpy as np
11
10
  from PIL import Image
12
11
 
13
12
  from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
@@ -380,6 +379,8 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
380
379
  Raises:
381
380
  OCRError: If OCR processing fails.
382
381
  """
382
+ import numpy as np # noqa: PLC0415
383
+
383
384
  self._init_paddle_ocr_sync(**kwargs)
384
385
 
385
386
  if image.mode != "RGB":
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.11.0
3
+ Version: 3.11.2
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -28,13 +28,13 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
28
28
  Classifier: Topic :: Text Processing :: General
29
29
  Classifier: Typing :: Typed
30
30
  Requires-Python: >=3.10
31
- Requires-Dist: anyio>=4.9.0
31
+ Requires-Dist: anyio>=4.10.0
32
32
  Requires-Dist: chardetng-py>=0.3.5
33
33
  Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
34
34
  Requires-Dist: html-to-markdown[lxml]>=1.9.0
35
- Requires-Dist: mcp>=1.12.3
35
+ Requires-Dist: mcp>=1.13.0
36
36
  Requires-Dist: msgspec>=0.18.0
37
- Requires-Dist: playa-pdf>=0.6.4
37
+ Requires-Dist: playa-pdf>=0.7.0
38
38
  Requires-Dist: psutil>=7.0.0
39
39
  Requires-Dist: pypdfium2==4.30.0
40
40
  Requires-Dist: python-calamine>=0.3.2
@@ -50,19 +50,19 @@ Requires-Dist: easyocr>=1.7.2; extra == 'all'
50
50
  Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
51
51
  Requires-Dist: gmft>=0.4.2; extra == 'all'
52
52
  Requires-Dist: keybert>=0.9.0; extra == 'all'
53
- Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all'
53
+ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'all'
54
54
  Requires-Dist: mailparse>=1.0.15; extra == 'all'
55
- Requires-Dist: paddleocr>=3.1.0; extra == 'all'
55
+ Requires-Dist: paddleocr>=3.1.1; extra == 'all'
56
56
  Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
57
57
  Requires-Dist: pandas>=2.3.1; extra == 'all'
58
- Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'all'
58
+ Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'all'
59
59
  Requires-Dist: rich>=14.1.0; extra == 'all'
60
60
  Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
61
61
  Requires-Dist: setuptools>=80.9.0; extra == 'all'
62
62
  Requires-Dist: spacy>=3.8.7; extra == 'all'
63
63
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
64
64
  Provides-Extra: api
65
- Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'api'
65
+ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'api'
66
66
  Provides-Extra: chunking
67
67
  Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
68
68
  Provides-Extra: cli
@@ -70,7 +70,7 @@ Requires-Dist: click>=8.2.1; extra == 'cli'
70
70
  Requires-Dist: rich>=14.1.0; extra == 'cli'
71
71
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
72
72
  Provides-Extra: crypto
73
- Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'crypto'
73
+ Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'crypto'
74
74
  Provides-Extra: document-classification
75
75
  Requires-Dist: deep-translator>=1.11.4; extra == 'document-classification'
76
76
  Requires-Dist: pandas>=2.3.1; extra == 'document-classification'
@@ -84,7 +84,7 @@ Requires-Dist: gmft>=0.4.2; extra == 'gmft'
84
84
  Provides-Extra: langdetect
85
85
  Requires-Dist: fast-langdetect>=0.3.2; extra == 'langdetect'
86
86
  Provides-Extra: paddleocr
87
- Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
87
+ Requires-Dist: paddleocr>=3.1.1; extra == 'paddleocr'
88
88
  Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
89
89
  Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
90
90
  Description-Content-Type: text/markdown
@@ -5,7 +5,7 @@ kreuzberg/_config.py,sha256=Au521UiR7vcQs_8_hhoWIfmDDMJIrDM3XZUB_qHfCmo,14035
5
5
  kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
6
6
  kreuzberg/_document_classification.py,sha256=qFGmwvUMhnNAvNNJO7E-huPx-Ps-_DWxdNxsozIzgaw,6870
7
7
  kreuzberg/_entity_extraction.py,sha256=Oa1T-9mptimpOHtcda-GtrVYH9PFy7DSJj3thJZUD7k,7902
8
- kreuzberg/_gmft.py,sha256=HdQ7Xpuixxl2Y0jY8C3KfyQEU0mN4yQdqErWCv4TnFY,25573
8
+ kreuzberg/_gmft.py,sha256=6P4gSSmU39puaYAKmdGr9ALf0USYTwRDuvvhG1LmI24,26441
9
9
  kreuzberg/_language_detection.py,sha256=_Ng2aHgPxOHFgd507gVNiIGVmnxxbpgYwsO0bD0yTzg,3315
10
10
  kreuzberg/_mime_types.py,sha256=2warRVqfBUNIg8JBg8yP4pRqaMPvwINosHMkJwtH_Fc,8488
11
11
  kreuzberg/_playa.py,sha256=_IPrUSWwSfDQlWXOpKlauV0D9MhGrujGP5kmQ0U3L0g,12188
@@ -31,8 +31,8 @@ kreuzberg/_mcp/__init__.py,sha256=8PYV-omC8Rln7Cove8C3rHu3d7sR1FuiwSBG1O7vkAE,92
31
31
  kreuzberg/_mcp/server.py,sha256=Dxed80MqZsYCFyYo0QdArpKE4H8DhpKY34fijdzV5uw,8731
32
32
  kreuzberg/_ocr/__init__.py,sha256=grshVFwVQl2rMvH1hg1JNlYXjy5-Tdb_rusLD1Cselk,706
33
33
  kreuzberg/_ocr/_base.py,sha256=IkONqwG6zxZoVMni1JlYugBoyONahlRny7J2_7Dy69c,3953
34
- kreuzberg/_ocr/_easyocr.py,sha256=dWfoj5fPIGqJPGTVeZ0W59TrW3DpNwF0bcfgt6FwQUw,17238
35
- kreuzberg/_ocr/_paddleocr.py,sha256=Is_iJQaSUeCMfCvg5RnuG_pmBRjBt0b3dCBPY1IAc3A,17583
34
+ kreuzberg/_ocr/_easyocr.py,sha256=eU4MA_B_-cvq_IhpCeYUruL_kqcfm8maNZKP7zvVQHI,17512
35
+ kreuzberg/_ocr/_paddleocr.py,sha256=I7ns6L56a2Ol460Bge6e0hpc2AkkwDepLcpCsABj5Dc,17609
36
36
  kreuzberg/_ocr/_tesseract.py,sha256=teLMH1pBhpcmEXDcyZlv56hYINLGMuaKZ0CQtcu_czQ,31510
37
37
  kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
38
  kreuzberg/_utils/_cache.py,sha256=hYd_a5Ni5VJBE1XU_eN9gvQ5gg0FRsdbRgmJe-OIJHM,15253
@@ -47,8 +47,8 @@ kreuzberg/_utils/_string.py,sha256=bCzO3UO6nXupxvtMWvHqfp1Vd9CTzEH9jmpJXQ7upAU,6
47
47
  kreuzberg/_utils/_sync.py,sha256=7LSavBmxVKQUzdjfx9fYRAI9IbJtRw8iGf_Q8B7RX9g,4923
48
48
  kreuzberg/_utils/_table.py,sha256=IomrfQBP85DZI8RmQjOVs2Siq7VP9FUTYPaZR4t3yRw,8199
49
49
  kreuzberg/_utils/_tmp.py,sha256=hVn-VVijIg2FM7EZJ899gc7wZg-TGoJZoeAcxMX-Cxg,1044
50
- kreuzberg-3.11.0.dist-info/METADATA,sha256=pvyRM3TAmXE3TnYaNOZ1chD_IQTgWn254wxnqDsy6EM,12135
51
- kreuzberg-3.11.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
52
- kreuzberg-3.11.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
53
- kreuzberg-3.11.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
54
- kreuzberg-3.11.0.dist-info/RECORD,,
50
+ kreuzberg-3.11.2.dist-info/METADATA,sha256=J-UWkai5WTH0ECDX_kcp-1H45Qxa1rFgYlgcRbdE4zE,12136
51
+ kreuzberg-3.11.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
52
+ kreuzberg-3.11.2.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
53
+ kreuzberg-3.11.2.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
54
+ kreuzberg-3.11.2.dist-info/RECORD,,