kreuzberg 3.11.1__py3-none-any.whl → 3.11.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_gmft.py +28 -10
- {kreuzberg-3.11.1.dist-info → kreuzberg-3.11.3.dist-info}/METADATA +8 -8
- {kreuzberg-3.11.1.dist-info → kreuzberg-3.11.3.dist-info}/RECORD +6 -6
- {kreuzberg-3.11.1.dist-info → kreuzberg-3.11.3.dist-info}/WHEEL +0 -0
- {kreuzberg-3.11.1.dist-info → kreuzberg-3.11.3.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.11.1.dist-info → kreuzberg-3.11.3.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_gmft.py
CHANGED
@@ -444,14 +444,26 @@ def _extract_tables_in_process(
|
|
444
444
|
cropped_image.save(img_bytes, format="PNG")
|
445
445
|
img_bytes.seek(0)
|
446
446
|
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
447
|
+
if data_frame.empty:
|
448
|
+
results.append(
|
449
|
+
{
|
450
|
+
"cropped_image_bytes": img_bytes.getvalue(),
|
451
|
+
"page_number": cropped_table.page.page_number,
|
452
|
+
"text": data_frame.to_markdown(),
|
453
|
+
"df_columns": data_frame.columns.tolist(),
|
454
|
+
"df_csv": None,
|
455
|
+
}
|
456
|
+
)
|
457
|
+
else:
|
458
|
+
results.append(
|
459
|
+
{
|
460
|
+
"cropped_image_bytes": img_bytes.getvalue(),
|
461
|
+
"page_number": cropped_table.page.page_number,
|
462
|
+
"text": data_frame.to_markdown(),
|
463
|
+
"df_columns": None,
|
464
|
+
"df_csv": data_frame.to_csv(index=False),
|
465
|
+
}
|
466
|
+
)
|
455
467
|
|
456
468
|
result_queue.put((True, results))
|
457
469
|
|
@@ -532,7 +544,10 @@ def _extract_tables_isolated(
|
|
532
544
|
img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
|
533
545
|
import pandas as pd # noqa: PLC0415
|
534
546
|
|
535
|
-
|
547
|
+
if table_dict["df_csv"] is None:
|
548
|
+
df = pd.DataFrame(columns=table_dict["df_columns"])
|
549
|
+
else:
|
550
|
+
df = pd.read_csv(StringIO(table_dict["df_csv"]))
|
536
551
|
|
537
552
|
tables.append(
|
538
553
|
TableData(
|
@@ -638,7 +653,10 @@ async def _extract_tables_isolated_async(
|
|
638
653
|
img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
|
639
654
|
import pandas as pd # noqa: PLC0415
|
640
655
|
|
641
|
-
|
656
|
+
if table_dict["df_csv"] is None:
|
657
|
+
df = pd.DataFrame(columns=table_dict["df_columns"])
|
658
|
+
else:
|
659
|
+
df = pd.read_csv(StringIO(table_dict["df_csv"]))
|
642
660
|
|
643
661
|
tables.append(
|
644
662
|
TableData(
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.11.
|
3
|
+
Version: 3.11.3
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -32,7 +32,7 @@ Requires-Dist: anyio>=4.10.0
|
|
32
32
|
Requires-Dist: chardetng-py>=0.3.5
|
33
33
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
34
34
|
Requires-Dist: html-to-markdown[lxml]>=1.9.0
|
35
|
-
Requires-Dist: mcp>=1.
|
35
|
+
Requires-Dist: mcp>=1.13.0
|
36
36
|
Requires-Dist: msgspec>=0.18.0
|
37
37
|
Requires-Dist: playa-pdf>=0.7.0
|
38
38
|
Requires-Dist: psutil>=7.0.0
|
@@ -52,9 +52,9 @@ Requires-Dist: gmft>=0.4.2; extra == 'all'
|
|
52
52
|
Requires-Dist: keybert>=0.9.0; extra == 'all'
|
53
53
|
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'all'
|
54
54
|
Requires-Dist: mailparse>=1.0.15; extra == 'all'
|
55
|
-
Requires-Dist: paddleocr>=3.
|
56
|
-
Requires-Dist: paddlepaddle>=3.1.
|
57
|
-
Requires-Dist: pandas>=2.3.
|
55
|
+
Requires-Dist: paddleocr>=3.2.0; extra == 'all'
|
56
|
+
Requires-Dist: paddlepaddle>=3.1.1; extra == 'all'
|
57
|
+
Requires-Dist: pandas>=2.3.2; extra == 'all'
|
58
58
|
Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'all'
|
59
59
|
Requires-Dist: rich>=14.1.0; extra == 'all'
|
60
60
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
|
@@ -73,7 +73,7 @@ Provides-Extra: crypto
|
|
73
73
|
Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'crypto'
|
74
74
|
Provides-Extra: document-classification
|
75
75
|
Requires-Dist: deep-translator>=1.11.4; extra == 'document-classification'
|
76
|
-
Requires-Dist: pandas>=2.3.
|
76
|
+
Requires-Dist: pandas>=2.3.2; extra == 'document-classification'
|
77
77
|
Provides-Extra: easyocr
|
78
78
|
Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
|
79
79
|
Provides-Extra: entity-extraction
|
@@ -84,8 +84,8 @@ Requires-Dist: gmft>=0.4.2; extra == 'gmft'
|
|
84
84
|
Provides-Extra: langdetect
|
85
85
|
Requires-Dist: fast-langdetect>=0.3.2; extra == 'langdetect'
|
86
86
|
Provides-Extra: paddleocr
|
87
|
-
Requires-Dist: paddleocr>=3.
|
88
|
-
Requires-Dist: paddlepaddle>=3.1.
|
87
|
+
Requires-Dist: paddleocr>=3.2.0; extra == 'paddleocr'
|
88
|
+
Requires-Dist: paddlepaddle>=3.1.1; extra == 'paddleocr'
|
89
89
|
Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
|
90
90
|
Description-Content-Type: text/markdown
|
91
91
|
|
@@ -5,7 +5,7 @@ kreuzberg/_config.py,sha256=Au521UiR7vcQs_8_hhoWIfmDDMJIrDM3XZUB_qHfCmo,14035
|
|
5
5
|
kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
|
6
6
|
kreuzberg/_document_classification.py,sha256=qFGmwvUMhnNAvNNJO7E-huPx-Ps-_DWxdNxsozIzgaw,6870
|
7
7
|
kreuzberg/_entity_extraction.py,sha256=Oa1T-9mptimpOHtcda-GtrVYH9PFy7DSJj3thJZUD7k,7902
|
8
|
-
kreuzberg/_gmft.py,sha256=
|
8
|
+
kreuzberg/_gmft.py,sha256=6P4gSSmU39puaYAKmdGr9ALf0USYTwRDuvvhG1LmI24,26441
|
9
9
|
kreuzberg/_language_detection.py,sha256=_Ng2aHgPxOHFgd507gVNiIGVmnxxbpgYwsO0bD0yTzg,3315
|
10
10
|
kreuzberg/_mime_types.py,sha256=2warRVqfBUNIg8JBg8yP4pRqaMPvwINosHMkJwtH_Fc,8488
|
11
11
|
kreuzberg/_playa.py,sha256=_IPrUSWwSfDQlWXOpKlauV0D9MhGrujGP5kmQ0U3L0g,12188
|
@@ -47,8 +47,8 @@ kreuzberg/_utils/_string.py,sha256=bCzO3UO6nXupxvtMWvHqfp1Vd9CTzEH9jmpJXQ7upAU,6
|
|
47
47
|
kreuzberg/_utils/_sync.py,sha256=7LSavBmxVKQUzdjfx9fYRAI9IbJtRw8iGf_Q8B7RX9g,4923
|
48
48
|
kreuzberg/_utils/_table.py,sha256=IomrfQBP85DZI8RmQjOVs2Siq7VP9FUTYPaZR4t3yRw,8199
|
49
49
|
kreuzberg/_utils/_tmp.py,sha256=hVn-VVijIg2FM7EZJ899gc7wZg-TGoJZoeAcxMX-Cxg,1044
|
50
|
-
kreuzberg-3.11.
|
51
|
-
kreuzberg-3.11.
|
52
|
-
kreuzberg-3.11.
|
53
|
-
kreuzberg-3.11.
|
54
|
-
kreuzberg-3.11.
|
50
|
+
kreuzberg-3.11.3.dist-info/METADATA,sha256=JOaIdR8UamST2qu5b0zJnk8qvyg5ly8LI7DKaFszUEs,12136
|
51
|
+
kreuzberg-3.11.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
52
|
+
kreuzberg-3.11.3.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
|
53
|
+
kreuzberg-3.11.3.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
54
|
+
kreuzberg-3.11.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|