kreuzberg 3.17.3__py3-none-any.whl → 3.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_api/main.py +45 -3
- kreuzberg/_entity_extraction.py +108 -18
- kreuzberg/_error_handling.py +182 -0
- kreuzberg/_extractors/_base.py +2 -2
- kreuzberg/_extractors/_html.py +2 -2
- kreuzberg/_extractors/_pdf.py +33 -54
- kreuzberg/_extractors/_structured.py +1 -1
- kreuzberg/_language_detection.py +2 -0
- kreuzberg/_ocr/_tesseract.py +28 -6
- kreuzberg/_types.py +18 -0
- kreuzberg/cli.py +36 -22
- kreuzberg/extraction.py +251 -107
- {kreuzberg-3.17.3.dist-info → kreuzberg-3.19.0.dist-info}/METADATA +7 -4
- {kreuzberg-3.17.3.dist-info → kreuzberg-3.19.0.dist-info}/RECORD +17 -16
- {kreuzberg-3.17.3.dist-info → kreuzberg-3.19.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.17.3.dist-info → kreuzberg-3.19.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.17.3.dist-info → kreuzberg-3.19.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.19.0
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -28,12 +28,12 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
28
28
|
Classifier: Topic :: Text Processing :: General
|
29
29
|
Classifier: Typing :: Typed
|
30
30
|
Requires-Python: >=3.10
|
31
|
-
Requires-Dist: anyio>=4.
|
31
|
+
Requires-Dist: anyio>=4.11.0
|
32
32
|
Requires-Dist: chardetng-py>=0.3.5
|
33
33
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
34
|
-
Requires-Dist: html-to-markdown[lxml]>=1.
|
34
|
+
Requires-Dist: html-to-markdown[lxml]>=1.16.0
|
35
35
|
Requires-Dist: langcodes>=3.5.0
|
36
|
-
Requires-Dist: mcp>=1.
|
36
|
+
Requires-Dist: mcp>=1.15.0
|
37
37
|
Requires-Dist: msgspec>=0.18.0
|
38
38
|
Requires-Dist: numpy>=2.0.0
|
39
39
|
Requires-Dist: playa-pdf>=0.7.0
|
@@ -42,6 +42,7 @@ Requires-Dist: psutil>=7.1.0
|
|
42
42
|
Requires-Dist: pypdfium2==4.30.0
|
43
43
|
Requires-Dist: python-calamine>=0.5.3
|
44
44
|
Requires-Dist: python-pptx>=1.0.2
|
45
|
+
Requires-Dist: transformers>=4.30.0
|
45
46
|
Requires-Dist: typing-extensions>=4.15.0; python_version < '3.12'
|
46
47
|
Provides-Extra: additional-extensions
|
47
48
|
Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
|
@@ -63,6 +64,7 @@ Requires-Dist: semantic-text-splitter>=0.28.0; extra == 'all'
|
|
63
64
|
Requires-Dist: setuptools>=80.9.0; extra == 'all'
|
64
65
|
Requires-Dist: spacy>=3.8.7; extra == 'all'
|
65
66
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
|
67
|
+
Requires-Dist: transformers>=4.25.0; extra == 'all'
|
66
68
|
Provides-Extra: api
|
67
69
|
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'api'
|
68
70
|
Provides-Extra: chunking
|
@@ -82,6 +84,7 @@ Requires-Dist: keybert>=0.9.0; extra == 'entity-extraction'
|
|
82
84
|
Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
|
83
85
|
Provides-Extra: gmft
|
84
86
|
Requires-Dist: gmft>=0.4.2; extra == 'gmft'
|
87
|
+
Requires-Dist: transformers>=4.25.0; extra == 'gmft'
|
85
88
|
Provides-Extra: langdetect
|
86
89
|
Requires-Dist: fast-langdetect>=1.0.0; extra == 'langdetect'
|
87
90
|
Provides-Extra: paddleocr
|
@@ -4,30 +4,31 @@ kreuzberg/_chunker.py,sha256=lRXvVN60vmWaTxa1b3QzvE-jBmOqYzh5dY-3Kl6pSqI,1427
|
|
4
4
|
kreuzberg/_config.py,sha256=ZYIcnJAjDnbWW_2WBy7NlOk1Ol6WpoMG5FMNMmHpqSY,13086
|
5
5
|
kreuzberg/_constants.py,sha256=gY6SpCi9za59ghRuLX_z7xfSok6qqvPbvEnv4BLczqI,265
|
6
6
|
kreuzberg/_document_classification.py,sha256=55aDxDIJ65qK6yEXt-fRYTn8LgALvYsWssjWSheVpR0,5697
|
7
|
-
kreuzberg/_entity_extraction.py,sha256=
|
7
|
+
kreuzberg/_entity_extraction.py,sha256=Ks-1gZIYDqgg2uJerd0FH_lYhjIwS0f0bMVhR9M59jA,7518
|
8
|
+
kreuzberg/_error_handling.py,sha256=Isr9yrY4JRKOmUVaUOky_LZ7tGVZAm8jxRD3qGbkc1g,5604
|
8
9
|
kreuzberg/_gmft.py,sha256=gfRXOsv-K9R7Y0zZ2SUa5wid3FpP2eFIlg5nepWcz1Q,20827
|
9
|
-
kreuzberg/_language_detection.py,sha256=
|
10
|
+
kreuzberg/_language_detection.py,sha256=4JzQldcDIVZRWUzRFc9AOFiq6Wfl9858mip1ZnrD2Ks,1143
|
10
11
|
kreuzberg/_mime_types.py,sha256=duEMDBg_qIf9A02tXAC_2znD-wgE-2BBMW9ofyYTJjE,8622
|
11
12
|
kreuzberg/_playa.py,sha256=p4G5ymSSCbQoDeXJjH-yuVzdd4y-wKcolqDthjPtqok,11413
|
12
13
|
kreuzberg/_registry.py,sha256=8XYT-vPhNYMAbB5RBIUKz-1Zdg48OCnBcdVZzBq6YwY,3307
|
13
|
-
kreuzberg/_types.py,sha256=
|
14
|
-
kreuzberg/cli.py,sha256=
|
14
|
+
kreuzberg/_types.py,sha256=6oBsmUUihVr4hJJrYeuWoUVzCP_-eciCrBVvGQHQTDI,49920
|
15
|
+
kreuzberg/cli.py,sha256=P_dqOHbGh-fFYZ4WErjngTKq7wbqaUmTD1Gjw2lIsDI,15242
|
15
16
|
kreuzberg/exceptions.py,sha256=KiGAfIX3_TkGYG1h9eTZ_E_pALsAqhZ_A3XfhwxwaS0,2909
|
16
|
-
kreuzberg/extraction.py,sha256=
|
17
|
+
kreuzberg/extraction.py,sha256=jMsomvg7SPnuXLGZKQl0YH64D0AhczSNDM4CKORd9d0,24185
|
17
18
|
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
19
|
kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
20
|
kreuzberg/_api/_config_cache.py,sha256=gX_ezGNq6SCpTn02yFkn24zMVrQwfIk8-u5XkKJiHFg,8774
|
20
|
-
kreuzberg/_api/main.py,sha256=
|
21
|
+
kreuzberg/_api/main.py,sha256=tmg1fICU4wshq0XXhGOk22oivfXjELtsEgOumdkZNI4,15257
|
21
22
|
kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
|
-
kreuzberg/_extractors/_base.py,sha256=
|
23
|
+
kreuzberg/_extractors/_base.py,sha256=99r-CUZcAp72c0mqkj-E41lj0SyzNaTb_w2EtKgfGJ8,9934
|
23
24
|
kreuzberg/_extractors/_email.py,sha256=DzNthVbmbdlajDUfs0nNwbHNvG0CAQVqJsRfsatHtf0,8799
|
24
|
-
kreuzberg/_extractors/_html.py,sha256=
|
25
|
+
kreuzberg/_extractors/_html.py,sha256=vNAgBrfok-16SOkhhsy10unqVwAczlTL_2KEn2X6S98,6315
|
25
26
|
kreuzberg/_extractors/_image.py,sha256=7rKEGhUAmdzO0YcBKQVhVme4PqyKIi2UCn4esmmFXOY,4300
|
26
27
|
kreuzberg/_extractors/_pandoc.py,sha256=cwthr--IFwbu8r0rCZ_Cx5zRlan94yuqt5e3mjYxesE,24182
|
27
|
-
kreuzberg/_extractors/_pdf.py,sha256=
|
28
|
+
kreuzberg/_extractors/_pdf.py,sha256=_MPtO_8BCpyAXyIWusmfqOaEsPMDxucjTQKz3cTaj8o,22663
|
28
29
|
kreuzberg/_extractors/_presentation.py,sha256=2g6PJnpgUpUfMjQJh-7_gHywDulE8QE8ypH__BrEUTQ,10692
|
29
30
|
kreuzberg/_extractors/_spread_sheet.py,sha256=TJOM70DLN0HzcOkAowZJogAx7QFrouohvU5V0OIliag,12738
|
30
|
-
kreuzberg/_extractors/_structured.py,sha256=
|
31
|
+
kreuzberg/_extractors/_structured.py,sha256=thpXhsBnvaHzGQX4sy6eVHowFv0yaYxLGHwxx4DouCI,8947
|
31
32
|
kreuzberg/_mcp/__init__.py,sha256=h6DgLFO4TMUk7_wCJ2jn2Y6IkFmfzb-Z7jX-G5UCYVc,43
|
32
33
|
kreuzberg/_mcp/server.py,sha256=71MhjiFDwgFROdGejf0djgO1eG370qudWmZsN59CUeA,16743
|
33
34
|
kreuzberg/_ocr/__init__.py,sha256=grshVFwVQl2rMvH1hg1JNlYXjy5-Tdb_rusLD1Cselk,706
|
@@ -35,7 +36,7 @@ kreuzberg/_ocr/_base.py,sha256=ZvOJvW8DtylQJZdCPk9vlVNZiBFK-dC4Oj7Kb6-mWkY,1419
|
|
35
36
|
kreuzberg/_ocr/_easyocr.py,sha256=bHz2S_8nNHaPHPemcJK-U0al9_qP-vUmWE4ECVlf7AA,15485
|
36
37
|
kreuzberg/_ocr/_paddleocr.py,sha256=CV9cCjkRe-3cNJ5tRu_sBXd_HNghEwfPIgWwxAZTeRY,15026
|
37
38
|
kreuzberg/_ocr/_table_extractor.py,sha256=LhBiCX8R_xR-uK1FH3ONA_vqOmqUWANZJ2HMCBLsmNY,5513
|
38
|
-
kreuzberg/_ocr/_tesseract.py,sha256=
|
39
|
+
kreuzberg/_ocr/_tesseract.py,sha256=Uu6H1LMh1WSC1OmKhPx-miG98r9KEfc0GF7b8isS33E,52420
|
39
40
|
kreuzberg/_token_reduction/__init__.py,sha256=y_2WgPxJes8_PD-VMfx7vQT0hGjFIixzS8PjaIseAGg,311
|
40
41
|
kreuzberg/_token_reduction/_reducer.py,sha256=shAfMPznP69sTSzwX_bE1LpcBmoia9cpd7r6bSc4R5Q,13609
|
41
42
|
kreuzberg/_token_reduction/_stopwords.py,sha256=mu-5CapG0RCP7LYzjhdTM6WWLtmt3cjZ08OOsyQkJVg,3608
|
@@ -121,8 +122,8 @@ kreuzberg/_utils/_string.py,sha256=wVyvEHByHBeu_6evmqJGv9Ml-NAwkyz60n8l-7L5Cw0,4
|
|
121
122
|
kreuzberg/_utils/_sync.py,sha256=gb828WYfVtkB4wKslJrPMmrdeI1h3htWceq-gywHtO4,3184
|
122
123
|
kreuzberg/_utils/_table.py,sha256=OVg6T2QnerMhVNb1juLTBSIjyjFiE5-OrUWr5NSCgnQ,6493
|
123
124
|
kreuzberg/_utils/_tmp.py,sha256=mwZ0BFzhGPfYa2tt8qSjUjfcHnSYvbQT4VlPRCRc_q8,2038
|
124
|
-
kreuzberg-3.
|
125
|
-
kreuzberg-3.
|
126
|
-
kreuzberg-3.
|
127
|
-
kreuzberg-3.
|
128
|
-
kreuzberg-3.
|
125
|
+
kreuzberg-3.19.0.dist-info/METADATA,sha256=fV1j2iWA2-rcZodFFV3kmSsuBJhoDsW6OuyIu9Myf4A,12492
|
126
|
+
kreuzberg-3.19.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
127
|
+
kreuzberg-3.19.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
|
128
|
+
kreuzberg-3.19.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
129
|
+
kreuzberg-3.19.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|