unstructured-ingest 0.5.12__py3-none-any.whl → 0.5.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/processes/connectors/neo4j.py +12 -4
- unstructured_ingest/v2/processes/connectors/utils.py +5 -2
- {unstructured_ingest-0.5.12.dist-info → unstructured_ingest-0.5.13.dist-info}/METADATA +22 -22
- {unstructured_ingest-0.5.12.dist-info → unstructured_ingest-0.5.13.dist-info}/RECORD +9 -9
- {unstructured_ingest-0.5.12.dist-info → unstructured_ingest-0.5.13.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.5.12.dist-info → unstructured_ingest-0.5.13.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.5.12.dist-info → unstructured_ingest-0.5.13.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.5.12.dist-info → unstructured_ingest-0.5.13.dist-info}/top_level.txt +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.5.
|
|
1
|
+
__version__ = "0.5.13" # pragma: no cover
|
|
@@ -154,7 +154,7 @@ class Neo4jUploadStager(UploadStager):
|
|
|
154
154
|
self._add_entities(element, graph, element_node)
|
|
155
155
|
|
|
156
156
|
if self._is_chunk(element):
|
|
157
|
-
for origin_element in format_and_truncate_orig_elements(element):
|
|
157
|
+
for origin_element in format_and_truncate_orig_elements(element, include_text=True):
|
|
158
158
|
origin_element_node = self._create_element_node(origin_element)
|
|
159
159
|
|
|
160
160
|
graph.add_edge(
|
|
@@ -327,13 +327,16 @@ class Neo4jUploader(Uploader):
|
|
|
327
327
|
async def _create_vector_index(
|
|
328
328
|
self, client: AsyncDriver, dimensions: int, similarity_function: SimilarityFunction
|
|
329
329
|
) -> None:
|
|
330
|
+
import neo4j.exceptions
|
|
331
|
+
|
|
330
332
|
label = Label.CHUNK
|
|
331
333
|
logger.info(
|
|
332
334
|
f"Creating index on nodes labeled '{label.value}' if it does not already exist."
|
|
333
335
|
)
|
|
334
336
|
index_name = f"{label.value.lower()}_vector"
|
|
335
|
-
|
|
336
|
-
|
|
337
|
+
try:
|
|
338
|
+
await client.execute_query(
|
|
339
|
+
f"""
|
|
337
340
|
CREATE VECTOR INDEX {index_name} IF NOT EXISTS
|
|
338
341
|
FOR (n:{label.value}) ON n.embedding
|
|
339
342
|
OPTIONS {{indexConfig: {{
|
|
@@ -341,7 +344,12 @@ class Neo4jUploader(Uploader):
|
|
|
341
344
|
`vector.dimensions`: {dimensions}}}
|
|
342
345
|
}}
|
|
343
346
|
"""
|
|
344
|
-
|
|
347
|
+
)
|
|
348
|
+
except neo4j.exceptions.ClientError as e:
|
|
349
|
+
if e.code == "Neo.ClientError.Schema.EquivalentSchemaRuleAlreadyExists":
|
|
350
|
+
logger.info(f"Index on nodes labeled '{label.value}' already exists.")
|
|
351
|
+
else:
|
|
352
|
+
raise
|
|
345
353
|
|
|
346
354
|
async def _delete_old_data_if_exists(self, file_data: FileData, client: AsyncDriver) -> None:
|
|
347
355
|
logger.info(f"Deleting old data for the record '{file_data.identifier}' (if present).")
|
|
@@ -31,7 +31,9 @@ def conform_string_to_dict(value: Any) -> dict:
|
|
|
31
31
|
raise ValidationError(f"Input could not be mapped to a valid dict: {value}")
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
def format_and_truncate_orig_elements(
|
|
34
|
+
def format_and_truncate_orig_elements(
|
|
35
|
+
element: dict, include_text: bool = False
|
|
36
|
+
) -> list[dict[str, Any]]:
|
|
35
37
|
"""
|
|
36
38
|
This function is used to format and truncate the orig_elements field in the metadata.
|
|
37
39
|
This is used to remove the text field and other larger fields from the orig_elements
|
|
@@ -42,7 +44,8 @@ def format_and_truncate_orig_elements(element: dict) -> list[dict[str, Any]]:
|
|
|
42
44
|
orig_elements = []
|
|
43
45
|
if raw_orig_elements is not None:
|
|
44
46
|
for element in elements_from_base64_gzipped_json(raw_orig_elements):
|
|
45
|
-
|
|
47
|
+
if not include_text:
|
|
48
|
+
element.pop("text", None)
|
|
46
49
|
for prop in (
|
|
47
50
|
"image_base64",
|
|
48
51
|
"text_as_html",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.13
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -22,13 +22,13 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
22
22
|
Requires-Python: >=3.9.0,<3.14
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
|
-
Requires-Dist: dataclasses_json
|
|
26
25
|
Requires-Dist: python-dateutil
|
|
27
|
-
Requires-Dist:
|
|
28
|
-
Requires-Dist: pydantic>=2.7
|
|
29
|
-
Requires-Dist: tqdm
|
|
26
|
+
Requires-Dist: dataclasses_json
|
|
30
27
|
Requires-Dist: pandas
|
|
28
|
+
Requires-Dist: pydantic>=2.7
|
|
29
|
+
Requires-Dist: click
|
|
31
30
|
Requires-Dist: opentelemetry-sdk
|
|
31
|
+
Requires-Dist: tqdm
|
|
32
32
|
Provides-Extra: remote
|
|
33
33
|
Requires-Dist: unstructured-client>=0.30.0; extra == "remote"
|
|
34
34
|
Provides-Extra: csv
|
|
@@ -66,23 +66,23 @@ Requires-Dist: pyairtable; extra == "airtable"
|
|
|
66
66
|
Provides-Extra: astradb
|
|
67
67
|
Requires-Dist: astrapy; extra == "astradb"
|
|
68
68
|
Provides-Extra: azure
|
|
69
|
-
Requires-Dist: fsspec; extra == "azure"
|
|
70
69
|
Requires-Dist: adlfs; extra == "azure"
|
|
70
|
+
Requires-Dist: fsspec; extra == "azure"
|
|
71
71
|
Provides-Extra: azure-ai-search
|
|
72
72
|
Requires-Dist: azure-search-documents; extra == "azure-ai-search"
|
|
73
73
|
Provides-Extra: biomed
|
|
74
|
-
Requires-Dist: bs4; extra == "biomed"
|
|
75
74
|
Requires-Dist: requests; extra == "biomed"
|
|
75
|
+
Requires-Dist: bs4; extra == "biomed"
|
|
76
76
|
Provides-Extra: box
|
|
77
|
-
Requires-Dist: fsspec; extra == "box"
|
|
78
77
|
Requires-Dist: boxfs; extra == "box"
|
|
78
|
+
Requires-Dist: fsspec; extra == "box"
|
|
79
79
|
Provides-Extra: chroma
|
|
80
80
|
Requires-Dist: chromadb; extra == "chroma"
|
|
81
81
|
Provides-Extra: clarifai
|
|
82
82
|
Requires-Dist: clarifai; extra == "clarifai"
|
|
83
83
|
Provides-Extra: confluence
|
|
84
|
-
Requires-Dist: atlassian-python-api; extra == "confluence"
|
|
85
84
|
Requires-Dist: requests; extra == "confluence"
|
|
85
|
+
Requires-Dist: atlassian-python-api; extra == "confluence"
|
|
86
86
|
Provides-Extra: couchbase
|
|
87
87
|
Requires-Dist: couchbase; extra == "couchbase"
|
|
88
88
|
Provides-Extra: delta-table
|
|
@@ -98,19 +98,19 @@ Requires-Dist: duckdb; extra == "duckdb"
|
|
|
98
98
|
Provides-Extra: elasticsearch
|
|
99
99
|
Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
|
|
100
100
|
Provides-Extra: gcs
|
|
101
|
-
Requires-Dist: bs4; extra == "gcs"
|
|
102
101
|
Requires-Dist: gcsfs; extra == "gcs"
|
|
102
|
+
Requires-Dist: bs4; extra == "gcs"
|
|
103
103
|
Requires-Dist: fsspec; extra == "gcs"
|
|
104
104
|
Provides-Extra: github
|
|
105
|
-
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
106
105
|
Requires-Dist: requests; extra == "github"
|
|
106
|
+
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
107
107
|
Provides-Extra: gitlab
|
|
108
108
|
Requires-Dist: python-gitlab; extra == "gitlab"
|
|
109
109
|
Provides-Extra: google-drive
|
|
110
110
|
Requires-Dist: google-api-python-client; extra == "google-drive"
|
|
111
111
|
Provides-Extra: hubspot
|
|
112
|
-
Requires-Dist: urllib3; extra == "hubspot"
|
|
113
112
|
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
113
|
+
Requires-Dist: urllib3; extra == "hubspot"
|
|
114
114
|
Provides-Extra: jira
|
|
115
115
|
Requires-Dist: atlassian-python-api; extra == "jira"
|
|
116
116
|
Provides-Extra: kafka
|
|
@@ -124,23 +124,23 @@ Requires-Dist: pymilvus; extra == "milvus"
|
|
|
124
124
|
Provides-Extra: mongodb
|
|
125
125
|
Requires-Dist: pymongo; extra == "mongodb"
|
|
126
126
|
Provides-Extra: neo4j
|
|
127
|
+
Requires-Dist: cymple; extra == "neo4j"
|
|
127
128
|
Requires-Dist: neo4j-rust-ext; extra == "neo4j"
|
|
128
129
|
Requires-Dist: networkx; extra == "neo4j"
|
|
129
|
-
Requires-Dist: cymple; extra == "neo4j"
|
|
130
130
|
Provides-Extra: notion
|
|
131
|
-
Requires-Dist: htmlBuilder; extra == "notion"
|
|
132
|
-
Requires-Dist: httpx; extra == "notion"
|
|
133
131
|
Requires-Dist: notion-client; extra == "notion"
|
|
132
|
+
Requires-Dist: htmlBuilder; extra == "notion"
|
|
134
133
|
Requires-Dist: backoff; extra == "notion"
|
|
134
|
+
Requires-Dist: httpx; extra == "notion"
|
|
135
135
|
Provides-Extra: onedrive
|
|
136
|
+
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
136
137
|
Requires-Dist: msal; extra == "onedrive"
|
|
137
138
|
Requires-Dist: bs4; extra == "onedrive"
|
|
138
|
-
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
139
139
|
Provides-Extra: opensearch
|
|
140
140
|
Requires-Dist: opensearch-py; extra == "opensearch"
|
|
141
141
|
Provides-Extra: outlook
|
|
142
|
-
Requires-Dist: msal; extra == "outlook"
|
|
143
142
|
Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
|
|
143
|
+
Requires-Dist: msal; extra == "outlook"
|
|
144
144
|
Provides-Extra: pinecone
|
|
145
145
|
Requires-Dist: pinecone-client>=3.7.1; extra == "pinecone"
|
|
146
146
|
Provides-Extra: postgres
|
|
@@ -155,8 +155,8 @@ Provides-Extra: s3
|
|
|
155
155
|
Requires-Dist: fsspec; extra == "s3"
|
|
156
156
|
Requires-Dist: s3fs; extra == "s3"
|
|
157
157
|
Provides-Extra: sharepoint
|
|
158
|
-
Requires-Dist: msal; extra == "sharepoint"
|
|
159
158
|
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
159
|
+
Requires-Dist: msal; extra == "sharepoint"
|
|
160
160
|
Provides-Extra: salesforce
|
|
161
161
|
Requires-Dist: simple-salesforce; extra == "salesforce"
|
|
162
162
|
Provides-Extra: sftp
|
|
@@ -178,22 +178,22 @@ Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
|
|
|
178
178
|
Provides-Extra: singlestore
|
|
179
179
|
Requires-Dist: singlestoredb; extra == "singlestore"
|
|
180
180
|
Provides-Extra: vectara
|
|
181
|
-
Requires-Dist: httpx; extra == "vectara"
|
|
182
181
|
Requires-Dist: requests; extra == "vectara"
|
|
183
182
|
Requires-Dist: aiofiles; extra == "vectara"
|
|
183
|
+
Requires-Dist: httpx; extra == "vectara"
|
|
184
184
|
Provides-Extra: vastdb
|
|
185
185
|
Requires-Dist: ibis; extra == "vastdb"
|
|
186
186
|
Requires-Dist: vastdb; extra == "vastdb"
|
|
187
187
|
Requires-Dist: pyarrow; extra == "vastdb"
|
|
188
188
|
Provides-Extra: zendesk
|
|
189
|
-
Requires-Dist: httpx; extra == "zendesk"
|
|
190
189
|
Requires-Dist: bs4; extra == "zendesk"
|
|
191
190
|
Requires-Dist: aiofiles; extra == "zendesk"
|
|
191
|
+
Requires-Dist: httpx; extra == "zendesk"
|
|
192
192
|
Provides-Extra: embed-huggingface
|
|
193
193
|
Requires-Dist: sentence-transformers; extra == "embed-huggingface"
|
|
194
194
|
Provides-Extra: embed-octoai
|
|
195
|
-
Requires-Dist: openai; extra == "embed-octoai"
|
|
196
195
|
Requires-Dist: tiktoken; extra == "embed-octoai"
|
|
196
|
+
Requires-Dist: openai; extra == "embed-octoai"
|
|
197
197
|
Provides-Extra: embed-vertexai
|
|
198
198
|
Requires-Dist: vertexai; extra == "embed-vertexai"
|
|
199
199
|
Provides-Extra: embed-voyageai
|
|
@@ -201,8 +201,8 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
|
|
|
201
201
|
Provides-Extra: embed-mixedbreadai
|
|
202
202
|
Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
|
|
203
203
|
Provides-Extra: openai
|
|
204
|
-
Requires-Dist: openai; extra == "openai"
|
|
205
204
|
Requires-Dist: tiktoken; extra == "openai"
|
|
205
|
+
Requires-Dist: openai; extra == "openai"
|
|
206
206
|
Provides-Extra: bedrock
|
|
207
207
|
Requires-Dist: boto3; extra == "bedrock"
|
|
208
208
|
Requires-Dist: aioboto3; extra == "bedrock"
|
|
@@ -111,7 +111,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
|
|
|
111
111
|
test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
112
112
|
test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
|
|
113
113
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
114
|
-
unstructured_ingest/__version__.py,sha256=
|
|
114
|
+
unstructured_ingest/__version__.py,sha256=SoXJvWNbP_dJpjjbYHn-IaGnsULa9X8yicnEFO_W3yI,43
|
|
115
115
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
116
116
|
unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
|
|
117
117
|
unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
|
|
@@ -442,7 +442,7 @@ unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=VRDAiou_7oWOIAgQTdOG
|
|
|
442
442
|
unstructured_ingest/v2/processes/connectors/local.py,sha256=ZvWTj6ZYkwnvQMNFsZWoaQyp9zp0WVqAywMaHJ2kcAc,7153
|
|
443
443
|
unstructured_ingest/v2/processes/connectors/milvus.py,sha256=wmcu9NVy3gYlQGT25inN5w_QrhFoL8-hRq0pJFSNw8g,8866
|
|
444
444
|
unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=cL0QUQZF_s2brh3nNNeAywXVpaIiND4b5JTAFlYjLjw,14273
|
|
445
|
-
unstructured_ingest/v2/processes/connectors/neo4j.py,sha256=
|
|
445
|
+
unstructured_ingest/v2/processes/connectors/neo4j.py,sha256=vxf6Xuh-OMS09Y-mIF0PIwrFauqRtoI7vjeLBXsFwTk,18744
|
|
446
446
|
unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=EM9fq67RsiudZvZbi6nDXkS-i6W0xLvbkNvD0G-Ni5E,17779
|
|
447
447
|
unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
|
|
448
448
|
unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=O9lC4mZ9V_exg9apiCJSWHsgkuYDSEOlI6CaUS5ZB7c,13961
|
|
@@ -450,7 +450,7 @@ unstructured_ingest/v2/processes/connectors/redisdb.py,sha256=p0AY4ukBNpwAemV4bW
|
|
|
450
450
|
unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
|
|
451
451
|
unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=2T9Bm1H_ALwHhG_YP7vsuUUW-mUg61zcaae3aa9BnN4,4827
|
|
452
452
|
unstructured_ingest/v2/processes/connectors/slack.py,sha256=Z73VmQ3oUY09KoLEi5OBdQeDt4ONEY_02SglWQc6HXE,9252
|
|
453
|
-
unstructured_ingest/v2/processes/connectors/utils.py,sha256=
|
|
453
|
+
unstructured_ingest/v2/processes/connectors/utils.py,sha256=TAd0hb1f291N-q7-TUe6JKSCGkhqDyo7Ij8zmliBZUc,2071
|
|
454
454
|
unstructured_ingest/v2/processes/connectors/vectara.py,sha256=BlI_4nkpNR99aYxDd9eusm5LQsVB9EI0r-5Kc1D7pgQ,12255
|
|
455
455
|
unstructured_ingest/v2/processes/connectors/assets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
456
456
|
unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json,sha256=SJlIO0kXxy866tWQ8bEzvwLwflsoUMIS-OKlxMvHIuE,504
|
|
@@ -575,9 +575,9 @@ unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=UZ_s8dnV
|
|
|
575
575
|
unstructured_ingest/v2/processes/connectors/zendesk/__init__.py,sha256=XMNocKJ3FHDfy36p_KHhH7ALi0-ji6NhGuQNCV2E4vY,699
|
|
576
576
|
unstructured_ingest/v2/processes/connectors/zendesk/client.py,sha256=MNyI6SUuxZHf_6zONoC6jR2f9BvyTYoMyGKDOhl4kgs,7897
|
|
577
577
|
unstructured_ingest/v2/processes/connectors/zendesk/zendesk.py,sha256=vQHZa5YYiDPXXPRAPMnPXhh0QzXeiBVx_YIWskZBQIc,15465
|
|
578
|
-
unstructured_ingest-0.5.
|
|
579
|
-
unstructured_ingest-0.5.
|
|
580
|
-
unstructured_ingest-0.5.
|
|
581
|
-
unstructured_ingest-0.5.
|
|
582
|
-
unstructured_ingest-0.5.
|
|
583
|
-
unstructured_ingest-0.5.
|
|
578
|
+
unstructured_ingest-0.5.13.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
579
|
+
unstructured_ingest-0.5.13.dist-info/METADATA,sha256=K95xEzr8Tq75w90-ivlwvfFhRkNRTPnNmtIiRXDXhjs,8465
|
|
580
|
+
unstructured_ingest-0.5.13.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
581
|
+
unstructured_ingest-0.5.13.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
582
|
+
unstructured_ingest-0.5.13.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
|
|
583
|
+
unstructured_ingest-0.5.13.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-0.5.12.dist-info → unstructured_ingest-0.5.13.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|