unstructured-ingest 0.3.7__py3-none-any.whl → 0.3.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/processes/connectors/pinecone.py +20 -32
- {unstructured_ingest-0.3.7.dist-info → unstructured_ingest-0.3.8.dist-info}/METADATA +12 -12
- {unstructured_ingest-0.3.7.dist-info → unstructured_ingest-0.3.8.dist-info}/RECORD +8 -8
- {unstructured_ingest-0.3.7.dist-info → unstructured_ingest-0.3.8.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.7.dist-info → unstructured_ingest-0.3.8.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.7.dist-info → unstructured_ingest-0.3.8.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.7.dist-info → unstructured_ingest-0.3.8.dist-info}/top_level.txt +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.3.
|
|
1
|
+
__version__ = "0.3.8" # pragma: no cover
|
|
@@ -6,7 +6,10 @@ from typing import TYPE_CHECKING, Any, Optional
|
|
|
6
6
|
from pydantic import Field, Secret
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.error import DestinationConnectionError
|
|
9
|
-
from unstructured_ingest.utils.data_prep import
|
|
9
|
+
from unstructured_ingest.utils.data_prep import (
|
|
10
|
+
flatten_dict,
|
|
11
|
+
generator_batching_wbytes,
|
|
12
|
+
)
|
|
10
13
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
14
|
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
12
15
|
from unstructured_ingest.v2.interfaces import (
|
|
@@ -148,8 +151,10 @@ class PineconeUploadStager(UploadStager):
|
|
|
148
151
|
|
|
149
152
|
metadata[RECORD_ID_LABEL] = file_data.identifier
|
|
150
153
|
|
|
154
|
+
# To support more optimal deletes, a prefix is suggested for each record:
|
|
155
|
+
# https://docs.pinecone.io/guides/data/manage-rag-documents#delete-all-records-for-a-parent-document
|
|
151
156
|
return {
|
|
152
|
-
"id": get_enhanced_element_id(element_dict=element_dict, file_data=file_data),
|
|
157
|
+
"id": f"{file_data.identifier}#{get_enhanced_element_id(element_dict=element_dict, file_data=file_data)}", # noqa:E501
|
|
153
158
|
"values": embeddings,
|
|
154
159
|
"metadata": metadata,
|
|
155
160
|
}
|
|
@@ -215,18 +220,6 @@ class PineconeUploader(Uploader):
|
|
|
215
220
|
f"from pinecone index: {resp}"
|
|
216
221
|
)
|
|
217
222
|
|
|
218
|
-
def delete_by_query(self, index: "PineconeIndex", query_params: dict) -> None:
|
|
219
|
-
while True:
|
|
220
|
-
query_results = index.query(**query_params)
|
|
221
|
-
matches = query_results.get("matches", [])
|
|
222
|
-
if not matches:
|
|
223
|
-
break
|
|
224
|
-
ids = [match["id"] for match in matches]
|
|
225
|
-
delete_params = {"ids": ids}
|
|
226
|
-
if namespace := self.upload_config.namespace:
|
|
227
|
-
delete_params["namespace"] = namespace
|
|
228
|
-
index.delete(**delete_params)
|
|
229
|
-
|
|
230
223
|
def serverless_delete_by_record_id(self, file_data: FileData) -> None:
|
|
231
224
|
logger.debug(
|
|
232
225
|
f"deleting any content with metadata "
|
|
@@ -234,26 +227,21 @@ class PineconeUploader(Uploader):
|
|
|
234
227
|
f"from pinecone serverless index"
|
|
235
228
|
)
|
|
236
229
|
index = self.connection_config.get_index(pool_threads=MAX_POOL_THREADS)
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
query_params = {
|
|
245
|
-
"filter": {self.upload_config.record_id_key: {"$eq": file_data.identifier}},
|
|
246
|
-
"vector": [0] * dimension,
|
|
247
|
-
"top_k": top_k,
|
|
248
|
-
}
|
|
230
|
+
list_kwargs = {"prefix": f"{file_data.identifier}#"}
|
|
231
|
+
deleted_ids = 0
|
|
232
|
+
if namespace := self.upload_config.namespace:
|
|
233
|
+
list_kwargs["namespace"] = namespace
|
|
234
|
+
for ids in index.list(**list_kwargs):
|
|
235
|
+
deleted_ids += len(ids)
|
|
236
|
+
delete_kwargs = {"ids": ids}
|
|
249
237
|
if namespace := self.upload_config.namespace:
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
238
|
+
delete_resp = delete_kwargs["namespace"] = namespace
|
|
239
|
+
# delete_resp should be an empty dict if there were no errors
|
|
240
|
+
if delete_resp:
|
|
241
|
+
logger.error(f"failed to delete batch of ids: {delete_resp}")
|
|
242
|
+
index.delete(**delete_kwargs)
|
|
255
243
|
logger.info(
|
|
256
|
-
f"deleted {
|
|
244
|
+
f"deleted {deleted_ids} records with metadata "
|
|
257
245
|
f"{self.upload_config.record_id_key}={file_data.identifier} "
|
|
258
246
|
f"from pinecone index"
|
|
259
247
|
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.8
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -22,12 +22,12 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
22
22
|
Requires-Python: >=3.9.0,<3.13
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
|
+
Requires-Dist: click
|
|
25
26
|
Requires-Dist: pandas
|
|
26
|
-
Requires-Dist: tqdm
|
|
27
27
|
Requires-Dist: pydantic>=2.7
|
|
28
28
|
Requires-Dist: opentelemetry-sdk
|
|
29
|
+
Requires-Dist: tqdm
|
|
29
30
|
Requires-Dist: dataclasses-json
|
|
30
|
-
Requires-Dist: click
|
|
31
31
|
Requires-Dist: python-dateutil
|
|
32
32
|
Provides-Extra: airtable
|
|
33
33
|
Requires-Dist: pyairtable; extra == "airtable"
|
|
@@ -51,8 +51,8 @@ Requires-Dist: chromadb; extra == "chroma"
|
|
|
51
51
|
Provides-Extra: clarifai
|
|
52
52
|
Requires-Dist: clarifai; extra == "clarifai"
|
|
53
53
|
Provides-Extra: confluence
|
|
54
|
-
Requires-Dist: atlassian-python-api; extra == "confluence"
|
|
55
54
|
Requires-Dist: requests; extra == "confluence"
|
|
55
|
+
Requires-Dist: atlassian-python-api; extra == "confluence"
|
|
56
56
|
Provides-Extra: couchbase
|
|
57
57
|
Requires-Dist: couchbase; extra == "couchbase"
|
|
58
58
|
Provides-Extra: csv
|
|
@@ -60,8 +60,8 @@ Requires-Dist: unstructured[tsv]; extra == "csv"
|
|
|
60
60
|
Provides-Extra: databricks-volumes
|
|
61
61
|
Requires-Dist: databricks-sdk; extra == "databricks-volumes"
|
|
62
62
|
Provides-Extra: delta-table
|
|
63
|
-
Requires-Dist: boto3; extra == "delta-table"
|
|
64
63
|
Requires-Dist: deltalake; extra == "delta-table"
|
|
64
|
+
Requires-Dist: boto3; extra == "delta-table"
|
|
65
65
|
Provides-Extra: discord
|
|
66
66
|
Requires-Dist: discord-py; extra == "discord"
|
|
67
67
|
Provides-Extra: doc
|
|
@@ -69,8 +69,8 @@ Requires-Dist: unstructured[docx]; extra == "doc"
|
|
|
69
69
|
Provides-Extra: docx
|
|
70
70
|
Requires-Dist: unstructured[docx]; extra == "docx"
|
|
71
71
|
Provides-Extra: dropbox
|
|
72
|
-
Requires-Dist: dropboxdrivefs; extra == "dropbox"
|
|
73
72
|
Requires-Dist: fsspec; extra == "dropbox"
|
|
73
|
+
Requires-Dist: dropboxdrivefs; extra == "dropbox"
|
|
74
74
|
Provides-Extra: duckdb
|
|
75
75
|
Requires-Dist: duckdb; extra == "duckdb"
|
|
76
76
|
Provides-Extra: elasticsearch
|
|
@@ -89,9 +89,9 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
|
|
|
89
89
|
Provides-Extra: epub
|
|
90
90
|
Requires-Dist: unstructured[epub]; extra == "epub"
|
|
91
91
|
Provides-Extra: gcs
|
|
92
|
-
Requires-Dist: gcsfs; extra == "gcs"
|
|
93
92
|
Requires-Dist: fsspec; extra == "gcs"
|
|
94
93
|
Requires-Dist: bs4; extra == "gcs"
|
|
94
|
+
Requires-Dist: gcsfs; extra == "gcs"
|
|
95
95
|
Provides-Extra: github
|
|
96
96
|
Requires-Dist: requests; extra == "github"
|
|
97
97
|
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
@@ -119,16 +119,16 @@ Requires-Dist: pymongo; extra == "mongodb"
|
|
|
119
119
|
Provides-Extra: msg
|
|
120
120
|
Requires-Dist: unstructured[msg]; extra == "msg"
|
|
121
121
|
Provides-Extra: notion
|
|
122
|
-
Requires-Dist:
|
|
122
|
+
Requires-Dist: httpx; extra == "notion"
|
|
123
123
|
Requires-Dist: backoff; extra == "notion"
|
|
124
124
|
Requires-Dist: htmlBuilder; extra == "notion"
|
|
125
|
-
Requires-Dist:
|
|
125
|
+
Requires-Dist: notion-client; extra == "notion"
|
|
126
126
|
Provides-Extra: odt
|
|
127
127
|
Requires-Dist: unstructured[odt]; extra == "odt"
|
|
128
128
|
Provides-Extra: onedrive
|
|
129
129
|
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
130
|
-
Requires-Dist: msal; extra == "onedrive"
|
|
131
130
|
Requires-Dist: bs4; extra == "onedrive"
|
|
131
|
+
Requires-Dist: msal; extra == "onedrive"
|
|
132
132
|
Provides-Extra: openai
|
|
133
133
|
Requires-Dist: tiktoken; extra == "openai"
|
|
134
134
|
Requires-Dist: openai; extra == "openai"
|
|
@@ -160,8 +160,8 @@ Requires-Dist: unstructured[rst]; extra == "rst"
|
|
|
160
160
|
Provides-Extra: rtf
|
|
161
161
|
Requires-Dist: unstructured[rtf]; extra == "rtf"
|
|
162
162
|
Provides-Extra: s3
|
|
163
|
-
Requires-Dist: s3fs; extra == "s3"
|
|
164
163
|
Requires-Dist: fsspec; extra == "s3"
|
|
164
|
+
Requires-Dist: s3fs; extra == "s3"
|
|
165
165
|
Provides-Extra: salesforce
|
|
166
166
|
Requires-Dist: simple-salesforce; extra == "salesforce"
|
|
167
167
|
Provides-Extra: sftp
|
|
@@ -175,8 +175,8 @@ Requires-Dist: singlestoredb; extra == "singlestore"
|
|
|
175
175
|
Provides-Extra: slack
|
|
176
176
|
Requires-Dist: slack-sdk[optional]; extra == "slack"
|
|
177
177
|
Provides-Extra: snowflake
|
|
178
|
-
Requires-Dist: psycopg2-binary; extra == "snowflake"
|
|
179
178
|
Requires-Dist: snowflake-connector-python; extra == "snowflake"
|
|
179
|
+
Requires-Dist: psycopg2-binary; extra == "snowflake"
|
|
180
180
|
Provides-Extra: togetherai
|
|
181
181
|
Requires-Dist: together; extra == "togetherai"
|
|
182
182
|
Provides-Extra: tsv
|
|
@@ -87,7 +87,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
|
|
|
87
87
|
test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
88
88
|
test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
|
|
89
89
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
90
|
-
unstructured_ingest/__version__.py,sha256=
|
|
90
|
+
unstructured_ingest/__version__.py,sha256=91iYzXMGhyMPaHI-Y8piVjcrk2AXRekqcknDzOkyQzk,42
|
|
91
91
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
92
92
|
unstructured_ingest/interfaces.py,sha256=OYVUP0bzBJpT-Lz92BDyz_hLBvyfxkuSwWHhUdnUayA,31493
|
|
93
93
|
unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
|
|
@@ -415,7 +415,7 @@ unstructured_ingest/v2/processes/connectors/milvus.py,sha256=3sV0Yv2vYMLyxszKCqA
|
|
|
415
415
|
unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=XLuprTCY0D9tAh_qn81MjJrDN9YaNqMlKe7BJl3eTZc,14998
|
|
416
416
|
unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=heZMtOIrCySi552ldIk8iH0pSRXZ0W2LeD-CcNOwCFQ,15979
|
|
417
417
|
unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
|
|
418
|
-
unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=
|
|
418
|
+
unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=BLi9wQzoAnt61m2vOa0xGvmR04kBH_tw9EV9xIw2O_Y,11629
|
|
419
419
|
unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
|
|
420
420
|
unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=Ndn2Wm7RupfjAtlLxxQwJueeE0V8aGMbNVPuFq9nqdQ,19730
|
|
421
421
|
unstructured_ingest/v2/processes/connectors/slack.py,sha256=Z73VmQ3oUY09KoLEi5OBdQeDt4ONEY_02SglWQc6HXE,9252
|
|
@@ -469,9 +469,9 @@ unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=bXtfEYLquR-
|
|
|
469
469
|
unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
|
|
470
470
|
unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
|
|
471
471
|
unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=dBDC_M8GVKupl7i9UMRCZyRIUv6gTkq8bJE_SILydAc,11291
|
|
472
|
-
unstructured_ingest-0.3.
|
|
473
|
-
unstructured_ingest-0.3.
|
|
474
|
-
unstructured_ingest-0.3.
|
|
475
|
-
unstructured_ingest-0.3.
|
|
476
|
-
unstructured_ingest-0.3.
|
|
477
|
-
unstructured_ingest-0.3.
|
|
472
|
+
unstructured_ingest-0.3.8.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
473
|
+
unstructured_ingest-0.3.8.dist-info/METADATA,sha256=HZRSyzQt9UJplhIiXEEsDrZGljY77nMG57SWRR0F3v8,7457
|
|
474
|
+
unstructured_ingest-0.3.8.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
|
475
|
+
unstructured_ingest-0.3.8.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
476
|
+
unstructured_ingest-0.3.8.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
|
|
477
|
+
unstructured_ingest-0.3.8.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-0.3.7.dist-info → unstructured_ingest-0.3.8.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|