unstructured-ingest 0.3.7__py3-none-any.whl → 0.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1 +1 @@
1
- __version__ = "0.3.7" # pragma: no cover
1
+ __version__ = "0.3.8" # pragma: no cover
@@ -6,7 +6,10 @@ from typing import TYPE_CHECKING, Any, Optional
6
6
  from pydantic import Field, Secret
7
7
 
8
8
  from unstructured_ingest.error import DestinationConnectionError
9
- from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
9
+ from unstructured_ingest.utils.data_prep import (
10
+ flatten_dict,
11
+ generator_batching_wbytes,
12
+ )
10
13
  from unstructured_ingest.utils.dep_check import requires_dependencies
11
14
  from unstructured_ingest.v2.constants import RECORD_ID_LABEL
12
15
  from unstructured_ingest.v2.interfaces import (
@@ -148,8 +151,10 @@ class PineconeUploadStager(UploadStager):
148
151
 
149
152
  metadata[RECORD_ID_LABEL] = file_data.identifier
150
153
 
154
+ # To support more optimal deletes, a prefix is suggested for each record:
155
+ # https://docs.pinecone.io/guides/data/manage-rag-documents#delete-all-records-for-a-parent-document
151
156
  return {
152
- "id": get_enhanced_element_id(element_dict=element_dict, file_data=file_data),
157
+ "id": f"{file_data.identifier}#{get_enhanced_element_id(element_dict=element_dict, file_data=file_data)}", # noqa:E501
153
158
  "values": embeddings,
154
159
  "metadata": metadata,
155
160
  }
@@ -215,18 +220,6 @@ class PineconeUploader(Uploader):
215
220
  f"from pinecone index: {resp}"
216
221
  )
217
222
 
218
- def delete_by_query(self, index: "PineconeIndex", query_params: dict) -> None:
219
- while True:
220
- query_results = index.query(**query_params)
221
- matches = query_results.get("matches", [])
222
- if not matches:
223
- break
224
- ids = [match["id"] for match in matches]
225
- delete_params = {"ids": ids}
226
- if namespace := self.upload_config.namespace:
227
- delete_params["namespace"] = namespace
228
- index.delete(**delete_params)
229
-
230
223
  def serverless_delete_by_record_id(self, file_data: FileData) -> None:
231
224
  logger.debug(
232
225
  f"deleting any content with metadata "
@@ -234,26 +227,21 @@ class PineconeUploader(Uploader):
234
227
  f"from pinecone serverless index"
235
228
  )
236
229
  index = self.connection_config.get_index(pool_threads=MAX_POOL_THREADS)
237
- index_stats = index.describe_index_stats()
238
- dimension = index_stats["dimension"]
239
- total_vectors = index_stats["total_vector_count"]
240
- if total_vectors == 0:
241
- return
242
- while total_vectors > 0:
243
- top_k = min(total_vectors, MAX_QUERY_RESULTS)
244
- query_params = {
245
- "filter": {self.upload_config.record_id_key: {"$eq": file_data.identifier}},
246
- "vector": [0] * dimension,
247
- "top_k": top_k,
248
- }
230
+ list_kwargs = {"prefix": f"{file_data.identifier}#"}
231
+ deleted_ids = 0
232
+ if namespace := self.upload_config.namespace:
233
+ list_kwargs["namespace"] = namespace
234
+ for ids in index.list(**list_kwargs):
235
+ deleted_ids += len(ids)
236
+ delete_kwargs = {"ids": ids}
249
237
  if namespace := self.upload_config.namespace:
250
- query_params["namespace"] = namespace
251
- self.delete_by_query(index=index, query_params=query_params)
252
- index_stats = index.describe_index_stats()
253
- total_vectors = index_stats["total_vector_count"]
254
-
238
+ delete_resp = delete_kwargs["namespace"] = namespace
239
+ # delete_resp should be an empty dict if there were no errors
240
+ if delete_resp:
241
+ logger.error(f"failed to delete batch of ids: {delete_resp}")
242
+ index.delete(**delete_kwargs)
255
243
  logger.info(
256
- f"deleted {total_vectors} records with metadata "
244
+ f"deleted {deleted_ids} records with metadata "
257
245
  f"{self.upload_config.record_id_key}={file_data.identifier} "
258
246
  f"from pinecone index"
259
247
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unstructured-ingest
3
- Version: 0.3.7
3
+ Version: 0.3.8
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -22,12 +22,12 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Python: >=3.9.0,<3.13
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
+ Requires-Dist: click
25
26
  Requires-Dist: pandas
26
- Requires-Dist: tqdm
27
27
  Requires-Dist: pydantic>=2.7
28
28
  Requires-Dist: opentelemetry-sdk
29
+ Requires-Dist: tqdm
29
30
  Requires-Dist: dataclasses-json
30
- Requires-Dist: click
31
31
  Requires-Dist: python-dateutil
32
32
  Provides-Extra: airtable
33
33
  Requires-Dist: pyairtable; extra == "airtable"
@@ -51,8 +51,8 @@ Requires-Dist: chromadb; extra == "chroma"
51
51
  Provides-Extra: clarifai
52
52
  Requires-Dist: clarifai; extra == "clarifai"
53
53
  Provides-Extra: confluence
54
- Requires-Dist: atlassian-python-api; extra == "confluence"
55
54
  Requires-Dist: requests; extra == "confluence"
55
+ Requires-Dist: atlassian-python-api; extra == "confluence"
56
56
  Provides-Extra: couchbase
57
57
  Requires-Dist: couchbase; extra == "couchbase"
58
58
  Provides-Extra: csv
@@ -60,8 +60,8 @@ Requires-Dist: unstructured[tsv]; extra == "csv"
60
60
  Provides-Extra: databricks-volumes
61
61
  Requires-Dist: databricks-sdk; extra == "databricks-volumes"
62
62
  Provides-Extra: delta-table
63
- Requires-Dist: boto3; extra == "delta-table"
64
63
  Requires-Dist: deltalake; extra == "delta-table"
64
+ Requires-Dist: boto3; extra == "delta-table"
65
65
  Provides-Extra: discord
66
66
  Requires-Dist: discord-py; extra == "discord"
67
67
  Provides-Extra: doc
@@ -69,8 +69,8 @@ Requires-Dist: unstructured[docx]; extra == "doc"
69
69
  Provides-Extra: docx
70
70
  Requires-Dist: unstructured[docx]; extra == "docx"
71
71
  Provides-Extra: dropbox
72
- Requires-Dist: dropboxdrivefs; extra == "dropbox"
73
72
  Requires-Dist: fsspec; extra == "dropbox"
73
+ Requires-Dist: dropboxdrivefs; extra == "dropbox"
74
74
  Provides-Extra: duckdb
75
75
  Requires-Dist: duckdb; extra == "duckdb"
76
76
  Provides-Extra: elasticsearch
@@ -89,9 +89,9 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
89
89
  Provides-Extra: epub
90
90
  Requires-Dist: unstructured[epub]; extra == "epub"
91
91
  Provides-Extra: gcs
92
- Requires-Dist: gcsfs; extra == "gcs"
93
92
  Requires-Dist: fsspec; extra == "gcs"
94
93
  Requires-Dist: bs4; extra == "gcs"
94
+ Requires-Dist: gcsfs; extra == "gcs"
95
95
  Provides-Extra: github
96
96
  Requires-Dist: requests; extra == "github"
97
97
  Requires-Dist: pygithub>1.58.0; extra == "github"
@@ -119,16 +119,16 @@ Requires-Dist: pymongo; extra == "mongodb"
119
119
  Provides-Extra: msg
120
120
  Requires-Dist: unstructured[msg]; extra == "msg"
121
121
  Provides-Extra: notion
122
- Requires-Dist: notion-client; extra == "notion"
122
+ Requires-Dist: httpx; extra == "notion"
123
123
  Requires-Dist: backoff; extra == "notion"
124
124
  Requires-Dist: htmlBuilder; extra == "notion"
125
- Requires-Dist: httpx; extra == "notion"
125
+ Requires-Dist: notion-client; extra == "notion"
126
126
  Provides-Extra: odt
127
127
  Requires-Dist: unstructured[odt]; extra == "odt"
128
128
  Provides-Extra: onedrive
129
129
  Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
130
- Requires-Dist: msal; extra == "onedrive"
131
130
  Requires-Dist: bs4; extra == "onedrive"
131
+ Requires-Dist: msal; extra == "onedrive"
132
132
  Provides-Extra: openai
133
133
  Requires-Dist: tiktoken; extra == "openai"
134
134
  Requires-Dist: openai; extra == "openai"
@@ -160,8 +160,8 @@ Requires-Dist: unstructured[rst]; extra == "rst"
160
160
  Provides-Extra: rtf
161
161
  Requires-Dist: unstructured[rtf]; extra == "rtf"
162
162
  Provides-Extra: s3
163
- Requires-Dist: s3fs; extra == "s3"
164
163
  Requires-Dist: fsspec; extra == "s3"
164
+ Requires-Dist: s3fs; extra == "s3"
165
165
  Provides-Extra: salesforce
166
166
  Requires-Dist: simple-salesforce; extra == "salesforce"
167
167
  Provides-Extra: sftp
@@ -175,8 +175,8 @@ Requires-Dist: singlestoredb; extra == "singlestore"
175
175
  Provides-Extra: slack
176
176
  Requires-Dist: slack-sdk[optional]; extra == "slack"
177
177
  Provides-Extra: snowflake
178
- Requires-Dist: psycopg2-binary; extra == "snowflake"
179
178
  Requires-Dist: snowflake-connector-python; extra == "snowflake"
179
+ Requires-Dist: psycopg2-binary; extra == "snowflake"
180
180
  Provides-Extra: togetherai
181
181
  Requires-Dist: together; extra == "togetherai"
182
182
  Provides-Extra: tsv
@@ -87,7 +87,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
87
87
  test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
88
88
  test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
89
89
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
90
- unstructured_ingest/__version__.py,sha256=ZszFecZmmLOry5GLIAeiaTytadll5VqC2yU3HLAZtUI,42
90
+ unstructured_ingest/__version__.py,sha256=91iYzXMGhyMPaHI-Y8piVjcrk2AXRekqcknDzOkyQzk,42
91
91
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
92
92
  unstructured_ingest/interfaces.py,sha256=OYVUP0bzBJpT-Lz92BDyz_hLBvyfxkuSwWHhUdnUayA,31493
93
93
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -415,7 +415,7 @@ unstructured_ingest/v2/processes/connectors/milvus.py,sha256=3sV0Yv2vYMLyxszKCqA
415
415
  unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=XLuprTCY0D9tAh_qn81MjJrDN9YaNqMlKe7BJl3eTZc,14998
416
416
  unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=heZMtOIrCySi552ldIk8iH0pSRXZ0W2LeD-CcNOwCFQ,15979
417
417
  unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
418
- unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=J6bj5zWu2AlxAbbjx8h7hpUTbSEkA9z1zoTd-yNUKXM,12037
418
+ unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=BLi9wQzoAnt61m2vOa0xGvmR04kBH_tw9EV9xIw2O_Y,11629
419
419
  unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
420
420
  unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=Ndn2Wm7RupfjAtlLxxQwJueeE0V8aGMbNVPuFq9nqdQ,19730
421
421
  unstructured_ingest/v2/processes/connectors/slack.py,sha256=Z73VmQ3oUY09KoLEi5OBdQeDt4ONEY_02SglWQc6HXE,9252
@@ -469,9 +469,9 @@ unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=bXtfEYLquR-
469
469
  unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
470
470
  unstructured_ingest/v2/processes/connectors/weaviate/local.py,sha256=LuTBKPseVewsz8VqxRPRLfGEm3BeI9nBZxpy7ZU5tOA,2201
471
471
  unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=dBDC_M8GVKupl7i9UMRCZyRIUv6gTkq8bJE_SILydAc,11291
472
- unstructured_ingest-0.3.7.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
473
- unstructured_ingest-0.3.7.dist-info/METADATA,sha256=xkBy4eCRGyZu-F4K3unehR7IsrZiJbiapobhGAXaaqM,7457
474
- unstructured_ingest-0.3.7.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
475
- unstructured_ingest-0.3.7.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
476
- unstructured_ingest-0.3.7.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
477
- unstructured_ingest-0.3.7.dist-info/RECORD,,
472
+ unstructured_ingest-0.3.8.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
473
+ unstructured_ingest-0.3.8.dist-info/METADATA,sha256=HZRSyzQt9UJplhIiXEEsDrZGljY77nMG57SWRR0F3v8,7457
474
+ unstructured_ingest-0.3.8.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
475
+ unstructured_ingest-0.3.8.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
476
+ unstructured_ingest-0.3.8.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
477
+ unstructured_ingest-0.3.8.dist-info/RECORD,,