unstructured-ingest 0.3.7__py3-none-any.whl → 0.3.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (64) hide show
  1. test/integration/chunkers/test_chunkers.py +0 -11
  2. test/integration/connectors/conftest.py +11 -1
  3. test/integration/connectors/databricks_tests/test_volumes_native.py +4 -3
  4. test/integration/connectors/duckdb/conftest.py +14 -0
  5. test/integration/connectors/duckdb/test_duckdb.py +51 -44
  6. test/integration/connectors/duckdb/test_motherduck.py +37 -48
  7. test/integration/connectors/elasticsearch/test_elasticsearch.py +26 -4
  8. test/integration/connectors/elasticsearch/test_opensearch.py +26 -3
  9. test/integration/connectors/sql/test_postgres.py +102 -91
  10. test/integration/connectors/sql/test_singlestore.py +111 -99
  11. test/integration/connectors/sql/test_snowflake.py +142 -117
  12. test/integration/connectors/sql/test_sqlite.py +86 -75
  13. test/integration/connectors/test_astradb.py +22 -1
  14. test/integration/connectors/test_azure_ai_search.py +25 -3
  15. test/integration/connectors/test_chroma.py +120 -0
  16. test/integration/connectors/test_confluence.py +4 -4
  17. test/integration/connectors/test_delta_table.py +1 -0
  18. test/integration/connectors/test_kafka.py +4 -4
  19. test/integration/connectors/test_milvus.py +21 -0
  20. test/integration/connectors/test_mongodb.py +3 -3
  21. test/integration/connectors/test_neo4j.py +236 -0
  22. test/integration/connectors/test_pinecone.py +25 -1
  23. test/integration/connectors/test_qdrant.py +25 -2
  24. test/integration/connectors/test_s3.py +9 -6
  25. test/integration/connectors/utils/docker.py +6 -0
  26. test/integration/connectors/utils/validation/__init__.py +0 -0
  27. test/integration/connectors/utils/validation/destination.py +88 -0
  28. test/integration/connectors/utils/validation/equality.py +75 -0
  29. test/integration/connectors/utils/{validation.py → validation/source.py} +15 -91
  30. test/integration/connectors/utils/validation/utils.py +36 -0
  31. unstructured_ingest/__version__.py +1 -1
  32. unstructured_ingest/utils/chunking.py +11 -0
  33. unstructured_ingest/utils/data_prep.py +36 -0
  34. unstructured_ingest/v2/interfaces/upload_stager.py +70 -6
  35. unstructured_ingest/v2/interfaces/uploader.py +11 -2
  36. unstructured_ingest/v2/pipeline/steps/stage.py +3 -1
  37. unstructured_ingest/v2/processes/connectors/astradb.py +8 -30
  38. unstructured_ingest/v2/processes/connectors/azure_ai_search.py +16 -40
  39. unstructured_ingest/v2/processes/connectors/chroma.py +36 -59
  40. unstructured_ingest/v2/processes/connectors/couchbase.py +42 -52
  41. unstructured_ingest/v2/processes/connectors/delta_table.py +11 -33
  42. unstructured_ingest/v2/processes/connectors/duckdb/base.py +26 -26
  43. unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +29 -20
  44. unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +37 -44
  45. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +5 -30
  46. unstructured_ingest/v2/processes/connectors/gitlab.py +32 -31
  47. unstructured_ingest/v2/processes/connectors/google_drive.py +32 -29
  48. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +2 -4
  49. unstructured_ingest/v2/processes/connectors/kdbai.py +44 -70
  50. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -10
  51. unstructured_ingest/v2/processes/connectors/local.py +13 -2
  52. unstructured_ingest/v2/processes/connectors/milvus.py +16 -57
  53. unstructured_ingest/v2/processes/connectors/mongodb.py +4 -8
  54. unstructured_ingest/v2/processes/connectors/neo4j.py +381 -0
  55. unstructured_ingest/v2/processes/connectors/pinecone.py +23 -65
  56. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +32 -41
  57. unstructured_ingest/v2/processes/connectors/sql/sql.py +41 -40
  58. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +9 -31
  59. {unstructured_ingest-0.3.7.dist-info → unstructured_ingest-0.3.9.dist-info}/METADATA +21 -17
  60. {unstructured_ingest-0.3.7.dist-info → unstructured_ingest-0.3.9.dist-info}/RECORD +64 -56
  61. {unstructured_ingest-0.3.7.dist-info → unstructured_ingest-0.3.9.dist-info}/LICENSE.md +0 -0
  62. {unstructured_ingest-0.3.7.dist-info → unstructured_ingest-0.3.9.dist-info}/WHEEL +0 -0
  63. {unstructured_ingest-0.3.7.dist-info → unstructured_ingest-0.3.9.dist-info}/entry_points.txt +0 -0
  64. {unstructured_ingest-0.3.7.dist-info → unstructured_ingest-0.3.9.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,7 @@
1
- import json
2
1
  import sys
3
2
  from contextlib import contextmanager
4
3
  from dataclasses import dataclass, replace
5
4
  from datetime import datetime
6
- from pathlib import Path
7
5
  from time import time
8
6
  from typing import TYPE_CHECKING, Any, Generator, Optional
9
7
 
@@ -332,18 +330,16 @@ class MongoDBUploader(Uploader):
332
330
  f"deleted {delete_results.deleted_count} records from collection {collection.name}"
333
331
  )
334
332
 
335
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
336
- with path.open("r") as file:
337
- elements_dict = json.load(file)
333
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
338
334
  logger.info(
339
- f"writing {len(elements_dict)} objects to destination "
335
+ f"writing {len(data)} objects to destination "
340
336
  f"db, {self.upload_config.database}, "
341
337
  f"collection {self.upload_config.collection} "
342
338
  f"at {self.connection_config.host}",
343
339
  )
344
340
  # This would typically live in the stager but since no other manipulation
345
341
  # is done, setting the record id field in the uploader
346
- for element in elements_dict:
342
+ for element in data:
347
343
  element[self.upload_config.record_id_key] = file_data.identifier
348
344
  with self.connection_config.get_client() as client:
349
345
  db = client[self.upload_config.database]
@@ -352,7 +348,7 @@ class MongoDBUploader(Uploader):
352
348
  self.delete_by_record_id(file_data=file_data, collection=collection)
353
349
  else:
354
350
  logger.warning("criteria for deleting previous content not met, skipping")
355
- for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
351
+ for chunk in batch_generator(data, self.upload_config.batch_size):
356
352
  collection.insert_many(chunk)
357
353
 
358
354
 
@@ -0,0 +1,381 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import json
5
+ import uuid
6
+ from collections import defaultdict
7
+ from contextlib import asynccontextmanager
8
+ from dataclasses import dataclass
9
+ from enum import Enum
10
+ from pathlib import Path
11
+ from typing import TYPE_CHECKING, Any, AsyncGenerator, Optional
12
+
13
+ import networkx as nx
14
+ from pydantic import BaseModel, ConfigDict, Field, Secret
15
+
16
+ from unstructured_ingest.error import DestinationConnectionError
17
+ from unstructured_ingest.logger import logger
18
+ from unstructured_ingest.utils.chunking import elements_from_base64_gzipped_json
19
+ from unstructured_ingest.utils.data_prep import batch_generator
20
+ from unstructured_ingest.utils.dep_check import requires_dependencies
21
+ from unstructured_ingest.v2.interfaces import (
22
+ AccessConfig,
23
+ ConnectionConfig,
24
+ FileData,
25
+ Uploader,
26
+ UploaderConfig,
27
+ UploadStager,
28
+ UploadStagerConfig,
29
+ )
30
+ from unstructured_ingest.v2.processes.connector_registry import (
31
+ DestinationRegistryEntry,
32
+ )
33
+
34
+ if TYPE_CHECKING:
35
+ from neo4j import AsyncDriver, Auth
36
+
37
+ CONNECTOR_TYPE = "neo4j"
38
+
39
+
40
+ class Neo4jAccessConfig(AccessConfig):
41
+ password: str
42
+
43
+
44
+ class Neo4jConnectionConfig(ConnectionConfig):
45
+ access_config: Secret[Neo4jAccessConfig]
46
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
47
+ username: str
48
+ uri: str = Field(description="Neo4j Connection URI <scheme>://<host>:<port>")
49
+ database: str = Field(description="Name of the target database")
50
+
51
+ @requires_dependencies(["neo4j"], extras="neo4j")
52
+ @asynccontextmanager
53
+ async def get_client(self) -> AsyncGenerator["AsyncDriver", None]:
54
+ from neo4j import AsyncGraphDatabase
55
+
56
+ driver = AsyncGraphDatabase.driver(**self._get_driver_parameters())
57
+ logger.info(f"Created driver connecting to the database '{self.database}' at {self.uri}.")
58
+ try:
59
+ yield driver
60
+ finally:
61
+ await driver.close()
62
+ logger.info(
63
+ f"Closed driver connecting to the database '{self.database}' at {self.uri}."
64
+ )
65
+
66
+ def _get_driver_parameters(self) -> dict:
67
+ return {
68
+ "uri": self.uri,
69
+ "auth": self._get_auth(),
70
+ "database": self.database,
71
+ }
72
+
73
+ @requires_dependencies(["neo4j"], extras="neo4j")
74
+ def _get_auth(self) -> "Auth":
75
+ from neo4j import Auth
76
+
77
+ return Auth("basic", self.username, self.access_config.get_secret_value().password)
78
+
79
+
80
+ class Neo4jUploadStagerConfig(UploadStagerConfig):
81
+ pass
82
+
83
+
84
+ @dataclass
85
+ class Neo4jUploadStager(UploadStager):
86
+ upload_stager_config: Neo4jUploadStagerConfig = Field(
87
+ default_factory=Neo4jUploadStagerConfig, validate_default=True
88
+ )
89
+
90
+ def run( # type: ignore
91
+ self,
92
+ elements_filepath: Path,
93
+ file_data: FileData,
94
+ output_dir: Path,
95
+ output_filename: str,
96
+ **kwargs: Any,
97
+ ) -> Path:
98
+ with elements_filepath.open() as file:
99
+ elements = json.load(file)
100
+
101
+ nx_graph = self._create_lexical_graph(
102
+ elements, self._create_document_node(file_data=file_data)
103
+ )
104
+ output_filepath = Path(output_dir) / f"{output_filename}.json"
105
+ output_filepath.parent.mkdir(parents=True, exist_ok=True)
106
+
107
+ with open(output_filepath, "w") as file:
108
+ json.dump(_GraphData.from_nx(nx_graph).model_dump(), file, indent=4)
109
+
110
+ return output_filepath
111
+
112
+ def _create_lexical_graph(self, elements: list[dict], document_node: _Node) -> nx.Graph:
113
+ graph = nx.MultiDiGraph()
114
+ graph.add_node(document_node)
115
+
116
+ previous_node: Optional[_Node] = None
117
+ for element in elements:
118
+ element_node = self._create_element_node(element)
119
+ order_relationship = (
120
+ Relationship.NEXT_CHUNK if self._is_chunk(element) else Relationship.NEXT_ELEMENT
121
+ )
122
+ if previous_node:
123
+ graph.add_edge(element_node, previous_node, relationship=order_relationship)
124
+
125
+ previous_node = element_node
126
+ graph.add_edge(element_node, document_node, relationship=Relationship.PART_OF_DOCUMENT)
127
+
128
+ if self._is_chunk(element):
129
+ origin_element_nodes = [
130
+ self._create_element_node(origin_element)
131
+ for origin_element in self._get_origin_elements(element)
132
+ ]
133
+ graph.add_edges_from(
134
+ [
135
+ (origin_element_node, element_node)
136
+ for origin_element_node in origin_element_nodes
137
+ ],
138
+ relationship=Relationship.PART_OF_CHUNK,
139
+ )
140
+ graph.add_edges_from(
141
+ [
142
+ (origin_element_node, document_node)
143
+ for origin_element_node in origin_element_nodes
144
+ ],
145
+ relationship=Relationship.PART_OF_DOCUMENT,
146
+ )
147
+
148
+ return graph
149
+
150
+ # TODO(Filip Knefel): Ensure _is_chunk is as reliable as possible, consider different checks
151
+ def _is_chunk(self, element: dict) -> bool:
152
+ return "orig_elements" in element.get("metadata", {})
153
+
154
+ def _create_document_node(self, file_data: FileData) -> _Node:
155
+ properties = {}
156
+ if file_data.source_identifiers:
157
+ properties["name"] = file_data.source_identifiers.filename
158
+ if file_data.metadata.date_created:
159
+ properties["date_created"] = file_data.metadata.date_created
160
+ if file_data.metadata.date_modified:
161
+ properties["date_modified"] = file_data.metadata.date_modified
162
+ return _Node(id_=file_data.identifier, properties=properties, labels=[Label.DOCUMENT])
163
+
164
+ def _create_element_node(self, element: dict) -> _Node:
165
+ properties = {"id": element["element_id"], "text": element["text"]}
166
+
167
+ if embeddings := element.get("embeddings"):
168
+ properties["embeddings"] = embeddings
169
+
170
+ label = Label.CHUNK if self._is_chunk(element) else Label.UNSTRUCTURED_ELEMENT
171
+ return _Node(id_=element["element_id"], properties=properties, labels=[label])
172
+
173
+ def _get_origin_elements(self, chunk_element: dict) -> list[dict]:
174
+ orig_elements = chunk_element.get("metadata", {}).get("orig_elements")
175
+ return elements_from_base64_gzipped_json(raw_s=orig_elements)
176
+
177
+
178
+ class _GraphData(BaseModel):
179
+ nodes: list[_Node]
180
+ edges: list[_Edge]
181
+
182
+ @classmethod
183
+ def from_nx(cls, nx_graph: nx.MultiDiGraph) -> _GraphData:
184
+ nodes = list(nx_graph.nodes())
185
+ edges = [
186
+ _Edge(
187
+ source_id=u.id_,
188
+ destination_id=v.id_,
189
+ relationship=Relationship(data_dict["relationship"]),
190
+ )
191
+ for u, v, data_dict in nx_graph.edges(data=True)
192
+ ]
193
+ return _GraphData(nodes=nodes, edges=edges)
194
+
195
+
196
+ class _Node(BaseModel):
197
+ model_config = ConfigDict(use_enum_values=True)
198
+
199
+ id_: str = Field(default_factory=lambda: str(uuid.uuid4()))
200
+ labels: list[Label] = Field(default_factory=list)
201
+ properties: dict = Field(default_factory=dict)
202
+
203
+ def __hash__(self):
204
+ return hash(self.id_)
205
+
206
+
207
+ class _Edge(BaseModel):
208
+ model_config = ConfigDict(use_enum_values=True)
209
+
210
+ source_id: str
211
+ destination_id: str
212
+ relationship: Relationship
213
+
214
+
215
+ class Label(str, Enum):
216
+ UNSTRUCTURED_ELEMENT = "UnstructuredElement"
217
+ CHUNK = "Chunk"
218
+ DOCUMENT = "Document"
219
+
220
+
221
+ class Relationship(str, Enum):
222
+ PART_OF_DOCUMENT = "PART_OF_DOCUMENT"
223
+ PART_OF_CHUNK = "PART_OF_CHUNK"
224
+ NEXT_CHUNK = "NEXT_CHUNK"
225
+ NEXT_ELEMENT = "NEXT_ELEMENT"
226
+
227
+
228
+ class Neo4jUploaderConfig(UploaderConfig):
229
+ batch_size: int = Field(
230
+ default=100, description="Maximal number of nodes/relationships created per transaction."
231
+ )
232
+
233
+
234
+ @dataclass
235
+ class Neo4jUploader(Uploader):
236
+ upload_config: Neo4jUploaderConfig
237
+ connection_config: Neo4jConnectionConfig
238
+ connector_type: str = CONNECTOR_TYPE
239
+
240
+ @DestinationConnectionError.wrap
241
+ def precheck(self) -> None:
242
+ async def verify_auth():
243
+ async with self.connection_config.get_client() as client:
244
+ await client.verify_connectivity()
245
+
246
+ asyncio.run(verify_auth())
247
+
248
+ def is_async(self):
249
+ return True
250
+
251
+ async def run_async(self, path: Path, file_data: FileData, **kwargs) -> None: # type: ignore
252
+ with path.open() as file:
253
+ staged_data = json.load(file)
254
+
255
+ graph_data = _GraphData.model_validate(staged_data)
256
+ async with self.connection_config.get_client() as client:
257
+ await self._create_uniqueness_constraints(client)
258
+ await self._delete_old_data_if_exists(file_data, client=client)
259
+ await self._merge_graph(graph_data=graph_data, client=client)
260
+
261
+ async def _create_uniqueness_constraints(self, client: AsyncDriver) -> None:
262
+ for label in Label:
263
+ logger.info(
264
+ f"Adding id uniqueness constraint for nodes labeled '{label}'"
265
+ " if it does not already exist."
266
+ )
267
+ constraint_name = f"{label.lower()}_id"
268
+ await client.execute_query(
269
+ f"""
270
+ CREATE CONSTRAINT {constraint_name} IF NOT EXISTS
271
+ FOR (n: {label}) REQUIRE n.id IS UNIQUE
272
+ """
273
+ )
274
+
275
+ async def _delete_old_data_if_exists(self, file_data: FileData, client: AsyncDriver) -> None:
276
+ logger.info(f"Deleting old data for the record '{file_data.identifier}' (if present).")
277
+ _, summary, _ = await client.execute_query(
278
+ f"""
279
+ MATCH (n: {Label.DOCUMENT} {{id: $identifier}})
280
+ MATCH (n)--(m: {Label.CHUNK}|{Label.UNSTRUCTURED_ELEMENT})
281
+ DETACH DELETE m""",
282
+ identifier=file_data.identifier,
283
+ )
284
+ logger.info(
285
+ f"Deleted {summary.counters.nodes_deleted} nodes"
286
+ f" and {summary.counters.relationships_deleted} relationships."
287
+ )
288
+
289
+ async def _merge_graph(self, graph_data: _GraphData, client: AsyncDriver) -> None:
290
+ nodes_by_labels: defaultdict[tuple[Label, ...], list[_Node]] = defaultdict(list)
291
+ for node in graph_data.nodes:
292
+ nodes_by_labels[tuple(node.labels)].append(node)
293
+
294
+ logger.info(f"Merging {len(graph_data.nodes)} graph nodes.")
295
+ # NOTE: Processed in parallel as there's no overlap between accessed nodes
296
+ await self._execute_queries(
297
+ [
298
+ self._create_nodes_query(nodes_batch, labels)
299
+ for labels, nodes in nodes_by_labels.items()
300
+ for nodes_batch in batch_generator(nodes, batch_size=self.upload_config.batch_size)
301
+ ],
302
+ client=client,
303
+ in_parallel=True,
304
+ )
305
+ logger.info(f"Finished merging {len(graph_data.nodes)} graph nodes.")
306
+
307
+ edges_by_relationship: defaultdict[Relationship, list[_Edge]] = defaultdict(list)
308
+ for edge in graph_data.edges:
309
+ edges_by_relationship[edge.relationship].append(edge)
310
+
311
+ logger.info(f"Merging {len(graph_data.edges)} graph relationships (edges).")
312
+ # NOTE: Processed sequentially to avoid queries locking node access to one another
313
+ await self._execute_queries(
314
+ [
315
+ self._create_edges_query(edges_batch, relationship)
316
+ for relationship, edges in edges_by_relationship.items()
317
+ for edges_batch in batch_generator(edges, batch_size=self.upload_config.batch_size)
318
+ ],
319
+ client=client,
320
+ )
321
+ logger.info(f"Finished merging {len(graph_data.edges)} graph relationships (edges).")
322
+
323
+ @staticmethod
324
+ async def _execute_queries(
325
+ queries_with_parameters: list[tuple[str, dict]],
326
+ client: AsyncDriver,
327
+ in_parallel: bool = False,
328
+ ) -> None:
329
+ if in_parallel:
330
+ logger.info(f"Executing {len(queries_with_parameters)} queries in parallel.")
331
+ await asyncio.gather(
332
+ *[
333
+ client.execute_query(query, parameters_=parameters)
334
+ for query, parameters in queries_with_parameters
335
+ ]
336
+ )
337
+ logger.info("Finished executing parallel queries.")
338
+ else:
339
+ logger.info(f"Executing {len(queries_with_parameters)} queries sequentially.")
340
+ for i, (query, parameters) in enumerate(queries_with_parameters):
341
+ logger.info(f"Query #{i} started.")
342
+ await client.execute_query(query, parameters_=parameters)
343
+ logger.info(f"Query #{i} finished.")
344
+ logger.info(
345
+ f"Finished executing all ({len(queries_with_parameters)}) sequential queries."
346
+ )
347
+
348
+ @staticmethod
349
+ def _create_nodes_query(nodes: list[_Node], labels: tuple[Label, ...]) -> tuple[str, dict]:
350
+ labels_string = ", ".join(labels)
351
+ logger.info(f"Preparing MERGE query for {len(nodes)} nodes labeled '{labels_string}'.")
352
+ query_string = f"""
353
+ UNWIND $nodes AS node
354
+ MERGE (n: {labels_string} {{id: node.id}})
355
+ SET n += node.properties
356
+ """
357
+ parameters = {"nodes": [{"id": node.id_, "properties": node.properties} for node in nodes]}
358
+ return query_string, parameters
359
+
360
+ @staticmethod
361
+ def _create_edges_query(edges: list[_Edge], relationship: Relationship) -> tuple[str, dict]:
362
+ logger.info(f"Preparing MERGE query for {len(edges)} {relationship} relationships.")
363
+ query_string = f"""
364
+ UNWIND $edges AS edge
365
+ MATCH (u {{id: edge.source}})
366
+ MATCH (v {{id: edge.destination}})
367
+ MERGE (u)-[:{relationship}]->(v)
368
+ """
369
+ parameters = {
370
+ "edges": [
371
+ {"source": edge.source_id, "destination": edge.destination_id} for edge in edges
372
+ ]
373
+ }
374
+ return query_string, parameters
375
+
376
+
377
+ neo4j_destination_entry = DestinationRegistryEntry(
378
+ connection_config=Neo4jConnectionConfig,
379
+ uploader=Neo4jUploader,
380
+ uploader_config=Neo4jUploaderConfig,
381
+ )
@@ -1,12 +1,14 @@
1
1
  import json
2
2
  from dataclasses import dataclass, field
3
- from pathlib import Path
4
3
  from typing import TYPE_CHECKING, Any, Optional
5
4
 
6
5
  from pydantic import Field, Secret
7
6
 
8
7
  from unstructured_ingest.error import DestinationConnectionError
9
- from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
8
+ from unstructured_ingest.utils.data_prep import (
9
+ flatten_dict,
10
+ generator_batching_wbytes,
11
+ )
10
12
  from unstructured_ingest.utils.dep_check import requires_dependencies
11
13
  from unstructured_ingest.v2.constants import RECORD_ID_LABEL
12
14
  from unstructured_ingest.v2.interfaces import (
@@ -148,39 +150,14 @@ class PineconeUploadStager(UploadStager):
148
150
 
149
151
  metadata[RECORD_ID_LABEL] = file_data.identifier
150
152
 
153
+ # To support more optimal deletes, a prefix is suggested for each record:
154
+ # https://docs.pinecone.io/guides/data/manage-rag-documents#delete-all-records-for-a-parent-document
151
155
  return {
152
- "id": get_enhanced_element_id(element_dict=element_dict, file_data=file_data),
156
+ "id": f"{file_data.identifier}#{get_enhanced_element_id(element_dict=element_dict, file_data=file_data)}", # noqa:E501
153
157
  "values": embeddings,
154
158
  "metadata": metadata,
155
159
  }
156
160
 
157
- def run(
158
- self,
159
- file_data: FileData,
160
- elements_filepath: Path,
161
- output_dir: Path,
162
- output_filename: str,
163
- **kwargs: Any,
164
- ) -> Path:
165
- with open(elements_filepath) as elements_file:
166
- elements_contents = json.load(elements_file)
167
-
168
- conformed_elements = [
169
- self.conform_dict(element_dict=element, file_data=file_data)
170
- for element in elements_contents
171
- ]
172
-
173
- if Path(output_filename).suffix != ".json":
174
- output_filename = f"{output_filename}.json"
175
- else:
176
- output_filename = f"{Path(output_filename).stem}.json"
177
- output_path = Path(output_dir) / Path(f"{output_filename}")
178
- output_path.parent.mkdir(parents=True, exist_ok=True)
179
-
180
- with open(output_path, "w") as output_file:
181
- json.dump(conformed_elements, output_file)
182
- return output_path
183
-
184
161
 
185
162
  @dataclass
186
163
  class PineconeUploader(Uploader):
@@ -215,18 +192,6 @@ class PineconeUploader(Uploader):
215
192
  f"from pinecone index: {resp}"
216
193
  )
217
194
 
218
- def delete_by_query(self, index: "PineconeIndex", query_params: dict) -> None:
219
- while True:
220
- query_results = index.query(**query_params)
221
- matches = query_results.get("matches", [])
222
- if not matches:
223
- break
224
- ids = [match["id"] for match in matches]
225
- delete_params = {"ids": ids}
226
- if namespace := self.upload_config.namespace:
227
- delete_params["namespace"] = namespace
228
- index.delete(**delete_params)
229
-
230
195
  def serverless_delete_by_record_id(self, file_data: FileData) -> None:
231
196
  logger.debug(
232
197
  f"deleting any content with metadata "
@@ -234,26 +199,21 @@ class PineconeUploader(Uploader):
234
199
  f"from pinecone serverless index"
235
200
  )
236
201
  index = self.connection_config.get_index(pool_threads=MAX_POOL_THREADS)
237
- index_stats = index.describe_index_stats()
238
- dimension = index_stats["dimension"]
239
- total_vectors = index_stats["total_vector_count"]
240
- if total_vectors == 0:
241
- return
242
- while total_vectors > 0:
243
- top_k = min(total_vectors, MAX_QUERY_RESULTS)
244
- query_params = {
245
- "filter": {self.upload_config.record_id_key: {"$eq": file_data.identifier}},
246
- "vector": [0] * dimension,
247
- "top_k": top_k,
248
- }
202
+ list_kwargs = {"prefix": f"{file_data.identifier}#"}
203
+ deleted_ids = 0
204
+ if namespace := self.upload_config.namespace:
205
+ list_kwargs["namespace"] = namespace
206
+ for ids in index.list(**list_kwargs):
207
+ deleted_ids += len(ids)
208
+ delete_kwargs = {"ids": ids}
249
209
  if namespace := self.upload_config.namespace:
250
- query_params["namespace"] = namespace
251
- self.delete_by_query(index=index, query_params=query_params)
252
- index_stats = index.describe_index_stats()
253
- total_vectors = index_stats["total_vector_count"]
254
-
210
+ delete_resp = delete_kwargs["namespace"] = namespace
211
+ # delete_resp should be an empty dict if there were no errors
212
+ if delete_resp:
213
+ logger.error(f"failed to delete batch of ids: {delete_resp}")
214
+ index.delete(**delete_kwargs)
255
215
  logger.info(
256
- f"deleted {total_vectors} records with metadata "
216
+ f"deleted {deleted_ids} records with metadata "
257
217
  f"{self.upload_config.record_id_key}={file_data.identifier} "
258
218
  f"from pinecone index"
259
219
  )
@@ -290,11 +250,9 @@ class PineconeUploader(Uploader):
290
250
  raise DestinationConnectionError(f"http error: {api_error}") from api_error
291
251
  logger.debug(f"results: {results}")
292
252
 
293
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
294
- with path.open("r") as file:
295
- elements_dict = json.load(file)
253
+ def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
296
254
  logger.info(
297
- f"writing a total of {len(elements_dict)} elements via"
255
+ f"writing a total of {len(data)} elements via"
298
256
  f" document batches to destination"
299
257
  f" index named {self.connection_config.index_name}"
300
258
  )
@@ -307,7 +265,7 @@ class PineconeUploader(Uploader):
307
265
  self.pod_delete_by_record_id(file_data=file_data)
308
266
  else:
309
267
  raise ValueError(f"unexpected spec type in index description: {index_description}")
310
- self.upsert_batches_async(elements_dict=elements_dict)
268
+ self.upsert_batches_async(elements_dict=data)
311
269
 
312
270
 
313
271
  pinecone_destination_entry = DestinationRegistryEntry(