unstructured-ingest 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (59) hide show
  1. test/integration/connectors/test_astradb.py +109 -0
  2. test/integration/connectors/test_azure_cog_search.py +233 -0
  3. test/integration/connectors/test_kafka.py +116 -16
  4. test/integration/connectors/test_pinecone.py +161 -0
  5. test/integration/connectors/test_s3.py +23 -0
  6. test/unit/v2/__init__.py +0 -0
  7. test/unit/v2/chunkers/__init__.py +0 -0
  8. test/unit/v2/chunkers/test_chunkers.py +49 -0
  9. test/unit/v2/connectors/__init__.py +0 -0
  10. test/unit/v2/embedders/__init__.py +0 -0
  11. test/unit/v2/embedders/test_bedrock.py +36 -0
  12. test/unit/v2/embedders/test_huggingface.py +48 -0
  13. test/unit/v2/embedders/test_mixedbread.py +37 -0
  14. test/unit/v2/embedders/test_octoai.py +35 -0
  15. test/unit/v2/embedders/test_openai.py +35 -0
  16. test/unit/v2/embedders/test_togetherai.py +37 -0
  17. test/unit/v2/embedders/test_vertexai.py +37 -0
  18. test/unit/v2/embedders/test_voyageai.py +38 -0
  19. test/unit/v2/partitioners/__init__.py +0 -0
  20. test/unit/v2/partitioners/test_partitioner.py +63 -0
  21. test/unit/v2/utils/__init__.py +0 -0
  22. test/unit/v2/utils/data_generator.py +32 -0
  23. unstructured_ingest/__version__.py +1 -1
  24. unstructured_ingest/cli/cmds/__init__.py +2 -2
  25. unstructured_ingest/cli/cmds/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
  26. unstructured_ingest/connector/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
  27. unstructured_ingest/runner/writers/__init__.py +2 -2
  28. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  29. unstructured_ingest/v2/constants.py +2 -0
  30. unstructured_ingest/v2/processes/connectors/__init__.py +4 -4
  31. unstructured_ingest/v2/processes/connectors/airtable.py +2 -2
  32. unstructured_ingest/v2/processes/connectors/astradb.py +33 -21
  33. unstructured_ingest/v2/processes/connectors/{azure_cognitive_search.py → azure_ai_search.py} +112 -35
  34. unstructured_ingest/v2/processes/connectors/confluence.py +2 -2
  35. unstructured_ingest/v2/processes/connectors/couchbase.py +1 -0
  36. unstructured_ingest/v2/processes/connectors/delta_table.py +17 -5
  37. unstructured_ingest/v2/processes/connectors/elasticsearch.py +1 -0
  38. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +27 -0
  39. unstructured_ingest/v2/processes/connectors/google_drive.py +3 -3
  40. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +6 -2
  41. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +38 -2
  42. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +78 -23
  43. unstructured_ingest/v2/processes/connectors/kafka/local.py +32 -4
  44. unstructured_ingest/v2/processes/connectors/onedrive.py +2 -3
  45. unstructured_ingest/v2/processes/connectors/outlook.py +2 -2
  46. unstructured_ingest/v2/processes/connectors/pinecone.py +83 -12
  47. unstructured_ingest/v2/processes/connectors/sharepoint.py +3 -2
  48. unstructured_ingest/v2/processes/connectors/slack.py +2 -2
  49. unstructured_ingest/v2/processes/connectors/sql/postgres.py +16 -8
  50. {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.0.dist-info}/METADATA +20 -19
  51. {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.0.dist-info}/RECORD +58 -37
  52. unstructured_ingest/runner/writers/azure_cognitive_search.py +0 -24
  53. /test/integration/embedders/{togetherai.py → test_togetherai.py} +0 -0
  54. /test/unit/{test_interfaces_v2.py → v2/test_interfaces.py} +0 -0
  55. /test/unit/{test_utils_v2.py → v2/test_utils.py} +0 -0
  56. {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.0.dist-info}/LICENSE.md +0 -0
  57. {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.0.dist-info}/WHEEL +0 -0
  58. {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.0.dist-info}/entry_points.txt +0 -0
  59. {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.0.dist-info}/top_level.txt +0 -0
@@ -4,7 +4,10 @@ from typing import TYPE_CHECKING, Optional
4
4
 
5
5
  from pydantic import Field, Secret, SecretStr
6
6
 
7
- from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
7
+ from unstructured_ingest.v2.processes.connector_registry import (
8
+ DestinationRegistryEntry,
9
+ SourceRegistryEntry,
10
+ )
8
11
  from unstructured_ingest.v2.processes.connectors.kafka.kafka import (
9
12
  KafkaAccessConfig,
10
13
  KafkaConnectionConfig,
@@ -12,6 +15,8 @@ from unstructured_ingest.v2.processes.connectors.kafka.kafka import (
12
15
  KafkaDownloaderConfig,
13
16
  KafkaIndexer,
14
17
  KafkaIndexerConfig,
18
+ KafkaUploader,
19
+ KafkaUploaderConfig,
15
20
  )
16
21
 
17
22
  if TYPE_CHECKING:
@@ -41,7 +46,21 @@ class CloudKafkaConnectionConfig(KafkaConnectionConfig):
41
46
  "group.id": "default_group_id",
42
47
  "enable.auto.commit": "false",
43
48
  "auto.offset.reset": "earliest",
44
- "message.max.bytes": 10485760,
49
+ "sasl.username": access_config.api_key,
50
+ "sasl.password": access_config.secret,
51
+ "sasl.mechanism": "PLAIN",
52
+ "security.protocol": "SASL_SSL",
53
+ }
54
+
55
+ return conf
56
+
57
+ def get_producer_configuration(self) -> dict:
58
+ bootstrap = self.bootstrap_server
59
+ port = self.port
60
+ access_config = self.access_config.get_secret_value()
61
+
62
+ conf = {
63
+ "bootstrap.servers": f"{bootstrap}:{port}",
45
64
  "sasl.username": access_config.api_key,
46
65
  "sasl.password": access_config.secret,
47
66
  "sasl.mechanism": "PLAIN",
@@ -73,6 +92,17 @@ class CloudKafkaDownloader(KafkaDownloader):
73
92
  connector_type: str = CONNECTOR_TYPE
74
93
 
75
94
 
95
+ class CloudKafkaUploaderConfig(KafkaUploaderConfig):
96
+ pass
97
+
98
+
99
+ @dataclass
100
+ class CloudKafkaUploader(KafkaUploader):
101
+ connection_config: CloudKafkaConnectionConfig
102
+ upload_config: CloudKafkaUploaderConfig
103
+ connector_type: str = CONNECTOR_TYPE
104
+
105
+
76
106
  kafka_cloud_source_entry = SourceRegistryEntry(
77
107
  connection_config=CloudKafkaConnectionConfig,
78
108
  indexer=CloudKafkaIndexer,
@@ -80,3 +110,9 @@ kafka_cloud_source_entry = SourceRegistryEntry(
80
110
  downloader=CloudKafkaDownloader,
81
111
  downloader_config=CloudKafkaDownloaderConfig,
82
112
  )
113
+
114
+ kafka_cloud_destination_entry = DestinationRegistryEntry(
115
+ connection_config=CloudKafkaConnectionConfig,
116
+ uploader=CloudKafkaUploader,
117
+ uploader_config=CloudKafkaUploaderConfig,
118
+ )
@@ -1,3 +1,4 @@
1
+ import json
1
2
  from abc import ABC, abstractmethod
2
3
  from contextlib import contextmanager
3
4
  from dataclasses import dataclass, field
@@ -5,32 +6,33 @@ from pathlib import Path
5
6
  from time import time
6
7
  from typing import TYPE_CHECKING, Any, ContextManager, Generator, Optional
7
8
 
8
- from pydantic import Secret
9
+ from pydantic import Field, Secret
9
10
 
10
11
  from unstructured_ingest.error import (
12
+ DestinationConnectionError,
11
13
  SourceConnectionError,
12
14
  SourceConnectionNetworkError,
13
15
  )
16
+ from unstructured_ingest.utils.data_prep import batch_generator
14
17
  from unstructured_ingest.utils.dep_check import requires_dependencies
15
18
  from unstructured_ingest.v2.interfaces import (
16
19
  AccessConfig,
17
20
  ConnectionConfig,
18
21
  Downloader,
19
22
  DownloaderConfig,
23
+ DownloadResponse,
20
24
  FileData,
21
25
  FileDataSourceMetadata,
22
26
  Indexer,
23
27
  IndexerConfig,
24
28
  SourceIdentifiers,
25
- download_responses,
29
+ Uploader,
30
+ UploaderConfig,
26
31
  )
27
32
  from unstructured_ingest.v2.logger import logger
28
- from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
29
33
 
30
34
  if TYPE_CHECKING:
31
- from confluent_kafka import Consumer
32
-
33
- CONNECTOR_TYPE = "kafka"
35
+ from confluent_kafka import Consumer, Producer
34
36
 
35
37
 
36
38
  class KafkaAccessConfig(AccessConfig, ABC):
@@ -39,7 +41,6 @@ class KafkaAccessConfig(AccessConfig, ABC):
39
41
 
40
42
  class KafkaConnectionConfig(ConnectionConfig, ABC):
41
43
  access_config: Secret[KafkaAccessConfig]
42
- timeout: Optional[float] = 1.0
43
44
  bootstrap_server: str
44
45
  port: int
45
46
 
@@ -47,6 +48,10 @@ class KafkaConnectionConfig(ConnectionConfig, ABC):
47
48
  def get_consumer_configuration(self) -> dict:
48
49
  pass
49
50
 
51
+ @abstractmethod
52
+ def get_producer_configuration(self) -> dict:
53
+ pass
54
+
50
55
  @contextmanager
51
56
  @requires_dependencies(["confluent_kafka"], extras="kafka")
52
57
  def get_consumer(self) -> ContextManager["Consumer"]:
@@ -59,20 +64,27 @@ class KafkaConnectionConfig(ConnectionConfig, ABC):
59
64
  finally:
60
65
  consumer.close()
61
66
 
67
+ @requires_dependencies(["confluent_kafka"], extras="kafka")
68
+ def get_producer(self) -> "Producer":
69
+ from confluent_kafka import Producer
70
+
71
+ producer = Producer(self.get_producer_configuration())
72
+ return producer
73
+
62
74
 
63
75
  class KafkaIndexerConfig(IndexerConfig):
64
- topic: str
76
+ topic: str = Field(description="which topic to consume from")
65
77
  num_messages_to_consume: Optional[int] = 100
78
+ timeout: Optional[float] = Field(default=1.0, description="polling timeout")
66
79
 
67
80
  def update_consumer(self, consumer: "Consumer") -> None:
68
81
  consumer.subscribe([self.topic])
69
82
 
70
83
 
71
84
  @dataclass
72
- class KafkaIndexer(Indexer):
85
+ class KafkaIndexer(Indexer, ABC):
73
86
  connection_config: KafkaConnectionConfig
74
87
  index_config: KafkaIndexerConfig
75
- connector_type: str = CONNECTOR_TYPE
76
88
 
77
89
  @contextmanager
78
90
  def get_consumer(self) -> ContextManager["Consumer"]:
@@ -90,7 +102,7 @@ class KafkaIndexer(Indexer):
90
102
  num_messages_to_consume = self.index_config.num_messages_to_consume
91
103
  with self.get_consumer() as consumer:
92
104
  while messages_consumed < num_messages_to_consume and empty_polls < max_empty_polls:
93
- msg = consumer.poll(timeout=self.connection_config.timeout)
105
+ msg = consumer.poll(timeout=self.index_config.timeout)
94
106
  if msg is None:
95
107
  logger.debug("No Kafka messages found")
96
108
  empty_polls += 1
@@ -139,13 +151,13 @@ class KafkaIndexer(Indexer):
139
151
  for message in self.generate_messages():
140
152
  yield self.generate_file_data(message)
141
153
 
142
- async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
154
+ async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
143
155
  raise NotImplementedError()
144
156
 
145
157
  def precheck(self):
146
158
  try:
147
159
  with self.get_consumer() as consumer:
148
- cluster_meta = consumer.list_topics(timeout=self.connection_config.timeout)
160
+ cluster_meta = consumer.list_topics(timeout=self.index_config.timeout)
149
161
  current_topics = [
150
162
  topic for topic in cluster_meta.topics if topic != "__consumer_offsets"
151
163
  ]
@@ -160,14 +172,13 @@ class KafkaDownloaderConfig(DownloaderConfig):
160
172
 
161
173
 
162
174
  @dataclass
163
- class KafkaDownloader(Downloader):
175
+ class KafkaDownloader(Downloader, ABC):
164
176
  connection_config: KafkaConnectionConfig
165
177
  download_config: KafkaDownloaderConfig = field(default_factory=KafkaDownloaderConfig)
166
- connector_type: str = CONNECTOR_TYPE
167
178
  version: Optional[str] = None
168
179
  source_url: Optional[str] = None
169
180
 
170
- def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
181
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
171
182
  source_identifiers = file_data.source_identifiers
172
183
  if source_identifiers is None:
173
184
  raise ValueError("FileData is missing source_identifiers")
@@ -187,10 +198,54 @@ class KafkaDownloader(Downloader):
187
198
  return self.generate_download_response(file_data=file_data, download_path=download_path)
188
199
 
189
200
 
190
- kafka_source_entry = SourceRegistryEntry(
191
- connection_config=KafkaConnectionConfig,
192
- indexer=KafkaIndexer,
193
- indexer_config=KafkaIndexerConfig,
194
- downloader=KafkaDownloader,
195
- downloader_config=KafkaDownloaderConfig,
196
- )
201
+ class KafkaUploaderConfig(UploaderConfig):
202
+ batch_size: int = Field(default=100, description="Batch size")
203
+ topic: str = Field(description="which topic to write to")
204
+ timeout: Optional[float] = Field(
205
+ default=10.0, description="Timeout in seconds to flush batch of messages"
206
+ )
207
+
208
+
209
+ @dataclass
210
+ class KafkaUploader(Uploader, ABC):
211
+ connection_config: KafkaConnectionConfig
212
+ upload_config: KafkaUploaderConfig
213
+
214
+ def precheck(self):
215
+ try:
216
+ with self.connection_config.get_consumer() as consumer:
217
+ cluster_meta = consumer.list_topics(timeout=self.upload_config.timeout)
218
+ current_topics = [
219
+ topic for topic in cluster_meta.topics if topic != "__consumer_offsets"
220
+ ]
221
+ logger.info(f"successfully checked available topics: {current_topics}")
222
+ except Exception as e:
223
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
224
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
225
+
226
+ def produce_batch(self, elements: list[dict]) -> None:
227
+ from confluent_kafka.error import KafkaException
228
+
229
+ producer = self.connection_config.get_producer()
230
+ failed_producer = False
231
+
232
+ def acked(err, msg):
233
+ if err is not None:
234
+ logger.error("Failed to deliver message: %s: %s" % (str(msg), str(err)))
235
+
236
+ for element in elements:
237
+ producer.produce(
238
+ topic=self.upload_config.topic,
239
+ value=json.dumps(element),
240
+ callback=acked,
241
+ )
242
+
243
+ producer.flush(timeout=self.upload_config.timeout)
244
+ if failed_producer:
245
+ raise KafkaException("failed to produce all messages in batch")
246
+
247
+ def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
248
+ with path.open("r") as elements_file:
249
+ elements = json.load(elements_file)
250
+ for element_batch in batch_generator(elements, batch_size=self.upload_config.batch_size):
251
+ self.produce_batch(elements=element_batch)
@@ -1,10 +1,12 @@
1
- import socket
2
1
  from dataclasses import dataclass
3
2
  from typing import TYPE_CHECKING
4
3
 
5
4
  from pydantic import Field, Secret
6
5
 
7
- from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
6
+ from unstructured_ingest.v2.processes.connector_registry import (
7
+ DestinationRegistryEntry,
8
+ SourceRegistryEntry,
9
+ )
8
10
  from unstructured_ingest.v2.processes.connectors.kafka.kafka import (
9
11
  KafkaAccessConfig,
10
12
  KafkaConnectionConfig,
@@ -12,6 +14,8 @@ from unstructured_ingest.v2.processes.connectors.kafka.kafka import (
12
14
  KafkaDownloaderConfig,
13
15
  KafkaIndexer,
14
16
  KafkaIndexerConfig,
17
+ KafkaUploader,
18
+ KafkaUploaderConfig,
15
19
  )
16
20
 
17
21
  if TYPE_CHECKING:
@@ -35,11 +39,18 @@ class LocalKafkaConnectionConfig(KafkaConnectionConfig):
35
39
 
36
40
  conf = {
37
41
  "bootstrap.servers": f"{bootstrap}:{port}",
38
- "client.id": socket.gethostname(),
39
42
  "group.id": "default_group_id",
40
43
  "enable.auto.commit": "false",
41
44
  "auto.offset.reset": "earliest",
42
- "message.max.bytes": 10485760,
45
+ }
46
+ return conf
47
+
48
+ def get_producer_configuration(self) -> dict:
49
+ bootstrap = self.bootstrap_server
50
+ port = self.port
51
+
52
+ conf = {
53
+ "bootstrap.servers": f"{bootstrap}:{port}",
43
54
  }
44
55
  return conf
45
56
 
@@ -66,6 +77,17 @@ class LocalKafkaDownloader(KafkaDownloader):
66
77
  connector_type: str = CONNECTOR_TYPE
67
78
 
68
79
 
80
+ class LocalKafkaUploaderConfig(KafkaUploaderConfig):
81
+ pass
82
+
83
+
84
+ @dataclass
85
+ class LocalKafkaUploader(KafkaUploader):
86
+ connection_config: LocalKafkaConnectionConfig
87
+ upload_config: LocalKafkaUploaderConfig
88
+ connector_type: str = CONNECTOR_TYPE
89
+
90
+
69
91
  kafka_local_source_entry = SourceRegistryEntry(
70
92
  connection_config=LocalKafkaConnectionConfig,
71
93
  indexer=LocalKafkaIndexer,
@@ -73,3 +95,9 @@ kafka_local_source_entry = SourceRegistryEntry(
73
95
  downloader=LocalKafkaDownloader,
74
96
  downloader_config=LocalKafkaDownloaderConfig,
75
97
  )
98
+
99
+ kafka_local_destination_entry = DestinationRegistryEntry(
100
+ connection_config=LocalKafkaConnectionConfig,
101
+ uploader=LocalKafkaUploader,
102
+ uploader_config=LocalKafkaUploaderConfig,
103
+ )
@@ -28,7 +28,6 @@ from unstructured_ingest.v2.interfaces import (
28
28
  SourceIdentifiers,
29
29
  Uploader,
30
30
  UploaderConfig,
31
- download_responses,
32
31
  )
33
32
  from unstructured_ingest.v2.logger import logger
34
33
  from unstructured_ingest.v2.processes.connector_registry import (
@@ -220,7 +219,7 @@ class OnedriveDownloader(Downloader):
220
219
  return self.download_dir / Path(rel_path)
221
220
 
222
221
  @SourceConnectionError.wrap
223
- def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
222
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
224
223
  file = self._fetch_file(file_data=file_data)
225
224
  fsize = file.get_property("size", 0)
226
225
  download_path = self.get_download_path(file_data=file_data)
@@ -233,7 +232,7 @@ class OnedriveDownloader(Downloader):
233
232
  else:
234
233
  with download_path.open(mode="wb") as f:
235
234
  file.download(f).execute_query()
236
- return DownloadResponse(file_data=file_data, path=download_path)
235
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
237
236
 
238
237
 
239
238
  class OnedriveUploaderConfig(UploaderConfig):
@@ -15,10 +15,10 @@ from unstructured_ingest.v2.interfaces import (
15
15
  ConnectionConfig,
16
16
  Downloader,
17
17
  DownloaderConfig,
18
+ DownloadResponse,
18
19
  FileData,
19
20
  Indexer,
20
21
  IndexerConfig,
21
- download_responses,
22
22
  )
23
23
  from unstructured_ingest.v2.interfaces.file_data import FileDataSourceMetadata, SourceIdentifiers
24
24
  from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
@@ -191,7 +191,7 @@ class OutlookDownloader(Downloader):
191
191
  connection_config: OutlookConnectionConfig
192
192
  download_config: OutlookDownloaderConfig = field(default_factory=OutlookDownloaderConfig)
193
193
 
194
- def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
194
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
195
195
  # NOTE: Indexer should provide source identifiers required to generate the download path
196
196
  download_path = self.get_download_path(file_data)
197
197
  if download_path is None:
@@ -9,6 +9,7 @@ from pydantic import Field, Secret
9
9
  from unstructured_ingest.error import DestinationConnectionError
10
10
  from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
11
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
12
+ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
12
13
  from unstructured_ingest.v2.interfaces import (
13
14
  AccessConfig,
14
15
  ConnectionConfig,
@@ -23,6 +24,7 @@ from unstructured_ingest.v2.processes.connector_registry import DestinationRegis
23
24
 
24
25
  if TYPE_CHECKING:
25
26
  from pinecone import Index as PineconeIndex
27
+ from pinecone import Pinecone
26
28
 
27
29
 
28
30
  CONNECTOR_TYPE = "pinecone"
@@ -43,16 +45,19 @@ class PineconeConnectionConfig(ConnectionConfig):
43
45
  )
44
46
 
45
47
  @requires_dependencies(["pinecone"], extras="pinecone")
46
- def get_index(self, **index_kwargs) -> "PineconeIndex":
48
+ def get_client(self, **index_kwargs) -> "Pinecone":
47
49
  from pinecone import Pinecone
48
50
 
49
51
  from unstructured_ingest import __version__ as unstructured_version
50
52
 
51
- pc = Pinecone(
53
+ return Pinecone(
52
54
  api_key=self.access_config.get_secret_value().pinecone_api_key,
53
55
  source_tag=f"unstructured_ingest=={unstructured_version}",
54
56
  )
55
57
 
58
+ def get_index(self, **index_kwargs) -> "PineconeIndex":
59
+ pc = self.get_client()
60
+
56
61
  index = pc.Index(name=self.index_name, **index_kwargs)
57
62
  logger.debug(f"connected to index: {pc.describe_index(self.index_name)}")
58
63
  return index
@@ -106,7 +111,7 @@ class PineconeUploadStager(UploadStager):
106
111
  default_factory=lambda: PineconeUploadStagerConfig()
107
112
  )
108
113
 
109
- def conform_dict(self, element_dict: dict) -> dict:
114
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
110
115
  embeddings = element_dict.pop("embeddings", None)
111
116
  metadata: dict[str, Any] = element_dict.pop("metadata", {})
112
117
  data_source = metadata.pop("data_source", {})
@@ -121,19 +126,23 @@ class PineconeUploadStager(UploadStager):
121
126
  }
122
127
  )
123
128
 
129
+ metadata = flatten_dict(
130
+ pinecone_metadata,
131
+ separator="-",
132
+ flatten_lists=True,
133
+ remove_none=True,
134
+ )
135
+ metadata[RECORD_ID_LABEL] = file_data.identifier
136
+
124
137
  return {
125
138
  "id": str(uuid.uuid4()),
126
139
  "values": embeddings,
127
- "metadata": flatten_dict(
128
- pinecone_metadata,
129
- separator="-",
130
- flatten_lists=True,
131
- remove_none=True,
132
- ),
140
+ "metadata": metadata,
133
141
  }
134
142
 
135
143
  def run(
136
144
  self,
145
+ file_data: FileData,
137
146
  elements_filepath: Path,
138
147
  output_dir: Path,
139
148
  output_filename: str,
@@ -143,10 +152,15 @@ class PineconeUploadStager(UploadStager):
143
152
  elements_contents = json.load(elements_file)
144
153
 
145
154
  conformed_elements = [
146
- self.conform_dict(element_dict=element) for element in elements_contents
155
+ self.conform_dict(element_dict=element, file_data=file_data)
156
+ for element in elements_contents
147
157
  ]
148
158
 
149
- output_path = Path(output_dir) / Path(f"{output_filename}.json")
159
+ if Path(output_filename).suffix != ".json":
160
+ output_filename = f"{output_filename}.json"
161
+ else:
162
+ output_filename = f"{Path(output_filename).stem}.json"
163
+ output_path = Path(output_dir) / Path(f"{output_filename}")
150
164
  output_path.parent.mkdir(parents=True, exist_ok=True)
151
165
 
152
166
  with open(output_path, "w") as output_file:
@@ -167,6 +181,55 @@ class PineconeUploader(Uploader):
167
181
  logger.error(f"failed to validate connection: {e}", exc_info=True)
168
182
  raise DestinationConnectionError(f"failed to validate connection: {e}")
169
183
 
184
+ def pod_delete_by_record_id(self, file_data: FileData) -> None:
185
+ logger.debug(
186
+ f"deleting any content with metadata {RECORD_ID_LABEL}={file_data.identifier} "
187
+ f"from pinecone pod index"
188
+ )
189
+ index = self.connection_config.get_index(pool_threads=MAX_POOL_THREADS)
190
+ delete_kwargs = {"filter": {RECORD_ID_LABEL: {"$eq": file_data.identifier}}}
191
+ if namespace := self.upload_config.namespace:
192
+ delete_kwargs["namespace"] = namespace
193
+
194
+ resp = index.delete(**delete_kwargs)
195
+ logger.debug(
196
+ f"deleted any content with metadata {RECORD_ID_LABEL}={file_data.identifier} "
197
+ f"from pinecone index: {resp}"
198
+ )
199
+
200
+ def serverless_delete_by_record_id(self, file_data: FileData) -> None:
201
+ logger.debug(
202
+ f"deleting any content with metadata {RECORD_ID_LABEL}={file_data.identifier} "
203
+ f"from pinecone serverless index"
204
+ )
205
+ index = self.connection_config.get_index(pool_threads=MAX_POOL_THREADS)
206
+ index_stats = index.describe_index_stats()
207
+ total_vectors = index_stats["total_vector_count"]
208
+ if total_vectors == 0:
209
+ return
210
+ dimension = index_stats["dimension"]
211
+ query_params = {
212
+ "filter": {RECORD_ID_LABEL: {"$eq": file_data.identifier}},
213
+ "vector": [0] * dimension,
214
+ "top_k": total_vectors,
215
+ }
216
+ if namespace := self.upload_config.namespace:
217
+ query_params["namespace"] = namespace
218
+ while True:
219
+ query_results = index.query(**query_params)
220
+ matches = query_results.get("matches", [])
221
+ if not matches:
222
+ break
223
+ ids = [match["id"] for match in matches]
224
+ delete_params = {"ids": ids}
225
+ if namespace := self.upload_config.namespace:
226
+ delete_params["namespace"] = namespace
227
+ index.delete(**delete_params)
228
+ logger.debug(
229
+ f"deleted any content with metadata {RECORD_ID_LABEL}={file_data.identifier} "
230
+ f"from pinecone index"
231
+ )
232
+
170
233
  @requires_dependencies(["pinecone"], extras="pinecone")
171
234
  def upsert_batches_async(self, elements_dict: list[dict]):
172
235
  from pinecone.exceptions import PineconeApiException
@@ -208,7 +271,15 @@ class PineconeUploader(Uploader):
208
271
  f" index named {self.connection_config.index_name}"
209
272
  f" with batch size {self.upload_config.batch_size}"
210
273
  )
211
-
274
+ # Determine if serverless or pod based index
275
+ pinecone_client = self.connection_config.get_client()
276
+ index_description = pinecone_client.describe_index(name=self.connection_config.index_name)
277
+ if "serverless" in index_description.get("spec"):
278
+ self.serverless_delete_by_record_id(file_data=file_data)
279
+ elif "pod" in index_description.get("spec"):
280
+ self.pod_delete_by_record_id(file_data=file_data)
281
+ else:
282
+ raise ValueError(f"unexpected spec type in index description: {index_description}")
212
283
  self.upsert_batches_async(elements_dict=elements_dict)
213
284
 
214
285
 
@@ -21,7 +21,6 @@ from unstructured_ingest.v2.interfaces import (
21
21
  Indexer,
22
22
  IndexerConfig,
23
23
  SourceIdentifiers,
24
- download_responses,
25
24
  )
26
25
  from unstructured_ingest.v2.logger import logger
27
26
  from unstructured_ingest.v2.processes.connector_registry import (
@@ -426,7 +425,7 @@ class SharepointDownloader(Downloader):
426
425
  f.write(etree.tostring(document, encoding="unicode", pretty_print=True))
427
426
  return self.generate_download_response(file_data=file_data, download_path=download_path)
428
427
 
429
- def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
428
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
430
429
  content_type = file_data.additional_metadata.get("sharepoint_content_type")
431
430
  if not content_type:
432
431
  raise ValueError(
@@ -436,6 +435,8 @@ class SharepointDownloader(Downloader):
436
435
  return self.get_document(file_data=file_data)
437
436
  elif content_type == SharepointContentType.SITEPAGE.value:
438
437
  return self.get_site_page(file_data=file_data)
438
+ else:
439
+ raise ValueError(f"content type not recognized: {content_type}")
439
440
 
440
441
 
441
442
  sharepoint_source_entry = SourceRegistryEntry(
@@ -16,9 +16,9 @@ from unstructured_ingest.v2.interfaces import (
16
16
  ConnectionConfig,
17
17
  Downloader,
18
18
  DownloaderConfig,
19
+ DownloadResponse,
19
20
  Indexer,
20
21
  IndexerConfig,
21
- download_responses,
22
22
  )
23
23
  from unstructured_ingest.v2.interfaces.file_data import (
24
24
  FileData,
@@ -161,7 +161,7 @@ class SlackDownloader(Downloader):
161
161
  def run(self, file_data, **kwargs):
162
162
  raise NotImplementedError
163
163
 
164
- async def run_async(self, file_data: FileData, **kwargs) -> download_responses:
164
+ async def run_async(self, file_data: FileData, **kwargs) -> DownloadResponse:
165
165
  # NOTE: Indexer should provide source identifiers required to generate the download path
166
166
  download_path = self.get_download_path(file_data)
167
167
  if download_path is None:
@@ -98,20 +98,28 @@ class PostgresDownloader(SQLDownloader):
98
98
  download_config: PostgresDownloaderConfig
99
99
  connector_type: str = CONNECTOR_TYPE
100
100
 
101
+ @requires_dependencies(["psycopg2"], extras="postgres")
101
102
  def query_db(self, file_data: FileData) -> tuple[list[tuple], list[str]]:
103
+ from psycopg2 import sql
104
+
102
105
  table_name = file_data.additional_metadata["table_name"]
103
106
  id_column = file_data.additional_metadata["id_column"]
104
- ids = file_data.additional_metadata["ids"]
107
+ ids = tuple(file_data.additional_metadata["ids"])
108
+
105
109
  with self.connection_config.get_cursor() as cursor:
106
- fields = ",".join(self.download_config.fields) if self.download_config.fields else "*"
107
- query = "SELECT {fields} FROM {table_name} WHERE {id_column} in ({ids})".format(
110
+ fields = (
111
+ sql.SQL(",").join(sql.Identifier(field) for field in self.download_config.fields)
112
+ if self.download_config.fields
113
+ else sql.SQL("*")
114
+ )
115
+
116
+ query = sql.SQL("SELECT {fields} FROM {table_name} WHERE {id_column} IN %s").format(
108
117
  fields=fields,
109
- table_name=table_name,
110
- id_column=id_column,
111
- ids=",".join([str(i) for i in ids]),
118
+ table_name=sql.Identifier(table_name),
119
+ id_column=sql.Identifier(id_column),
112
120
  )
113
- logger.debug(f"running query: {query}")
114
- cursor.execute(query)
121
+ logger.debug(f"running query: {cursor.mogrify(query, (ids,))}")
122
+ cursor.execute(query, (ids,))
115
123
  rows = cursor.fetchall()
116
124
  columns = [col[0] for col in cursor.description]
117
125
  return rows, columns