unstructured-ingest 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (59) hide show
  1. test/integration/connectors/test_astradb.py +109 -0
  2. test/integration/connectors/test_azure_cog_search.py +233 -0
  3. test/integration/connectors/test_kafka.py +116 -16
  4. test/integration/connectors/test_pinecone.py +161 -0
  5. test/integration/connectors/test_s3.py +23 -0
  6. test/unit/v2/__init__.py +0 -0
  7. test/unit/v2/chunkers/__init__.py +0 -0
  8. test/unit/v2/chunkers/test_chunkers.py +49 -0
  9. test/unit/v2/connectors/__init__.py +0 -0
  10. test/unit/v2/embedders/__init__.py +0 -0
  11. test/unit/v2/embedders/test_bedrock.py +36 -0
  12. test/unit/v2/embedders/test_huggingface.py +48 -0
  13. test/unit/v2/embedders/test_mixedbread.py +37 -0
  14. test/unit/v2/embedders/test_octoai.py +35 -0
  15. test/unit/v2/embedders/test_openai.py +35 -0
  16. test/unit/v2/embedders/test_togetherai.py +37 -0
  17. test/unit/v2/embedders/test_vertexai.py +37 -0
  18. test/unit/v2/embedders/test_voyageai.py +38 -0
  19. test/unit/v2/partitioners/__init__.py +0 -0
  20. test/unit/v2/partitioners/test_partitioner.py +63 -0
  21. test/unit/v2/utils/__init__.py +0 -0
  22. test/unit/v2/utils/data_generator.py +32 -0
  23. unstructured_ingest/__version__.py +1 -1
  24. unstructured_ingest/cli/cmds/__init__.py +2 -2
  25. unstructured_ingest/cli/cmds/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
  26. unstructured_ingest/connector/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
  27. unstructured_ingest/runner/writers/__init__.py +2 -2
  28. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  29. unstructured_ingest/v2/constants.py +2 -0
  30. unstructured_ingest/v2/processes/connectors/__init__.py +4 -4
  31. unstructured_ingest/v2/processes/connectors/airtable.py +2 -2
  32. unstructured_ingest/v2/processes/connectors/astradb.py +33 -21
  33. unstructured_ingest/v2/processes/connectors/{azure_cognitive_search.py → azure_ai_search.py} +112 -35
  34. unstructured_ingest/v2/processes/connectors/confluence.py +2 -2
  35. unstructured_ingest/v2/processes/connectors/couchbase.py +1 -0
  36. unstructured_ingest/v2/processes/connectors/delta_table.py +17 -5
  37. unstructured_ingest/v2/processes/connectors/elasticsearch.py +1 -0
  38. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +27 -0
  39. unstructured_ingest/v2/processes/connectors/google_drive.py +3 -3
  40. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +6 -2
  41. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +38 -2
  42. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +78 -23
  43. unstructured_ingest/v2/processes/connectors/kafka/local.py +32 -4
  44. unstructured_ingest/v2/processes/connectors/onedrive.py +2 -3
  45. unstructured_ingest/v2/processes/connectors/outlook.py +2 -2
  46. unstructured_ingest/v2/processes/connectors/pinecone.py +83 -12
  47. unstructured_ingest/v2/processes/connectors/sharepoint.py +3 -2
  48. unstructured_ingest/v2/processes/connectors/slack.py +2 -2
  49. unstructured_ingest/v2/processes/connectors/sql/postgres.py +16 -8
  50. {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.0.dist-info}/METADATA +20 -19
  51. {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.0.dist-info}/RECORD +58 -37
  52. unstructured_ingest/runner/writers/azure_cognitive_search.py +0 -24
  53. /test/integration/embedders/{togetherai.py → test_togetherai.py} +0 -0
  54. /test/unit/{test_interfaces_v2.py → v2/test_interfaces.py} +0 -0
  55. /test/unit/{test_utils_v2.py → v2/test_utils.py} +0 -0
  56. {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.0.dist-info}/LICENSE.md +0 -0
  57. {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.0.dist-info}/WHEEL +0 -0
  58. {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.0.dist-info}/entry_points.txt +0 -0
  59. {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.0.dist-info}/top_level.txt +0 -0
@@ -14,8 +14,8 @@ from .airtable import CONNECTOR_TYPE as AIRTABLE_CONNECTOR_TYPE
14
14
  from .airtable import airtable_source_entry
15
15
  from .astradb import CONNECTOR_TYPE as ASTRA_DB_CONNECTOR_TYPE
16
16
  from .astradb import astra_db_destination_entry, astra_db_source_entry
17
- from .azure_cognitive_search import CONNECTOR_TYPE as AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE
18
- from .azure_cognitive_search import azure_cognitive_search_destination_entry
17
+ from .azure_ai_search import CONNECTOR_TYPE as AZURE_AI_SEARCH_CONNECTOR_TYPE
18
+ from .azure_ai_search import azure_ai_search_destination_entry
19
19
  from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
20
20
  from .chroma import chroma_destination_entry
21
21
  from .confluence import CONNECTOR_TYPE as CONFLUENCE_CONNECTOR_TYPE
@@ -97,8 +97,8 @@ add_source_entry(source_type=SHAREPOINT_CONNECTOR_TYPE, entry=sharepoint_source_
97
97
 
98
98
  add_destination_entry(destination_type=MILVUS_CONNECTOR_TYPE, entry=milvus_destination_entry)
99
99
  add_destination_entry(
100
- destination_type=AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE,
101
- entry=azure_cognitive_search_destination_entry,
100
+ destination_type=AZURE_AI_SEARCH_CONNECTOR_TYPE,
101
+ entry=azure_ai_search_destination_entry,
102
102
  )
103
103
 
104
104
  add_destination_entry(destination_type=KDBAI_CONNECTOR_TYPE, entry=kdbai_destination_entry)
@@ -12,11 +12,11 @@ from unstructured_ingest.v2.interfaces import (
12
12
  ConnectionConfig,
13
13
  Downloader,
14
14
  DownloaderConfig,
15
+ DownloadResponse,
15
16
  FileData,
16
17
  Indexer,
17
18
  IndexerConfig,
18
19
  SourceIdentifiers,
19
- download_responses,
20
20
  )
21
21
  from unstructured_ingest.v2.processes.connector_registry import (
22
22
  SourceRegistryEntry,
@@ -214,7 +214,7 @@ class AirtableDownloader(Downloader):
214
214
  row_dict.update(table_row["fields"])
215
215
  return row_dict
216
216
 
217
- def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
217
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
218
218
  table_meta = AirtableTableMeta.model_validate(file_data.additional_metadata)
219
219
  table_contents = self.get_table_contents(table_meta=table_meta)
220
220
  df = pandas.DataFrame.from_dict(
@@ -19,6 +19,7 @@ from unstructured_ingest.error import (
19
19
  )
20
20
  from unstructured_ingest.utils.data_prep import batch_generator
21
21
  from unstructured_ingest.utils.dep_check import requires_dependencies
22
+ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
22
23
  from unstructured_ingest.v2.interfaces import (
23
24
  AccessConfig,
24
25
  ConnectionConfig,
@@ -129,11 +130,6 @@ class AstraDBIndexerConfig(IndexerConfig):
129
130
  "numbers, and underscores."
130
131
  )
131
132
  keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
132
- namespace: Optional[str] = Field(
133
- default=None,
134
- description="The Astra DB connection namespace.",
135
- deprecated="Please use 'keyspace' instead.",
136
- )
137
133
  batch_size: int = Field(default=20, description="Number of records per batch")
138
134
 
139
135
 
@@ -147,21 +143,17 @@ class AstraDBUploaderConfig(UploaderConfig):
147
143
  "Note that the collection name must only include letters, "
148
144
  "numbers, and underscores."
149
145
  )
150
- embedding_dimension: int = Field(
151
- default=384, description="The dimensionality of the embeddings"
152
- )
153
146
  keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
154
- namespace: Optional[str] = Field(
155
- default=None,
156
- description="The Astra DB connection namespace.",
157
- deprecated="Please use 'keyspace' instead.",
158
- )
159
147
  requested_indexing_policy: Optional[dict[str, Any]] = Field(
160
148
  default=None,
161
149
  description="The indexing policy to use for the collection.",
162
150
  examples=['{"deny": ["metadata"]}'],
163
151
  )
164
152
  batch_size: int = Field(default=20, description="Number of records per batch")
153
+ record_id_key: str = Field(
154
+ default=RECORD_ID_LABEL,
155
+ description="searchable key to find entries for the same record on previous runs",
156
+ )
165
157
 
166
158
 
167
159
  @dataclass
@@ -173,7 +165,7 @@ class AstraDBIndexer(Indexer):
173
165
  return get_astra_collection(
174
166
  connection_config=self.connection_config,
175
167
  collection_name=self.index_config.collection_name,
176
- keyspace=self.index_config.keyspace or self.index_config.namespace,
168
+ keyspace=self.index_config.keyspace,
177
169
  )
178
170
 
179
171
  def precheck(self) -> None:
@@ -223,7 +215,7 @@ class AstraDBIndexer(Indexer):
223
215
  additional_metadata={
224
216
  "ids": list(batch),
225
217
  "collection_name": self.index_config.collection_name,
226
- "keyspace": self.index_config.keyspace or self.index_config.namespace,
218
+ "keyspace": self.index_config.keyspace,
227
219
  },
228
220
  )
229
221
  yield fd
@@ -309,10 +301,11 @@ class AstraDBUploadStager(UploadStager):
309
301
  default_factory=lambda: AstraDBUploadStagerConfig()
310
302
  )
311
303
 
312
- def conform_dict(self, element_dict: dict) -> dict:
304
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
313
305
  return {
314
306
  "$vector": element_dict.pop("embeddings", None),
315
307
  "content": element_dict.pop("text", None),
308
+ RECORD_ID_LABEL: file_data.identifier,
316
309
  "metadata": element_dict,
317
310
  }
318
311
 
@@ -328,10 +321,15 @@ class AstraDBUploadStager(UploadStager):
328
321
  elements_contents = json.load(elements_file)
329
322
  conformed_elements = []
330
323
  for element in elements_contents:
331
- conformed_elements.append(self.conform_dict(element_dict=element))
332
- output_path = Path(output_dir) / Path(f"{output_filename}.json")
324
+ conformed_elements.append(self.conform_dict(element_dict=element, file_data=file_data))
325
+ output_filename_path = Path(output_filename)
326
+ if output_filename_path.suffix == ".json":
327
+ output_path = Path(output_dir) / output_filename_path
328
+ else:
329
+ output_path = Path(output_dir) / output_filename_path.with_suffix(".json")
330
+ output_path.parent.mkdir(parents=True, exist_ok=True)
333
331
  with open(output_path, "w") as output_file:
334
- json.dump(conformed_elements, output_file)
332
+ json.dump(conformed_elements, output_file, indent=2)
335
333
  return output_path
336
334
 
337
335
 
@@ -346,7 +344,7 @@ class AstraDBUploader(Uploader):
346
344
  get_astra_collection(
347
345
  connection_config=self.connection_config,
348
346
  collection_name=self.upload_config.collection_name,
349
- keyspace=self.upload_config.keyspace or self.upload_config.namespace,
347
+ keyspace=self.upload_config.keyspace,
350
348
  )
351
349
  except Exception as e:
352
350
  logger.error(f"Failed to validate connection {e}", exc_info=True)
@@ -357,7 +355,19 @@ class AstraDBUploader(Uploader):
357
355
  return get_astra_collection(
358
356
  connection_config=self.connection_config,
359
357
  collection_name=self.upload_config.collection_name,
360
- keyspace=self.upload_config.keyspace or self.upload_config.namespace,
358
+ keyspace=self.upload_config.keyspace,
359
+ )
360
+
361
+ def delete_by_record_id(self, collection: "AstraDBCollection", file_data: FileData):
362
+ logger.debug(
363
+ f"deleting records from collection {collection.name} "
364
+ f"with {self.upload_config.record_id_key} "
365
+ f"set to {file_data.identifier}"
366
+ )
367
+ delete_filter = {self.upload_config.record_id_key: {"$eq": file_data.identifier}}
368
+ delete_resp = collection.delete_many(filter=delete_filter)
369
+ logger.debug(
370
+ f"deleted {delete_resp.deleted_count} records from collection {collection.name}"
361
371
  )
362
372
 
363
373
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
@@ -371,6 +381,8 @@ class AstraDBUploader(Uploader):
371
381
  astra_db_batch_size = self.upload_config.batch_size
372
382
  collection = self.get_collection()
373
383
 
384
+ self.delete_by_record_id(collection=collection, file_data=file_data)
385
+
374
386
  for chunk in batch_generator(elements_dict, astra_db_batch_size):
375
387
  collection.insert_many(chunk)
376
388
 
@@ -9,6 +9,7 @@ from pydantic import Field, Secret
9
9
  from unstructured_ingest.error import DestinationConnectionError, WriteError
10
10
  from unstructured_ingest.utils.data_prep import batch_generator
11
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
12
+ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
12
13
  from unstructured_ingest.v2.interfaces import (
13
14
  AccessConfig,
14
15
  ConnectionConfig,
@@ -26,18 +27,18 @@ from unstructured_ingest.v2.processes.connectors.utils import parse_datetime
26
27
 
27
28
  if TYPE_CHECKING:
28
29
  from azure.search.documents import SearchClient
30
+ from azure.search.documents.indexes import SearchIndexClient
29
31
 
32
+ CONNECTOR_TYPE = "azure_ai_search"
30
33
 
31
- CONNECTOR_TYPE = "azure_cognitive_search"
32
34
 
33
-
34
- class AzureCognitiveSearchAccessConfig(AccessConfig):
35
- azure_cognitive_search_key: str = Field(
35
+ class AzureAISearchAccessConfig(AccessConfig):
36
+ azure_ai_search_key: str = Field(
36
37
  alias="key", description="Credential that is used for authenticating to an Azure service"
37
38
  )
38
39
 
39
40
 
40
- class AzureCognitiveSearchConnectionConfig(ConnectionConfig):
41
+ class AzureAISearchConnectionConfig(ConnectionConfig):
41
42
  endpoint: str = Field(
42
43
  description="The URL endpoint of an Azure AI (Cognitive) search service. "
43
44
  "In the form of https://{{service_name}}.search.windows.net"
@@ -45,10 +46,10 @@ class AzureCognitiveSearchConnectionConfig(ConnectionConfig):
45
46
  index: str = Field(
46
47
  description="The name of the Azure AI (Cognitive) Search index to connect to."
47
48
  )
48
- access_config: Secret[AzureCognitiveSearchAccessConfig]
49
+ access_config: Secret[AzureAISearchAccessConfig]
49
50
 
50
- @requires_dependencies(["azure.search", "azure.core"], extras="azure-cognitive-search")
51
- def generate_client(self) -> "SearchClient":
51
+ @requires_dependencies(["azure.search", "azure.core"], extras="azure-ai-search")
52
+ def get_search_client(self) -> "SearchClient":
52
53
  from azure.core.credentials import AzureKeyCredential
53
54
  from azure.search.documents import SearchClient
54
55
 
@@ -56,27 +57,43 @@ class AzureCognitiveSearchConnectionConfig(ConnectionConfig):
56
57
  endpoint=self.endpoint,
57
58
  index_name=self.index,
58
59
  credential=AzureKeyCredential(
59
- self.access_config.get_secret_value().azure_cognitive_search_key
60
+ self.access_config.get_secret_value().azure_ai_search_key
61
+ ),
62
+ )
63
+
64
+ @requires_dependencies(["azure.search", "azure.core"], extras="azure-ai-search")
65
+ def get_search_index_client(self) -> "SearchIndexClient":
66
+ from azure.core.credentials import AzureKeyCredential
67
+ from azure.search.documents.indexes import SearchIndexClient
68
+
69
+ return SearchIndexClient(
70
+ endpoint=self.endpoint,
71
+ credential=AzureKeyCredential(
72
+ self.access_config.get_secret_value().azure_ai_search_key
60
73
  ),
61
74
  )
62
75
 
63
76
 
64
- class AzureCognitiveSearchUploadStagerConfig(UploadStagerConfig):
77
+ class AzureAISearchUploadStagerConfig(UploadStagerConfig):
65
78
  pass
66
79
 
67
80
 
68
- class AzureCognitiveSearchUploaderConfig(UploaderConfig):
81
+ class AzureAISearchUploaderConfig(UploaderConfig):
69
82
  batch_size: int = Field(default=100, description="Number of records per batch")
83
+ record_id_key: str = Field(
84
+ default=RECORD_ID_LABEL,
85
+ description="searchable key to find entries for the same record on previous runs",
86
+ )
70
87
 
71
88
 
72
89
  @dataclass
73
- class AzureCognitiveSearchUploadStager(UploadStager):
74
- upload_stager_config: AzureCognitiveSearchUploadStagerConfig = field(
75
- default_factory=lambda: AzureCognitiveSearchUploadStagerConfig()
90
+ class AzureAISearchUploadStager(UploadStager):
91
+ upload_stager_config: AzureAISearchUploadStagerConfig = field(
92
+ default_factory=lambda: AzureAISearchUploadStagerConfig()
76
93
  )
77
94
 
78
95
  @staticmethod
79
- def conform_dict(data: dict) -> dict:
96
+ def conform_dict(data: dict, file_data: FileData) -> dict:
80
97
  """
81
98
  updates the dictionary that is from each Element being converted into a dict/json
82
99
  into a dictionary that conforms to the schema expected by the
@@ -84,6 +101,7 @@ class AzureCognitiveSearchUploadStager(UploadStager):
84
101
  """
85
102
 
86
103
  data["id"] = str(uuid.uuid4())
104
+ data[RECORD_ID_LABEL] = file_data.identifier
87
105
 
88
106
  if points := data.get("metadata", {}).get("coordinates", {}).get("points"):
89
107
  data["metadata"]["coordinates"]["points"] = json.dumps(points)
@@ -124,6 +142,7 @@ class AzureCognitiveSearchUploadStager(UploadStager):
124
142
 
125
143
  def run(
126
144
  self,
145
+ file_data: FileData,
127
146
  elements_filepath: Path,
128
147
  output_dir: Path,
129
148
  output_filename: str,
@@ -132,23 +151,59 @@ class AzureCognitiveSearchUploadStager(UploadStager):
132
151
  with open(elements_filepath) as elements_file:
133
152
  elements_contents = json.load(elements_file)
134
153
 
135
- conformed_elements = [self.conform_dict(data=element) for element in elements_contents]
154
+ conformed_elements = [
155
+ self.conform_dict(data=element, file_data=file_data) for element in elements_contents
156
+ ]
136
157
 
137
158
  output_path = Path(output_dir) / Path(f"{output_filename}.json")
159
+ output_path.parent.mkdir(parents=True, exist_ok=True)
138
160
  with open(output_path, "w") as output_file:
139
- json.dump(conformed_elements, output_file)
161
+ json.dump(conformed_elements, output_file, indent=2)
140
162
  return output_path
141
163
 
142
164
 
143
165
  @dataclass
144
- class AzureCognitiveSearchUploader(Uploader):
145
- upload_config: AzureCognitiveSearchUploaderConfig
146
- connection_config: AzureCognitiveSearchConnectionConfig
166
+ class AzureAISearchUploader(Uploader):
167
+ upload_config: AzureAISearchUploaderConfig
168
+ connection_config: AzureAISearchConnectionConfig
147
169
  connector_type: str = CONNECTOR_TYPE
148
170
 
171
+ def query_docs(self, record_id: str, index_key: str) -> list[str]:
172
+ client = self.connection_config.get_search_client()
173
+ results = list(client.search(filter=f"record_id eq '{record_id}'", select=[index_key]))
174
+ return [result[index_key] for result in results]
175
+
176
+ def delete_by_record_id(self, file_data: FileData, index_key: str) -> None:
177
+ logger.debug(
178
+ f"deleting any content with metadata "
179
+ f"{self.upload_config.record_id_key}={file_data.identifier} "
180
+ f"from azure cognitive search index: {self.connection_config.index}"
181
+ )
182
+ doc_ids_to_delete = self.query_docs(record_id=file_data.identifier, index_key=index_key)
183
+ if not doc_ids_to_delete:
184
+ return
185
+ client: SearchClient = self.connection_config.get_search_client()
186
+ results = client.delete_documents(
187
+ documents=[{index_key: doc_id} for doc_id in doc_ids_to_delete]
188
+ )
189
+ errors = []
190
+ success = []
191
+ for result in results:
192
+ if result.succeeded:
193
+ success.append(result)
194
+ else:
195
+ errors.append(result)
196
+ logger.debug(f"results: {len(success)} successes, {len(errors)} failures")
197
+ if errors:
198
+ raise WriteError(
199
+ ", ".join(
200
+ [f"[{error.status_code}] {error.error_message}" for error in errors],
201
+ ),
202
+ )
203
+
149
204
  @DestinationConnectionError.wrap
150
- @requires_dependencies(["azure"], extras="azure-cognitive-search")
151
- def write_dict(self, *args, elements_dict: list[dict[str, Any]], **kwargs) -> None:
205
+ @requires_dependencies(["azure"], extras="azure-ai-search")
206
+ def write_dict(self, elements_dict: list[dict[str, Any]]) -> None:
152
207
  import azure.core.exceptions
153
208
 
154
209
  logger.info(
@@ -156,7 +211,7 @@ class AzureCognitiveSearchUploader(Uploader):
156
211
  f"index at {self.connection_config.index}",
157
212
  )
158
213
  try:
159
- results = self.connection_config.generate_client().upload_documents(
214
+ results = self.connection_config.get_search_client().upload_documents(
160
215
  documents=elements_dict
161
216
  )
162
217
 
@@ -174,24 +229,42 @@ class AzureCognitiveSearchUploader(Uploader):
174
229
  raise WriteError(
175
230
  ", ".join(
176
231
  [
177
- f"{error.azure_cognitive_search_key}: "
232
+ f"{error.azure_ai_search_key}: "
178
233
  f"[{error.status_code}] {error.error_message}"
179
234
  for error in errors
180
235
  ],
181
236
  ),
182
237
  )
183
238
 
239
+ def can_delete(self) -> bool:
240
+ search_index_client = self.connection_config.get_search_index_client()
241
+ index = search_index_client.get_index(name=self.connection_config.index)
242
+ index_fields = index.fields
243
+ record_id_fields = [
244
+ field for field in index_fields if field.name == self.upload_config.record_id_key
245
+ ]
246
+ if not record_id_fields:
247
+ return False
248
+ record_id_field = record_id_fields[0]
249
+ return record_id_field.filterable
250
+
251
+ def get_index_key(self) -> str:
252
+ search_index_client = self.connection_config.get_search_index_client()
253
+ index = search_index_client.get_index(name=self.connection_config.index)
254
+ index_fields = index.fields
255
+ key_fields = [field for field in index_fields if field.key]
256
+ if not key_fields:
257
+ raise ValueError("no key field found in index fields")
258
+ return key_fields[0].name
259
+
184
260
  def precheck(self) -> None:
185
261
  try:
186
- client = self.connection_config.generate_client()
262
+ client = self.connection_config.get_search_client()
187
263
  client.get_document_count()
188
264
  except Exception as e:
189
265
  logger.error(f"failed to validate connection: {e}", exc_info=True)
190
266
  raise DestinationConnectionError(f"failed to validate connection: {e}")
191
267
 
192
- def write_dict_wrapper(self, elements_dict):
193
- return self.write_dict(elements_dict=elements_dict)
194
-
195
268
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
196
269
  with path.open("r") as file:
197
270
  elements_dict = json.load(file)
@@ -201,17 +274,21 @@ class AzureCognitiveSearchUploader(Uploader):
201
274
  f" index at {str(self.connection_config.index)}"
202
275
  f" with batch size {str(self.upload_config.batch_size)}"
203
276
  )
277
+ if self.can_delete():
278
+ index_key = self.get_index_key()
279
+ self.delete_by_record_id(file_data=file_data, index_key=index_key)
280
+ else:
281
+ logger.warning("criteria for deleting previous content not met, skipping")
204
282
 
205
283
  batch_size = self.upload_config.batch_size
206
-
207
284
  for chunk in batch_generator(elements_dict, batch_size):
208
285
  self.write_dict(elements_dict=chunk) # noqa: E203
209
286
 
210
287
 
211
- azure_cognitive_search_destination_entry = DestinationRegistryEntry(
212
- connection_config=AzureCognitiveSearchConnectionConfig,
213
- uploader=AzureCognitiveSearchUploader,
214
- uploader_config=AzureCognitiveSearchUploaderConfig,
215
- upload_stager=AzureCognitiveSearchUploadStager,
216
- upload_stager_config=AzureCognitiveSearchUploadStagerConfig,
288
+ azure_ai_search_destination_entry = DestinationRegistryEntry(
289
+ connection_config=AzureAISearchConnectionConfig,
290
+ uploader=AzureAISearchUploader,
291
+ uploader_config=AzureAISearchUploaderConfig,
292
+ upload_stager=AzureAISearchUploadStager,
293
+ upload_stager_config=AzureAISearchUploadStagerConfig,
217
294
  )
@@ -11,12 +11,12 @@ from unstructured_ingest.v2.interfaces import (
11
11
  ConnectionConfig,
12
12
  Downloader,
13
13
  DownloaderConfig,
14
+ DownloadResponse,
14
15
  FileData,
15
16
  FileDataSourceMetadata,
16
17
  Indexer,
17
18
  IndexerConfig,
18
19
  SourceIdentifiers,
19
- download_responses,
20
20
  )
21
21
  from unstructured_ingest.v2.logger import logger
22
22
  from unstructured_ingest.v2.processes.connector_registry import (
@@ -154,7 +154,7 @@ class ConfluenceDownloader(Downloader):
154
154
  download_config: ConfluenceDownloaderConfig = field(default_factory=ConfluenceDownloaderConfig)
155
155
  connector_type: str = CONNECTOR_TYPE
156
156
 
157
- def run(self, file_data: FileData, **kwargs) -> download_responses:
157
+ def run(self, file_data: FileData, **kwargs) -> DownloadResponse:
158
158
  doc_id = file_data.identifier
159
159
  try:
160
160
  client = self.connection_config.get_client()
@@ -205,6 +205,7 @@ class CouchbaseIndexer(Indexer):
205
205
  yield FileData(
206
206
  identifier=identified,
207
207
  connector_type=CONNECTOR_TYPE,
208
+ doc_type="batch",
208
209
  metadata=FileDataSourceMetadata(
209
210
  url=f"{self.connection_config.connection_string}/"
210
211
  f"{self.connection_config.bucket}",
@@ -4,6 +4,7 @@ from dataclasses import dataclass, field
4
4
  from multiprocessing import Process
5
5
  from pathlib import Path
6
6
  from typing import Any, Optional
7
+ from urllib.parse import urlparse
7
8
 
8
9
  import pandas as pd
9
10
  from pydantic import Field, Secret
@@ -94,7 +95,7 @@ class DeltaTableUploader(Uploader):
94
95
  connection_config: DeltaTableConnectionConfig
95
96
  connector_type: str = CONNECTOR_TYPE
96
97
 
97
- @requires_dependencies(["s3fs", "fsspec"], extras="s3")
98
+ @requires_dependencies(["boto3"], extras="delta-table")
98
99
  def precheck(self):
99
100
  secrets = self.connection_config.access_config.get_secret_value()
100
101
  if (
@@ -102,13 +103,24 @@ class DeltaTableUploader(Uploader):
102
103
  and secrets.aws_access_key_id
103
104
  and secrets.aws_secret_access_key
104
105
  ):
105
- from fsspec import get_filesystem_class
106
+ from boto3 import client
107
+
108
+ url = urlparse(self.connection_config.table_uri)
109
+ bucket_name = url.netloc
110
+ dir_path = url.path.lstrip("/")
106
111
 
107
112
  try:
108
- fs = get_filesystem_class("s3")(
109
- key=secrets.aws_access_key_id, secret=secrets.aws_secret_access_key
113
+ s3_client = client(
114
+ "s3",
115
+ aws_access_key_id=secrets.aws_access_key_id,
116
+ aws_secret_access_key=secrets.aws_secret_access_key,
110
117
  )
111
- fs.write_bytes(path=self.connection_config.table_uri, value=b"")
118
+ s3_client.put_object(Bucket=bucket_name, Key=dir_path, Body=b"")
119
+
120
+ response = s3_client.get_bucket_location(Bucket=bucket_name)
121
+
122
+ if self.connection_config.aws_region != response.get("LocationConstraint"):
123
+ raise ValueError("Wrong AWS Region was provided.")
112
124
 
113
125
  except Exception as e:
114
126
  logger.error(f"failed to validate connection: {e}", exc_info=True)
@@ -191,6 +191,7 @@ class ElasticsearchIndexer(Indexer):
191
191
  yield FileData(
192
192
  identifier=identified,
193
193
  connector_type=CONNECTOR_TYPE,
194
+ doc_type="batch",
194
195
  metadata=FileDataSourceMetadata(
195
196
  url=f"{self.connection_config.hosts[0]}/{self.index_config.index_name}",
196
197
  date_processed=str(time()),
@@ -1,6 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import os
3
4
  import random
5
+ import shutil
6
+ import tempfile
4
7
  from dataclasses import dataclass, field
5
8
  from pathlib import Path
6
9
  from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
@@ -207,12 +210,35 @@ class FsspecDownloader(Downloader):
207
210
  **self.connection_config.get_access_config(),
208
211
  )
209
212
 
213
+ def handle_directory_download(self, lpath: Path) -> None:
214
+ # If the object's name contains certain characters (i.e. '?'), it
215
+ # gets downloaded into a new directory of the same name. This
216
+ # reconciles that with what is expected, which is to download it
217
+ # as a file that is not within a directory.
218
+ if not lpath.is_dir():
219
+ return
220
+ desired_name = lpath.name
221
+ files_in_dir = [file for file in lpath.iterdir() if file.is_file()]
222
+ if not files_in_dir:
223
+ raise ValueError(f"no files in {lpath}")
224
+ if len(files_in_dir) > 1:
225
+ raise ValueError(
226
+ "Multiple files in {}: {}".format(lpath, ", ".join([str(f) for f in files_in_dir]))
227
+ )
228
+ file = files_in_dir[0]
229
+ with tempfile.TemporaryDirectory() as temp_dir:
230
+ temp_location = os.path.join(temp_dir, desired_name)
231
+ shutil.copyfile(src=file, dst=temp_location)
232
+ shutil.rmtree(lpath)
233
+ shutil.move(src=temp_location, dst=lpath)
234
+
210
235
  def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
211
236
  download_path = self.get_download_path(file_data=file_data)
212
237
  download_path.parent.mkdir(parents=True, exist_ok=True)
213
238
  try:
214
239
  rpath = file_data.additional_metadata["original_file_path"]
215
240
  self.fs.get(rpath=rpath, lpath=download_path.as_posix())
241
+ self.handle_directory_download(lpath=download_path)
216
242
  except Exception as e:
217
243
  logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
218
244
  raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
@@ -224,6 +250,7 @@ class FsspecDownloader(Downloader):
224
250
  try:
225
251
  rpath = file_data.additional_metadata["original_file_path"]
226
252
  await self.fs.get(rpath=rpath, lpath=download_path.as_posix())
253
+ self.handle_directory_download(lpath=download_path)
227
254
  except Exception as e:
228
255
  logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
229
256
  raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
@@ -19,12 +19,12 @@ from unstructured_ingest.v2.interfaces import (
19
19
  ConnectionConfig,
20
20
  Downloader,
21
21
  DownloaderConfig,
22
+ DownloadResponse,
22
23
  FileData,
23
24
  FileDataSourceMetadata,
24
25
  Indexer,
25
26
  IndexerConfig,
26
27
  SourceIdentifiers,
27
- download_responses,
28
28
  )
29
29
  from unstructured_ingest.v2.logger import logger
30
30
  from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
@@ -294,7 +294,7 @@ class GoogleDriveDownloader(Downloader):
294
294
  _, downloaded = downloader.next_chunk()
295
295
  return downloaded
296
296
 
297
- def _write_file(self, file_data: FileData, file_contents: io.BytesIO):
297
+ def _write_file(self, file_data: FileData, file_contents: io.BytesIO) -> DownloadResponse:
298
298
  download_path = self.get_download_path(file_data=file_data)
299
299
  download_path.parent.mkdir(parents=True, exist_ok=True)
300
300
  logger.debug(f"writing {file_data.source_identifiers.fullpath} to {download_path}")
@@ -303,7 +303,7 @@ class GoogleDriveDownloader(Downloader):
303
303
  return self.generate_download_response(file_data=file_data, download_path=download_path)
304
304
 
305
305
  @requires_dependencies(["googleapiclient"], extras="google-drive")
306
- def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
306
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
307
307
  from googleapiclient.http import MediaIoBaseDownload
308
308
 
309
309
  logger.debug(f"fetching file: {file_data.source_identifiers.fullpath}")
@@ -1,13 +1,17 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from unstructured_ingest.v2.processes.connector_registry import (
4
+ add_destination_entry,
4
5
  add_source_entry,
5
6
  )
6
7
 
7
8
  from .cloud import CONNECTOR_TYPE as CLOUD_CONNECTOR
8
- from .cloud import kafka_cloud_source_entry
9
+ from .cloud import kafka_cloud_destination_entry, kafka_cloud_source_entry
9
10
  from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR
10
- from .local import kafka_local_source_entry
11
+ from .local import kafka_local_destination_entry, kafka_local_source_entry
11
12
 
12
13
  add_source_entry(source_type=LOCAL_CONNECTOR, entry=kafka_local_source_entry)
14
+ add_destination_entry(destination_type=LOCAL_CONNECTOR, entry=kafka_local_destination_entry)
15
+
13
16
  add_source_entry(source_type=CLOUD_CONNECTOR, entry=kafka_cloud_source_entry)
17
+ add_destination_entry(destination_type=CLOUD_CONNECTOR, entry=kafka_cloud_destination_entry)