unstructured-ingest 0.3.9__py3-none-any.whl → 0.3.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (44) hide show
  1. test/integration/connectors/sql/test_postgres.py +3 -3
  2. test/integration/connectors/sql/test_singlestore.py +3 -3
  3. test/integration/connectors/sql/test_sqlite.py +3 -3
  4. test/integration/connectors/test_astradb.py +40 -0
  5. test/integration/connectors/test_kafka.py +2 -2
  6. test/integration/connectors/test_mongodb.py +4 -1
  7. test/integration/connectors/utils/validation/source.py +31 -11
  8. unstructured_ingest/__version__.py +1 -1
  9. unstructured_ingest/v2/interfaces/__init__.py +3 -1
  10. unstructured_ingest/v2/interfaces/file_data.py +58 -14
  11. unstructured_ingest/v2/pipeline/steps/chunk.py +2 -1
  12. unstructured_ingest/v2/pipeline/steps/download.py +5 -4
  13. unstructured_ingest/v2/pipeline/steps/embed.py +2 -1
  14. unstructured_ingest/v2/pipeline/steps/filter.py +2 -2
  15. unstructured_ingest/v2/pipeline/steps/index.py +4 -4
  16. unstructured_ingest/v2/pipeline/steps/partition.py +3 -2
  17. unstructured_ingest/v2/pipeline/steps/stage.py +2 -2
  18. unstructured_ingest/v2/pipeline/steps/uncompress.py +2 -2
  19. unstructured_ingest/v2/pipeline/steps/upload.py +3 -3
  20. unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
  21. unstructured_ingest/v2/processes/connectors/astradb.py +35 -33
  22. unstructured_ingest/v2/processes/connectors/couchbase.py +50 -41
  23. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +41 -45
  24. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +12 -35
  25. unstructured_ingest/v2/processes/connectors/fsspec/box.py +12 -35
  26. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +15 -42
  27. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +33 -29
  28. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +12 -34
  29. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -37
  30. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -33
  31. unstructured_ingest/v2/processes/connectors/mongodb.py +95 -100
  32. unstructured_ingest/v2/processes/connectors/neo4j.py +5 -3
  33. unstructured_ingest/v2/processes/connectors/onedrive.py +1 -1
  34. unstructured_ingest/v2/processes/connectors/sql/postgres.py +5 -5
  35. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -5
  36. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
  37. unstructured_ingest/v2/processes/connectors/sql/sql.py +31 -26
  38. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -5
  39. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/METADATA +14 -13
  40. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/RECORD +44 -44
  41. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/LICENSE.md +0 -0
  42. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/WHEEL +0 -0
  43. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/entry_points.txt +0 -0
  44. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,11 @@
1
- import copy
2
1
  import csv
3
2
  import hashlib
4
- import sys
5
3
  from dataclasses import dataclass, field
6
4
  from pathlib import Path
7
5
  from time import time
8
6
  from typing import TYPE_CHECKING, Any, Generator, Optional
9
7
 
10
- from pydantic import Field, Secret
8
+ from pydantic import BaseModel, Field, Secret
11
9
 
12
10
  from unstructured_ingest import __name__ as integration_name
13
11
  from unstructured_ingest.__version__ import __version__ as integration_version
@@ -22,6 +20,8 @@ from unstructured_ingest.utils.string_and_date_utils import truncate_string_byte
22
20
  from unstructured_ingest.v2.constants import RECORD_ID_LABEL
23
21
  from unstructured_ingest.v2.interfaces import (
24
22
  AccessConfig,
23
+ BatchFileData,
24
+ BatchItem,
25
25
  ConnectionConfig,
26
26
  Downloader,
27
27
  DownloaderConfig,
@@ -53,6 +53,15 @@ CONNECTOR_TYPE = "astradb"
53
53
  MAX_CONTENT_PARAM_BYTE_SIZE = 8000
54
54
 
55
55
 
56
+ class AstraDBAdditionalMetadata(BaseModel):
57
+ collection_name: str
58
+ keyspace: Optional[str] = None
59
+
60
+
61
+ class AstraDBBatchFileData(BatchFileData):
62
+ additional_metadata: AstraDBAdditionalMetadata
63
+
64
+
56
65
  class AstraDBAccessConfig(AccessConfig):
57
66
  token: str = Field(description="Astra DB Token with access to the database.")
58
67
  api_endpoint: str = Field(description="The API endpoint for the Astra DB.")
@@ -179,9 +188,6 @@ class AstraDBIndexer(Indexer):
179
188
 
180
189
  def _get_doc_ids(self) -> set[str]:
181
190
  """Fetches all document ids in an index"""
182
- # Initialize set of ids
183
- ids = set()
184
-
185
191
  # Get the collection
186
192
  collection = self.get_collection()
187
193
 
@@ -194,31 +200,26 @@ class AstraDBIndexer(Indexer):
194
200
  astra_db_docs.append(result)
195
201
 
196
202
  # Create file data for each astra record
197
- for astra_record in astra_db_docs:
198
- ids.add(astra_record["_id"])
203
+ ids = sorted([astra_record["_id"] for astra_record in astra_db_docs])
199
204
 
200
- return ids
205
+ return set(ids)
201
206
 
202
- def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
207
+ def run(self, **kwargs: Any) -> Generator[AstraDBBatchFileData, None, None]:
203
208
  all_ids = self._get_doc_ids()
204
209
  ids = list(all_ids)
205
210
  id_batches = batch_generator(ids, self.index_config.batch_size)
206
211
 
207
212
  for batch in id_batches:
208
- # Make sure the hash is always a positive number to create identified
209
- identified = str(hash(batch) + sys.maxsize + 1)
210
- fd = FileData(
211
- identifier=identified,
213
+ fd = AstraDBBatchFileData(
212
214
  connector_type=CONNECTOR_TYPE,
213
- doc_type="batch",
214
215
  metadata=FileDataSourceMetadata(
215
216
  date_processed=str(time()),
216
217
  ),
217
- additional_metadata={
218
- "ids": list(batch),
219
- "collection_name": self.index_config.collection_name,
220
- "keyspace": self.index_config.keyspace,
221
- },
218
+ additional_metadata=AstraDBAdditionalMetadata(
219
+ collection_name=self.index_config.collection_name,
220
+ keyspace=self.index_config.keyspace,
221
+ ),
222
+ batch_items=[BatchItem(identifier=b) for b in batch],
222
223
  )
223
224
  yield fd
224
225
 
@@ -247,7 +248,9 @@ class AstraDBDownloader(Downloader):
247
248
  writer.writerow(astra_result.keys())
248
249
  writer.writerow(astra_result.values())
249
250
 
250
- def generate_download_response(self, result: dict, file_data: FileData) -> DownloadResponse:
251
+ def generate_download_response(
252
+ self, result: dict, file_data: AstraDBBatchFileData
253
+ ) -> DownloadResponse:
251
254
  record_id = result["_id"]
252
255
  filename_id = self.get_identifier(record_id=record_id)
253
256
  filename = f"{filename_id}.csv" # csv to preserve column info
@@ -255,7 +258,7 @@ class AstraDBDownloader(Downloader):
255
258
  logger.debug(f"Downloading results from record {record_id} as csv to {download_path}")
256
259
  download_path.parent.mkdir(parents=True, exist_ok=True)
257
260
  try:
258
- self.write_astra_result_to_csv(astra_result=result, download_path=download_path)
261
+ self.write_astra_result_to_csv(astra_result=result, download_path=str(download_path))
259
262
  except Exception as e:
260
263
  logger.error(
261
264
  f"failed to download from record {record_id} to {download_path}: {e}",
@@ -264,14 +267,12 @@ class AstraDBDownloader(Downloader):
264
267
  raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
265
268
 
266
269
  # modify input file_data for download_response
267
- copied_file_data = copy.deepcopy(file_data)
268
- copied_file_data.identifier = filename
269
- copied_file_data.doc_type = "file"
270
- copied_file_data.metadata.date_processed = str(time())
271
- copied_file_data.metadata.record_locator = {"document_id": record_id}
272
- copied_file_data.additional_metadata.pop("ids", None)
270
+ cast_file_data = FileData.cast(file_data=file_data)
271
+ cast_file_data.identifier = filename
272
+ cast_file_data.metadata.date_processed = str(time())
273
+ cast_file_data.metadata.record_locator = {"document_id": record_id}
273
274
  return super().generate_download_response(
274
- file_data=copied_file_data, download_path=download_path
275
+ file_data=cast_file_data, download_path=download_path
275
276
  )
276
277
 
277
278
  def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
@@ -279,9 +280,10 @@ class AstraDBDownloader(Downloader):
279
280
 
280
281
  async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
281
282
  # Get metadata from file_data
282
- ids: list[str] = file_data.additional_metadata["ids"]
283
- collection_name: str = file_data.additional_metadata["collection_name"]
284
- keyspace: str = file_data.additional_metadata["keyspace"]
283
+ astra_file_data = AstraDBBatchFileData.cast(file_data=file_data)
284
+ ids: list[str] = [item.identifier for item in astra_file_data.batch_items]
285
+ collection_name: str = astra_file_data.additional_metadata.collection_name
286
+ keyspace: str = astra_file_data.additional_metadata.keyspace
285
287
 
286
288
  # Retrieve results from async collection
287
289
  download_responses = []
@@ -292,7 +294,7 @@ class AstraDBDownloader(Downloader):
292
294
  )
293
295
  async for result in async_astra_collection.find({"_id": {"$in": ids}}):
294
296
  download_responses.append(
295
- self.generate_download_response(result=result, file_data=file_data)
297
+ self.generate_download_response(result=result, file_data=astra_file_data)
296
298
  )
297
299
  return download_responses
298
300
 
@@ -1,5 +1,4 @@
1
1
  import hashlib
2
- import sys
3
2
  import time
4
3
  from contextlib import contextmanager
5
4
  from dataclasses import dataclass, field
@@ -7,7 +6,7 @@ from datetime import timedelta
7
6
  from pathlib import Path
8
7
  from typing import TYPE_CHECKING, Any, Generator, List
9
8
 
10
- from pydantic import Field, Secret
9
+ from pydantic import BaseModel, Field, Secret
11
10
 
12
11
  from unstructured_ingest.error import (
13
12
  DestinationConnectionError,
@@ -18,6 +17,8 @@ from unstructured_ingest.utils.data_prep import batch_generator, flatten_dict
18
17
  from unstructured_ingest.utils.dep_check import requires_dependencies
19
18
  from unstructured_ingest.v2.interfaces import (
20
19
  AccessConfig,
20
+ BatchFileData,
21
+ BatchItem,
21
22
  ConnectionConfig,
22
23
  Downloader,
23
24
  DownloaderConfig,
@@ -40,11 +41,20 @@ from unstructured_ingest.v2.processes.connector_registry import (
40
41
 
41
42
  if TYPE_CHECKING:
42
43
  from couchbase.cluster import Cluster
44
+ from couchbase.collection import Collection
43
45
 
44
46
  CONNECTOR_TYPE = "couchbase"
45
47
  SERVER_API_VERSION = "1"
46
48
 
47
49
 
50
+ class CouchbaseAdditionalMetadata(BaseModel):
51
+ bucket: str
52
+
53
+
54
+ class CouchbaseBatchFileData(BatchFileData):
55
+ additional_metadata: CouchbaseAdditionalMetadata
56
+
57
+
48
58
  class CouchbaseAccessConfig(AccessConfig):
49
59
  password: str = Field(description="The password for the Couchbase server")
50
60
 
@@ -180,31 +190,21 @@ class CouchbaseIndexer(Indexer):
180
190
  if attempts == max_attempts:
181
191
  raise SourceConnectionError(f"failed to get document ids: {e}")
182
192
 
183
- def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
193
+ def run(self, **kwargs: Any) -> Generator[CouchbaseBatchFileData, None, None]:
184
194
  ids = self._get_doc_ids()
185
-
186
- id_batches = [
187
- ids[i * self.index_config.batch_size : (i + 1) * self.index_config.batch_size]
188
- for i in range(
189
- (len(ids) + self.index_config.batch_size - 1) // self.index_config.batch_size
190
- )
191
- ]
192
- for batch in id_batches:
195
+ for batch in batch_generator(ids, self.index_config.batch_size):
193
196
  # Make sure the hash is always a positive number to create identified
194
- identified = str(hash(tuple(batch)) + sys.maxsize + 1)
195
- yield FileData(
196
- identifier=identified,
197
+ yield CouchbaseBatchFileData(
197
198
  connector_type=CONNECTOR_TYPE,
198
- doc_type="batch",
199
199
  metadata=FileDataSourceMetadata(
200
200
  url=f"{self.connection_config.connection_string}/"
201
201
  f"{self.connection_config.bucket}",
202
202
  date_processed=str(time.time()),
203
203
  ),
204
- additional_metadata={
205
- "ids": list(batch),
206
- "bucket": self.connection_config.bucket,
207
- },
204
+ additional_metadata=CouchbaseAdditionalMetadata(
205
+ bucket=self.connection_config.bucket
206
+ ),
207
+ batch_items=[BatchItem(identifier=b) for b in batch],
208
208
  )
209
209
 
210
210
 
@@ -241,7 +241,7 @@ class CouchbaseDownloader(Downloader):
241
241
  return concatenated_values
242
242
 
243
243
  def generate_download_response(
244
- self, result: dict, bucket: str, file_data: FileData
244
+ self, result: dict, bucket: str, file_data: CouchbaseBatchFileData
245
245
  ) -> DownloadResponse:
246
246
  record_id = result[self.download_config.collection_id]
247
247
  filename_id = self.get_identifier(bucket=bucket, record_id=record_id)
@@ -261,28 +261,25 @@ class CouchbaseDownloader(Downloader):
261
261
  exc_info=True,
262
262
  )
263
263
  raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
264
- return DownloadResponse(
265
- file_data=FileData(
266
- identifier=filename_id,
267
- connector_type=CONNECTOR_TYPE,
268
- metadata=FileDataSourceMetadata(
269
- version=None,
270
- date_processed=str(time.time()),
271
- record_locator={
272
- "connection_string": self.connection_config.connection_string,
273
- "bucket": bucket,
274
- "scope": self.connection_config.scope,
275
- "collection": self.connection_config.collection,
276
- "document_id": record_id,
277
- },
278
- ),
279
- ),
280
- path=download_path,
264
+ cast_file_data = FileData.cast(file_data=file_data)
265
+ cast_file_data.identifier = filename_id
266
+ cast_file_data.metadata.date_processed = str(time.time())
267
+ cast_file_data.metadata.record_locator = {
268
+ "connection_string": self.connection_config.connection_string,
269
+ "bucket": bucket,
270
+ "scope": self.connection_config.scope,
271
+ "collection": self.connection_config.collection,
272
+ "document_id": record_id,
273
+ }
274
+ return super().generate_download_response(
275
+ file_data=cast_file_data,
276
+ download_path=download_path,
281
277
  )
282
278
 
283
279
  def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
284
- bucket_name: str = file_data.additional_metadata["bucket"]
285
- ids: list[str] = file_data.additional_metadata["ids"]
280
+ couchbase_file_data = CouchbaseBatchFileData.cast(file_data=file_data)
281
+ bucket_name: str = couchbase_file_data.additional_metadata.bucket
282
+ ids: list[str] = [item.identifier for item in couchbase_file_data.batch_items]
286
283
 
287
284
  with self.connection_config.get_client() as client:
288
285
  bucket = client.bucket(bucket_name)
@@ -292,13 +289,25 @@ class CouchbaseDownloader(Downloader):
292
289
  download_resp = self.process_all_doc_ids(ids, collection, bucket_name, file_data)
293
290
  return list(download_resp)
294
291
 
295
- def process_doc_id(self, doc_id, collection, bucket_name, file_data):
292
+ def process_doc_id(
293
+ self,
294
+ doc_id: str,
295
+ collection: "Collection",
296
+ bucket_name: str,
297
+ file_data: CouchbaseBatchFileData,
298
+ ):
296
299
  result = collection.get(doc_id)
297
300
  return self.generate_download_response(
298
301
  result=result.content_as[dict], bucket=bucket_name, file_data=file_data
299
302
  )
300
303
 
301
- def process_all_doc_ids(self, ids, collection, bucket_name, file_data):
304
+ def process_all_doc_ids(
305
+ self,
306
+ ids: list[str],
307
+ collection: "Collection",
308
+ bucket_name: str,
309
+ file_data: CouchbaseBatchFileData,
310
+ ):
302
311
  for doc_id in ids:
303
312
  yield self.process_doc_id(doc_id, collection, bucket_name, file_data)
304
313
 
@@ -1,6 +1,5 @@
1
1
  import collections
2
2
  import hashlib
3
- import sys
4
3
  from contextlib import contextmanager
5
4
  from dataclasses import dataclass, field
6
5
  from pathlib import Path
@@ -15,11 +14,17 @@ from unstructured_ingest.error import (
15
14
  SourceConnectionNetworkError,
16
15
  WriteError,
17
16
  )
18
- from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
17
+ from unstructured_ingest.utils.data_prep import (
18
+ batch_generator,
19
+ flatten_dict,
20
+ generator_batching_wbytes,
21
+ )
19
22
  from unstructured_ingest.utils.dep_check import requires_dependencies
20
23
  from unstructured_ingest.v2.constants import RECORD_ID_LABEL
21
24
  from unstructured_ingest.v2.interfaces import (
22
25
  AccessConfig,
26
+ BatchFileData,
27
+ BatchItem,
23
28
  ConnectionConfig,
24
29
  Downloader,
25
30
  DownloaderConfig,
@@ -48,6 +53,14 @@ if TYPE_CHECKING:
48
53
  CONNECTOR_TYPE = "elasticsearch"
49
54
 
50
55
 
56
+ class ElastisearchAdditionalMetadata(BaseModel):
57
+ index_name: str
58
+
59
+
60
+ class ElasticsearchBatchFileData(BatchFileData):
61
+ additional_metadata: ElastisearchAdditionalMetadata
62
+
63
+
51
64
  class ElasticsearchAccessConfig(AccessConfig):
52
65
  password: Optional[str] = Field(
53
66
  default=None, description="password when using basic auth or connecting to a cloud instance"
@@ -174,36 +187,21 @@ class ElasticsearchIndexer(Indexer):
174
187
 
175
188
  return {hit["_id"] for hit in hits}
176
189
 
177
- def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
190
+ def run(self, **kwargs: Any) -> Generator[ElasticsearchBatchFileData, None, None]:
178
191
  all_ids = self._get_doc_ids()
179
192
  ids = list(all_ids)
180
- id_batches: list[frozenset[str]] = [
181
- frozenset(
182
- ids[
183
- i
184
- * self.index_config.batch_size : (i + 1) # noqa
185
- * self.index_config.batch_size
186
- ]
187
- )
188
- for i in range(
189
- (len(ids) + self.index_config.batch_size - 1) // self.index_config.batch_size
190
- )
191
- ]
192
- for batch in id_batches:
193
+ for batch in batch_generator(ids, self.index_config.batch_size):
193
194
  # Make sure the hash is always a positive number to create identified
194
- identified = str(hash(batch) + sys.maxsize + 1)
195
- yield FileData(
196
- identifier=identified,
195
+ yield ElasticsearchBatchFileData(
197
196
  connector_type=CONNECTOR_TYPE,
198
- doc_type="batch",
199
197
  metadata=FileDataSourceMetadata(
200
198
  url=f"{self.connection_config.hosts[0]}/{self.index_config.index_name}",
201
199
  date_processed=str(time()),
202
200
  ),
203
- additional_metadata={
204
- "ids": list(batch),
205
- "index_name": self.index_config.index_name,
206
- },
201
+ additional_metadata=ElastisearchAdditionalMetadata(
202
+ index_name=self.index_config.index_name,
203
+ ),
204
+ batch_items=[BatchItem(identifier=b) for b in batch],
207
205
  )
208
206
 
209
207
 
@@ -237,7 +235,7 @@ class ElasticsearchDownloader(Downloader):
237
235
  return concatenated_values
238
236
 
239
237
  def generate_download_response(
240
- self, result: dict, index_name: str, file_data: FileData
238
+ self, result: dict, index_name: str, file_data: ElasticsearchBatchFileData
241
239
  ) -> DownloadResponse:
242
240
  record_id = result["_id"]
243
241
  filename_id = self.get_identifier(index_name=index_name, record_id=record_id)
@@ -257,22 +255,19 @@ class ElasticsearchDownloader(Downloader):
257
255
  exc_info=True,
258
256
  )
259
257
  raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
260
- return DownloadResponse(
261
- file_data=FileData(
262
- identifier=filename_id,
263
- connector_type=CONNECTOR_TYPE,
264
- source_identifiers=SourceIdentifiers(filename=filename, fullpath=filename),
265
- metadata=FileDataSourceMetadata(
266
- version=str(result["_version"]) if "_version" in result else None,
267
- date_processed=str(time()),
268
- record_locator={
269
- "hosts": self.connection_config.hosts,
270
- "index_name": index_name,
271
- "document_id": record_id,
272
- },
273
- ),
274
- ),
275
- path=download_path,
258
+ cast_file_data = FileData.cast(file_data=file_data)
259
+ cast_file_data.identifier = filename_id
260
+ cast_file_data.metadata.date_processed = str(time())
261
+ cast_file_data.metadata.version = str(result["_version"]) if "_version" in result else None
262
+ cast_file_data.metadata.record_locator = {
263
+ "hosts": self.connection_config.hosts,
264
+ "index_name": index_name,
265
+ "document_id": record_id,
266
+ }
267
+ cast_file_data.source_identifiers = SourceIdentifiers(filename=filename, fullpath=filename)
268
+ return super().generate_download_response(
269
+ file_data=cast_file_data,
270
+ download_path=download_path,
276
271
  )
277
272
 
278
273
  def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
@@ -285,11 +280,12 @@ class ElasticsearchDownloader(Downloader):
285
280
 
286
281
  return AsyncElasticsearch, async_scan
287
282
 
288
- async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
283
+ async def run_async(self, file_data: BatchFileData, **kwargs: Any) -> download_responses:
284
+ elasticsearch_filedata = ElasticsearchBatchFileData.cast(file_data=file_data)
289
285
  AsyncClient, async_scan = self.load_async()
290
286
 
291
- index_name: str = file_data.additional_metadata["index_name"]
292
- ids: list[str] = file_data.additional_metadata["ids"]
287
+ index_name: str = elasticsearch_filedata.additional_metadata.index_name
288
+ ids: list[str] = [item.identifier for item in elasticsearch_filedata.batch_items]
293
289
 
294
290
  scan_query = {
295
291
  "_source": self.download_config.fields,
@@ -307,7 +303,7 @@ class ElasticsearchDownloader(Downloader):
307
303
  ):
308
304
  download_responses.append(
309
305
  self.generate_download_response(
310
- result=result, index_name=index_name, file_data=file_data
306
+ result=result, index_name=index_name, file_data=elasticsearch_filedata
311
307
  )
312
308
  )
313
309
  return download_responses
@@ -1,14 +1,14 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from contextlib import contextmanager
3
4
  from dataclasses import dataclass, field
4
- from pathlib import Path
5
5
  from time import time
6
- from typing import Any, Generator, Optional
6
+ from typing import TYPE_CHECKING, Any, Generator, Optional
7
7
 
8
8
  from pydantic import Field, Secret
9
9
 
10
10
  from unstructured_ingest.utils.dep_check import requires_dependencies
11
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
11
+ from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
12
12
  from unstructured_ingest.v2.processes.connector_registry import (
13
13
  DestinationRegistryEntry,
14
14
  SourceRegistryEntry,
@@ -25,6 +25,9 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
25
25
  )
26
26
  from unstructured_ingest.v2.processes.connectors.fsspec.utils import json_serial, sterilize_dict
27
27
 
28
+ if TYPE_CHECKING:
29
+ from adlfs import AzureBlobFileSystem
30
+
28
31
  CONNECTOR_TYPE = "azure"
29
32
 
30
33
 
@@ -89,6 +92,12 @@ class AzureConnectionConfig(FsspecConnectionConfig):
89
92
  }
90
93
  return access_configs
91
94
 
95
+ @requires_dependencies(["adlfs", "fsspec"], extras="azure")
96
+ @contextmanager
97
+ def get_client(self, protocol: str) -> Generator["AzureBlobFileSystem", None, None]:
98
+ with super().get_client(protocol=protocol) as client:
99
+ yield client
100
+
92
101
 
93
102
  @dataclass
94
103
  class AzureIndexer(FsspecIndexer):
@@ -96,17 +105,9 @@ class AzureIndexer(FsspecIndexer):
96
105
  index_config: AzureIndexerConfig
97
106
  connector_type: str = CONNECTOR_TYPE
98
107
 
99
- @requires_dependencies(["adlfs", "fsspec"], extras="azure")
100
- def precheck(self) -> None:
101
- super().precheck()
102
-
103
108
  def sterilize_info(self, file_data: dict) -> dict:
104
109
  return sterilize_dict(data=file_data, default=azure_json_serial)
105
110
 
106
- @requires_dependencies(["adlfs", "fsspec"], extras="azure")
107
- def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
108
- return super().run(**kwargs)
109
-
110
111
  def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
111
112
  path = file_data["name"]
112
113
  date_created = (
@@ -149,14 +150,6 @@ class AzureDownloader(FsspecDownloader):
149
150
  connector_type: str = CONNECTOR_TYPE
150
151
  download_config: Optional[AzureDownloaderConfig] = field(default_factory=AzureDownloaderConfig)
151
152
 
152
- @requires_dependencies(["adlfs", "fsspec"], extras="azure")
153
- def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
154
- return super().run(file_data=file_data, **kwargs)
155
-
156
- @requires_dependencies(["adlfs", "fsspec"], extras="azure")
157
- async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
158
- return await super().run_async(file_data=file_data, **kwargs)
159
-
160
153
 
161
154
  class AzureUploaderConfig(FsspecUploaderConfig):
162
155
  pass
@@ -168,22 +161,6 @@ class AzureUploader(FsspecUploader):
168
161
  connection_config: AzureConnectionConfig
169
162
  upload_config: AzureUploaderConfig = field(default=None)
170
163
 
171
- @requires_dependencies(["adlfs", "fsspec"], extras="azure")
172
- def __post_init__(self):
173
- super().__post_init__()
174
-
175
- @requires_dependencies(["adlfs", "fsspec"], extras="azure")
176
- def precheck(self) -> None:
177
- super().precheck()
178
-
179
- @requires_dependencies(["adlfs", "fsspec"], extras="azure")
180
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
181
- return super().run(path=path, file_data=file_data, **kwargs)
182
-
183
- @requires_dependencies(["adlfs", "fsspec"], extras="azure")
184
- async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
185
- return await super().run_async(path=path, file_data=file_data, **kwargs)
186
-
187
164
 
188
165
  azure_source_entry = SourceRegistryEntry(
189
166
  indexer=AzureIndexer,
@@ -1,16 +1,16 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from contextlib import contextmanager
3
4
  from dataclasses import dataclass, field
4
- from pathlib import Path
5
5
  from time import time
6
- from typing import Annotated, Any, Generator, Optional
6
+ from typing import TYPE_CHECKING, Annotated, Any, Generator, Optional
7
7
 
8
8
  from dateutil import parser
9
9
  from pydantic import Field, Secret
10
10
  from pydantic.functional_validators import BeforeValidator
11
11
 
12
12
  from unstructured_ingest.utils.dep_check import requires_dependencies
13
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
13
+ from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
14
14
  from unstructured_ingest.v2.processes.connector_registry import (
15
15
  DestinationRegistryEntry,
16
16
  SourceRegistryEntry,
@@ -28,6 +28,9 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
28
28
  )
29
29
  from unstructured_ingest.v2.processes.connectors.utils import conform_string_to_dict
30
30
 
31
+ if TYPE_CHECKING:
32
+ from boxfs import BoxFileSystem
33
+
31
34
  CONNECTOR_TYPE = "box"
32
35
 
33
36
 
@@ -72,6 +75,12 @@ class BoxConnectionConfig(FsspecConnectionConfig):
72
75
 
73
76
  return access_kwargs_with_oauth
74
77
 
78
+ @requires_dependencies(["boxfs"], extras="box")
79
+ @contextmanager
80
+ def get_client(self, protocol: str) -> Generator["BoxFileSystem", None, None]:
81
+ with super().get_client(protocol=protocol) as client:
82
+ yield client
83
+
75
84
 
76
85
  @dataclass
77
86
  class BoxIndexer(FsspecIndexer):
@@ -79,14 +88,6 @@ class BoxIndexer(FsspecIndexer):
79
88
  index_config: BoxIndexerConfig
80
89
  connector_type: str = CONNECTOR_TYPE
81
90
 
82
- @requires_dependencies(["boxfs"], extras="box")
83
- def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
84
- return super().run(**kwargs)
85
-
86
- @requires_dependencies(["boxfs"], extras="box")
87
- def precheck(self) -> None:
88
- super().precheck()
89
-
90
91
  def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
91
92
  path = file_data["name"]
92
93
  date_created = None
@@ -126,14 +127,6 @@ class BoxDownloader(FsspecDownloader):
126
127
  connector_type: str = CONNECTOR_TYPE
127
128
  download_config: Optional[BoxDownloaderConfig] = field(default_factory=BoxDownloaderConfig)
128
129
 
129
- @requires_dependencies(["boxfs"], extras="box")
130
- def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
131
- return super().run(file_data=file_data, **kwargs)
132
-
133
- @requires_dependencies(["boxfs"], extras="box")
134
- async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
135
- return await super().run_async(file_data=file_data, **kwargs)
136
-
137
130
 
138
131
  class BoxUploaderConfig(FsspecUploaderConfig):
139
132
  pass
@@ -145,22 +138,6 @@ class BoxUploader(FsspecUploader):
145
138
  connection_config: BoxConnectionConfig
146
139
  upload_config: BoxUploaderConfig = field(default=None)
147
140
 
148
- @requires_dependencies(["boxfs"], extras="box")
149
- def __post_init__(self):
150
- super().__post_init__()
151
-
152
- @requires_dependencies(["boxfs"], extras="box")
153
- def precheck(self) -> None:
154
- super().precheck()
155
-
156
- @requires_dependencies(["boxfs"], extras="box")
157
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
158
- return super().run(path=path, file_data=file_data, **kwargs)
159
-
160
- @requires_dependencies(["boxfs"], extras="box")
161
- async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
162
- return await super().run_async(path=path, file_data=file_data, **kwargs)
163
-
164
141
 
165
142
  box_source_entry = SourceRegistryEntry(
166
143
  indexer=BoxIndexer,