unstructured-ingest 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (52) hide show
  1. test/integration/connectors/{databricks_tests → databricks}/test_volumes_native.py +75 -19
  2. test/integration/connectors/sql/test_postgres.py +9 -5
  3. test/integration/connectors/sql/test_singlestore.py +9 -5
  4. test/integration/connectors/sql/test_snowflake.py +6 -2
  5. test/integration/connectors/sql/test_sqlite.py +9 -5
  6. test/integration/connectors/test_astradb.py +40 -0
  7. test/integration/connectors/test_kafka.py +2 -2
  8. test/integration/connectors/test_mongodb.py +4 -1
  9. test/integration/connectors/utils/validation/source.py +31 -11
  10. unstructured_ingest/__version__.py +1 -1
  11. unstructured_ingest/v2/interfaces/__init__.py +3 -1
  12. unstructured_ingest/v2/interfaces/file_data.py +69 -15
  13. unstructured_ingest/v2/pipeline/steps/chunk.py +2 -1
  14. unstructured_ingest/v2/pipeline/steps/download.py +5 -4
  15. unstructured_ingest/v2/pipeline/steps/embed.py +2 -1
  16. unstructured_ingest/v2/pipeline/steps/filter.py +2 -2
  17. unstructured_ingest/v2/pipeline/steps/index.py +4 -4
  18. unstructured_ingest/v2/pipeline/steps/partition.py +3 -2
  19. unstructured_ingest/v2/pipeline/steps/stage.py +2 -2
  20. unstructured_ingest/v2/pipeline/steps/uncompress.py +2 -2
  21. unstructured_ingest/v2/pipeline/steps/upload.py +3 -3
  22. unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
  23. unstructured_ingest/v2/processes/connectors/astradb.py +37 -33
  24. unstructured_ingest/v2/processes/connectors/couchbase.py +52 -41
  25. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +5 -0
  26. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +2 -2
  27. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +2 -2
  28. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +2 -2
  29. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +2 -2
  30. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +41 -45
  31. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +12 -35
  32. unstructured_ingest/v2/processes/connectors/fsspec/box.py +12 -35
  33. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +15 -42
  34. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +33 -29
  35. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +12 -34
  36. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -37
  37. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -33
  38. unstructured_ingest/v2/processes/connectors/mongodb.py +94 -100
  39. unstructured_ingest/v2/processes/connectors/neo4j.py +5 -3
  40. unstructured_ingest/v2/processes/connectors/onedrive.py +1 -1
  41. unstructured_ingest/v2/processes/connectors/sql/postgres.py +5 -5
  42. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -5
  43. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
  44. unstructured_ingest/v2/processes/connectors/sql/sql.py +36 -26
  45. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -5
  46. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/METADATA +11 -10
  47. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/RECORD +52 -52
  48. /test/integration/connectors/{databricks_tests → databricks}/__init__.py +0 -0
  49. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/LICENSE.md +0 -0
  50. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/WHEEL +0 -0
  51. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/entry_points.txt +0 -0
  52. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/top_level.txt +0 -0
@@ -3,12 +3,12 @@ from typing import Optional
3
3
 
4
4
  from pydantic import Field, Secret
5
5
 
6
- from unstructured_ingest.v2.interfaces import AccessConfig
7
6
  from unstructured_ingest.v2.processes.connector_registry import (
8
7
  DestinationRegistryEntry,
9
8
  SourceRegistryEntry,
10
9
  )
11
10
  from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
11
+ DatabricksVolumesAccessConfig,
12
12
  DatabricksVolumesConnectionConfig,
13
13
  DatabricksVolumesDownloader,
14
14
  DatabricksVolumesDownloaderConfig,
@@ -21,7 +21,7 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
21
21
  CONNECTOR_TYPE = "databricks_volumes"
22
22
 
23
23
 
24
- class DatabricksNativeVolumesAccessConfig(AccessConfig):
24
+ class DatabricksNativeVolumesAccessConfig(DatabricksVolumesAccessConfig):
25
25
  client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
26
26
  client_secret: Optional[str] = Field(
27
27
  default=None, description="Client Secret of the OAuth app."
@@ -1,6 +1,5 @@
1
1
  import collections
2
2
  import hashlib
3
- import sys
4
3
  from contextlib import contextmanager
5
4
  from dataclasses import dataclass, field
6
5
  from pathlib import Path
@@ -15,11 +14,17 @@ from unstructured_ingest.error import (
15
14
  SourceConnectionNetworkError,
16
15
  WriteError,
17
16
  )
18
- from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
17
+ from unstructured_ingest.utils.data_prep import (
18
+ batch_generator,
19
+ flatten_dict,
20
+ generator_batching_wbytes,
21
+ )
19
22
  from unstructured_ingest.utils.dep_check import requires_dependencies
20
23
  from unstructured_ingest.v2.constants import RECORD_ID_LABEL
21
24
  from unstructured_ingest.v2.interfaces import (
22
25
  AccessConfig,
26
+ BatchFileData,
27
+ BatchItem,
23
28
  ConnectionConfig,
24
29
  Downloader,
25
30
  DownloaderConfig,
@@ -48,6 +53,14 @@ if TYPE_CHECKING:
48
53
  CONNECTOR_TYPE = "elasticsearch"
49
54
 
50
55
 
56
+ class ElastisearchAdditionalMetadata(BaseModel):
57
+ index_name: str
58
+
59
+
60
+ class ElasticsearchBatchFileData(BatchFileData):
61
+ additional_metadata: ElastisearchAdditionalMetadata
62
+
63
+
51
64
  class ElasticsearchAccessConfig(AccessConfig):
52
65
  password: Optional[str] = Field(
53
66
  default=None, description="password when using basic auth or connecting to a cloud instance"
@@ -174,36 +187,21 @@ class ElasticsearchIndexer(Indexer):
174
187
 
175
188
  return {hit["_id"] for hit in hits}
176
189
 
177
- def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
190
+ def run(self, **kwargs: Any) -> Generator[ElasticsearchBatchFileData, None, None]:
178
191
  all_ids = self._get_doc_ids()
179
192
  ids = list(all_ids)
180
- id_batches: list[frozenset[str]] = [
181
- frozenset(
182
- ids[
183
- i
184
- * self.index_config.batch_size : (i + 1) # noqa
185
- * self.index_config.batch_size
186
- ]
187
- )
188
- for i in range(
189
- (len(ids) + self.index_config.batch_size - 1) // self.index_config.batch_size
190
- )
191
- ]
192
- for batch in id_batches:
193
+ for batch in batch_generator(ids, self.index_config.batch_size):
193
194
  # Make sure the hash is always a positive number to create identified
194
- identified = str(hash(batch) + sys.maxsize + 1)
195
- yield FileData(
196
- identifier=identified,
195
+ yield ElasticsearchBatchFileData(
197
196
  connector_type=CONNECTOR_TYPE,
198
- doc_type="batch",
199
197
  metadata=FileDataSourceMetadata(
200
198
  url=f"{self.connection_config.hosts[0]}/{self.index_config.index_name}",
201
199
  date_processed=str(time()),
202
200
  ),
203
- additional_metadata={
204
- "ids": list(batch),
205
- "index_name": self.index_config.index_name,
206
- },
201
+ additional_metadata=ElastisearchAdditionalMetadata(
202
+ index_name=self.index_config.index_name,
203
+ ),
204
+ batch_items=[BatchItem(identifier=b) for b in batch],
207
205
  )
208
206
 
209
207
 
@@ -237,7 +235,7 @@ class ElasticsearchDownloader(Downloader):
237
235
  return concatenated_values
238
236
 
239
237
  def generate_download_response(
240
- self, result: dict, index_name: str, file_data: FileData
238
+ self, result: dict, index_name: str, file_data: ElasticsearchBatchFileData
241
239
  ) -> DownloadResponse:
242
240
  record_id = result["_id"]
243
241
  filename_id = self.get_identifier(index_name=index_name, record_id=record_id)
@@ -257,22 +255,19 @@ class ElasticsearchDownloader(Downloader):
257
255
  exc_info=True,
258
256
  )
259
257
  raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
260
- return DownloadResponse(
261
- file_data=FileData(
262
- identifier=filename_id,
263
- connector_type=CONNECTOR_TYPE,
264
- source_identifiers=SourceIdentifiers(filename=filename, fullpath=filename),
265
- metadata=FileDataSourceMetadata(
266
- version=str(result["_version"]) if "_version" in result else None,
267
- date_processed=str(time()),
268
- record_locator={
269
- "hosts": self.connection_config.hosts,
270
- "index_name": index_name,
271
- "document_id": record_id,
272
- },
273
- ),
274
- ),
275
- path=download_path,
258
+ file_data.source_identifiers = SourceIdentifiers(filename=filename, fullpath=filename)
259
+ cast_file_data = FileData.cast(file_data=file_data)
260
+ cast_file_data.identifier = filename_id
261
+ cast_file_data.metadata.date_processed = str(time())
262
+ cast_file_data.metadata.version = str(result["_version"]) if "_version" in result else None
263
+ cast_file_data.metadata.record_locator = {
264
+ "hosts": self.connection_config.hosts,
265
+ "index_name": index_name,
266
+ "document_id": record_id,
267
+ }
268
+ return super().generate_download_response(
269
+ file_data=cast_file_data,
270
+ download_path=download_path,
276
271
  )
277
272
 
278
273
  def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
@@ -285,11 +280,12 @@ class ElasticsearchDownloader(Downloader):
285
280
 
286
281
  return AsyncElasticsearch, async_scan
287
282
 
288
- async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
283
+ async def run_async(self, file_data: BatchFileData, **kwargs: Any) -> download_responses:
284
+ elasticsearch_filedata = ElasticsearchBatchFileData.cast(file_data=file_data)
289
285
  AsyncClient, async_scan = self.load_async()
290
286
 
291
- index_name: str = file_data.additional_metadata["index_name"]
292
- ids: list[str] = file_data.additional_metadata["ids"]
287
+ index_name: str = elasticsearch_filedata.additional_metadata.index_name
288
+ ids: list[str] = [item.identifier for item in elasticsearch_filedata.batch_items]
293
289
 
294
290
  scan_query = {
295
291
  "_source": self.download_config.fields,
@@ -307,7 +303,7 @@ class ElasticsearchDownloader(Downloader):
307
303
  ):
308
304
  download_responses.append(
309
305
  self.generate_download_response(
310
- result=result, index_name=index_name, file_data=file_data
306
+ result=result, index_name=index_name, file_data=elasticsearch_filedata
311
307
  )
312
308
  )
313
309
  return download_responses
@@ -1,14 +1,14 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from contextlib import contextmanager
3
4
  from dataclasses import dataclass, field
4
- from pathlib import Path
5
5
  from time import time
6
- from typing import Any, Generator, Optional
6
+ from typing import TYPE_CHECKING, Any, Generator, Optional
7
7
 
8
8
  from pydantic import Field, Secret
9
9
 
10
10
  from unstructured_ingest.utils.dep_check import requires_dependencies
11
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
11
+ from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
12
12
  from unstructured_ingest.v2.processes.connector_registry import (
13
13
  DestinationRegistryEntry,
14
14
  SourceRegistryEntry,
@@ -25,6 +25,9 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
25
25
  )
26
26
  from unstructured_ingest.v2.processes.connectors.fsspec.utils import json_serial, sterilize_dict
27
27
 
28
+ if TYPE_CHECKING:
29
+ from adlfs import AzureBlobFileSystem
30
+
28
31
  CONNECTOR_TYPE = "azure"
29
32
 
30
33
 
@@ -89,6 +92,12 @@ class AzureConnectionConfig(FsspecConnectionConfig):
89
92
  }
90
93
  return access_configs
91
94
 
95
+ @requires_dependencies(["adlfs", "fsspec"], extras="azure")
96
+ @contextmanager
97
+ def get_client(self, protocol: str) -> Generator["AzureBlobFileSystem", None, None]:
98
+ with super().get_client(protocol=protocol) as client:
99
+ yield client
100
+
92
101
 
93
102
  @dataclass
94
103
  class AzureIndexer(FsspecIndexer):
@@ -96,17 +105,9 @@ class AzureIndexer(FsspecIndexer):
96
105
  index_config: AzureIndexerConfig
97
106
  connector_type: str = CONNECTOR_TYPE
98
107
 
99
- @requires_dependencies(["adlfs", "fsspec"], extras="azure")
100
- def precheck(self) -> None:
101
- super().precheck()
102
-
103
108
  def sterilize_info(self, file_data: dict) -> dict:
104
109
  return sterilize_dict(data=file_data, default=azure_json_serial)
105
110
 
106
- @requires_dependencies(["adlfs", "fsspec"], extras="azure")
107
- def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
108
- return super().run(**kwargs)
109
-
110
111
  def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
111
112
  path = file_data["name"]
112
113
  date_created = (
@@ -149,14 +150,6 @@ class AzureDownloader(FsspecDownloader):
149
150
  connector_type: str = CONNECTOR_TYPE
150
151
  download_config: Optional[AzureDownloaderConfig] = field(default_factory=AzureDownloaderConfig)
151
152
 
152
- @requires_dependencies(["adlfs", "fsspec"], extras="azure")
153
- def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
154
- return super().run(file_data=file_data, **kwargs)
155
-
156
- @requires_dependencies(["adlfs", "fsspec"], extras="azure")
157
- async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
158
- return await super().run_async(file_data=file_data, **kwargs)
159
-
160
153
 
161
154
  class AzureUploaderConfig(FsspecUploaderConfig):
162
155
  pass
@@ -168,22 +161,6 @@ class AzureUploader(FsspecUploader):
168
161
  connection_config: AzureConnectionConfig
169
162
  upload_config: AzureUploaderConfig = field(default=None)
170
163
 
171
- @requires_dependencies(["adlfs", "fsspec"], extras="azure")
172
- def __post_init__(self):
173
- super().__post_init__()
174
-
175
- @requires_dependencies(["adlfs", "fsspec"], extras="azure")
176
- def precheck(self) -> None:
177
- super().precheck()
178
-
179
- @requires_dependencies(["adlfs", "fsspec"], extras="azure")
180
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
181
- return super().run(path=path, file_data=file_data, **kwargs)
182
-
183
- @requires_dependencies(["adlfs", "fsspec"], extras="azure")
184
- async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
185
- return await super().run_async(path=path, file_data=file_data, **kwargs)
186
-
187
164
 
188
165
  azure_source_entry = SourceRegistryEntry(
189
166
  indexer=AzureIndexer,
@@ -1,16 +1,16 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from contextlib import contextmanager
3
4
  from dataclasses import dataclass, field
4
- from pathlib import Path
5
5
  from time import time
6
- from typing import Annotated, Any, Generator, Optional
6
+ from typing import TYPE_CHECKING, Annotated, Any, Generator, Optional
7
7
 
8
8
  from dateutil import parser
9
9
  from pydantic import Field, Secret
10
10
  from pydantic.functional_validators import BeforeValidator
11
11
 
12
12
  from unstructured_ingest.utils.dep_check import requires_dependencies
13
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
13
+ from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
14
14
  from unstructured_ingest.v2.processes.connector_registry import (
15
15
  DestinationRegistryEntry,
16
16
  SourceRegistryEntry,
@@ -28,6 +28,9 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
28
28
  )
29
29
  from unstructured_ingest.v2.processes.connectors.utils import conform_string_to_dict
30
30
 
31
+ if TYPE_CHECKING:
32
+ from boxfs import BoxFileSystem
33
+
31
34
  CONNECTOR_TYPE = "box"
32
35
 
33
36
 
@@ -72,6 +75,12 @@ class BoxConnectionConfig(FsspecConnectionConfig):
72
75
 
73
76
  return access_kwargs_with_oauth
74
77
 
78
+ @requires_dependencies(["boxfs"], extras="box")
79
+ @contextmanager
80
+ def get_client(self, protocol: str) -> Generator["BoxFileSystem", None, None]:
81
+ with super().get_client(protocol=protocol) as client:
82
+ yield client
83
+
75
84
 
76
85
  @dataclass
77
86
  class BoxIndexer(FsspecIndexer):
@@ -79,14 +88,6 @@ class BoxIndexer(FsspecIndexer):
79
88
  index_config: BoxIndexerConfig
80
89
  connector_type: str = CONNECTOR_TYPE
81
90
 
82
- @requires_dependencies(["boxfs"], extras="box")
83
- def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
84
- return super().run(**kwargs)
85
-
86
- @requires_dependencies(["boxfs"], extras="box")
87
- def precheck(self) -> None:
88
- super().precheck()
89
-
90
91
  def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
91
92
  path = file_data["name"]
92
93
  date_created = None
@@ -126,14 +127,6 @@ class BoxDownloader(FsspecDownloader):
126
127
  connector_type: str = CONNECTOR_TYPE
127
128
  download_config: Optional[BoxDownloaderConfig] = field(default_factory=BoxDownloaderConfig)
128
129
 
129
- @requires_dependencies(["boxfs"], extras="box")
130
- def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
131
- return super().run(file_data=file_data, **kwargs)
132
-
133
- @requires_dependencies(["boxfs"], extras="box")
134
- async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
135
- return await super().run_async(file_data=file_data, **kwargs)
136
-
137
130
 
138
131
  class BoxUploaderConfig(FsspecUploaderConfig):
139
132
  pass
@@ -145,22 +138,6 @@ class BoxUploader(FsspecUploader):
145
138
  connection_config: BoxConnectionConfig
146
139
  upload_config: BoxUploaderConfig = field(default=None)
147
140
 
148
- @requires_dependencies(["boxfs"], extras="box")
149
- def __post_init__(self):
150
- super().__post_init__()
151
-
152
- @requires_dependencies(["boxfs"], extras="box")
153
- def precheck(self) -> None:
154
- super().precheck()
155
-
156
- @requires_dependencies(["boxfs"], extras="box")
157
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
158
- return super().run(path=path, file_data=file_data, **kwargs)
159
-
160
- @requires_dependencies(["boxfs"], extras="box")
161
- async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
162
- return await super().run_async(path=path, file_data=file_data, **kwargs)
163
-
164
141
 
165
142
  box_source_entry = SourceRegistryEntry(
166
143
  indexer=BoxIndexer,
@@ -1,14 +1,14 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from contextlib import contextmanager
3
4
  from dataclasses import dataclass, field
4
- from pathlib import Path
5
5
  from time import time
6
- from typing import Any, Generator, Optional
6
+ from typing import TYPE_CHECKING, Generator, Optional
7
7
 
8
8
  from pydantic import Field, Secret
9
9
 
10
10
  from unstructured_ingest.utils.dep_check import requires_dependencies
11
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
11
+ from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
12
12
  from unstructured_ingest.v2.processes.connector_registry import (
13
13
  DestinationRegistryEntry,
14
14
  SourceRegistryEntry,
@@ -24,11 +24,16 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
24
24
  FsspecUploaderConfig,
25
25
  )
26
26
 
27
+ if TYPE_CHECKING:
28
+ from dropboxdrivefs import DropboxDriveFileSystem
29
+
27
30
  CONNECTOR_TYPE = "dropbox"
28
31
 
29
32
 
30
33
  class DropboxIndexerConfig(FsspecIndexerConfig):
31
- pass
34
+ def model_post_init(self, __context):
35
+ if not self.path_without_protocol.startswith("/"):
36
+ self.path_without_protocol = "/" + self.path_without_protocol
32
37
 
33
38
 
34
39
  class DropboxAccessConfig(FsspecAccessConfig):
@@ -42,6 +47,12 @@ class DropboxConnectionConfig(FsspecConnectionConfig):
42
47
  )
43
48
  connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
44
49
 
50
+ @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
51
+ @contextmanager
52
+ def get_client(self, protocol: str) -> Generator["DropboxDriveFileSystem", None, None]:
53
+ with super().get_client(protocol=protocol) as client:
54
+ yield client
55
+
45
56
 
46
57
  @dataclass
47
58
  class DropboxIndexer(FsspecIndexer):
@@ -83,20 +94,6 @@ class DropboxIndexer(FsspecIndexer):
83
94
  filesize_bytes=file_size,
84
95
  )
85
96
 
86
- @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
87
- def __post_init__(self):
88
- # dropbox expects the path to start with a /
89
- if not self.index_config.path_without_protocol.startswith("/"):
90
- self.index_config.path_without_protocol = "/" + self.index_config.path_without_protocol
91
-
92
- @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
93
- def precheck(self) -> None:
94
- super().precheck()
95
-
96
- @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
97
- def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
98
- return super().run(**kwargs)
99
-
100
97
 
101
98
  class DropboxDownloaderConfig(FsspecDownloaderConfig):
102
99
  pass
@@ -111,14 +108,6 @@ class DropboxDownloader(FsspecDownloader):
111
108
  default_factory=DropboxDownloaderConfig
112
109
  )
113
110
 
114
- @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
115
- def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
116
- return super().run(file_data=file_data, **kwargs)
117
-
118
- @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
119
- async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
120
- return await super().run_async(file_data=file_data, **kwargs)
121
-
122
111
 
123
112
  class DropboxUploaderConfig(FsspecUploaderConfig):
124
113
  pass
@@ -130,22 +119,6 @@ class DropboxUploader(FsspecUploader):
130
119
  connection_config: DropboxConnectionConfig
131
120
  upload_config: DropboxUploaderConfig = field(default=None)
132
121
 
133
- @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
134
- def __post_init__(self):
135
- super().__post_init__()
136
-
137
- @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
138
- def precheck(self) -> None:
139
- super().precheck()
140
-
141
- @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
142
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
143
- return super().run(path=path, file_data=file_data, **kwargs)
144
-
145
- @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
146
- async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
147
- return await super().run_async(path=path, file_data=file_data, **kwargs)
148
-
149
122
 
150
123
  dropbox_source_entry = SourceRegistryEntry(
151
124
  indexer=DropboxIndexer,
@@ -4,6 +4,7 @@ import os
4
4
  import random
5
5
  import shutil
6
6
  import tempfile
7
+ from contextlib import contextmanager
7
8
  from dataclasses import dataclass, field
8
9
  from pathlib import Path
9
10
  from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
@@ -78,6 +79,15 @@ class FsspecConnectionConfig(ConnectionConfig):
78
79
  access_config: Secret[FsspecAccessConfig]
79
80
  connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
80
81
 
82
+ @contextmanager
83
+ def get_client(self, protocol: str) -> Generator["AbstractFileSystem", None, None]:
84
+ from fsspec import get_filesystem_class
85
+
86
+ client = get_filesystem_class(protocol)(
87
+ **self.get_access_config(),
88
+ )
89
+ yield client
90
+
81
91
 
82
92
  FsspecIndexerConfigT = TypeVar("FsspecIndexerConfigT", bound=FsspecIndexerConfig)
83
93
  FsspecConnectionConfigT = TypeVar("FsspecConnectionConfigT", bound=FsspecConnectionConfig)
@@ -89,14 +99,6 @@ class FsspecIndexer(Indexer):
89
99
  index_config: FsspecIndexerConfigT
90
100
  connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
91
101
 
92
- @property
93
- def fs(self) -> "AbstractFileSystem":
94
- from fsspec import get_filesystem_class
95
-
96
- return get_filesystem_class(self.index_config.protocol)(
97
- **self.connection_config.get_access_config(),
98
- )
99
-
100
102
  def precheck(self) -> None:
101
103
  from fsspec import get_filesystem_class
102
104
 
@@ -110,7 +112,8 @@ class FsspecIndexer(Indexer):
110
112
  return
111
113
  file_to_sample = valid_files[0]
112
114
  logger.debug(f"attempting to make HEAD request for file: {file_to_sample}")
113
- self.fs.head(path=file_to_sample)
115
+ with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
116
+ client.head(path=file_to_sample)
114
117
  except Exception as e:
115
118
  logger.error(f"failed to validate connection: {e}", exc_info=True)
116
119
  raise SourceConnectionError(f"failed to validate connection: {e}")
@@ -120,16 +123,18 @@ class FsspecIndexer(Indexer):
120
123
  # fs.ls does not walk directories
121
124
  # directories that are listed in cloud storage can cause problems
122
125
  # because they are seen as 0 byte files
123
- files = self.fs.ls(self.index_config.path_without_protocol, detail=True)
126
+ with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
127
+ files = client.ls(self.index_config.path_without_protocol, detail=True)
124
128
 
125
129
  else:
126
130
  # fs.find will recursively walk directories
127
131
  # "size" is a common key for all the cloud protocols with fs
128
- found = self.fs.find(
129
- self.index_config.path_without_protocol,
130
- detail=True,
131
- )
132
- files = found.values()
132
+ with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
133
+ found = client.find(
134
+ self.index_config.path_without_protocol,
135
+ detail=True,
136
+ )
137
+ files = found.values()
133
138
  filtered_files = [
134
139
  file for file in files if file.get("size") > 0 and file.get("type") == "file"
135
140
  ]
@@ -200,15 +205,8 @@ class FsspecDownloader(Downloader):
200
205
  )
201
206
 
202
207
  def is_async(self) -> bool:
203
- return self.fs.async_impl
204
-
205
- @property
206
- def fs(self) -> "AbstractFileSystem":
207
- from fsspec import get_filesystem_class
208
-
209
- return get_filesystem_class(self.protocol)(
210
- **self.connection_config.get_access_config(),
211
- )
208
+ with self.connection_config.get_client(protocol=self.protocol) as client:
209
+ return client.async_impl
212
210
 
213
211
  def handle_directory_download(self, lpath: Path) -> None:
214
212
  # If the object's name contains certain characters (i.e. '?'), it
@@ -237,7 +235,8 @@ class FsspecDownloader(Downloader):
237
235
  download_path.parent.mkdir(parents=True, exist_ok=True)
238
236
  try:
239
237
  rpath = file_data.additional_metadata["original_file_path"]
240
- self.fs.get(rpath=rpath, lpath=download_path.as_posix())
238
+ with self.connection_config.get_client(protocol=self.protocol) as client:
239
+ client.get(rpath=rpath, lpath=download_path.as_posix())
241
240
  self.handle_directory_download(lpath=download_path)
242
241
  except Exception as e:
243
242
  logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
@@ -249,7 +248,8 @@ class FsspecDownloader(Downloader):
249
248
  download_path.parent.mkdir(parents=True, exist_ok=True)
250
249
  try:
251
250
  rpath = file_data.additional_metadata["original_file_path"]
252
- await self.fs.get(rpath=rpath, lpath=download_path.as_posix())
251
+ with self.connection_config.get_client(protocol=self.protocol) as client:
252
+ await client.get(rpath=rpath, lpath=download_path.as_posix())
253
253
  self.handle_directory_download(lpath=download_path)
254
254
  except Exception as e:
255
255
  logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
@@ -268,9 +268,11 @@ FsspecUploaderConfigT = TypeVar("FsspecUploaderConfigT", bound=FsspecUploaderCon
268
268
  class FsspecUploader(Uploader):
269
269
  connector_type: str = CONNECTOR_TYPE
270
270
  upload_config: FsspecUploaderConfigT = field(default=None)
271
+ connection_config: FsspecConnectionConfigT
271
272
 
272
273
  def is_async(self) -> bool:
273
- return self.fs.async_impl
274
+ with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
275
+ return client.async_impl
274
276
 
275
277
  @property
276
278
  def fs(self) -> "AbstractFileSystem":
@@ -314,11 +316,13 @@ class FsspecUploader(Uploader):
314
316
  path_str = str(path.resolve())
315
317
  upload_path = self.get_upload_path(file_data=file_data)
316
318
  logger.debug(f"writing local file {path_str} to {upload_path}")
317
- self.fs.upload(lpath=path_str, rpath=upload_path.as_posix())
319
+ with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
320
+ client.upload(lpath=path_str, rpath=upload_path.as_posix())
318
321
 
319
322
  async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
320
323
  upload_path = self.get_upload_path(file_data=file_data)
321
324
  path_str = str(path.resolve())
322
325
  # Odd that fsspec doesn't run exists() as async even when client support async
323
326
  logger.debug(f"writing local file {path_str} to {upload_path}")
324
- self.fs.upload(lpath=path_str, rpath=upload_path.as_posix())
327
+ with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
328
+ client.upload(lpath=path_str, rpath=upload_path.as_posix())