unstructured-ingest 0.3.9__py3-none-any.whl → 0.3.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (44) hide show
  1. test/integration/connectors/sql/test_postgres.py +3 -3
  2. test/integration/connectors/sql/test_singlestore.py +3 -3
  3. test/integration/connectors/sql/test_sqlite.py +3 -3
  4. test/integration/connectors/test_astradb.py +40 -0
  5. test/integration/connectors/test_kafka.py +2 -2
  6. test/integration/connectors/test_mongodb.py +4 -1
  7. test/integration/connectors/utils/validation/source.py +31 -11
  8. unstructured_ingest/__version__.py +1 -1
  9. unstructured_ingest/v2/interfaces/__init__.py +3 -1
  10. unstructured_ingest/v2/interfaces/file_data.py +58 -14
  11. unstructured_ingest/v2/pipeline/steps/chunk.py +2 -1
  12. unstructured_ingest/v2/pipeline/steps/download.py +5 -4
  13. unstructured_ingest/v2/pipeline/steps/embed.py +2 -1
  14. unstructured_ingest/v2/pipeline/steps/filter.py +2 -2
  15. unstructured_ingest/v2/pipeline/steps/index.py +4 -4
  16. unstructured_ingest/v2/pipeline/steps/partition.py +3 -2
  17. unstructured_ingest/v2/pipeline/steps/stage.py +2 -2
  18. unstructured_ingest/v2/pipeline/steps/uncompress.py +2 -2
  19. unstructured_ingest/v2/pipeline/steps/upload.py +3 -3
  20. unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
  21. unstructured_ingest/v2/processes/connectors/astradb.py +35 -33
  22. unstructured_ingest/v2/processes/connectors/couchbase.py +50 -41
  23. unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +41 -45
  24. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +12 -35
  25. unstructured_ingest/v2/processes/connectors/fsspec/box.py +12 -35
  26. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +15 -42
  27. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +33 -29
  28. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +12 -34
  29. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -37
  30. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -33
  31. unstructured_ingest/v2/processes/connectors/mongodb.py +95 -100
  32. unstructured_ingest/v2/processes/connectors/neo4j.py +5 -3
  33. unstructured_ingest/v2/processes/connectors/onedrive.py +1 -1
  34. unstructured_ingest/v2/processes/connectors/sql/postgres.py +5 -5
  35. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -5
  36. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
  37. unstructured_ingest/v2/processes/connectors/sql/sql.py +31 -26
  38. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -5
  39. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/METADATA +14 -13
  40. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/RECORD +44 -44
  41. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/LICENSE.md +0 -0
  42. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/WHEEL +0 -0
  43. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/entry_points.txt +0 -0
  44. {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.10.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,14 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from contextlib import contextmanager
3
4
  from dataclasses import dataclass, field
4
- from pathlib import Path
5
5
  from time import time
6
- from typing import Any, Generator, Optional
6
+ from typing import TYPE_CHECKING, Generator, Optional
7
7
 
8
8
  from pydantic import Field, Secret
9
9
 
10
10
  from unstructured_ingest.utils.dep_check import requires_dependencies
11
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
11
+ from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
12
12
  from unstructured_ingest.v2.processes.connector_registry import (
13
13
  DestinationRegistryEntry,
14
14
  SourceRegistryEntry,
@@ -24,11 +24,16 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
24
24
  FsspecUploaderConfig,
25
25
  )
26
26
 
27
+ if TYPE_CHECKING:
28
+ from dropboxdrivefs import DropboxDriveFileSystem
29
+
27
30
  CONNECTOR_TYPE = "dropbox"
28
31
 
29
32
 
30
33
  class DropboxIndexerConfig(FsspecIndexerConfig):
31
- pass
34
+ def model_post_init(self, __context):
35
+ if not self.path_without_protocol.startswith("/"):
36
+ self.path_without_protocol = "/" + self.path_without_protocol
32
37
 
33
38
 
34
39
  class DropboxAccessConfig(FsspecAccessConfig):
@@ -42,6 +47,12 @@ class DropboxConnectionConfig(FsspecConnectionConfig):
42
47
  )
43
48
  connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
44
49
 
50
+ @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
51
+ @contextmanager
52
+ def get_client(self, protocol: str) -> Generator["DropboxDriveFileSystem", None, None]:
53
+ with super().get_client(protocol=protocol) as client:
54
+ yield client
55
+
45
56
 
46
57
  @dataclass
47
58
  class DropboxIndexer(FsspecIndexer):
@@ -83,20 +94,6 @@ class DropboxIndexer(FsspecIndexer):
83
94
  filesize_bytes=file_size,
84
95
  )
85
96
 
86
- @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
87
- def __post_init__(self):
88
- # dropbox expects the path to start with a /
89
- if not self.index_config.path_without_protocol.startswith("/"):
90
- self.index_config.path_without_protocol = "/" + self.index_config.path_without_protocol
91
-
92
- @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
93
- def precheck(self) -> None:
94
- super().precheck()
95
-
96
- @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
97
- def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
98
- return super().run(**kwargs)
99
-
100
97
 
101
98
  class DropboxDownloaderConfig(FsspecDownloaderConfig):
102
99
  pass
@@ -111,14 +108,6 @@ class DropboxDownloader(FsspecDownloader):
111
108
  default_factory=DropboxDownloaderConfig
112
109
  )
113
110
 
114
- @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
115
- def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
116
- return super().run(file_data=file_data, **kwargs)
117
-
118
- @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
119
- async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
120
- return await super().run_async(file_data=file_data, **kwargs)
121
-
122
111
 
123
112
  class DropboxUploaderConfig(FsspecUploaderConfig):
124
113
  pass
@@ -130,22 +119,6 @@ class DropboxUploader(FsspecUploader):
130
119
  connection_config: DropboxConnectionConfig
131
120
  upload_config: DropboxUploaderConfig = field(default=None)
132
121
 
133
- @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
134
- def __post_init__(self):
135
- super().__post_init__()
136
-
137
- @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
138
- def precheck(self) -> None:
139
- super().precheck()
140
-
141
- @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
142
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
143
- return super().run(path=path, file_data=file_data, **kwargs)
144
-
145
- @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
146
- async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
147
- return await super().run_async(path=path, file_data=file_data, **kwargs)
148
-
149
122
 
150
123
  dropbox_source_entry = SourceRegistryEntry(
151
124
  indexer=DropboxIndexer,
@@ -4,6 +4,7 @@ import os
4
4
  import random
5
5
  import shutil
6
6
  import tempfile
7
+ from contextlib import contextmanager
7
8
  from dataclasses import dataclass, field
8
9
  from pathlib import Path
9
10
  from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
@@ -78,6 +79,15 @@ class FsspecConnectionConfig(ConnectionConfig):
78
79
  access_config: Secret[FsspecAccessConfig]
79
80
  connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
80
81
 
82
+ @contextmanager
83
+ def get_client(self, protocol: str) -> Generator["AbstractFileSystem", None, None]:
84
+ from fsspec import get_filesystem_class
85
+
86
+ client = get_filesystem_class(protocol)(
87
+ **self.get_access_config(),
88
+ )
89
+ yield client
90
+
81
91
 
82
92
  FsspecIndexerConfigT = TypeVar("FsspecIndexerConfigT", bound=FsspecIndexerConfig)
83
93
  FsspecConnectionConfigT = TypeVar("FsspecConnectionConfigT", bound=FsspecConnectionConfig)
@@ -89,14 +99,6 @@ class FsspecIndexer(Indexer):
89
99
  index_config: FsspecIndexerConfigT
90
100
  connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
91
101
 
92
- @property
93
- def fs(self) -> "AbstractFileSystem":
94
- from fsspec import get_filesystem_class
95
-
96
- return get_filesystem_class(self.index_config.protocol)(
97
- **self.connection_config.get_access_config(),
98
- )
99
-
100
102
  def precheck(self) -> None:
101
103
  from fsspec import get_filesystem_class
102
104
 
@@ -110,7 +112,8 @@ class FsspecIndexer(Indexer):
110
112
  return
111
113
  file_to_sample = valid_files[0]
112
114
  logger.debug(f"attempting to make HEAD request for file: {file_to_sample}")
113
- self.fs.head(path=file_to_sample)
115
+ with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
116
+ client.head(path=file_to_sample)
114
117
  except Exception as e:
115
118
  logger.error(f"failed to validate connection: {e}", exc_info=True)
116
119
  raise SourceConnectionError(f"failed to validate connection: {e}")
@@ -120,16 +123,18 @@ class FsspecIndexer(Indexer):
120
123
  # fs.ls does not walk directories
121
124
  # directories that are listed in cloud storage can cause problems
122
125
  # because they are seen as 0 byte files
123
- files = self.fs.ls(self.index_config.path_without_protocol, detail=True)
126
+ with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
127
+ files = client.ls(self.index_config.path_without_protocol, detail=True)
124
128
 
125
129
  else:
126
130
  # fs.find will recursively walk directories
127
131
  # "size" is a common key for all the cloud protocols with fs
128
- found = self.fs.find(
129
- self.index_config.path_without_protocol,
130
- detail=True,
131
- )
132
- files = found.values()
132
+ with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
133
+ found = client.find(
134
+ self.index_config.path_without_protocol,
135
+ detail=True,
136
+ )
137
+ files = found.values()
133
138
  filtered_files = [
134
139
  file for file in files if file.get("size") > 0 and file.get("type") == "file"
135
140
  ]
@@ -200,15 +205,8 @@ class FsspecDownloader(Downloader):
200
205
  )
201
206
 
202
207
  def is_async(self) -> bool:
203
- return self.fs.async_impl
204
-
205
- @property
206
- def fs(self) -> "AbstractFileSystem":
207
- from fsspec import get_filesystem_class
208
-
209
- return get_filesystem_class(self.protocol)(
210
- **self.connection_config.get_access_config(),
211
- )
208
+ with self.connection_config.get_client(protocol=self.protocol) as client:
209
+ return client.async_impl
212
210
 
213
211
  def handle_directory_download(self, lpath: Path) -> None:
214
212
  # If the object's name contains certain characters (i.e. '?'), it
@@ -237,7 +235,8 @@ class FsspecDownloader(Downloader):
237
235
  download_path.parent.mkdir(parents=True, exist_ok=True)
238
236
  try:
239
237
  rpath = file_data.additional_metadata["original_file_path"]
240
- self.fs.get(rpath=rpath, lpath=download_path.as_posix())
238
+ with self.connection_config.get_client(protocol=self.protocol) as client:
239
+ client.get(rpath=rpath, lpath=download_path.as_posix())
241
240
  self.handle_directory_download(lpath=download_path)
242
241
  except Exception as e:
243
242
  logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
@@ -249,7 +248,8 @@ class FsspecDownloader(Downloader):
249
248
  download_path.parent.mkdir(parents=True, exist_ok=True)
250
249
  try:
251
250
  rpath = file_data.additional_metadata["original_file_path"]
252
- await self.fs.get(rpath=rpath, lpath=download_path.as_posix())
251
+ with self.connection_config.get_client(protocol=self.protocol) as client:
252
+ await client.get(rpath=rpath, lpath=download_path.as_posix())
253
253
  self.handle_directory_download(lpath=download_path)
254
254
  except Exception as e:
255
255
  logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
@@ -268,9 +268,11 @@ FsspecUploaderConfigT = TypeVar("FsspecUploaderConfigT", bound=FsspecUploaderCon
268
268
  class FsspecUploader(Uploader):
269
269
  connector_type: str = CONNECTOR_TYPE
270
270
  upload_config: FsspecUploaderConfigT = field(default=None)
271
+ connection_config: FsspecConnectionConfigT
271
272
 
272
273
  def is_async(self) -> bool:
273
- return self.fs.async_impl
274
+ with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
275
+ return client.async_impl
274
276
 
275
277
  @property
276
278
  def fs(self) -> "AbstractFileSystem":
@@ -314,11 +316,13 @@ class FsspecUploader(Uploader):
314
316
  path_str = str(path.resolve())
315
317
  upload_path = self.get_upload_path(file_data=file_data)
316
318
  logger.debug(f"writing local file {path_str} to {upload_path}")
317
- self.fs.upload(lpath=path_str, rpath=upload_path.as_posix())
319
+ with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
320
+ client.upload(lpath=path_str, rpath=upload_path.as_posix())
318
321
 
319
322
  async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
320
323
  upload_path = self.get_upload_path(file_data=file_data)
321
324
  path_str = str(path.resolve())
322
325
  # Odd that fsspec doesn't run exists() as async even when client support async
323
326
  logger.debug(f"writing local file {path_str} to {upload_path}")
324
- self.fs.upload(lpath=path_str, rpath=upload_path.as_posix())
327
+ with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
328
+ client.upload(lpath=path_str, rpath=upload_path.as_posix())
@@ -1,16 +1,17 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from contextlib import contextmanager
3
4
  from dataclasses import dataclass, field
4
5
  from pathlib import Path
5
6
  from time import time
6
- from typing import Any, Generator, Optional, Union
7
+ from typing import TYPE_CHECKING, Any, Generator, Optional, Union
7
8
 
8
9
  from dateutil import parser
9
10
  from pydantic import Field, Secret
10
11
 
11
12
  from unstructured_ingest.utils.dep_check import requires_dependencies
12
13
  from unstructured_ingest.utils.string_and_date_utils import json_to_dict
13
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
14
+ from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
14
15
  from unstructured_ingest.v2.processes.connector_registry import (
15
16
  DestinationRegistryEntry,
16
17
  SourceRegistryEntry,
@@ -26,6 +27,9 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
26
27
  FsspecUploaderConfig,
27
28
  )
28
29
 
30
+ if TYPE_CHECKING:
31
+ from gcsfs import GCSFileSystem
32
+
29
33
  CONNECTOR_TYPE = "gcs"
30
34
 
31
35
 
@@ -93,6 +97,12 @@ class GcsConnectionConfig(FsspecConnectionConfig):
93
97
  access_config: Secret[GcsAccessConfig] = Field(default=GcsAccessConfig(), validate_default=True)
94
98
  connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
95
99
 
100
+ @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
101
+ @contextmanager
102
+ def get_client(self, protocol: str) -> Generator["GCSFileSystem", None, None]:
103
+ with super().get_client(protocol=protocol) as client:
104
+ yield client
105
+
96
106
 
97
107
  @dataclass
98
108
  class GcsIndexer(FsspecIndexer):
@@ -100,14 +110,6 @@ class GcsIndexer(FsspecIndexer):
100
110
  index_config: GcsIndexerConfig
101
111
  connector_type: str = CONNECTOR_TYPE
102
112
 
103
- @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
104
- def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
105
- return super().run(**kwargs)
106
-
107
- @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
108
- def precheck(self) -> None:
109
- super().precheck()
110
-
111
113
  def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
112
114
  path = file_data["name"]
113
115
  date_created = None
@@ -147,14 +149,6 @@ class GcsDownloader(FsspecDownloader):
147
149
  connector_type: str = CONNECTOR_TYPE
148
150
  download_config: Optional[GcsDownloaderConfig] = field(default_factory=GcsDownloaderConfig)
149
151
 
150
- @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
151
- def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
152
- return super().run(file_data=file_data, **kwargs)
153
-
154
- @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
155
- async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
156
- return await super().run_async(file_data=file_data, **kwargs)
157
-
158
152
 
159
153
  class GcsUploaderConfig(FsspecUploaderConfig):
160
154
  pass
@@ -166,22 +160,6 @@ class GcsUploader(FsspecUploader):
166
160
  connection_config: GcsConnectionConfig
167
161
  upload_config: GcsUploaderConfig = field(default=None)
168
162
 
169
- @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
170
- def __post_init__(self):
171
- super().__post_init__()
172
-
173
- @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
174
- def precheck(self) -> None:
175
- super().precheck()
176
-
177
- @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
178
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
179
- return super().run(path=path, file_data=file_data, **kwargs)
180
-
181
- @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
182
- async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
183
- return await super().run_async(path=path, file_data=file_data, **kwargs)
184
-
185
163
 
186
164
  gcs_source_entry = SourceRegistryEntry(
187
165
  indexer=GcsIndexer,
@@ -1,15 +1,13 @@
1
1
  import contextlib
2
+ from contextlib import contextmanager
2
3
  from dataclasses import dataclass, field
3
- from pathlib import Path
4
4
  from time import time
5
- from typing import Any, Generator, Optional
5
+ from typing import TYPE_CHECKING, Any, Generator, Optional
6
6
 
7
7
  from pydantic import Field, Secret
8
8
 
9
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
10
  from unstructured_ingest.v2.interfaces import (
11
- DownloadResponse,
12
- FileData,
13
11
  FileDataSourceMetadata,
14
12
  )
15
13
  from unstructured_ingest.v2.processes.connector_registry import (
@@ -29,6 +27,9 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
29
27
 
30
28
  CONNECTOR_TYPE = "s3"
31
29
 
30
+ if TYPE_CHECKING:
31
+ from s3fs import S3FileSystem
32
+
32
33
 
33
34
  class S3IndexerConfig(FsspecIndexerConfig):
34
35
  pass
@@ -72,6 +73,12 @@ class S3ConnectionConfig(FsspecConnectionConfig):
72
73
  )
73
74
  return access_configs
74
75
 
76
+ @requires_dependencies(["s3fs", "fsspec"], extras="s3")
77
+ @contextmanager
78
+ def get_client(self, protocol: str) -> Generator["S3FileSystem", None, None]:
79
+ with super().get_client(protocol=protocol) as client:
80
+ yield client
81
+
75
82
 
76
83
  @dataclass
77
84
  class S3Indexer(FsspecIndexer):
@@ -97,7 +104,8 @@ class S3Indexer(FsspecIndexer):
97
104
  version = file_data.get("ETag").rstrip('"').lstrip('"') if "ETag" in file_data else None
98
105
  metadata: dict[str, str] = {}
99
106
  with contextlib.suppress(AttributeError):
100
- metadata = self.fs.metadata(path=path)
107
+ with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
108
+ metadata = client.metadata(path=path)
101
109
  record_locator = {
102
110
  "protocol": self.index_config.protocol,
103
111
  "remote_file_path": self.index_config.remote_url,
@@ -114,14 +122,6 @@ class S3Indexer(FsspecIndexer):
114
122
  filesize_bytes=file_size,
115
123
  )
116
124
 
117
- @requires_dependencies(["s3fs", "fsspec"], extras="s3")
118
- def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
119
- return super().run(**kwargs)
120
-
121
- @requires_dependencies(["s3fs", "fsspec"], extras="s3")
122
- def precheck(self) -> None:
123
- super().precheck()
124
-
125
125
 
126
126
  class S3DownloaderConfig(FsspecDownloaderConfig):
127
127
  pass
@@ -134,14 +134,6 @@ class S3Downloader(FsspecDownloader):
134
134
  connector_type: str = CONNECTOR_TYPE
135
135
  download_config: Optional[S3DownloaderConfig] = field(default_factory=S3DownloaderConfig)
136
136
 
137
- @requires_dependencies(["s3fs", "fsspec"], extras="s3")
138
- def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
139
- return super().run(file_data=file_data, **kwargs)
140
-
141
- @requires_dependencies(["s3fs", "fsspec"], extras="s3")
142
- async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
143
- return await super().run_async(file_data=file_data, **kwargs)
144
-
145
137
 
146
138
  class S3UploaderConfig(FsspecUploaderConfig):
147
139
  pass
@@ -153,22 +145,6 @@ class S3Uploader(FsspecUploader):
153
145
  connection_config: S3ConnectionConfig
154
146
  upload_config: S3UploaderConfig = field(default=None)
155
147
 
156
- @requires_dependencies(["s3fs", "fsspec"], extras="s3")
157
- def precheck(self) -> None:
158
- super().precheck()
159
-
160
- @requires_dependencies(["s3fs", "fsspec"], extras="s3")
161
- def __post_init__(self):
162
- super().__post_init__()
163
-
164
- @requires_dependencies(["s3fs", "fsspec"], extras="s3")
165
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
166
- return super().run(path=path, file_data=file_data, **kwargs)
167
-
168
- @requires_dependencies(["s3fs", "fsspec"], extras="s3")
169
- async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
170
- return await super().run_async(path=path, file_data=file_data, **kwargs)
171
-
172
148
 
173
149
  s3_source_entry = SourceRegistryEntry(
174
150
  indexer=S3Indexer,
@@ -1,16 +1,17 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import os
4
+ from contextlib import contextmanager
4
5
  from dataclasses import dataclass, field
5
6
  from pathlib import Path
6
7
  from time import time
7
- from typing import Any, Generator, Optional
8
+ from typing import TYPE_CHECKING, Any, Generator, Optional
8
9
  from urllib.parse import urlparse
9
10
 
10
11
  from pydantic import Field, Secret
11
12
 
12
13
  from unstructured_ingest.utils.dep_check import requires_dependencies
13
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
14
+ from unstructured_ingest.v2.interfaces import FileData, FileDataSourceMetadata
14
15
  from unstructured_ingest.v2.processes.connector_registry import (
15
16
  DestinationRegistryEntry,
16
17
  SourceRegistryEntry,
@@ -26,6 +27,9 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
26
27
  FsspecUploaderConfig,
27
28
  )
28
29
 
30
+ if TYPE_CHECKING:
31
+ from fsspec.implementations.sftp import SFTPFileSystem
32
+
29
33
  CONNECTOR_TYPE = "sftp"
30
34
 
31
35
 
@@ -67,6 +71,19 @@ class SftpConnectionConfig(FsspecConnectionConfig):
67
71
  }
68
72
  return access_config
69
73
 
74
+ @contextmanager
75
+ @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
76
+ def get_client(self, protocol: str) -> Generator["SFTPFileSystem", None, None]:
77
+ # The paramiko.SSHClient() client that's opened by the SFTPFileSystem
78
+ # never gets closed so explicitly adding that as part of this context manager
79
+ from fsspec import get_filesystem_class
80
+
81
+ client: SFTPFileSystem = get_filesystem_class(protocol)(
82
+ **self.get_access_config(),
83
+ )
84
+ yield client
85
+ client.client.close()
86
+
70
87
 
71
88
  @dataclass
72
89
  class SftpIndexer(FsspecIndexer):
@@ -74,13 +91,11 @@ class SftpIndexer(FsspecIndexer):
74
91
  index_config: SftpIndexerConfig
75
92
  connector_type: str = CONNECTOR_TYPE
76
93
 
77
- @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
78
94
  def __post_init__(self):
79
95
  parsed_url = urlparse(self.index_config.remote_url)
80
96
  self.connection_config.host = parsed_url.hostname or self.connection_config.host
81
97
  self.connection_config.port = parsed_url.port or self.connection_config.port
82
98
 
83
- @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
84
99
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
85
100
  for file in super().run(**kwargs):
86
101
  new_identifier = (
@@ -92,10 +107,6 @@ class SftpIndexer(FsspecIndexer):
92
107
  file.identifier = new_identifier
93
108
  yield file
94
109
 
95
- @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
96
- def precheck(self) -> None:
97
- super().precheck()
98
-
99
110
  def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
100
111
  path = file_data["name"]
101
112
  date_created = str(file_data.get("time").timestamp()) if "time" in file_data else None
@@ -128,20 +139,11 @@ class SftpDownloader(FsspecDownloader):
128
139
  connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
129
140
  download_config: Optional[SftpDownloaderConfig] = field(default_factory=SftpDownloaderConfig)
130
141
 
131
- @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
132
142
  def __post_init__(self):
133
143
  parsed_url = urlparse(self.download_config.remote_url)
134
144
  self.connection_config.host = parsed_url.hostname or self.connection_config.host
135
145
  self.connection_config.port = parsed_url.port or self.connection_config.port
136
146
 
137
- @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
138
- def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
139
- return super().run(file_data=file_data, **kwargs)
140
-
141
- @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
142
- async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
143
- return await super().run_async(file_data=file_data, **kwargs)
144
-
145
147
 
146
148
  class SftpUploaderConfig(FsspecUploaderConfig):
147
149
  pass
@@ -153,22 +155,6 @@ class SftpUploader(FsspecUploader):
153
155
  connection_config: SftpConnectionConfig
154
156
  upload_config: SftpUploaderConfig = field(default=None)
155
157
 
156
- @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
157
- def __post_init__(self):
158
- super().__post_init__()
159
-
160
- @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
161
- def precheck(self) -> None:
162
- super().precheck()
163
-
164
- @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
165
- def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
166
- return super().run(path=path, file_data=file_data, **kwargs)
167
-
168
- @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
169
- async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
170
- return await super().run_async(path=path, file_data=file_data, **kwargs)
171
-
172
158
 
173
159
  sftp_source_entry = SourceRegistryEntry(
174
160
  indexer=SftpIndexer,