unstructured-ingest 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/{databricks_tests → databricks}/test_volumes_native.py +75 -19
- test/integration/connectors/sql/test_postgres.py +9 -5
- test/integration/connectors/sql/test_singlestore.py +9 -5
- test/integration/connectors/sql/test_snowflake.py +6 -2
- test/integration/connectors/sql/test_sqlite.py +9 -5
- test/integration/connectors/test_astradb.py +40 -0
- test/integration/connectors/test_kafka.py +2 -2
- test/integration/connectors/test_mongodb.py +4 -1
- test/integration/connectors/utils/validation/source.py +31 -11
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/interfaces/__init__.py +3 -1
- unstructured_ingest/v2/interfaces/file_data.py +69 -15
- unstructured_ingest/v2/pipeline/steps/chunk.py +2 -1
- unstructured_ingest/v2/pipeline/steps/download.py +5 -4
- unstructured_ingest/v2/pipeline/steps/embed.py +2 -1
- unstructured_ingest/v2/pipeline/steps/filter.py +2 -2
- unstructured_ingest/v2/pipeline/steps/index.py +4 -4
- unstructured_ingest/v2/pipeline/steps/partition.py +3 -2
- unstructured_ingest/v2/pipeline/steps/stage.py +2 -2
- unstructured_ingest/v2/pipeline/steps/uncompress.py +2 -2
- unstructured_ingest/v2/pipeline/steps/upload.py +3 -3
- unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +37 -33
- unstructured_ingest/v2/processes/connectors/couchbase.py +52 -41
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +5 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +2 -2
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +2 -2
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +2 -2
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +2 -2
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +41 -45
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +12 -35
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +12 -35
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +15 -42
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +33 -29
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +12 -34
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -37
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -33
- unstructured_ingest/v2/processes/connectors/mongodb.py +94 -100
- unstructured_ingest/v2/processes/connectors/neo4j.py +5 -3
- unstructured_ingest/v2/processes/connectors/onedrive.py +1 -1
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/sql.py +36 -26
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -5
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/METADATA +11 -10
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/RECORD +52 -52
- /test/integration/connectors/{databricks_tests → databricks}/__init__.py +0 -0
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.9.dist-info → unstructured_ingest-0.3.11.dist-info}/top_level.txt +0 -0
|
@@ -3,12 +3,12 @@ from typing import Optional
|
|
|
3
3
|
|
|
4
4
|
from pydantic import Field, Secret
|
|
5
5
|
|
|
6
|
-
from unstructured_ingest.v2.interfaces import AccessConfig
|
|
7
6
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
8
7
|
DestinationRegistryEntry,
|
|
9
8
|
SourceRegistryEntry,
|
|
10
9
|
)
|
|
11
10
|
from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
|
|
11
|
+
DatabricksVolumesAccessConfig,
|
|
12
12
|
DatabricksVolumesConnectionConfig,
|
|
13
13
|
DatabricksVolumesDownloader,
|
|
14
14
|
DatabricksVolumesDownloaderConfig,
|
|
@@ -21,7 +21,7 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
|
|
|
21
21
|
CONNECTOR_TYPE = "databricks_volumes"
|
|
22
22
|
|
|
23
23
|
|
|
24
|
-
class DatabricksNativeVolumesAccessConfig(
|
|
24
|
+
class DatabricksNativeVolumesAccessConfig(DatabricksVolumesAccessConfig):
|
|
25
25
|
client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
|
|
26
26
|
client_secret: Optional[str] = Field(
|
|
27
27
|
default=None, description="Client Secret of the OAuth app."
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import collections
|
|
2
2
|
import hashlib
|
|
3
|
-
import sys
|
|
4
3
|
from contextlib import contextmanager
|
|
5
4
|
from dataclasses import dataclass, field
|
|
6
5
|
from pathlib import Path
|
|
@@ -15,11 +14,17 @@ from unstructured_ingest.error import (
|
|
|
15
14
|
SourceConnectionNetworkError,
|
|
16
15
|
WriteError,
|
|
17
16
|
)
|
|
18
|
-
from unstructured_ingest.utils.data_prep import
|
|
17
|
+
from unstructured_ingest.utils.data_prep import (
|
|
18
|
+
batch_generator,
|
|
19
|
+
flatten_dict,
|
|
20
|
+
generator_batching_wbytes,
|
|
21
|
+
)
|
|
19
22
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
20
23
|
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
21
24
|
from unstructured_ingest.v2.interfaces import (
|
|
22
25
|
AccessConfig,
|
|
26
|
+
BatchFileData,
|
|
27
|
+
BatchItem,
|
|
23
28
|
ConnectionConfig,
|
|
24
29
|
Downloader,
|
|
25
30
|
DownloaderConfig,
|
|
@@ -48,6 +53,14 @@ if TYPE_CHECKING:
|
|
|
48
53
|
CONNECTOR_TYPE = "elasticsearch"
|
|
49
54
|
|
|
50
55
|
|
|
56
|
+
class ElastisearchAdditionalMetadata(BaseModel):
|
|
57
|
+
index_name: str
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class ElasticsearchBatchFileData(BatchFileData):
|
|
61
|
+
additional_metadata: ElastisearchAdditionalMetadata
|
|
62
|
+
|
|
63
|
+
|
|
51
64
|
class ElasticsearchAccessConfig(AccessConfig):
|
|
52
65
|
password: Optional[str] = Field(
|
|
53
66
|
default=None, description="password when using basic auth or connecting to a cloud instance"
|
|
@@ -174,36 +187,21 @@ class ElasticsearchIndexer(Indexer):
|
|
|
174
187
|
|
|
175
188
|
return {hit["_id"] for hit in hits}
|
|
176
189
|
|
|
177
|
-
def run(self, **kwargs: Any) -> Generator[
|
|
190
|
+
def run(self, **kwargs: Any) -> Generator[ElasticsearchBatchFileData, None, None]:
|
|
178
191
|
all_ids = self._get_doc_ids()
|
|
179
192
|
ids = list(all_ids)
|
|
180
|
-
|
|
181
|
-
frozenset(
|
|
182
|
-
ids[
|
|
183
|
-
i
|
|
184
|
-
* self.index_config.batch_size : (i + 1) # noqa
|
|
185
|
-
* self.index_config.batch_size
|
|
186
|
-
]
|
|
187
|
-
)
|
|
188
|
-
for i in range(
|
|
189
|
-
(len(ids) + self.index_config.batch_size - 1) // self.index_config.batch_size
|
|
190
|
-
)
|
|
191
|
-
]
|
|
192
|
-
for batch in id_batches:
|
|
193
|
+
for batch in batch_generator(ids, self.index_config.batch_size):
|
|
193
194
|
# Make sure the hash is always a positive number to create identified
|
|
194
|
-
|
|
195
|
-
yield FileData(
|
|
196
|
-
identifier=identified,
|
|
195
|
+
yield ElasticsearchBatchFileData(
|
|
197
196
|
connector_type=CONNECTOR_TYPE,
|
|
198
|
-
doc_type="batch",
|
|
199
197
|
metadata=FileDataSourceMetadata(
|
|
200
198
|
url=f"{self.connection_config.hosts[0]}/{self.index_config.index_name}",
|
|
201
199
|
date_processed=str(time()),
|
|
202
200
|
),
|
|
203
|
-
additional_metadata=
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
201
|
+
additional_metadata=ElastisearchAdditionalMetadata(
|
|
202
|
+
index_name=self.index_config.index_name,
|
|
203
|
+
),
|
|
204
|
+
batch_items=[BatchItem(identifier=b) for b in batch],
|
|
207
205
|
)
|
|
208
206
|
|
|
209
207
|
|
|
@@ -237,7 +235,7 @@ class ElasticsearchDownloader(Downloader):
|
|
|
237
235
|
return concatenated_values
|
|
238
236
|
|
|
239
237
|
def generate_download_response(
|
|
240
|
-
self, result: dict, index_name: str, file_data:
|
|
238
|
+
self, result: dict, index_name: str, file_data: ElasticsearchBatchFileData
|
|
241
239
|
) -> DownloadResponse:
|
|
242
240
|
record_id = result["_id"]
|
|
243
241
|
filename_id = self.get_identifier(index_name=index_name, record_id=record_id)
|
|
@@ -257,22 +255,19 @@ class ElasticsearchDownloader(Downloader):
|
|
|
257
255
|
exc_info=True,
|
|
258
256
|
)
|
|
259
257
|
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
),
|
|
274
|
-
),
|
|
275
|
-
path=download_path,
|
|
258
|
+
file_data.source_identifiers = SourceIdentifiers(filename=filename, fullpath=filename)
|
|
259
|
+
cast_file_data = FileData.cast(file_data=file_data)
|
|
260
|
+
cast_file_data.identifier = filename_id
|
|
261
|
+
cast_file_data.metadata.date_processed = str(time())
|
|
262
|
+
cast_file_data.metadata.version = str(result["_version"]) if "_version" in result else None
|
|
263
|
+
cast_file_data.metadata.record_locator = {
|
|
264
|
+
"hosts": self.connection_config.hosts,
|
|
265
|
+
"index_name": index_name,
|
|
266
|
+
"document_id": record_id,
|
|
267
|
+
}
|
|
268
|
+
return super().generate_download_response(
|
|
269
|
+
file_data=cast_file_data,
|
|
270
|
+
download_path=download_path,
|
|
276
271
|
)
|
|
277
272
|
|
|
278
273
|
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
@@ -285,11 +280,12 @@ class ElasticsearchDownloader(Downloader):
|
|
|
285
280
|
|
|
286
281
|
return AsyncElasticsearch, async_scan
|
|
287
282
|
|
|
288
|
-
async def run_async(self, file_data:
|
|
283
|
+
async def run_async(self, file_data: BatchFileData, **kwargs: Any) -> download_responses:
|
|
284
|
+
elasticsearch_filedata = ElasticsearchBatchFileData.cast(file_data=file_data)
|
|
289
285
|
AsyncClient, async_scan = self.load_async()
|
|
290
286
|
|
|
291
|
-
index_name: str =
|
|
292
|
-
ids: list[str] =
|
|
287
|
+
index_name: str = elasticsearch_filedata.additional_metadata.index_name
|
|
288
|
+
ids: list[str] = [item.identifier for item in elasticsearch_filedata.batch_items]
|
|
293
289
|
|
|
294
290
|
scan_query = {
|
|
295
291
|
"_source": self.download_config.fields,
|
|
@@ -307,7 +303,7 @@ class ElasticsearchDownloader(Downloader):
|
|
|
307
303
|
):
|
|
308
304
|
download_responses.append(
|
|
309
305
|
self.generate_download_response(
|
|
310
|
-
result=result, index_name=index_name, file_data=
|
|
306
|
+
result=result, index_name=index_name, file_data=elasticsearch_filedata
|
|
311
307
|
)
|
|
312
308
|
)
|
|
313
309
|
return download_responses
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from contextlib import contextmanager
|
|
3
4
|
from dataclasses import dataclass, field
|
|
4
|
-
from pathlib import Path
|
|
5
5
|
from time import time
|
|
6
|
-
from typing import Any, Generator, Optional
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
7
7
|
|
|
8
8
|
from pydantic import Field, Secret
|
|
9
9
|
|
|
10
10
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
|
-
from unstructured_ingest.v2.interfaces import
|
|
11
|
+
from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
|
|
12
12
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
13
13
|
DestinationRegistryEntry,
|
|
14
14
|
SourceRegistryEntry,
|
|
@@ -25,6 +25,9 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
25
25
|
)
|
|
26
26
|
from unstructured_ingest.v2.processes.connectors.fsspec.utils import json_serial, sterilize_dict
|
|
27
27
|
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from adlfs import AzureBlobFileSystem
|
|
30
|
+
|
|
28
31
|
CONNECTOR_TYPE = "azure"
|
|
29
32
|
|
|
30
33
|
|
|
@@ -89,6 +92,12 @@ class AzureConnectionConfig(FsspecConnectionConfig):
|
|
|
89
92
|
}
|
|
90
93
|
return access_configs
|
|
91
94
|
|
|
95
|
+
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
96
|
+
@contextmanager
|
|
97
|
+
def get_client(self, protocol: str) -> Generator["AzureBlobFileSystem", None, None]:
|
|
98
|
+
with super().get_client(protocol=protocol) as client:
|
|
99
|
+
yield client
|
|
100
|
+
|
|
92
101
|
|
|
93
102
|
@dataclass
|
|
94
103
|
class AzureIndexer(FsspecIndexer):
|
|
@@ -96,17 +105,9 @@ class AzureIndexer(FsspecIndexer):
|
|
|
96
105
|
index_config: AzureIndexerConfig
|
|
97
106
|
connector_type: str = CONNECTOR_TYPE
|
|
98
107
|
|
|
99
|
-
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
100
|
-
def precheck(self) -> None:
|
|
101
|
-
super().precheck()
|
|
102
|
-
|
|
103
108
|
def sterilize_info(self, file_data: dict) -> dict:
|
|
104
109
|
return sterilize_dict(data=file_data, default=azure_json_serial)
|
|
105
110
|
|
|
106
|
-
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
107
|
-
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
108
|
-
return super().run(**kwargs)
|
|
109
|
-
|
|
110
111
|
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
111
112
|
path = file_data["name"]
|
|
112
113
|
date_created = (
|
|
@@ -149,14 +150,6 @@ class AzureDownloader(FsspecDownloader):
|
|
|
149
150
|
connector_type: str = CONNECTOR_TYPE
|
|
150
151
|
download_config: Optional[AzureDownloaderConfig] = field(default_factory=AzureDownloaderConfig)
|
|
151
152
|
|
|
152
|
-
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
153
|
-
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
154
|
-
return super().run(file_data=file_data, **kwargs)
|
|
155
|
-
|
|
156
|
-
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
157
|
-
async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
158
|
-
return await super().run_async(file_data=file_data, **kwargs)
|
|
159
|
-
|
|
160
153
|
|
|
161
154
|
class AzureUploaderConfig(FsspecUploaderConfig):
|
|
162
155
|
pass
|
|
@@ -168,22 +161,6 @@ class AzureUploader(FsspecUploader):
|
|
|
168
161
|
connection_config: AzureConnectionConfig
|
|
169
162
|
upload_config: AzureUploaderConfig = field(default=None)
|
|
170
163
|
|
|
171
|
-
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
172
|
-
def __post_init__(self):
|
|
173
|
-
super().__post_init__()
|
|
174
|
-
|
|
175
|
-
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
176
|
-
def precheck(self) -> None:
|
|
177
|
-
super().precheck()
|
|
178
|
-
|
|
179
|
-
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
180
|
-
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
181
|
-
return super().run(path=path, file_data=file_data, **kwargs)
|
|
182
|
-
|
|
183
|
-
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
184
|
-
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
185
|
-
return await super().run_async(path=path, file_data=file_data, **kwargs)
|
|
186
|
-
|
|
187
164
|
|
|
188
165
|
azure_source_entry = SourceRegistryEntry(
|
|
189
166
|
indexer=AzureIndexer,
|
|
@@ -1,16 +1,16 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from contextlib import contextmanager
|
|
3
4
|
from dataclasses import dataclass, field
|
|
4
|
-
from pathlib import Path
|
|
5
5
|
from time import time
|
|
6
|
-
from typing import Annotated, Any, Generator, Optional
|
|
6
|
+
from typing import TYPE_CHECKING, Annotated, Any, Generator, Optional
|
|
7
7
|
|
|
8
8
|
from dateutil import parser
|
|
9
9
|
from pydantic import Field, Secret
|
|
10
10
|
from pydantic.functional_validators import BeforeValidator
|
|
11
11
|
|
|
12
12
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
|
-
from unstructured_ingest.v2.interfaces import
|
|
13
|
+
from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
|
|
14
14
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
15
15
|
DestinationRegistryEntry,
|
|
16
16
|
SourceRegistryEntry,
|
|
@@ -28,6 +28,9 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
28
28
|
)
|
|
29
29
|
from unstructured_ingest.v2.processes.connectors.utils import conform_string_to_dict
|
|
30
30
|
|
|
31
|
+
if TYPE_CHECKING:
|
|
32
|
+
from boxfs import BoxFileSystem
|
|
33
|
+
|
|
31
34
|
CONNECTOR_TYPE = "box"
|
|
32
35
|
|
|
33
36
|
|
|
@@ -72,6 +75,12 @@ class BoxConnectionConfig(FsspecConnectionConfig):
|
|
|
72
75
|
|
|
73
76
|
return access_kwargs_with_oauth
|
|
74
77
|
|
|
78
|
+
@requires_dependencies(["boxfs"], extras="box")
|
|
79
|
+
@contextmanager
|
|
80
|
+
def get_client(self, protocol: str) -> Generator["BoxFileSystem", None, None]:
|
|
81
|
+
with super().get_client(protocol=protocol) as client:
|
|
82
|
+
yield client
|
|
83
|
+
|
|
75
84
|
|
|
76
85
|
@dataclass
|
|
77
86
|
class BoxIndexer(FsspecIndexer):
|
|
@@ -79,14 +88,6 @@ class BoxIndexer(FsspecIndexer):
|
|
|
79
88
|
index_config: BoxIndexerConfig
|
|
80
89
|
connector_type: str = CONNECTOR_TYPE
|
|
81
90
|
|
|
82
|
-
@requires_dependencies(["boxfs"], extras="box")
|
|
83
|
-
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
84
|
-
return super().run(**kwargs)
|
|
85
|
-
|
|
86
|
-
@requires_dependencies(["boxfs"], extras="box")
|
|
87
|
-
def precheck(self) -> None:
|
|
88
|
-
super().precheck()
|
|
89
|
-
|
|
90
91
|
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
91
92
|
path = file_data["name"]
|
|
92
93
|
date_created = None
|
|
@@ -126,14 +127,6 @@ class BoxDownloader(FsspecDownloader):
|
|
|
126
127
|
connector_type: str = CONNECTOR_TYPE
|
|
127
128
|
download_config: Optional[BoxDownloaderConfig] = field(default_factory=BoxDownloaderConfig)
|
|
128
129
|
|
|
129
|
-
@requires_dependencies(["boxfs"], extras="box")
|
|
130
|
-
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
131
|
-
return super().run(file_data=file_data, **kwargs)
|
|
132
|
-
|
|
133
|
-
@requires_dependencies(["boxfs"], extras="box")
|
|
134
|
-
async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
135
|
-
return await super().run_async(file_data=file_data, **kwargs)
|
|
136
|
-
|
|
137
130
|
|
|
138
131
|
class BoxUploaderConfig(FsspecUploaderConfig):
|
|
139
132
|
pass
|
|
@@ -145,22 +138,6 @@ class BoxUploader(FsspecUploader):
|
|
|
145
138
|
connection_config: BoxConnectionConfig
|
|
146
139
|
upload_config: BoxUploaderConfig = field(default=None)
|
|
147
140
|
|
|
148
|
-
@requires_dependencies(["boxfs"], extras="box")
|
|
149
|
-
def __post_init__(self):
|
|
150
|
-
super().__post_init__()
|
|
151
|
-
|
|
152
|
-
@requires_dependencies(["boxfs"], extras="box")
|
|
153
|
-
def precheck(self) -> None:
|
|
154
|
-
super().precheck()
|
|
155
|
-
|
|
156
|
-
@requires_dependencies(["boxfs"], extras="box")
|
|
157
|
-
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
158
|
-
return super().run(path=path, file_data=file_data, **kwargs)
|
|
159
|
-
|
|
160
|
-
@requires_dependencies(["boxfs"], extras="box")
|
|
161
|
-
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
162
|
-
return await super().run_async(path=path, file_data=file_data, **kwargs)
|
|
163
|
-
|
|
164
141
|
|
|
165
142
|
box_source_entry = SourceRegistryEntry(
|
|
166
143
|
indexer=BoxIndexer,
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from contextlib import contextmanager
|
|
3
4
|
from dataclasses import dataclass, field
|
|
4
|
-
from pathlib import Path
|
|
5
5
|
from time import time
|
|
6
|
-
from typing import
|
|
6
|
+
from typing import TYPE_CHECKING, Generator, Optional
|
|
7
7
|
|
|
8
8
|
from pydantic import Field, Secret
|
|
9
9
|
|
|
10
10
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
|
-
from unstructured_ingest.v2.interfaces import
|
|
11
|
+
from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
|
|
12
12
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
13
13
|
DestinationRegistryEntry,
|
|
14
14
|
SourceRegistryEntry,
|
|
@@ -24,11 +24,16 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
24
24
|
FsspecUploaderConfig,
|
|
25
25
|
)
|
|
26
26
|
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from dropboxdrivefs import DropboxDriveFileSystem
|
|
29
|
+
|
|
27
30
|
CONNECTOR_TYPE = "dropbox"
|
|
28
31
|
|
|
29
32
|
|
|
30
33
|
class DropboxIndexerConfig(FsspecIndexerConfig):
|
|
31
|
-
|
|
34
|
+
def model_post_init(self, __context):
|
|
35
|
+
if not self.path_without_protocol.startswith("/"):
|
|
36
|
+
self.path_without_protocol = "/" + self.path_without_protocol
|
|
32
37
|
|
|
33
38
|
|
|
34
39
|
class DropboxAccessConfig(FsspecAccessConfig):
|
|
@@ -42,6 +47,12 @@ class DropboxConnectionConfig(FsspecConnectionConfig):
|
|
|
42
47
|
)
|
|
43
48
|
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
44
49
|
|
|
50
|
+
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
51
|
+
@contextmanager
|
|
52
|
+
def get_client(self, protocol: str) -> Generator["DropboxDriveFileSystem", None, None]:
|
|
53
|
+
with super().get_client(protocol=protocol) as client:
|
|
54
|
+
yield client
|
|
55
|
+
|
|
45
56
|
|
|
46
57
|
@dataclass
|
|
47
58
|
class DropboxIndexer(FsspecIndexer):
|
|
@@ -83,20 +94,6 @@ class DropboxIndexer(FsspecIndexer):
|
|
|
83
94
|
filesize_bytes=file_size,
|
|
84
95
|
)
|
|
85
96
|
|
|
86
|
-
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
87
|
-
def __post_init__(self):
|
|
88
|
-
# dropbox expects the path to start with a /
|
|
89
|
-
if not self.index_config.path_without_protocol.startswith("/"):
|
|
90
|
-
self.index_config.path_without_protocol = "/" + self.index_config.path_without_protocol
|
|
91
|
-
|
|
92
|
-
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
93
|
-
def precheck(self) -> None:
|
|
94
|
-
super().precheck()
|
|
95
|
-
|
|
96
|
-
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
97
|
-
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
98
|
-
return super().run(**kwargs)
|
|
99
|
-
|
|
100
97
|
|
|
101
98
|
class DropboxDownloaderConfig(FsspecDownloaderConfig):
|
|
102
99
|
pass
|
|
@@ -111,14 +108,6 @@ class DropboxDownloader(FsspecDownloader):
|
|
|
111
108
|
default_factory=DropboxDownloaderConfig
|
|
112
109
|
)
|
|
113
110
|
|
|
114
|
-
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
115
|
-
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
116
|
-
return super().run(file_data=file_data, **kwargs)
|
|
117
|
-
|
|
118
|
-
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
119
|
-
async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
120
|
-
return await super().run_async(file_data=file_data, **kwargs)
|
|
121
|
-
|
|
122
111
|
|
|
123
112
|
class DropboxUploaderConfig(FsspecUploaderConfig):
|
|
124
113
|
pass
|
|
@@ -130,22 +119,6 @@ class DropboxUploader(FsspecUploader):
|
|
|
130
119
|
connection_config: DropboxConnectionConfig
|
|
131
120
|
upload_config: DropboxUploaderConfig = field(default=None)
|
|
132
121
|
|
|
133
|
-
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
134
|
-
def __post_init__(self):
|
|
135
|
-
super().__post_init__()
|
|
136
|
-
|
|
137
|
-
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
138
|
-
def precheck(self) -> None:
|
|
139
|
-
super().precheck()
|
|
140
|
-
|
|
141
|
-
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
142
|
-
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
143
|
-
return super().run(path=path, file_data=file_data, **kwargs)
|
|
144
|
-
|
|
145
|
-
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
146
|
-
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
147
|
-
return await super().run_async(path=path, file_data=file_data, **kwargs)
|
|
148
|
-
|
|
149
122
|
|
|
150
123
|
dropbox_source_entry = SourceRegistryEntry(
|
|
151
124
|
indexer=DropboxIndexer,
|
|
@@ -4,6 +4,7 @@ import os
|
|
|
4
4
|
import random
|
|
5
5
|
import shutil
|
|
6
6
|
import tempfile
|
|
7
|
+
from contextlib import contextmanager
|
|
7
8
|
from dataclasses import dataclass, field
|
|
8
9
|
from pathlib import Path
|
|
9
10
|
from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
|
|
@@ -78,6 +79,15 @@ class FsspecConnectionConfig(ConnectionConfig):
|
|
|
78
79
|
access_config: Secret[FsspecAccessConfig]
|
|
79
80
|
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
80
81
|
|
|
82
|
+
@contextmanager
|
|
83
|
+
def get_client(self, protocol: str) -> Generator["AbstractFileSystem", None, None]:
|
|
84
|
+
from fsspec import get_filesystem_class
|
|
85
|
+
|
|
86
|
+
client = get_filesystem_class(protocol)(
|
|
87
|
+
**self.get_access_config(),
|
|
88
|
+
)
|
|
89
|
+
yield client
|
|
90
|
+
|
|
81
91
|
|
|
82
92
|
FsspecIndexerConfigT = TypeVar("FsspecIndexerConfigT", bound=FsspecIndexerConfig)
|
|
83
93
|
FsspecConnectionConfigT = TypeVar("FsspecConnectionConfigT", bound=FsspecConnectionConfig)
|
|
@@ -89,14 +99,6 @@ class FsspecIndexer(Indexer):
|
|
|
89
99
|
index_config: FsspecIndexerConfigT
|
|
90
100
|
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
91
101
|
|
|
92
|
-
@property
|
|
93
|
-
def fs(self) -> "AbstractFileSystem":
|
|
94
|
-
from fsspec import get_filesystem_class
|
|
95
|
-
|
|
96
|
-
return get_filesystem_class(self.index_config.protocol)(
|
|
97
|
-
**self.connection_config.get_access_config(),
|
|
98
|
-
)
|
|
99
|
-
|
|
100
102
|
def precheck(self) -> None:
|
|
101
103
|
from fsspec import get_filesystem_class
|
|
102
104
|
|
|
@@ -110,7 +112,8 @@ class FsspecIndexer(Indexer):
|
|
|
110
112
|
return
|
|
111
113
|
file_to_sample = valid_files[0]
|
|
112
114
|
logger.debug(f"attempting to make HEAD request for file: {file_to_sample}")
|
|
113
|
-
self.
|
|
115
|
+
with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
|
|
116
|
+
client.head(path=file_to_sample)
|
|
114
117
|
except Exception as e:
|
|
115
118
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
116
119
|
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
@@ -120,16 +123,18 @@ class FsspecIndexer(Indexer):
|
|
|
120
123
|
# fs.ls does not walk directories
|
|
121
124
|
# directories that are listed in cloud storage can cause problems
|
|
122
125
|
# because they are seen as 0 byte files
|
|
123
|
-
|
|
126
|
+
with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
|
|
127
|
+
files = client.ls(self.index_config.path_without_protocol, detail=True)
|
|
124
128
|
|
|
125
129
|
else:
|
|
126
130
|
# fs.find will recursively walk directories
|
|
127
131
|
# "size" is a common key for all the cloud protocols with fs
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
132
|
+
with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
|
|
133
|
+
found = client.find(
|
|
134
|
+
self.index_config.path_without_protocol,
|
|
135
|
+
detail=True,
|
|
136
|
+
)
|
|
137
|
+
files = found.values()
|
|
133
138
|
filtered_files = [
|
|
134
139
|
file for file in files if file.get("size") > 0 and file.get("type") == "file"
|
|
135
140
|
]
|
|
@@ -200,15 +205,8 @@ class FsspecDownloader(Downloader):
|
|
|
200
205
|
)
|
|
201
206
|
|
|
202
207
|
def is_async(self) -> bool:
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
@property
|
|
206
|
-
def fs(self) -> "AbstractFileSystem":
|
|
207
|
-
from fsspec import get_filesystem_class
|
|
208
|
-
|
|
209
|
-
return get_filesystem_class(self.protocol)(
|
|
210
|
-
**self.connection_config.get_access_config(),
|
|
211
|
-
)
|
|
208
|
+
with self.connection_config.get_client(protocol=self.protocol) as client:
|
|
209
|
+
return client.async_impl
|
|
212
210
|
|
|
213
211
|
def handle_directory_download(self, lpath: Path) -> None:
|
|
214
212
|
# If the object's name contains certain characters (i.e. '?'), it
|
|
@@ -237,7 +235,8 @@ class FsspecDownloader(Downloader):
|
|
|
237
235
|
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
238
236
|
try:
|
|
239
237
|
rpath = file_data.additional_metadata["original_file_path"]
|
|
240
|
-
self.
|
|
238
|
+
with self.connection_config.get_client(protocol=self.protocol) as client:
|
|
239
|
+
client.get(rpath=rpath, lpath=download_path.as_posix())
|
|
241
240
|
self.handle_directory_download(lpath=download_path)
|
|
242
241
|
except Exception as e:
|
|
243
242
|
logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
|
|
@@ -249,7 +248,8 @@ class FsspecDownloader(Downloader):
|
|
|
249
248
|
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
250
249
|
try:
|
|
251
250
|
rpath = file_data.additional_metadata["original_file_path"]
|
|
252
|
-
|
|
251
|
+
with self.connection_config.get_client(protocol=self.protocol) as client:
|
|
252
|
+
await client.get(rpath=rpath, lpath=download_path.as_posix())
|
|
253
253
|
self.handle_directory_download(lpath=download_path)
|
|
254
254
|
except Exception as e:
|
|
255
255
|
logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
|
|
@@ -268,9 +268,11 @@ FsspecUploaderConfigT = TypeVar("FsspecUploaderConfigT", bound=FsspecUploaderCon
|
|
|
268
268
|
class FsspecUploader(Uploader):
|
|
269
269
|
connector_type: str = CONNECTOR_TYPE
|
|
270
270
|
upload_config: FsspecUploaderConfigT = field(default=None)
|
|
271
|
+
connection_config: FsspecConnectionConfigT
|
|
271
272
|
|
|
272
273
|
def is_async(self) -> bool:
|
|
273
|
-
|
|
274
|
+
with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
|
|
275
|
+
return client.async_impl
|
|
274
276
|
|
|
275
277
|
@property
|
|
276
278
|
def fs(self) -> "AbstractFileSystem":
|
|
@@ -314,11 +316,13 @@ class FsspecUploader(Uploader):
|
|
|
314
316
|
path_str = str(path.resolve())
|
|
315
317
|
upload_path = self.get_upload_path(file_data=file_data)
|
|
316
318
|
logger.debug(f"writing local file {path_str} to {upload_path}")
|
|
317
|
-
self.
|
|
319
|
+
with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
|
|
320
|
+
client.upload(lpath=path_str, rpath=upload_path.as_posix())
|
|
318
321
|
|
|
319
322
|
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
320
323
|
upload_path = self.get_upload_path(file_data=file_data)
|
|
321
324
|
path_str = str(path.resolve())
|
|
322
325
|
# Odd that fsspec doesn't run exists() as async even when client support async
|
|
323
326
|
logger.debug(f"writing local file {path_str} to {upload_path}")
|
|
324
|
-
self.
|
|
327
|
+
with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
|
|
328
|
+
client.upload(lpath=path_str, rpath=upload_path.as_posix())
|