unstructured-ingest 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/chunkers/test_chunkers.py +0 -11
- test/integration/connectors/conftest.py +11 -1
- test/integration/connectors/databricks_tests/test_volumes_native.py +4 -3
- test/integration/connectors/duckdb/conftest.py +14 -0
- test/integration/connectors/duckdb/test_duckdb.py +51 -44
- test/integration/connectors/duckdb/test_motherduck.py +37 -48
- test/integration/connectors/elasticsearch/test_elasticsearch.py +26 -4
- test/integration/connectors/elasticsearch/test_opensearch.py +26 -3
- test/integration/connectors/sql/test_postgres.py +103 -92
- test/integration/connectors/sql/test_singlestore.py +112 -100
- test/integration/connectors/sql/test_snowflake.py +142 -117
- test/integration/connectors/sql/test_sqlite.py +87 -76
- test/integration/connectors/test_astradb.py +62 -1
- test/integration/connectors/test_azure_ai_search.py +25 -3
- test/integration/connectors/test_chroma.py +120 -0
- test/integration/connectors/test_confluence.py +4 -4
- test/integration/connectors/test_delta_table.py +1 -0
- test/integration/connectors/test_kafka.py +6 -6
- test/integration/connectors/test_milvus.py +21 -0
- test/integration/connectors/test_mongodb.py +7 -4
- test/integration/connectors/test_neo4j.py +236 -0
- test/integration/connectors/test_pinecone.py +25 -1
- test/integration/connectors/test_qdrant.py +25 -2
- test/integration/connectors/test_s3.py +9 -6
- test/integration/connectors/utils/docker.py +6 -0
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +88 -0
- test/integration/connectors/utils/validation/equality.py +75 -0
- test/integration/connectors/utils/{validation.py → validation/source.py} +42 -98
- test/integration/connectors/utils/validation/utils.py +36 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/utils/chunking.py +11 -0
- unstructured_ingest/utils/data_prep.py +36 -0
- unstructured_ingest/v2/interfaces/__init__.py +3 -1
- unstructured_ingest/v2/interfaces/file_data.py +58 -14
- unstructured_ingest/v2/interfaces/upload_stager.py +70 -6
- unstructured_ingest/v2/interfaces/uploader.py +11 -2
- unstructured_ingest/v2/pipeline/steps/chunk.py +2 -1
- unstructured_ingest/v2/pipeline/steps/download.py +5 -4
- unstructured_ingest/v2/pipeline/steps/embed.py +2 -1
- unstructured_ingest/v2/pipeline/steps/filter.py +2 -2
- unstructured_ingest/v2/pipeline/steps/index.py +4 -4
- unstructured_ingest/v2/pipeline/steps/partition.py +3 -2
- unstructured_ingest/v2/pipeline/steps/stage.py +5 -3
- unstructured_ingest/v2/pipeline/steps/uncompress.py +2 -2
- unstructured_ingest/v2/pipeline/steps/upload.py +3 -3
- unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
- unstructured_ingest/v2/processes/connectors/astradb.py +43 -63
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +16 -40
- unstructured_ingest/v2/processes/connectors/chroma.py +36 -59
- unstructured_ingest/v2/processes/connectors/couchbase.py +92 -93
- unstructured_ingest/v2/processes/connectors/delta_table.py +11 -33
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +26 -26
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +29 -20
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +37 -44
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +46 -75
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +12 -35
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +12 -35
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +15 -42
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +33 -29
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +12 -34
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -37
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -33
- unstructured_ingest/v2/processes/connectors/gitlab.py +32 -31
- unstructured_ingest/v2/processes/connectors/google_drive.py +32 -29
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +2 -4
- unstructured_ingest/v2/processes/connectors/kdbai.py +44 -70
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -10
- unstructured_ingest/v2/processes/connectors/local.py +13 -2
- unstructured_ingest/v2/processes/connectors/milvus.py +16 -57
- unstructured_ingest/v2/processes/connectors/mongodb.py +99 -108
- unstructured_ingest/v2/processes/connectors/neo4j.py +383 -0
- unstructured_ingest/v2/processes/connectors/onedrive.py +1 -1
- unstructured_ingest/v2/processes/connectors/pinecone.py +3 -33
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +32 -41
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/sql.py +72 -66
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -5
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +9 -31
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/METADATA +20 -15
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/RECORD +87 -79
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.8.dist-info → unstructured_ingest-0.3.10.dist-info}/top_level.txt +0 -0
|
@@ -1,16 +1,16 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from contextlib import contextmanager
|
|
3
4
|
from dataclasses import dataclass, field
|
|
4
|
-
from pathlib import Path
|
|
5
5
|
from time import time
|
|
6
|
-
from typing import Annotated, Any, Generator, Optional
|
|
6
|
+
from typing import TYPE_CHECKING, Annotated, Any, Generator, Optional
|
|
7
7
|
|
|
8
8
|
from dateutil import parser
|
|
9
9
|
from pydantic import Field, Secret
|
|
10
10
|
from pydantic.functional_validators import BeforeValidator
|
|
11
11
|
|
|
12
12
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
|
-
from unstructured_ingest.v2.interfaces import
|
|
13
|
+
from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
|
|
14
14
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
15
15
|
DestinationRegistryEntry,
|
|
16
16
|
SourceRegistryEntry,
|
|
@@ -28,6 +28,9 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
28
28
|
)
|
|
29
29
|
from unstructured_ingest.v2.processes.connectors.utils import conform_string_to_dict
|
|
30
30
|
|
|
31
|
+
if TYPE_CHECKING:
|
|
32
|
+
from boxfs import BoxFileSystem
|
|
33
|
+
|
|
31
34
|
CONNECTOR_TYPE = "box"
|
|
32
35
|
|
|
33
36
|
|
|
@@ -72,6 +75,12 @@ class BoxConnectionConfig(FsspecConnectionConfig):
|
|
|
72
75
|
|
|
73
76
|
return access_kwargs_with_oauth
|
|
74
77
|
|
|
78
|
+
@requires_dependencies(["boxfs"], extras="box")
|
|
79
|
+
@contextmanager
|
|
80
|
+
def get_client(self, protocol: str) -> Generator["BoxFileSystem", None, None]:
|
|
81
|
+
with super().get_client(protocol=protocol) as client:
|
|
82
|
+
yield client
|
|
83
|
+
|
|
75
84
|
|
|
76
85
|
@dataclass
|
|
77
86
|
class BoxIndexer(FsspecIndexer):
|
|
@@ -79,14 +88,6 @@ class BoxIndexer(FsspecIndexer):
|
|
|
79
88
|
index_config: BoxIndexerConfig
|
|
80
89
|
connector_type: str = CONNECTOR_TYPE
|
|
81
90
|
|
|
82
|
-
@requires_dependencies(["boxfs"], extras="box")
|
|
83
|
-
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
84
|
-
return super().run(**kwargs)
|
|
85
|
-
|
|
86
|
-
@requires_dependencies(["boxfs"], extras="box")
|
|
87
|
-
def precheck(self) -> None:
|
|
88
|
-
super().precheck()
|
|
89
|
-
|
|
90
91
|
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
91
92
|
path = file_data["name"]
|
|
92
93
|
date_created = None
|
|
@@ -126,14 +127,6 @@ class BoxDownloader(FsspecDownloader):
|
|
|
126
127
|
connector_type: str = CONNECTOR_TYPE
|
|
127
128
|
download_config: Optional[BoxDownloaderConfig] = field(default_factory=BoxDownloaderConfig)
|
|
128
129
|
|
|
129
|
-
@requires_dependencies(["boxfs"], extras="box")
|
|
130
|
-
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
131
|
-
return super().run(file_data=file_data, **kwargs)
|
|
132
|
-
|
|
133
|
-
@requires_dependencies(["boxfs"], extras="box")
|
|
134
|
-
async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
135
|
-
return await super().run_async(file_data=file_data, **kwargs)
|
|
136
|
-
|
|
137
130
|
|
|
138
131
|
class BoxUploaderConfig(FsspecUploaderConfig):
|
|
139
132
|
pass
|
|
@@ -145,22 +138,6 @@ class BoxUploader(FsspecUploader):
|
|
|
145
138
|
connection_config: BoxConnectionConfig
|
|
146
139
|
upload_config: BoxUploaderConfig = field(default=None)
|
|
147
140
|
|
|
148
|
-
@requires_dependencies(["boxfs"], extras="box")
|
|
149
|
-
def __post_init__(self):
|
|
150
|
-
super().__post_init__()
|
|
151
|
-
|
|
152
|
-
@requires_dependencies(["boxfs"], extras="box")
|
|
153
|
-
def precheck(self) -> None:
|
|
154
|
-
super().precheck()
|
|
155
|
-
|
|
156
|
-
@requires_dependencies(["boxfs"], extras="box")
|
|
157
|
-
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
158
|
-
return super().run(path=path, file_data=file_data, **kwargs)
|
|
159
|
-
|
|
160
|
-
@requires_dependencies(["boxfs"], extras="box")
|
|
161
|
-
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
162
|
-
return await super().run_async(path=path, file_data=file_data, **kwargs)
|
|
163
|
-
|
|
164
141
|
|
|
165
142
|
box_source_entry = SourceRegistryEntry(
|
|
166
143
|
indexer=BoxIndexer,
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from contextlib import contextmanager
|
|
3
4
|
from dataclasses import dataclass, field
|
|
4
|
-
from pathlib import Path
|
|
5
5
|
from time import time
|
|
6
|
-
from typing import
|
|
6
|
+
from typing import TYPE_CHECKING, Generator, Optional
|
|
7
7
|
|
|
8
8
|
from pydantic import Field, Secret
|
|
9
9
|
|
|
10
10
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
|
-
from unstructured_ingest.v2.interfaces import
|
|
11
|
+
from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
|
|
12
12
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
13
13
|
DestinationRegistryEntry,
|
|
14
14
|
SourceRegistryEntry,
|
|
@@ -24,11 +24,16 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
24
24
|
FsspecUploaderConfig,
|
|
25
25
|
)
|
|
26
26
|
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from dropboxdrivefs import DropboxDriveFileSystem
|
|
29
|
+
|
|
27
30
|
CONNECTOR_TYPE = "dropbox"
|
|
28
31
|
|
|
29
32
|
|
|
30
33
|
class DropboxIndexerConfig(FsspecIndexerConfig):
|
|
31
|
-
|
|
34
|
+
def model_post_init(self, __context):
|
|
35
|
+
if not self.path_without_protocol.startswith("/"):
|
|
36
|
+
self.path_without_protocol = "/" + self.path_without_protocol
|
|
32
37
|
|
|
33
38
|
|
|
34
39
|
class DropboxAccessConfig(FsspecAccessConfig):
|
|
@@ -42,6 +47,12 @@ class DropboxConnectionConfig(FsspecConnectionConfig):
|
|
|
42
47
|
)
|
|
43
48
|
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
44
49
|
|
|
50
|
+
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
51
|
+
@contextmanager
|
|
52
|
+
def get_client(self, protocol: str) -> Generator["DropboxDriveFileSystem", None, None]:
|
|
53
|
+
with super().get_client(protocol=protocol) as client:
|
|
54
|
+
yield client
|
|
55
|
+
|
|
45
56
|
|
|
46
57
|
@dataclass
|
|
47
58
|
class DropboxIndexer(FsspecIndexer):
|
|
@@ -83,20 +94,6 @@ class DropboxIndexer(FsspecIndexer):
|
|
|
83
94
|
filesize_bytes=file_size,
|
|
84
95
|
)
|
|
85
96
|
|
|
86
|
-
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
87
|
-
def __post_init__(self):
|
|
88
|
-
# dropbox expects the path to start with a /
|
|
89
|
-
if not self.index_config.path_without_protocol.startswith("/"):
|
|
90
|
-
self.index_config.path_without_protocol = "/" + self.index_config.path_without_protocol
|
|
91
|
-
|
|
92
|
-
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
93
|
-
def precheck(self) -> None:
|
|
94
|
-
super().precheck()
|
|
95
|
-
|
|
96
|
-
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
97
|
-
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
98
|
-
return super().run(**kwargs)
|
|
99
|
-
|
|
100
97
|
|
|
101
98
|
class DropboxDownloaderConfig(FsspecDownloaderConfig):
|
|
102
99
|
pass
|
|
@@ -111,14 +108,6 @@ class DropboxDownloader(FsspecDownloader):
|
|
|
111
108
|
default_factory=DropboxDownloaderConfig
|
|
112
109
|
)
|
|
113
110
|
|
|
114
|
-
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
115
|
-
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
116
|
-
return super().run(file_data=file_data, **kwargs)
|
|
117
|
-
|
|
118
|
-
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
119
|
-
async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
120
|
-
return await super().run_async(file_data=file_data, **kwargs)
|
|
121
|
-
|
|
122
111
|
|
|
123
112
|
class DropboxUploaderConfig(FsspecUploaderConfig):
|
|
124
113
|
pass
|
|
@@ -130,22 +119,6 @@ class DropboxUploader(FsspecUploader):
|
|
|
130
119
|
connection_config: DropboxConnectionConfig
|
|
131
120
|
upload_config: DropboxUploaderConfig = field(default=None)
|
|
132
121
|
|
|
133
|
-
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
134
|
-
def __post_init__(self):
|
|
135
|
-
super().__post_init__()
|
|
136
|
-
|
|
137
|
-
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
138
|
-
def precheck(self) -> None:
|
|
139
|
-
super().precheck()
|
|
140
|
-
|
|
141
|
-
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
142
|
-
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
143
|
-
return super().run(path=path, file_data=file_data, **kwargs)
|
|
144
|
-
|
|
145
|
-
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
146
|
-
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
147
|
-
return await super().run_async(path=path, file_data=file_data, **kwargs)
|
|
148
|
-
|
|
149
122
|
|
|
150
123
|
dropbox_source_entry = SourceRegistryEntry(
|
|
151
124
|
indexer=DropboxIndexer,
|
|
@@ -4,6 +4,7 @@ import os
|
|
|
4
4
|
import random
|
|
5
5
|
import shutil
|
|
6
6
|
import tempfile
|
|
7
|
+
from contextlib import contextmanager
|
|
7
8
|
from dataclasses import dataclass, field
|
|
8
9
|
from pathlib import Path
|
|
9
10
|
from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
|
|
@@ -78,6 +79,15 @@ class FsspecConnectionConfig(ConnectionConfig):
|
|
|
78
79
|
access_config: Secret[FsspecAccessConfig]
|
|
79
80
|
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
80
81
|
|
|
82
|
+
@contextmanager
|
|
83
|
+
def get_client(self, protocol: str) -> Generator["AbstractFileSystem", None, None]:
|
|
84
|
+
from fsspec import get_filesystem_class
|
|
85
|
+
|
|
86
|
+
client = get_filesystem_class(protocol)(
|
|
87
|
+
**self.get_access_config(),
|
|
88
|
+
)
|
|
89
|
+
yield client
|
|
90
|
+
|
|
81
91
|
|
|
82
92
|
FsspecIndexerConfigT = TypeVar("FsspecIndexerConfigT", bound=FsspecIndexerConfig)
|
|
83
93
|
FsspecConnectionConfigT = TypeVar("FsspecConnectionConfigT", bound=FsspecConnectionConfig)
|
|
@@ -89,14 +99,6 @@ class FsspecIndexer(Indexer):
|
|
|
89
99
|
index_config: FsspecIndexerConfigT
|
|
90
100
|
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
91
101
|
|
|
92
|
-
@property
|
|
93
|
-
def fs(self) -> "AbstractFileSystem":
|
|
94
|
-
from fsspec import get_filesystem_class
|
|
95
|
-
|
|
96
|
-
return get_filesystem_class(self.index_config.protocol)(
|
|
97
|
-
**self.connection_config.get_access_config(),
|
|
98
|
-
)
|
|
99
|
-
|
|
100
102
|
def precheck(self) -> None:
|
|
101
103
|
from fsspec import get_filesystem_class
|
|
102
104
|
|
|
@@ -110,7 +112,8 @@ class FsspecIndexer(Indexer):
|
|
|
110
112
|
return
|
|
111
113
|
file_to_sample = valid_files[0]
|
|
112
114
|
logger.debug(f"attempting to make HEAD request for file: {file_to_sample}")
|
|
113
|
-
self.
|
|
115
|
+
with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
|
|
116
|
+
client.head(path=file_to_sample)
|
|
114
117
|
except Exception as e:
|
|
115
118
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
116
119
|
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
@@ -120,16 +123,18 @@ class FsspecIndexer(Indexer):
|
|
|
120
123
|
# fs.ls does not walk directories
|
|
121
124
|
# directories that are listed in cloud storage can cause problems
|
|
122
125
|
# because they are seen as 0 byte files
|
|
123
|
-
|
|
126
|
+
with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
|
|
127
|
+
files = client.ls(self.index_config.path_without_protocol, detail=True)
|
|
124
128
|
|
|
125
129
|
else:
|
|
126
130
|
# fs.find will recursively walk directories
|
|
127
131
|
# "size" is a common key for all the cloud protocols with fs
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
132
|
+
with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
|
|
133
|
+
found = client.find(
|
|
134
|
+
self.index_config.path_without_protocol,
|
|
135
|
+
detail=True,
|
|
136
|
+
)
|
|
137
|
+
files = found.values()
|
|
133
138
|
filtered_files = [
|
|
134
139
|
file for file in files if file.get("size") > 0 and file.get("type") == "file"
|
|
135
140
|
]
|
|
@@ -200,15 +205,8 @@ class FsspecDownloader(Downloader):
|
|
|
200
205
|
)
|
|
201
206
|
|
|
202
207
|
def is_async(self) -> bool:
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
@property
|
|
206
|
-
def fs(self) -> "AbstractFileSystem":
|
|
207
|
-
from fsspec import get_filesystem_class
|
|
208
|
-
|
|
209
|
-
return get_filesystem_class(self.protocol)(
|
|
210
|
-
**self.connection_config.get_access_config(),
|
|
211
|
-
)
|
|
208
|
+
with self.connection_config.get_client(protocol=self.protocol) as client:
|
|
209
|
+
return client.async_impl
|
|
212
210
|
|
|
213
211
|
def handle_directory_download(self, lpath: Path) -> None:
|
|
214
212
|
# If the object's name contains certain characters (i.e. '?'), it
|
|
@@ -237,7 +235,8 @@ class FsspecDownloader(Downloader):
|
|
|
237
235
|
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
238
236
|
try:
|
|
239
237
|
rpath = file_data.additional_metadata["original_file_path"]
|
|
240
|
-
self.
|
|
238
|
+
with self.connection_config.get_client(protocol=self.protocol) as client:
|
|
239
|
+
client.get(rpath=rpath, lpath=download_path.as_posix())
|
|
241
240
|
self.handle_directory_download(lpath=download_path)
|
|
242
241
|
except Exception as e:
|
|
243
242
|
logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
|
|
@@ -249,7 +248,8 @@ class FsspecDownloader(Downloader):
|
|
|
249
248
|
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
250
249
|
try:
|
|
251
250
|
rpath = file_data.additional_metadata["original_file_path"]
|
|
252
|
-
|
|
251
|
+
with self.connection_config.get_client(protocol=self.protocol) as client:
|
|
252
|
+
await client.get(rpath=rpath, lpath=download_path.as_posix())
|
|
253
253
|
self.handle_directory_download(lpath=download_path)
|
|
254
254
|
except Exception as e:
|
|
255
255
|
logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
|
|
@@ -268,9 +268,11 @@ FsspecUploaderConfigT = TypeVar("FsspecUploaderConfigT", bound=FsspecUploaderCon
|
|
|
268
268
|
class FsspecUploader(Uploader):
|
|
269
269
|
connector_type: str = CONNECTOR_TYPE
|
|
270
270
|
upload_config: FsspecUploaderConfigT = field(default=None)
|
|
271
|
+
connection_config: FsspecConnectionConfigT
|
|
271
272
|
|
|
272
273
|
def is_async(self) -> bool:
|
|
273
|
-
|
|
274
|
+
with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
|
|
275
|
+
return client.async_impl
|
|
274
276
|
|
|
275
277
|
@property
|
|
276
278
|
def fs(self) -> "AbstractFileSystem":
|
|
@@ -314,11 +316,13 @@ class FsspecUploader(Uploader):
|
|
|
314
316
|
path_str = str(path.resolve())
|
|
315
317
|
upload_path = self.get_upload_path(file_data=file_data)
|
|
316
318
|
logger.debug(f"writing local file {path_str} to {upload_path}")
|
|
317
|
-
self.
|
|
319
|
+
with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
|
|
320
|
+
client.upload(lpath=path_str, rpath=upload_path.as_posix())
|
|
318
321
|
|
|
319
322
|
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
320
323
|
upload_path = self.get_upload_path(file_data=file_data)
|
|
321
324
|
path_str = str(path.resolve())
|
|
322
325
|
# Odd that fsspec doesn't run exists() as async even when client support async
|
|
323
326
|
logger.debug(f"writing local file {path_str} to {upload_path}")
|
|
324
|
-
self.
|
|
327
|
+
with self.connection_config.get_client(protocol=self.upload_config.protocol) as client:
|
|
328
|
+
client.upload(lpath=path_str, rpath=upload_path.as_posix())
|
|
@@ -1,16 +1,17 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from contextlib import contextmanager
|
|
3
4
|
from dataclasses import dataclass, field
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from time import time
|
|
6
|
-
from typing import Any, Generator, Optional, Union
|
|
7
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional, Union
|
|
7
8
|
|
|
8
9
|
from dateutil import parser
|
|
9
10
|
from pydantic import Field, Secret
|
|
10
11
|
|
|
11
12
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
13
|
from unstructured_ingest.utils.string_and_date_utils import json_to_dict
|
|
13
|
-
from unstructured_ingest.v2.interfaces import
|
|
14
|
+
from unstructured_ingest.v2.interfaces import FileDataSourceMetadata
|
|
14
15
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
15
16
|
DestinationRegistryEntry,
|
|
16
17
|
SourceRegistryEntry,
|
|
@@ -26,6 +27,9 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
26
27
|
FsspecUploaderConfig,
|
|
27
28
|
)
|
|
28
29
|
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from gcsfs import GCSFileSystem
|
|
32
|
+
|
|
29
33
|
CONNECTOR_TYPE = "gcs"
|
|
30
34
|
|
|
31
35
|
|
|
@@ -93,6 +97,12 @@ class GcsConnectionConfig(FsspecConnectionConfig):
|
|
|
93
97
|
access_config: Secret[GcsAccessConfig] = Field(default=GcsAccessConfig(), validate_default=True)
|
|
94
98
|
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
95
99
|
|
|
100
|
+
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
|
|
101
|
+
@contextmanager
|
|
102
|
+
def get_client(self, protocol: str) -> Generator["GCSFileSystem", None, None]:
|
|
103
|
+
with super().get_client(protocol=protocol) as client:
|
|
104
|
+
yield client
|
|
105
|
+
|
|
96
106
|
|
|
97
107
|
@dataclass
|
|
98
108
|
class GcsIndexer(FsspecIndexer):
|
|
@@ -100,14 +110,6 @@ class GcsIndexer(FsspecIndexer):
|
|
|
100
110
|
index_config: GcsIndexerConfig
|
|
101
111
|
connector_type: str = CONNECTOR_TYPE
|
|
102
112
|
|
|
103
|
-
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
|
|
104
|
-
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
105
|
-
return super().run(**kwargs)
|
|
106
|
-
|
|
107
|
-
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
|
|
108
|
-
def precheck(self) -> None:
|
|
109
|
-
super().precheck()
|
|
110
|
-
|
|
111
113
|
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
112
114
|
path = file_data["name"]
|
|
113
115
|
date_created = None
|
|
@@ -147,14 +149,6 @@ class GcsDownloader(FsspecDownloader):
|
|
|
147
149
|
connector_type: str = CONNECTOR_TYPE
|
|
148
150
|
download_config: Optional[GcsDownloaderConfig] = field(default_factory=GcsDownloaderConfig)
|
|
149
151
|
|
|
150
|
-
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
|
|
151
|
-
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
152
|
-
return super().run(file_data=file_data, **kwargs)
|
|
153
|
-
|
|
154
|
-
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
|
|
155
|
-
async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
156
|
-
return await super().run_async(file_data=file_data, **kwargs)
|
|
157
|
-
|
|
158
152
|
|
|
159
153
|
class GcsUploaderConfig(FsspecUploaderConfig):
|
|
160
154
|
pass
|
|
@@ -166,22 +160,6 @@ class GcsUploader(FsspecUploader):
|
|
|
166
160
|
connection_config: GcsConnectionConfig
|
|
167
161
|
upload_config: GcsUploaderConfig = field(default=None)
|
|
168
162
|
|
|
169
|
-
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
|
|
170
|
-
def __post_init__(self):
|
|
171
|
-
super().__post_init__()
|
|
172
|
-
|
|
173
|
-
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
|
|
174
|
-
def precheck(self) -> None:
|
|
175
|
-
super().precheck()
|
|
176
|
-
|
|
177
|
-
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
|
|
178
|
-
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
179
|
-
return super().run(path=path, file_data=file_data, **kwargs)
|
|
180
|
-
|
|
181
|
-
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
|
|
182
|
-
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
183
|
-
return await super().run_async(path=path, file_data=file_data, **kwargs)
|
|
184
|
-
|
|
185
163
|
|
|
186
164
|
gcs_source_entry = SourceRegistryEntry(
|
|
187
165
|
indexer=GcsIndexer,
|
|
@@ -1,15 +1,13 @@
|
|
|
1
1
|
import contextlib
|
|
2
|
+
from contextlib import contextmanager
|
|
2
3
|
from dataclasses import dataclass, field
|
|
3
|
-
from pathlib import Path
|
|
4
4
|
from time import time
|
|
5
|
-
from typing import Any, Generator, Optional
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
6
6
|
|
|
7
7
|
from pydantic import Field, Secret
|
|
8
8
|
|
|
9
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
10
|
from unstructured_ingest.v2.interfaces import (
|
|
11
|
-
DownloadResponse,
|
|
12
|
-
FileData,
|
|
13
11
|
FileDataSourceMetadata,
|
|
14
12
|
)
|
|
15
13
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
@@ -29,6 +27,9 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
29
27
|
|
|
30
28
|
CONNECTOR_TYPE = "s3"
|
|
31
29
|
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from s3fs import S3FileSystem
|
|
32
|
+
|
|
32
33
|
|
|
33
34
|
class S3IndexerConfig(FsspecIndexerConfig):
|
|
34
35
|
pass
|
|
@@ -72,6 +73,12 @@ class S3ConnectionConfig(FsspecConnectionConfig):
|
|
|
72
73
|
)
|
|
73
74
|
return access_configs
|
|
74
75
|
|
|
76
|
+
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
77
|
+
@contextmanager
|
|
78
|
+
def get_client(self, protocol: str) -> Generator["S3FileSystem", None, None]:
|
|
79
|
+
with super().get_client(protocol=protocol) as client:
|
|
80
|
+
yield client
|
|
81
|
+
|
|
75
82
|
|
|
76
83
|
@dataclass
|
|
77
84
|
class S3Indexer(FsspecIndexer):
|
|
@@ -97,7 +104,8 @@ class S3Indexer(FsspecIndexer):
|
|
|
97
104
|
version = file_data.get("ETag").rstrip('"').lstrip('"') if "ETag" in file_data else None
|
|
98
105
|
metadata: dict[str, str] = {}
|
|
99
106
|
with contextlib.suppress(AttributeError):
|
|
100
|
-
|
|
107
|
+
with self.connection_config.get_client(protocol=self.index_config.protocol) as client:
|
|
108
|
+
metadata = client.metadata(path=path)
|
|
101
109
|
record_locator = {
|
|
102
110
|
"protocol": self.index_config.protocol,
|
|
103
111
|
"remote_file_path": self.index_config.remote_url,
|
|
@@ -114,14 +122,6 @@ class S3Indexer(FsspecIndexer):
|
|
|
114
122
|
filesize_bytes=file_size,
|
|
115
123
|
)
|
|
116
124
|
|
|
117
|
-
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
118
|
-
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
119
|
-
return super().run(**kwargs)
|
|
120
|
-
|
|
121
|
-
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
122
|
-
def precheck(self) -> None:
|
|
123
|
-
super().precheck()
|
|
124
|
-
|
|
125
125
|
|
|
126
126
|
class S3DownloaderConfig(FsspecDownloaderConfig):
|
|
127
127
|
pass
|
|
@@ -134,14 +134,6 @@ class S3Downloader(FsspecDownloader):
|
|
|
134
134
|
connector_type: str = CONNECTOR_TYPE
|
|
135
135
|
download_config: Optional[S3DownloaderConfig] = field(default_factory=S3DownloaderConfig)
|
|
136
136
|
|
|
137
|
-
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
138
|
-
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
139
|
-
return super().run(file_data=file_data, **kwargs)
|
|
140
|
-
|
|
141
|
-
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
142
|
-
async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
143
|
-
return await super().run_async(file_data=file_data, **kwargs)
|
|
144
|
-
|
|
145
137
|
|
|
146
138
|
class S3UploaderConfig(FsspecUploaderConfig):
|
|
147
139
|
pass
|
|
@@ -153,22 +145,6 @@ class S3Uploader(FsspecUploader):
|
|
|
153
145
|
connection_config: S3ConnectionConfig
|
|
154
146
|
upload_config: S3UploaderConfig = field(default=None)
|
|
155
147
|
|
|
156
|
-
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
157
|
-
def precheck(self) -> None:
|
|
158
|
-
super().precheck()
|
|
159
|
-
|
|
160
|
-
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
161
|
-
def __post_init__(self):
|
|
162
|
-
super().__post_init__()
|
|
163
|
-
|
|
164
|
-
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
165
|
-
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
166
|
-
return super().run(path=path, file_data=file_data, **kwargs)
|
|
167
|
-
|
|
168
|
-
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
169
|
-
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
170
|
-
return await super().run_async(path=path, file_data=file_data, **kwargs)
|
|
171
|
-
|
|
172
148
|
|
|
173
149
|
s3_source_entry = SourceRegistryEntry(
|
|
174
150
|
indexer=S3Indexer,
|
|
@@ -1,16 +1,17 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import os
|
|
4
|
+
from contextlib import contextmanager
|
|
4
5
|
from dataclasses import dataclass, field
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from time import time
|
|
7
|
-
from typing import Any, Generator, Optional
|
|
8
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
8
9
|
from urllib.parse import urlparse
|
|
9
10
|
|
|
10
11
|
from pydantic import Field, Secret
|
|
11
12
|
|
|
12
13
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
13
|
-
from unstructured_ingest.v2.interfaces import
|
|
14
|
+
from unstructured_ingest.v2.interfaces import FileData, FileDataSourceMetadata
|
|
14
15
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
15
16
|
DestinationRegistryEntry,
|
|
16
17
|
SourceRegistryEntry,
|
|
@@ -26,6 +27,9 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
|
|
|
26
27
|
FsspecUploaderConfig,
|
|
27
28
|
)
|
|
28
29
|
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from fsspec.implementations.sftp import SFTPFileSystem
|
|
32
|
+
|
|
29
33
|
CONNECTOR_TYPE = "sftp"
|
|
30
34
|
|
|
31
35
|
|
|
@@ -67,6 +71,19 @@ class SftpConnectionConfig(FsspecConnectionConfig):
|
|
|
67
71
|
}
|
|
68
72
|
return access_config
|
|
69
73
|
|
|
74
|
+
@contextmanager
|
|
75
|
+
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
76
|
+
def get_client(self, protocol: str) -> Generator["SFTPFileSystem", None, None]:
|
|
77
|
+
# The paramiko.SSHClient() client that's opened by the SFTPFileSystem
|
|
78
|
+
# never gets closed so explicitly adding that as part of this context manager
|
|
79
|
+
from fsspec import get_filesystem_class
|
|
80
|
+
|
|
81
|
+
client: SFTPFileSystem = get_filesystem_class(protocol)(
|
|
82
|
+
**self.get_access_config(),
|
|
83
|
+
)
|
|
84
|
+
yield client
|
|
85
|
+
client.client.close()
|
|
86
|
+
|
|
70
87
|
|
|
71
88
|
@dataclass
|
|
72
89
|
class SftpIndexer(FsspecIndexer):
|
|
@@ -74,13 +91,11 @@ class SftpIndexer(FsspecIndexer):
|
|
|
74
91
|
index_config: SftpIndexerConfig
|
|
75
92
|
connector_type: str = CONNECTOR_TYPE
|
|
76
93
|
|
|
77
|
-
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
78
94
|
def __post_init__(self):
|
|
79
95
|
parsed_url = urlparse(self.index_config.remote_url)
|
|
80
96
|
self.connection_config.host = parsed_url.hostname or self.connection_config.host
|
|
81
97
|
self.connection_config.port = parsed_url.port or self.connection_config.port
|
|
82
98
|
|
|
83
|
-
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
84
99
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
85
100
|
for file in super().run(**kwargs):
|
|
86
101
|
new_identifier = (
|
|
@@ -92,10 +107,6 @@ class SftpIndexer(FsspecIndexer):
|
|
|
92
107
|
file.identifier = new_identifier
|
|
93
108
|
yield file
|
|
94
109
|
|
|
95
|
-
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
96
|
-
def precheck(self) -> None:
|
|
97
|
-
super().precheck()
|
|
98
|
-
|
|
99
110
|
def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
|
|
100
111
|
path = file_data["name"]
|
|
101
112
|
date_created = str(file_data.get("time").timestamp()) if "time" in file_data else None
|
|
@@ -128,20 +139,11 @@ class SftpDownloader(FsspecDownloader):
|
|
|
128
139
|
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
129
140
|
download_config: Optional[SftpDownloaderConfig] = field(default_factory=SftpDownloaderConfig)
|
|
130
141
|
|
|
131
|
-
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
132
142
|
def __post_init__(self):
|
|
133
143
|
parsed_url = urlparse(self.download_config.remote_url)
|
|
134
144
|
self.connection_config.host = parsed_url.hostname or self.connection_config.host
|
|
135
145
|
self.connection_config.port = parsed_url.port or self.connection_config.port
|
|
136
146
|
|
|
137
|
-
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
138
|
-
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
139
|
-
return super().run(file_data=file_data, **kwargs)
|
|
140
|
-
|
|
141
|
-
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
142
|
-
async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
143
|
-
return await super().run_async(file_data=file_data, **kwargs)
|
|
144
|
-
|
|
145
147
|
|
|
146
148
|
class SftpUploaderConfig(FsspecUploaderConfig):
|
|
147
149
|
pass
|
|
@@ -153,22 +155,6 @@ class SftpUploader(FsspecUploader):
|
|
|
153
155
|
connection_config: SftpConnectionConfig
|
|
154
156
|
upload_config: SftpUploaderConfig = field(default=None)
|
|
155
157
|
|
|
156
|
-
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
157
|
-
def __post_init__(self):
|
|
158
|
-
super().__post_init__()
|
|
159
|
-
|
|
160
|
-
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
161
|
-
def precheck(self) -> None:
|
|
162
|
-
super().precheck()
|
|
163
|
-
|
|
164
|
-
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
165
|
-
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
166
|
-
return super().run(path=path, file_data=file_data, **kwargs)
|
|
167
|
-
|
|
168
|
-
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
169
|
-
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
170
|
-
return await super().run_async(path=path, file_data=file_data, **kwargs)
|
|
171
|
-
|
|
172
158
|
|
|
173
159
|
sftp_source_entry = SourceRegistryEntry(
|
|
174
160
|
indexer=SftpIndexer,
|