unstructured-ingest 0.0.2.dev0__py3-none-any.whl → 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/cli/base/cmd.py +10 -0
- unstructured_ingest/v2/cli/base/src.py +2 -0
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +1 -9
- unstructured_ingest/v2/cli/cmds/local.py +0 -8
- unstructured_ingest/v2/cli/configs/__init__.py +8 -1
- unstructured_ingest/v2/cli/configs/filter.py +28 -0
- unstructured_ingest/v2/interfaces/__init__.py +2 -1
- unstructured_ingest/v2/interfaces/downloader.py +9 -3
- unstructured_ingest/v2/interfaces/file_data.py +6 -1
- unstructured_ingest/v2/interfaces/process.py +3 -4
- unstructured_ingest/v2/pipeline/interfaces.py +3 -5
- unstructured_ingest/v2/pipeline/pipeline.py +72 -2
- unstructured_ingest/v2/pipeline/steps/download.py +77 -13
- unstructured_ingest/v2/pipeline/steps/filter.py +40 -0
- unstructured_ingest/v2/processes/connectors/astra.py +8 -0
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +8 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +8 -6
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +9 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +23 -9
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +8 -0
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +8 -0
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +8 -0
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +30 -28
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +8 -0
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +21 -5
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +8 -0
- unstructured_ingest/v2/processes/connectors/google_drive.py +13 -9
- unstructured_ingest/v2/processes/connectors/local.py +15 -15
- unstructured_ingest/v2/processes/connectors/mongodb.py +10 -4
- unstructured_ingest/v2/processes/connectors/onedrive.py +14 -2
- unstructured_ingest/v2/processes/connectors/opensearch.py +33 -5
- unstructured_ingest/v2/processes/connectors/pinecone.py +6 -3
- unstructured_ingest/v2/processes/connectors/salesforce.py +10 -8
- unstructured_ingest/v2/processes/connectors/sharepoint.py +14 -8
- unstructured_ingest/v2/processes/connectors/sql.py +24 -9
- unstructured_ingest/v2/processes/connectors/weaviate.py +13 -5
- unstructured_ingest/v2/processes/filter.py +54 -0
- unstructured_ingest-0.0.3.dist-info/METADATA +175 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.3.dist-info}/RECORD +43 -40
- unstructured_ingest-0.0.2.dev0.dist-info/METADATA +0 -321
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.3.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.3.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.3.dist-info}/top_level.txt +0 -0
|
@@ -3,6 +3,7 @@ from dataclasses import dataclass, field
|
|
|
3
3
|
from typing import TYPE_CHECKING, Any, Optional
|
|
4
4
|
|
|
5
5
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
6
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
6
7
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
7
8
|
from unstructured_ingest.v2.interfaces import (
|
|
8
9
|
AccessConfig,
|
|
@@ -11,6 +12,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
11
12
|
Uploader,
|
|
12
13
|
UploaderConfig,
|
|
13
14
|
)
|
|
15
|
+
from unstructured_ingest.v2.logger import logger
|
|
14
16
|
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
15
17
|
|
|
16
18
|
if TYPE_CHECKING:
|
|
@@ -78,6 +80,13 @@ class DatabricksVolumesUploader(Uploader):
|
|
|
78
80
|
host=self.connection_config.host, **self.connection_config.access_config.to_dict()
|
|
79
81
|
)
|
|
80
82
|
|
|
83
|
+
def precheck(self) -> None:
|
|
84
|
+
try:
|
|
85
|
+
assert self.client.current_user.me().active
|
|
86
|
+
except Exception as e:
|
|
87
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
88
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
89
|
+
|
|
81
90
|
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
82
91
|
for content in contents:
|
|
83
92
|
with open(content.path, "rb") as elements_file:
|
|
@@ -7,10 +7,12 @@ from pathlib import Path
|
|
|
7
7
|
from time import time
|
|
8
8
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
9
9
|
|
|
10
|
-
from unstructured.documents.elements import DataSourceMetadata
|
|
11
|
-
|
|
12
10
|
from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
|
|
13
|
-
from unstructured_ingest.error import
|
|
11
|
+
from unstructured_ingest.error import (
|
|
12
|
+
DestinationConnectionError,
|
|
13
|
+
SourceConnectionError,
|
|
14
|
+
SourceConnectionNetworkError,
|
|
15
|
+
)
|
|
14
16
|
from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
|
|
15
17
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
16
18
|
from unstructured_ingest.v2.interfaces import (
|
|
@@ -20,6 +22,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
20
22
|
DownloaderConfig,
|
|
21
23
|
DownloadResponse,
|
|
22
24
|
FileData,
|
|
25
|
+
FileDataSourceMetadata,
|
|
23
26
|
Indexer,
|
|
24
27
|
IndexerConfig,
|
|
25
28
|
UploadContent,
|
|
@@ -121,11 +124,14 @@ class ElasticsearchIndexerConfig(IndexerConfig):
|
|
|
121
124
|
class ElasticsearchIndexer(Indexer):
|
|
122
125
|
connection_config: ElasticsearchConnectionConfig
|
|
123
126
|
index_config: ElasticsearchIndexerConfig
|
|
124
|
-
client: "ElasticsearchClient" = field(init=False)
|
|
125
127
|
connector_type: str = CONNECTOR_TYPE
|
|
126
128
|
|
|
127
|
-
def
|
|
128
|
-
|
|
129
|
+
def precheck(self) -> None:
|
|
130
|
+
try:
|
|
131
|
+
self.connection_config.get_client()
|
|
132
|
+
except Exception as e:
|
|
133
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
134
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
129
135
|
|
|
130
136
|
@requires_dependencies(["elasticsearch"], extras="elasticsearch")
|
|
131
137
|
def load_scan(self):
|
|
@@ -138,8 +144,9 @@ class ElasticsearchIndexer(Indexer):
|
|
|
138
144
|
scan = self.load_scan()
|
|
139
145
|
|
|
140
146
|
scan_query: dict = {"stored_fields": [], "query": {"match_all": {}}}
|
|
147
|
+
client = self.connection_config.get_client()
|
|
141
148
|
hits = scan(
|
|
142
|
-
|
|
149
|
+
client,
|
|
143
150
|
query=scan_query,
|
|
144
151
|
scroll="1m",
|
|
145
152
|
index=self.index_config.index_name,
|
|
@@ -168,7 +175,7 @@ class ElasticsearchIndexer(Indexer):
|
|
|
168
175
|
yield FileData(
|
|
169
176
|
identifier=identified,
|
|
170
177
|
connector_type=CONNECTOR_TYPE,
|
|
171
|
-
metadata=
|
|
178
|
+
metadata=FileDataSourceMetadata(
|
|
172
179
|
url=f"{self.connection_config.hosts[0]}/{self.index_config.index_name}",
|
|
173
180
|
date_processed=str(time()),
|
|
174
181
|
),
|
|
@@ -234,7 +241,7 @@ class ElasticsearchDownloader(Downloader):
|
|
|
234
241
|
file_data=FileData(
|
|
235
242
|
identifier=filename_id,
|
|
236
243
|
connector_type=CONNECTOR_TYPE,
|
|
237
|
-
metadata=
|
|
244
|
+
metadata=FileDataSourceMetadata(
|
|
238
245
|
version=str(result["_version"]) if "_version" in result else None,
|
|
239
246
|
date_processed=str(time()),
|
|
240
247
|
record_locator={
|
|
@@ -339,6 +346,13 @@ class ElasticsearchUploader(Uploader):
|
|
|
339
346
|
upload_config: ElasticsearchUploaderConfig
|
|
340
347
|
connection_config: ElasticsearchConnectionConfig
|
|
341
348
|
|
|
349
|
+
def precheck(self) -> None:
|
|
350
|
+
try:
|
|
351
|
+
self.connection_config.get_client()
|
|
352
|
+
except Exception as e:
|
|
353
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
354
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
355
|
+
|
|
342
356
|
@requires_dependencies(["elasticsearch"], extras="elasticsearch")
|
|
343
357
|
def load_parallel_bulk(self):
|
|
344
358
|
from elasticsearch.helpers import parallel_bulk
|
|
@@ -75,6 +75,10 @@ class AzureIndexer(FsspecIndexer):
|
|
|
75
75
|
index_config: AzureIndexerConfig
|
|
76
76
|
connector_type: str = CONNECTOR_TYPE
|
|
77
77
|
|
|
78
|
+
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
79
|
+
def precheck(self) -> None:
|
|
80
|
+
super().precheck()
|
|
81
|
+
|
|
78
82
|
def sterilize_info(self, path) -> dict:
|
|
79
83
|
info = self.fs.info(path=path)
|
|
80
84
|
return sterilize_dict(data=info, default=azure_json_serial)
|
|
@@ -120,6 +124,10 @@ class AzureUploader(FsspecUploader):
|
|
|
120
124
|
def __post_init__(self):
|
|
121
125
|
super().__post_init__()
|
|
122
126
|
|
|
127
|
+
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
128
|
+
def precheck(self) -> None:
|
|
129
|
+
super().precheck()
|
|
130
|
+
|
|
123
131
|
@requires_dependencies(["adlfs", "fsspec"], extras="azure")
|
|
124
132
|
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
125
133
|
return super().run(contents=contents, **kwargs)
|
|
@@ -70,6 +70,10 @@ class BoxIndexer(FsspecIndexer):
|
|
|
70
70
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
71
71
|
return super().run(**kwargs)
|
|
72
72
|
|
|
73
|
+
@requires_dependencies(["boxfs"], extras="box")
|
|
74
|
+
def precheck(self) -> None:
|
|
75
|
+
super().precheck()
|
|
76
|
+
|
|
73
77
|
|
|
74
78
|
@dataclass
|
|
75
79
|
class BoxDownloaderConfig(FsspecDownloaderConfig):
|
|
@@ -107,6 +111,10 @@ class BoxUploader(FsspecUploader):
|
|
|
107
111
|
def __post_init__(self):
|
|
108
112
|
super().__post_init__()
|
|
109
113
|
|
|
114
|
+
@requires_dependencies(["boxfs"], extras="box")
|
|
115
|
+
def precheck(self) -> None:
|
|
116
|
+
super().precheck()
|
|
117
|
+
|
|
110
118
|
@requires_dependencies(["boxfs"], extras="box")
|
|
111
119
|
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
112
120
|
return super().run(contents=contents, **kwargs)
|
|
@@ -57,6 +57,10 @@ class DropboxIndexer(FsspecIndexer):
|
|
|
57
57
|
if not self.index_config.path_without_protocol.startswith("/"):
|
|
58
58
|
self.index_config.path_without_protocol = "/" + self.index_config.path_without_protocol
|
|
59
59
|
|
|
60
|
+
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
61
|
+
def precheck(self) -> None:
|
|
62
|
+
super().precheck()
|
|
63
|
+
|
|
60
64
|
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
61
65
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
62
66
|
return super().run(**kwargs)
|
|
@@ -106,6 +110,10 @@ class DropboxUploader(FsspecUploader):
|
|
|
106
110
|
def __post_init__(self):
|
|
107
111
|
super().__post_init__()
|
|
108
112
|
|
|
113
|
+
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
114
|
+
def precheck(self) -> None:
|
|
115
|
+
super().precheck()
|
|
116
|
+
|
|
109
117
|
@requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
|
|
110
118
|
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
111
119
|
return super().run(contents=contents, **kwargs)
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import contextlib
|
|
4
|
-
import fnmatch
|
|
5
4
|
from dataclasses import dataclass, field
|
|
6
5
|
from datetime import datetime
|
|
7
6
|
from pathlib import Path
|
|
@@ -9,10 +8,12 @@ from time import time
|
|
|
9
8
|
from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
|
|
10
9
|
from uuid import NAMESPACE_DNS, uuid5
|
|
11
10
|
|
|
12
|
-
from unstructured.documents.elements import DataSourceMetadata
|
|
13
|
-
|
|
14
11
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
15
|
-
from unstructured_ingest.error import
|
|
12
|
+
from unstructured_ingest.error import (
|
|
13
|
+
DestinationConnectionError,
|
|
14
|
+
SourceConnectionError,
|
|
15
|
+
SourceConnectionNetworkError,
|
|
16
|
+
)
|
|
16
17
|
from unstructured_ingest.v2.interfaces import (
|
|
17
18
|
AccessConfig,
|
|
18
19
|
ConnectionConfig,
|
|
@@ -20,6 +21,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
20
21
|
DownloaderConfig,
|
|
21
22
|
DownloadResponse,
|
|
22
23
|
FileData,
|
|
24
|
+
FileDataSourceMetadata,
|
|
23
25
|
Indexer,
|
|
24
26
|
IndexerConfig,
|
|
25
27
|
SourceIdentifiers,
|
|
@@ -74,7 +76,6 @@ class FileConfig(Base):
|
|
|
74
76
|
@dataclass
|
|
75
77
|
class FsspecIndexerConfig(FileConfig, IndexerConfig):
|
|
76
78
|
recursive: bool = False
|
|
77
|
-
file_glob: Optional[list[str]] = None
|
|
78
79
|
|
|
79
80
|
|
|
80
81
|
@dataclass
|
|
@@ -109,17 +110,7 @@ class FsspecIndexer(Indexer):
|
|
|
109
110
|
**self.connection_config.get_access_config(),
|
|
110
111
|
)
|
|
111
112
|
|
|
112
|
-
def
|
|
113
|
-
if self.index_config.file_glob is None:
|
|
114
|
-
return True
|
|
115
|
-
patterns = self.index_config.file_glob
|
|
116
|
-
for pattern in patterns:
|
|
117
|
-
if fnmatch.filter([path], pattern):
|
|
118
|
-
return True
|
|
119
|
-
logger.debug(f"The file {path!r} is discarded as it does not match any given glob.")
|
|
120
|
-
return False
|
|
121
|
-
|
|
122
|
-
def check_connection(self):
|
|
113
|
+
def precheck(self) -> None:
|
|
123
114
|
from fsspec import get_filesystem_class
|
|
124
115
|
|
|
125
116
|
try:
|
|
@@ -157,10 +148,10 @@ class FsspecIndexer(Indexer):
|
|
|
157
148
|
else:
|
|
158
149
|
raise TypeError(f"unhandled response type from find: {type(found)}")
|
|
159
150
|
|
|
160
|
-
def get_metadata(self, path: str) ->
|
|
151
|
+
def get_metadata(self, path: str) -> FileDataSourceMetadata:
|
|
161
152
|
date_created = None
|
|
162
153
|
date_modified = None
|
|
163
|
-
|
|
154
|
+
file_size = None
|
|
164
155
|
try:
|
|
165
156
|
created: Optional[Any] = self.fs.created(path)
|
|
166
157
|
if created:
|
|
@@ -180,6 +171,8 @@ class FsspecIndexer(Indexer):
|
|
|
180
171
|
date_modified = str(modified)
|
|
181
172
|
except NotImplementedError:
|
|
182
173
|
pass
|
|
174
|
+
with contextlib.suppress(AttributeError):
|
|
175
|
+
file_size = self.fs.size(path)
|
|
183
176
|
|
|
184
177
|
version = self.fs.checksum(path)
|
|
185
178
|
metadata: dict[str, str] = {}
|
|
@@ -189,15 +182,19 @@ class FsspecIndexer(Indexer):
|
|
|
189
182
|
"protocol": self.index_config.protocol,
|
|
190
183
|
"remote_file_path": self.index_config.remote_url,
|
|
191
184
|
}
|
|
185
|
+
file_stat = self.fs.stat(path=path)
|
|
186
|
+
if file_id := file_stat.get("id"):
|
|
187
|
+
record_locator["file_id"] = file_id
|
|
192
188
|
if metadata:
|
|
193
189
|
record_locator["metadata"] = metadata
|
|
194
|
-
return
|
|
190
|
+
return FileDataSourceMetadata(
|
|
195
191
|
date_created=date_created,
|
|
196
192
|
date_modified=date_modified,
|
|
197
193
|
date_processed=str(time()),
|
|
198
194
|
version=str(version),
|
|
199
195
|
url=f"{self.index_config.protocol}://{path}",
|
|
200
196
|
record_locator=record_locator,
|
|
197
|
+
filesize_bytes=file_size,
|
|
201
198
|
)
|
|
202
199
|
|
|
203
200
|
def sterilize_info(self, path) -> dict:
|
|
@@ -205,8 +202,7 @@ class FsspecIndexer(Indexer):
|
|
|
205
202
|
return sterilize_dict(data=info)
|
|
206
203
|
|
|
207
204
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
208
|
-
|
|
209
|
-
files = [f for f in raw_files if self.does_path_match_glob(f)]
|
|
205
|
+
files = self.list_files()
|
|
210
206
|
for file in files:
|
|
211
207
|
# Note: we remove any remaining leading slashes (Box introduces these)
|
|
212
208
|
# to get a valid relative path
|
|
@@ -255,13 +251,6 @@ class FsspecDownloader(Downloader):
|
|
|
255
251
|
**self.connection_config.get_access_config(),
|
|
256
252
|
)
|
|
257
253
|
|
|
258
|
-
def get_download_path(self, file_data: FileData) -> Path:
|
|
259
|
-
return (
|
|
260
|
-
self.download_dir / Path(file_data.source_identifiers.relative_path)
|
|
261
|
-
if self.download_config
|
|
262
|
-
else Path(file_data.source_identifiers.rel_path)
|
|
263
|
-
)
|
|
264
|
-
|
|
265
254
|
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
266
255
|
download_path = self.get_download_path(file_data=file_data)
|
|
267
256
|
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
@@ -315,6 +304,19 @@ class FsspecUploader(Uploader):
|
|
|
315
304
|
f"missing 1 required positional argument: 'upload_config'"
|
|
316
305
|
)
|
|
317
306
|
|
|
307
|
+
def precheck(self) -> None:
|
|
308
|
+
from fsspec import get_filesystem_class
|
|
309
|
+
|
|
310
|
+
try:
|
|
311
|
+
fs = get_filesystem_class(self.upload_config.protocol)(
|
|
312
|
+
**self.connection_config.get_access_config(),
|
|
313
|
+
)
|
|
314
|
+
root_dir = self.upload_config.path_without_protocol.split("/")[0]
|
|
315
|
+
fs.ls(path=root_dir, detail=False)
|
|
316
|
+
except Exception as e:
|
|
317
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
318
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
319
|
+
|
|
318
320
|
def get_upload_path(self, file_data: FileData) -> Path:
|
|
319
321
|
upload_path = (
|
|
320
322
|
Path(self.upload_config.path_without_protocol)
|
|
@@ -80,6 +80,10 @@ class GcsIndexer(FsspecIndexer):
|
|
|
80
80
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
81
81
|
return super().run(**kwargs)
|
|
82
82
|
|
|
83
|
+
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
|
|
84
|
+
def precheck(self) -> None:
|
|
85
|
+
super().precheck()
|
|
86
|
+
|
|
83
87
|
|
|
84
88
|
@dataclass
|
|
85
89
|
class GcsDownloaderConfig(FsspecDownloaderConfig):
|
|
@@ -117,6 +121,10 @@ class GcsUploader(FsspecUploader):
|
|
|
117
121
|
def __post_init__(self):
|
|
118
122
|
super().__post_init__()
|
|
119
123
|
|
|
124
|
+
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
|
|
125
|
+
def precheck(self) -> None:
|
|
126
|
+
super().precheck()
|
|
127
|
+
|
|
120
128
|
@requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
|
|
121
129
|
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
122
130
|
return super().run(contents=contents, **kwargs)
|
|
@@ -5,11 +5,15 @@ from pathlib import Path
|
|
|
5
5
|
from time import time
|
|
6
6
|
from typing import Any, Generator, Optional
|
|
7
7
|
|
|
8
|
-
from unstructured.
|
|
8
|
+
from unstructured.utils import requires_dependencies
|
|
9
9
|
|
|
10
10
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
11
|
-
from unstructured_ingest.
|
|
12
|
-
|
|
11
|
+
from unstructured_ingest.v2.interfaces import (
|
|
12
|
+
DownloadResponse,
|
|
13
|
+
FileData,
|
|
14
|
+
FileDataSourceMetadata,
|
|
15
|
+
UploadContent,
|
|
16
|
+
)
|
|
13
17
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
14
18
|
DestinationRegistryEntry,
|
|
15
19
|
SourceRegistryEntry,
|
|
@@ -66,9 +70,10 @@ class S3Indexer(FsspecIndexer):
|
|
|
66
70
|
index_config: S3IndexerConfig
|
|
67
71
|
connector_type: str = CONNECTOR_TYPE
|
|
68
72
|
|
|
69
|
-
def get_metadata(self, path: str) ->
|
|
73
|
+
def get_metadata(self, path: str) -> FileDataSourceMetadata:
|
|
70
74
|
date_created = None
|
|
71
75
|
date_modified = None
|
|
76
|
+
file_size = None
|
|
72
77
|
try:
|
|
73
78
|
modified: Optional[datetime] = self.fs.modified(path)
|
|
74
79
|
if modified:
|
|
@@ -76,6 +81,8 @@ class S3Indexer(FsspecIndexer):
|
|
|
76
81
|
date_modified = str(modified.timestamp())
|
|
77
82
|
except NotImplementedError:
|
|
78
83
|
pass
|
|
84
|
+
with contextlib.suppress(AttributeError):
|
|
85
|
+
file_size = self.fs.size(path)
|
|
79
86
|
|
|
80
87
|
version = None
|
|
81
88
|
info: dict[str, Any] = self.fs.info(path)
|
|
@@ -90,19 +97,24 @@ class S3Indexer(FsspecIndexer):
|
|
|
90
97
|
}
|
|
91
98
|
if metadata:
|
|
92
99
|
record_locator["metadata"] = metadata
|
|
93
|
-
return
|
|
100
|
+
return FileDataSourceMetadata(
|
|
94
101
|
date_created=date_created,
|
|
95
102
|
date_modified=date_modified,
|
|
96
103
|
date_processed=str(time()),
|
|
97
104
|
version=version,
|
|
98
105
|
url=f"{self.index_config.protocol}://{path}",
|
|
99
106
|
record_locator=record_locator,
|
|
107
|
+
filesize_bytes=file_size,
|
|
100
108
|
)
|
|
101
109
|
|
|
102
110
|
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
103
111
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
104
112
|
return super().run(**kwargs)
|
|
105
113
|
|
|
114
|
+
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
115
|
+
def precheck(self) -> None:
|
|
116
|
+
super().precheck()
|
|
117
|
+
|
|
106
118
|
|
|
107
119
|
@dataclass
|
|
108
120
|
class S3DownloaderConfig(FsspecDownloaderConfig):
|
|
@@ -136,6 +148,10 @@ class S3Uploader(FsspecUploader):
|
|
|
136
148
|
connection_config: S3ConnectionConfig
|
|
137
149
|
upload_config: S3UploaderConfig = field(default=None)
|
|
138
150
|
|
|
151
|
+
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
152
|
+
def precheck(self) -> None:
|
|
153
|
+
super().precheck()
|
|
154
|
+
|
|
139
155
|
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
140
156
|
def __post_init__(self):
|
|
141
157
|
super().__post_init__()
|
|
@@ -91,6 +91,10 @@ class SftpIndexer(FsspecIndexer):
|
|
|
91
91
|
file.identifier = new_identifier
|
|
92
92
|
yield file
|
|
93
93
|
|
|
94
|
+
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
95
|
+
def precheck(self) -> None:
|
|
96
|
+
super().precheck()
|
|
97
|
+
|
|
94
98
|
|
|
95
99
|
@dataclass
|
|
96
100
|
class SftpDownloaderConfig(FsspecDownloaderConfig):
|
|
@@ -142,6 +146,10 @@ class SftpUploader(FsspecUploader):
|
|
|
142
146
|
def __post_init__(self):
|
|
143
147
|
super().__post_init__()
|
|
144
148
|
|
|
149
|
+
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
150
|
+
def precheck(self) -> None:
|
|
151
|
+
super().precheck()
|
|
152
|
+
|
|
145
153
|
@requires_dependencies(["paramiko", "fsspec"], extras="sftp")
|
|
146
154
|
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
147
155
|
return super().run(contents=contents, **kwargs)
|
|
@@ -1,15 +1,16 @@
|
|
|
1
1
|
import io
|
|
2
2
|
import os
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
|
-
from pathlib import Path
|
|
5
4
|
from typing import TYPE_CHECKING, Any, Generator, Optional, Union
|
|
6
5
|
|
|
7
6
|
from dateutil import parser
|
|
8
|
-
from unstructured.documents.elements import DataSourceMetadata
|
|
9
7
|
from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
|
|
10
8
|
|
|
11
9
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
12
|
-
from unstructured_ingest.error import
|
|
10
|
+
from unstructured_ingest.error import (
|
|
11
|
+
SourceConnectionError,
|
|
12
|
+
SourceConnectionNetworkError,
|
|
13
|
+
)
|
|
13
14
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
15
|
from unstructured_ingest.utils.string_and_date_utils import json_to_dict
|
|
15
16
|
from unstructured_ingest.v2.interfaces import (
|
|
@@ -18,6 +19,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
18
19
|
Downloader,
|
|
19
20
|
DownloaderConfig,
|
|
20
21
|
FileData,
|
|
22
|
+
FileDataSourceMetadata,
|
|
21
23
|
Indexer,
|
|
22
24
|
IndexerConfig,
|
|
23
25
|
SourceIdentifiers,
|
|
@@ -121,6 +123,13 @@ class GoogleDriveIndexer(Indexer):
|
|
|
121
123
|
]
|
|
122
124
|
)
|
|
123
125
|
|
|
126
|
+
def precheck(self) -> None:
|
|
127
|
+
try:
|
|
128
|
+
self.connection_config.get_files_service()
|
|
129
|
+
except Exception as e:
|
|
130
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
131
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
132
|
+
|
|
124
133
|
@staticmethod
|
|
125
134
|
def is_dir(record: dict) -> bool:
|
|
126
135
|
return record.get("mimeType") == "application/vnd.google-apps.folder"
|
|
@@ -155,7 +164,7 @@ class GoogleDriveIndexer(Indexer):
|
|
|
155
164
|
connector_type=CONNECTOR_TYPE,
|
|
156
165
|
identifier=file_id,
|
|
157
166
|
source_identifiers=source_identifiers,
|
|
158
|
-
metadata=
|
|
167
|
+
metadata=FileDataSourceMetadata(
|
|
159
168
|
url=url,
|
|
160
169
|
version=version,
|
|
161
170
|
date_created=str(date_created_dt.timestamp()),
|
|
@@ -272,11 +281,6 @@ class GoogleDriveDownloader(Downloader):
|
|
|
272
281
|
)
|
|
273
282
|
connector_type: str = CONNECTOR_TYPE
|
|
274
283
|
|
|
275
|
-
def get_download_path(self, file_data: FileData) -> Path:
|
|
276
|
-
rel_path = file_data.source_identifiers.relative_path
|
|
277
|
-
rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
|
|
278
|
-
return self.download_dir / Path(rel_path)
|
|
279
|
-
|
|
280
284
|
@SourceConnectionNetworkError.wrap
|
|
281
285
|
def _get_content(self, downloader: "MediaIoBaseDownload") -> bool:
|
|
282
286
|
downloaded = False
|
|
@@ -1,12 +1,9 @@
|
|
|
1
1
|
import glob
|
|
2
|
-
import itertools
|
|
3
2
|
import shutil
|
|
4
3
|
from dataclasses import dataclass, field
|
|
5
4
|
from pathlib import Path
|
|
6
5
|
from time import time
|
|
7
|
-
from typing import Any, Generator
|
|
8
|
-
|
|
9
|
-
from unstructured.documents.elements import DataSourceMetadata
|
|
6
|
+
from typing import Any, Generator
|
|
10
7
|
|
|
11
8
|
from unstructured_ingest.v2.interfaces import (
|
|
12
9
|
AccessConfig,
|
|
@@ -15,6 +12,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
15
12
|
DownloaderConfig,
|
|
16
13
|
DownloadResponse,
|
|
17
14
|
FileData,
|
|
15
|
+
FileDataSourceMetadata,
|
|
18
16
|
Indexer,
|
|
19
17
|
IndexerConfig,
|
|
20
18
|
SourceIdentifiers,
|
|
@@ -45,7 +43,6 @@ class LocalConnectionConfig(ConnectionConfig):
|
|
|
45
43
|
class LocalIndexerConfig(IndexerConfig):
|
|
46
44
|
input_path: str
|
|
47
45
|
recursive: bool = False
|
|
48
|
-
file_glob: Optional[list[str]] = None
|
|
49
46
|
|
|
50
47
|
@property
|
|
51
48
|
def path(self) -> Path:
|
|
@@ -64,16 +61,11 @@ class LocalIndexer(Indexer):
|
|
|
64
61
|
input_path = self.index_config.path
|
|
65
62
|
if input_path.is_file():
|
|
66
63
|
return [Path(s) for s in glob.glob(f"{self.index_config.path}")]
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
return list(
|
|
71
|
-
itertools.chain.from_iterable(
|
|
72
|
-
glob_fn(pattern) for pattern in self.index_config.file_glob
|
|
73
|
-
)
|
|
74
|
-
)
|
|
64
|
+
if self.index_config.recursive:
|
|
65
|
+
return list(input_path.rglob("*"))
|
|
66
|
+
return list(input_path.glob("*"))
|
|
75
67
|
|
|
76
|
-
def get_file_metadata(self, path: Path) ->
|
|
68
|
+
def get_file_metadata(self, path: Path) -> FileDataSourceMetadata:
|
|
77
69
|
stats = path.stat()
|
|
78
70
|
try:
|
|
79
71
|
date_modified = str(stats.st_mtime)
|
|
@@ -93,12 +85,20 @@ class LocalIndexer(Indexer):
|
|
|
93
85
|
except Exception as e:
|
|
94
86
|
logger.warning(f"Couldn't detect file mode: {e}")
|
|
95
87
|
permissions_data = None
|
|
96
|
-
|
|
88
|
+
|
|
89
|
+
try:
|
|
90
|
+
filesize_bytes = stats.st_size
|
|
91
|
+
except Exception as e:
|
|
92
|
+
logger.warning(f"Couldn't detect file size: {e}")
|
|
93
|
+
filesize_bytes = None
|
|
94
|
+
|
|
95
|
+
return FileDataSourceMetadata(
|
|
97
96
|
date_modified=date_modified,
|
|
98
97
|
date_created=date_created,
|
|
99
98
|
date_processed=str(time()),
|
|
100
99
|
permissions_data=permissions_data,
|
|
101
100
|
record_locator={"path": str(path.resolve())},
|
|
101
|
+
filesize_bytes=filesize_bytes,
|
|
102
102
|
)
|
|
103
103
|
|
|
104
104
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Any, Optional
|
|
|
6
6
|
from unstructured.__version__ import __version__ as unstructured_version
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
9
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
9
10
|
from unstructured_ingest.utils.data_prep import batch_generator
|
|
10
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
12
|
from unstructured_ingest.v2.interfaces import (
|
|
@@ -85,11 +86,15 @@ class MongoDBUploaderConfig(UploaderConfig):
|
|
|
85
86
|
class MongoDBUploader(Uploader):
|
|
86
87
|
upload_config: MongoDBUploaderConfig
|
|
87
88
|
connection_config: MongoDBConnectionConfig
|
|
88
|
-
client: Optional["MongoClient"] = field(init=False)
|
|
89
89
|
connector_type: str = CONNECTOR_TYPE
|
|
90
90
|
|
|
91
|
-
def
|
|
92
|
-
|
|
91
|
+
def precheck(self) -> None:
|
|
92
|
+
try:
|
|
93
|
+
client = self.create_client()
|
|
94
|
+
client.admin.command("ping")
|
|
95
|
+
except Exception as e:
|
|
96
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
97
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
93
98
|
|
|
94
99
|
@requires_dependencies(["pymongo"], extras="mongodb")
|
|
95
100
|
def create_client(self) -> "MongoClient":
|
|
@@ -123,7 +128,8 @@ class MongoDBUploader(Uploader):
|
|
|
123
128
|
f"collection {self.connection_config.collection} "
|
|
124
129
|
f"at {self.connection_config.host}",
|
|
125
130
|
)
|
|
126
|
-
|
|
131
|
+
client = self.create_client()
|
|
132
|
+
db = client[self.connection_config.database]
|
|
127
133
|
collection = db[self.connection_config.collection]
|
|
128
134
|
for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
|
|
129
135
|
collection.insert_many(chunk)
|
|
@@ -5,7 +5,6 @@ from time import time
|
|
|
5
5
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
6
6
|
|
|
7
7
|
from dateutil import parser
|
|
8
|
-
from unstructured.documents.elements import DataSourceMetadata
|
|
9
8
|
|
|
10
9
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
11
10
|
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
@@ -17,6 +16,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
17
16
|
DownloaderConfig,
|
|
18
17
|
DownloadResponse,
|
|
19
18
|
FileData,
|
|
19
|
+
FileDataSourceMetadata,
|
|
20
20
|
Indexer,
|
|
21
21
|
IndexerConfig,
|
|
22
22
|
SourceIdentifiers,
|
|
@@ -87,6 +87,18 @@ class OnedriveIndexer(Indexer):
|
|
|
87
87
|
connection_config: OnedriveConnectionConfig
|
|
88
88
|
index_config: OnedriveIndexerConfig
|
|
89
89
|
|
|
90
|
+
def precheck(self) -> None:
|
|
91
|
+
try:
|
|
92
|
+
token_resp: dict = self.connection_config.get_token()
|
|
93
|
+
if error := token_resp.get("error"):
|
|
94
|
+
raise SourceConnectionError(
|
|
95
|
+
"{} ({})".format(error, token_resp.get("error_description"))
|
|
96
|
+
)
|
|
97
|
+
self.connection_config.get_client()
|
|
98
|
+
except Exception as e:
|
|
99
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
100
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
101
|
+
|
|
90
102
|
def list_objects(self, folder, recursive) -> list["DriveItem"]:
|
|
91
103
|
drive_items = folder.children.get().execute_query()
|
|
92
104
|
files = [d for d in drive_items if d.is_file]
|
|
@@ -136,7 +148,7 @@ class OnedriveIndexer(Indexer):
|
|
|
136
148
|
source_identifiers=SourceIdentifiers(
|
|
137
149
|
fullpath=server_path, filename=drive_item.name, rel_path=rel_path
|
|
138
150
|
),
|
|
139
|
-
metadata=
|
|
151
|
+
metadata=FileDataSourceMetadata(
|
|
140
152
|
url=drive_item.parent_reference.path + "/" + drive_item.name,
|
|
141
153
|
version=drive_item.etag,
|
|
142
154
|
date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
|