unstructured-ingest 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/cli/base/cmd.py +10 -0
- unstructured_ingest/v2/cli/base/src.py +2 -0
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +1 -9
- unstructured_ingest/v2/cli/cmds/local.py +0 -8
- unstructured_ingest/v2/cli/configs/__init__.py +8 -1
- unstructured_ingest/v2/cli/configs/filter.py +28 -0
- unstructured_ingest/v2/interfaces/__init__.py +2 -1
- unstructured_ingest/v2/interfaces/downloader.py +9 -3
- unstructured_ingest/v2/interfaces/file_data.py +6 -1
- unstructured_ingest/v2/interfaces/process.py +3 -0
- unstructured_ingest/v2/pipeline/interfaces.py +3 -5
- unstructured_ingest/v2/pipeline/pipeline.py +72 -2
- unstructured_ingest/v2/pipeline/steps/download.py +77 -13
- unstructured_ingest/v2/pipeline/steps/filter.py +40 -0
- unstructured_ingest/v2/processes/connectors/astra.py +8 -0
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +8 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +8 -6
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +9 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +23 -9
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +22 -31
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -5
- unstructured_ingest/v2/processes/connectors/google_drive.py +13 -9
- unstructured_ingest/v2/processes/connectors/local.py +15 -15
- unstructured_ingest/v2/processes/connectors/mongodb.py +10 -4
- unstructured_ingest/v2/processes/connectors/onedrive.py +14 -2
- unstructured_ingest/v2/processes/connectors/pinecone.py +6 -3
- unstructured_ingest/v2/processes/connectors/salesforce.py +10 -8
- unstructured_ingest/v2/processes/connectors/sharepoint.py +14 -8
- unstructured_ingest/v2/processes/connectors/sql.py +24 -9
- unstructured_ingest/v2/processes/connectors/weaviate.py +13 -5
- unstructured_ingest/v2/processes/filter.py +54 -0
- {unstructured_ingest-0.0.1.dist-info → unstructured_ingest-0.0.2.dist-info}/METADATA +13 -13
- {unstructured_ingest-0.0.1.dist-info → unstructured_ingest-0.0.2.dist-info}/RECORD +37 -34
- {unstructured_ingest-0.0.1.dist-info → unstructured_ingest-0.0.2.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.1.dist-info → unstructured_ingest-0.0.2.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.1.dist-info → unstructured_ingest-0.0.2.dist-info}/top_level.txt +0 -0
|
@@ -3,6 +3,7 @@ from dataclasses import dataclass, field
|
|
|
3
3
|
from typing import TYPE_CHECKING, Any, Optional
|
|
4
4
|
|
|
5
5
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
6
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
6
7
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
7
8
|
from unstructured_ingest.v2.interfaces import (
|
|
8
9
|
AccessConfig,
|
|
@@ -11,6 +12,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
11
12
|
Uploader,
|
|
12
13
|
UploaderConfig,
|
|
13
14
|
)
|
|
15
|
+
from unstructured_ingest.v2.logger import logger
|
|
14
16
|
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
15
17
|
|
|
16
18
|
if TYPE_CHECKING:
|
|
@@ -78,6 +80,13 @@ class DatabricksVolumesUploader(Uploader):
|
|
|
78
80
|
host=self.connection_config.host, **self.connection_config.access_config.to_dict()
|
|
79
81
|
)
|
|
80
82
|
|
|
83
|
+
def precheck(self) -> None:
|
|
84
|
+
try:
|
|
85
|
+
assert self.client.current_user.me().active
|
|
86
|
+
except Exception as e:
|
|
87
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
88
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
89
|
+
|
|
81
90
|
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
82
91
|
for content in contents:
|
|
83
92
|
with open(content.path, "rb") as elements_file:
|
|
@@ -7,10 +7,12 @@ from pathlib import Path
|
|
|
7
7
|
from time import time
|
|
8
8
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
9
9
|
|
|
10
|
-
from unstructured.documents.elements import DataSourceMetadata
|
|
11
|
-
|
|
12
10
|
from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
|
|
13
|
-
from unstructured_ingest.error import
|
|
11
|
+
from unstructured_ingest.error import (
|
|
12
|
+
DestinationConnectionError,
|
|
13
|
+
SourceConnectionError,
|
|
14
|
+
SourceConnectionNetworkError,
|
|
15
|
+
)
|
|
14
16
|
from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
|
|
15
17
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
16
18
|
from unstructured_ingest.v2.interfaces import (
|
|
@@ -20,6 +22,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
20
22
|
DownloaderConfig,
|
|
21
23
|
DownloadResponse,
|
|
22
24
|
FileData,
|
|
25
|
+
FileDataSourceMetadata,
|
|
23
26
|
Indexer,
|
|
24
27
|
IndexerConfig,
|
|
25
28
|
UploadContent,
|
|
@@ -121,11 +124,14 @@ class ElasticsearchIndexerConfig(IndexerConfig):
|
|
|
121
124
|
class ElasticsearchIndexer(Indexer):
|
|
122
125
|
connection_config: ElasticsearchConnectionConfig
|
|
123
126
|
index_config: ElasticsearchIndexerConfig
|
|
124
|
-
client: "ElasticsearchClient" = field(init=False)
|
|
125
127
|
connector_type: str = CONNECTOR_TYPE
|
|
126
128
|
|
|
127
|
-
def
|
|
128
|
-
|
|
129
|
+
def precheck(self) -> None:
|
|
130
|
+
try:
|
|
131
|
+
self.connection_config.get_client()
|
|
132
|
+
except Exception as e:
|
|
133
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
134
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
129
135
|
|
|
130
136
|
@requires_dependencies(["elasticsearch"], extras="elasticsearch")
|
|
131
137
|
def load_scan(self):
|
|
@@ -138,8 +144,9 @@ class ElasticsearchIndexer(Indexer):
|
|
|
138
144
|
scan = self.load_scan()
|
|
139
145
|
|
|
140
146
|
scan_query: dict = {"stored_fields": [], "query": {"match_all": {}}}
|
|
147
|
+
client = self.connection_config.get_client()
|
|
141
148
|
hits = scan(
|
|
142
|
-
|
|
149
|
+
client,
|
|
143
150
|
query=scan_query,
|
|
144
151
|
scroll="1m",
|
|
145
152
|
index=self.index_config.index_name,
|
|
@@ -168,7 +175,7 @@ class ElasticsearchIndexer(Indexer):
|
|
|
168
175
|
yield FileData(
|
|
169
176
|
identifier=identified,
|
|
170
177
|
connector_type=CONNECTOR_TYPE,
|
|
171
|
-
metadata=
|
|
178
|
+
metadata=FileDataSourceMetadata(
|
|
172
179
|
url=f"{self.connection_config.hosts[0]}/{self.index_config.index_name}",
|
|
173
180
|
date_processed=str(time()),
|
|
174
181
|
),
|
|
@@ -234,7 +241,7 @@ class ElasticsearchDownloader(Downloader):
|
|
|
234
241
|
file_data=FileData(
|
|
235
242
|
identifier=filename_id,
|
|
236
243
|
connector_type=CONNECTOR_TYPE,
|
|
237
|
-
metadata=
|
|
244
|
+
metadata=FileDataSourceMetadata(
|
|
238
245
|
version=str(result["_version"]) if "_version" in result else None,
|
|
239
246
|
date_processed=str(time()),
|
|
240
247
|
record_locator={
|
|
@@ -339,6 +346,13 @@ class ElasticsearchUploader(Uploader):
|
|
|
339
346
|
upload_config: ElasticsearchUploaderConfig
|
|
340
347
|
connection_config: ElasticsearchConnectionConfig
|
|
341
348
|
|
|
349
|
+
def precheck(self) -> None:
|
|
350
|
+
try:
|
|
351
|
+
self.connection_config.get_client()
|
|
352
|
+
except Exception as e:
|
|
353
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
354
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
355
|
+
|
|
342
356
|
@requires_dependencies(["elasticsearch"], extras="elasticsearch")
|
|
343
357
|
def load_parallel_bulk(self):
|
|
344
358
|
from elasticsearch.helpers import parallel_bulk
|
|
@@ -1,14 +1,12 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import contextlib
|
|
4
|
-
import fnmatch
|
|
5
4
|
from dataclasses import dataclass, field
|
|
6
5
|
from datetime import datetime
|
|
7
6
|
from pathlib import Path
|
|
8
7
|
from time import time
|
|
9
8
|
from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
|
|
10
|
-
|
|
11
|
-
from unstructured.documents.elements import DataSourceMetadata
|
|
9
|
+
from uuid import NAMESPACE_DNS, uuid5
|
|
12
10
|
|
|
13
11
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
14
12
|
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
@@ -19,6 +17,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
19
17
|
DownloaderConfig,
|
|
20
18
|
DownloadResponse,
|
|
21
19
|
FileData,
|
|
20
|
+
FileDataSourceMetadata,
|
|
22
21
|
Indexer,
|
|
23
22
|
IndexerConfig,
|
|
24
23
|
SourceIdentifiers,
|
|
@@ -73,7 +72,6 @@ class FileConfig(Base):
|
|
|
73
72
|
@dataclass
|
|
74
73
|
class FsspecIndexerConfig(FileConfig, IndexerConfig):
|
|
75
74
|
recursive: bool = False
|
|
76
|
-
file_glob: Optional[list[str]] = None
|
|
77
75
|
|
|
78
76
|
|
|
79
77
|
@dataclass
|
|
@@ -108,17 +106,7 @@ class FsspecIndexer(Indexer):
|
|
|
108
106
|
**self.connection_config.get_access_config(),
|
|
109
107
|
)
|
|
110
108
|
|
|
111
|
-
def
|
|
112
|
-
if self.index_config.file_glob is None:
|
|
113
|
-
return True
|
|
114
|
-
patterns = self.index_config.file_glob
|
|
115
|
-
for pattern in patterns:
|
|
116
|
-
if fnmatch.filter([path], pattern):
|
|
117
|
-
return True
|
|
118
|
-
logger.debug(f"The file {path!r} is discarded as it does not match any given glob.")
|
|
119
|
-
return False
|
|
120
|
-
|
|
121
|
-
def check_connection(self):
|
|
109
|
+
def precheck(self) -> None:
|
|
122
110
|
from fsspec import get_filesystem_class
|
|
123
111
|
|
|
124
112
|
try:
|
|
@@ -156,10 +144,10 @@ class FsspecIndexer(Indexer):
|
|
|
156
144
|
else:
|
|
157
145
|
raise TypeError(f"unhandled response type from find: {type(found)}")
|
|
158
146
|
|
|
159
|
-
def get_metadata(self, path: str) ->
|
|
147
|
+
def get_metadata(self, path: str) -> FileDataSourceMetadata:
|
|
160
148
|
date_created = None
|
|
161
149
|
date_modified = None
|
|
162
|
-
|
|
150
|
+
file_size = None
|
|
163
151
|
try:
|
|
164
152
|
created: Optional[Any] = self.fs.created(path)
|
|
165
153
|
if created:
|
|
@@ -179,6 +167,8 @@ class FsspecIndexer(Indexer):
|
|
|
179
167
|
date_modified = str(modified)
|
|
180
168
|
except NotImplementedError:
|
|
181
169
|
pass
|
|
170
|
+
with contextlib.suppress(AttributeError):
|
|
171
|
+
file_size = self.fs.size(path)
|
|
182
172
|
|
|
183
173
|
version = self.fs.checksum(path)
|
|
184
174
|
metadata: dict[str, str] = {}
|
|
@@ -188,15 +178,19 @@ class FsspecIndexer(Indexer):
|
|
|
188
178
|
"protocol": self.index_config.protocol,
|
|
189
179
|
"remote_file_path": self.index_config.remote_url,
|
|
190
180
|
}
|
|
181
|
+
file_stat = self.fs.stat(path=path)
|
|
182
|
+
if file_id := file_stat.get("id"):
|
|
183
|
+
record_locator["file_id"] = file_id
|
|
191
184
|
if metadata:
|
|
192
185
|
record_locator["metadata"] = metadata
|
|
193
|
-
return
|
|
186
|
+
return FileDataSourceMetadata(
|
|
194
187
|
date_created=date_created,
|
|
195
188
|
date_modified=date_modified,
|
|
196
189
|
date_processed=str(time()),
|
|
197
190
|
version=str(version),
|
|
198
191
|
url=f"{self.index_config.protocol}://{path}",
|
|
199
192
|
record_locator=record_locator,
|
|
193
|
+
filesize_bytes=file_size,
|
|
200
194
|
)
|
|
201
195
|
|
|
202
196
|
def sterilize_info(self, path) -> dict:
|
|
@@ -204,14 +198,16 @@ class FsspecIndexer(Indexer):
|
|
|
204
198
|
return sterilize_dict(data=info)
|
|
205
199
|
|
|
206
200
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
207
|
-
|
|
208
|
-
files = [f for f in raw_files if self.does_path_match_glob(f)]
|
|
201
|
+
files = self.list_files()
|
|
209
202
|
for file in files:
|
|
210
203
|
# Note: we remove any remaining leading slashes (Box introduces these)
|
|
211
204
|
# to get a valid relative path
|
|
212
205
|
rel_path = file.replace(self.index_config.path_without_protocol, "").lstrip("/")
|
|
206
|
+
|
|
207
|
+
additional_metadata = self.sterilize_info(path=file)
|
|
208
|
+
additional_metadata["original_file_path"] = file
|
|
213
209
|
yield FileData(
|
|
214
|
-
identifier=file,
|
|
210
|
+
identifier=str(uuid5(NAMESPACE_DNS, file)),
|
|
215
211
|
connector_type=self.connector_type,
|
|
216
212
|
source_identifiers=SourceIdentifiers(
|
|
217
213
|
filename=Path(file).name,
|
|
@@ -219,7 +215,7 @@ class FsspecIndexer(Indexer):
|
|
|
219
215
|
fullpath=file,
|
|
220
216
|
),
|
|
221
217
|
metadata=self.get_metadata(path=file),
|
|
222
|
-
additional_metadata=
|
|
218
|
+
additional_metadata=additional_metadata,
|
|
223
219
|
)
|
|
224
220
|
|
|
225
221
|
|
|
@@ -251,18 +247,12 @@ class FsspecDownloader(Downloader):
|
|
|
251
247
|
**self.connection_config.get_access_config(),
|
|
252
248
|
)
|
|
253
249
|
|
|
254
|
-
def get_download_path(self, file_data: FileData) -> Path:
|
|
255
|
-
return (
|
|
256
|
-
self.download_dir / Path(file_data.source_identifiers.relative_path)
|
|
257
|
-
if self.download_config
|
|
258
|
-
else Path(file_data.source_identifiers.rel_path)
|
|
259
|
-
)
|
|
260
|
-
|
|
261
250
|
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
262
251
|
download_path = self.get_download_path(file_data=file_data)
|
|
263
252
|
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
264
253
|
try:
|
|
265
|
-
|
|
254
|
+
rpath = file_data.additional_metadata["original_file_path"]
|
|
255
|
+
self.fs.get(rpath=rpath, lpath=download_path.as_posix())
|
|
266
256
|
except Exception as e:
|
|
267
257
|
logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
|
|
268
258
|
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
@@ -272,7 +262,8 @@ class FsspecDownloader(Downloader):
|
|
|
272
262
|
download_path = self.get_download_path(file_data=file_data)
|
|
273
263
|
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
274
264
|
try:
|
|
275
|
-
|
|
265
|
+
rpath = file_data.additional_metadata["original_file_path"]
|
|
266
|
+
await self.fs.get(rpath=rpath, lpath=download_path.as_posix())
|
|
276
267
|
except Exception as e:
|
|
277
268
|
logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
|
|
278
269
|
raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
|
|
@@ -5,11 +5,15 @@ from pathlib import Path
|
|
|
5
5
|
from time import time
|
|
6
6
|
from typing import Any, Generator, Optional
|
|
7
7
|
|
|
8
|
-
from unstructured.
|
|
8
|
+
from unstructured.utils import requires_dependencies
|
|
9
9
|
|
|
10
10
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
11
|
-
from unstructured_ingest.
|
|
12
|
-
|
|
11
|
+
from unstructured_ingest.v2.interfaces import (
|
|
12
|
+
DownloadResponse,
|
|
13
|
+
FileData,
|
|
14
|
+
FileDataSourceMetadata,
|
|
15
|
+
UploadContent,
|
|
16
|
+
)
|
|
13
17
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
14
18
|
DestinationRegistryEntry,
|
|
15
19
|
SourceRegistryEntry,
|
|
@@ -66,9 +70,10 @@ class S3Indexer(FsspecIndexer):
|
|
|
66
70
|
index_config: S3IndexerConfig
|
|
67
71
|
connector_type: str = CONNECTOR_TYPE
|
|
68
72
|
|
|
69
|
-
def get_metadata(self, path: str) ->
|
|
73
|
+
def get_metadata(self, path: str) -> FileDataSourceMetadata:
|
|
70
74
|
date_created = None
|
|
71
75
|
date_modified = None
|
|
76
|
+
file_size = None
|
|
72
77
|
try:
|
|
73
78
|
modified: Optional[datetime] = self.fs.modified(path)
|
|
74
79
|
if modified:
|
|
@@ -76,6 +81,8 @@ class S3Indexer(FsspecIndexer):
|
|
|
76
81
|
date_modified = str(modified.timestamp())
|
|
77
82
|
except NotImplementedError:
|
|
78
83
|
pass
|
|
84
|
+
with contextlib.suppress(AttributeError):
|
|
85
|
+
file_size = self.fs.size(path)
|
|
79
86
|
|
|
80
87
|
version = None
|
|
81
88
|
info: dict[str, Any] = self.fs.info(path)
|
|
@@ -90,13 +97,14 @@ class S3Indexer(FsspecIndexer):
|
|
|
90
97
|
}
|
|
91
98
|
if metadata:
|
|
92
99
|
record_locator["metadata"] = metadata
|
|
93
|
-
return
|
|
100
|
+
return FileDataSourceMetadata(
|
|
94
101
|
date_created=date_created,
|
|
95
102
|
date_modified=date_modified,
|
|
96
103
|
date_processed=str(time()),
|
|
97
104
|
version=version,
|
|
98
105
|
url=f"{self.index_config.protocol}://{path}",
|
|
99
106
|
record_locator=record_locator,
|
|
107
|
+
filesize_bytes=file_size,
|
|
100
108
|
)
|
|
101
109
|
|
|
102
110
|
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
@@ -1,15 +1,16 @@
|
|
|
1
1
|
import io
|
|
2
2
|
import os
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
|
-
from pathlib import Path
|
|
5
4
|
from typing import TYPE_CHECKING, Any, Generator, Optional, Union
|
|
6
5
|
|
|
7
6
|
from dateutil import parser
|
|
8
|
-
from unstructured.documents.elements import DataSourceMetadata
|
|
9
7
|
from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
|
|
10
8
|
|
|
11
9
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
12
|
-
from unstructured_ingest.error import
|
|
10
|
+
from unstructured_ingest.error import (
|
|
11
|
+
SourceConnectionError,
|
|
12
|
+
SourceConnectionNetworkError,
|
|
13
|
+
)
|
|
13
14
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
15
|
from unstructured_ingest.utils.string_and_date_utils import json_to_dict
|
|
15
16
|
from unstructured_ingest.v2.interfaces import (
|
|
@@ -18,6 +19,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
18
19
|
Downloader,
|
|
19
20
|
DownloaderConfig,
|
|
20
21
|
FileData,
|
|
22
|
+
FileDataSourceMetadata,
|
|
21
23
|
Indexer,
|
|
22
24
|
IndexerConfig,
|
|
23
25
|
SourceIdentifiers,
|
|
@@ -121,6 +123,13 @@ class GoogleDriveIndexer(Indexer):
|
|
|
121
123
|
]
|
|
122
124
|
)
|
|
123
125
|
|
|
126
|
+
def precheck(self) -> None:
|
|
127
|
+
try:
|
|
128
|
+
self.connection_config.get_files_service()
|
|
129
|
+
except Exception as e:
|
|
130
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
131
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
132
|
+
|
|
124
133
|
@staticmethod
|
|
125
134
|
def is_dir(record: dict) -> bool:
|
|
126
135
|
return record.get("mimeType") == "application/vnd.google-apps.folder"
|
|
@@ -155,7 +164,7 @@ class GoogleDriveIndexer(Indexer):
|
|
|
155
164
|
connector_type=CONNECTOR_TYPE,
|
|
156
165
|
identifier=file_id,
|
|
157
166
|
source_identifiers=source_identifiers,
|
|
158
|
-
metadata=
|
|
167
|
+
metadata=FileDataSourceMetadata(
|
|
159
168
|
url=url,
|
|
160
169
|
version=version,
|
|
161
170
|
date_created=str(date_created_dt.timestamp()),
|
|
@@ -272,11 +281,6 @@ class GoogleDriveDownloader(Downloader):
|
|
|
272
281
|
)
|
|
273
282
|
connector_type: str = CONNECTOR_TYPE
|
|
274
283
|
|
|
275
|
-
def get_download_path(self, file_data: FileData) -> Path:
|
|
276
|
-
rel_path = file_data.source_identifiers.relative_path
|
|
277
|
-
rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
|
|
278
|
-
return self.download_dir / Path(rel_path)
|
|
279
|
-
|
|
280
284
|
@SourceConnectionNetworkError.wrap
|
|
281
285
|
def _get_content(self, downloader: "MediaIoBaseDownload") -> bool:
|
|
282
286
|
downloaded = False
|
|
@@ -1,12 +1,9 @@
|
|
|
1
1
|
import glob
|
|
2
|
-
import itertools
|
|
3
2
|
import shutil
|
|
4
3
|
from dataclasses import dataclass, field
|
|
5
4
|
from pathlib import Path
|
|
6
5
|
from time import time
|
|
7
|
-
from typing import Any, Generator
|
|
8
|
-
|
|
9
|
-
from unstructured.documents.elements import DataSourceMetadata
|
|
6
|
+
from typing import Any, Generator
|
|
10
7
|
|
|
11
8
|
from unstructured_ingest.v2.interfaces import (
|
|
12
9
|
AccessConfig,
|
|
@@ -15,6 +12,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
15
12
|
DownloaderConfig,
|
|
16
13
|
DownloadResponse,
|
|
17
14
|
FileData,
|
|
15
|
+
FileDataSourceMetadata,
|
|
18
16
|
Indexer,
|
|
19
17
|
IndexerConfig,
|
|
20
18
|
SourceIdentifiers,
|
|
@@ -45,7 +43,6 @@ class LocalConnectionConfig(ConnectionConfig):
|
|
|
45
43
|
class LocalIndexerConfig(IndexerConfig):
|
|
46
44
|
input_path: str
|
|
47
45
|
recursive: bool = False
|
|
48
|
-
file_glob: Optional[list[str]] = None
|
|
49
46
|
|
|
50
47
|
@property
|
|
51
48
|
def path(self) -> Path:
|
|
@@ -64,16 +61,11 @@ class LocalIndexer(Indexer):
|
|
|
64
61
|
input_path = self.index_config.path
|
|
65
62
|
if input_path.is_file():
|
|
66
63
|
return [Path(s) for s in glob.glob(f"{self.index_config.path}")]
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
return list(
|
|
71
|
-
itertools.chain.from_iterable(
|
|
72
|
-
glob_fn(pattern) for pattern in self.index_config.file_glob
|
|
73
|
-
)
|
|
74
|
-
)
|
|
64
|
+
if self.index_config.recursive:
|
|
65
|
+
return list(input_path.rglob("*"))
|
|
66
|
+
return list(input_path.glob("*"))
|
|
75
67
|
|
|
76
|
-
def get_file_metadata(self, path: Path) ->
|
|
68
|
+
def get_file_metadata(self, path: Path) -> FileDataSourceMetadata:
|
|
77
69
|
stats = path.stat()
|
|
78
70
|
try:
|
|
79
71
|
date_modified = str(stats.st_mtime)
|
|
@@ -93,12 +85,20 @@ class LocalIndexer(Indexer):
|
|
|
93
85
|
except Exception as e:
|
|
94
86
|
logger.warning(f"Couldn't detect file mode: {e}")
|
|
95
87
|
permissions_data = None
|
|
96
|
-
|
|
88
|
+
|
|
89
|
+
try:
|
|
90
|
+
filesize_bytes = stats.st_size
|
|
91
|
+
except Exception as e:
|
|
92
|
+
logger.warning(f"Couldn't detect file size: {e}")
|
|
93
|
+
filesize_bytes = None
|
|
94
|
+
|
|
95
|
+
return FileDataSourceMetadata(
|
|
97
96
|
date_modified=date_modified,
|
|
98
97
|
date_created=date_created,
|
|
99
98
|
date_processed=str(time()),
|
|
100
99
|
permissions_data=permissions_data,
|
|
101
100
|
record_locator={"path": str(path.resolve())},
|
|
101
|
+
filesize_bytes=filesize_bytes,
|
|
102
102
|
)
|
|
103
103
|
|
|
104
104
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Any, Optional
|
|
|
6
6
|
from unstructured.__version__ import __version__ as unstructured_version
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
9
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
9
10
|
from unstructured_ingest.utils.data_prep import batch_generator
|
|
10
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
12
|
from unstructured_ingest.v2.interfaces import (
|
|
@@ -85,11 +86,15 @@ class MongoDBUploaderConfig(UploaderConfig):
|
|
|
85
86
|
class MongoDBUploader(Uploader):
|
|
86
87
|
upload_config: MongoDBUploaderConfig
|
|
87
88
|
connection_config: MongoDBConnectionConfig
|
|
88
|
-
client: Optional["MongoClient"] = field(init=False)
|
|
89
89
|
connector_type: str = CONNECTOR_TYPE
|
|
90
90
|
|
|
91
|
-
def
|
|
92
|
-
|
|
91
|
+
def precheck(self) -> None:
|
|
92
|
+
try:
|
|
93
|
+
client = self.create_client()
|
|
94
|
+
client.admin.command("ping")
|
|
95
|
+
except Exception as e:
|
|
96
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
97
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
93
98
|
|
|
94
99
|
@requires_dependencies(["pymongo"], extras="mongodb")
|
|
95
100
|
def create_client(self) -> "MongoClient":
|
|
@@ -123,7 +128,8 @@ class MongoDBUploader(Uploader):
|
|
|
123
128
|
f"collection {self.connection_config.collection} "
|
|
124
129
|
f"at {self.connection_config.host}",
|
|
125
130
|
)
|
|
126
|
-
|
|
131
|
+
client = self.create_client()
|
|
132
|
+
db = client[self.connection_config.database]
|
|
127
133
|
collection = db[self.connection_config.collection]
|
|
128
134
|
for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
|
|
129
135
|
collection.insert_many(chunk)
|
|
@@ -5,7 +5,6 @@ from time import time
|
|
|
5
5
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
6
6
|
|
|
7
7
|
from dateutil import parser
|
|
8
|
-
from unstructured.documents.elements import DataSourceMetadata
|
|
9
8
|
|
|
10
9
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
11
10
|
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
@@ -17,6 +16,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
17
16
|
DownloaderConfig,
|
|
18
17
|
DownloadResponse,
|
|
19
18
|
FileData,
|
|
19
|
+
FileDataSourceMetadata,
|
|
20
20
|
Indexer,
|
|
21
21
|
IndexerConfig,
|
|
22
22
|
SourceIdentifiers,
|
|
@@ -87,6 +87,18 @@ class OnedriveIndexer(Indexer):
|
|
|
87
87
|
connection_config: OnedriveConnectionConfig
|
|
88
88
|
index_config: OnedriveIndexerConfig
|
|
89
89
|
|
|
90
|
+
def precheck(self) -> None:
|
|
91
|
+
try:
|
|
92
|
+
token_resp: dict = self.connection_config.get_token()
|
|
93
|
+
if error := token_resp.get("error"):
|
|
94
|
+
raise SourceConnectionError(
|
|
95
|
+
"{} ({})".format(error, token_resp.get("error_description"))
|
|
96
|
+
)
|
|
97
|
+
self.connection_config.get_client()
|
|
98
|
+
except Exception as e:
|
|
99
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
100
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
101
|
+
|
|
90
102
|
def list_objects(self, folder, recursive) -> list["DriveItem"]:
|
|
91
103
|
drive_items = folder.children.get().execute_query()
|
|
92
104
|
files = [d for d in drive_items if d.is_file]
|
|
@@ -136,7 +148,7 @@ class OnedriveIndexer(Indexer):
|
|
|
136
148
|
source_identifiers=SourceIdentifiers(
|
|
137
149
|
fullpath=server_path, filename=drive_item.name, rel_path=rel_path
|
|
138
150
|
),
|
|
139
|
-
metadata=
|
|
151
|
+
metadata=FileDataSourceMetadata(
|
|
140
152
|
url=drive_item.parent_reference.path + "/" + drive_item.name,
|
|
141
153
|
version=drive_item.etag,
|
|
142
154
|
date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
|
|
@@ -123,9 +123,12 @@ class PineconeUploader(Uploader):
|
|
|
123
123
|
connection_config: PineconeConnectionConfig
|
|
124
124
|
connector_type: str = CONNECTOR_TYPE
|
|
125
125
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
126
|
+
def precheck(self):
|
|
127
|
+
try:
|
|
128
|
+
self.connection_config.get_index()
|
|
129
|
+
except Exception as e:
|
|
130
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
131
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
129
132
|
|
|
130
133
|
@requires_dependencies(["pinecone"], extras="pinecone")
|
|
131
134
|
def upsert_batch(self, batch):
|
|
@@ -18,10 +18,9 @@ from textwrap import dedent
|
|
|
18
18
|
from typing import TYPE_CHECKING, Any, Generator, Type
|
|
19
19
|
|
|
20
20
|
from dateutil import parser
|
|
21
|
-
from unstructured.documents.elements import DataSourceMetadata
|
|
22
21
|
|
|
23
22
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
24
|
-
from unstructured_ingest.error import SourceConnectionNetworkError
|
|
23
|
+
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
25
24
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
26
25
|
from unstructured_ingest.v2.interfaces import (
|
|
27
26
|
AccessConfig,
|
|
@@ -30,6 +29,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
30
29
|
DownloaderConfig,
|
|
31
30
|
DownloadResponse,
|
|
32
31
|
FileData,
|
|
32
|
+
FileDataSourceMetadata,
|
|
33
33
|
Indexer,
|
|
34
34
|
IndexerConfig,
|
|
35
35
|
SourceIdentifiers,
|
|
@@ -132,6 +132,13 @@ class SalesforceIndexer(Indexer):
|
|
|
132
132
|
if record_type not in ACCEPTED_CATEGORIES:
|
|
133
133
|
raise ValueError(f"{record_type} not currently an accepted Salesforce category")
|
|
134
134
|
|
|
135
|
+
def precheck(self) -> None:
|
|
136
|
+
try:
|
|
137
|
+
self.connection_config.get_client()
|
|
138
|
+
except Exception as e:
|
|
139
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
140
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
141
|
+
|
|
135
142
|
def get_file_extension(self, record_type) -> str:
|
|
136
143
|
if record_type == "EmailMessage":
|
|
137
144
|
extension = ".eml"
|
|
@@ -172,7 +179,7 @@ class SalesforceIndexer(Indexer):
|
|
|
172
179
|
filename=record_with_extension,
|
|
173
180
|
fullpath=f"{record['attributes']['type']}/{record_with_extension}",
|
|
174
181
|
),
|
|
175
|
-
metadata=
|
|
182
|
+
metadata=FileDataSourceMetadata(
|
|
176
183
|
url=record["attributes"]["url"],
|
|
177
184
|
version=str(parser.parse(record["SystemModstamp"]).timestamp()),
|
|
178
185
|
date_created=str(parser.parse(record["CreatedDate"]).timestamp()),
|
|
@@ -207,11 +214,6 @@ class SalesforceDownloader(Downloader):
|
|
|
207
214
|
)
|
|
208
215
|
connector_type: str = CONNECTOR_TYPE
|
|
209
216
|
|
|
210
|
-
def get_download_path(self, file_data: FileData) -> Path:
|
|
211
|
-
rel_path = file_data.source_identifiers.relative_path
|
|
212
|
-
rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
|
|
213
|
-
return self.download_dir / Path(rel_path)
|
|
214
|
-
|
|
215
217
|
def _xml_for_record(self, record: OrderedDict) -> str:
|
|
216
218
|
"""Creates partitionable xml file from a record"""
|
|
217
219
|
import xml.etree.ElementTree as ET
|
|
@@ -6,10 +6,8 @@ from time import time
|
|
|
6
6
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
7
7
|
from urllib.parse import quote
|
|
8
8
|
|
|
9
|
-
from unstructured.documents.elements import DataSourceMetadata
|
|
10
|
-
|
|
11
9
|
from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
|
|
12
|
-
from unstructured_ingest.error import SourceConnectionNetworkError
|
|
10
|
+
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
13
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
12
|
from unstructured_ingest.v2.interfaces import (
|
|
15
13
|
AccessConfig,
|
|
@@ -18,6 +16,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
18
16
|
DownloaderConfig,
|
|
19
17
|
DownloadResponse,
|
|
20
18
|
FileData,
|
|
19
|
+
FileDataSourceMetadata,
|
|
21
20
|
Indexer,
|
|
22
21
|
IndexerConfig,
|
|
23
22
|
SourceIdentifiers,
|
|
@@ -134,6 +133,14 @@ class SharepointIndexer(Indexer):
|
|
|
134
133
|
connection_config: SharepointConnectionConfig
|
|
135
134
|
index_config: SharepointIndexerConfig = field(default_factory=lambda: SharepointIndexerConfig())
|
|
136
135
|
|
|
136
|
+
def precheck(self) -> None:
|
|
137
|
+
try:
|
|
138
|
+
site_client = self.connection_config.get_client()
|
|
139
|
+
site_client.site_pages.pages.get().execute_query()
|
|
140
|
+
except Exception as e:
|
|
141
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
142
|
+
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
143
|
+
|
|
137
144
|
def list_files(self, folder: "Folder", recursive: bool = False) -> list["File"]:
|
|
138
145
|
if not recursive:
|
|
139
146
|
folder.expand(["Files"]).get().execute_query()
|
|
@@ -187,7 +194,7 @@ class SharepointIndexer(Indexer):
|
|
|
187
194
|
fullpath=file_path,
|
|
188
195
|
rel_path=file_path.replace(self.index_config.path, ""),
|
|
189
196
|
),
|
|
190
|
-
metadata=
|
|
197
|
+
metadata=FileDataSourceMetadata(
|
|
191
198
|
url=url,
|
|
192
199
|
version=version,
|
|
193
200
|
date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
|
|
@@ -222,7 +229,7 @@ class SharepointIndexer(Indexer):
|
|
|
222
229
|
fullpath=fullpath,
|
|
223
230
|
rel_path=rel_path,
|
|
224
231
|
),
|
|
225
|
-
metadata=
|
|
232
|
+
metadata=FileDataSourceMetadata(
|
|
226
233
|
url=absolute_url,
|
|
227
234
|
version=f"{file.major_version}.{file.minor_version}",
|
|
228
235
|
date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
|
|
@@ -340,10 +347,9 @@ class SharepointDownloader(Downloader):
|
|
|
340
347
|
connector_type: str = CONNECTOR_TYPE
|
|
341
348
|
|
|
342
349
|
def get_download_path(self, file_data: FileData) -> Path:
|
|
350
|
+
download_path = super().get_download_path(file_data=file_data)
|
|
351
|
+
|
|
343
352
|
content_type = file_data.additional_metadata.get("sharepoint_content_type")
|
|
344
|
-
rel_path = file_data.source_identifiers.fullpath
|
|
345
|
-
rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
|
|
346
|
-
download_path = self.download_dir / Path(rel_path)
|
|
347
353
|
if content_type == SharepointContentType.SITEPAGE.value:
|
|
348
354
|
# Update output extension to html if site page
|
|
349
355
|
download_path = download_path.with_suffix(".html")
|