unstructured-ingest 0.0.2__py3-none-any.whl → 0.0.2.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/cli/base/cmd.py +0 -10
- unstructured_ingest/v2/cli/base/src.py +0 -2
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +9 -1
- unstructured_ingest/v2/cli/cmds/local.py +8 -0
- unstructured_ingest/v2/cli/configs/__init__.py +1 -8
- unstructured_ingest/v2/interfaces/__init__.py +1 -2
- unstructured_ingest/v2/interfaces/downloader.py +3 -9
- unstructured_ingest/v2/interfaces/file_data.py +1 -6
- unstructured_ingest/v2/interfaces/process.py +0 -3
- unstructured_ingest/v2/pipeline/interfaces.py +5 -3
- unstructured_ingest/v2/pipeline/pipeline.py +2 -72
- unstructured_ingest/v2/pipeline/steps/download.py +13 -77
- unstructured_ingest/v2/processes/connectors/astra.py +0 -8
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +0 -8
- unstructured_ingest/v2/processes/connectors/chroma.py +6 -8
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +0 -9
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +9 -23
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +27 -12
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +5 -13
- unstructured_ingest/v2/processes/connectors/google_drive.py +9 -13
- unstructured_ingest/v2/processes/connectors/local.py +15 -15
- unstructured_ingest/v2/processes/connectors/mongodb.py +4 -10
- unstructured_ingest/v2/processes/connectors/onedrive.py +2 -14
- unstructured_ingest/v2/processes/connectors/pinecone.py +3 -6
- unstructured_ingest/v2/processes/connectors/salesforce.py +8 -10
- unstructured_ingest/v2/processes/connectors/sharepoint.py +8 -14
- unstructured_ingest/v2/processes/connectors/sql.py +9 -24
- unstructured_ingest/v2/processes/connectors/weaviate.py +5 -13
- {unstructured_ingest-0.0.2.dist-info → unstructured_ingest-0.0.2.dev0.dist-info}/METADATA +15 -15
- {unstructured_ingest-0.0.2.dist-info → unstructured_ingest-0.0.2.dev0.dist-info}/RECORD +34 -37
- unstructured_ingest/v2/cli/configs/filter.py +0 -28
- unstructured_ingest/v2/pipeline/steps/filter.py +0 -40
- unstructured_ingest/v2/processes/filter.py +0 -54
- {unstructured_ingest-0.0.2.dist-info → unstructured_ingest-0.0.2.dev0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.2.dist-info → unstructured_ingest-0.0.2.dev0.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.2.dist-info → unstructured_ingest-0.0.2.dev0.dist-info}/top_level.txt +0 -0
|
@@ -7,12 +7,10 @@ from pathlib import Path
|
|
|
7
7
|
from time import time
|
|
8
8
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
9
9
|
|
|
10
|
+
from unstructured.documents.elements import DataSourceMetadata
|
|
11
|
+
|
|
10
12
|
from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
|
|
11
|
-
from unstructured_ingest.error import
|
|
12
|
-
DestinationConnectionError,
|
|
13
|
-
SourceConnectionError,
|
|
14
|
-
SourceConnectionNetworkError,
|
|
15
|
-
)
|
|
13
|
+
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
16
14
|
from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
|
|
17
15
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
18
16
|
from unstructured_ingest.v2.interfaces import (
|
|
@@ -22,7 +20,6 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
22
20
|
DownloaderConfig,
|
|
23
21
|
DownloadResponse,
|
|
24
22
|
FileData,
|
|
25
|
-
FileDataSourceMetadata,
|
|
26
23
|
Indexer,
|
|
27
24
|
IndexerConfig,
|
|
28
25
|
UploadContent,
|
|
@@ -124,14 +121,11 @@ class ElasticsearchIndexerConfig(IndexerConfig):
|
|
|
124
121
|
class ElasticsearchIndexer(Indexer):
|
|
125
122
|
connection_config: ElasticsearchConnectionConfig
|
|
126
123
|
index_config: ElasticsearchIndexerConfig
|
|
124
|
+
client: "ElasticsearchClient" = field(init=False)
|
|
127
125
|
connector_type: str = CONNECTOR_TYPE
|
|
128
126
|
|
|
129
|
-
def
|
|
130
|
-
|
|
131
|
-
self.connection_config.get_client()
|
|
132
|
-
except Exception as e:
|
|
133
|
-
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
134
|
-
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
127
|
+
def __post_init__(self):
|
|
128
|
+
self.client = self.connection_config.get_client()
|
|
135
129
|
|
|
136
130
|
@requires_dependencies(["elasticsearch"], extras="elasticsearch")
|
|
137
131
|
def load_scan(self):
|
|
@@ -144,9 +138,8 @@ class ElasticsearchIndexer(Indexer):
|
|
|
144
138
|
scan = self.load_scan()
|
|
145
139
|
|
|
146
140
|
scan_query: dict = {"stored_fields": [], "query": {"match_all": {}}}
|
|
147
|
-
client = self.connection_config.get_client()
|
|
148
141
|
hits = scan(
|
|
149
|
-
client,
|
|
142
|
+
self.client,
|
|
150
143
|
query=scan_query,
|
|
151
144
|
scroll="1m",
|
|
152
145
|
index=self.index_config.index_name,
|
|
@@ -175,7 +168,7 @@ class ElasticsearchIndexer(Indexer):
|
|
|
175
168
|
yield FileData(
|
|
176
169
|
identifier=identified,
|
|
177
170
|
connector_type=CONNECTOR_TYPE,
|
|
178
|
-
metadata=
|
|
171
|
+
metadata=DataSourceMetadata(
|
|
179
172
|
url=f"{self.connection_config.hosts[0]}/{self.index_config.index_name}",
|
|
180
173
|
date_processed=str(time()),
|
|
181
174
|
),
|
|
@@ -241,7 +234,7 @@ class ElasticsearchDownloader(Downloader):
|
|
|
241
234
|
file_data=FileData(
|
|
242
235
|
identifier=filename_id,
|
|
243
236
|
connector_type=CONNECTOR_TYPE,
|
|
244
|
-
metadata=
|
|
237
|
+
metadata=DataSourceMetadata(
|
|
245
238
|
version=str(result["_version"]) if "_version" in result else None,
|
|
246
239
|
date_processed=str(time()),
|
|
247
240
|
record_locator={
|
|
@@ -346,13 +339,6 @@ class ElasticsearchUploader(Uploader):
|
|
|
346
339
|
upload_config: ElasticsearchUploaderConfig
|
|
347
340
|
connection_config: ElasticsearchConnectionConfig
|
|
348
341
|
|
|
349
|
-
def precheck(self) -> None:
|
|
350
|
-
try:
|
|
351
|
-
self.connection_config.get_client()
|
|
352
|
-
except Exception as e:
|
|
353
|
-
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
354
|
-
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
355
|
-
|
|
356
342
|
@requires_dependencies(["elasticsearch"], extras="elasticsearch")
|
|
357
343
|
def load_parallel_bulk(self):
|
|
358
344
|
from elasticsearch.helpers import parallel_bulk
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import contextlib
|
|
4
|
+
import fnmatch
|
|
4
5
|
from dataclasses import dataclass, field
|
|
5
6
|
from datetime import datetime
|
|
6
7
|
from pathlib import Path
|
|
@@ -8,6 +9,8 @@ from time import time
|
|
|
8
9
|
from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
|
|
9
10
|
from uuid import NAMESPACE_DNS, uuid5
|
|
10
11
|
|
|
12
|
+
from unstructured.documents.elements import DataSourceMetadata
|
|
13
|
+
|
|
11
14
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
12
15
|
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
13
16
|
from unstructured_ingest.v2.interfaces import (
|
|
@@ -17,7 +20,6 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
17
20
|
DownloaderConfig,
|
|
18
21
|
DownloadResponse,
|
|
19
22
|
FileData,
|
|
20
|
-
FileDataSourceMetadata,
|
|
21
23
|
Indexer,
|
|
22
24
|
IndexerConfig,
|
|
23
25
|
SourceIdentifiers,
|
|
@@ -72,6 +74,7 @@ class FileConfig(Base):
|
|
|
72
74
|
@dataclass
|
|
73
75
|
class FsspecIndexerConfig(FileConfig, IndexerConfig):
|
|
74
76
|
recursive: bool = False
|
|
77
|
+
file_glob: Optional[list[str]] = None
|
|
75
78
|
|
|
76
79
|
|
|
77
80
|
@dataclass
|
|
@@ -106,7 +109,17 @@ class FsspecIndexer(Indexer):
|
|
|
106
109
|
**self.connection_config.get_access_config(),
|
|
107
110
|
)
|
|
108
111
|
|
|
109
|
-
def
|
|
112
|
+
def does_path_match_glob(self, path: str) -> bool:
|
|
113
|
+
if self.index_config.file_glob is None:
|
|
114
|
+
return True
|
|
115
|
+
patterns = self.index_config.file_glob
|
|
116
|
+
for pattern in patterns:
|
|
117
|
+
if fnmatch.filter([path], pattern):
|
|
118
|
+
return True
|
|
119
|
+
logger.debug(f"The file {path!r} is discarded as it does not match any given glob.")
|
|
120
|
+
return False
|
|
121
|
+
|
|
122
|
+
def check_connection(self):
|
|
110
123
|
from fsspec import get_filesystem_class
|
|
111
124
|
|
|
112
125
|
try:
|
|
@@ -144,10 +157,10 @@ class FsspecIndexer(Indexer):
|
|
|
144
157
|
else:
|
|
145
158
|
raise TypeError(f"unhandled response type from find: {type(found)}")
|
|
146
159
|
|
|
147
|
-
def get_metadata(self, path: str) ->
|
|
160
|
+
def get_metadata(self, path: str) -> DataSourceMetadata:
|
|
148
161
|
date_created = None
|
|
149
162
|
date_modified = None
|
|
150
|
-
|
|
163
|
+
|
|
151
164
|
try:
|
|
152
165
|
created: Optional[Any] = self.fs.created(path)
|
|
153
166
|
if created:
|
|
@@ -167,8 +180,6 @@ class FsspecIndexer(Indexer):
|
|
|
167
180
|
date_modified = str(modified)
|
|
168
181
|
except NotImplementedError:
|
|
169
182
|
pass
|
|
170
|
-
with contextlib.suppress(AttributeError):
|
|
171
|
-
file_size = self.fs.size(path)
|
|
172
183
|
|
|
173
184
|
version = self.fs.checksum(path)
|
|
174
185
|
metadata: dict[str, str] = {}
|
|
@@ -178,19 +189,15 @@ class FsspecIndexer(Indexer):
|
|
|
178
189
|
"protocol": self.index_config.protocol,
|
|
179
190
|
"remote_file_path": self.index_config.remote_url,
|
|
180
191
|
}
|
|
181
|
-
file_stat = self.fs.stat(path=path)
|
|
182
|
-
if file_id := file_stat.get("id"):
|
|
183
|
-
record_locator["file_id"] = file_id
|
|
184
192
|
if metadata:
|
|
185
193
|
record_locator["metadata"] = metadata
|
|
186
|
-
return
|
|
194
|
+
return DataSourceMetadata(
|
|
187
195
|
date_created=date_created,
|
|
188
196
|
date_modified=date_modified,
|
|
189
197
|
date_processed=str(time()),
|
|
190
198
|
version=str(version),
|
|
191
199
|
url=f"{self.index_config.protocol}://{path}",
|
|
192
200
|
record_locator=record_locator,
|
|
193
|
-
filesize_bytes=file_size,
|
|
194
201
|
)
|
|
195
202
|
|
|
196
203
|
def sterilize_info(self, path) -> dict:
|
|
@@ -198,7 +205,8 @@ class FsspecIndexer(Indexer):
|
|
|
198
205
|
return sterilize_dict(data=info)
|
|
199
206
|
|
|
200
207
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
201
|
-
|
|
208
|
+
raw_files = self.list_files()
|
|
209
|
+
files = [f for f in raw_files if self.does_path_match_glob(f)]
|
|
202
210
|
for file in files:
|
|
203
211
|
# Note: we remove any remaining leading slashes (Box introduces these)
|
|
204
212
|
# to get a valid relative path
|
|
@@ -247,6 +255,13 @@ class FsspecDownloader(Downloader):
|
|
|
247
255
|
**self.connection_config.get_access_config(),
|
|
248
256
|
)
|
|
249
257
|
|
|
258
|
+
def get_download_path(self, file_data: FileData) -> Path:
|
|
259
|
+
return (
|
|
260
|
+
self.download_dir / Path(file_data.source_identifiers.relative_path)
|
|
261
|
+
if self.download_config
|
|
262
|
+
else Path(file_data.source_identifiers.rel_path)
|
|
263
|
+
)
|
|
264
|
+
|
|
250
265
|
def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
|
|
251
266
|
download_path = self.get_download_path(file_data=file_data)
|
|
252
267
|
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
@@ -5,15 +5,11 @@ from pathlib import Path
|
|
|
5
5
|
from time import time
|
|
6
6
|
from typing import Any, Generator, Optional
|
|
7
7
|
|
|
8
|
-
from unstructured.
|
|
8
|
+
from unstructured.documents.elements import DataSourceMetadata
|
|
9
9
|
|
|
10
10
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
11
|
-
from unstructured_ingest.
|
|
12
|
-
|
|
13
|
-
FileData,
|
|
14
|
-
FileDataSourceMetadata,
|
|
15
|
-
UploadContent,
|
|
16
|
-
)
|
|
11
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
|
+
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
|
|
17
13
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
18
14
|
DestinationRegistryEntry,
|
|
19
15
|
SourceRegistryEntry,
|
|
@@ -70,10 +66,9 @@ class S3Indexer(FsspecIndexer):
|
|
|
70
66
|
index_config: S3IndexerConfig
|
|
71
67
|
connector_type: str = CONNECTOR_TYPE
|
|
72
68
|
|
|
73
|
-
def get_metadata(self, path: str) ->
|
|
69
|
+
def get_metadata(self, path: str) -> DataSourceMetadata:
|
|
74
70
|
date_created = None
|
|
75
71
|
date_modified = None
|
|
76
|
-
file_size = None
|
|
77
72
|
try:
|
|
78
73
|
modified: Optional[datetime] = self.fs.modified(path)
|
|
79
74
|
if modified:
|
|
@@ -81,8 +76,6 @@ class S3Indexer(FsspecIndexer):
|
|
|
81
76
|
date_modified = str(modified.timestamp())
|
|
82
77
|
except NotImplementedError:
|
|
83
78
|
pass
|
|
84
|
-
with contextlib.suppress(AttributeError):
|
|
85
|
-
file_size = self.fs.size(path)
|
|
86
79
|
|
|
87
80
|
version = None
|
|
88
81
|
info: dict[str, Any] = self.fs.info(path)
|
|
@@ -97,14 +90,13 @@ class S3Indexer(FsspecIndexer):
|
|
|
97
90
|
}
|
|
98
91
|
if metadata:
|
|
99
92
|
record_locator["metadata"] = metadata
|
|
100
|
-
return
|
|
93
|
+
return DataSourceMetadata(
|
|
101
94
|
date_created=date_created,
|
|
102
95
|
date_modified=date_modified,
|
|
103
96
|
date_processed=str(time()),
|
|
104
97
|
version=version,
|
|
105
98
|
url=f"{self.index_config.protocol}://{path}",
|
|
106
99
|
record_locator=record_locator,
|
|
107
|
-
filesize_bytes=file_size,
|
|
108
100
|
)
|
|
109
101
|
|
|
110
102
|
@requires_dependencies(["s3fs", "fsspec"], extras="s3")
|
|
@@ -1,16 +1,15 @@
|
|
|
1
1
|
import io
|
|
2
2
|
import os
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
|
+
from pathlib import Path
|
|
4
5
|
from typing import TYPE_CHECKING, Any, Generator, Optional, Union
|
|
5
6
|
|
|
6
7
|
from dateutil import parser
|
|
8
|
+
from unstructured.documents.elements import DataSourceMetadata
|
|
7
9
|
from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
|
|
8
10
|
|
|
9
11
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
10
|
-
from unstructured_ingest.error import
|
|
11
|
-
SourceConnectionError,
|
|
12
|
-
SourceConnectionNetworkError,
|
|
13
|
-
)
|
|
12
|
+
from unstructured_ingest.error import SourceConnectionNetworkError
|
|
14
13
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
15
14
|
from unstructured_ingest.utils.string_and_date_utils import json_to_dict
|
|
16
15
|
from unstructured_ingest.v2.interfaces import (
|
|
@@ -19,7 +18,6 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
19
18
|
Downloader,
|
|
20
19
|
DownloaderConfig,
|
|
21
20
|
FileData,
|
|
22
|
-
FileDataSourceMetadata,
|
|
23
21
|
Indexer,
|
|
24
22
|
IndexerConfig,
|
|
25
23
|
SourceIdentifiers,
|
|
@@ -123,13 +121,6 @@ class GoogleDriveIndexer(Indexer):
|
|
|
123
121
|
]
|
|
124
122
|
)
|
|
125
123
|
|
|
126
|
-
def precheck(self) -> None:
|
|
127
|
-
try:
|
|
128
|
-
self.connection_config.get_files_service()
|
|
129
|
-
except Exception as e:
|
|
130
|
-
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
131
|
-
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
132
|
-
|
|
133
124
|
@staticmethod
|
|
134
125
|
def is_dir(record: dict) -> bool:
|
|
135
126
|
return record.get("mimeType") == "application/vnd.google-apps.folder"
|
|
@@ -164,7 +155,7 @@ class GoogleDriveIndexer(Indexer):
|
|
|
164
155
|
connector_type=CONNECTOR_TYPE,
|
|
165
156
|
identifier=file_id,
|
|
166
157
|
source_identifiers=source_identifiers,
|
|
167
|
-
metadata=
|
|
158
|
+
metadata=DataSourceMetadata(
|
|
168
159
|
url=url,
|
|
169
160
|
version=version,
|
|
170
161
|
date_created=str(date_created_dt.timestamp()),
|
|
@@ -281,6 +272,11 @@ class GoogleDriveDownloader(Downloader):
|
|
|
281
272
|
)
|
|
282
273
|
connector_type: str = CONNECTOR_TYPE
|
|
283
274
|
|
|
275
|
+
def get_download_path(self, file_data: FileData) -> Path:
|
|
276
|
+
rel_path = file_data.source_identifiers.relative_path
|
|
277
|
+
rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
|
|
278
|
+
return self.download_dir / Path(rel_path)
|
|
279
|
+
|
|
284
280
|
@SourceConnectionNetworkError.wrap
|
|
285
281
|
def _get_content(self, downloader: "MediaIoBaseDownload") -> bool:
|
|
286
282
|
downloaded = False
|
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
import glob
|
|
2
|
+
import itertools
|
|
2
3
|
import shutil
|
|
3
4
|
from dataclasses import dataclass, field
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from time import time
|
|
6
|
-
from typing import Any, Generator
|
|
7
|
+
from typing import Any, Generator, Optional
|
|
8
|
+
|
|
9
|
+
from unstructured.documents.elements import DataSourceMetadata
|
|
7
10
|
|
|
8
11
|
from unstructured_ingest.v2.interfaces import (
|
|
9
12
|
AccessConfig,
|
|
@@ -12,7 +15,6 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
12
15
|
DownloaderConfig,
|
|
13
16
|
DownloadResponse,
|
|
14
17
|
FileData,
|
|
15
|
-
FileDataSourceMetadata,
|
|
16
18
|
Indexer,
|
|
17
19
|
IndexerConfig,
|
|
18
20
|
SourceIdentifiers,
|
|
@@ -43,6 +45,7 @@ class LocalConnectionConfig(ConnectionConfig):
|
|
|
43
45
|
class LocalIndexerConfig(IndexerConfig):
|
|
44
46
|
input_path: str
|
|
45
47
|
recursive: bool = False
|
|
48
|
+
file_glob: Optional[list[str]] = None
|
|
46
49
|
|
|
47
50
|
@property
|
|
48
51
|
def path(self) -> Path:
|
|
@@ -61,11 +64,16 @@ class LocalIndexer(Indexer):
|
|
|
61
64
|
input_path = self.index_config.path
|
|
62
65
|
if input_path.is_file():
|
|
63
66
|
return [Path(s) for s in glob.glob(f"{self.index_config.path}")]
|
|
64
|
-
if self.index_config.recursive
|
|
65
|
-
|
|
66
|
-
|
|
67
|
+
glob_fn = input_path.rglob if self.index_config.recursive else input_path.glob
|
|
68
|
+
if not self.index_config.file_glob:
|
|
69
|
+
return list(glob_fn("*"))
|
|
70
|
+
return list(
|
|
71
|
+
itertools.chain.from_iterable(
|
|
72
|
+
glob_fn(pattern) for pattern in self.index_config.file_glob
|
|
73
|
+
)
|
|
74
|
+
)
|
|
67
75
|
|
|
68
|
-
def get_file_metadata(self, path: Path) ->
|
|
76
|
+
def get_file_metadata(self, path: Path) -> DataSourceMetadata:
|
|
69
77
|
stats = path.stat()
|
|
70
78
|
try:
|
|
71
79
|
date_modified = str(stats.st_mtime)
|
|
@@ -85,20 +93,12 @@ class LocalIndexer(Indexer):
|
|
|
85
93
|
except Exception as e:
|
|
86
94
|
logger.warning(f"Couldn't detect file mode: {e}")
|
|
87
95
|
permissions_data = None
|
|
88
|
-
|
|
89
|
-
try:
|
|
90
|
-
filesize_bytes = stats.st_size
|
|
91
|
-
except Exception as e:
|
|
92
|
-
logger.warning(f"Couldn't detect file size: {e}")
|
|
93
|
-
filesize_bytes = None
|
|
94
|
-
|
|
95
|
-
return FileDataSourceMetadata(
|
|
96
|
+
return DataSourceMetadata(
|
|
96
97
|
date_modified=date_modified,
|
|
97
98
|
date_created=date_created,
|
|
98
99
|
date_processed=str(time()),
|
|
99
100
|
permissions_data=permissions_data,
|
|
100
101
|
record_locator={"path": str(path.resolve())},
|
|
101
|
-
filesize_bytes=filesize_bytes,
|
|
102
102
|
)
|
|
103
103
|
|
|
104
104
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
@@ -6,7 +6,6 @@ from typing import TYPE_CHECKING, Any, Optional
|
|
|
6
6
|
from unstructured.__version__ import __version__ as unstructured_version
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
9
|
-
from unstructured_ingest.error import DestinationConnectionError
|
|
10
9
|
from unstructured_ingest.utils.data_prep import batch_generator
|
|
11
10
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
11
|
from unstructured_ingest.v2.interfaces import (
|
|
@@ -86,15 +85,11 @@ class MongoDBUploaderConfig(UploaderConfig):
|
|
|
86
85
|
class MongoDBUploader(Uploader):
|
|
87
86
|
upload_config: MongoDBUploaderConfig
|
|
88
87
|
connection_config: MongoDBConnectionConfig
|
|
88
|
+
client: Optional["MongoClient"] = field(init=False)
|
|
89
89
|
connector_type: str = CONNECTOR_TYPE
|
|
90
90
|
|
|
91
|
-
def
|
|
92
|
-
|
|
93
|
-
client = self.create_client()
|
|
94
|
-
client.admin.command("ping")
|
|
95
|
-
except Exception as e:
|
|
96
|
-
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
97
|
-
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
91
|
+
def __post_init__(self):
|
|
92
|
+
self.client = self.create_client()
|
|
98
93
|
|
|
99
94
|
@requires_dependencies(["pymongo"], extras="mongodb")
|
|
100
95
|
def create_client(self) -> "MongoClient":
|
|
@@ -128,8 +123,7 @@ class MongoDBUploader(Uploader):
|
|
|
128
123
|
f"collection {self.connection_config.collection} "
|
|
129
124
|
f"at {self.connection_config.host}",
|
|
130
125
|
)
|
|
131
|
-
|
|
132
|
-
db = client[self.connection_config.database]
|
|
126
|
+
db = self.client[self.connection_config.database]
|
|
133
127
|
collection = db[self.connection_config.collection]
|
|
134
128
|
for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
|
|
135
129
|
collection.insert_many(chunk)
|
|
@@ -5,6 +5,7 @@ from time import time
|
|
|
5
5
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
6
6
|
|
|
7
7
|
from dateutil import parser
|
|
8
|
+
from unstructured.documents.elements import DataSourceMetadata
|
|
8
9
|
|
|
9
10
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
10
11
|
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
@@ -16,7 +17,6 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
16
17
|
DownloaderConfig,
|
|
17
18
|
DownloadResponse,
|
|
18
19
|
FileData,
|
|
19
|
-
FileDataSourceMetadata,
|
|
20
20
|
Indexer,
|
|
21
21
|
IndexerConfig,
|
|
22
22
|
SourceIdentifiers,
|
|
@@ -87,18 +87,6 @@ class OnedriveIndexer(Indexer):
|
|
|
87
87
|
connection_config: OnedriveConnectionConfig
|
|
88
88
|
index_config: OnedriveIndexerConfig
|
|
89
89
|
|
|
90
|
-
def precheck(self) -> None:
|
|
91
|
-
try:
|
|
92
|
-
token_resp: dict = self.connection_config.get_token()
|
|
93
|
-
if error := token_resp.get("error"):
|
|
94
|
-
raise SourceConnectionError(
|
|
95
|
-
"{} ({})".format(error, token_resp.get("error_description"))
|
|
96
|
-
)
|
|
97
|
-
self.connection_config.get_client()
|
|
98
|
-
except Exception as e:
|
|
99
|
-
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
100
|
-
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
101
|
-
|
|
102
90
|
def list_objects(self, folder, recursive) -> list["DriveItem"]:
|
|
103
91
|
drive_items = folder.children.get().execute_query()
|
|
104
92
|
files = [d for d in drive_items if d.is_file]
|
|
@@ -148,7 +136,7 @@ class OnedriveIndexer(Indexer):
|
|
|
148
136
|
source_identifiers=SourceIdentifiers(
|
|
149
137
|
fullpath=server_path, filename=drive_item.name, rel_path=rel_path
|
|
150
138
|
),
|
|
151
|
-
metadata=
|
|
139
|
+
metadata=DataSourceMetadata(
|
|
152
140
|
url=drive_item.parent_reference.path + "/" + drive_item.name,
|
|
153
141
|
version=drive_item.etag,
|
|
154
142
|
date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
|
|
@@ -123,12 +123,9 @@ class PineconeUploader(Uploader):
|
|
|
123
123
|
connection_config: PineconeConnectionConfig
|
|
124
124
|
connector_type: str = CONNECTOR_TYPE
|
|
125
125
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
except Exception as e:
|
|
130
|
-
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
131
|
-
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
126
|
+
@DestinationConnectionError.wrap
|
|
127
|
+
def check_connection(self):
|
|
128
|
+
_ = self.connection_config.get_index()
|
|
132
129
|
|
|
133
130
|
@requires_dependencies(["pinecone"], extras="pinecone")
|
|
134
131
|
def upsert_batch(self, batch):
|
|
@@ -18,9 +18,10 @@ from textwrap import dedent
|
|
|
18
18
|
from typing import TYPE_CHECKING, Any, Generator, Type
|
|
19
19
|
|
|
20
20
|
from dateutil import parser
|
|
21
|
+
from unstructured.documents.elements import DataSourceMetadata
|
|
21
22
|
|
|
22
23
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
23
|
-
from unstructured_ingest.error import
|
|
24
|
+
from unstructured_ingest.error import SourceConnectionNetworkError
|
|
24
25
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
25
26
|
from unstructured_ingest.v2.interfaces import (
|
|
26
27
|
AccessConfig,
|
|
@@ -29,7 +30,6 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
29
30
|
DownloaderConfig,
|
|
30
31
|
DownloadResponse,
|
|
31
32
|
FileData,
|
|
32
|
-
FileDataSourceMetadata,
|
|
33
33
|
Indexer,
|
|
34
34
|
IndexerConfig,
|
|
35
35
|
SourceIdentifiers,
|
|
@@ -132,13 +132,6 @@ class SalesforceIndexer(Indexer):
|
|
|
132
132
|
if record_type not in ACCEPTED_CATEGORIES:
|
|
133
133
|
raise ValueError(f"{record_type} not currently an accepted Salesforce category")
|
|
134
134
|
|
|
135
|
-
def precheck(self) -> None:
|
|
136
|
-
try:
|
|
137
|
-
self.connection_config.get_client()
|
|
138
|
-
except Exception as e:
|
|
139
|
-
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
140
|
-
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
141
|
-
|
|
142
135
|
def get_file_extension(self, record_type) -> str:
|
|
143
136
|
if record_type == "EmailMessage":
|
|
144
137
|
extension = ".eml"
|
|
@@ -179,7 +172,7 @@ class SalesforceIndexer(Indexer):
|
|
|
179
172
|
filename=record_with_extension,
|
|
180
173
|
fullpath=f"{record['attributes']['type']}/{record_with_extension}",
|
|
181
174
|
),
|
|
182
|
-
metadata=
|
|
175
|
+
metadata=DataSourceMetadata(
|
|
183
176
|
url=record["attributes"]["url"],
|
|
184
177
|
version=str(parser.parse(record["SystemModstamp"]).timestamp()),
|
|
185
178
|
date_created=str(parser.parse(record["CreatedDate"]).timestamp()),
|
|
@@ -214,6 +207,11 @@ class SalesforceDownloader(Downloader):
|
|
|
214
207
|
)
|
|
215
208
|
connector_type: str = CONNECTOR_TYPE
|
|
216
209
|
|
|
210
|
+
def get_download_path(self, file_data: FileData) -> Path:
|
|
211
|
+
rel_path = file_data.source_identifiers.relative_path
|
|
212
|
+
rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
|
|
213
|
+
return self.download_dir / Path(rel_path)
|
|
214
|
+
|
|
217
215
|
def _xml_for_record(self, record: OrderedDict) -> str:
|
|
218
216
|
"""Creates partitionable xml file from a record"""
|
|
219
217
|
import xml.etree.ElementTree as ET
|
|
@@ -6,8 +6,10 @@ from time import time
|
|
|
6
6
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
7
7
|
from urllib.parse import quote
|
|
8
8
|
|
|
9
|
+
from unstructured.documents.elements import DataSourceMetadata
|
|
10
|
+
|
|
9
11
|
from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
|
|
10
|
-
from unstructured_ingest.error import
|
|
12
|
+
from unstructured_ingest.error import SourceConnectionNetworkError
|
|
11
13
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
14
|
from unstructured_ingest.v2.interfaces import (
|
|
13
15
|
AccessConfig,
|
|
@@ -16,7 +18,6 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
16
18
|
DownloaderConfig,
|
|
17
19
|
DownloadResponse,
|
|
18
20
|
FileData,
|
|
19
|
-
FileDataSourceMetadata,
|
|
20
21
|
Indexer,
|
|
21
22
|
IndexerConfig,
|
|
22
23
|
SourceIdentifiers,
|
|
@@ -133,14 +134,6 @@ class SharepointIndexer(Indexer):
|
|
|
133
134
|
connection_config: SharepointConnectionConfig
|
|
134
135
|
index_config: SharepointIndexerConfig = field(default_factory=lambda: SharepointIndexerConfig())
|
|
135
136
|
|
|
136
|
-
def precheck(self) -> None:
|
|
137
|
-
try:
|
|
138
|
-
site_client = self.connection_config.get_client()
|
|
139
|
-
site_client.site_pages.pages.get().execute_query()
|
|
140
|
-
except Exception as e:
|
|
141
|
-
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
142
|
-
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
143
|
-
|
|
144
137
|
def list_files(self, folder: "Folder", recursive: bool = False) -> list["File"]:
|
|
145
138
|
if not recursive:
|
|
146
139
|
folder.expand(["Files"]).get().execute_query()
|
|
@@ -194,7 +187,7 @@ class SharepointIndexer(Indexer):
|
|
|
194
187
|
fullpath=file_path,
|
|
195
188
|
rel_path=file_path.replace(self.index_config.path, ""),
|
|
196
189
|
),
|
|
197
|
-
metadata=
|
|
190
|
+
metadata=DataSourceMetadata(
|
|
198
191
|
url=url,
|
|
199
192
|
version=version,
|
|
200
193
|
date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
|
|
@@ -229,7 +222,7 @@ class SharepointIndexer(Indexer):
|
|
|
229
222
|
fullpath=fullpath,
|
|
230
223
|
rel_path=rel_path,
|
|
231
224
|
),
|
|
232
|
-
metadata=
|
|
225
|
+
metadata=DataSourceMetadata(
|
|
233
226
|
url=absolute_url,
|
|
234
227
|
version=f"{file.major_version}.{file.minor_version}",
|
|
235
228
|
date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
|
|
@@ -347,9 +340,10 @@ class SharepointDownloader(Downloader):
|
|
|
347
340
|
connector_type: str = CONNECTOR_TYPE
|
|
348
341
|
|
|
349
342
|
def get_download_path(self, file_data: FileData) -> Path:
|
|
350
|
-
download_path = super().get_download_path(file_data=file_data)
|
|
351
|
-
|
|
352
343
|
content_type = file_data.additional_metadata.get("sharepoint_content_type")
|
|
344
|
+
rel_path = file_data.source_identifiers.fullpath
|
|
345
|
+
rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
|
|
346
|
+
download_path = self.download_dir / Path(rel_path)
|
|
353
347
|
if content_type == SharepointContentType.SITEPAGE.value:
|
|
354
348
|
# Update output extension to html if site page
|
|
355
349
|
download_path = download_path.with_suffix(".html")
|