unstructured-ingest 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (37) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/v2/cli/base/cmd.py +10 -0
  3. unstructured_ingest/v2/cli/base/src.py +2 -0
  4. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +1 -9
  5. unstructured_ingest/v2/cli/cmds/local.py +0 -8
  6. unstructured_ingest/v2/cli/configs/__init__.py +8 -1
  7. unstructured_ingest/v2/cli/configs/filter.py +28 -0
  8. unstructured_ingest/v2/interfaces/__init__.py +2 -1
  9. unstructured_ingest/v2/interfaces/downloader.py +9 -3
  10. unstructured_ingest/v2/interfaces/file_data.py +6 -1
  11. unstructured_ingest/v2/interfaces/process.py +3 -0
  12. unstructured_ingest/v2/pipeline/interfaces.py +3 -5
  13. unstructured_ingest/v2/pipeline/pipeline.py +72 -2
  14. unstructured_ingest/v2/pipeline/steps/download.py +77 -13
  15. unstructured_ingest/v2/pipeline/steps/filter.py +40 -0
  16. unstructured_ingest/v2/processes/connectors/astra.py +8 -0
  17. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +8 -0
  18. unstructured_ingest/v2/processes/connectors/chroma.py +8 -6
  19. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +9 -0
  20. unstructured_ingest/v2/processes/connectors/elasticsearch.py +23 -9
  21. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +22 -31
  22. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -5
  23. unstructured_ingest/v2/processes/connectors/google_drive.py +13 -9
  24. unstructured_ingest/v2/processes/connectors/local.py +15 -15
  25. unstructured_ingest/v2/processes/connectors/mongodb.py +10 -4
  26. unstructured_ingest/v2/processes/connectors/onedrive.py +14 -2
  27. unstructured_ingest/v2/processes/connectors/pinecone.py +6 -3
  28. unstructured_ingest/v2/processes/connectors/salesforce.py +10 -8
  29. unstructured_ingest/v2/processes/connectors/sharepoint.py +14 -8
  30. unstructured_ingest/v2/processes/connectors/sql.py +24 -9
  31. unstructured_ingest/v2/processes/connectors/weaviate.py +13 -5
  32. unstructured_ingest/v2/processes/filter.py +54 -0
  33. {unstructured_ingest-0.0.1.dist-info → unstructured_ingest-0.0.2.dist-info}/METADATA +13 -13
  34. {unstructured_ingest-0.0.1.dist-info → unstructured_ingest-0.0.2.dist-info}/RECORD +37 -34
  35. {unstructured_ingest-0.0.1.dist-info → unstructured_ingest-0.0.2.dist-info}/WHEEL +0 -0
  36. {unstructured_ingest-0.0.1.dist-info → unstructured_ingest-0.0.2.dist-info}/entry_points.txt +0 -0
  37. {unstructured_ingest-0.0.1.dist-info → unstructured_ingest-0.0.2.dist-info}/top_level.txt +0 -0
@@ -3,6 +3,7 @@ from dataclasses import dataclass, field
3
3
  from typing import TYPE_CHECKING, Any, Optional
4
4
 
5
5
  from unstructured_ingest.enhanced_dataclass import enhanced_field
6
+ from unstructured_ingest.error import DestinationConnectionError
6
7
  from unstructured_ingest.utils.dep_check import requires_dependencies
7
8
  from unstructured_ingest.v2.interfaces import (
8
9
  AccessConfig,
@@ -11,6 +12,7 @@ from unstructured_ingest.v2.interfaces import (
11
12
  Uploader,
12
13
  UploaderConfig,
13
14
  )
15
+ from unstructured_ingest.v2.logger import logger
14
16
  from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
15
17
 
16
18
  if TYPE_CHECKING:
@@ -78,6 +80,13 @@ class DatabricksVolumesUploader(Uploader):
78
80
  host=self.connection_config.host, **self.connection_config.access_config.to_dict()
79
81
  )
80
82
 
83
+ def precheck(self) -> None:
84
+ try:
85
+ assert self.client.current_user.me().active
86
+ except Exception as e:
87
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
88
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
89
+
81
90
  def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
82
91
  for content in contents:
83
92
  with open(content.path, "rb") as elements_file:
@@ -7,10 +7,12 @@ from pathlib import Path
7
7
  from time import time
8
8
  from typing import TYPE_CHECKING, Any, Generator, Optional
9
9
 
10
- from unstructured.documents.elements import DataSourceMetadata
11
-
12
10
  from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
13
- from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
11
+ from unstructured_ingest.error import (
12
+ DestinationConnectionError,
13
+ SourceConnectionError,
14
+ SourceConnectionNetworkError,
15
+ )
14
16
  from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
15
17
  from unstructured_ingest.utils.dep_check import requires_dependencies
16
18
  from unstructured_ingest.v2.interfaces import (
@@ -20,6 +22,7 @@ from unstructured_ingest.v2.interfaces import (
20
22
  DownloaderConfig,
21
23
  DownloadResponse,
22
24
  FileData,
25
+ FileDataSourceMetadata,
23
26
  Indexer,
24
27
  IndexerConfig,
25
28
  UploadContent,
@@ -121,11 +124,14 @@ class ElasticsearchIndexerConfig(IndexerConfig):
121
124
  class ElasticsearchIndexer(Indexer):
122
125
  connection_config: ElasticsearchConnectionConfig
123
126
  index_config: ElasticsearchIndexerConfig
124
- client: "ElasticsearchClient" = field(init=False)
125
127
  connector_type: str = CONNECTOR_TYPE
126
128
 
127
- def __post_init__(self):
128
- self.client = self.connection_config.get_client()
129
+ def precheck(self) -> None:
130
+ try:
131
+ self.connection_config.get_client()
132
+ except Exception as e:
133
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
134
+ raise SourceConnectionError(f"failed to validate connection: {e}")
129
135
 
130
136
  @requires_dependencies(["elasticsearch"], extras="elasticsearch")
131
137
  def load_scan(self):
@@ -138,8 +144,9 @@ class ElasticsearchIndexer(Indexer):
138
144
  scan = self.load_scan()
139
145
 
140
146
  scan_query: dict = {"stored_fields": [], "query": {"match_all": {}}}
147
+ client = self.connection_config.get_client()
141
148
  hits = scan(
142
- self.client,
149
+ client,
143
150
  query=scan_query,
144
151
  scroll="1m",
145
152
  index=self.index_config.index_name,
@@ -168,7 +175,7 @@ class ElasticsearchIndexer(Indexer):
168
175
  yield FileData(
169
176
  identifier=identified,
170
177
  connector_type=CONNECTOR_TYPE,
171
- metadata=DataSourceMetadata(
178
+ metadata=FileDataSourceMetadata(
172
179
  url=f"{self.connection_config.hosts[0]}/{self.index_config.index_name}",
173
180
  date_processed=str(time()),
174
181
  ),
@@ -234,7 +241,7 @@ class ElasticsearchDownloader(Downloader):
234
241
  file_data=FileData(
235
242
  identifier=filename_id,
236
243
  connector_type=CONNECTOR_TYPE,
237
- metadata=DataSourceMetadata(
244
+ metadata=FileDataSourceMetadata(
238
245
  version=str(result["_version"]) if "_version" in result else None,
239
246
  date_processed=str(time()),
240
247
  record_locator={
@@ -339,6 +346,13 @@ class ElasticsearchUploader(Uploader):
339
346
  upload_config: ElasticsearchUploaderConfig
340
347
  connection_config: ElasticsearchConnectionConfig
341
348
 
349
+ def precheck(self) -> None:
350
+ try:
351
+ self.connection_config.get_client()
352
+ except Exception as e:
353
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
354
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
355
+
342
356
  @requires_dependencies(["elasticsearch"], extras="elasticsearch")
343
357
  def load_parallel_bulk(self):
344
358
  from elasticsearch.helpers import parallel_bulk
@@ -1,14 +1,12 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import contextlib
4
- import fnmatch
5
4
  from dataclasses import dataclass, field
6
5
  from datetime import datetime
7
6
  from pathlib import Path
8
7
  from time import time
9
8
  from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
10
-
11
- from unstructured.documents.elements import DataSourceMetadata
9
+ from uuid import NAMESPACE_DNS, uuid5
12
10
 
13
11
  from unstructured_ingest.enhanced_dataclass import enhanced_field
14
12
  from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
@@ -19,6 +17,7 @@ from unstructured_ingest.v2.interfaces import (
19
17
  DownloaderConfig,
20
18
  DownloadResponse,
21
19
  FileData,
20
+ FileDataSourceMetadata,
22
21
  Indexer,
23
22
  IndexerConfig,
24
23
  SourceIdentifiers,
@@ -73,7 +72,6 @@ class FileConfig(Base):
73
72
  @dataclass
74
73
  class FsspecIndexerConfig(FileConfig, IndexerConfig):
75
74
  recursive: bool = False
76
- file_glob: Optional[list[str]] = None
77
75
 
78
76
 
79
77
  @dataclass
@@ -108,17 +106,7 @@ class FsspecIndexer(Indexer):
108
106
  **self.connection_config.get_access_config(),
109
107
  )
110
108
 
111
- def does_path_match_glob(self, path: str) -> bool:
112
- if self.index_config.file_glob is None:
113
- return True
114
- patterns = self.index_config.file_glob
115
- for pattern in patterns:
116
- if fnmatch.filter([path], pattern):
117
- return True
118
- logger.debug(f"The file {path!r} is discarded as it does not match any given glob.")
119
- return False
120
-
121
- def check_connection(self):
109
+ def precheck(self) -> None:
122
110
  from fsspec import get_filesystem_class
123
111
 
124
112
  try:
@@ -156,10 +144,10 @@ class FsspecIndexer(Indexer):
156
144
  else:
157
145
  raise TypeError(f"unhandled response type from find: {type(found)}")
158
146
 
159
- def get_metadata(self, path: str) -> DataSourceMetadata:
147
+ def get_metadata(self, path: str) -> FileDataSourceMetadata:
160
148
  date_created = None
161
149
  date_modified = None
162
-
150
+ file_size = None
163
151
  try:
164
152
  created: Optional[Any] = self.fs.created(path)
165
153
  if created:
@@ -179,6 +167,8 @@ class FsspecIndexer(Indexer):
179
167
  date_modified = str(modified)
180
168
  except NotImplementedError:
181
169
  pass
170
+ with contextlib.suppress(AttributeError):
171
+ file_size = self.fs.size(path)
182
172
 
183
173
  version = self.fs.checksum(path)
184
174
  metadata: dict[str, str] = {}
@@ -188,15 +178,19 @@ class FsspecIndexer(Indexer):
188
178
  "protocol": self.index_config.protocol,
189
179
  "remote_file_path": self.index_config.remote_url,
190
180
  }
181
+ file_stat = self.fs.stat(path=path)
182
+ if file_id := file_stat.get("id"):
183
+ record_locator["file_id"] = file_id
191
184
  if metadata:
192
185
  record_locator["metadata"] = metadata
193
- return DataSourceMetadata(
186
+ return FileDataSourceMetadata(
194
187
  date_created=date_created,
195
188
  date_modified=date_modified,
196
189
  date_processed=str(time()),
197
190
  version=str(version),
198
191
  url=f"{self.index_config.protocol}://{path}",
199
192
  record_locator=record_locator,
193
+ filesize_bytes=file_size,
200
194
  )
201
195
 
202
196
  def sterilize_info(self, path) -> dict:
@@ -204,14 +198,16 @@ class FsspecIndexer(Indexer):
204
198
  return sterilize_dict(data=info)
205
199
 
206
200
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
207
- raw_files = self.list_files()
208
- files = [f for f in raw_files if self.does_path_match_glob(f)]
201
+ files = self.list_files()
209
202
  for file in files:
210
203
  # Note: we remove any remaining leading slashes (Box introduces these)
211
204
  # to get a valid relative path
212
205
  rel_path = file.replace(self.index_config.path_without_protocol, "").lstrip("/")
206
+
207
+ additional_metadata = self.sterilize_info(path=file)
208
+ additional_metadata["original_file_path"] = file
213
209
  yield FileData(
214
- identifier=file,
210
+ identifier=str(uuid5(NAMESPACE_DNS, file)),
215
211
  connector_type=self.connector_type,
216
212
  source_identifiers=SourceIdentifiers(
217
213
  filename=Path(file).name,
@@ -219,7 +215,7 @@ class FsspecIndexer(Indexer):
219
215
  fullpath=file,
220
216
  ),
221
217
  metadata=self.get_metadata(path=file),
222
- additional_metadata=self.sterilize_info(path=file),
218
+ additional_metadata=additional_metadata,
223
219
  )
224
220
 
225
221
 
@@ -251,18 +247,12 @@ class FsspecDownloader(Downloader):
251
247
  **self.connection_config.get_access_config(),
252
248
  )
253
249
 
254
- def get_download_path(self, file_data: FileData) -> Path:
255
- return (
256
- self.download_dir / Path(file_data.source_identifiers.relative_path)
257
- if self.download_config
258
- else Path(file_data.source_identifiers.rel_path)
259
- )
260
-
261
250
  def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
262
251
  download_path = self.get_download_path(file_data=file_data)
263
252
  download_path.parent.mkdir(parents=True, exist_ok=True)
264
253
  try:
265
- self.fs.get(rpath=file_data.identifier, lpath=download_path.as_posix())
254
+ rpath = file_data.additional_metadata["original_file_path"]
255
+ self.fs.get(rpath=rpath, lpath=download_path.as_posix())
266
256
  except Exception as e:
267
257
  logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
268
258
  raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
@@ -272,7 +262,8 @@ class FsspecDownloader(Downloader):
272
262
  download_path = self.get_download_path(file_data=file_data)
273
263
  download_path.parent.mkdir(parents=True, exist_ok=True)
274
264
  try:
275
- await self.fs.get(rpath=file_data.identifier, lpath=download_path.as_posix())
265
+ rpath = file_data.additional_metadata["original_file_path"]
266
+ await self.fs.get(rpath=rpath, lpath=download_path.as_posix())
276
267
  except Exception as e:
277
268
  logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
278
269
  raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
@@ -5,11 +5,15 @@ from pathlib import Path
5
5
  from time import time
6
6
  from typing import Any, Generator, Optional
7
7
 
8
- from unstructured.documents.elements import DataSourceMetadata
8
+ from unstructured.utils import requires_dependencies
9
9
 
10
10
  from unstructured_ingest.enhanced_dataclass import enhanced_field
11
- from unstructured_ingest.utils.dep_check import requires_dependencies
12
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
11
+ from unstructured_ingest.v2.interfaces import (
12
+ DownloadResponse,
13
+ FileData,
14
+ FileDataSourceMetadata,
15
+ UploadContent,
16
+ )
13
17
  from unstructured_ingest.v2.processes.connector_registry import (
14
18
  DestinationRegistryEntry,
15
19
  SourceRegistryEntry,
@@ -66,9 +70,10 @@ class S3Indexer(FsspecIndexer):
66
70
  index_config: S3IndexerConfig
67
71
  connector_type: str = CONNECTOR_TYPE
68
72
 
69
- def get_metadata(self, path: str) -> DataSourceMetadata:
73
+ def get_metadata(self, path: str) -> FileDataSourceMetadata:
70
74
  date_created = None
71
75
  date_modified = None
76
+ file_size = None
72
77
  try:
73
78
  modified: Optional[datetime] = self.fs.modified(path)
74
79
  if modified:
@@ -76,6 +81,8 @@ class S3Indexer(FsspecIndexer):
76
81
  date_modified = str(modified.timestamp())
77
82
  except NotImplementedError:
78
83
  pass
84
+ with contextlib.suppress(AttributeError):
85
+ file_size = self.fs.size(path)
79
86
 
80
87
  version = None
81
88
  info: dict[str, Any] = self.fs.info(path)
@@ -90,13 +97,14 @@ class S3Indexer(FsspecIndexer):
90
97
  }
91
98
  if metadata:
92
99
  record_locator["metadata"] = metadata
93
- return DataSourceMetadata(
100
+ return FileDataSourceMetadata(
94
101
  date_created=date_created,
95
102
  date_modified=date_modified,
96
103
  date_processed=str(time()),
97
104
  version=version,
98
105
  url=f"{self.index_config.protocol}://{path}",
99
106
  record_locator=record_locator,
107
+ filesize_bytes=file_size,
100
108
  )
101
109
 
102
110
  @requires_dependencies(["s3fs", "fsspec"], extras="s3")
@@ -1,15 +1,16 @@
1
1
  import io
2
2
  import os
3
3
  from dataclasses import dataclass, field
4
- from pathlib import Path
5
4
  from typing import TYPE_CHECKING, Any, Generator, Optional, Union
6
5
 
7
6
  from dateutil import parser
8
- from unstructured.documents.elements import DataSourceMetadata
9
7
  from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
10
8
 
11
9
  from unstructured_ingest.enhanced_dataclass import enhanced_field
12
- from unstructured_ingest.error import SourceConnectionNetworkError
10
+ from unstructured_ingest.error import (
11
+ SourceConnectionError,
12
+ SourceConnectionNetworkError,
13
+ )
13
14
  from unstructured_ingest.utils.dep_check import requires_dependencies
14
15
  from unstructured_ingest.utils.string_and_date_utils import json_to_dict
15
16
  from unstructured_ingest.v2.interfaces import (
@@ -18,6 +19,7 @@ from unstructured_ingest.v2.interfaces import (
18
19
  Downloader,
19
20
  DownloaderConfig,
20
21
  FileData,
22
+ FileDataSourceMetadata,
21
23
  Indexer,
22
24
  IndexerConfig,
23
25
  SourceIdentifiers,
@@ -121,6 +123,13 @@ class GoogleDriveIndexer(Indexer):
121
123
  ]
122
124
  )
123
125
 
126
+ def precheck(self) -> None:
127
+ try:
128
+ self.connection_config.get_files_service()
129
+ except Exception as e:
130
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
131
+ raise SourceConnectionError(f"failed to validate connection: {e}")
132
+
124
133
  @staticmethod
125
134
  def is_dir(record: dict) -> bool:
126
135
  return record.get("mimeType") == "application/vnd.google-apps.folder"
@@ -155,7 +164,7 @@ class GoogleDriveIndexer(Indexer):
155
164
  connector_type=CONNECTOR_TYPE,
156
165
  identifier=file_id,
157
166
  source_identifiers=source_identifiers,
158
- metadata=DataSourceMetadata(
167
+ metadata=FileDataSourceMetadata(
159
168
  url=url,
160
169
  version=version,
161
170
  date_created=str(date_created_dt.timestamp()),
@@ -272,11 +281,6 @@ class GoogleDriveDownloader(Downloader):
272
281
  )
273
282
  connector_type: str = CONNECTOR_TYPE
274
283
 
275
- def get_download_path(self, file_data: FileData) -> Path:
276
- rel_path = file_data.source_identifiers.relative_path
277
- rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
278
- return self.download_dir / Path(rel_path)
279
-
280
284
  @SourceConnectionNetworkError.wrap
281
285
  def _get_content(self, downloader: "MediaIoBaseDownload") -> bool:
282
286
  downloaded = False
@@ -1,12 +1,9 @@
1
1
  import glob
2
- import itertools
3
2
  import shutil
4
3
  from dataclasses import dataclass, field
5
4
  from pathlib import Path
6
5
  from time import time
7
- from typing import Any, Generator, Optional
8
-
9
- from unstructured.documents.elements import DataSourceMetadata
6
+ from typing import Any, Generator
10
7
 
11
8
  from unstructured_ingest.v2.interfaces import (
12
9
  AccessConfig,
@@ -15,6 +12,7 @@ from unstructured_ingest.v2.interfaces import (
15
12
  DownloaderConfig,
16
13
  DownloadResponse,
17
14
  FileData,
15
+ FileDataSourceMetadata,
18
16
  Indexer,
19
17
  IndexerConfig,
20
18
  SourceIdentifiers,
@@ -45,7 +43,6 @@ class LocalConnectionConfig(ConnectionConfig):
45
43
  class LocalIndexerConfig(IndexerConfig):
46
44
  input_path: str
47
45
  recursive: bool = False
48
- file_glob: Optional[list[str]] = None
49
46
 
50
47
  @property
51
48
  def path(self) -> Path:
@@ -64,16 +61,11 @@ class LocalIndexer(Indexer):
64
61
  input_path = self.index_config.path
65
62
  if input_path.is_file():
66
63
  return [Path(s) for s in glob.glob(f"{self.index_config.path}")]
67
- glob_fn = input_path.rglob if self.index_config.recursive else input_path.glob
68
- if not self.index_config.file_glob:
69
- return list(glob_fn("*"))
70
- return list(
71
- itertools.chain.from_iterable(
72
- glob_fn(pattern) for pattern in self.index_config.file_glob
73
- )
74
- )
64
+ if self.index_config.recursive:
65
+ return list(input_path.rglob("*"))
66
+ return list(input_path.glob("*"))
75
67
 
76
- def get_file_metadata(self, path: Path) -> DataSourceMetadata:
68
+ def get_file_metadata(self, path: Path) -> FileDataSourceMetadata:
77
69
  stats = path.stat()
78
70
  try:
79
71
  date_modified = str(stats.st_mtime)
@@ -93,12 +85,20 @@ class LocalIndexer(Indexer):
93
85
  except Exception as e:
94
86
  logger.warning(f"Couldn't detect file mode: {e}")
95
87
  permissions_data = None
96
- return DataSourceMetadata(
88
+
89
+ try:
90
+ filesize_bytes = stats.st_size
91
+ except Exception as e:
92
+ logger.warning(f"Couldn't detect file size: {e}")
93
+ filesize_bytes = None
94
+
95
+ return FileDataSourceMetadata(
97
96
  date_modified=date_modified,
98
97
  date_created=date_created,
99
98
  date_processed=str(time()),
100
99
  permissions_data=permissions_data,
101
100
  record_locator={"path": str(path.resolve())},
101
+ filesize_bytes=filesize_bytes,
102
102
  )
103
103
 
104
104
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Any, Optional
6
6
  from unstructured.__version__ import __version__ as unstructured_version
7
7
 
8
8
  from unstructured_ingest.enhanced_dataclass import enhanced_field
9
+ from unstructured_ingest.error import DestinationConnectionError
9
10
  from unstructured_ingest.utils.data_prep import batch_generator
10
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
11
12
  from unstructured_ingest.v2.interfaces import (
@@ -85,11 +86,15 @@ class MongoDBUploaderConfig(UploaderConfig):
85
86
  class MongoDBUploader(Uploader):
86
87
  upload_config: MongoDBUploaderConfig
87
88
  connection_config: MongoDBConnectionConfig
88
- client: Optional["MongoClient"] = field(init=False)
89
89
  connector_type: str = CONNECTOR_TYPE
90
90
 
91
- def __post_init__(self):
92
- self.client = self.create_client()
91
+ def precheck(self) -> None:
92
+ try:
93
+ client = self.create_client()
94
+ client.admin.command("ping")
95
+ except Exception as e:
96
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
97
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
93
98
 
94
99
  @requires_dependencies(["pymongo"], extras="mongodb")
95
100
  def create_client(self) -> "MongoClient":
@@ -123,7 +128,8 @@ class MongoDBUploader(Uploader):
123
128
  f"collection {self.connection_config.collection} "
124
129
  f"at {self.connection_config.host}",
125
130
  )
126
- db = self.client[self.connection_config.database]
131
+ client = self.create_client()
132
+ db = client[self.connection_config.database]
127
133
  collection = db[self.connection_config.collection]
128
134
  for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
129
135
  collection.insert_many(chunk)
@@ -5,7 +5,6 @@ from time import time
5
5
  from typing import TYPE_CHECKING, Any, Generator, Optional
6
6
 
7
7
  from dateutil import parser
8
- from unstructured.documents.elements import DataSourceMetadata
9
8
 
10
9
  from unstructured_ingest.enhanced_dataclass import enhanced_field
11
10
  from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
@@ -17,6 +16,7 @@ from unstructured_ingest.v2.interfaces import (
17
16
  DownloaderConfig,
18
17
  DownloadResponse,
19
18
  FileData,
19
+ FileDataSourceMetadata,
20
20
  Indexer,
21
21
  IndexerConfig,
22
22
  SourceIdentifiers,
@@ -87,6 +87,18 @@ class OnedriveIndexer(Indexer):
87
87
  connection_config: OnedriveConnectionConfig
88
88
  index_config: OnedriveIndexerConfig
89
89
 
90
+ def precheck(self) -> None:
91
+ try:
92
+ token_resp: dict = self.connection_config.get_token()
93
+ if error := token_resp.get("error"):
94
+ raise SourceConnectionError(
95
+ "{} ({})".format(error, token_resp.get("error_description"))
96
+ )
97
+ self.connection_config.get_client()
98
+ except Exception as e:
99
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
100
+ raise SourceConnectionError(f"failed to validate connection: {e}")
101
+
90
102
  def list_objects(self, folder, recursive) -> list["DriveItem"]:
91
103
  drive_items = folder.children.get().execute_query()
92
104
  files = [d for d in drive_items if d.is_file]
@@ -136,7 +148,7 @@ class OnedriveIndexer(Indexer):
136
148
  source_identifiers=SourceIdentifiers(
137
149
  fullpath=server_path, filename=drive_item.name, rel_path=rel_path
138
150
  ),
139
- metadata=DataSourceMetadata(
151
+ metadata=FileDataSourceMetadata(
140
152
  url=drive_item.parent_reference.path + "/" + drive_item.name,
141
153
  version=drive_item.etag,
142
154
  date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
@@ -123,9 +123,12 @@ class PineconeUploader(Uploader):
123
123
  connection_config: PineconeConnectionConfig
124
124
  connector_type: str = CONNECTOR_TYPE
125
125
 
126
- @DestinationConnectionError.wrap
127
- def check_connection(self):
128
- _ = self.connection_config.get_index()
126
+ def precheck(self):
127
+ try:
128
+ self.connection_config.get_index()
129
+ except Exception as e:
130
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
131
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
129
132
 
130
133
  @requires_dependencies(["pinecone"], extras="pinecone")
131
134
  def upsert_batch(self, batch):
@@ -18,10 +18,9 @@ from textwrap import dedent
18
18
  from typing import TYPE_CHECKING, Any, Generator, Type
19
19
 
20
20
  from dateutil import parser
21
- from unstructured.documents.elements import DataSourceMetadata
22
21
 
23
22
  from unstructured_ingest.enhanced_dataclass import enhanced_field
24
- from unstructured_ingest.error import SourceConnectionNetworkError
23
+ from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
25
24
  from unstructured_ingest.utils.dep_check import requires_dependencies
26
25
  from unstructured_ingest.v2.interfaces import (
27
26
  AccessConfig,
@@ -30,6 +29,7 @@ from unstructured_ingest.v2.interfaces import (
30
29
  DownloaderConfig,
31
30
  DownloadResponse,
32
31
  FileData,
32
+ FileDataSourceMetadata,
33
33
  Indexer,
34
34
  IndexerConfig,
35
35
  SourceIdentifiers,
@@ -132,6 +132,13 @@ class SalesforceIndexer(Indexer):
132
132
  if record_type not in ACCEPTED_CATEGORIES:
133
133
  raise ValueError(f"{record_type} not currently an accepted Salesforce category")
134
134
 
135
+ def precheck(self) -> None:
136
+ try:
137
+ self.connection_config.get_client()
138
+ except Exception as e:
139
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
140
+ raise SourceConnectionError(f"failed to validate connection: {e}")
141
+
135
142
  def get_file_extension(self, record_type) -> str:
136
143
  if record_type == "EmailMessage":
137
144
  extension = ".eml"
@@ -172,7 +179,7 @@ class SalesforceIndexer(Indexer):
172
179
  filename=record_with_extension,
173
180
  fullpath=f"{record['attributes']['type']}/{record_with_extension}",
174
181
  ),
175
- metadata=DataSourceMetadata(
182
+ metadata=FileDataSourceMetadata(
176
183
  url=record["attributes"]["url"],
177
184
  version=str(parser.parse(record["SystemModstamp"]).timestamp()),
178
185
  date_created=str(parser.parse(record["CreatedDate"]).timestamp()),
@@ -207,11 +214,6 @@ class SalesforceDownloader(Downloader):
207
214
  )
208
215
  connector_type: str = CONNECTOR_TYPE
209
216
 
210
- def get_download_path(self, file_data: FileData) -> Path:
211
- rel_path = file_data.source_identifiers.relative_path
212
- rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
213
- return self.download_dir / Path(rel_path)
214
-
215
217
  def _xml_for_record(self, record: OrderedDict) -> str:
216
218
  """Creates partitionable xml file from a record"""
217
219
  import xml.etree.ElementTree as ET
@@ -6,10 +6,8 @@ from time import time
6
6
  from typing import TYPE_CHECKING, Any, Generator, Optional
7
7
  from urllib.parse import quote
8
8
 
9
- from unstructured.documents.elements import DataSourceMetadata
10
-
11
9
  from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
12
- from unstructured_ingest.error import SourceConnectionNetworkError
10
+ from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
13
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
14
12
  from unstructured_ingest.v2.interfaces import (
15
13
  AccessConfig,
@@ -18,6 +16,7 @@ from unstructured_ingest.v2.interfaces import (
18
16
  DownloaderConfig,
19
17
  DownloadResponse,
20
18
  FileData,
19
+ FileDataSourceMetadata,
21
20
  Indexer,
22
21
  IndexerConfig,
23
22
  SourceIdentifiers,
@@ -134,6 +133,14 @@ class SharepointIndexer(Indexer):
134
133
  connection_config: SharepointConnectionConfig
135
134
  index_config: SharepointIndexerConfig = field(default_factory=lambda: SharepointIndexerConfig())
136
135
 
136
+ def precheck(self) -> None:
137
+ try:
138
+ site_client = self.connection_config.get_client()
139
+ site_client.site_pages.pages.get().execute_query()
140
+ except Exception as e:
141
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
142
+ raise SourceConnectionError(f"failed to validate connection: {e}")
143
+
137
144
  def list_files(self, folder: "Folder", recursive: bool = False) -> list["File"]:
138
145
  if not recursive:
139
146
  folder.expand(["Files"]).get().execute_query()
@@ -187,7 +194,7 @@ class SharepointIndexer(Indexer):
187
194
  fullpath=file_path,
188
195
  rel_path=file_path.replace(self.index_config.path, ""),
189
196
  ),
190
- metadata=DataSourceMetadata(
197
+ metadata=FileDataSourceMetadata(
191
198
  url=url,
192
199
  version=version,
193
200
  date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
@@ -222,7 +229,7 @@ class SharepointIndexer(Indexer):
222
229
  fullpath=fullpath,
223
230
  rel_path=rel_path,
224
231
  ),
225
- metadata=DataSourceMetadata(
232
+ metadata=FileDataSourceMetadata(
226
233
  url=absolute_url,
227
234
  version=f"{file.major_version}.{file.minor_version}",
228
235
  date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
@@ -340,10 +347,9 @@ class SharepointDownloader(Downloader):
340
347
  connector_type: str = CONNECTOR_TYPE
341
348
 
342
349
  def get_download_path(self, file_data: FileData) -> Path:
350
+ download_path = super().get_download_path(file_data=file_data)
351
+
343
352
  content_type = file_data.additional_metadata.get("sharepoint_content_type")
344
- rel_path = file_data.source_identifiers.fullpath
345
- rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
346
- download_path = self.download_dir / Path(rel_path)
347
353
  if content_type == SharepointContentType.SITEPAGE.value:
348
354
  # Update output extension to html if site page
349
355
  download_path = download_path.with_suffix(".html")