unstructured-ingest 0.0.2__py3-none-any.whl → 0.0.2.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (37) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/v2/cli/base/cmd.py +0 -10
  3. unstructured_ingest/v2/cli/base/src.py +0 -2
  4. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +9 -1
  5. unstructured_ingest/v2/cli/cmds/local.py +8 -0
  6. unstructured_ingest/v2/cli/configs/__init__.py +1 -8
  7. unstructured_ingest/v2/interfaces/__init__.py +1 -2
  8. unstructured_ingest/v2/interfaces/downloader.py +3 -9
  9. unstructured_ingest/v2/interfaces/file_data.py +1 -6
  10. unstructured_ingest/v2/interfaces/process.py +0 -3
  11. unstructured_ingest/v2/pipeline/interfaces.py +5 -3
  12. unstructured_ingest/v2/pipeline/pipeline.py +2 -72
  13. unstructured_ingest/v2/pipeline/steps/download.py +13 -77
  14. unstructured_ingest/v2/processes/connectors/astra.py +0 -8
  15. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +0 -8
  16. unstructured_ingest/v2/processes/connectors/chroma.py +6 -8
  17. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +0 -9
  18. unstructured_ingest/v2/processes/connectors/elasticsearch.py +9 -23
  19. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +27 -12
  20. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +5 -13
  21. unstructured_ingest/v2/processes/connectors/google_drive.py +9 -13
  22. unstructured_ingest/v2/processes/connectors/local.py +15 -15
  23. unstructured_ingest/v2/processes/connectors/mongodb.py +4 -10
  24. unstructured_ingest/v2/processes/connectors/onedrive.py +2 -14
  25. unstructured_ingest/v2/processes/connectors/pinecone.py +3 -6
  26. unstructured_ingest/v2/processes/connectors/salesforce.py +8 -10
  27. unstructured_ingest/v2/processes/connectors/sharepoint.py +8 -14
  28. unstructured_ingest/v2/processes/connectors/sql.py +9 -24
  29. unstructured_ingest/v2/processes/connectors/weaviate.py +5 -13
  30. {unstructured_ingest-0.0.2.dist-info → unstructured_ingest-0.0.2.dev0.dist-info}/METADATA +15 -15
  31. {unstructured_ingest-0.0.2.dist-info → unstructured_ingest-0.0.2.dev0.dist-info}/RECORD +34 -37
  32. unstructured_ingest/v2/cli/configs/filter.py +0 -28
  33. unstructured_ingest/v2/pipeline/steps/filter.py +0 -40
  34. unstructured_ingest/v2/processes/filter.py +0 -54
  35. {unstructured_ingest-0.0.2.dist-info → unstructured_ingest-0.0.2.dev0.dist-info}/WHEEL +0 -0
  36. {unstructured_ingest-0.0.2.dist-info → unstructured_ingest-0.0.2.dev0.dist-info}/entry_points.txt +0 -0
  37. {unstructured_ingest-0.0.2.dist-info → unstructured_ingest-0.0.2.dev0.dist-info}/top_level.txt +0 -0
@@ -7,12 +7,10 @@ from pathlib import Path
7
7
  from time import time
8
8
  from typing import TYPE_CHECKING, Any, Generator, Optional
9
9
 
10
+ from unstructured.documents.elements import DataSourceMetadata
11
+
10
12
  from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
11
- from unstructured_ingest.error import (
12
- DestinationConnectionError,
13
- SourceConnectionError,
14
- SourceConnectionNetworkError,
15
- )
13
+ from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
16
14
  from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
17
15
  from unstructured_ingest.utils.dep_check import requires_dependencies
18
16
  from unstructured_ingest.v2.interfaces import (
@@ -22,7 +20,6 @@ from unstructured_ingest.v2.interfaces import (
22
20
  DownloaderConfig,
23
21
  DownloadResponse,
24
22
  FileData,
25
- FileDataSourceMetadata,
26
23
  Indexer,
27
24
  IndexerConfig,
28
25
  UploadContent,
@@ -124,14 +121,11 @@ class ElasticsearchIndexerConfig(IndexerConfig):
124
121
  class ElasticsearchIndexer(Indexer):
125
122
  connection_config: ElasticsearchConnectionConfig
126
123
  index_config: ElasticsearchIndexerConfig
124
+ client: "ElasticsearchClient" = field(init=False)
127
125
  connector_type: str = CONNECTOR_TYPE
128
126
 
129
- def precheck(self) -> None:
130
- try:
131
- self.connection_config.get_client()
132
- except Exception as e:
133
- logger.error(f"failed to validate connection: {e}", exc_info=True)
134
- raise SourceConnectionError(f"failed to validate connection: {e}")
127
+ def __post_init__(self):
128
+ self.client = self.connection_config.get_client()
135
129
 
136
130
  @requires_dependencies(["elasticsearch"], extras="elasticsearch")
137
131
  def load_scan(self):
@@ -144,9 +138,8 @@ class ElasticsearchIndexer(Indexer):
144
138
  scan = self.load_scan()
145
139
 
146
140
  scan_query: dict = {"stored_fields": [], "query": {"match_all": {}}}
147
- client = self.connection_config.get_client()
148
141
  hits = scan(
149
- client,
142
+ self.client,
150
143
  query=scan_query,
151
144
  scroll="1m",
152
145
  index=self.index_config.index_name,
@@ -175,7 +168,7 @@ class ElasticsearchIndexer(Indexer):
175
168
  yield FileData(
176
169
  identifier=identified,
177
170
  connector_type=CONNECTOR_TYPE,
178
- metadata=FileDataSourceMetadata(
171
+ metadata=DataSourceMetadata(
179
172
  url=f"{self.connection_config.hosts[0]}/{self.index_config.index_name}",
180
173
  date_processed=str(time()),
181
174
  ),
@@ -241,7 +234,7 @@ class ElasticsearchDownloader(Downloader):
241
234
  file_data=FileData(
242
235
  identifier=filename_id,
243
236
  connector_type=CONNECTOR_TYPE,
244
- metadata=FileDataSourceMetadata(
237
+ metadata=DataSourceMetadata(
245
238
  version=str(result["_version"]) if "_version" in result else None,
246
239
  date_processed=str(time()),
247
240
  record_locator={
@@ -346,13 +339,6 @@ class ElasticsearchUploader(Uploader):
346
339
  upload_config: ElasticsearchUploaderConfig
347
340
  connection_config: ElasticsearchConnectionConfig
348
341
 
349
- def precheck(self) -> None:
350
- try:
351
- self.connection_config.get_client()
352
- except Exception as e:
353
- logger.error(f"failed to validate connection: {e}", exc_info=True)
354
- raise DestinationConnectionError(f"failed to validate connection: {e}")
355
-
356
342
  @requires_dependencies(["elasticsearch"], extras="elasticsearch")
357
343
  def load_parallel_bulk(self):
358
344
  from elasticsearch.helpers import parallel_bulk
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import contextlib
4
+ import fnmatch
4
5
  from dataclasses import dataclass, field
5
6
  from datetime import datetime
6
7
  from pathlib import Path
@@ -8,6 +9,8 @@ from time import time
8
9
  from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
9
10
  from uuid import NAMESPACE_DNS, uuid5
10
11
 
12
+ from unstructured.documents.elements import DataSourceMetadata
13
+
11
14
  from unstructured_ingest.enhanced_dataclass import enhanced_field
12
15
  from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
13
16
  from unstructured_ingest.v2.interfaces import (
@@ -17,7 +20,6 @@ from unstructured_ingest.v2.interfaces import (
17
20
  DownloaderConfig,
18
21
  DownloadResponse,
19
22
  FileData,
20
- FileDataSourceMetadata,
21
23
  Indexer,
22
24
  IndexerConfig,
23
25
  SourceIdentifiers,
@@ -72,6 +74,7 @@ class FileConfig(Base):
72
74
  @dataclass
73
75
  class FsspecIndexerConfig(FileConfig, IndexerConfig):
74
76
  recursive: bool = False
77
+ file_glob: Optional[list[str]] = None
75
78
 
76
79
 
77
80
  @dataclass
@@ -106,7 +109,17 @@ class FsspecIndexer(Indexer):
106
109
  **self.connection_config.get_access_config(),
107
110
  )
108
111
 
109
- def precheck(self) -> None:
112
+ def does_path_match_glob(self, path: str) -> bool:
113
+ if self.index_config.file_glob is None:
114
+ return True
115
+ patterns = self.index_config.file_glob
116
+ for pattern in patterns:
117
+ if fnmatch.filter([path], pattern):
118
+ return True
119
+ logger.debug(f"The file {path!r} is discarded as it does not match any given glob.")
120
+ return False
121
+
122
+ def check_connection(self):
110
123
  from fsspec import get_filesystem_class
111
124
 
112
125
  try:
@@ -144,10 +157,10 @@ class FsspecIndexer(Indexer):
144
157
  else:
145
158
  raise TypeError(f"unhandled response type from find: {type(found)}")
146
159
 
147
- def get_metadata(self, path: str) -> FileDataSourceMetadata:
160
+ def get_metadata(self, path: str) -> DataSourceMetadata:
148
161
  date_created = None
149
162
  date_modified = None
150
- file_size = None
163
+
151
164
  try:
152
165
  created: Optional[Any] = self.fs.created(path)
153
166
  if created:
@@ -167,8 +180,6 @@ class FsspecIndexer(Indexer):
167
180
  date_modified = str(modified)
168
181
  except NotImplementedError:
169
182
  pass
170
- with contextlib.suppress(AttributeError):
171
- file_size = self.fs.size(path)
172
183
 
173
184
  version = self.fs.checksum(path)
174
185
  metadata: dict[str, str] = {}
@@ -178,19 +189,15 @@ class FsspecIndexer(Indexer):
178
189
  "protocol": self.index_config.protocol,
179
190
  "remote_file_path": self.index_config.remote_url,
180
191
  }
181
- file_stat = self.fs.stat(path=path)
182
- if file_id := file_stat.get("id"):
183
- record_locator["file_id"] = file_id
184
192
  if metadata:
185
193
  record_locator["metadata"] = metadata
186
- return FileDataSourceMetadata(
194
+ return DataSourceMetadata(
187
195
  date_created=date_created,
188
196
  date_modified=date_modified,
189
197
  date_processed=str(time()),
190
198
  version=str(version),
191
199
  url=f"{self.index_config.protocol}://{path}",
192
200
  record_locator=record_locator,
193
- filesize_bytes=file_size,
194
201
  )
195
202
 
196
203
  def sterilize_info(self, path) -> dict:
@@ -198,7 +205,8 @@ class FsspecIndexer(Indexer):
198
205
  return sterilize_dict(data=info)
199
206
 
200
207
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
201
- files = self.list_files()
208
+ raw_files = self.list_files()
209
+ files = [f for f in raw_files if self.does_path_match_glob(f)]
202
210
  for file in files:
203
211
  # Note: we remove any remaining leading slashes (Box introduces these)
204
212
  # to get a valid relative path
@@ -247,6 +255,13 @@ class FsspecDownloader(Downloader):
247
255
  **self.connection_config.get_access_config(),
248
256
  )
249
257
 
258
+ def get_download_path(self, file_data: FileData) -> Path:
259
+ return (
260
+ self.download_dir / Path(file_data.source_identifiers.relative_path)
261
+ if self.download_config
262
+ else Path(file_data.source_identifiers.rel_path)
263
+ )
264
+
250
265
  def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
251
266
  download_path = self.get_download_path(file_data=file_data)
252
267
  download_path.parent.mkdir(parents=True, exist_ok=True)
@@ -5,15 +5,11 @@ from pathlib import Path
5
5
  from time import time
6
6
  from typing import Any, Generator, Optional
7
7
 
8
- from unstructured.utils import requires_dependencies
8
+ from unstructured.documents.elements import DataSourceMetadata
9
9
 
10
10
  from unstructured_ingest.enhanced_dataclass import enhanced_field
11
- from unstructured_ingest.v2.interfaces import (
12
- DownloadResponse,
13
- FileData,
14
- FileDataSourceMetadata,
15
- UploadContent,
16
- )
11
+ from unstructured_ingest.utils.dep_check import requires_dependencies
12
+ from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
17
13
  from unstructured_ingest.v2.processes.connector_registry import (
18
14
  DestinationRegistryEntry,
19
15
  SourceRegistryEntry,
@@ -70,10 +66,9 @@ class S3Indexer(FsspecIndexer):
70
66
  index_config: S3IndexerConfig
71
67
  connector_type: str = CONNECTOR_TYPE
72
68
 
73
- def get_metadata(self, path: str) -> FileDataSourceMetadata:
69
+ def get_metadata(self, path: str) -> DataSourceMetadata:
74
70
  date_created = None
75
71
  date_modified = None
76
- file_size = None
77
72
  try:
78
73
  modified: Optional[datetime] = self.fs.modified(path)
79
74
  if modified:
@@ -81,8 +76,6 @@ class S3Indexer(FsspecIndexer):
81
76
  date_modified = str(modified.timestamp())
82
77
  except NotImplementedError:
83
78
  pass
84
- with contextlib.suppress(AttributeError):
85
- file_size = self.fs.size(path)
86
79
 
87
80
  version = None
88
81
  info: dict[str, Any] = self.fs.info(path)
@@ -97,14 +90,13 @@ class S3Indexer(FsspecIndexer):
97
90
  }
98
91
  if metadata:
99
92
  record_locator["metadata"] = metadata
100
- return FileDataSourceMetadata(
93
+ return DataSourceMetadata(
101
94
  date_created=date_created,
102
95
  date_modified=date_modified,
103
96
  date_processed=str(time()),
104
97
  version=version,
105
98
  url=f"{self.index_config.protocol}://{path}",
106
99
  record_locator=record_locator,
107
- filesize_bytes=file_size,
108
100
  )
109
101
 
110
102
  @requires_dependencies(["s3fs", "fsspec"], extras="s3")
@@ -1,16 +1,15 @@
1
1
  import io
2
2
  import os
3
3
  from dataclasses import dataclass, field
4
+ from pathlib import Path
4
5
  from typing import TYPE_CHECKING, Any, Generator, Optional, Union
5
6
 
6
7
  from dateutil import parser
8
+ from unstructured.documents.elements import DataSourceMetadata
7
9
  from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
8
10
 
9
11
  from unstructured_ingest.enhanced_dataclass import enhanced_field
10
- from unstructured_ingest.error import (
11
- SourceConnectionError,
12
- SourceConnectionNetworkError,
13
- )
12
+ from unstructured_ingest.error import SourceConnectionNetworkError
14
13
  from unstructured_ingest.utils.dep_check import requires_dependencies
15
14
  from unstructured_ingest.utils.string_and_date_utils import json_to_dict
16
15
  from unstructured_ingest.v2.interfaces import (
@@ -19,7 +18,6 @@ from unstructured_ingest.v2.interfaces import (
19
18
  Downloader,
20
19
  DownloaderConfig,
21
20
  FileData,
22
- FileDataSourceMetadata,
23
21
  Indexer,
24
22
  IndexerConfig,
25
23
  SourceIdentifiers,
@@ -123,13 +121,6 @@ class GoogleDriveIndexer(Indexer):
123
121
  ]
124
122
  )
125
123
 
126
- def precheck(self) -> None:
127
- try:
128
- self.connection_config.get_files_service()
129
- except Exception as e:
130
- logger.error(f"failed to validate connection: {e}", exc_info=True)
131
- raise SourceConnectionError(f"failed to validate connection: {e}")
132
-
133
124
  @staticmethod
134
125
  def is_dir(record: dict) -> bool:
135
126
  return record.get("mimeType") == "application/vnd.google-apps.folder"
@@ -164,7 +155,7 @@ class GoogleDriveIndexer(Indexer):
164
155
  connector_type=CONNECTOR_TYPE,
165
156
  identifier=file_id,
166
157
  source_identifiers=source_identifiers,
167
- metadata=FileDataSourceMetadata(
158
+ metadata=DataSourceMetadata(
168
159
  url=url,
169
160
  version=version,
170
161
  date_created=str(date_created_dt.timestamp()),
@@ -281,6 +272,11 @@ class GoogleDriveDownloader(Downloader):
281
272
  )
282
273
  connector_type: str = CONNECTOR_TYPE
283
274
 
275
+ def get_download_path(self, file_data: FileData) -> Path:
276
+ rel_path = file_data.source_identifiers.relative_path
277
+ rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
278
+ return self.download_dir / Path(rel_path)
279
+
284
280
  @SourceConnectionNetworkError.wrap
285
281
  def _get_content(self, downloader: "MediaIoBaseDownload") -> bool:
286
282
  downloaded = False
@@ -1,9 +1,12 @@
1
1
  import glob
2
+ import itertools
2
3
  import shutil
3
4
  from dataclasses import dataclass, field
4
5
  from pathlib import Path
5
6
  from time import time
6
- from typing import Any, Generator
7
+ from typing import Any, Generator, Optional
8
+
9
+ from unstructured.documents.elements import DataSourceMetadata
7
10
 
8
11
  from unstructured_ingest.v2.interfaces import (
9
12
  AccessConfig,
@@ -12,7 +15,6 @@ from unstructured_ingest.v2.interfaces import (
12
15
  DownloaderConfig,
13
16
  DownloadResponse,
14
17
  FileData,
15
- FileDataSourceMetadata,
16
18
  Indexer,
17
19
  IndexerConfig,
18
20
  SourceIdentifiers,
@@ -43,6 +45,7 @@ class LocalConnectionConfig(ConnectionConfig):
43
45
  class LocalIndexerConfig(IndexerConfig):
44
46
  input_path: str
45
47
  recursive: bool = False
48
+ file_glob: Optional[list[str]] = None
46
49
 
47
50
  @property
48
51
  def path(self) -> Path:
@@ -61,11 +64,16 @@ class LocalIndexer(Indexer):
61
64
  input_path = self.index_config.path
62
65
  if input_path.is_file():
63
66
  return [Path(s) for s in glob.glob(f"{self.index_config.path}")]
64
- if self.index_config.recursive:
65
- return list(input_path.rglob("*"))
66
- return list(input_path.glob("*"))
67
+ glob_fn = input_path.rglob if self.index_config.recursive else input_path.glob
68
+ if not self.index_config.file_glob:
69
+ return list(glob_fn("*"))
70
+ return list(
71
+ itertools.chain.from_iterable(
72
+ glob_fn(pattern) for pattern in self.index_config.file_glob
73
+ )
74
+ )
67
75
 
68
- def get_file_metadata(self, path: Path) -> FileDataSourceMetadata:
76
+ def get_file_metadata(self, path: Path) -> DataSourceMetadata:
69
77
  stats = path.stat()
70
78
  try:
71
79
  date_modified = str(stats.st_mtime)
@@ -85,20 +93,12 @@ class LocalIndexer(Indexer):
85
93
  except Exception as e:
86
94
  logger.warning(f"Couldn't detect file mode: {e}")
87
95
  permissions_data = None
88
-
89
- try:
90
- filesize_bytes = stats.st_size
91
- except Exception as e:
92
- logger.warning(f"Couldn't detect file size: {e}")
93
- filesize_bytes = None
94
-
95
- return FileDataSourceMetadata(
96
+ return DataSourceMetadata(
96
97
  date_modified=date_modified,
97
98
  date_created=date_created,
98
99
  date_processed=str(time()),
99
100
  permissions_data=permissions_data,
100
101
  record_locator={"path": str(path.resolve())},
101
- filesize_bytes=filesize_bytes,
102
102
  )
103
103
 
104
104
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
@@ -6,7 +6,6 @@ from typing import TYPE_CHECKING, Any, Optional
6
6
  from unstructured.__version__ import __version__ as unstructured_version
7
7
 
8
8
  from unstructured_ingest.enhanced_dataclass import enhanced_field
9
- from unstructured_ingest.error import DestinationConnectionError
10
9
  from unstructured_ingest.utils.data_prep import batch_generator
11
10
  from unstructured_ingest.utils.dep_check import requires_dependencies
12
11
  from unstructured_ingest.v2.interfaces import (
@@ -86,15 +85,11 @@ class MongoDBUploaderConfig(UploaderConfig):
86
85
  class MongoDBUploader(Uploader):
87
86
  upload_config: MongoDBUploaderConfig
88
87
  connection_config: MongoDBConnectionConfig
88
+ client: Optional["MongoClient"] = field(init=False)
89
89
  connector_type: str = CONNECTOR_TYPE
90
90
 
91
- def precheck(self) -> None:
92
- try:
93
- client = self.create_client()
94
- client.admin.command("ping")
95
- except Exception as e:
96
- logger.error(f"failed to validate connection: {e}", exc_info=True)
97
- raise DestinationConnectionError(f"failed to validate connection: {e}")
91
+ def __post_init__(self):
92
+ self.client = self.create_client()
98
93
 
99
94
  @requires_dependencies(["pymongo"], extras="mongodb")
100
95
  def create_client(self) -> "MongoClient":
@@ -128,8 +123,7 @@ class MongoDBUploader(Uploader):
128
123
  f"collection {self.connection_config.collection} "
129
124
  f"at {self.connection_config.host}",
130
125
  )
131
- client = self.create_client()
132
- db = client[self.connection_config.database]
126
+ db = self.client[self.connection_config.database]
133
127
  collection = db[self.connection_config.collection]
134
128
  for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
135
129
  collection.insert_many(chunk)
@@ -5,6 +5,7 @@ from time import time
5
5
  from typing import TYPE_CHECKING, Any, Generator, Optional
6
6
 
7
7
  from dateutil import parser
8
+ from unstructured.documents.elements import DataSourceMetadata
8
9
 
9
10
  from unstructured_ingest.enhanced_dataclass import enhanced_field
10
11
  from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
@@ -16,7 +17,6 @@ from unstructured_ingest.v2.interfaces import (
16
17
  DownloaderConfig,
17
18
  DownloadResponse,
18
19
  FileData,
19
- FileDataSourceMetadata,
20
20
  Indexer,
21
21
  IndexerConfig,
22
22
  SourceIdentifiers,
@@ -87,18 +87,6 @@ class OnedriveIndexer(Indexer):
87
87
  connection_config: OnedriveConnectionConfig
88
88
  index_config: OnedriveIndexerConfig
89
89
 
90
- def precheck(self) -> None:
91
- try:
92
- token_resp: dict = self.connection_config.get_token()
93
- if error := token_resp.get("error"):
94
- raise SourceConnectionError(
95
- "{} ({})".format(error, token_resp.get("error_description"))
96
- )
97
- self.connection_config.get_client()
98
- except Exception as e:
99
- logger.error(f"failed to validate connection: {e}", exc_info=True)
100
- raise SourceConnectionError(f"failed to validate connection: {e}")
101
-
102
90
  def list_objects(self, folder, recursive) -> list["DriveItem"]:
103
91
  drive_items = folder.children.get().execute_query()
104
92
  files = [d for d in drive_items if d.is_file]
@@ -148,7 +136,7 @@ class OnedriveIndexer(Indexer):
148
136
  source_identifiers=SourceIdentifiers(
149
137
  fullpath=server_path, filename=drive_item.name, rel_path=rel_path
150
138
  ),
151
- metadata=FileDataSourceMetadata(
139
+ metadata=DataSourceMetadata(
152
140
  url=drive_item.parent_reference.path + "/" + drive_item.name,
153
141
  version=drive_item.etag,
154
142
  date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
@@ -123,12 +123,9 @@ class PineconeUploader(Uploader):
123
123
  connection_config: PineconeConnectionConfig
124
124
  connector_type: str = CONNECTOR_TYPE
125
125
 
126
- def precheck(self):
127
- try:
128
- self.connection_config.get_index()
129
- except Exception as e:
130
- logger.error(f"failed to validate connection: {e}", exc_info=True)
131
- raise DestinationConnectionError(f"failed to validate connection: {e}")
126
+ @DestinationConnectionError.wrap
127
+ def check_connection(self):
128
+ _ = self.connection_config.get_index()
132
129
 
133
130
  @requires_dependencies(["pinecone"], extras="pinecone")
134
131
  def upsert_batch(self, batch):
@@ -18,9 +18,10 @@ from textwrap import dedent
18
18
  from typing import TYPE_CHECKING, Any, Generator, Type
19
19
 
20
20
  from dateutil import parser
21
+ from unstructured.documents.elements import DataSourceMetadata
21
22
 
22
23
  from unstructured_ingest.enhanced_dataclass import enhanced_field
23
- from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
24
+ from unstructured_ingest.error import SourceConnectionNetworkError
24
25
  from unstructured_ingest.utils.dep_check import requires_dependencies
25
26
  from unstructured_ingest.v2.interfaces import (
26
27
  AccessConfig,
@@ -29,7 +30,6 @@ from unstructured_ingest.v2.interfaces import (
29
30
  DownloaderConfig,
30
31
  DownloadResponse,
31
32
  FileData,
32
- FileDataSourceMetadata,
33
33
  Indexer,
34
34
  IndexerConfig,
35
35
  SourceIdentifiers,
@@ -132,13 +132,6 @@ class SalesforceIndexer(Indexer):
132
132
  if record_type not in ACCEPTED_CATEGORIES:
133
133
  raise ValueError(f"{record_type} not currently an accepted Salesforce category")
134
134
 
135
- def precheck(self) -> None:
136
- try:
137
- self.connection_config.get_client()
138
- except Exception as e:
139
- logger.error(f"failed to validate connection: {e}", exc_info=True)
140
- raise SourceConnectionError(f"failed to validate connection: {e}")
141
-
142
135
  def get_file_extension(self, record_type) -> str:
143
136
  if record_type == "EmailMessage":
144
137
  extension = ".eml"
@@ -179,7 +172,7 @@ class SalesforceIndexer(Indexer):
179
172
  filename=record_with_extension,
180
173
  fullpath=f"{record['attributes']['type']}/{record_with_extension}",
181
174
  ),
182
- metadata=FileDataSourceMetadata(
175
+ metadata=DataSourceMetadata(
183
176
  url=record["attributes"]["url"],
184
177
  version=str(parser.parse(record["SystemModstamp"]).timestamp()),
185
178
  date_created=str(parser.parse(record["CreatedDate"]).timestamp()),
@@ -214,6 +207,11 @@ class SalesforceDownloader(Downloader):
214
207
  )
215
208
  connector_type: str = CONNECTOR_TYPE
216
209
 
210
+ def get_download_path(self, file_data: FileData) -> Path:
211
+ rel_path = file_data.source_identifiers.relative_path
212
+ rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
213
+ return self.download_dir / Path(rel_path)
214
+
217
215
  def _xml_for_record(self, record: OrderedDict) -> str:
218
216
  """Creates partitionable xml file from a record"""
219
217
  import xml.etree.ElementTree as ET
@@ -6,8 +6,10 @@ from time import time
6
6
  from typing import TYPE_CHECKING, Any, Generator, Optional
7
7
  from urllib.parse import quote
8
8
 
9
+ from unstructured.documents.elements import DataSourceMetadata
10
+
9
11
  from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
10
- from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
12
+ from unstructured_ingest.error import SourceConnectionNetworkError
11
13
  from unstructured_ingest.utils.dep_check import requires_dependencies
12
14
  from unstructured_ingest.v2.interfaces import (
13
15
  AccessConfig,
@@ -16,7 +18,6 @@ from unstructured_ingest.v2.interfaces import (
16
18
  DownloaderConfig,
17
19
  DownloadResponse,
18
20
  FileData,
19
- FileDataSourceMetadata,
20
21
  Indexer,
21
22
  IndexerConfig,
22
23
  SourceIdentifiers,
@@ -133,14 +134,6 @@ class SharepointIndexer(Indexer):
133
134
  connection_config: SharepointConnectionConfig
134
135
  index_config: SharepointIndexerConfig = field(default_factory=lambda: SharepointIndexerConfig())
135
136
 
136
- def precheck(self) -> None:
137
- try:
138
- site_client = self.connection_config.get_client()
139
- site_client.site_pages.pages.get().execute_query()
140
- except Exception as e:
141
- logger.error(f"failed to validate connection: {e}", exc_info=True)
142
- raise SourceConnectionError(f"failed to validate connection: {e}")
143
-
144
137
  def list_files(self, folder: "Folder", recursive: bool = False) -> list["File"]:
145
138
  if not recursive:
146
139
  folder.expand(["Files"]).get().execute_query()
@@ -194,7 +187,7 @@ class SharepointIndexer(Indexer):
194
187
  fullpath=file_path,
195
188
  rel_path=file_path.replace(self.index_config.path, ""),
196
189
  ),
197
- metadata=FileDataSourceMetadata(
190
+ metadata=DataSourceMetadata(
198
191
  url=url,
199
192
  version=version,
200
193
  date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
@@ -229,7 +222,7 @@ class SharepointIndexer(Indexer):
229
222
  fullpath=fullpath,
230
223
  rel_path=rel_path,
231
224
  ),
232
- metadata=FileDataSourceMetadata(
225
+ metadata=DataSourceMetadata(
233
226
  url=absolute_url,
234
227
  version=f"{file.major_version}.{file.minor_version}",
235
228
  date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
@@ -347,9 +340,10 @@ class SharepointDownloader(Downloader):
347
340
  connector_type: str = CONNECTOR_TYPE
348
341
 
349
342
  def get_download_path(self, file_data: FileData) -> Path:
350
- download_path = super().get_download_path(file_data=file_data)
351
-
352
343
  content_type = file_data.additional_metadata.get("sharepoint_content_type")
344
+ rel_path = file_data.source_identifiers.fullpath
345
+ rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
346
+ download_path = self.download_dir / Path(rel_path)
353
347
  if content_type == SharepointContentType.SITEPAGE.value:
354
348
  # Update output extension to html if site page
355
349
  download_path = download_path.with_suffix(".html")