unstructured-ingest 0.0.2.dev0__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (44) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/v2/cli/base/cmd.py +10 -0
  3. unstructured_ingest/v2/cli/base/src.py +2 -0
  4. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +1 -9
  5. unstructured_ingest/v2/cli/cmds/local.py +0 -8
  6. unstructured_ingest/v2/cli/configs/__init__.py +8 -1
  7. unstructured_ingest/v2/cli/configs/filter.py +28 -0
  8. unstructured_ingest/v2/interfaces/__init__.py +2 -1
  9. unstructured_ingest/v2/interfaces/downloader.py +9 -3
  10. unstructured_ingest/v2/interfaces/file_data.py +6 -1
  11. unstructured_ingest/v2/interfaces/process.py +3 -4
  12. unstructured_ingest/v2/pipeline/interfaces.py +3 -5
  13. unstructured_ingest/v2/pipeline/pipeline.py +72 -2
  14. unstructured_ingest/v2/pipeline/steps/download.py +77 -13
  15. unstructured_ingest/v2/pipeline/steps/filter.py +40 -0
  16. unstructured_ingest/v2/processes/connectors/astra.py +8 -0
  17. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +8 -0
  18. unstructured_ingest/v2/processes/connectors/chroma.py +8 -6
  19. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +9 -0
  20. unstructured_ingest/v2/processes/connectors/elasticsearch.py +23 -9
  21. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +8 -0
  22. unstructured_ingest/v2/processes/connectors/fsspec/box.py +8 -0
  23. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +8 -0
  24. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +30 -28
  25. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +8 -0
  26. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +21 -5
  27. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +8 -0
  28. unstructured_ingest/v2/processes/connectors/google_drive.py +13 -9
  29. unstructured_ingest/v2/processes/connectors/local.py +15 -15
  30. unstructured_ingest/v2/processes/connectors/mongodb.py +10 -4
  31. unstructured_ingest/v2/processes/connectors/onedrive.py +14 -2
  32. unstructured_ingest/v2/processes/connectors/opensearch.py +33 -5
  33. unstructured_ingest/v2/processes/connectors/pinecone.py +6 -3
  34. unstructured_ingest/v2/processes/connectors/salesforce.py +10 -8
  35. unstructured_ingest/v2/processes/connectors/sharepoint.py +14 -8
  36. unstructured_ingest/v2/processes/connectors/sql.py +24 -9
  37. unstructured_ingest/v2/processes/connectors/weaviate.py +13 -5
  38. unstructured_ingest/v2/processes/filter.py +54 -0
  39. unstructured_ingest-0.0.3.dist-info/METADATA +175 -0
  40. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.3.dist-info}/RECORD +43 -40
  41. unstructured_ingest-0.0.2.dev0.dist-info/METADATA +0 -321
  42. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.3.dist-info}/WHEEL +0 -0
  43. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.3.dist-info}/entry_points.txt +0 -0
  44. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.3.dist-info}/top_level.txt +0 -0
@@ -3,6 +3,7 @@ from dataclasses import dataclass, field
3
3
  from typing import TYPE_CHECKING, Any, Optional
4
4
 
5
5
  from unstructured_ingest.enhanced_dataclass import enhanced_field
6
+ from unstructured_ingest.error import DestinationConnectionError
6
7
  from unstructured_ingest.utils.dep_check import requires_dependencies
7
8
  from unstructured_ingest.v2.interfaces import (
8
9
  AccessConfig,
@@ -11,6 +12,7 @@ from unstructured_ingest.v2.interfaces import (
11
12
  Uploader,
12
13
  UploaderConfig,
13
14
  )
15
+ from unstructured_ingest.v2.logger import logger
14
16
  from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
15
17
 
16
18
  if TYPE_CHECKING:
@@ -78,6 +80,13 @@ class DatabricksVolumesUploader(Uploader):
78
80
  host=self.connection_config.host, **self.connection_config.access_config.to_dict()
79
81
  )
80
82
 
83
+ def precheck(self) -> None:
84
+ try:
85
+ assert self.client.current_user.me().active
86
+ except Exception as e:
87
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
88
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
89
+
81
90
  def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
82
91
  for content in contents:
83
92
  with open(content.path, "rb") as elements_file:
@@ -7,10 +7,12 @@ from pathlib import Path
7
7
  from time import time
8
8
  from typing import TYPE_CHECKING, Any, Generator, Optional
9
9
 
10
- from unstructured.documents.elements import DataSourceMetadata
11
-
12
10
  from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
13
- from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
11
+ from unstructured_ingest.error import (
12
+ DestinationConnectionError,
13
+ SourceConnectionError,
14
+ SourceConnectionNetworkError,
15
+ )
14
16
  from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
15
17
  from unstructured_ingest.utils.dep_check import requires_dependencies
16
18
  from unstructured_ingest.v2.interfaces import (
@@ -20,6 +22,7 @@ from unstructured_ingest.v2.interfaces import (
20
22
  DownloaderConfig,
21
23
  DownloadResponse,
22
24
  FileData,
25
+ FileDataSourceMetadata,
23
26
  Indexer,
24
27
  IndexerConfig,
25
28
  UploadContent,
@@ -121,11 +124,14 @@ class ElasticsearchIndexerConfig(IndexerConfig):
121
124
  class ElasticsearchIndexer(Indexer):
122
125
  connection_config: ElasticsearchConnectionConfig
123
126
  index_config: ElasticsearchIndexerConfig
124
- client: "ElasticsearchClient" = field(init=False)
125
127
  connector_type: str = CONNECTOR_TYPE
126
128
 
127
- def __post_init__(self):
128
- self.client = self.connection_config.get_client()
129
+ def precheck(self) -> None:
130
+ try:
131
+ self.connection_config.get_client()
132
+ except Exception as e:
133
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
134
+ raise SourceConnectionError(f"failed to validate connection: {e}")
129
135
 
130
136
  @requires_dependencies(["elasticsearch"], extras="elasticsearch")
131
137
  def load_scan(self):
@@ -138,8 +144,9 @@ class ElasticsearchIndexer(Indexer):
138
144
  scan = self.load_scan()
139
145
 
140
146
  scan_query: dict = {"stored_fields": [], "query": {"match_all": {}}}
147
+ client = self.connection_config.get_client()
141
148
  hits = scan(
142
- self.client,
149
+ client,
143
150
  query=scan_query,
144
151
  scroll="1m",
145
152
  index=self.index_config.index_name,
@@ -168,7 +175,7 @@ class ElasticsearchIndexer(Indexer):
168
175
  yield FileData(
169
176
  identifier=identified,
170
177
  connector_type=CONNECTOR_TYPE,
171
- metadata=DataSourceMetadata(
178
+ metadata=FileDataSourceMetadata(
172
179
  url=f"{self.connection_config.hosts[0]}/{self.index_config.index_name}",
173
180
  date_processed=str(time()),
174
181
  ),
@@ -234,7 +241,7 @@ class ElasticsearchDownloader(Downloader):
234
241
  file_data=FileData(
235
242
  identifier=filename_id,
236
243
  connector_type=CONNECTOR_TYPE,
237
- metadata=DataSourceMetadata(
244
+ metadata=FileDataSourceMetadata(
238
245
  version=str(result["_version"]) if "_version" in result else None,
239
246
  date_processed=str(time()),
240
247
  record_locator={
@@ -339,6 +346,13 @@ class ElasticsearchUploader(Uploader):
339
346
  upload_config: ElasticsearchUploaderConfig
340
347
  connection_config: ElasticsearchConnectionConfig
341
348
 
349
+ def precheck(self) -> None:
350
+ try:
351
+ self.connection_config.get_client()
352
+ except Exception as e:
353
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
354
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
355
+
342
356
  @requires_dependencies(["elasticsearch"], extras="elasticsearch")
343
357
  def load_parallel_bulk(self):
344
358
  from elasticsearch.helpers import parallel_bulk
@@ -75,6 +75,10 @@ class AzureIndexer(FsspecIndexer):
75
75
  index_config: AzureIndexerConfig
76
76
  connector_type: str = CONNECTOR_TYPE
77
77
 
78
+ @requires_dependencies(["adlfs", "fsspec"], extras="azure")
79
+ def precheck(self) -> None:
80
+ super().precheck()
81
+
78
82
  def sterilize_info(self, path) -> dict:
79
83
  info = self.fs.info(path=path)
80
84
  return sterilize_dict(data=info, default=azure_json_serial)
@@ -120,6 +124,10 @@ class AzureUploader(FsspecUploader):
120
124
  def __post_init__(self):
121
125
  super().__post_init__()
122
126
 
127
+ @requires_dependencies(["adlfs", "fsspec"], extras="azure")
128
+ def precheck(self) -> None:
129
+ super().precheck()
130
+
123
131
  @requires_dependencies(["adlfs", "fsspec"], extras="azure")
124
132
  def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
125
133
  return super().run(contents=contents, **kwargs)
@@ -70,6 +70,10 @@ class BoxIndexer(FsspecIndexer):
70
70
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
71
71
  return super().run(**kwargs)
72
72
 
73
+ @requires_dependencies(["boxfs"], extras="box")
74
+ def precheck(self) -> None:
75
+ super().precheck()
76
+
73
77
 
74
78
  @dataclass
75
79
  class BoxDownloaderConfig(FsspecDownloaderConfig):
@@ -107,6 +111,10 @@ class BoxUploader(FsspecUploader):
107
111
  def __post_init__(self):
108
112
  super().__post_init__()
109
113
 
114
+ @requires_dependencies(["boxfs"], extras="box")
115
+ def precheck(self) -> None:
116
+ super().precheck()
117
+
110
118
  @requires_dependencies(["boxfs"], extras="box")
111
119
  def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
112
120
  return super().run(contents=contents, **kwargs)
@@ -57,6 +57,10 @@ class DropboxIndexer(FsspecIndexer):
57
57
  if not self.index_config.path_without_protocol.startswith("/"):
58
58
  self.index_config.path_without_protocol = "/" + self.index_config.path_without_protocol
59
59
 
60
+ @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
61
+ def precheck(self) -> None:
62
+ super().precheck()
63
+
60
64
  @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
61
65
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
62
66
  return super().run(**kwargs)
@@ -106,6 +110,10 @@ class DropboxUploader(FsspecUploader):
106
110
  def __post_init__(self):
107
111
  super().__post_init__()
108
112
 
113
+ @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
114
+ def precheck(self) -> None:
115
+ super().precheck()
116
+
109
117
  @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
110
118
  def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
111
119
  return super().run(contents=contents, **kwargs)
@@ -1,7 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import contextlib
4
- import fnmatch
5
4
  from dataclasses import dataclass, field
6
5
  from datetime import datetime
7
6
  from pathlib import Path
@@ -9,10 +8,12 @@ from time import time
9
8
  from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
10
9
  from uuid import NAMESPACE_DNS, uuid5
11
10
 
12
- from unstructured.documents.elements import DataSourceMetadata
13
-
14
11
  from unstructured_ingest.enhanced_dataclass import enhanced_field
15
- from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
12
+ from unstructured_ingest.error import (
13
+ DestinationConnectionError,
14
+ SourceConnectionError,
15
+ SourceConnectionNetworkError,
16
+ )
16
17
  from unstructured_ingest.v2.interfaces import (
17
18
  AccessConfig,
18
19
  ConnectionConfig,
@@ -20,6 +21,7 @@ from unstructured_ingest.v2.interfaces import (
20
21
  DownloaderConfig,
21
22
  DownloadResponse,
22
23
  FileData,
24
+ FileDataSourceMetadata,
23
25
  Indexer,
24
26
  IndexerConfig,
25
27
  SourceIdentifiers,
@@ -74,7 +76,6 @@ class FileConfig(Base):
74
76
  @dataclass
75
77
  class FsspecIndexerConfig(FileConfig, IndexerConfig):
76
78
  recursive: bool = False
77
- file_glob: Optional[list[str]] = None
78
79
 
79
80
 
80
81
  @dataclass
@@ -109,17 +110,7 @@ class FsspecIndexer(Indexer):
109
110
  **self.connection_config.get_access_config(),
110
111
  )
111
112
 
112
- def does_path_match_glob(self, path: str) -> bool:
113
- if self.index_config.file_glob is None:
114
- return True
115
- patterns = self.index_config.file_glob
116
- for pattern in patterns:
117
- if fnmatch.filter([path], pattern):
118
- return True
119
- logger.debug(f"The file {path!r} is discarded as it does not match any given glob.")
120
- return False
121
-
122
- def check_connection(self):
113
+ def precheck(self) -> None:
123
114
  from fsspec import get_filesystem_class
124
115
 
125
116
  try:
@@ -157,10 +148,10 @@ class FsspecIndexer(Indexer):
157
148
  else:
158
149
  raise TypeError(f"unhandled response type from find: {type(found)}")
159
150
 
160
- def get_metadata(self, path: str) -> DataSourceMetadata:
151
+ def get_metadata(self, path: str) -> FileDataSourceMetadata:
161
152
  date_created = None
162
153
  date_modified = None
163
-
154
+ file_size = None
164
155
  try:
165
156
  created: Optional[Any] = self.fs.created(path)
166
157
  if created:
@@ -180,6 +171,8 @@ class FsspecIndexer(Indexer):
180
171
  date_modified = str(modified)
181
172
  except NotImplementedError:
182
173
  pass
174
+ with contextlib.suppress(AttributeError):
175
+ file_size = self.fs.size(path)
183
176
 
184
177
  version = self.fs.checksum(path)
185
178
  metadata: dict[str, str] = {}
@@ -189,15 +182,19 @@ class FsspecIndexer(Indexer):
189
182
  "protocol": self.index_config.protocol,
190
183
  "remote_file_path": self.index_config.remote_url,
191
184
  }
185
+ file_stat = self.fs.stat(path=path)
186
+ if file_id := file_stat.get("id"):
187
+ record_locator["file_id"] = file_id
192
188
  if metadata:
193
189
  record_locator["metadata"] = metadata
194
- return DataSourceMetadata(
190
+ return FileDataSourceMetadata(
195
191
  date_created=date_created,
196
192
  date_modified=date_modified,
197
193
  date_processed=str(time()),
198
194
  version=str(version),
199
195
  url=f"{self.index_config.protocol}://{path}",
200
196
  record_locator=record_locator,
197
+ filesize_bytes=file_size,
201
198
  )
202
199
 
203
200
  def sterilize_info(self, path) -> dict:
@@ -205,8 +202,7 @@ class FsspecIndexer(Indexer):
205
202
  return sterilize_dict(data=info)
206
203
 
207
204
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
208
- raw_files = self.list_files()
209
- files = [f for f in raw_files if self.does_path_match_glob(f)]
205
+ files = self.list_files()
210
206
  for file in files:
211
207
  # Note: we remove any remaining leading slashes (Box introduces these)
212
208
  # to get a valid relative path
@@ -255,13 +251,6 @@ class FsspecDownloader(Downloader):
255
251
  **self.connection_config.get_access_config(),
256
252
  )
257
253
 
258
- def get_download_path(self, file_data: FileData) -> Path:
259
- return (
260
- self.download_dir / Path(file_data.source_identifiers.relative_path)
261
- if self.download_config
262
- else Path(file_data.source_identifiers.rel_path)
263
- )
264
-
265
254
  def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
266
255
  download_path = self.get_download_path(file_data=file_data)
267
256
  download_path.parent.mkdir(parents=True, exist_ok=True)
@@ -315,6 +304,19 @@ class FsspecUploader(Uploader):
315
304
  f"missing 1 required positional argument: 'upload_config'"
316
305
  )
317
306
 
307
+ def precheck(self) -> None:
308
+ from fsspec import get_filesystem_class
309
+
310
+ try:
311
+ fs = get_filesystem_class(self.upload_config.protocol)(
312
+ **self.connection_config.get_access_config(),
313
+ )
314
+ root_dir = self.upload_config.path_without_protocol.split("/")[0]
315
+ fs.ls(path=root_dir, detail=False)
316
+ except Exception as e:
317
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
318
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
319
+
318
320
  def get_upload_path(self, file_data: FileData) -> Path:
319
321
  upload_path = (
320
322
  Path(self.upload_config.path_without_protocol)
@@ -80,6 +80,10 @@ class GcsIndexer(FsspecIndexer):
80
80
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
81
81
  return super().run(**kwargs)
82
82
 
83
+ @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
84
+ def precheck(self) -> None:
85
+ super().precheck()
86
+
83
87
 
84
88
  @dataclass
85
89
  class GcsDownloaderConfig(FsspecDownloaderConfig):
@@ -117,6 +121,10 @@ class GcsUploader(FsspecUploader):
117
121
  def __post_init__(self):
118
122
  super().__post_init__()
119
123
 
124
+ @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
125
+ def precheck(self) -> None:
126
+ super().precheck()
127
+
120
128
  @requires_dependencies(["gcsfs", "fsspec"], extras="gcs")
121
129
  def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
122
130
  return super().run(contents=contents, **kwargs)
@@ -5,11 +5,15 @@ from pathlib import Path
5
5
  from time import time
6
6
  from typing import Any, Generator, Optional
7
7
 
8
- from unstructured.documents.elements import DataSourceMetadata
8
+ from unstructured.utils import requires_dependencies
9
9
 
10
10
  from unstructured_ingest.enhanced_dataclass import enhanced_field
11
- from unstructured_ingest.utils.dep_check import requires_dependencies
12
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
11
+ from unstructured_ingest.v2.interfaces import (
12
+ DownloadResponse,
13
+ FileData,
14
+ FileDataSourceMetadata,
15
+ UploadContent,
16
+ )
13
17
  from unstructured_ingest.v2.processes.connector_registry import (
14
18
  DestinationRegistryEntry,
15
19
  SourceRegistryEntry,
@@ -66,9 +70,10 @@ class S3Indexer(FsspecIndexer):
66
70
  index_config: S3IndexerConfig
67
71
  connector_type: str = CONNECTOR_TYPE
68
72
 
69
- def get_metadata(self, path: str) -> DataSourceMetadata:
73
+ def get_metadata(self, path: str) -> FileDataSourceMetadata:
70
74
  date_created = None
71
75
  date_modified = None
76
+ file_size = None
72
77
  try:
73
78
  modified: Optional[datetime] = self.fs.modified(path)
74
79
  if modified:
@@ -76,6 +81,8 @@ class S3Indexer(FsspecIndexer):
76
81
  date_modified = str(modified.timestamp())
77
82
  except NotImplementedError:
78
83
  pass
84
+ with contextlib.suppress(AttributeError):
85
+ file_size = self.fs.size(path)
79
86
 
80
87
  version = None
81
88
  info: dict[str, Any] = self.fs.info(path)
@@ -90,19 +97,24 @@ class S3Indexer(FsspecIndexer):
90
97
  }
91
98
  if metadata:
92
99
  record_locator["metadata"] = metadata
93
- return DataSourceMetadata(
100
+ return FileDataSourceMetadata(
94
101
  date_created=date_created,
95
102
  date_modified=date_modified,
96
103
  date_processed=str(time()),
97
104
  version=version,
98
105
  url=f"{self.index_config.protocol}://{path}",
99
106
  record_locator=record_locator,
107
+ filesize_bytes=file_size,
100
108
  )
101
109
 
102
110
  @requires_dependencies(["s3fs", "fsspec"], extras="s3")
103
111
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
104
112
  return super().run(**kwargs)
105
113
 
114
+ @requires_dependencies(["s3fs", "fsspec"], extras="s3")
115
+ def precheck(self) -> None:
116
+ super().precheck()
117
+
106
118
 
107
119
  @dataclass
108
120
  class S3DownloaderConfig(FsspecDownloaderConfig):
@@ -136,6 +148,10 @@ class S3Uploader(FsspecUploader):
136
148
  connection_config: S3ConnectionConfig
137
149
  upload_config: S3UploaderConfig = field(default=None)
138
150
 
151
+ @requires_dependencies(["s3fs", "fsspec"], extras="s3")
152
+ def precheck(self) -> None:
153
+ super().precheck()
154
+
139
155
  @requires_dependencies(["s3fs", "fsspec"], extras="s3")
140
156
  def __post_init__(self):
141
157
  super().__post_init__()
@@ -91,6 +91,10 @@ class SftpIndexer(FsspecIndexer):
91
91
  file.identifier = new_identifier
92
92
  yield file
93
93
 
94
+ @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
95
+ def precheck(self) -> None:
96
+ super().precheck()
97
+
94
98
 
95
99
  @dataclass
96
100
  class SftpDownloaderConfig(FsspecDownloaderConfig):
@@ -142,6 +146,10 @@ class SftpUploader(FsspecUploader):
142
146
  def __post_init__(self):
143
147
  super().__post_init__()
144
148
 
149
+ @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
150
+ def precheck(self) -> None:
151
+ super().precheck()
152
+
145
153
  @requires_dependencies(["paramiko", "fsspec"], extras="sftp")
146
154
  def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
147
155
  return super().run(contents=contents, **kwargs)
@@ -1,15 +1,16 @@
1
1
  import io
2
2
  import os
3
3
  from dataclasses import dataclass, field
4
- from pathlib import Path
5
4
  from typing import TYPE_CHECKING, Any, Generator, Optional, Union
6
5
 
7
6
  from dateutil import parser
8
- from unstructured.documents.elements import DataSourceMetadata
9
7
  from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
10
8
 
11
9
  from unstructured_ingest.enhanced_dataclass import enhanced_field
12
- from unstructured_ingest.error import SourceConnectionNetworkError
10
+ from unstructured_ingest.error import (
11
+ SourceConnectionError,
12
+ SourceConnectionNetworkError,
13
+ )
13
14
  from unstructured_ingest.utils.dep_check import requires_dependencies
14
15
  from unstructured_ingest.utils.string_and_date_utils import json_to_dict
15
16
  from unstructured_ingest.v2.interfaces import (
@@ -18,6 +19,7 @@ from unstructured_ingest.v2.interfaces import (
18
19
  Downloader,
19
20
  DownloaderConfig,
20
21
  FileData,
22
+ FileDataSourceMetadata,
21
23
  Indexer,
22
24
  IndexerConfig,
23
25
  SourceIdentifiers,
@@ -121,6 +123,13 @@ class GoogleDriveIndexer(Indexer):
121
123
  ]
122
124
  )
123
125
 
126
+ def precheck(self) -> None:
127
+ try:
128
+ self.connection_config.get_files_service()
129
+ except Exception as e:
130
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
131
+ raise SourceConnectionError(f"failed to validate connection: {e}")
132
+
124
133
  @staticmethod
125
134
  def is_dir(record: dict) -> bool:
126
135
  return record.get("mimeType") == "application/vnd.google-apps.folder"
@@ -155,7 +164,7 @@ class GoogleDriveIndexer(Indexer):
155
164
  connector_type=CONNECTOR_TYPE,
156
165
  identifier=file_id,
157
166
  source_identifiers=source_identifiers,
158
- metadata=DataSourceMetadata(
167
+ metadata=FileDataSourceMetadata(
159
168
  url=url,
160
169
  version=version,
161
170
  date_created=str(date_created_dt.timestamp()),
@@ -272,11 +281,6 @@ class GoogleDriveDownloader(Downloader):
272
281
  )
273
282
  connector_type: str = CONNECTOR_TYPE
274
283
 
275
- def get_download_path(self, file_data: FileData) -> Path:
276
- rel_path = file_data.source_identifiers.relative_path
277
- rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
278
- return self.download_dir / Path(rel_path)
279
-
280
284
  @SourceConnectionNetworkError.wrap
281
285
  def _get_content(self, downloader: "MediaIoBaseDownload") -> bool:
282
286
  downloaded = False
@@ -1,12 +1,9 @@
1
1
  import glob
2
- import itertools
3
2
  import shutil
4
3
  from dataclasses import dataclass, field
5
4
  from pathlib import Path
6
5
  from time import time
7
- from typing import Any, Generator, Optional
8
-
9
- from unstructured.documents.elements import DataSourceMetadata
6
+ from typing import Any, Generator
10
7
 
11
8
  from unstructured_ingest.v2.interfaces import (
12
9
  AccessConfig,
@@ -15,6 +12,7 @@ from unstructured_ingest.v2.interfaces import (
15
12
  DownloaderConfig,
16
13
  DownloadResponse,
17
14
  FileData,
15
+ FileDataSourceMetadata,
18
16
  Indexer,
19
17
  IndexerConfig,
20
18
  SourceIdentifiers,
@@ -45,7 +43,6 @@ class LocalConnectionConfig(ConnectionConfig):
45
43
  class LocalIndexerConfig(IndexerConfig):
46
44
  input_path: str
47
45
  recursive: bool = False
48
- file_glob: Optional[list[str]] = None
49
46
 
50
47
  @property
51
48
  def path(self) -> Path:
@@ -64,16 +61,11 @@ class LocalIndexer(Indexer):
64
61
  input_path = self.index_config.path
65
62
  if input_path.is_file():
66
63
  return [Path(s) for s in glob.glob(f"{self.index_config.path}")]
67
- glob_fn = input_path.rglob if self.index_config.recursive else input_path.glob
68
- if not self.index_config.file_glob:
69
- return list(glob_fn("*"))
70
- return list(
71
- itertools.chain.from_iterable(
72
- glob_fn(pattern) for pattern in self.index_config.file_glob
73
- )
74
- )
64
+ if self.index_config.recursive:
65
+ return list(input_path.rglob("*"))
66
+ return list(input_path.glob("*"))
75
67
 
76
- def get_file_metadata(self, path: Path) -> DataSourceMetadata:
68
+ def get_file_metadata(self, path: Path) -> FileDataSourceMetadata:
77
69
  stats = path.stat()
78
70
  try:
79
71
  date_modified = str(stats.st_mtime)
@@ -93,12 +85,20 @@ class LocalIndexer(Indexer):
93
85
  except Exception as e:
94
86
  logger.warning(f"Couldn't detect file mode: {e}")
95
87
  permissions_data = None
96
- return DataSourceMetadata(
88
+
89
+ try:
90
+ filesize_bytes = stats.st_size
91
+ except Exception as e:
92
+ logger.warning(f"Couldn't detect file size: {e}")
93
+ filesize_bytes = None
94
+
95
+ return FileDataSourceMetadata(
97
96
  date_modified=date_modified,
98
97
  date_created=date_created,
99
98
  date_processed=str(time()),
100
99
  permissions_data=permissions_data,
101
100
  record_locator={"path": str(path.resolve())},
101
+ filesize_bytes=filesize_bytes,
102
102
  )
103
103
 
104
104
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Any, Optional
6
6
  from unstructured.__version__ import __version__ as unstructured_version
7
7
 
8
8
  from unstructured_ingest.enhanced_dataclass import enhanced_field
9
+ from unstructured_ingest.error import DestinationConnectionError
9
10
  from unstructured_ingest.utils.data_prep import batch_generator
10
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
11
12
  from unstructured_ingest.v2.interfaces import (
@@ -85,11 +86,15 @@ class MongoDBUploaderConfig(UploaderConfig):
85
86
  class MongoDBUploader(Uploader):
86
87
  upload_config: MongoDBUploaderConfig
87
88
  connection_config: MongoDBConnectionConfig
88
- client: Optional["MongoClient"] = field(init=False)
89
89
  connector_type: str = CONNECTOR_TYPE
90
90
 
91
- def __post_init__(self):
92
- self.client = self.create_client()
91
+ def precheck(self) -> None:
92
+ try:
93
+ client = self.create_client()
94
+ client.admin.command("ping")
95
+ except Exception as e:
96
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
97
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
93
98
 
94
99
  @requires_dependencies(["pymongo"], extras="mongodb")
95
100
  def create_client(self) -> "MongoClient":
@@ -123,7 +128,8 @@ class MongoDBUploader(Uploader):
123
128
  f"collection {self.connection_config.collection} "
124
129
  f"at {self.connection_config.host}",
125
130
  )
126
- db = self.client[self.connection_config.database]
131
+ client = self.create_client()
132
+ db = client[self.connection_config.database]
127
133
  collection = db[self.connection_config.collection]
128
134
  for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
129
135
  collection.insert_many(chunk)
@@ -5,7 +5,6 @@ from time import time
5
5
  from typing import TYPE_CHECKING, Any, Generator, Optional
6
6
 
7
7
  from dateutil import parser
8
- from unstructured.documents.elements import DataSourceMetadata
9
8
 
10
9
  from unstructured_ingest.enhanced_dataclass import enhanced_field
11
10
  from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
@@ -17,6 +16,7 @@ from unstructured_ingest.v2.interfaces import (
17
16
  DownloaderConfig,
18
17
  DownloadResponse,
19
18
  FileData,
19
+ FileDataSourceMetadata,
20
20
  Indexer,
21
21
  IndexerConfig,
22
22
  SourceIdentifiers,
@@ -87,6 +87,18 @@ class OnedriveIndexer(Indexer):
87
87
  connection_config: OnedriveConnectionConfig
88
88
  index_config: OnedriveIndexerConfig
89
89
 
90
+ def precheck(self) -> None:
91
+ try:
92
+ token_resp: dict = self.connection_config.get_token()
93
+ if error := token_resp.get("error"):
94
+ raise SourceConnectionError(
95
+ "{} ({})".format(error, token_resp.get("error_description"))
96
+ )
97
+ self.connection_config.get_client()
98
+ except Exception as e:
99
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
100
+ raise SourceConnectionError(f"failed to validate connection: {e}")
101
+
90
102
  def list_objects(self, folder, recursive) -> list["DriveItem"]:
91
103
  drive_items = folder.children.get().execute_query()
92
104
  files = [d for d in drive_items if d.is_file]
@@ -136,7 +148,7 @@ class OnedriveIndexer(Indexer):
136
148
  source_identifiers=SourceIdentifiers(
137
149
  fullpath=server_path, filename=drive_item.name, rel_path=rel_path
138
150
  ),
139
- metadata=DataSourceMetadata(
151
+ metadata=FileDataSourceMetadata(
140
152
  url=drive_item.parent_reference.path + "/" + drive_item.name,
141
153
  version=drive_item.etag,
142
154
  date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,