unstructured-ingest 0.0.0__py3-none-any.whl → 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (44) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/connector/notion/helpers.py +1 -1
  3. unstructured_ingest/logger.py +2 -2
  4. unstructured_ingest/v2/cli/base/cmd.py +10 -0
  5. unstructured_ingest/v2/cli/base/src.py +2 -0
  6. unstructured_ingest/v2/cli/cmds/__init__.py +2 -0
  7. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +1 -9
  8. unstructured_ingest/v2/cli/cmds/local.py +0 -8
  9. unstructured_ingest/v2/cli/cmds/milvus.py +72 -0
  10. unstructured_ingest/v2/cli/configs/__init__.py +8 -1
  11. unstructured_ingest/v2/cli/configs/filter.py +28 -0
  12. unstructured_ingest/v2/interfaces/__init__.py +2 -1
  13. unstructured_ingest/v2/interfaces/downloader.py +9 -3
  14. unstructured_ingest/v2/interfaces/file_data.py +6 -1
  15. unstructured_ingest/v2/interfaces/process.py +3 -0
  16. unstructured_ingest/v2/logger.py +1 -1
  17. unstructured_ingest/v2/pipeline/interfaces.py +3 -1
  18. unstructured_ingest/v2/pipeline/pipeline.py +72 -2
  19. unstructured_ingest/v2/pipeline/steps/download.py +77 -13
  20. unstructured_ingest/v2/pipeline/steps/filter.py +40 -0
  21. unstructured_ingest/v2/processes/connectors/__init__.py +4 -2
  22. unstructured_ingest/v2/processes/connectors/astra.py +8 -0
  23. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +8 -0
  24. unstructured_ingest/v2/processes/connectors/chroma.py +8 -6
  25. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +9 -0
  26. unstructured_ingest/v2/processes/connectors/elasticsearch.py +23 -9
  27. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +22 -31
  28. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -5
  29. unstructured_ingest/v2/processes/connectors/google_drive.py +13 -9
  30. unstructured_ingest/v2/processes/connectors/local.py +15 -15
  31. unstructured_ingest/v2/processes/connectors/milvus.py +200 -0
  32. unstructured_ingest/v2/processes/connectors/mongodb.py +10 -4
  33. unstructured_ingest/v2/processes/connectors/onedrive.py +14 -2
  34. unstructured_ingest/v2/processes/connectors/pinecone.py +10 -7
  35. unstructured_ingest/v2/processes/connectors/salesforce.py +10 -8
  36. unstructured_ingest/v2/processes/connectors/sharepoint.py +14 -8
  37. unstructured_ingest/v2/processes/connectors/sql.py +24 -9
  38. unstructured_ingest/v2/processes/connectors/weaviate.py +13 -5
  39. unstructured_ingest/v2/processes/filter.py +54 -0
  40. {unstructured_ingest-0.0.0.dist-info → unstructured_ingest-0.0.2.dist-info}/METADATA +16 -14
  41. {unstructured_ingest-0.0.0.dist-info → unstructured_ingest-0.0.2.dist-info}/RECORD +44 -39
  42. {unstructured_ingest-0.0.0.dist-info → unstructured_ingest-0.0.2.dist-info}/WHEEL +0 -0
  43. {unstructured_ingest-0.0.0.dist-info → unstructured_ingest-0.0.2.dist-info}/entry_points.txt +0 -0
  44. {unstructured_ingest-0.0.0.dist-info → unstructured_ingest-0.0.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,40 @@
1
+ import asyncio
2
+ from dataclasses import dataclass
3
+ from typing import Callable, Optional
4
+
5
+ from unstructured_ingest.v2.interfaces.file_data import FileData
6
+ from unstructured_ingest.v2.logger import logger
7
+ from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
8
+ from unstructured_ingest.v2.pipeline.utils import sterilize_dict
9
+ from unstructured_ingest.v2.processes.filter import Filterer
10
+
11
+ STEP_ID = "filter"
12
+
13
+
14
+ @dataclass
15
+ class FilterStep(PipelineStep):
16
+ process: Filterer
17
+ identifier: str = STEP_ID
18
+
19
+ def __post_init__(self):
20
+ config = (
21
+ sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
22
+ if self.process.config
23
+ else None
24
+ )
25
+ logger.info(f"Created {self.identifier} with configs: {config}")
26
+
27
+ async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:
28
+ file_data = FileData.from_file(path=file_data_path)
29
+ fn_kwargs = {"file_data": file_data}
30
+ if not asyncio.iscoroutinefunction(fn):
31
+ resp = fn(**fn_kwargs)
32
+ elif semaphore := self.context.semaphore:
33
+ async with semaphore:
34
+ resp = await fn(**fn_kwargs)
35
+ else:
36
+ resp = await fn(**fn_kwargs)
37
+
38
+ if resp:
39
+ return {"file_data_path": file_data_path}
40
+ return None
@@ -1,7 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- import unstructured.ingest.v2.processes.connectors.fsspec # noqa: F401
4
-
3
+ import unstructured_ingest.v2.processes.connectors.fsspec # noqa: F401
5
4
  from unstructured_ingest.v2.processes.connector_registry import (
6
5
  add_destination_entry,
7
6
  add_source_entry,
@@ -19,6 +18,8 @@ from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
19
18
  from .google_drive import google_drive_source_entry
20
19
  from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR_TYPE
21
20
  from .local import local_destination_entry, local_source_entry
21
+ from .milvus import CONNECTOR_TYPE as MILVUS_CONNECTOR_TYPE
22
+ from .milvus import milvus_destination_entry
22
23
  from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE
23
24
  from .mongodb import mongodb_destination_entry
24
25
  from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
@@ -75,3 +76,4 @@ add_source_entry(source_type=SHAREPOINT_CONNECTOR_TYPE, entry=sharepoint_source_
75
76
  add_destination_entry(
76
77
  destination_type=SINGLESTORE_CONNECTOR_TYPE, entry=singlestore_destination_entry
77
78
  )
79
+ add_destination_entry(destination_type=MILVUS_CONNECTOR_TYPE, entry=milvus_destination_entry)
@@ -7,6 +7,7 @@ from unstructured import __name__ as integration_name
7
7
  from unstructured.__version__ import __version__ as integration_version
8
8
 
9
9
  from unstructured_ingest.enhanced_dataclass import enhanced_field
10
+ from unstructured_ingest.error import DestinationConnectionError
10
11
  from unstructured_ingest.utils.data_prep import batch_generator
11
12
  from unstructured_ingest.utils.dep_check import requires_dependencies
12
13
  from unstructured_ingest.v2.interfaces import (
@@ -94,6 +95,13 @@ class AstraUploader(Uploader):
94
95
  upload_config: AstraUploaderConfig
95
96
  connector_type: str = CONNECTOR_TYPE
96
97
 
98
+ def precheck(self) -> None:
99
+ try:
100
+ self.get_collection()
101
+ except Exception as e:
102
+ logger.error(f"Failed to validate connection {e}", exc_info=True)
103
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
104
+
97
105
  @requires_dependencies(["astrapy"], extras="astra")
98
106
  def get_collection(self) -> "AstraDBCollection":
99
107
  from astrapy.db import AstraDB
@@ -175,6 +175,14 @@ class AzureCognitiveSearchUploader(Uploader):
175
175
  ),
176
176
  )
177
177
 
178
+ def precheck(self) -> None:
179
+ try:
180
+ client = self.connection_config.generate_client()
181
+ client.get_document_count()
182
+ except Exception as e:
183
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
184
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
185
+
178
186
  def write_dict_wrapper(self, elements_dict):
179
187
  return self.write_dict(elements_dict=elements_dict)
180
188
 
@@ -111,10 +111,13 @@ class ChromaUploader(Uploader):
111
111
  connector_type: str = CONNECTOR_TYPE
112
112
  upload_config: ChromaUploaderConfig
113
113
  connection_config: ChromaConnectionConfig
114
- client: Optional["Client"] = field(init=False)
115
114
 
116
- def __post_init__(self):
117
- self.client = self.create_client()
115
+ def precheck(self) -> None:
116
+ try:
117
+ self.create_client()
118
+ except Exception as e:
119
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
120
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
118
121
 
119
122
  @requires_dependencies(["chromadb"], extras="chroma")
120
123
  def create_client(self) -> "Client":
@@ -187,10 +190,9 @@ class ChromaUploader(Uploader):
187
190
  f"collection {self.connection_config.collection_name} "
188
191
  f"at {self.connection_config.host}",
189
192
  )
193
+ client = self.create_client()
190
194
 
191
- collection = self.client.get_or_create_collection(
192
- name=self.connection_config.collection_name
193
- )
195
+ collection = client.get_or_create_collection(name=self.connection_config.collection_name)
194
196
  for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
195
197
  self.upsert_batch(collection, self.prepare_chroma_list(chunk))
196
198
 
@@ -3,6 +3,7 @@ from dataclasses import dataclass, field
3
3
  from typing import TYPE_CHECKING, Any, Optional
4
4
 
5
5
  from unstructured_ingest.enhanced_dataclass import enhanced_field
6
+ from unstructured_ingest.error import DestinationConnectionError
6
7
  from unstructured_ingest.utils.dep_check import requires_dependencies
7
8
  from unstructured_ingest.v2.interfaces import (
8
9
  AccessConfig,
@@ -11,6 +12,7 @@ from unstructured_ingest.v2.interfaces import (
11
12
  Uploader,
12
13
  UploaderConfig,
13
14
  )
15
+ from unstructured_ingest.v2.logger import logger
14
16
  from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
15
17
 
16
18
  if TYPE_CHECKING:
@@ -78,6 +80,13 @@ class DatabricksVolumesUploader(Uploader):
78
80
  host=self.connection_config.host, **self.connection_config.access_config.to_dict()
79
81
  )
80
82
 
83
+ def precheck(self) -> None:
84
+ try:
85
+ assert self.client.current_user.me().active
86
+ except Exception as e:
87
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
88
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
89
+
81
90
  def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
82
91
  for content in contents:
83
92
  with open(content.path, "rb") as elements_file:
@@ -7,10 +7,12 @@ from pathlib import Path
7
7
  from time import time
8
8
  from typing import TYPE_CHECKING, Any, Generator, Optional
9
9
 
10
- from unstructured.documents.elements import DataSourceMetadata
11
-
12
10
  from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
13
- from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
11
+ from unstructured_ingest.error import (
12
+ DestinationConnectionError,
13
+ SourceConnectionError,
14
+ SourceConnectionNetworkError,
15
+ )
14
16
  from unstructured_ingest.utils.data_prep import flatten_dict, generator_batching_wbytes
15
17
  from unstructured_ingest.utils.dep_check import requires_dependencies
16
18
  from unstructured_ingest.v2.interfaces import (
@@ -20,6 +22,7 @@ from unstructured_ingest.v2.interfaces import (
20
22
  DownloaderConfig,
21
23
  DownloadResponse,
22
24
  FileData,
25
+ FileDataSourceMetadata,
23
26
  Indexer,
24
27
  IndexerConfig,
25
28
  UploadContent,
@@ -121,11 +124,14 @@ class ElasticsearchIndexerConfig(IndexerConfig):
121
124
  class ElasticsearchIndexer(Indexer):
122
125
  connection_config: ElasticsearchConnectionConfig
123
126
  index_config: ElasticsearchIndexerConfig
124
- client: "ElasticsearchClient" = field(init=False)
125
127
  connector_type: str = CONNECTOR_TYPE
126
128
 
127
- def __post_init__(self):
128
- self.client = self.connection_config.get_client()
129
+ def precheck(self) -> None:
130
+ try:
131
+ self.connection_config.get_client()
132
+ except Exception as e:
133
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
134
+ raise SourceConnectionError(f"failed to validate connection: {e}")
129
135
 
130
136
  @requires_dependencies(["elasticsearch"], extras="elasticsearch")
131
137
  def load_scan(self):
@@ -138,8 +144,9 @@ class ElasticsearchIndexer(Indexer):
138
144
  scan = self.load_scan()
139
145
 
140
146
  scan_query: dict = {"stored_fields": [], "query": {"match_all": {}}}
147
+ client = self.connection_config.get_client()
141
148
  hits = scan(
142
- self.client,
149
+ client,
143
150
  query=scan_query,
144
151
  scroll="1m",
145
152
  index=self.index_config.index_name,
@@ -168,7 +175,7 @@ class ElasticsearchIndexer(Indexer):
168
175
  yield FileData(
169
176
  identifier=identified,
170
177
  connector_type=CONNECTOR_TYPE,
171
- metadata=DataSourceMetadata(
178
+ metadata=FileDataSourceMetadata(
172
179
  url=f"{self.connection_config.hosts[0]}/{self.index_config.index_name}",
173
180
  date_processed=str(time()),
174
181
  ),
@@ -234,7 +241,7 @@ class ElasticsearchDownloader(Downloader):
234
241
  file_data=FileData(
235
242
  identifier=filename_id,
236
243
  connector_type=CONNECTOR_TYPE,
237
- metadata=DataSourceMetadata(
244
+ metadata=FileDataSourceMetadata(
238
245
  version=str(result["_version"]) if "_version" in result else None,
239
246
  date_processed=str(time()),
240
247
  record_locator={
@@ -339,6 +346,13 @@ class ElasticsearchUploader(Uploader):
339
346
  upload_config: ElasticsearchUploaderConfig
340
347
  connection_config: ElasticsearchConnectionConfig
341
348
 
349
+ def precheck(self) -> None:
350
+ try:
351
+ self.connection_config.get_client()
352
+ except Exception as e:
353
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
354
+ raise DestinationConnectionError(f"failed to validate connection: {e}")
355
+
342
356
  @requires_dependencies(["elasticsearch"], extras="elasticsearch")
343
357
  def load_parallel_bulk(self):
344
358
  from elasticsearch.helpers import parallel_bulk
@@ -1,14 +1,12 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import contextlib
4
- import fnmatch
5
4
  from dataclasses import dataclass, field
6
5
  from datetime import datetime
7
6
  from pathlib import Path
8
7
  from time import time
9
8
  from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
10
-
11
- from unstructured.documents.elements import DataSourceMetadata
9
+ from uuid import NAMESPACE_DNS, uuid5
12
10
 
13
11
  from unstructured_ingest.enhanced_dataclass import enhanced_field
14
12
  from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
@@ -19,6 +17,7 @@ from unstructured_ingest.v2.interfaces import (
19
17
  DownloaderConfig,
20
18
  DownloadResponse,
21
19
  FileData,
20
+ FileDataSourceMetadata,
22
21
  Indexer,
23
22
  IndexerConfig,
24
23
  SourceIdentifiers,
@@ -73,7 +72,6 @@ class FileConfig(Base):
73
72
  @dataclass
74
73
  class FsspecIndexerConfig(FileConfig, IndexerConfig):
75
74
  recursive: bool = False
76
- file_glob: Optional[list[str]] = None
77
75
 
78
76
 
79
77
  @dataclass
@@ -108,17 +106,7 @@ class FsspecIndexer(Indexer):
108
106
  **self.connection_config.get_access_config(),
109
107
  )
110
108
 
111
- def does_path_match_glob(self, path: str) -> bool:
112
- if self.index_config.file_glob is None:
113
- return True
114
- patterns = self.index_config.file_glob
115
- for pattern in patterns:
116
- if fnmatch.filter([path], pattern):
117
- return True
118
- logger.debug(f"The file {path!r} is discarded as it does not match any given glob.")
119
- return False
120
-
121
- def check_connection(self):
109
+ def precheck(self) -> None:
122
110
  from fsspec import get_filesystem_class
123
111
 
124
112
  try:
@@ -156,10 +144,10 @@ class FsspecIndexer(Indexer):
156
144
  else:
157
145
  raise TypeError(f"unhandled response type from find: {type(found)}")
158
146
 
159
- def get_metadata(self, path: str) -> DataSourceMetadata:
147
+ def get_metadata(self, path: str) -> FileDataSourceMetadata:
160
148
  date_created = None
161
149
  date_modified = None
162
-
150
+ file_size = None
163
151
  try:
164
152
  created: Optional[Any] = self.fs.created(path)
165
153
  if created:
@@ -179,6 +167,8 @@ class FsspecIndexer(Indexer):
179
167
  date_modified = str(modified)
180
168
  except NotImplementedError:
181
169
  pass
170
+ with contextlib.suppress(AttributeError):
171
+ file_size = self.fs.size(path)
182
172
 
183
173
  version = self.fs.checksum(path)
184
174
  metadata: dict[str, str] = {}
@@ -188,15 +178,19 @@ class FsspecIndexer(Indexer):
188
178
  "protocol": self.index_config.protocol,
189
179
  "remote_file_path": self.index_config.remote_url,
190
180
  }
181
+ file_stat = self.fs.stat(path=path)
182
+ if file_id := file_stat.get("id"):
183
+ record_locator["file_id"] = file_id
191
184
  if metadata:
192
185
  record_locator["metadata"] = metadata
193
- return DataSourceMetadata(
186
+ return FileDataSourceMetadata(
194
187
  date_created=date_created,
195
188
  date_modified=date_modified,
196
189
  date_processed=str(time()),
197
190
  version=str(version),
198
191
  url=f"{self.index_config.protocol}://{path}",
199
192
  record_locator=record_locator,
193
+ filesize_bytes=file_size,
200
194
  )
201
195
 
202
196
  def sterilize_info(self, path) -> dict:
@@ -204,14 +198,16 @@ class FsspecIndexer(Indexer):
204
198
  return sterilize_dict(data=info)
205
199
 
206
200
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
207
- raw_files = self.list_files()
208
- files = [f for f in raw_files if self.does_path_match_glob(f)]
201
+ files = self.list_files()
209
202
  for file in files:
210
203
  # Note: we remove any remaining leading slashes (Box introduces these)
211
204
  # to get a valid relative path
212
205
  rel_path = file.replace(self.index_config.path_without_protocol, "").lstrip("/")
206
+
207
+ additional_metadata = self.sterilize_info(path=file)
208
+ additional_metadata["original_file_path"] = file
213
209
  yield FileData(
214
- identifier=file,
210
+ identifier=str(uuid5(NAMESPACE_DNS, file)),
215
211
  connector_type=self.connector_type,
216
212
  source_identifiers=SourceIdentifiers(
217
213
  filename=Path(file).name,
@@ -219,7 +215,7 @@ class FsspecIndexer(Indexer):
219
215
  fullpath=file,
220
216
  ),
221
217
  metadata=self.get_metadata(path=file),
222
- additional_metadata=self.sterilize_info(path=file),
218
+ additional_metadata=additional_metadata,
223
219
  )
224
220
 
225
221
 
@@ -251,18 +247,12 @@ class FsspecDownloader(Downloader):
251
247
  **self.connection_config.get_access_config(),
252
248
  )
253
249
 
254
- def get_download_path(self, file_data: FileData) -> Path:
255
- return (
256
- self.download_dir / Path(file_data.source_identifiers.relative_path)
257
- if self.download_config
258
- else Path(file_data.source_identifiers.rel_path)
259
- )
260
-
261
250
  def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
262
251
  download_path = self.get_download_path(file_data=file_data)
263
252
  download_path.parent.mkdir(parents=True, exist_ok=True)
264
253
  try:
265
- self.fs.get(rpath=file_data.identifier, lpath=download_path.as_posix())
254
+ rpath = file_data.additional_metadata["original_file_path"]
255
+ self.fs.get(rpath=rpath, lpath=download_path.as_posix())
266
256
  except Exception as e:
267
257
  logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
268
258
  raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
@@ -272,7 +262,8 @@ class FsspecDownloader(Downloader):
272
262
  download_path = self.get_download_path(file_data=file_data)
273
263
  download_path.parent.mkdir(parents=True, exist_ok=True)
274
264
  try:
275
- await self.fs.get(rpath=file_data.identifier, lpath=download_path.as_posix())
265
+ rpath = file_data.additional_metadata["original_file_path"]
266
+ await self.fs.get(rpath=rpath, lpath=download_path.as_posix())
276
267
  except Exception as e:
277
268
  logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True)
278
269
  raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}")
@@ -5,11 +5,15 @@ from pathlib import Path
5
5
  from time import time
6
6
  from typing import Any, Generator, Optional
7
7
 
8
- from unstructured.documents.elements import DataSourceMetadata
8
+ from unstructured.utils import requires_dependencies
9
9
 
10
10
  from unstructured_ingest.enhanced_dataclass import enhanced_field
11
- from unstructured_ingest.utils.dep_check import requires_dependencies
12
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, UploadContent
11
+ from unstructured_ingest.v2.interfaces import (
12
+ DownloadResponse,
13
+ FileData,
14
+ FileDataSourceMetadata,
15
+ UploadContent,
16
+ )
13
17
  from unstructured_ingest.v2.processes.connector_registry import (
14
18
  DestinationRegistryEntry,
15
19
  SourceRegistryEntry,
@@ -66,9 +70,10 @@ class S3Indexer(FsspecIndexer):
66
70
  index_config: S3IndexerConfig
67
71
  connector_type: str = CONNECTOR_TYPE
68
72
 
69
- def get_metadata(self, path: str) -> DataSourceMetadata:
73
+ def get_metadata(self, path: str) -> FileDataSourceMetadata:
70
74
  date_created = None
71
75
  date_modified = None
76
+ file_size = None
72
77
  try:
73
78
  modified: Optional[datetime] = self.fs.modified(path)
74
79
  if modified:
@@ -76,6 +81,8 @@ class S3Indexer(FsspecIndexer):
76
81
  date_modified = str(modified.timestamp())
77
82
  except NotImplementedError:
78
83
  pass
84
+ with contextlib.suppress(AttributeError):
85
+ file_size = self.fs.size(path)
79
86
 
80
87
  version = None
81
88
  info: dict[str, Any] = self.fs.info(path)
@@ -90,13 +97,14 @@ class S3Indexer(FsspecIndexer):
90
97
  }
91
98
  if metadata:
92
99
  record_locator["metadata"] = metadata
93
- return DataSourceMetadata(
100
+ return FileDataSourceMetadata(
94
101
  date_created=date_created,
95
102
  date_modified=date_modified,
96
103
  date_processed=str(time()),
97
104
  version=version,
98
105
  url=f"{self.index_config.protocol}://{path}",
99
106
  record_locator=record_locator,
107
+ filesize_bytes=file_size,
100
108
  )
101
109
 
102
110
  @requires_dependencies(["s3fs", "fsspec"], extras="s3")
@@ -1,15 +1,16 @@
1
1
  import io
2
2
  import os
3
3
  from dataclasses import dataclass, field
4
- from pathlib import Path
5
4
  from typing import TYPE_CHECKING, Any, Generator, Optional, Union
6
5
 
7
6
  from dateutil import parser
8
- from unstructured.documents.elements import DataSourceMetadata
9
7
  from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
10
8
 
11
9
  from unstructured_ingest.enhanced_dataclass import enhanced_field
12
- from unstructured_ingest.error import SourceConnectionNetworkError
10
+ from unstructured_ingest.error import (
11
+ SourceConnectionError,
12
+ SourceConnectionNetworkError,
13
+ )
13
14
  from unstructured_ingest.utils.dep_check import requires_dependencies
14
15
  from unstructured_ingest.utils.string_and_date_utils import json_to_dict
15
16
  from unstructured_ingest.v2.interfaces import (
@@ -18,6 +19,7 @@ from unstructured_ingest.v2.interfaces import (
18
19
  Downloader,
19
20
  DownloaderConfig,
20
21
  FileData,
22
+ FileDataSourceMetadata,
21
23
  Indexer,
22
24
  IndexerConfig,
23
25
  SourceIdentifiers,
@@ -121,6 +123,13 @@ class GoogleDriveIndexer(Indexer):
121
123
  ]
122
124
  )
123
125
 
126
+ def precheck(self) -> None:
127
+ try:
128
+ self.connection_config.get_files_service()
129
+ except Exception as e:
130
+ logger.error(f"failed to validate connection: {e}", exc_info=True)
131
+ raise SourceConnectionError(f"failed to validate connection: {e}")
132
+
124
133
  @staticmethod
125
134
  def is_dir(record: dict) -> bool:
126
135
  return record.get("mimeType") == "application/vnd.google-apps.folder"
@@ -155,7 +164,7 @@ class GoogleDriveIndexer(Indexer):
155
164
  connector_type=CONNECTOR_TYPE,
156
165
  identifier=file_id,
157
166
  source_identifiers=source_identifiers,
158
- metadata=DataSourceMetadata(
167
+ metadata=FileDataSourceMetadata(
159
168
  url=url,
160
169
  version=version,
161
170
  date_created=str(date_created_dt.timestamp()),
@@ -272,11 +281,6 @@ class GoogleDriveDownloader(Downloader):
272
281
  )
273
282
  connector_type: str = CONNECTOR_TYPE
274
283
 
275
- def get_download_path(self, file_data: FileData) -> Path:
276
- rel_path = file_data.source_identifiers.relative_path
277
- rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
278
- return self.download_dir / Path(rel_path)
279
-
280
284
  @SourceConnectionNetworkError.wrap
281
285
  def _get_content(self, downloader: "MediaIoBaseDownload") -> bool:
282
286
  downloaded = False
@@ -1,12 +1,9 @@
1
1
  import glob
2
- import itertools
3
2
  import shutil
4
3
  from dataclasses import dataclass, field
5
4
  from pathlib import Path
6
5
  from time import time
7
- from typing import Any, Generator, Optional
8
-
9
- from unstructured.documents.elements import DataSourceMetadata
6
+ from typing import Any, Generator
10
7
 
11
8
  from unstructured_ingest.v2.interfaces import (
12
9
  AccessConfig,
@@ -15,6 +12,7 @@ from unstructured_ingest.v2.interfaces import (
15
12
  DownloaderConfig,
16
13
  DownloadResponse,
17
14
  FileData,
15
+ FileDataSourceMetadata,
18
16
  Indexer,
19
17
  IndexerConfig,
20
18
  SourceIdentifiers,
@@ -45,7 +43,6 @@ class LocalConnectionConfig(ConnectionConfig):
45
43
  class LocalIndexerConfig(IndexerConfig):
46
44
  input_path: str
47
45
  recursive: bool = False
48
- file_glob: Optional[list[str]] = None
49
46
 
50
47
  @property
51
48
  def path(self) -> Path:
@@ -64,16 +61,11 @@ class LocalIndexer(Indexer):
64
61
  input_path = self.index_config.path
65
62
  if input_path.is_file():
66
63
  return [Path(s) for s in glob.glob(f"{self.index_config.path}")]
67
- glob_fn = input_path.rglob if self.index_config.recursive else input_path.glob
68
- if not self.index_config.file_glob:
69
- return list(glob_fn("*"))
70
- return list(
71
- itertools.chain.from_iterable(
72
- glob_fn(pattern) for pattern in self.index_config.file_glob
73
- )
74
- )
64
+ if self.index_config.recursive:
65
+ return list(input_path.rglob("*"))
66
+ return list(input_path.glob("*"))
75
67
 
76
- def get_file_metadata(self, path: Path) -> DataSourceMetadata:
68
+ def get_file_metadata(self, path: Path) -> FileDataSourceMetadata:
77
69
  stats = path.stat()
78
70
  try:
79
71
  date_modified = str(stats.st_mtime)
@@ -93,12 +85,20 @@ class LocalIndexer(Indexer):
93
85
  except Exception as e:
94
86
  logger.warning(f"Couldn't detect file mode: {e}")
95
87
  permissions_data = None
96
- return DataSourceMetadata(
88
+
89
+ try:
90
+ filesize_bytes = stats.st_size
91
+ except Exception as e:
92
+ logger.warning(f"Couldn't detect file size: {e}")
93
+ filesize_bytes = None
94
+
95
+ return FileDataSourceMetadata(
97
96
  date_modified=date_modified,
98
97
  date_created=date_created,
99
98
  date_processed=str(time()),
100
99
  permissions_data=permissions_data,
101
100
  record_locator={"path": str(path.resolve())},
101
+ filesize_bytes=filesize_bytes,
102
102
  )
103
103
 
104
104
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]: