unstructured-ingest 0.0.21__py3-none-any.whl → 0.0.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (41) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/embed/bedrock.py +56 -19
  3. unstructured_ingest/embed/huggingface.py +22 -22
  4. unstructured_ingest/embed/interfaces.py +11 -4
  5. unstructured_ingest/embed/mixedbreadai.py +17 -17
  6. unstructured_ingest/embed/octoai.py +7 -7
  7. unstructured_ingest/embed/openai.py +15 -20
  8. unstructured_ingest/embed/vertexai.py +25 -17
  9. unstructured_ingest/embed/voyageai.py +22 -17
  10. unstructured_ingest/v2/cli/base/cmd.py +1 -1
  11. unstructured_ingest/v2/interfaces/connector.py +1 -1
  12. unstructured_ingest/v2/pipeline/pipeline.py +3 -1
  13. unstructured_ingest/v2/pipeline/steps/chunk.py +1 -1
  14. unstructured_ingest/v2/pipeline/steps/download.py +6 -2
  15. unstructured_ingest/v2/pipeline/steps/embed.py +1 -1
  16. unstructured_ingest/v2/pipeline/steps/filter.py +1 -1
  17. unstructured_ingest/v2/pipeline/steps/index.py +4 -2
  18. unstructured_ingest/v2/pipeline/steps/partition.py +1 -1
  19. unstructured_ingest/v2/pipeline/steps/stage.py +3 -1
  20. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -1
  21. unstructured_ingest/v2/pipeline/steps/upload.py +6 -2
  22. unstructured_ingest/v2/processes/connectors/airtable.py +1 -1
  23. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +1 -1
  24. unstructured_ingest/v2/processes/connectors/elasticsearch.py +2 -2
  25. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +31 -5
  26. unstructured_ingest/v2/processes/connectors/fsspec/box.py +31 -2
  27. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +36 -8
  28. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +25 -77
  29. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +30 -1
  30. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +15 -18
  31. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +22 -1
  32. unstructured_ingest/v2/processes/connectors/milvus.py +2 -2
  33. unstructured_ingest/v2/processes/connectors/opensearch.py +2 -2
  34. unstructured_ingest/v2/utils.py +1 -1
  35. unstructured_ingest-0.0.22.dist-info/METADATA +186 -0
  36. {unstructured_ingest-0.0.21.dist-info → unstructured_ingest-0.0.22.dist-info}/RECORD +40 -40
  37. {unstructured_ingest-0.0.21.dist-info → unstructured_ingest-0.0.22.dist-info}/WHEEL +1 -1
  38. unstructured_ingest-0.0.21.dist-info/METADATA +0 -639
  39. {unstructured_ingest-0.0.21.dist-info → unstructured_ingest-0.0.22.dist-info}/LICENSE.md +0 -0
  40. {unstructured_ingest-0.0.21.dist-info → unstructured_ingest-0.0.22.dist-info}/entry_points.txt +0 -0
  41. {unstructured_ingest-0.0.21.dist-info → unstructured_ingest-0.0.22.dist-info}/top_level.txt +0 -0
@@ -187,7 +187,9 @@ class Pipeline:
187
187
  return filtered_records
188
188
 
189
189
  def _run(self):
190
- logger.info(f"running local pipeline: {self} with configs: " f"{self.context.json()}")
190
+ logger.info(
191
+ f"running local pipeline: {self} with configs: " f"{self.context.model_dump_json()}"
192
+ )
191
193
  if self.context.mp_supported:
192
194
  manager = mp.Manager()
193
195
  self.context.status = manager.dict()
@@ -28,7 +28,7 @@ class ChunkStep(PipelineStep):
28
28
  return f"{self.identifier} ({self.process.config.chunking_strategy})"
29
29
 
30
30
  def __post_init__(self):
31
- config = self.process.config.json() if self.process.config else None
31
+ config = self.process.config.model_dump_json() if self.process.config else None
32
32
  logger.info(f"created {self.identifier} with configs: {config}")
33
33
 
34
34
  def should_chunk(self, filepath: Path, file_data: FileData) -> bool:
@@ -31,9 +31,13 @@ class DownloadStep(PipelineStep):
31
31
  return f"{self.identifier} ({self.process.__class__.__name__})"
32
32
 
33
33
  def __post_init__(self):
34
- config = self.process.download_config.json() if self.process.download_config else None
34
+ config = (
35
+ self.process.download_config.model_dump_json() if self.process.download_config else None
36
+ )
35
37
  connection_config = (
36
- self.process.connection_config.json() if self.process.connection_config else None
38
+ self.process.connection_config.model_dump_json()
39
+ if self.process.connection_config
40
+ else None
37
41
  )
38
42
  logger.info(
39
43
  f"Created {self.identifier} with configs: {config}, "
@@ -28,7 +28,7 @@ class EmbedStep(PipelineStep):
28
28
  return f"{self.identifier} ({self.process.config.embedding_provider})"
29
29
 
30
30
  def __post_init__(self):
31
- config = self.process.config.json() if self.process.config else None
31
+ config = self.process.config.model_dump_json() if self.process.config else None
32
32
  logger.info(f"created {self.identifier} with configs: {config}")
33
33
 
34
34
  def should_embed(self, filepath: Path, file_data: FileData) -> bool:
@@ -16,7 +16,7 @@ class FilterStep(PipelineStep):
16
16
  identifier: str = STEP_ID
17
17
 
18
18
  def __post_init__(self):
19
- config = self.process.config.json() if self.process.config else None
19
+ config = self.process.config.model_dump_json() if self.process.config else None
20
20
  logger.info(f"created {self.identifier} with configs: {config}")
21
21
 
22
22
  async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:
@@ -23,9 +23,11 @@ class IndexStep(PipelineStep):
23
23
  return f"{self.identifier} ({self.process.__class__.__name__})"
24
24
 
25
25
  def __post_init__(self):
26
- config = self.process.index_config.json() if self.process.index_config else None
26
+ config = self.process.index_config.model_dump_json() if self.process.index_config else None
27
27
  connection_config = (
28
- self.process.connection_config.json() if self.process.connection_config else None
28
+ self.process.connection_config.model_dump_json()
29
+ if self.process.connection_config
30
+ else None
29
31
  )
30
32
  logger.info(
31
33
  f"created {self.identifier} with configs: {config}, "
@@ -28,7 +28,7 @@ class PartitionStep(PipelineStep):
28
28
  return f"{self.identifier} ({self.process.config.strategy})"
29
29
 
30
30
  def __post_init__(self):
31
- config = self.process.config.json()
31
+ config = self.process.config.model_dump_json()
32
32
  logger.info(f"created {self.identifier} with configs: {config}")
33
33
 
34
34
  def should_partition(self, filepath: Path, file_data: FileData) -> bool:
@@ -28,7 +28,9 @@ class UploadStageStep(PipelineStep):
28
28
 
29
29
  def __post_init__(self):
30
30
  config = (
31
- self.process.upload_stager_config.json() if self.process.upload_stager_config else None
31
+ self.process.upload_stager_config.model_dump_json()
32
+ if self.process.upload_stager_config
33
+ else None
32
34
  )
33
35
  self.cache_dir.mkdir(parents=True, exist_ok=True)
34
36
  logger.info(f"created {self.identifier} with configs: {config}")
@@ -22,7 +22,7 @@ class UncompressStep(PipelineStep):
22
22
  identifier: str = STEP_ID
23
23
 
24
24
  def __post_init__(self):
25
- config = self.process.config.json() if self.process.config else None
25
+ config = self.process.config.model_dump_json() if self.process.config else None
26
26
  logger.info(f"created {self.identifier} with configs: {config}")
27
27
 
28
28
  async def _run_async(
@@ -25,9 +25,13 @@ class UploadStep(BatchPipelineStep):
25
25
  return f"{self.identifier} ({self.process.__class__.__name__})"
26
26
 
27
27
  def __post_init__(self):
28
- config = self.process.upload_config.json() if self.process.upload_config else None
28
+ config = (
29
+ self.process.upload_config.model_dump_json() if self.process.upload_config else None
30
+ )
29
31
  connection_config = (
30
- self.process.connection_config.json() if self.process.connection_config else None
32
+ self.process.connection_config.model_dump_json()
33
+ if self.process.connection_config
34
+ else None
31
35
  )
32
36
  logger.info(
33
37
  f"Created {self.identifier} with configs: {config}, "
@@ -181,7 +181,7 @@ class AirtableIndexer(Indexer):
181
181
  yield FileData(
182
182
  identifier=table_meta.get_id(),
183
183
  connector_type=CONNECTOR_TYPE,
184
- additional_metadata=table_meta.dict(),
184
+ additional_metadata=table_meta.model_dump(),
185
185
  source_identifiers=SourceIdentifiers(
186
186
  filename=str(Path(fullpath).name),
187
187
  fullpath=fullpath,
@@ -130,7 +130,7 @@ class DatabricksVolumesUploader(Uploader):
130
130
 
131
131
  return WorkspaceClient(
132
132
  host=self.connection_config.host,
133
- **self.connection_config.access_config.get_secret_value().dict(),
133
+ **self.connection_config.access_config.get_secret_value().model_dump(),
134
134
  )
135
135
 
136
136
  def precheck(self) -> None:
@@ -104,8 +104,8 @@ class ElasticsearchConnectionConfig(ConnectionConfig):
104
104
  elif access_config.es_api_key:
105
105
  client_input_kwargs["api_key"] = access_config.es_api_key
106
106
  client_input = ElasticsearchClientInput(**client_input_kwargs)
107
- logger.debug(f"elasticsearch client inputs mapped to: {client_input.dict()}")
108
- client_kwargs = client_input.dict()
107
+ logger.debug(f"elasticsearch client inputs mapped to: {client_input.model_dump()}")
108
+ client_kwargs = client_input.model_dump()
109
109
  client_kwargs["basic_auth"] = (
110
110
  client_input.basic_auth.get_secret_value() if client_input.basic_auth else None
111
111
  )
@@ -2,12 +2,13 @@ from __future__ import annotations
2
2
 
3
3
  from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
+ from time import time
5
6
  from typing import Any, Generator, Optional
6
7
 
7
8
  from pydantic import Field, Secret
8
9
 
9
10
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
11
+ from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
11
12
  from unstructured_ingest.v2.processes.connector_registry import (
12
13
  DestinationRegistryEntry,
13
14
  SourceRegistryEntry,
@@ -84,7 +85,7 @@ class AzureConnectionConfig(FsspecConnectionConfig):
84
85
  def get_access_config(self) -> dict[str, Any]:
85
86
  # Avoid injecting None by filtering out k,v pairs where the value is None
86
87
  access_configs: dict[str, Any] = {
87
- k: v for k, v in self.access_config.get_secret_value().dict().items() if v
88
+ k: v for k, v in self.access_config.get_secret_value().model_dump().items() if v
88
89
  }
89
90
  return access_configs
90
91
 
@@ -99,14 +100,39 @@ class AzureIndexer(FsspecIndexer):
99
100
  def precheck(self) -> None:
100
101
  super().precheck()
101
102
 
102
- def sterilize_info(self, path) -> dict:
103
- info = self.fs.info(path=path)
104
- return sterilize_dict(data=info, default=azure_json_serial)
103
+ def sterilize_info(self, file_data: dict) -> dict:
104
+ return sterilize_dict(data=file_data, default=azure_json_serial)
105
105
 
106
106
  @requires_dependencies(["adlfs", "fsspec"], extras="azure")
107
107
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
108
108
  return super().run(**kwargs)
109
109
 
110
+ def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
111
+ path = file_data["name"]
112
+ date_created = (
113
+ file_data.get("creation_time").timestamp() if "creation_time" in file_data else None
114
+ )
115
+ date_modified = (
116
+ file_data.get("last_modified").timestamp() if "last_modified" in file_data else None
117
+ )
118
+
119
+ file_size = file_data.get("size") if "size" in file_data else None
120
+
121
+ version = file_data.get("etag")
122
+ record_locator = {
123
+ "protocol": self.index_config.protocol,
124
+ "remote_file_path": self.index_config.remote_url,
125
+ }
126
+ return FileDataSourceMetadata(
127
+ date_created=date_created,
128
+ date_modified=date_modified,
129
+ date_processed=str(time()),
130
+ version=version,
131
+ url=f"{self.index_config.protocol}://{path}",
132
+ record_locator=record_locator,
133
+ filesize_bytes=file_size,
134
+ )
135
+
110
136
 
111
137
  class AzureDownloaderConfig(FsspecDownloaderConfig):
112
138
  pass
@@ -2,12 +2,14 @@ from __future__ import annotations
2
2
 
3
3
  from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
+ from time import time
5
6
  from typing import Any, Generator, Optional
6
7
 
8
+ from dateutil import parser
7
9
  from pydantic import Field, Secret
8
10
 
9
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
12
+ from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
11
13
  from unstructured_ingest.v2.processes.connector_registry import (
12
14
  DestinationRegistryEntry,
13
15
  SourceRegistryEntry,
@@ -52,7 +54,7 @@ class BoxConnectionConfig(FsspecConnectionConfig):
52
54
  ac.box_app_config,
53
55
  ),
54
56
  }
55
- access_config: dict[str, Any] = ac.dict()
57
+ access_config: dict[str, Any] = ac.model_dump()
56
58
  access_config.pop("box_app_config", None)
57
59
  access_kwargs_with_oauth.update(access_config)
58
60
 
@@ -73,6 +75,33 @@ class BoxIndexer(FsspecIndexer):
73
75
  def precheck(self) -> None:
74
76
  super().precheck()
75
77
 
78
+ def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
79
+ path = file_data["name"]
80
+ date_created = None
81
+ date_modified = None
82
+ if modified_at_str := file_data.get("modified_at"):
83
+ date_modified = parser.parse(modified_at_str).timestamp()
84
+ if created_at_str := file_data.get("created_at"):
85
+ date_created = parser.parse(created_at_str).timestamp()
86
+
87
+ file_size = file_data.get("size") if "size" in file_data else None
88
+
89
+ version = file_data.get("id")
90
+ record_locator = {
91
+ "protocol": self.index_config.protocol,
92
+ "remote_file_path": self.index_config.remote_url,
93
+ "file_id": file_data.get("id"),
94
+ }
95
+ return FileDataSourceMetadata(
96
+ date_created=date_created,
97
+ date_modified=date_modified,
98
+ date_processed=str(time()),
99
+ version=version,
100
+ url=f"{self.index_config.protocol}://{path}",
101
+ record_locator=record_locator,
102
+ filesize_bytes=file_size,
103
+ )
104
+
76
105
 
77
106
  class BoxDownloaderConfig(FsspecDownloaderConfig):
78
107
  pass
@@ -2,12 +2,13 @@ from __future__ import annotations
2
2
 
3
3
  from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
+ from time import time
5
6
  from typing import Any, Generator, Optional
6
7
 
7
8
  from pydantic import Field, Secret
8
9
 
9
10
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
11
+ from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
11
12
  from unstructured_ingest.v2.processes.connector_registry import (
12
13
  DestinationRegistryEntry,
13
14
  SourceRegistryEntry,
@@ -22,7 +23,6 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
22
23
  FsspecUploader,
23
24
  FsspecUploaderConfig,
24
25
  )
25
- from unstructured_ingest.v2.processes.connectors.fsspec.utils import sterilize_dict
26
26
 
27
27
  CONNECTOR_TYPE = "dropbox"
28
28
 
@@ -49,6 +49,40 @@ class DropboxIndexer(FsspecIndexer):
49
49
  index_config: DropboxIndexerConfig
50
50
  connector_type: str = CONNECTOR_TYPE
51
51
 
52
+ def get_path(self, file_data: dict) -> str:
53
+ return file_data["name"]
54
+
55
+ def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
56
+ path = file_data["name"].lstrip("/")
57
+ date_created = None
58
+ date_modified = None
59
+ server_modified = file_data.get("server_modified")
60
+ client_modified = file_data.get("client_modified")
61
+ if server_modified and client_modified and server_modified > client_modified:
62
+ date_created = str(client_modified.timestamp())
63
+ date_modified = str(server_modified.timestamp())
64
+ elif server_modified and client_modified and server_modified < client_modified:
65
+ date_created = str(server_modified.timestamp())
66
+ date_modified = str(client_modified.timestamp())
67
+
68
+ file_size = file_data.get("size") if "size" in file_data else None
69
+
70
+ version = file_data.get("content_hash")
71
+ record_locator = {
72
+ "protocol": self.index_config.protocol,
73
+ "remote_file_path": self.index_config.remote_url,
74
+ "file_id": file_data.get("id"),
75
+ }
76
+ return FileDataSourceMetadata(
77
+ date_created=date_created,
78
+ date_modified=date_modified,
79
+ date_processed=str(time()),
80
+ version=version,
81
+ url=f"{self.index_config.protocol}://{path}",
82
+ record_locator=record_locator,
83
+ filesize_bytes=file_size,
84
+ )
85
+
52
86
  @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
53
87
  def __post_init__(self):
54
88
  # dropbox expects the path to start with a /
@@ -63,12 +97,6 @@ class DropboxIndexer(FsspecIndexer):
63
97
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
64
98
  return super().run(**kwargs)
65
99
 
66
- def sterilize_info(self, path) -> dict:
67
- # the fs.info method defined in the dropboxdrivefs library expects a "url"
68
- # kwarg rather than "path"; though both refer to the same thing
69
- info = self.fs.info(url=path)
70
- return sterilize_dict(data=info)
71
-
72
100
 
73
101
  class DropboxDownloaderConfig(FsspecDownloaderConfig):
74
102
  pass
@@ -1,10 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
- import contextlib
4
3
  from dataclasses import dataclass, field
5
- from datetime import datetime
6
4
  from pathlib import Path
7
- from time import time
8
5
  from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
9
6
  from uuid import NAMESPACE_DNS, uuid5
10
7
 
@@ -113,18 +110,13 @@ class FsspecIndexer(Indexer):
113
110
  logger.error(f"failed to validate connection: {e}", exc_info=True)
114
111
  raise SourceConnectionError(f"failed to validate connection: {e}")
115
112
 
116
- def list_files(self) -> list[str]:
113
+ def get_file_data(self) -> list[dict[str, Any]]:
117
114
  if not self.index_config.recursive:
118
115
  # fs.ls does not walk directories
119
116
  # directories that are listed in cloud storage can cause problems
120
117
  # because they are seen as 0 byte files
121
- found = self.fs.ls(self.index_config.path_without_protocol, detail=True)
122
- if isinstance(found, list):
123
- return [
124
- x.get("name") for x in found if x.get("size") > 0 and x.get("type") == "file"
125
- ]
126
- else:
127
- raise TypeError(f"unhandled response type from ls: {type(found)}")
118
+ files = self.fs.ls(self.index_config.path_without_protocol, detail=True)
119
+
128
120
  else:
129
121
  # fs.find will recursively walk directories
130
122
  # "size" is a common key for all the cloud protocols with fs
@@ -132,84 +124,40 @@ class FsspecIndexer(Indexer):
132
124
  self.index_config.path_without_protocol,
133
125
  detail=True,
134
126
  )
135
- if isinstance(found, dict):
136
- return [
137
- k for k, v in found.items() if v.get("size") > 0 and v.get("type") == "file"
138
- ]
139
- else:
140
- raise TypeError(f"unhandled response type from find: {type(found)}")
141
-
142
- def get_metadata(self, path: str) -> FileDataSourceMetadata:
143
- date_created = None
144
- date_modified = None
145
- file_size = None
146
- try:
147
- created: Optional[Any] = self.fs.created(path)
148
- if created:
149
- if isinstance(created, datetime):
150
- date_created = str(created.timestamp())
151
- else:
152
- date_created = str(created)
153
- except NotImplementedError:
154
- pass
127
+ files = found.values()
128
+ filtered_files = [
129
+ file for file in files if file.get("size") > 0 and file.get("type") == "file"
130
+ ]
131
+ return filtered_files
155
132
 
156
- try:
157
- modified: Optional[Any] = self.fs.modified(path)
158
- if modified:
159
- if isinstance(modified, datetime):
160
- date_modified = str(modified.timestamp())
161
- else:
162
- date_modified = str(modified)
163
- except NotImplementedError:
164
- pass
165
- with contextlib.suppress(AttributeError):
166
- file_size = self.fs.size(path)
167
-
168
- version = self.fs.checksum(path)
169
- metadata: dict[str, str] = {}
170
- with contextlib.suppress(AttributeError):
171
- metadata = self.fs.metadata(path)
172
- record_locator = {
173
- "protocol": self.index_config.protocol,
174
- "remote_file_path": self.index_config.remote_url,
175
- }
176
- file_stat = self.fs.stat(path=path)
177
- if file_id := file_stat.get("id"):
178
- record_locator["file_id"] = file_id
179
- if metadata:
180
- record_locator["metadata"] = metadata
181
- return FileDataSourceMetadata(
182
- date_created=date_created,
183
- date_modified=date_modified,
184
- date_processed=str(time()),
185
- version=str(version),
186
- url=f"{self.index_config.protocol}://{path}",
187
- record_locator=record_locator,
188
- filesize_bytes=file_size,
189
- )
133
+ def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
134
+ raise NotImplementedError()
135
+
136
+ def get_path(self, file_data: dict) -> str:
137
+ return file_data["name"]
190
138
 
191
- def sterilize_info(self, path) -> dict:
192
- info = self.fs.info(path=path)
193
- return sterilize_dict(data=info)
139
+ def sterilize_info(self, file_data: dict) -> dict:
140
+ return sterilize_dict(data=file_data)
194
141
 
195
142
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
196
- files = self.list_files()
197
- for file in files:
143
+ files = self.get_file_data()
144
+ for file_data in files:
145
+ file_path = self.get_path(file_data=file_data)
198
146
  # Note: we remove any remaining leading slashes (Box introduces these)
199
147
  # to get a valid relative path
200
- rel_path = file.replace(self.index_config.path_without_protocol, "").lstrip("/")
148
+ rel_path = file_path.replace(self.index_config.path_without_protocol, "").lstrip("/")
201
149
 
202
- additional_metadata = self.sterilize_info(path=file)
203
- additional_metadata["original_file_path"] = file
150
+ additional_metadata = self.sterilize_info(file_data=file_data)
151
+ additional_metadata["original_file_path"] = file_path
204
152
  yield FileData(
205
- identifier=str(uuid5(NAMESPACE_DNS, file)),
153
+ identifier=str(uuid5(NAMESPACE_DNS, file_path)),
206
154
  connector_type=self.connector_type,
207
155
  source_identifiers=SourceIdentifiers(
208
- filename=Path(file).name,
156
+ filename=Path(file_path).name,
209
157
  rel_path=rel_path or None,
210
- fullpath=file,
158
+ fullpath=file_path,
211
159
  ),
212
- metadata=self.get_metadata(path=file),
160
+ metadata=self.get_metadata(file_data=file_data),
213
161
  additional_metadata=additional_metadata,
214
162
  )
215
163
 
@@ -2,13 +2,15 @@ from __future__ import annotations
2
2
 
3
3
  from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
+ from time import time
5
6
  from typing import Any, Generator, Optional, Union
6
7
 
8
+ from dateutil import parser
7
9
  from pydantic import Field, Secret
8
10
 
9
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
12
  from unstructured_ingest.utils.string_and_date_utils import json_to_dict
11
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
13
+ from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
12
14
  from unstructured_ingest.v2.processes.connector_registry import (
13
15
  DestinationRegistryEntry,
14
16
  SourceRegistryEntry,
@@ -106,6 +108,33 @@ class GcsIndexer(FsspecIndexer):
106
108
  def precheck(self) -> None:
107
109
  super().precheck()
108
110
 
111
+ def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
112
+ path = file_data["name"]
113
+ date_created = None
114
+ date_modified = None
115
+ if modified_at_str := file_data.get("updated"):
116
+ date_modified = parser.parse(modified_at_str).timestamp()
117
+ if created_at_str := file_data.get("timeCreated"):
118
+ date_created = parser.parse(created_at_str).timestamp()
119
+
120
+ file_size = file_data.get("size") if "size" in file_data else None
121
+
122
+ version = file_data.get("etag")
123
+ record_locator = {
124
+ "protocol": self.index_config.protocol,
125
+ "remote_file_path": self.index_config.remote_url,
126
+ "file_id": file_data.get("id"),
127
+ }
128
+ return FileDataSourceMetadata(
129
+ date_created=date_created,
130
+ date_modified=date_modified,
131
+ date_processed=str(time()),
132
+ version=version,
133
+ url=f"{self.index_config.protocol}://{path}",
134
+ record_locator=record_locator,
135
+ filesize_bytes=file_size,
136
+ )
137
+
109
138
 
110
139
  class GcsDownloaderConfig(FsspecDownloaderConfig):
111
140
  pass
@@ -1,6 +1,5 @@
1
1
  import contextlib
2
2
  from dataclasses import dataclass, field
3
- from datetime import datetime
4
3
  from pathlib import Path
5
4
  from time import time
6
5
  from typing import Any, Generator, Optional
@@ -69,7 +68,7 @@ class S3ConnectionConfig(FsspecConnectionConfig):
69
68
 
70
69
  # Avoid injecting None by filtering out k,v pairs where the value is None
71
70
  access_configs.update(
72
- {k: v for k, v in self.access_config.get_secret_value().dict().items() if v}
71
+ {k: v for k, v in self.access_config.get_secret_value().model_dump().items() if v}
73
72
  )
74
73
  return access_configs
75
74
 
@@ -80,27 +79,25 @@ class S3Indexer(FsspecIndexer):
80
79
  index_config: S3IndexerConfig
81
80
  connector_type: str = CONNECTOR_TYPE
82
81
 
83
- def get_metadata(self, path: str) -> FileDataSourceMetadata:
82
+ def get_path(self, file_data: dict) -> str:
83
+ return file_data["Key"]
84
+
85
+ def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
86
+ path = file_data["Key"]
84
87
  date_created = None
85
88
  date_modified = None
86
- file_size = None
87
- try:
88
- modified: Optional[datetime] = self.fs.modified(path)
89
- if modified:
90
- date_created = str(modified.timestamp())
91
- date_modified = str(modified.timestamp())
92
- except NotImplementedError:
93
- pass
94
- with contextlib.suppress(AttributeError):
95
- file_size = self.fs.size(path)
89
+ modified = file_data.get("LastModified")
90
+ if modified:
91
+ date_created = str(modified.timestamp())
92
+ date_modified = str(modified.timestamp())
93
+
94
+ file_size = file_data.get("size") if "size" in file_data else None
95
+ file_size = file_size or file_data.get("Size")
96
96
 
97
- version = None
98
- info: dict[str, Any] = self.fs.info(path)
99
- if etag := info.get("ETag"):
100
- version = str(etag).rstrip('"').lstrip('"')
97
+ version = file_data.get("ETag").rstrip('"').lstrip('"') if "ETag" in file_data else None
101
98
  metadata: dict[str, str] = {}
102
99
  with contextlib.suppress(AttributeError):
103
- metadata = self.fs.metadata(path)
100
+ metadata = self.fs.metadata(path=path)
104
101
  record_locator = {
105
102
  "protocol": self.index_config.protocol,
106
103
  "remote_file_path": self.index_config.remote_url,
@@ -3,13 +3,14 @@ from __future__ import annotations
3
3
  import os
4
4
  from dataclasses import dataclass, field
5
5
  from pathlib import Path
6
+ from time import time
6
7
  from typing import Any, Generator, Optional
7
8
  from urllib.parse import urlparse
8
9
 
9
10
  from pydantic import Field, Secret
10
11
 
11
12
  from unstructured_ingest.utils.dep_check import requires_dependencies
12
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
13
+ from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
13
14
  from unstructured_ingest.v2.processes.connector_registry import (
14
15
  DestinationRegistryEntry,
15
16
  SourceRegistryEntry,
@@ -96,6 +97,26 @@ class SftpIndexer(FsspecIndexer):
96
97
  def precheck(self) -> None:
97
98
  super().precheck()
98
99
 
100
+ def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
101
+ path = file_data["name"]
102
+ date_created = file_data.get("time").timestamp() if "time" in file_data else None
103
+ date_modified = file_data.get("mtime").timestamp() if "mtime" in file_data else None
104
+
105
+ file_size = file_data.get("size") if "size" in file_data else None
106
+
107
+ record_locator = {
108
+ "protocol": self.index_config.protocol,
109
+ "remote_file_path": self.index_config.remote_url,
110
+ }
111
+ return FileDataSourceMetadata(
112
+ date_created=date_created,
113
+ date_modified=date_modified,
114
+ date_processed=str(time()),
115
+ url=f"{self.index_config.protocol}://{path}",
116
+ record_locator=record_locator,
117
+ filesize_bytes=file_size,
118
+ )
119
+
99
120
 
100
121
  class SftpDownloaderConfig(FsspecDownloaderConfig):
101
122
  remote_url: str = Field(description="Remote fsspec URL formatted as `protocol://dir/path`")