unstructured-ingest 0.0.21__py3-none-any.whl → 0.0.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (45) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/interfaces.py +5 -5
  3. unstructured_ingest/embed/__init__.py +0 -17
  4. unstructured_ingest/embed/bedrock.py +56 -19
  5. unstructured_ingest/embed/huggingface.py +22 -22
  6. unstructured_ingest/embed/interfaces.py +11 -4
  7. unstructured_ingest/embed/mixedbreadai.py +17 -17
  8. unstructured_ingest/embed/octoai.py +7 -7
  9. unstructured_ingest/embed/openai.py +15 -20
  10. unstructured_ingest/embed/vertexai.py +26 -18
  11. unstructured_ingest/embed/voyageai.py +25 -20
  12. unstructured_ingest/interfaces.py +5 -5
  13. unstructured_ingest/v2/cli/base/cmd.py +1 -1
  14. unstructured_ingest/v2/interfaces/connector.py +1 -1
  15. unstructured_ingest/v2/pipeline/pipeline.py +3 -1
  16. unstructured_ingest/v2/pipeline/steps/chunk.py +1 -1
  17. unstructured_ingest/v2/pipeline/steps/download.py +6 -2
  18. unstructured_ingest/v2/pipeline/steps/embed.py +1 -1
  19. unstructured_ingest/v2/pipeline/steps/filter.py +1 -1
  20. unstructured_ingest/v2/pipeline/steps/index.py +4 -2
  21. unstructured_ingest/v2/pipeline/steps/partition.py +1 -1
  22. unstructured_ingest/v2/pipeline/steps/stage.py +3 -1
  23. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -1
  24. unstructured_ingest/v2/pipeline/steps/upload.py +6 -2
  25. unstructured_ingest/v2/processes/connectors/airtable.py +1 -1
  26. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +1 -1
  27. unstructured_ingest/v2/processes/connectors/elasticsearch.py +2 -2
  28. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +31 -5
  29. unstructured_ingest/v2/processes/connectors/fsspec/box.py +31 -2
  30. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +36 -8
  31. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +25 -77
  32. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +30 -1
  33. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +15 -18
  34. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +22 -1
  35. unstructured_ingest/v2/processes/connectors/milvus.py +2 -2
  36. unstructured_ingest/v2/processes/connectors/opensearch.py +2 -2
  37. unstructured_ingest/v2/processes/embedder.py +10 -10
  38. unstructured_ingest/v2/utils.py +1 -1
  39. unstructured_ingest-0.0.23.dist-info/METADATA +186 -0
  40. {unstructured_ingest-0.0.21.dist-info → unstructured_ingest-0.0.23.dist-info}/RECORD +44 -44
  41. {unstructured_ingest-0.0.21.dist-info → unstructured_ingest-0.0.23.dist-info}/WHEEL +1 -1
  42. unstructured_ingest-0.0.21.dist-info/METADATA +0 -639
  43. {unstructured_ingest-0.0.21.dist-info → unstructured_ingest-0.0.23.dist-info}/LICENSE.md +0 -0
  44. {unstructured_ingest-0.0.21.dist-info → unstructured_ingest-0.0.23.dist-info}/entry_points.txt +0 -0
  45. {unstructured_ingest-0.0.21.dist-info → unstructured_ingest-0.0.23.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,5 @@
1
1
  from dataclasses import dataclass
2
- from typing import TYPE_CHECKING, List, Optional
2
+ from typing import TYPE_CHECKING, Optional
3
3
 
4
4
  import numpy as np
5
5
  from pydantic import Field, SecretStr
@@ -8,7 +8,7 @@ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, Embedding
8
8
  from unstructured_ingest.utils.dep_check import requires_dependencies
9
9
 
10
10
  if TYPE_CHECKING:
11
- from langchain_voyageai import VoyageAIEmbeddings
11
+ from voyageai import Client as VoyageAIClient
12
12
 
13
13
 
14
14
  class VoyageAIEmbeddingConfig(EmbeddingConfig):
@@ -16,28 +16,30 @@ class VoyageAIEmbeddingConfig(EmbeddingConfig):
16
16
  embedder_model_name: str = Field(alias="model_name")
17
17
  batch_size: Optional[int] = Field(default=None)
18
18
  truncation: Optional[bool] = Field(default=None)
19
+ max_retries: int = 0
20
+ timeout_in_seconds: Optional[int] = None
19
21
 
20
22
  @requires_dependencies(
21
- ["langchain", "langchain_voyageai"],
23
+ ["voyageai"],
22
24
  extras="embed-voyageai",
23
25
  )
24
- def get_client(self) -> "VoyageAIEmbeddings":
25
- """Creates a Langchain VoyageAI python client to embed elements."""
26
- from langchain_voyageai import VoyageAIEmbeddings
27
-
28
- return VoyageAIEmbeddings(
29
- voyage_api_key=self.api_key,
30
- model=self.embedder_model_name,
31
- batch_size=self.batch_size,
32
- truncation=self.truncation,
26
+ def get_client(self) -> "VoyageAIClient":
27
+ """Creates a VoyageAI python client to embed elements."""
28
+ from voyageai import Client as VoyageAIClient
29
+
30
+ client = VoyageAIClient(
31
+ api_key=self.api_key.get_secret_value(),
32
+ max_retries=self.max_retries,
33
+ timeout=self.timeout_in_seconds,
33
34
  )
35
+ return client
34
36
 
35
37
 
36
38
  @dataclass
37
39
  class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
38
40
  config: VoyageAIEmbeddingConfig
39
41
 
40
- def get_exemplary_embedding(self) -> List[float]:
42
+ def get_exemplary_embedding(self) -> list[float]:
41
43
  return self.embed_query(query="A sample query.")
42
44
 
43
45
  @property
@@ -50,17 +52,20 @@ class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
50
52
  exemplary_embedding = self.get_exemplary_embedding()
51
53
  return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
52
54
 
53
- def embed_documents(self, elements: List[dict]) -> List[dict]:
54
- client = self.config.get_client()
55
- embeddings = client.embed_documents([e.get("text", "") for e in elements])
55
+ def _embed_documents(self, elements: list[str]) -> list[list[float]]:
56
+ client: VoyageAIClient = self.config.get_client()
57
+ response = client.embed(texts=elements, model=self.config.embedder_model_name)
58
+ return response.embeddings
59
+
60
+ def embed_documents(self, elements: list[dict]) -> list[dict]:
61
+ embeddings = self._embed_documents([e.get("text", "") for e in elements])
56
62
  return self._add_embeddings_to_elements(elements, embeddings)
57
63
 
58
- def embed_query(self, query: str) -> List[float]:
59
- client = self.config.get_client()
60
- return client.embed_query(query)
64
+ def embed_query(self, query: str) -> list[float]:
65
+ return self._embed_documents(elements=[query])[0]
61
66
 
62
67
  @staticmethod
63
- def _add_embeddings_to_elements(elements, embeddings) -> List[dict]:
68
+ def _add_embeddings_to_elements(elements, embeddings) -> list[dict]:
64
69
  assert len(elements) == len(embeddings)
65
70
  elements_w_embedding = []
66
71
  for i, element in enumerate(elements):
@@ -204,14 +204,14 @@ class EmbeddingConfig(BaseConfig):
204
204
  if self.model_name:
205
205
  kwargs["model_name"] = self.model_name
206
206
  # TODO make this more dynamic to map to encoder configs
207
- if self.provider == "langchain-openai":
207
+ if self.provider == "openai":
208
208
  from unstructured_ingest.embed.openai import (
209
209
  OpenAIEmbeddingConfig,
210
210
  OpenAIEmbeddingEncoder,
211
211
  )
212
212
 
213
213
  return OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(**kwargs))
214
- elif self.provider == "langchain-huggingface":
214
+ elif self.provider == "huggingface":
215
215
  from unstructured_ingest.embed.huggingface import (
216
216
  HuggingFaceEmbeddingConfig,
217
217
  HuggingFaceEmbeddingEncoder,
@@ -225,7 +225,7 @@ class EmbeddingConfig(BaseConfig):
225
225
  )
226
226
 
227
227
  return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(**kwargs))
228
- elif self.provider == "langchain-aws-bedrock":
228
+ elif self.provider == "aws-bedrock":
229
229
  from unstructured_ingest.embed.bedrock import (
230
230
  BedrockEmbeddingConfig,
231
231
  BedrockEmbeddingEncoder,
@@ -238,14 +238,14 @@ class EmbeddingConfig(BaseConfig):
238
238
  region_name=self.aws_region,
239
239
  )
240
240
  )
241
- elif self.provider == "langchain-vertexai":
241
+ elif self.provider == "vertexai":
242
242
  from unstructured_ingest.embed.vertexai import (
243
243
  VertexAIEmbeddingConfig,
244
244
  VertexAIEmbeddingEncoder,
245
245
  )
246
246
 
247
247
  return VertexAIEmbeddingEncoder(config=VertexAIEmbeddingConfig(**kwargs))
248
- elif self.provider == "langchain-voyageai":
248
+ elif self.provider == "voyageai":
249
249
  from unstructured_ingest.embed.voyageai import (
250
250
  VoyageAIEmbeddingConfig,
251
251
  VoyageAIEmbeddingEncoder,
@@ -155,7 +155,7 @@ class BaseCmd(ABC):
155
155
  @staticmethod
156
156
  def get_filterer(options: dict[str, Any]) -> Optional[Filterer]:
157
157
  filterer_configs = extract_config(flat_data=options, config=FiltererConfig)
158
- if not filterer_configs.dict():
158
+ if not filterer_configs.model_dump():
159
159
  return None
160
160
  return Filterer(config=filterer_configs)
161
161
 
@@ -19,7 +19,7 @@ class ConnectionConfig(BaseModel):
19
19
  def get_access_config(self) -> dict[str, Any]:
20
20
  if not self.access_config:
21
21
  return {}
22
- return self.access_config.get_secret_value().dict()
22
+ return self.access_config.get_secret_value().model_dump()
23
23
 
24
24
 
25
25
  ConnectionConfigT = TypeVar("ConnectionConfigT", bound=ConnectionConfig)
@@ -187,7 +187,9 @@ class Pipeline:
187
187
  return filtered_records
188
188
 
189
189
  def _run(self):
190
- logger.info(f"running local pipeline: {self} with configs: " f"{self.context.json()}")
190
+ logger.info(
191
+ f"running local pipeline: {self} with configs: " f"{self.context.model_dump_json()}"
192
+ )
191
193
  if self.context.mp_supported:
192
194
  manager = mp.Manager()
193
195
  self.context.status = manager.dict()
@@ -28,7 +28,7 @@ class ChunkStep(PipelineStep):
28
28
  return f"{self.identifier} ({self.process.config.chunking_strategy})"
29
29
 
30
30
  def __post_init__(self):
31
- config = self.process.config.json() if self.process.config else None
31
+ config = self.process.config.model_dump_json() if self.process.config else None
32
32
  logger.info(f"created {self.identifier} with configs: {config}")
33
33
 
34
34
  def should_chunk(self, filepath: Path, file_data: FileData) -> bool:
@@ -31,9 +31,13 @@ class DownloadStep(PipelineStep):
31
31
  return f"{self.identifier} ({self.process.__class__.__name__})"
32
32
 
33
33
  def __post_init__(self):
34
- config = self.process.download_config.json() if self.process.download_config else None
34
+ config = (
35
+ self.process.download_config.model_dump_json() if self.process.download_config else None
36
+ )
35
37
  connection_config = (
36
- self.process.connection_config.json() if self.process.connection_config else None
38
+ self.process.connection_config.model_dump_json()
39
+ if self.process.connection_config
40
+ else None
37
41
  )
38
42
  logger.info(
39
43
  f"Created {self.identifier} with configs: {config}, "
@@ -28,7 +28,7 @@ class EmbedStep(PipelineStep):
28
28
  return f"{self.identifier} ({self.process.config.embedding_provider})"
29
29
 
30
30
  def __post_init__(self):
31
- config = self.process.config.json() if self.process.config else None
31
+ config = self.process.config.model_dump_json() if self.process.config else None
32
32
  logger.info(f"created {self.identifier} with configs: {config}")
33
33
 
34
34
  def should_embed(self, filepath: Path, file_data: FileData) -> bool:
@@ -16,7 +16,7 @@ class FilterStep(PipelineStep):
16
16
  identifier: str = STEP_ID
17
17
 
18
18
  def __post_init__(self):
19
- config = self.process.config.json() if self.process.config else None
19
+ config = self.process.config.model_dump_json() if self.process.config else None
20
20
  logger.info(f"created {self.identifier} with configs: {config}")
21
21
 
22
22
  async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:
@@ -23,9 +23,11 @@ class IndexStep(PipelineStep):
23
23
  return f"{self.identifier} ({self.process.__class__.__name__})"
24
24
 
25
25
  def __post_init__(self):
26
- config = self.process.index_config.json() if self.process.index_config else None
26
+ config = self.process.index_config.model_dump_json() if self.process.index_config else None
27
27
  connection_config = (
28
- self.process.connection_config.json() if self.process.connection_config else None
28
+ self.process.connection_config.model_dump_json()
29
+ if self.process.connection_config
30
+ else None
29
31
  )
30
32
  logger.info(
31
33
  f"created {self.identifier} with configs: {config}, "
@@ -28,7 +28,7 @@ class PartitionStep(PipelineStep):
28
28
  return f"{self.identifier} ({self.process.config.strategy})"
29
29
 
30
30
  def __post_init__(self):
31
- config = self.process.config.json()
31
+ config = self.process.config.model_dump_json()
32
32
  logger.info(f"created {self.identifier} with configs: {config}")
33
33
 
34
34
  def should_partition(self, filepath: Path, file_data: FileData) -> bool:
@@ -28,7 +28,9 @@ class UploadStageStep(PipelineStep):
28
28
 
29
29
  def __post_init__(self):
30
30
  config = (
31
- self.process.upload_stager_config.json() if self.process.upload_stager_config else None
31
+ self.process.upload_stager_config.model_dump_json()
32
+ if self.process.upload_stager_config
33
+ else None
32
34
  )
33
35
  self.cache_dir.mkdir(parents=True, exist_ok=True)
34
36
  logger.info(f"created {self.identifier} with configs: {config}")
@@ -22,7 +22,7 @@ class UncompressStep(PipelineStep):
22
22
  identifier: str = STEP_ID
23
23
 
24
24
  def __post_init__(self):
25
- config = self.process.config.json() if self.process.config else None
25
+ config = self.process.config.model_dump_json() if self.process.config else None
26
26
  logger.info(f"created {self.identifier} with configs: {config}")
27
27
 
28
28
  async def _run_async(
@@ -25,9 +25,13 @@ class UploadStep(BatchPipelineStep):
25
25
  return f"{self.identifier} ({self.process.__class__.__name__})"
26
26
 
27
27
  def __post_init__(self):
28
- config = self.process.upload_config.json() if self.process.upload_config else None
28
+ config = (
29
+ self.process.upload_config.model_dump_json() if self.process.upload_config else None
30
+ )
29
31
  connection_config = (
30
- self.process.connection_config.json() if self.process.connection_config else None
32
+ self.process.connection_config.model_dump_json()
33
+ if self.process.connection_config
34
+ else None
31
35
  )
32
36
  logger.info(
33
37
  f"Created {self.identifier} with configs: {config}, "
@@ -181,7 +181,7 @@ class AirtableIndexer(Indexer):
181
181
  yield FileData(
182
182
  identifier=table_meta.get_id(),
183
183
  connector_type=CONNECTOR_TYPE,
184
- additional_metadata=table_meta.dict(),
184
+ additional_metadata=table_meta.model_dump(),
185
185
  source_identifiers=SourceIdentifiers(
186
186
  filename=str(Path(fullpath).name),
187
187
  fullpath=fullpath,
@@ -130,7 +130,7 @@ class DatabricksVolumesUploader(Uploader):
130
130
 
131
131
  return WorkspaceClient(
132
132
  host=self.connection_config.host,
133
- **self.connection_config.access_config.get_secret_value().dict(),
133
+ **self.connection_config.access_config.get_secret_value().model_dump(),
134
134
  )
135
135
 
136
136
  def precheck(self) -> None:
@@ -104,8 +104,8 @@ class ElasticsearchConnectionConfig(ConnectionConfig):
104
104
  elif access_config.es_api_key:
105
105
  client_input_kwargs["api_key"] = access_config.es_api_key
106
106
  client_input = ElasticsearchClientInput(**client_input_kwargs)
107
- logger.debug(f"elasticsearch client inputs mapped to: {client_input.dict()}")
108
- client_kwargs = client_input.dict()
107
+ logger.debug(f"elasticsearch client inputs mapped to: {client_input.model_dump()}")
108
+ client_kwargs = client_input.model_dump()
109
109
  client_kwargs["basic_auth"] = (
110
110
  client_input.basic_auth.get_secret_value() if client_input.basic_auth else None
111
111
  )
@@ -2,12 +2,13 @@ from __future__ import annotations
2
2
 
3
3
  from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
+ from time import time
5
6
  from typing import Any, Generator, Optional
6
7
 
7
8
  from pydantic import Field, Secret
8
9
 
9
10
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
11
+ from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
11
12
  from unstructured_ingest.v2.processes.connector_registry import (
12
13
  DestinationRegistryEntry,
13
14
  SourceRegistryEntry,
@@ -84,7 +85,7 @@ class AzureConnectionConfig(FsspecConnectionConfig):
84
85
  def get_access_config(self) -> dict[str, Any]:
85
86
  # Avoid injecting None by filtering out k,v pairs where the value is None
86
87
  access_configs: dict[str, Any] = {
87
- k: v for k, v in self.access_config.get_secret_value().dict().items() if v
88
+ k: v for k, v in self.access_config.get_secret_value().model_dump().items() if v
88
89
  }
89
90
  return access_configs
90
91
 
@@ -99,14 +100,39 @@ class AzureIndexer(FsspecIndexer):
99
100
  def precheck(self) -> None:
100
101
  super().precheck()
101
102
 
102
- def sterilize_info(self, path) -> dict:
103
- info = self.fs.info(path=path)
104
- return sterilize_dict(data=info, default=azure_json_serial)
103
+ def sterilize_info(self, file_data: dict) -> dict:
104
+ return sterilize_dict(data=file_data, default=azure_json_serial)
105
105
 
106
106
  @requires_dependencies(["adlfs", "fsspec"], extras="azure")
107
107
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
108
108
  return super().run(**kwargs)
109
109
 
110
+ def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
111
+ path = file_data["name"]
112
+ date_created = (
113
+ file_data.get("creation_time").timestamp() if "creation_time" in file_data else None
114
+ )
115
+ date_modified = (
116
+ file_data.get("last_modified").timestamp() if "last_modified" in file_data else None
117
+ )
118
+
119
+ file_size = file_data.get("size") if "size" in file_data else None
120
+
121
+ version = file_data.get("etag")
122
+ record_locator = {
123
+ "protocol": self.index_config.protocol,
124
+ "remote_file_path": self.index_config.remote_url,
125
+ }
126
+ return FileDataSourceMetadata(
127
+ date_created=date_created,
128
+ date_modified=date_modified,
129
+ date_processed=str(time()),
130
+ version=version,
131
+ url=f"{self.index_config.protocol}://{path}",
132
+ record_locator=record_locator,
133
+ filesize_bytes=file_size,
134
+ )
135
+
110
136
 
111
137
  class AzureDownloaderConfig(FsspecDownloaderConfig):
112
138
  pass
@@ -2,12 +2,14 @@ from __future__ import annotations
2
2
 
3
3
  from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
+ from time import time
5
6
  from typing import Any, Generator, Optional
6
7
 
8
+ from dateutil import parser
7
9
  from pydantic import Field, Secret
8
10
 
9
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
12
+ from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
11
13
  from unstructured_ingest.v2.processes.connector_registry import (
12
14
  DestinationRegistryEntry,
13
15
  SourceRegistryEntry,
@@ -52,7 +54,7 @@ class BoxConnectionConfig(FsspecConnectionConfig):
52
54
  ac.box_app_config,
53
55
  ),
54
56
  }
55
- access_config: dict[str, Any] = ac.dict()
57
+ access_config: dict[str, Any] = ac.model_dump()
56
58
  access_config.pop("box_app_config", None)
57
59
  access_kwargs_with_oauth.update(access_config)
58
60
 
@@ -73,6 +75,33 @@ class BoxIndexer(FsspecIndexer):
73
75
  def precheck(self) -> None:
74
76
  super().precheck()
75
77
 
78
+ def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
79
+ path = file_data["name"]
80
+ date_created = None
81
+ date_modified = None
82
+ if modified_at_str := file_data.get("modified_at"):
83
+ date_modified = parser.parse(modified_at_str).timestamp()
84
+ if created_at_str := file_data.get("created_at"):
85
+ date_created = parser.parse(created_at_str).timestamp()
86
+
87
+ file_size = file_data.get("size") if "size" in file_data else None
88
+
89
+ version = file_data.get("id")
90
+ record_locator = {
91
+ "protocol": self.index_config.protocol,
92
+ "remote_file_path": self.index_config.remote_url,
93
+ "file_id": file_data.get("id"),
94
+ }
95
+ return FileDataSourceMetadata(
96
+ date_created=date_created,
97
+ date_modified=date_modified,
98
+ date_processed=str(time()),
99
+ version=version,
100
+ url=f"{self.index_config.protocol}://{path}",
101
+ record_locator=record_locator,
102
+ filesize_bytes=file_size,
103
+ )
104
+
76
105
 
77
106
  class BoxDownloaderConfig(FsspecDownloaderConfig):
78
107
  pass
@@ -2,12 +2,13 @@ from __future__ import annotations
2
2
 
3
3
  from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
+ from time import time
5
6
  from typing import Any, Generator, Optional
6
7
 
7
8
  from pydantic import Field, Secret
8
9
 
9
10
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
11
+ from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
11
12
  from unstructured_ingest.v2.processes.connector_registry import (
12
13
  DestinationRegistryEntry,
13
14
  SourceRegistryEntry,
@@ -22,7 +23,6 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
22
23
  FsspecUploader,
23
24
  FsspecUploaderConfig,
24
25
  )
25
- from unstructured_ingest.v2.processes.connectors.fsspec.utils import sterilize_dict
26
26
 
27
27
  CONNECTOR_TYPE = "dropbox"
28
28
 
@@ -49,6 +49,40 @@ class DropboxIndexer(FsspecIndexer):
49
49
  index_config: DropboxIndexerConfig
50
50
  connector_type: str = CONNECTOR_TYPE
51
51
 
52
+ def get_path(self, file_data: dict) -> str:
53
+ return file_data["name"]
54
+
55
+ def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
56
+ path = file_data["name"].lstrip("/")
57
+ date_created = None
58
+ date_modified = None
59
+ server_modified = file_data.get("server_modified")
60
+ client_modified = file_data.get("client_modified")
61
+ if server_modified and client_modified and server_modified > client_modified:
62
+ date_created = str(client_modified.timestamp())
63
+ date_modified = str(server_modified.timestamp())
64
+ elif server_modified and client_modified and server_modified < client_modified:
65
+ date_created = str(server_modified.timestamp())
66
+ date_modified = str(client_modified.timestamp())
67
+
68
+ file_size = file_data.get("size") if "size" in file_data else None
69
+
70
+ version = file_data.get("content_hash")
71
+ record_locator = {
72
+ "protocol": self.index_config.protocol,
73
+ "remote_file_path": self.index_config.remote_url,
74
+ "file_id": file_data.get("id"),
75
+ }
76
+ return FileDataSourceMetadata(
77
+ date_created=date_created,
78
+ date_modified=date_modified,
79
+ date_processed=str(time()),
80
+ version=version,
81
+ url=f"{self.index_config.protocol}://{path}",
82
+ record_locator=record_locator,
83
+ filesize_bytes=file_size,
84
+ )
85
+
52
86
  @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox")
53
87
  def __post_init__(self):
54
88
  # dropbox expects the path to start with a /
@@ -63,12 +97,6 @@ class DropboxIndexer(FsspecIndexer):
63
97
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
64
98
  return super().run(**kwargs)
65
99
 
66
- def sterilize_info(self, path) -> dict:
67
- # the fs.info method defined in the dropboxdrivefs library expects a "url"
68
- # kwarg rather than "path"; though both refer to the same thing
69
- info = self.fs.info(url=path)
70
- return sterilize_dict(data=info)
71
-
72
100
 
73
101
  class DropboxDownloaderConfig(FsspecDownloaderConfig):
74
102
  pass
@@ -1,10 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
- import contextlib
4
3
  from dataclasses import dataclass, field
5
- from datetime import datetime
6
4
  from pathlib import Path
7
- from time import time
8
5
  from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar
9
6
  from uuid import NAMESPACE_DNS, uuid5
10
7
 
@@ -113,18 +110,13 @@ class FsspecIndexer(Indexer):
113
110
  logger.error(f"failed to validate connection: {e}", exc_info=True)
114
111
  raise SourceConnectionError(f"failed to validate connection: {e}")
115
112
 
116
- def list_files(self) -> list[str]:
113
+ def get_file_data(self) -> list[dict[str, Any]]:
117
114
  if not self.index_config.recursive:
118
115
  # fs.ls does not walk directories
119
116
  # directories that are listed in cloud storage can cause problems
120
117
  # because they are seen as 0 byte files
121
- found = self.fs.ls(self.index_config.path_without_protocol, detail=True)
122
- if isinstance(found, list):
123
- return [
124
- x.get("name") for x in found if x.get("size") > 0 and x.get("type") == "file"
125
- ]
126
- else:
127
- raise TypeError(f"unhandled response type from ls: {type(found)}")
118
+ files = self.fs.ls(self.index_config.path_without_protocol, detail=True)
119
+
128
120
  else:
129
121
  # fs.find will recursively walk directories
130
122
  # "size" is a common key for all the cloud protocols with fs
@@ -132,84 +124,40 @@ class FsspecIndexer(Indexer):
132
124
  self.index_config.path_without_protocol,
133
125
  detail=True,
134
126
  )
135
- if isinstance(found, dict):
136
- return [
137
- k for k, v in found.items() if v.get("size") > 0 and v.get("type") == "file"
138
- ]
139
- else:
140
- raise TypeError(f"unhandled response type from find: {type(found)}")
141
-
142
- def get_metadata(self, path: str) -> FileDataSourceMetadata:
143
- date_created = None
144
- date_modified = None
145
- file_size = None
146
- try:
147
- created: Optional[Any] = self.fs.created(path)
148
- if created:
149
- if isinstance(created, datetime):
150
- date_created = str(created.timestamp())
151
- else:
152
- date_created = str(created)
153
- except NotImplementedError:
154
- pass
127
+ files = found.values()
128
+ filtered_files = [
129
+ file for file in files if file.get("size") > 0 and file.get("type") == "file"
130
+ ]
131
+ return filtered_files
155
132
 
156
- try:
157
- modified: Optional[Any] = self.fs.modified(path)
158
- if modified:
159
- if isinstance(modified, datetime):
160
- date_modified = str(modified.timestamp())
161
- else:
162
- date_modified = str(modified)
163
- except NotImplementedError:
164
- pass
165
- with contextlib.suppress(AttributeError):
166
- file_size = self.fs.size(path)
167
-
168
- version = self.fs.checksum(path)
169
- metadata: dict[str, str] = {}
170
- with contextlib.suppress(AttributeError):
171
- metadata = self.fs.metadata(path)
172
- record_locator = {
173
- "protocol": self.index_config.protocol,
174
- "remote_file_path": self.index_config.remote_url,
175
- }
176
- file_stat = self.fs.stat(path=path)
177
- if file_id := file_stat.get("id"):
178
- record_locator["file_id"] = file_id
179
- if metadata:
180
- record_locator["metadata"] = metadata
181
- return FileDataSourceMetadata(
182
- date_created=date_created,
183
- date_modified=date_modified,
184
- date_processed=str(time()),
185
- version=str(version),
186
- url=f"{self.index_config.protocol}://{path}",
187
- record_locator=record_locator,
188
- filesize_bytes=file_size,
189
- )
133
+ def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
134
+ raise NotImplementedError()
135
+
136
+ def get_path(self, file_data: dict) -> str:
137
+ return file_data["name"]
190
138
 
191
- def sterilize_info(self, path) -> dict:
192
- info = self.fs.info(path=path)
193
- return sterilize_dict(data=info)
139
+ def sterilize_info(self, file_data: dict) -> dict:
140
+ return sterilize_dict(data=file_data)
194
141
 
195
142
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
196
- files = self.list_files()
197
- for file in files:
143
+ files = self.get_file_data()
144
+ for file_data in files:
145
+ file_path = self.get_path(file_data=file_data)
198
146
  # Note: we remove any remaining leading slashes (Box introduces these)
199
147
  # to get a valid relative path
200
- rel_path = file.replace(self.index_config.path_without_protocol, "").lstrip("/")
148
+ rel_path = file_path.replace(self.index_config.path_without_protocol, "").lstrip("/")
201
149
 
202
- additional_metadata = self.sterilize_info(path=file)
203
- additional_metadata["original_file_path"] = file
150
+ additional_metadata = self.sterilize_info(file_data=file_data)
151
+ additional_metadata["original_file_path"] = file_path
204
152
  yield FileData(
205
- identifier=str(uuid5(NAMESPACE_DNS, file)),
153
+ identifier=str(uuid5(NAMESPACE_DNS, file_path)),
206
154
  connector_type=self.connector_type,
207
155
  source_identifiers=SourceIdentifiers(
208
- filename=Path(file).name,
156
+ filename=Path(file_path).name,
209
157
  rel_path=rel_path or None,
210
- fullpath=file,
158
+ fullpath=file_path,
211
159
  ),
212
- metadata=self.get_metadata(path=file),
160
+ metadata=self.get_metadata(file_data=file_data),
213
161
  additional_metadata=additional_metadata,
214
162
  )
215
163