unstructured-ingest 0.0.19__py3-none-any.whl → 0.0.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (47) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/cmds/astradb.py +2 -2
  3. unstructured_ingest/connector/astradb.py +54 -24
  4. unstructured_ingest/embed/bedrock.py +56 -19
  5. unstructured_ingest/embed/huggingface.py +22 -22
  6. unstructured_ingest/embed/interfaces.py +11 -4
  7. unstructured_ingest/embed/mixedbreadai.py +17 -17
  8. unstructured_ingest/embed/octoai.py +7 -7
  9. unstructured_ingest/embed/openai.py +15 -20
  10. unstructured_ingest/embed/vertexai.py +25 -17
  11. unstructured_ingest/embed/voyageai.py +22 -17
  12. unstructured_ingest/v2/cli/base/cmd.py +1 -1
  13. unstructured_ingest/v2/interfaces/connector.py +1 -1
  14. unstructured_ingest/v2/pipeline/pipeline.py +3 -1
  15. unstructured_ingest/v2/pipeline/steps/chunk.py +1 -1
  16. unstructured_ingest/v2/pipeline/steps/download.py +6 -2
  17. unstructured_ingest/v2/pipeline/steps/embed.py +1 -1
  18. unstructured_ingest/v2/pipeline/steps/filter.py +1 -1
  19. unstructured_ingest/v2/pipeline/steps/index.py +4 -2
  20. unstructured_ingest/v2/pipeline/steps/partition.py +1 -1
  21. unstructured_ingest/v2/pipeline/steps/stage.py +3 -1
  22. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -1
  23. unstructured_ingest/v2/pipeline/steps/upload.py +6 -2
  24. unstructured_ingest/v2/processes/chunker.py +8 -29
  25. unstructured_ingest/v2/processes/connectors/airtable.py +1 -1
  26. unstructured_ingest/v2/processes/connectors/astradb.py +26 -19
  27. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +11 -8
  28. unstructured_ingest/v2/processes/connectors/elasticsearch.py +2 -2
  29. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +31 -5
  30. unstructured_ingest/v2/processes/connectors/fsspec/box.py +31 -2
  31. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +36 -8
  32. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +25 -77
  33. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +30 -1
  34. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +15 -18
  35. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +22 -1
  36. unstructured_ingest/v2/processes/connectors/milvus.py +2 -2
  37. unstructured_ingest/v2/processes/connectors/opensearch.py +2 -2
  38. unstructured_ingest/v2/processes/partitioner.py +9 -55
  39. unstructured_ingest/v2/unstructured_api.py +87 -0
  40. unstructured_ingest/v2/utils.py +1 -1
  41. unstructured_ingest-0.0.22.dist-info/METADATA +186 -0
  42. {unstructured_ingest-0.0.19.dist-info → unstructured_ingest-0.0.22.dist-info}/RECORD +46 -45
  43. {unstructured_ingest-0.0.19.dist-info → unstructured_ingest-0.0.22.dist-info}/WHEEL +1 -1
  44. unstructured_ingest-0.0.19.dist-info/METADATA +0 -639
  45. {unstructured_ingest-0.0.19.dist-info → unstructured_ingest-0.0.22.dist-info}/LICENSE.md +0 -0
  46. {unstructured_ingest-0.0.19.dist-info → unstructured_ingest-0.0.22.dist-info}/entry_points.txt +0 -0
  47. {unstructured_ingest-0.0.19.dist-info → unstructured_ingest-0.0.22.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,7 @@ import json
3
3
  import os
4
4
  from dataclasses import dataclass
5
5
  from pathlib import Path
6
- from typing import TYPE_CHECKING, Annotated, Any, List, Optional
6
+ from typing import TYPE_CHECKING, Annotated, Any, Optional
7
7
 
8
8
  import numpy as np
9
9
  from pydantic import Field, Secret, ValidationError
@@ -13,7 +13,7 @@ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, Embedding
13
13
  from unstructured_ingest.utils.dep_check import requires_dependencies
14
14
 
15
15
  if TYPE_CHECKING:
16
- from langchain_google_vertexai import VertexAIEmbeddings
16
+ from vertexai.language_models import TextEmbeddingModel
17
17
 
18
18
 
19
19
  def conform_string_to_dict(value: Any) -> dict:
@@ -41,45 +41,53 @@ class VertexAIEmbeddingConfig(EmbeddingConfig):
41
41
  os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = str(application_credentials_path)
42
42
 
43
43
  @requires_dependencies(
44
- ["langchain", "langchain_google_vertexai"],
44
+ ["vertexai"],
45
45
  extras="embed-vertexai",
46
46
  )
47
- def get_client(self) -> "VertexAIEmbeddings":
47
+ def get_client(self) -> "TextEmbeddingModel":
48
48
  """Creates a Langchain VertexAI python client to embed elements."""
49
- from langchain_google_vertexai import VertexAIEmbeddings
49
+ from vertexai.language_models import TextEmbeddingModel
50
50
 
51
51
  self.register_application_credentials()
52
- vertexai_client = VertexAIEmbeddings(model_name=self.embedder_model_name)
53
- return vertexai_client
52
+ return TextEmbeddingModel.from_pretrained(self.embedder_model_name)
54
53
 
55
54
 
56
55
  @dataclass
57
56
  class VertexAIEmbeddingEncoder(BaseEmbeddingEncoder):
58
57
  config: VertexAIEmbeddingConfig
59
58
 
60
- def get_exemplary_embedding(self) -> List[float]:
59
+ def get_exemplary_embedding(self) -> list[float]:
61
60
  return self.embed_query(query="A sample query.")
62
61
 
63
- def num_of_dimensions(self):
62
+ def num_of_dimensions(self) -> tuple[int, ...]:
64
63
  exemplary_embedding = self.get_exemplary_embedding()
65
64
  return np.shape(exemplary_embedding)
66
65
 
67
- def is_unit_vector(self):
66
+ def is_unit_vector(self) -> bool:
68
67
  exemplary_embedding = self.get_exemplary_embedding()
69
68
  return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
70
69
 
71
70
  def embed_query(self, query):
72
- client = self.config.get_client()
73
- result = client.embed_query(str(query))
74
- return result
71
+ return self._embed_documents(elements=[query])[0]
75
72
 
76
- def embed_documents(self, elements: List[dict]) -> List[dict]:
77
- client = self.config.get_client()
78
- embeddings = client.embed_documents([e.get("text", "") for e in elements])
73
+ def embed_documents(self, elements: list[dict]) -> list[dict]:
74
+ embeddings = self._embed_documents([e.get("text", "") for e in elements])
79
75
  elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
80
76
  return elements_with_embeddings
81
77
 
82
- def _add_embeddings_to_elements(self, elements, embeddings) -> List[dict]:
78
+ @requires_dependencies(
79
+ ["vertexai"],
80
+ extras="embed-vertexai",
81
+ )
82
+ def _embed_documents(self, elements: list[str]) -> list[list[float]]:
83
+ from vertexai.language_models import TextEmbeddingInput
84
+
85
+ client = self.config.get_client()
86
+ inputs = [TextEmbeddingInput(text=element) for element in elements]
87
+ embeddings = client.get_embeddings(inputs)
88
+ return [e.values for e in embeddings]
89
+
90
+ def _add_embeddings_to_elements(self, elements, embeddings) -> list[dict]:
83
91
  assert len(elements) == len(embeddings)
84
92
  elements_w_embedding = []
85
93
  for i, element in enumerate(elements):
@@ -1,5 +1,5 @@
1
1
  from dataclasses import dataclass
2
- from typing import TYPE_CHECKING, List, Optional
2
+ from typing import TYPE_CHECKING, Optional
3
3
 
4
4
  import numpy as np
5
5
  from pydantic import Field, SecretStr
@@ -8,7 +8,7 @@ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, Embedding
8
8
  from unstructured_ingest.utils.dep_check import requires_dependencies
9
9
 
10
10
  if TYPE_CHECKING:
11
- from langchain_voyageai import VoyageAIEmbeddings
11
+ from voyageai import Client as VoyageAIClient
12
12
 
13
13
 
14
14
  class VoyageAIEmbeddingConfig(EmbeddingConfig):
@@ -16,28 +16,30 @@ class VoyageAIEmbeddingConfig(EmbeddingConfig):
16
16
  embedder_model_name: str = Field(alias="model_name")
17
17
  batch_size: Optional[int] = Field(default=None)
18
18
  truncation: Optional[bool] = Field(default=None)
19
+ max_retries: int = 0
20
+ timeout_in_seconds: Optional[int] = None
19
21
 
20
22
  @requires_dependencies(
21
23
  ["langchain", "langchain_voyageai"],
22
24
  extras="embed-voyageai",
23
25
  )
24
- def get_client(self) -> "VoyageAIEmbeddings":
26
+ def get_client(self) -> "VoyageAIClient":
25
27
  """Creates a Langchain VoyageAI python client to embed elements."""
26
- from langchain_voyageai import VoyageAIEmbeddings
28
+ from voyageai import Client as VoyageAIClient
27
29
 
28
- return VoyageAIEmbeddings(
29
- voyage_api_key=self.api_key,
30
- model=self.embedder_model_name,
31
- batch_size=self.batch_size,
32
- truncation=self.truncation,
30
+ client = VoyageAIClient(
31
+ api_key=self.api_key.get_secret_value(),
32
+ max_retries=self.max_retries,
33
+ timeout=self.timeout_in_seconds,
33
34
  )
35
+ return client
34
36
 
35
37
 
36
38
  @dataclass
37
39
  class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
38
40
  config: VoyageAIEmbeddingConfig
39
41
 
40
- def get_exemplary_embedding(self) -> List[float]:
42
+ def get_exemplary_embedding(self) -> list[float]:
41
43
  return self.embed_query(query="A sample query.")
42
44
 
43
45
  @property
@@ -50,17 +52,20 @@ class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
50
52
  exemplary_embedding = self.get_exemplary_embedding()
51
53
  return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
52
54
 
53
- def embed_documents(self, elements: List[dict]) -> List[dict]:
54
- client = self.config.get_client()
55
- embeddings = client.embed_documents([e.get("text", "") for e in elements])
55
+ def _embed_documents(self, elements: list[str]) -> list[list[float]]:
56
+ client: VoyageAIClient = self.config.get_client()
57
+ response = client.embed(texts=elements, model=self.config.embedder_model_name)
58
+ return response.embeddings
59
+
60
+ def embed_documents(self, elements: list[dict]) -> list[dict]:
61
+ embeddings = self._embed_documents([e.get("text", "") for e in elements])
56
62
  return self._add_embeddings_to_elements(elements, embeddings)
57
63
 
58
- def embed_query(self, query: str) -> List[float]:
59
- client = self.config.get_client()
60
- return client.embed_query(query)
64
+ def embed_query(self, query: str) -> list[float]:
65
+ return self._embed_documents(elements=[query])[0]
61
66
 
62
67
  @staticmethod
63
- def _add_embeddings_to_elements(elements, embeddings) -> List[dict]:
68
+ def _add_embeddings_to_elements(elements, embeddings) -> list[dict]:
64
69
  assert len(elements) == len(embeddings)
65
70
  elements_w_embedding = []
66
71
  for i, element in enumerate(elements):
@@ -155,7 +155,7 @@ class BaseCmd(ABC):
155
155
  @staticmethod
156
156
  def get_filterer(options: dict[str, Any]) -> Optional[Filterer]:
157
157
  filterer_configs = extract_config(flat_data=options, config=FiltererConfig)
158
- if not filterer_configs.dict():
158
+ if not filterer_configs.model_dump():
159
159
  return None
160
160
  return Filterer(config=filterer_configs)
161
161
 
@@ -19,7 +19,7 @@ class ConnectionConfig(BaseModel):
19
19
  def get_access_config(self) -> dict[str, Any]:
20
20
  if not self.access_config:
21
21
  return {}
22
- return self.access_config.get_secret_value().dict()
22
+ return self.access_config.get_secret_value().model_dump()
23
23
 
24
24
 
25
25
  ConnectionConfigT = TypeVar("ConnectionConfigT", bound=ConnectionConfig)
@@ -187,7 +187,9 @@ class Pipeline:
187
187
  return filtered_records
188
188
 
189
189
  def _run(self):
190
- logger.info(f"running local pipeline: {self} with configs: " f"{self.context.json()}")
190
+ logger.info(
191
+ f"running local pipeline: {self} with configs: " f"{self.context.model_dump_json()}"
192
+ )
191
193
  if self.context.mp_supported:
192
194
  manager = mp.Manager()
193
195
  self.context.status = manager.dict()
@@ -28,7 +28,7 @@ class ChunkStep(PipelineStep):
28
28
  return f"{self.identifier} ({self.process.config.chunking_strategy})"
29
29
 
30
30
  def __post_init__(self):
31
- config = self.process.config.json() if self.process.config else None
31
+ config = self.process.config.model_dump_json() if self.process.config else None
32
32
  logger.info(f"created {self.identifier} with configs: {config}")
33
33
 
34
34
  def should_chunk(self, filepath: Path, file_data: FileData) -> bool:
@@ -31,9 +31,13 @@ class DownloadStep(PipelineStep):
31
31
  return f"{self.identifier} ({self.process.__class__.__name__})"
32
32
 
33
33
  def __post_init__(self):
34
- config = self.process.download_config.json() if self.process.download_config else None
34
+ config = (
35
+ self.process.download_config.model_dump_json() if self.process.download_config else None
36
+ )
35
37
  connection_config = (
36
- self.process.connection_config.json() if self.process.connection_config else None
38
+ self.process.connection_config.model_dump_json()
39
+ if self.process.connection_config
40
+ else None
37
41
  )
38
42
  logger.info(
39
43
  f"Created {self.identifier} with configs: {config}, "
@@ -28,7 +28,7 @@ class EmbedStep(PipelineStep):
28
28
  return f"{self.identifier} ({self.process.config.embedding_provider})"
29
29
 
30
30
  def __post_init__(self):
31
- config = self.process.config.json() if self.process.config else None
31
+ config = self.process.config.model_dump_json() if self.process.config else None
32
32
  logger.info(f"created {self.identifier} with configs: {config}")
33
33
 
34
34
  def should_embed(self, filepath: Path, file_data: FileData) -> bool:
@@ -16,7 +16,7 @@ class FilterStep(PipelineStep):
16
16
  identifier: str = STEP_ID
17
17
 
18
18
  def __post_init__(self):
19
- config = self.process.config.json() if self.process.config else None
19
+ config = self.process.config.model_dump_json() if self.process.config else None
20
20
  logger.info(f"created {self.identifier} with configs: {config}")
21
21
 
22
22
  async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:
@@ -23,9 +23,11 @@ class IndexStep(PipelineStep):
23
23
  return f"{self.identifier} ({self.process.__class__.__name__})"
24
24
 
25
25
  def __post_init__(self):
26
- config = self.process.index_config.json() if self.process.index_config else None
26
+ config = self.process.index_config.model_dump_json() if self.process.index_config else None
27
27
  connection_config = (
28
- self.process.connection_config.json() if self.process.connection_config else None
28
+ self.process.connection_config.model_dump_json()
29
+ if self.process.connection_config
30
+ else None
29
31
  )
30
32
  logger.info(
31
33
  f"created {self.identifier} with configs: {config}, "
@@ -28,7 +28,7 @@ class PartitionStep(PipelineStep):
28
28
  return f"{self.identifier} ({self.process.config.strategy})"
29
29
 
30
30
  def __post_init__(self):
31
- config = self.process.config.json()
31
+ config = self.process.config.model_dump_json()
32
32
  logger.info(f"created {self.identifier} with configs: {config}")
33
33
 
34
34
  def should_partition(self, filepath: Path, file_data: FileData) -> bool:
@@ -28,7 +28,9 @@ class UploadStageStep(PipelineStep):
28
28
 
29
29
  def __post_init__(self):
30
30
  config = (
31
- self.process.upload_stager_config.json() if self.process.upload_stager_config else None
31
+ self.process.upload_stager_config.model_dump_json()
32
+ if self.process.upload_stager_config
33
+ else None
32
34
  )
33
35
  self.cache_dir.mkdir(parents=True, exist_ok=True)
34
36
  logger.info(f"created {self.identifier} with configs: {config}")
@@ -22,7 +22,7 @@ class UncompressStep(PipelineStep):
22
22
  identifier: str = STEP_ID
23
23
 
24
24
  def __post_init__(self):
25
- config = self.process.config.json() if self.process.config else None
25
+ config = self.process.config.model_dump_json() if self.process.config else None
26
26
  logger.info(f"created {self.identifier} with configs: {config}")
27
27
 
28
28
  async def _run_async(
@@ -25,9 +25,13 @@ class UploadStep(BatchPipelineStep):
25
25
  return f"{self.identifier} ({self.process.__class__.__name__})"
26
26
 
27
27
  def __post_init__(self):
28
- config = self.process.upload_config.json() if self.process.upload_config else None
28
+ config = (
29
+ self.process.upload_config.model_dump_json() if self.process.upload_config else None
30
+ )
29
31
  connection_config = (
30
- self.process.connection_config.json() if self.process.connection_config else None
32
+ self.process.connection_config.model_dump_json()
33
+ if self.process.connection_config
34
+ else None
31
35
  )
32
36
  logger.info(
33
37
  f"Created {self.identifier} with configs: {config}, "
@@ -1,5 +1,5 @@
1
1
  from abc import ABC
2
- from dataclasses import dataclass, fields
2
+ from dataclasses import dataclass
3
3
  from pathlib import Path
4
4
  from typing import Any, Optional
5
5
 
@@ -9,6 +9,7 @@ from unstructured_ingest.utils.chunking import assign_and_map_hash_ids
9
9
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
10
  from unstructured_ingest.v2.interfaces.process import BaseProcess
11
11
  from unstructured_ingest.v2.logger import logger
12
+ from unstructured_ingest.v2.unstructured_api import call_api
12
13
 
13
14
  CHUNK_MAX_CHARS_DEFAULT: int = 500
14
15
  CHUNK_MULTI_PAGE_DEFAULT: bool = True
@@ -111,35 +112,13 @@ class Chunker(BaseProcess, ABC):
111
112
 
112
113
  @requires_dependencies(dependencies=["unstructured_client"], extras="remote")
113
114
  async def run_async(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
114
- from unstructured_client import UnstructuredClient
115
- from unstructured_client.models.operations import PartitionRequest
116
- from unstructured_client.models.shared import Files, PartitionParameters
117
-
118
- client = UnstructuredClient(
119
- api_key_auth=self.config.chunk_api_key.get_secret_value(),
115
+ elements = await call_api(
120
116
  server_url=self.config.chunking_endpoint,
117
+ api_key=self.config.chunk_api_key.get_secret_value(),
118
+ filename=elements_filepath,
119
+ api_parameters=self.config.to_chunking_kwargs(),
121
120
  )
122
- partition_request = self.config.to_chunking_kwargs()
123
- possible_fields = [f.name for f in fields(PartitionParameters)]
124
- filtered_partition_request = {
125
- k: v for k, v in partition_request.items() if k in possible_fields
126
- }
127
- if len(filtered_partition_request) != len(partition_request):
128
- logger.debug(
129
- "Following fields were omitted due to not being "
130
- "supported by the currently used unstructured client: {}".format(
131
- ", ".join([v for v in partition_request if v not in filtered_partition_request])
132
- )
133
- )
134
- with open(elements_filepath, "rb") as f:
135
- files = Files(
136
- content=f.read(),
137
- file_name=str(elements_filepath.resolve()),
138
- )
139
- filtered_partition_request["files"] = files
140
- partition_params = PartitionParameters(**filtered_partition_request)
141
- partition_request_obj = PartitionRequest(partition_params)
142
- resp = client.general.partition(partition_request_obj)
143
- elements = resp.elements or []
121
+
144
122
  elements = assign_and_map_hash_ids(elements=elements)
123
+
145
124
  return elements
@@ -181,7 +181,7 @@ class AirtableIndexer(Indexer):
181
181
  yield FileData(
182
182
  identifier=table_meta.get_id(),
183
183
  connector_type=CONNECTOR_TYPE,
184
- additional_metadata=table_meta.dict(),
184
+ additional_metadata=table_meta.model_dump(),
185
185
  source_identifiers=SourceIdentifiers(
186
186
  filename=str(Path(fullpath).name),
187
187
  fullpath=fullpath,
@@ -25,7 +25,8 @@ from unstructured_ingest.v2.processes.connector_registry import (
25
25
  )
26
26
 
27
27
  if TYPE_CHECKING:
28
- from astrapy.db import AstraDBCollection
28
+ from astrapy import Collection as AstraDBCollection
29
+
29
30
 
30
31
  CONNECTOR_TYPE = "astradb"
31
32
 
@@ -85,7 +86,12 @@ class AstraDBUploaderConfig(UploaderConfig):
85
86
  embedding_dimension: int = Field(
86
87
  default=384, description="The dimensionality of the embeddings"
87
88
  )
88
- namespace: Optional[str] = Field(default=None, description="The Astra DB connection namespace.")
89
+ keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
90
+ namespace: Optional[str] = Field(
91
+ default=None,
92
+ description="The Astra DB connection namespace.",
93
+ deprecated="Please use 'keyspace' instead.",
94
+ )
89
95
  requested_indexing_policy: Optional[dict[str, Any]] = Field(
90
96
  default=None,
91
97
  description="The indexing policy to use for the collection.",
@@ -109,33 +115,34 @@ class AstraDBUploader(Uploader):
109
115
 
110
116
  @requires_dependencies(["astrapy"], extras="astradb")
111
117
  def get_collection(self) -> "AstraDBCollection":
112
- from astrapy.db import AstraDB
118
+ from astrapy import DataAPIClient as AstraDBClient
113
119
 
114
- # Get the collection_name and embedding dimension
115
- collection_name = self.upload_config.collection_name
116
- embedding_dimension = self.upload_config.embedding_dimension
117
- requested_indexing_policy = self.upload_config.requested_indexing_policy
120
+ # Choose keyspace or deprecated namespace
121
+ keyspace_param = self.upload_config.keyspace or self.upload_config.namespace
118
122
 
119
- # If the user has requested an indexing policy, pass it to the Astra DB
120
- options = {"indexing": requested_indexing_policy} if requested_indexing_policy else None
123
+ # Get the collection_name
124
+ collection_name = self.upload_config.collection_name
121
125
 
122
126
  # Build the Astra DB object.
123
- # caller_name/version for AstraDB tracking
124
127
  access_configs = self.connection_config.access_config.get_secret_value()
125
- astra_db = AstraDB(
126
- api_endpoint=access_configs.api_endpoint,
127
- token=access_configs.token,
128
- namespace=self.upload_config.namespace,
128
+
129
+ # Create a client object to interact with the Astra DB
130
+ # caller_name/version for Astra DB tracking
131
+ my_client = AstraDBClient(
129
132
  caller_name=integration_name,
130
133
  caller_version=integration_version,
131
134
  )
132
135
 
133
- # Create and connect to the newly created collection
134
- astra_db_collection = astra_db.create_collection(
135
- collection_name=collection_name,
136
- dimension=embedding_dimension,
137
- options=options,
136
+ # Get the database object
137
+ astra_db = my_client.get_database(
138
+ api_endpoint=access_configs.api_endpoint,
139
+ token=access_configs.token,
140
+ keyspace=keyspace_param,
138
141
  )
142
+
143
+ # Connect to the newly created collection
144
+ astra_db_collection = astra_db.get_collection(name=collection_name)
145
+
139
146
  return astra_db_collection
140
147
 
141
148
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
@@ -42,8 +42,10 @@ class DatabricksVolumesAccessConfig(AccessConfig):
42
42
  description="The Databricks password part of basic authentication. "
43
43
  "Only possible when Host is *.cloud.databricks.com (AWS).",
44
44
  )
45
- client_id: Optional[str] = Field(default=None)
46
- client_secret: Optional[str] = Field(default=None)
45
+ client_id: Optional[str] = Field(default=None, description="Client ID of the OAuth app.")
46
+ client_secret: Optional[str] = Field(
47
+ default=None, description="Client Secret of the OAuth app."
48
+ )
47
49
  token: Optional[str] = Field(
48
50
  default=None,
49
51
  description="The Databricks personal access token (PAT) (AWS, Azure, and GCP) or "
@@ -128,7 +130,7 @@ class DatabricksVolumesUploader(Uploader):
128
130
 
129
131
  return WorkspaceClient(
130
132
  host=self.connection_config.host,
131
- **self.connection_config.access_config.get_secret_value().dict(),
133
+ **self.connection_config.access_config.get_secret_value().model_dump(),
132
134
  )
133
135
 
134
136
  def precheck(self) -> None:
@@ -140,11 +142,12 @@ class DatabricksVolumesUploader(Uploader):
140
142
 
141
143
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
142
144
  output_path = os.path.join(self.upload_config.path, path.name)
143
- self.get_client().files.upload(
144
- file_path=output_path,
145
- contents=path,
146
- overwrite=self.upload_config.overwrite,
147
- )
145
+ with open(path, "rb") as elements_file:
146
+ self.get_client().files.upload(
147
+ file_path=output_path,
148
+ contents=elements_file,
149
+ overwrite=self.upload_config.overwrite,
150
+ )
148
151
 
149
152
 
150
153
  databricks_volumes_destination_entry = DestinationRegistryEntry(
@@ -104,8 +104,8 @@ class ElasticsearchConnectionConfig(ConnectionConfig):
104
104
  elif access_config.es_api_key:
105
105
  client_input_kwargs["api_key"] = access_config.es_api_key
106
106
  client_input = ElasticsearchClientInput(**client_input_kwargs)
107
- logger.debug(f"elasticsearch client inputs mapped to: {client_input.dict()}")
108
- client_kwargs = client_input.dict()
107
+ logger.debug(f"elasticsearch client inputs mapped to: {client_input.model_dump()}")
108
+ client_kwargs = client_input.model_dump()
109
109
  client_kwargs["basic_auth"] = (
110
110
  client_input.basic_auth.get_secret_value() if client_input.basic_auth else None
111
111
  )
@@ -2,12 +2,13 @@ from __future__ import annotations
2
2
 
3
3
  from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
+ from time import time
5
6
  from typing import Any, Generator, Optional
6
7
 
7
8
  from pydantic import Field, Secret
8
9
 
9
10
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
11
+ from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
11
12
  from unstructured_ingest.v2.processes.connector_registry import (
12
13
  DestinationRegistryEntry,
13
14
  SourceRegistryEntry,
@@ -84,7 +85,7 @@ class AzureConnectionConfig(FsspecConnectionConfig):
84
85
  def get_access_config(self) -> dict[str, Any]:
85
86
  # Avoid injecting None by filtering out k,v pairs where the value is None
86
87
  access_configs: dict[str, Any] = {
87
- k: v for k, v in self.access_config.get_secret_value().dict().items() if v
88
+ k: v for k, v in self.access_config.get_secret_value().model_dump().items() if v
88
89
  }
89
90
  return access_configs
90
91
 
@@ -99,14 +100,39 @@ class AzureIndexer(FsspecIndexer):
99
100
  def precheck(self) -> None:
100
101
  super().precheck()
101
102
 
102
- def sterilize_info(self, path) -> dict:
103
- info = self.fs.info(path=path)
104
- return sterilize_dict(data=info, default=azure_json_serial)
103
+ def sterilize_info(self, file_data: dict) -> dict:
104
+ return sterilize_dict(data=file_data, default=azure_json_serial)
105
105
 
106
106
  @requires_dependencies(["adlfs", "fsspec"], extras="azure")
107
107
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
108
108
  return super().run(**kwargs)
109
109
 
110
+ def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
111
+ path = file_data["name"]
112
+ date_created = (
113
+ file_data.get("creation_time").timestamp() if "creation_time" in file_data else None
114
+ )
115
+ date_modified = (
116
+ file_data.get("last_modified").timestamp() if "last_modified" in file_data else None
117
+ )
118
+
119
+ file_size = file_data.get("size") if "size" in file_data else None
120
+
121
+ version = file_data.get("etag")
122
+ record_locator = {
123
+ "protocol": self.index_config.protocol,
124
+ "remote_file_path": self.index_config.remote_url,
125
+ }
126
+ return FileDataSourceMetadata(
127
+ date_created=date_created,
128
+ date_modified=date_modified,
129
+ date_processed=str(time()),
130
+ version=version,
131
+ url=f"{self.index_config.protocol}://{path}",
132
+ record_locator=record_locator,
133
+ filesize_bytes=file_size,
134
+ )
135
+
110
136
 
111
137
  class AzureDownloaderConfig(FsspecDownloaderConfig):
112
138
  pass
@@ -2,12 +2,14 @@ from __future__ import annotations
2
2
 
3
3
  from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
+ from time import time
5
6
  from typing import Any, Generator, Optional
6
7
 
8
+ from dateutil import parser
7
9
  from pydantic import Field, Secret
8
10
 
9
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
10
- from unstructured_ingest.v2.interfaces import DownloadResponse, FileData
12
+ from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, FileDataSourceMetadata
11
13
  from unstructured_ingest.v2.processes.connector_registry import (
12
14
  DestinationRegistryEntry,
13
15
  SourceRegistryEntry,
@@ -52,7 +54,7 @@ class BoxConnectionConfig(FsspecConnectionConfig):
52
54
  ac.box_app_config,
53
55
  ),
54
56
  }
55
- access_config: dict[str, Any] = ac.dict()
57
+ access_config: dict[str, Any] = ac.model_dump()
56
58
  access_config.pop("box_app_config", None)
57
59
  access_kwargs_with_oauth.update(access_config)
58
60
 
@@ -73,6 +75,33 @@ class BoxIndexer(FsspecIndexer):
73
75
  def precheck(self) -> None:
74
76
  super().precheck()
75
77
 
78
+ def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
79
+ path = file_data["name"]
80
+ date_created = None
81
+ date_modified = None
82
+ if modified_at_str := file_data.get("modified_at"):
83
+ date_modified = parser.parse(modified_at_str).timestamp()
84
+ if created_at_str := file_data.get("created_at"):
85
+ date_created = parser.parse(created_at_str).timestamp()
86
+
87
+ file_size = file_data.get("size") if "size" in file_data else None
88
+
89
+ version = file_data.get("id")
90
+ record_locator = {
91
+ "protocol": self.index_config.protocol,
92
+ "remote_file_path": self.index_config.remote_url,
93
+ "file_id": file_data.get("id"),
94
+ }
95
+ return FileDataSourceMetadata(
96
+ date_created=date_created,
97
+ date_modified=date_modified,
98
+ date_processed=str(time()),
99
+ version=version,
100
+ url=f"{self.index_config.protocol}://{path}",
101
+ record_locator=record_locator,
102
+ filesize_bytes=file_size,
103
+ )
104
+
76
105
 
77
106
  class BoxDownloaderConfig(FsspecDownloaderConfig):
78
107
  pass