unstructured-ingest 0.0.21__py3-none-any.whl → 0.0.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (41) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/embed/bedrock.py +56 -19
  3. unstructured_ingest/embed/huggingface.py +22 -22
  4. unstructured_ingest/embed/interfaces.py +11 -4
  5. unstructured_ingest/embed/mixedbreadai.py +17 -17
  6. unstructured_ingest/embed/octoai.py +7 -7
  7. unstructured_ingest/embed/openai.py +15 -20
  8. unstructured_ingest/embed/vertexai.py +25 -17
  9. unstructured_ingest/embed/voyageai.py +22 -17
  10. unstructured_ingest/v2/cli/base/cmd.py +1 -1
  11. unstructured_ingest/v2/interfaces/connector.py +1 -1
  12. unstructured_ingest/v2/pipeline/pipeline.py +3 -1
  13. unstructured_ingest/v2/pipeline/steps/chunk.py +1 -1
  14. unstructured_ingest/v2/pipeline/steps/download.py +6 -2
  15. unstructured_ingest/v2/pipeline/steps/embed.py +1 -1
  16. unstructured_ingest/v2/pipeline/steps/filter.py +1 -1
  17. unstructured_ingest/v2/pipeline/steps/index.py +4 -2
  18. unstructured_ingest/v2/pipeline/steps/partition.py +1 -1
  19. unstructured_ingest/v2/pipeline/steps/stage.py +3 -1
  20. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -1
  21. unstructured_ingest/v2/pipeline/steps/upload.py +6 -2
  22. unstructured_ingest/v2/processes/connectors/airtable.py +1 -1
  23. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +1 -1
  24. unstructured_ingest/v2/processes/connectors/elasticsearch.py +2 -2
  25. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +31 -5
  26. unstructured_ingest/v2/processes/connectors/fsspec/box.py +31 -2
  27. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +36 -8
  28. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +25 -77
  29. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +30 -1
  30. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +15 -18
  31. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +22 -1
  32. unstructured_ingest/v2/processes/connectors/milvus.py +2 -2
  33. unstructured_ingest/v2/processes/connectors/opensearch.py +2 -2
  34. unstructured_ingest/v2/utils.py +1 -1
  35. unstructured_ingest-0.0.22.dist-info/METADATA +186 -0
  36. {unstructured_ingest-0.0.21.dist-info → unstructured_ingest-0.0.22.dist-info}/RECORD +40 -40
  37. {unstructured_ingest-0.0.21.dist-info → unstructured_ingest-0.0.22.dist-info}/WHEEL +1 -1
  38. unstructured_ingest-0.0.21.dist-info/METADATA +0 -639
  39. {unstructured_ingest-0.0.21.dist-info → unstructured_ingest-0.0.22.dist-info}/LICENSE.md +0 -0
  40. {unstructured_ingest-0.0.21.dist-info → unstructured_ingest-0.0.22.dist-info}/entry_points.txt +0 -0
  41. {unstructured_ingest-0.0.21.dist-info → unstructured_ingest-0.0.22.dist-info}/top_level.txt +0 -0
@@ -1 +1 @@
1
- __version__ = "0.0.21" # pragma: no cover
1
+ __version__ = "0.0.22" # pragma: no cover
@@ -1,38 +1,43 @@
1
+ import json
2
+ import os
1
3
  from dataclasses import dataclass
2
- from typing import TYPE_CHECKING, List
4
+ from typing import TYPE_CHECKING
3
5
 
4
6
  import numpy as np
5
- from pydantic import SecretStr
7
+ from pydantic import Field, SecretStr
6
8
 
7
9
  from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
8
10
  from unstructured_ingest.utils.dep_check import requires_dependencies
9
11
 
10
12
  if TYPE_CHECKING:
11
- from langchain_community.embeddings import BedrockEmbeddings
13
+ from botocore.client import BaseClient
14
+
15
+ class BedrockClient(BaseClient):
16
+ def invoke_model(self, body: str, modelId: str, trace: str) -> dict:
17
+ pass
12
18
 
13
19
 
14
20
  class BedrockEmbeddingConfig(EmbeddingConfig):
15
21
  aws_access_key_id: SecretStr
16
22
  aws_secret_access_key: SecretStr
17
23
  region_name: str = "us-west-2"
24
+ embed_model_name: str = Field(default="amazon.titan-embed-text-v1", alias="model_name")
18
25
 
19
26
  @requires_dependencies(
20
- ["boto3", "numpy", "langchain_community"],
27
+ ["boto3", "numpy", "botocore"],
21
28
  extras="bedrock",
22
29
  )
23
- def get_client(self) -> "BedrockEmbeddings":
30
+ def get_client(self) -> "BedrockClient":
24
31
  # delay import only when needed
25
32
  import boto3
26
- from langchain_community.embeddings import BedrockEmbeddings
27
33
 
28
- bedrock_runtime = boto3.client(
34
+ bedrock_client = boto3.client(
29
35
  service_name="bedrock-runtime",
30
36
  aws_access_key_id=self.aws_access_key_id.get_secret_value(),
31
37
  aws_secret_access_key=self.aws_secret_access_key.get_secret_value(),
32
38
  region_name=self.region_name,
33
39
  )
34
40
 
35
- bedrock_client = BedrockEmbeddings(client=bedrock_runtime)
36
41
  return bedrock_client
37
42
 
38
43
 
@@ -40,28 +45,60 @@ class BedrockEmbeddingConfig(EmbeddingConfig):
40
45
  class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
41
46
  config: BedrockEmbeddingConfig
42
47
 
43
- def get_exemplary_embedding(self) -> List[float]:
48
+ def get_exemplary_embedding(self) -> list[float]:
44
49
  return self.embed_query(query="Q")
45
50
 
46
- def num_of_dimensions(self):
51
+ def num_of_dimensions(self) -> tuple[int, ...]:
47
52
  exemplary_embedding = self.get_exemplary_embedding()
48
53
  return np.shape(exemplary_embedding)
49
54
 
50
- def is_unit_vector(self):
55
+ def is_unit_vector(self) -> bool:
51
56
  exemplary_embedding = self.get_exemplary_embedding()
52
57
  return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
53
58
 
54
- def embed_query(self, query):
55
- bedrock_client = self.config.get_client()
56
- return np.array(bedrock_client.embed_query(query))
57
-
58
- def embed_documents(self, elements: List[dict]) -> List[dict]:
59
- bedrock_client = self.config.get_client()
60
- embeddings = bedrock_client.embed_documents([e.get("text", "") for e in elements])
59
+ def embed_query(self, query: str) -> list[float]:
60
+ """Call out to Bedrock embedding endpoint."""
61
+ # replace newlines, which can negatively affect performance.
62
+ text = query.replace(os.linesep, " ")
63
+
64
+ # format input body for provider
65
+ provider = self.config.embed_model_name.split(".")[0]
66
+ input_body = {}
67
+ if provider == "cohere":
68
+ if "input_type" not in input_body:
69
+ input_body["input_type"] = "search_document"
70
+ input_body["texts"] = [text]
71
+ else:
72
+ # includes common provider == "amazon"
73
+ input_body["inputText"] = text
74
+ body = json.dumps(input_body)
75
+
76
+ try:
77
+ bedrock_client = self.config.get_client()
78
+ # invoke bedrock API
79
+ response = bedrock_client.invoke_model(
80
+ body=body,
81
+ modelId=self.config.embed_model_name,
82
+ accept="application/json",
83
+ contentType="application/json",
84
+ )
85
+
86
+ # format output based on provider
87
+ response_body = json.loads(response.get("body").read())
88
+ if provider == "cohere":
89
+ return response_body.get("embeddings")[0]
90
+ else:
91
+ # includes common provider == "amazon"
92
+ return response_body.get("embedding")
93
+ except Exception as e:
94
+ raise ValueError(f"Error raised by inference endpoint: {e}")
95
+
96
+ def embed_documents(self, elements: list[dict]) -> list[dict]:
97
+ embeddings = [self.embed_query(query=e.get("text", "")) for e in elements]
61
98
  elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
62
99
  return elements_with_embeddings
63
100
 
64
- def _add_embeddings_to_elements(self, elements, embeddings) -> List[dict]:
101
+ def _add_embeddings_to_elements(self, elements, embeddings) -> list[dict]:
65
102
  assert len(elements) == len(embeddings)
66
103
  elements_w_embedding = []
67
104
  for i, element in enumerate(elements):
@@ -1,5 +1,5 @@
1
1
  from dataclasses import dataclass
2
- from typing import TYPE_CHECKING, List, Optional
2
+ from typing import TYPE_CHECKING, Optional
3
3
 
4
4
  import numpy as np
5
5
  from pydantic import Field
@@ -8,7 +8,7 @@ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, Embedding
8
8
  from unstructured_ingest.utils.dep_check import requires_dependencies
9
9
 
10
10
  if TYPE_CHECKING:
11
- from langchain_huggingface.embeddings import HuggingFaceEmbeddings
11
+ from sentence_transformers import SentenceTransformer
12
12
 
13
13
 
14
14
  class HuggingFaceEmbeddingConfig(EmbeddingConfig):
@@ -19,51 +19,51 @@ class HuggingFaceEmbeddingConfig(EmbeddingConfig):
19
19
  default_factory=lambda: {"device": "cpu"}, alias="model_kwargs"
20
20
  )
21
21
  encode_kwargs: Optional[dict] = Field(default_factory=lambda: {"normalize_embeddings": False})
22
- cache_folder: Optional[dict] = Field(default=None)
22
+ cache_folder: Optional[str] = Field(default=None)
23
23
 
24
24
  @requires_dependencies(
25
- ["langchain_huggingface"],
25
+ ["sentence_transformers"],
26
26
  extras="embed-huggingface",
27
27
  )
28
- def get_client(self) -> "HuggingFaceEmbeddings":
29
- """Creates a langchain Huggingface python client to embed elements."""
30
- from langchain_huggingface.embeddings import HuggingFaceEmbeddings
31
-
32
- client = HuggingFaceEmbeddings(
33
- model_name=self.embedder_model_name,
34
- model_kwargs=self.embedder_model_kwargs,
35
- encode_kwargs=self.encode_kwargs,
28
+ def get_client(self) -> "SentenceTransformer":
29
+ from sentence_transformers import SentenceTransformer
30
+
31
+ return SentenceTransformer(
32
+ model_name_or_path=self.embedder_model_name,
36
33
  cache_folder=self.cache_folder,
34
+ **self.embedder_model_kwargs,
37
35
  )
38
- return client
39
36
 
40
37
 
41
38
  @dataclass
42
39
  class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
43
40
  config: HuggingFaceEmbeddingConfig
44
41
 
45
- def get_exemplary_embedding(self) -> List[float]:
42
+ def get_exemplary_embedding(self) -> list[float]:
46
43
  return self.embed_query(query="Q")
47
44
 
48
- def num_of_dimensions(self):
45
+ def num_of_dimensions(self) -> tuple[int, ...]:
49
46
  exemplary_embedding = self.get_exemplary_embedding()
50
47
  return np.shape(exemplary_embedding)
51
48
 
52
- def is_unit_vector(self):
49
+ def is_unit_vector(self) -> bool:
53
50
  exemplary_embedding = self.get_exemplary_embedding()
54
51
  return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
55
52
 
56
- def embed_query(self, query):
57
- client = self.config.get_client()
58
- return client.embed_query(str(query))
53
+ def embed_query(self, query: str) -> list[float]:
54
+ return self._embed_documents(texts=[query])[0]
59
55
 
60
- def embed_documents(self, elements: List[dict]) -> List[dict]:
56
+ def _embed_documents(self, texts: list[str]) -> list[list[float]]:
61
57
  client = self.config.get_client()
62
- embeddings = client.embed_documents([e.get("text", "") for e in elements])
58
+ embeddings = client.encode(texts, **self.config.encode_kwargs)
59
+ return embeddings.tolist()
60
+
61
+ def embed_documents(self, elements: list[dict]) -> list[dict]:
62
+ embeddings = self._embed_documents([e.get("text", "") for e in elements])
63
63
  elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
64
64
  return elements_with_embeddings
65
65
 
66
- def _add_embeddings_to_elements(self, elements: list[dict], embeddings: list) -> List[dict]:
66
+ def _add_embeddings_to_elements(self, elements: list[dict], embeddings: list) -> list[dict]:
67
67
  assert len(elements) == len(embeddings)
68
68
  elements_w_embedding = []
69
69
 
@@ -1,6 +1,5 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from dataclasses import dataclass
3
- from typing import List, Tuple
4
3
 
5
4
  from pydantic import BaseModel
6
5
 
@@ -19,7 +18,7 @@ class BaseEmbeddingEncoder(ABC):
19
18
 
20
19
  @property
21
20
  @abstractmethod
22
- def num_of_dimensions(self) -> Tuple[int]:
21
+ def num_of_dimensions(self) -> tuple[int, ...]:
23
22
  """Number of dimensions for the embedding vector."""
24
23
 
25
24
  @property
@@ -28,9 +27,17 @@ class BaseEmbeddingEncoder(ABC):
28
27
  """Denotes if the embedding vector is a unit vector."""
29
28
 
30
29
  @abstractmethod
31
- def embed_documents(self, elements: List[dict]) -> List[dict]:
30
+ def embed_documents(self, elements: list[dict]) -> list[dict]:
32
31
  pass
33
32
 
34
33
  @abstractmethod
35
- def embed_query(self, query: str) -> List[float]:
34
+ def embed_query(self, query: str) -> list[float]:
36
35
  pass
36
+
37
+ def _embed_documents(self, elements: list[str]) -> list[list[float]]:
38
+ results = []
39
+ for text in elements:
40
+ response = self.embed_query(query=text)
41
+ results.append(response)
42
+
43
+ return results
@@ -1,6 +1,6 @@
1
1
  import os
2
2
  from dataclasses import dataclass, field
3
- from typing import TYPE_CHECKING, List, Optional
3
+ from typing import TYPE_CHECKING, Optional
4
4
 
5
5
  import numpy as np
6
6
  from pydantic import Field, SecretStr
@@ -67,10 +67,10 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
67
67
 
68
68
  config: MixedbreadAIEmbeddingConfig
69
69
 
70
- _exemplary_embedding: Optional[List[float]] = field(init=False, default=None)
70
+ _exemplary_embedding: Optional[list[float]] = field(init=False, default=None)
71
71
  _request_options: Optional["RequestOptions"] = field(init=False, default=None)
72
72
 
73
- def get_exemplary_embedding(self) -> List[float]:
73
+ def get_exemplary_embedding(self) -> list[float]:
74
74
  """Get an exemplary embedding to determine dimensions and unit vector status."""
75
75
  return self._embed(["Q"])[0]
76
76
 
@@ -91,7 +91,7 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
91
91
  )
92
92
 
93
93
  @property
94
- def num_of_dimensions(self):
94
+ def num_of_dimensions(self) -> tuple[int, ...]:
95
95
  """Get the number of dimensions for the embeddings."""
96
96
  exemplary_embedding = self.get_exemplary_embedding()
97
97
  return np.shape(exemplary_embedding)
@@ -102,15 +102,15 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
102
102
  exemplary_embedding = self.get_exemplary_embedding()
103
103
  return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
104
104
 
105
- def _embed(self, texts: List[str]) -> List[List[float]]:
105
+ def _embed(self, texts: list[str]) -> list[list[float]]:
106
106
  """
107
107
  Embed a list of texts using the Mixedbread AI API.
108
108
 
109
109
  Args:
110
- texts (List[str]): List of texts to embed.
110
+ texts (list[str]): List of texts to embed.
111
111
 
112
112
  Returns:
113
- List[List[float]]: List of embeddings.
113
+ list[list[float]]: List of embeddings.
114
114
  """
115
115
  batch_size = BATCH_SIZE
116
116
  batch_itr = range(0, len(texts), batch_size)
@@ -132,17 +132,17 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
132
132
 
133
133
  @staticmethod
134
134
  def _add_embeddings_to_elements(
135
- elements: List[dict], embeddings: List[List[float]]
136
- ) -> List[dict]:
135
+ elements: list[dict], embeddings: list[list[float]]
136
+ ) -> list[dict]:
137
137
  """
138
138
  Add embeddings to elements.
139
139
 
140
140
  Args:
141
- elements (List[Element]): List of elements.
142
- embeddings (List[List[float]]): List of embeddings.
141
+ elements (list[Element]): List of elements.
142
+ embeddings (list[list[float]]): List of embeddings.
143
143
 
144
144
  Returns:
145
- List[Element]: Elements with embeddings added.
145
+ list[Element]: Elements with embeddings added.
146
146
  """
147
147
  assert len(elements) == len(embeddings)
148
148
  elements_w_embedding = []
@@ -151,20 +151,20 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
151
151
  elements_w_embedding.append(element)
152
152
  return elements
153
153
 
154
- def embed_documents(self, elements: List[dict]) -> List[dict]:
154
+ def embed_documents(self, elements: list[dict]) -> list[dict]:
155
155
  """
156
156
  Embed a list of document elements.
157
157
 
158
158
  Args:
159
- elements (List[Element]): List of document elements.
159
+ elements (list[Element]): List of document elements.
160
160
 
161
161
  Returns:
162
- List[Element]: Elements with embeddings.
162
+ list[Element]: Elements with embeddings.
163
163
  """
164
164
  embeddings = self._embed([e.get("text", "") for e in elements])
165
165
  return self._add_embeddings_to_elements(elements, embeddings)
166
166
 
167
- def embed_query(self, query: str) -> List[float]:
167
+ def embed_query(self, query: str) -> list[float]:
168
168
  """
169
169
  Embed a query string.
170
170
 
@@ -172,6 +172,6 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
172
172
  query (str): Query string to embed.
173
173
 
174
174
  Returns:
175
- List[float]: Embedding of the query.
175
+ list[float]: Embedding of the query.
176
176
  """
177
177
  return self._embed([query])[0]
@@ -1,5 +1,5 @@
1
1
  from dataclasses import dataclass, field
2
- from typing import TYPE_CHECKING, List, Optional
2
+ from typing import TYPE_CHECKING, Optional
3
3
 
4
4
  import numpy as np
5
5
  from pydantic import Field, SecretStr
@@ -31,16 +31,16 @@ class OctoAiEmbeddingConfig(EmbeddingConfig):
31
31
  class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
32
32
  config: OctoAiEmbeddingConfig
33
33
  # Uses the OpenAI SDK
34
- _exemplary_embedding: Optional[List[float]] = field(init=False, default=None)
34
+ _exemplary_embedding: Optional[list[float]] = field(init=False, default=None)
35
35
 
36
- def get_exemplary_embedding(self) -> List[float]:
36
+ def get_exemplary_embedding(self) -> list[float]:
37
37
  return self.embed_query("Q")
38
38
 
39
- def num_of_dimensions(self):
39
+ def num_of_dimensions(self) -> tuple[int, ...]:
40
40
  exemplary_embedding = self.get_exemplary_embedding()
41
41
  return np.shape(exemplary_embedding)
42
42
 
43
- def is_unit_vector(self):
43
+ def is_unit_vector(self) -> bool:
44
44
  exemplary_embedding = self.get_exemplary_embedding()
45
45
  return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
46
46
 
@@ -49,12 +49,12 @@ class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
49
49
  response = client.embeddings.create(input=query, model=self.config.embedder_model_name)
50
50
  return response.data[0].embedding
51
51
 
52
- def embed_documents(self, elements: List[dict]) -> List[dict]:
52
+ def embed_documents(self, elements: list[dict]) -> list[dict]:
53
53
  embeddings = [self.embed_query(e.get("text", "")) for e in elements]
54
54
  elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
55
55
  return elements_with_embeddings
56
56
 
57
- def _add_embeddings_to_elements(self, elements, embeddings) -> List[dict]:
57
+ def _add_embeddings_to_elements(self, elements, embeddings) -> list[dict]:
58
58
  assert len(elements) == len(embeddings)
59
59
  elements_w_embedding = []
60
60
  for i, element in enumerate(elements):
@@ -1,5 +1,5 @@
1
1
  from dataclasses import dataclass
2
- from typing import TYPE_CHECKING, List
2
+ from typing import TYPE_CHECKING
3
3
 
4
4
  import numpy as np
5
5
  from pydantic import Field, SecretStr
@@ -8,51 +8,46 @@ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, Embedding
8
8
  from unstructured_ingest.utils.dep_check import requires_dependencies
9
9
 
10
10
  if TYPE_CHECKING:
11
- from langchain_openai.embeddings import OpenAIEmbeddings
11
+ from openai import OpenAI
12
12
 
13
13
 
14
14
  class OpenAIEmbeddingConfig(EmbeddingConfig):
15
15
  api_key: SecretStr
16
16
  embedder_model_name: str = Field(default="text-embedding-ada-002", alias="model_name")
17
17
 
18
- @requires_dependencies(["langchain_openai"], extras="openai")
19
- def get_client(self) -> "OpenAIEmbeddings":
20
- """Creates a langchain OpenAI python client to embed elements."""
21
- from langchain_openai import OpenAIEmbeddings
18
+ @requires_dependencies(["openai"], extras="openai")
19
+ def get_client(self) -> "OpenAI":
20
+ from openai import OpenAI
22
21
 
23
- openai_client = OpenAIEmbeddings(
24
- openai_api_key=self.api_key.get_secret_value(),
25
- model=self.embedder_model_name, # type:ignore
26
- )
27
- return openai_client
22
+ return OpenAI(api_key=self.api_key.get_secret_value())
28
23
 
29
24
 
30
25
  @dataclass
31
26
  class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
32
27
  config: OpenAIEmbeddingConfig
33
28
 
34
- def get_exemplary_embedding(self) -> List[float]:
29
+ def get_exemplary_embedding(self) -> list[float]:
35
30
  return self.embed_query(query="Q")
36
31
 
37
- def num_of_dimensions(self):
32
+ def num_of_dimensions(self) -> tuple[int, ...]:
38
33
  exemplary_embedding = self.get_exemplary_embedding()
39
34
  return np.shape(exemplary_embedding)
40
35
 
41
- def is_unit_vector(self):
36
+ def is_unit_vector(self) -> bool:
42
37
  exemplary_embedding = self.get_exemplary_embedding()
43
38
  return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
44
39
 
45
- def embed_query(self, query):
40
+ def embed_query(self, query: str) -> list[float]:
46
41
  client = self.config.get_client()
47
- return client.embed_query(str(query))
42
+ response = client.embeddings.create(input=query, model=self.config.embedder_model_name)
43
+ return response.data[0].embedding
48
44
 
49
- def embed_documents(self, elements: List[dict]) -> List[dict]:
50
- client = self.config.get_client()
51
- embeddings = client.embed_documents([e.get("text", "") for e in elements])
45
+ def embed_documents(self, elements: list[dict]) -> list[dict]:
46
+ embeddings = self._embed_documents([e.get("text", "") for e in elements])
52
47
  elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
53
48
  return elements_with_embeddings
54
49
 
55
- def _add_embeddings_to_elements(self, elements, embeddings) -> List[dict]:
50
+ def _add_embeddings_to_elements(self, elements, embeddings) -> list[dict]:
56
51
  assert len(elements) == len(embeddings)
57
52
  elements_w_embedding = []
58
53
  for i, element in enumerate(elements):
@@ -3,7 +3,7 @@ import json
3
3
  import os
4
4
  from dataclasses import dataclass
5
5
  from pathlib import Path
6
- from typing import TYPE_CHECKING, Annotated, Any, List, Optional
6
+ from typing import TYPE_CHECKING, Annotated, Any, Optional
7
7
 
8
8
  import numpy as np
9
9
  from pydantic import Field, Secret, ValidationError
@@ -13,7 +13,7 @@ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, Embedding
13
13
  from unstructured_ingest.utils.dep_check import requires_dependencies
14
14
 
15
15
  if TYPE_CHECKING:
16
- from langchain_google_vertexai import VertexAIEmbeddings
16
+ from vertexai.language_models import TextEmbeddingModel
17
17
 
18
18
 
19
19
  def conform_string_to_dict(value: Any) -> dict:
@@ -41,45 +41,53 @@ class VertexAIEmbeddingConfig(EmbeddingConfig):
41
41
  os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = str(application_credentials_path)
42
42
 
43
43
  @requires_dependencies(
44
- ["langchain", "langchain_google_vertexai"],
44
+ ["vertexai"],
45
45
  extras="embed-vertexai",
46
46
  )
47
- def get_client(self) -> "VertexAIEmbeddings":
47
+ def get_client(self) -> "TextEmbeddingModel":
48
48
  """Creates a Langchain VertexAI python client to embed elements."""
49
- from langchain_google_vertexai import VertexAIEmbeddings
49
+ from vertexai.language_models import TextEmbeddingModel
50
50
 
51
51
  self.register_application_credentials()
52
- vertexai_client = VertexAIEmbeddings(model_name=self.embedder_model_name)
53
- return vertexai_client
52
+ return TextEmbeddingModel.from_pretrained(self.embedder_model_name)
54
53
 
55
54
 
56
55
  @dataclass
57
56
  class VertexAIEmbeddingEncoder(BaseEmbeddingEncoder):
58
57
  config: VertexAIEmbeddingConfig
59
58
 
60
- def get_exemplary_embedding(self) -> List[float]:
59
+ def get_exemplary_embedding(self) -> list[float]:
61
60
  return self.embed_query(query="A sample query.")
62
61
 
63
- def num_of_dimensions(self):
62
+ def num_of_dimensions(self) -> tuple[int, ...]:
64
63
  exemplary_embedding = self.get_exemplary_embedding()
65
64
  return np.shape(exemplary_embedding)
66
65
 
67
- def is_unit_vector(self):
66
+ def is_unit_vector(self) -> bool:
68
67
  exemplary_embedding = self.get_exemplary_embedding()
69
68
  return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
70
69
 
71
70
  def embed_query(self, query):
72
- client = self.config.get_client()
73
- result = client.embed_query(str(query))
74
- return result
71
+ return self._embed_documents(elements=[query])[0]
75
72
 
76
- def embed_documents(self, elements: List[dict]) -> List[dict]:
77
- client = self.config.get_client()
78
- embeddings = client.embed_documents([e.get("text", "") for e in elements])
73
+ def embed_documents(self, elements: list[dict]) -> list[dict]:
74
+ embeddings = self._embed_documents([e.get("text", "") for e in elements])
79
75
  elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
80
76
  return elements_with_embeddings
81
77
 
82
- def _add_embeddings_to_elements(self, elements, embeddings) -> List[dict]:
78
+ @requires_dependencies(
79
+ ["vertexai"],
80
+ extras="embed-vertexai",
81
+ )
82
+ def _embed_documents(self, elements: list[str]) -> list[list[float]]:
83
+ from vertexai.language_models import TextEmbeddingInput
84
+
85
+ client = self.config.get_client()
86
+ inputs = [TextEmbeddingInput(text=element) for element in elements]
87
+ embeddings = client.get_embeddings(inputs)
88
+ return [e.values for e in embeddings]
89
+
90
+ def _add_embeddings_to_elements(self, elements, embeddings) -> list[dict]:
83
91
  assert len(elements) == len(embeddings)
84
92
  elements_w_embedding = []
85
93
  for i, element in enumerate(elements):
@@ -1,5 +1,5 @@
1
1
  from dataclasses import dataclass
2
- from typing import TYPE_CHECKING, List, Optional
2
+ from typing import TYPE_CHECKING, Optional
3
3
 
4
4
  import numpy as np
5
5
  from pydantic import Field, SecretStr
@@ -8,7 +8,7 @@ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, Embedding
8
8
  from unstructured_ingest.utils.dep_check import requires_dependencies
9
9
 
10
10
  if TYPE_CHECKING:
11
- from langchain_voyageai import VoyageAIEmbeddings
11
+ from voyageai import Client as VoyageAIClient
12
12
 
13
13
 
14
14
  class VoyageAIEmbeddingConfig(EmbeddingConfig):
@@ -16,28 +16,30 @@ class VoyageAIEmbeddingConfig(EmbeddingConfig):
16
16
  embedder_model_name: str = Field(alias="model_name")
17
17
  batch_size: Optional[int] = Field(default=None)
18
18
  truncation: Optional[bool] = Field(default=None)
19
+ max_retries: int = 0
20
+ timeout_in_seconds: Optional[int] = None
19
21
 
20
22
  @requires_dependencies(
21
23
  ["langchain", "langchain_voyageai"],
22
24
  extras="embed-voyageai",
23
25
  )
24
- def get_client(self) -> "VoyageAIEmbeddings":
26
+ def get_client(self) -> "VoyageAIClient":
25
27
  """Creates a Langchain VoyageAI python client to embed elements."""
26
- from langchain_voyageai import VoyageAIEmbeddings
28
+ from voyageai import Client as VoyageAIClient
27
29
 
28
- return VoyageAIEmbeddings(
29
- voyage_api_key=self.api_key,
30
- model=self.embedder_model_name,
31
- batch_size=self.batch_size,
32
- truncation=self.truncation,
30
+ client = VoyageAIClient(
31
+ api_key=self.api_key.get_secret_value(),
32
+ max_retries=self.max_retries,
33
+ timeout=self.timeout_in_seconds,
33
34
  )
35
+ return client
34
36
 
35
37
 
36
38
  @dataclass
37
39
  class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
38
40
  config: VoyageAIEmbeddingConfig
39
41
 
40
- def get_exemplary_embedding(self) -> List[float]:
42
+ def get_exemplary_embedding(self) -> list[float]:
41
43
  return self.embed_query(query="A sample query.")
42
44
 
43
45
  @property
@@ -50,17 +52,20 @@ class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
50
52
  exemplary_embedding = self.get_exemplary_embedding()
51
53
  return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
52
54
 
53
- def embed_documents(self, elements: List[dict]) -> List[dict]:
54
- client = self.config.get_client()
55
- embeddings = client.embed_documents([e.get("text", "") for e in elements])
55
+ def _embed_documents(self, elements: list[str]) -> list[list[float]]:
56
+ client: VoyageAIClient = self.config.get_client()
57
+ response = client.embed(texts=elements, model=self.config.embedder_model_name)
58
+ return response.embeddings
59
+
60
+ def embed_documents(self, elements: list[dict]) -> list[dict]:
61
+ embeddings = self._embed_documents([e.get("text", "") for e in elements])
56
62
  return self._add_embeddings_to_elements(elements, embeddings)
57
63
 
58
- def embed_query(self, query: str) -> List[float]:
59
- client = self.config.get_client()
60
- return client.embed_query(query)
64
+ def embed_query(self, query: str) -> list[float]:
65
+ return self._embed_documents(elements=[query])[0]
61
66
 
62
67
  @staticmethod
63
- def _add_embeddings_to_elements(elements, embeddings) -> List[dict]:
68
+ def _add_embeddings_to_elements(elements, embeddings) -> list[dict]:
64
69
  assert len(elements) == len(embeddings)
65
70
  elements_w_embedding = []
66
71
  for i, element in enumerate(elements):
@@ -155,7 +155,7 @@ class BaseCmd(ABC):
155
155
  @staticmethod
156
156
  def get_filterer(options: dict[str, Any]) -> Optional[Filterer]:
157
157
  filterer_configs = extract_config(flat_data=options, config=FiltererConfig)
158
- if not filterer_configs.dict():
158
+ if not filterer_configs.model_dump():
159
159
  return None
160
160
  return Filterer(config=filterer_configs)
161
161
 
@@ -19,7 +19,7 @@ class ConnectionConfig(BaseModel):
19
19
  def get_access_config(self) -> dict[str, Any]:
20
20
  if not self.access_config:
21
21
  return {}
22
- return self.access_config.get_secret_value().dict()
22
+ return self.access_config.get_secret_value().model_dump()
23
23
 
24
24
 
25
25
  ConnectionConfigT = TypeVar("ConnectionConfigT", bound=ConnectionConfig)