unstructured-ingest 0.0.13__py3-none-any.whl → 0.0.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (28) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/embed/__init__.py +17 -0
  3. unstructured_ingest/embed/bedrock.py +70 -0
  4. unstructured_ingest/embed/huggingface.py +73 -0
  5. unstructured_ingest/embed/interfaces.py +36 -0
  6. unstructured_ingest/embed/mixedbreadai.py +177 -0
  7. unstructured_ingest/embed/octoai.py +63 -0
  8. unstructured_ingest/embed/openai.py +61 -0
  9. unstructured_ingest/embed/vertexai.py +88 -0
  10. unstructured_ingest/embed/voyageai.py +69 -0
  11. unstructured_ingest/interfaces.py +17 -7
  12. unstructured_ingest/pipeline/reformat/embedding.py +3 -5
  13. unstructured_ingest/utils/data_prep.py +20 -12
  14. unstructured_ingest/v2/cli/base/src.py +2 -1
  15. unstructured_ingest/v2/pipeline/interfaces.py +3 -1
  16. unstructured_ingest/v2/pipeline/pipeline.py +25 -23
  17. unstructured_ingest/v2/processes/connectors/chroma.py +6 -1
  18. unstructured_ingest/v2/processes/connectors/google_drive.py +1 -2
  19. unstructured_ingest/v2/processes/connectors/onedrive.py +6 -4
  20. unstructured_ingest/v2/processes/connectors/pinecone.py +37 -15
  21. unstructured_ingest/v2/processes/connectors/sharepoint.py +1 -1
  22. unstructured_ingest/v2/processes/embedder.py +41 -24
  23. {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.14.dist-info}/METADATA +214 -211
  24. {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.14.dist-info}/RECORD +28 -19
  25. {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.14.dist-info}/LICENSE.md +0 -0
  26. {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.14.dist-info}/WHEEL +0 -0
  27. {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.14.dist-info}/entry_points.txt +0 -0
  28. {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.14.dist-info}/top_level.txt +0 -0
@@ -24,7 +24,8 @@ from unstructured_ingest.utils.data_prep import flatten_dict
24
24
 
25
25
  if TYPE_CHECKING:
26
26
  from unstructured.documents.elements import Element
27
- from unstructured.embed.interfaces import BaseEmbeddingEncoder
27
+
28
+ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder
28
29
 
29
30
  A = TypeVar("A", bound="DataClassJsonMixin")
30
31
 
@@ -204,22 +205,31 @@ class EmbeddingConfig(BaseConfig):
204
205
  kwargs["model_name"] = self.model_name
205
206
  # TODO make this more dynamic to map to encoder configs
206
207
  if self.provider == "langchain-openai":
207
- from unstructured.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
208
+ from unstructured_ingest.embed.openai import (
209
+ OpenAIEmbeddingConfig,
210
+ OpenAIEmbeddingEncoder,
211
+ )
208
212
 
209
213
  return OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(**kwargs))
210
214
  elif self.provider == "langchain-huggingface":
211
- from unstructured.embed.huggingface import (
215
+ from unstructured_ingest.embed.huggingface import (
212
216
  HuggingFaceEmbeddingConfig,
213
217
  HuggingFaceEmbeddingEncoder,
214
218
  )
215
219
 
216
220
  return HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig(**kwargs))
217
221
  elif self.provider == "octoai":
218
- from unstructured.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
222
+ from unstructured_ingest.embed.octoai import (
223
+ OctoAiEmbeddingConfig,
224
+ OctoAIEmbeddingEncoder,
225
+ )
219
226
 
220
227
  return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(**kwargs))
221
228
  elif self.provider == "langchain-aws-bedrock":
222
- from unstructured.embed.bedrock import BedrockEmbeddingConfig, BedrockEmbeddingEncoder
229
+ from unstructured_ingest.embed.bedrock import (
230
+ BedrockEmbeddingConfig,
231
+ BedrockEmbeddingEncoder,
232
+ )
223
233
 
224
234
  return BedrockEmbeddingEncoder(
225
235
  config=BedrockEmbeddingConfig(
@@ -229,14 +239,14 @@ class EmbeddingConfig(BaseConfig):
229
239
  )
230
240
  )
231
241
  elif self.provider == "langchain-vertexai":
232
- from unstructured.embed.vertexai import (
242
+ from unstructured_ingest.embed.vertexai import (
233
243
  VertexAIEmbeddingConfig,
234
244
  VertexAIEmbeddingEncoder,
235
245
  )
236
246
 
237
247
  return VertexAIEmbeddingEncoder(config=VertexAIEmbeddingConfig(**kwargs))
238
248
  elif self.provider == "langchain-voyageai":
239
- from unstructured.embed.voyageai import (
249
+ from unstructured_ingest.embed.voyageai import (
240
250
  VoyageAIEmbeddingConfig,
241
251
  VoyageAIEmbeddingEncoder,
242
252
  )
@@ -27,8 +27,6 @@ class Embedder(ReformatNode):
27
27
  return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]
28
28
 
29
29
  def run(self, elements_json: str) -> Optional[str]:
30
- from unstructured.staging.base import elements_from_json
31
-
32
30
  try:
33
31
  elements_json_filename = os.path.basename(elements_json)
34
32
  filename_ext = os.path.basename(elements_json_filename)
@@ -48,10 +46,10 @@ class Embedder(ReformatNode):
48
46
  ):
49
47
  logger.debug(f"File exists: {json_path}, skipping embedding")
50
48
  return str(json_path)
51
- elements = elements_from_json(filename=elements_json)
49
+ with open(elements_json) as f:
50
+ elements = json.load(f)
52
51
  embedder = self.embedder_config.get_embedder()
53
- embedded_elements = embedder.embed_documents(elements=elements)
54
- element_dicts = [e.to_dict() for e in embedded_elements]
52
+ element_dicts = embedder.embed_documents(elements=elements)
55
53
  with open(json_path, "w", encoding="utf8") as output_f:
56
54
  logger.info(f"writing embeddings content to {json_path}")
57
55
  json.dump(element_dicts, output_f, ensure_ascii=False, indent=2)
@@ -1,12 +1,15 @@
1
1
  import itertools
2
2
  import json
3
3
  from datetime import datetime
4
- from typing import Any, Optional, Sequence, cast
4
+ from typing import Any, Iterable, Optional, Sequence, TypeVar, cast
5
5
 
6
6
  DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
7
7
 
8
+ T = TypeVar("T")
9
+ IterableT = Iterable[T]
8
10
 
9
- def batch_generator(iterable, batch_size=100):
11
+
12
+ def batch_generator(iterable: IterableT, batch_size: int = 100) -> IterableT:
10
13
  """A helper function to break an iterable into batches of size batch_size."""
11
14
  it = iter(iterable)
12
15
  chunk = tuple(itertools.islice(it, batch_size))
@@ -16,23 +19,28 @@ def batch_generator(iterable, batch_size=100):
16
19
 
17
20
 
18
21
  def generator_batching_wbytes(
19
- iterable, batch_size_limit_bytes=15_000_000, max_batch_size: int = 1000
20
- ):
22
+ iterable: IterableT,
23
+ batch_size_limit_bytes: Optional[int] = None,
24
+ max_batch_size: Optional[int] = None,
25
+ ) -> IterableT:
26
+ if not batch_size_limit_bytes and not max_batch_size:
27
+ return iterable
21
28
  """A helper function to break an iterable into chunks of specified bytes."""
22
29
  current_batch, current_batch_size = [], 0
23
30
 
24
31
  for item in iterable:
25
32
  item_size_bytes = len(json.dumps(item).encode("utf-8"))
26
-
27
- if (
28
- current_batch_size + item_size_bytes <= batch_size_limit_bytes
29
- or len(current_batch) == 0 # prevent inifite yielding of empty batch
30
- ) and len(current_batch) < max_batch_size:
31
- current_batch.append(item)
32
- current_batch_size += item_size_bytes
33
- else:
33
+ if batch_size_limit_bytes and current_batch_size + item_size_bytes > batch_size_limit_bytes:
34
+ yield current_batch
35
+ current_batch, current_batch_size = [item], item_size_bytes
36
+ continue
37
+ if max_batch_size and len(current_batch) + 1 > max_batch_size:
34
38
  yield current_batch
35
39
  current_batch, current_batch_size = [item], item_size_bytes
40
+ continue
41
+
42
+ current_batch.append(item)
43
+ current_batch_size += item_size_bytes
36
44
 
37
45
  if current_batch:
38
46
  yield current_batch
@@ -1,5 +1,6 @@
1
1
  import logging
2
2
  from dataclasses import dataclass, field
3
+ from typing import Any
3
4
 
4
5
  import click
5
6
  from pydantic import BaseModel
@@ -47,7 +48,7 @@ class SrcCmd(BaseCmd):
47
48
  options = self.consolidate_options(options=options)
48
49
  return options
49
50
 
50
- def cmd(self, ctx: click.Context, **options) -> None:
51
+ def cmd(self, ctx: click.Context, **options: dict[str, Any]) -> None:
51
52
  if ctx.invoked_subcommand:
52
53
  return
53
54
 
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  import asyncio
2
4
  import logging
3
5
  import multiprocessing as mp
@@ -132,7 +134,7 @@ class PipelineStep(ABC):
132
134
  async def _run_async(self, fn: Callable, **kwargs: Any) -> Optional[Any]:
133
135
  raise NotImplementedError
134
136
 
135
- def run(self, _fn: Optional[Callable] = None, **kwargs: Any) -> Optional[Any]:
137
+ def run(self, _fn: Callable[..., Any] | None = None, **kwargs: Any) -> Optional[Any]:
136
138
  kwargs = kwargs.copy()
137
139
  otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=logger.debug)
138
140
  tracer = otel_handler.get_tracer()
@@ -1,7 +1,9 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
4
  import multiprocessing as mp
3
5
  from dataclasses import InitVar, dataclass, field
4
- from typing import Any, Optional, Union
6
+ from typing import Any
5
7
 
6
8
  from unstructured_ingest.v2.interfaces import ProcessorConfig, Uploader
7
9
  from unstructured_ingest.v2.logger import logger, make_default_logger
@@ -48,33 +50,33 @@ class Pipeline:
48
50
  partitioner: InitVar[Partitioner]
49
51
  partitioner_step: PartitionStep = field(init=False)
50
52
 
51
- chunker: InitVar[Optional[Chunker]] = None
52
- chunker_step: ChunkStep = field(init=False, default=None)
53
+ chunker: InitVar[Chunker | None] = None
54
+ chunker_step: ChunkStep | None = field(init=False, default=None)
53
55
 
54
- embedder: InitVar[Optional[Embedder]] = None
55
- embedder_step: EmbedStep = field(init=False, default=None)
56
+ embedder: InitVar[Embedder | None] = None
57
+ embedder_step: EmbedStep | None = field(init=False, default=None)
56
58
 
57
- stager: InitVar[Optional[UploadStager]] = None
58
- stager_step: UploadStageStep = field(init=False, default=None)
59
+ stager: InitVar[UploadStager | None] = None
60
+ stager_step: UploadStageStep | None = field(init=False, default=None)
59
61
 
60
62
  uploader: InitVar[Uploader] = field(default=LocalUploader())
61
- uploader_step: UploadStep = field(init=False, default=None)
63
+ uploader_step: UploadStep | None = field(init=False, default=None)
62
64
 
63
- uncompress_step: UncompressStep = field(init=False, default=None)
65
+ uncompress_step: UncompressStep | None = field(init=False, default=None)
64
66
 
65
- filterer: InitVar[Optional[Filterer]] = None
66
- filter_step: FilterStep = field(init=False, default=None)
67
+ filterer: InitVar[Filterer | None] = None
68
+ filter_step: FilterStep | None = field(init=False, default=None)
67
69
 
68
70
  def __post_init__(
69
71
  self,
70
72
  indexer: IndexerT,
71
73
  downloader: DownloaderT,
72
74
  partitioner: Partitioner,
73
- chunker: Chunker = None,
74
- embedder: Embedder = None,
75
- stager: UploadStager = None,
76
- uploader: Uploader = None,
77
- filterer: Filterer = None,
75
+ chunker: Chunker | None = None,
76
+ embedder: Embedder | None = None,
77
+ stager: UploadStager | None = None,
78
+ uploader: Uploader | None = None,
79
+ filterer: Filterer | None = None,
78
80
  ):
79
81
  make_default_logger(level=logging.DEBUG if self.context.verbose else logging.INFO)
80
82
  otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint)
@@ -136,7 +138,7 @@ class Pipeline:
136
138
  if self.context.status:
137
139
  raise PipelineError("Pipeline did not run successfully")
138
140
 
139
- def clean_results(self, results: Optional[list[Union[Any, list[Any]]]]) -> Optional[list[Any]]:
141
+ def clean_results(self, results: list[Any | list[Any]] | None) -> list[Any] | None:
140
142
  if not results:
141
143
  return None
142
144
  results = [r for r in results if r]
@@ -274,12 +276,12 @@ class Pipeline:
274
276
  downloader_config: DownloaderConfigT,
275
277
  source_connection_config: ConnectionConfig,
276
278
  partitioner_config: PartitionerConfig,
277
- filterer_config: FiltererConfig = None,
278
- chunker_config: Optional[ChunkerConfig] = None,
279
- embedder_config: Optional[EmbedderConfig] = None,
280
- destination_connection_config: Optional[ConnectionConfig] = None,
281
- stager_config: Optional[UploadStagerConfigT] = None,
282
- uploader_config: Optional[UploaderConfigT] = None,
279
+ filterer_config: FiltererConfig | None = None,
280
+ chunker_config: ChunkerConfig | None = None,
281
+ embedder_config: EmbedderConfig | None = None,
282
+ destination_connection_config: ConnectionConfig | None = None,
283
+ stager_config: UploadStagerConfigT | None = None,
284
+ uploader_config: UploaderConfigT | None = None,
283
285
  ) -> "Pipeline":
284
286
  # Get registry key based on indexer config
285
287
  source_entry = {
@@ -41,9 +41,14 @@ class ChromaAccessConfig(AccessConfig):
41
41
  )
42
42
 
43
43
 
44
+ SecretChromaAccessConfig = Secret[ChromaAccessConfig]
45
+
46
+
44
47
  class ChromaConnectionConfig(ConnectionConfig):
45
48
  collection_name: str = Field(description="The name of the Chroma collection to write into.")
46
- access_config: Secret[ChromaAccessConfig]
49
+ access_config: SecretChromaAccessConfig = Field(
50
+ default=SecretChromaAccessConfig(secret_value=ChromaAccessConfig())
51
+ )
47
52
  path: Optional[str] = Field(
48
53
  default=None, description="Location where Chroma is persisted, if not connecting via http."
49
54
  )
@@ -28,8 +28,7 @@ from unstructured_ingest.v2.interfaces import (
28
28
  )
29
29
  from unstructured_ingest.v2.logger import logger
30
30
  from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
31
-
32
- from .utils import conform_string_to_dict
31
+ from unstructured_ingest.v2.processes.connectors.utils import conform_string_to_dict
33
32
 
34
33
  CONNECTOR_TYPE = "google_drive"
35
34
 
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  import json
2
4
  from dataclasses import dataclass
3
5
  from pathlib import Path
@@ -103,7 +105,7 @@ class OnedriveIndexer(Indexer):
103
105
  logger.error(f"failed to validate connection: {e}", exc_info=True)
104
106
  raise SourceConnectionError(f"failed to validate connection: {e}")
105
107
 
106
- def list_objects(self, folder, recursive) -> list["DriveItem"]:
108
+ def list_objects(self, folder: DriveItem, recursive: bool) -> list["DriveItem"]:
107
109
  drive_items = folder.children.get().execute_query()
108
110
  files = [d for d in drive_items if d.is_file]
109
111
  if not recursive:
@@ -139,12 +141,12 @@ class OnedriveIndexer(Indexer):
139
141
  server_path = file_path + "/" + filename
140
142
  rel_path = server_path.replace(self.index_config.path, "").lstrip("/")
141
143
  date_modified_dt = (
142
- parser.parse(drive_item.last_modified_datetime)
144
+ parser.parse(str(drive_item.last_modified_datetime))
143
145
  if drive_item.last_modified_datetime
144
146
  else None
145
147
  )
146
148
  date_created_at = (
147
- parser.parse(drive_item.created_datetime) if drive_item.created_datetime else None
149
+ parser.parse(str(drive_item.created_datetime)) if drive_item.created_datetime else None
148
150
  )
149
151
  return FileData(
150
152
  identifier=drive_item.id,
@@ -156,7 +158,7 @@ class OnedriveIndexer(Indexer):
156
158
  url=drive_item.parent_reference.path + "/" + drive_item.name,
157
159
  version=drive_item.etag,
158
160
  date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
159
- date_created=str(date_created_at.timestamp()) if date_modified_dt else None,
161
+ date_created=str(date_created_at.timestamp()) if date_created_at else None,
160
162
  date_processed=str(time()),
161
163
  record_locator={
162
164
  "user_pname": self.connection_config.user_pname,
@@ -27,6 +27,7 @@ if TYPE_CHECKING:
27
27
 
28
28
  CONNECTOR_TYPE = "pinecone"
29
29
  MAX_PAYLOAD_SIZE = 2 * 1024 * 1024 # 2MB
30
+ MAX_POOL_THREADS = 100
30
31
 
31
32
 
32
33
  class PineconeAccessConfig(AccessConfig):
@@ -45,7 +46,7 @@ class PineconeConnectionConfig(ConnectionConfig):
45
46
  )
46
47
 
47
48
  @requires_dependencies(["pinecone"], extras="pinecone")
48
- def get_index(self) -> "PineconeIndex":
49
+ def get_index(self, **index_kwargs) -> "PineconeIndex":
49
50
  from pinecone import Pinecone
50
51
 
51
52
  from unstructured_ingest import __version__ as unstructured_version
@@ -55,7 +56,7 @@ class PineconeConnectionConfig(ConnectionConfig):
55
56
  source_tag=f"unstructured_ingest=={unstructured_version}",
56
57
  )
57
58
 
58
- index = pc.Index(self.index_name)
59
+ index = pc.Index(name=self.index_name, **index_kwargs)
59
60
  logger.debug(f"Connected to index: {pc.describe_index(self.index_name)}")
60
61
  return index
61
62
 
@@ -65,7 +66,13 @@ class PineconeUploadStagerConfig(UploadStagerConfig):
65
66
 
66
67
 
67
68
  class PineconeUploaderConfig(UploaderConfig):
68
- batch_size: int = Field(default=100, description="Number of records per batch")
69
+ batch_size: Optional[int] = Field(
70
+ default=None,
71
+ description="Optional number of records per batch. Will otherwise limit by size.",
72
+ )
73
+ pool_threads: Optional[int] = Field(
74
+ default=1, description="Optional limit on number of threads to use for upload"
75
+ )
69
76
 
70
77
 
71
78
  ALLOWED_FIELDS = (
@@ -149,29 +156,44 @@ class PineconeUploader(Uploader):
149
156
  raise DestinationConnectionError(f"failed to validate connection: {e}")
150
157
 
151
158
  @requires_dependencies(["pinecone"], extras="pinecone")
152
- def upsert_batch(self, batch):
159
+ def upsert_batches_async(self, elements_dict: list[dict]):
153
160
  from pinecone.exceptions import PineconeApiException
154
161
 
155
- try:
156
- index = self.connection_config.get_index()
157
- response = index.upsert(batch)
158
- except PineconeApiException as api_error:
159
- raise DestinationConnectionError(f"http error: {api_error}") from api_error
160
- logger.debug(f"results: {response}")
162
+ chunks = list(
163
+ generator_batching_wbytes(
164
+ iterable=elements_dict,
165
+ batch_size_limit_bytes=MAX_PAYLOAD_SIZE - 100,
166
+ max_batch_size=self.upload_config.batch_size,
167
+ )
168
+ )
169
+ logger.info(f"Split doc with {len(elements_dict)} elements into {len(chunks)} batches")
170
+
171
+ max_pool_threads = min(len(chunks), MAX_POOL_THREADS)
172
+ if self.upload_config.pool_threads:
173
+ pool_threads = min(self.upload_config.pool_threads, max_pool_threads)
174
+ else:
175
+ pool_threads = max_pool_threads
176
+ index = self.connection_config.get_index(pool_threads=pool_threads)
177
+ with index:
178
+ async_results = [index.upsert(vectors=chunk, async_req=True) for chunk in chunks]
179
+ # Wait for and retrieve responses (this raises in case of error)
180
+ try:
181
+ results = [async_result.get() for async_result in async_results]
182
+ except PineconeApiException as api_error:
183
+ raise DestinationConnectionError(f"http error: {api_error}") from api_error
184
+ logger.debug(f"results: {results}")
161
185
 
162
186
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
163
187
  with path.open("r") as file:
164
188
  elements_dict = json.load(file)
165
189
  logger.info(
166
- f"writing document batches to destination"
190
+ f"writing a total of {len(elements_dict)} elements via"
191
+ f" document batches to destination"
167
192
  f" index named {self.connection_config.index_name}"
168
193
  f" with batch size {self.upload_config.batch_size}"
169
194
  )
170
195
 
171
- for batch in generator_batching_wbytes(
172
- elements_dict, MAX_PAYLOAD_SIZE - 100, self.upload_config.batch_size
173
- ):
174
- self.upsert_batch(batch=batch)
196
+ self.upsert_batches_async(elements_dict=elements_dict)
175
197
 
176
198
 
177
199
  pinecone_destination_entry = DestinationRegistryEntry(
@@ -139,7 +139,7 @@ class SharepointConnectionConfig(ConnectionConfig):
139
139
 
140
140
  class SharepointIndexerConfig(IndexerConfig):
141
141
  path: Optional[str] = Field(
142
- defaul=None,
142
+ default=None,
143
143
  description="Path from which to start parsing files. If the connector is to \
144
144
  process all sites within the tenant this filter will be applied to \
145
145
  all sites document libraries.",
@@ -1,3 +1,4 @@
1
+ import json
1
2
  from abc import ABC
2
3
  from dataclasses import dataclass
3
4
  from pathlib import Path
@@ -5,11 +6,10 @@ from typing import TYPE_CHECKING, Any, Literal, Optional
5
6
 
6
7
  from pydantic import BaseModel, Field, SecretStr
7
8
 
8
- from unstructured_ingest.utils.dep_check import requires_dependencies
9
9
  from unstructured_ingest.v2.interfaces.process import BaseProcess
10
10
 
11
11
  if TYPE_CHECKING:
12
- from unstructured.embed.interfaces import BaseEmbeddingEncoder
12
+ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder
13
13
 
14
14
 
15
15
  class EmbedderConfig(BaseModel):
@@ -21,6 +21,7 @@ class EmbedderConfig(BaseModel):
21
21
  "langchain-vertexai",
22
22
  "langchain-voyageai",
23
23
  "octoai",
24
+ "mixedbread-ai",
24
25
  ]
25
26
  ] = Field(default=None, description="Type of the embedding class to be used.")
26
27
  embedding_api_key: Optional[SecretStr] = Field(
@@ -42,30 +43,31 @@ class EmbedderConfig(BaseModel):
42
43
  default="us-west-2", description="AWS region used for AWS-based embedders, such as bedrock"
43
44
  )
44
45
 
45
- @requires_dependencies(dependencies=["unstructured"], extras="embed-huggingface")
46
46
  def get_huggingface_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
47
- from unstructured.embed.huggingface import (
47
+ from unstructured_ingest.embed.huggingface import (
48
48
  HuggingFaceEmbeddingConfig,
49
49
  HuggingFaceEmbeddingEncoder,
50
50
  )
51
51
 
52
- return HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig(**embedding_kwargs))
52
+ return HuggingFaceEmbeddingEncoder(
53
+ config=HuggingFaceEmbeddingConfig.model_validate(embedding_kwargs)
54
+ )
53
55
 
54
- @requires_dependencies(dependencies=["unstructured"], extras="openai")
55
56
  def get_openai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
56
- from unstructured.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
57
+ from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
57
58
 
58
- return OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(**embedding_kwargs))
59
+ return OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig.model_validate(embedding_kwargs))
59
60
 
60
- @requires_dependencies(dependencies=["unstructured"], extras="embed-octoai")
61
61
  def get_octoai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
62
- from unstructured.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
62
+ from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
63
63
 
64
- return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(**embedding_kwargs))
64
+ return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig.model_validate(embedding_kwargs))
65
65
 
66
- @requires_dependencies(dependencies=["unstructured"], extras="bedrock")
67
66
  def get_bedrock_embedder(self) -> "BaseEmbeddingEncoder":
68
- from unstructured.embed.bedrock import BedrockEmbeddingConfig, BedrockEmbeddingEncoder
67
+ from unstructured_ingest.embed.bedrock import (
68
+ BedrockEmbeddingConfig,
69
+ BedrockEmbeddingEncoder,
70
+ )
69
71
 
70
72
  return BedrockEmbeddingEncoder(
71
73
  config=BedrockEmbeddingConfig(
@@ -75,20 +77,35 @@ class EmbedderConfig(BaseModel):
75
77
  )
76
78
  )
77
79
 
78
- @requires_dependencies(dependencies=["unstructured"], extras="embed-vertexai")
79
80
  def get_vertexai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
80
- from unstructured.embed.vertexai import (
81
+ from unstructured_ingest.embed.vertexai import (
81
82
  VertexAIEmbeddingConfig,
82
83
  VertexAIEmbeddingEncoder,
83
84
  )
84
85
 
85
- return VertexAIEmbeddingEncoder(config=VertexAIEmbeddingConfig(**embedding_kwargs))
86
+ return VertexAIEmbeddingEncoder(
87
+ config=VertexAIEmbeddingConfig.model_validate(embedding_kwargs)
88
+ )
86
89
 
87
- @requires_dependencies(dependencies=["unstructured"], extras="embed-voyageai")
88
90
  def get_voyageai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
89
- from unstructured.embed.voyageai import VoyageAIEmbeddingConfig, VoyageAIEmbeddingEncoder
91
+ from unstructured_ingest.embed.voyageai import (
92
+ VoyageAIEmbeddingConfig,
93
+ VoyageAIEmbeddingEncoder,
94
+ )
95
+
96
+ return VoyageAIEmbeddingEncoder(
97
+ config=VoyageAIEmbeddingConfig.model_validate(embedding_kwargs)
98
+ )
90
99
 
91
- return VoyageAIEmbeddingEncoder(config=VoyageAIEmbeddingConfig(**embedding_kwargs))
100
+ def get_mixedbread_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
101
+ from unstructured_ingest.embed.mixedbreadai import (
102
+ MixedbreadAIEmbeddingConfig,
103
+ MixedbreadAIEmbeddingEncoder,
104
+ )
105
+
106
+ return MixedbreadAIEmbeddingEncoder(
107
+ config=MixedbreadAIEmbeddingConfig.model_validate(embedding_kwargs)
108
+ )
92
109
 
93
110
  def get_embedder(self) -> "BaseEmbeddingEncoder":
94
111
  kwargs: dict[str, Any] = {}
@@ -114,6 +131,8 @@ class EmbedderConfig(BaseModel):
114
131
 
115
132
  if self.embedding_provider == "langchain-voyageai":
116
133
  return self.get_voyageai_embedder(embedding_kwargs=kwargs)
134
+ if self.embedding_provider == "mixedbread-ai":
135
+ return self.get_mixedbread_embedder(embedding_kwargs=kwargs)
117
136
 
118
137
  raise ValueError(f"{self.embedding_provider} not a recognized encoder")
119
138
 
@@ -122,14 +141,12 @@ class EmbedderConfig(BaseModel):
122
141
  class Embedder(BaseProcess, ABC):
123
142
  config: EmbedderConfig
124
143
 
125
- @requires_dependencies(dependencies=["unstructured"])
126
144
  def run(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
127
- from unstructured.staging.base import elements_from_json
128
-
129
145
  # TODO update base embedder classes to support async
130
146
  embedder = self.config.get_embedder()
131
- elements = elements_from_json(filename=str(elements_filepath))
147
+ with elements_filepath.open("r") as elements_file:
148
+ elements = json.load(elements_file)
132
149
  if not elements:
133
150
  return [e.to_dict() for e in elements]
134
151
  embedded_elements = embedder.embed_documents(elements=elements)
135
- return [e.to_dict() for e in embedded_elements]
152
+ return embedded_elements