unstructured-ingest 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (82) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/interfaces.py +1 -1
  3. unstructured_ingest/cli/utils.py +1 -1
  4. unstructured_ingest/connector/astradb.py +1 -1
  5. unstructured_ingest/connector/biomed.py +4 -4
  6. unstructured_ingest/connector/chroma.py +1 -1
  7. unstructured_ingest/connector/databricks_volumes.py +2 -2
  8. unstructured_ingest/connector/fsspec/box.py +1 -1
  9. unstructured_ingest/connector/fsspec/fsspec.py +5 -5
  10. unstructured_ingest/connector/git.py +1 -1
  11. unstructured_ingest/connector/google_drive.py +4 -4
  12. unstructured_ingest/connector/hubspot.py +1 -1
  13. unstructured_ingest/connector/kafka.py +8 -8
  14. unstructured_ingest/connector/local.py +1 -1
  15. unstructured_ingest/connector/notion/helpers.py +4 -4
  16. unstructured_ingest/connector/onedrive.py +3 -3
  17. unstructured_ingest/connector/outlook.py +2 -2
  18. unstructured_ingest/connector/pinecone.py +1 -1
  19. unstructured_ingest/connector/sharepoint.py +8 -8
  20. unstructured_ingest/connector/vectara.py +6 -6
  21. unstructured_ingest/embed/__init__.py +17 -0
  22. unstructured_ingest/embed/bedrock.py +70 -0
  23. unstructured_ingest/embed/huggingface.py +73 -0
  24. unstructured_ingest/embed/interfaces.py +36 -0
  25. unstructured_ingest/embed/mixedbreadai.py +177 -0
  26. unstructured_ingest/embed/octoai.py +63 -0
  27. unstructured_ingest/embed/openai.py +61 -0
  28. unstructured_ingest/embed/vertexai.py +88 -0
  29. unstructured_ingest/embed/voyageai.py +69 -0
  30. unstructured_ingest/interfaces.py +21 -11
  31. unstructured_ingest/logger.py +1 -1
  32. unstructured_ingest/pipeline/copy.py +1 -1
  33. unstructured_ingest/pipeline/interfaces.py +2 -2
  34. unstructured_ingest/pipeline/partition.py +1 -1
  35. unstructured_ingest/pipeline/pipeline.py +1 -1
  36. unstructured_ingest/pipeline/reformat/chunking.py +2 -2
  37. unstructured_ingest/pipeline/reformat/embedding.py +4 -6
  38. unstructured_ingest/pipeline/source.py +2 -2
  39. unstructured_ingest/utils/compression.py +3 -3
  40. unstructured_ingest/utils/data_prep.py +20 -12
  41. unstructured_ingest/utils/string_and_date_utils.py +2 -2
  42. unstructured_ingest/v2/cli/base/cmd.py +3 -3
  43. unstructured_ingest/v2/cli/base/dest.py +1 -1
  44. unstructured_ingest/v2/cli/base/src.py +3 -2
  45. unstructured_ingest/v2/cli/utils/click.py +1 -1
  46. unstructured_ingest/v2/interfaces/processor.py +48 -13
  47. unstructured_ingest/v2/logger.py +1 -1
  48. unstructured_ingest/v2/otel.py +1 -1
  49. unstructured_ingest/v2/pipeline/interfaces.py +12 -3
  50. unstructured_ingest/v2/pipeline/pipeline.py +42 -29
  51. unstructured_ingest/v2/pipeline/steps/chunk.py +3 -3
  52. unstructured_ingest/v2/pipeline/steps/download.py +17 -2
  53. unstructured_ingest/v2/pipeline/steps/embed.py +3 -3
  54. unstructured_ingest/v2/pipeline/steps/filter.py +1 -1
  55. unstructured_ingest/v2/pipeline/steps/index.py +2 -2
  56. unstructured_ingest/v2/pipeline/steps/partition.py +3 -3
  57. unstructured_ingest/v2/pipeline/steps/stage.py +1 -1
  58. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -1
  59. unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
  60. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  61. unstructured_ingest/v2/processes/connectors/chroma.py +6 -1
  62. unstructured_ingest/v2/processes/connectors/elasticsearch.py +1 -1
  63. unstructured_ingest/v2/processes/connectors/fsspec/box.py +1 -1
  64. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +4 -4
  65. unstructured_ingest/v2/processes/connectors/google_drive.py +2 -3
  66. unstructured_ingest/v2/processes/connectors/local.py +6 -5
  67. unstructured_ingest/v2/processes/connectors/milvus.py +1 -1
  68. unstructured_ingest/v2/processes/connectors/onedrive.py +8 -6
  69. unstructured_ingest/v2/processes/connectors/opensearch.py +1 -1
  70. unstructured_ingest/v2/processes/connectors/pinecone.py +38 -16
  71. unstructured_ingest/v2/processes/connectors/sharepoint.py +10 -6
  72. unstructured_ingest/v2/processes/embedder.py +41 -24
  73. unstructured_ingest/v2/processes/filter.py +1 -1
  74. unstructured_ingest/v2/processes/partitioner.py +3 -3
  75. unstructured_ingest/v2/utils.py +7 -0
  76. {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/METADATA +212 -211
  77. {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/RECORD +81 -72
  78. unstructured_ingest/evaluate.py +0 -338
  79. {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/LICENSE.md +0 -0
  80. {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/WHEEL +0 -0
  81. {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/entry_points.txt +0 -0
  82. {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,36 @@
1
+ from abc import ABC, abstractmethod
2
+ from dataclasses import dataclass
3
+ from typing import List, Tuple
4
+
5
+ from pydantic import BaseModel
6
+
7
+
8
+ class EmbeddingConfig(BaseModel):
9
+ pass
10
+
11
+
12
+ @dataclass
13
+ class BaseEmbeddingEncoder(ABC):
14
+ config: EmbeddingConfig
15
+
16
+ def initialize(self):
17
+ """Initializes the embedding encoder class. Should also validate the instance
18
+ is properly configured: e.g., embed a single a element"""
19
+
20
+ @property
21
+ @abstractmethod
22
+ def num_of_dimensions(self) -> Tuple[int]:
23
+ """Number of dimensions for the embedding vector."""
24
+
25
+ @property
26
+ @abstractmethod
27
+ def is_unit_vector(self) -> bool:
28
+ """Denotes if the embedding vector is a unit vector."""
29
+
30
+ @abstractmethod
31
+ def embed_documents(self, elements: List[dict]) -> List[dict]:
32
+ pass
33
+
34
+ @abstractmethod
35
+ def embed_query(self, query: str) -> List[float]:
36
+ pass
@@ -0,0 +1,177 @@
1
+ import os
2
+ from dataclasses import dataclass, field
3
+ from typing import TYPE_CHECKING, List, Optional
4
+
5
+ import numpy as np
6
+ from pydantic import Field, SecretStr
7
+
8
+ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
9
+ from unstructured_ingest.utils.dep_check import requires_dependencies
10
+
11
+ USER_AGENT = "@mixedbread-ai/unstructured"
12
+ BATCH_SIZE = 128
13
+ TIMEOUT = 60
14
+ MAX_RETRIES = 3
15
+ ENCODING_FORMAT = "float"
16
+ TRUNCATION_STRATEGY = "end"
17
+
18
+
19
+ if TYPE_CHECKING:
20
+ from mixedbread_ai.client import MixedbreadAI
21
+ from mixedbread_ai.core import RequestOptions
22
+
23
+
24
+ class MixedbreadAIEmbeddingConfig(EmbeddingConfig):
25
+ """
26
+ Configuration class for Mixedbread AI Embedding Encoder.
27
+
28
+ Attributes:
29
+ api_key (str): API key for accessing Mixedbread AI..
30
+ embedder_model_name (str): Name of the model to use for embeddings.
31
+ """
32
+
33
+ api_key: SecretStr = Field(
34
+ default_factory=lambda: SecretStr(os.environ.get("MXBAI_API_KEY")),
35
+ )
36
+
37
+ embedder_model_name: str = Field(
38
+ default="mixedbread-ai/mxbai-embed-large-v1", alias="model_name"
39
+ )
40
+
41
+ @requires_dependencies(
42
+ ["mixedbread_ai"],
43
+ extras="embed-mixedbreadai",
44
+ )
45
+ def get_client(self) -> "MixedbreadAI":
46
+ """
47
+ Create the Mixedbread AI client.
48
+
49
+ Returns:
50
+ MixedbreadAI: Initialized client.
51
+ """
52
+ from mixedbread_ai.client import MixedbreadAI
53
+
54
+ return MixedbreadAI(
55
+ api_key=self.api_key.get_secret_value(),
56
+ )
57
+
58
+
59
+ @dataclass
60
+ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
61
+ """
62
+ Embedding encoder for Mixedbread AI.
63
+
64
+ Attributes:
65
+ config (MixedbreadAIEmbeddingConfig): Configuration for the embedding encoder.
66
+ """
67
+
68
+ config: MixedbreadAIEmbeddingConfig
69
+
70
+ _exemplary_embedding: Optional[List[float]] = field(init=False, default=None)
71
+ _request_options: Optional["RequestOptions"] = field(init=False, default=None)
72
+
73
+ def get_exemplary_embedding(self) -> List[float]:
74
+ """Get an exemplary embedding to determine dimensions and unit vector status."""
75
+ return self._embed(["Q"])[0]
76
+
77
+ def initialize(self):
78
+ if self.config.api_key is None:
79
+ raise ValueError(
80
+ "The Mixedbread AI API key must be specified."
81
+ + "You either pass it in the constructor using 'api_key'"
82
+ + "or via the 'MXBAI_API_KEY' environment variable."
83
+ )
84
+
85
+ from mixedbread_ai.core import RequestOptions
86
+
87
+ self._request_options = RequestOptions(
88
+ max_retries=MAX_RETRIES,
89
+ timeout_in_seconds=TIMEOUT,
90
+ additional_headers={"User-Agent": USER_AGENT},
91
+ )
92
+
93
+ @property
94
+ def num_of_dimensions(self):
95
+ """Get the number of dimensions for the embeddings."""
96
+ exemplary_embedding = self.get_exemplary_embedding()
97
+ return np.shape(exemplary_embedding)
98
+
99
+ @property
100
+ def is_unit_vector(self) -> bool:
101
+ """Check if the embedding is a unit vector."""
102
+ exemplary_embedding = self.get_exemplary_embedding()
103
+ return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
104
+
105
+ def _embed(self, texts: List[str]) -> List[List[float]]:
106
+ """
107
+ Embed a list of texts using the Mixedbread AI API.
108
+
109
+ Args:
110
+ texts (List[str]): List of texts to embed.
111
+
112
+ Returns:
113
+ List[List[float]]: List of embeddings.
114
+ """
115
+ batch_size = BATCH_SIZE
116
+ batch_itr = range(0, len(texts), batch_size)
117
+
118
+ responses = []
119
+ client = self.config.get_client()
120
+ for i in batch_itr:
121
+ batch = texts[i : i + batch_size]
122
+ response = client.embeddings(
123
+ model=self.config.embedder_model_name,
124
+ normalized=True,
125
+ encoding_format=ENCODING_FORMAT,
126
+ truncation_strategy=TRUNCATION_STRATEGY,
127
+ request_options=self._request_options,
128
+ input=batch,
129
+ )
130
+ responses.append(response)
131
+ return [item.embedding for response in responses for item in response.data]
132
+
133
+ @staticmethod
134
+ def _add_embeddings_to_elements(
135
+ elements: List[dict], embeddings: List[List[float]]
136
+ ) -> List[dict]:
137
+ """
138
+ Add embeddings to elements.
139
+
140
+ Args:
141
+ elements (List[Element]): List of elements.
142
+ embeddings (List[List[float]]): List of embeddings.
143
+
144
+ Returns:
145
+ List[Element]: Elements with embeddings added.
146
+ """
147
+ assert len(elements) == len(embeddings)
148
+ elements_w_embedding = []
149
+ for i, element in enumerate(elements):
150
+ element["embeddings"] = embeddings[i]
151
+ elements_w_embedding.append(element)
152
+ return elements
153
+
154
+ def embed_documents(self, elements: List[dict]) -> List[dict]:
155
+ """
156
+ Embed a list of document elements.
157
+
158
+ Args:
159
+ elements (List[Element]): List of document elements.
160
+
161
+ Returns:
162
+ List[Element]: Elements with embeddings.
163
+ """
164
+ embeddings = self._embed([e.get("text", "") for e in elements])
165
+ return self._add_embeddings_to_elements(elements, embeddings)
166
+
167
+ def embed_query(self, query: str) -> List[float]:
168
+ """
169
+ Embed a query string.
170
+
171
+ Args:
172
+ query (str): Query string to embed.
173
+
174
+ Returns:
175
+ List[float]: Embedding of the query.
176
+ """
177
+ return self._embed([query])[0]
@@ -0,0 +1,63 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import TYPE_CHECKING, List, Optional
3
+
4
+ import numpy as np
5
+ from pydantic import Field, SecretStr
6
+
7
+ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
8
+ from unstructured_ingest.utils.dep_check import requires_dependencies
9
+
10
+ if TYPE_CHECKING:
11
+ from openai import OpenAI
12
+
13
+
14
+ class OctoAiEmbeddingConfig(EmbeddingConfig):
15
+ api_key: SecretStr
16
+ embedder_model_name: str = Field(default="thenlper/gte-large", alias="model_name")
17
+ base_url: str = Field(default="https://text.octoai.run/v1")
18
+
19
+ @requires_dependencies(
20
+ ["openai", "tiktoken"],
21
+ extras="embed-octoai",
22
+ )
23
+ def get_client(self) -> "OpenAI":
24
+ """Creates an OpenAI python client to embed elements. Uses the OpenAI SDK."""
25
+ from openai import OpenAI
26
+
27
+ return OpenAI(api_key=self.api_key.get_secret_value(), base_url=self.base_url)
28
+
29
+
30
+ @dataclass
31
+ class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
32
+ config: OctoAiEmbeddingConfig
33
+ # Uses the OpenAI SDK
34
+ _exemplary_embedding: Optional[List[float]] = field(init=False, default=None)
35
+
36
+ def get_exemplary_embedding(self) -> List[float]:
37
+ return self.embed_query("Q")
38
+
39
+ def num_of_dimensions(self):
40
+ exemplary_embedding = self.get_exemplary_embedding()
41
+ return np.shape(exemplary_embedding)
42
+
43
+ def is_unit_vector(self):
44
+ exemplary_embedding = self.get_exemplary_embedding()
45
+ return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
46
+
47
+ def embed_query(self, query: str):
48
+ client = self.config.get_client()
49
+ response = client.embeddings.create(input=query, model=self.config.embedder_model_name)
50
+ return response.data[0].embedding
51
+
52
+ def embed_documents(self, elements: List[dict]) -> List[dict]:
53
+ embeddings = [self.embed_query(e.get("text", "")) for e in elements]
54
+ elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
55
+ return elements_with_embeddings
56
+
57
+ def _add_embeddings_to_elements(self, elements, embeddings) -> List[dict]:
58
+ assert len(elements) == len(embeddings)
59
+ elements_w_embedding = []
60
+ for i, element in enumerate(elements):
61
+ element["embeddings"] = embeddings[i]
62
+ elements_w_embedding.append(element)
63
+ return elements
@@ -0,0 +1,61 @@
1
+ from dataclasses import dataclass
2
+ from typing import TYPE_CHECKING, List
3
+
4
+ import numpy as np
5
+ from pydantic import Field, SecretStr
6
+
7
+ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
8
+ from unstructured_ingest.utils.dep_check import requires_dependencies
9
+
10
+ if TYPE_CHECKING:
11
+ from langchain_openai.embeddings import OpenAIEmbeddings
12
+
13
+
14
+ class OpenAIEmbeddingConfig(EmbeddingConfig):
15
+ api_key: SecretStr
16
+ embedder_model_name: str = Field(default="text-embedding-ada-002", alias="model_name")
17
+
18
+ @requires_dependencies(["langchain_openai"], extras="openai")
19
+ def get_client(self) -> "OpenAIEmbeddings":
20
+ """Creates a langchain OpenAI python client to embed elements."""
21
+ from langchain_openai import OpenAIEmbeddings
22
+
23
+ openai_client = OpenAIEmbeddings(
24
+ openai_api_key=self.api_key.get_secret_value(),
25
+ model=self.embedder_model_name, # type:ignore
26
+ )
27
+ return openai_client
28
+
29
+
30
+ @dataclass
31
+ class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
32
+ config: OpenAIEmbeddingConfig
33
+
34
+ def get_exemplary_embedding(self) -> List[float]:
35
+ return self.embed_query(query="Q")
36
+
37
+ def num_of_dimensions(self):
38
+ exemplary_embedding = self.get_exemplary_embedding()
39
+ return np.shape(exemplary_embedding)
40
+
41
+ def is_unit_vector(self):
42
+ exemplary_embedding = self.get_exemplary_embedding()
43
+ return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
44
+
45
+ def embed_query(self, query):
46
+ client = self.config.get_client()
47
+ return client.embed_query(str(query))
48
+
49
+ def embed_documents(self, elements: List[dict]) -> List[dict]:
50
+ client = self.config.get_client()
51
+ embeddings = client.embed_documents([e.get("text", "") for e in elements])
52
+ elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
53
+ return elements_with_embeddings
54
+
55
+ def _add_embeddings_to_elements(self, elements, embeddings) -> List[dict]:
56
+ assert len(elements) == len(embeddings)
57
+ elements_w_embedding = []
58
+ for i, element in enumerate(elements):
59
+ element["embeddings"] = embeddings[i]
60
+ elements_w_embedding.append(element)
61
+ return elements
@@ -0,0 +1,88 @@
1
+ # type: ignore
2
+ import json
3
+ import os
4
+ from dataclasses import dataclass
5
+ from typing import TYPE_CHECKING, Annotated, Any, List, Optional
6
+
7
+ import numpy as np
8
+ from pydantic import Field, Secret, ValidationError
9
+ from pydantic.functional_validators import BeforeValidator
10
+ from unstructured.utils import FileHandler
11
+
12
+ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
13
+ from unstructured_ingest.utils.dep_check import requires_dependencies
14
+
15
+ if TYPE_CHECKING:
16
+ from langchain_google_vertexai import VertexAIEmbeddings
17
+
18
+
19
+ def conform_string_to_dict(value: Any) -> dict:
20
+ if isinstance(value, dict):
21
+ return value
22
+ if isinstance(value, str):
23
+ return json.loads(value)
24
+ raise ValidationError(f"Input could not be mapped to a valid dict: {value}")
25
+
26
+
27
+ ApiKeyType = Secret[Annotated[dict, BeforeValidator(conform_string_to_dict)]]
28
+
29
+
30
+ class VertexAIEmbeddingConfig(EmbeddingConfig):
31
+ api_key: ApiKeyType
32
+ embedder_model_name: Optional[str] = Field(
33
+ default="textembedding-gecko@001", alias="model_name"
34
+ )
35
+
36
+ def register_application_credentials(self):
37
+ # TODO look into passing credentials in directly, rather than via env var and tmp file
38
+ application_credentials_path = os.path.join("/tmp", "google-vertex-app-credentials.json")
39
+ credentials_file = FileHandler(application_credentials_path)
40
+ credentials_file.write_file(json.dumps(self.api_key.get_secret_value()))
41
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = application_credentials_path
42
+
43
+ @requires_dependencies(
44
+ ["langchain", "langchain_google_vertexai"],
45
+ extras="embed-vertexai",
46
+ )
47
+ def get_client(self) -> "VertexAIEmbeddings":
48
+ """Creates a Langchain VertexAI python client to embed elements."""
49
+ from langchain_google_vertexai import VertexAIEmbeddings
50
+
51
+ self.register_application_credentials()
52
+ vertexai_client = VertexAIEmbeddings(model_name=self.embedder_model_name)
53
+ return vertexai_client
54
+
55
+
56
+ @dataclass
57
+ class VertexAIEmbeddingEncoder(BaseEmbeddingEncoder):
58
+ config: VertexAIEmbeddingConfig
59
+
60
+ def get_exemplary_embedding(self) -> List[float]:
61
+ return self.embed_query(query="A sample query.")
62
+
63
+ def num_of_dimensions(self):
64
+ exemplary_embedding = self.get_exemplary_embedding()
65
+ return np.shape(exemplary_embedding)
66
+
67
+ def is_unit_vector(self):
68
+ exemplary_embedding = self.get_exemplary_embedding()
69
+ return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
70
+
71
+ def embed_query(self, query):
72
+ client = self.config.get_client()
73
+ result = client.embed_query(str(query))
74
+ return result
75
+
76
+ def embed_documents(self, elements: List[dict]) -> List[dict]:
77
+ client = self.config.get_client()
78
+ embeddings = client.embed_documents([e.get("text", "") for e in elements])
79
+ elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
80
+ return elements_with_embeddings
81
+
82
+ def _add_embeddings_to_elements(self, elements, embeddings) -> List[dict]:
83
+ assert len(elements) == len(embeddings)
84
+ elements_w_embedding = []
85
+ for i, element in enumerate(elements):
86
+ element["embeddings"] = embeddings[i]
87
+ elements_w_embedding.append(element)
88
+ return elements
@@ -0,0 +1,69 @@
1
+ from dataclasses import dataclass
2
+ from typing import TYPE_CHECKING, List, Optional
3
+
4
+ import numpy as np
5
+ from pydantic import Field, SecretStr
6
+
7
+ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
8
+ from unstructured_ingest.utils.dep_check import requires_dependencies
9
+
10
+ if TYPE_CHECKING:
11
+ from langchain_voyageai import VoyageAIEmbeddings
12
+
13
+
14
+ class VoyageAIEmbeddingConfig(EmbeddingConfig):
15
+ api_key: SecretStr
16
+ embedder_model_name: str = Field(alias="model_name")
17
+ batch_size: Optional[int] = Field(default=None)
18
+ truncation: Optional[bool] = Field(default=None)
19
+
20
+ @requires_dependencies(
21
+ ["langchain", "langchain_voyageai"],
22
+ extras="embed-voyageai",
23
+ )
24
+ def get_client(self) -> "VoyageAIEmbeddings":
25
+ """Creates a Langchain VoyageAI python client to embed elements."""
26
+ from langchain_voyageai import VoyageAIEmbeddings
27
+
28
+ return VoyageAIEmbeddings(
29
+ voyage_api_key=self.api_key,
30
+ model=self.embedder_model_name,
31
+ batch_size=self.batch_size,
32
+ truncation=self.truncation,
33
+ )
34
+
35
+
36
+ @dataclass
37
+ class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
38
+ config: VoyageAIEmbeddingConfig
39
+
40
+ def get_exemplary_embedding(self) -> List[float]:
41
+ return self.embed_query(query="A sample query.")
42
+
43
+ @property
44
+ def num_of_dimensions(self) -> tuple[int, ...]:
45
+ exemplary_embedding = self.get_exemplary_embedding()
46
+ return np.shape(exemplary_embedding)
47
+
48
+ @property
49
+ def is_unit_vector(self) -> bool:
50
+ exemplary_embedding = self.get_exemplary_embedding()
51
+ return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
52
+
53
+ def embed_documents(self, elements: List[dict]) -> List[dict]:
54
+ client = self.config.get_client()
55
+ embeddings = client.embed_documents([e.get("text", "") for e in elements])
56
+ return self._add_embeddings_to_elements(elements, embeddings)
57
+
58
+ def embed_query(self, query: str) -> List[float]:
59
+ client = self.config.get_client()
60
+ return client.embed_query(query)
61
+
62
+ @staticmethod
63
+ def _add_embeddings_to_elements(elements, embeddings) -> List[dict]:
64
+ assert len(elements) == len(embeddings)
65
+ elements_w_embedding = []
66
+ for i, element in enumerate(elements):
67
+ element["embeddings"] = embeddings[i]
68
+ elements_w_embedding.append(element)
69
+ return elements
@@ -24,7 +24,8 @@ from unstructured_ingest.utils.data_prep import flatten_dict
24
24
 
25
25
  if TYPE_CHECKING:
26
26
  from unstructured.documents.elements import Element
27
- from unstructured.embed.interfaces import BaseEmbeddingEncoder
27
+
28
+ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder
28
29
 
29
30
  A = TypeVar("A", bound="DataClassJsonMixin")
30
31
 
@@ -204,22 +205,31 @@ class EmbeddingConfig(BaseConfig):
204
205
  kwargs["model_name"] = self.model_name
205
206
  # TODO make this more dynamic to map to encoder configs
206
207
  if self.provider == "langchain-openai":
207
- from unstructured.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
208
+ from unstructured_ingest.embed.openai import (
209
+ OpenAIEmbeddingConfig,
210
+ OpenAIEmbeddingEncoder,
211
+ )
208
212
 
209
213
  return OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(**kwargs))
210
214
  elif self.provider == "langchain-huggingface":
211
- from unstructured.embed.huggingface import (
215
+ from unstructured_ingest.embed.huggingface import (
212
216
  HuggingFaceEmbeddingConfig,
213
217
  HuggingFaceEmbeddingEncoder,
214
218
  )
215
219
 
216
220
  return HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig(**kwargs))
217
221
  elif self.provider == "octoai":
218
- from unstructured.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
222
+ from unstructured_ingest.embed.octoai import (
223
+ OctoAiEmbeddingConfig,
224
+ OctoAIEmbeddingEncoder,
225
+ )
219
226
 
220
227
  return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(**kwargs))
221
228
  elif self.provider == "langchain-aws-bedrock":
222
- from unstructured.embed.bedrock import BedrockEmbeddingConfig, BedrockEmbeddingEncoder
229
+ from unstructured_ingest.embed.bedrock import (
230
+ BedrockEmbeddingConfig,
231
+ BedrockEmbeddingEncoder,
232
+ )
223
233
 
224
234
  return BedrockEmbeddingEncoder(
225
235
  config=BedrockEmbeddingConfig(
@@ -229,14 +239,14 @@ class EmbeddingConfig(BaseConfig):
229
239
  )
230
240
  )
231
241
  elif self.provider == "langchain-vertexai":
232
- from unstructured.embed.vertexai import (
242
+ from unstructured_ingest.embed.vertexai import (
233
243
  VertexAIEmbeddingConfig,
234
244
  VertexAIEmbeddingEncoder,
235
245
  )
236
246
 
237
247
  return VertexAIEmbeddingEncoder(config=VertexAIEmbeddingConfig(**kwargs))
238
248
  elif self.provider == "langchain-voyageai":
239
- from unstructured.embed.voyageai import (
249
+ from unstructured_ingest.embed.voyageai import (
240
250
  VoyageAIEmbeddingConfig,
241
251
  VoyageAIEmbeddingEncoder,
242
252
  )
@@ -519,7 +529,7 @@ class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
519
529
  and self.filename.is_file()
520
530
  and self.filename.stat().st_size
521
531
  ):
522
- logger.debug(f"File exists: {self.filename}, skipping {func.__name__}")
532
+ logger.debug(f"file exists: {self.filename}, skipping {func.__name__}")
523
533
  return None
524
534
  return func(self, *args, **kwargs)
525
535
 
@@ -576,7 +586,7 @@ class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
576
586
 
577
587
  endpoint = partition_config.partition_endpoint
578
588
 
579
- logger.debug(f"Using remote partition ({endpoint})")
589
+ logger.debug(f"using remote partition ({endpoint})")
580
590
 
581
591
  elements = partition_via_api(
582
592
  filename=str(self.filename),
@@ -596,7 +606,7 @@ class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
596
606
  self._date_processed = datetime.utcnow().isoformat()
597
607
  if self.read_config.download_only:
598
608
  return None
599
- logger.info(f"Processing {self.filename}")
609
+ logger.info(f"processing {self.filename}")
600
610
 
601
611
  elements = self.partition_file(partition_config=partition_config, **partition_kwargs)
602
612
  element_dicts = [e.to_dict() for e in elements]
@@ -814,7 +824,7 @@ class IngestDocCleanupMixin:
814
824
  and self.filename.is_file()
815
825
  and not self.read_config.download_only
816
826
  ):
817
- logger.debug(f"Cleaning up {self}")
827
+ logger.debug(f"cleaning up {self}")
818
828
  os.unlink(self.filename)
819
829
 
820
830
 
@@ -95,7 +95,7 @@ class SensitiveFormatter(logging.Formatter):
95
95
 
96
96
 
97
97
  def remove_root_handlers(logger: logging.Logger) -> None:
98
- # NOTE(robinson) - in some environments such as Google Colab, there is a root handler
98
+ # NOTE(robinson): in some environments such as Google Colab, there is a root handler
99
99
  # that doesn't not mask secrets, meaning sensitive info such as api keys appear in logs.
100
100
  # Removing these when they exist prevents this behavior
101
101
  if logger.root.hasHandlers():
@@ -15,5 +15,5 @@ class Copier(CopyNode):
15
15
  ingest_doc = create_ingest_doc_from_dict(ingest_doc_dict)
16
16
  desired_output = ingest_doc._output_filename
17
17
  Path(desired_output).parent.mkdir(parents=True, exist_ok=True)
18
- logger.info(f"Copying {json_path} -> {desired_output}")
18
+ logger.info(f"copying {json_path} -> {desired_output}")
19
19
  shutil.copy(json_path, desired_output)
@@ -57,7 +57,7 @@ class PipelineNode(DataClassJsonMixin, ABC):
57
57
  iterable = iterable if iterable else []
58
58
  if iterable:
59
59
  logger.info(
60
- f"Calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore
60
+ f"calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore
61
61
  )
62
62
 
63
63
  self.initialize()
@@ -92,7 +92,7 @@ class PipelineNode(DataClassJsonMixin, ABC):
92
92
 
93
93
  def initialize(self):
94
94
  if path := self.get_path():
95
- logger.info(f"Creating {path}")
95
+ logger.info(f"creating {path}")
96
96
  path.mkdir(parents=True, exist_ok=True)
97
97
  ingest_log_streaming_init(logging.DEBUG if self.pipeline_context.verbose else logging.INFO)
98
98
 
@@ -30,7 +30,7 @@ class Partitioner(PartitionNode):
30
30
  and json_path.is_file()
31
31
  and json_path.stat().st_size
32
32
  ):
33
- logger.info(f"File exists: {json_path}, skipping partition")
33
+ logger.info(f"file exists: {json_path}, skipping partition")
34
34
  return str(json_path)
35
35
  partition_kwargs: t.Dict[str, t.Any] = {
36
36
  "strategy": self.partition_config.strategy,
@@ -96,7 +96,7 @@ class Pipeline(DataClassJsonMixin):
96
96
  for reformat_node in self.reformat_nodes:
97
97
  reformatted_jsons = reformat_node(iterable=partitioned_jsons)
98
98
  if not reformatted_jsons:
99
- logger.info(f"No files to process after {reformat_node.__class__.__name__}")
99
+ logger.info(f"no files to process after {reformat_node.__class__.__name__}")
100
100
  return
101
101
  partitioned_jsons = reformatted_jsons
102
102
 
@@ -58,7 +58,7 @@ class Chunker(ReformatNode):
58
58
  and json_path.is_file()
59
59
  and json_path.stat().st_size
60
60
  ):
61
- logger.debug(f"File exists: {json_path}, skipping chunking")
61
+ logger.debug(f"file exists: {json_path}, skipping chunking")
62
62
  return str(json_path)
63
63
 
64
64
  chunked_elements = self.chunk(elements_json)
@@ -112,7 +112,7 @@ class Chunker(ReformatNode):
112
112
 
113
113
  return partition_via_api(
114
114
  filename=elements_json_file,
115
- # -- (jennings) If api_key or api_url are None, partition_via_api will raise an
115
+ # -- NOTE(jennings): If api_key or api_url are None, partition_via_api will raise an
116
116
  # -- error, which will be caught and logged by Chunker.run()
117
117
  api_key=self.partition_config.api_key, # type: ignore
118
118
  api_url=self.partition_config.partition_endpoint, # type: ignore