unstructured-ingest 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/interfaces.py +1 -1
- unstructured_ingest/cli/utils.py +1 -1
- unstructured_ingest/connector/astradb.py +1 -1
- unstructured_ingest/connector/biomed.py +4 -4
- unstructured_ingest/connector/chroma.py +1 -1
- unstructured_ingest/connector/databricks_volumes.py +2 -2
- unstructured_ingest/connector/fsspec/box.py +1 -1
- unstructured_ingest/connector/fsspec/fsspec.py +5 -5
- unstructured_ingest/connector/git.py +1 -1
- unstructured_ingest/connector/google_drive.py +4 -4
- unstructured_ingest/connector/hubspot.py +1 -1
- unstructured_ingest/connector/kafka.py +8 -8
- unstructured_ingest/connector/local.py +1 -1
- unstructured_ingest/connector/notion/helpers.py +4 -4
- unstructured_ingest/connector/onedrive.py +3 -3
- unstructured_ingest/connector/outlook.py +2 -2
- unstructured_ingest/connector/pinecone.py +1 -1
- unstructured_ingest/connector/sharepoint.py +8 -8
- unstructured_ingest/connector/vectara.py +6 -6
- unstructured_ingest/embed/__init__.py +17 -0
- unstructured_ingest/embed/bedrock.py +70 -0
- unstructured_ingest/embed/huggingface.py +73 -0
- unstructured_ingest/embed/interfaces.py +36 -0
- unstructured_ingest/embed/mixedbreadai.py +177 -0
- unstructured_ingest/embed/octoai.py +63 -0
- unstructured_ingest/embed/openai.py +61 -0
- unstructured_ingest/embed/vertexai.py +88 -0
- unstructured_ingest/embed/voyageai.py +69 -0
- unstructured_ingest/interfaces.py +21 -11
- unstructured_ingest/logger.py +1 -1
- unstructured_ingest/pipeline/copy.py +1 -1
- unstructured_ingest/pipeline/interfaces.py +2 -2
- unstructured_ingest/pipeline/partition.py +1 -1
- unstructured_ingest/pipeline/pipeline.py +1 -1
- unstructured_ingest/pipeline/reformat/chunking.py +2 -2
- unstructured_ingest/pipeline/reformat/embedding.py +4 -6
- unstructured_ingest/pipeline/source.py +2 -2
- unstructured_ingest/utils/compression.py +3 -3
- unstructured_ingest/utils/data_prep.py +20 -12
- unstructured_ingest/utils/string_and_date_utils.py +2 -2
- unstructured_ingest/v2/cli/base/cmd.py +3 -3
- unstructured_ingest/v2/cli/base/dest.py +1 -1
- unstructured_ingest/v2/cli/base/src.py +3 -2
- unstructured_ingest/v2/cli/utils/click.py +1 -1
- unstructured_ingest/v2/interfaces/processor.py +48 -13
- unstructured_ingest/v2/logger.py +1 -1
- unstructured_ingest/v2/otel.py +1 -1
- unstructured_ingest/v2/pipeline/interfaces.py +12 -3
- unstructured_ingest/v2/pipeline/pipeline.py +42 -29
- unstructured_ingest/v2/pipeline/steps/chunk.py +3 -3
- unstructured_ingest/v2/pipeline/steps/download.py +17 -2
- unstructured_ingest/v2/pipeline/steps/embed.py +3 -3
- unstructured_ingest/v2/pipeline/steps/filter.py +1 -1
- unstructured_ingest/v2/pipeline/steps/index.py +2 -2
- unstructured_ingest/v2/pipeline/steps/partition.py +3 -3
- unstructured_ingest/v2/pipeline/steps/stage.py +1 -1
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
- unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +6 -1
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +4 -4
- unstructured_ingest/v2/processes/connectors/google_drive.py +2 -3
- unstructured_ingest/v2/processes/connectors/local.py +6 -5
- unstructured_ingest/v2/processes/connectors/milvus.py +1 -1
- unstructured_ingest/v2/processes/connectors/onedrive.py +8 -6
- unstructured_ingest/v2/processes/connectors/opensearch.py +1 -1
- unstructured_ingest/v2/processes/connectors/pinecone.py +38 -16
- unstructured_ingest/v2/processes/connectors/sharepoint.py +10 -6
- unstructured_ingest/v2/processes/embedder.py +41 -24
- unstructured_ingest/v2/processes/filter.py +1 -1
- unstructured_ingest/v2/processes/partitioner.py +3 -3
- unstructured_ingest/v2/utils.py +7 -0
- {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/METADATA +212 -211
- {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/RECORD +81 -72
- unstructured_ingest/evaluate.py +0 -338
- {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import List, Tuple
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class EmbeddingConfig(BaseModel):
|
|
9
|
+
pass
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class BaseEmbeddingEncoder(ABC):
|
|
14
|
+
config: EmbeddingConfig
|
|
15
|
+
|
|
16
|
+
def initialize(self):
|
|
17
|
+
"""Initializes the embedding encoder class. Should also validate the instance
|
|
18
|
+
is properly configured: e.g., embed a single a element"""
|
|
19
|
+
|
|
20
|
+
@property
|
|
21
|
+
@abstractmethod
|
|
22
|
+
def num_of_dimensions(self) -> Tuple[int]:
|
|
23
|
+
"""Number of dimensions for the embedding vector."""
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
@abstractmethod
|
|
27
|
+
def is_unit_vector(self) -> bool:
|
|
28
|
+
"""Denotes if the embedding vector is a unit vector."""
|
|
29
|
+
|
|
30
|
+
@abstractmethod
|
|
31
|
+
def embed_documents(self, elements: List[dict]) -> List[dict]:
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def embed_query(self, query: str) -> List[float]:
|
|
36
|
+
pass
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from typing import TYPE_CHECKING, List, Optional
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from pydantic import Field, SecretStr
|
|
7
|
+
|
|
8
|
+
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
|
|
9
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
|
+
|
|
11
|
+
USER_AGENT = "@mixedbread-ai/unstructured"
|
|
12
|
+
BATCH_SIZE = 128
|
|
13
|
+
TIMEOUT = 60
|
|
14
|
+
MAX_RETRIES = 3
|
|
15
|
+
ENCODING_FORMAT = "float"
|
|
16
|
+
TRUNCATION_STRATEGY = "end"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from mixedbread_ai.client import MixedbreadAI
|
|
21
|
+
from mixedbread_ai.core import RequestOptions
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class MixedbreadAIEmbeddingConfig(EmbeddingConfig):
|
|
25
|
+
"""
|
|
26
|
+
Configuration class for Mixedbread AI Embedding Encoder.
|
|
27
|
+
|
|
28
|
+
Attributes:
|
|
29
|
+
api_key (str): API key for accessing Mixedbread AI..
|
|
30
|
+
embedder_model_name (str): Name of the model to use for embeddings.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
api_key: SecretStr = Field(
|
|
34
|
+
default_factory=lambda: SecretStr(os.environ.get("MXBAI_API_KEY")),
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
embedder_model_name: str = Field(
|
|
38
|
+
default="mixedbread-ai/mxbai-embed-large-v1", alias="model_name"
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
@requires_dependencies(
|
|
42
|
+
["mixedbread_ai"],
|
|
43
|
+
extras="embed-mixedbreadai",
|
|
44
|
+
)
|
|
45
|
+
def get_client(self) -> "MixedbreadAI":
|
|
46
|
+
"""
|
|
47
|
+
Create the Mixedbread AI client.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
MixedbreadAI: Initialized client.
|
|
51
|
+
"""
|
|
52
|
+
from mixedbread_ai.client import MixedbreadAI
|
|
53
|
+
|
|
54
|
+
return MixedbreadAI(
|
|
55
|
+
api_key=self.api_key.get_secret_value(),
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
61
|
+
"""
|
|
62
|
+
Embedding encoder for Mixedbread AI.
|
|
63
|
+
|
|
64
|
+
Attributes:
|
|
65
|
+
config (MixedbreadAIEmbeddingConfig): Configuration for the embedding encoder.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
config: MixedbreadAIEmbeddingConfig
|
|
69
|
+
|
|
70
|
+
_exemplary_embedding: Optional[List[float]] = field(init=False, default=None)
|
|
71
|
+
_request_options: Optional["RequestOptions"] = field(init=False, default=None)
|
|
72
|
+
|
|
73
|
+
def get_exemplary_embedding(self) -> List[float]:
|
|
74
|
+
"""Get an exemplary embedding to determine dimensions and unit vector status."""
|
|
75
|
+
return self._embed(["Q"])[0]
|
|
76
|
+
|
|
77
|
+
def initialize(self):
|
|
78
|
+
if self.config.api_key is None:
|
|
79
|
+
raise ValueError(
|
|
80
|
+
"The Mixedbread AI API key must be specified."
|
|
81
|
+
+ "You either pass it in the constructor using 'api_key'"
|
|
82
|
+
+ "or via the 'MXBAI_API_KEY' environment variable."
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
from mixedbread_ai.core import RequestOptions
|
|
86
|
+
|
|
87
|
+
self._request_options = RequestOptions(
|
|
88
|
+
max_retries=MAX_RETRIES,
|
|
89
|
+
timeout_in_seconds=TIMEOUT,
|
|
90
|
+
additional_headers={"User-Agent": USER_AGENT},
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def num_of_dimensions(self):
|
|
95
|
+
"""Get the number of dimensions for the embeddings."""
|
|
96
|
+
exemplary_embedding = self.get_exemplary_embedding()
|
|
97
|
+
return np.shape(exemplary_embedding)
|
|
98
|
+
|
|
99
|
+
@property
|
|
100
|
+
def is_unit_vector(self) -> bool:
|
|
101
|
+
"""Check if the embedding is a unit vector."""
|
|
102
|
+
exemplary_embedding = self.get_exemplary_embedding()
|
|
103
|
+
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
|
|
104
|
+
|
|
105
|
+
def _embed(self, texts: List[str]) -> List[List[float]]:
|
|
106
|
+
"""
|
|
107
|
+
Embed a list of texts using the Mixedbread AI API.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
texts (List[str]): List of texts to embed.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
List[List[float]]: List of embeddings.
|
|
114
|
+
"""
|
|
115
|
+
batch_size = BATCH_SIZE
|
|
116
|
+
batch_itr = range(0, len(texts), batch_size)
|
|
117
|
+
|
|
118
|
+
responses = []
|
|
119
|
+
client = self.config.get_client()
|
|
120
|
+
for i in batch_itr:
|
|
121
|
+
batch = texts[i : i + batch_size]
|
|
122
|
+
response = client.embeddings(
|
|
123
|
+
model=self.config.embedder_model_name,
|
|
124
|
+
normalized=True,
|
|
125
|
+
encoding_format=ENCODING_FORMAT,
|
|
126
|
+
truncation_strategy=TRUNCATION_STRATEGY,
|
|
127
|
+
request_options=self._request_options,
|
|
128
|
+
input=batch,
|
|
129
|
+
)
|
|
130
|
+
responses.append(response)
|
|
131
|
+
return [item.embedding for response in responses for item in response.data]
|
|
132
|
+
|
|
133
|
+
@staticmethod
|
|
134
|
+
def _add_embeddings_to_elements(
|
|
135
|
+
elements: List[dict], embeddings: List[List[float]]
|
|
136
|
+
) -> List[dict]:
|
|
137
|
+
"""
|
|
138
|
+
Add embeddings to elements.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
elements (List[Element]): List of elements.
|
|
142
|
+
embeddings (List[List[float]]): List of embeddings.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
List[Element]: Elements with embeddings added.
|
|
146
|
+
"""
|
|
147
|
+
assert len(elements) == len(embeddings)
|
|
148
|
+
elements_w_embedding = []
|
|
149
|
+
for i, element in enumerate(elements):
|
|
150
|
+
element["embeddings"] = embeddings[i]
|
|
151
|
+
elements_w_embedding.append(element)
|
|
152
|
+
return elements
|
|
153
|
+
|
|
154
|
+
def embed_documents(self, elements: List[dict]) -> List[dict]:
|
|
155
|
+
"""
|
|
156
|
+
Embed a list of document elements.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
elements (List[Element]): List of document elements.
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
List[Element]: Elements with embeddings.
|
|
163
|
+
"""
|
|
164
|
+
embeddings = self._embed([e.get("text", "") for e in elements])
|
|
165
|
+
return self._add_embeddings_to_elements(elements, embeddings)
|
|
166
|
+
|
|
167
|
+
def embed_query(self, query: str) -> List[float]:
|
|
168
|
+
"""
|
|
169
|
+
Embed a query string.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
query (str): Query string to embed.
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
List[float]: Embedding of the query.
|
|
176
|
+
"""
|
|
177
|
+
return self._embed([query])[0]
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import TYPE_CHECKING, List, Optional
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
from pydantic import Field, SecretStr
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
|
|
8
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from openai import OpenAI
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class OctoAiEmbeddingConfig(EmbeddingConfig):
|
|
15
|
+
api_key: SecretStr
|
|
16
|
+
embedder_model_name: str = Field(default="thenlper/gte-large", alias="model_name")
|
|
17
|
+
base_url: str = Field(default="https://text.octoai.run/v1")
|
|
18
|
+
|
|
19
|
+
@requires_dependencies(
|
|
20
|
+
["openai", "tiktoken"],
|
|
21
|
+
extras="embed-octoai",
|
|
22
|
+
)
|
|
23
|
+
def get_client(self) -> "OpenAI":
|
|
24
|
+
"""Creates an OpenAI python client to embed elements. Uses the OpenAI SDK."""
|
|
25
|
+
from openai import OpenAI
|
|
26
|
+
|
|
27
|
+
return OpenAI(api_key=self.api_key.get_secret_value(), base_url=self.base_url)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
32
|
+
config: OctoAiEmbeddingConfig
|
|
33
|
+
# Uses the OpenAI SDK
|
|
34
|
+
_exemplary_embedding: Optional[List[float]] = field(init=False, default=None)
|
|
35
|
+
|
|
36
|
+
def get_exemplary_embedding(self) -> List[float]:
|
|
37
|
+
return self.embed_query("Q")
|
|
38
|
+
|
|
39
|
+
def num_of_dimensions(self):
|
|
40
|
+
exemplary_embedding = self.get_exemplary_embedding()
|
|
41
|
+
return np.shape(exemplary_embedding)
|
|
42
|
+
|
|
43
|
+
def is_unit_vector(self):
|
|
44
|
+
exemplary_embedding = self.get_exemplary_embedding()
|
|
45
|
+
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
|
|
46
|
+
|
|
47
|
+
def embed_query(self, query: str):
|
|
48
|
+
client = self.config.get_client()
|
|
49
|
+
response = client.embeddings.create(input=query, model=self.config.embedder_model_name)
|
|
50
|
+
return response.data[0].embedding
|
|
51
|
+
|
|
52
|
+
def embed_documents(self, elements: List[dict]) -> List[dict]:
|
|
53
|
+
embeddings = [self.embed_query(e.get("text", "")) for e in elements]
|
|
54
|
+
elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
|
|
55
|
+
return elements_with_embeddings
|
|
56
|
+
|
|
57
|
+
def _add_embeddings_to_elements(self, elements, embeddings) -> List[dict]:
|
|
58
|
+
assert len(elements) == len(embeddings)
|
|
59
|
+
elements_w_embedding = []
|
|
60
|
+
for i, element in enumerate(elements):
|
|
61
|
+
element["embeddings"] = embeddings[i]
|
|
62
|
+
elements_w_embedding.append(element)
|
|
63
|
+
return elements
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import TYPE_CHECKING, List
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
from pydantic import Field, SecretStr
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
|
|
8
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from langchain_openai.embeddings import OpenAIEmbeddings
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class OpenAIEmbeddingConfig(EmbeddingConfig):
|
|
15
|
+
api_key: SecretStr
|
|
16
|
+
embedder_model_name: str = Field(default="text-embedding-ada-002", alias="model_name")
|
|
17
|
+
|
|
18
|
+
@requires_dependencies(["langchain_openai"], extras="openai")
|
|
19
|
+
def get_client(self) -> "OpenAIEmbeddings":
|
|
20
|
+
"""Creates a langchain OpenAI python client to embed elements."""
|
|
21
|
+
from langchain_openai import OpenAIEmbeddings
|
|
22
|
+
|
|
23
|
+
openai_client = OpenAIEmbeddings(
|
|
24
|
+
openai_api_key=self.api_key.get_secret_value(),
|
|
25
|
+
model=self.embedder_model_name, # type:ignore
|
|
26
|
+
)
|
|
27
|
+
return openai_client
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
32
|
+
config: OpenAIEmbeddingConfig
|
|
33
|
+
|
|
34
|
+
def get_exemplary_embedding(self) -> List[float]:
|
|
35
|
+
return self.embed_query(query="Q")
|
|
36
|
+
|
|
37
|
+
def num_of_dimensions(self):
|
|
38
|
+
exemplary_embedding = self.get_exemplary_embedding()
|
|
39
|
+
return np.shape(exemplary_embedding)
|
|
40
|
+
|
|
41
|
+
def is_unit_vector(self):
|
|
42
|
+
exemplary_embedding = self.get_exemplary_embedding()
|
|
43
|
+
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
|
|
44
|
+
|
|
45
|
+
def embed_query(self, query):
|
|
46
|
+
client = self.config.get_client()
|
|
47
|
+
return client.embed_query(str(query))
|
|
48
|
+
|
|
49
|
+
def embed_documents(self, elements: List[dict]) -> List[dict]:
|
|
50
|
+
client = self.config.get_client()
|
|
51
|
+
embeddings = client.embed_documents([e.get("text", "") for e in elements])
|
|
52
|
+
elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
|
|
53
|
+
return elements_with_embeddings
|
|
54
|
+
|
|
55
|
+
def _add_embeddings_to_elements(self, elements, embeddings) -> List[dict]:
|
|
56
|
+
assert len(elements) == len(embeddings)
|
|
57
|
+
elements_w_embedding = []
|
|
58
|
+
for i, element in enumerate(elements):
|
|
59
|
+
element["embeddings"] = embeddings[i]
|
|
60
|
+
elements_w_embedding.append(element)
|
|
61
|
+
return elements
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# type: ignore
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import TYPE_CHECKING, Annotated, Any, List, Optional
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
from pydantic import Field, Secret, ValidationError
|
|
9
|
+
from pydantic.functional_validators import BeforeValidator
|
|
10
|
+
from unstructured.utils import FileHandler
|
|
11
|
+
|
|
12
|
+
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
|
|
13
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from langchain_google_vertexai import VertexAIEmbeddings
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def conform_string_to_dict(value: Any) -> dict:
|
|
20
|
+
if isinstance(value, dict):
|
|
21
|
+
return value
|
|
22
|
+
if isinstance(value, str):
|
|
23
|
+
return json.loads(value)
|
|
24
|
+
raise ValidationError(f"Input could not be mapped to a valid dict: {value}")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
ApiKeyType = Secret[Annotated[dict, BeforeValidator(conform_string_to_dict)]]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class VertexAIEmbeddingConfig(EmbeddingConfig):
|
|
31
|
+
api_key: ApiKeyType
|
|
32
|
+
embedder_model_name: Optional[str] = Field(
|
|
33
|
+
default="textembedding-gecko@001", alias="model_name"
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
def register_application_credentials(self):
|
|
37
|
+
# TODO look into passing credentials in directly, rather than via env var and tmp file
|
|
38
|
+
application_credentials_path = os.path.join("/tmp", "google-vertex-app-credentials.json")
|
|
39
|
+
credentials_file = FileHandler(application_credentials_path)
|
|
40
|
+
credentials_file.write_file(json.dumps(self.api_key.get_secret_value()))
|
|
41
|
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = application_credentials_path
|
|
42
|
+
|
|
43
|
+
@requires_dependencies(
|
|
44
|
+
["langchain", "langchain_google_vertexai"],
|
|
45
|
+
extras="embed-vertexai",
|
|
46
|
+
)
|
|
47
|
+
def get_client(self) -> "VertexAIEmbeddings":
|
|
48
|
+
"""Creates a Langchain VertexAI python client to embed elements."""
|
|
49
|
+
from langchain_google_vertexai import VertexAIEmbeddings
|
|
50
|
+
|
|
51
|
+
self.register_application_credentials()
|
|
52
|
+
vertexai_client = VertexAIEmbeddings(model_name=self.embedder_model_name)
|
|
53
|
+
return vertexai_client
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class VertexAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
58
|
+
config: VertexAIEmbeddingConfig
|
|
59
|
+
|
|
60
|
+
def get_exemplary_embedding(self) -> List[float]:
|
|
61
|
+
return self.embed_query(query="A sample query.")
|
|
62
|
+
|
|
63
|
+
def num_of_dimensions(self):
|
|
64
|
+
exemplary_embedding = self.get_exemplary_embedding()
|
|
65
|
+
return np.shape(exemplary_embedding)
|
|
66
|
+
|
|
67
|
+
def is_unit_vector(self):
|
|
68
|
+
exemplary_embedding = self.get_exemplary_embedding()
|
|
69
|
+
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
|
|
70
|
+
|
|
71
|
+
def embed_query(self, query):
|
|
72
|
+
client = self.config.get_client()
|
|
73
|
+
result = client.embed_query(str(query))
|
|
74
|
+
return result
|
|
75
|
+
|
|
76
|
+
def embed_documents(self, elements: List[dict]) -> List[dict]:
|
|
77
|
+
client = self.config.get_client()
|
|
78
|
+
embeddings = client.embed_documents([e.get("text", "") for e in elements])
|
|
79
|
+
elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
|
|
80
|
+
return elements_with_embeddings
|
|
81
|
+
|
|
82
|
+
def _add_embeddings_to_elements(self, elements, embeddings) -> List[dict]:
|
|
83
|
+
assert len(elements) == len(embeddings)
|
|
84
|
+
elements_w_embedding = []
|
|
85
|
+
for i, element in enumerate(elements):
|
|
86
|
+
element["embeddings"] = embeddings[i]
|
|
87
|
+
elements_w_embedding.append(element)
|
|
88
|
+
return elements
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import TYPE_CHECKING, List, Optional
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
from pydantic import Field, SecretStr
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
|
|
8
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from langchain_voyageai import VoyageAIEmbeddings
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class VoyageAIEmbeddingConfig(EmbeddingConfig):
|
|
15
|
+
api_key: SecretStr
|
|
16
|
+
embedder_model_name: str = Field(alias="model_name")
|
|
17
|
+
batch_size: Optional[int] = Field(default=None)
|
|
18
|
+
truncation: Optional[bool] = Field(default=None)
|
|
19
|
+
|
|
20
|
+
@requires_dependencies(
|
|
21
|
+
["langchain", "langchain_voyageai"],
|
|
22
|
+
extras="embed-voyageai",
|
|
23
|
+
)
|
|
24
|
+
def get_client(self) -> "VoyageAIEmbeddings":
|
|
25
|
+
"""Creates a Langchain VoyageAI python client to embed elements."""
|
|
26
|
+
from langchain_voyageai import VoyageAIEmbeddings
|
|
27
|
+
|
|
28
|
+
return VoyageAIEmbeddings(
|
|
29
|
+
voyage_api_key=self.api_key,
|
|
30
|
+
model=self.embedder_model_name,
|
|
31
|
+
batch_size=self.batch_size,
|
|
32
|
+
truncation=self.truncation,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
38
|
+
config: VoyageAIEmbeddingConfig
|
|
39
|
+
|
|
40
|
+
def get_exemplary_embedding(self) -> List[float]:
|
|
41
|
+
return self.embed_query(query="A sample query.")
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def num_of_dimensions(self) -> tuple[int, ...]:
|
|
45
|
+
exemplary_embedding = self.get_exemplary_embedding()
|
|
46
|
+
return np.shape(exemplary_embedding)
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def is_unit_vector(self) -> bool:
|
|
50
|
+
exemplary_embedding = self.get_exemplary_embedding()
|
|
51
|
+
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
|
|
52
|
+
|
|
53
|
+
def embed_documents(self, elements: List[dict]) -> List[dict]:
|
|
54
|
+
client = self.config.get_client()
|
|
55
|
+
embeddings = client.embed_documents([e.get("text", "") for e in elements])
|
|
56
|
+
return self._add_embeddings_to_elements(elements, embeddings)
|
|
57
|
+
|
|
58
|
+
def embed_query(self, query: str) -> List[float]:
|
|
59
|
+
client = self.config.get_client()
|
|
60
|
+
return client.embed_query(query)
|
|
61
|
+
|
|
62
|
+
@staticmethod
|
|
63
|
+
def _add_embeddings_to_elements(elements, embeddings) -> List[dict]:
|
|
64
|
+
assert len(elements) == len(embeddings)
|
|
65
|
+
elements_w_embedding = []
|
|
66
|
+
for i, element in enumerate(elements):
|
|
67
|
+
element["embeddings"] = embeddings[i]
|
|
68
|
+
elements_w_embedding.append(element)
|
|
69
|
+
return elements
|
|
@@ -24,7 +24,8 @@ from unstructured_ingest.utils.data_prep import flatten_dict
|
|
|
24
24
|
|
|
25
25
|
if TYPE_CHECKING:
|
|
26
26
|
from unstructured.documents.elements import Element
|
|
27
|
-
|
|
27
|
+
|
|
28
|
+
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder
|
|
28
29
|
|
|
29
30
|
A = TypeVar("A", bound="DataClassJsonMixin")
|
|
30
31
|
|
|
@@ -204,22 +205,31 @@ class EmbeddingConfig(BaseConfig):
|
|
|
204
205
|
kwargs["model_name"] = self.model_name
|
|
205
206
|
# TODO make this more dynamic to map to encoder configs
|
|
206
207
|
if self.provider == "langchain-openai":
|
|
207
|
-
from
|
|
208
|
+
from unstructured_ingest.embed.openai import (
|
|
209
|
+
OpenAIEmbeddingConfig,
|
|
210
|
+
OpenAIEmbeddingEncoder,
|
|
211
|
+
)
|
|
208
212
|
|
|
209
213
|
return OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(**kwargs))
|
|
210
214
|
elif self.provider == "langchain-huggingface":
|
|
211
|
-
from
|
|
215
|
+
from unstructured_ingest.embed.huggingface import (
|
|
212
216
|
HuggingFaceEmbeddingConfig,
|
|
213
217
|
HuggingFaceEmbeddingEncoder,
|
|
214
218
|
)
|
|
215
219
|
|
|
216
220
|
return HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig(**kwargs))
|
|
217
221
|
elif self.provider == "octoai":
|
|
218
|
-
from
|
|
222
|
+
from unstructured_ingest.embed.octoai import (
|
|
223
|
+
OctoAiEmbeddingConfig,
|
|
224
|
+
OctoAIEmbeddingEncoder,
|
|
225
|
+
)
|
|
219
226
|
|
|
220
227
|
return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(**kwargs))
|
|
221
228
|
elif self.provider == "langchain-aws-bedrock":
|
|
222
|
-
from
|
|
229
|
+
from unstructured_ingest.embed.bedrock import (
|
|
230
|
+
BedrockEmbeddingConfig,
|
|
231
|
+
BedrockEmbeddingEncoder,
|
|
232
|
+
)
|
|
223
233
|
|
|
224
234
|
return BedrockEmbeddingEncoder(
|
|
225
235
|
config=BedrockEmbeddingConfig(
|
|
@@ -229,14 +239,14 @@ class EmbeddingConfig(BaseConfig):
|
|
|
229
239
|
)
|
|
230
240
|
)
|
|
231
241
|
elif self.provider == "langchain-vertexai":
|
|
232
|
-
from
|
|
242
|
+
from unstructured_ingest.embed.vertexai import (
|
|
233
243
|
VertexAIEmbeddingConfig,
|
|
234
244
|
VertexAIEmbeddingEncoder,
|
|
235
245
|
)
|
|
236
246
|
|
|
237
247
|
return VertexAIEmbeddingEncoder(config=VertexAIEmbeddingConfig(**kwargs))
|
|
238
248
|
elif self.provider == "langchain-voyageai":
|
|
239
|
-
from
|
|
249
|
+
from unstructured_ingest.embed.voyageai import (
|
|
240
250
|
VoyageAIEmbeddingConfig,
|
|
241
251
|
VoyageAIEmbeddingEncoder,
|
|
242
252
|
)
|
|
@@ -519,7 +529,7 @@ class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
|
|
|
519
529
|
and self.filename.is_file()
|
|
520
530
|
and self.filename.stat().st_size
|
|
521
531
|
):
|
|
522
|
-
logger.debug(f"
|
|
532
|
+
logger.debug(f"file exists: {self.filename}, skipping {func.__name__}")
|
|
523
533
|
return None
|
|
524
534
|
return func(self, *args, **kwargs)
|
|
525
535
|
|
|
@@ -576,7 +586,7 @@ class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
|
|
|
576
586
|
|
|
577
587
|
endpoint = partition_config.partition_endpoint
|
|
578
588
|
|
|
579
|
-
logger.debug(f"
|
|
589
|
+
logger.debug(f"using remote partition ({endpoint})")
|
|
580
590
|
|
|
581
591
|
elements = partition_via_api(
|
|
582
592
|
filename=str(self.filename),
|
|
@@ -596,7 +606,7 @@ class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
|
|
|
596
606
|
self._date_processed = datetime.utcnow().isoformat()
|
|
597
607
|
if self.read_config.download_only:
|
|
598
608
|
return None
|
|
599
|
-
logger.info(f"
|
|
609
|
+
logger.info(f"processing {self.filename}")
|
|
600
610
|
|
|
601
611
|
elements = self.partition_file(partition_config=partition_config, **partition_kwargs)
|
|
602
612
|
element_dicts = [e.to_dict() for e in elements]
|
|
@@ -814,7 +824,7 @@ class IngestDocCleanupMixin:
|
|
|
814
824
|
and self.filename.is_file()
|
|
815
825
|
and not self.read_config.download_only
|
|
816
826
|
):
|
|
817
|
-
logger.debug(f"
|
|
827
|
+
logger.debug(f"cleaning up {self}")
|
|
818
828
|
os.unlink(self.filename)
|
|
819
829
|
|
|
820
830
|
|
unstructured_ingest/logger.py
CHANGED
|
@@ -95,7 +95,7 @@ class SensitiveFormatter(logging.Formatter):
|
|
|
95
95
|
|
|
96
96
|
|
|
97
97
|
def remove_root_handlers(logger: logging.Logger) -> None:
|
|
98
|
-
# NOTE(robinson)
|
|
98
|
+
# NOTE(robinson): in some environments such as Google Colab, there is a root handler
|
|
99
99
|
# that doesn't not mask secrets, meaning sensitive info such as api keys appear in logs.
|
|
100
100
|
# Removing these when they exist prevents this behavior
|
|
101
101
|
if logger.root.hasHandlers():
|
|
@@ -15,5 +15,5 @@ class Copier(CopyNode):
|
|
|
15
15
|
ingest_doc = create_ingest_doc_from_dict(ingest_doc_dict)
|
|
16
16
|
desired_output = ingest_doc._output_filename
|
|
17
17
|
Path(desired_output).parent.mkdir(parents=True, exist_ok=True)
|
|
18
|
-
logger.info(f"
|
|
18
|
+
logger.info(f"copying {json_path} -> {desired_output}")
|
|
19
19
|
shutil.copy(json_path, desired_output)
|
|
@@ -57,7 +57,7 @@ class PipelineNode(DataClassJsonMixin, ABC):
|
|
|
57
57
|
iterable = iterable if iterable else []
|
|
58
58
|
if iterable:
|
|
59
59
|
logger.info(
|
|
60
|
-
f"
|
|
60
|
+
f"calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore
|
|
61
61
|
)
|
|
62
62
|
|
|
63
63
|
self.initialize()
|
|
@@ -92,7 +92,7 @@ class PipelineNode(DataClassJsonMixin, ABC):
|
|
|
92
92
|
|
|
93
93
|
def initialize(self):
|
|
94
94
|
if path := self.get_path():
|
|
95
|
-
logger.info(f"
|
|
95
|
+
logger.info(f"creating {path}")
|
|
96
96
|
path.mkdir(parents=True, exist_ok=True)
|
|
97
97
|
ingest_log_streaming_init(logging.DEBUG if self.pipeline_context.verbose else logging.INFO)
|
|
98
98
|
|
|
@@ -30,7 +30,7 @@ class Partitioner(PartitionNode):
|
|
|
30
30
|
and json_path.is_file()
|
|
31
31
|
and json_path.stat().st_size
|
|
32
32
|
):
|
|
33
|
-
logger.info(f"
|
|
33
|
+
logger.info(f"file exists: {json_path}, skipping partition")
|
|
34
34
|
return str(json_path)
|
|
35
35
|
partition_kwargs: t.Dict[str, t.Any] = {
|
|
36
36
|
"strategy": self.partition_config.strategy,
|
|
@@ -96,7 +96,7 @@ class Pipeline(DataClassJsonMixin):
|
|
|
96
96
|
for reformat_node in self.reformat_nodes:
|
|
97
97
|
reformatted_jsons = reformat_node(iterable=partitioned_jsons)
|
|
98
98
|
if not reformatted_jsons:
|
|
99
|
-
logger.info(f"
|
|
99
|
+
logger.info(f"no files to process after {reformat_node.__class__.__name__}")
|
|
100
100
|
return
|
|
101
101
|
partitioned_jsons = reformatted_jsons
|
|
102
102
|
|
|
@@ -58,7 +58,7 @@ class Chunker(ReformatNode):
|
|
|
58
58
|
and json_path.is_file()
|
|
59
59
|
and json_path.stat().st_size
|
|
60
60
|
):
|
|
61
|
-
logger.debug(f"
|
|
61
|
+
logger.debug(f"file exists: {json_path}, skipping chunking")
|
|
62
62
|
return str(json_path)
|
|
63
63
|
|
|
64
64
|
chunked_elements = self.chunk(elements_json)
|
|
@@ -112,7 +112,7 @@ class Chunker(ReformatNode):
|
|
|
112
112
|
|
|
113
113
|
return partition_via_api(
|
|
114
114
|
filename=elements_json_file,
|
|
115
|
-
# -- (jennings) If api_key or api_url are None, partition_via_api will raise an
|
|
115
|
+
# -- NOTE(jennings): If api_key or api_url are None, partition_via_api will raise an
|
|
116
116
|
# -- error, which will be caught and logged by Chunker.run()
|
|
117
117
|
api_key=self.partition_config.api_key, # type: ignore
|
|
118
118
|
api_url=self.partition_config.partition_endpoint, # type: ignore
|