unstructured-ingest 0.0.24__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +42 -0
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +15 -0
- test/integration/connectors/databricks_tests/__init__.py +0 -0
- test/integration/connectors/databricks_tests/test_volumes_native.py +165 -0
- test/integration/connectors/test_postgres.py +100 -0
- test/integration/connectors/test_s3.py +152 -0
- test/integration/connectors/test_sqlite.py +91 -0
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +7 -0
- test/integration/connectors/utils/docker_compose.py +44 -0
- test/integration/connectors/utils/validation.py +198 -0
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +13 -0
- test/integration/embedders/test_bedrock.py +49 -0
- test/integration/embedders/test_huggingface.py +26 -0
- test/integration/embedders/test_mixedbread.py +47 -0
- test/integration/embedders/test_octoai.py +41 -0
- test/integration/embedders/test_openai.py +41 -0
- test/integration/embedders/test_vertexai.py +41 -0
- test/integration/embedders/test_voyageai.py +41 -0
- test/integration/embedders/togetherai.py +43 -0
- test/integration/embedders/utils.py +44 -0
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +75 -0
- test/integration/utils.py +15 -0
- test/unit/__init__.py +0 -0
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +41 -0
- test/unit/embed/test_octoai.py +20 -0
- test/unit/embed/test_openai.py +20 -0
- test/unit/embed/test_vertexai.py +25 -0
- test/unit/embed/test_voyageai.py +24 -0
- test/unit/test_chunking_utils.py +36 -0
- test/unit/test_error.py +27 -0
- test/unit/test_interfaces.py +280 -0
- test/unit/test_interfaces_v2.py +26 -0
- test/unit/test_logger.py +78 -0
- test/unit/test_utils.py +164 -0
- test/unit/test_utils_v2.py +82 -0
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/interfaces.py +2 -2
- unstructured_ingest/connector/notion/types/block.py +1 -0
- unstructured_ingest/connector/notion/types/database.py +1 -0
- unstructured_ingest/connector/notion/types/page.py +1 -0
- unstructured_ingest/embed/bedrock.py +0 -20
- unstructured_ingest/embed/huggingface.py +0 -21
- unstructured_ingest/embed/interfaces.py +29 -3
- unstructured_ingest/embed/mixedbreadai.py +0 -36
- unstructured_ingest/embed/octoai.py +2 -24
- unstructured_ingest/embed/openai.py +0 -20
- unstructured_ingest/embed/togetherai.py +40 -0
- unstructured_ingest/embed/vertexai.py +0 -20
- unstructured_ingest/embed/voyageai.py +1 -24
- unstructured_ingest/interfaces.py +1 -1
- unstructured_ingest/utils/dep_check.py +12 -0
- unstructured_ingest/v2/cli/utils/click.py +21 -2
- unstructured_ingest/v2/interfaces/connector.py +22 -2
- unstructured_ingest/v2/interfaces/downloader.py +1 -0
- unstructured_ingest/v2/processes/chunker.py +1 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +9 -11
- unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +175 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +125 -32
- unstructured_ingest/v2/processes/connectors/mongodb.py +223 -3
- unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +9 -1
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +13 -0
- unstructured_ingest/v2/processes/connectors/sql/postgres.py +121 -0
- unstructured_ingest/v2/processes/connectors/sql/sql.py +181 -0
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +109 -0
- unstructured_ingest/v2/processes/embedder.py +13 -0
- unstructured_ingest/v2/processes/partitioner.py +2 -1
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/METADATA +12 -10
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/RECORD +86 -32
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/top_level.txt +1 -0
- unstructured_ingest/v2/processes/connectors/sql.py +0 -275
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/entry_points.txt +0 -0
test/unit/test_utils.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import typing as t
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
|
|
6
|
+
import pytest
|
|
7
|
+
import pytz
|
|
8
|
+
|
|
9
|
+
from unstructured_ingest.cli.utils import extract_config
|
|
10
|
+
from unstructured_ingest.interfaces import BaseConfig
|
|
11
|
+
from unstructured_ingest.utils.string_and_date_utils import ensure_isoformat_datetime, json_to_dict
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class A(BaseConfig):
|
|
16
|
+
a: str
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class B(BaseConfig):
|
|
21
|
+
a: A
|
|
22
|
+
b: int
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
flat_data = {"a": "test", "b": 4, "c": True}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def test_extract_config_concrete():
|
|
29
|
+
@dataclass
|
|
30
|
+
class C(BaseConfig):
|
|
31
|
+
b: B
|
|
32
|
+
c: bool
|
|
33
|
+
|
|
34
|
+
c = extract_config(flat_data=flat_data, config=C)
|
|
35
|
+
expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": True}
|
|
36
|
+
assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def test_extract_config_optional():
|
|
40
|
+
@dataclass
|
|
41
|
+
class C(BaseConfig):
|
|
42
|
+
c: bool
|
|
43
|
+
b: t.Optional[B] = None
|
|
44
|
+
|
|
45
|
+
c = extract_config(flat_data=flat_data, config=C)
|
|
46
|
+
expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": True}
|
|
47
|
+
assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def test_extract_config_union():
|
|
51
|
+
@dataclass
|
|
52
|
+
class C(BaseConfig):
|
|
53
|
+
c: bool
|
|
54
|
+
b: t.Optional[t.Union[B, int]] = None
|
|
55
|
+
|
|
56
|
+
c = extract_config(flat_data=flat_data, config=C)
|
|
57
|
+
expected_result = {"b": 4, "c": True}
|
|
58
|
+
assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def test_extract_config_list():
|
|
62
|
+
@dataclass
|
|
63
|
+
class C(BaseConfig):
|
|
64
|
+
c: t.List[int]
|
|
65
|
+
b: B
|
|
66
|
+
|
|
67
|
+
flat_data = {"a": "test", "b": 4, "c": [1, 2, 3]}
|
|
68
|
+
c = extract_config(flat_data=flat_data, config=C)
|
|
69
|
+
expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": [1, 2, 3]}
|
|
70
|
+
assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def test_extract_config_optional_list():
|
|
74
|
+
@dataclass
|
|
75
|
+
class C(BaseConfig):
|
|
76
|
+
b: B
|
|
77
|
+
c: t.Optional[t.List[int]] = None
|
|
78
|
+
|
|
79
|
+
flat_data = {"a": "test", "b": 4, "c": [1, 2, 3]}
|
|
80
|
+
c = extract_config(flat_data=flat_data, config=C)
|
|
81
|
+
expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": [1, 2, 3]}
|
|
82
|
+
assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def test_extract_config_dataclass_list():
|
|
86
|
+
@dataclass
|
|
87
|
+
class C(BaseConfig):
|
|
88
|
+
c: bool
|
|
89
|
+
b: t.List[B] = field(default_factory=list)
|
|
90
|
+
|
|
91
|
+
flat_data = {"a": "test", "c": True}
|
|
92
|
+
c = extract_config(flat_data=flat_data, config=C)
|
|
93
|
+
expected_result = {"b": [], "c": True}
|
|
94
|
+
assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def test_extract_config_dict():
|
|
98
|
+
@dataclass
|
|
99
|
+
class C(BaseConfig):
|
|
100
|
+
c: bool
|
|
101
|
+
b: t.Dict[str, B] = field(default_factory=dict)
|
|
102
|
+
|
|
103
|
+
flat_data = {"c": True}
|
|
104
|
+
c = extract_config(flat_data=flat_data, config=C)
|
|
105
|
+
expected_result = {"c": True, "b": {}}
|
|
106
|
+
assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def test_json_to_dict_valid_json():
|
|
110
|
+
json_string = '{"key": "value"}'
|
|
111
|
+
expected_result = {"key": "value"}
|
|
112
|
+
assert json_to_dict(json_string) == expected_result
|
|
113
|
+
assert isinstance(json_to_dict(json_string), dict)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def test_json_to_dict_malformed_json():
|
|
117
|
+
json_string = '{"key": "value"'
|
|
118
|
+
expected_result = '{"key": "value"'
|
|
119
|
+
assert json_to_dict(json_string) == expected_result
|
|
120
|
+
assert isinstance(json_to_dict(json_string), str)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def test_json_to_dict_single_quotes():
|
|
124
|
+
json_string = "{'key': 'value'}"
|
|
125
|
+
expected_result = {"key": "value"}
|
|
126
|
+
assert json_to_dict(json_string) == expected_result
|
|
127
|
+
assert isinstance(json_to_dict(json_string), dict)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def test_json_to_dict_path():
|
|
131
|
+
json_string = "/path/to/file.json"
|
|
132
|
+
expected_result = "/path/to/file.json"
|
|
133
|
+
assert json_to_dict(json_string) == expected_result
|
|
134
|
+
assert isinstance(json_to_dict(json_string), str)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def test_ensure_isoformat_datetime_for_datetime():
|
|
138
|
+
dt = ensure_isoformat_datetime(datetime(2021, 1, 1, 12, 0, 0))
|
|
139
|
+
assert dt == "2021-01-01T12:00:00"
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def test_ensure_isoformat_datetime_for_datetime_with_tz():
|
|
143
|
+
dt = ensure_isoformat_datetime(datetime(2021, 1, 1, 12, 0, 0, tzinfo=pytz.UTC))
|
|
144
|
+
assert dt == "2021-01-01T12:00:00+00:00"
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def test_ensure_isoformat_datetime_for_string():
|
|
148
|
+
dt = ensure_isoformat_datetime("2021-01-01T12:00:00")
|
|
149
|
+
assert dt == "2021-01-01T12:00:00"
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def test_ensure_isoformat_datetime_for_string2():
|
|
153
|
+
dt = ensure_isoformat_datetime("2021-01-01T12:00:00+00:00")
|
|
154
|
+
assert dt == "2021-01-01T12:00:00+00:00"
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def test_ensure_isoformat_datetime_fails_on_string():
|
|
158
|
+
with pytest.raises(ValueError):
|
|
159
|
+
ensure_isoformat_datetime("bad timestamp")
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def test_ensure_isoformat_datetime_fails_on_int():
|
|
163
|
+
with pytest.raises(TypeError):
|
|
164
|
+
ensure_isoformat_datetime(1111)
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, Field, Secret, SecretStr
|
|
5
|
+
from pydantic.types import _SecretBase
|
|
6
|
+
|
|
7
|
+
from unstructured_ingest.v2.utils import serialize_base_model, serialize_base_model_json
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class MockChildBaseModel(BaseModel):
|
|
11
|
+
child_secret_str: SecretStr
|
|
12
|
+
child_secret_float: Secret[float]
|
|
13
|
+
child_not_secret_dict: dict[str, Any] = Field(default_factory=dict)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class MockBaseModel(BaseModel):
|
|
17
|
+
secret_str: SecretStr
|
|
18
|
+
not_secret_bool: bool
|
|
19
|
+
secret_child_base: Secret[MockChildBaseModel]
|
|
20
|
+
not_secret_list: list[int] = Field(default_factory=list)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
model = MockBaseModel(
|
|
24
|
+
secret_str="secret string",
|
|
25
|
+
not_secret_bool=False,
|
|
26
|
+
secret_child_base=MockChildBaseModel(
|
|
27
|
+
child_secret_str="child secret string",
|
|
28
|
+
child_secret_float=3.14,
|
|
29
|
+
child_not_secret_dict={"key": "value"},
|
|
30
|
+
),
|
|
31
|
+
not_secret_list=[1, 2, 3],
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def test_serialize_base_model():
|
|
36
|
+
|
|
37
|
+
serialized_dict = model.model_dump()
|
|
38
|
+
assert isinstance(serialized_dict["secret_str"], _SecretBase)
|
|
39
|
+
assert isinstance(serialized_dict["secret_child_base"], _SecretBase)
|
|
40
|
+
|
|
41
|
+
serialized_dict_w_secrets = serialize_base_model(model=model)
|
|
42
|
+
assert not isinstance(serialized_dict_w_secrets["secret_str"], _SecretBase)
|
|
43
|
+
assert not isinstance(serialized_dict_w_secrets["secret_child_base"], _SecretBase)
|
|
44
|
+
|
|
45
|
+
expected_dict = {
|
|
46
|
+
"secret_str": "secret string",
|
|
47
|
+
"not_secret_bool": False,
|
|
48
|
+
"secret_child_base": {
|
|
49
|
+
"child_secret_str": "child secret string",
|
|
50
|
+
"child_secret_float": 3.14,
|
|
51
|
+
"child_not_secret_dict": {"key": "value"},
|
|
52
|
+
},
|
|
53
|
+
"not_secret_list": [1, 2, 3],
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
assert serialized_dict_w_secrets == expected_dict
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def test_serialize_base_model_json():
|
|
60
|
+
serialized_json = model.model_dump_json()
|
|
61
|
+
serialized_dict = json.loads(serialized_json)
|
|
62
|
+
expected_dict = {
|
|
63
|
+
"secret_str": "**********",
|
|
64
|
+
"not_secret_bool": False,
|
|
65
|
+
"secret_child_base": "**********",
|
|
66
|
+
"not_secret_list": [1, 2, 3],
|
|
67
|
+
}
|
|
68
|
+
assert expected_dict == serialized_dict
|
|
69
|
+
|
|
70
|
+
serialized_json_w_secrets = serialize_base_model_json(model=model)
|
|
71
|
+
serialized_dict_w_secrets = json.loads(serialized_json_w_secrets)
|
|
72
|
+
expected_dict_w_secrets = {
|
|
73
|
+
"secret_str": "secret string",
|
|
74
|
+
"not_secret_bool": False,
|
|
75
|
+
"secret_child_base": {
|
|
76
|
+
"child_secret_str": "child secret string",
|
|
77
|
+
"child_secret_float": 3.14,
|
|
78
|
+
"child_not_secret_dict": {"key": "value"},
|
|
79
|
+
},
|
|
80
|
+
"not_secret_list": [1, 2, 3],
|
|
81
|
+
}
|
|
82
|
+
assert expected_dict_w_secrets == serialized_dict_w_secrets
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.0
|
|
1
|
+
__version__ = "0.1.0" # pragma: no cover
|
|
@@ -341,9 +341,9 @@ class CliPartitionConfig(PartitionConfig, CliMixin):
|
|
|
341
341
|
),
|
|
342
342
|
click.Option(
|
|
343
343
|
["--partition-endpoint"],
|
|
344
|
-
default="https://api.
|
|
344
|
+
default="https://api.unstructuredapp.io/general/v0/general",
|
|
345
345
|
help="If partitioning via api, use the following host. "
|
|
346
|
-
"Default: https://api.
|
|
346
|
+
"Default: https://api.unstructuredapp.io/general/v0/general",
|
|
347
347
|
),
|
|
348
348
|
click.Option(
|
|
349
349
|
["--api-key"],
|
|
@@ -3,7 +3,6 @@ import os
|
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from typing import TYPE_CHECKING
|
|
5
5
|
|
|
6
|
-
import numpy as np
|
|
7
6
|
from pydantic import Field, SecretStr
|
|
8
7
|
|
|
9
8
|
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
|
|
@@ -45,17 +44,6 @@ class BedrockEmbeddingConfig(EmbeddingConfig):
|
|
|
45
44
|
class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
46
45
|
config: BedrockEmbeddingConfig
|
|
47
46
|
|
|
48
|
-
def get_exemplary_embedding(self) -> list[float]:
|
|
49
|
-
return self.embed_query(query="Q")
|
|
50
|
-
|
|
51
|
-
def num_of_dimensions(self) -> tuple[int, ...]:
|
|
52
|
-
exemplary_embedding = self.get_exemplary_embedding()
|
|
53
|
-
return np.shape(exemplary_embedding)
|
|
54
|
-
|
|
55
|
-
def is_unit_vector(self) -> bool:
|
|
56
|
-
exemplary_embedding = self.get_exemplary_embedding()
|
|
57
|
-
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
|
|
58
|
-
|
|
59
47
|
def embed_query(self, query: str) -> list[float]:
|
|
60
48
|
"""Call out to Bedrock embedding endpoint."""
|
|
61
49
|
# replace newlines, which can negatively affect performance.
|
|
@@ -97,11 +85,3 @@ class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
97
85
|
embeddings = [self.embed_query(query=e.get("text", "")) for e in elements]
|
|
98
86
|
elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
|
|
99
87
|
return elements_with_embeddings
|
|
100
|
-
|
|
101
|
-
def _add_embeddings_to_elements(self, elements, embeddings) -> list[dict]:
|
|
102
|
-
assert len(elements) == len(embeddings)
|
|
103
|
-
elements_w_embedding = []
|
|
104
|
-
for i, element in enumerate(elements):
|
|
105
|
-
element["embeddings"] = embeddings[i]
|
|
106
|
-
elements_w_embedding.append(element)
|
|
107
|
-
return elements
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
2
|
from typing import TYPE_CHECKING, Optional
|
|
3
3
|
|
|
4
|
-
import numpy as np
|
|
5
4
|
from pydantic import Field
|
|
6
5
|
|
|
7
6
|
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
|
|
@@ -39,17 +38,6 @@ class HuggingFaceEmbeddingConfig(EmbeddingConfig):
|
|
|
39
38
|
class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
40
39
|
config: HuggingFaceEmbeddingConfig
|
|
41
40
|
|
|
42
|
-
def get_exemplary_embedding(self) -> list[float]:
|
|
43
|
-
return self.embed_query(query="Q")
|
|
44
|
-
|
|
45
|
-
def num_of_dimensions(self) -> tuple[int, ...]:
|
|
46
|
-
exemplary_embedding = self.get_exemplary_embedding()
|
|
47
|
-
return np.shape(exemplary_embedding)
|
|
48
|
-
|
|
49
|
-
def is_unit_vector(self) -> bool:
|
|
50
|
-
exemplary_embedding = self.get_exemplary_embedding()
|
|
51
|
-
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
|
|
52
|
-
|
|
53
41
|
def embed_query(self, query: str) -> list[float]:
|
|
54
42
|
return self._embed_documents(texts=[query])[0]
|
|
55
43
|
|
|
@@ -62,12 +50,3 @@ class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
62
50
|
embeddings = self._embed_documents([e.get("text", "") for e in elements])
|
|
63
51
|
elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
|
|
64
52
|
return elements_with_embeddings
|
|
65
|
-
|
|
66
|
-
def _add_embeddings_to_elements(self, elements: list[dict], embeddings: list) -> list[dict]:
|
|
67
|
-
assert len(elements) == len(embeddings)
|
|
68
|
-
elements_w_embedding = []
|
|
69
|
-
|
|
70
|
-
for i, element in enumerate(elements):
|
|
71
|
-
element["embeddings"] = embeddings[i]
|
|
72
|
-
elements_w_embedding.append(element)
|
|
73
|
-
return elements
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
|
|
4
|
+
import numpy as np
|
|
4
5
|
from pydantic import BaseModel
|
|
5
6
|
|
|
6
7
|
|
|
@@ -17,14 +18,18 @@ class BaseEmbeddingEncoder(ABC):
|
|
|
17
18
|
is properly configured: e.g., embed a single a element"""
|
|
18
19
|
|
|
19
20
|
@property
|
|
20
|
-
@abstractmethod
|
|
21
21
|
def num_of_dimensions(self) -> tuple[int, ...]:
|
|
22
|
-
|
|
22
|
+
exemplary_embedding = self.get_exemplary_embedding()
|
|
23
|
+
return np.shape(exemplary_embedding)
|
|
24
|
+
|
|
25
|
+
def get_exemplary_embedding(self) -> list[float]:
|
|
26
|
+
return self.embed_query(query="Q")
|
|
23
27
|
|
|
24
28
|
@property
|
|
25
|
-
@abstractmethod
|
|
26
29
|
def is_unit_vector(self) -> bool:
|
|
27
30
|
"""Denotes if the embedding vector is a unit vector."""
|
|
31
|
+
exemplary_embedding = self.get_exemplary_embedding()
|
|
32
|
+
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
|
|
28
33
|
|
|
29
34
|
@abstractmethod
|
|
30
35
|
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
@@ -41,3 +46,24 @@ class BaseEmbeddingEncoder(ABC):
|
|
|
41
46
|
results.append(response)
|
|
42
47
|
|
|
43
48
|
return results
|
|
49
|
+
|
|
50
|
+
@staticmethod
|
|
51
|
+
def _add_embeddings_to_elements(
|
|
52
|
+
elements: list[dict], embeddings: list[list[float]]
|
|
53
|
+
) -> list[dict]:
|
|
54
|
+
"""
|
|
55
|
+
Add embeddings to elements.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
elements (list[Element]): List of elements.
|
|
59
|
+
embeddings (list[list[float]]): List of embeddings.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
list[Element]: Elements with embeddings added.
|
|
63
|
+
"""
|
|
64
|
+
assert len(elements) == len(embeddings)
|
|
65
|
+
elements_w_embedding = []
|
|
66
|
+
for i, element in enumerate(elements):
|
|
67
|
+
element["embeddings"] = embeddings[i]
|
|
68
|
+
elements_w_embedding.append(element)
|
|
69
|
+
return elements
|
|
@@ -2,7 +2,6 @@ import os
|
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
3
|
from typing import TYPE_CHECKING, Optional
|
|
4
4
|
|
|
5
|
-
import numpy as np
|
|
6
5
|
from pydantic import Field, SecretStr
|
|
7
6
|
|
|
8
7
|
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
|
|
@@ -66,8 +65,6 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
66
65
|
"""
|
|
67
66
|
|
|
68
67
|
config: MixedbreadAIEmbeddingConfig
|
|
69
|
-
|
|
70
|
-
_exemplary_embedding: Optional[list[float]] = field(init=False, default=None)
|
|
71
68
|
_request_options: Optional["RequestOptions"] = field(init=False, default=None)
|
|
72
69
|
|
|
73
70
|
def get_exemplary_embedding(self) -> list[float]:
|
|
@@ -90,18 +87,6 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
90
87
|
additional_headers={"User-Agent": USER_AGENT},
|
|
91
88
|
)
|
|
92
89
|
|
|
93
|
-
@property
|
|
94
|
-
def num_of_dimensions(self) -> tuple[int, ...]:
|
|
95
|
-
"""Get the number of dimensions for the embeddings."""
|
|
96
|
-
exemplary_embedding = self.get_exemplary_embedding()
|
|
97
|
-
return np.shape(exemplary_embedding)
|
|
98
|
-
|
|
99
|
-
@property
|
|
100
|
-
def is_unit_vector(self) -> bool:
|
|
101
|
-
"""Check if the embedding is a unit vector."""
|
|
102
|
-
exemplary_embedding = self.get_exemplary_embedding()
|
|
103
|
-
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
|
|
104
|
-
|
|
105
90
|
def _embed(self, texts: list[str]) -> list[list[float]]:
|
|
106
91
|
"""
|
|
107
92
|
Embed a list of texts using the Mixedbread AI API.
|
|
@@ -130,27 +115,6 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
130
115
|
responses.append(response)
|
|
131
116
|
return [item.embedding for response in responses for item in response.data]
|
|
132
117
|
|
|
133
|
-
@staticmethod
|
|
134
|
-
def _add_embeddings_to_elements(
|
|
135
|
-
elements: list[dict], embeddings: list[list[float]]
|
|
136
|
-
) -> list[dict]:
|
|
137
|
-
"""
|
|
138
|
-
Add embeddings to elements.
|
|
139
|
-
|
|
140
|
-
Args:
|
|
141
|
-
elements (list[Element]): List of elements.
|
|
142
|
-
embeddings (list[list[float]]): List of embeddings.
|
|
143
|
-
|
|
144
|
-
Returns:
|
|
145
|
-
list[Element]: Elements with embeddings added.
|
|
146
|
-
"""
|
|
147
|
-
assert len(elements) == len(embeddings)
|
|
148
|
-
elements_w_embedding = []
|
|
149
|
-
for i, element in enumerate(elements):
|
|
150
|
-
element["embeddings"] = embeddings[i]
|
|
151
|
-
elements_w_embedding.append(element)
|
|
152
|
-
return elements
|
|
153
|
-
|
|
154
118
|
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
155
119
|
"""
|
|
156
120
|
Embed a list of document elements.
|
|
@@ -1,7 +1,6 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
from typing import TYPE_CHECKING
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
3
|
|
|
4
|
-
import numpy as np
|
|
5
4
|
from pydantic import Field, SecretStr
|
|
6
5
|
|
|
7
6
|
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
|
|
@@ -30,19 +29,6 @@ class OctoAiEmbeddingConfig(EmbeddingConfig):
|
|
|
30
29
|
@dataclass
|
|
31
30
|
class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
32
31
|
config: OctoAiEmbeddingConfig
|
|
33
|
-
# Uses the OpenAI SDK
|
|
34
|
-
_exemplary_embedding: Optional[list[float]] = field(init=False, default=None)
|
|
35
|
-
|
|
36
|
-
def get_exemplary_embedding(self) -> list[float]:
|
|
37
|
-
return self.embed_query("Q")
|
|
38
|
-
|
|
39
|
-
def num_of_dimensions(self) -> tuple[int, ...]:
|
|
40
|
-
exemplary_embedding = self.get_exemplary_embedding()
|
|
41
|
-
return np.shape(exemplary_embedding)
|
|
42
|
-
|
|
43
|
-
def is_unit_vector(self) -> bool:
|
|
44
|
-
exemplary_embedding = self.get_exemplary_embedding()
|
|
45
|
-
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
|
|
46
32
|
|
|
47
33
|
def embed_query(self, query: str):
|
|
48
34
|
client = self.config.get_client()
|
|
@@ -53,11 +39,3 @@ class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
53
39
|
embeddings = [self.embed_query(e.get("text", "")) for e in elements]
|
|
54
40
|
elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
|
|
55
41
|
return elements_with_embeddings
|
|
56
|
-
|
|
57
|
-
def _add_embeddings_to_elements(self, elements, embeddings) -> list[dict]:
|
|
58
|
-
assert len(elements) == len(embeddings)
|
|
59
|
-
elements_w_embedding = []
|
|
60
|
-
for i, element in enumerate(elements):
|
|
61
|
-
element["embeddings"] = embeddings[i]
|
|
62
|
-
elements_w_embedding.append(element)
|
|
63
|
-
return elements
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
2
|
from typing import TYPE_CHECKING
|
|
3
3
|
|
|
4
|
-
import numpy as np
|
|
5
4
|
from pydantic import Field, SecretStr
|
|
6
5
|
|
|
7
6
|
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
|
|
@@ -26,17 +25,6 @@ class OpenAIEmbeddingConfig(EmbeddingConfig):
|
|
|
26
25
|
class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
27
26
|
config: OpenAIEmbeddingConfig
|
|
28
27
|
|
|
29
|
-
def get_exemplary_embedding(self) -> list[float]:
|
|
30
|
-
return self.embed_query(query="Q")
|
|
31
|
-
|
|
32
|
-
def num_of_dimensions(self) -> tuple[int, ...]:
|
|
33
|
-
exemplary_embedding = self.get_exemplary_embedding()
|
|
34
|
-
return np.shape(exemplary_embedding)
|
|
35
|
-
|
|
36
|
-
def is_unit_vector(self) -> bool:
|
|
37
|
-
exemplary_embedding = self.get_exemplary_embedding()
|
|
38
|
-
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
|
|
39
|
-
|
|
40
28
|
def embed_query(self, query: str) -> list[float]:
|
|
41
29
|
client = self.config.get_client()
|
|
42
30
|
response = client.embeddings.create(input=query, model=self.config.embedder_model_name)
|
|
@@ -46,11 +34,3 @@ class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
46
34
|
embeddings = self._embed_documents([e.get("text", "") for e in elements])
|
|
47
35
|
elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
|
|
48
36
|
return elements_with_embeddings
|
|
49
|
-
|
|
50
|
-
def _add_embeddings_to_elements(self, elements, embeddings) -> list[dict]:
|
|
51
|
-
assert len(elements) == len(embeddings)
|
|
52
|
-
elements_w_embedding = []
|
|
53
|
-
for i, element in enumerate(elements):
|
|
54
|
-
element["embeddings"] = embeddings[i]
|
|
55
|
-
elements_w_embedding.append(element)
|
|
56
|
-
return elements
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
|
|
4
|
+
from pydantic import Field, SecretStr
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
|
|
7
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from together import Together
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class TogetherAIEmbeddingConfig(EmbeddingConfig):
|
|
14
|
+
api_key: SecretStr
|
|
15
|
+
embedder_model_name: str = Field(
|
|
16
|
+
default="togethercomputer/m2-bert-80M-8k-retrieval", alias="model_name"
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
@requires_dependencies(["together"], extras="togetherai")
|
|
20
|
+
def get_client(self) -> "Together":
|
|
21
|
+
from together import Together
|
|
22
|
+
|
|
23
|
+
return Together(api_key=self.api_key.get_secret_value())
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class TogetherAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
28
|
+
config: TogetherAIEmbeddingConfig
|
|
29
|
+
|
|
30
|
+
def embed_query(self, query: str) -> list[float]:
|
|
31
|
+
return self._embed_documents(elements=[query])[0]
|
|
32
|
+
|
|
33
|
+
def embed_documents(self, elements: list[dict]) -> list[dict]:
|
|
34
|
+
embeddings = self._embed_documents([e.get("text", "") for e in elements])
|
|
35
|
+
return self._add_embeddings_to_elements(elements, embeddings)
|
|
36
|
+
|
|
37
|
+
def _embed_documents(self, elements: list[str]) -> list[list[float]]:
|
|
38
|
+
client = self.config.get_client()
|
|
39
|
+
outputs = client.embeddings.create(model=self.config.embedder_model_name, input=elements)
|
|
40
|
+
return [outputs.data[i].embedding for i in range(len(elements))]
|
|
@@ -5,7 +5,6 @@ from dataclasses import dataclass
|
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from typing import TYPE_CHECKING, Annotated, Any, Optional
|
|
7
7
|
|
|
8
|
-
import numpy as np
|
|
9
8
|
from pydantic import Field, Secret, ValidationError
|
|
10
9
|
from pydantic.functional_validators import BeforeValidator
|
|
11
10
|
|
|
@@ -56,17 +55,6 @@ class VertexAIEmbeddingConfig(EmbeddingConfig):
|
|
|
56
55
|
class VertexAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
57
56
|
config: VertexAIEmbeddingConfig
|
|
58
57
|
|
|
59
|
-
def get_exemplary_embedding(self) -> list[float]:
|
|
60
|
-
return self.embed_query(query="A sample query.")
|
|
61
|
-
|
|
62
|
-
def num_of_dimensions(self) -> tuple[int, ...]:
|
|
63
|
-
exemplary_embedding = self.get_exemplary_embedding()
|
|
64
|
-
return np.shape(exemplary_embedding)
|
|
65
|
-
|
|
66
|
-
def is_unit_vector(self) -> bool:
|
|
67
|
-
exemplary_embedding = self.get_exemplary_embedding()
|
|
68
|
-
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
|
|
69
|
-
|
|
70
58
|
def embed_query(self, query):
|
|
71
59
|
return self._embed_documents(elements=[query])[0]
|
|
72
60
|
|
|
@@ -86,11 +74,3 @@ class VertexAIEmbeddingEncoder(BaseEmbeddingEncoder):
|
|
|
86
74
|
inputs = [TextEmbeddingInput(text=element) for element in elements]
|
|
87
75
|
embeddings = client.get_embeddings(inputs)
|
|
88
76
|
return [e.values for e in embeddings]
|
|
89
|
-
|
|
90
|
-
def _add_embeddings_to_elements(self, elements, embeddings) -> list[dict]:
|
|
91
|
-
assert len(elements) == len(embeddings)
|
|
92
|
-
elements_w_embedding = []
|
|
93
|
-
for i, element in enumerate(elements):
|
|
94
|
-
element["embeddings"] = embeddings[i]
|
|
95
|
-
elements_w_embedding.append(element)
|
|
96
|
-
return elements
|