unstructured-ingest 0.0.25__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (83) hide show
  1. test/__init__.py +0 -0
  2. test/integration/__init__.py +0 -0
  3. test/integration/chunkers/__init__.py +0 -0
  4. test/integration/chunkers/test_chunkers.py +42 -0
  5. test/integration/connectors/__init__.py +0 -0
  6. test/integration/connectors/conftest.py +15 -0
  7. test/integration/connectors/databricks_tests/__init__.py +0 -0
  8. test/integration/connectors/databricks_tests/test_volumes_native.py +165 -0
  9. test/integration/connectors/test_postgres.py +100 -0
  10. test/integration/connectors/test_s3.py +152 -0
  11. test/integration/connectors/test_sqlite.py +91 -0
  12. test/integration/connectors/utils/__init__.py +0 -0
  13. test/integration/connectors/utils/constants.py +7 -0
  14. test/integration/connectors/utils/docker_compose.py +44 -0
  15. test/integration/connectors/utils/validation.py +198 -0
  16. test/integration/embedders/__init__.py +0 -0
  17. test/integration/embedders/conftest.py +13 -0
  18. test/integration/embedders/test_bedrock.py +49 -0
  19. test/integration/embedders/test_huggingface.py +26 -0
  20. test/integration/embedders/test_mixedbread.py +47 -0
  21. test/integration/embedders/test_octoai.py +41 -0
  22. test/integration/embedders/test_openai.py +41 -0
  23. test/integration/embedders/test_vertexai.py +41 -0
  24. test/integration/embedders/test_voyageai.py +41 -0
  25. test/integration/embedders/togetherai.py +43 -0
  26. test/integration/embedders/utils.py +44 -0
  27. test/integration/partitioners/__init__.py +0 -0
  28. test/integration/partitioners/test_partitioner.py +75 -0
  29. test/integration/utils.py +15 -0
  30. test/unit/__init__.py +0 -0
  31. test/unit/embed/__init__.py +0 -0
  32. test/unit/embed/test_mixedbreadai.py +41 -0
  33. test/unit/embed/test_octoai.py +20 -0
  34. test/unit/embed/test_openai.py +20 -0
  35. test/unit/embed/test_vertexai.py +25 -0
  36. test/unit/embed/test_voyageai.py +24 -0
  37. test/unit/test_chunking_utils.py +36 -0
  38. test/unit/test_error.py +27 -0
  39. test/unit/test_interfaces.py +280 -0
  40. test/unit/test_interfaces_v2.py +26 -0
  41. test/unit/test_logger.py +78 -0
  42. test/unit/test_utils.py +164 -0
  43. test/unit/test_utils_v2.py +82 -0
  44. unstructured_ingest/__version__.py +1 -1
  45. unstructured_ingest/cli/interfaces.py +2 -2
  46. unstructured_ingest/connector/notion/types/block.py +1 -0
  47. unstructured_ingest/connector/notion/types/database.py +1 -0
  48. unstructured_ingest/connector/notion/types/page.py +1 -0
  49. unstructured_ingest/embed/bedrock.py +0 -20
  50. unstructured_ingest/embed/huggingface.py +0 -21
  51. unstructured_ingest/embed/interfaces.py +29 -3
  52. unstructured_ingest/embed/mixedbreadai.py +0 -36
  53. unstructured_ingest/embed/octoai.py +2 -24
  54. unstructured_ingest/embed/openai.py +0 -20
  55. unstructured_ingest/embed/togetherai.py +40 -0
  56. unstructured_ingest/embed/vertexai.py +0 -20
  57. unstructured_ingest/embed/voyageai.py +1 -24
  58. unstructured_ingest/interfaces.py +1 -1
  59. unstructured_ingest/v2/cli/utils/click.py +21 -2
  60. unstructured_ingest/v2/interfaces/connector.py +22 -2
  61. unstructured_ingest/v2/interfaces/downloader.py +1 -0
  62. unstructured_ingest/v2/processes/chunker.py +1 -1
  63. unstructured_ingest/v2/processes/connectors/__init__.py +5 -18
  64. unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
  65. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +175 -0
  66. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
  67. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
  68. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
  69. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
  70. unstructured_ingest/v2/processes/connectors/mongodb.py +223 -3
  71. unstructured_ingest/v2/processes/connectors/sql/__init__.py +13 -0
  72. unstructured_ingest/v2/processes/connectors/sql/postgres.py +121 -0
  73. unstructured_ingest/v2/processes/connectors/sql/sql.py +181 -0
  74. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +109 -0
  75. unstructured_ingest/v2/processes/embedder.py +13 -0
  76. unstructured_ingest/v2/processes/partitioner.py +2 -1
  77. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/METADATA +14 -12
  78. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/RECORD +82 -29
  79. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/top_level.txt +1 -0
  80. unstructured_ingest/v2/processes/connectors/sql.py +0 -275
  81. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/LICENSE.md +0 -0
  82. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/WHEEL +0 -0
  83. {unstructured_ingest-0.0.25.dist-info → unstructured_ingest-0.1.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,164 @@
1
+ import json
2
+ import typing as t
3
+ from dataclasses import dataclass, field
4
+ from datetime import datetime
5
+
6
+ import pytest
7
+ import pytz
8
+
9
+ from unstructured_ingest.cli.utils import extract_config
10
+ from unstructured_ingest.interfaces import BaseConfig
11
+ from unstructured_ingest.utils.string_and_date_utils import ensure_isoformat_datetime, json_to_dict
12
+
13
+
14
+ @dataclass
15
+ class A(BaseConfig):
16
+ a: str
17
+
18
+
19
+ @dataclass
20
+ class B(BaseConfig):
21
+ a: A
22
+ b: int
23
+
24
+
25
+ flat_data = {"a": "test", "b": 4, "c": True}
26
+
27
+
28
+ def test_extract_config_concrete():
29
+ @dataclass
30
+ class C(BaseConfig):
31
+ b: B
32
+ c: bool
33
+
34
+ c = extract_config(flat_data=flat_data, config=C)
35
+ expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": True}
36
+ assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
37
+
38
+
39
+ def test_extract_config_optional():
40
+ @dataclass
41
+ class C(BaseConfig):
42
+ c: bool
43
+ b: t.Optional[B] = None
44
+
45
+ c = extract_config(flat_data=flat_data, config=C)
46
+ expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": True}
47
+ assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
48
+
49
+
50
+ def test_extract_config_union():
51
+ @dataclass
52
+ class C(BaseConfig):
53
+ c: bool
54
+ b: t.Optional[t.Union[B, int]] = None
55
+
56
+ c = extract_config(flat_data=flat_data, config=C)
57
+ expected_result = {"b": 4, "c": True}
58
+ assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
59
+
60
+
61
+ def test_extract_config_list():
62
+ @dataclass
63
+ class C(BaseConfig):
64
+ c: t.List[int]
65
+ b: B
66
+
67
+ flat_data = {"a": "test", "b": 4, "c": [1, 2, 3]}
68
+ c = extract_config(flat_data=flat_data, config=C)
69
+ expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": [1, 2, 3]}
70
+ assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
71
+
72
+
73
+ def test_extract_config_optional_list():
74
+ @dataclass
75
+ class C(BaseConfig):
76
+ b: B
77
+ c: t.Optional[t.List[int]] = None
78
+
79
+ flat_data = {"a": "test", "b": 4, "c": [1, 2, 3]}
80
+ c = extract_config(flat_data=flat_data, config=C)
81
+ expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": [1, 2, 3]}
82
+ assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
83
+
84
+
85
+ def test_extract_config_dataclass_list():
86
+ @dataclass
87
+ class C(BaseConfig):
88
+ c: bool
89
+ b: t.List[B] = field(default_factory=list)
90
+
91
+ flat_data = {"a": "test", "c": True}
92
+ c = extract_config(flat_data=flat_data, config=C)
93
+ expected_result = {"b": [], "c": True}
94
+ assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
95
+
96
+
97
+ def test_extract_config_dict():
98
+ @dataclass
99
+ class C(BaseConfig):
100
+ c: bool
101
+ b: t.Dict[str, B] = field(default_factory=dict)
102
+
103
+ flat_data = {"c": True}
104
+ c = extract_config(flat_data=flat_data, config=C)
105
+ expected_result = {"c": True, "b": {}}
106
+ assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
107
+
108
+
109
+ def test_json_to_dict_valid_json():
110
+ json_string = '{"key": "value"}'
111
+ expected_result = {"key": "value"}
112
+ assert json_to_dict(json_string) == expected_result
113
+ assert isinstance(json_to_dict(json_string), dict)
114
+
115
+
116
+ def test_json_to_dict_malformed_json():
117
+ json_string = '{"key": "value"'
118
+ expected_result = '{"key": "value"'
119
+ assert json_to_dict(json_string) == expected_result
120
+ assert isinstance(json_to_dict(json_string), str)
121
+
122
+
123
+ def test_json_to_dict_single_quotes():
124
+ json_string = "{'key': 'value'}"
125
+ expected_result = {"key": "value"}
126
+ assert json_to_dict(json_string) == expected_result
127
+ assert isinstance(json_to_dict(json_string), dict)
128
+
129
+
130
+ def test_json_to_dict_path():
131
+ json_string = "/path/to/file.json"
132
+ expected_result = "/path/to/file.json"
133
+ assert json_to_dict(json_string) == expected_result
134
+ assert isinstance(json_to_dict(json_string), str)
135
+
136
+
137
+ def test_ensure_isoformat_datetime_for_datetime():
138
+ dt = ensure_isoformat_datetime(datetime(2021, 1, 1, 12, 0, 0))
139
+ assert dt == "2021-01-01T12:00:00"
140
+
141
+
142
+ def test_ensure_isoformat_datetime_for_datetime_with_tz():
143
+ dt = ensure_isoformat_datetime(datetime(2021, 1, 1, 12, 0, 0, tzinfo=pytz.UTC))
144
+ assert dt == "2021-01-01T12:00:00+00:00"
145
+
146
+
147
+ def test_ensure_isoformat_datetime_for_string():
148
+ dt = ensure_isoformat_datetime("2021-01-01T12:00:00")
149
+ assert dt == "2021-01-01T12:00:00"
150
+
151
+
152
+ def test_ensure_isoformat_datetime_for_string2():
153
+ dt = ensure_isoformat_datetime("2021-01-01T12:00:00+00:00")
154
+ assert dt == "2021-01-01T12:00:00+00:00"
155
+
156
+
157
+ def test_ensure_isoformat_datetime_fails_on_string():
158
+ with pytest.raises(ValueError):
159
+ ensure_isoformat_datetime("bad timestamp")
160
+
161
+
162
+ def test_ensure_isoformat_datetime_fails_on_int():
163
+ with pytest.raises(TypeError):
164
+ ensure_isoformat_datetime(1111)
@@ -0,0 +1,82 @@
1
+ import json
2
+ from typing import Any
3
+
4
+ from pydantic import BaseModel, Field, Secret, SecretStr
5
+ from pydantic.types import _SecretBase
6
+
7
+ from unstructured_ingest.v2.utils import serialize_base_model, serialize_base_model_json
8
+
9
+
10
+ class MockChildBaseModel(BaseModel):
11
+ child_secret_str: SecretStr
12
+ child_secret_float: Secret[float]
13
+ child_not_secret_dict: dict[str, Any] = Field(default_factory=dict)
14
+
15
+
16
+ class MockBaseModel(BaseModel):
17
+ secret_str: SecretStr
18
+ not_secret_bool: bool
19
+ secret_child_base: Secret[MockChildBaseModel]
20
+ not_secret_list: list[int] = Field(default_factory=list)
21
+
22
+
23
+ model = MockBaseModel(
24
+ secret_str="secret string",
25
+ not_secret_bool=False,
26
+ secret_child_base=MockChildBaseModel(
27
+ child_secret_str="child secret string",
28
+ child_secret_float=3.14,
29
+ child_not_secret_dict={"key": "value"},
30
+ ),
31
+ not_secret_list=[1, 2, 3],
32
+ )
33
+
34
+
35
+ def test_serialize_base_model():
36
+
37
+ serialized_dict = model.model_dump()
38
+ assert isinstance(serialized_dict["secret_str"], _SecretBase)
39
+ assert isinstance(serialized_dict["secret_child_base"], _SecretBase)
40
+
41
+ serialized_dict_w_secrets = serialize_base_model(model=model)
42
+ assert not isinstance(serialized_dict_w_secrets["secret_str"], _SecretBase)
43
+ assert not isinstance(serialized_dict_w_secrets["secret_child_base"], _SecretBase)
44
+
45
+ expected_dict = {
46
+ "secret_str": "secret string",
47
+ "not_secret_bool": False,
48
+ "secret_child_base": {
49
+ "child_secret_str": "child secret string",
50
+ "child_secret_float": 3.14,
51
+ "child_not_secret_dict": {"key": "value"},
52
+ },
53
+ "not_secret_list": [1, 2, 3],
54
+ }
55
+
56
+ assert serialized_dict_w_secrets == expected_dict
57
+
58
+
59
+ def test_serialize_base_model_json():
60
+ serialized_json = model.model_dump_json()
61
+ serialized_dict = json.loads(serialized_json)
62
+ expected_dict = {
63
+ "secret_str": "**********",
64
+ "not_secret_bool": False,
65
+ "secret_child_base": "**********",
66
+ "not_secret_list": [1, 2, 3],
67
+ }
68
+ assert expected_dict == serialized_dict
69
+
70
+ serialized_json_w_secrets = serialize_base_model_json(model=model)
71
+ serialized_dict_w_secrets = json.loads(serialized_json_w_secrets)
72
+ expected_dict_w_secrets = {
73
+ "secret_str": "secret string",
74
+ "not_secret_bool": False,
75
+ "secret_child_base": {
76
+ "child_secret_str": "child secret string",
77
+ "child_secret_float": 3.14,
78
+ "child_not_secret_dict": {"key": "value"},
79
+ },
80
+ "not_secret_list": [1, 2, 3],
81
+ }
82
+ assert expected_dict_w_secrets == serialized_dict_w_secrets
@@ -1 +1 @@
1
- __version__ = "0.0.25" # pragma: no cover
1
+ __version__ = "0.1.0" # pragma: no cover
@@ -341,9 +341,9 @@ class CliPartitionConfig(PartitionConfig, CliMixin):
341
341
  ),
342
342
  click.Option(
343
343
  ["--partition-endpoint"],
344
- default="https://api.unstructured.io/general/v0/general",
344
+ default="https://api.unstructuredapp.io/general/v0/general",
345
345
  help="If partitioning via api, use the following host. "
346
- "Default: https://api.unstructured.io/general/v0/general",
346
+ "Default: https://api.unstructuredapp.io/general/v0/general",
347
347
  ),
348
348
  click.Option(
349
349
  ["--api-key"],
@@ -58,6 +58,7 @@ class Block(FromJSONMixin, GetHTMLMixin):
58
58
  last_edited_time: str
59
59
  last_edited_by: PartialUser
60
60
  archived: bool
61
+ in_trash: bool
61
62
  has_children: bool
62
63
  parent: Parent
63
64
  block: BlockBase
@@ -26,6 +26,7 @@ class Database(FromJSONMixin, GetHTMLMixin):
26
26
  last_edited_time: str
27
27
  last_edited_by: PartialUser
28
28
  archived: bool
29
+ in_trash: bool
29
30
  parent: Parent
30
31
  url: str
31
32
  is_inline: bool
@@ -16,6 +16,7 @@ class Page(FromJSONMixin):
16
16
  last_edited_time: str
17
17
  last_edited_by: PartialUser
18
18
  archived: bool
19
+ in_trash: bool
19
20
  properties: dict
20
21
  parent: Parent
21
22
  url: str
@@ -3,7 +3,6 @@ import os
3
3
  from dataclasses import dataclass
4
4
  from typing import TYPE_CHECKING
5
5
 
6
- import numpy as np
7
6
  from pydantic import Field, SecretStr
8
7
 
9
8
  from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
@@ -45,17 +44,6 @@ class BedrockEmbeddingConfig(EmbeddingConfig):
45
44
  class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
46
45
  config: BedrockEmbeddingConfig
47
46
 
48
- def get_exemplary_embedding(self) -> list[float]:
49
- return self.embed_query(query="Q")
50
-
51
- def num_of_dimensions(self) -> tuple[int, ...]:
52
- exemplary_embedding = self.get_exemplary_embedding()
53
- return np.shape(exemplary_embedding)
54
-
55
- def is_unit_vector(self) -> bool:
56
- exemplary_embedding = self.get_exemplary_embedding()
57
- return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
58
-
59
47
  def embed_query(self, query: str) -> list[float]:
60
48
  """Call out to Bedrock embedding endpoint."""
61
49
  # replace newlines, which can negatively affect performance.
@@ -97,11 +85,3 @@ class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
97
85
  embeddings = [self.embed_query(query=e.get("text", "")) for e in elements]
98
86
  elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
99
87
  return elements_with_embeddings
100
-
101
- def _add_embeddings_to_elements(self, elements, embeddings) -> list[dict]:
102
- assert len(elements) == len(embeddings)
103
- elements_w_embedding = []
104
- for i, element in enumerate(elements):
105
- element["embeddings"] = embeddings[i]
106
- elements_w_embedding.append(element)
107
- return elements
@@ -1,7 +1,6 @@
1
1
  from dataclasses import dataclass
2
2
  from typing import TYPE_CHECKING, Optional
3
3
 
4
- import numpy as np
5
4
  from pydantic import Field
6
5
 
7
6
  from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
@@ -39,17 +38,6 @@ class HuggingFaceEmbeddingConfig(EmbeddingConfig):
39
38
  class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
40
39
  config: HuggingFaceEmbeddingConfig
41
40
 
42
- def get_exemplary_embedding(self) -> list[float]:
43
- return self.embed_query(query="Q")
44
-
45
- def num_of_dimensions(self) -> tuple[int, ...]:
46
- exemplary_embedding = self.get_exemplary_embedding()
47
- return np.shape(exemplary_embedding)
48
-
49
- def is_unit_vector(self) -> bool:
50
- exemplary_embedding = self.get_exemplary_embedding()
51
- return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
52
-
53
41
  def embed_query(self, query: str) -> list[float]:
54
42
  return self._embed_documents(texts=[query])[0]
55
43
 
@@ -62,12 +50,3 @@ class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
62
50
  embeddings = self._embed_documents([e.get("text", "") for e in elements])
63
51
  elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
64
52
  return elements_with_embeddings
65
-
66
- def _add_embeddings_to_elements(self, elements: list[dict], embeddings: list) -> list[dict]:
67
- assert len(elements) == len(embeddings)
68
- elements_w_embedding = []
69
-
70
- for i, element in enumerate(elements):
71
- element["embeddings"] = embeddings[i]
72
- elements_w_embedding.append(element)
73
- return elements
@@ -1,6 +1,7 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from dataclasses import dataclass
3
3
 
4
+ import numpy as np
4
5
  from pydantic import BaseModel
5
6
 
6
7
 
@@ -17,14 +18,18 @@ class BaseEmbeddingEncoder(ABC):
17
18
  is properly configured: e.g., embed a single a element"""
18
19
 
19
20
  @property
20
- @abstractmethod
21
21
  def num_of_dimensions(self) -> tuple[int, ...]:
22
- """Number of dimensions for the embedding vector."""
22
+ exemplary_embedding = self.get_exemplary_embedding()
23
+ return np.shape(exemplary_embedding)
24
+
25
+ def get_exemplary_embedding(self) -> list[float]:
26
+ return self.embed_query(query="Q")
23
27
 
24
28
  @property
25
- @abstractmethod
26
29
  def is_unit_vector(self) -> bool:
27
30
  """Denotes if the embedding vector is a unit vector."""
31
+ exemplary_embedding = self.get_exemplary_embedding()
32
+ return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
28
33
 
29
34
  @abstractmethod
30
35
  def embed_documents(self, elements: list[dict]) -> list[dict]:
@@ -41,3 +46,24 @@ class BaseEmbeddingEncoder(ABC):
41
46
  results.append(response)
42
47
 
43
48
  return results
49
+
50
+ @staticmethod
51
+ def _add_embeddings_to_elements(
52
+ elements: list[dict], embeddings: list[list[float]]
53
+ ) -> list[dict]:
54
+ """
55
+ Add embeddings to elements.
56
+
57
+ Args:
58
+ elements (list[Element]): List of elements.
59
+ embeddings (list[list[float]]): List of embeddings.
60
+
61
+ Returns:
62
+ list[Element]: Elements with embeddings added.
63
+ """
64
+ assert len(elements) == len(embeddings)
65
+ elements_w_embedding = []
66
+ for i, element in enumerate(elements):
67
+ element["embeddings"] = embeddings[i]
68
+ elements_w_embedding.append(element)
69
+ return elements
@@ -2,7 +2,6 @@ import os
2
2
  from dataclasses import dataclass, field
3
3
  from typing import TYPE_CHECKING, Optional
4
4
 
5
- import numpy as np
6
5
  from pydantic import Field, SecretStr
7
6
 
8
7
  from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
@@ -66,8 +65,6 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
66
65
  """
67
66
 
68
67
  config: MixedbreadAIEmbeddingConfig
69
-
70
- _exemplary_embedding: Optional[list[float]] = field(init=False, default=None)
71
68
  _request_options: Optional["RequestOptions"] = field(init=False, default=None)
72
69
 
73
70
  def get_exemplary_embedding(self) -> list[float]:
@@ -90,18 +87,6 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
90
87
  additional_headers={"User-Agent": USER_AGENT},
91
88
  )
92
89
 
93
- @property
94
- def num_of_dimensions(self) -> tuple[int, ...]:
95
- """Get the number of dimensions for the embeddings."""
96
- exemplary_embedding = self.get_exemplary_embedding()
97
- return np.shape(exemplary_embedding)
98
-
99
- @property
100
- def is_unit_vector(self) -> bool:
101
- """Check if the embedding is a unit vector."""
102
- exemplary_embedding = self.get_exemplary_embedding()
103
- return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
104
-
105
90
  def _embed(self, texts: list[str]) -> list[list[float]]:
106
91
  """
107
92
  Embed a list of texts using the Mixedbread AI API.
@@ -130,27 +115,6 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
130
115
  responses.append(response)
131
116
  return [item.embedding for response in responses for item in response.data]
132
117
 
133
- @staticmethod
134
- def _add_embeddings_to_elements(
135
- elements: list[dict], embeddings: list[list[float]]
136
- ) -> list[dict]:
137
- """
138
- Add embeddings to elements.
139
-
140
- Args:
141
- elements (list[Element]): List of elements.
142
- embeddings (list[list[float]]): List of embeddings.
143
-
144
- Returns:
145
- list[Element]: Elements with embeddings added.
146
- """
147
- assert len(elements) == len(embeddings)
148
- elements_w_embedding = []
149
- for i, element in enumerate(elements):
150
- element["embeddings"] = embeddings[i]
151
- elements_w_embedding.append(element)
152
- return elements
153
-
154
118
  def embed_documents(self, elements: list[dict]) -> list[dict]:
155
119
  """
156
120
  Embed a list of document elements.
@@ -1,7 +1,6 @@
1
- from dataclasses import dataclass, field
2
- from typing import TYPE_CHECKING, Optional
1
+ from dataclasses import dataclass
2
+ from typing import TYPE_CHECKING
3
3
 
4
- import numpy as np
5
4
  from pydantic import Field, SecretStr
6
5
 
7
6
  from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
@@ -30,19 +29,6 @@ class OctoAiEmbeddingConfig(EmbeddingConfig):
30
29
  @dataclass
31
30
  class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
32
31
  config: OctoAiEmbeddingConfig
33
- # Uses the OpenAI SDK
34
- _exemplary_embedding: Optional[list[float]] = field(init=False, default=None)
35
-
36
- def get_exemplary_embedding(self) -> list[float]:
37
- return self.embed_query("Q")
38
-
39
- def num_of_dimensions(self) -> tuple[int, ...]:
40
- exemplary_embedding = self.get_exemplary_embedding()
41
- return np.shape(exemplary_embedding)
42
-
43
- def is_unit_vector(self) -> bool:
44
- exemplary_embedding = self.get_exemplary_embedding()
45
- return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
46
32
 
47
33
  def embed_query(self, query: str):
48
34
  client = self.config.get_client()
@@ -53,11 +39,3 @@ class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
53
39
  embeddings = [self.embed_query(e.get("text", "")) for e in elements]
54
40
  elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
55
41
  return elements_with_embeddings
56
-
57
- def _add_embeddings_to_elements(self, elements, embeddings) -> list[dict]:
58
- assert len(elements) == len(embeddings)
59
- elements_w_embedding = []
60
- for i, element in enumerate(elements):
61
- element["embeddings"] = embeddings[i]
62
- elements_w_embedding.append(element)
63
- return elements
@@ -1,7 +1,6 @@
1
1
  from dataclasses import dataclass
2
2
  from typing import TYPE_CHECKING
3
3
 
4
- import numpy as np
5
4
  from pydantic import Field, SecretStr
6
5
 
7
6
  from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
@@ -26,17 +25,6 @@ class OpenAIEmbeddingConfig(EmbeddingConfig):
26
25
  class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
27
26
  config: OpenAIEmbeddingConfig
28
27
 
29
- def get_exemplary_embedding(self) -> list[float]:
30
- return self.embed_query(query="Q")
31
-
32
- def num_of_dimensions(self) -> tuple[int, ...]:
33
- exemplary_embedding = self.get_exemplary_embedding()
34
- return np.shape(exemplary_embedding)
35
-
36
- def is_unit_vector(self) -> bool:
37
- exemplary_embedding = self.get_exemplary_embedding()
38
- return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
39
-
40
28
  def embed_query(self, query: str) -> list[float]:
41
29
  client = self.config.get_client()
42
30
  response = client.embeddings.create(input=query, model=self.config.embedder_model_name)
@@ -46,11 +34,3 @@ class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
46
34
  embeddings = self._embed_documents([e.get("text", "") for e in elements])
47
35
  elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
48
36
  return elements_with_embeddings
49
-
50
- def _add_embeddings_to_elements(self, elements, embeddings) -> list[dict]:
51
- assert len(elements) == len(embeddings)
52
- elements_w_embedding = []
53
- for i, element in enumerate(elements):
54
- element["embeddings"] = embeddings[i]
55
- elements_w_embedding.append(element)
56
- return elements
@@ -0,0 +1,40 @@
1
+ from dataclasses import dataclass
2
+ from typing import TYPE_CHECKING
3
+
4
+ from pydantic import Field, SecretStr
5
+
6
+ from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
7
+ from unstructured_ingest.utils.dep_check import requires_dependencies
8
+
9
+ if TYPE_CHECKING:
10
+ from together import Together
11
+
12
+
13
+ class TogetherAIEmbeddingConfig(EmbeddingConfig):
14
+ api_key: SecretStr
15
+ embedder_model_name: str = Field(
16
+ default="togethercomputer/m2-bert-80M-8k-retrieval", alias="model_name"
17
+ )
18
+
19
+ @requires_dependencies(["together"], extras="togetherai")
20
+ def get_client(self) -> "Together":
21
+ from together import Together
22
+
23
+ return Together(api_key=self.api_key.get_secret_value())
24
+
25
+
26
+ @dataclass
27
+ class TogetherAIEmbeddingEncoder(BaseEmbeddingEncoder):
28
+ config: TogetherAIEmbeddingConfig
29
+
30
+ def embed_query(self, query: str) -> list[float]:
31
+ return self._embed_documents(elements=[query])[0]
32
+
33
+ def embed_documents(self, elements: list[dict]) -> list[dict]:
34
+ embeddings = self._embed_documents([e.get("text", "") for e in elements])
35
+ return self._add_embeddings_to_elements(elements, embeddings)
36
+
37
+ def _embed_documents(self, elements: list[str]) -> list[list[float]]:
38
+ client = self.config.get_client()
39
+ outputs = client.embeddings.create(model=self.config.embedder_model_name, input=elements)
40
+ return [outputs.data[i].embedding for i in range(len(elements))]
@@ -5,7 +5,6 @@ from dataclasses import dataclass
5
5
  from pathlib import Path
6
6
  from typing import TYPE_CHECKING, Annotated, Any, Optional
7
7
 
8
- import numpy as np
9
8
  from pydantic import Field, Secret, ValidationError
10
9
  from pydantic.functional_validators import BeforeValidator
11
10
 
@@ -56,17 +55,6 @@ class VertexAIEmbeddingConfig(EmbeddingConfig):
56
55
  class VertexAIEmbeddingEncoder(BaseEmbeddingEncoder):
57
56
  config: VertexAIEmbeddingConfig
58
57
 
59
- def get_exemplary_embedding(self) -> list[float]:
60
- return self.embed_query(query="A sample query.")
61
-
62
- def num_of_dimensions(self) -> tuple[int, ...]:
63
- exemplary_embedding = self.get_exemplary_embedding()
64
- return np.shape(exemplary_embedding)
65
-
66
- def is_unit_vector(self) -> bool:
67
- exemplary_embedding = self.get_exemplary_embedding()
68
- return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
69
-
70
58
  def embed_query(self, query):
71
59
  return self._embed_documents(elements=[query])[0]
72
60
 
@@ -86,11 +74,3 @@ class VertexAIEmbeddingEncoder(BaseEmbeddingEncoder):
86
74
  inputs = [TextEmbeddingInput(text=element) for element in elements]
87
75
  embeddings = client.get_embeddings(inputs)
88
76
  return [e.values for e in embeddings]
89
-
90
- def _add_embeddings_to_elements(self, elements, embeddings) -> list[dict]:
91
- assert len(elements) == len(embeddings)
92
- elements_w_embedding = []
93
- for i, element in enumerate(elements):
94
- element["embeddings"] = embeddings[i]
95
- elements_w_embedding.append(element)
96
- return elements