PyPI - unstructured-ingest - Versions diffs - 0.0.24__py3-none-any.whl → 0.1.0__py3-none-any.whl - Mend

unstructured-ingest 0.0.24py3-none-any.whl → 0.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (87) hide show

test/__init__.py +0 -0
test/integration/__init__.py +0 -0
test/integration/chunkers/__init__.py +0 -0
test/integration/chunkers/test_chunkers.py +42 -0
test/integration/connectors/__init__.py +0 -0
test/integration/connectors/conftest.py +15 -0
test/integration/connectors/databricks_tests/__init__.py +0 -0
test/integration/connectors/databricks_tests/test_volumes_native.py +165 -0
test/integration/connectors/test_postgres.py +100 -0
test/integration/connectors/test_s3.py +152 -0
test/integration/connectors/test_sqlite.py +91 -0
test/integration/connectors/utils/__init__.py +0 -0
test/integration/connectors/utils/constants.py +7 -0
test/integration/connectors/utils/docker_compose.py +44 -0
test/integration/connectors/utils/validation.py +198 -0
test/integration/embedders/__init__.py +0 -0
test/integration/embedders/conftest.py +13 -0
test/integration/embedders/test_bedrock.py +49 -0
test/integration/embedders/test_huggingface.py +26 -0
test/integration/embedders/test_mixedbread.py +47 -0
test/integration/embedders/test_octoai.py +41 -0
test/integration/embedders/test_openai.py +41 -0
test/integration/embedders/test_vertexai.py +41 -0
test/integration/embedders/test_voyageai.py +41 -0
test/integration/embedders/togetherai.py +43 -0
test/integration/embedders/utils.py +44 -0
test/integration/partitioners/__init__.py +0 -0
test/integration/partitioners/test_partitioner.py +75 -0
test/integration/utils.py +15 -0
test/unit/__init__.py +0 -0
test/unit/embed/__init__.py +0 -0
test/unit/embed/test_mixedbreadai.py +41 -0
test/unit/embed/test_octoai.py +20 -0
test/unit/embed/test_openai.py +20 -0
test/unit/embed/test_vertexai.py +25 -0
test/unit/embed/test_voyageai.py +24 -0
test/unit/test_chunking_utils.py +36 -0
test/unit/test_error.py +27 -0
test/unit/test_interfaces.py +280 -0
test/unit/test_interfaces_v2.py +26 -0
test/unit/test_logger.py +78 -0
test/unit/test_utils.py +164 -0
test/unit/test_utils_v2.py +82 -0
unstructured_ingest/__version__.py +1 -1
unstructured_ingest/cli/interfaces.py +2 -2
unstructured_ingest/connector/notion/types/block.py +1 -0
unstructured_ingest/connector/notion/types/database.py +1 -0
unstructured_ingest/connector/notion/types/page.py +1 -0
unstructured_ingest/embed/bedrock.py +0 -20
unstructured_ingest/embed/huggingface.py +0 -21
unstructured_ingest/embed/interfaces.py +29 -3
unstructured_ingest/embed/mixedbreadai.py +0 -36
unstructured_ingest/embed/octoai.py +2 -24
unstructured_ingest/embed/openai.py +0 -20
unstructured_ingest/embed/togetherai.py +40 -0
unstructured_ingest/embed/vertexai.py +0 -20
unstructured_ingest/embed/voyageai.py +1 -24
unstructured_ingest/interfaces.py +1 -1
unstructured_ingest/utils/dep_check.py +12 -0
unstructured_ingest/v2/cli/utils/click.py +21 -2
unstructured_ingest/v2/interfaces/connector.py +22 -2
unstructured_ingest/v2/interfaces/downloader.py +1 -0
unstructured_ingest/v2/processes/chunker.py +1 -1
unstructured_ingest/v2/processes/connectors/__init__.py +9 -11
unstructured_ingest/v2/processes/connectors/databricks/__init__.py +52 -0
unstructured_ingest/v2/processes/connectors/databricks/volumes.py +175 -0
unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +87 -0
unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +102 -0
unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +85 -0
unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +86 -0
unstructured_ingest/v2/processes/connectors/databricks_volumes.py +125 -32
unstructured_ingest/v2/processes/connectors/mongodb.py +223 -3
unstructured_ingest/v2/processes/connectors/outlook.py +239 -0
unstructured_ingest/v2/processes/connectors/pinecone.py +9 -1
unstructured_ingest/v2/processes/connectors/sql/__init__.py +13 -0
unstructured_ingest/v2/processes/connectors/sql/postgres.py +121 -0
unstructured_ingest/v2/processes/connectors/sql/sql.py +181 -0
unstructured_ingest/v2/processes/connectors/sql/sqlite.py +109 -0
unstructured_ingest/v2/processes/embedder.py +13 -0
unstructured_ingest/v2/processes/partitioner.py +2 -1
{unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/METADATA +12 -10
{unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/RECORD +86 -32
{unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/top_level.txt +1 -0
unstructured_ingest/v2/processes/connectors/sql.py +0 -275
{unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/LICENSE.md +0 -0
{unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/WHEEL +0 -0
{unstructured_ingest-0.0.24.dist-info → unstructured_ingest-0.1.0.dist-info}/entry_points.txt +0 -0

test/unit/test_utils.py ADDED Viewed

@@ -0,0 +1,164 @@
+import json
+import typing as t
+from dataclasses import dataclass, field
+from datetime import datetime
+import pytest
+import pytz
+from unstructured_ingest.cli.utils import extract_config
+from unstructured_ingest.interfaces import BaseConfig
+from unstructured_ingest.utils.string_and_date_utils import ensure_isoformat_datetime, json_to_dict
+@dataclass
+class A(BaseConfig):
+    a: str
+@dataclass
+class B(BaseConfig):
+    a: A
+    b: int
+flat_data = {"a": "test", "b": 4, "c": True}
+def test_extract_config_concrete():
+    @dataclass
+    class C(BaseConfig):
+        b: B
+        c: bool
+    c = extract_config(flat_data=flat_data, config=C)
+    expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": True}
+    assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
+def test_extract_config_optional():
+    @dataclass
+    class C(BaseConfig):
+        c: bool
+        b: t.Optional[B] = None
+    c = extract_config(flat_data=flat_data, config=C)
+    expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": True}
+    assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
+def test_extract_config_union():
+    @dataclass
+    class C(BaseConfig):
+        c: bool
+        b: t.Optional[t.Union[B, int]] = None
+    c = extract_config(flat_data=flat_data, config=C)
+    expected_result = {"b": 4, "c": True}
+    assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
+def test_extract_config_list():
+    @dataclass
+    class C(BaseConfig):
+        c: t.List[int]
+        b: B
+    flat_data = {"a": "test", "b": 4, "c": [1, 2, 3]}
+    c = extract_config(flat_data=flat_data, config=C)
+    expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": [1, 2, 3]}
+    assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
+def test_extract_config_optional_list():
+    @dataclass
+    class C(BaseConfig):
+        b: B
+        c: t.Optional[t.List[int]] = None
+    flat_data = {"a": "test", "b": 4, "c": [1, 2, 3]}
+    c = extract_config(flat_data=flat_data, config=C)
+    expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": [1, 2, 3]}
+    assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
+def test_extract_config_dataclass_list():
+    @dataclass
+    class C(BaseConfig):
+        c: bool
+        b: t.List[B] = field(default_factory=list)
+    flat_data = {"a": "test", "c": True}
+    c = extract_config(flat_data=flat_data, config=C)
+    expected_result = {"b": [], "c": True}
+    assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
+def test_extract_config_dict():
+    @dataclass
+    class C(BaseConfig):
+        c: bool
+        b: t.Dict[str, B] = field(default_factory=dict)
+    flat_data = {"c": True}
+    c = extract_config(flat_data=flat_data, config=C)
+    expected_result = {"c": True, "b": {}}
+    assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True)
+def test_json_to_dict_valid_json():
+    json_string = '{"key": "value"}'
+    expected_result = {"key": "value"}
+    assert json_to_dict(json_string) == expected_result
+    assert isinstance(json_to_dict(json_string), dict)
+def test_json_to_dict_malformed_json():
+    json_string = '{"key": "value"'
+    expected_result = '{"key": "value"'
+    assert json_to_dict(json_string) == expected_result
+    assert isinstance(json_to_dict(json_string), str)
+def test_json_to_dict_single_quotes():
+    json_string = "{'key': 'value'}"
+    expected_result = {"key": "value"}
+    assert json_to_dict(json_string) == expected_result
+    assert isinstance(json_to_dict(json_string), dict)
+def test_json_to_dict_path():
+    json_string = "/path/to/file.json"
+    expected_result = "/path/to/file.json"
+    assert json_to_dict(json_string) == expected_result
+    assert isinstance(json_to_dict(json_string), str)
+def test_ensure_isoformat_datetime_for_datetime():
+    dt = ensure_isoformat_datetime(datetime(2021, 1, 1, 12, 0, 0))
+    assert dt == "2021-01-01T12:00:00"
+def test_ensure_isoformat_datetime_for_datetime_with_tz():
+    dt = ensure_isoformat_datetime(datetime(2021, 1, 1, 12, 0, 0, tzinfo=pytz.UTC))
+    assert dt == "2021-01-01T12:00:00+00:00"
+def test_ensure_isoformat_datetime_for_string():
+    dt = ensure_isoformat_datetime("2021-01-01T12:00:00")
+    assert dt == "2021-01-01T12:00:00"
+def test_ensure_isoformat_datetime_for_string2():
+    dt = ensure_isoformat_datetime("2021-01-01T12:00:00+00:00")
+    assert dt == "2021-01-01T12:00:00+00:00"
+def test_ensure_isoformat_datetime_fails_on_string():
+    with pytest.raises(ValueError):
+        ensure_isoformat_datetime("bad timestamp")
+def test_ensure_isoformat_datetime_fails_on_int():
+    with pytest.raises(TypeError):
+        ensure_isoformat_datetime(1111)

test/unit/test_utils_v2.py ADDED Viewed

@@ -0,0 +1,82 @@
+import json
+from typing import Any
+from pydantic import BaseModel, Field, Secret, SecretStr
+from pydantic.types import _SecretBase
+from unstructured_ingest.v2.utils import serialize_base_model, serialize_base_model_json
+class MockChildBaseModel(BaseModel):
+    child_secret_str: SecretStr
+    child_secret_float: Secret[float]
+    child_not_secret_dict: dict[str, Any] = Field(default_factory=dict)
+class MockBaseModel(BaseModel):
+    secret_str: SecretStr
+    not_secret_bool: bool
+    secret_child_base: Secret[MockChildBaseModel]
+    not_secret_list: list[int] = Field(default_factory=list)
+model = MockBaseModel(
+    secret_str="secret string",
+    not_secret_bool=False,
+    secret_child_base=MockChildBaseModel(
+        child_secret_str="child secret string",
+        child_secret_float=3.14,
+        child_not_secret_dict={"key": "value"},
+    ),
+    not_secret_list=[1, 2, 3],
+)
+def test_serialize_base_model():
+    serialized_dict = model.model_dump()
+    assert isinstance(serialized_dict["secret_str"], _SecretBase)
+    assert isinstance(serialized_dict["secret_child_base"], _SecretBase)
+    serialized_dict_w_secrets = serialize_base_model(model=model)
+    assert not isinstance(serialized_dict_w_secrets["secret_str"], _SecretBase)
+    assert not isinstance(serialized_dict_w_secrets["secret_child_base"], _SecretBase)
+    expected_dict = {
+        "secret_str": "secret string",
+        "not_secret_bool": False,
+        "secret_child_base": {
+            "child_secret_str": "child secret string",
+            "child_secret_float": 3.14,
+            "child_not_secret_dict": {"key": "value"},
+        },
+        "not_secret_list": [1, 2, 3],
+    }
+    assert serialized_dict_w_secrets == expected_dict
+def test_serialize_base_model_json():
+    serialized_json = model.model_dump_json()
+    serialized_dict = json.loads(serialized_json)
+    expected_dict = {
+        "secret_str": "**********",
+        "not_secret_bool": False,
+        "secret_child_base": "**********",
+        "not_secret_list": [1, 2, 3],
+    }
+    assert expected_dict == serialized_dict
+    serialized_json_w_secrets = serialize_base_model_json(model=model)
+    serialized_dict_w_secrets = json.loads(serialized_json_w_secrets)
+    expected_dict_w_secrets = {
+        "secret_str": "secret string",
+        "not_secret_bool": False,
+        "secret_child_base": {
+            "child_secret_str": "child secret string",
+            "child_secret_float": 3.14,
+            "child_not_secret_dict": {"key": "value"},
+        },
+        "not_secret_list": [1, 2, 3],
+    }
+    assert expected_dict_w_secrets == serialized_dict_w_secrets

unstructured_ingest/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.0~~.24~~" # pragma: no cover
1	+ __version__ = "0.1.0" # pragma: no cover

unstructured_ingest/cli/interfaces.py CHANGED Viewed

@@ -341,9 +341,9 @@ class CliPartitionConfig(PartitionConfig, CliMixin):
             ),
             click.Option(
                 ["--partition-endpoint"],
-                default="https://api.unstructured.io/general/v0/general",
+                default="https://api.unstructuredapp.io/general/v0/general",
                 help="If partitioning via api, use the following host. "
-                "Default: https://api.unstructured.io/general/v0/general",
+                "Default: https://api.unstructuredapp.io/general/v0/general",
             ),
             click.Option(
                 ["--api-key"],

unstructured_ingest/connector/notion/types/block.py CHANGED Viewed

@@ -58,6 +58,7 @@ class Block(FromJSONMixin, GetHTMLMixin):
     last_edited_time: str
     last_edited_by: PartialUser
     archived: bool
+    in_trash: bool
     has_children: bool
     parent: Parent
     block: BlockBase

unstructured_ingest/connector/notion/types/database.py CHANGED Viewed

@@ -26,6 +26,7 @@ class Database(FromJSONMixin, GetHTMLMixin):
     last_edited_time: str
     last_edited_by: PartialUser
     archived: bool
+    in_trash: bool
     parent: Parent
     url: str
     is_inline: bool

unstructured_ingest/connector/notion/types/page.py CHANGED Viewed

@@ -16,6 +16,7 @@ class Page(FromJSONMixin):
     last_edited_time: str
     last_edited_by: PartialUser
     archived: bool
+    in_trash: bool
     properties: dict
     parent: Parent
     url: str

unstructured_ingest/embed/bedrock.py CHANGED Viewed

@@ -3,7 +3,6 @@ import os
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
-import numpy as np
 from pydantic import Field, SecretStr
 from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
@@ -45,17 +44,6 @@ class BedrockEmbeddingConfig(EmbeddingConfig):
 class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
     config: BedrockEmbeddingConfig
-    def get_exemplary_embedding(self) -> list[float]:
-        return self.embed_query(query="Q")
-    def num_of_dimensions(self) -> tuple[int, ...]:
-        exemplary_embedding = self.get_exemplary_embedding()
-        return np.shape(exemplary_embedding)
-    def is_unit_vector(self) -> bool:
-        exemplary_embedding = self.get_exemplary_embedding()
-        return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
     def embed_query(self, query: str) -> list[float]:
         """Call out to Bedrock embedding endpoint."""
         # replace newlines, which can negatively affect performance.
@@ -97,11 +85,3 @@ class BedrockEmbeddingEncoder(BaseEmbeddingEncoder):
         embeddings = [self.embed_query(query=e.get("text", "")) for e in elements]
         elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
         return elements_with_embeddings
-    def _add_embeddings_to_elements(self, elements, embeddings) -> list[dict]:
-        assert len(elements) == len(embeddings)
-        elements_w_embedding = []
-        for i, element in enumerate(elements):
-            element["embeddings"] = embeddings[i]
-            elements_w_embedding.append(element)
-        return elements

unstructured_ingest/embed/huggingface.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Optional
-import numpy as np
 from pydantic import Field
 from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
@@ -39,17 +38,6 @@ class HuggingFaceEmbeddingConfig(EmbeddingConfig):
 class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
     config: HuggingFaceEmbeddingConfig
-    def get_exemplary_embedding(self) -> list[float]:
-        return self.embed_query(query="Q")
-    def num_of_dimensions(self) -> tuple[int, ...]:
-        exemplary_embedding = self.get_exemplary_embedding()
-        return np.shape(exemplary_embedding)
-    def is_unit_vector(self) -> bool:
-        exemplary_embedding = self.get_exemplary_embedding()
-        return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
     def embed_query(self, query: str) -> list[float]:
         return self._embed_documents(texts=[query])[0]
@@ -62,12 +50,3 @@ class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
         embeddings = self._embed_documents([e.get("text", "") for e in elements])
         elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
         return elements_with_embeddings
-    def _add_embeddings_to_elements(self, elements: list[dict], embeddings: list) -> list[dict]:
-        assert len(elements) == len(embeddings)
-        elements_w_embedding = []
-        for i, element in enumerate(elements):
-            element["embeddings"] = embeddings[i]
-            elements_w_embedding.append(element)
-        return elements

unstructured_ingest/embed/interfaces.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
+import numpy as np
 from pydantic import BaseModel
@@ -17,14 +18,18 @@ class BaseEmbeddingEncoder(ABC):
         is properly configured: e.g., embed a single a element"""
     @property
-    @abstractmethod
     def num_of_dimensions(self) -> tuple[int, ...]:
-        """Number of dimensions for the embedding vector."""
+        exemplary_embedding = self.get_exemplary_embedding()
+        return np.shape(exemplary_embedding)
+    def get_exemplary_embedding(self) -> list[float]:
+        return self.embed_query(query="Q")
     @property
-    @abstractmethod
     def is_unit_vector(self) -> bool:
         """Denotes if the embedding vector is a unit vector."""
+        exemplary_embedding = self.get_exemplary_embedding()
+        return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
     @abstractmethod
     def embed_documents(self, elements: list[dict]) -> list[dict]:
@@ -41,3 +46,24 @@ class BaseEmbeddingEncoder(ABC):
             results.append(response)
         return results
+    @staticmethod
+    def _add_embeddings_to_elements(
+        elements: list[dict], embeddings: list[list[float]]
+    ) -> list[dict]:
+        """
+        Add embeddings to elements.
+        Args:
+            elements (list[Element]): List of elements.
+            embeddings (list[list[float]]): List of embeddings.
+        Returns:
+            list[Element]: Elements with embeddings added.
+        """
+        assert len(elements) == len(embeddings)
+        elements_w_embedding = []
+        for i, element in enumerate(elements):
+            element["embeddings"] = embeddings[i]
+            elements_w_embedding.append(element)
+        return elements

unstructured_ingest/embed/mixedbreadai.py CHANGED Viewed

@@ -2,7 +2,6 @@ import os
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Optional
-import numpy as np
 from pydantic import Field, SecretStr
 from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
@@ -66,8 +65,6 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
     """
     config: MixedbreadAIEmbeddingConfig
-    _exemplary_embedding: Optional[list[float]] = field(init=False, default=None)
     _request_options: Optional["RequestOptions"] = field(init=False, default=None)
     def get_exemplary_embedding(self) -> list[float]:
@@ -90,18 +87,6 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
             additional_headers={"User-Agent": USER_AGENT},
         )
-    @property
-    def num_of_dimensions(self) -> tuple[int, ...]:
-        """Get the number of dimensions for the embeddings."""
-        exemplary_embedding = self.get_exemplary_embedding()
-        return np.shape(exemplary_embedding)
-    @property
-    def is_unit_vector(self) -> bool:
-        """Check if the embedding is a unit vector."""
-        exemplary_embedding = self.get_exemplary_embedding()
-        return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
     def _embed(self, texts: list[str]) -> list[list[float]]:
         """
         Embed a list of texts using the Mixedbread AI API.
@@ -130,27 +115,6 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder):
             responses.append(response)
         return [item.embedding for response in responses for item in response.data]
-    @staticmethod
-    def _add_embeddings_to_elements(
-        elements: list[dict], embeddings: list[list[float]]
-    ) -> list[dict]:
-        """
-        Add embeddings to elements.
-        Args:
-            elements (list[Element]): List of elements.
-            embeddings (list[list[float]]): List of embeddings.
-        Returns:
-            list[Element]: Elements with embeddings added.
-        """
-        assert len(elements) == len(embeddings)
-        elements_w_embedding = []
-        for i, element in enumerate(elements):
-            element["embeddings"] = embeddings[i]
-            elements_w_embedding.append(element)
-        return elements
     def embed_documents(self, elements: list[dict]) -> list[dict]:
         """
         Embed a list of document elements.

unstructured_ingest/embed/octoai.py CHANGED Viewed

@@ -1,7 +1,6 @@
-from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Optional
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
-import numpy as np
 from pydantic import Field, SecretStr
 from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
@@ -30,19 +29,6 @@ class OctoAiEmbeddingConfig(EmbeddingConfig):
 @dataclass
 class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
     config: OctoAiEmbeddingConfig
-    # Uses the OpenAI SDK
-    _exemplary_embedding: Optional[list[float]] = field(init=False, default=None)
-    def get_exemplary_embedding(self) -> list[float]:
-        return self.embed_query("Q")
-    def num_of_dimensions(self) -> tuple[int, ...]:
-        exemplary_embedding = self.get_exemplary_embedding()
-        return np.shape(exemplary_embedding)
-    def is_unit_vector(self) -> bool:
-        exemplary_embedding = self.get_exemplary_embedding()
-        return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
     def embed_query(self, query: str):
         client = self.config.get_client()
@@ -53,11 +39,3 @@ class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder):
         embeddings = [self.embed_query(e.get("text", "")) for e in elements]
         elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
         return elements_with_embeddings
-    def _add_embeddings_to_elements(self, elements, embeddings) -> list[dict]:
-        assert len(elements) == len(embeddings)
-        elements_w_embedding = []
-        for i, element in enumerate(elements):
-            element["embeddings"] = embeddings[i]
-            elements_w_embedding.append(element)
-        return elements

unstructured_ingest/embed/openai.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
-import numpy as np
 from pydantic import Field, SecretStr
 from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
@@ -26,17 +25,6 @@ class OpenAIEmbeddingConfig(EmbeddingConfig):
 class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
     config: OpenAIEmbeddingConfig
-    def get_exemplary_embedding(self) -> list[float]:
-        return self.embed_query(query="Q")
-    def num_of_dimensions(self) -> tuple[int, ...]:
-        exemplary_embedding = self.get_exemplary_embedding()
-        return np.shape(exemplary_embedding)
-    def is_unit_vector(self) -> bool:
-        exemplary_embedding = self.get_exemplary_embedding()
-        return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
     def embed_query(self, query: str) -> list[float]:
         client = self.config.get_client()
         response = client.embeddings.create(input=query, model=self.config.embedder_model_name)
@@ -46,11 +34,3 @@ class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder):
         embeddings = self._embed_documents([e.get("text", "") for e in elements])
         elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings)
         return elements_with_embeddings
-    def _add_embeddings_to_elements(self, elements, embeddings) -> list[dict]:
-        assert len(elements) == len(embeddings)
-        elements_w_embedding = []
-        for i, element in enumerate(elements):
-            element["embeddings"] = embeddings[i]
-            elements_w_embedding.append(element)
-        return elements

unstructured_ingest/embed/togetherai.py ADDED Viewed

@@ -0,0 +1,40 @@
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+from pydantic import Field, SecretStr
+from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig
+from unstructured_ingest.utils.dep_check import requires_dependencies
+if TYPE_CHECKING:
+    from together import Together
+class TogetherAIEmbeddingConfig(EmbeddingConfig):
+    api_key: SecretStr
+    embedder_model_name: str = Field(
+        default="togethercomputer/m2-bert-80M-8k-retrieval", alias="model_name"
+    )
+    @requires_dependencies(["together"], extras="togetherai")
+    def get_client(self) -> "Together":
+        from together import Together
+        return Together(api_key=self.api_key.get_secret_value())
+@dataclass
+class TogetherAIEmbeddingEncoder(BaseEmbeddingEncoder):
+    config: TogetherAIEmbeddingConfig
+    def embed_query(self, query: str) -> list[float]:
+        return self._embed_documents(elements=[query])[0]
+    def embed_documents(self, elements: list[dict]) -> list[dict]:
+        embeddings = self._embed_documents([e.get("text", "") for e in elements])
+        return self._add_embeddings_to_elements(elements, embeddings)
+    def _embed_documents(self, elements: list[str]) -> list[list[float]]:
+        client = self.config.get_client()
+        outputs = client.embeddings.create(model=self.config.embedder_model_name, input=elements)
+        return [outputs.data[i].embedding for i in range(len(elements))]

unstructured_ingest/embed/vertexai.py CHANGED Viewed

@@ -5,7 +5,6 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import TYPE_CHECKING, Annotated, Any, Optional
-import numpy as np
 from pydantic import Field, Secret, ValidationError
 from pydantic.functional_validators import BeforeValidator
@@ -56,17 +55,6 @@ class VertexAIEmbeddingConfig(EmbeddingConfig):
 class VertexAIEmbeddingEncoder(BaseEmbeddingEncoder):
     config: VertexAIEmbeddingConfig
-    def get_exemplary_embedding(self) -> list[float]:
-        return self.embed_query(query="A sample query.")
-    def num_of_dimensions(self) -> tuple[int, ...]:
-        exemplary_embedding = self.get_exemplary_embedding()
-        return np.shape(exemplary_embedding)
-    def is_unit_vector(self) -> bool:
-        exemplary_embedding = self.get_exemplary_embedding()
-        return np.isclose(np.linalg.norm(exemplary_embedding), 1.0)
     def embed_query(self, query):
         return self._embed_documents(elements=[query])[0]
@@ -86,11 +74,3 @@ class VertexAIEmbeddingEncoder(BaseEmbeddingEncoder):
         inputs = [TextEmbeddingInput(text=element) for element in elements]
         embeddings = client.get_embeddings(inputs)
         return [e.values for e in embeddings]
-    def _add_embeddings_to_elements(self, elements, embeddings) -> list[dict]:
-        assert len(elements) == len(embeddings)
-        elements_w_embedding = []
-        for i, element in enumerate(elements):
-            element["embeddings"] = embeddings[i]
-            elements_w_embedding.append(element)
-        return elements

unstructured-ingest 0.0.24__py3-none-any.whl → 0.1.0__py3-none-any.whl

Potentially problematic release.

unstructured-ingest 0.0.24py3-none-any.whl → 0.1.0py3-none-any.whl