PyPI - orca-sdk - Versions diffs - 0.1.0__py3-none-any.whl - Mend

orca-sdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (175) hide show

orca_sdk/embedding_model.py ADDED Viewed

@@ -0,0 +1,336 @@
+from __future__ import annotations
+from abc import abstractmethod
+from datetime import datetime
+from typing import TYPE_CHECKING, Sequence, cast, overload
+from ._generated_api_client.api import (
+    create_finetuned_embedding_model,
+    delete_finetuned_embedding_model,
+    embed_with_finetuned_model_gpu,
+    embed_with_pretrained_model_gpu,
+    get_finetuned_embedding_model,
+    get_pretrained_embedding_model,
+    list_finetuned_embedding_models,
+    list_pretrained_embedding_models,
+)
+from ._generated_api_client.models import (
+    EmbeddingFinetuningMethod,
+    EmbedRequest,
+    FinetunedEmbeddingModelMetadata,
+    FinetuneEmbeddingModelRequest,
+    FinetuneEmbeddingModelRequestTrainingArgs,
+    PretrainedEmbeddingModelMetadata,
+    PretrainedEmbeddingModelName,
+)
+from ._utils.common import CreateMode, DropMode
+from ._utils.task import TaskStatus, wait_for_task
+from .datasource import Datasource
+if TYPE_CHECKING:
+    from .labeled_memoryset import LabeledMemoryset
+class _EmbeddingModel:
+    name: str
+    embedding_dim: int
+    max_seq_length: int
+    uses_context: bool
+    def __init__(self, *, name: str, embedding_dim: int, max_seq_length: int, uses_context: bool):
+        self.name = name
+        self.embedding_dim = embedding_dim
+        self.max_seq_length = max_seq_length
+        self.uses_context = uses_context
+    @classmethod
+    @abstractmethod
+    def all(cls) -> Sequence[_EmbeddingModel]:
+        pass
+    @overload
+    def embed(self, value: str, max_seq_length: int | None = None) -> list[float]:
+        pass
+    @overload
+    def embed(self, value: list[str], max_seq_length: int | None = None) -> list[list[float]]:
+        pass
+    def embed(self, value: str | list[str], max_seq_length: int | None = None) -> list[float] | list[list[float]]:
+        request = EmbedRequest(values=value if isinstance(value, list) else [value], max_seq_length=max_seq_length)
+        if isinstance(self, PretrainedEmbeddingModel):
+            embeddings = embed_with_pretrained_model_gpu(self._model_name, body=request)
+        elif isinstance(self, FinetunedEmbeddingModel):
+            embeddings = embed_with_finetuned_model_gpu(self.id, body=request)
+        else:
+            raise ValueError("Invalid embedding model")
+        return embeddings if isinstance(value, list) else embeddings[0]
+class _PretrainedEmbeddingModelMeta(type):
+    def __getattr__(cls, name: str) -> PretrainedEmbeddingModel:
+        if cls != FinetunedEmbeddingModel and name in PretrainedEmbeddingModelName.__members__:
+            return PretrainedEmbeddingModel._get(name)
+        else:
+            raise AttributeError(f"'{cls.__name__}' object has no attribute '{name}'")
+class PretrainedEmbeddingModel(_EmbeddingModel, metaclass=_PretrainedEmbeddingModelMeta):
+    """
+    A pretrained embedding model
+    **Models:**
+    OrcaCloud supports a select number of small to medium sized embedding models that perform well on the
+        [Hugging Face MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard).
+        These can be accessed as class attributes. We currently support:
+    - **`CDE_SMALL`**: Context-aware CDE small model from Hugging Face ([jxm/cde-small-v1](https://huggingface.co/jxm/cde-small-v1))
+    - **`CLIP_BASE`**: Multi-modal CLIP model from Hugging Face ([sentence-transformers/clip-ViT-L-14](https://huggingface.co/sentence-transformers/clip-ViT-L-14))
+    - **`GTE_BASE`**: Alibaba's GTE model from Hugging Face ([Alibaba-NLP/gte-base-en-v1.5](https://huggingface.co/Alibaba-NLP/gte-base-en-v1.5))
+    Examples:
+        >>> PretrainedEmbeddingModel.CDE_SMALL
+        PretrainedEmbeddingModel({name: CDE_SMALL, embedding_dim: 768, max_seq_length: 512})
+    Attributes:
+        name: Name of the pretrained embedding model
+        embedding_dim: Dimension of the embeddings that are generated by the model
+        max_seq_length: Maximum input length (in tokens not characters) that this model can process. Inputs that are longer will be truncated during the embedding process
+        uses_context: Whether the pretrained embedding model uses context
+    """
+    _model_name: PretrainedEmbeddingModelName
+    def __init__(self, metadata: PretrainedEmbeddingModelMetadata):
+        # for internal use only, do not document
+        self._model_name = metadata.name
+        super().__init__(
+            name=metadata.name.value,
+            embedding_dim=metadata.embedding_dim,
+            max_seq_length=metadata.max_seq_length,
+            uses_context=metadata.uses_context,
+        )
+    def __eq__(self, other) -> bool:
+        return isinstance(other, PretrainedEmbeddingModel) and self.name == other.name
+    def __repr__(self) -> str:
+        return f"PretrainedEmbeddingModel({{name: {self.name}, embedding_dim: {self.embedding_dim}, max_seq_length: {self.max_seq_length}}})"
+    @classmethod
+    def all(cls) -> list[PretrainedEmbeddingModel]:
+        """
+        List all pretrained embedding models in the OrcaCloud
+        Returns:
+            A list of all pretrained embedding models available in the OrcaCloud
+        """
+        return [cls(metadata) for metadata in list_pretrained_embedding_models()]
+    _instances: dict[str, PretrainedEmbeddingModel] = {}
+    @classmethod
+    def _get(cls, name: PretrainedEmbeddingModelName | str) -> PretrainedEmbeddingModel:
+        # for internal use only, do not document - we want people to use dot notation to get the model
+        if str(name) not in cls._instances:
+            cls._instances[str(name)] = cls(get_pretrained_embedding_model(cast(PretrainedEmbeddingModelName, name)))
+        return cls._instances[str(name)]
+    @classmethod
+    def exists(cls, name: str) -> bool:
+        """
+        Check if a pretrained embedding model exists by name
+        Params:
+            name: The name of the pretrained embedding model
+        Returns:
+            True if the pretrained embedding model exists, False otherwise
+        """
+        return name in PretrainedEmbeddingModelName
+    def finetune(
+        self,
+        name: str,
+        train_datasource: Datasource | LabeledMemoryset,
+        *,
+        eval_datasource: Datasource | None = None,
+        label_column: str = "label",
+        value_column: str = "value",
+        training_method: EmbeddingFinetuningMethod | str = EmbeddingFinetuningMethod.CLASSIFICATION,
+        training_args: dict | None = None,
+        if_exists: CreateMode = "error",
+    ) -> FinetunedEmbeddingModel:
+        """
+        Finetune an embedding model
+        Params:
+            name: Name of the finetuned embedding model
+            train_datasource: Data to train on
+            eval_datasource: Optionally provide data to evaluate on
+            label_column: Column name of the label
+            value_column: Column name of the value
+            training_method: Training method to use
+            training_args: Optional override for Hugging Face [`TrainingArguments`](transformers.TrainingArguments).
+                If not provided, reasonable training arguments will be used for the specified training method
+            if_exists: What to do if a finetuned embedding model with the same name already exists, defaults to
+                `"error"`. Other option is `"open"` to open the existing finetuned embedding model.
+        Returns:
+            The finetuned embedding model
+        Raises:
+            ValueError: If the finetuned embedding model already exists and `if_exists` is `"error"` or if it is `"open"`
+                but the base model param does not match the existing model
+        Examples:
+            >>> datasource = Datasource.open("my_datasource")
+            >>> model = PretrainedEmbeddingModel.CLIP_BASE
+            >>> model.finetune("my_finetuned_model", datasource)
+        """
+        exists = FinetunedEmbeddingModel.exists(name)
+        if exists and if_exists == "error":
+            raise ValueError(f"Finetuned embedding model '{name}' already exists")
+        elif exists and if_exists == "open":
+            existing = FinetunedEmbeddingModel.open(name)
+            if existing.base_model_name != self._model_name:
+                raise ValueError(f"Finetuned embedding model '{name}' already exists, but with different base model")
+            return existing
+        from .labeled_memoryset import LabeledMemoryset
+        train_datasource_id = train_datasource.id if isinstance(train_datasource, Datasource) else None
+        train_memoryset_id = train_datasource.id if isinstance(train_datasource, LabeledMemoryset) else None
+        assert train_datasource_id is not None or train_memoryset_id is not None
+        res = create_finetuned_embedding_model(
+            body=FinetuneEmbeddingModelRequest(
+                name=name,
+                base_model=self._model_name,
+                train_memoryset_id=train_memoryset_id,
+                train_datasource_id=train_datasource_id,
+                eval_datasource_id=eval_datasource.id if eval_datasource is not None else None,
+                label_column=label_column,
+                value_column=value_column,
+                training_method=EmbeddingFinetuningMethod(training_method),
+                training_args=(FinetuneEmbeddingModelRequestTrainingArgs.from_dict(training_args or {})),
+            ),
+        )
+        wait_for_task(res.finetuning_task_id, description="Finetuning embedding model")
+        return FinetunedEmbeddingModel.open(res.id)
+class FinetunedEmbeddingModel(_EmbeddingModel):
+    """
+    A finetuned embedding model in the OrcaCloud
+    Attributes:
+        name: Name of the finetuned embedding model
+        embedding_dim: Dimension of the embeddings that are generated by the model
+        max_seq_length: Maximum input length (in tokens not characters) that this model can process. Inputs that are longer will be truncated during the embedding process
+        uses_context: Whether the model uses the memoryset to contextualize embeddings (acts akin to inverse document frequency in TFIDF features)
+        id: Unique identifier of the finetuned embedding model
+        base_model: Base model the finetuned embedding model was trained on
+        created_at: When the model was finetuned
+    """
+    id: str
+    created_at: datetime
+    updated_at: datetime
+    _status: TaskStatus
+    def __init__(self, metadata: FinetunedEmbeddingModelMetadata):
+        # for internal use only, do not document
+        self.id = metadata.id
+        self.created_at = metadata.created_at
+        self.updated_at = metadata.updated_at
+        self.base_model_name = metadata.base_model
+        self._status = metadata.finetuning_status
+        super().__init__(
+            name=metadata.name,
+            embedding_dim=metadata.embedding_dim,
+            max_seq_length=metadata.max_seq_length,
+            uses_context=metadata.uses_context,
+        )
+    def __eq__(self, other) -> bool:
+        return isinstance(other, FinetunedEmbeddingModel) and self.id == other.id
+    def __repr__(self) -> str:
+        return (
+            "FinetunedEmbeddingModel({\n"
+            f"    name: {self.name},\n"
+            f"    embedding_dim: {self.embedding_dim},\n"
+            f"    max_seq_length: {self.max_seq_length},\n"
+            f"    status: {self._status}\n"
+            f"    base_model: PretrainedEmbeddingModel.{self.base_model_name.value}\n"
+            "})"
+        )
+    @property
+    def base_model(self) -> PretrainedEmbeddingModel:
+        """Pretrained model the finetuned embedding model was based on"""
+        return PretrainedEmbeddingModel._get(self.base_model_name)
+    @classmethod
+    def all(cls) -> list[FinetunedEmbeddingModel]:
+        """
+        List all finetuned embedding model handles in the OrcaCloud
+        Returns:
+            A list of all finetuned embedding model handles in the OrcaCloud
+        """
+        return [cls(metadata) for metadata in list_finetuned_embedding_models()]
+    @classmethod
+    def open(cls, name: str) -> FinetunedEmbeddingModel:
+        """
+        Get a handle to a finetuned embedding model in the OrcaCloud
+        Params:
+            name: The name or unique identifier of a finetuned embedding model
+        Returns:
+            A handle to the finetuned embedding model in the OrcaCloud
+        Raises:
+            LookupError: If the finetuned embedding model does not exist
+        """
+        return cls(get_finetuned_embedding_model(name))
+    @classmethod
+    def exists(cls, name_or_id: str) -> bool:
+        """
+        Check if a finetuned embedding model with the given name or id exists.
+        Params:
+            name_or_id: The name or id of the finetuned embedding model
+        Returns:
+            True if the finetuned embedding model exists, False otherwise
+        """
+        try:
+            cls.open(name_or_id)
+            return True
+        except LookupError:
+            return False
+    @classmethod
+    def drop(cls, name_or_id: str, *, if_not_exists: DropMode = "error"):
+        """
+        Delete the finetuned embedding model from the OrcaCloud
+        Params:
+            name_or_id: The name or id of the finetuned embedding model
+        Raises:
+            LookupError: If the finetuned embedding model does not exist and `if_not_exists` is `"error"`
+        """
+        try:
+            delete_finetuned_embedding_model(name_or_id)
+        except LookupError:
+            if if_not_exists == "error":
+                raise

orca_sdk/embedding_model_test.py ADDED Viewed

@@ -0,0 +1,173 @@
+from uuid import uuid4
+import pytest
+from .datasource import Datasource
+from .embedding_model import (
+    FinetunedEmbeddingModel,
+    PretrainedEmbeddingModel,
+    PretrainedEmbeddingModelName,
+    TaskStatus,
+)
+from .labeled_memoryset import LabeledMemoryset
+def test_open_pretrained_model():
+    model = PretrainedEmbeddingModel.GTE_BASE
+    assert model is not None
+    assert isinstance(model, PretrainedEmbeddingModel)
+    assert model.name == "GTE_BASE"
+    assert model.embedding_dim == 768
+    assert model.max_seq_length == 8192
+    assert model is PretrainedEmbeddingModel.GTE_BASE
+def test_open_pretrained_model_unauthenticated(unauthenticated):
+    with pytest.raises(ValueError, match="Invalid API key"):
+        PretrainedEmbeddingModel.GTE_BASE.embed("I love this airline")
+def test_open_pretrained_model_not_found():
+    with pytest.raises(LookupError):
+        PretrainedEmbeddingModel._get("INVALID_MODEL")
+def test_all_pretrained_models():
+    models = PretrainedEmbeddingModel.all()
+    assert len(models) == len(PretrainedEmbeddingModelName)
+    assert all(m.name in PretrainedEmbeddingModelName.__members__ for m in models)
+def test_embed_text():
+    embedding = PretrainedEmbeddingModel.GTE_BASE.embed("I love this airline", max_seq_length=32)
+    assert embedding is not None
+    assert isinstance(embedding, list)
+    assert len(embedding) == 768
+    assert isinstance(embedding[0], float)
+def test_embed_text_unauthenticated(unauthenticated):
+    with pytest.raises(ValueError, match="Invalid API key"):
+        PretrainedEmbeddingModel.GTE_BASE.embed("I love this airline", max_seq_length=32)
+@pytest.fixture(scope="session")
+def finetuned_model(datasource) -> FinetunedEmbeddingModel:
+    return PretrainedEmbeddingModel.DISTILBERT.finetune("test_finetuned_model", datasource)
+def test_finetune_model_with_datasource(finetuned_model: FinetunedEmbeddingModel):
+    assert finetuned_model is not None
+    assert finetuned_model.name == "test_finetuned_model"
+    assert finetuned_model.base_model == PretrainedEmbeddingModel.DISTILBERT
+    assert finetuned_model.embedding_dim == 768
+    assert finetuned_model.max_seq_length == 512
+    assert finetuned_model._status == TaskStatus.COMPLETED
+def test_finetune_model_with_memoryset(memoryset: LabeledMemoryset):
+    finetuned_model = PretrainedEmbeddingModel.DISTILBERT.finetune("test_finetuned_model_from_memoryset", memoryset)
+    assert finetuned_model is not None
+    assert finetuned_model.name == "test_finetuned_model_from_memoryset"
+    assert finetuned_model.base_model == PretrainedEmbeddingModel.DISTILBERT
+    assert finetuned_model.embedding_dim == 768
+    assert finetuned_model.max_seq_length == 512
+    assert finetuned_model._status == TaskStatus.COMPLETED
+def test_finetune_model_already_exists_error(datasource: Datasource, finetuned_model):
+    with pytest.raises(ValueError):
+        PretrainedEmbeddingModel.DISTILBERT.finetune("test_finetuned_model", datasource)
+    with pytest.raises(ValueError):
+        PretrainedEmbeddingModel.DISTILBERT.finetune("test_finetuned_model", datasource, if_exists="error")
+def test_finetune_model_already_exists_return(datasource: Datasource, finetuned_model):
+    with pytest.raises(ValueError):
+        PretrainedEmbeddingModel.GTE_BASE.finetune("test_finetuned_model", datasource, if_exists="open")
+    new_model = PretrainedEmbeddingModel.DISTILBERT.finetune("test_finetuned_model", datasource, if_exists="open")
+    assert new_model is not None
+    assert new_model.name == "test_finetuned_model"
+    assert new_model.base_model == PretrainedEmbeddingModel.DISTILBERT
+    assert new_model.embedding_dim == 768
+    assert new_model.max_seq_length == 512
+    assert new_model._status == TaskStatus.COMPLETED
+def test_finetune_model_unauthenticated(unauthenticated, datasource: Datasource):
+    with pytest.raises(ValueError, match="Invalid API key"):
+        PretrainedEmbeddingModel.DISTILBERT.finetune("test_finetuned_model_unauthenticated", datasource)
+def test_use_finetuned_model_in_memoryset(datasource: Datasource, finetuned_model: FinetunedEmbeddingModel):
+    memoryset = LabeledMemoryset.create(
+        "test_memoryset_finetuned_model",
+        datasource,
+        embedding_model=finetuned_model,
+        value_column="text",
+    )
+    assert memoryset is not None
+    assert memoryset.name == "test_memoryset_finetuned_model"
+    assert memoryset.embedding_model == finetuned_model
+    assert memoryset.length == datasource.length
+def test_open_finetuned_model(finetuned_model: FinetunedEmbeddingModel):
+    model = FinetunedEmbeddingModel.open(finetuned_model.name)
+    assert isinstance(model, FinetunedEmbeddingModel)
+    assert model.id == finetuned_model.id
+    assert model.name == finetuned_model.name
+    assert model.base_model == PretrainedEmbeddingModel.DISTILBERT
+    assert model.embedding_dim == 768
+    assert model.max_seq_length == 512
+    assert model == finetuned_model
+def test_embed_finetuned_model(finetuned_model: FinetunedEmbeddingModel):
+    embedding = finetuned_model.embed("I love this airline")
+    assert embedding is not None
+    assert isinstance(embedding, list)
+    assert len(embedding) == 768
+    assert isinstance(embedding[0], float)
+def test_all_finetuned_models(finetuned_model: FinetunedEmbeddingModel):
+    models = FinetunedEmbeddingModel.all()
+    assert len(models) > 0
+    assert any(model.name == finetuned_model.name for model in models)
+def test_all_finetuned_models_unauthenticated(unauthenticated):
+    with pytest.raises(ValueError, match="Invalid API key"):
+        FinetunedEmbeddingModel.all()
+def test_all_finetuned_models_unauthorized(unauthorized, finetuned_model: FinetunedEmbeddingModel):
+    assert finetuned_model not in FinetunedEmbeddingModel.all()
+def test_drop_finetuned_model(datasource: Datasource):
+    PretrainedEmbeddingModel.DISTILBERT.finetune("finetuned_model_to_delete", datasource)
+    assert FinetunedEmbeddingModel.open("finetuned_model_to_delete")
+    FinetunedEmbeddingModel.drop("finetuned_model_to_delete")
+    with pytest.raises(LookupError):
+        FinetunedEmbeddingModel.open("finetuned_model_to_delete")
+def test_drop_finetuned_model_unauthenticated(unauthenticated, datasource: Datasource):
+    with pytest.raises(ValueError, match="Invalid API key"):
+        PretrainedEmbeddingModel.DISTILBERT.finetune("finetuned_model_to_delete", datasource)
+def test_drop_finetuned_model_not_found():
+    with pytest.raises(LookupError):
+        FinetunedEmbeddingModel.drop(str(uuid4()))
+    # ignores error if specified
+    FinetunedEmbeddingModel.drop(str(uuid4()), if_not_exists="ignore")
+def test_drop_finetuned_model_unauthorized(unauthorized, finetuned_model: FinetunedEmbeddingModel):
+    with pytest.raises(LookupError):
+        FinetunedEmbeddingModel.drop(finetuned_model.id)