PyPI - eval-studio-client - Versions diffs - 0.7.0__py3-none-any.whl - Mend

eval-studio-client 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (470) hide show

eval_studio_client/test_labs.py ADDED Viewed

@@ -0,0 +1,319 @@
+import dataclasses
+import enum
+import json
+from typing import List
+from typing import Optional
+from typing import Union
+import uuid
+from eval_studio_client import api
+from eval_studio_client import evaluators
+from eval_studio_client import leaderboards as l10s
+from eval_studio_client.api import models as apiModels
+class ModelType(enum.Enum):
+    h2ogpte = "h2ogpte"  # h2oGPTe RAG
+    h2ogpte_llm = "h2ogpte_llm"  # h2oGPTe-hosted LLM
+    h2ogpt = "h2ogpt"  # h2oGPT-hosted LLM
+    h2ollmops = "h2ollmops"  # H2O LLMOps-hosted LLM
+    openai_rag = "openai_rag"  # OpenAI RAG
+    openai_llm = "openai_llm"  # OpenAI-hosted LLM
+    azure_openai_llm = "azure_openai_llm"  # MS Azure hosted OpenAI LLM
+    amazon_bedrock = "amazon_bedrock"  # Amazon Bedrock
+@dataclasses.dataclass
+class TestLab:
+    """Represents an Eval Studio Test Lab, which can directly be evaluated,
+    without a need to contact LLM/RAG system. This object contains all the information,
+    needed for the evaluation, such as prompt, actual answer and retrieved contexts,
+    for all of the models.
+    Attributes:
+        name (str): The name of the test lab.
+        description (str): The description of the test lab.
+        dataset: The dataset consists of test cases, which are used for the evaluation.
+        models: The models contain definitions of connections to different models
+                or RAG systems
+    """
+    name: str
+    description: str = ""
+    _models: List["TestLabModel"] = dataclasses.field(default_factory=list)
+    _client: Optional[api.ApiClient] = None
+    __test__ = False
+    def __post_init__(self):
+        if self._client:
+            self._leaderboard_api = api.LeaderboardServiceApi(self._client)
+    @property
+    def models(self) -> List["TestLabModel"]:
+        return self._models
+    def add_model(
+        self,
+        name: str,
+        model_type: ModelType,
+        llm_model_name: str,
+        collection_id: str = "",
+        collection_name: str = "",
+        documents: Optional[List[str]] = None,
+    ) -> "TestLabModel":
+        """Registers a new model to the Test Lab.
+        Args:
+            name (str): Human readable name of the model.
+            model_type (ModelType): The type of the model. One of `ModelType` values.
+            llm_model_name (str): Identification of the LLM models used,
+                e.g. "h2oai/h2ogpt-4096-llama2-13b-chat"
+            collection_id (str, optional): ID of the existing collection in the RAG
+                system, which produced the answers.
+            collection_name (str, optional): Name of the existing collection in the RAG
+                system, which produced the answers.
+            documents (Optional[List[str]], optional): List of document URLs used
+                in the RAG evaluation. These can later be reused.
+        Returns:
+            TestLabModel: New instance of TestLabModel.
+        """
+        key = str(uuid.uuid4())
+        _m = TestLabModel(
+            name=name,
+            key=key,
+            model_type=model_type.value,
+            llm_model_name=llm_model_name,
+            collection_id=collection_id,
+            collection_name=collection_name,
+            documents=documents or [],
+        )
+        self._models.append(_m)
+        return _m
+    def evaluate(self, evaluator: evaluators.Evaluator) -> Optional[l10s.Leaderboard]:
+        """Runs an evaluation for the test lab.
+        Args:
+            evaluator: The evaluator to use for the evaluation.
+        """
+        req = apiModels.V1alphaImportLeaderboardRequest(
+            testLabJson=self.json(),
+            evaluator=evaluator.key,
+            model=None,
+            leaderboardDisplayName=self.name,
+            leaderboardDescription=self.description or "",
+            testDisplayName=f"{self.name}-Test",
+            testDescription=self.description or "",
+        )
+        res = self._leaderboard_api.leaderboard_service_import_leaderboard(req)
+        if res and res.operation:
+            return self._get_leaderboard_from_operation(res.operation)
+        return None
+    def json(self) -> str:
+        raw_inputs = []
+        dataset = []
+        for m in self.models:
+            raw_inputs.extend(m.raw_inputs)
+            dataset.extend(m.dataset)
+        lab = {
+            "name": self.name,
+            "description": self.description,
+            "raw_dataset": {"inputs": raw_inputs},
+            "dataset": {"inputs": dataset},
+            "models": [m.to_dict() for m in self.models],
+            "llm_model_names": self._llm_model_names(),
+        }
+        return json.dumps(lab, indent=4, sort_keys=True)
+    def _get_leaderboard_from_operation(
+        self, operation: apiModels.V1alphaOperation
+    ) -> Optional[l10s.Leaderboard]:
+        """Retrieves the leaderboard from the operation, which created it.
+        Args:
+            operation: The operation that created the leaderboard.
+        """
+        if not operation.metadata:
+            raise RuntimeError("Not possible to retrieve leaderboard from operation")
+        leadeboard_id = operation.metadata.to_dict().get("leaderboard")
+        res = self._leaderboard_api.leaderboard_service_get_leaderboard(leadeboard_id)
+        if res and res.leaderboard:
+            return l10s.Leaderboard._from_api_leaderboard(res.leaderboard, self._client)
+        return None
+    def _llm_model_names(self) -> List[str]:
+        return [m.llm_model_name for m in self.models]
+@dataclasses.dataclass
+class TestLabModel:
+    """Represents a model, which is used in the testing. This object contains
+    the model key, the model name and the model type.
+    """
+    # Human readable name of the model
+    name: str
+    # The unique identification of the model to link with inputs
+    key: str
+    model_type: str
+    llm_model_name: str
+    collection_id: str = ""
+    collection_name: str = ""
+    documents: List[str] = dataclasses.field(default_factory=list)
+    connection: str = ""
+    _inputs: List["_TestLabInput"] = dataclasses.field(default_factory=list)
+    __test__ = False
+    def __post_init__(self):
+        self.validate_model_type()
+    @property
+    def raw_inputs(self) -> List[dict]:
+        return [i.to_raw_input_dict() for i in self._inputs]
+    @property
+    def dataset(self) -> List[dict]:
+        return [i.to_dataset_dict() for i in self._inputs]
+    def add_input(
+        self,
+        prompt: str,
+        actual_output: str,
+        corpus: Optional[List[str]] = None,
+        context: Optional[List[str]] = None,
+        categories: Union[str, List[str]] = "",
+        expected_output: str = "",
+        output_constraints: Optional[List[str]] = None,
+        actual_duration: float = 0.0,
+        cost: float = 0.0,
+        output_condition: str = "",
+    ) -> "_TestLabInput":
+        """Add an evaluation input, which contains all the info relevant for the
+        evaluation, to avoid calling the RAG/LLM itself.
+        Args:
+            prompt (str): Prompt or input to the RAG/LLM.
+            actual_output (str): Actual output from the RAG/LLM.
+            corpus (Optional[List[str]], optional): List of document URLs used in the RAG.
+            context (Optional[List[str]], optional): List of retrieved contexts.
+            categories (Union[str, List[str]]): List of categories/tags for the input.
+            expected_output (str): Expected output from the RAG/LLM.
+            output_constraints (List[str]): List of constraints for the output,
+                such as expected tokens in the answer.
+            actual_duration (float, optional): Duration of the inference of the answer.
+            cost (float, optional): Cost estimate of the inference.
+            output_condition (str, optional): Output condition is a logical expression
+                used to set the expectation on the output. The expression is in
+                Google's filtering language format defined in
+                https://google.aip.dev/160#logical-operators .
+        Returns:
+            TestLabInput instance.
+        """
+        i = _TestLabInput(
+            prompt=prompt,
+            corpus=corpus,
+            context=context,
+            categories=categories,
+            expected_output=expected_output,
+            output_constraints=output_constraints,
+            output_condition=output_condition,
+            actual_output=actual_output,
+            actual_duration=actual_duration,
+            cost=cost,
+            model_key=self.key,
+        )
+        self._inputs.append(i)
+        return i
+    def to_dict(self) -> dict:
+        return {
+            "name": self.name,
+            "key": self.key,
+            "model_type": self.model_type,
+            "collection_id": self.collection_id,
+            "collection_name": self.collection_name,
+            "llm_model_name": self.llm_model_name,
+            "documents": self.documents or [],
+            "connection": self.connection,
+        }
+    def validate_model_type(self):
+        valid_values = [e.value for e in set(ModelType)]
+        if self.model_type not in valid_values:
+            raise ValueError(
+                f"Invalid model type: {self.model_type}. Valid values: {valid_values}"
+            )
+@dataclasses.dataclass
+class _TestLabInput:
+    """Represents a single input for the testing, which is basically a `TestCase`,
+    with more information.
+    """
+    # The input prompt
+    prompt: str
+    corpus: Optional[List[str]] = None
+    context: Optional[List[str]] = None
+    categories: Union[str, List[str]] = ""
+    expected_output: str = ""
+    output_constraints: Optional[List[str]] = None
+    output_condition: str = ""
+    actual_output: str = ""
+    actual_duration: float = 0.0
+    cost: float = 0.0
+    model_key: str = ""
+    def to_raw_input_dict(self) -> dict:
+        return {
+            "input": self.prompt,
+            "corpus": self.corpus or [],
+            "context": [],
+            "categories": self.categories,
+            "expected_output": self.expected_output,
+            "output_constraints": self.output_constraints or [],
+            "output_condition": self.output_condition or "",
+            "actual_output": "",
+            "actual_duration": 0.0,
+            "cost": 0.0,
+            "model_key": self.model_key,
+        }
+    def to_dataset_dict(self) -> dict:
+        return {
+            "input": self.prompt,
+            "corpus": self.corpus or [],
+            "context": self.context or [],
+            "categories": self.categories,
+            "expected_output": self.expected_output,
+            "output_constraints": self.output_constraints or [],
+            "output_condition": self.output_condition or "",
+            "actual_output": self.actual_output,
+            "actual_duration": self.actual_duration,
+            "cost": self.cost,
+            "model_key": self.model_key,
+        }
+class _TestLabs:
+    def __init__(self, client: api.ApiClient):
+        self._client = client
+    def create(self, name: str, description: str = "") -> TestLab:
+        """Create a new Test Lab instance
+        Args:
+            name: Name of the test lab
+            description: Description of the test lab
+        """
+        return TestLab(name, description, _client=self._client)

eval_studio_client/tests.py ADDED Viewed

@@ -0,0 +1,369 @@
+import dataclasses
+import datetime
+import json
+from typing import List
+from typing import Optional
+from typing import Union
+from eval_studio_client import api
+from eval_studio_client import documents as d7s
+from eval_studio_client import perturbators as p10s
+from eval_studio_client.api import models
+@dataclasses.dataclass
+class TestCase:
+    """Represents a single test case, which contains tested prompt, expected answer
+    and set of constraints.
+    Attributes:
+        key (str): Generated ID of the test case.
+        prompt (str): Prompt of the test case.
+        answer (str): Expected answer of the test case.
+        constraints (List[str]): String tokens expected in the actual answer.
+            Note: all of the constraints in the list are concatenated using AND
+            operator, which means actual answer need to contain all of the tokens.
+        create_time (datetime): Timestamp of the test case creation.
+        update_time (datetime): Timestamp of the last test case update.
+    """
+    key: str
+    prompt: str
+    answer: str
+    constraints: List[str]
+    condition: str
+    create_time: Optional[datetime.datetime] = None
+    update_time: Optional[datetime.datetime] = None
+    def to_api_proto(self) -> models.V1alphaTestCase:
+        """Converts the client TestCase to an API TestCase."""
+        return models.V1alphaTestCase(
+            prompt=self.prompt,
+            answer=self.answer,
+            constraints=self.constraints,
+            condition=self.condition,
+        )
+    @staticmethod
+    def _from_api_test_case(api_test_case: models.V1alphaTestCase) -> "TestCase":
+        return TestCase(
+            key=api_test_case.name or "",
+            prompt=api_test_case.prompt or "",
+            answer=api_test_case.answer or "",
+            constraints=api_test_case.constraints or [],
+            condition=api_test_case.condition or "",
+        )
+@dataclasses.dataclass
+class Test:
+    """Represents a test, which contains a set of test cases and optionally
+    also documents for evaluating RAG systems.
+    Attributes:
+        key (str): Generated ID of the test.
+        name (str): Name of the test.
+        description (str): Description of the test.
+        create_time (datetime): Timestamp of the test creation.
+        update_time (datetime): Timestamp of the last test update.
+    """
+    key: str
+    name: str
+    description: str
+    _document_names: List[str]
+    create_time: Optional[datetime.datetime] = None
+    update_time: Optional[datetime.datetime] = None
+    _client: Optional[api.ApiClient] = None
+    def __post_init__(self):
+        if self._client:
+            self._test_api = api.TestServiceApi(self._client)
+            self._test_case_api = api.TestCaseServiceApi(self._client)
+            self._document_api = api.DocumentServiceApi(self._client)
+    @property
+    def test_cases(self) -> List[TestCase]:
+        """Retrieves all test cases in the test."""
+        res = self._test_case_api.test_case_service_list_test_cases(self.key)
+        if res and res.test_cases:
+            return [TestCase._from_api_test_case(tc) for tc in res.test_cases]
+        return []
+    @property
+    def documents(self) -> List[d7s.Document]:
+        """Retrieves all documents attached to the test."""
+        if not self._document_names:
+            return []
+        res = self._document_api.document_service_batch_get_documents(
+            self._document_names
+        )
+        if res and res.documents:
+            return [
+                d7s.Document._from_api_document(d, self._client) for d in res.documents
+            ]
+        return []
+    def perturb(
+        self,
+        new_test_name: str,
+        perturbators: Union[p10s.Perturbator, str, List[Union[p10s.Perturbator, str]]],
+        new_test_description: str = "",
+    ) -> "Test":
+        """Creates new Test by perturbing this test using the given Perturbators.
+        Args:
+            new_test_name (str): Name of the newly created test.
+            perturbators (Perturbator, List[Perturbator], str or List[str]): List of Perturbators or
+                their keys used to perturbate this Test.
+            new_test_description (str): Optional description of the newly created test.
+        """
+        if self._client is None:
+            raise RuntimeError("Client is not set.")
+        if not new_test_name:
+            raise ValueError("New test name must be provided.")
+        if not perturbators:
+            raise ValueError("Perturbators must be provided.")
+        if isinstance(perturbators, (p10s.Perturbator, str)):
+            perturbators_to_run = [perturbators]
+        else:
+            perturbators_to_run = perturbators
+        configs = [_PerturbatorConfiguration(p) for p in perturbators_to_run]
+        req = models.TestServicePerturbTestRequest(
+            perturbatorConfigurations=[c.to_api_proto() for c in configs],
+            newTestDisplayName=new_test_name,
+            newTestDescription=new_test_description,
+        )
+        resp = self._test_api.test_service_perturb_test(self.key, req)
+        return Test._from_api_test(resp.test, self._client)
+    def delete(self, force=False):
+        """Deletes the test.
+        Args:
+            force (bool): If True, test cases will be deleted as well.
+        """
+        self._test_api.test_service_delete_test(self.key, force=force)
+    def create_test_case(
+        self,
+        prompt: str,
+        answer: str,
+        constraints: Optional[List[str]] = None,
+        condition: str = "",
+    ) -> Optional[TestCase]:
+        """Creates a new test case in the test.
+        Args:
+            prompt (str): Prompt of the test case.
+            answer (str): Expected answer of the test case.
+            constraints (List[str]): String tokens expected in the actual answer.
+                Note: all of the constraints in the list are concatenated using AND
+                operator, which means actual answer need to contain all of the tokens.
+            condition (str): Test case output condition, in a form logical expression.
+                The format of the string is defined by the Google's filtering language.
+                (ref. https://google.aip.dev/160#logical-operators)
+        """
+        case = TestCase(
+            key="",
+            prompt=prompt,
+            answer=answer,
+            constraints=constraints or [],
+            condition=condition,
+        )
+        res = self._test_case_api.test_case_service_create_test_case(
+            parent=self.key, test_case=case.to_api_proto()
+        )
+        if res and res.test_case:
+            return TestCase._from_api_test_case(res.test_case)
+        return None
+    def remove_test_case(self, test_case_key: str):
+        """Removes a test case from the test.
+        Args:
+            test_case_key (str): Resource name of the test case to be removed.
+        """
+        self._test_case_api.test_case_service_delete_test_case(test_case_key)
+    def create_document(
+        self, name: str, url: str, description: Optional[str] = None
+    ) -> Optional[d7s.Document]:
+        """Creates a new document and attaches it to the test.
+        Args:
+            name (str): Name of the document.
+            url (str): URL of the document.
+            description (str): Description of the document.
+        """
+        doc = d7s.Document("", name, description or "", url)
+        res = self._document_api.document_service_create_document(doc.to_api_proto())
+        if res and res.document:
+            doc = d7s.Document._from_api_document(res.document, self._client)
+        try:
+            self.link_document(doc)
+        except ValueError as err:
+            raise RuntimeError("Failed to create the document.") from err
+        except Exception as err:
+            doc.delete()
+            raise RuntimeError("Failed to link the document to the test.") from err
+        return doc
+    def link_document(self, document: d7s.Document):
+        """Attaches an existing document to the test.
+        Args:
+            document (Document): Document to be attached to the test.
+        """
+        if not document.key:
+            raise ValueError("Document must have a resource name.")
+        self._document_names.append(document.key)
+        try:
+            self._test_api.test_service_update_test(
+                test_name=self.key,
+                test=models.RequiredTheTestToUpdate(documents=self._document_names),
+            )
+        except Exception as err:
+            self._document_names.remove(document.key)
+            raise RuntimeError("Failed to link the document to the test.") from err
+    def unlink_document(self, document_key: str):
+        """Deletes a document attached to the test.
+        Args:
+            document_key (str): Resource name of the document to be detached from the test.
+        """
+        try:
+            self._document_names.remove(document_key)
+        except ValueError as err:
+            raise ValueError(
+                f"Document {document_key} is not attached to the test."
+            ) from err
+        try:
+            self._test_api.test_service_update_test(
+                test_name=self.key,
+                test=models.RequiredTheTestToUpdate(documents=self._document_names),
+            )
+        except Exception as err:
+            self._document_names.append(document_key)
+            raise RuntimeError("Failed to unlink the document from the test.") from err
+    @staticmethod
+    def _from_api_test(api_test: models.V1alphaTest, client: api.ApiClient) -> "Test":
+        return Test(
+            key=api_test.name or "",
+            name=api_test.display_name or "",
+            description=api_test.description or "",
+            create_time=api_test.create_time,
+            update_time=api_test.update_time,
+            _document_names=api_test.documents or [],
+            _client=client,
+        )
+class _Tests:
+    def __init__(self, client: api.ApiClient):
+        self._client = client
+        self._api = api.TestServiceApi(client)
+    def list(self) -> List[Test]:
+        """Retrieves all user tests in the Eval Studio."""
+        res = self._api.test_service_list_tests()
+        if res and res.tests:
+            return [Test._from_api_test(t, self._client) for t in res.tests]
+        return []
+    def create(
+        self,
+        name: str,
+        description: Optional[str] = "",
+        documents: Optional[List[d7s.Document]] = None,
+    ) -> Optional[Test]:
+        """Creates a new test in the Eval Studio.
+        Args:
+            name (str): Name of the test.
+            description (str): Description of the test.
+            documents (optional): List of `Document`s to be attached to the test.
+        """
+        _documents = [d.key for d in documents] if documents else None
+        test = models.V1alphaTest(
+            display_name=name, description=description, documents=_documents
+        )
+        res = self._api.test_service_create_test(test)
+        if res and res.test:
+            return Test._from_api_test(res.test, self._client)
+        return None
+    def delete(self, key: str):
+        """Deletes the test with given resource name.
+        Args:
+            key (str): Resource name of the test to be deleted.
+        """
+        self._api.test_service_delete_test(key)
+    def import_test_suite(
+        self, test_suite: str, name_prefix: Optional[str] = None
+    ) -> List[Test]:
+        """Imports a list of tests (Test Suite) from a JSON.
+        Args:
+            test_suite (str): JSON string of the test suite.
+            name_prefix (str): Optional prefix to name the imported tests.
+        """
+        req = models.V1alphaBatchImportTestsRequest(
+            testsJson=test_suite, testDisplayNamePrefix=name_prefix or None
+        )
+        res = self._api.test_service_batch_import_tests(req)
+        if res and res.tests:
+            return [Test._from_api_test(t, self._client) for t in res.tests]
+        return []
+class _PerturbatorConfiguration:
+    """Represents the configuration of a perturbator to use during the perturbation process.
+    Attributes:
+        perturbator (Perturbator or str): Perturbator to use or its key.
+    """
+    def __init__(self, perturbator: Union[p10s.Perturbator, str]):
+        self.name = (
+            perturbator.key
+            if isinstance(perturbator, p10s.Perturbator)
+            else perturbator
+        )
+        self.intensity = (
+            perturbator.intensity
+            if isinstance(perturbator, p10s.Perturbator)
+            else p10s.PerturbatorIntensity.medium
+        )
+        self.params = (
+            perturbator.params if isinstance(perturbator, p10s.Perturbator) else None
+        )
+    def to_api_proto(self) -> models.V1alphaPerturbatorConfiguration:
+        """Converts the client PerturbatorConfiguration to an API PerturbatorConfiguration."""
+        return models.V1alphaPerturbatorConfiguration(
+            name=self.name,
+            intensity=self.intensity.to_api_proto(),
+            params=json.dumps(self.params) if self.params else None,
+        )

eval_studio_client-0.7.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,18 @@
+Metadata-Version: 2.3
+Name: eval-studio-client
+Version: 0.7.0
+Project-URL: Source, https://github.com/h2oai/eval-studio/tree/main/client-py/src/
+Project-URL: Issues, https://github.com/h2oai/eval-studio/issues
+Author-email: "H2O.ai" <support@h2o.ai>
+License-Expression: MIT
+Classifier: Development Status :: 4 - Beta
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Requires-Python: >=3.9
+Requires-Dist: h2o-authn<3.0.0,>=2.0.0
+Requires-Dist: pydantic>=2
+Requires-Dist: python-dateutil>=2.5.3
+Requires-Dist: typing-extensions>=4.7.1
+Requires-Dist: urllib3<2.3.0,>=1.26.19