PyPI - eval-studio-client - Versions diffs - 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

eval-studio-client 1.0.0py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (179) hide show

eval_studio_client/leaderboards.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import dataclasses
 import datetime
 import json
+import os
 import time
 from typing import Dict
 from typing import List
@@ -33,6 +34,7 @@ class Leaderboard:
     update_time: Optional[datetime.datetime] = None
     problems: List[p6s.Problem] = dataclasses.field(default_factory=list)
     insights: List[i6s.Insight] = dataclasses.field(default_factory=list)
+    summary: Optional[str] = None
     existing_collection: Optional[str] = None
     _report: Optional[str] = None
     _leaderboard: Optional[str] = None
@@ -86,6 +88,42 @@ class Leaderboard:
         if self._client:
             self._leaderboard_api.leaderboard_service_delete_leaderboard(self.key)
+    def download_result(self, dest: str):
+        """Downloads the leaderboard result to a JSON file.
+        Args:
+            dest (str): The destination path for the report.
+        """
+        if not os.path.exists(dest):
+            raise ValueError("Destination path does not exist.")
+        if os.path.isdir(dest):
+            dest = os.path.join(dest, "results.json")
+        if self._client and self.finished:
+            headers: Dict[str, str] = {}
+            url = urljoin(
+                self._client.configuration.host, f"/content/{self.key}/results"
+            )
+            self._client.update_params_for_auth(
+                headers=headers,
+                queries=[],
+                auth_settings=[],
+                resource_path=url,
+                method="GET",
+                body=None,
+            )
+            response = urllib3.request("GET", url, headers=headers)
+            if response.status == 200:
+                with open(dest, "wb") as f:
+                    f.write(response.data)
+                    return
+            else:
+                raise RuntimeError("Failed to retrieve leaderboard result.")
+        raise ValueError("Cannot download result for unfinished leaderboard.")
     def download_report(self, dest: str):
         """Downloads the leaderboard report to a zip file.
@@ -114,6 +152,30 @@ class Leaderboard:
         raise ValueError("Cannot download report for unfinished leaderboard.")
+    def get_result_json(self) -> str:
+        """Retrieves the leaderboard result as a JSON string."""
+        if self._client and self.finished:
+            headers: Dict[str, str] = {}
+            url = urljoin(
+                self._client.configuration.host, f"/content/{self.key}/results"
+            )
+            self._client.update_params_for_auth(
+                headers=headers,
+                queries=[],
+                auth_settings=[],
+                resource_path=url,
+                method="GET",
+                body=None,
+            )
+            response = urllib3.request("GET", url, headers=headers)
+            if response.status == 200:
+                return str(response.data)
+            else:
+                raise RuntimeError("Failed to retrieve leaderboard result.")
+        raise ValueError("Cannot download result for unfinished leaderboard.")
     def get_table(self) -> LeaderboardTable:
         """Retrieves the leaderboard table."""
         if self._client and self.finished:
@@ -170,6 +232,7 @@ class Leaderboard:
         """Refresh the leaderboard with the latest API data."""
         self.key = api_leaderboard.name or ""
         self.update_time = api_leaderboard.update_time
+        self.summary = api_leaderboard.leaderboard_summary
         self._leaderboard = api_leaderboard.leaderboard_table
         self._report = api_leaderboard.leaderboard_report or ""
         self._status = api_leaderboard.status
@@ -192,6 +255,7 @@ class Leaderboard:
             update_time=api_leaderboard.update_time,
             problems=problems,
             insights=insights,
+            summary=api_leaderboard.leaderboard_summary,
             existing_collection=api_leaderboard.h2ogpte_collection or None,
             _evaluator_name=api_leaderboard.evaluator or "",
             _test_names=api_leaderboard.tests or [],
@@ -208,3 +272,62 @@ class Leaderboard:
             models.V1LeaderboardStatus.LEADERBOARD_STATUS_COMPLETED,
             models.V1LeaderboardStatus.LEADERBOARD_STATUS_FAILED,
         ]
+    @staticmethod
+    def from_operation(
+        operation: models.V1Operation, client: Optional[api.ApiClient]
+    ) -> Optional["Leaderboard"]:
+        """Retrieves the leaderboard from the operation, which created it.
+        Args:
+            operation: The operation that created the dashboard.
+            client: The API client to use for the leaderboard retrieval.
+        Returns:
+            Leaderboard: The leaderboard instance created by the operation.
+        """
+        if not client:
+            raise RuntimeError("API Client is not provided")
+        if not operation.metadata:
+            raise RuntimeError(
+                "Operation metadata missing, it's not possible to retrieve leaderboard from operation"
+            )
+        leaderboard_api = api.LeaderboardServiceApi(client)
+        leadeboard_id = operation.metadata.to_dict().get("leaderboard", "")
+        res = leaderboard_api.leaderboard_service_get_leaderboard(str(leadeboard_id))
+        if res and res.leaderboard:
+            return Leaderboard._from_api_leaderboard(res.leaderboard, client)
+        return None
+class _Leaderboards:
+    def __init__(self, client: api.ApiClient):
+        self._client = client
+        self._api = api.LeaderboardServiceApi(client)
+    def get(self, key: str) -> Leaderboard:
+        """Gets an individual leaderboard with a given key from Eval Studio.
+        Args:
+            key: The leaderboard resource name to retrieve.
+        """
+        res = self._api.leaderboard_service_get_leaderboard(key)
+        if res and res.leaderboard:
+            return Leaderboard._from_api_leaderboard(res.leaderboard, self._client)
+        raise KeyError("Leaderboard not found.")
+    def list(self) -> List[Leaderboard]:
+        """Lists all user leaderboards in Eval Studio."""
+        res = self._api.leaderboard_service_list_leaderboards()
+        if res:
+            res_leaderboards = res.leaderboards or []
+            return [
+                Leaderboard._from_api_leaderboard(lb, self._client)
+                for lb in res_leaderboards
+            ]
+        return []

eval_studio_client/models.py CHANGED Viewed

@@ -168,7 +168,7 @@ class Model:
             )
         if res and res.operation:
-            return self._get_leaderboard_from_operation(res.operation)
+            return l10s.Leaderboard.from_operation(res.operation, self._client)
         return None
@@ -226,7 +226,7 @@ class Model:
         )
         if res and res.operation:
-            return self._get_dashboard_from_operation(res.operation)
+            return d8s.Dashboard.from_operation(res.operation, self._client)
         return None
@@ -257,7 +257,7 @@ class Model:
         )
         res = self._leaderboard_api.leaderboard_service_import_leaderboard(req)
         if res and res.operation:
-            return self._get_leaderboard_from_operation(res.operation)
+            return l10s.Leaderboard.from_operation(res.operation, self._client)
         return None
@@ -273,45 +273,6 @@ class Model:
         raise RuntimeError("Failed to list base models")
-    def _get_leaderboard_from_operation(
-        self, operation: models.V1Operation
-    ) -> Optional[l10s.Leaderboard]:
-        """Retrieves the leaderboard from the operation, which created it.
-        Args:
-            operation: The operation that created the leaderboard.
-        """
-        if not operation.metadata:
-            raise RuntimeError("Not possible to retrieve leaderboard from operation")
-        leadeboard_id = operation.metadata.to_dict().get("leaderboard")
-        res = self._leaderboard_api.leaderboard_service_get_leaderboard(leadeboard_id)
-        if res and res.leaderboard:
-            return l10s.Leaderboard._from_api_leaderboard(res.leaderboard, self._client)
-        return None
-    def _get_dashboard_from_operation(
-        self, operation: models.V1Operation
-    ) -> Optional[d8s.Dashboard]:
-        """Retrieves the dashboard from the operation, which created it.
-        Args:
-            operation: The operation that created the dashboard.
-        """
-        if not self._client:
-            raise RuntimeError("Client is not set.")
-        if not operation.metadata:
-            raise RuntimeError("Not possible to retrieve dashboard from operation")
-        dashboard_id = operation.metadata.to_dict().get("dashboard")
-        res = self._dashboard_api.dashboard_service_get_dashboard(dashboard_id)
-        if res and res.dashboard:
-            return d8s.Dashboard._from_api_dashboard(res.dashboard, self._client)
-        return None
     @staticmethod
     def _from_api_model(api_model: models.V1Model, client: api.ApiClient) -> "Model":
         """Converts the API model to the client model."""

eval_studio_client/test_labs.py CHANGED Viewed

@@ -7,7 +7,8 @@ from typing import Union
 import uuid
 from eval_studio_client import api
-from eval_studio_client import evaluators
+from eval_studio_client import dashboards
+from eval_studio_client import evaluators as e8s
 from eval_studio_client import leaderboards as l10s
 from eval_studio_client.api import models as apiModels
@@ -92,11 +93,56 @@ class TestLab:
         self._models.append(_m)
         return _m
-    def evaluate(self, evaluator: evaluators.Evaluator) -> Optional[l10s.Leaderboard]:
+    def evaluate(
+        self,
+        evaluators: Union[e8s.Evaluator, List[e8s.Evaluator]],
+        name: Optional[str] = None,
+        description: Optional[str] = None,
+    ) -> Optional[dashboards.Dashboard]:
         """Runs an evaluation for the test lab.
+        Args:
+            evaluators (Union[e8s.Evaluator, List[e8s.Evaluator]]): One or many evaluators
+                used to evaluate the test lab.
+            name (str, optional): Optional name for the evaluation.
+            description (str, optional): Optional description for the evaluation.
+        Returns:
+           Dashboard: Evaluation dashboard instance. In case launching of evaluation
+              fails, `None` is returned.
+        """
+        _evaluators = (
+            [evaluators] if isinstance(evaluators, e8s.Evaluator) else evaluators
+        )
+        name = name or self.name or "Imported Dashboard"
+        description = description or self.description or ""
+        req = apiModels.V1BatchImportLeaderboardRequest(
+            testLabJson=self.json(),
+            evaluators=[e.key for e in _evaluators],
+            model=None,
+            dashboardDisplayName=name,
+            dashboardDescription=description,
+            testDisplayName=f"{name} - Test",
+            testDescription=f"Test suite for {description}",
+        )
+        res = self._leaderboard_api.leaderboard_service_batch_import_leaderboard(req)
+        if res and res.operation:
+            return dashboards.Dashboard.from_operation(res.operation, self._client)
+        return None
+    def create_leaderboard(
+        self, evaluator: e8s.Evaluator
+    ) -> Optional[l10s.Leaderboard]:
+        """Creates a single leaderboard for the test lab.
         Args:
             evaluator: The evaluator to use for the evaluation.
+        Returns:
+           Leaderboard: Single evaluation leaderboard instance.
+            In case launching of evaluation fails, `None` is returned.
         """
         req = apiModels.V1ImportLeaderboardRequest(
             testLabJson=self.json(),
@@ -109,7 +155,7 @@ class TestLab:
         )
         res = self._leaderboard_api.leaderboard_service_import_leaderboard(req)
         if res and res.operation:
-            return self._get_leaderboard_from_operation(res.operation)
+            return l10s.Leaderboard.from_operation(res.operation, self._client)
         return None
@@ -131,24 +177,6 @@ class TestLab:
         return json.dumps(lab, indent=4, sort_keys=True)
-    def _get_leaderboard_from_operation(
-        self, operation: apiModels.V1Operation
-    ) -> Optional[l10s.Leaderboard]:
-        """Retrieves the leaderboard from the operation, which created it.
-        Args:
-            operation: The operation that created the leaderboard.
-        """
-        if not operation.metadata:
-            raise RuntimeError("Not possible to retrieve leaderboard from operation")
-        leadeboard_id = operation.metadata.to_dict().get("leaderboard")
-        res = self._leaderboard_api.leaderboard_service_get_leaderboard(leadeboard_id)
-        if res and res.leaderboard:
-            return l10s.Leaderboard._from_api_leaderboard(res.leaderboard, self._client)
-        return None
     def _llm_model_names(self) -> List[str]:
         return [m.llm_model_name for m in self.models]

eval_studio_client/tests.py CHANGED Viewed

@@ -118,6 +118,71 @@ class _TestCaseGenerationHandle:
         )
+@dataclasses.dataclass
+class _TestCaseLibraryGetHandle(_TestCaseGenerationHandle):
+    @staticmethod
+    def _from_operation(
+        res: (
+            models.V1ImportTestCasesFromLibraryResponse | models.V1GetOperationResponse
+        ),
+    ) -> "_TestCaseLibraryGetHandle":
+        """Converts an API operation to prompt library handle."""
+        op: models.V1Operation | None = res.operation
+        if not op:
+            return _TestCaseLibraryGetHandle(name=None)
+        # progress
+        if hasattr(op, "metadata") and op.metadata:
+            meta_dict = op.metadata.to_dict() or {}
+        else:
+            meta_dict = {}
+        return _TestCaseLibraryGetHandle(
+            name=op.name,
+            progress=meta_dict.get("progress"),
+            progress_message=meta_dict.get("progressMessage"),
+            error=op.error,
+            done=op.done,
+        )
+@dataclasses.dataclass
+class TestCaseLibraryItem:
+    """Represents a single test case library item - test suite."""
+    key: str
+    name: str
+    description: str
+    test_suite_url: str
+    test_count: int
+    test_case_count: int
+    evaluates: List[str]
+    categories: List[str]
+    @staticmethod
+    def _from_api_items(
+        api_items: List[models.V1PromptLibraryItem],
+    ) -> List["TestCaseLibraryItem"]:
+        return (
+            [
+                TestCaseLibraryItem(
+                    key=api_item.name or "",
+                    name=api_item.display_name or "",
+                    description=api_item.description or "",
+                    test_suite_url=api_item.test_suite_url or "",
+                    test_count=api_item.test_count or 0,
+                    test_case_count=api_item.test_case_count or 0,
+                    evaluates=list(api_item.evaluates) if api_item.evaluates else [],
+                    categories=list(api_item.categories) if api_item.categories else [],
+                )
+                for api_item in api_items
+            ]
+            if api_items
+            else []
+        )
 @dataclasses.dataclass
 class TestCase:
     """Represents a single test case, which contains tested prompt, expected answer
@@ -183,6 +248,7 @@ class Test:
     update_time: Optional[datetime.datetime] = None
     _client: Optional[api.ApiClient] = None
     _gen_tc_op_name: Optional[str] = None
+    _lib_tc_op_name: Optional[str] = None
     def __post_init__(self):
         if self._client:
@@ -267,7 +333,7 @@ class Test:
         Args:
             count (int): Number of test cases to generate (generator may return fewer
-               prompts).
+                prompts).
             model (str): Model to use for generating the prompts.
             base_llm_model (str): Base LLM model to use for generating the prompts.
             generators (List[TestCaseGenerator]): Methods to use for generation.
@@ -342,6 +408,127 @@ class Test:
         raise TimeoutError("Waiting timeout has been reached.")
+    def list_test_suite_library_items(
+        self,
+        filter_by_categories: Optional[List[str]] = None,
+        filter_by_purposes: Optional[List[str]] = None,
+        filter_by_evaluates: Optional[List[str]] = None,
+        filter_by_origin: Optional[str] = None,
+        filter_by_test_case_count: Optional[int] = None,
+        filter_by_test_count: Optional[int] = None,
+        filter_by_fts: Optional[str] = None,
+    ) -> List[TestCaseLibraryItem]:
+        """Retrieves a list of all available items - suites of tests - in the library.
+        Args:
+            filter_by_categories (List[str]): List of categories to filter
+                the library items.
+            filter_by_purposes (List[str]): List of purposes to filter
+                the library items.
+            filter_by_evaluates (List[str]): List of evaluates to filter
+                the library items.
+            filter_by_origin (str): Origin to filter the library items.
+            filter_by_test_case_count (int): Test case count to filter
+                the library items.
+            filter_by_test_count (int): Test count to filter the library items.
+            filter_by_fts (str): FTS to filter the library items - phrase to search for.
+        Returns:
+            List[TestCaseLibraryItem]: List of library items.
+        """
+        req = models.TestServiceListTestCaseLibraryItemsRequest(
+            filter_by_categories=filter_by_categories,
+            filter_by_purposes=filter_by_purposes,
+            filter_by_evaluates=filter_by_evaluates,
+            filter_by_origin=filter_by_origin,
+            filter_by_test_case_count=filter_by_test_case_count,
+            filter_by_test_count=filter_by_test_count,
+            filter_by_fts=filter_by_fts,
+        )
+        res = self._test_api.test_service_list_test_case_library_items(self.key, req)
+        if res and res.prompt_library_items:
+            return TestCaseLibraryItem._from_api_items(res.prompt_library_items)
+        return []
+    def add_library_test_cases(
+        self, test_suite_url: str, count: int, test_document_urls: Optional[List[str]]
+    ) -> None:
+        """Sample test cases from the test suite library and add them to the test.
+        Args:
+            test_suite_url (str): The URL of the library test suite to get TestCases
+                from (sample).
+            count (int): The number of TestCases to get from the library.
+            test_document_urls (List[str]): The list of target Test corpus
+                document URLs to skip when returning library TestCases corpus.
+        """
+        req = models.TestServiceImportTestCasesFromLibraryRequest(
+            test_suite_url=test_suite_url,
+            count=count,
+            test_document_urls=test_document_urls,
+        )
+        res = self._test_api.test_service_import_test_cases_from_library(self.key, req)
+        op: models.V1Operation | None = res.operation
+        self._lib_tc_op_name = op.name if op else None
+    def wait_for_library_test_case_get(
+        self, timeout: Optional[float] = None, verbose: bool = False
+    ) -> None:
+        """Waits for the library test cases(s) sampling  to finish.
+        Args:
+            timeout (float): The maximum time to wait in seconds.
+            verbose (bool): If True, prints the status of the handle while waiting.
+        """
+        if not self._lib_tc_op_name:
+            raise ValueError(
+                "There is no ongoing getting of test case(s) from the library - "
+                "the operation name is not set."
+            )
+        if verbose:
+            print(
+                f"Waiting for getting library test case(s) operation to finish "
+                f"({self._lib_tc_op_name}):"
+            )
+        if self._client:
+            # exponential backoff
+            wait_time = 1.0
+            wait_coef = 1.6
+            wait_max = 8.0
+            wait_total = 0.0
+            timeout = timeout or float(2 * 24 * 60 * 60)  # 2 days
+            progress_bar = utils.ProgressBar()
+            while wait_total < timeout:
+                handle = _TestCaseLibraryGetHandle._from_operation(
+                    self._operation_api.operation_service_get_operation(
+                        self._lib_tc_op_name
+                    )
+                )
+                if verbose:
+                    progress_bar.update(handle.progress or 0, handle.progress_message)
+                if handle.done:
+                    if handle.error:
+                        raise RuntimeError(
+                            f"Getting of library test case(s) failed: {handle.error}"
+                        )
+                    return
+                wait_time *= wait_coef
+                time.sleep(min(wait_time, wait_max))
+        else:
+            raise ValueError(
+                "Unable to establish a connection to the Eval Studio host."
+            )
+        raise TimeoutError("Waiting timeout has been reached.")
     def delete(self, force=False):
         """Deletes the test.

{eval_studio_client-1.0.0.dist-info → eval_studio_client-1.0.1.dist-info}/METADATA RENAMED Viewed

@@ -1,10 +1,9 @@
 Metadata-Version: 2.3
 Name: eval-studio-client
-Version: 1.0.0
+Version: 1.0.1
 Project-URL: Source, https://github.com/h2oai/eval-studio/tree/main/client-py/src/
 Project-URL: Issues, https://github.com/h2oai/eval-studio/issues
 Author-email: "H2O.ai" <support@h2o.ai>
-License: MIT
 Classifier: Development Status :: 4 - Beta
 Classifier: Programming Language :: Python
 Classifier: Programming Language :: Python :: 3.9

eval-studio-client 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

eval-studio-client 1.0.0py3-none-any.whl → 1.0.1py3-none-any.whl