PyPI - eval-studio-client - Versions diffs - 0.8.2__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

eval-studio-client 0.8.2py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

eval_studio_client/dashboards.py CHANGED Viewed

@@ -9,6 +9,7 @@ from eval_studio_client import api
 from eval_studio_client import insights as i6s
 from eval_studio_client import leaderboards as l10s
 from eval_studio_client import problems as p6s
+from eval_studio_client import utils
 from eval_studio_client.api import models
@@ -41,6 +42,7 @@ class Dashboard:
             self._dashboard_api = api.DashboardServiceApi(self._client)
             self._leaderboard_api = api.LeaderboardServiceApi(self._client)
             self._info_api = api.InfoServiceApi(self._client)
+            self._operation_api = api.OperationServiceApi(self._client)
     @property
     def leaderboards(self) -> Optional[List[l10s.Leaderboard]]:
@@ -118,36 +120,56 @@ class Dashboard:
         else:
             raise ValueError("Cannot establish connection to Eval Studio host.")
-    def wait_to_finish(self, timeout: Optional[float] = None):
+    def wait_to_finish(self, timeout: Optional[float] = None, verbose: bool = False):
         """Waits for the dashboard to finish.
         Args:
             timeout: The maximum time to wait in seconds.
+            verbose (bool): If True, prints the status of the evaluation while waiting.
         """
         timeout = timeout or float("inf")
+        progress_bar = utils.ProgressBar()
         if self.finished:
             return
+        if not self._create_operation:
+            # This means that the evaluation has no assigned operation, thus cannot poll.
+            raise RuntimeError("Failed to retrieve running evaluation info.")
         if self._client:
             ctr = 0
             while ctr < timeout:
-                lbs = self.leaderboards
-                if lbs:
-                    if all(lb.finished for lb in lbs):
-                        return
-                    ctr += 1
-                    time.sleep(1)
+                op = self._operation_api.operation_service_get_operation(
+                    self._create_operation
+                )
+                if not op or not op.operation:
+                    raise RuntimeError(
+                        "Failed to retrieve running evaluation progress."
+                    )
+                if verbose:
+                    if not op.operation.metadata:
+                        raise RuntimeError(
+                            "Failed to retrieve running evaluation progress details."
+                        )
+                    op_meta = op.operation.metadata.to_dict()
+                    progress = op_meta.get("progress", 0)
+                    progress_msg = op_meta.get("progressMessage", "Running")
+                    progress_bar.update(progress, progress_msg)
+                if op.operation.done:
+                    return
+                ctr += 1
+                time.sleep(1)
         else:
             raise ValueError("Cannot establish connection to Eval Studio host.")
         raise TimeoutError("Waiting timeout has been reached.")
-    def show(self):
-        """Opens the evaluation in the default web browser.
-        NOTE: This functionality is primarily for interactive use in Jupyter notebooks.
-        """
+    def show(self) -> str:
+        """Prints the endpoint URL of the evaluation dashboard."""
         if self._client:
             info_res = self._info_api.info_service_get_info()
             if not info_res or not info_res.info:
@@ -155,11 +177,8 @@ class Dashboard:
             host = info_res.info.base_url
             url = urllib.parse.urljoin(host, self.key)
-            # NOTE: Local import is used to avoid problems for users outside Jupyter environment.
-            import webbrowser
-            webbrowser.open(url)
+            print(f"Open following url to access evaluation dashboard: \n\n{url}")
+            return url
         else:
             raise ValueError("Cannot establish connection to Eval Studio host.")

eval_studio_client/leaderboards.py CHANGED Viewed

@@ -33,11 +33,13 @@ class Leaderboard:
     update_time: Optional[datetime.datetime] = None
     problems: List[p6s.Problem] = dataclasses.field(default_factory=list)
     insights: List[i6s.Insight] = dataclasses.field(default_factory=list)
+    existing_collection: Optional[str] = None
     _report: Optional[str] = None
     _leaderboard: Optional[str] = None
     _model_name: Optional[str] = None
     _status: Optional[models.V1LeaderboardStatus] = None
     _client: Optional[api.ApiClient] = None
+    _operation: Optional[str] = None
     def __post_init__(self):
         self._evaluator_api = api.EvaluatorServiceApi(self._client)
@@ -161,6 +163,7 @@ class Leaderboard:
             evaluator=self._evaluator_name,
             tests=self._test_names,
             model=self._model_name,
+            h2ogpte_collection=self.existing_collection or None,
         )
     def _update_result(self, api_leaderboard: models.V1Leaderboard):
@@ -189,12 +192,14 @@ class Leaderboard:
             update_time=api_leaderboard.update_time,
             problems=problems,
             insights=insights,
+            existing_collection=api_leaderboard.h2ogpte_collection or None,
             _evaluator_name=api_leaderboard.evaluator or "",
             _test_names=api_leaderboard.tests or [],
             _report=api_leaderboard.leaderboard_report or "",
             _leaderboard=api_leaderboard.leaderboard_table,
             _status=api_leaderboard.status,
             _client=client,
+            _operation=api_leaderboard.create_operation or None,
         )
     @staticmethod

eval_studio_client/models.py CHANGED Viewed

@@ -22,6 +22,28 @@ DEFAULT_RAG_MODEL_KEY = "models/defaultRAGModel"
 DEFAULT_LLM_MODEL_KEY = "models/defaultLLMModel"
+@dataclasses.dataclass
+class CollectionInfo:
+    """Represents the information about a collection in the H2OGPTE
+    or a Knowledge Base in Amazon Bedrock.
+    """
+    key: str
+    name: str
+    description: str
+    def __str__(self):
+        return f"{self.name} ({self.key})"
+    @staticmethod
+    def _from_api_collection_info(api_col: models.V1CollectionInfo) -> "CollectionInfo":
+        return CollectionInfo(
+            key=api_col.id or "",
+            name=api_col.display_name or "",
+            description=api_col.description or "",
+        )
 @dataclasses.dataclass
 class Model:
     """Represents Eval Studio connection to an external RAG/LLM system.
@@ -80,6 +102,28 @@ class Model:
         return result
+    @property
+    def base_models(self) -> List[str]:
+        """List of base LLM models available to use e.g. for the evaluation."""
+        res = self._model_api.model_service_list_base_models(self.key)
+        if res and res.base_models:
+            return [str(m) for m in res.base_models]
+        raise RuntimeError("Failed to list base models")
+    @property
+    def collections(self) -> List[CollectionInfo]:
+        """List of collections available for evaluation.
+        NOTE: This is currently supported only for H2OGPTe and Amazon Bedrock RAG
+        model hosts.
+        """
+        res = self._model_api.model_service_list_model_collections(self.key)
+        if res and res.collections:
+            return list(res.collections)
+        raise RuntimeError("Failed to list model host collections")
     def create_leaderboard(
         self,
         name: str,
@@ -88,20 +132,27 @@ class Model:
         description: Optional[str] = None,
         base_models: Optional[List[str]] = None,
         use_cache: bool = True,
+        existing_collection: Optional[str] = None,
     ) -> Optional[l10s.Leaderboard]:
         """Runs a new evaluation for the model and creates a new leaderboard.
         Args:
+            name: The name of the leaderboard.
             evaluator: The evaluator to use for the evaluation.
             test_suite: The list of tests used to evaluate the model.
+            description (optional): The description of the leaderboard.
             base_models (optional): The base LLM models to use for the evaluation.
             use_cache (optional): Whether to use the cached answers if available.
+            existing_collection (str): ID or the resource name of the existing
+                collection, which will be used as a corpus for evaluation.
+                NOTE: This option works only for the H2OGPTe and Amazon Bedrock model hosts ATM.
         """
         lb = l10s.Leaderboard(
             key="",
             name=name,
             description=description or "",
             base_models=base_models or [],
+            existing_collection=existing_collection,
             _model_name=self.key,
             _evaluator_name=evaluator.key,
             _test_names=[t.key for t in test_suite],
@@ -128,6 +179,7 @@ class Model:
         test_suites: Union[tests.Test, List[tests.Test]],
         description: Optional[str] = None,
         base_models: Optional[List[str]] = None,
+        existing_collection: Optional[str] = None,
     ) -> Optional[d8s.Dashboard]:
         """Runs a new evaluation for the model and creates a new dashboard.
@@ -136,6 +188,9 @@ class Model:
             test_suites: The test(s) used to evaluate the model.
             description (optional): The description of the dashboard.
             base_models (optional): The base LLM models to use for the evaluation.
+            existing_collection (str): ID or the resource name of the existing
+                collection, which will be used as a corpus for evaluation.
+                NOTE: This option works only for the H2OGPTe and Amazon Bedrock model hosts ATM.
         """
         _evaluators = (
             [evaluators] if isinstance(evaluators, e8s.Evaluator) else evaluators
@@ -151,6 +206,7 @@ class Model:
                 name=f"{name} - {evaluator.name}",
                 description=description or "",
                 base_models=base_models or [],
+                existing_collection=existing_collection,
                 _model_name=self.key,
                 _evaluator_name=evaluator.key,
                 _test_names=[t.key for t in _test_suites],

eval_studio_client/tests.py CHANGED Viewed

@@ -11,6 +11,7 @@ from typing import Union
 from eval_studio_client import api
 from eval_studio_client import documents as d7s
 from eval_studio_client import perturbators as p10s
+from eval_studio_client import utils
 from eval_studio_client.api import models
@@ -85,15 +86,9 @@ class TestCaseGenerator(enum.Enum):
 @dataclasses.dataclass
-class TestCaseGenerationHandle:
+class _TestCaseGenerationHandle:
     name: Any | None
-    create_time: Optional[datetime.datetime] = None
-    creator: Optional[str] = None
-    update_time: Optional[datetime.datetime] = None
-    updater: Optional[str] = None
-    delete_time: Optional[datetime.datetime] = None
-    deleter: Optional[str] = None
     progress: Optional[float] = None
     progress_message: Optional[str] = None
     error: Optional[models.RpcStatus] = None
@@ -102,11 +97,11 @@ class TestCaseGenerationHandle:
     @staticmethod
     def _from_operation(
         res: models.V1GenerateTestCasesResponse | models.V1GetOperationResponse,
-    ) -> "TestCaseGenerationHandle":
+    ) -> "_TestCaseGenerationHandle":
         """Converts an API operation to prompt generation handle."""
         op: models.V1Operation | None = res.operation
         if not op:
-            return TestCaseGenerationHandle(name=None)
+            return _TestCaseGenerationHandle(name=None)
         # progress
         if hasattr(op, "metadata") and op.metadata:
@@ -114,14 +109,8 @@ class TestCaseGenerationHandle:
         else:
             meta_dict = {}
-        return TestCaseGenerationHandle(
+        return _TestCaseGenerationHandle(
             name=op.name,
-            create_time=op.create_time,
-            creator=op.creator,
-            update_time=op.update_time,
-            updater=op.updater,
-            delete_time=op.delete_time,
-            deleter=op.deleter,
             progress=meta_dict.get("progress"),
             progress_message=meta_dict.get("progressMessage"),
             error=op.error,
@@ -193,6 +182,7 @@ class Test:
     create_time: Optional[datetime.datetime] = None
     update_time: Optional[datetime.datetime] = None
     _client: Optional[api.ApiClient] = None
+    _gen_tc_op_name: Optional[str] = None
     def __post_init__(self):
         if self._client:
@@ -271,7 +261,8 @@ class Test:
         model: Optional[str] = None,
         base_llm_model: Optional[str] = None,
         generators: Optional[List[TestCaseGenerator]] = None,
-    ) -> "TestCaseGenerationHandle":
+        existing_collection: Optional[str] = None,
+    ) -> None:
         """Generates test cases based on the documents of the Test.
         Args:
@@ -280,6 +271,9 @@ class Test:
             model (str): Model to use for generating the prompts.
             base_llm_model (str): Base LLM model to use for generating the prompts.
             generators (List[TestCaseGenerator]): Methods to use for generation.
+            existing_collection (str): ID or the resource name of the existing
+                collection, from which prompts will be generated.
+                NOTE: This option works only for the H2OGPTe model host ATM.
         """
         req = models.TestServiceGenerateTestCasesRequest(
@@ -287,61 +281,64 @@ class Test:
             model=model or None,
             base_llm_model=base_llm_model or None,
             generators=[g.to_api_proto() for g in generators] if generators else None,
+            h2ogpte_collection_id=existing_collection or None,
         )
         res = self._test_api.test_service_generate_test_cases(self.key, req)
-        return TestCaseGenerationHandle._from_operation(res)
+        op: models.V1Operation | None = res.operation
+        self._gen_tc_op_name = op.name if op else None
     def wait_for_test_case_generation(
-        self,
-        handle: TestCaseGenerationHandle,
-        timeout: Optional[float] = None,
-        verbose: bool = False,
-    ) -> TestCaseGenerationHandle:
+        self, timeout: Optional[float] = None, verbose: bool = False
+    ) -> None:
         """Waits for the test case generation to finish.
         Args:
-            handle (TestCaseGenerationHandle): Handle of the test case generation.
             timeout (float): The maximum time to wait in seconds.
             verbose (bool): If True, prints the status of the handle while waiting.
         """
-        if not handle.name:
-            raise ValueError("Test case generation handle is not valid.")
-        elif handle.done:
-            return handle
+        if not self._gen_tc_op_name:
+            raise ValueError(
+                "There is no ongoing test case generation - the operation name is not "
+                "set."
+            )
         if verbose:
-            print(f"Waiting for test case generation to finish ({handle.name}):")
+            print(
+                f"Waiting for test case generation to finish ({self._gen_tc_op_name}):"
+            )
         if self._client:
             # exponential backoff
             wait_time = 1.0
             wait_coef = 1.6
-            wait_max = 20.0
+            wait_max = 8.0
             wait_total = 0.0
             timeout = timeout or float(2 * 24 * 60 * 60)  # 2 days
+            progress_bar = utils.ProgressBar()
             while wait_total < timeout:
-                handle = TestCaseGenerationHandle._from_operation(
-                    self._operation_api.operation_service_get_operation(handle.name)
+                handle = _TestCaseGenerationHandle._from_operation(
+                    self._operation_api.operation_service_get_operation(
+                        self._gen_tc_op_name
+                    )
                 )
                 if verbose:
-                    if handle.progress or handle.progress_message:
-                        progress = (
-                            int(handle.progress * 100.0) if handle.progress else 0
-                        )
-                        msg = f"{progress:>2}% - '{handle.progress_message}'"
-                    else:
-                        msg = " 0% - 'Initializing'"
-                    print(f"  {msg}")
+                    progress_bar.update(handle.progress or 0, handle.progress_message)
                 if handle.done:
-                    return handle
+                    if handle.error:
+                        raise RuntimeError(
+                            f"Test case generation failed: {handle.error}"
+                        )
+                    return
                 wait_time *= wait_coef
                 time.sleep(min(wait_time, wait_max))
         else:
-            raise ValueError("Cannot establish connection to Eval Studio host.")
+            raise ValueError(
+                "Unable to establish a connection to the Eval Studio host."
+            )
         raise TimeoutError("Waiting timeout has been reached.")

eval_studio_client/utils.py ADDED Viewed

@@ -0,0 +1,26 @@
+from typing import Optional
+class ProgressBar:
+    def __init__(self):
+        self.progress = 0.0
+        self.progress_message = "Initializing"
+        self._progress_max = 1.0
+    def update(self, progress: float, message: Optional[str] = None):
+        try:
+            self.progress = float(str(progress))
+        except ValueError:
+            self.progress = 0.0
+        if message:
+            self.progress_message = message or ""
+        self.print()
+    def print(self):
+        print(" " * len(self.progress_message), end="\r")
+        p_progress = int(self.progress / self._progress_max * 100)
+        p_hashes = p_progress // 5
+        p_msg = f"  {p_progress:>3}% |{'#' * p_hashes:<20}| {self.progress_message}"
+        print(p_msg, end="\r")

{eval_studio_client-0.8.2.dist-info → eval_studio_client-1.0.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: eval-studio-client
-Version: 0.8.2
+Version: 1.0.0
 Project-URL: Source, https://github.com/h2oai/eval-studio/tree/main/client-py/src/
 Project-URL: Issues, https://github.com/h2oai/eval-studio/issues
 Author-email: "H2O.ai" <support@h2o.ai>

{eval_studio_client-0.8.2.dist-info → eval_studio_client-1.0.0.dist-info}/RECORD RENAMED Viewed

@@ -1,16 +1,17 @@
 eval_studio_client/__about__.py,sha256=7TnXVu0lNAY4UdQ_2iwTlAENGdigMUVBy6UmtWGB6sQ,30
 eval_studio_client/__init__.py,sha256=v8lXY_l4j3lAbIfW21nZFeWZX0sl4nKHbB29h2qYVU8,207
 eval_studio_client/client.py,sha256=khRFtcFNZHAMe1bA7SyvoLOPHVZQ2XJOZ3UB3gX8EKs,3307
-eval_studio_client/dashboards.py,sha256=S35kude0FSn-v0t-H1N6aHhsNhlmIgF3duKR8TUfKes,7331
+eval_studio_client/dashboards.py,sha256=TBMiO4OvTnWYSVuj2-EBxSdKQtEAb_HXgc9gXtRnu-s,8381
 eval_studio_client/documents.py,sha256=fjsbHnqZnouu0stCf_p15RgoszkY4_gIsbX1hiw7Xv8,3076
 eval_studio_client/evaluators.py,sha256=blJlWMswIGr1u6TQDiiO-fInYVnkBT0Y02J57o8Z094,2100
 eval_studio_client/insights.py,sha256=bhe6XBVJ61-2bcDdNe6HiZsu0sly8LeoYAKo1GkgK08,1199
-eval_studio_client/leaderboards.py,sha256=UZItYErAGRXDsae61iMnHXXjoAUFSPL-HTQ_eQnkIJI,7746
-eval_studio_client/models.py,sha256=4OFASuJF1OvIdVODqUk4Uv70cojIJ9CFz3U1nmPFJwI,19137
+eval_studio_client/leaderboards.py,sha256=NHko_kuPIXnbBdEDMK1MHQmHJRCHA7_Q1wx4eqBvBF8,8035
+eval_studio_client/models.py,sha256=nW1Wk6L89iWSjhMVk_sKmxSomKX3b6ANALbwWvbJ7Uk,21346
 eval_studio_client/perturbators.py,sha256=CtcWqEgPGpOcDHvYAQBlNDKnS-ZDBkL7Y_Ygsgpvikw,3133
 eval_studio_client/problems.py,sha256=rdGIfo7AqyxGhWMpbIDX1WXFoQvzKktKAWDKRde5VbY,1515
 eval_studio_client/test_labs.py,sha256=IEY98Ocu7WQcxZN_jy5YthVBoHAgHjgA2T93U7q0eYE,11260
-eval_studio_client/tests.py,sha256=n14-zM2J9oUKgKZQm2xjtg7f8MWxnL2Ov00jQqMP8fw,22512
+eval_studio_client/tests.py,sha256=_Qu6X4FoocYJ-liClXLQqIR91P7GjWmxpeyDhRl5JXI,22393
+eval_studio_client/utils.py,sha256=e5bsQVgNHYNSqSOthxlmncerPdgbvWwQaY_C-libuXk,764
 eval_studio_client/api/__init__.py,sha256=Ef5qooH4SLfYUqVBJl79oRKWYnXryDPZV4IXGfvG1Wc,15269
 eval_studio_client/api/api_client.py,sha256=yFQKmCsVhswcTbdGY4lf-61mf8FVm3Kfon8Qhe1sPKw,26431
 eval_studio_client/api/api_response.py,sha256=eMxw1mpmJcoGZ3gs9z6jM4oYoZ10Gjk333s9sKxGv7s,652
@@ -480,6 +481,6 @@ eval_studio_client/api/test/test_v1_update_test_response.py,sha256=pqTwL9SgoOM9k
 eval_studio_client/api/test/test_v1_who_am_i_response.py,sha256=bNbjL5-b-4asyziW6znJhuU2yrzd9RgJa2ZiNw3e6YA,1523
 eval_studio_client/api/test/test_who_am_i_service_api.py,sha256=gYWKFamJMyVne2QaOSPz6WEkxExRuAphMGKf1nFayLU,898
 eval_studio_client/gen/openapiv2/eval_studio.swagger.json,sha256=2jOBBxQ2H2mS9C_nlqoTrTiYMmCLaUFQym6su3fXJ8I,210976
-eval_studio_client-0.8.2.dist-info/METADATA,sha256=hza1__A-Rky7RO8E8KyQmkb-KXvODW1wNLZPAWCJWBk,707
-eval_studio_client-0.8.2.dist-info/WHEEL,sha256=C2FUgwZgiLbznR-k0b_5k3Ai_1aASOXDss3lzCUsUug,87
-eval_studio_client-0.8.2.dist-info/RECORD,,
+eval_studio_client-1.0.0.dist-info/METADATA,sha256=l8XLUMIu-W4pHRG8fs1IZek_bGIEiFtDRyPjPGkpQrY,707
+eval_studio_client-1.0.0.dist-info/WHEEL,sha256=C2FUgwZgiLbznR-k0b_5k3Ai_1aASOXDss3lzCUsUug,87
+eval_studio_client-1.0.0.dist-info/RECORD,,

{eval_studio_client-0.8.2.dist-info → eval_studio_client-1.0.0.dist-info}/WHEEL RENAMED Viewed

File without changes

eval-studio-client 0.8.2__py3-none-any.whl → 1.0.0__py3-none-any.whl

eval-studio-client 0.8.2py3-none-any.whl → 1.0.0py3-none-any.whl