PyPI - eval-studio-client - Versions diffs - 0.8.2__py3-none-any.whl → 1.0.0a1__py3-none-any.whl - Mend

eval-studio-client 0.8.2py3-none-any.whl → 1.0.0a1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

eval_studio_client/leaderboards.py CHANGED Viewed

@@ -33,6 +33,7 @@ class Leaderboard:
     update_time: Optional[datetime.datetime] = None
     problems: List[p6s.Problem] = dataclasses.field(default_factory=list)
     insights: List[i6s.Insight] = dataclasses.field(default_factory=list)
+    existing_collection: Optional[str] = None
     _report: Optional[str] = None
     _leaderboard: Optional[str] = None
     _model_name: Optional[str] = None
@@ -161,6 +162,7 @@ class Leaderboard:
             evaluator=self._evaluator_name,
             tests=self._test_names,
             model=self._model_name,
+            h2ogpte_collection=self.existing_collection or None,
         )
     def _update_result(self, api_leaderboard: models.V1Leaderboard):
@@ -189,6 +191,7 @@ class Leaderboard:
             update_time=api_leaderboard.update_time,
             problems=problems,
             insights=insights,
+            existing_collection=api_leaderboard.h2ogpte_collection or None,
             _evaluator_name=api_leaderboard.evaluator or "",
             _test_names=api_leaderboard.tests or [],
             _report=api_leaderboard.leaderboard_report or "",

eval_studio_client/models.py CHANGED Viewed

@@ -22,6 +22,28 @@ DEFAULT_RAG_MODEL_KEY = "models/defaultRAGModel"
 DEFAULT_LLM_MODEL_KEY = "models/defaultLLMModel"
+@dataclasses.dataclass
+class CollectionInfo:
+    """Represents the information about a collection in the H2OGPTE
+    or a Knowledge Base in Amazon Bedrock.
+    """
+    key: str
+    name: str
+    description: str
+    def __str__(self):
+        return f"{self.name} ({self.key})"
+    @staticmethod
+    def _from_api_collection_info(api_col: models.V1CollectionInfo) -> "CollectionInfo":
+        return CollectionInfo(
+            key=api_col.id or "",
+            name=api_col.display_name or "",
+            description=api_col.description or "",
+        )
 @dataclasses.dataclass
 class Model:
     """Represents Eval Studio connection to an external RAG/LLM system.
@@ -80,6 +102,28 @@ class Model:
         return result
+    @property
+    def base_models(self) -> List[str]:
+        """List of base LLM models available to use e.g. for the evaluation."""
+        res = self._model_api.model_service_list_base_models(self.key)
+        if res and res.base_models:
+            return [str(m) for m in res.base_models]
+        raise RuntimeError("Failed to list base models")
+    @property
+    def collections(self) -> List[CollectionInfo]:
+        """List of collections available for evaluation.
+        NOTE: This is currently supported only for H2OGPTe and Amazon Bedrock RAG
+        model hosts.
+        """
+        res = self._model_api.model_service_list_model_collections(self.key)
+        if res and res.collections:
+            return list(res.collections)
+        raise RuntimeError("Failed to list model host collections")
     def create_leaderboard(
         self,
         name: str,
@@ -88,20 +132,27 @@ class Model:
         description: Optional[str] = None,
         base_models: Optional[List[str]] = None,
         use_cache: bool = True,
+        existing_collection: Optional[str] = None,
     ) -> Optional[l10s.Leaderboard]:
         """Runs a new evaluation for the model and creates a new leaderboard.
         Args:
+            name: The name of the leaderboard.
             evaluator: The evaluator to use for the evaluation.
             test_suite: The list of tests used to evaluate the model.
+            description (optional): The description of the leaderboard.
             base_models (optional): The base LLM models to use for the evaluation.
             use_cache (optional): Whether to use the cached answers if available.
+            existing_collection (str): ID or the resource name of the existing
+                collection, which will be used as a corpus for evaluation.
+                NOTE: This option works only for the H2OGPTe and Amazon Bedrock model hosts ATM.
         """
         lb = l10s.Leaderboard(
             key="",
             name=name,
             description=description or "",
             base_models=base_models or [],
+            existing_collection=existing_collection,
             _model_name=self.key,
             _evaluator_name=evaluator.key,
             _test_names=[t.key for t in test_suite],
@@ -128,6 +179,7 @@ class Model:
         test_suites: Union[tests.Test, List[tests.Test]],
         description: Optional[str] = None,
         base_models: Optional[List[str]] = None,
+        existing_collection: Optional[str] = None,
     ) -> Optional[d8s.Dashboard]:
         """Runs a new evaluation for the model and creates a new dashboard.
@@ -136,6 +188,9 @@ class Model:
             test_suites: The test(s) used to evaluate the model.
             description (optional): The description of the dashboard.
             base_models (optional): The base LLM models to use for the evaluation.
+            existing_collection (str): ID or the resource name of the existing
+                collection, which will be used as a corpus for evaluation.
+                NOTE: This option works only for the H2OGPTe and Amazon Bedrock model hosts ATM.
         """
         _evaluators = (
             [evaluators] if isinstance(evaluators, e8s.Evaluator) else evaluators
@@ -151,6 +206,7 @@ class Model:
                 name=f"{name} - {evaluator.name}",
                 description=description or "",
                 base_models=base_models or [],
+                existing_collection=existing_collection,
                 _model_name=self.key,
                 _evaluator_name=evaluator.key,
                 _test_names=[t.key for t in _test_suites],

eval_studio_client/tests.py CHANGED Viewed

@@ -271,6 +271,7 @@ class Test:
         model: Optional[str] = None,
         base_llm_model: Optional[str] = None,
         generators: Optional[List[TestCaseGenerator]] = None,
+        existing_collection: Optional[str] = None,
     ) -> "TestCaseGenerationHandle":
         """Generates test cases based on the documents of the Test.
@@ -280,6 +281,9 @@ class Test:
             model (str): Model to use for generating the prompts.
             base_llm_model (str): Base LLM model to use for generating the prompts.
             generators (List[TestCaseGenerator]): Methods to use for generation.
+            existing_collection (str): ID or the resource name of the existing
+                collection, from which prompts will be generated.
+                NOTE: This option works only for the H2OGPTe model host ATM.
         """
         req = models.TestServiceGenerateTestCasesRequest(
@@ -287,6 +291,7 @@ class Test:
             model=model or None,
             base_llm_model=base_llm_model or None,
             generators=[g.to_api_proto() for g in generators] if generators else None,
+            h2ogpte_collection_id=existing_collection or None,
         )
         res = self._test_api.test_service_generate_test_cases(self.key, req)
@@ -317,23 +322,32 @@ class Test:
             # exponential backoff
             wait_time = 1.0
             wait_coef = 1.6
-            wait_max = 20.0
+            wait_max = 8.0
             wait_total = 0.0
             timeout = timeout or float(2 * 24 * 60 * 60)  # 2 days
+            # progress
+            p_max = 1.0
+            p_msg = ""
             while wait_total < timeout:
                 handle = TestCaseGenerationHandle._from_operation(
                     self._operation_api.operation_service_get_operation(handle.name)
                 )
                 if verbose:
+                    print(" " * len(p_msg), end="\r")
                     if handle.progress or handle.progress_message:
-                        progress = (
-                            int(handle.progress * 100.0) if handle.progress else 0
-                        )
-                        msg = f"{progress:>2}% - '{handle.progress_message}'"
+                        try:
+                            h_progress = float(str(handle.progress))
+                        except ValueError:
+                            h_progress = 0.0
+                        h_msg = handle.progress_message or "Processing"
                     else:
-                        msg = " 0% - 'Initializing'"
-                    print(f"  {msg}")
+                        h_progress = 0.0
+                        h_msg = "Initializing"
+                    p_progress = int(h_progress / p_max * 100)
+                    p_hashes = p_progress // 5
+                    p_msg = f"  {p_progress:>3}% |{'#' * p_hashes:<20}| {h_msg}"
+                    print(p_msg, end="\r")
                 if handle.done:
                     return handle

{eval_studio_client-0.8.2.dist-info → eval_studio_client-1.0.0a1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: eval-studio-client
-Version: 0.8.2
+Version: 1.0.0a1
 Project-URL: Source, https://github.com/h2oai/eval-studio/tree/main/client-py/src/
 Project-URL: Issues, https://github.com/h2oai/eval-studio/issues
 Author-email: "H2O.ai" <support@h2o.ai>

{eval_studio_client-0.8.2.dist-info → eval_studio_client-1.0.0a1.dist-info}/RECORD RENAMED Viewed

@@ -5,12 +5,12 @@ eval_studio_client/dashboards.py,sha256=S35kude0FSn-v0t-H1N6aHhsNhlmIgF3duKR8TUf
 eval_studio_client/documents.py,sha256=fjsbHnqZnouu0stCf_p15RgoszkY4_gIsbX1hiw7Xv8,3076
 eval_studio_client/evaluators.py,sha256=blJlWMswIGr1u6TQDiiO-fInYVnkBT0Y02J57o8Z094,2100
 eval_studio_client/insights.py,sha256=bhe6XBVJ61-2bcDdNe6HiZsu0sly8LeoYAKo1GkgK08,1199
-eval_studio_client/leaderboards.py,sha256=UZItYErAGRXDsae61iMnHXXjoAUFSPL-HTQ_eQnkIJI,7746
-eval_studio_client/models.py,sha256=4OFASuJF1OvIdVODqUk4Uv70cojIJ9CFz3U1nmPFJwI,19137
+eval_studio_client/leaderboards.py,sha256=5S4cJVS8bX_KoRcT_75eXxrDY-xdfkQdehwGgIgIBfU,7933
+eval_studio_client/models.py,sha256=nW1Wk6L89iWSjhMVk_sKmxSomKX3b6ANALbwWvbJ7Uk,21346
 eval_studio_client/perturbators.py,sha256=CtcWqEgPGpOcDHvYAQBlNDKnS-ZDBkL7Y_Ygsgpvikw,3133
 eval_studio_client/problems.py,sha256=rdGIfo7AqyxGhWMpbIDX1WXFoQvzKktKAWDKRde5VbY,1515
 eval_studio_client/test_labs.py,sha256=IEY98Ocu7WQcxZN_jy5YthVBoHAgHjgA2T93U7q0eYE,11260
-eval_studio_client/tests.py,sha256=n14-zM2J9oUKgKZQm2xjtg7f8MWxnL2Ov00jQqMP8fw,22512
+eval_studio_client/tests.py,sha256=xMKI3OC-dRHlss484gkuLWcF-XFuLZxx7-XMIuNmAxU,23236
 eval_studio_client/api/__init__.py,sha256=Ef5qooH4SLfYUqVBJl79oRKWYnXryDPZV4IXGfvG1Wc,15269
 eval_studio_client/api/api_client.py,sha256=yFQKmCsVhswcTbdGY4lf-61mf8FVm3Kfon8Qhe1sPKw,26431
 eval_studio_client/api/api_response.py,sha256=eMxw1mpmJcoGZ3gs9z6jM4oYoZ10Gjk333s9sKxGv7s,652
@@ -480,6 +480,6 @@ eval_studio_client/api/test/test_v1_update_test_response.py,sha256=pqTwL9SgoOM9k
 eval_studio_client/api/test/test_v1_who_am_i_response.py,sha256=bNbjL5-b-4asyziW6znJhuU2yrzd9RgJa2ZiNw3e6YA,1523
 eval_studio_client/api/test/test_who_am_i_service_api.py,sha256=gYWKFamJMyVne2QaOSPz6WEkxExRuAphMGKf1nFayLU,898
 eval_studio_client/gen/openapiv2/eval_studio.swagger.json,sha256=2jOBBxQ2H2mS9C_nlqoTrTiYMmCLaUFQym6su3fXJ8I,210976
-eval_studio_client-0.8.2.dist-info/METADATA,sha256=hza1__A-Rky7RO8E8KyQmkb-KXvODW1wNLZPAWCJWBk,707
-eval_studio_client-0.8.2.dist-info/WHEEL,sha256=C2FUgwZgiLbznR-k0b_5k3Ai_1aASOXDss3lzCUsUug,87
-eval_studio_client-0.8.2.dist-info/RECORD,,
+eval_studio_client-1.0.0a1.dist-info/METADATA,sha256=rX1UrncVa_ayrO30V9oeNhTjqV1EWNyBFOvL2q8YJ9c,709
+eval_studio_client-1.0.0a1.dist-info/WHEEL,sha256=C2FUgwZgiLbznR-k0b_5k3Ai_1aASOXDss3lzCUsUug,87
+eval_studio_client-1.0.0a1.dist-info/RECORD,,

{eval_studio_client-0.8.2.dist-info → eval_studio_client-1.0.0a1.dist-info}/WHEEL RENAMED Viewed

File without changes

eval-studio-client 0.8.2__py3-none-any.whl → 1.0.0a1__py3-none-any.whl

eval-studio-client 0.8.2py3-none-any.whl → 1.0.0a1py3-none-any.whl