eval-studio-client 0.8.2__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,6 +9,7 @@ from eval_studio_client import api
9
9
  from eval_studio_client import insights as i6s
10
10
  from eval_studio_client import leaderboards as l10s
11
11
  from eval_studio_client import problems as p6s
12
+ from eval_studio_client import utils
12
13
  from eval_studio_client.api import models
13
14
 
14
15
 
@@ -41,6 +42,7 @@ class Dashboard:
41
42
  self._dashboard_api = api.DashboardServiceApi(self._client)
42
43
  self._leaderboard_api = api.LeaderboardServiceApi(self._client)
43
44
  self._info_api = api.InfoServiceApi(self._client)
45
+ self._operation_api = api.OperationServiceApi(self._client)
44
46
 
45
47
  @property
46
48
  def leaderboards(self) -> Optional[List[l10s.Leaderboard]]:
@@ -118,36 +120,56 @@ class Dashboard:
118
120
  else:
119
121
  raise ValueError("Cannot establish connection to Eval Studio host.")
120
122
 
121
- def wait_to_finish(self, timeout: Optional[float] = None):
123
+ def wait_to_finish(self, timeout: Optional[float] = None, verbose: bool = False):
122
124
  """Waits for the dashboard to finish.
123
125
 
124
126
  Args:
125
127
  timeout: The maximum time to wait in seconds.
128
+ verbose (bool): If True, prints the status of the evaluation while waiting.
126
129
  """
127
130
  timeout = timeout or float("inf")
131
+ progress_bar = utils.ProgressBar()
128
132
  if self.finished:
129
133
  return
130
134
 
135
+ if not self._create_operation:
136
+ # This means that the evaluation has no assigned operation, thus cannot poll.
137
+ raise RuntimeError("Failed to retrieve running evaluation info.")
138
+
131
139
  if self._client:
132
140
  ctr = 0
133
141
  while ctr < timeout:
134
- lbs = self.leaderboards
135
- if lbs:
136
- if all(lb.finished for lb in lbs):
137
- return
138
-
139
- ctr += 1
140
- time.sleep(1)
142
+ op = self._operation_api.operation_service_get_operation(
143
+ self._create_operation
144
+ )
145
+ if not op or not op.operation:
146
+ raise RuntimeError(
147
+ "Failed to retrieve running evaluation progress."
148
+ )
149
+
150
+ if verbose:
151
+ if not op.operation.metadata:
152
+ raise RuntimeError(
153
+ "Failed to retrieve running evaluation progress details."
154
+ )
155
+
156
+ op_meta = op.operation.metadata.to_dict()
157
+ progress = op_meta.get("progress", 0)
158
+ progress_msg = op_meta.get("progressMessage", "Running")
159
+ progress_bar.update(progress, progress_msg)
160
+
161
+ if op.operation.done:
162
+ return
163
+
164
+ ctr += 1
165
+ time.sleep(1)
141
166
  else:
142
167
  raise ValueError("Cannot establish connection to Eval Studio host.")
143
168
 
144
169
  raise TimeoutError("Waiting timeout has been reached.")
145
170
 
146
- def show(self):
147
- """Opens the evaluation in the default web browser.
148
-
149
- NOTE: This functionality is primarily for interactive use in Jupyter notebooks.
150
- """
171
+ def show(self) -> str:
172
+ """Prints the endpoint URL of the evaluation dashboard."""
151
173
  if self._client:
152
174
  info_res = self._info_api.info_service_get_info()
153
175
  if not info_res or not info_res.info:
@@ -155,11 +177,8 @@ class Dashboard:
155
177
 
156
178
  host = info_res.info.base_url
157
179
  url = urllib.parse.urljoin(host, self.key)
158
-
159
- # NOTE: Local import is used to avoid problems for users outside Jupyter environment.
160
- import webbrowser
161
-
162
- webbrowser.open(url)
180
+ print(f"Open following url to access evaluation dashboard: \n\n{url}")
181
+ return url
163
182
  else:
164
183
  raise ValueError("Cannot establish connection to Eval Studio host.")
165
184
 
@@ -33,11 +33,13 @@ class Leaderboard:
33
33
  update_time: Optional[datetime.datetime] = None
34
34
  problems: List[p6s.Problem] = dataclasses.field(default_factory=list)
35
35
  insights: List[i6s.Insight] = dataclasses.field(default_factory=list)
36
+ existing_collection: Optional[str] = None
36
37
  _report: Optional[str] = None
37
38
  _leaderboard: Optional[str] = None
38
39
  _model_name: Optional[str] = None
39
40
  _status: Optional[models.V1LeaderboardStatus] = None
40
41
  _client: Optional[api.ApiClient] = None
42
+ _operation: Optional[str] = None
41
43
 
42
44
  def __post_init__(self):
43
45
  self._evaluator_api = api.EvaluatorServiceApi(self._client)
@@ -161,6 +163,7 @@ class Leaderboard:
161
163
  evaluator=self._evaluator_name,
162
164
  tests=self._test_names,
163
165
  model=self._model_name,
166
+ h2ogpte_collection=self.existing_collection or None,
164
167
  )
165
168
 
166
169
  def _update_result(self, api_leaderboard: models.V1Leaderboard):
@@ -189,12 +192,14 @@ class Leaderboard:
189
192
  update_time=api_leaderboard.update_time,
190
193
  problems=problems,
191
194
  insights=insights,
195
+ existing_collection=api_leaderboard.h2ogpte_collection or None,
192
196
  _evaluator_name=api_leaderboard.evaluator or "",
193
197
  _test_names=api_leaderboard.tests or [],
194
198
  _report=api_leaderboard.leaderboard_report or "",
195
199
  _leaderboard=api_leaderboard.leaderboard_table,
196
200
  _status=api_leaderboard.status,
197
201
  _client=client,
202
+ _operation=api_leaderboard.create_operation or None,
198
203
  )
199
204
 
200
205
  @staticmethod
@@ -22,6 +22,28 @@ DEFAULT_RAG_MODEL_KEY = "models/defaultRAGModel"
22
22
  DEFAULT_LLM_MODEL_KEY = "models/defaultLLMModel"
23
23
 
24
24
 
25
+ @dataclasses.dataclass
26
+ class CollectionInfo:
27
+ """Represents the information about a collection in the H2OGPTE
28
+ or a Knowledge Base in Amazon Bedrock.
29
+ """
30
+
31
+ key: str
32
+ name: str
33
+ description: str
34
+
35
+ def __str__(self):
36
+ return f"{self.name} ({self.key})"
37
+
38
+ @staticmethod
39
+ def _from_api_collection_info(api_col: models.V1CollectionInfo) -> "CollectionInfo":
40
+ return CollectionInfo(
41
+ key=api_col.id or "",
42
+ name=api_col.display_name or "",
43
+ description=api_col.description or "",
44
+ )
45
+
46
+
25
47
  @dataclasses.dataclass
26
48
  class Model:
27
49
  """Represents Eval Studio connection to an external RAG/LLM system.
@@ -80,6 +102,28 @@ class Model:
80
102
 
81
103
  return result
82
104
 
105
+ @property
106
+ def base_models(self) -> List[str]:
107
+ """List of base LLM models available to use e.g. for the evaluation."""
108
+ res = self._model_api.model_service_list_base_models(self.key)
109
+ if res and res.base_models:
110
+ return [str(m) for m in res.base_models]
111
+
112
+ raise RuntimeError("Failed to list base models")
113
+
114
+ @property
115
+ def collections(self) -> List[CollectionInfo]:
116
+ """List of collections available for evaluation.
117
+
118
+ NOTE: This is currently supported only for H2OGPTe and Amazon Bedrock RAG
119
+ model hosts.
120
+ """
121
+ res = self._model_api.model_service_list_model_collections(self.key)
122
+ if res and res.collections:
123
+ return list(res.collections)
124
+
125
+ raise RuntimeError("Failed to list model host collections")
126
+
83
127
  def create_leaderboard(
84
128
  self,
85
129
  name: str,
@@ -88,20 +132,27 @@ class Model:
88
132
  description: Optional[str] = None,
89
133
  base_models: Optional[List[str]] = None,
90
134
  use_cache: bool = True,
135
+ existing_collection: Optional[str] = None,
91
136
  ) -> Optional[l10s.Leaderboard]:
92
137
  """Runs a new evaluation for the model and creates a new leaderboard.
93
138
 
94
139
  Args:
140
+ name: The name of the leaderboard.
95
141
  evaluator: The evaluator to use for the evaluation.
96
142
  test_suite: The list of tests used to evaluate the model.
143
+ description (optional): The description of the leaderboard.
97
144
  base_models (optional): The base LLM models to use for the evaluation.
98
145
  use_cache (optional): Whether to use the cached answers if available.
146
+ existing_collection (str): ID or the resource name of the existing
147
+ collection, which will be used as a corpus for evaluation.
148
+ NOTE: This option works only for the H2OGPTe and Amazon Bedrock model hosts ATM.
99
149
  """
100
150
  lb = l10s.Leaderboard(
101
151
  key="",
102
152
  name=name,
103
153
  description=description or "",
104
154
  base_models=base_models or [],
155
+ existing_collection=existing_collection,
105
156
  _model_name=self.key,
106
157
  _evaluator_name=evaluator.key,
107
158
  _test_names=[t.key for t in test_suite],
@@ -128,6 +179,7 @@ class Model:
128
179
  test_suites: Union[tests.Test, List[tests.Test]],
129
180
  description: Optional[str] = None,
130
181
  base_models: Optional[List[str]] = None,
182
+ existing_collection: Optional[str] = None,
131
183
  ) -> Optional[d8s.Dashboard]:
132
184
  """Runs a new evaluation for the model and creates a new dashboard.
133
185
 
@@ -136,6 +188,9 @@ class Model:
136
188
  test_suites: The test(s) used to evaluate the model.
137
189
  description (optional): The description of the dashboard.
138
190
  base_models (optional): The base LLM models to use for the evaluation.
191
+ existing_collection (str): ID or the resource name of the existing
192
+ collection, which will be used as a corpus for evaluation.
193
+ NOTE: This option works only for the H2OGPTe and Amazon Bedrock model hosts ATM.
139
194
  """
140
195
  _evaluators = (
141
196
  [evaluators] if isinstance(evaluators, e8s.Evaluator) else evaluators
@@ -151,6 +206,7 @@ class Model:
151
206
  name=f"{name} - {evaluator.name}",
152
207
  description=description or "",
153
208
  base_models=base_models or [],
209
+ existing_collection=existing_collection,
154
210
  _model_name=self.key,
155
211
  _evaluator_name=evaluator.key,
156
212
  _test_names=[t.key for t in _test_suites],
@@ -11,6 +11,7 @@ from typing import Union
11
11
  from eval_studio_client import api
12
12
  from eval_studio_client import documents as d7s
13
13
  from eval_studio_client import perturbators as p10s
14
+ from eval_studio_client import utils
14
15
  from eval_studio_client.api import models
15
16
 
16
17
 
@@ -85,15 +86,9 @@ class TestCaseGenerator(enum.Enum):
85
86
 
86
87
 
87
88
  @dataclasses.dataclass
88
- class TestCaseGenerationHandle:
89
+ class _TestCaseGenerationHandle:
89
90
 
90
91
  name: Any | None
91
- create_time: Optional[datetime.datetime] = None
92
- creator: Optional[str] = None
93
- update_time: Optional[datetime.datetime] = None
94
- updater: Optional[str] = None
95
- delete_time: Optional[datetime.datetime] = None
96
- deleter: Optional[str] = None
97
92
  progress: Optional[float] = None
98
93
  progress_message: Optional[str] = None
99
94
  error: Optional[models.RpcStatus] = None
@@ -102,11 +97,11 @@ class TestCaseGenerationHandle:
102
97
  @staticmethod
103
98
  def _from_operation(
104
99
  res: models.V1GenerateTestCasesResponse | models.V1GetOperationResponse,
105
- ) -> "TestCaseGenerationHandle":
100
+ ) -> "_TestCaseGenerationHandle":
106
101
  """Converts an API operation to prompt generation handle."""
107
102
  op: models.V1Operation | None = res.operation
108
103
  if not op:
109
- return TestCaseGenerationHandle(name=None)
104
+ return _TestCaseGenerationHandle(name=None)
110
105
 
111
106
  # progress
112
107
  if hasattr(op, "metadata") and op.metadata:
@@ -114,14 +109,8 @@ class TestCaseGenerationHandle:
114
109
  else:
115
110
  meta_dict = {}
116
111
 
117
- return TestCaseGenerationHandle(
112
+ return _TestCaseGenerationHandle(
118
113
  name=op.name,
119
- create_time=op.create_time,
120
- creator=op.creator,
121
- update_time=op.update_time,
122
- updater=op.updater,
123
- delete_time=op.delete_time,
124
- deleter=op.deleter,
125
114
  progress=meta_dict.get("progress"),
126
115
  progress_message=meta_dict.get("progressMessage"),
127
116
  error=op.error,
@@ -193,6 +182,7 @@ class Test:
193
182
  create_time: Optional[datetime.datetime] = None
194
183
  update_time: Optional[datetime.datetime] = None
195
184
  _client: Optional[api.ApiClient] = None
185
+ _gen_tc_op_name: Optional[str] = None
196
186
 
197
187
  def __post_init__(self):
198
188
  if self._client:
@@ -271,7 +261,8 @@ class Test:
271
261
  model: Optional[str] = None,
272
262
  base_llm_model: Optional[str] = None,
273
263
  generators: Optional[List[TestCaseGenerator]] = None,
274
- ) -> "TestCaseGenerationHandle":
264
+ existing_collection: Optional[str] = None,
265
+ ) -> None:
275
266
  """Generates test cases based on the documents of the Test.
276
267
 
277
268
  Args:
@@ -280,6 +271,9 @@ class Test:
280
271
  model (str): Model to use for generating the prompts.
281
272
  base_llm_model (str): Base LLM model to use for generating the prompts.
282
273
  generators (List[TestCaseGenerator]): Methods to use for generation.
274
+ existing_collection (str): ID or the resource name of the existing
275
+ collection, from which prompts will be generated.
276
+ NOTE: This option works only for the H2OGPTe model host ATM.
283
277
  """
284
278
 
285
279
  req = models.TestServiceGenerateTestCasesRequest(
@@ -287,61 +281,64 @@ class Test:
287
281
  model=model or None,
288
282
  base_llm_model=base_llm_model or None,
289
283
  generators=[g.to_api_proto() for g in generators] if generators else None,
284
+ h2ogpte_collection_id=existing_collection or None,
290
285
  )
291
286
 
292
287
  res = self._test_api.test_service_generate_test_cases(self.key, req)
293
288
 
294
- return TestCaseGenerationHandle._from_operation(res)
289
+ op: models.V1Operation | None = res.operation
290
+ self._gen_tc_op_name = op.name if op else None
295
291
 
296
292
  def wait_for_test_case_generation(
297
- self,
298
- handle: TestCaseGenerationHandle,
299
- timeout: Optional[float] = None,
300
- verbose: bool = False,
301
- ) -> TestCaseGenerationHandle:
293
+ self, timeout: Optional[float] = None, verbose: bool = False
294
+ ) -> None:
302
295
  """Waits for the test case generation to finish.
303
296
 
304
297
  Args:
305
- handle (TestCaseGenerationHandle): Handle of the test case generation.
306
298
  timeout (float): The maximum time to wait in seconds.
307
299
  verbose (bool): If True, prints the status of the handle while waiting.
308
300
  """
309
- if not handle.name:
310
- raise ValueError("Test case generation handle is not valid.")
311
- elif handle.done:
312
- return handle
301
+ if not self._gen_tc_op_name:
302
+ raise ValueError(
303
+ "There is no ongoing test case generation - the operation name is not "
304
+ "set."
305
+ )
313
306
 
314
307
  if verbose:
315
- print(f"Waiting for test case generation to finish ({handle.name}):")
308
+ print(
309
+ f"Waiting for test case generation to finish ({self._gen_tc_op_name}):"
310
+ )
316
311
  if self._client:
317
312
  # exponential backoff
318
313
  wait_time = 1.0
319
314
  wait_coef = 1.6
320
- wait_max = 20.0
315
+ wait_max = 8.0
321
316
  wait_total = 0.0
322
317
  timeout = timeout or float(2 * 24 * 60 * 60) # 2 days
318
+ progress_bar = utils.ProgressBar()
323
319
  while wait_total < timeout:
324
- handle = TestCaseGenerationHandle._from_operation(
325
- self._operation_api.operation_service_get_operation(handle.name)
320
+ handle = _TestCaseGenerationHandle._from_operation(
321
+ self._operation_api.operation_service_get_operation(
322
+ self._gen_tc_op_name
323
+ )
326
324
  )
327
325
 
328
326
  if verbose:
329
- if handle.progress or handle.progress_message:
330
- progress = (
331
- int(handle.progress * 100.0) if handle.progress else 0
332
- )
333
- msg = f"{progress:>2}% - '{handle.progress_message}'"
334
- else:
335
- msg = " 0% - 'Initializing'"
336
- print(f" {msg}")
327
+ progress_bar.update(handle.progress or 0, handle.progress_message)
337
328
 
338
329
  if handle.done:
339
- return handle
330
+ if handle.error:
331
+ raise RuntimeError(
332
+ f"Test case generation failed: {handle.error}"
333
+ )
334
+ return
340
335
 
341
336
  wait_time *= wait_coef
342
337
  time.sleep(min(wait_time, wait_max))
343
338
  else:
344
- raise ValueError("Cannot establish connection to Eval Studio host.")
339
+ raise ValueError(
340
+ "Unable to establish a connection to the Eval Studio host."
341
+ )
345
342
 
346
343
  raise TimeoutError("Waiting timeout has been reached.")
347
344
 
@@ -0,0 +1,26 @@
1
+ from typing import Optional
2
+
3
+
4
+ class ProgressBar:
5
+ def __init__(self):
6
+ self.progress = 0.0
7
+ self.progress_message = "Initializing"
8
+ self._progress_max = 1.0
9
+
10
+ def update(self, progress: float, message: Optional[str] = None):
11
+ try:
12
+ self.progress = float(str(progress))
13
+ except ValueError:
14
+ self.progress = 0.0
15
+
16
+ if message:
17
+ self.progress_message = message or ""
18
+
19
+ self.print()
20
+
21
+ def print(self):
22
+ print(" " * len(self.progress_message), end="\r")
23
+ p_progress = int(self.progress / self._progress_max * 100)
24
+ p_hashes = p_progress // 5
25
+ p_msg = f" {p_progress:>3}% |{'#' * p_hashes:<20}| {self.progress_message}"
26
+ print(p_msg, end="\r")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: eval-studio-client
3
- Version: 0.8.2
3
+ Version: 1.0.0
4
4
  Project-URL: Source, https://github.com/h2oai/eval-studio/tree/main/client-py/src/
5
5
  Project-URL: Issues, https://github.com/h2oai/eval-studio/issues
6
6
  Author-email: "H2O.ai" <support@h2o.ai>
@@ -1,16 +1,17 @@
1
1
  eval_studio_client/__about__.py,sha256=7TnXVu0lNAY4UdQ_2iwTlAENGdigMUVBy6UmtWGB6sQ,30
2
2
  eval_studio_client/__init__.py,sha256=v8lXY_l4j3lAbIfW21nZFeWZX0sl4nKHbB29h2qYVU8,207
3
3
  eval_studio_client/client.py,sha256=khRFtcFNZHAMe1bA7SyvoLOPHVZQ2XJOZ3UB3gX8EKs,3307
4
- eval_studio_client/dashboards.py,sha256=S35kude0FSn-v0t-H1N6aHhsNhlmIgF3duKR8TUfKes,7331
4
+ eval_studio_client/dashboards.py,sha256=TBMiO4OvTnWYSVuj2-EBxSdKQtEAb_HXgc9gXtRnu-s,8381
5
5
  eval_studio_client/documents.py,sha256=fjsbHnqZnouu0stCf_p15RgoszkY4_gIsbX1hiw7Xv8,3076
6
6
  eval_studio_client/evaluators.py,sha256=blJlWMswIGr1u6TQDiiO-fInYVnkBT0Y02J57o8Z094,2100
7
7
  eval_studio_client/insights.py,sha256=bhe6XBVJ61-2bcDdNe6HiZsu0sly8LeoYAKo1GkgK08,1199
8
- eval_studio_client/leaderboards.py,sha256=UZItYErAGRXDsae61iMnHXXjoAUFSPL-HTQ_eQnkIJI,7746
9
- eval_studio_client/models.py,sha256=4OFASuJF1OvIdVODqUk4Uv70cojIJ9CFz3U1nmPFJwI,19137
8
+ eval_studio_client/leaderboards.py,sha256=NHko_kuPIXnbBdEDMK1MHQmHJRCHA7_Q1wx4eqBvBF8,8035
9
+ eval_studio_client/models.py,sha256=nW1Wk6L89iWSjhMVk_sKmxSomKX3b6ANALbwWvbJ7Uk,21346
10
10
  eval_studio_client/perturbators.py,sha256=CtcWqEgPGpOcDHvYAQBlNDKnS-ZDBkL7Y_Ygsgpvikw,3133
11
11
  eval_studio_client/problems.py,sha256=rdGIfo7AqyxGhWMpbIDX1WXFoQvzKktKAWDKRde5VbY,1515
12
12
  eval_studio_client/test_labs.py,sha256=IEY98Ocu7WQcxZN_jy5YthVBoHAgHjgA2T93U7q0eYE,11260
13
- eval_studio_client/tests.py,sha256=n14-zM2J9oUKgKZQm2xjtg7f8MWxnL2Ov00jQqMP8fw,22512
13
+ eval_studio_client/tests.py,sha256=_Qu6X4FoocYJ-liClXLQqIR91P7GjWmxpeyDhRl5JXI,22393
14
+ eval_studio_client/utils.py,sha256=e5bsQVgNHYNSqSOthxlmncerPdgbvWwQaY_C-libuXk,764
14
15
  eval_studio_client/api/__init__.py,sha256=Ef5qooH4SLfYUqVBJl79oRKWYnXryDPZV4IXGfvG1Wc,15269
15
16
  eval_studio_client/api/api_client.py,sha256=yFQKmCsVhswcTbdGY4lf-61mf8FVm3Kfon8Qhe1sPKw,26431
16
17
  eval_studio_client/api/api_response.py,sha256=eMxw1mpmJcoGZ3gs9z6jM4oYoZ10Gjk333s9sKxGv7s,652
@@ -480,6 +481,6 @@ eval_studio_client/api/test/test_v1_update_test_response.py,sha256=pqTwL9SgoOM9k
480
481
  eval_studio_client/api/test/test_v1_who_am_i_response.py,sha256=bNbjL5-b-4asyziW6znJhuU2yrzd9RgJa2ZiNw3e6YA,1523
481
482
  eval_studio_client/api/test/test_who_am_i_service_api.py,sha256=gYWKFamJMyVne2QaOSPz6WEkxExRuAphMGKf1nFayLU,898
482
483
  eval_studio_client/gen/openapiv2/eval_studio.swagger.json,sha256=2jOBBxQ2H2mS9C_nlqoTrTiYMmCLaUFQym6su3fXJ8I,210976
483
- eval_studio_client-0.8.2.dist-info/METADATA,sha256=hza1__A-Rky7RO8E8KyQmkb-KXvODW1wNLZPAWCJWBk,707
484
- eval_studio_client-0.8.2.dist-info/WHEEL,sha256=C2FUgwZgiLbznR-k0b_5k3Ai_1aASOXDss3lzCUsUug,87
485
- eval_studio_client-0.8.2.dist-info/RECORD,,
484
+ eval_studio_client-1.0.0.dist-info/METADATA,sha256=l8XLUMIu-W4pHRG8fs1IZek_bGIEiFtDRyPjPGkpQrY,707
485
+ eval_studio_client-1.0.0.dist-info/WHEEL,sha256=C2FUgwZgiLbznR-k0b_5k3Ai_1aASOXDss3lzCUsUug,87
486
+ eval_studio_client-1.0.0.dist-info/RECORD,,