eval-studio-client 0.8.2__py3-none-any.whl → 1.0.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eval_studio_client/leaderboards.py +3 -0
- eval_studio_client/models.py +56 -0
- eval_studio_client/tests.py +21 -7
- {eval_studio_client-0.8.2.dist-info → eval_studio_client-1.0.0a1.dist-info}/METADATA +1 -1
- {eval_studio_client-0.8.2.dist-info → eval_studio_client-1.0.0a1.dist-info}/RECORD +6 -6
- {eval_studio_client-0.8.2.dist-info → eval_studio_client-1.0.0a1.dist-info}/WHEEL +0 -0
|
@@ -33,6 +33,7 @@ class Leaderboard:
|
|
|
33
33
|
update_time: Optional[datetime.datetime] = None
|
|
34
34
|
problems: List[p6s.Problem] = dataclasses.field(default_factory=list)
|
|
35
35
|
insights: List[i6s.Insight] = dataclasses.field(default_factory=list)
|
|
36
|
+
existing_collection: Optional[str] = None
|
|
36
37
|
_report: Optional[str] = None
|
|
37
38
|
_leaderboard: Optional[str] = None
|
|
38
39
|
_model_name: Optional[str] = None
|
|
@@ -161,6 +162,7 @@ class Leaderboard:
|
|
|
161
162
|
evaluator=self._evaluator_name,
|
|
162
163
|
tests=self._test_names,
|
|
163
164
|
model=self._model_name,
|
|
165
|
+
h2ogpte_collection=self.existing_collection or None,
|
|
164
166
|
)
|
|
165
167
|
|
|
166
168
|
def _update_result(self, api_leaderboard: models.V1Leaderboard):
|
|
@@ -189,6 +191,7 @@ class Leaderboard:
|
|
|
189
191
|
update_time=api_leaderboard.update_time,
|
|
190
192
|
problems=problems,
|
|
191
193
|
insights=insights,
|
|
194
|
+
existing_collection=api_leaderboard.h2ogpte_collection or None,
|
|
192
195
|
_evaluator_name=api_leaderboard.evaluator or "",
|
|
193
196
|
_test_names=api_leaderboard.tests or [],
|
|
194
197
|
_report=api_leaderboard.leaderboard_report or "",
|
eval_studio_client/models.py
CHANGED
|
@@ -22,6 +22,28 @@ DEFAULT_RAG_MODEL_KEY = "models/defaultRAGModel"
|
|
|
22
22
|
DEFAULT_LLM_MODEL_KEY = "models/defaultLLMModel"
|
|
23
23
|
|
|
24
24
|
|
|
25
|
+
@dataclasses.dataclass
|
|
26
|
+
class CollectionInfo:
|
|
27
|
+
"""Represents the information about a collection in the H2OGPTE
|
|
28
|
+
or a Knowledge Base in Amazon Bedrock.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
key: str
|
|
32
|
+
name: str
|
|
33
|
+
description: str
|
|
34
|
+
|
|
35
|
+
def __str__(self):
|
|
36
|
+
return f"{self.name} ({self.key})"
|
|
37
|
+
|
|
38
|
+
@staticmethod
|
|
39
|
+
def _from_api_collection_info(api_col: models.V1CollectionInfo) -> "CollectionInfo":
|
|
40
|
+
return CollectionInfo(
|
|
41
|
+
key=api_col.id or "",
|
|
42
|
+
name=api_col.display_name or "",
|
|
43
|
+
description=api_col.description or "",
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
25
47
|
@dataclasses.dataclass
|
|
26
48
|
class Model:
|
|
27
49
|
"""Represents Eval Studio connection to an external RAG/LLM system.
|
|
@@ -80,6 +102,28 @@ class Model:
|
|
|
80
102
|
|
|
81
103
|
return result
|
|
82
104
|
|
|
105
|
+
@property
|
|
106
|
+
def base_models(self) -> List[str]:
|
|
107
|
+
"""List of base LLM models available to use e.g. for the evaluation."""
|
|
108
|
+
res = self._model_api.model_service_list_base_models(self.key)
|
|
109
|
+
if res and res.base_models:
|
|
110
|
+
return [str(m) for m in res.base_models]
|
|
111
|
+
|
|
112
|
+
raise RuntimeError("Failed to list base models")
|
|
113
|
+
|
|
114
|
+
@property
|
|
115
|
+
def collections(self) -> List[CollectionInfo]:
|
|
116
|
+
"""List of collections available for evaluation.
|
|
117
|
+
|
|
118
|
+
NOTE: This is currently supported only for H2OGPTe and Amazon Bedrock RAG
|
|
119
|
+
model hosts.
|
|
120
|
+
"""
|
|
121
|
+
res = self._model_api.model_service_list_model_collections(self.key)
|
|
122
|
+
if res and res.collections:
|
|
123
|
+
return list(res.collections)
|
|
124
|
+
|
|
125
|
+
raise RuntimeError("Failed to list model host collections")
|
|
126
|
+
|
|
83
127
|
def create_leaderboard(
|
|
84
128
|
self,
|
|
85
129
|
name: str,
|
|
@@ -88,20 +132,27 @@ class Model:
|
|
|
88
132
|
description: Optional[str] = None,
|
|
89
133
|
base_models: Optional[List[str]] = None,
|
|
90
134
|
use_cache: bool = True,
|
|
135
|
+
existing_collection: Optional[str] = None,
|
|
91
136
|
) -> Optional[l10s.Leaderboard]:
|
|
92
137
|
"""Runs a new evaluation for the model and creates a new leaderboard.
|
|
93
138
|
|
|
94
139
|
Args:
|
|
140
|
+
name: The name of the leaderboard.
|
|
95
141
|
evaluator: The evaluator to use for the evaluation.
|
|
96
142
|
test_suite: The list of tests used to evaluate the model.
|
|
143
|
+
description (optional): The description of the leaderboard.
|
|
97
144
|
base_models (optional): The base LLM models to use for the evaluation.
|
|
98
145
|
use_cache (optional): Whether to use the cached answers if available.
|
|
146
|
+
existing_collection (str): ID or the resource name of the existing
|
|
147
|
+
collection, which will be used as a corpus for evaluation.
|
|
148
|
+
NOTE: This option works only for the H2OGPTe and Amazon Bedrock model hosts ATM.
|
|
99
149
|
"""
|
|
100
150
|
lb = l10s.Leaderboard(
|
|
101
151
|
key="",
|
|
102
152
|
name=name,
|
|
103
153
|
description=description or "",
|
|
104
154
|
base_models=base_models or [],
|
|
155
|
+
existing_collection=existing_collection,
|
|
105
156
|
_model_name=self.key,
|
|
106
157
|
_evaluator_name=evaluator.key,
|
|
107
158
|
_test_names=[t.key for t in test_suite],
|
|
@@ -128,6 +179,7 @@ class Model:
|
|
|
128
179
|
test_suites: Union[tests.Test, List[tests.Test]],
|
|
129
180
|
description: Optional[str] = None,
|
|
130
181
|
base_models: Optional[List[str]] = None,
|
|
182
|
+
existing_collection: Optional[str] = None,
|
|
131
183
|
) -> Optional[d8s.Dashboard]:
|
|
132
184
|
"""Runs a new evaluation for the model and creates a new dashboard.
|
|
133
185
|
|
|
@@ -136,6 +188,9 @@ class Model:
|
|
|
136
188
|
test_suites: The test(s) used to evaluate the model.
|
|
137
189
|
description (optional): The description of the dashboard.
|
|
138
190
|
base_models (optional): The base LLM models to use for the evaluation.
|
|
191
|
+
existing_collection (str): ID or the resource name of the existing
|
|
192
|
+
collection, which will be used as a corpus for evaluation.
|
|
193
|
+
NOTE: This option works only for the H2OGPTe and Amazon Bedrock model hosts ATM.
|
|
139
194
|
"""
|
|
140
195
|
_evaluators = (
|
|
141
196
|
[evaluators] if isinstance(evaluators, e8s.Evaluator) else evaluators
|
|
@@ -151,6 +206,7 @@ class Model:
|
|
|
151
206
|
name=f"{name} - {evaluator.name}",
|
|
152
207
|
description=description or "",
|
|
153
208
|
base_models=base_models or [],
|
|
209
|
+
existing_collection=existing_collection,
|
|
154
210
|
_model_name=self.key,
|
|
155
211
|
_evaluator_name=evaluator.key,
|
|
156
212
|
_test_names=[t.key for t in _test_suites],
|
eval_studio_client/tests.py
CHANGED
|
@@ -271,6 +271,7 @@ class Test:
|
|
|
271
271
|
model: Optional[str] = None,
|
|
272
272
|
base_llm_model: Optional[str] = None,
|
|
273
273
|
generators: Optional[List[TestCaseGenerator]] = None,
|
|
274
|
+
existing_collection: Optional[str] = None,
|
|
274
275
|
) -> "TestCaseGenerationHandle":
|
|
275
276
|
"""Generates test cases based on the documents of the Test.
|
|
276
277
|
|
|
@@ -280,6 +281,9 @@ class Test:
|
|
|
280
281
|
model (str): Model to use for generating the prompts.
|
|
281
282
|
base_llm_model (str): Base LLM model to use for generating the prompts.
|
|
282
283
|
generators (List[TestCaseGenerator]): Methods to use for generation.
|
|
284
|
+
existing_collection (str): ID or the resource name of the existing
|
|
285
|
+
collection, from which prompts will be generated.
|
|
286
|
+
NOTE: This option works only for the H2OGPTe model host ATM.
|
|
283
287
|
"""
|
|
284
288
|
|
|
285
289
|
req = models.TestServiceGenerateTestCasesRequest(
|
|
@@ -287,6 +291,7 @@ class Test:
|
|
|
287
291
|
model=model or None,
|
|
288
292
|
base_llm_model=base_llm_model or None,
|
|
289
293
|
generators=[g.to_api_proto() for g in generators] if generators else None,
|
|
294
|
+
h2ogpte_collection_id=existing_collection or None,
|
|
290
295
|
)
|
|
291
296
|
|
|
292
297
|
res = self._test_api.test_service_generate_test_cases(self.key, req)
|
|
@@ -317,23 +322,32 @@ class Test:
|
|
|
317
322
|
# exponential backoff
|
|
318
323
|
wait_time = 1.0
|
|
319
324
|
wait_coef = 1.6
|
|
320
|
-
wait_max =
|
|
325
|
+
wait_max = 8.0
|
|
321
326
|
wait_total = 0.0
|
|
322
327
|
timeout = timeout or float(2 * 24 * 60 * 60) # 2 days
|
|
328
|
+
# progress
|
|
329
|
+
p_max = 1.0
|
|
330
|
+
p_msg = ""
|
|
323
331
|
while wait_total < timeout:
|
|
324
332
|
handle = TestCaseGenerationHandle._from_operation(
|
|
325
333
|
self._operation_api.operation_service_get_operation(handle.name)
|
|
326
334
|
)
|
|
327
335
|
|
|
328
336
|
if verbose:
|
|
337
|
+
print(" " * len(p_msg), end="\r")
|
|
329
338
|
if handle.progress or handle.progress_message:
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
339
|
+
try:
|
|
340
|
+
h_progress = float(str(handle.progress))
|
|
341
|
+
except ValueError:
|
|
342
|
+
h_progress = 0.0
|
|
343
|
+
h_msg = handle.progress_message or "Processing"
|
|
334
344
|
else:
|
|
335
|
-
|
|
336
|
-
|
|
345
|
+
h_progress = 0.0
|
|
346
|
+
h_msg = "Initializing"
|
|
347
|
+
p_progress = int(h_progress / p_max * 100)
|
|
348
|
+
p_hashes = p_progress // 5
|
|
349
|
+
p_msg = f" {p_progress:>3}% |{'#' * p_hashes:<20}| {h_msg}"
|
|
350
|
+
print(p_msg, end="\r")
|
|
337
351
|
|
|
338
352
|
if handle.done:
|
|
339
353
|
return handle
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: eval-studio-client
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0a1
|
|
4
4
|
Project-URL: Source, https://github.com/h2oai/eval-studio/tree/main/client-py/src/
|
|
5
5
|
Project-URL: Issues, https://github.com/h2oai/eval-studio/issues
|
|
6
6
|
Author-email: "H2O.ai" <support@h2o.ai>
|
|
@@ -5,12 +5,12 @@ eval_studio_client/dashboards.py,sha256=S35kude0FSn-v0t-H1N6aHhsNhlmIgF3duKR8TUf
|
|
|
5
5
|
eval_studio_client/documents.py,sha256=fjsbHnqZnouu0stCf_p15RgoszkY4_gIsbX1hiw7Xv8,3076
|
|
6
6
|
eval_studio_client/evaluators.py,sha256=blJlWMswIGr1u6TQDiiO-fInYVnkBT0Y02J57o8Z094,2100
|
|
7
7
|
eval_studio_client/insights.py,sha256=bhe6XBVJ61-2bcDdNe6HiZsu0sly8LeoYAKo1GkgK08,1199
|
|
8
|
-
eval_studio_client/leaderboards.py,sha256=
|
|
9
|
-
eval_studio_client/models.py,sha256=
|
|
8
|
+
eval_studio_client/leaderboards.py,sha256=5S4cJVS8bX_KoRcT_75eXxrDY-xdfkQdehwGgIgIBfU,7933
|
|
9
|
+
eval_studio_client/models.py,sha256=nW1Wk6L89iWSjhMVk_sKmxSomKX3b6ANALbwWvbJ7Uk,21346
|
|
10
10
|
eval_studio_client/perturbators.py,sha256=CtcWqEgPGpOcDHvYAQBlNDKnS-ZDBkL7Y_Ygsgpvikw,3133
|
|
11
11
|
eval_studio_client/problems.py,sha256=rdGIfo7AqyxGhWMpbIDX1WXFoQvzKktKAWDKRde5VbY,1515
|
|
12
12
|
eval_studio_client/test_labs.py,sha256=IEY98Ocu7WQcxZN_jy5YthVBoHAgHjgA2T93U7q0eYE,11260
|
|
13
|
-
eval_studio_client/tests.py,sha256=
|
|
13
|
+
eval_studio_client/tests.py,sha256=xMKI3OC-dRHlss484gkuLWcF-XFuLZxx7-XMIuNmAxU,23236
|
|
14
14
|
eval_studio_client/api/__init__.py,sha256=Ef5qooH4SLfYUqVBJl79oRKWYnXryDPZV4IXGfvG1Wc,15269
|
|
15
15
|
eval_studio_client/api/api_client.py,sha256=yFQKmCsVhswcTbdGY4lf-61mf8FVm3Kfon8Qhe1sPKw,26431
|
|
16
16
|
eval_studio_client/api/api_response.py,sha256=eMxw1mpmJcoGZ3gs9z6jM4oYoZ10Gjk333s9sKxGv7s,652
|
|
@@ -480,6 +480,6 @@ eval_studio_client/api/test/test_v1_update_test_response.py,sha256=pqTwL9SgoOM9k
|
|
|
480
480
|
eval_studio_client/api/test/test_v1_who_am_i_response.py,sha256=bNbjL5-b-4asyziW6znJhuU2yrzd9RgJa2ZiNw3e6YA,1523
|
|
481
481
|
eval_studio_client/api/test/test_who_am_i_service_api.py,sha256=gYWKFamJMyVne2QaOSPz6WEkxExRuAphMGKf1nFayLU,898
|
|
482
482
|
eval_studio_client/gen/openapiv2/eval_studio.swagger.json,sha256=2jOBBxQ2H2mS9C_nlqoTrTiYMmCLaUFQym6su3fXJ8I,210976
|
|
483
|
-
eval_studio_client-0.
|
|
484
|
-
eval_studio_client-0.
|
|
485
|
-
eval_studio_client-0.
|
|
483
|
+
eval_studio_client-1.0.0a1.dist-info/METADATA,sha256=rX1UrncVa_ayrO30V9oeNhTjqV1EWNyBFOvL2q8YJ9c,709
|
|
484
|
+
eval_studio_client-1.0.0a1.dist-info/WHEEL,sha256=C2FUgwZgiLbznR-k0b_5k3Ai_1aASOXDss3lzCUsUug,87
|
|
485
|
+
eval_studio_client-1.0.0a1.dist-info/RECORD,,
|
|
File without changes
|