eval-studio-client 1.0.0a1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eval_studio_client/api/__init__.py +43 -0
- eval_studio_client/api/api/__init__.py +5 -0
- eval_studio_client/api/api/human_calibration_service_api.py +304 -0
- eval_studio_client/api/api/perturbator_service_api.py +268 -1
- eval_studio_client/api/api/prompt_library_service_api.py +669 -0
- eval_studio_client/api/api/test_service_api.py +568 -0
- eval_studio_client/api/api/workflow_edge_service_api.py +296 -0
- eval_studio_client/api/api/workflow_node_service_api.py +1634 -0
- eval_studio_client/api/api/workflow_service_api.py +1609 -0
- eval_studio_client/api/docs/HumanCalibrationServiceApi.md +77 -0
- eval_studio_client/api/docs/PerturbationServiceCreatePerturbationRequest.md +1 -0
- eval_studio_client/api/docs/PerturbatorServiceApi.md +33 -3
- eval_studio_client/api/docs/PromptGenerationServiceAutoGeneratePromptsRequest.md +2 -1
- eval_studio_client/api/docs/PromptLibraryServiceApi.md +155 -0
- eval_studio_client/api/docs/ProtobufNullValue.md +12 -0
- eval_studio_client/api/docs/RequiredTheTestCaseToUpdate.md +1 -0
- eval_studio_client/api/docs/RequiredTheUpdatedWorkflow.md +44 -0
- eval_studio_client/api/docs/RequiredTheUpdatedWorkflowNode.md +44 -0
- eval_studio_client/api/docs/TestServiceApi.md +140 -0
- eval_studio_client/api/docs/TestServiceGenerateTestCasesRequest.md +1 -0
- eval_studio_client/api/docs/TestServiceImportTestCasesFromLibraryRequest.md +32 -0
- eval_studio_client/api/docs/TestServiceListTestCaseLibraryItemsRequest.md +35 -0
- eval_studio_client/api/docs/TestServicePerturbTestRequest.md +1 -0
- eval_studio_client/api/docs/V1BatchDeleteWorkflowsRequest.md +29 -0
- eval_studio_client/api/docs/V1BatchDeleteWorkflowsResponse.md +29 -0
- eval_studio_client/api/docs/V1BatchGetWorkflowEdgesResponse.md +29 -0
- eval_studio_client/api/docs/V1BatchGetWorkflowNodesResponse.md +29 -0
- eval_studio_client/api/docs/V1CreateEvaluationRequest.md +1 -0
- eval_studio_client/api/docs/V1CreateWorkflowResponse.md +29 -0
- eval_studio_client/api/docs/V1DeleteWorkflowNodeResponse.md +29 -0
- eval_studio_client/api/docs/V1DeleteWorkflowResponse.md +29 -0
- eval_studio_client/api/docs/V1EstimateThresholdRequest.md +33 -0
- eval_studio_client/api/docs/V1GetWorkflowNodePrerequisitesResponse.md +30 -0
- eval_studio_client/api/docs/V1GetWorkflowNodeResponse.md +29 -0
- eval_studio_client/api/docs/V1GetWorkflowResponse.md +29 -0
- eval_studio_client/api/docs/V1ImportEvaluationRequest.md +1 -0
- eval_studio_client/api/docs/V1ImportTestCasesFromLibraryResponse.md +29 -0
- eval_studio_client/api/docs/V1ImportTestCasesRequest.md +33 -0
- eval_studio_client/api/docs/V1LabeledTestCase.md +31 -0
- eval_studio_client/api/docs/V1ListPromptLibraryItemsResponse.md +29 -0
- eval_studio_client/api/docs/V1ListTestCaseLibraryItemsResponse.md +29 -0
- eval_studio_client/api/docs/V1ListWorkflowsResponse.md +29 -0
- eval_studio_client/api/docs/V1ProcessWorkflowNodeResponse.md +29 -0
- eval_studio_client/api/docs/V1PromptLibraryItem.md +42 -0
- eval_studio_client/api/docs/V1TestCase.md +1 -0
- eval_studio_client/api/docs/V1TestSuiteEvaluates.md +11 -0
- eval_studio_client/api/docs/V1UpdateWorkflowNodeResponse.md +29 -0
- eval_studio_client/api/docs/V1UpdateWorkflowResponse.md +29 -0
- eval_studio_client/api/docs/V1Workflow.md +46 -0
- eval_studio_client/api/docs/V1WorkflowEdge.md +40 -0
- eval_studio_client/api/docs/V1WorkflowEdgeType.md +12 -0
- eval_studio_client/api/docs/V1WorkflowNode.md +46 -0
- eval_studio_client/api/docs/V1WorkflowNodeArtifact.md +40 -0
- eval_studio_client/api/docs/V1WorkflowNodeArtifacts.md +29 -0
- eval_studio_client/api/docs/V1WorkflowNodeAttributes.md +30 -0
- eval_studio_client/api/docs/V1WorkflowNodeStatus.md +12 -0
- eval_studio_client/api/docs/V1WorkflowNodeType.md +12 -0
- eval_studio_client/api/docs/V1WorkflowNodeView.md +12 -0
- eval_studio_client/api/docs/V1WorkflowType.md +12 -0
- eval_studio_client/api/docs/WorkflowEdgeServiceApi.md +76 -0
- eval_studio_client/api/docs/WorkflowNodeServiceApi.md +423 -0
- eval_studio_client/api/docs/WorkflowServiceApi.md +417 -0
- eval_studio_client/api/models/__init__.py +38 -0
- eval_studio_client/api/models/perturbation_service_create_perturbation_request.py +8 -2
- eval_studio_client/api/models/prompt_generation_service_auto_generate_prompts_request.py +5 -3
- eval_studio_client/api/models/protobuf_null_value.py +36 -0
- eval_studio_client/api/models/required_the_test_case_to_update.py +6 -2
- eval_studio_client/api/models/required_the_updated_workflow.py +152 -0
- eval_studio_client/api/models/required_the_updated_workflow_node.py +152 -0
- eval_studio_client/api/models/test_service_generate_test_cases_request.py +4 -2
- eval_studio_client/api/models/test_service_import_test_cases_from_library_request.py +93 -0
- eval_studio_client/api/models/test_service_list_test_case_library_items_request.py +99 -0
- eval_studio_client/api/models/test_service_perturb_test_request.py +4 -2
- eval_studio_client/api/models/v1_batch_delete_workflows_request.py +87 -0
- eval_studio_client/api/models/v1_batch_delete_workflows_response.py +95 -0
- eval_studio_client/api/models/v1_batch_get_workflow_edges_response.py +95 -0
- eval_studio_client/api/models/v1_batch_get_workflow_nodes_response.py +95 -0
- eval_studio_client/api/models/v1_create_evaluation_request.py +7 -2
- eval_studio_client/api/models/v1_create_workflow_response.py +91 -0
- eval_studio_client/api/models/v1_delete_workflow_node_response.py +91 -0
- eval_studio_client/api/models/v1_delete_workflow_response.py +91 -0
- eval_studio_client/api/models/v1_estimate_threshold_request.py +103 -0
- eval_studio_client/api/models/v1_get_workflow_node_prerequisites_response.py +89 -0
- eval_studio_client/api/models/v1_get_workflow_node_response.py +91 -0
- eval_studio_client/api/models/v1_get_workflow_response.py +91 -0
- eval_studio_client/api/models/v1_import_evaluation_request.py +7 -2
- eval_studio_client/api/models/v1_import_test_cases_from_library_response.py +91 -0
- eval_studio_client/api/models/v1_import_test_cases_request.py +95 -0
- eval_studio_client/api/models/v1_labeled_test_case.py +91 -0
- eval_studio_client/api/models/v1_list_prompt_library_items_response.py +95 -0
- eval_studio_client/api/models/v1_list_test_case_library_items_response.py +95 -0
- eval_studio_client/api/models/v1_list_workflows_response.py +95 -0
- eval_studio_client/api/models/v1_process_workflow_node_response.py +91 -0
- eval_studio_client/api/models/v1_prompt_library_item.py +129 -0
- eval_studio_client/api/models/v1_test_case.py +6 -2
- eval_studio_client/api/models/v1_test_suite_evaluates.py +39 -0
- eval_studio_client/api/models/v1_update_workflow_node_response.py +91 -0
- eval_studio_client/api/models/v1_update_workflow_response.py +91 -0
- eval_studio_client/api/models/v1_workflow.py +156 -0
- eval_studio_client/api/models/v1_workflow_edge.py +123 -0
- eval_studio_client/api/models/v1_workflow_edge_type.py +37 -0
- eval_studio_client/api/models/v1_workflow_node.py +156 -0
- eval_studio_client/api/models/v1_workflow_node_artifact.py +122 -0
- eval_studio_client/api/models/v1_workflow_node_artifacts.py +97 -0
- eval_studio_client/api/models/v1_workflow_node_attributes.py +87 -0
- eval_studio_client/api/models/v1_workflow_node_status.py +40 -0
- eval_studio_client/api/models/v1_workflow_node_type.py +41 -0
- eval_studio_client/api/models/v1_workflow_node_view.py +38 -0
- eval_studio_client/api/models/v1_workflow_type.py +37 -0
- eval_studio_client/api/test/test_human_calibration_service_api.py +38 -0
- eval_studio_client/api/test/test_perturbation_service_create_perturbation_request.py +20 -2
- eval_studio_client/api/test/test_prompt_generation_service_auto_generate_prompts_request.py +4 -1
- eval_studio_client/api/test/test_prompt_library_service_api.py +43 -0
- eval_studio_client/api/test/test_protobuf_null_value.py +33 -0
- eval_studio_client/api/test/test_required_the_test_case_to_update.py +4 -1
- eval_studio_client/api/test/test_required_the_updated_workflow.py +88 -0
- eval_studio_client/api/test/test_required_the_updated_workflow_node.py +80 -0
- eval_studio_client/api/test/test_test_service_api.py +12 -0
- eval_studio_client/api/test/test_test_service_generate_test_cases_request.py +4 -1
- eval_studio_client/api/test/test_test_service_import_test_cases_from_library_request.py +56 -0
- eval_studio_client/api/test/test_test_service_list_test_case_library_items_request.py +63 -0
- eval_studio_client/api/test/test_test_service_perturb_test_request.py +4 -1
- eval_studio_client/api/test/test_v1_batch_delete_test_cases_response.py +4 -1
- eval_studio_client/api/test/test_v1_batch_delete_workflows_request.py +53 -0
- eval_studio_client/api/test/test_v1_batch_delete_workflows_response.py +92 -0
- eval_studio_client/api/test/test_v1_batch_get_workflow_edges_response.py +64 -0
- eval_studio_client/api/test/test_v1_batch_get_workflow_nodes_response.py +84 -0
- eval_studio_client/api/test/test_v1_create_evaluation_request.py +20 -2
- eval_studio_client/api/test/test_v1_create_test_case_response.py +4 -1
- eval_studio_client/api/test/test_v1_create_workflow_response.py +90 -0
- eval_studio_client/api/test/test_v1_delete_test_case_response.py +4 -1
- eval_studio_client/api/test/test_v1_delete_workflow_node_response.py +82 -0
- eval_studio_client/api/test/test_v1_delete_workflow_response.py +90 -0
- eval_studio_client/api/test/test_v1_estimate_threshold_request.py +60 -0
- eval_studio_client/api/test/test_v1_evaluation_test.py +4 -1
- eval_studio_client/api/test/test_v1_find_all_test_cases_by_id_response.py +4 -1
- eval_studio_client/api/test/test_v1_get_test_case_response.py +4 -1
- eval_studio_client/api/test/test_v1_get_workflow_node_prerequisites_response.py +56 -0
- eval_studio_client/api/test/test_v1_get_workflow_node_response.py +82 -0
- eval_studio_client/api/test/test_v1_get_workflow_response.py +90 -0
- eval_studio_client/api/test/test_v1_import_evaluation_request.py +16 -1
- eval_studio_client/api/test/test_v1_import_test_cases_from_library_response.py +71 -0
- eval_studio_client/api/test/test_v1_import_test_cases_request.py +57 -0
- eval_studio_client/api/test/test_v1_labeled_test_case.py +53 -0
- eval_studio_client/api/test/test_v1_list_prompt_library_items_response.py +71 -0
- eval_studio_client/api/test/test_v1_list_test_case_library_items_response.py +71 -0
- eval_studio_client/api/test/test_v1_list_test_cases_response.py +4 -1
- eval_studio_client/api/test/test_v1_list_workflows_response.py +92 -0
- eval_studio_client/api/test/test_v1_process_workflow_node_response.py +71 -0
- eval_studio_client/api/test/test_v1_prompt_library_item.py +68 -0
- eval_studio_client/api/test/test_v1_test_case.py +4 -1
- eval_studio_client/api/test/test_v1_test_suite_evaluates.py +33 -0
- eval_studio_client/api/test/test_v1_update_test_case_response.py +4 -1
- eval_studio_client/api/test/test_v1_update_workflow_node_response.py +82 -0
- eval_studio_client/api/test/test_v1_update_workflow_response.py +90 -0
- eval_studio_client/api/test/test_v1_workflow.py +89 -0
- eval_studio_client/api/test/test_v1_workflow_edge.py +61 -0
- eval_studio_client/api/test/test_v1_workflow_edge_type.py +33 -0
- eval_studio_client/api/test/test_v1_workflow_node.py +81 -0
- eval_studio_client/api/test/test_v1_workflow_node_artifact.py +61 -0
- eval_studio_client/api/test/test_v1_workflow_node_artifacts.py +64 -0
- eval_studio_client/api/test/test_v1_workflow_node_attributes.py +51 -0
- eval_studio_client/api/test/test_v1_workflow_node_status.py +33 -0
- eval_studio_client/api/test/test_v1_workflow_node_type.py +33 -0
- eval_studio_client/api/test/test_v1_workflow_node_view.py +33 -0
- eval_studio_client/api/test/test_v1_workflow_type.py +33 -0
- eval_studio_client/api/test/test_workflow_edge_service_api.py +38 -0
- eval_studio_client/api/test/test_workflow_node_service_api.py +73 -0
- eval_studio_client/api/test/test_workflow_service_api.py +73 -0
- eval_studio_client/client.py +7 -0
- eval_studio_client/dashboards.py +66 -18
- eval_studio_client/gen/openapiv2/eval_studio.swagger.json +2665 -794
- eval_studio_client/leaderboards.py +125 -0
- eval_studio_client/models.py +3 -42
- eval_studio_client/test_labs.py +49 -21
- eval_studio_client/tests.py +221 -51
- eval_studio_client/utils.py +26 -0
- {eval_studio_client-1.0.0a1.dist-info → eval_studio_client-1.0.1.dist-info}/METADATA +1 -2
- {eval_studio_client-1.0.0a1.dist-info → eval_studio_client-1.0.1.dist-info}/RECORD +180 -50
- {eval_studio_client-1.0.0a1.dist-info → eval_studio_client-1.0.1.dist-info}/WHEEL +1 -1
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
import datetime
|
|
3
3
|
import json
|
|
4
|
+
import os
|
|
4
5
|
import time
|
|
5
6
|
from typing import Dict
|
|
6
7
|
from typing import List
|
|
@@ -33,12 +34,14 @@ class Leaderboard:
|
|
|
33
34
|
update_time: Optional[datetime.datetime] = None
|
|
34
35
|
problems: List[p6s.Problem] = dataclasses.field(default_factory=list)
|
|
35
36
|
insights: List[i6s.Insight] = dataclasses.field(default_factory=list)
|
|
37
|
+
summary: Optional[str] = None
|
|
36
38
|
existing_collection: Optional[str] = None
|
|
37
39
|
_report: Optional[str] = None
|
|
38
40
|
_leaderboard: Optional[str] = None
|
|
39
41
|
_model_name: Optional[str] = None
|
|
40
42
|
_status: Optional[models.V1LeaderboardStatus] = None
|
|
41
43
|
_client: Optional[api.ApiClient] = None
|
|
44
|
+
_operation: Optional[str] = None
|
|
42
45
|
|
|
43
46
|
def __post_init__(self):
|
|
44
47
|
self._evaluator_api = api.EvaluatorServiceApi(self._client)
|
|
@@ -85,6 +88,42 @@ class Leaderboard:
|
|
|
85
88
|
if self._client:
|
|
86
89
|
self._leaderboard_api.leaderboard_service_delete_leaderboard(self.key)
|
|
87
90
|
|
|
91
|
+
def download_result(self, dest: str):
|
|
92
|
+
"""Downloads the leaderboard result to a JSON file.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
dest (str): The destination path for the report.
|
|
96
|
+
"""
|
|
97
|
+
if not os.path.exists(dest):
|
|
98
|
+
raise ValueError("Destination path does not exist.")
|
|
99
|
+
|
|
100
|
+
if os.path.isdir(dest):
|
|
101
|
+
dest = os.path.join(dest, "results.json")
|
|
102
|
+
|
|
103
|
+
if self._client and self.finished:
|
|
104
|
+
headers: Dict[str, str] = {}
|
|
105
|
+
url = urljoin(
|
|
106
|
+
self._client.configuration.host, f"/content/{self.key}/results"
|
|
107
|
+
)
|
|
108
|
+
self._client.update_params_for_auth(
|
|
109
|
+
headers=headers,
|
|
110
|
+
queries=[],
|
|
111
|
+
auth_settings=[],
|
|
112
|
+
resource_path=url,
|
|
113
|
+
method="GET",
|
|
114
|
+
body=None,
|
|
115
|
+
)
|
|
116
|
+
response = urllib3.request("GET", url, headers=headers)
|
|
117
|
+
|
|
118
|
+
if response.status == 200:
|
|
119
|
+
with open(dest, "wb") as f:
|
|
120
|
+
f.write(response.data)
|
|
121
|
+
return
|
|
122
|
+
else:
|
|
123
|
+
raise RuntimeError("Failed to retrieve leaderboard result.")
|
|
124
|
+
|
|
125
|
+
raise ValueError("Cannot download result for unfinished leaderboard.")
|
|
126
|
+
|
|
88
127
|
def download_report(self, dest: str):
|
|
89
128
|
"""Downloads the leaderboard report to a zip file.
|
|
90
129
|
|
|
@@ -113,6 +152,30 @@ class Leaderboard:
|
|
|
113
152
|
|
|
114
153
|
raise ValueError("Cannot download report for unfinished leaderboard.")
|
|
115
154
|
|
|
155
|
+
def get_result_json(self) -> str:
|
|
156
|
+
"""Retrieves the leaderboard result as a JSON string."""
|
|
157
|
+
if self._client and self.finished:
|
|
158
|
+
headers: Dict[str, str] = {}
|
|
159
|
+
url = urljoin(
|
|
160
|
+
self._client.configuration.host, f"/content/{self.key}/results"
|
|
161
|
+
)
|
|
162
|
+
self._client.update_params_for_auth(
|
|
163
|
+
headers=headers,
|
|
164
|
+
queries=[],
|
|
165
|
+
auth_settings=[],
|
|
166
|
+
resource_path=url,
|
|
167
|
+
method="GET",
|
|
168
|
+
body=None,
|
|
169
|
+
)
|
|
170
|
+
response = urllib3.request("GET", url, headers=headers)
|
|
171
|
+
|
|
172
|
+
if response.status == 200:
|
|
173
|
+
return str(response.data)
|
|
174
|
+
else:
|
|
175
|
+
raise RuntimeError("Failed to retrieve leaderboard result.")
|
|
176
|
+
|
|
177
|
+
raise ValueError("Cannot download result for unfinished leaderboard.")
|
|
178
|
+
|
|
116
179
|
def get_table(self) -> LeaderboardTable:
|
|
117
180
|
"""Retrieves the leaderboard table."""
|
|
118
181
|
if self._client and self.finished:
|
|
@@ -169,6 +232,7 @@ class Leaderboard:
|
|
|
169
232
|
"""Refresh the leaderboard with the latest API data."""
|
|
170
233
|
self.key = api_leaderboard.name or ""
|
|
171
234
|
self.update_time = api_leaderboard.update_time
|
|
235
|
+
self.summary = api_leaderboard.leaderboard_summary
|
|
172
236
|
self._leaderboard = api_leaderboard.leaderboard_table
|
|
173
237
|
self._report = api_leaderboard.leaderboard_report or ""
|
|
174
238
|
self._status = api_leaderboard.status
|
|
@@ -191,6 +255,7 @@ class Leaderboard:
|
|
|
191
255
|
update_time=api_leaderboard.update_time,
|
|
192
256
|
problems=problems,
|
|
193
257
|
insights=insights,
|
|
258
|
+
summary=api_leaderboard.leaderboard_summary,
|
|
194
259
|
existing_collection=api_leaderboard.h2ogpte_collection or None,
|
|
195
260
|
_evaluator_name=api_leaderboard.evaluator or "",
|
|
196
261
|
_test_names=api_leaderboard.tests or [],
|
|
@@ -198,6 +263,7 @@ class Leaderboard:
|
|
|
198
263
|
_leaderboard=api_leaderboard.leaderboard_table,
|
|
199
264
|
_status=api_leaderboard.status,
|
|
200
265
|
_client=client,
|
|
266
|
+
_operation=api_leaderboard.create_operation or None,
|
|
201
267
|
)
|
|
202
268
|
|
|
203
269
|
@staticmethod
|
|
@@ -206,3 +272,62 @@ class Leaderboard:
|
|
|
206
272
|
models.V1LeaderboardStatus.LEADERBOARD_STATUS_COMPLETED,
|
|
207
273
|
models.V1LeaderboardStatus.LEADERBOARD_STATUS_FAILED,
|
|
208
274
|
]
|
|
275
|
+
|
|
276
|
+
@staticmethod
|
|
277
|
+
def from_operation(
|
|
278
|
+
operation: models.V1Operation, client: Optional[api.ApiClient]
|
|
279
|
+
) -> Optional["Leaderboard"]:
|
|
280
|
+
"""Retrieves the leaderboard from the operation, which created it.
|
|
281
|
+
|
|
282
|
+
Args:
|
|
283
|
+
operation: The operation that created the dashboard.
|
|
284
|
+
client: The API client to use for the leaderboard retrieval.
|
|
285
|
+
|
|
286
|
+
Returns:
|
|
287
|
+
Leaderboard: The leaderboard instance created by the operation.
|
|
288
|
+
"""
|
|
289
|
+
if not client:
|
|
290
|
+
raise RuntimeError("API Client is not provided")
|
|
291
|
+
|
|
292
|
+
if not operation.metadata:
|
|
293
|
+
raise RuntimeError(
|
|
294
|
+
"Operation metadata missing, it's not possible to retrieve leaderboard from operation"
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
leaderboard_api = api.LeaderboardServiceApi(client)
|
|
298
|
+
leadeboard_id = operation.metadata.to_dict().get("leaderboard", "")
|
|
299
|
+
res = leaderboard_api.leaderboard_service_get_leaderboard(str(leadeboard_id))
|
|
300
|
+
if res and res.leaderboard:
|
|
301
|
+
return Leaderboard._from_api_leaderboard(res.leaderboard, client)
|
|
302
|
+
|
|
303
|
+
return None
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
class _Leaderboards:
|
|
307
|
+
def __init__(self, client: api.ApiClient):
|
|
308
|
+
self._client = client
|
|
309
|
+
self._api = api.LeaderboardServiceApi(client)
|
|
310
|
+
|
|
311
|
+
def get(self, key: str) -> Leaderboard:
|
|
312
|
+
"""Gets an individual leaderboard with a given key from Eval Studio.
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
key: The leaderboard resource name to retrieve.
|
|
316
|
+
"""
|
|
317
|
+
res = self._api.leaderboard_service_get_leaderboard(key)
|
|
318
|
+
if res and res.leaderboard:
|
|
319
|
+
return Leaderboard._from_api_leaderboard(res.leaderboard, self._client)
|
|
320
|
+
|
|
321
|
+
raise KeyError("Leaderboard not found.")
|
|
322
|
+
|
|
323
|
+
def list(self) -> List[Leaderboard]:
|
|
324
|
+
"""Lists all user leaderboards in Eval Studio."""
|
|
325
|
+
res = self._api.leaderboard_service_list_leaderboards()
|
|
326
|
+
if res:
|
|
327
|
+
res_leaderboards = res.leaderboards or []
|
|
328
|
+
return [
|
|
329
|
+
Leaderboard._from_api_leaderboard(lb, self._client)
|
|
330
|
+
for lb in res_leaderboards
|
|
331
|
+
]
|
|
332
|
+
|
|
333
|
+
return []
|
eval_studio_client/models.py
CHANGED
|
@@ -168,7 +168,7 @@ class Model:
|
|
|
168
168
|
)
|
|
169
169
|
|
|
170
170
|
if res and res.operation:
|
|
171
|
-
return
|
|
171
|
+
return l10s.Leaderboard.from_operation(res.operation, self._client)
|
|
172
172
|
|
|
173
173
|
return None
|
|
174
174
|
|
|
@@ -226,7 +226,7 @@ class Model:
|
|
|
226
226
|
)
|
|
227
227
|
|
|
228
228
|
if res and res.operation:
|
|
229
|
-
return
|
|
229
|
+
return d8s.Dashboard.from_operation(res.operation, self._client)
|
|
230
230
|
|
|
231
231
|
return None
|
|
232
232
|
|
|
@@ -257,7 +257,7 @@ class Model:
|
|
|
257
257
|
)
|
|
258
258
|
res = self._leaderboard_api.leaderboard_service_import_leaderboard(req)
|
|
259
259
|
if res and res.operation:
|
|
260
|
-
return
|
|
260
|
+
return l10s.Leaderboard.from_operation(res.operation, self._client)
|
|
261
261
|
|
|
262
262
|
return None
|
|
263
263
|
|
|
@@ -273,45 +273,6 @@ class Model:
|
|
|
273
273
|
|
|
274
274
|
raise RuntimeError("Failed to list base models")
|
|
275
275
|
|
|
276
|
-
def _get_leaderboard_from_operation(
|
|
277
|
-
self, operation: models.V1Operation
|
|
278
|
-
) -> Optional[l10s.Leaderboard]:
|
|
279
|
-
"""Retrieves the leaderboard from the operation, which created it.
|
|
280
|
-
|
|
281
|
-
Args:
|
|
282
|
-
operation: The operation that created the leaderboard.
|
|
283
|
-
"""
|
|
284
|
-
if not operation.metadata:
|
|
285
|
-
raise RuntimeError("Not possible to retrieve leaderboard from operation")
|
|
286
|
-
|
|
287
|
-
leadeboard_id = operation.metadata.to_dict().get("leaderboard")
|
|
288
|
-
res = self._leaderboard_api.leaderboard_service_get_leaderboard(leadeboard_id)
|
|
289
|
-
if res and res.leaderboard:
|
|
290
|
-
return l10s.Leaderboard._from_api_leaderboard(res.leaderboard, self._client)
|
|
291
|
-
|
|
292
|
-
return None
|
|
293
|
-
|
|
294
|
-
def _get_dashboard_from_operation(
|
|
295
|
-
self, operation: models.V1Operation
|
|
296
|
-
) -> Optional[d8s.Dashboard]:
|
|
297
|
-
"""Retrieves the dashboard from the operation, which created it.
|
|
298
|
-
|
|
299
|
-
Args:
|
|
300
|
-
operation: The operation that created the dashboard.
|
|
301
|
-
"""
|
|
302
|
-
if not self._client:
|
|
303
|
-
raise RuntimeError("Client is not set.")
|
|
304
|
-
|
|
305
|
-
if not operation.metadata:
|
|
306
|
-
raise RuntimeError("Not possible to retrieve dashboard from operation")
|
|
307
|
-
|
|
308
|
-
dashboard_id = operation.metadata.to_dict().get("dashboard")
|
|
309
|
-
res = self._dashboard_api.dashboard_service_get_dashboard(dashboard_id)
|
|
310
|
-
if res and res.dashboard:
|
|
311
|
-
return d8s.Dashboard._from_api_dashboard(res.dashboard, self._client)
|
|
312
|
-
|
|
313
|
-
return None
|
|
314
|
-
|
|
315
276
|
@staticmethod
|
|
316
277
|
def _from_api_model(api_model: models.V1Model, client: api.ApiClient) -> "Model":
|
|
317
278
|
"""Converts the API model to the client model."""
|
eval_studio_client/test_labs.py
CHANGED
|
@@ -7,7 +7,8 @@ from typing import Union
|
|
|
7
7
|
import uuid
|
|
8
8
|
|
|
9
9
|
from eval_studio_client import api
|
|
10
|
-
from eval_studio_client import
|
|
10
|
+
from eval_studio_client import dashboards
|
|
11
|
+
from eval_studio_client import evaluators as e8s
|
|
11
12
|
from eval_studio_client import leaderboards as l10s
|
|
12
13
|
from eval_studio_client.api import models as apiModels
|
|
13
14
|
|
|
@@ -92,11 +93,56 @@ class TestLab:
|
|
|
92
93
|
self._models.append(_m)
|
|
93
94
|
return _m
|
|
94
95
|
|
|
95
|
-
def evaluate(
|
|
96
|
+
def evaluate(
|
|
97
|
+
self,
|
|
98
|
+
evaluators: Union[e8s.Evaluator, List[e8s.Evaluator]],
|
|
99
|
+
name: Optional[str] = None,
|
|
100
|
+
description: Optional[str] = None,
|
|
101
|
+
) -> Optional[dashboards.Dashboard]:
|
|
96
102
|
"""Runs an evaluation for the test lab.
|
|
97
103
|
|
|
104
|
+
Args:
|
|
105
|
+
evaluators (Union[e8s.Evaluator, List[e8s.Evaluator]]): One or many evaluators
|
|
106
|
+
used to evaluate the test lab.
|
|
107
|
+
name (str, optional): Optional name for the evaluation.
|
|
108
|
+
description (str, optional): Optional description for the evaluation.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
Dashboard: Evaluation dashboard instance. In case launching of evaluation
|
|
112
|
+
fails, `None` is returned.
|
|
113
|
+
"""
|
|
114
|
+
_evaluators = (
|
|
115
|
+
[evaluators] if isinstance(evaluators, e8s.Evaluator) else evaluators
|
|
116
|
+
)
|
|
117
|
+
name = name or self.name or "Imported Dashboard"
|
|
118
|
+
description = description or self.description or ""
|
|
119
|
+
req = apiModels.V1BatchImportLeaderboardRequest(
|
|
120
|
+
testLabJson=self.json(),
|
|
121
|
+
evaluators=[e.key for e in _evaluators],
|
|
122
|
+
model=None,
|
|
123
|
+
dashboardDisplayName=name,
|
|
124
|
+
dashboardDescription=description,
|
|
125
|
+
testDisplayName=f"{name} - Test",
|
|
126
|
+
testDescription=f"Test suite for {description}",
|
|
127
|
+
)
|
|
128
|
+
res = self._leaderboard_api.leaderboard_service_batch_import_leaderboard(req)
|
|
129
|
+
|
|
130
|
+
if res and res.operation:
|
|
131
|
+
return dashboards.Dashboard.from_operation(res.operation, self._client)
|
|
132
|
+
|
|
133
|
+
return None
|
|
134
|
+
|
|
135
|
+
def create_leaderboard(
|
|
136
|
+
self, evaluator: e8s.Evaluator
|
|
137
|
+
) -> Optional[l10s.Leaderboard]:
|
|
138
|
+
"""Creates a single leaderboard for the test lab.
|
|
139
|
+
|
|
98
140
|
Args:
|
|
99
141
|
evaluator: The evaluator to use for the evaluation.
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
Leaderboard: Single evaluation leaderboard instance.
|
|
145
|
+
In case launching of evaluation fails, `None` is returned.
|
|
100
146
|
"""
|
|
101
147
|
req = apiModels.V1ImportLeaderboardRequest(
|
|
102
148
|
testLabJson=self.json(),
|
|
@@ -109,7 +155,7 @@ class TestLab:
|
|
|
109
155
|
)
|
|
110
156
|
res = self._leaderboard_api.leaderboard_service_import_leaderboard(req)
|
|
111
157
|
if res and res.operation:
|
|
112
|
-
return
|
|
158
|
+
return l10s.Leaderboard.from_operation(res.operation, self._client)
|
|
113
159
|
|
|
114
160
|
return None
|
|
115
161
|
|
|
@@ -131,24 +177,6 @@ class TestLab:
|
|
|
131
177
|
|
|
132
178
|
return json.dumps(lab, indent=4, sort_keys=True)
|
|
133
179
|
|
|
134
|
-
def _get_leaderboard_from_operation(
|
|
135
|
-
self, operation: apiModels.V1Operation
|
|
136
|
-
) -> Optional[l10s.Leaderboard]:
|
|
137
|
-
"""Retrieves the leaderboard from the operation, which created it.
|
|
138
|
-
|
|
139
|
-
Args:
|
|
140
|
-
operation: The operation that created the leaderboard.
|
|
141
|
-
"""
|
|
142
|
-
if not operation.metadata:
|
|
143
|
-
raise RuntimeError("Not possible to retrieve leaderboard from operation")
|
|
144
|
-
|
|
145
|
-
leadeboard_id = operation.metadata.to_dict().get("leaderboard")
|
|
146
|
-
res = self._leaderboard_api.leaderboard_service_get_leaderboard(leadeboard_id)
|
|
147
|
-
if res and res.leaderboard:
|
|
148
|
-
return l10s.Leaderboard._from_api_leaderboard(res.leaderboard, self._client)
|
|
149
|
-
|
|
150
|
-
return None
|
|
151
|
-
|
|
152
180
|
def _llm_model_names(self) -> List[str]:
|
|
153
181
|
return [m.llm_model_name for m in self.models]
|
|
154
182
|
|