eval-studio-client 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (179) hide show
  1. eval_studio_client/api/__init__.py +43 -0
  2. eval_studio_client/api/api/__init__.py +5 -0
  3. eval_studio_client/api/api/human_calibration_service_api.py +304 -0
  4. eval_studio_client/api/api/perturbator_service_api.py +268 -1
  5. eval_studio_client/api/api/prompt_library_service_api.py +669 -0
  6. eval_studio_client/api/api/test_service_api.py +568 -0
  7. eval_studio_client/api/api/workflow_edge_service_api.py +296 -0
  8. eval_studio_client/api/api/workflow_node_service_api.py +1634 -0
  9. eval_studio_client/api/api/workflow_service_api.py +1609 -0
  10. eval_studio_client/api/docs/HumanCalibrationServiceApi.md +77 -0
  11. eval_studio_client/api/docs/PerturbationServiceCreatePerturbationRequest.md +1 -0
  12. eval_studio_client/api/docs/PerturbatorServiceApi.md +33 -3
  13. eval_studio_client/api/docs/PromptGenerationServiceAutoGeneratePromptsRequest.md +2 -1
  14. eval_studio_client/api/docs/PromptLibraryServiceApi.md +155 -0
  15. eval_studio_client/api/docs/ProtobufNullValue.md +12 -0
  16. eval_studio_client/api/docs/RequiredTheTestCaseToUpdate.md +1 -0
  17. eval_studio_client/api/docs/RequiredTheUpdatedWorkflow.md +44 -0
  18. eval_studio_client/api/docs/RequiredTheUpdatedWorkflowNode.md +44 -0
  19. eval_studio_client/api/docs/TestServiceApi.md +140 -0
  20. eval_studio_client/api/docs/TestServiceGenerateTestCasesRequest.md +1 -0
  21. eval_studio_client/api/docs/TestServiceImportTestCasesFromLibraryRequest.md +32 -0
  22. eval_studio_client/api/docs/TestServiceListTestCaseLibraryItemsRequest.md +35 -0
  23. eval_studio_client/api/docs/TestServicePerturbTestRequest.md +1 -0
  24. eval_studio_client/api/docs/V1BatchDeleteWorkflowsRequest.md +29 -0
  25. eval_studio_client/api/docs/V1BatchDeleteWorkflowsResponse.md +29 -0
  26. eval_studio_client/api/docs/V1BatchGetWorkflowEdgesResponse.md +29 -0
  27. eval_studio_client/api/docs/V1BatchGetWorkflowNodesResponse.md +29 -0
  28. eval_studio_client/api/docs/V1CreateEvaluationRequest.md +1 -0
  29. eval_studio_client/api/docs/V1CreateWorkflowResponse.md +29 -0
  30. eval_studio_client/api/docs/V1DeleteWorkflowNodeResponse.md +29 -0
  31. eval_studio_client/api/docs/V1DeleteWorkflowResponse.md +29 -0
  32. eval_studio_client/api/docs/V1EstimateThresholdRequest.md +33 -0
  33. eval_studio_client/api/docs/V1GetWorkflowNodePrerequisitesResponse.md +30 -0
  34. eval_studio_client/api/docs/V1GetWorkflowNodeResponse.md +29 -0
  35. eval_studio_client/api/docs/V1GetWorkflowResponse.md +29 -0
  36. eval_studio_client/api/docs/V1ImportEvaluationRequest.md +1 -0
  37. eval_studio_client/api/docs/V1ImportTestCasesFromLibraryResponse.md +29 -0
  38. eval_studio_client/api/docs/V1ImportTestCasesRequest.md +33 -0
  39. eval_studio_client/api/docs/V1LabeledTestCase.md +31 -0
  40. eval_studio_client/api/docs/V1ListPromptLibraryItemsResponse.md +29 -0
  41. eval_studio_client/api/docs/V1ListTestCaseLibraryItemsResponse.md +29 -0
  42. eval_studio_client/api/docs/V1ListWorkflowsResponse.md +29 -0
  43. eval_studio_client/api/docs/V1ProcessWorkflowNodeResponse.md +29 -0
  44. eval_studio_client/api/docs/V1PromptLibraryItem.md +42 -0
  45. eval_studio_client/api/docs/V1TestCase.md +1 -0
  46. eval_studio_client/api/docs/V1TestSuiteEvaluates.md +11 -0
  47. eval_studio_client/api/docs/V1UpdateWorkflowNodeResponse.md +29 -0
  48. eval_studio_client/api/docs/V1UpdateWorkflowResponse.md +29 -0
  49. eval_studio_client/api/docs/V1Workflow.md +46 -0
  50. eval_studio_client/api/docs/V1WorkflowEdge.md +40 -0
  51. eval_studio_client/api/docs/V1WorkflowEdgeType.md +12 -0
  52. eval_studio_client/api/docs/V1WorkflowNode.md +46 -0
  53. eval_studio_client/api/docs/V1WorkflowNodeArtifact.md +40 -0
  54. eval_studio_client/api/docs/V1WorkflowNodeArtifacts.md +29 -0
  55. eval_studio_client/api/docs/V1WorkflowNodeAttributes.md +30 -0
  56. eval_studio_client/api/docs/V1WorkflowNodeStatus.md +12 -0
  57. eval_studio_client/api/docs/V1WorkflowNodeType.md +12 -0
  58. eval_studio_client/api/docs/V1WorkflowNodeView.md +12 -0
  59. eval_studio_client/api/docs/V1WorkflowType.md +12 -0
  60. eval_studio_client/api/docs/WorkflowEdgeServiceApi.md +76 -0
  61. eval_studio_client/api/docs/WorkflowNodeServiceApi.md +423 -0
  62. eval_studio_client/api/docs/WorkflowServiceApi.md +417 -0
  63. eval_studio_client/api/models/__init__.py +38 -0
  64. eval_studio_client/api/models/perturbation_service_create_perturbation_request.py +8 -2
  65. eval_studio_client/api/models/prompt_generation_service_auto_generate_prompts_request.py +5 -3
  66. eval_studio_client/api/models/protobuf_null_value.py +36 -0
  67. eval_studio_client/api/models/required_the_test_case_to_update.py +6 -2
  68. eval_studio_client/api/models/required_the_updated_workflow.py +152 -0
  69. eval_studio_client/api/models/required_the_updated_workflow_node.py +152 -0
  70. eval_studio_client/api/models/test_service_generate_test_cases_request.py +4 -2
  71. eval_studio_client/api/models/test_service_import_test_cases_from_library_request.py +93 -0
  72. eval_studio_client/api/models/test_service_list_test_case_library_items_request.py +99 -0
  73. eval_studio_client/api/models/test_service_perturb_test_request.py +4 -2
  74. eval_studio_client/api/models/v1_batch_delete_workflows_request.py +87 -0
  75. eval_studio_client/api/models/v1_batch_delete_workflows_response.py +95 -0
  76. eval_studio_client/api/models/v1_batch_get_workflow_edges_response.py +95 -0
  77. eval_studio_client/api/models/v1_batch_get_workflow_nodes_response.py +95 -0
  78. eval_studio_client/api/models/v1_create_evaluation_request.py +7 -2
  79. eval_studio_client/api/models/v1_create_workflow_response.py +91 -0
  80. eval_studio_client/api/models/v1_delete_workflow_node_response.py +91 -0
  81. eval_studio_client/api/models/v1_delete_workflow_response.py +91 -0
  82. eval_studio_client/api/models/v1_estimate_threshold_request.py +103 -0
  83. eval_studio_client/api/models/v1_get_workflow_node_prerequisites_response.py +89 -0
  84. eval_studio_client/api/models/v1_get_workflow_node_response.py +91 -0
  85. eval_studio_client/api/models/v1_get_workflow_response.py +91 -0
  86. eval_studio_client/api/models/v1_import_evaluation_request.py +7 -2
  87. eval_studio_client/api/models/v1_import_test_cases_from_library_response.py +91 -0
  88. eval_studio_client/api/models/v1_import_test_cases_request.py +95 -0
  89. eval_studio_client/api/models/v1_labeled_test_case.py +91 -0
  90. eval_studio_client/api/models/v1_list_prompt_library_items_response.py +95 -0
  91. eval_studio_client/api/models/v1_list_test_case_library_items_response.py +95 -0
  92. eval_studio_client/api/models/v1_list_workflows_response.py +95 -0
  93. eval_studio_client/api/models/v1_process_workflow_node_response.py +91 -0
  94. eval_studio_client/api/models/v1_prompt_library_item.py +129 -0
  95. eval_studio_client/api/models/v1_test_case.py +6 -2
  96. eval_studio_client/api/models/v1_test_suite_evaluates.py +39 -0
  97. eval_studio_client/api/models/v1_update_workflow_node_response.py +91 -0
  98. eval_studio_client/api/models/v1_update_workflow_response.py +91 -0
  99. eval_studio_client/api/models/v1_workflow.py +156 -0
  100. eval_studio_client/api/models/v1_workflow_edge.py +123 -0
  101. eval_studio_client/api/models/v1_workflow_edge_type.py +37 -0
  102. eval_studio_client/api/models/v1_workflow_node.py +156 -0
  103. eval_studio_client/api/models/v1_workflow_node_artifact.py +122 -0
  104. eval_studio_client/api/models/v1_workflow_node_artifacts.py +97 -0
  105. eval_studio_client/api/models/v1_workflow_node_attributes.py +87 -0
  106. eval_studio_client/api/models/v1_workflow_node_status.py +40 -0
  107. eval_studio_client/api/models/v1_workflow_node_type.py +41 -0
  108. eval_studio_client/api/models/v1_workflow_node_view.py +38 -0
  109. eval_studio_client/api/models/v1_workflow_type.py +37 -0
  110. eval_studio_client/api/test/test_human_calibration_service_api.py +38 -0
  111. eval_studio_client/api/test/test_perturbation_service_create_perturbation_request.py +20 -2
  112. eval_studio_client/api/test/test_prompt_generation_service_auto_generate_prompts_request.py +4 -1
  113. eval_studio_client/api/test/test_prompt_library_service_api.py +43 -0
  114. eval_studio_client/api/test/test_protobuf_null_value.py +33 -0
  115. eval_studio_client/api/test/test_required_the_test_case_to_update.py +4 -1
  116. eval_studio_client/api/test/test_required_the_updated_workflow.py +88 -0
  117. eval_studio_client/api/test/test_required_the_updated_workflow_node.py +80 -0
  118. eval_studio_client/api/test/test_test_service_api.py +12 -0
  119. eval_studio_client/api/test/test_test_service_generate_test_cases_request.py +4 -1
  120. eval_studio_client/api/test/test_test_service_import_test_cases_from_library_request.py +56 -0
  121. eval_studio_client/api/test/test_test_service_list_test_case_library_items_request.py +63 -0
  122. eval_studio_client/api/test/test_test_service_perturb_test_request.py +4 -1
  123. eval_studio_client/api/test/test_v1_batch_delete_test_cases_response.py +4 -1
  124. eval_studio_client/api/test/test_v1_batch_delete_workflows_request.py +53 -0
  125. eval_studio_client/api/test/test_v1_batch_delete_workflows_response.py +92 -0
  126. eval_studio_client/api/test/test_v1_batch_get_workflow_edges_response.py +64 -0
  127. eval_studio_client/api/test/test_v1_batch_get_workflow_nodes_response.py +84 -0
  128. eval_studio_client/api/test/test_v1_create_evaluation_request.py +20 -2
  129. eval_studio_client/api/test/test_v1_create_test_case_response.py +4 -1
  130. eval_studio_client/api/test/test_v1_create_workflow_response.py +90 -0
  131. eval_studio_client/api/test/test_v1_delete_test_case_response.py +4 -1
  132. eval_studio_client/api/test/test_v1_delete_workflow_node_response.py +82 -0
  133. eval_studio_client/api/test/test_v1_delete_workflow_response.py +90 -0
  134. eval_studio_client/api/test/test_v1_estimate_threshold_request.py +60 -0
  135. eval_studio_client/api/test/test_v1_evaluation_test.py +4 -1
  136. eval_studio_client/api/test/test_v1_find_all_test_cases_by_id_response.py +4 -1
  137. eval_studio_client/api/test/test_v1_get_test_case_response.py +4 -1
  138. eval_studio_client/api/test/test_v1_get_workflow_node_prerequisites_response.py +56 -0
  139. eval_studio_client/api/test/test_v1_get_workflow_node_response.py +82 -0
  140. eval_studio_client/api/test/test_v1_get_workflow_response.py +90 -0
  141. eval_studio_client/api/test/test_v1_import_evaluation_request.py +16 -1
  142. eval_studio_client/api/test/test_v1_import_test_cases_from_library_response.py +71 -0
  143. eval_studio_client/api/test/test_v1_import_test_cases_request.py +57 -0
  144. eval_studio_client/api/test/test_v1_labeled_test_case.py +53 -0
  145. eval_studio_client/api/test/test_v1_list_prompt_library_items_response.py +71 -0
  146. eval_studio_client/api/test/test_v1_list_test_case_library_items_response.py +71 -0
  147. eval_studio_client/api/test/test_v1_list_test_cases_response.py +4 -1
  148. eval_studio_client/api/test/test_v1_list_workflows_response.py +92 -0
  149. eval_studio_client/api/test/test_v1_process_workflow_node_response.py +71 -0
  150. eval_studio_client/api/test/test_v1_prompt_library_item.py +68 -0
  151. eval_studio_client/api/test/test_v1_test_case.py +4 -1
  152. eval_studio_client/api/test/test_v1_test_suite_evaluates.py +33 -0
  153. eval_studio_client/api/test/test_v1_update_test_case_response.py +4 -1
  154. eval_studio_client/api/test/test_v1_update_workflow_node_response.py +82 -0
  155. eval_studio_client/api/test/test_v1_update_workflow_response.py +90 -0
  156. eval_studio_client/api/test/test_v1_workflow.py +89 -0
  157. eval_studio_client/api/test/test_v1_workflow_edge.py +61 -0
  158. eval_studio_client/api/test/test_v1_workflow_edge_type.py +33 -0
  159. eval_studio_client/api/test/test_v1_workflow_node.py +81 -0
  160. eval_studio_client/api/test/test_v1_workflow_node_artifact.py +61 -0
  161. eval_studio_client/api/test/test_v1_workflow_node_artifacts.py +64 -0
  162. eval_studio_client/api/test/test_v1_workflow_node_attributes.py +51 -0
  163. eval_studio_client/api/test/test_v1_workflow_node_status.py +33 -0
  164. eval_studio_client/api/test/test_v1_workflow_node_type.py +33 -0
  165. eval_studio_client/api/test/test_v1_workflow_node_view.py +33 -0
  166. eval_studio_client/api/test/test_v1_workflow_type.py +33 -0
  167. eval_studio_client/api/test/test_workflow_edge_service_api.py +38 -0
  168. eval_studio_client/api/test/test_workflow_node_service_api.py +73 -0
  169. eval_studio_client/api/test/test_workflow_service_api.py +73 -0
  170. eval_studio_client/client.py +7 -0
  171. eval_studio_client/dashboards.py +29 -0
  172. eval_studio_client/gen/openapiv2/eval_studio.swagger.json +2665 -794
  173. eval_studio_client/leaderboards.py +123 -0
  174. eval_studio_client/models.py +3 -42
  175. eval_studio_client/test_labs.py +49 -21
  176. eval_studio_client/tests.py +188 -1
  177. {eval_studio_client-1.0.0.dist-info → eval_studio_client-1.0.1.dist-info}/METADATA +1 -2
  178. {eval_studio_client-1.0.0.dist-info → eval_studio_client-1.0.1.dist-info}/RECORD +179 -50
  179. {eval_studio_client-1.0.0.dist-info → eval_studio_client-1.0.1.dist-info}/WHEEL +1 -1
@@ -1,6 +1,7 @@
1
1
  import dataclasses
2
2
  import datetime
3
3
  import json
4
+ import os
4
5
  import time
5
6
  from typing import Dict
6
7
  from typing import List
@@ -33,6 +34,7 @@ class Leaderboard:
33
34
  update_time: Optional[datetime.datetime] = None
34
35
  problems: List[p6s.Problem] = dataclasses.field(default_factory=list)
35
36
  insights: List[i6s.Insight] = dataclasses.field(default_factory=list)
37
+ summary: Optional[str] = None
36
38
  existing_collection: Optional[str] = None
37
39
  _report: Optional[str] = None
38
40
  _leaderboard: Optional[str] = None
@@ -86,6 +88,42 @@ class Leaderboard:
86
88
  if self._client:
87
89
  self._leaderboard_api.leaderboard_service_delete_leaderboard(self.key)
88
90
 
91
+ def download_result(self, dest: str):
92
+ """Downloads the leaderboard result to a JSON file.
93
+
94
+ Args:
95
+ dest (str): The destination path for the report.
96
+ """
97
+ if not os.path.exists(dest):
98
+ raise ValueError("Destination path does not exist.")
99
+
100
+ if os.path.isdir(dest):
101
+ dest = os.path.join(dest, "results.json")
102
+
103
+ if self._client and self.finished:
104
+ headers: Dict[str, str] = {}
105
+ url = urljoin(
106
+ self._client.configuration.host, f"/content/{self.key}/results"
107
+ )
108
+ self._client.update_params_for_auth(
109
+ headers=headers,
110
+ queries=[],
111
+ auth_settings=[],
112
+ resource_path=url,
113
+ method="GET",
114
+ body=None,
115
+ )
116
+ response = urllib3.request("GET", url, headers=headers)
117
+
118
+ if response.status == 200:
119
+ with open(dest, "wb") as f:
120
+ f.write(response.data)
121
+ return
122
+ else:
123
+ raise RuntimeError("Failed to retrieve leaderboard result.")
124
+
125
+ raise ValueError("Cannot download result for unfinished leaderboard.")
126
+
89
127
  def download_report(self, dest: str):
90
128
  """Downloads the leaderboard report to a zip file.
91
129
 
@@ -114,6 +152,30 @@ class Leaderboard:
114
152
 
115
153
  raise ValueError("Cannot download report for unfinished leaderboard.")
116
154
 
155
+ def get_result_json(self) -> str:
156
+ """Retrieves the leaderboard result as a JSON string."""
157
+ if self._client and self.finished:
158
+ headers: Dict[str, str] = {}
159
+ url = urljoin(
160
+ self._client.configuration.host, f"/content/{self.key}/results"
161
+ )
162
+ self._client.update_params_for_auth(
163
+ headers=headers,
164
+ queries=[],
165
+ auth_settings=[],
166
+ resource_path=url,
167
+ method="GET",
168
+ body=None,
169
+ )
170
+ response = urllib3.request("GET", url, headers=headers)
171
+
172
+ if response.status == 200:
173
+ return str(response.data)
174
+ else:
175
+ raise RuntimeError("Failed to retrieve leaderboard result.")
176
+
177
+ raise ValueError("Cannot download result for unfinished leaderboard.")
178
+
117
179
  def get_table(self) -> LeaderboardTable:
118
180
  """Retrieves the leaderboard table."""
119
181
  if self._client and self.finished:
@@ -170,6 +232,7 @@ class Leaderboard:
170
232
  """Refresh the leaderboard with the latest API data."""
171
233
  self.key = api_leaderboard.name or ""
172
234
  self.update_time = api_leaderboard.update_time
235
+ self.summary = api_leaderboard.leaderboard_summary
173
236
  self._leaderboard = api_leaderboard.leaderboard_table
174
237
  self._report = api_leaderboard.leaderboard_report or ""
175
238
  self._status = api_leaderboard.status
@@ -192,6 +255,7 @@ class Leaderboard:
192
255
  update_time=api_leaderboard.update_time,
193
256
  problems=problems,
194
257
  insights=insights,
258
+ summary=api_leaderboard.leaderboard_summary,
195
259
  existing_collection=api_leaderboard.h2ogpte_collection or None,
196
260
  _evaluator_name=api_leaderboard.evaluator or "",
197
261
  _test_names=api_leaderboard.tests or [],
@@ -208,3 +272,62 @@ class Leaderboard:
208
272
  models.V1LeaderboardStatus.LEADERBOARD_STATUS_COMPLETED,
209
273
  models.V1LeaderboardStatus.LEADERBOARD_STATUS_FAILED,
210
274
  ]
275
+
276
+ @staticmethod
277
+ def from_operation(
278
+ operation: models.V1Operation, client: Optional[api.ApiClient]
279
+ ) -> Optional["Leaderboard"]:
280
+ """Retrieves the leaderboard from the operation, which created it.
281
+
282
+ Args:
283
+ operation: The operation that created the dashboard.
284
+ client: The API client to use for the leaderboard retrieval.
285
+
286
+ Returns:
287
+ Leaderboard: The leaderboard instance created by the operation.
288
+ """
289
+ if not client:
290
+ raise RuntimeError("API Client is not provided")
291
+
292
+ if not operation.metadata:
293
+ raise RuntimeError(
294
+ "Operation metadata missing, it's not possible to retrieve leaderboard from operation"
295
+ )
296
+
297
+ leaderboard_api = api.LeaderboardServiceApi(client)
298
+ leadeboard_id = operation.metadata.to_dict().get("leaderboard", "")
299
+ res = leaderboard_api.leaderboard_service_get_leaderboard(str(leadeboard_id))
300
+ if res and res.leaderboard:
301
+ return Leaderboard._from_api_leaderboard(res.leaderboard, client)
302
+
303
+ return None
304
+
305
+
306
+ class _Leaderboards:
307
+ def __init__(self, client: api.ApiClient):
308
+ self._client = client
309
+ self._api = api.LeaderboardServiceApi(client)
310
+
311
+ def get(self, key: str) -> Leaderboard:
312
+ """Gets an individual leaderboard with a given key from Eval Studio.
313
+
314
+ Args:
315
+ key: The leaderboard resource name to retrieve.
316
+ """
317
+ res = self._api.leaderboard_service_get_leaderboard(key)
318
+ if res and res.leaderboard:
319
+ return Leaderboard._from_api_leaderboard(res.leaderboard, self._client)
320
+
321
+ raise KeyError("Leaderboard not found.")
322
+
323
+ def list(self) -> List[Leaderboard]:
324
+ """Lists all user leaderboards in Eval Studio."""
325
+ res = self._api.leaderboard_service_list_leaderboards()
326
+ if res:
327
+ res_leaderboards = res.leaderboards or []
328
+ return [
329
+ Leaderboard._from_api_leaderboard(lb, self._client)
330
+ for lb in res_leaderboards
331
+ ]
332
+
333
+ return []
@@ -168,7 +168,7 @@ class Model:
168
168
  )
169
169
 
170
170
  if res and res.operation:
171
- return self._get_leaderboard_from_operation(res.operation)
171
+ return l10s.Leaderboard.from_operation(res.operation, self._client)
172
172
 
173
173
  return None
174
174
 
@@ -226,7 +226,7 @@ class Model:
226
226
  )
227
227
 
228
228
  if res and res.operation:
229
- return self._get_dashboard_from_operation(res.operation)
229
+ return d8s.Dashboard.from_operation(res.operation, self._client)
230
230
 
231
231
  return None
232
232
 
@@ -257,7 +257,7 @@ class Model:
257
257
  )
258
258
  res = self._leaderboard_api.leaderboard_service_import_leaderboard(req)
259
259
  if res and res.operation:
260
- return self._get_leaderboard_from_operation(res.operation)
260
+ return l10s.Leaderboard.from_operation(res.operation, self._client)
261
261
 
262
262
  return None
263
263
 
@@ -273,45 +273,6 @@ class Model:
273
273
 
274
274
  raise RuntimeError("Failed to list base models")
275
275
 
276
- def _get_leaderboard_from_operation(
277
- self, operation: models.V1Operation
278
- ) -> Optional[l10s.Leaderboard]:
279
- """Retrieves the leaderboard from the operation, which created it.
280
-
281
- Args:
282
- operation: The operation that created the leaderboard.
283
- """
284
- if not operation.metadata:
285
- raise RuntimeError("Not possible to retrieve leaderboard from operation")
286
-
287
- leadeboard_id = operation.metadata.to_dict().get("leaderboard")
288
- res = self._leaderboard_api.leaderboard_service_get_leaderboard(leadeboard_id)
289
- if res and res.leaderboard:
290
- return l10s.Leaderboard._from_api_leaderboard(res.leaderboard, self._client)
291
-
292
- return None
293
-
294
- def _get_dashboard_from_operation(
295
- self, operation: models.V1Operation
296
- ) -> Optional[d8s.Dashboard]:
297
- """Retrieves the dashboard from the operation, which created it.
298
-
299
- Args:
300
- operation: The operation that created the dashboard.
301
- """
302
- if not self._client:
303
- raise RuntimeError("Client is not set.")
304
-
305
- if not operation.metadata:
306
- raise RuntimeError("Not possible to retrieve dashboard from operation")
307
-
308
- dashboard_id = operation.metadata.to_dict().get("dashboard")
309
- res = self._dashboard_api.dashboard_service_get_dashboard(dashboard_id)
310
- if res and res.dashboard:
311
- return d8s.Dashboard._from_api_dashboard(res.dashboard, self._client)
312
-
313
- return None
314
-
315
276
  @staticmethod
316
277
  def _from_api_model(api_model: models.V1Model, client: api.ApiClient) -> "Model":
317
278
  """Converts the API model to the client model."""
@@ -7,7 +7,8 @@ from typing import Union
7
7
  import uuid
8
8
 
9
9
  from eval_studio_client import api
10
- from eval_studio_client import evaluators
10
+ from eval_studio_client import dashboards
11
+ from eval_studio_client import evaluators as e8s
11
12
  from eval_studio_client import leaderboards as l10s
12
13
  from eval_studio_client.api import models as apiModels
13
14
 
@@ -92,11 +93,56 @@ class TestLab:
92
93
  self._models.append(_m)
93
94
  return _m
94
95
 
95
- def evaluate(self, evaluator: evaluators.Evaluator) -> Optional[l10s.Leaderboard]:
96
+ def evaluate(
97
+ self,
98
+ evaluators: Union[e8s.Evaluator, List[e8s.Evaluator]],
99
+ name: Optional[str] = None,
100
+ description: Optional[str] = None,
101
+ ) -> Optional[dashboards.Dashboard]:
96
102
  """Runs an evaluation for the test lab.
97
103
 
104
+ Args:
105
+ evaluators (Union[e8s.Evaluator, List[e8s.Evaluator]]): One or many evaluators
106
+ used to evaluate the test lab.
107
+ name (str, optional): Optional name for the evaluation.
108
+ description (str, optional): Optional description for the evaluation.
109
+
110
+ Returns:
111
+ Dashboard: Evaluation dashboard instance. In case launching of evaluation
112
+ fails, `None` is returned.
113
+ """
114
+ _evaluators = (
115
+ [evaluators] if isinstance(evaluators, e8s.Evaluator) else evaluators
116
+ )
117
+ name = name or self.name or "Imported Dashboard"
118
+ description = description or self.description or ""
119
+ req = apiModels.V1BatchImportLeaderboardRequest(
120
+ testLabJson=self.json(),
121
+ evaluators=[e.key for e in _evaluators],
122
+ model=None,
123
+ dashboardDisplayName=name,
124
+ dashboardDescription=description,
125
+ testDisplayName=f"{name} - Test",
126
+ testDescription=f"Test suite for {description}",
127
+ )
128
+ res = self._leaderboard_api.leaderboard_service_batch_import_leaderboard(req)
129
+
130
+ if res and res.operation:
131
+ return dashboards.Dashboard.from_operation(res.operation, self._client)
132
+
133
+ return None
134
+
135
+ def create_leaderboard(
136
+ self, evaluator: e8s.Evaluator
137
+ ) -> Optional[l10s.Leaderboard]:
138
+ """Creates a single leaderboard for the test lab.
139
+
98
140
  Args:
99
141
  evaluator: The evaluator to use for the evaluation.
142
+
143
+ Returns:
144
+ Leaderboard: Single evaluation leaderboard instance.
145
+ In case launching of evaluation fails, `None` is returned.
100
146
  """
101
147
  req = apiModels.V1ImportLeaderboardRequest(
102
148
  testLabJson=self.json(),
@@ -109,7 +155,7 @@ class TestLab:
109
155
  )
110
156
  res = self._leaderboard_api.leaderboard_service_import_leaderboard(req)
111
157
  if res and res.operation:
112
- return self._get_leaderboard_from_operation(res.operation)
158
+ return l10s.Leaderboard.from_operation(res.operation, self._client)
113
159
 
114
160
  return None
115
161
 
@@ -131,24 +177,6 @@ class TestLab:
131
177
 
132
178
  return json.dumps(lab, indent=4, sort_keys=True)
133
179
 
134
- def _get_leaderboard_from_operation(
135
- self, operation: apiModels.V1Operation
136
- ) -> Optional[l10s.Leaderboard]:
137
- """Retrieves the leaderboard from the operation, which created it.
138
-
139
- Args:
140
- operation: The operation that created the leaderboard.
141
- """
142
- if not operation.metadata:
143
- raise RuntimeError("Not possible to retrieve leaderboard from operation")
144
-
145
- leadeboard_id = operation.metadata.to_dict().get("leaderboard")
146
- res = self._leaderboard_api.leaderboard_service_get_leaderboard(leadeboard_id)
147
- if res and res.leaderboard:
148
- return l10s.Leaderboard._from_api_leaderboard(res.leaderboard, self._client)
149
-
150
- return None
151
-
152
180
  def _llm_model_names(self) -> List[str]:
153
181
  return [m.llm_model_name for m in self.models]
154
182
 
@@ -118,6 +118,71 @@ class _TestCaseGenerationHandle:
118
118
  )
119
119
 
120
120
 
121
+ @dataclasses.dataclass
122
+ class _TestCaseLibraryGetHandle(_TestCaseGenerationHandle):
123
+
124
+ @staticmethod
125
+ def _from_operation(
126
+ res: (
127
+ models.V1ImportTestCasesFromLibraryResponse | models.V1GetOperationResponse
128
+ ),
129
+ ) -> "_TestCaseLibraryGetHandle":
130
+ """Converts an API operation to prompt library handle."""
131
+ op: models.V1Operation | None = res.operation
132
+ if not op:
133
+ return _TestCaseLibraryGetHandle(name=None)
134
+
135
+ # progress
136
+ if hasattr(op, "metadata") and op.metadata:
137
+ meta_dict = op.metadata.to_dict() or {}
138
+ else:
139
+ meta_dict = {}
140
+
141
+ return _TestCaseLibraryGetHandle(
142
+ name=op.name,
143
+ progress=meta_dict.get("progress"),
144
+ progress_message=meta_dict.get("progressMessage"),
145
+ error=op.error,
146
+ done=op.done,
147
+ )
148
+
149
+
150
+ @dataclasses.dataclass
151
+ class TestCaseLibraryItem:
152
+ """Represents a single test case library item - test suite."""
153
+
154
+ key: str
155
+ name: str
156
+ description: str
157
+ test_suite_url: str
158
+ test_count: int
159
+ test_case_count: int
160
+ evaluates: List[str]
161
+ categories: List[str]
162
+
163
+ @staticmethod
164
+ def _from_api_items(
165
+ api_items: List[models.V1PromptLibraryItem],
166
+ ) -> List["TestCaseLibraryItem"]:
167
+ return (
168
+ [
169
+ TestCaseLibraryItem(
170
+ key=api_item.name or "",
171
+ name=api_item.display_name or "",
172
+ description=api_item.description or "",
173
+ test_suite_url=api_item.test_suite_url or "",
174
+ test_count=api_item.test_count or 0,
175
+ test_case_count=api_item.test_case_count or 0,
176
+ evaluates=list(api_item.evaluates) if api_item.evaluates else [],
177
+ categories=list(api_item.categories) if api_item.categories else [],
178
+ )
179
+ for api_item in api_items
180
+ ]
181
+ if api_items
182
+ else []
183
+ )
184
+
185
+
121
186
  @dataclasses.dataclass
122
187
  class TestCase:
123
188
  """Represents a single test case, which contains tested prompt, expected answer
@@ -183,6 +248,7 @@ class Test:
183
248
  update_time: Optional[datetime.datetime] = None
184
249
  _client: Optional[api.ApiClient] = None
185
250
  _gen_tc_op_name: Optional[str] = None
251
+ _lib_tc_op_name: Optional[str] = None
186
252
 
187
253
  def __post_init__(self):
188
254
  if self._client:
@@ -267,7 +333,7 @@ class Test:
267
333
 
268
334
  Args:
269
335
  count (int): Number of test cases to generate (generator may return fewer
270
- prompts).
336
+ prompts).
271
337
  model (str): Model to use for generating the prompts.
272
338
  base_llm_model (str): Base LLM model to use for generating the prompts.
273
339
  generators (List[TestCaseGenerator]): Methods to use for generation.
@@ -342,6 +408,127 @@ class Test:
342
408
 
343
409
  raise TimeoutError("Waiting timeout has been reached.")
344
410
 
411
+ def list_test_suite_library_items(
412
+ self,
413
+ filter_by_categories: Optional[List[str]] = None,
414
+ filter_by_purposes: Optional[List[str]] = None,
415
+ filter_by_evaluates: Optional[List[str]] = None,
416
+ filter_by_origin: Optional[str] = None,
417
+ filter_by_test_case_count: Optional[int] = None,
418
+ filter_by_test_count: Optional[int] = None,
419
+ filter_by_fts: Optional[str] = None,
420
+ ) -> List[TestCaseLibraryItem]:
421
+ """Retrieves a list of all available items - suites of tests - in the library.
422
+
423
+ Args:
424
+ filter_by_categories (List[str]): List of categories to filter
425
+ the library items.
426
+ filter_by_purposes (List[str]): List of purposes to filter
427
+ the library items.
428
+ filter_by_evaluates (List[str]): List of evaluates to filter
429
+ the library items.
430
+ filter_by_origin (str): Origin to filter the library items.
431
+ filter_by_test_case_count (int): Test case count to filter
432
+ the library items.
433
+ filter_by_test_count (int): Test count to filter the library items.
434
+ filter_by_fts (str): FTS to filter the library items - phrase to search for.
435
+
436
+ Returns:
437
+ List[TestCaseLibraryItem]: List of library items.
438
+ """
439
+ req = models.TestServiceListTestCaseLibraryItemsRequest(
440
+ filter_by_categories=filter_by_categories,
441
+ filter_by_purposes=filter_by_purposes,
442
+ filter_by_evaluates=filter_by_evaluates,
443
+ filter_by_origin=filter_by_origin,
444
+ filter_by_test_case_count=filter_by_test_case_count,
445
+ filter_by_test_count=filter_by_test_count,
446
+ filter_by_fts=filter_by_fts,
447
+ )
448
+
449
+ res = self._test_api.test_service_list_test_case_library_items(self.key, req)
450
+ if res and res.prompt_library_items:
451
+ return TestCaseLibraryItem._from_api_items(res.prompt_library_items)
452
+
453
+ return []
454
+
455
+ def add_library_test_cases(
456
+ self, test_suite_url: str, count: int, test_document_urls: Optional[List[str]]
457
+ ) -> None:
458
+ """Sample test cases from the test suite library and add them to the test.
459
+
460
+ Args:
461
+ test_suite_url (str): The URL of the library test suite to get TestCases
462
+ from (sample).
463
+ count (int): The number of TestCases to get from the library.
464
+ test_document_urls (List[str]): The list of target Test corpus
465
+ document URLs to skip when returning library TestCases corpus.
466
+ """
467
+ req = models.TestServiceImportTestCasesFromLibraryRequest(
468
+ test_suite_url=test_suite_url,
469
+ count=count,
470
+ test_document_urls=test_document_urls,
471
+ )
472
+
473
+ res = self._test_api.test_service_import_test_cases_from_library(self.key, req)
474
+
475
+ op: models.V1Operation | None = res.operation
476
+ self._lib_tc_op_name = op.name if op else None
477
+
478
+ def wait_for_library_test_case_get(
479
+ self, timeout: Optional[float] = None, verbose: bool = False
480
+ ) -> None:
481
+ """Waits for the library test cases(s) sampling to finish.
482
+
483
+ Args:
484
+ timeout (float): The maximum time to wait in seconds.
485
+ verbose (bool): If True, prints the status of the handle while waiting.
486
+ """
487
+ if not self._lib_tc_op_name:
488
+ raise ValueError(
489
+ "There is no ongoing getting of test case(s) from the library - "
490
+ "the operation name is not set."
491
+ )
492
+
493
+ if verbose:
494
+ print(
495
+ f"Waiting for getting library test case(s) operation to finish "
496
+ f"({self._lib_tc_op_name}):"
497
+ )
498
+ if self._client:
499
+ # exponential backoff
500
+ wait_time = 1.0
501
+ wait_coef = 1.6
502
+ wait_max = 8.0
503
+ wait_total = 0.0
504
+ timeout = timeout or float(2 * 24 * 60 * 60) # 2 days
505
+ progress_bar = utils.ProgressBar()
506
+ while wait_total < timeout:
507
+ handle = _TestCaseLibraryGetHandle._from_operation(
508
+ self._operation_api.operation_service_get_operation(
509
+ self._lib_tc_op_name
510
+ )
511
+ )
512
+
513
+ if verbose:
514
+ progress_bar.update(handle.progress or 0, handle.progress_message)
515
+
516
+ if handle.done:
517
+ if handle.error:
518
+ raise RuntimeError(
519
+ f"Getting of library test case(s) failed: {handle.error}"
520
+ )
521
+ return
522
+
523
+ wait_time *= wait_coef
524
+ time.sleep(min(wait_time, wait_max))
525
+ else:
526
+ raise ValueError(
527
+ "Unable to establish a connection to the Eval Studio host."
528
+ )
529
+
530
+ raise TimeoutError("Waiting timeout has been reached.")
531
+
345
532
  def delete(self, force=False):
346
533
  """Deletes the test.
347
534
 
@@ -1,10 +1,9 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: eval-studio-client
3
- Version: 1.0.0
3
+ Version: 1.0.1
4
4
  Project-URL: Source, https://github.com/h2oai/eval-studio/tree/main/client-py/src/
5
5
  Project-URL: Issues, https://github.com/h2oai/eval-studio/issues
6
6
  Author-email: "H2O.ai" <support@h2o.ai>
7
- License: MIT
8
7
  Classifier: Development Status :: 4 - Beta
9
8
  Classifier: Programming Language :: Python
10
9
  Classifier: Programming Language :: Python :: 3.9