scorable 1.6.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of scorable might be problematic. Click here for more details.

Files changed (228) hide show
  1. root/__about__.py +4 -0
  2. root/__init__.py +17 -0
  3. root/client.py +207 -0
  4. root/datasets.py +231 -0
  5. root/execution_logs.py +162 -0
  6. root/generated/__init__.py +0 -0
  7. root/generated/openapi_aclient/__init__.py +1 -0
  8. root/generated/openapi_aclient/api/__init__.py +1 -0
  9. root/generated/openapi_aclient/api/datasets_api.py +1274 -0
  10. root/generated/openapi_aclient/api/evaluators_api.py +3641 -0
  11. root/generated/openapi_aclient/api/execution_logs_api.py +751 -0
  12. root/generated/openapi_aclient/api/judges_api.py +3794 -0
  13. root/generated/openapi_aclient/api/models_api.py +1473 -0
  14. root/generated/openapi_aclient/api/objectives_api.py +1767 -0
  15. root/generated/openapi_aclient/api_client.py +662 -0
  16. root/generated/openapi_aclient/api_response.py +22 -0
  17. root/generated/openapi_aclient/configuration.py +470 -0
  18. root/generated/openapi_aclient/exceptions.py +197 -0
  19. root/generated/openapi_aclient/models/__init__.py +122 -0
  20. root/generated/openapi_aclient/models/data_set_create.py +118 -0
  21. root/generated/openapi_aclient/models/data_set_create_request.py +105 -0
  22. root/generated/openapi_aclient/models/data_set_list.py +129 -0
  23. root/generated/openapi_aclient/models/data_set_type.py +36 -0
  24. root/generated/openapi_aclient/models/dataset_range_request.py +93 -0
  25. root/generated/openapi_aclient/models/evaluator.py +273 -0
  26. root/generated/openapi_aclient/models/evaluator_calibration_output.py +101 -0
  27. root/generated/openapi_aclient/models/evaluator_calibration_result.py +134 -0
  28. root/generated/openapi_aclient/models/evaluator_demonstrations.py +107 -0
  29. root/generated/openapi_aclient/models/evaluator_demonstrations_request.py +107 -0
  30. root/generated/openapi_aclient/models/evaluator_execution_function_parameter_property_request.py +86 -0
  31. root/generated/openapi_aclient/models/evaluator_execution_function_parameter_request.py +109 -0
  32. root/generated/openapi_aclient/models/evaluator_execution_function_parameter_type_enum.py +35 -0
  33. root/generated/openapi_aclient/models/evaluator_execution_function_request.py +99 -0
  34. root/generated/openapi_aclient/models/evaluator_execution_functions_request.py +98 -0
  35. root/generated/openapi_aclient/models/evaluator_execution_functions_type_enum.py +35 -0
  36. root/generated/openapi_aclient/models/evaluator_execution_request.py +134 -0
  37. root/generated/openapi_aclient/models/evaluator_execution_result.py +114 -0
  38. root/generated/openapi_aclient/models/evaluator_inputs_value.py +100 -0
  39. root/generated/openapi_aclient/models/evaluator_inputs_value_items.py +89 -0
  40. root/generated/openapi_aclient/models/evaluator_list_output.py +198 -0
  41. root/generated/openapi_aclient/models/evaluator_reference.py +90 -0
  42. root/generated/openapi_aclient/models/evaluator_reference_request.py +90 -0
  43. root/generated/openapi_aclient/models/evaluator_request.py +194 -0
  44. root/generated/openapi_aclient/models/evaluator_result.py +110 -0
  45. root/generated/openapi_aclient/models/execution_log_details.py +291 -0
  46. root/generated/openapi_aclient/models/execution_log_details_evaluation_context.py +83 -0
  47. root/generated/openapi_aclient/models/execution_log_details_evaluator_latencies_inner.py +83 -0
  48. root/generated/openapi_aclient/models/execution_log_list.py +217 -0
  49. root/generated/openapi_aclient/models/execution_log_list_evaluation_context.py +83 -0
  50. root/generated/openapi_aclient/models/generation_model_params_request.py +93 -0
  51. root/generated/openapi_aclient/models/id.py +87 -0
  52. root/generated/openapi_aclient/models/input_variable.py +121 -0
  53. root/generated/openapi_aclient/models/input_variable_request.py +82 -0
  54. root/generated/openapi_aclient/models/judge.py +178 -0
  55. root/generated/openapi_aclient/models/judge_execution_request.py +114 -0
  56. root/generated/openapi_aclient/models/judge_execution_response.py +97 -0
  57. root/generated/openapi_aclient/models/judge_files_inner.py +84 -0
  58. root/generated/openapi_aclient/models/judge_generator_request.py +142 -0
  59. root/generated/openapi_aclient/models/judge_generator_response.py +88 -0
  60. root/generated/openapi_aclient/models/judge_invite_request.py +87 -0
  61. root/generated/openapi_aclient/models/judge_list.py +156 -0
  62. root/generated/openapi_aclient/models/judge_rectifier_request_request.py +114 -0
  63. root/generated/openapi_aclient/models/judge_rectifier_response.py +121 -0
  64. root/generated/openapi_aclient/models/judge_request.py +108 -0
  65. root/generated/openapi_aclient/models/model.py +126 -0
  66. root/generated/openapi_aclient/models/model_list.py +115 -0
  67. root/generated/openapi_aclient/models/model_params.py +89 -0
  68. root/generated/openapi_aclient/models/model_params_request.py +89 -0
  69. root/generated/openapi_aclient/models/model_request.py +118 -0
  70. root/generated/openapi_aclient/models/nested_evaluator.py +110 -0
  71. root/generated/openapi_aclient/models/nested_evaluator_objective.py +87 -0
  72. root/generated/openapi_aclient/models/nested_evaluator_request.py +92 -0
  73. root/generated/openapi_aclient/models/nested_objective_evaluator.py +105 -0
  74. root/generated/openapi_aclient/models/nested_objective_evaluator_request.py +92 -0
  75. root/generated/openapi_aclient/models/nested_objective_list.py +111 -0
  76. root/generated/openapi_aclient/models/nested_user_details.py +88 -0
  77. root/generated/openapi_aclient/models/nested_user_details_request.py +82 -0
  78. root/generated/openapi_aclient/models/nested_vector_objective.py +88 -0
  79. root/generated/openapi_aclient/models/nested_vector_objective_request.py +82 -0
  80. root/generated/openapi_aclient/models/objective.py +157 -0
  81. root/generated/openapi_aclient/models/objective_list.py +128 -0
  82. root/generated/openapi_aclient/models/objective_request.py +113 -0
  83. root/generated/openapi_aclient/models/objective_validator.py +100 -0
  84. root/generated/openapi_aclient/models/objective_validator_request.py +90 -0
  85. root/generated/openapi_aclient/models/paginated_data_set_list_list.py +111 -0
  86. root/generated/openapi_aclient/models/paginated_evaluator_list.py +111 -0
  87. root/generated/openapi_aclient/models/paginated_evaluator_list_output_list.py +111 -0
  88. root/generated/openapi_aclient/models/paginated_execution_log_list_list.py +111 -0
  89. root/generated/openapi_aclient/models/paginated_judge_list_list.py +111 -0
  90. root/generated/openapi_aclient/models/paginated_model_list_list.py +111 -0
  91. root/generated/openapi_aclient/models/paginated_objective_list.py +111 -0
  92. root/generated/openapi_aclient/models/paginated_objective_list_list.py +111 -0
  93. root/generated/openapi_aclient/models/patched_evaluator_request.py +194 -0
  94. root/generated/openapi_aclient/models/patched_judge_request.py +110 -0
  95. root/generated/openapi_aclient/models/patched_model_request.py +118 -0
  96. root/generated/openapi_aclient/models/patched_objective_request.py +113 -0
  97. root/generated/openapi_aclient/models/provider.py +99 -0
  98. root/generated/openapi_aclient/models/reference_variable.py +123 -0
  99. root/generated/openapi_aclient/models/reference_variable_request.py +83 -0
  100. root/generated/openapi_aclient/models/skill_execution_validator_result.py +130 -0
  101. root/generated/openapi_aclient/models/skill_test_data_request.py +107 -0
  102. root/generated/openapi_aclient/models/skill_test_data_request_dataset_range.py +93 -0
  103. root/generated/openapi_aclient/models/skill_test_input_request.py +171 -0
  104. root/generated/openapi_aclient/models/skill_type_enum.py +36 -0
  105. root/generated/openapi_aclient/models/status_change.py +84 -0
  106. root/generated/openapi_aclient/models/status_change_request.py +84 -0
  107. root/generated/openapi_aclient/models/status_change_status_enum.py +36 -0
  108. root/generated/openapi_aclient/models/status_enum.py +38 -0
  109. root/generated/openapi_aclient/models/validation_result_status.py +36 -0
  110. root/generated/openapi_aclient/models/visibility_enum.py +38 -0
  111. root/generated/openapi_aclient/rest.py +166 -0
  112. root/generated/openapi_aclient_README.md +239 -0
  113. root/generated/openapi_client/__init__.py +1 -0
  114. root/generated/openapi_client/api/__init__.py +1 -0
  115. root/generated/openapi_client/api/datasets_api.py +1274 -0
  116. root/generated/openapi_client/api/evaluators_api.py +3641 -0
  117. root/generated/openapi_client/api/execution_logs_api.py +751 -0
  118. root/generated/openapi_client/api/judges_api.py +3794 -0
  119. root/generated/openapi_client/api/models_api.py +1473 -0
  120. root/generated/openapi_client/api/objectives_api.py +1767 -0
  121. root/generated/openapi_client/api_client.py +659 -0
  122. root/generated/openapi_client/api_response.py +22 -0
  123. root/generated/openapi_client/configuration.py +474 -0
  124. root/generated/openapi_client/exceptions.py +197 -0
  125. root/generated/openapi_client/models/__init__.py +120 -0
  126. root/generated/openapi_client/models/data_set_create.py +118 -0
  127. root/generated/openapi_client/models/data_set_create_request.py +105 -0
  128. root/generated/openapi_client/models/data_set_list.py +129 -0
  129. root/generated/openapi_client/models/data_set_type.py +36 -0
  130. root/generated/openapi_client/models/dataset_range_request.py +93 -0
  131. root/generated/openapi_client/models/evaluator.py +273 -0
  132. root/generated/openapi_client/models/evaluator_calibration_output.py +101 -0
  133. root/generated/openapi_client/models/evaluator_calibration_result.py +134 -0
  134. root/generated/openapi_client/models/evaluator_demonstrations.py +107 -0
  135. root/generated/openapi_client/models/evaluator_demonstrations_request.py +107 -0
  136. root/generated/openapi_client/models/evaluator_execution_function_parameter_property_request.py +86 -0
  137. root/generated/openapi_client/models/evaluator_execution_function_parameter_request.py +109 -0
  138. root/generated/openapi_client/models/evaluator_execution_function_parameter_type_enum.py +35 -0
  139. root/generated/openapi_client/models/evaluator_execution_function_request.py +99 -0
  140. root/generated/openapi_client/models/evaluator_execution_functions_request.py +98 -0
  141. root/generated/openapi_client/models/evaluator_execution_functions_type_enum.py +35 -0
  142. root/generated/openapi_client/models/evaluator_execution_request.py +134 -0
  143. root/generated/openapi_client/models/evaluator_execution_result.py +114 -0
  144. root/generated/openapi_client/models/evaluator_inputs_value.py +100 -0
  145. root/generated/openapi_client/models/evaluator_inputs_value_items.py +89 -0
  146. root/generated/openapi_client/models/evaluator_list_output.py +198 -0
  147. root/generated/openapi_client/models/evaluator_reference.py +90 -0
  148. root/generated/openapi_client/models/evaluator_reference_request.py +90 -0
  149. root/generated/openapi_client/models/evaluator_request.py +194 -0
  150. root/generated/openapi_client/models/evaluator_result.py +110 -0
  151. root/generated/openapi_client/models/execution_log_details.py +291 -0
  152. root/generated/openapi_client/models/execution_log_details_evaluation_context.py +83 -0
  153. root/generated/openapi_client/models/execution_log_details_evaluator_latencies_inner.py +83 -0
  154. root/generated/openapi_client/models/execution_log_list.py +215 -0
  155. root/generated/openapi_client/models/execution_log_list_evaluation_context.py +83 -0
  156. root/generated/openapi_client/models/generation_model_params_request.py +93 -0
  157. root/generated/openapi_client/models/id.py +87 -0
  158. root/generated/openapi_client/models/input_variable.py +121 -0
  159. root/generated/openapi_client/models/input_variable_request.py +82 -0
  160. root/generated/openapi_client/models/judge.py +178 -0
  161. root/generated/openapi_client/models/judge_execution_request.py +114 -0
  162. root/generated/openapi_client/models/judge_execution_response.py +97 -0
  163. root/generated/openapi_client/models/judge_files_inner.py +84 -0
  164. root/generated/openapi_client/models/judge_generator_request.py +142 -0
  165. root/generated/openapi_client/models/judge_generator_response.py +88 -0
  166. root/generated/openapi_client/models/judge_invite_request.py +87 -0
  167. root/generated/openapi_client/models/judge_list.py +156 -0
  168. root/generated/openapi_client/models/judge_rectifier_request_request.py +114 -0
  169. root/generated/openapi_client/models/judge_rectifier_response.py +121 -0
  170. root/generated/openapi_client/models/judge_request.py +108 -0
  171. root/generated/openapi_client/models/model.py +126 -0
  172. root/generated/openapi_client/models/model_list.py +115 -0
  173. root/generated/openapi_client/models/model_params.py +89 -0
  174. root/generated/openapi_client/models/model_params_request.py +89 -0
  175. root/generated/openapi_client/models/model_request.py +118 -0
  176. root/generated/openapi_client/models/nested_evaluator.py +110 -0
  177. root/generated/openapi_client/models/nested_evaluator_objective.py +87 -0
  178. root/generated/openapi_client/models/nested_evaluator_request.py +92 -0
  179. root/generated/openapi_client/models/nested_objective_evaluator.py +105 -0
  180. root/generated/openapi_client/models/nested_objective_evaluator_request.py +92 -0
  181. root/generated/openapi_client/models/nested_objective_list.py +111 -0
  182. root/generated/openapi_client/models/nested_user_details.py +88 -0
  183. root/generated/openapi_client/models/nested_user_details_request.py +82 -0
  184. root/generated/openapi_client/models/nested_vector_objective.py +88 -0
  185. root/generated/openapi_client/models/nested_vector_objective_request.py +82 -0
  186. root/generated/openapi_client/models/objective.py +157 -0
  187. root/generated/openapi_client/models/objective_list.py +128 -0
  188. root/generated/openapi_client/models/objective_request.py +113 -0
  189. root/generated/openapi_client/models/objective_validator.py +100 -0
  190. root/generated/openapi_client/models/objective_validator_request.py +90 -0
  191. root/generated/openapi_client/models/paginated_data_set_list_list.py +111 -0
  192. root/generated/openapi_client/models/paginated_evaluator_list.py +111 -0
  193. root/generated/openapi_client/models/paginated_evaluator_list_output_list.py +111 -0
  194. root/generated/openapi_client/models/paginated_execution_log_list_list.py +111 -0
  195. root/generated/openapi_client/models/paginated_judge_list_list.py +111 -0
  196. root/generated/openapi_client/models/paginated_model_list_list.py +111 -0
  197. root/generated/openapi_client/models/paginated_objective_list.py +111 -0
  198. root/generated/openapi_client/models/paginated_objective_list_list.py +111 -0
  199. root/generated/openapi_client/models/patched_evaluator_request.py +194 -0
  200. root/generated/openapi_client/models/patched_judge_request.py +110 -0
  201. root/generated/openapi_client/models/patched_model_request.py +118 -0
  202. root/generated/openapi_client/models/patched_objective_request.py +113 -0
  203. root/generated/openapi_client/models/provider.py +99 -0
  204. root/generated/openapi_client/models/reference_variable.py +123 -0
  205. root/generated/openapi_client/models/reference_variable_request.py +83 -0
  206. root/generated/openapi_client/models/skill_execution_validator_result.py +130 -0
  207. root/generated/openapi_client/models/skill_test_data_request.py +107 -0
  208. root/generated/openapi_client/models/skill_test_data_request_dataset_range.py +93 -0
  209. root/generated/openapi_client/models/skill_test_input_request.py +171 -0
  210. root/generated/openapi_client/models/skill_type_enum.py +36 -0
  211. root/generated/openapi_client/models/status_change.py +84 -0
  212. root/generated/openapi_client/models/status_change_request.py +84 -0
  213. root/generated/openapi_client/models/status_change_status_enum.py +36 -0
  214. root/generated/openapi_client/models/status_enum.py +38 -0
  215. root/generated/openapi_client/models/validation_result_status.py +36 -0
  216. root/generated/openapi_client/models/visibility_enum.py +38 -0
  217. root/generated/openapi_client/rest.py +203 -0
  218. root/generated/openapi_client_README.md +238 -0
  219. root/judges.py +681 -0
  220. root/models.py +197 -0
  221. root/objectives.py +343 -0
  222. root/py.typed +0 -0
  223. root/skills.py +1707 -0
  224. root/utils.py +90 -0
  225. scorable-1.6.4.dist-info/METADATA +395 -0
  226. scorable-1.6.4.dist-info/RECORD +228 -0
  227. scorable-1.6.4.dist-info/WHEEL +4 -0
  228. scorable-1.6.4.dist-info/licenses/LICENSE +202 -0
root/skills.py ADDED
@@ -0,0 +1,1707 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import math
5
+ from collections import defaultdict
6
+ from concurrent.futures import ThreadPoolExecutor, as_completed
7
+ from contextlib import AbstractAsyncContextManager, AbstractContextManager
8
+ from enum import Enum
9
+ from functools import partial
10
+ from typing import TYPE_CHECKING, AsyncIterator, Dict, Iterator, List, Literal, Optional, Union, cast
11
+
12
+ from pydantic import BaseModel, StrictStr
13
+
14
+ from root.generated.openapi_aclient.models.evaluator_request import EvaluatorRequest as AEvaluatorRequest
15
+ from root.generated.openapi_aclient.models.paginated_evaluator_list import (
16
+ PaginatedEvaluatorList as APaginatedEvaluatorList,
17
+ )
18
+ from root.generated.openapi_aclient.models.paginated_evaluator_list_output_list import (
19
+ PaginatedEvaluatorListOutputList as APaginatedEvaluatorListOutputList,
20
+ )
21
+ from root.generated.openapi_client.models.evaluator_request import EvaluatorRequest
22
+ from root.generated.openapi_client.models.paginated_evaluator_list import PaginatedEvaluatorList
23
+
24
+ from .generated.openapi_aclient import ApiClient as AApiClient
25
+ from .generated.openapi_aclient.api.evaluators_api import EvaluatorsApi as AEvaluatorsApi
26
+ from .generated.openapi_aclient.api.objectives_api import ObjectivesApi as AObjectivesApi
27
+ from .generated.openapi_aclient.models import (
28
+ EvaluatorDemonstrationsRequest as AEvaluatorDemonstrationsRequest,
29
+ )
30
+ from .generated.openapi_aclient.models import (
31
+ EvaluatorExecutionFunctionsRequest as AEvaluatorExecutionFunctionsRequest,
32
+ )
33
+ from .generated.openapi_aclient.models import (
34
+ EvaluatorExecutionRequest as AEvaluatorExecutionRequest,
35
+ )
36
+ from .generated.openapi_aclient.models import (
37
+ EvaluatorExecutionResult as AEvaluatorExecutionResult,
38
+ )
39
+ from .generated.openapi_aclient.models import (
40
+ ModelParamsRequest as AModelParamsRequest,
41
+ )
42
+ from .generated.openapi_aclient.models.evaluator import Evaluator as AOpenAPIEvaluator
43
+ from .generated.openapi_aclient.models.evaluator_calibration_output import (
44
+ EvaluatorCalibrationOutput as AEvaluatorCalibrationOutput,
45
+ )
46
+ from .generated.openapi_aclient.models.evaluator_list_output import EvaluatorListOutput as AEvaluatorListOutput
47
+ from .generated.openapi_aclient.models.input_variable_request import InputVariableRequest as AInputVariableRequest
48
+ from .generated.openapi_aclient.models.objective_request import ObjectiveRequest as AObjectiveRequest
49
+ from .generated.openapi_aclient.models.patched_evaluator_request import (
50
+ PatchedEvaluatorRequest as APatchedEvaluatorRequest,
51
+ )
52
+ from .generated.openapi_aclient.models.reference_variable_request import (
53
+ ReferenceVariableRequest as AReferenceVariableRequest,
54
+ )
55
+ from .generated.openapi_aclient.models.skill_test_data_request import SkillTestDataRequest as ASkillTestDataRequest
56
+ from .generated.openapi_aclient.models.skill_test_input_request import (
57
+ SkillTestInputRequest as ASkillTestInputRequest,
58
+ )
59
+ from .generated.openapi_client import ApiClient as ApiClient
60
+ from .generated.openapi_client.api.evaluators_api import EvaluatorsApi as EvaluatorsApi
61
+ from .generated.openapi_client.api.objectives_api import ObjectivesApi as ObjectivesApi
62
+ from .generated.openapi_client.models.evaluator_calibration_output import EvaluatorCalibrationOutput
63
+ from .generated.openapi_client.models.evaluator_demonstrations_request import (
64
+ EvaluatorDemonstrationsRequest,
65
+ )
66
+ from .generated.openapi_client.models.evaluator_execution_functions_request import (
67
+ EvaluatorExecutionFunctionsRequest,
68
+ )
69
+ from .generated.openapi_client.models.evaluator_execution_request import EvaluatorExecutionRequest
70
+ from .generated.openapi_client.models.evaluator_execution_result import EvaluatorExecutionResult
71
+ from .generated.openapi_client.models.evaluator_list_output import EvaluatorListOutput
72
+ from .generated.openapi_client.models.input_variable_request import InputVariableRequest
73
+ from .generated.openapi_client.models.model_params_request import ModelParamsRequest
74
+ from .generated.openapi_client.models.objective_request import ObjectiveRequest
75
+ from .generated.openapi_client.models.patched_evaluator_request import PatchedEvaluatorRequest
76
+ from .generated.openapi_client.models.reference_variable_request import ReferenceVariableRequest
77
+ from .generated.openapi_client.models.skill_test_data_request import SkillTestDataRequest
78
+ from .generated.openapi_client.models.skill_test_input_request import SkillTestInputRequest
79
+ from .utils import ClientContextCallable, aiterate_cursor_list, iterate_cursor_list, with_async_client, with_sync_client
80
+
81
+ if TYPE_CHECKING:
82
+ from .generated.openapi_aclient.models.evaluator import Evaluator as GeneratedEvaluator
83
+ from .generated.openapi_client.models.evaluator import Evaluator as SyncGeneratedEvaluator
84
+
85
+
86
+ ModelName = Union[
87
+ str,
88
+ Literal[
89
+ "root", # RS-chosen model
90
+ ],
91
+ ]
92
+
93
+
94
+ class ModelParams(BaseModel):
95
+ """
96
+ Additional model parameters.
97
+
98
+ All fields are made optional in practice.
99
+ """
100
+
101
+ temperature: Optional[float] = None
102
+
103
+
104
+ class ReferenceVariable(BaseModel):
105
+ """
106
+ Reference variable definition.
107
+
108
+ `name` within prompt gets populated with content from `dataset_id`.
109
+ """
110
+
111
+ name: str
112
+ dataset_id: str
113
+
114
+
115
+ class InputVariable(BaseModel):
116
+ """
117
+ Input variable definition.
118
+
119
+ `name` within prompt gets populated with the provided variable.
120
+ """
121
+
122
+ name: str
123
+
124
+
125
+ class EvaluatorDemonstration(BaseModel):
126
+ """
127
+ Evaluator demonstration
128
+
129
+ Demonstrations are used to train an evaluator to adjust its behavior.
130
+ """
131
+
132
+ request: Optional[str] = None
133
+ response: str
134
+ score: float
135
+ justification: Optional[str] = None
136
+
137
+
138
+ class ACalibrateBatchParameters:
139
+ def __init__(
140
+ self,
141
+ name: str,
142
+ prompt: str,
143
+ model: "ModelName",
144
+ pii_filter: bool = False,
145
+ reference_variables: Optional[Union[List["ReferenceVariable"], List["AReferenceVariableRequest"]]] = None,
146
+ input_variables: Optional[Union[List["InputVariable"], List["AInputVariableRequest"]]] = None,
147
+ ):
148
+ self.name = name
149
+ self.prompt = prompt
150
+ self.model = model
151
+ self.pii_filter = pii_filter
152
+ self.reference_variables = reference_variables
153
+ self.input_variables = input_variables
154
+
155
+
156
+ class ACalibrateBatchResult(BaseModel):
157
+ results: List[AEvaluatorCalibrationOutput]
158
+ rms_errors_model: Dict[str, float]
159
+ mae_errors_model: Dict[str, float]
160
+ rms_errors_prompt: Dict[str, float]
161
+ mae_errors_prompt: Dict[str, float]
162
+
163
+
164
+ class CalibrateBatchParameters:
165
+ def __init__(
166
+ self,
167
+ name: str,
168
+ prompt: str,
169
+ model: "ModelName",
170
+ pii_filter: bool = False,
171
+ reference_variables: Optional[Union[List["ReferenceVariable"], List["ReferenceVariableRequest"]]] = None,
172
+ input_variables: Optional[Union[List["InputVariable"], List["InputVariableRequest"]]] = None,
173
+ ):
174
+ self.name = name
175
+ self.prompt = prompt
176
+ self.model = model
177
+ self.pii_filter = pii_filter
178
+ self.reference_variables = reference_variables
179
+ self.input_variables = input_variables
180
+
181
+
182
+ class CalibrateBatchResult(BaseModel):
183
+ results: List[EvaluatorCalibrationOutput]
184
+ rms_errors_model: Dict[str, float]
185
+ mae_errors_model: Dict[str, float]
186
+ rms_errors_prompt: Dict[str, float]
187
+ mae_errors_prompt: Dict[str, float]
188
+
189
+
190
+ class Versions:
191
+ """
192
+ Version listing (sub)API
193
+
194
+ Note that this should not be directly instantiated.
195
+ """
196
+
197
+ def __init__(self, client_context: ClientContextCallable):
198
+ self.client_context = client_context
199
+
200
+ @with_sync_client
201
+ def list(self, evaluator_id: str, *, _client: ApiClient) -> PaginatedEvaluatorList:
202
+ """
203
+ List all versions of a evaluator.
204
+ """
205
+
206
+ api_instance = EvaluatorsApi(_client)
207
+ return api_instance.evaluators_versions_list(id=evaluator_id)
208
+
209
+ async def alist(self, evaluator_id: str) -> APaginatedEvaluatorList:
210
+ """
211
+ Asynchronously list all versions of a evaluator.
212
+ """
213
+
214
+ context = self.client_context()
215
+ assert isinstance(context, AbstractAsyncContextManager), "This method is not available in synchronous mode"
216
+ async with context as client:
217
+ api_instance = AEvaluatorsApi(client)
218
+ return await api_instance.evaluators_versions_list(id=evaluator_id)
219
+
220
+
221
+ class Evaluator(AOpenAPIEvaluator):
222
+ """
223
+ Wrapper for a single Evaluator.
224
+
225
+ For available attributes, please check the (automatically
226
+ generated) superclass documentation.
227
+ """
228
+
229
+ client_context: ClientContextCallable
230
+
231
+ @classmethod
232
+ def _wrap(
233
+ cls, apiobj: Union[AOpenAPIEvaluator, "SyncGeneratedEvaluator"], client_context: ClientContextCallable
234
+ ) -> "Evaluator": # noqa: E501
235
+ obj = cast(Evaluator, apiobj)
236
+ obj.__class__ = cls
237
+ obj.client_context = client_context
238
+ return obj
239
+
240
+ @with_sync_client
241
+ def run(
242
+ self,
243
+ response: Optional[str] = None,
244
+ request: Optional[str] = None,
245
+ contexts: Optional[List[str]] = None,
246
+ functions: Optional[List[EvaluatorExecutionFunctionsRequest]] = None,
247
+ expected_output: Optional[str] = None,
248
+ variables: Optional[dict[str, str]] = None,
249
+ tags: Optional[List[str]] = None,
250
+ *,
251
+ _client: ApiClient,
252
+ _request_timeout: Optional[int] = None,
253
+ ) -> EvaluatorExecutionResult:
254
+ """
255
+ Run the evaluator.
256
+
257
+ Args:
258
+ response: LLM output.
259
+ request: The prompt sent to the LLM.
260
+ contexts: Optional documents passed to RAG evaluators
261
+ functions: Optional function definitions to LLM tool call validation
262
+ expected_output: Optional expected output for the evaluator.
263
+ variables: Optional additional variable mappings for the evaluator. For example, if the evaluator
264
+ predicate is "evaluate the output based on {subject}: {output}", then variables={"subject": "clarity"}.
265
+ tags: Optional tags to add to the evaluator execution
266
+ """
267
+
268
+ if not response and not request:
269
+ raise ValueError("Either response or request must be provided")
270
+
271
+ api_instance = EvaluatorsApi(_client)
272
+
273
+ evaluator_execution_request = EvaluatorExecutionRequest(
274
+ evaluator_version_id=self.version_id,
275
+ request=request,
276
+ response=response,
277
+ contexts=contexts,
278
+ functions=functions,
279
+ expected_output=expected_output,
280
+ variables=variables,
281
+ tags=tags,
282
+ )
283
+ return api_instance.evaluators_execute_create(
284
+ id=self.id,
285
+ evaluator_execution_request=evaluator_execution_request,
286
+ _request_timeout=_request_timeout,
287
+ )
288
+
289
+
290
+ class AEvaluator(AOpenAPIEvaluator):
291
+ """
292
+ Wrapper for a single Evaluator.
293
+
294
+ For available attributes, please check the (automatically
295
+ generated) superclass documentation.
296
+ """
297
+
298
+ client_context: ClientContextCallable
299
+
300
+ @classmethod
301
+ async def _awrap(
302
+ cls, apiobj: Union[AOpenAPIEvaluator, "GeneratedEvaluator"], client_context: ClientContextCallable
303
+ ) -> "AEvaluator": # noqa: E501
304
+ obj = cast(AEvaluator, apiobj)
305
+ obj.__class__ = cls
306
+ obj.client_context = client_context
307
+ return obj
308
+
309
+ @with_async_client
310
+ async def arun(
311
+ self,
312
+ response: Optional[str] = None,
313
+ request: Optional[str] = None,
314
+ contexts: Optional[List[str]] = None,
315
+ functions: Optional[List[AEvaluatorExecutionFunctionsRequest]] = None,
316
+ expected_output: Optional[str] = None,
317
+ variables: Optional[dict[str, str]] = None,
318
+ tags: Optional[List[str]] = None,
319
+ *,
320
+ _client: AApiClient,
321
+ _request_timeout: Optional[int] = None,
322
+ ) -> AEvaluatorExecutionResult:
323
+ """
324
+ Asynchronously run the evaluator.
325
+
326
+ Args:
327
+ response: LLM output.
328
+ request: The prompt sent to the LLM.
329
+ contexts: Optional documents passed to RAG evaluators
330
+ functions: Optional function definitions to LLM tool call validation
331
+ expected_output: Optional expected output for the evaluator.
332
+ variables: Optional additional variable mappings for the evaluator. For example, if the evaluator
333
+ predicate is "evaluate the output based on {subject}: {output}", then variables={"subject": "clarity"}.
334
+ tags: Optional tags to add to the evaluator execution
335
+ """
336
+
337
+ if not response and not request:
338
+ raise ValueError("Either response or request must be provided")
339
+
340
+ api_instance = AEvaluatorsApi(_client)
341
+
342
+ evaluator_execution_request = AEvaluatorExecutionRequest(
343
+ evaluator_version_id=self.version_id,
344
+ request=request,
345
+ response=response,
346
+ contexts=contexts,
347
+ functions=functions,
348
+ expected_output=expected_output,
349
+ variables=variables,
350
+ tags=tags,
351
+ )
352
+ return await api_instance.evaluators_execute_create(
353
+ id=self.id,
354
+ evaluator_execution_request=evaluator_execution_request,
355
+ _request_timeout=_request_timeout,
356
+ )
357
+
358
+
359
+ def _to_input_variables(
360
+ input_variables: Optional[Union[List[InputVariable], List[InputVariableRequest]]],
361
+ ) -> List[InputVariableRequest]:
362
+ def _convert_to_generated_model(entry: Union[InputVariable, InputVariableRequest]) -> InputVariableRequest:
363
+ if not isinstance(entry, InputVariableRequest):
364
+ return InputVariableRequest(name=entry.name)
365
+ return entry
366
+
367
+ return [_convert_to_generated_model(entry) for entry in input_variables or {}]
368
+
369
+
370
+ def _to_model_params(model_params: Optional[Union[ModelParams, ModelParamsRequest]]) -> Optional[ModelParamsRequest]:
371
+ if isinstance(model_params, ModelParams):
372
+ return ModelParamsRequest(**model_params.model_dump())
373
+ return model_params
374
+
375
+
376
+ def _to_reference_variables(
377
+ reference_variables: Optional[Union[List[ReferenceVariable], List[ReferenceVariableRequest]]],
378
+ ) -> List[ReferenceVariableRequest]:
379
+ def _convert_to_generated_model(
380
+ entry: Union[ReferenceVariable, ReferenceVariableRequest],
381
+ ) -> ReferenceVariableRequest:
382
+ if not isinstance(entry, ReferenceVariableRequest):
383
+ return ReferenceVariableRequest(name=entry.name, dataset=entry.dataset_id)
384
+ return entry
385
+
386
+ return [_convert_to_generated_model(entry) for entry in reference_variables or {}]
387
+
388
+
389
+ def _to_evaluator_demonstrations(
390
+ input_variables: Optional[Union[List[EvaluatorDemonstration], List[EvaluatorDemonstrationsRequest]]],
391
+ ) -> List[EvaluatorDemonstrationsRequest]:
392
+ def _convert_dict(
393
+ entry: Union[EvaluatorDemonstration, EvaluatorDemonstrationsRequest],
394
+ ) -> EvaluatorDemonstrationsRequest:
395
+ if not isinstance(entry, EvaluatorDemonstrationsRequest):
396
+ return EvaluatorDemonstrationsRequest(
397
+ score=entry.score,
398
+ request=entry.request,
399
+ response=entry.response,
400
+ justification=entry.justification,
401
+ )
402
+ return entry
403
+
404
+ return [_convert_dict(entry) for entry in input_variables or {}]
405
+
406
+
407
+ def _ato_input_variables(
408
+ input_variables: Optional[Union[List[InputVariable], List[AInputVariableRequest]]],
409
+ ) -> List[AInputVariableRequest]:
410
+ def _convert_to_generated_model(entry: Union[InputVariable, AInputVariableRequest]) -> AInputVariableRequest:
411
+ if not isinstance(entry, AInputVariableRequest):
412
+ return AInputVariableRequest(name=entry.name)
413
+ return entry
414
+
415
+ return [_convert_to_generated_model(entry) for entry in input_variables or {}]
416
+
417
+
418
+ def _ato_model_params(model_params: Optional[Union[ModelParams, AModelParamsRequest]]) -> Optional[AModelParamsRequest]:
419
+ if isinstance(model_params, ModelParams):
420
+ return AModelParamsRequest(**model_params.model_dump())
421
+ return model_params
422
+
423
+
424
+ def _ato_reference_variables(
425
+ reference_variables: Optional[Union[List[ReferenceVariable], List[AReferenceVariableRequest]]],
426
+ ) -> List[AReferenceVariableRequest]:
427
+ def _convert_to_generated_model(
428
+ entry: Union[ReferenceVariable, AReferenceVariableRequest],
429
+ ) -> AReferenceVariableRequest:
430
+ if not isinstance(entry, AReferenceVariableRequest):
431
+ return AReferenceVariableRequest(name=entry.name, dataset=entry.dataset_id)
432
+ return entry
433
+
434
+ return [_convert_to_generated_model(entry) for entry in reference_variables or {}]
435
+
436
+
437
+ def _ato_evaluator_demonstrations(
438
+ input_variables: Optional[Union[List[EvaluatorDemonstration], List[AEvaluatorDemonstrationsRequest]]],
439
+ ) -> List[AEvaluatorDemonstrationsRequest]:
440
+ def _aconvert_dict(
441
+ entry: Union[EvaluatorDemonstration, AEvaluatorDemonstrationsRequest],
442
+ ) -> AEvaluatorDemonstrationsRequest:
443
+ if not isinstance(entry, AEvaluatorDemonstrationsRequest):
444
+ return AEvaluatorDemonstrationsRequest(
445
+ score=entry.score,
446
+ request=entry.request,
447
+ response=entry.response,
448
+ justification=entry.justification,
449
+ )
450
+ return entry
451
+
452
+ return [_aconvert_dict(entry) for entry in input_variables or {}]
453
+
454
+
455
+ class PresetEvaluatorRunner:
456
+ client_context: ClientContextCallable
457
+
458
+ def __init__(
459
+ self,
460
+ client_context: ClientContextCallable,
461
+ evaluator_id: str,
462
+ eval_name: str,
463
+ evaluator_version_id: Optional[str] = None,
464
+ ):
465
+ self.client_context = client_context
466
+ self.evaluator_id = evaluator_id
467
+ self.evaluator_version_id = evaluator_version_id
468
+ self.__name__ = eval_name
469
+
470
+ @with_sync_client
471
+ def __call__(
472
+ self,
473
+ response: Optional[str] = None,
474
+ request: Optional[str] = None,
475
+ contexts: Optional[List[str]] = None,
476
+ functions: Optional[List[EvaluatorExecutionFunctionsRequest]] = None,
477
+ expected_output: Optional[str] = None,
478
+ variables: Optional[dict[str, str]] = None,
479
+ tags: Optional[List[str]] = None,
480
+ *,
481
+ _client: ApiClient,
482
+ _request_timeout: Optional[int] = None,
483
+ ) -> EvaluatorExecutionResult:
484
+ """
485
+ Run the evaluator.
486
+
487
+ Args:
488
+ response: LLM output.
489
+ request: The prompt sent to the LLM.
490
+ contexts: Optional documents passed to RAG evaluators
491
+ functions: Optional function definitions to LLM tool call validation
492
+ expected_output: Optional expected output for the evaluator.
493
+ variables: Optional additional variable mappings for the evaluator. For example, if the evaluator
494
+ predicate is "evaluate the output based on {subject}: {output}", then variables={"subject": "clarity"}.
495
+ tags: Optional tags to add to the evaluator execution
496
+ """
497
+
498
+ if not response and not request:
499
+ raise ValueError("Either response or request must be provided")
500
+
501
+ api_instance = EvaluatorsApi(_client)
502
+
503
+ evaluator_execution_request = EvaluatorExecutionRequest(
504
+ evaluator_version_id=self.evaluator_version_id,
505
+ request=request,
506
+ response=response,
507
+ contexts=contexts,
508
+ functions=functions,
509
+ expected_output=expected_output,
510
+ variables=variables,
511
+ tags=tags,
512
+ )
513
+ return api_instance.evaluators_execute_create(
514
+ id=self.evaluator_id,
515
+ evaluator_execution_request=evaluator_execution_request,
516
+ _request_timeout=_request_timeout,
517
+ )
518
+
519
+
520
+ class APresetEvaluatorRunner:
521
+ client_context: ClientContextCallable
522
+
523
+ def __init__(
524
+ self,
525
+ client_context: ClientContextCallable,
526
+ evaluator_id: str,
527
+ eval_name: str,
528
+ evaluator_version_id: Optional[str] = None,
529
+ ):
530
+ self.client_context = client_context
531
+ self.evaluator_id = evaluator_id
532
+ self.evaluator_version_id = evaluator_version_id
533
+ self.__name__ = eval_name
534
+
535
+ @with_async_client
536
+ async def __call__(
537
+ self,
538
+ response: Optional[str] = None,
539
+ request: Optional[str] = None,
540
+ contexts: Optional[List[str]] = None,
541
+ functions: Optional[List[AEvaluatorExecutionFunctionsRequest]] = None,
542
+ expected_output: Optional[str] = None,
543
+ variables: Optional[dict[str, str]] = None,
544
+ tags: Optional[List[str]] = None,
545
+ *,
546
+ _client: AApiClient,
547
+ _request_timeout: Optional[int] = None,
548
+ ) -> AEvaluatorExecutionResult:
549
+ """
550
+ Asynchronously run the evaluator.
551
+
552
+ Args:
553
+ response: LLM output.
554
+ request: The prompt sent to the LLM.
555
+ contexts: Optional documents passed to RAG evaluators
556
+ functions: Optional function definitions to LLM tool call validation
557
+ expected_output: Optional expected output for the evaluator.
558
+ variables: Optional additional variable mappings for the evaluator. For example, if the evaluator
559
+ predicate is "evaluate the output based on {subject}: {output}", then variables={"subject": "clarity"}.
560
+ tags: Optional tags to add to the evaluator execution
561
+ """
562
+
563
+ if not response and not request:
564
+ raise ValueError("Either response or request must be provided")
565
+
566
+ api_instance = AEvaluatorsApi(_client)
567
+
568
+ evaluator_execution_request = AEvaluatorExecutionRequest(
569
+ evaluator_version_id=self.evaluator_version_id,
570
+ request=request,
571
+ response=response,
572
+ contexts=contexts,
573
+ functions=functions,
574
+ expected_output=expected_output,
575
+ variables=variables,
576
+ tags=tags,
577
+ )
578
+ return await api_instance.evaluators_execute_create(
579
+ id=self.evaluator_id,
580
+ evaluator_execution_request=evaluator_execution_request,
581
+ _request_timeout=_request_timeout,
582
+ )
583
+
584
+
585
+ class Evaluators:
586
+ """Evaluators (sub) API
587
+
588
+ Note:
589
+
590
+ The construction of the API instance should be handled by
591
+ accesing an attribute of a :class:`root.client.RootSignals` instance.
592
+ """
593
+
594
+ def _validate_create_params_sanitize_name(
595
+ self, name: Optional[str], intent: Optional[str], objective_id: Optional[str]
596
+ ) -> str:
597
+ if objective_id is not None:
598
+ if intent:
599
+ raise ValueError("Supplying both objective_id and intent is not supported")
600
+ if name is None:
601
+ name = "<unnamed>"
602
+ return name
603
+
604
+ def __init__(self, client_context: ClientContextCallable):
605
+ self.client_context = client_context
606
+ self.versions = Versions(client_context)
607
+
608
+ def _to_objective_request(self, *, intent: Optional[str] = None) -> ObjectiveRequest:
609
+ return ObjectiveRequest(
610
+ intent=intent,
611
+ )
612
+
613
+ async def _ato_objective_request(self, *, intent: Optional[str] = None) -> AObjectiveRequest:
614
+ return AObjectiveRequest(
615
+ intent=intent,
616
+ )
617
+
618
+ @with_sync_client
619
+ def run(
620
+ self,
621
+ evaluator_id: str,
622
+ *,
623
+ request: Optional[str] = None,
624
+ response: Optional[str] = None,
625
+ contexts: Optional[List[str]] = None,
626
+ functions: Optional[List[EvaluatorExecutionFunctionsRequest]] = None,
627
+ expected_output: Optional[str] = None,
628
+ evaluator_version_id: Optional[str] = None,
629
+ variables: Optional[dict[str, str]] = None,
630
+ tags: Optional[List[str]] = None,
631
+ _request_timeout: Optional[int] = None,
632
+ _client: ApiClient,
633
+ ) -> EvaluatorExecutionResult:
634
+ """
635
+ Run the evaluator.
636
+
637
+ Args:
638
+ evaluator_id: The ID of the evaluator to run.
639
+ request: The prompt sent to the LLM.
640
+ response: LLM output.
641
+ contexts: Optional documents passed to RAG evaluators.
642
+ functions: Optional function definitions to LLM tool call validation.
643
+ expected_output: Optional expected output for the evaluator.
644
+ evaluator_version_id: Version ID of the evaluator to run. If omitted, the latest version is used.
645
+ variables: Optional additional variable mappings for the evaluator. For example, if the evaluator
646
+ predicate is "evaluate the output based on {subject}: {output}", then variables={"subject": "clarity"}.
647
+ tags: Optional tags to add to the evaluator execution
648
+ _request_timeout: Optional timeout for the request.
649
+ """
650
+
651
+ if not response and not request:
652
+ raise ValueError("Either response or request must be provided")
653
+
654
+ api_instance = EvaluatorsApi(_client)
655
+
656
+ evaluator_execution_request = EvaluatorExecutionRequest(
657
+ evaluator_version_id=evaluator_version_id,
658
+ request=request,
659
+ response=response,
660
+ contexts=contexts,
661
+ functions=functions,
662
+ expected_output=expected_output,
663
+ variables=variables,
664
+ tags=tags,
665
+ )
666
+ return api_instance.evaluators_execute_create(
667
+ id=evaluator_id,
668
+ evaluator_execution_request=evaluator_execution_request,
669
+ _request_timeout=_request_timeout,
670
+ )
671
+
672
+ @with_async_client
673
+ async def arun(
674
+ self,
675
+ evaluator_id: str,
676
+ *,
677
+ request: Optional[str] = None,
678
+ response: Optional[str] = None,
679
+ contexts: Optional[List[str]] = None,
680
+ functions: Optional[List[AEvaluatorExecutionFunctionsRequest]] = None,
681
+ expected_output: Optional[str] = None,
682
+ evaluator_version_id: Optional[str] = None,
683
+ variables: Optional[dict[str, str]] = None,
684
+ tags: Optional[List[str]] = None,
685
+ _request_timeout: Optional[int] = None,
686
+ _client: AApiClient,
687
+ ) -> AEvaluatorExecutionResult:
688
+ """
689
+ Asynchronously run the evaluator.
690
+
691
+ Args:
692
+ evaluator_id: The ID of the evaluator to run.
693
+ request: The prompt sent to the LLM.
694
+ response: LLM output.
695
+ contexts: Optional documents passed to RAG evaluators.
696
+ functions: Optional function definitions to LLM tool call validation.
697
+ expected_output: Optional expected output for the evaluator.
698
+ evaluator_version_id: Version ID of the evaluator to run. If omitted, the latest version is used.
699
+ variables: Optional additional variable mappings for the evaluator. For example, if the evaluator
700
+ predicate is "evaluate the output based on {subject}: {output}", then variables={"subject": "clarity"}.
701
+ tags: Optional tags to add to the evaluator execution
702
+ _request_timeout: Optional timeout for the request.
703
+ """
704
+
705
+ if not response and not request:
706
+ raise ValueError("Either response or request must be provided")
707
+
708
+ api_instance = AEvaluatorsApi(_client)
709
+ evaluator_execution_request = AEvaluatorExecutionRequest(
710
+ evaluator_version_id=evaluator_version_id,
711
+ request=request,
712
+ response=response,
713
+ contexts=contexts,
714
+ functions=functions,
715
+ expected_output=expected_output,
716
+ variables=variables,
717
+ tags=tags,
718
+ )
719
+ return await api_instance.evaluators_execute_create(
720
+ id=evaluator_id,
721
+ evaluator_execution_request=evaluator_execution_request,
722
+ _request_timeout=_request_timeout,
723
+ )
724
+
725
+ @with_sync_client
726
+ def calibrate_existing(
727
+ self,
728
+ evaluator_id: str,
729
+ *,
730
+ test_dataset_id: Optional[str] = None,
731
+ test_data: Optional[List[List[str]]] = None,
732
+ _request_timeout: Optional[int] = None,
733
+ _client: ApiClient,
734
+ ) -> List[EvaluatorCalibrationOutput]:
735
+ """
736
+ Run calibration set on an existing evaluator.
737
+ """
738
+
739
+ if not test_dataset_id and not test_data:
740
+ raise ValueError("Either test_dataset_id or test_data must be provided")
741
+ if test_dataset_id and test_data:
742
+ raise ValueError("Only one of test_dataset_id or test_data must be provided")
743
+ api_instance = EvaluatorsApi(_client)
744
+ evaluator_test_request = SkillTestDataRequest(
745
+ test_dataset_id=test_dataset_id,
746
+ test_data=test_data,
747
+ )
748
+ return api_instance.evaluators_calibrate_create2(
749
+ evaluator_id, evaluator_test_request, _request_timeout=_request_timeout
750
+ )
751
+
752
+ @with_async_client
753
+ async def acalibrate_existing(
754
+ self,
755
+ evaluator_id: str,
756
+ *,
757
+ test_dataset_id: Optional[str] = None,
758
+ test_data: Optional[List[List[str]]] = None,
759
+ _request_timeout: Optional[int] = None,
760
+ _client: AApiClient,
761
+ ) -> List[AEvaluatorCalibrationOutput]:
762
+ """
763
+ Asynchronously run calibration set on an existing evaluator.
764
+ """
765
+
766
+ if not test_dataset_id and not test_data:
767
+ raise ValueError("Either test_dataset_id or test_data must be provided")
768
+ if test_dataset_id and test_data:
769
+ raise ValueError("Only one of test_dataset_id or test_data must be provided")
770
+ api_instance = AEvaluatorsApi(_client)
771
+ evaluator_test_request = ASkillTestDataRequest(
772
+ test_dataset_id=test_dataset_id,
773
+ test_data=test_data,
774
+ )
775
+ return await api_instance.evaluators_calibrate_create2(
776
+ evaluator_id, evaluator_test_request, _request_timeout=_request_timeout
777
+ )
778
+
779
+ @with_sync_client
780
+ def calibrate(
781
+ self,
782
+ *,
783
+ name: str,
784
+ test_dataset_id: Optional[str] = None,
785
+ test_data: Optional[List[List[str]]] = None,
786
+ prompt: str,
787
+ model: ModelName,
788
+ pii_filter: bool = False,
789
+ reference_variables: Optional[Union[List[ReferenceVariable], List[ReferenceVariableRequest]]] = None,
790
+ input_variables: Optional[Union[List[InputVariable], List[InputVariableRequest]]] = None,
791
+ _request_timeout: Optional[int] = None,
792
+ _client: ApiClient,
793
+ ) -> List[EvaluatorCalibrationOutput]:
794
+ """
795
+ Run calibration set for an evaluator definition.
796
+ See the create evaluator method for more details on the parameters.
797
+ """
798
+
799
+ if not test_dataset_id and not test_data:
800
+ raise ValueError("Either test_dataset_id or test_data must be provided")
801
+ if test_dataset_id and test_data:
802
+ raise ValueError("Only one of test_dataset_id or test_data must be provided")
803
+ api_instance = EvaluatorsApi(_client)
804
+ evaluator_test_request = SkillTestInputRequest(
805
+ name=name,
806
+ test_dataset_id=test_dataset_id,
807
+ test_data=test_data,
808
+ prompt=prompt,
809
+ models=[model],
810
+ is_evaluator=True,
811
+ pii_filter=pii_filter,
812
+ objective=ObjectiveRequest(intent="Calibration"),
813
+ reference_variables=_to_reference_variables(reference_variables),
814
+ input_variables=_to_input_variables(input_variables),
815
+ )
816
+ return api_instance.evaluators_calibrate_create(evaluator_test_request, _request_timeout=_request_timeout)
817
+
818
+ @with_async_client
819
+ async def acalibrate(
820
+ self,
821
+ *,
822
+ name: str,
823
+ test_dataset_id: Optional[str] = None,
824
+ test_data: Optional[List[List[str]]] = None,
825
+ prompt: str,
826
+ model: ModelName,
827
+ pii_filter: bool = False,
828
+ reference_variables: Optional[Union[List[ReferenceVariable], List[AReferenceVariableRequest]]] = None,
829
+ input_variables: Optional[Union[List[InputVariable], List[AInputVariableRequest]]] = None,
830
+ _request_timeout: Optional[int] = None,
831
+ _client: AApiClient,
832
+ ) -> List[AEvaluatorCalibrationOutput]:
833
+ """
834
+ Asynchronously run calibration set for an evaluator definition.
835
+ See the create evaluator method for more details on the parameters.
836
+ """
837
+
838
+ if not test_dataset_id and not test_data:
839
+ raise ValueError("Either test_dataset_id or test_data must be provided")
840
+ if test_dataset_id and test_data:
841
+ raise ValueError("Only one of test_dataset_id or test_data must be provided")
842
+ api_instance = AEvaluatorsApi(_client)
843
+ evaluator_test_request = ASkillTestInputRequest(
844
+ name=name,
845
+ test_dataset_id=test_dataset_id,
846
+ test_data=test_data,
847
+ prompt=prompt,
848
+ models=[model],
849
+ is_evaluator=True,
850
+ pii_filter=pii_filter,
851
+ objective=AObjectiveRequest(intent="Calibration"),
852
+ reference_variables=_ato_reference_variables(reference_variables),
853
+ input_variables=_ato_input_variables(input_variables),
854
+ )
855
+ return await api_instance.evaluators_calibrate_create(evaluator_test_request, _request_timeout=_request_timeout)
856
+
857
+ def calibrate_batch(
858
+ self,
859
+ *,
860
+ evaluator_definitions: List[CalibrateBatchParameters],
861
+ test_dataset_id: Optional[str] = None,
862
+ test_data: Optional[List[List[str]]] = None,
863
+ parallel_requests: int = 1,
864
+ _request_timeout: Optional[int] = None,
865
+ ) -> CalibrateBatchResult:
866
+ """
867
+ Run calibration for a set of prompts and models
868
+
869
+ Args:
870
+ evaluator_definitions: List of evaluator definitions.
871
+ test_dataset_id: ID of the dataset to be used to test the evaluator.
872
+ test_data: Snapshot of data to be used to test the evaluator.
873
+ parallel_requests: Number of parallel requests.
874
+
875
+ Returns a model with the results and errors for each model and prompt.
876
+ """
877
+
878
+ if test_dataset_id and test_data:
879
+ raise ValueError("Only one of test_dataset_id or test_data must be provided")
880
+
881
+ model_errors: Dict[str, Dict[str, float]] = defaultdict(
882
+ lambda: {"sum_squared_errors": 0, "abs_errors": 0, "count": 0}
883
+ )
884
+ prompt_errors: Dict[str, Dict[str, float]] = defaultdict(
885
+ lambda: {"sum_squared_errors": 0, "abs_errors": 0, "count": 0}
886
+ )
887
+
888
+ all_results = []
889
+
890
+ use_thread_pool = parallel_requests > 1
891
+
892
+ def process_results(results: List[EvaluatorCalibrationOutput], param: CalibrateBatchParameters) -> None:
893
+ for result in results:
894
+ score = result.result.score or 0
895
+ expected_score = result.result.expected_score or 0
896
+ squared_error = (score - expected_score) ** 2
897
+ abs_error = abs(score - expected_score)
898
+
899
+ # TODO multiple thread race condition
900
+ model_errors[param.model]["sum_squared_errors"] += squared_error
901
+ model_errors[param.model]["abs_errors"] += abs_error
902
+ model_errors[param.model]["count"] += 1
903
+
904
+ prompt_errors[param.prompt]["sum_squared_errors"] += squared_error
905
+ prompt_errors[param.prompt]["abs_errors"] += abs_error
906
+ prompt_errors[param.prompt]["count"] += 1
907
+
908
+ all_results.append(result)
909
+
910
+ if use_thread_pool:
911
+ with ThreadPoolExecutor(max_workers=parallel_requests) as executor:
912
+ futures = {
913
+ executor.submit(
914
+ self.calibrate,
915
+ name=param.name,
916
+ test_dataset_id=test_dataset_id,
917
+ test_data=test_data,
918
+ prompt=param.prompt,
919
+ model=param.model,
920
+ pii_filter=param.pii_filter,
921
+ reference_variables=param.reference_variables,
922
+ input_variables=param.input_variables,
923
+ _request_timeout=_request_timeout,
924
+ ): param
925
+ for param in evaluator_definitions
926
+ }
927
+
928
+ for future in as_completed(futures):
929
+ param = futures[future]
930
+ try:
931
+ results = future.result()
932
+ process_results(results, param)
933
+ except Exception as exc:
934
+ raise ValueError(f"Calibration failed for {param.prompt} with model {param.model}") from exc
935
+ else:
936
+ for param in evaluator_definitions:
937
+ try:
938
+ results = self.calibrate(
939
+ name=param.name,
940
+ test_dataset_id=test_dataset_id,
941
+ test_data=test_data,
942
+ prompt=param.prompt,
943
+ model=param.model,
944
+ pii_filter=param.pii_filter,
945
+ reference_variables=param.reference_variables,
946
+ input_variables=param.input_variables,
947
+ _request_timeout=_request_timeout,
948
+ )
949
+ process_results(results, param)
950
+ except Exception as exc:
951
+ raise ValueError(f"Calibration failed for {param.prompt} with model {param.model}") from exc
952
+
953
+ rms_errors_model = {
954
+ model: math.sqrt(data["sum_squared_errors"] / data["count"])
955
+ for model, data in model_errors.items()
956
+ if data["count"] > 0
957
+ }
958
+
959
+ rms_errors_prompt = {
960
+ prompt: math.sqrt(data["sum_squared_errors"] / data["count"])
961
+ for prompt, data in prompt_errors.items()
962
+ if data["count"] > 0
963
+ }
964
+
965
+ mae_errors_model = {
966
+ model: data["abs_errors"] / data["count"] for model, data in model_errors.items() if data["count"] > 0
967
+ }
968
+
969
+ mae_errors_prompt = {
970
+ prompt: data["abs_errors"] / data["count"] for prompt, data in prompt_errors.items() if data["count"] > 0
971
+ }
972
+
973
+ return CalibrateBatchResult(
974
+ results=all_results,
975
+ rms_errors_model=rms_errors_model,
976
+ rms_errors_prompt=rms_errors_prompt,
977
+ mae_errors_model=mae_errors_model,
978
+ mae_errors_prompt=mae_errors_prompt,
979
+ )
980
+
981
+ async def acalibrate_batch(
982
+ self,
983
+ *,
984
+ evaluator_definitions: List[ACalibrateBatchParameters],
985
+ test_dataset_id: Optional[str] = None,
986
+ test_data: Optional[List[List[str]]] = None,
987
+ parallel_requests: int = 1,
988
+ _request_timeout: Optional[int] = None,
989
+ ) -> ACalibrateBatchResult:
990
+ """
991
+ Asynchronously run calibration for a set of prompts and models
992
+
993
+ Args:
994
+ evaluator_definitions: List of evaluator definitions.
995
+ test_dataset_id: ID of the dataset to be used to test the evaluator.
996
+ test_data: Snapshot of data to be used to test the evaluator.
997
+ parallel_requests: Number of parallel requests.
998
+
999
+ Returns a model with the results and errors for each model and prompt.
1000
+ """
1001
+
1002
+ if test_dataset_id and test_data:
1003
+ raise ValueError("Only one of test_dataset_id or test_data must be provided")
1004
+
1005
+ model_errors: Dict[str, Dict[str, float]] = defaultdict(
1006
+ lambda: {"sum_squared_errors": 0, "abs_errors": 0, "count": 0}
1007
+ )
1008
+ prompt_errors: Dict[str, Dict[str, float]] = defaultdict(
1009
+ lambda: {"sum_squared_errors": 0, "abs_errors": 0, "count": 0}
1010
+ )
1011
+
1012
+ all_results = []
1013
+
1014
+ async def process_results(results: List[AEvaluatorCalibrationOutput], param: ACalibrateBatchParameters) -> None:
1015
+ for result in results:
1016
+ score = result.result.score or 0
1017
+ expected_score = result.result.expected_score or 0
1018
+ squared_error = (score - expected_score) ** 2
1019
+ abs_error = abs(score - expected_score)
1020
+
1021
+ model_errors[param.model]["sum_squared_errors"] += squared_error
1022
+ model_errors[param.model]["abs_errors"] += abs_error
1023
+ model_errors[param.model]["count"] += 1
1024
+
1025
+ prompt_errors[param.prompt]["sum_squared_errors"] += squared_error
1026
+ prompt_errors[param.prompt]["abs_errors"] += abs_error
1027
+ prompt_errors[param.prompt]["count"] += 1
1028
+
1029
+ all_results.append(result)
1030
+
1031
+ sem = asyncio.Semaphore(parallel_requests)
1032
+
1033
+ async def bounded_calibrate(param: ACalibrateBatchParameters) -> None:
1034
+ async with sem:
1035
+ try:
1036
+ results = await self.acalibrate(
1037
+ name=param.name,
1038
+ test_dataset_id=test_dataset_id,
1039
+ test_data=test_data,
1040
+ prompt=param.prompt,
1041
+ model=param.model,
1042
+ pii_filter=param.pii_filter,
1043
+ reference_variables=param.reference_variables,
1044
+ input_variables=param.input_variables,
1045
+ _request_timeout=_request_timeout,
1046
+ )
1047
+ await process_results(results, param)
1048
+ except Exception as exc:
1049
+ raise ValueError(f"Calibration failed for {param.prompt} with model {param.model}") from exc
1050
+
1051
+ await asyncio.gather(*(bounded_calibrate(param) for param in evaluator_definitions))
1052
+
1053
+ rms_errors_model = {
1054
+ model: math.sqrt(data["sum_squared_errors"] / data["count"])
1055
+ for model, data in model_errors.items()
1056
+ if data["count"] > 0
1057
+ }
1058
+
1059
+ rms_errors_prompt = {
1060
+ prompt: math.sqrt(data["sum_squared_errors"] / data["count"])
1061
+ for prompt, data in prompt_errors.items()
1062
+ if data["count"] > 0
1063
+ }
1064
+
1065
+ mae_errors_model = {
1066
+ model: data["abs_errors"] / data["count"] for model, data in model_errors.items() if data["count"] > 0
1067
+ }
1068
+
1069
+ mae_errors_prompt = {
1070
+ prompt: data["abs_errors"] / data["count"] for prompt, data in prompt_errors.items() if data["count"] > 0
1071
+ }
1072
+
1073
+ return ACalibrateBatchResult(
1074
+ results=all_results,
1075
+ rms_errors_model=rms_errors_model,
1076
+ rms_errors_prompt=rms_errors_prompt,
1077
+ mae_errors_model=mae_errors_model,
1078
+ mae_errors_prompt=mae_errors_prompt,
1079
+ )
1080
+
1081
+ @with_sync_client
1082
+ def get_by_name(
1083
+ self,
1084
+ name: str,
1085
+ *,
1086
+ _client: ApiClient,
1087
+ ) -> Evaluator:
1088
+ """Get an evaluator instance by name.
1089
+
1090
+ Args:
1091
+ name: The evaluator to be fetched. Note this only works for uniquely named evaluators.
1092
+ """
1093
+
1094
+ api_instance = EvaluatorsApi(_client)
1095
+
1096
+ evaluator_list: List[EvaluatorListOutput] = list(
1097
+ iterate_cursor_list(
1098
+ partial(api_instance.evaluators_list, name=name),
1099
+ limit=1,
1100
+ )
1101
+ )
1102
+
1103
+ if not evaluator_list:
1104
+ raise ValueError(f"No evaluator found with name '{name}'")
1105
+
1106
+ evaluator = evaluator_list[0]
1107
+ api_response = api_instance.evaluators_retrieve(id=evaluator.id)
1108
+
1109
+ return Evaluator._wrap(api_response, self.client_context)
1110
+
1111
+ @with_async_client
1112
+ async def aget_by_name(
1113
+ self,
1114
+ name: str,
1115
+ *,
1116
+ _client: ApiClient,
1117
+ ) -> AEvaluator:
1118
+ """Asynchronously get an evaluator instance by name.
1119
+
1120
+ Args:
1121
+ name: The evaluator to be fetched. Note this only works for uniquely named evaluators.
1122
+ """
1123
+
1124
+ context = self.client_context()
1125
+
1126
+ assert isinstance(context, AbstractAsyncContextManager), "This method is not available in synchronous mode"
1127
+
1128
+ async with context as client:
1129
+ api_instance = AEvaluatorsApi(client)
1130
+
1131
+ evaluator_list: List[AEvaluatorListOutput] = []
1132
+ async for evaluator in aiterate_cursor_list( # type: ignore[var-annotated]
1133
+ partial(api_instance.evaluators_list, name=name),
1134
+ limit=1,
1135
+ ):
1136
+ evaluator_list.extend(evaluator)
1137
+
1138
+ if not evaluator_list:
1139
+ raise ValueError(f"No evaluator found with name '{name}'")
1140
+
1141
+ evaluator = evaluator_list[0]
1142
+ api_response = await api_instance.evaluators_retrieve(id=evaluator.id)
1143
+
1144
+ return await AEvaluator._awrap(api_response, self.client_context)
1145
+
1146
+ @with_sync_client
1147
+ def create(
1148
+ self,
1149
+ predicate: str = "",
1150
+ *,
1151
+ name: Optional[str] = None,
1152
+ intent: Optional[str] = None,
1153
+ model: Optional[ModelName] = None,
1154
+ fallback_models: Optional[List[ModelName]] = None,
1155
+ reference_variables: Optional[Union[List[ReferenceVariable], List[ReferenceVariableRequest]]] = None,
1156
+ input_variables: Optional[Union[List[InputVariable], List[InputVariableRequest]]] = None,
1157
+ model_params: Optional[Union[ModelParams, ModelParamsRequest]] = None,
1158
+ evaluator_demonstrations: Optional[List[EvaluatorDemonstration]] = None,
1159
+ objective_id: Optional[str] = None,
1160
+ overwrite: bool = False,
1161
+ _client: ApiClient,
1162
+ _request_timeout: Optional[int] = None,
1163
+ ) -> Evaluator:
1164
+ """Create a new evaluator and return the result
1165
+
1166
+ Args:
1167
+ predicate: The question / predicate that is provided to the semantic quantification layer to
1168
+ transform it into a final prompt before being passed to the model
1169
+
1170
+ name: Name of the evaluator (defaulting to <unnamed>)
1171
+
1172
+ objective_id: Optional pre-existing objective id to assign to the evaluator.
1173
+
1174
+ intent: The intent of the evaluator (defaulting to name); not available if objective_id is set.
1175
+
1176
+ model: The model to use (defaults to 'root', which means
1177
+ Root Signals default at the time of evaluator creation)
1178
+
1179
+ fallback_models: The fallback models to use in case the primary model fails.
1180
+
1181
+ reference_variables: An optional list of reference variables for
1182
+ the evaluator.
1183
+
1184
+ input_variables: An optional list of input variables for
1185
+ the evaluator.
1186
+
1187
+ model_params: An optional set of additional parameters to the model (e.g., temperature).
1188
+
1189
+ evaluator_demonstrations: An optional list of evaluator demonstrations to guide
1190
+ the evaluator's behavior.
1191
+
1192
+ overwrite: Whether to overwrite an evaluator with the same name if it exists.
1193
+ """
1194
+
1195
+ name = self._validate_create_params_sanitize_name(name, intent, objective_id)
1196
+ api_instance = EvaluatorsApi(_client)
1197
+ objective: Optional[ObjectiveRequest] = None
1198
+ if objective_id is None:
1199
+ if intent is None:
1200
+ intent = name
1201
+ objective = self._to_objective_request(
1202
+ intent=intent,
1203
+ )
1204
+ objectives_api_instance = ObjectivesApi(_client)
1205
+ objective_id = objectives_api_instance.objectives_create(objective_request=objective).id
1206
+
1207
+ evaluator_request = EvaluatorRequest(
1208
+ name=name,
1209
+ objective_id=objective_id,
1210
+ prompt=predicate,
1211
+ models=[model for model in [model] + (fallback_models or []) if model is not None],
1212
+ reference_variables=_to_reference_variables(reference_variables),
1213
+ input_variables=_to_input_variables(input_variables),
1214
+ model_params=_to_model_params(model_params),
1215
+ evaluator_demonstrations=_to_evaluator_demonstrations(evaluator_demonstrations),
1216
+ overwrite=overwrite,
1217
+ )
1218
+
1219
+ evaluator = api_instance.evaluators_create(
1220
+ evaluator_request=evaluator_request, _request_timeout=_request_timeout
1221
+ )
1222
+
1223
+ return Evaluator._wrap(evaluator, self.client_context)
1224
+
1225
+ @with_async_client
1226
+ async def acreate(
1227
+ self,
1228
+ predicate: str = "",
1229
+ *,
1230
+ name: Optional[str] = None,
1231
+ intent: Optional[str] = None,
1232
+ model: Optional[ModelName] = None,
1233
+ fallback_models: Optional[List[ModelName]] = None,
1234
+ reference_variables: Optional[Union[List[ReferenceVariable], List[AReferenceVariableRequest]]] = None,
1235
+ input_variables: Optional[Union[List[InputVariable], List[AInputVariableRequest]]] = None,
1236
+ model_params: Optional[Union[ModelParams, AModelParamsRequest]] = None,
1237
+ evaluator_demonstrations: Optional[List[EvaluatorDemonstration]] = None,
1238
+ objective_id: Optional[str] = None,
1239
+ overwrite: bool = False,
1240
+ _client: ApiClient,
1241
+ _request_timeout: Optional[int] = None,
1242
+ ) -> AEvaluator:
1243
+ """
1244
+ Asynchronously create a new evaluator and return the result
1245
+
1246
+ Args:
1247
+ predicate: The question / predicate that is provided to the semantic quantification layer to
1248
+ transform it into a final prompt before being passed to the model
1249
+
1250
+ name: Name of the evaluator (defaulting to <unnamed>)
1251
+
1252
+ objective_id: Optional pre-existing objective id to assign to the evaluator.
1253
+
1254
+ intent: The intent of the evaluator (defaulting to name); not available if objective_id is set.
1255
+
1256
+ model: The model to use (defaults to 'root', which means
1257
+ Root Signals default at the time of evaluator creation)
1258
+
1259
+ fallback_models: The fallback models to use in case the primary model fails.
1260
+
1261
+ reference_variables: An optional list of reference variables for
1262
+ the evaluator.
1263
+
1264
+ input_variables: An optional list of input variables for
1265
+ the evaluator.
1266
+
1267
+ model_params: An optional set of additional parameters to the model (e.g., temperature).
1268
+
1269
+ evaluator_demonstrations: An optional list of evaluator demonstrations to guide
1270
+ the evaluator's behavior.
1271
+
1272
+ overwrite: Whether to overwrite an evaluator with the same name if it exists.
1273
+ """
1274
+
1275
+ name = self._validate_create_params_sanitize_name(name, intent, objective_id)
1276
+ api_instance = AEvaluatorsApi(_client)
1277
+ objective: Optional[AObjectiveRequest] = None
1278
+ if objective_id is None:
1279
+ if intent is None:
1280
+ intent = name
1281
+ objective = await self._ato_objective_request(intent=intent)
1282
+ objectives_api_instance = AObjectivesApi(_client)
1283
+ new_objective = await objectives_api_instance.objectives_create(objective_request=objective)
1284
+ objective_id = new_objective.id
1285
+
1286
+ evaluator_request = AEvaluatorRequest(
1287
+ name=name,
1288
+ objective_id=objective_id,
1289
+ prompt=predicate,
1290
+ models=[model for model in [model] + (fallback_models or []) if model is not None],
1291
+ reference_variables=_ato_reference_variables(reference_variables),
1292
+ input_variables=_ato_input_variables(input_variables),
1293
+ model_params=_ato_model_params(model_params),
1294
+ evaluator_demonstrations=_ato_evaluator_demonstrations(evaluator_demonstrations),
1295
+ overwrite=overwrite,
1296
+ )
1297
+
1298
+ evaluator = await api_instance.evaluators_create(
1299
+ evaluator_request=evaluator_request, _request_timeout=_request_timeout
1300
+ )
1301
+
1302
+ return await AEvaluator._awrap(evaluator, self.client_context)
1303
+
1304
+ @with_sync_client
1305
+ def update(
1306
+ self,
1307
+ evaluator_id: str,
1308
+ *,
1309
+ change_note: Optional[str] = None,
1310
+ fallback_models: Optional[List[ModelName]] = None,
1311
+ input_variables: Optional[Union[List[InputVariable], List[InputVariableRequest]]] = None,
1312
+ model: Optional[ModelName] = None,
1313
+ name: Optional[str] = None,
1314
+ predicate: Optional[str] = None,
1315
+ reference_variables: Optional[Union[List[ReferenceVariable], List[ReferenceVariableRequest]]] = None,
1316
+ model_params: Optional[Union[ModelParams, ModelParamsRequest]] = None,
1317
+ evaluator_demonstrations: Optional[List[EvaluatorDemonstration]] = None,
1318
+ objective_id: Optional[str] = None,
1319
+ _request_timeout: Optional[int] = None,
1320
+ _client: ApiClient,
1321
+ ) -> Evaluator:
1322
+ """
1323
+ Update an evaluator and return the result
1324
+
1325
+ See the create method for more information on the arguments.
1326
+ """
1327
+
1328
+ api_instance = EvaluatorsApi(_client)
1329
+ request = PatchedEvaluatorRequest(
1330
+ change_note=change_note or "",
1331
+ input_variables=_to_input_variables(input_variables) if input_variables else None,
1332
+ models=[model for model in [model] + (fallback_models or []) if model is not None]
1333
+ if model or fallback_models
1334
+ else None,
1335
+ name=name,
1336
+ prompt=predicate,
1337
+ reference_variables=_to_reference_variables(reference_variables) if reference_variables else None,
1338
+ model_params=_to_model_params(model_params) if model_params else None,
1339
+ objective_id=objective_id,
1340
+ evaluator_demonstrations=_to_evaluator_demonstrations(evaluator_demonstrations)
1341
+ if evaluator_demonstrations
1342
+ else None,
1343
+ )
1344
+
1345
+ api_response = api_instance.evaluators_partial_update(
1346
+ id=evaluator_id,
1347
+ patched_evaluator_request=request,
1348
+ _request_timeout=_request_timeout,
1349
+ )
1350
+ return Evaluator._wrap(api_response, self.client_context)
1351
+
1352
+ @with_async_client
1353
+ async def aupdate(
1354
+ self,
1355
+ evaluator_id: str,
1356
+ *,
1357
+ change_note: Optional[str] = None,
1358
+ fallback_models: Optional[List[ModelName]] = None,
1359
+ input_variables: Optional[Union[List[InputVariable], List[AInputVariableRequest]]] = None,
1360
+ model: Optional[ModelName] = None,
1361
+ name: Optional[str] = None,
1362
+ predicate: Optional[str] = None,
1363
+ reference_variables: Optional[Union[List[ReferenceVariable], List[AReferenceVariableRequest]]] = None,
1364
+ model_params: Optional[Union[ModelParams, AModelParamsRequest]] = None,
1365
+ evaluator_demonstrations: Optional[List[EvaluatorDemonstration]] = None,
1366
+ objective_id: Optional[str] = None,
1367
+ _request_timeout: Optional[int] = None,
1368
+ _client: AApiClient,
1369
+ ) -> AEvaluator:
1370
+ """
1371
+ Asynchronously update an evaluator and return the result
1372
+
1373
+ See the create method for more information on the arguments.
1374
+ """
1375
+ api_instance = AEvaluatorsApi(_client)
1376
+
1377
+ request = APatchedEvaluatorRequest(
1378
+ change_note=change_note or "",
1379
+ input_variables=_ato_input_variables(input_variables) if input_variables else None,
1380
+ models=[model for model in [model] + (fallback_models or []) if model is not None]
1381
+ if model or fallback_models
1382
+ else None,
1383
+ name=name,
1384
+ reference_variables=_ato_reference_variables(reference_variables) if reference_variables else None,
1385
+ model_params=_ato_model_params(model_params) if model_params else None,
1386
+ objective_id=objective_id,
1387
+ prompt=predicate,
1388
+ evaluator_demonstrations=_ato_evaluator_demonstrations(evaluator_demonstrations)
1389
+ if evaluator_demonstrations
1390
+ else None,
1391
+ )
1392
+ api_response = await api_instance.evaluators_partial_update(
1393
+ id=evaluator_id,
1394
+ patched_evaluator_request=request,
1395
+ _request_timeout=_request_timeout,
1396
+ )
1397
+ return await AEvaluator._awrap(api_response, self.client_context)
1398
+
1399
+ @with_sync_client
1400
+ def get(
1401
+ self,
1402
+ evaluator_id: str,
1403
+ *,
1404
+ _request_timeout: Optional[int] = None,
1405
+ _client: ApiClient,
1406
+ ) -> Evaluator:
1407
+ """
1408
+ Get a Evaluator instance by ID.
1409
+ """
1410
+
1411
+ api_instance = EvaluatorsApi(_client)
1412
+ api_response = api_instance.evaluators_retrieve(id=evaluator_id, _request_timeout=_request_timeout)
1413
+ return Evaluator._wrap(api_response, self.client_context)
1414
+
1415
+ @with_async_client
1416
+ async def aget(
1417
+ self,
1418
+ evaluator_id: str,
1419
+ *,
1420
+ _request_timeout: Optional[int] = None,
1421
+ _client: AApiClient,
1422
+ ) -> AEvaluator:
1423
+ """
1424
+ Asynchronously get a Evaluator instance by ID.
1425
+ """
1426
+
1427
+ api_instance = AEvaluatorsApi(_client)
1428
+ api_response = await api_instance.evaluators_retrieve(id=evaluator_id, _request_timeout=_request_timeout)
1429
+ return await AEvaluator._awrap(api_response, self.client_context)
1430
+
1431
+ @with_sync_client
1432
+ def list(
1433
+ self,
1434
+ search_term: Optional[str] = None,
1435
+ *,
1436
+ limit: int = 100,
1437
+ name: Optional[str] = None,
1438
+ only_root_evaluators: bool = False,
1439
+ _client: ApiClient,
1440
+ ) -> Iterator[EvaluatorListOutput]:
1441
+ """
1442
+ Iterate through the evaluators.
1443
+
1444
+ Args:
1445
+ search_term: Can be used to limit returned evaluators.
1446
+ limit: Number of entries to iterate through at most.
1447
+ name: Specific name the returned evaluators must match.
1448
+ only_root_evaluators: Returns only Root Signals defined evaluators.
1449
+ """
1450
+
1451
+ api_instance = EvaluatorsApi(_client)
1452
+ yield from iterate_cursor_list(
1453
+ partial(
1454
+ api_instance.evaluators_list,
1455
+ name=name,
1456
+ search=search_term,
1457
+ is_root_evaluator=True if only_root_evaluators else None,
1458
+ ),
1459
+ limit=limit,
1460
+ )
1461
+
1462
+ async def alist(
1463
+ self,
1464
+ search_term: Optional[str] = None,
1465
+ *,
1466
+ limit: int = 100,
1467
+ name: Optional[str] = None,
1468
+ only_root_evaluators: bool = False,
1469
+ ) -> AsyncIterator[AEvaluatorListOutput]:
1470
+ """
1471
+ Asynchronously iterate through the evaluators.
1472
+
1473
+ Args:
1474
+ search_term: Can be used to limit returned evaluators.
1475
+ limit: Number of entries to iterate through at most.
1476
+ name: Specific name the returned evaluators must match.
1477
+ only_root_evaluators: Returns only Root Signals defined evaluators.
1478
+ """
1479
+
1480
+ context = self.client_context()
1481
+ assert isinstance(context, AbstractAsyncContextManager), "This method is not available in synchronous mode"
1482
+ async with context as client:
1483
+ api_instance = AEvaluatorsApi(client)
1484
+ partial_list = partial(
1485
+ api_instance.evaluators_list,
1486
+ name=name,
1487
+ search=search_term,
1488
+ is_root_evaluator=True if only_root_evaluators else None,
1489
+ )
1490
+
1491
+ cursor: Optional[StrictStr] = None
1492
+ while limit > 0:
1493
+ result: APaginatedEvaluatorListOutputList = await partial_list(page_size=limit, cursor=cursor)
1494
+ if not result.results:
1495
+ return
1496
+
1497
+ used_results = result.results[:limit]
1498
+ limit -= len(used_results)
1499
+ for used_result in used_results:
1500
+ yield used_result
1501
+
1502
+ if not (cursor := result.next):
1503
+ return
1504
+
1505
+ @with_sync_client
1506
+ def run_by_name(
1507
+ self,
1508
+ name: str,
1509
+ *,
1510
+ request: Optional[str] = None,
1511
+ response: Optional[str] = None,
1512
+ contexts: Optional[List[str]] = None,
1513
+ functions: Optional[List[EvaluatorExecutionFunctionsRequest]] = None,
1514
+ expected_output: Optional[str] = None,
1515
+ evaluator_version_id: Optional[str] = None,
1516
+ variables: Optional[dict[str, str]] = None,
1517
+ tags: Optional[List[str]] = None,
1518
+ _request_timeout: Optional[int] = None,
1519
+ _client: ApiClient,
1520
+ ) -> EvaluatorExecutionResult:
1521
+ """
1522
+ Run an evaluator by name.
1523
+
1524
+ Args:
1525
+ name: The name of the evaluator to run.
1526
+ request: The prompt sent to the LLM.
1527
+ response: LLM output.
1528
+ contexts: Optional documents passed to RAG evaluators.
1529
+ functions: Optional function definitions to LLM tool call validation.
1530
+ expected_output: Optional expected output for the evaluator.
1531
+ evaluator_version_id: Version ID of the evaluator to run. If omitted, the latest version is used.
1532
+ variables: Optional additional variable mappings for the evaluator. For example, if the evaluator
1533
+ predicate is "evaluate the output based on {subject}: {output}", then variables={"subject": "clarity"}.
1534
+ tags: Optional tags to add to the evaluator execution
1535
+ _request_timeout: Optional timeout for the request.
1536
+ """
1537
+
1538
+ if not response and not request:
1539
+ raise ValueError("Either response or request must be provided")
1540
+
1541
+ api_instance = EvaluatorsApi(_client)
1542
+
1543
+ evaluator_execution_request = EvaluatorExecutionRequest(
1544
+ evaluator_version_id=evaluator_version_id,
1545
+ request=request,
1546
+ response=response,
1547
+ contexts=contexts,
1548
+ functions=functions,
1549
+ expected_output=expected_output,
1550
+ variables=variables,
1551
+ tags=tags,
1552
+ )
1553
+ return api_instance.evaluators_execute_by_name_create(
1554
+ name=name,
1555
+ evaluator_execution_request=evaluator_execution_request,
1556
+ _request_timeout=_request_timeout,
1557
+ )
1558
+
1559
+ @with_sync_client
1560
+ def delete(self, evaluator_id: str, *, _client: ApiClient) -> None:
1561
+ """
1562
+ Delete the evaluator.
1563
+ """
1564
+
1565
+ api_instance = EvaluatorsApi(_client)
1566
+ return api_instance.evaluators_destroy(id=evaluator_id)
1567
+
1568
+ @with_async_client
1569
+ async def adelete(self, evaluator_id: str, *, _client: AApiClient) -> None:
1570
+ """
1571
+ Delete the evaluator.
1572
+ """
1573
+
1574
+ api_instance = AEvaluatorsApi(_client)
1575
+ return await api_instance.evaluators_destroy(id=evaluator_id)
1576
+
1577
+ @with_async_client
1578
+ async def arun_by_name(
1579
+ self,
1580
+ name: str,
1581
+ *,
1582
+ request: Optional[str] = None,
1583
+ response: Optional[str] = None,
1584
+ contexts: Optional[List[str]] = None,
1585
+ functions: Optional[List[AEvaluatorExecutionFunctionsRequest]] = None,
1586
+ expected_output: Optional[str] = None,
1587
+ evaluator_version_id: Optional[str] = None,
1588
+ variables: Optional[dict[str, str]] = None,
1589
+ tags: Optional[List[str]] = None,
1590
+ _request_timeout: Optional[int] = None,
1591
+ _client: AApiClient,
1592
+ ) -> AEvaluatorExecutionResult:
1593
+ """
1594
+ Asynchronously run an evaluator by name.
1595
+
1596
+ Args:
1597
+ name: The name of the evaluator to run.
1598
+ request: The prompt sent to the LLM.
1599
+ response: LLM output.
1600
+ contexts: Optional documents passed to RAG evaluators.
1601
+ functions: Optional function definitions to LLM tool call validation.
1602
+ expected_output: Optional expected output for the evaluator.
1603
+ evaluator_version_id: Version ID of the evaluator to run. If omitted, the latest version is used.
1604
+ variables: Optional additional variable mappings for the evaluator. For example, if the evaluator
1605
+ predicate is "evaluate the output based on {subject}: {output}", then variables={"subject": "clarity"}.
1606
+ tags: Optional tags to add to the evaluator execution
1607
+ _request_timeout: Optional timeout for the request.
1608
+ """
1609
+
1610
+ if not response and not request:
1611
+ raise ValueError("Either response or request must be provided")
1612
+
1613
+ api_instance = AEvaluatorsApi(_client)
1614
+ evaluator_execution_request = AEvaluatorExecutionRequest(
1615
+ evaluator_version_id=evaluator_version_id,
1616
+ request=request,
1617
+ response=response,
1618
+ contexts=contexts,
1619
+ functions=functions,
1620
+ expected_output=expected_output,
1621
+ variables=variables,
1622
+ tags=tags,
1623
+ )
1624
+ return await api_instance.evaluators_execute_by_name_create(
1625
+ name=name,
1626
+ evaluator_execution_request=evaluator_execution_request,
1627
+ _request_timeout=_request_timeout,
1628
+ )
1629
+
1630
+ EvaluatorName = Literal[
1631
+ "Faithfulness",
1632
+ "Relevance",
1633
+ "Clarity",
1634
+ "Non_toxicity",
1635
+ "Helpfulness",
1636
+ "Politeness",
1637
+ "Formality",
1638
+ "Harmlessness",
1639
+ "Confidentiality",
1640
+ "Persuasiveness",
1641
+ "JSON_Empty_Values_Ratio",
1642
+ "JSON_Property_Name_Accuracy",
1643
+ "JSON_Property_Type_Accuracy",
1644
+ "JSON_Property_Completeness",
1645
+ "JSON_Content_Accuracy",
1646
+ "Context_Recall",
1647
+ "Answer_Correctness",
1648
+ "Answer_Semantic_Similarity",
1649
+ "Sentiment_recognition",
1650
+ "Safety_for_Children",
1651
+ "Precision",
1652
+ "Originality",
1653
+ "Engagingness",
1654
+ "Conciseness",
1655
+ "Coherence",
1656
+ "Quality_of_Writing_Professional",
1657
+ "Quality_of_Writing_Creative",
1658
+ "Truthfulness",
1659
+ "Context_Precision",
1660
+ "Answer_Relevance",
1661
+ ]
1662
+
1663
+ class Eval(Enum):
1664
+ # TODO: These eval names should be retrieved automatically from the API or a shared config file
1665
+ Faithfulness = "901794f9-634c-4852-9e41-7c558f1ff1ab"
1666
+ Relevance = "bd789257-f458-4e9e-8ce9-fa6e86dc3fb9"
1667
+ Clarity = "9976d9f3-7265-4732-b518-d61c2642b14e"
1668
+ Non_toxicity = "e296e374-7539-4eb2-a74a-47847dd26fb8"
1669
+ Helpfulness = "88bc92d5-bebf-45e4-9cd1-dfa33309c320"
1670
+ Politeness = "2856903a-e48c-4548-b3fe-520fd88c4f25"
1671
+ Formality = "8ab6cf1a-42b5-4a23-a15c-21372816483d"
1672
+ Harmlessness = "379fee0a-4fd1-4942-833b-7d78d78b334d"
1673
+ Confidentiality = "2eaa0a02-47a9-48f7-9b47-66ad257f93eb"
1674
+ Persuasiveness = "85bb6a74-f5dd-4130-8dcc-cffdf72327cc"
1675
+ JSON_Empty_Values_Ratio = "03829088-1799-438e-ae30-1db60832e52d"
1676
+ JSON_Property_Name_Accuracy = "740923aa-8ffd-49cc-a95d-14f831243b25"
1677
+ JSON_Property_Type_Accuracy = "eabc6924-1fec-4e96-82ce-c03bf415c885"
1678
+ JSON_Property_Completeness = "e5de37f7-d20c-420f-8072-f41dce96ecfc"
1679
+ JSON_Content_Accuracy = "b6a9aeff-c888-46d7-9e9c-7cf8cb461762"
1680
+ Context_Recall = "8bb60975-5062-4367-9fc6-a920044cba56"
1681
+ Answer_Correctness = "d4487568-4243-4da8-9c76-adbaf762dbe0"
1682
+ Answer_Semantic_Similarity = "ff350bce-4b07-4af7-9640-803c9d3c2ff9"
1683
+ Sentiment_recognition = "e3782c1e-eaf4-4b2d-8d26-53db2160f1fd"
1684
+ Safety_for_Children = "39a8b5ba-de77-4726-a6b0-621d40b3cdf5"
1685
+ Precision = "767bdd49-5f8c-48ca-8324-dfd6be7f8a79"
1686
+ Originality = "e72cb54f-548a-44f9-a6ca-4e14e5ade7f7"
1687
+ Engagingness = "64729487-d4a8-42d8-bd9e-72fd8390c134"
1688
+ Conciseness = "be828d33-158a-4e92-a2eb-f4d96c13f956"
1689
+ Coherence = "e599886c-c338-458f-91b3-5d7eba452618"
1690
+ Quality_of_Writing_Professional = "059affa9-2d1c-48de-8e97-f81dd3fc3cbe"
1691
+ Quality_of_Writing_Creative = "060abfb6-57c9-43b5-9a6d-8a1a9bb853b8"
1692
+ Truthfulness = "053df10f-b0c7-400b-892e-46ce3aa1e430"
1693
+ Context_Precision = "9d1e9a25-7e76-4771-b1e3-40825d7918c5"
1694
+ Answer_Relevance = "0907d422-e94f-4c9c-a63d-ec0eefd8a903"
1695
+ Compliance_Preview = "4613f248-b60e-403a-bcdc-157d1c44194a"
1696
+ Faithfulness_Swift = "a3a5e97b-7fcb-441e-92f2-6e59aa473b89"
1697
+ Truthfulness_Swift = "c8c65e61-2dc8-4f29-865a-a5e59127d208"
1698
+ Completeness = "f0832c32-6beb-4383-a1ea-cdeb883d9044"
1699
+
1700
+ def __getattr__(self, name: Union[EvaluatorName, str]) -> Union["PresetEvaluatorRunner", "APresetEvaluatorRunner"]:
1701
+ if name in self.Eval.__members__:
1702
+ context = self.client_context()
1703
+ if isinstance(context, AbstractContextManager):
1704
+ return PresetEvaluatorRunner(self.client_context, self.Eval.__members__[name].value, name)
1705
+ else:
1706
+ return APresetEvaluatorRunner(self.client_context, self.Eval.__members__[name].value, name)
1707
+ raise AttributeError(f"{name} is not a valid attribute")