scorable 1.6.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of scorable might be problematic. Click here for more details.
- root/__about__.py +4 -0
- root/__init__.py +17 -0
- root/client.py +207 -0
- root/datasets.py +231 -0
- root/execution_logs.py +162 -0
- root/generated/__init__.py +0 -0
- root/generated/openapi_aclient/__init__.py +1 -0
- root/generated/openapi_aclient/api/__init__.py +1 -0
- root/generated/openapi_aclient/api/datasets_api.py +1274 -0
- root/generated/openapi_aclient/api/evaluators_api.py +3641 -0
- root/generated/openapi_aclient/api/execution_logs_api.py +751 -0
- root/generated/openapi_aclient/api/judges_api.py +3794 -0
- root/generated/openapi_aclient/api/models_api.py +1473 -0
- root/generated/openapi_aclient/api/objectives_api.py +1767 -0
- root/generated/openapi_aclient/api_client.py +662 -0
- root/generated/openapi_aclient/api_response.py +22 -0
- root/generated/openapi_aclient/configuration.py +470 -0
- root/generated/openapi_aclient/exceptions.py +197 -0
- root/generated/openapi_aclient/models/__init__.py +122 -0
- root/generated/openapi_aclient/models/data_set_create.py +118 -0
- root/generated/openapi_aclient/models/data_set_create_request.py +105 -0
- root/generated/openapi_aclient/models/data_set_list.py +129 -0
- root/generated/openapi_aclient/models/data_set_type.py +36 -0
- root/generated/openapi_aclient/models/dataset_range_request.py +93 -0
- root/generated/openapi_aclient/models/evaluator.py +273 -0
- root/generated/openapi_aclient/models/evaluator_calibration_output.py +101 -0
- root/generated/openapi_aclient/models/evaluator_calibration_result.py +134 -0
- root/generated/openapi_aclient/models/evaluator_demonstrations.py +107 -0
- root/generated/openapi_aclient/models/evaluator_demonstrations_request.py +107 -0
- root/generated/openapi_aclient/models/evaluator_execution_function_parameter_property_request.py +86 -0
- root/generated/openapi_aclient/models/evaluator_execution_function_parameter_request.py +109 -0
- root/generated/openapi_aclient/models/evaluator_execution_function_parameter_type_enum.py +35 -0
- root/generated/openapi_aclient/models/evaluator_execution_function_request.py +99 -0
- root/generated/openapi_aclient/models/evaluator_execution_functions_request.py +98 -0
- root/generated/openapi_aclient/models/evaluator_execution_functions_type_enum.py +35 -0
- root/generated/openapi_aclient/models/evaluator_execution_request.py +134 -0
- root/generated/openapi_aclient/models/evaluator_execution_result.py +114 -0
- root/generated/openapi_aclient/models/evaluator_inputs_value.py +100 -0
- root/generated/openapi_aclient/models/evaluator_inputs_value_items.py +89 -0
- root/generated/openapi_aclient/models/evaluator_list_output.py +198 -0
- root/generated/openapi_aclient/models/evaluator_reference.py +90 -0
- root/generated/openapi_aclient/models/evaluator_reference_request.py +90 -0
- root/generated/openapi_aclient/models/evaluator_request.py +194 -0
- root/generated/openapi_aclient/models/evaluator_result.py +110 -0
- root/generated/openapi_aclient/models/execution_log_details.py +291 -0
- root/generated/openapi_aclient/models/execution_log_details_evaluation_context.py +83 -0
- root/generated/openapi_aclient/models/execution_log_details_evaluator_latencies_inner.py +83 -0
- root/generated/openapi_aclient/models/execution_log_list.py +217 -0
- root/generated/openapi_aclient/models/execution_log_list_evaluation_context.py +83 -0
- root/generated/openapi_aclient/models/generation_model_params_request.py +93 -0
- root/generated/openapi_aclient/models/id.py +87 -0
- root/generated/openapi_aclient/models/input_variable.py +121 -0
- root/generated/openapi_aclient/models/input_variable_request.py +82 -0
- root/generated/openapi_aclient/models/judge.py +178 -0
- root/generated/openapi_aclient/models/judge_execution_request.py +114 -0
- root/generated/openapi_aclient/models/judge_execution_response.py +97 -0
- root/generated/openapi_aclient/models/judge_files_inner.py +84 -0
- root/generated/openapi_aclient/models/judge_generator_request.py +142 -0
- root/generated/openapi_aclient/models/judge_generator_response.py +88 -0
- root/generated/openapi_aclient/models/judge_invite_request.py +87 -0
- root/generated/openapi_aclient/models/judge_list.py +156 -0
- root/generated/openapi_aclient/models/judge_rectifier_request_request.py +114 -0
- root/generated/openapi_aclient/models/judge_rectifier_response.py +121 -0
- root/generated/openapi_aclient/models/judge_request.py +108 -0
- root/generated/openapi_aclient/models/model.py +126 -0
- root/generated/openapi_aclient/models/model_list.py +115 -0
- root/generated/openapi_aclient/models/model_params.py +89 -0
- root/generated/openapi_aclient/models/model_params_request.py +89 -0
- root/generated/openapi_aclient/models/model_request.py +118 -0
- root/generated/openapi_aclient/models/nested_evaluator.py +110 -0
- root/generated/openapi_aclient/models/nested_evaluator_objective.py +87 -0
- root/generated/openapi_aclient/models/nested_evaluator_request.py +92 -0
- root/generated/openapi_aclient/models/nested_objective_evaluator.py +105 -0
- root/generated/openapi_aclient/models/nested_objective_evaluator_request.py +92 -0
- root/generated/openapi_aclient/models/nested_objective_list.py +111 -0
- root/generated/openapi_aclient/models/nested_user_details.py +88 -0
- root/generated/openapi_aclient/models/nested_user_details_request.py +82 -0
- root/generated/openapi_aclient/models/nested_vector_objective.py +88 -0
- root/generated/openapi_aclient/models/nested_vector_objective_request.py +82 -0
- root/generated/openapi_aclient/models/objective.py +157 -0
- root/generated/openapi_aclient/models/objective_list.py +128 -0
- root/generated/openapi_aclient/models/objective_request.py +113 -0
- root/generated/openapi_aclient/models/objective_validator.py +100 -0
- root/generated/openapi_aclient/models/objective_validator_request.py +90 -0
- root/generated/openapi_aclient/models/paginated_data_set_list_list.py +111 -0
- root/generated/openapi_aclient/models/paginated_evaluator_list.py +111 -0
- root/generated/openapi_aclient/models/paginated_evaluator_list_output_list.py +111 -0
- root/generated/openapi_aclient/models/paginated_execution_log_list_list.py +111 -0
- root/generated/openapi_aclient/models/paginated_judge_list_list.py +111 -0
- root/generated/openapi_aclient/models/paginated_model_list_list.py +111 -0
- root/generated/openapi_aclient/models/paginated_objective_list.py +111 -0
- root/generated/openapi_aclient/models/paginated_objective_list_list.py +111 -0
- root/generated/openapi_aclient/models/patched_evaluator_request.py +194 -0
- root/generated/openapi_aclient/models/patched_judge_request.py +110 -0
- root/generated/openapi_aclient/models/patched_model_request.py +118 -0
- root/generated/openapi_aclient/models/patched_objective_request.py +113 -0
- root/generated/openapi_aclient/models/provider.py +99 -0
- root/generated/openapi_aclient/models/reference_variable.py +123 -0
- root/generated/openapi_aclient/models/reference_variable_request.py +83 -0
- root/generated/openapi_aclient/models/skill_execution_validator_result.py +130 -0
- root/generated/openapi_aclient/models/skill_test_data_request.py +107 -0
- root/generated/openapi_aclient/models/skill_test_data_request_dataset_range.py +93 -0
- root/generated/openapi_aclient/models/skill_test_input_request.py +171 -0
- root/generated/openapi_aclient/models/skill_type_enum.py +36 -0
- root/generated/openapi_aclient/models/status_change.py +84 -0
- root/generated/openapi_aclient/models/status_change_request.py +84 -0
- root/generated/openapi_aclient/models/status_change_status_enum.py +36 -0
- root/generated/openapi_aclient/models/status_enum.py +38 -0
- root/generated/openapi_aclient/models/validation_result_status.py +36 -0
- root/generated/openapi_aclient/models/visibility_enum.py +38 -0
- root/generated/openapi_aclient/rest.py +166 -0
- root/generated/openapi_aclient_README.md +239 -0
- root/generated/openapi_client/__init__.py +1 -0
- root/generated/openapi_client/api/__init__.py +1 -0
- root/generated/openapi_client/api/datasets_api.py +1274 -0
- root/generated/openapi_client/api/evaluators_api.py +3641 -0
- root/generated/openapi_client/api/execution_logs_api.py +751 -0
- root/generated/openapi_client/api/judges_api.py +3794 -0
- root/generated/openapi_client/api/models_api.py +1473 -0
- root/generated/openapi_client/api/objectives_api.py +1767 -0
- root/generated/openapi_client/api_client.py +659 -0
- root/generated/openapi_client/api_response.py +22 -0
- root/generated/openapi_client/configuration.py +474 -0
- root/generated/openapi_client/exceptions.py +197 -0
- root/generated/openapi_client/models/__init__.py +120 -0
- root/generated/openapi_client/models/data_set_create.py +118 -0
- root/generated/openapi_client/models/data_set_create_request.py +105 -0
- root/generated/openapi_client/models/data_set_list.py +129 -0
- root/generated/openapi_client/models/data_set_type.py +36 -0
- root/generated/openapi_client/models/dataset_range_request.py +93 -0
- root/generated/openapi_client/models/evaluator.py +273 -0
- root/generated/openapi_client/models/evaluator_calibration_output.py +101 -0
- root/generated/openapi_client/models/evaluator_calibration_result.py +134 -0
- root/generated/openapi_client/models/evaluator_demonstrations.py +107 -0
- root/generated/openapi_client/models/evaluator_demonstrations_request.py +107 -0
- root/generated/openapi_client/models/evaluator_execution_function_parameter_property_request.py +86 -0
- root/generated/openapi_client/models/evaluator_execution_function_parameter_request.py +109 -0
- root/generated/openapi_client/models/evaluator_execution_function_parameter_type_enum.py +35 -0
- root/generated/openapi_client/models/evaluator_execution_function_request.py +99 -0
- root/generated/openapi_client/models/evaluator_execution_functions_request.py +98 -0
- root/generated/openapi_client/models/evaluator_execution_functions_type_enum.py +35 -0
- root/generated/openapi_client/models/evaluator_execution_request.py +134 -0
- root/generated/openapi_client/models/evaluator_execution_result.py +114 -0
- root/generated/openapi_client/models/evaluator_inputs_value.py +100 -0
- root/generated/openapi_client/models/evaluator_inputs_value_items.py +89 -0
- root/generated/openapi_client/models/evaluator_list_output.py +198 -0
- root/generated/openapi_client/models/evaluator_reference.py +90 -0
- root/generated/openapi_client/models/evaluator_reference_request.py +90 -0
- root/generated/openapi_client/models/evaluator_request.py +194 -0
- root/generated/openapi_client/models/evaluator_result.py +110 -0
- root/generated/openapi_client/models/execution_log_details.py +291 -0
- root/generated/openapi_client/models/execution_log_details_evaluation_context.py +83 -0
- root/generated/openapi_client/models/execution_log_details_evaluator_latencies_inner.py +83 -0
- root/generated/openapi_client/models/execution_log_list.py +215 -0
- root/generated/openapi_client/models/execution_log_list_evaluation_context.py +83 -0
- root/generated/openapi_client/models/generation_model_params_request.py +93 -0
- root/generated/openapi_client/models/id.py +87 -0
- root/generated/openapi_client/models/input_variable.py +121 -0
- root/generated/openapi_client/models/input_variable_request.py +82 -0
- root/generated/openapi_client/models/judge.py +178 -0
- root/generated/openapi_client/models/judge_execution_request.py +114 -0
- root/generated/openapi_client/models/judge_execution_response.py +97 -0
- root/generated/openapi_client/models/judge_files_inner.py +84 -0
- root/generated/openapi_client/models/judge_generator_request.py +142 -0
- root/generated/openapi_client/models/judge_generator_response.py +88 -0
- root/generated/openapi_client/models/judge_invite_request.py +87 -0
- root/generated/openapi_client/models/judge_list.py +156 -0
- root/generated/openapi_client/models/judge_rectifier_request_request.py +114 -0
- root/generated/openapi_client/models/judge_rectifier_response.py +121 -0
- root/generated/openapi_client/models/judge_request.py +108 -0
- root/generated/openapi_client/models/model.py +126 -0
- root/generated/openapi_client/models/model_list.py +115 -0
- root/generated/openapi_client/models/model_params.py +89 -0
- root/generated/openapi_client/models/model_params_request.py +89 -0
- root/generated/openapi_client/models/model_request.py +118 -0
- root/generated/openapi_client/models/nested_evaluator.py +110 -0
- root/generated/openapi_client/models/nested_evaluator_objective.py +87 -0
- root/generated/openapi_client/models/nested_evaluator_request.py +92 -0
- root/generated/openapi_client/models/nested_objective_evaluator.py +105 -0
- root/generated/openapi_client/models/nested_objective_evaluator_request.py +92 -0
- root/generated/openapi_client/models/nested_objective_list.py +111 -0
- root/generated/openapi_client/models/nested_user_details.py +88 -0
- root/generated/openapi_client/models/nested_user_details_request.py +82 -0
- root/generated/openapi_client/models/nested_vector_objective.py +88 -0
- root/generated/openapi_client/models/nested_vector_objective_request.py +82 -0
- root/generated/openapi_client/models/objective.py +157 -0
- root/generated/openapi_client/models/objective_list.py +128 -0
- root/generated/openapi_client/models/objective_request.py +113 -0
- root/generated/openapi_client/models/objective_validator.py +100 -0
- root/generated/openapi_client/models/objective_validator_request.py +90 -0
- root/generated/openapi_client/models/paginated_data_set_list_list.py +111 -0
- root/generated/openapi_client/models/paginated_evaluator_list.py +111 -0
- root/generated/openapi_client/models/paginated_evaluator_list_output_list.py +111 -0
- root/generated/openapi_client/models/paginated_execution_log_list_list.py +111 -0
- root/generated/openapi_client/models/paginated_judge_list_list.py +111 -0
- root/generated/openapi_client/models/paginated_model_list_list.py +111 -0
- root/generated/openapi_client/models/paginated_objective_list.py +111 -0
- root/generated/openapi_client/models/paginated_objective_list_list.py +111 -0
- root/generated/openapi_client/models/patched_evaluator_request.py +194 -0
- root/generated/openapi_client/models/patched_judge_request.py +110 -0
- root/generated/openapi_client/models/patched_model_request.py +118 -0
- root/generated/openapi_client/models/patched_objective_request.py +113 -0
- root/generated/openapi_client/models/provider.py +99 -0
- root/generated/openapi_client/models/reference_variable.py +123 -0
- root/generated/openapi_client/models/reference_variable_request.py +83 -0
- root/generated/openapi_client/models/skill_execution_validator_result.py +130 -0
- root/generated/openapi_client/models/skill_test_data_request.py +107 -0
- root/generated/openapi_client/models/skill_test_data_request_dataset_range.py +93 -0
- root/generated/openapi_client/models/skill_test_input_request.py +171 -0
- root/generated/openapi_client/models/skill_type_enum.py +36 -0
- root/generated/openapi_client/models/status_change.py +84 -0
- root/generated/openapi_client/models/status_change_request.py +84 -0
- root/generated/openapi_client/models/status_change_status_enum.py +36 -0
- root/generated/openapi_client/models/status_enum.py +38 -0
- root/generated/openapi_client/models/validation_result_status.py +36 -0
- root/generated/openapi_client/models/visibility_enum.py +38 -0
- root/generated/openapi_client/rest.py +203 -0
- root/generated/openapi_client_README.md +238 -0
- root/judges.py +681 -0
- root/models.py +197 -0
- root/objectives.py +343 -0
- root/py.typed +0 -0
- root/skills.py +1707 -0
- root/utils.py +90 -0
- scorable-1.6.4.dist-info/METADATA +395 -0
- scorable-1.6.4.dist-info/RECORD +228 -0
- scorable-1.6.4.dist-info/WHEEL +4 -0
- scorable-1.6.4.dist-info/licenses/LICENSE +202 -0
root/skills.py
ADDED
|
@@ -0,0 +1,1707 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import math
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
7
|
+
from contextlib import AbstractAsyncContextManager, AbstractContextManager
|
|
8
|
+
from enum import Enum
|
|
9
|
+
from functools import partial
|
|
10
|
+
from typing import TYPE_CHECKING, AsyncIterator, Dict, Iterator, List, Literal, Optional, Union, cast
|
|
11
|
+
|
|
12
|
+
from pydantic import BaseModel, StrictStr
|
|
13
|
+
|
|
14
|
+
from root.generated.openapi_aclient.models.evaluator_request import EvaluatorRequest as AEvaluatorRequest
|
|
15
|
+
from root.generated.openapi_aclient.models.paginated_evaluator_list import (
|
|
16
|
+
PaginatedEvaluatorList as APaginatedEvaluatorList,
|
|
17
|
+
)
|
|
18
|
+
from root.generated.openapi_aclient.models.paginated_evaluator_list_output_list import (
|
|
19
|
+
PaginatedEvaluatorListOutputList as APaginatedEvaluatorListOutputList,
|
|
20
|
+
)
|
|
21
|
+
from root.generated.openapi_client.models.evaluator_request import EvaluatorRequest
|
|
22
|
+
from root.generated.openapi_client.models.paginated_evaluator_list import PaginatedEvaluatorList
|
|
23
|
+
|
|
24
|
+
from .generated.openapi_aclient import ApiClient as AApiClient
|
|
25
|
+
from .generated.openapi_aclient.api.evaluators_api import EvaluatorsApi as AEvaluatorsApi
|
|
26
|
+
from .generated.openapi_aclient.api.objectives_api import ObjectivesApi as AObjectivesApi
|
|
27
|
+
from .generated.openapi_aclient.models import (
|
|
28
|
+
EvaluatorDemonstrationsRequest as AEvaluatorDemonstrationsRequest,
|
|
29
|
+
)
|
|
30
|
+
from .generated.openapi_aclient.models import (
|
|
31
|
+
EvaluatorExecutionFunctionsRequest as AEvaluatorExecutionFunctionsRequest,
|
|
32
|
+
)
|
|
33
|
+
from .generated.openapi_aclient.models import (
|
|
34
|
+
EvaluatorExecutionRequest as AEvaluatorExecutionRequest,
|
|
35
|
+
)
|
|
36
|
+
from .generated.openapi_aclient.models import (
|
|
37
|
+
EvaluatorExecutionResult as AEvaluatorExecutionResult,
|
|
38
|
+
)
|
|
39
|
+
from .generated.openapi_aclient.models import (
|
|
40
|
+
ModelParamsRequest as AModelParamsRequest,
|
|
41
|
+
)
|
|
42
|
+
from .generated.openapi_aclient.models.evaluator import Evaluator as AOpenAPIEvaluator
|
|
43
|
+
from .generated.openapi_aclient.models.evaluator_calibration_output import (
|
|
44
|
+
EvaluatorCalibrationOutput as AEvaluatorCalibrationOutput,
|
|
45
|
+
)
|
|
46
|
+
from .generated.openapi_aclient.models.evaluator_list_output import EvaluatorListOutput as AEvaluatorListOutput
|
|
47
|
+
from .generated.openapi_aclient.models.input_variable_request import InputVariableRequest as AInputVariableRequest
|
|
48
|
+
from .generated.openapi_aclient.models.objective_request import ObjectiveRequest as AObjectiveRequest
|
|
49
|
+
from .generated.openapi_aclient.models.patched_evaluator_request import (
|
|
50
|
+
PatchedEvaluatorRequest as APatchedEvaluatorRequest,
|
|
51
|
+
)
|
|
52
|
+
from .generated.openapi_aclient.models.reference_variable_request import (
|
|
53
|
+
ReferenceVariableRequest as AReferenceVariableRequest,
|
|
54
|
+
)
|
|
55
|
+
from .generated.openapi_aclient.models.skill_test_data_request import SkillTestDataRequest as ASkillTestDataRequest
|
|
56
|
+
from .generated.openapi_aclient.models.skill_test_input_request import (
|
|
57
|
+
SkillTestInputRequest as ASkillTestInputRequest,
|
|
58
|
+
)
|
|
59
|
+
from .generated.openapi_client import ApiClient as ApiClient
|
|
60
|
+
from .generated.openapi_client.api.evaluators_api import EvaluatorsApi as EvaluatorsApi
|
|
61
|
+
from .generated.openapi_client.api.objectives_api import ObjectivesApi as ObjectivesApi
|
|
62
|
+
from .generated.openapi_client.models.evaluator_calibration_output import EvaluatorCalibrationOutput
|
|
63
|
+
from .generated.openapi_client.models.evaluator_demonstrations_request import (
|
|
64
|
+
EvaluatorDemonstrationsRequest,
|
|
65
|
+
)
|
|
66
|
+
from .generated.openapi_client.models.evaluator_execution_functions_request import (
|
|
67
|
+
EvaluatorExecutionFunctionsRequest,
|
|
68
|
+
)
|
|
69
|
+
from .generated.openapi_client.models.evaluator_execution_request import EvaluatorExecutionRequest
|
|
70
|
+
from .generated.openapi_client.models.evaluator_execution_result import EvaluatorExecutionResult
|
|
71
|
+
from .generated.openapi_client.models.evaluator_list_output import EvaluatorListOutput
|
|
72
|
+
from .generated.openapi_client.models.input_variable_request import InputVariableRequest
|
|
73
|
+
from .generated.openapi_client.models.model_params_request import ModelParamsRequest
|
|
74
|
+
from .generated.openapi_client.models.objective_request import ObjectiveRequest
|
|
75
|
+
from .generated.openapi_client.models.patched_evaluator_request import PatchedEvaluatorRequest
|
|
76
|
+
from .generated.openapi_client.models.reference_variable_request import ReferenceVariableRequest
|
|
77
|
+
from .generated.openapi_client.models.skill_test_data_request import SkillTestDataRequest
|
|
78
|
+
from .generated.openapi_client.models.skill_test_input_request import SkillTestInputRequest
|
|
79
|
+
from .utils import ClientContextCallable, aiterate_cursor_list, iterate_cursor_list, with_async_client, with_sync_client
|
|
80
|
+
|
|
81
|
+
if TYPE_CHECKING:
|
|
82
|
+
from .generated.openapi_aclient.models.evaluator import Evaluator as GeneratedEvaluator
|
|
83
|
+
from .generated.openapi_client.models.evaluator import Evaluator as SyncGeneratedEvaluator
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
ModelName = Union[
|
|
87
|
+
str,
|
|
88
|
+
Literal[
|
|
89
|
+
"root", # RS-chosen model
|
|
90
|
+
],
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class ModelParams(BaseModel):
|
|
95
|
+
"""
|
|
96
|
+
Additional model parameters.
|
|
97
|
+
|
|
98
|
+
All fields are made optional in practice.
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
temperature: Optional[float] = None
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class ReferenceVariable(BaseModel):
|
|
105
|
+
"""
|
|
106
|
+
Reference variable definition.
|
|
107
|
+
|
|
108
|
+
`name` within prompt gets populated with content from `dataset_id`.
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
name: str
|
|
112
|
+
dataset_id: str
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class InputVariable(BaseModel):
|
|
116
|
+
"""
|
|
117
|
+
Input variable definition.
|
|
118
|
+
|
|
119
|
+
`name` within prompt gets populated with the provided variable.
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
name: str
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class EvaluatorDemonstration(BaseModel):
|
|
126
|
+
"""
|
|
127
|
+
Evaluator demonstration
|
|
128
|
+
|
|
129
|
+
Demonstrations are used to train an evaluator to adjust its behavior.
|
|
130
|
+
"""
|
|
131
|
+
|
|
132
|
+
request: Optional[str] = None
|
|
133
|
+
response: str
|
|
134
|
+
score: float
|
|
135
|
+
justification: Optional[str] = None
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
class ACalibrateBatchParameters:
|
|
139
|
+
def __init__(
|
|
140
|
+
self,
|
|
141
|
+
name: str,
|
|
142
|
+
prompt: str,
|
|
143
|
+
model: "ModelName",
|
|
144
|
+
pii_filter: bool = False,
|
|
145
|
+
reference_variables: Optional[Union[List["ReferenceVariable"], List["AReferenceVariableRequest"]]] = None,
|
|
146
|
+
input_variables: Optional[Union[List["InputVariable"], List["AInputVariableRequest"]]] = None,
|
|
147
|
+
):
|
|
148
|
+
self.name = name
|
|
149
|
+
self.prompt = prompt
|
|
150
|
+
self.model = model
|
|
151
|
+
self.pii_filter = pii_filter
|
|
152
|
+
self.reference_variables = reference_variables
|
|
153
|
+
self.input_variables = input_variables
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class ACalibrateBatchResult(BaseModel):
|
|
157
|
+
results: List[AEvaluatorCalibrationOutput]
|
|
158
|
+
rms_errors_model: Dict[str, float]
|
|
159
|
+
mae_errors_model: Dict[str, float]
|
|
160
|
+
rms_errors_prompt: Dict[str, float]
|
|
161
|
+
mae_errors_prompt: Dict[str, float]
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
class CalibrateBatchParameters:
|
|
165
|
+
def __init__(
|
|
166
|
+
self,
|
|
167
|
+
name: str,
|
|
168
|
+
prompt: str,
|
|
169
|
+
model: "ModelName",
|
|
170
|
+
pii_filter: bool = False,
|
|
171
|
+
reference_variables: Optional[Union[List["ReferenceVariable"], List["ReferenceVariableRequest"]]] = None,
|
|
172
|
+
input_variables: Optional[Union[List["InputVariable"], List["InputVariableRequest"]]] = None,
|
|
173
|
+
):
|
|
174
|
+
self.name = name
|
|
175
|
+
self.prompt = prompt
|
|
176
|
+
self.model = model
|
|
177
|
+
self.pii_filter = pii_filter
|
|
178
|
+
self.reference_variables = reference_variables
|
|
179
|
+
self.input_variables = input_variables
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
class CalibrateBatchResult(BaseModel):
|
|
183
|
+
results: List[EvaluatorCalibrationOutput]
|
|
184
|
+
rms_errors_model: Dict[str, float]
|
|
185
|
+
mae_errors_model: Dict[str, float]
|
|
186
|
+
rms_errors_prompt: Dict[str, float]
|
|
187
|
+
mae_errors_prompt: Dict[str, float]
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
class Versions:
|
|
191
|
+
"""
|
|
192
|
+
Version listing (sub)API
|
|
193
|
+
|
|
194
|
+
Note that this should not be directly instantiated.
|
|
195
|
+
"""
|
|
196
|
+
|
|
197
|
+
def __init__(self, client_context: ClientContextCallable):
|
|
198
|
+
self.client_context = client_context
|
|
199
|
+
|
|
200
|
+
@with_sync_client
|
|
201
|
+
def list(self, evaluator_id: str, *, _client: ApiClient) -> PaginatedEvaluatorList:
|
|
202
|
+
"""
|
|
203
|
+
List all versions of a evaluator.
|
|
204
|
+
"""
|
|
205
|
+
|
|
206
|
+
api_instance = EvaluatorsApi(_client)
|
|
207
|
+
return api_instance.evaluators_versions_list(id=evaluator_id)
|
|
208
|
+
|
|
209
|
+
async def alist(self, evaluator_id: str) -> APaginatedEvaluatorList:
|
|
210
|
+
"""
|
|
211
|
+
Asynchronously list all versions of a evaluator.
|
|
212
|
+
"""
|
|
213
|
+
|
|
214
|
+
context = self.client_context()
|
|
215
|
+
assert isinstance(context, AbstractAsyncContextManager), "This method is not available in synchronous mode"
|
|
216
|
+
async with context as client:
|
|
217
|
+
api_instance = AEvaluatorsApi(client)
|
|
218
|
+
return await api_instance.evaluators_versions_list(id=evaluator_id)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
class Evaluator(AOpenAPIEvaluator):
|
|
222
|
+
"""
|
|
223
|
+
Wrapper for a single Evaluator.
|
|
224
|
+
|
|
225
|
+
For available attributes, please check the (automatically
|
|
226
|
+
generated) superclass documentation.
|
|
227
|
+
"""
|
|
228
|
+
|
|
229
|
+
client_context: ClientContextCallable
|
|
230
|
+
|
|
231
|
+
@classmethod
|
|
232
|
+
def _wrap(
|
|
233
|
+
cls, apiobj: Union[AOpenAPIEvaluator, "SyncGeneratedEvaluator"], client_context: ClientContextCallable
|
|
234
|
+
) -> "Evaluator": # noqa: E501
|
|
235
|
+
obj = cast(Evaluator, apiobj)
|
|
236
|
+
obj.__class__ = cls
|
|
237
|
+
obj.client_context = client_context
|
|
238
|
+
return obj
|
|
239
|
+
|
|
240
|
+
@with_sync_client
|
|
241
|
+
def run(
|
|
242
|
+
self,
|
|
243
|
+
response: Optional[str] = None,
|
|
244
|
+
request: Optional[str] = None,
|
|
245
|
+
contexts: Optional[List[str]] = None,
|
|
246
|
+
functions: Optional[List[EvaluatorExecutionFunctionsRequest]] = None,
|
|
247
|
+
expected_output: Optional[str] = None,
|
|
248
|
+
variables: Optional[dict[str, str]] = None,
|
|
249
|
+
tags: Optional[List[str]] = None,
|
|
250
|
+
*,
|
|
251
|
+
_client: ApiClient,
|
|
252
|
+
_request_timeout: Optional[int] = None,
|
|
253
|
+
) -> EvaluatorExecutionResult:
|
|
254
|
+
"""
|
|
255
|
+
Run the evaluator.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
response: LLM output.
|
|
259
|
+
request: The prompt sent to the LLM.
|
|
260
|
+
contexts: Optional documents passed to RAG evaluators
|
|
261
|
+
functions: Optional function definitions to LLM tool call validation
|
|
262
|
+
expected_output: Optional expected output for the evaluator.
|
|
263
|
+
variables: Optional additional variable mappings for the evaluator. For example, if the evaluator
|
|
264
|
+
predicate is "evaluate the output based on {subject}: {output}", then variables={"subject": "clarity"}.
|
|
265
|
+
tags: Optional tags to add to the evaluator execution
|
|
266
|
+
"""
|
|
267
|
+
|
|
268
|
+
if not response and not request:
|
|
269
|
+
raise ValueError("Either response or request must be provided")
|
|
270
|
+
|
|
271
|
+
api_instance = EvaluatorsApi(_client)
|
|
272
|
+
|
|
273
|
+
evaluator_execution_request = EvaluatorExecutionRequest(
|
|
274
|
+
evaluator_version_id=self.version_id,
|
|
275
|
+
request=request,
|
|
276
|
+
response=response,
|
|
277
|
+
contexts=contexts,
|
|
278
|
+
functions=functions,
|
|
279
|
+
expected_output=expected_output,
|
|
280
|
+
variables=variables,
|
|
281
|
+
tags=tags,
|
|
282
|
+
)
|
|
283
|
+
return api_instance.evaluators_execute_create(
|
|
284
|
+
id=self.id,
|
|
285
|
+
evaluator_execution_request=evaluator_execution_request,
|
|
286
|
+
_request_timeout=_request_timeout,
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
class AEvaluator(AOpenAPIEvaluator):
|
|
291
|
+
"""
|
|
292
|
+
Wrapper for a single Evaluator.
|
|
293
|
+
|
|
294
|
+
For available attributes, please check the (automatically
|
|
295
|
+
generated) superclass documentation.
|
|
296
|
+
"""
|
|
297
|
+
|
|
298
|
+
client_context: ClientContextCallable
|
|
299
|
+
|
|
300
|
+
@classmethod
|
|
301
|
+
async def _awrap(
|
|
302
|
+
cls, apiobj: Union[AOpenAPIEvaluator, "GeneratedEvaluator"], client_context: ClientContextCallable
|
|
303
|
+
) -> "AEvaluator": # noqa: E501
|
|
304
|
+
obj = cast(AEvaluator, apiobj)
|
|
305
|
+
obj.__class__ = cls
|
|
306
|
+
obj.client_context = client_context
|
|
307
|
+
return obj
|
|
308
|
+
|
|
309
|
+
@with_async_client
|
|
310
|
+
async def arun(
|
|
311
|
+
self,
|
|
312
|
+
response: Optional[str] = None,
|
|
313
|
+
request: Optional[str] = None,
|
|
314
|
+
contexts: Optional[List[str]] = None,
|
|
315
|
+
functions: Optional[List[AEvaluatorExecutionFunctionsRequest]] = None,
|
|
316
|
+
expected_output: Optional[str] = None,
|
|
317
|
+
variables: Optional[dict[str, str]] = None,
|
|
318
|
+
tags: Optional[List[str]] = None,
|
|
319
|
+
*,
|
|
320
|
+
_client: AApiClient,
|
|
321
|
+
_request_timeout: Optional[int] = None,
|
|
322
|
+
) -> AEvaluatorExecutionResult:
|
|
323
|
+
"""
|
|
324
|
+
Asynchronously run the evaluator.
|
|
325
|
+
|
|
326
|
+
Args:
|
|
327
|
+
response: LLM output.
|
|
328
|
+
request: The prompt sent to the LLM.
|
|
329
|
+
contexts: Optional documents passed to RAG evaluators
|
|
330
|
+
functions: Optional function definitions to LLM tool call validation
|
|
331
|
+
expected_output: Optional expected output for the evaluator.
|
|
332
|
+
variables: Optional additional variable mappings for the evaluator. For example, if the evaluator
|
|
333
|
+
predicate is "evaluate the output based on {subject}: {output}", then variables={"subject": "clarity"}.
|
|
334
|
+
tags: Optional tags to add to the evaluator execution
|
|
335
|
+
"""
|
|
336
|
+
|
|
337
|
+
if not response and not request:
|
|
338
|
+
raise ValueError("Either response or request must be provided")
|
|
339
|
+
|
|
340
|
+
api_instance = AEvaluatorsApi(_client)
|
|
341
|
+
|
|
342
|
+
evaluator_execution_request = AEvaluatorExecutionRequest(
|
|
343
|
+
evaluator_version_id=self.version_id,
|
|
344
|
+
request=request,
|
|
345
|
+
response=response,
|
|
346
|
+
contexts=contexts,
|
|
347
|
+
functions=functions,
|
|
348
|
+
expected_output=expected_output,
|
|
349
|
+
variables=variables,
|
|
350
|
+
tags=tags,
|
|
351
|
+
)
|
|
352
|
+
return await api_instance.evaluators_execute_create(
|
|
353
|
+
id=self.id,
|
|
354
|
+
evaluator_execution_request=evaluator_execution_request,
|
|
355
|
+
_request_timeout=_request_timeout,
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def _to_input_variables(
|
|
360
|
+
input_variables: Optional[Union[List[InputVariable], List[InputVariableRequest]]],
|
|
361
|
+
) -> List[InputVariableRequest]:
|
|
362
|
+
def _convert_to_generated_model(entry: Union[InputVariable, InputVariableRequest]) -> InputVariableRequest:
|
|
363
|
+
if not isinstance(entry, InputVariableRequest):
|
|
364
|
+
return InputVariableRequest(name=entry.name)
|
|
365
|
+
return entry
|
|
366
|
+
|
|
367
|
+
return [_convert_to_generated_model(entry) for entry in input_variables or {}]
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
def _to_model_params(model_params: Optional[Union[ModelParams, ModelParamsRequest]]) -> Optional[ModelParamsRequest]:
|
|
371
|
+
if isinstance(model_params, ModelParams):
|
|
372
|
+
return ModelParamsRequest(**model_params.model_dump())
|
|
373
|
+
return model_params
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
def _to_reference_variables(
|
|
377
|
+
reference_variables: Optional[Union[List[ReferenceVariable], List[ReferenceVariableRequest]]],
|
|
378
|
+
) -> List[ReferenceVariableRequest]:
|
|
379
|
+
def _convert_to_generated_model(
|
|
380
|
+
entry: Union[ReferenceVariable, ReferenceVariableRequest],
|
|
381
|
+
) -> ReferenceVariableRequest:
|
|
382
|
+
if not isinstance(entry, ReferenceVariableRequest):
|
|
383
|
+
return ReferenceVariableRequest(name=entry.name, dataset=entry.dataset_id)
|
|
384
|
+
return entry
|
|
385
|
+
|
|
386
|
+
return [_convert_to_generated_model(entry) for entry in reference_variables or {}]
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def _to_evaluator_demonstrations(
|
|
390
|
+
input_variables: Optional[Union[List[EvaluatorDemonstration], List[EvaluatorDemonstrationsRequest]]],
|
|
391
|
+
) -> List[EvaluatorDemonstrationsRequest]:
|
|
392
|
+
def _convert_dict(
|
|
393
|
+
entry: Union[EvaluatorDemonstration, EvaluatorDemonstrationsRequest],
|
|
394
|
+
) -> EvaluatorDemonstrationsRequest:
|
|
395
|
+
if not isinstance(entry, EvaluatorDemonstrationsRequest):
|
|
396
|
+
return EvaluatorDemonstrationsRequest(
|
|
397
|
+
score=entry.score,
|
|
398
|
+
request=entry.request,
|
|
399
|
+
response=entry.response,
|
|
400
|
+
justification=entry.justification,
|
|
401
|
+
)
|
|
402
|
+
return entry
|
|
403
|
+
|
|
404
|
+
return [_convert_dict(entry) for entry in input_variables or {}]
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def _ato_input_variables(
|
|
408
|
+
input_variables: Optional[Union[List[InputVariable], List[AInputVariableRequest]]],
|
|
409
|
+
) -> List[AInputVariableRequest]:
|
|
410
|
+
def _convert_to_generated_model(entry: Union[InputVariable, AInputVariableRequest]) -> AInputVariableRequest:
|
|
411
|
+
if not isinstance(entry, AInputVariableRequest):
|
|
412
|
+
return AInputVariableRequest(name=entry.name)
|
|
413
|
+
return entry
|
|
414
|
+
|
|
415
|
+
return [_convert_to_generated_model(entry) for entry in input_variables or {}]
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
def _ato_model_params(model_params: Optional[Union[ModelParams, AModelParamsRequest]]) -> Optional[AModelParamsRequest]:
|
|
419
|
+
if isinstance(model_params, ModelParams):
|
|
420
|
+
return AModelParamsRequest(**model_params.model_dump())
|
|
421
|
+
return model_params
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
def _ato_reference_variables(
|
|
425
|
+
reference_variables: Optional[Union[List[ReferenceVariable], List[AReferenceVariableRequest]]],
|
|
426
|
+
) -> List[AReferenceVariableRequest]:
|
|
427
|
+
def _convert_to_generated_model(
|
|
428
|
+
entry: Union[ReferenceVariable, AReferenceVariableRequest],
|
|
429
|
+
) -> AReferenceVariableRequest:
|
|
430
|
+
if not isinstance(entry, AReferenceVariableRequest):
|
|
431
|
+
return AReferenceVariableRequest(name=entry.name, dataset=entry.dataset_id)
|
|
432
|
+
return entry
|
|
433
|
+
|
|
434
|
+
return [_convert_to_generated_model(entry) for entry in reference_variables or {}]
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
def _ato_evaluator_demonstrations(
|
|
438
|
+
input_variables: Optional[Union[List[EvaluatorDemonstration], List[AEvaluatorDemonstrationsRequest]]],
|
|
439
|
+
) -> List[AEvaluatorDemonstrationsRequest]:
|
|
440
|
+
def _aconvert_dict(
|
|
441
|
+
entry: Union[EvaluatorDemonstration, AEvaluatorDemonstrationsRequest],
|
|
442
|
+
) -> AEvaluatorDemonstrationsRequest:
|
|
443
|
+
if not isinstance(entry, AEvaluatorDemonstrationsRequest):
|
|
444
|
+
return AEvaluatorDemonstrationsRequest(
|
|
445
|
+
score=entry.score,
|
|
446
|
+
request=entry.request,
|
|
447
|
+
response=entry.response,
|
|
448
|
+
justification=entry.justification,
|
|
449
|
+
)
|
|
450
|
+
return entry
|
|
451
|
+
|
|
452
|
+
return [_aconvert_dict(entry) for entry in input_variables or {}]
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
class PresetEvaluatorRunner:
|
|
456
|
+
client_context: ClientContextCallable
|
|
457
|
+
|
|
458
|
+
def __init__(
|
|
459
|
+
self,
|
|
460
|
+
client_context: ClientContextCallable,
|
|
461
|
+
evaluator_id: str,
|
|
462
|
+
eval_name: str,
|
|
463
|
+
evaluator_version_id: Optional[str] = None,
|
|
464
|
+
):
|
|
465
|
+
self.client_context = client_context
|
|
466
|
+
self.evaluator_id = evaluator_id
|
|
467
|
+
self.evaluator_version_id = evaluator_version_id
|
|
468
|
+
self.__name__ = eval_name
|
|
469
|
+
|
|
470
|
+
@with_sync_client
|
|
471
|
+
def __call__(
|
|
472
|
+
self,
|
|
473
|
+
response: Optional[str] = None,
|
|
474
|
+
request: Optional[str] = None,
|
|
475
|
+
contexts: Optional[List[str]] = None,
|
|
476
|
+
functions: Optional[List[EvaluatorExecutionFunctionsRequest]] = None,
|
|
477
|
+
expected_output: Optional[str] = None,
|
|
478
|
+
variables: Optional[dict[str, str]] = None,
|
|
479
|
+
tags: Optional[List[str]] = None,
|
|
480
|
+
*,
|
|
481
|
+
_client: ApiClient,
|
|
482
|
+
_request_timeout: Optional[int] = None,
|
|
483
|
+
) -> EvaluatorExecutionResult:
|
|
484
|
+
"""
|
|
485
|
+
Run the evaluator.
|
|
486
|
+
|
|
487
|
+
Args:
|
|
488
|
+
response: LLM output.
|
|
489
|
+
request: The prompt sent to the LLM.
|
|
490
|
+
contexts: Optional documents passed to RAG evaluators
|
|
491
|
+
functions: Optional function definitions to LLM tool call validation
|
|
492
|
+
expected_output: Optional expected output for the evaluator.
|
|
493
|
+
variables: Optional additional variable mappings for the evaluator. For example, if the evaluator
|
|
494
|
+
predicate is "evaluate the output based on {subject}: {output}", then variables={"subject": "clarity"}.
|
|
495
|
+
tags: Optional tags to add to the evaluator execution
|
|
496
|
+
"""
|
|
497
|
+
|
|
498
|
+
if not response and not request:
|
|
499
|
+
raise ValueError("Either response or request must be provided")
|
|
500
|
+
|
|
501
|
+
api_instance = EvaluatorsApi(_client)
|
|
502
|
+
|
|
503
|
+
evaluator_execution_request = EvaluatorExecutionRequest(
|
|
504
|
+
evaluator_version_id=self.evaluator_version_id,
|
|
505
|
+
request=request,
|
|
506
|
+
response=response,
|
|
507
|
+
contexts=contexts,
|
|
508
|
+
functions=functions,
|
|
509
|
+
expected_output=expected_output,
|
|
510
|
+
variables=variables,
|
|
511
|
+
tags=tags,
|
|
512
|
+
)
|
|
513
|
+
return api_instance.evaluators_execute_create(
|
|
514
|
+
id=self.evaluator_id,
|
|
515
|
+
evaluator_execution_request=evaluator_execution_request,
|
|
516
|
+
_request_timeout=_request_timeout,
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
class APresetEvaluatorRunner:
|
|
521
|
+
client_context: ClientContextCallable
|
|
522
|
+
|
|
523
|
+
def __init__(
|
|
524
|
+
self,
|
|
525
|
+
client_context: ClientContextCallable,
|
|
526
|
+
evaluator_id: str,
|
|
527
|
+
eval_name: str,
|
|
528
|
+
evaluator_version_id: Optional[str] = None,
|
|
529
|
+
):
|
|
530
|
+
self.client_context = client_context
|
|
531
|
+
self.evaluator_id = evaluator_id
|
|
532
|
+
self.evaluator_version_id = evaluator_version_id
|
|
533
|
+
self.__name__ = eval_name
|
|
534
|
+
|
|
535
|
+
@with_async_client
|
|
536
|
+
async def __call__(
|
|
537
|
+
self,
|
|
538
|
+
response: Optional[str] = None,
|
|
539
|
+
request: Optional[str] = None,
|
|
540
|
+
contexts: Optional[List[str]] = None,
|
|
541
|
+
functions: Optional[List[AEvaluatorExecutionFunctionsRequest]] = None,
|
|
542
|
+
expected_output: Optional[str] = None,
|
|
543
|
+
variables: Optional[dict[str, str]] = None,
|
|
544
|
+
tags: Optional[List[str]] = None,
|
|
545
|
+
*,
|
|
546
|
+
_client: AApiClient,
|
|
547
|
+
_request_timeout: Optional[int] = None,
|
|
548
|
+
) -> AEvaluatorExecutionResult:
|
|
549
|
+
"""
|
|
550
|
+
Asynchronously run the evaluator.
|
|
551
|
+
|
|
552
|
+
Args:
|
|
553
|
+
response: LLM output.
|
|
554
|
+
request: The prompt sent to the LLM.
|
|
555
|
+
contexts: Optional documents passed to RAG evaluators
|
|
556
|
+
functions: Optional function definitions to LLM tool call validation
|
|
557
|
+
expected_output: Optional expected output for the evaluator.
|
|
558
|
+
variables: Optional additional variable mappings for the evaluator. For example, if the evaluator
|
|
559
|
+
predicate is "evaluate the output based on {subject}: {output}", then variables={"subject": "clarity"}.
|
|
560
|
+
tags: Optional tags to add to the evaluator execution
|
|
561
|
+
"""
|
|
562
|
+
|
|
563
|
+
if not response and not request:
|
|
564
|
+
raise ValueError("Either response or request must be provided")
|
|
565
|
+
|
|
566
|
+
api_instance = AEvaluatorsApi(_client)
|
|
567
|
+
|
|
568
|
+
evaluator_execution_request = AEvaluatorExecutionRequest(
|
|
569
|
+
evaluator_version_id=self.evaluator_version_id,
|
|
570
|
+
request=request,
|
|
571
|
+
response=response,
|
|
572
|
+
contexts=contexts,
|
|
573
|
+
functions=functions,
|
|
574
|
+
expected_output=expected_output,
|
|
575
|
+
variables=variables,
|
|
576
|
+
tags=tags,
|
|
577
|
+
)
|
|
578
|
+
return await api_instance.evaluators_execute_create(
|
|
579
|
+
id=self.evaluator_id,
|
|
580
|
+
evaluator_execution_request=evaluator_execution_request,
|
|
581
|
+
_request_timeout=_request_timeout,
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
|
|
585
|
+
class Evaluators:
|
|
586
|
+
"""Evaluators (sub) API
|
|
587
|
+
|
|
588
|
+
Note:
|
|
589
|
+
|
|
590
|
+
The construction of the API instance should be handled by
|
|
591
|
+
accesing an attribute of a :class:`root.client.RootSignals` instance.
|
|
592
|
+
"""
|
|
593
|
+
|
|
594
|
+
def _validate_create_params_sanitize_name(
|
|
595
|
+
self, name: Optional[str], intent: Optional[str], objective_id: Optional[str]
|
|
596
|
+
) -> str:
|
|
597
|
+
if objective_id is not None:
|
|
598
|
+
if intent:
|
|
599
|
+
raise ValueError("Supplying both objective_id and intent is not supported")
|
|
600
|
+
if name is None:
|
|
601
|
+
name = "<unnamed>"
|
|
602
|
+
return name
|
|
603
|
+
|
|
604
|
+
def __init__(self, client_context: ClientContextCallable):
|
|
605
|
+
self.client_context = client_context
|
|
606
|
+
self.versions = Versions(client_context)
|
|
607
|
+
|
|
608
|
+
def _to_objective_request(self, *, intent: Optional[str] = None) -> ObjectiveRequest:
|
|
609
|
+
return ObjectiveRequest(
|
|
610
|
+
intent=intent,
|
|
611
|
+
)
|
|
612
|
+
|
|
613
|
+
async def _ato_objective_request(self, *, intent: Optional[str] = None) -> AObjectiveRequest:
|
|
614
|
+
return AObjectiveRequest(
|
|
615
|
+
intent=intent,
|
|
616
|
+
)
|
|
617
|
+
|
|
618
|
+
@with_sync_client
|
|
619
|
+
def run(
|
|
620
|
+
self,
|
|
621
|
+
evaluator_id: str,
|
|
622
|
+
*,
|
|
623
|
+
request: Optional[str] = None,
|
|
624
|
+
response: Optional[str] = None,
|
|
625
|
+
contexts: Optional[List[str]] = None,
|
|
626
|
+
functions: Optional[List[EvaluatorExecutionFunctionsRequest]] = None,
|
|
627
|
+
expected_output: Optional[str] = None,
|
|
628
|
+
evaluator_version_id: Optional[str] = None,
|
|
629
|
+
variables: Optional[dict[str, str]] = None,
|
|
630
|
+
tags: Optional[List[str]] = None,
|
|
631
|
+
_request_timeout: Optional[int] = None,
|
|
632
|
+
_client: ApiClient,
|
|
633
|
+
) -> EvaluatorExecutionResult:
|
|
634
|
+
"""
|
|
635
|
+
Run the evaluator.
|
|
636
|
+
|
|
637
|
+
Args:
|
|
638
|
+
evaluator_id: The ID of the evaluator to run.
|
|
639
|
+
request: The prompt sent to the LLM.
|
|
640
|
+
response: LLM output.
|
|
641
|
+
contexts: Optional documents passed to RAG evaluators.
|
|
642
|
+
functions: Optional function definitions to LLM tool call validation.
|
|
643
|
+
expected_output: Optional expected output for the evaluator.
|
|
644
|
+
evaluator_version_id: Version ID of the evaluator to run. If omitted, the latest version is used.
|
|
645
|
+
variables: Optional additional variable mappings for the evaluator. For example, if the evaluator
|
|
646
|
+
predicate is "evaluate the output based on {subject}: {output}", then variables={"subject": "clarity"}.
|
|
647
|
+
tags: Optional tags to add to the evaluator execution
|
|
648
|
+
_request_timeout: Optional timeout for the request.
|
|
649
|
+
"""
|
|
650
|
+
|
|
651
|
+
if not response and not request:
|
|
652
|
+
raise ValueError("Either response or request must be provided")
|
|
653
|
+
|
|
654
|
+
api_instance = EvaluatorsApi(_client)
|
|
655
|
+
|
|
656
|
+
evaluator_execution_request = EvaluatorExecutionRequest(
|
|
657
|
+
evaluator_version_id=evaluator_version_id,
|
|
658
|
+
request=request,
|
|
659
|
+
response=response,
|
|
660
|
+
contexts=contexts,
|
|
661
|
+
functions=functions,
|
|
662
|
+
expected_output=expected_output,
|
|
663
|
+
variables=variables,
|
|
664
|
+
tags=tags,
|
|
665
|
+
)
|
|
666
|
+
return api_instance.evaluators_execute_create(
|
|
667
|
+
id=evaluator_id,
|
|
668
|
+
evaluator_execution_request=evaluator_execution_request,
|
|
669
|
+
_request_timeout=_request_timeout,
|
|
670
|
+
)
|
|
671
|
+
|
|
672
|
+
@with_async_client
|
|
673
|
+
async def arun(
|
|
674
|
+
self,
|
|
675
|
+
evaluator_id: str,
|
|
676
|
+
*,
|
|
677
|
+
request: Optional[str] = None,
|
|
678
|
+
response: Optional[str] = None,
|
|
679
|
+
contexts: Optional[List[str]] = None,
|
|
680
|
+
functions: Optional[List[AEvaluatorExecutionFunctionsRequest]] = None,
|
|
681
|
+
expected_output: Optional[str] = None,
|
|
682
|
+
evaluator_version_id: Optional[str] = None,
|
|
683
|
+
variables: Optional[dict[str, str]] = None,
|
|
684
|
+
tags: Optional[List[str]] = None,
|
|
685
|
+
_request_timeout: Optional[int] = None,
|
|
686
|
+
_client: AApiClient,
|
|
687
|
+
) -> AEvaluatorExecutionResult:
|
|
688
|
+
"""
|
|
689
|
+
Asynchronously run the evaluator.
|
|
690
|
+
|
|
691
|
+
Args:
|
|
692
|
+
evaluator_id: The ID of the evaluator to run.
|
|
693
|
+
request: The prompt sent to the LLM.
|
|
694
|
+
response: LLM output.
|
|
695
|
+
contexts: Optional documents passed to RAG evaluators.
|
|
696
|
+
functions: Optional function definitions to LLM tool call validation.
|
|
697
|
+
expected_output: Optional expected output for the evaluator.
|
|
698
|
+
evaluator_version_id: Version ID of the evaluator to run. If omitted, the latest version is used.
|
|
699
|
+
variables: Optional additional variable mappings for the evaluator. For example, if the evaluator
|
|
700
|
+
predicate is "evaluate the output based on {subject}: {output}", then variables={"subject": "clarity"}.
|
|
701
|
+
tags: Optional tags to add to the evaluator execution
|
|
702
|
+
_request_timeout: Optional timeout for the request.
|
|
703
|
+
"""
|
|
704
|
+
|
|
705
|
+
if not response and not request:
|
|
706
|
+
raise ValueError("Either response or request must be provided")
|
|
707
|
+
|
|
708
|
+
api_instance = AEvaluatorsApi(_client)
|
|
709
|
+
evaluator_execution_request = AEvaluatorExecutionRequest(
|
|
710
|
+
evaluator_version_id=evaluator_version_id,
|
|
711
|
+
request=request,
|
|
712
|
+
response=response,
|
|
713
|
+
contexts=contexts,
|
|
714
|
+
functions=functions,
|
|
715
|
+
expected_output=expected_output,
|
|
716
|
+
variables=variables,
|
|
717
|
+
tags=tags,
|
|
718
|
+
)
|
|
719
|
+
return await api_instance.evaluators_execute_create(
|
|
720
|
+
id=evaluator_id,
|
|
721
|
+
evaluator_execution_request=evaluator_execution_request,
|
|
722
|
+
_request_timeout=_request_timeout,
|
|
723
|
+
)
|
|
724
|
+
|
|
725
|
+
@with_sync_client
|
|
726
|
+
def calibrate_existing(
|
|
727
|
+
self,
|
|
728
|
+
evaluator_id: str,
|
|
729
|
+
*,
|
|
730
|
+
test_dataset_id: Optional[str] = None,
|
|
731
|
+
test_data: Optional[List[List[str]]] = None,
|
|
732
|
+
_request_timeout: Optional[int] = None,
|
|
733
|
+
_client: ApiClient,
|
|
734
|
+
) -> List[EvaluatorCalibrationOutput]:
|
|
735
|
+
"""
|
|
736
|
+
Run calibration set on an existing evaluator.
|
|
737
|
+
"""
|
|
738
|
+
|
|
739
|
+
if not test_dataset_id and not test_data:
|
|
740
|
+
raise ValueError("Either test_dataset_id or test_data must be provided")
|
|
741
|
+
if test_dataset_id and test_data:
|
|
742
|
+
raise ValueError("Only one of test_dataset_id or test_data must be provided")
|
|
743
|
+
api_instance = EvaluatorsApi(_client)
|
|
744
|
+
evaluator_test_request = SkillTestDataRequest(
|
|
745
|
+
test_dataset_id=test_dataset_id,
|
|
746
|
+
test_data=test_data,
|
|
747
|
+
)
|
|
748
|
+
return api_instance.evaluators_calibrate_create2(
|
|
749
|
+
evaluator_id, evaluator_test_request, _request_timeout=_request_timeout
|
|
750
|
+
)
|
|
751
|
+
|
|
752
|
+
@with_async_client
|
|
753
|
+
async def acalibrate_existing(
|
|
754
|
+
self,
|
|
755
|
+
evaluator_id: str,
|
|
756
|
+
*,
|
|
757
|
+
test_dataset_id: Optional[str] = None,
|
|
758
|
+
test_data: Optional[List[List[str]]] = None,
|
|
759
|
+
_request_timeout: Optional[int] = None,
|
|
760
|
+
_client: AApiClient,
|
|
761
|
+
) -> List[AEvaluatorCalibrationOutput]:
|
|
762
|
+
"""
|
|
763
|
+
Asynchronously run calibration set on an existing evaluator.
|
|
764
|
+
"""
|
|
765
|
+
|
|
766
|
+
if not test_dataset_id and not test_data:
|
|
767
|
+
raise ValueError("Either test_dataset_id or test_data must be provided")
|
|
768
|
+
if test_dataset_id and test_data:
|
|
769
|
+
raise ValueError("Only one of test_dataset_id or test_data must be provided")
|
|
770
|
+
api_instance = AEvaluatorsApi(_client)
|
|
771
|
+
evaluator_test_request = ASkillTestDataRequest(
|
|
772
|
+
test_dataset_id=test_dataset_id,
|
|
773
|
+
test_data=test_data,
|
|
774
|
+
)
|
|
775
|
+
return await api_instance.evaluators_calibrate_create2(
|
|
776
|
+
evaluator_id, evaluator_test_request, _request_timeout=_request_timeout
|
|
777
|
+
)
|
|
778
|
+
|
|
779
|
+
@with_sync_client
|
|
780
|
+
def calibrate(
|
|
781
|
+
self,
|
|
782
|
+
*,
|
|
783
|
+
name: str,
|
|
784
|
+
test_dataset_id: Optional[str] = None,
|
|
785
|
+
test_data: Optional[List[List[str]]] = None,
|
|
786
|
+
prompt: str,
|
|
787
|
+
model: ModelName,
|
|
788
|
+
pii_filter: bool = False,
|
|
789
|
+
reference_variables: Optional[Union[List[ReferenceVariable], List[ReferenceVariableRequest]]] = None,
|
|
790
|
+
input_variables: Optional[Union[List[InputVariable], List[InputVariableRequest]]] = None,
|
|
791
|
+
_request_timeout: Optional[int] = None,
|
|
792
|
+
_client: ApiClient,
|
|
793
|
+
) -> List[EvaluatorCalibrationOutput]:
|
|
794
|
+
"""
|
|
795
|
+
Run calibration set for an evaluator definition.
|
|
796
|
+
See the create evaluator method for more details on the parameters.
|
|
797
|
+
"""
|
|
798
|
+
|
|
799
|
+
if not test_dataset_id and not test_data:
|
|
800
|
+
raise ValueError("Either test_dataset_id or test_data must be provided")
|
|
801
|
+
if test_dataset_id and test_data:
|
|
802
|
+
raise ValueError("Only one of test_dataset_id or test_data must be provided")
|
|
803
|
+
api_instance = EvaluatorsApi(_client)
|
|
804
|
+
evaluator_test_request = SkillTestInputRequest(
|
|
805
|
+
name=name,
|
|
806
|
+
test_dataset_id=test_dataset_id,
|
|
807
|
+
test_data=test_data,
|
|
808
|
+
prompt=prompt,
|
|
809
|
+
models=[model],
|
|
810
|
+
is_evaluator=True,
|
|
811
|
+
pii_filter=pii_filter,
|
|
812
|
+
objective=ObjectiveRequest(intent="Calibration"),
|
|
813
|
+
reference_variables=_to_reference_variables(reference_variables),
|
|
814
|
+
input_variables=_to_input_variables(input_variables),
|
|
815
|
+
)
|
|
816
|
+
return api_instance.evaluators_calibrate_create(evaluator_test_request, _request_timeout=_request_timeout)
|
|
817
|
+
|
|
818
|
+
@with_async_client
|
|
819
|
+
async def acalibrate(
|
|
820
|
+
self,
|
|
821
|
+
*,
|
|
822
|
+
name: str,
|
|
823
|
+
test_dataset_id: Optional[str] = None,
|
|
824
|
+
test_data: Optional[List[List[str]]] = None,
|
|
825
|
+
prompt: str,
|
|
826
|
+
model: ModelName,
|
|
827
|
+
pii_filter: bool = False,
|
|
828
|
+
reference_variables: Optional[Union[List[ReferenceVariable], List[AReferenceVariableRequest]]] = None,
|
|
829
|
+
input_variables: Optional[Union[List[InputVariable], List[AInputVariableRequest]]] = None,
|
|
830
|
+
_request_timeout: Optional[int] = None,
|
|
831
|
+
_client: AApiClient,
|
|
832
|
+
) -> List[AEvaluatorCalibrationOutput]:
|
|
833
|
+
"""
|
|
834
|
+
Asynchronously run calibration set for an evaluator definition.
|
|
835
|
+
See the create evaluator method for more details on the parameters.
|
|
836
|
+
"""
|
|
837
|
+
|
|
838
|
+
if not test_dataset_id and not test_data:
|
|
839
|
+
raise ValueError("Either test_dataset_id or test_data must be provided")
|
|
840
|
+
if test_dataset_id and test_data:
|
|
841
|
+
raise ValueError("Only one of test_dataset_id or test_data must be provided")
|
|
842
|
+
api_instance = AEvaluatorsApi(_client)
|
|
843
|
+
evaluator_test_request = ASkillTestInputRequest(
|
|
844
|
+
name=name,
|
|
845
|
+
test_dataset_id=test_dataset_id,
|
|
846
|
+
test_data=test_data,
|
|
847
|
+
prompt=prompt,
|
|
848
|
+
models=[model],
|
|
849
|
+
is_evaluator=True,
|
|
850
|
+
pii_filter=pii_filter,
|
|
851
|
+
objective=AObjectiveRequest(intent="Calibration"),
|
|
852
|
+
reference_variables=_ato_reference_variables(reference_variables),
|
|
853
|
+
input_variables=_ato_input_variables(input_variables),
|
|
854
|
+
)
|
|
855
|
+
return await api_instance.evaluators_calibrate_create(evaluator_test_request, _request_timeout=_request_timeout)
|
|
856
|
+
|
|
857
|
+
def calibrate_batch(
|
|
858
|
+
self,
|
|
859
|
+
*,
|
|
860
|
+
evaluator_definitions: List[CalibrateBatchParameters],
|
|
861
|
+
test_dataset_id: Optional[str] = None,
|
|
862
|
+
test_data: Optional[List[List[str]]] = None,
|
|
863
|
+
parallel_requests: int = 1,
|
|
864
|
+
_request_timeout: Optional[int] = None,
|
|
865
|
+
) -> CalibrateBatchResult:
|
|
866
|
+
"""
|
|
867
|
+
Run calibration for a set of prompts and models
|
|
868
|
+
|
|
869
|
+
Args:
|
|
870
|
+
evaluator_definitions: List of evaluator definitions.
|
|
871
|
+
test_dataset_id: ID of the dataset to be used to test the evaluator.
|
|
872
|
+
test_data: Snapshot of data to be used to test the evaluator.
|
|
873
|
+
parallel_requests: Number of parallel requests.
|
|
874
|
+
|
|
875
|
+
Returns a model with the results and errors for each model and prompt.
|
|
876
|
+
"""
|
|
877
|
+
|
|
878
|
+
if test_dataset_id and test_data:
|
|
879
|
+
raise ValueError("Only one of test_dataset_id or test_data must be provided")
|
|
880
|
+
|
|
881
|
+
model_errors: Dict[str, Dict[str, float]] = defaultdict(
|
|
882
|
+
lambda: {"sum_squared_errors": 0, "abs_errors": 0, "count": 0}
|
|
883
|
+
)
|
|
884
|
+
prompt_errors: Dict[str, Dict[str, float]] = defaultdict(
|
|
885
|
+
lambda: {"sum_squared_errors": 0, "abs_errors": 0, "count": 0}
|
|
886
|
+
)
|
|
887
|
+
|
|
888
|
+
all_results = []
|
|
889
|
+
|
|
890
|
+
use_thread_pool = parallel_requests > 1
|
|
891
|
+
|
|
892
|
+
def process_results(results: List[EvaluatorCalibrationOutput], param: CalibrateBatchParameters) -> None:
|
|
893
|
+
for result in results:
|
|
894
|
+
score = result.result.score or 0
|
|
895
|
+
expected_score = result.result.expected_score or 0
|
|
896
|
+
squared_error = (score - expected_score) ** 2
|
|
897
|
+
abs_error = abs(score - expected_score)
|
|
898
|
+
|
|
899
|
+
# TODO multiple thread race condition
|
|
900
|
+
model_errors[param.model]["sum_squared_errors"] += squared_error
|
|
901
|
+
model_errors[param.model]["abs_errors"] += abs_error
|
|
902
|
+
model_errors[param.model]["count"] += 1
|
|
903
|
+
|
|
904
|
+
prompt_errors[param.prompt]["sum_squared_errors"] += squared_error
|
|
905
|
+
prompt_errors[param.prompt]["abs_errors"] += abs_error
|
|
906
|
+
prompt_errors[param.prompt]["count"] += 1
|
|
907
|
+
|
|
908
|
+
all_results.append(result)
|
|
909
|
+
|
|
910
|
+
if use_thread_pool:
|
|
911
|
+
with ThreadPoolExecutor(max_workers=parallel_requests) as executor:
|
|
912
|
+
futures = {
|
|
913
|
+
executor.submit(
|
|
914
|
+
self.calibrate,
|
|
915
|
+
name=param.name,
|
|
916
|
+
test_dataset_id=test_dataset_id,
|
|
917
|
+
test_data=test_data,
|
|
918
|
+
prompt=param.prompt,
|
|
919
|
+
model=param.model,
|
|
920
|
+
pii_filter=param.pii_filter,
|
|
921
|
+
reference_variables=param.reference_variables,
|
|
922
|
+
input_variables=param.input_variables,
|
|
923
|
+
_request_timeout=_request_timeout,
|
|
924
|
+
): param
|
|
925
|
+
for param in evaluator_definitions
|
|
926
|
+
}
|
|
927
|
+
|
|
928
|
+
for future in as_completed(futures):
|
|
929
|
+
param = futures[future]
|
|
930
|
+
try:
|
|
931
|
+
results = future.result()
|
|
932
|
+
process_results(results, param)
|
|
933
|
+
except Exception as exc:
|
|
934
|
+
raise ValueError(f"Calibration failed for {param.prompt} with model {param.model}") from exc
|
|
935
|
+
else:
|
|
936
|
+
for param in evaluator_definitions:
|
|
937
|
+
try:
|
|
938
|
+
results = self.calibrate(
|
|
939
|
+
name=param.name,
|
|
940
|
+
test_dataset_id=test_dataset_id,
|
|
941
|
+
test_data=test_data,
|
|
942
|
+
prompt=param.prompt,
|
|
943
|
+
model=param.model,
|
|
944
|
+
pii_filter=param.pii_filter,
|
|
945
|
+
reference_variables=param.reference_variables,
|
|
946
|
+
input_variables=param.input_variables,
|
|
947
|
+
_request_timeout=_request_timeout,
|
|
948
|
+
)
|
|
949
|
+
process_results(results, param)
|
|
950
|
+
except Exception as exc:
|
|
951
|
+
raise ValueError(f"Calibration failed for {param.prompt} with model {param.model}") from exc
|
|
952
|
+
|
|
953
|
+
rms_errors_model = {
|
|
954
|
+
model: math.sqrt(data["sum_squared_errors"] / data["count"])
|
|
955
|
+
for model, data in model_errors.items()
|
|
956
|
+
if data["count"] > 0
|
|
957
|
+
}
|
|
958
|
+
|
|
959
|
+
rms_errors_prompt = {
|
|
960
|
+
prompt: math.sqrt(data["sum_squared_errors"] / data["count"])
|
|
961
|
+
for prompt, data in prompt_errors.items()
|
|
962
|
+
if data["count"] > 0
|
|
963
|
+
}
|
|
964
|
+
|
|
965
|
+
mae_errors_model = {
|
|
966
|
+
model: data["abs_errors"] / data["count"] for model, data in model_errors.items() if data["count"] > 0
|
|
967
|
+
}
|
|
968
|
+
|
|
969
|
+
mae_errors_prompt = {
|
|
970
|
+
prompt: data["abs_errors"] / data["count"] for prompt, data in prompt_errors.items() if data["count"] > 0
|
|
971
|
+
}
|
|
972
|
+
|
|
973
|
+
return CalibrateBatchResult(
|
|
974
|
+
results=all_results,
|
|
975
|
+
rms_errors_model=rms_errors_model,
|
|
976
|
+
rms_errors_prompt=rms_errors_prompt,
|
|
977
|
+
mae_errors_model=mae_errors_model,
|
|
978
|
+
mae_errors_prompt=mae_errors_prompt,
|
|
979
|
+
)
|
|
980
|
+
|
|
981
|
+
async def acalibrate_batch(
|
|
982
|
+
self,
|
|
983
|
+
*,
|
|
984
|
+
evaluator_definitions: List[ACalibrateBatchParameters],
|
|
985
|
+
test_dataset_id: Optional[str] = None,
|
|
986
|
+
test_data: Optional[List[List[str]]] = None,
|
|
987
|
+
parallel_requests: int = 1,
|
|
988
|
+
_request_timeout: Optional[int] = None,
|
|
989
|
+
) -> ACalibrateBatchResult:
|
|
990
|
+
"""
|
|
991
|
+
Asynchronously run calibration for a set of prompts and models
|
|
992
|
+
|
|
993
|
+
Args:
|
|
994
|
+
evaluator_definitions: List of evaluator definitions.
|
|
995
|
+
test_dataset_id: ID of the dataset to be used to test the evaluator.
|
|
996
|
+
test_data: Snapshot of data to be used to test the evaluator.
|
|
997
|
+
parallel_requests: Number of parallel requests.
|
|
998
|
+
|
|
999
|
+
Returns a model with the results and errors for each model and prompt.
|
|
1000
|
+
"""
|
|
1001
|
+
|
|
1002
|
+
if test_dataset_id and test_data:
|
|
1003
|
+
raise ValueError("Only one of test_dataset_id or test_data must be provided")
|
|
1004
|
+
|
|
1005
|
+
model_errors: Dict[str, Dict[str, float]] = defaultdict(
|
|
1006
|
+
lambda: {"sum_squared_errors": 0, "abs_errors": 0, "count": 0}
|
|
1007
|
+
)
|
|
1008
|
+
prompt_errors: Dict[str, Dict[str, float]] = defaultdict(
|
|
1009
|
+
lambda: {"sum_squared_errors": 0, "abs_errors": 0, "count": 0}
|
|
1010
|
+
)
|
|
1011
|
+
|
|
1012
|
+
all_results = []
|
|
1013
|
+
|
|
1014
|
+
async def process_results(results: List[AEvaluatorCalibrationOutput], param: ACalibrateBatchParameters) -> None:
|
|
1015
|
+
for result in results:
|
|
1016
|
+
score = result.result.score or 0
|
|
1017
|
+
expected_score = result.result.expected_score or 0
|
|
1018
|
+
squared_error = (score - expected_score) ** 2
|
|
1019
|
+
abs_error = abs(score - expected_score)
|
|
1020
|
+
|
|
1021
|
+
model_errors[param.model]["sum_squared_errors"] += squared_error
|
|
1022
|
+
model_errors[param.model]["abs_errors"] += abs_error
|
|
1023
|
+
model_errors[param.model]["count"] += 1
|
|
1024
|
+
|
|
1025
|
+
prompt_errors[param.prompt]["sum_squared_errors"] += squared_error
|
|
1026
|
+
prompt_errors[param.prompt]["abs_errors"] += abs_error
|
|
1027
|
+
prompt_errors[param.prompt]["count"] += 1
|
|
1028
|
+
|
|
1029
|
+
all_results.append(result)
|
|
1030
|
+
|
|
1031
|
+
sem = asyncio.Semaphore(parallel_requests)
|
|
1032
|
+
|
|
1033
|
+
async def bounded_calibrate(param: ACalibrateBatchParameters) -> None:
|
|
1034
|
+
async with sem:
|
|
1035
|
+
try:
|
|
1036
|
+
results = await self.acalibrate(
|
|
1037
|
+
name=param.name,
|
|
1038
|
+
test_dataset_id=test_dataset_id,
|
|
1039
|
+
test_data=test_data,
|
|
1040
|
+
prompt=param.prompt,
|
|
1041
|
+
model=param.model,
|
|
1042
|
+
pii_filter=param.pii_filter,
|
|
1043
|
+
reference_variables=param.reference_variables,
|
|
1044
|
+
input_variables=param.input_variables,
|
|
1045
|
+
_request_timeout=_request_timeout,
|
|
1046
|
+
)
|
|
1047
|
+
await process_results(results, param)
|
|
1048
|
+
except Exception as exc:
|
|
1049
|
+
raise ValueError(f"Calibration failed for {param.prompt} with model {param.model}") from exc
|
|
1050
|
+
|
|
1051
|
+
await asyncio.gather(*(bounded_calibrate(param) for param in evaluator_definitions))
|
|
1052
|
+
|
|
1053
|
+
rms_errors_model = {
|
|
1054
|
+
model: math.sqrt(data["sum_squared_errors"] / data["count"])
|
|
1055
|
+
for model, data in model_errors.items()
|
|
1056
|
+
if data["count"] > 0
|
|
1057
|
+
}
|
|
1058
|
+
|
|
1059
|
+
rms_errors_prompt = {
|
|
1060
|
+
prompt: math.sqrt(data["sum_squared_errors"] / data["count"])
|
|
1061
|
+
for prompt, data in prompt_errors.items()
|
|
1062
|
+
if data["count"] > 0
|
|
1063
|
+
}
|
|
1064
|
+
|
|
1065
|
+
mae_errors_model = {
|
|
1066
|
+
model: data["abs_errors"] / data["count"] for model, data in model_errors.items() if data["count"] > 0
|
|
1067
|
+
}
|
|
1068
|
+
|
|
1069
|
+
mae_errors_prompt = {
|
|
1070
|
+
prompt: data["abs_errors"] / data["count"] for prompt, data in prompt_errors.items() if data["count"] > 0
|
|
1071
|
+
}
|
|
1072
|
+
|
|
1073
|
+
return ACalibrateBatchResult(
|
|
1074
|
+
results=all_results,
|
|
1075
|
+
rms_errors_model=rms_errors_model,
|
|
1076
|
+
rms_errors_prompt=rms_errors_prompt,
|
|
1077
|
+
mae_errors_model=mae_errors_model,
|
|
1078
|
+
mae_errors_prompt=mae_errors_prompt,
|
|
1079
|
+
)
|
|
1080
|
+
|
|
1081
|
+
@with_sync_client
|
|
1082
|
+
def get_by_name(
|
|
1083
|
+
self,
|
|
1084
|
+
name: str,
|
|
1085
|
+
*,
|
|
1086
|
+
_client: ApiClient,
|
|
1087
|
+
) -> Evaluator:
|
|
1088
|
+
"""Get an evaluator instance by name.
|
|
1089
|
+
|
|
1090
|
+
Args:
|
|
1091
|
+
name: The evaluator to be fetched. Note this only works for uniquely named evaluators.
|
|
1092
|
+
"""
|
|
1093
|
+
|
|
1094
|
+
api_instance = EvaluatorsApi(_client)
|
|
1095
|
+
|
|
1096
|
+
evaluator_list: List[EvaluatorListOutput] = list(
|
|
1097
|
+
iterate_cursor_list(
|
|
1098
|
+
partial(api_instance.evaluators_list, name=name),
|
|
1099
|
+
limit=1,
|
|
1100
|
+
)
|
|
1101
|
+
)
|
|
1102
|
+
|
|
1103
|
+
if not evaluator_list:
|
|
1104
|
+
raise ValueError(f"No evaluator found with name '{name}'")
|
|
1105
|
+
|
|
1106
|
+
evaluator = evaluator_list[0]
|
|
1107
|
+
api_response = api_instance.evaluators_retrieve(id=evaluator.id)
|
|
1108
|
+
|
|
1109
|
+
return Evaluator._wrap(api_response, self.client_context)
|
|
1110
|
+
|
|
1111
|
+
@with_async_client
|
|
1112
|
+
async def aget_by_name(
|
|
1113
|
+
self,
|
|
1114
|
+
name: str,
|
|
1115
|
+
*,
|
|
1116
|
+
_client: ApiClient,
|
|
1117
|
+
) -> AEvaluator:
|
|
1118
|
+
"""Asynchronously get an evaluator instance by name.
|
|
1119
|
+
|
|
1120
|
+
Args:
|
|
1121
|
+
name: The evaluator to be fetched. Note this only works for uniquely named evaluators.
|
|
1122
|
+
"""
|
|
1123
|
+
|
|
1124
|
+
context = self.client_context()
|
|
1125
|
+
|
|
1126
|
+
assert isinstance(context, AbstractAsyncContextManager), "This method is not available in synchronous mode"
|
|
1127
|
+
|
|
1128
|
+
async with context as client:
|
|
1129
|
+
api_instance = AEvaluatorsApi(client)
|
|
1130
|
+
|
|
1131
|
+
evaluator_list: List[AEvaluatorListOutput] = []
|
|
1132
|
+
async for evaluator in aiterate_cursor_list( # type: ignore[var-annotated]
|
|
1133
|
+
partial(api_instance.evaluators_list, name=name),
|
|
1134
|
+
limit=1,
|
|
1135
|
+
):
|
|
1136
|
+
evaluator_list.extend(evaluator)
|
|
1137
|
+
|
|
1138
|
+
if not evaluator_list:
|
|
1139
|
+
raise ValueError(f"No evaluator found with name '{name}'")
|
|
1140
|
+
|
|
1141
|
+
evaluator = evaluator_list[0]
|
|
1142
|
+
api_response = await api_instance.evaluators_retrieve(id=evaluator.id)
|
|
1143
|
+
|
|
1144
|
+
return await AEvaluator._awrap(api_response, self.client_context)
|
|
1145
|
+
|
|
1146
|
+
@with_sync_client
|
|
1147
|
+
def create(
|
|
1148
|
+
self,
|
|
1149
|
+
predicate: str = "",
|
|
1150
|
+
*,
|
|
1151
|
+
name: Optional[str] = None,
|
|
1152
|
+
intent: Optional[str] = None,
|
|
1153
|
+
model: Optional[ModelName] = None,
|
|
1154
|
+
fallback_models: Optional[List[ModelName]] = None,
|
|
1155
|
+
reference_variables: Optional[Union[List[ReferenceVariable], List[ReferenceVariableRequest]]] = None,
|
|
1156
|
+
input_variables: Optional[Union[List[InputVariable], List[InputVariableRequest]]] = None,
|
|
1157
|
+
model_params: Optional[Union[ModelParams, ModelParamsRequest]] = None,
|
|
1158
|
+
evaluator_demonstrations: Optional[List[EvaluatorDemonstration]] = None,
|
|
1159
|
+
objective_id: Optional[str] = None,
|
|
1160
|
+
overwrite: bool = False,
|
|
1161
|
+
_client: ApiClient,
|
|
1162
|
+
_request_timeout: Optional[int] = None,
|
|
1163
|
+
) -> Evaluator:
|
|
1164
|
+
"""Create a new evaluator and return the result
|
|
1165
|
+
|
|
1166
|
+
Args:
|
|
1167
|
+
predicate: The question / predicate that is provided to the semantic quantification layer to
|
|
1168
|
+
transform it into a final prompt before being passed to the model
|
|
1169
|
+
|
|
1170
|
+
name: Name of the evaluator (defaulting to <unnamed>)
|
|
1171
|
+
|
|
1172
|
+
objective_id: Optional pre-existing objective id to assign to the evaluator.
|
|
1173
|
+
|
|
1174
|
+
intent: The intent of the evaluator (defaulting to name); not available if objective_id is set.
|
|
1175
|
+
|
|
1176
|
+
model: The model to use (defaults to 'root', which means
|
|
1177
|
+
Root Signals default at the time of evaluator creation)
|
|
1178
|
+
|
|
1179
|
+
fallback_models: The fallback models to use in case the primary model fails.
|
|
1180
|
+
|
|
1181
|
+
reference_variables: An optional list of reference variables for
|
|
1182
|
+
the evaluator.
|
|
1183
|
+
|
|
1184
|
+
input_variables: An optional list of input variables for
|
|
1185
|
+
the evaluator.
|
|
1186
|
+
|
|
1187
|
+
model_params: An optional set of additional parameters to the model (e.g., temperature).
|
|
1188
|
+
|
|
1189
|
+
evaluator_demonstrations: An optional list of evaluator demonstrations to guide
|
|
1190
|
+
the evaluator's behavior.
|
|
1191
|
+
|
|
1192
|
+
overwrite: Whether to overwrite an evaluator with the same name if it exists.
|
|
1193
|
+
"""
|
|
1194
|
+
|
|
1195
|
+
name = self._validate_create_params_sanitize_name(name, intent, objective_id)
|
|
1196
|
+
api_instance = EvaluatorsApi(_client)
|
|
1197
|
+
objective: Optional[ObjectiveRequest] = None
|
|
1198
|
+
if objective_id is None:
|
|
1199
|
+
if intent is None:
|
|
1200
|
+
intent = name
|
|
1201
|
+
objective = self._to_objective_request(
|
|
1202
|
+
intent=intent,
|
|
1203
|
+
)
|
|
1204
|
+
objectives_api_instance = ObjectivesApi(_client)
|
|
1205
|
+
objective_id = objectives_api_instance.objectives_create(objective_request=objective).id
|
|
1206
|
+
|
|
1207
|
+
evaluator_request = EvaluatorRequest(
|
|
1208
|
+
name=name,
|
|
1209
|
+
objective_id=objective_id,
|
|
1210
|
+
prompt=predicate,
|
|
1211
|
+
models=[model for model in [model] + (fallback_models or []) if model is not None],
|
|
1212
|
+
reference_variables=_to_reference_variables(reference_variables),
|
|
1213
|
+
input_variables=_to_input_variables(input_variables),
|
|
1214
|
+
model_params=_to_model_params(model_params),
|
|
1215
|
+
evaluator_demonstrations=_to_evaluator_demonstrations(evaluator_demonstrations),
|
|
1216
|
+
overwrite=overwrite,
|
|
1217
|
+
)
|
|
1218
|
+
|
|
1219
|
+
evaluator = api_instance.evaluators_create(
|
|
1220
|
+
evaluator_request=evaluator_request, _request_timeout=_request_timeout
|
|
1221
|
+
)
|
|
1222
|
+
|
|
1223
|
+
return Evaluator._wrap(evaluator, self.client_context)
|
|
1224
|
+
|
|
1225
|
+
@with_async_client
|
|
1226
|
+
async def acreate(
|
|
1227
|
+
self,
|
|
1228
|
+
predicate: str = "",
|
|
1229
|
+
*,
|
|
1230
|
+
name: Optional[str] = None,
|
|
1231
|
+
intent: Optional[str] = None,
|
|
1232
|
+
model: Optional[ModelName] = None,
|
|
1233
|
+
fallback_models: Optional[List[ModelName]] = None,
|
|
1234
|
+
reference_variables: Optional[Union[List[ReferenceVariable], List[AReferenceVariableRequest]]] = None,
|
|
1235
|
+
input_variables: Optional[Union[List[InputVariable], List[AInputVariableRequest]]] = None,
|
|
1236
|
+
model_params: Optional[Union[ModelParams, AModelParamsRequest]] = None,
|
|
1237
|
+
evaluator_demonstrations: Optional[List[EvaluatorDemonstration]] = None,
|
|
1238
|
+
objective_id: Optional[str] = None,
|
|
1239
|
+
overwrite: bool = False,
|
|
1240
|
+
_client: ApiClient,
|
|
1241
|
+
_request_timeout: Optional[int] = None,
|
|
1242
|
+
) -> AEvaluator:
|
|
1243
|
+
"""
|
|
1244
|
+
Asynchronously create a new evaluator and return the result
|
|
1245
|
+
|
|
1246
|
+
Args:
|
|
1247
|
+
predicate: The question / predicate that is provided to the semantic quantification layer to
|
|
1248
|
+
transform it into a final prompt before being passed to the model
|
|
1249
|
+
|
|
1250
|
+
name: Name of the evaluator (defaulting to <unnamed>)
|
|
1251
|
+
|
|
1252
|
+
objective_id: Optional pre-existing objective id to assign to the evaluator.
|
|
1253
|
+
|
|
1254
|
+
intent: The intent of the evaluator (defaulting to name); not available if objective_id is set.
|
|
1255
|
+
|
|
1256
|
+
model: The model to use (defaults to 'root', which means
|
|
1257
|
+
Root Signals default at the time of evaluator creation)
|
|
1258
|
+
|
|
1259
|
+
fallback_models: The fallback models to use in case the primary model fails.
|
|
1260
|
+
|
|
1261
|
+
reference_variables: An optional list of reference variables for
|
|
1262
|
+
the evaluator.
|
|
1263
|
+
|
|
1264
|
+
input_variables: An optional list of input variables for
|
|
1265
|
+
the evaluator.
|
|
1266
|
+
|
|
1267
|
+
model_params: An optional set of additional parameters to the model (e.g., temperature).
|
|
1268
|
+
|
|
1269
|
+
evaluator_demonstrations: An optional list of evaluator demonstrations to guide
|
|
1270
|
+
the evaluator's behavior.
|
|
1271
|
+
|
|
1272
|
+
overwrite: Whether to overwrite an evaluator with the same name if it exists.
|
|
1273
|
+
"""
|
|
1274
|
+
|
|
1275
|
+
name = self._validate_create_params_sanitize_name(name, intent, objective_id)
|
|
1276
|
+
api_instance = AEvaluatorsApi(_client)
|
|
1277
|
+
objective: Optional[AObjectiveRequest] = None
|
|
1278
|
+
if objective_id is None:
|
|
1279
|
+
if intent is None:
|
|
1280
|
+
intent = name
|
|
1281
|
+
objective = await self._ato_objective_request(intent=intent)
|
|
1282
|
+
objectives_api_instance = AObjectivesApi(_client)
|
|
1283
|
+
new_objective = await objectives_api_instance.objectives_create(objective_request=objective)
|
|
1284
|
+
objective_id = new_objective.id
|
|
1285
|
+
|
|
1286
|
+
evaluator_request = AEvaluatorRequest(
|
|
1287
|
+
name=name,
|
|
1288
|
+
objective_id=objective_id,
|
|
1289
|
+
prompt=predicate,
|
|
1290
|
+
models=[model for model in [model] + (fallback_models or []) if model is not None],
|
|
1291
|
+
reference_variables=_ato_reference_variables(reference_variables),
|
|
1292
|
+
input_variables=_ato_input_variables(input_variables),
|
|
1293
|
+
model_params=_ato_model_params(model_params),
|
|
1294
|
+
evaluator_demonstrations=_ato_evaluator_demonstrations(evaluator_demonstrations),
|
|
1295
|
+
overwrite=overwrite,
|
|
1296
|
+
)
|
|
1297
|
+
|
|
1298
|
+
evaluator = await api_instance.evaluators_create(
|
|
1299
|
+
evaluator_request=evaluator_request, _request_timeout=_request_timeout
|
|
1300
|
+
)
|
|
1301
|
+
|
|
1302
|
+
return await AEvaluator._awrap(evaluator, self.client_context)
|
|
1303
|
+
|
|
1304
|
+
@with_sync_client
|
|
1305
|
+
def update(
|
|
1306
|
+
self,
|
|
1307
|
+
evaluator_id: str,
|
|
1308
|
+
*,
|
|
1309
|
+
change_note: Optional[str] = None,
|
|
1310
|
+
fallback_models: Optional[List[ModelName]] = None,
|
|
1311
|
+
input_variables: Optional[Union[List[InputVariable], List[InputVariableRequest]]] = None,
|
|
1312
|
+
model: Optional[ModelName] = None,
|
|
1313
|
+
name: Optional[str] = None,
|
|
1314
|
+
predicate: Optional[str] = None,
|
|
1315
|
+
reference_variables: Optional[Union[List[ReferenceVariable], List[ReferenceVariableRequest]]] = None,
|
|
1316
|
+
model_params: Optional[Union[ModelParams, ModelParamsRequest]] = None,
|
|
1317
|
+
evaluator_demonstrations: Optional[List[EvaluatorDemonstration]] = None,
|
|
1318
|
+
objective_id: Optional[str] = None,
|
|
1319
|
+
_request_timeout: Optional[int] = None,
|
|
1320
|
+
_client: ApiClient,
|
|
1321
|
+
) -> Evaluator:
|
|
1322
|
+
"""
|
|
1323
|
+
Update an evaluator and return the result
|
|
1324
|
+
|
|
1325
|
+
See the create method for more information on the arguments.
|
|
1326
|
+
"""
|
|
1327
|
+
|
|
1328
|
+
api_instance = EvaluatorsApi(_client)
|
|
1329
|
+
request = PatchedEvaluatorRequest(
|
|
1330
|
+
change_note=change_note or "",
|
|
1331
|
+
input_variables=_to_input_variables(input_variables) if input_variables else None,
|
|
1332
|
+
models=[model for model in [model] + (fallback_models or []) if model is not None]
|
|
1333
|
+
if model or fallback_models
|
|
1334
|
+
else None,
|
|
1335
|
+
name=name,
|
|
1336
|
+
prompt=predicate,
|
|
1337
|
+
reference_variables=_to_reference_variables(reference_variables) if reference_variables else None,
|
|
1338
|
+
model_params=_to_model_params(model_params) if model_params else None,
|
|
1339
|
+
objective_id=objective_id,
|
|
1340
|
+
evaluator_demonstrations=_to_evaluator_demonstrations(evaluator_demonstrations)
|
|
1341
|
+
if evaluator_demonstrations
|
|
1342
|
+
else None,
|
|
1343
|
+
)
|
|
1344
|
+
|
|
1345
|
+
api_response = api_instance.evaluators_partial_update(
|
|
1346
|
+
id=evaluator_id,
|
|
1347
|
+
patched_evaluator_request=request,
|
|
1348
|
+
_request_timeout=_request_timeout,
|
|
1349
|
+
)
|
|
1350
|
+
return Evaluator._wrap(api_response, self.client_context)
|
|
1351
|
+
|
|
1352
|
+
@with_async_client
|
|
1353
|
+
async def aupdate(
|
|
1354
|
+
self,
|
|
1355
|
+
evaluator_id: str,
|
|
1356
|
+
*,
|
|
1357
|
+
change_note: Optional[str] = None,
|
|
1358
|
+
fallback_models: Optional[List[ModelName]] = None,
|
|
1359
|
+
input_variables: Optional[Union[List[InputVariable], List[AInputVariableRequest]]] = None,
|
|
1360
|
+
model: Optional[ModelName] = None,
|
|
1361
|
+
name: Optional[str] = None,
|
|
1362
|
+
predicate: Optional[str] = None,
|
|
1363
|
+
reference_variables: Optional[Union[List[ReferenceVariable], List[AReferenceVariableRequest]]] = None,
|
|
1364
|
+
model_params: Optional[Union[ModelParams, AModelParamsRequest]] = None,
|
|
1365
|
+
evaluator_demonstrations: Optional[List[EvaluatorDemonstration]] = None,
|
|
1366
|
+
objective_id: Optional[str] = None,
|
|
1367
|
+
_request_timeout: Optional[int] = None,
|
|
1368
|
+
_client: AApiClient,
|
|
1369
|
+
) -> AEvaluator:
|
|
1370
|
+
"""
|
|
1371
|
+
Asynchronously update an evaluator and return the result
|
|
1372
|
+
|
|
1373
|
+
See the create method for more information on the arguments.
|
|
1374
|
+
"""
|
|
1375
|
+
api_instance = AEvaluatorsApi(_client)
|
|
1376
|
+
|
|
1377
|
+
request = APatchedEvaluatorRequest(
|
|
1378
|
+
change_note=change_note or "",
|
|
1379
|
+
input_variables=_ato_input_variables(input_variables) if input_variables else None,
|
|
1380
|
+
models=[model for model in [model] + (fallback_models or []) if model is not None]
|
|
1381
|
+
if model or fallback_models
|
|
1382
|
+
else None,
|
|
1383
|
+
name=name,
|
|
1384
|
+
reference_variables=_ato_reference_variables(reference_variables) if reference_variables else None,
|
|
1385
|
+
model_params=_ato_model_params(model_params) if model_params else None,
|
|
1386
|
+
objective_id=objective_id,
|
|
1387
|
+
prompt=predicate,
|
|
1388
|
+
evaluator_demonstrations=_ato_evaluator_demonstrations(evaluator_demonstrations)
|
|
1389
|
+
if evaluator_demonstrations
|
|
1390
|
+
else None,
|
|
1391
|
+
)
|
|
1392
|
+
api_response = await api_instance.evaluators_partial_update(
|
|
1393
|
+
id=evaluator_id,
|
|
1394
|
+
patched_evaluator_request=request,
|
|
1395
|
+
_request_timeout=_request_timeout,
|
|
1396
|
+
)
|
|
1397
|
+
return await AEvaluator._awrap(api_response, self.client_context)
|
|
1398
|
+
|
|
1399
|
+
@with_sync_client
|
|
1400
|
+
def get(
|
|
1401
|
+
self,
|
|
1402
|
+
evaluator_id: str,
|
|
1403
|
+
*,
|
|
1404
|
+
_request_timeout: Optional[int] = None,
|
|
1405
|
+
_client: ApiClient,
|
|
1406
|
+
) -> Evaluator:
|
|
1407
|
+
"""
|
|
1408
|
+
Get a Evaluator instance by ID.
|
|
1409
|
+
"""
|
|
1410
|
+
|
|
1411
|
+
api_instance = EvaluatorsApi(_client)
|
|
1412
|
+
api_response = api_instance.evaluators_retrieve(id=evaluator_id, _request_timeout=_request_timeout)
|
|
1413
|
+
return Evaluator._wrap(api_response, self.client_context)
|
|
1414
|
+
|
|
1415
|
+
@with_async_client
|
|
1416
|
+
async def aget(
|
|
1417
|
+
self,
|
|
1418
|
+
evaluator_id: str,
|
|
1419
|
+
*,
|
|
1420
|
+
_request_timeout: Optional[int] = None,
|
|
1421
|
+
_client: AApiClient,
|
|
1422
|
+
) -> AEvaluator:
|
|
1423
|
+
"""
|
|
1424
|
+
Asynchronously get a Evaluator instance by ID.
|
|
1425
|
+
"""
|
|
1426
|
+
|
|
1427
|
+
api_instance = AEvaluatorsApi(_client)
|
|
1428
|
+
api_response = await api_instance.evaluators_retrieve(id=evaluator_id, _request_timeout=_request_timeout)
|
|
1429
|
+
return await AEvaluator._awrap(api_response, self.client_context)
|
|
1430
|
+
|
|
1431
|
+
@with_sync_client
|
|
1432
|
+
def list(
|
|
1433
|
+
self,
|
|
1434
|
+
search_term: Optional[str] = None,
|
|
1435
|
+
*,
|
|
1436
|
+
limit: int = 100,
|
|
1437
|
+
name: Optional[str] = None,
|
|
1438
|
+
only_root_evaluators: bool = False,
|
|
1439
|
+
_client: ApiClient,
|
|
1440
|
+
) -> Iterator[EvaluatorListOutput]:
|
|
1441
|
+
"""
|
|
1442
|
+
Iterate through the evaluators.
|
|
1443
|
+
|
|
1444
|
+
Args:
|
|
1445
|
+
search_term: Can be used to limit returned evaluators.
|
|
1446
|
+
limit: Number of entries to iterate through at most.
|
|
1447
|
+
name: Specific name the returned evaluators must match.
|
|
1448
|
+
only_root_evaluators: Returns only Root Signals defined evaluators.
|
|
1449
|
+
"""
|
|
1450
|
+
|
|
1451
|
+
api_instance = EvaluatorsApi(_client)
|
|
1452
|
+
yield from iterate_cursor_list(
|
|
1453
|
+
partial(
|
|
1454
|
+
api_instance.evaluators_list,
|
|
1455
|
+
name=name,
|
|
1456
|
+
search=search_term,
|
|
1457
|
+
is_root_evaluator=True if only_root_evaluators else None,
|
|
1458
|
+
),
|
|
1459
|
+
limit=limit,
|
|
1460
|
+
)
|
|
1461
|
+
|
|
1462
|
+
async def alist(
|
|
1463
|
+
self,
|
|
1464
|
+
search_term: Optional[str] = None,
|
|
1465
|
+
*,
|
|
1466
|
+
limit: int = 100,
|
|
1467
|
+
name: Optional[str] = None,
|
|
1468
|
+
only_root_evaluators: bool = False,
|
|
1469
|
+
) -> AsyncIterator[AEvaluatorListOutput]:
|
|
1470
|
+
"""
|
|
1471
|
+
Asynchronously iterate through the evaluators.
|
|
1472
|
+
|
|
1473
|
+
Args:
|
|
1474
|
+
search_term: Can be used to limit returned evaluators.
|
|
1475
|
+
limit: Number of entries to iterate through at most.
|
|
1476
|
+
name: Specific name the returned evaluators must match.
|
|
1477
|
+
only_root_evaluators: Returns only Root Signals defined evaluators.
|
|
1478
|
+
"""
|
|
1479
|
+
|
|
1480
|
+
context = self.client_context()
|
|
1481
|
+
assert isinstance(context, AbstractAsyncContextManager), "This method is not available in synchronous mode"
|
|
1482
|
+
async with context as client:
|
|
1483
|
+
api_instance = AEvaluatorsApi(client)
|
|
1484
|
+
partial_list = partial(
|
|
1485
|
+
api_instance.evaluators_list,
|
|
1486
|
+
name=name,
|
|
1487
|
+
search=search_term,
|
|
1488
|
+
is_root_evaluator=True if only_root_evaluators else None,
|
|
1489
|
+
)
|
|
1490
|
+
|
|
1491
|
+
cursor: Optional[StrictStr] = None
|
|
1492
|
+
while limit > 0:
|
|
1493
|
+
result: APaginatedEvaluatorListOutputList = await partial_list(page_size=limit, cursor=cursor)
|
|
1494
|
+
if not result.results:
|
|
1495
|
+
return
|
|
1496
|
+
|
|
1497
|
+
used_results = result.results[:limit]
|
|
1498
|
+
limit -= len(used_results)
|
|
1499
|
+
for used_result in used_results:
|
|
1500
|
+
yield used_result
|
|
1501
|
+
|
|
1502
|
+
if not (cursor := result.next):
|
|
1503
|
+
return
|
|
1504
|
+
|
|
1505
|
+
@with_sync_client
|
|
1506
|
+
def run_by_name(
|
|
1507
|
+
self,
|
|
1508
|
+
name: str,
|
|
1509
|
+
*,
|
|
1510
|
+
request: Optional[str] = None,
|
|
1511
|
+
response: Optional[str] = None,
|
|
1512
|
+
contexts: Optional[List[str]] = None,
|
|
1513
|
+
functions: Optional[List[EvaluatorExecutionFunctionsRequest]] = None,
|
|
1514
|
+
expected_output: Optional[str] = None,
|
|
1515
|
+
evaluator_version_id: Optional[str] = None,
|
|
1516
|
+
variables: Optional[dict[str, str]] = None,
|
|
1517
|
+
tags: Optional[List[str]] = None,
|
|
1518
|
+
_request_timeout: Optional[int] = None,
|
|
1519
|
+
_client: ApiClient,
|
|
1520
|
+
) -> EvaluatorExecutionResult:
|
|
1521
|
+
"""
|
|
1522
|
+
Run an evaluator by name.
|
|
1523
|
+
|
|
1524
|
+
Args:
|
|
1525
|
+
name: The name of the evaluator to run.
|
|
1526
|
+
request: The prompt sent to the LLM.
|
|
1527
|
+
response: LLM output.
|
|
1528
|
+
contexts: Optional documents passed to RAG evaluators.
|
|
1529
|
+
functions: Optional function definitions to LLM tool call validation.
|
|
1530
|
+
expected_output: Optional expected output for the evaluator.
|
|
1531
|
+
evaluator_version_id: Version ID of the evaluator to run. If omitted, the latest version is used.
|
|
1532
|
+
variables: Optional additional variable mappings for the evaluator. For example, if the evaluator
|
|
1533
|
+
predicate is "evaluate the output based on {subject}: {output}", then variables={"subject": "clarity"}.
|
|
1534
|
+
tags: Optional tags to add to the evaluator execution
|
|
1535
|
+
_request_timeout: Optional timeout for the request.
|
|
1536
|
+
"""
|
|
1537
|
+
|
|
1538
|
+
if not response and not request:
|
|
1539
|
+
raise ValueError("Either response or request must be provided")
|
|
1540
|
+
|
|
1541
|
+
api_instance = EvaluatorsApi(_client)
|
|
1542
|
+
|
|
1543
|
+
evaluator_execution_request = EvaluatorExecutionRequest(
|
|
1544
|
+
evaluator_version_id=evaluator_version_id,
|
|
1545
|
+
request=request,
|
|
1546
|
+
response=response,
|
|
1547
|
+
contexts=contexts,
|
|
1548
|
+
functions=functions,
|
|
1549
|
+
expected_output=expected_output,
|
|
1550
|
+
variables=variables,
|
|
1551
|
+
tags=tags,
|
|
1552
|
+
)
|
|
1553
|
+
return api_instance.evaluators_execute_by_name_create(
|
|
1554
|
+
name=name,
|
|
1555
|
+
evaluator_execution_request=evaluator_execution_request,
|
|
1556
|
+
_request_timeout=_request_timeout,
|
|
1557
|
+
)
|
|
1558
|
+
|
|
1559
|
+
@with_sync_client
|
|
1560
|
+
def delete(self, evaluator_id: str, *, _client: ApiClient) -> None:
|
|
1561
|
+
"""
|
|
1562
|
+
Delete the evaluator.
|
|
1563
|
+
"""
|
|
1564
|
+
|
|
1565
|
+
api_instance = EvaluatorsApi(_client)
|
|
1566
|
+
return api_instance.evaluators_destroy(id=evaluator_id)
|
|
1567
|
+
|
|
1568
|
+
@with_async_client
|
|
1569
|
+
async def adelete(self, evaluator_id: str, *, _client: AApiClient) -> None:
|
|
1570
|
+
"""
|
|
1571
|
+
Delete the evaluator.
|
|
1572
|
+
"""
|
|
1573
|
+
|
|
1574
|
+
api_instance = AEvaluatorsApi(_client)
|
|
1575
|
+
return await api_instance.evaluators_destroy(id=evaluator_id)
|
|
1576
|
+
|
|
1577
|
+
@with_async_client
|
|
1578
|
+
async def arun_by_name(
|
|
1579
|
+
self,
|
|
1580
|
+
name: str,
|
|
1581
|
+
*,
|
|
1582
|
+
request: Optional[str] = None,
|
|
1583
|
+
response: Optional[str] = None,
|
|
1584
|
+
contexts: Optional[List[str]] = None,
|
|
1585
|
+
functions: Optional[List[AEvaluatorExecutionFunctionsRequest]] = None,
|
|
1586
|
+
expected_output: Optional[str] = None,
|
|
1587
|
+
evaluator_version_id: Optional[str] = None,
|
|
1588
|
+
variables: Optional[dict[str, str]] = None,
|
|
1589
|
+
tags: Optional[List[str]] = None,
|
|
1590
|
+
_request_timeout: Optional[int] = None,
|
|
1591
|
+
_client: AApiClient,
|
|
1592
|
+
) -> AEvaluatorExecutionResult:
|
|
1593
|
+
"""
|
|
1594
|
+
Asynchronously run an evaluator by name.
|
|
1595
|
+
|
|
1596
|
+
Args:
|
|
1597
|
+
name: The name of the evaluator to run.
|
|
1598
|
+
request: The prompt sent to the LLM.
|
|
1599
|
+
response: LLM output.
|
|
1600
|
+
contexts: Optional documents passed to RAG evaluators.
|
|
1601
|
+
functions: Optional function definitions to LLM tool call validation.
|
|
1602
|
+
expected_output: Optional expected output for the evaluator.
|
|
1603
|
+
evaluator_version_id: Version ID of the evaluator to run. If omitted, the latest version is used.
|
|
1604
|
+
variables: Optional additional variable mappings for the evaluator. For example, if the evaluator
|
|
1605
|
+
predicate is "evaluate the output based on {subject}: {output}", then variables={"subject": "clarity"}.
|
|
1606
|
+
tags: Optional tags to add to the evaluator execution
|
|
1607
|
+
_request_timeout: Optional timeout for the request.
|
|
1608
|
+
"""
|
|
1609
|
+
|
|
1610
|
+
if not response and not request:
|
|
1611
|
+
raise ValueError("Either response or request must be provided")
|
|
1612
|
+
|
|
1613
|
+
api_instance = AEvaluatorsApi(_client)
|
|
1614
|
+
evaluator_execution_request = AEvaluatorExecutionRequest(
|
|
1615
|
+
evaluator_version_id=evaluator_version_id,
|
|
1616
|
+
request=request,
|
|
1617
|
+
response=response,
|
|
1618
|
+
contexts=contexts,
|
|
1619
|
+
functions=functions,
|
|
1620
|
+
expected_output=expected_output,
|
|
1621
|
+
variables=variables,
|
|
1622
|
+
tags=tags,
|
|
1623
|
+
)
|
|
1624
|
+
return await api_instance.evaluators_execute_by_name_create(
|
|
1625
|
+
name=name,
|
|
1626
|
+
evaluator_execution_request=evaluator_execution_request,
|
|
1627
|
+
_request_timeout=_request_timeout,
|
|
1628
|
+
)
|
|
1629
|
+
|
|
1630
|
+
EvaluatorName = Literal[
|
|
1631
|
+
"Faithfulness",
|
|
1632
|
+
"Relevance",
|
|
1633
|
+
"Clarity",
|
|
1634
|
+
"Non_toxicity",
|
|
1635
|
+
"Helpfulness",
|
|
1636
|
+
"Politeness",
|
|
1637
|
+
"Formality",
|
|
1638
|
+
"Harmlessness",
|
|
1639
|
+
"Confidentiality",
|
|
1640
|
+
"Persuasiveness",
|
|
1641
|
+
"JSON_Empty_Values_Ratio",
|
|
1642
|
+
"JSON_Property_Name_Accuracy",
|
|
1643
|
+
"JSON_Property_Type_Accuracy",
|
|
1644
|
+
"JSON_Property_Completeness",
|
|
1645
|
+
"JSON_Content_Accuracy",
|
|
1646
|
+
"Context_Recall",
|
|
1647
|
+
"Answer_Correctness",
|
|
1648
|
+
"Answer_Semantic_Similarity",
|
|
1649
|
+
"Sentiment_recognition",
|
|
1650
|
+
"Safety_for_Children",
|
|
1651
|
+
"Precision",
|
|
1652
|
+
"Originality",
|
|
1653
|
+
"Engagingness",
|
|
1654
|
+
"Conciseness",
|
|
1655
|
+
"Coherence",
|
|
1656
|
+
"Quality_of_Writing_Professional",
|
|
1657
|
+
"Quality_of_Writing_Creative",
|
|
1658
|
+
"Truthfulness",
|
|
1659
|
+
"Context_Precision",
|
|
1660
|
+
"Answer_Relevance",
|
|
1661
|
+
]
|
|
1662
|
+
|
|
1663
|
+
class Eval(Enum):
|
|
1664
|
+
# TODO: These eval names should be retrieved automatically from the API or a shared config file
|
|
1665
|
+
Faithfulness = "901794f9-634c-4852-9e41-7c558f1ff1ab"
|
|
1666
|
+
Relevance = "bd789257-f458-4e9e-8ce9-fa6e86dc3fb9"
|
|
1667
|
+
Clarity = "9976d9f3-7265-4732-b518-d61c2642b14e"
|
|
1668
|
+
Non_toxicity = "e296e374-7539-4eb2-a74a-47847dd26fb8"
|
|
1669
|
+
Helpfulness = "88bc92d5-bebf-45e4-9cd1-dfa33309c320"
|
|
1670
|
+
Politeness = "2856903a-e48c-4548-b3fe-520fd88c4f25"
|
|
1671
|
+
Formality = "8ab6cf1a-42b5-4a23-a15c-21372816483d"
|
|
1672
|
+
Harmlessness = "379fee0a-4fd1-4942-833b-7d78d78b334d"
|
|
1673
|
+
Confidentiality = "2eaa0a02-47a9-48f7-9b47-66ad257f93eb"
|
|
1674
|
+
Persuasiveness = "85bb6a74-f5dd-4130-8dcc-cffdf72327cc"
|
|
1675
|
+
JSON_Empty_Values_Ratio = "03829088-1799-438e-ae30-1db60832e52d"
|
|
1676
|
+
JSON_Property_Name_Accuracy = "740923aa-8ffd-49cc-a95d-14f831243b25"
|
|
1677
|
+
JSON_Property_Type_Accuracy = "eabc6924-1fec-4e96-82ce-c03bf415c885"
|
|
1678
|
+
JSON_Property_Completeness = "e5de37f7-d20c-420f-8072-f41dce96ecfc"
|
|
1679
|
+
JSON_Content_Accuracy = "b6a9aeff-c888-46d7-9e9c-7cf8cb461762"
|
|
1680
|
+
Context_Recall = "8bb60975-5062-4367-9fc6-a920044cba56"
|
|
1681
|
+
Answer_Correctness = "d4487568-4243-4da8-9c76-adbaf762dbe0"
|
|
1682
|
+
Answer_Semantic_Similarity = "ff350bce-4b07-4af7-9640-803c9d3c2ff9"
|
|
1683
|
+
Sentiment_recognition = "e3782c1e-eaf4-4b2d-8d26-53db2160f1fd"
|
|
1684
|
+
Safety_for_Children = "39a8b5ba-de77-4726-a6b0-621d40b3cdf5"
|
|
1685
|
+
Precision = "767bdd49-5f8c-48ca-8324-dfd6be7f8a79"
|
|
1686
|
+
Originality = "e72cb54f-548a-44f9-a6ca-4e14e5ade7f7"
|
|
1687
|
+
Engagingness = "64729487-d4a8-42d8-bd9e-72fd8390c134"
|
|
1688
|
+
Conciseness = "be828d33-158a-4e92-a2eb-f4d96c13f956"
|
|
1689
|
+
Coherence = "e599886c-c338-458f-91b3-5d7eba452618"
|
|
1690
|
+
Quality_of_Writing_Professional = "059affa9-2d1c-48de-8e97-f81dd3fc3cbe"
|
|
1691
|
+
Quality_of_Writing_Creative = "060abfb6-57c9-43b5-9a6d-8a1a9bb853b8"
|
|
1692
|
+
Truthfulness = "053df10f-b0c7-400b-892e-46ce3aa1e430"
|
|
1693
|
+
Context_Precision = "9d1e9a25-7e76-4771-b1e3-40825d7918c5"
|
|
1694
|
+
Answer_Relevance = "0907d422-e94f-4c9c-a63d-ec0eefd8a903"
|
|
1695
|
+
Compliance_Preview = "4613f248-b60e-403a-bcdc-157d1c44194a"
|
|
1696
|
+
Faithfulness_Swift = "a3a5e97b-7fcb-441e-92f2-6e59aa473b89"
|
|
1697
|
+
Truthfulness_Swift = "c8c65e61-2dc8-4f29-865a-a5e59127d208"
|
|
1698
|
+
Completeness = "f0832c32-6beb-4383-a1ea-cdeb883d9044"
|
|
1699
|
+
|
|
1700
|
+
def __getattr__(self, name: Union[EvaluatorName, str]) -> Union["PresetEvaluatorRunner", "APresetEvaluatorRunner"]:
|
|
1701
|
+
if name in self.Eval.__members__:
|
|
1702
|
+
context = self.client_context()
|
|
1703
|
+
if isinstance(context, AbstractContextManager):
|
|
1704
|
+
return PresetEvaluatorRunner(self.client_context, self.Eval.__members__[name].value, name)
|
|
1705
|
+
else:
|
|
1706
|
+
return APresetEvaluatorRunner(self.client_context, self.Eval.__members__[name].value, name)
|
|
1707
|
+
raise AttributeError(f"{name} is not a valid attribute")
|