agenta 0.8.4__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of agenta might be problematic. Click here for more details.
- agenta/client/backend/__init__.py +32 -14
- agenta/client/backend/client.py +1462 -654
- agenta/client/backend/types/__init__.py +32 -14
- agenta/client/backend/types/aggregated_result.py +39 -0
- agenta/client/backend/types/app_variant_output.py +0 -1
- agenta/client/backend/types/app_variant_output_extended.py +50 -0
- agenta/client/backend/types/app_variant_revision.py +40 -0
- agenta/client/backend/types/{custom_evaluation_output.py → config_db.py} +3 -5
- agenta/client/backend/types/{custom_evaluation_names.py → delete_evaluation.py} +2 -3
- agenta/client/backend/types/environment_output.py +2 -0
- agenta/client/backend/types/evaluation.py +4 -4
- agenta/client/backend/types/evaluation_scenario.py +2 -3
- agenta/client/backend/types/evaluation_scenario_input.py +3 -2
- agenta/client/backend/types/evaluation_scenario_output.py +2 -2
- agenta/client/backend/types/evaluation_scenario_result.py +38 -0
- agenta/client/backend/types/evaluation_status_enum.py +4 -4
- agenta/client/backend/types/evaluation_type.py +0 -28
- agenta/client/backend/types/evaluator.py +39 -0
- agenta/client/backend/types/{custom_evaluation_detail.py → evaluator_config.py} +4 -4
- agenta/client/backend/types/human_evaluation.py +49 -0
- agenta/client/backend/types/human_evaluation_scenario.py +48 -0
- agenta/client/backend/types/{create_custom_evaluation.py → human_evaluation_scenario_input.py} +3 -4
- agenta/client/backend/types/human_evaluation_scenario_output.py +37 -0
- agenta/client/backend/types/{evaluation_scenario_score.py → human_evaluation_scenario_score.py} +1 -1
- agenta/client/backend/types/{evaluation_scenario_update_score.py → human_evaluation_scenario_update_score.py} +1 -1
- agenta/client/backend/types/llm_run_rate_limit.py +39 -0
- agenta/client/backend/types/result.py +37 -0
- {agenta-0.8.4.dist-info → agenta-0.10.0.dist-info}/METADATA +1 -1
- {agenta-0.8.4.dist-info → agenta-0.10.0.dist-info}/RECORD +31 -22
- agenta/client/backend/types/evaluation_type_settings.py +0 -42
- {agenta-0.8.4.dist-info → agenta-0.10.0.dist-info}/WHEEL +0 -0
- {agenta-0.8.4.dist-info → agenta-0.10.0.dist-info}/entry_points.txt +0 -0
|
@@ -3,37 +3,46 @@
|
|
|
3
3
|
from .add_variant_from_base_and_config_response import (
|
|
4
4
|
AddVariantFromBaseAndConfigResponse,
|
|
5
5
|
)
|
|
6
|
+
from .aggregated_result import AggregatedResult
|
|
6
7
|
from .app import App
|
|
7
8
|
from .app_variant_output import AppVariantOutput
|
|
9
|
+
from .app_variant_output_extended import AppVariantOutputExtended
|
|
10
|
+
from .app_variant_revision import AppVariantRevision
|
|
8
11
|
from .base_output import BaseOutput
|
|
9
12
|
from .body_import_testset import BodyImportTestset
|
|
13
|
+
from .config_db import ConfigDb
|
|
10
14
|
from .container_templates_response import ContainerTemplatesResponse
|
|
11
15
|
from .create_app_output import CreateAppOutput
|
|
12
|
-
from .
|
|
13
|
-
from .custom_evaluation_detail import CustomEvaluationDetail
|
|
14
|
-
from .custom_evaluation_names import CustomEvaluationNames
|
|
15
|
-
from .custom_evaluation_output import CustomEvaluationOutput
|
|
16
|
+
from .delete_evaluation import DeleteEvaluation
|
|
16
17
|
from .docker_env_vars import DockerEnvVars
|
|
17
18
|
from .environment_output import EnvironmentOutput
|
|
18
19
|
from .evaluation import Evaluation
|
|
19
20
|
from .evaluation_scenario import EvaluationScenario
|
|
20
21
|
from .evaluation_scenario_input import EvaluationScenarioInput
|
|
21
22
|
from .evaluation_scenario_output import EvaluationScenarioOutput
|
|
22
|
-
from .
|
|
23
|
-
from .evaluation_scenario_update_score import EvaluationScenarioUpdateScore
|
|
23
|
+
from .evaluation_scenario_result import EvaluationScenarioResult
|
|
24
24
|
from .evaluation_status_enum import EvaluationStatusEnum
|
|
25
25
|
from .evaluation_type import EvaluationType
|
|
26
|
-
from .evaluation_type_settings import EvaluationTypeSettings
|
|
27
26
|
from .evaluation_webhook import EvaluationWebhook
|
|
27
|
+
from .evaluator import Evaluator
|
|
28
|
+
from .evaluator_config import EvaluatorConfig
|
|
28
29
|
from .feedback import Feedback
|
|
29
30
|
from .get_config_reponse import GetConfigReponse
|
|
30
31
|
from .http_validation_error import HttpValidationError
|
|
32
|
+
from .human_evaluation import HumanEvaluation
|
|
33
|
+
from .human_evaluation_scenario import HumanEvaluationScenario
|
|
34
|
+
from .human_evaluation_scenario_input import HumanEvaluationScenarioInput
|
|
35
|
+
from .human_evaluation_scenario_output import HumanEvaluationScenarioOutput
|
|
36
|
+
from .human_evaluation_scenario_score import HumanEvaluationScenarioScore
|
|
37
|
+
from .human_evaluation_scenario_update_score import HumanEvaluationScenarioUpdateScore
|
|
31
38
|
from .image import Image
|
|
32
39
|
from .invite_request import InviteRequest
|
|
33
40
|
from .list_api_keys_output import ListApiKeysOutput
|
|
41
|
+
from .llm_run_rate_limit import LlmRunRateLimit
|
|
34
42
|
from .new_testset import NewTestset
|
|
35
43
|
from .organization import Organization
|
|
36
44
|
from .organization_output import OrganizationOutput
|
|
45
|
+
from .result import Result
|
|
37
46
|
from .simple_evaluation_output import SimpleEvaluationOutput
|
|
38
47
|
from .span import Span
|
|
39
48
|
from .template import Template
|
|
@@ -49,37 +58,46 @@ from .variant_action_enum import VariantActionEnum
|
|
|
49
58
|
|
|
50
59
|
__all__ = [
|
|
51
60
|
"AddVariantFromBaseAndConfigResponse",
|
|
61
|
+
"AggregatedResult",
|
|
52
62
|
"App",
|
|
53
63
|
"AppVariantOutput",
|
|
64
|
+
"AppVariantOutputExtended",
|
|
65
|
+
"AppVariantRevision",
|
|
54
66
|
"BaseOutput",
|
|
55
67
|
"BodyImportTestset",
|
|
68
|
+
"ConfigDb",
|
|
56
69
|
"ContainerTemplatesResponse",
|
|
57
70
|
"CreateAppOutput",
|
|
58
|
-
"
|
|
59
|
-
"CustomEvaluationDetail",
|
|
60
|
-
"CustomEvaluationNames",
|
|
61
|
-
"CustomEvaluationOutput",
|
|
71
|
+
"DeleteEvaluation",
|
|
62
72
|
"DockerEnvVars",
|
|
63
73
|
"EnvironmentOutput",
|
|
64
74
|
"Evaluation",
|
|
65
75
|
"EvaluationScenario",
|
|
66
76
|
"EvaluationScenarioInput",
|
|
67
77
|
"EvaluationScenarioOutput",
|
|
68
|
-
"
|
|
69
|
-
"EvaluationScenarioUpdateScore",
|
|
78
|
+
"EvaluationScenarioResult",
|
|
70
79
|
"EvaluationStatusEnum",
|
|
71
80
|
"EvaluationType",
|
|
72
|
-
"EvaluationTypeSettings",
|
|
73
81
|
"EvaluationWebhook",
|
|
82
|
+
"Evaluator",
|
|
83
|
+
"EvaluatorConfig",
|
|
74
84
|
"Feedback",
|
|
75
85
|
"GetConfigReponse",
|
|
76
86
|
"HttpValidationError",
|
|
87
|
+
"HumanEvaluation",
|
|
88
|
+
"HumanEvaluationScenario",
|
|
89
|
+
"HumanEvaluationScenarioInput",
|
|
90
|
+
"HumanEvaluationScenarioOutput",
|
|
91
|
+
"HumanEvaluationScenarioScore",
|
|
92
|
+
"HumanEvaluationScenarioUpdateScore",
|
|
77
93
|
"Image",
|
|
78
94
|
"InviteRequest",
|
|
79
95
|
"ListApiKeysOutput",
|
|
96
|
+
"LlmRunRateLimit",
|
|
80
97
|
"NewTestset",
|
|
81
98
|
"Organization",
|
|
82
99
|
"OrganizationOutput",
|
|
100
|
+
"Result",
|
|
83
101
|
"SimpleEvaluationOutput",
|
|
84
102
|
"Span",
|
|
85
103
|
"Template",
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
|
2
|
+
|
|
3
|
+
import datetime as dt
|
|
4
|
+
import typing
|
|
5
|
+
|
|
6
|
+
from ..core.datetime_utils import serialize_datetime
|
|
7
|
+
from .evaluator_config import EvaluatorConfig
|
|
8
|
+
from .result import Result
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
import pydantic.v1 as pydantic # type: ignore
|
|
12
|
+
except ImportError:
|
|
13
|
+
import pydantic # type: ignore
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class AggregatedResult(pydantic.BaseModel):
|
|
17
|
+
evaluator_config: EvaluatorConfig
|
|
18
|
+
result: Result
|
|
19
|
+
|
|
20
|
+
def json(self, **kwargs: typing.Any) -> str:
|
|
21
|
+
kwargs_with_defaults: typing.Any = {
|
|
22
|
+
"by_alias": True,
|
|
23
|
+
"exclude_unset": True,
|
|
24
|
+
**kwargs,
|
|
25
|
+
}
|
|
26
|
+
return super().json(**kwargs_with_defaults)
|
|
27
|
+
|
|
28
|
+
def dict(self, **kwargs: typing.Any) -> typing.Dict[str, typing.Any]:
|
|
29
|
+
kwargs_with_defaults: typing.Any = {
|
|
30
|
+
"by_alias": True,
|
|
31
|
+
"exclude_unset": True,
|
|
32
|
+
**kwargs,
|
|
33
|
+
}
|
|
34
|
+
return super().dict(**kwargs_with_defaults)
|
|
35
|
+
|
|
36
|
+
class Config:
|
|
37
|
+
frozen = True
|
|
38
|
+
smart_union = True
|
|
39
|
+
json_encoders = {dt.datetime: serialize_datetime}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
|
2
|
+
|
|
3
|
+
import datetime as dt
|
|
4
|
+
import typing
|
|
5
|
+
|
|
6
|
+
from ..core.datetime_utils import serialize_datetime
|
|
7
|
+
from .app_variant_revision import AppVariantRevision
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
import pydantic.v1 as pydantic # type: ignore
|
|
11
|
+
except ImportError:
|
|
12
|
+
import pydantic # type: ignore
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class AppVariantOutputExtended(pydantic.BaseModel):
|
|
16
|
+
app_id: str
|
|
17
|
+
app_name: str
|
|
18
|
+
variant_id: str
|
|
19
|
+
variant_name: str
|
|
20
|
+
parameters: typing.Optional[typing.Dict[str, typing.Any]]
|
|
21
|
+
previous_variant_name: typing.Optional[str]
|
|
22
|
+
organization_id: str
|
|
23
|
+
user_id: str
|
|
24
|
+
base_name: str
|
|
25
|
+
base_id: str
|
|
26
|
+
config_name: str
|
|
27
|
+
uri: typing.Optional[str]
|
|
28
|
+
revision: int
|
|
29
|
+
revisions: typing.List[AppVariantRevision]
|
|
30
|
+
|
|
31
|
+
def json(self, **kwargs: typing.Any) -> str:
|
|
32
|
+
kwargs_with_defaults: typing.Any = {
|
|
33
|
+
"by_alias": True,
|
|
34
|
+
"exclude_unset": True,
|
|
35
|
+
**kwargs,
|
|
36
|
+
}
|
|
37
|
+
return super().json(**kwargs_with_defaults)
|
|
38
|
+
|
|
39
|
+
def dict(self, **kwargs: typing.Any) -> typing.Dict[str, typing.Any]:
|
|
40
|
+
kwargs_with_defaults: typing.Any = {
|
|
41
|
+
"by_alias": True,
|
|
42
|
+
"exclude_unset": True,
|
|
43
|
+
**kwargs,
|
|
44
|
+
}
|
|
45
|
+
return super().dict(**kwargs_with_defaults)
|
|
46
|
+
|
|
47
|
+
class Config:
|
|
48
|
+
frozen = True
|
|
49
|
+
smart_union = True
|
|
50
|
+
json_encoders = {dt.datetime: serialize_datetime}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
|
2
|
+
|
|
3
|
+
import datetime as dt
|
|
4
|
+
import typing
|
|
5
|
+
|
|
6
|
+
from ..core.datetime_utils import serialize_datetime
|
|
7
|
+
from .config_db import ConfigDb
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
import pydantic.v1 as pydantic # type: ignore
|
|
11
|
+
except ImportError:
|
|
12
|
+
import pydantic # type: ignore
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class AppVariantRevision(pydantic.BaseModel):
|
|
16
|
+
revision: int
|
|
17
|
+
modified_by: str
|
|
18
|
+
config: ConfigDb
|
|
19
|
+
created_at: dt.datetime
|
|
20
|
+
|
|
21
|
+
def json(self, **kwargs: typing.Any) -> str:
|
|
22
|
+
kwargs_with_defaults: typing.Any = {
|
|
23
|
+
"by_alias": True,
|
|
24
|
+
"exclude_unset": True,
|
|
25
|
+
**kwargs,
|
|
26
|
+
}
|
|
27
|
+
return super().json(**kwargs_with_defaults)
|
|
28
|
+
|
|
29
|
+
def dict(self, **kwargs: typing.Any) -> typing.Dict[str, typing.Any]:
|
|
30
|
+
kwargs_with_defaults: typing.Any = {
|
|
31
|
+
"by_alias": True,
|
|
32
|
+
"exclude_unset": True,
|
|
33
|
+
**kwargs,
|
|
34
|
+
}
|
|
35
|
+
return super().dict(**kwargs_with_defaults)
|
|
36
|
+
|
|
37
|
+
class Config:
|
|
38
|
+
frozen = True
|
|
39
|
+
smart_union = True
|
|
40
|
+
json_encoders = {dt.datetime: serialize_datetime}
|
|
@@ -11,11 +11,9 @@ except ImportError:
|
|
|
11
11
|
import pydantic # type: ignore
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
class
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
evaluation_name: str
|
|
18
|
-
created_at: dt.datetime
|
|
14
|
+
class ConfigDb(pydantic.BaseModel):
|
|
15
|
+
config_name: str
|
|
16
|
+
parameters: typing.Optional[typing.Dict[str, typing.Any]]
|
|
19
17
|
|
|
20
18
|
def json(self, **kwargs: typing.Any) -> str:
|
|
21
19
|
kwargs_with_defaults: typing.Any = {
|
|
@@ -11,9 +11,8 @@ except ImportError:
|
|
|
11
11
|
import pydantic # type: ignore
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
class
|
|
15
|
-
|
|
16
|
-
evaluation_name: str
|
|
14
|
+
class DeleteEvaluation(pydantic.BaseModel):
|
|
15
|
+
evaluations_ids: typing.List[str]
|
|
17
16
|
|
|
18
17
|
def json(self, **kwargs: typing.Any) -> str:
|
|
19
18
|
kwargs_with_defaults: typing.Any = {
|
|
@@ -16,6 +16,8 @@ class EnvironmentOutput(pydantic.BaseModel):
|
|
|
16
16
|
app_id: str
|
|
17
17
|
deployed_app_variant_id: typing.Optional[str]
|
|
18
18
|
deployed_variant_name: typing.Optional[str]
|
|
19
|
+
deployed_app_variant_revision_id: typing.Optional[str]
|
|
20
|
+
revision: typing.Optional[str]
|
|
19
21
|
|
|
20
22
|
def json(self, **kwargs: typing.Any) -> str:
|
|
21
23
|
kwargs_with_defaults: typing.Any = {
|
|
@@ -4,8 +4,7 @@ import datetime as dt
|
|
|
4
4
|
import typing
|
|
5
5
|
|
|
6
6
|
from ..core.datetime_utils import serialize_datetime
|
|
7
|
-
from .
|
|
8
|
-
from .evaluation_type_settings import EvaluationTypeSettings
|
|
7
|
+
from .aggregated_result import AggregatedResult
|
|
9
8
|
|
|
10
9
|
try:
|
|
11
10
|
import pydantic.v1 as pydantic # type: ignore
|
|
@@ -18,13 +17,14 @@ class Evaluation(pydantic.BaseModel):
|
|
|
18
17
|
app_id: str
|
|
19
18
|
user_id: str
|
|
20
19
|
user_username: str
|
|
21
|
-
evaluation_type: EvaluationType
|
|
22
|
-
evaluation_type_settings: typing.Optional[EvaluationTypeSettings]
|
|
23
20
|
variant_ids: typing.List[str]
|
|
24
21
|
variant_names: typing.List[str]
|
|
22
|
+
variant_revision_ids: typing.List[str]
|
|
23
|
+
revisions: typing.List[str]
|
|
25
24
|
testset_id: str
|
|
26
25
|
testset_name: str
|
|
27
26
|
status: str
|
|
27
|
+
aggregated_results: typing.List[AggregatedResult]
|
|
28
28
|
created_at: dt.datetime
|
|
29
29
|
updated_at: dt.datetime
|
|
30
30
|
|
|
@@ -6,7 +6,7 @@ import typing
|
|
|
6
6
|
from ..core.datetime_utils import serialize_datetime
|
|
7
7
|
from .evaluation_scenario_input import EvaluationScenarioInput
|
|
8
8
|
from .evaluation_scenario_output import EvaluationScenarioOutput
|
|
9
|
-
from .
|
|
9
|
+
from .evaluation_scenario_result import EvaluationScenarioResult
|
|
10
10
|
|
|
11
11
|
try:
|
|
12
12
|
import pydantic.v1 as pydantic # type: ignore
|
|
@@ -19,12 +19,11 @@ class EvaluationScenario(pydantic.BaseModel):
|
|
|
19
19
|
evaluation_id: str
|
|
20
20
|
inputs: typing.List[EvaluationScenarioInput]
|
|
21
21
|
outputs: typing.List[EvaluationScenarioOutput]
|
|
22
|
-
vote: typing.Optional[str]
|
|
23
|
-
score: typing.Optional[EvaluationScenarioScore]
|
|
24
22
|
evaluation: typing.Optional[str]
|
|
25
23
|
correct_answer: typing.Optional[str]
|
|
26
24
|
is_pinned: typing.Optional[bool]
|
|
27
25
|
note: typing.Optional[str]
|
|
26
|
+
results: typing.List[EvaluationScenarioResult]
|
|
28
27
|
|
|
29
28
|
def json(self, **kwargs: typing.Any) -> str:
|
|
30
29
|
kwargs_with_defaults: typing.Any = {
|
|
@@ -12,8 +12,9 @@ except ImportError:
|
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class EvaluationScenarioInput(pydantic.BaseModel):
|
|
15
|
-
|
|
16
|
-
|
|
15
|
+
name: str
|
|
16
|
+
type: str
|
|
17
|
+
value: typing.Optional[typing.Any]
|
|
17
18
|
|
|
18
19
|
def json(self, **kwargs: typing.Any) -> str:
|
|
19
20
|
kwargs_with_defaults: typing.Any = {
|
|
@@ -12,8 +12,8 @@ except ImportError:
|
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class EvaluationScenarioOutput(pydantic.BaseModel):
|
|
15
|
-
|
|
16
|
-
|
|
15
|
+
type: str
|
|
16
|
+
value: typing.Optional[typing.Any]
|
|
17
17
|
|
|
18
18
|
def json(self, **kwargs: typing.Any) -> str:
|
|
19
19
|
kwargs_with_defaults: typing.Any = {
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
|
2
|
+
|
|
3
|
+
import datetime as dt
|
|
4
|
+
import typing
|
|
5
|
+
|
|
6
|
+
from ..core.datetime_utils import serialize_datetime
|
|
7
|
+
from .result import Result
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
import pydantic.v1 as pydantic # type: ignore
|
|
11
|
+
except ImportError:
|
|
12
|
+
import pydantic # type: ignore
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class EvaluationScenarioResult(pydantic.BaseModel):
|
|
16
|
+
evaluator_config: str
|
|
17
|
+
result: Result
|
|
18
|
+
|
|
19
|
+
def json(self, **kwargs: typing.Any) -> str:
|
|
20
|
+
kwargs_with_defaults: typing.Any = {
|
|
21
|
+
"by_alias": True,
|
|
22
|
+
"exclude_unset": True,
|
|
23
|
+
**kwargs,
|
|
24
|
+
}
|
|
25
|
+
return super().json(**kwargs_with_defaults)
|
|
26
|
+
|
|
27
|
+
def dict(self, **kwargs: typing.Any) -> typing.Dict[str, typing.Any]:
|
|
28
|
+
kwargs_with_defaults: typing.Any = {
|
|
29
|
+
"by_alias": True,
|
|
30
|
+
"exclude_unset": True,
|
|
31
|
+
**kwargs,
|
|
32
|
+
}
|
|
33
|
+
return super().dict(**kwargs_with_defaults)
|
|
34
|
+
|
|
35
|
+
class Config:
|
|
36
|
+
frozen = True
|
|
37
|
+
smart_union = True
|
|
38
|
+
json_encoders = {dt.datetime: serialize_datetime}
|
|
@@ -13,21 +13,21 @@ class EvaluationStatusEnum(str, enum.Enum):
|
|
|
13
13
|
|
|
14
14
|
EVALUATION_INITIALIZED = "EVALUATION_INITIALIZED"
|
|
15
15
|
EVALUATION_STARTED = "EVALUATION_STARTED"
|
|
16
|
-
COMPARISON_RUN_STARTED = "COMPARISON_RUN_STARTED"
|
|
17
16
|
EVALUATION_FINISHED = "EVALUATION_FINISHED"
|
|
17
|
+
EVALUATION_FAILED = "EVALUATION_FAILED"
|
|
18
18
|
|
|
19
19
|
def visit(
|
|
20
20
|
self,
|
|
21
21
|
evaluation_initialized: typing.Callable[[], T_Result],
|
|
22
22
|
evaluation_started: typing.Callable[[], T_Result],
|
|
23
|
-
comparison_run_started: typing.Callable[[], T_Result],
|
|
24
23
|
evaluation_finished: typing.Callable[[], T_Result],
|
|
24
|
+
evaluation_failed: typing.Callable[[], T_Result],
|
|
25
25
|
) -> T_Result:
|
|
26
26
|
if self is EvaluationStatusEnum.EVALUATION_INITIALIZED:
|
|
27
27
|
return evaluation_initialized()
|
|
28
28
|
if self is EvaluationStatusEnum.EVALUATION_STARTED:
|
|
29
29
|
return evaluation_started()
|
|
30
|
-
if self is EvaluationStatusEnum.COMPARISON_RUN_STARTED:
|
|
31
|
-
return comparison_run_started()
|
|
32
30
|
if self is EvaluationStatusEnum.EVALUATION_FINISHED:
|
|
33
31
|
return evaluation_finished()
|
|
32
|
+
if self is EvaluationStatusEnum.EVALUATION_FAILED:
|
|
33
|
+
return evaluation_failed()
|
|
@@ -11,43 +11,15 @@ class EvaluationType(str, enum.Enum):
|
|
|
11
11
|
An enumeration.
|
|
12
12
|
"""
|
|
13
13
|
|
|
14
|
-
AUTO_EXACT_MATCH = "auto_exact_match"
|
|
15
|
-
AUTO_SIMILARITY_MATCH = "auto_similarity_match"
|
|
16
|
-
AUTO_REGEX_TEST = "auto_regex_test"
|
|
17
|
-
AUTO_WEBHOOK_TEST = "auto_webhook_test"
|
|
18
|
-
AUTO_AI_CRITIQUE = "auto_ai_critique"
|
|
19
14
|
HUMAN_A_B_TESTING = "human_a_b_testing"
|
|
20
|
-
HUMAN_SCORING = "human_scoring"
|
|
21
|
-
CUSTOM_CODE_RUN = "custom_code_run"
|
|
22
15
|
SINGLE_MODEL_TEST = "single_model_test"
|
|
23
16
|
|
|
24
17
|
def visit(
|
|
25
18
|
self,
|
|
26
|
-
auto_exact_match: typing.Callable[[], T_Result],
|
|
27
|
-
auto_similarity_match: typing.Callable[[], T_Result],
|
|
28
|
-
auto_regex_test: typing.Callable[[], T_Result],
|
|
29
|
-
auto_webhook_test: typing.Callable[[], T_Result],
|
|
30
|
-
auto_ai_critique: typing.Callable[[], T_Result],
|
|
31
19
|
human_a_b_testing: typing.Callable[[], T_Result],
|
|
32
|
-
human_scoring: typing.Callable[[], T_Result],
|
|
33
|
-
custom_code_run: typing.Callable[[], T_Result],
|
|
34
20
|
single_model_test: typing.Callable[[], T_Result],
|
|
35
21
|
) -> T_Result:
|
|
36
|
-
if self is EvaluationType.AUTO_EXACT_MATCH:
|
|
37
|
-
return auto_exact_match()
|
|
38
|
-
if self is EvaluationType.AUTO_SIMILARITY_MATCH:
|
|
39
|
-
return auto_similarity_match()
|
|
40
|
-
if self is EvaluationType.AUTO_REGEX_TEST:
|
|
41
|
-
return auto_regex_test()
|
|
42
|
-
if self is EvaluationType.AUTO_WEBHOOK_TEST:
|
|
43
|
-
return auto_webhook_test()
|
|
44
|
-
if self is EvaluationType.AUTO_AI_CRITIQUE:
|
|
45
|
-
return auto_ai_critique()
|
|
46
22
|
if self is EvaluationType.HUMAN_A_B_TESTING:
|
|
47
23
|
return human_a_b_testing()
|
|
48
|
-
if self is EvaluationType.HUMAN_SCORING:
|
|
49
|
-
return human_scoring()
|
|
50
|
-
if self is EvaluationType.CUSTOM_CODE_RUN:
|
|
51
|
-
return custom_code_run()
|
|
52
24
|
if self is EvaluationType.SINGLE_MODEL_TEST:
|
|
53
25
|
return single_model_test()
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
|
2
|
+
|
|
3
|
+
import datetime as dt
|
|
4
|
+
import typing
|
|
5
|
+
|
|
6
|
+
from ..core.datetime_utils import serialize_datetime
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
import pydantic.v1 as pydantic # type: ignore
|
|
10
|
+
except ImportError:
|
|
11
|
+
import pydantic # type: ignore
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Evaluator(pydantic.BaseModel):
|
|
15
|
+
name: str
|
|
16
|
+
key: str
|
|
17
|
+
direct_use: bool
|
|
18
|
+
settings_template: typing.Dict[str, typing.Any]
|
|
19
|
+
|
|
20
|
+
def json(self, **kwargs: typing.Any) -> str:
|
|
21
|
+
kwargs_with_defaults: typing.Any = {
|
|
22
|
+
"by_alias": True,
|
|
23
|
+
"exclude_unset": True,
|
|
24
|
+
**kwargs,
|
|
25
|
+
}
|
|
26
|
+
return super().json(**kwargs_with_defaults)
|
|
27
|
+
|
|
28
|
+
def dict(self, **kwargs: typing.Any) -> typing.Dict[str, typing.Any]:
|
|
29
|
+
kwargs_with_defaults: typing.Any = {
|
|
30
|
+
"by_alias": True,
|
|
31
|
+
"exclude_unset": True,
|
|
32
|
+
**kwargs,
|
|
33
|
+
}
|
|
34
|
+
return super().dict(**kwargs_with_defaults)
|
|
35
|
+
|
|
36
|
+
class Config:
|
|
37
|
+
frozen = True
|
|
38
|
+
smart_union = True
|
|
39
|
+
json_encoders = {dt.datetime: serialize_datetime}
|
|
@@ -11,11 +11,11 @@ except ImportError:
|
|
|
11
11
|
import pydantic # type: ignore
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
class
|
|
14
|
+
class EvaluatorConfig(pydantic.BaseModel):
|
|
15
15
|
id: str
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
16
|
+
name: str
|
|
17
|
+
evaluator_key: str
|
|
18
|
+
settings_values: typing.Optional[typing.Dict[str, typing.Any]]
|
|
19
19
|
created_at: dt.datetime
|
|
20
20
|
updated_at: dt.datetime
|
|
21
21
|
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
|
2
|
+
|
|
3
|
+
import datetime as dt
|
|
4
|
+
import typing
|
|
5
|
+
|
|
6
|
+
from ..core.datetime_utils import serialize_datetime
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
import pydantic.v1 as pydantic # type: ignore
|
|
10
|
+
except ImportError:
|
|
11
|
+
import pydantic # type: ignore
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class HumanEvaluation(pydantic.BaseModel):
|
|
15
|
+
id: str
|
|
16
|
+
app_id: str
|
|
17
|
+
user_id: str
|
|
18
|
+
user_username: str
|
|
19
|
+
evaluation_type: str
|
|
20
|
+
variant_ids: typing.List[str]
|
|
21
|
+
variant_names: typing.List[str]
|
|
22
|
+
variants_revision_ids: typing.List[str]
|
|
23
|
+
revisions: typing.List[str]
|
|
24
|
+
testset_id: str
|
|
25
|
+
testset_name: str
|
|
26
|
+
status: str
|
|
27
|
+
created_at: dt.datetime
|
|
28
|
+
updated_at: dt.datetime
|
|
29
|
+
|
|
30
|
+
def json(self, **kwargs: typing.Any) -> str:
|
|
31
|
+
kwargs_with_defaults: typing.Any = {
|
|
32
|
+
"by_alias": True,
|
|
33
|
+
"exclude_unset": True,
|
|
34
|
+
**kwargs,
|
|
35
|
+
}
|
|
36
|
+
return super().json(**kwargs_with_defaults)
|
|
37
|
+
|
|
38
|
+
def dict(self, **kwargs: typing.Any) -> typing.Dict[str, typing.Any]:
|
|
39
|
+
kwargs_with_defaults: typing.Any = {
|
|
40
|
+
"by_alias": True,
|
|
41
|
+
"exclude_unset": True,
|
|
42
|
+
**kwargs,
|
|
43
|
+
}
|
|
44
|
+
return super().dict(**kwargs_with_defaults)
|
|
45
|
+
|
|
46
|
+
class Config:
|
|
47
|
+
frozen = True
|
|
48
|
+
smart_union = True
|
|
49
|
+
json_encoders = {dt.datetime: serialize_datetime}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# This file was auto-generated by Fern from our API Definition.
|
|
2
|
+
|
|
3
|
+
import datetime as dt
|
|
4
|
+
import typing
|
|
5
|
+
|
|
6
|
+
from ..core.datetime_utils import serialize_datetime
|
|
7
|
+
from .human_evaluation_scenario_input import HumanEvaluationScenarioInput
|
|
8
|
+
from .human_evaluation_scenario_output import HumanEvaluationScenarioOutput
|
|
9
|
+
from .human_evaluation_scenario_score import HumanEvaluationScenarioScore
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import pydantic.v1 as pydantic # type: ignore
|
|
13
|
+
except ImportError:
|
|
14
|
+
import pydantic # type: ignore
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class HumanEvaluationScenario(pydantic.BaseModel):
|
|
18
|
+
id: typing.Optional[str]
|
|
19
|
+
evaluation_id: str
|
|
20
|
+
inputs: typing.List[HumanEvaluationScenarioInput]
|
|
21
|
+
outputs: typing.List[HumanEvaluationScenarioOutput]
|
|
22
|
+
vote: typing.Optional[str]
|
|
23
|
+
score: typing.Optional[HumanEvaluationScenarioScore]
|
|
24
|
+
evaluation: typing.Optional[str]
|
|
25
|
+
correct_answer: typing.Optional[str]
|
|
26
|
+
is_pinned: typing.Optional[bool]
|
|
27
|
+
note: typing.Optional[str]
|
|
28
|
+
|
|
29
|
+
def json(self, **kwargs: typing.Any) -> str:
|
|
30
|
+
kwargs_with_defaults: typing.Any = {
|
|
31
|
+
"by_alias": True,
|
|
32
|
+
"exclude_unset": True,
|
|
33
|
+
**kwargs,
|
|
34
|
+
}
|
|
35
|
+
return super().json(**kwargs_with_defaults)
|
|
36
|
+
|
|
37
|
+
def dict(self, **kwargs: typing.Any) -> typing.Dict[str, typing.Any]:
|
|
38
|
+
kwargs_with_defaults: typing.Any = {
|
|
39
|
+
"by_alias": True,
|
|
40
|
+
"exclude_unset": True,
|
|
41
|
+
**kwargs,
|
|
42
|
+
}
|
|
43
|
+
return super().dict(**kwargs_with_defaults)
|
|
44
|
+
|
|
45
|
+
class Config:
|
|
46
|
+
frozen = True
|
|
47
|
+
smart_union = True
|
|
48
|
+
json_encoders = {dt.datetime: serialize_datetime}
|
agenta/client/backend/types/{create_custom_evaluation.py → human_evaluation_scenario_input.py}
RENAMED
|
@@ -11,10 +11,9 @@ except ImportError:
|
|
|
11
11
|
import pydantic # type: ignore
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
class
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
app_id: str
|
|
14
|
+
class HumanEvaluationScenarioInput(pydantic.BaseModel):
|
|
15
|
+
input_name: str
|
|
16
|
+
input_value: str
|
|
18
17
|
|
|
19
18
|
def json(self, **kwargs: typing.Any) -> str:
|
|
20
19
|
kwargs_with_defaults: typing.Any = {
|