uipath 2.1.16__py3-none-any.whl → 2.1.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- uipath/_cli/_evals/_evaluators/__init__.py +6 -4
- uipath/_cli/_evals/_evaluators/_deterministic_evaluator_base.py +46 -0
- uipath/_cli/_evals/_evaluators/_evaluator_factory.py +42 -22
- uipath/_cli/_evals/_evaluators/_exact_match_evaluator.py +40 -0
- uipath/_cli/_evals/_evaluators/_json_similarity_evaluator.py +168 -0
- uipath/_cli/_evals/_evaluators/_llm_as_judge_evaluator.py +2 -0
- uipath/_cli/_evals/_models/_evaluators.py +12 -11
- uipath/_cli/_evals/evaluation_service.py +1 -1
- uipath/_cli/_evals/progress_reporter.py +44 -88
- uipath/models/exceptions.py +6 -0
- {uipath-2.1.16.dist-info → uipath-2.1.18.dist-info}/METADATA +1 -1
- {uipath-2.1.16.dist-info → uipath-2.1.18.dist-info}/RECORD +15 -14
- uipath/_cli/_evals/_evaluators/_agent_scorer_evaluator.py +0 -48
- uipath/_cli/_evals/_evaluators/_deterministic_evaluator.py +0 -41
- {uipath-2.1.16.dist-info → uipath-2.1.18.dist-info}/WHEEL +0 -0
- {uipath-2.1.16.dist-info → uipath-2.1.18.dist-info}/entry_points.txt +0 -0
- {uipath-2.1.16.dist-info → uipath-2.1.18.dist-info}/licenses/LICENSE +0 -0
@@ -3,18 +3,20 @@
|
|
3
3
|
This package contains all evaluator types and the factory for creating them.
|
4
4
|
"""
|
5
5
|
|
6
|
-
from .
|
7
|
-
from ._deterministic_evaluator import DeterministicEvaluator
|
6
|
+
from ._deterministic_evaluator_base import DeterministicEvaluatorBase
|
8
7
|
from ._evaluator_base import EvaluatorBase
|
9
8
|
from ._evaluator_factory import EvaluatorFactory
|
9
|
+
from ._exact_match_evaluator import ExactMatchEvaluator
|
10
|
+
from ._json_similarity_evaluator import JsonSimilarityEvaluator
|
10
11
|
from ._llm_as_judge_evaluator import LlmAsAJudgeEvaluator
|
11
12
|
from ._trajectory_evaluator import TrajectoryEvaluator
|
12
13
|
|
13
14
|
__all__ = [
|
14
15
|
"EvaluatorBase",
|
16
|
+
"DeterministicEvaluatorBase",
|
15
17
|
"EvaluatorFactory",
|
16
|
-
"
|
18
|
+
"JsonSimilarityEvaluator",
|
19
|
+
"ExactMatchEvaluator",
|
17
20
|
"LlmAsAJudgeEvaluator",
|
18
|
-
"AgentScorerEvaluator",
|
19
21
|
"TrajectoryEvaluator",
|
20
22
|
]
|
@@ -0,0 +1,46 @@
|
|
1
|
+
import copy
|
2
|
+
import json
|
3
|
+
from abc import ABC
|
4
|
+
from typing import Any, Dict, Tuple
|
5
|
+
|
6
|
+
from ._evaluator_base import EvaluatorBase
|
7
|
+
|
8
|
+
|
9
|
+
class DeterministicEvaluatorBase(EvaluatorBase, ABC):
|
10
|
+
def __init__(self, target_output_key: str = "*"):
|
11
|
+
super().__init__()
|
12
|
+
self.target_output_key = target_output_key
|
13
|
+
|
14
|
+
def _select_targets(
|
15
|
+
self, expected_output: Dict[str, Any], actual_output: Dict[str, Any]
|
16
|
+
) -> Tuple[Any, Any]:
|
17
|
+
actual_output_copy = copy.deepcopy(actual_output)
|
18
|
+
expected_output_copy = copy.deepcopy(expected_output)
|
19
|
+
if self.target_output_key != "*":
|
20
|
+
if (
|
21
|
+
self.target_output_key not in actual_output
|
22
|
+
or self.target_output_key not in expected_output
|
23
|
+
):
|
24
|
+
raise ValueError(
|
25
|
+
f"Field '{self.target_output_key}' missing from expected or actual output"
|
26
|
+
)
|
27
|
+
actual_output_copy = actual_output_copy[self.target_output_key]
|
28
|
+
expected_output_copy = expected_output[self.target_output_key]
|
29
|
+
return actual_output_copy, expected_output_copy
|
30
|
+
|
31
|
+
def _canonical_json(self, obj: Any) -> str:
|
32
|
+
return json.dumps(
|
33
|
+
self._normalize_numbers(obj),
|
34
|
+
sort_keys=True,
|
35
|
+
separators=(",", ":"),
|
36
|
+
ensure_ascii=False,
|
37
|
+
)
|
38
|
+
|
39
|
+
def _normalize_numbers(self, obj: Any) -> Any:
|
40
|
+
if isinstance(obj, dict):
|
41
|
+
return {k: self._normalize_numbers(v) for k, v in obj.items()}
|
42
|
+
if isinstance(obj, (list, tuple)):
|
43
|
+
return [self._normalize_numbers(v) for v in obj]
|
44
|
+
if isinstance(obj, (int, float)) and not isinstance(obj, bool):
|
45
|
+
return float(obj)
|
46
|
+
return obj
|
@@ -1,9 +1,9 @@
|
|
1
1
|
from typing import Any, Dict
|
2
2
|
|
3
3
|
from .._models import EvaluatorCategory, EvaluatorType
|
4
|
-
from ._agent_scorer_evaluator import AgentScorerEvaluator
|
5
|
-
from ._deterministic_evaluator import DeterministicEvaluator
|
6
4
|
from ._evaluator_base import EvaluatorBase, EvaluatorBaseParams
|
5
|
+
from ._exact_match_evaluator import ExactMatchEvaluator
|
6
|
+
from ._json_similarity_evaluator import JsonSimilarityEvaluator
|
7
7
|
from ._llm_as_judge_evaluator import LlmAsAJudgeEvaluator
|
8
8
|
from ._trajectory_evaluator import TrajectoryEvaluator
|
9
9
|
|
@@ -50,23 +50,50 @@ class EvaluatorFactory:
|
|
50
50
|
)
|
51
51
|
|
52
52
|
# Create evaluator based on category
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
53
|
+
match category:
|
54
|
+
case EvaluatorCategory.Deterministic:
|
55
|
+
if evaluator_type == evaluator_type.Equals:
|
56
|
+
return EvaluatorFactory._create_exact_match_evaluator(
|
57
|
+
base_params, data
|
58
|
+
)
|
59
|
+
elif evaluator_type == evaluator_type.JsonSimilarity:
|
60
|
+
return EvaluatorFactory._create_json_similarity_evaluator(
|
61
|
+
base_params, data
|
62
|
+
)
|
63
|
+
else:
|
64
|
+
raise ValueError(
|
65
|
+
f"Unknown evaluator type {evaluator_type} for category {category}"
|
66
|
+
)
|
67
|
+
case EvaluatorCategory.LlmAsAJudge:
|
68
|
+
return EvaluatorFactory._create_llm_as_judge_evaluator(
|
69
|
+
base_params, data
|
70
|
+
)
|
71
|
+
case EvaluatorCategory.AgentScorer:
|
72
|
+
raise NotImplementedError()
|
73
|
+
case EvaluatorCategory.Trajectory:
|
74
|
+
return EvaluatorFactory._create_trajectory_evaluator(base_params, data)
|
75
|
+
case _:
|
76
|
+
raise ValueError(f"Unknown evaluator category: {category}")
|
63
77
|
|
64
78
|
@staticmethod
|
65
|
-
def
|
79
|
+
def _create_exact_match_evaluator(
|
66
80
|
base_params: EvaluatorBaseParams, data: Dict[str, Any]
|
67
|
-
) ->
|
81
|
+
) -> ExactMatchEvaluator:
|
68
82
|
"""Create a deterministic evaluator."""
|
69
|
-
|
83
|
+
return ExactMatchEvaluator.from_params(
|
84
|
+
base_params,
|
85
|
+
target_output_key=data.get("targetOutputKey", "*"),
|
86
|
+
)
|
87
|
+
|
88
|
+
@staticmethod
|
89
|
+
def _create_json_similarity_evaluator(
|
90
|
+
base_params: EvaluatorBaseParams, data: Dict[str, Any]
|
91
|
+
) -> JsonSimilarityEvaluator:
|
92
|
+
"""Create a deterministic evaluator."""
|
93
|
+
return JsonSimilarityEvaluator.from_params(
|
94
|
+
base_params,
|
95
|
+
target_output_key=data.get("targetOutputKey", "*"),
|
96
|
+
)
|
70
97
|
|
71
98
|
@staticmethod
|
72
99
|
def _create_llm_as_judge_evaluator(
|
@@ -88,13 +115,6 @@ class EvaluatorFactory:
|
|
88
115
|
target_output_key=data.get("targetOutputKey", "*"),
|
89
116
|
)
|
90
117
|
|
91
|
-
@staticmethod
|
92
|
-
def _create_agent_scorer_evaluator(
|
93
|
-
base_params: EvaluatorBaseParams, data: Dict[str, Any]
|
94
|
-
) -> AgentScorerEvaluator:
|
95
|
-
"""Create an agent scorer evaluator."""
|
96
|
-
raise NotImplementedError()
|
97
|
-
|
98
118
|
@staticmethod
|
99
119
|
def _create_trajectory_evaluator(
|
100
120
|
base_params: EvaluatorBaseParams, data: Dict[str, Any]
|
@@ -0,0 +1,40 @@
|
|
1
|
+
import copy
|
2
|
+
from typing import Any, Dict
|
3
|
+
|
4
|
+
from uipath._cli._evals._evaluators._deterministic_evaluator_base import (
|
5
|
+
DeterministicEvaluatorBase,
|
6
|
+
)
|
7
|
+
from uipath._cli._evals._models import EvaluationResult
|
8
|
+
from uipath._cli._evals._models._evaluators import ScoreType
|
9
|
+
|
10
|
+
|
11
|
+
class ExactMatchEvaluator(DeterministicEvaluatorBase):
|
12
|
+
async def evaluate(
|
13
|
+
self,
|
14
|
+
evaluation_id: str,
|
15
|
+
evaluation_name: str,
|
16
|
+
input_data: Dict[str, Any],
|
17
|
+
expected_output: Dict[str, Any],
|
18
|
+
actual_output: Dict[str, Any],
|
19
|
+
) -> EvaluationResult:
|
20
|
+
actual_output_copy = copy.deepcopy(actual_output)
|
21
|
+
expected_output_copy = copy.deepcopy(expected_output)
|
22
|
+
|
23
|
+
actual_output, expected_output = self._select_targets(
|
24
|
+
expected_output, actual_output
|
25
|
+
)
|
26
|
+
are_equal = self._canonical_json(actual_output) == self._canonical_json(
|
27
|
+
expected_output
|
28
|
+
)
|
29
|
+
|
30
|
+
return EvaluationResult(
|
31
|
+
evaluation_id=evaluation_id,
|
32
|
+
evaluation_name=evaluation_name,
|
33
|
+
evaluator_id=self.id,
|
34
|
+
evaluator_name=self.name,
|
35
|
+
score=are_equal,
|
36
|
+
input=input_data,
|
37
|
+
expected_output=expected_output_copy,
|
38
|
+
actual_output=actual_output_copy,
|
39
|
+
score_type=ScoreType.BOOLEAN,
|
40
|
+
)
|
@@ -0,0 +1,168 @@
|
|
1
|
+
import copy
|
2
|
+
import math
|
3
|
+
from typing import Any, Dict, Tuple
|
4
|
+
|
5
|
+
from uipath._cli._evals._evaluators._deterministic_evaluator_base import (
|
6
|
+
DeterministicEvaluatorBase,
|
7
|
+
)
|
8
|
+
from uipath._cli._evals._models import EvaluationResult
|
9
|
+
from uipath._cli._evals._models._evaluators import ScoreType
|
10
|
+
|
11
|
+
|
12
|
+
class JsonSimilarityEvaluator(DeterministicEvaluatorBase):
|
13
|
+
"""Deterministic evaluator that scores structural JSON similarity.
|
14
|
+
|
15
|
+
Compares expected versus actual JSON-like structures and returns a
|
16
|
+
numerical score in the range [0, 100]. The comparison is token-based
|
17
|
+
and tolerant for numbers and strings (via Levenshtein distance).
|
18
|
+
"""
|
19
|
+
|
20
|
+
async def evaluate(
|
21
|
+
self,
|
22
|
+
evaluation_id: str,
|
23
|
+
evaluation_name: str,
|
24
|
+
input_data: Dict[str, Any],
|
25
|
+
expected_output: Dict[str, Any],
|
26
|
+
actual_output: Dict[str, Any],
|
27
|
+
) -> EvaluationResult:
|
28
|
+
"""Evaluate similarity between expected and actual JSON outputs.
|
29
|
+
|
30
|
+
Args:
|
31
|
+
evaluation_id: Unique identifier for this evaluation run.
|
32
|
+
evaluation_name: Human friendly evaluation name.
|
33
|
+
input_data: Input payload used to produce the outputs.
|
34
|
+
expected_output: Ground-truth JSON structure.
|
35
|
+
actual_output: Produced JSON structure to compare against the ground truth.
|
36
|
+
|
37
|
+
Returns:
|
38
|
+
EvaluationResult: Structured result with the numerical similarity score.
|
39
|
+
"""
|
40
|
+
actual_output_copy = copy.deepcopy(actual_output)
|
41
|
+
expected_output_copy = copy.deepcopy(expected_output)
|
42
|
+
|
43
|
+
actual_output, expected_output = self._select_targets(
|
44
|
+
expected_output, actual_output
|
45
|
+
)
|
46
|
+
similarity = self._compare_json(expected_output, actual_output)
|
47
|
+
|
48
|
+
return EvaluationResult(
|
49
|
+
evaluation_id=evaluation_id,
|
50
|
+
evaluation_name=evaluation_name,
|
51
|
+
evaluator_id=self.id,
|
52
|
+
evaluator_name=self.name,
|
53
|
+
score=similarity,
|
54
|
+
input=input_data,
|
55
|
+
expected_output=expected_output_copy,
|
56
|
+
actual_output=actual_output_copy,
|
57
|
+
score_type=ScoreType.NUMERICAL,
|
58
|
+
)
|
59
|
+
|
60
|
+
def _compare_json(self, expected: Any, actual: Any) -> float:
|
61
|
+
matched_leaves, total_leaves = self._compare_tokens(expected, actual)
|
62
|
+
if total_leaves == 0:
|
63
|
+
return 100.0
|
64
|
+
sim = (matched_leaves / total_leaves) * 100.0
|
65
|
+
return max(0.0, min(100.0, sim))
|
66
|
+
|
67
|
+
def _compare_tokens(
|
68
|
+
self, expected_token: Any, actual_token: Any
|
69
|
+
) -> Tuple[float, float]:
|
70
|
+
if self._is_number(expected_token) and self._is_number(actual_token):
|
71
|
+
return self._compare_numbers(float(expected_token), float(actual_token))
|
72
|
+
|
73
|
+
if type(expected_token) is not type(actual_token):
|
74
|
+
return 0.0, self._count_leaves(expected_token)
|
75
|
+
|
76
|
+
if isinstance(expected_token, dict):
|
77
|
+
matched_leaves = total_leaves = 0.0
|
78
|
+
# Only expected keys count
|
79
|
+
for expected_key, expected_value in expected_token.items():
|
80
|
+
if isinstance(actual_token, dict) and expected_key in actual_token:
|
81
|
+
matched, total = self._compare_tokens(
|
82
|
+
expected_value, actual_token[expected_key]
|
83
|
+
)
|
84
|
+
else:
|
85
|
+
matched, total = (0.0, self._count_leaves(expected_value))
|
86
|
+
matched_leaves += matched
|
87
|
+
total_leaves += total
|
88
|
+
return matched_leaves, total_leaves
|
89
|
+
|
90
|
+
if isinstance(expected_token, list):
|
91
|
+
matched_leaves = total_leaves = 0.0
|
92
|
+
common_length = min(len(expected_token), len(actual_token))
|
93
|
+
for index in range(common_length):
|
94
|
+
matched, total = self._compare_tokens(
|
95
|
+
expected_token[index], actual_token[index]
|
96
|
+
)
|
97
|
+
matched_leaves += matched
|
98
|
+
total_leaves += total
|
99
|
+
for index in range(common_length, len(expected_token)):
|
100
|
+
total_leaves += self._count_leaves(expected_token[index])
|
101
|
+
return (matched_leaves, total_leaves)
|
102
|
+
|
103
|
+
if isinstance(expected_token, bool):
|
104
|
+
return (1.0, 1.0) if expected_token == actual_token else (0.0, 1.0)
|
105
|
+
|
106
|
+
if isinstance(expected_token, str):
|
107
|
+
return self._compare_strings(expected_token, actual_token)
|
108
|
+
|
109
|
+
return (1.0, 1.0) if str(expected_token) == str(actual_token) else (0.0, 1.0)
|
110
|
+
|
111
|
+
def _compare_numbers(
|
112
|
+
self, expected_number: float, actual_number: float
|
113
|
+
) -> Tuple[float, float]:
|
114
|
+
total = 1.0
|
115
|
+
if math.isclose(expected_number, 0.0, abs_tol=1e-12):
|
116
|
+
matched = 1.0 if math.isclose(actual_number, 0.0, abs_tol=1e-12) else 0.0
|
117
|
+
else:
|
118
|
+
ratio = abs(expected_number - actual_number) / abs(expected_number)
|
119
|
+
matched = max(0.0, min(1.0, 1.0 - ratio))
|
120
|
+
return matched, total
|
121
|
+
|
122
|
+
def _compare_strings(
|
123
|
+
self, expected_string: str, actual_string: str
|
124
|
+
) -> Tuple[float, float]:
|
125
|
+
total = 1.0
|
126
|
+
if not expected_string and not actual_string:
|
127
|
+
return 1.0, total
|
128
|
+
distance = self._levenshtein(expected_string, actual_string)
|
129
|
+
max_length = max(len(expected_string), len(actual_string))
|
130
|
+
similarity = 1.0 - (distance / max_length) if max_length else 1.0
|
131
|
+
similarity = max(0.0, min(1.0, similarity))
|
132
|
+
return similarity, total
|
133
|
+
|
134
|
+
def _count_leaves(self, token_node: Any) -> float:
|
135
|
+
if isinstance(token_node, dict):
|
136
|
+
return sum(
|
137
|
+
self._count_leaves(child_value) for child_value in token_node.values()
|
138
|
+
)
|
139
|
+
if isinstance(token_node, list):
|
140
|
+
return sum(self._count_leaves(child_value) for child_value in token_node)
|
141
|
+
return 1.0
|
142
|
+
|
143
|
+
def _levenshtein(self, source_text: str, target_text: str) -> int:
|
144
|
+
if not source_text:
|
145
|
+
return len(target_text)
|
146
|
+
if not target_text:
|
147
|
+
return len(source_text)
|
148
|
+
source_len, target_len = len(source_text), len(target_text)
|
149
|
+
distance_matrix = [[0] * (target_len + 1) for _ in range(source_len + 1)]
|
150
|
+
for row_idx in range(source_len + 1):
|
151
|
+
distance_matrix[row_idx][0] = row_idx
|
152
|
+
for col_idx in range(target_len + 1):
|
153
|
+
distance_matrix[0][col_idx] = col_idx
|
154
|
+
for row_idx in range(1, source_len + 1):
|
155
|
+
for col_idx in range(1, target_len + 1):
|
156
|
+
substitution_cost = (
|
157
|
+
0 if source_text[row_idx - 1] == target_text[col_idx - 1] else 1
|
158
|
+
)
|
159
|
+
distance_matrix[row_idx][col_idx] = min(
|
160
|
+
distance_matrix[row_idx - 1][col_idx] + 1, # deletion
|
161
|
+
distance_matrix[row_idx][col_idx - 1] + 1, # insertion
|
162
|
+
distance_matrix[row_idx - 1][col_idx - 1]
|
163
|
+
+ substitution_cost, # substitution
|
164
|
+
)
|
165
|
+
return distance_matrix[source_len][target_len]
|
166
|
+
|
167
|
+
def _is_number(self, value: Any) -> bool:
|
168
|
+
return isinstance(value, (int, float)) and not isinstance(value, bool)
|
@@ -11,6 +11,7 @@ from ...._utils.constants import (
|
|
11
11
|
COMMUNITY_agents_SUFFIX,
|
12
12
|
)
|
13
13
|
from .._models import EvaluationResult, LLMResponse
|
14
|
+
from .._models._evaluators import ScoreType
|
14
15
|
from ._evaluator_base import EvaluatorBase
|
15
16
|
|
16
17
|
|
@@ -86,6 +87,7 @@ class LlmAsAJudgeEvaluator(EvaluatorBase):
|
|
86
87
|
expected_output=expected_output,
|
87
88
|
actual_output=actual_output,
|
88
89
|
details=llm_response.justification,
|
90
|
+
score_type=ScoreType.NUMERICAL,
|
89
91
|
)
|
90
92
|
|
91
93
|
def _extract_target_value(self, output: Dict[str, Any]) -> Any:
|
@@ -1,8 +1,8 @@
|
|
1
|
-
from datetime import datetime
|
1
|
+
from datetime import datetime, timezone
|
2
2
|
from enum import IntEnum
|
3
3
|
from typing import Any, Dict, List, Optional
|
4
4
|
|
5
|
-
from pydantic import BaseModel
|
5
|
+
from pydantic import BaseModel
|
6
6
|
|
7
7
|
|
8
8
|
class LLMResponse(BaseModel):
|
@@ -50,6 +50,12 @@ class EvaluatorType(IntEnum):
|
|
50
50
|
raise ValueError(f"{value} is not a valid EvaluatorType value")
|
51
51
|
|
52
52
|
|
53
|
+
class ScoreType(IntEnum):
|
54
|
+
BOOLEAN = 0
|
55
|
+
NUMERICAL = 1
|
56
|
+
ERROR = 2
|
57
|
+
|
58
|
+
|
53
59
|
class EvaluationResult(BaseModel):
|
54
60
|
"""Result of a single evaluation."""
|
55
61
|
|
@@ -57,13 +63,14 @@ class EvaluationResult(BaseModel):
|
|
57
63
|
evaluation_name: str
|
58
64
|
evaluator_id: str
|
59
65
|
evaluator_name: str
|
60
|
-
score: float
|
61
|
-
|
66
|
+
score: float | bool
|
67
|
+
score_type: ScoreType
|
68
|
+
# this is marked as optional, as it is populated inside the 'measure_execution_time' decorator
|
62
69
|
evaluation_time: Optional[float] = None
|
63
70
|
input: Dict[str, Any]
|
64
71
|
expected_output: Dict[str, Any]
|
65
72
|
actual_output: Dict[str, Any]
|
66
|
-
timestamp: datetime =
|
73
|
+
timestamp: datetime = datetime.now(timezone.utc)
|
67
74
|
details: Optional[str] = None
|
68
75
|
|
69
76
|
|
@@ -76,12 +83,6 @@ class EvaluationSetResult(BaseModel):
|
|
76
83
|
average_score: float
|
77
84
|
|
78
85
|
|
79
|
-
class ScoreType(IntEnum):
|
80
|
-
BOOLEAN = 0
|
81
|
-
NUMERICAL = 1
|
82
|
-
ERROR = 2
|
83
|
-
|
84
|
-
|
85
86
|
class EvalItemResult(BaseModel):
|
86
87
|
"""Result of a single evaluation item."""
|
87
88
|
|
@@ -337,7 +337,7 @@ class EvaluationService:
|
|
337
337
|
try:
|
338
338
|
if self._progress_reporter:
|
339
339
|
await self._progress_reporter.update_eval_run(
|
340
|
-
eval_results, eval_run_id,
|
340
|
+
eval_results, eval_run_id, execution_time
|
341
341
|
)
|
342
342
|
sw_progress_reporter_queue.task_done()
|
343
343
|
except Exception as e:
|
@@ -35,7 +35,9 @@ class ProgressReporter:
|
|
35
35
|
self._eval_set_id = eval_set_id
|
36
36
|
self.agent_snapshot = agent_snapshot
|
37
37
|
self._no_of_evals = no_of_evals
|
38
|
-
self._evaluators =
|
38
|
+
self._evaluators: dict[str, EvaluatorBase] = {
|
39
|
+
evaluator.id: evaluator for evaluator in evaluators
|
40
|
+
}
|
39
41
|
self._evaluator_scores: dict[str, list[float]] = {
|
40
42
|
evaluator.id: [] for evaluator in evaluators
|
41
43
|
}
|
@@ -56,6 +58,18 @@ class ProgressReporter:
|
|
56
58
|
"Cannot report data to StudioWeb. Please set UIPATH_PROJECT_ID."
|
57
59
|
)
|
58
60
|
|
61
|
+
async def create_eval_set_run(self):
|
62
|
+
"""Create a new evaluation set run in StudioWeb."""
|
63
|
+
spec = self._create_eval_set_run_spec()
|
64
|
+
response = await self._client.request_async(
|
65
|
+
method=spec.method,
|
66
|
+
url=spec.endpoint,
|
67
|
+
params=spec.params,
|
68
|
+
content=spec.content,
|
69
|
+
headers=spec.headers,
|
70
|
+
)
|
71
|
+
self._eval_set_run_id = json.loads(response.content)["id"]
|
72
|
+
|
59
73
|
async def create_eval_run(self, eval_item: dict[str, Any]):
|
60
74
|
"""Create a new evaluation run in StudioWeb.
|
61
75
|
|
@@ -72,7 +86,6 @@ class ProgressReporter:
|
|
72
86
|
params=spec.params,
|
73
87
|
content=spec.content,
|
74
88
|
headers=spec.headers,
|
75
|
-
scoped="org",
|
76
89
|
)
|
77
90
|
return json.loads(response.content)["id"]
|
78
91
|
|
@@ -80,7 +93,6 @@ class ProgressReporter:
|
|
80
93
|
self,
|
81
94
|
eval_results: list[EvalItemResult],
|
82
95
|
eval_run_id: str,
|
83
|
-
success: bool,
|
84
96
|
execution_time: float,
|
85
97
|
):
|
86
98
|
"""Update an evaluation run with results.
|
@@ -88,7 +100,6 @@ class ProgressReporter:
|
|
88
100
|
Args:
|
89
101
|
eval_results: Dictionary mapping evaluator IDs to evaluation results
|
90
102
|
eval_run_id: ID of the evaluation run to update
|
91
|
-
success: Whether the evaluation was successful
|
92
103
|
execution_time: The agent execution time
|
93
104
|
"""
|
94
105
|
assertion_runs, evaluator_scores, actual_output = self._collect_results(
|
@@ -107,21 +118,7 @@ class ProgressReporter:
|
|
107
118
|
params=spec.params,
|
108
119
|
content=spec.content,
|
109
120
|
headers=spec.headers,
|
110
|
-
scoped="org",
|
111
|
-
)
|
112
|
-
|
113
|
-
async def create_eval_set_run(self):
|
114
|
-
"""Create a new evaluation set run in StudioWeb."""
|
115
|
-
spec = self._create_eval_set_run_spec()
|
116
|
-
response = await self._client.request_async(
|
117
|
-
method=spec.method,
|
118
|
-
url=spec.endpoint,
|
119
|
-
params=spec.params,
|
120
|
-
content=spec.content,
|
121
|
-
headers=spec.headers,
|
122
|
-
scoped="org",
|
123
121
|
)
|
124
|
-
self._eval_set_run_id = json.loads(response.content)["id"]
|
125
122
|
|
126
123
|
async def update_eval_set_run(self):
|
127
124
|
"""Update the evaluation set run status to complete."""
|
@@ -132,7 +129,6 @@ class ProgressReporter:
|
|
132
129
|
params=spec.params,
|
133
130
|
content=spec.content,
|
134
131
|
headers=spec.headers,
|
135
|
-
scoped="org",
|
136
132
|
)
|
137
133
|
|
138
134
|
def _collect_results(
|
@@ -143,12 +139,23 @@ class ProgressReporter:
|
|
143
139
|
actual_output: dict[str, Any] = {}
|
144
140
|
for eval_result in eval_results:
|
145
141
|
# keep track of evaluator scores. this should be removed after this computation is done server-side
|
146
|
-
|
147
|
-
|
148
|
-
|
142
|
+
|
143
|
+
# check the evaluator score type
|
144
|
+
match eval_result.result.score_type:
|
145
|
+
case ScoreType.NUMERICAL:
|
146
|
+
self._evaluator_scores[eval_result.evaluator_id].append(
|
147
|
+
eval_result.result.score
|
148
|
+
)
|
149
|
+
case ScoreType.BOOLEAN:
|
150
|
+
self._evaluator_scores[eval_result.evaluator_id].append(
|
151
|
+
100 if eval_result.result.score else 0
|
152
|
+
)
|
153
|
+
case ScoreType.ERROR:
|
154
|
+
self._evaluator_scores[eval_result.evaluator_id].append(0)
|
155
|
+
|
149
156
|
evaluator_scores.append(
|
150
157
|
{
|
151
|
-
"type":
|
158
|
+
"type": eval_result.result.score_type.value,
|
152
159
|
"value": eval_result.result.score,
|
153
160
|
"justification": eval_result.result.details,
|
154
161
|
"evaluatorId": eval_result.evaluator_id,
|
@@ -158,14 +165,6 @@ class ProgressReporter:
|
|
158
165
|
{
|
159
166
|
"status": EvaluationStatus.COMPLETED.value,
|
160
167
|
"evaluatorId": eval_result.evaluator_id,
|
161
|
-
"result": {
|
162
|
-
"output": {"content": {**eval_result.result.actual_output}},
|
163
|
-
"score": {
|
164
|
-
"type": ScoreType.NUMERICAL.value,
|
165
|
-
"value": eval_result.result.score,
|
166
|
-
"justification": eval_result.result.details,
|
167
|
-
},
|
168
|
-
},
|
169
168
|
"completionMetrics": {
|
170
169
|
"duration": eval_result.result.evaluation_time,
|
171
170
|
"cost": None,
|
@@ -173,6 +172,14 @@ class ProgressReporter:
|
|
173
172
|
"completionTokens": 0,
|
174
173
|
"promptTokens": 0,
|
175
174
|
},
|
175
|
+
"assertionSnapshot": {
|
176
|
+
"assertionType": self._evaluators[
|
177
|
+
eval_result.evaluator_id
|
178
|
+
].type.name,
|
179
|
+
"outputKey": self._evaluators[
|
180
|
+
eval_result.evaluator_id
|
181
|
+
].target_output_key,
|
182
|
+
},
|
176
183
|
}
|
177
184
|
)
|
178
185
|
|
@@ -192,7 +199,7 @@ class ProgressReporter:
|
|
192
199
|
return RequestSpec(
|
193
200
|
method="PUT",
|
194
201
|
endpoint=Endpoint(
|
195
|
-
f"
|
202
|
+
f"agentsruntime_/api/execution/agents/{self._project_id}/evalRun"
|
196
203
|
),
|
197
204
|
content=json.dumps(
|
198
205
|
{
|
@@ -213,7 +220,7 @@ class ProgressReporter:
|
|
213
220
|
return RequestSpec(
|
214
221
|
method="POST",
|
215
222
|
endpoint=Endpoint(
|
216
|
-
f"
|
223
|
+
f"agentsruntime_/api/execution/agents/{self._project_id}/evalRun"
|
217
224
|
),
|
218
225
|
content=json.dumps(
|
219
226
|
{
|
@@ -221,41 +228,10 @@ class ProgressReporter:
|
|
221
228
|
"evalSnapshot": {
|
222
229
|
"id": eval_item["id"],
|
223
230
|
"name": eval_item["name"],
|
224
|
-
"assertionType": "unknown",
|
225
|
-
"assertionProperties": {},
|
226
231
|
"inputs": eval_item.get("inputs"),
|
227
|
-
"
|
232
|
+
"expectedOutput": eval_item.get("expectedOutput", {}),
|
228
233
|
},
|
229
234
|
"status": EvaluationStatus.IN_PROGRESS.value,
|
230
|
-
"assertionRuns": [
|
231
|
-
# TODO: replace default values
|
232
|
-
{
|
233
|
-
"assertionSnapshot": {
|
234
|
-
"assertionProperties": {
|
235
|
-
"expectedOutput": eval_item.get(
|
236
|
-
"expectedOutput", {}
|
237
|
-
),
|
238
|
-
"prompt": "No prompt for coded agents",
|
239
|
-
"simulationInstructions": "",
|
240
|
-
"expectedAgentBehavior": "",
|
241
|
-
"inputGenerationInstructions": "",
|
242
|
-
"simulateTools": False,
|
243
|
-
"simulateInput": False,
|
244
|
-
"toolsToSimulate": [],
|
245
|
-
**(
|
246
|
-
{"model": evaluator.model}
|
247
|
-
if hasattr(evaluator, "model")
|
248
|
-
else {}
|
249
|
-
),
|
250
|
-
},
|
251
|
-
"assertionType": "Custom",
|
252
|
-
"outputKey": "*",
|
253
|
-
},
|
254
|
-
"status": 1,
|
255
|
-
"evaluatorId": evaluator.id,
|
256
|
-
}
|
257
|
-
for evaluator in self._evaluators
|
258
|
-
],
|
259
235
|
}
|
260
236
|
),
|
261
237
|
headers=self._tenant_header(),
|
@@ -264,13 +240,12 @@ class ProgressReporter:
|
|
264
240
|
def _create_eval_set_run_spec(
|
265
241
|
self,
|
266
242
|
) -> RequestSpec:
|
267
|
-
self._add_defaults_to_agent_snapshot()
|
268
243
|
agent_snapshot_dict = json.loads(self.agent_snapshot)
|
269
244
|
|
270
245
|
return RequestSpec(
|
271
246
|
method="POST",
|
272
247
|
endpoint=Endpoint(
|
273
|
-
f"
|
248
|
+
f"agentsruntime_/api/execution/agents/{self._project_id}/evalSetRun"
|
274
249
|
),
|
275
250
|
content=json.dumps(
|
276
251
|
{
|
@@ -288,7 +263,7 @@ class ProgressReporter:
|
|
288
263
|
evaluator_scores = []
|
289
264
|
evaluator_averages = []
|
290
265
|
|
291
|
-
for evaluator in self._evaluators:
|
266
|
+
for evaluator in self._evaluators.values():
|
292
267
|
scores = self._evaluator_scores[evaluator.id]
|
293
268
|
if scores:
|
294
269
|
avg_score = sum(scores) / len(scores)
|
@@ -316,14 +291,11 @@ class ProgressReporter:
|
|
316
291
|
return RequestSpec(
|
317
292
|
method="PUT",
|
318
293
|
endpoint=Endpoint(
|
319
|
-
f"
|
294
|
+
f"agentsruntime_/api/execution/agents/{self._project_id}/evalSetRun"
|
320
295
|
),
|
321
296
|
content=json.dumps(
|
322
297
|
{
|
323
|
-
## TODO: send the actual data here (do we need to send those again? isn't it redundant?)
|
324
298
|
"evalSetRunId": self._eval_set_run_id,
|
325
|
-
## this should be removed. not used but enforced by the API
|
326
|
-
"score": overall_score,
|
327
299
|
"status": EvaluationStatus.COMPLETED.value,
|
328
300
|
"evaluatorScores": evaluator_scores,
|
329
301
|
}
|
@@ -331,22 +303,6 @@ class ProgressReporter:
|
|
331
303
|
headers=self._tenant_header(),
|
332
304
|
)
|
333
305
|
|
334
|
-
def _add_defaults_to_agent_snapshot(self):
|
335
|
-
## TODO: remove this after properties are marked as optional at api level
|
336
|
-
agent_snapshot_dict = json.loads(self.agent_snapshot)
|
337
|
-
agent_snapshot_dict["tools"] = []
|
338
|
-
agent_snapshot_dict["contexts"] = []
|
339
|
-
agent_snapshot_dict["escalations"] = []
|
340
|
-
agent_snapshot_dict["systemPrompt"] = ""
|
341
|
-
agent_snapshot_dict["userPrompt"] = ""
|
342
|
-
agent_snapshot_dict["settings"] = {
|
343
|
-
"model": "",
|
344
|
-
"maxTokens": 0,
|
345
|
-
"temperature": 0,
|
346
|
-
"engine": "",
|
347
|
-
}
|
348
|
-
self.agent_snapshot = json.dumps(agent_snapshot_dict)
|
349
|
-
|
350
306
|
def _tenant_header(self) -> dict[str, str]:
|
351
307
|
tenant_id = os.getenv(ENV_TENANT_ID, None)
|
352
308
|
if not tenant_id:
|
uipath/models/exceptions.py
CHANGED
@@ -20,6 +20,11 @@ class EnrichedException(Exception):
|
|
20
20
|
# Extract the relevant details from the HTTPStatusError
|
21
21
|
self.status_code = error.response.status_code if error.response else "Unknown"
|
22
22
|
self.url = str(error.request.url) if error.request else "Unknown"
|
23
|
+
self.http_method = (
|
24
|
+
error.request.method
|
25
|
+
if error.request and error.request.method
|
26
|
+
else "Unknown"
|
27
|
+
)
|
23
28
|
self.response_content = (
|
24
29
|
error.response.content.decode("utf-8")
|
25
30
|
if error.response and error.response.content
|
@@ -28,6 +33,7 @@ class EnrichedException(Exception):
|
|
28
33
|
|
29
34
|
enriched_message = (
|
30
35
|
f"\nRequest URL: {self.url}"
|
36
|
+
f"\nHTTP Method: {self.http_method}"
|
31
37
|
f"\nStatus Code: {self.status_code}"
|
32
38
|
f"\nResponse Content: {self.response_content}"
|
33
39
|
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: uipath
|
3
|
-
Version: 2.1.
|
3
|
+
Version: 2.1.18
|
4
4
|
Summary: Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools.
|
5
5
|
Project-URL: Homepage, https://uipath.com
|
6
6
|
Project-URL: Repository, https://github.com/UiPath/uipath-python
|
@@ -29,18 +29,19 @@ uipath/_cli/_auth/auth_config.json,sha256=UnAhdum8phjuZaZKE5KLp0IcPCbIltDEU1M_G8
|
|
29
29
|
uipath/_cli/_auth/index.html,sha256=_Q2OtqPfapG_6vumbQYqtb2PfFe0smk7TlGERKEBvB4,22518
|
30
30
|
uipath/_cli/_auth/localhost.crt,sha256=oGl9oLLOiouHubAt39B4zEfylFvKEtbtr_43SIliXJc,1226
|
31
31
|
uipath/_cli/_auth/localhost.key,sha256=X31VYXD8scZtmGA837dGX5l6G-LXHLo5ItWJhZXaz3c,1679
|
32
|
-
uipath/_cli/_evals/evaluation_service.py,sha256=
|
33
|
-
uipath/_cli/_evals/progress_reporter.py,sha256=
|
34
|
-
uipath/_cli/_evals/_evaluators/__init__.py,sha256=
|
35
|
-
uipath/_cli/_evals/_evaluators/
|
36
|
-
uipath/_cli/_evals/_evaluators/_deterministic_evaluator.py,sha256=P0du9KWz5MP5Pw70Ze7piqeBfFq7w0aU7DLeEiNC3k4,1398
|
32
|
+
uipath/_cli/_evals/evaluation_service.py,sha256=VVxZxoCJoB2SUhej_c0DzC9AlnIlWMKnug7z5weNSoE,22077
|
33
|
+
uipath/_cli/_evals/progress_reporter.py,sha256=m1Dio1vG-04nFTFz5ijM_j1dhudlgOzQukmTkkg6wS4,11490
|
34
|
+
uipath/_cli/_evals/_evaluators/__init__.py,sha256=jD7KNLjbsUpsESFXX11eW2MEPXDNuPp2-t-IPB-inlM,734
|
35
|
+
uipath/_cli/_evals/_evaluators/_deterministic_evaluator_base.py,sha256=BTl0puBjp9iCsU3YFfYWqk4TOz4iE19O3q1-dK6qUOI,1723
|
37
36
|
uipath/_cli/_evals/_evaluators/_evaluator_base.py,sha256=knHUwYFt0gMG1uJhq5TGEab6M_YevxX019yT3yYwZsw,3787
|
38
|
-
uipath/_cli/_evals/_evaluators/_evaluator_factory.py,sha256=
|
39
|
-
uipath/_cli/_evals/_evaluators/
|
37
|
+
uipath/_cli/_evals/_evaluators/_evaluator_factory.py,sha256=RJtCuFREZ8Ijlldpa0521poZLmcR7vTU3WyYOmhJOkc,4688
|
38
|
+
uipath/_cli/_evals/_evaluators/_exact_match_evaluator.py,sha256=lvEtAitrZy9myoZLMXLqlBWBPX06Msu67kuFMGSbikM,1319
|
39
|
+
uipath/_cli/_evals/_evaluators/_json_similarity_evaluator.py,sha256=HpmkvuwU4Az3IIqFVLUmDvzkqb21pFMxY0sg2biZOMM,7093
|
40
|
+
uipath/_cli/_evals/_evaluators/_llm_as_judge_evaluator.py,sha256=nSLZ29xWqALEI53ifr79JPXjyx0T4sr7p-4NygwgAio,6594
|
40
41
|
uipath/_cli/_evals/_evaluators/_trajectory_evaluator.py,sha256=dnogQTOskpI4_cNF0Ge3hBceJJocvOgxBWAwaCWnzB0,1595
|
41
42
|
uipath/_cli/_evals/_models/__init__.py,sha256=Ewjp3u2YeTH2MmzY9LWf7EIbAoIf_nW9fMYbj7pGlPs,420
|
42
43
|
uipath/_cli/_evals/_models/_evaluation_set.py,sha256=UIapFwn_Ti9zHUIcL3xyHDcLZ4lq4sHJ3JXLvY5OYI0,1080
|
43
|
-
uipath/_cli/_evals/_models/_evaluators.py,sha256=
|
44
|
+
uipath/_cli/_evals/_models/_evaluators.py,sha256=l57NEVyYmzSKuoIXuGkE94Br01hAMg35fiS2MlTkaQM,2115
|
44
45
|
uipath/_cli/_push/sw_file_handler.py,sha256=tRE9n68xv0r20ulwOyALHtYwzbjGneiASwzNm8xtBN0,16372
|
45
46
|
uipath/_cli/_runtime/_contracts.py,sha256=WlpaiQAMWCo-JFHjee35Klf49A3GsKjOU1Mf2IpUGHY,16033
|
46
47
|
uipath/_cli/_runtime/_escalation.py,sha256=x3vI98qsfRA-fL_tNkRVTFXioM5Gv2w0GFcXJJ5eQtg,7981
|
@@ -99,7 +100,7 @@ uipath/models/connections.py,sha256=perIqW99YEg_0yWZPdpZlmNpZcwY_toR1wkqDUBdAN0,
|
|
99
100
|
uipath/models/context_grounding.py,sha256=S9PeOlFlw7VxzzJVR_Fs28OObW3MLHUPCFqNgkEz24k,1315
|
100
101
|
uipath/models/context_grounding_index.py,sha256=0ADlH8fC10qIbakgwU89pRVawzJ36TiSDKIqOhUdhuA,2580
|
101
102
|
uipath/models/errors.py,sha256=gPyU4sKYn57v03aOVqm97mnU9Do2e7bwMQwiSQVp9qc,461
|
102
|
-
uipath/models/exceptions.py,sha256=
|
103
|
+
uipath/models/exceptions.py,sha256=F0ITAhJsl6Agvmnv4nxvgY5oC_lrYIlxWTLs0yx859M,1636
|
103
104
|
uipath/models/interrupt_models.py,sha256=UzuVTMVesI204YQ4qFQFaN-gN3kksddkrujofcaC7zQ,881
|
104
105
|
uipath/models/job.py,sha256=f9L6_kg_VP0dAYvdcz1DWEWzy4NZPdlpHREod0uNK1E,3099
|
105
106
|
uipath/models/llm_gateway.py,sha256=rUIus7BrUuuRriXqSJUE9FnjOyQ7pYpaX6hWEYvA6AA,1923
|
@@ -114,8 +115,8 @@ uipath/tracing/_traced.py,sha256=qeVDrds2OUnpdUIA0RhtF0kg2dlAZhyC1RRkI-qivTM,185
|
|
114
115
|
uipath/tracing/_utils.py,sha256=ZeensQexnw69jVcsVrGyED7mPlAU-L1agDGm6_1A3oc,10388
|
115
116
|
uipath/utils/__init__.py,sha256=VD-KXFpF_oWexFg6zyiWMkxl2HM4hYJMIUDZ1UEtGx0,105
|
116
117
|
uipath/utils/_endpoints_manager.py,sha256=hiGEu6vyfQJoeiiql6w21TNiG6tADUfXlVBimxPU1-Q,4160
|
117
|
-
uipath-2.1.
|
118
|
-
uipath-2.1.
|
119
|
-
uipath-2.1.
|
120
|
-
uipath-2.1.
|
121
|
-
uipath-2.1.
|
118
|
+
uipath-2.1.18.dist-info/METADATA,sha256=V5bxB_ENxsAgMRKGPz3Kx3gvmmgnrRxRDVAbILiBTtY,6367
|
119
|
+
uipath-2.1.18.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
120
|
+
uipath-2.1.18.dist-info/entry_points.txt,sha256=9C2_29U6Oq1ExFu7usihR-dnfIVNSKc-0EFbh0rskB4,43
|
121
|
+
uipath-2.1.18.dist-info/licenses/LICENSE,sha256=-KBavWXepyDjimmzH5fVAsi-6jNVpIKFc2kZs0Ri4ng,1058
|
122
|
+
uipath-2.1.18.dist-info/RECORD,,
|
@@ -1,48 +0,0 @@
|
|
1
|
-
from typing import Any, Dict
|
2
|
-
|
3
|
-
from .._models import EvaluationResult
|
4
|
-
from ._evaluator_base import EvaluatorBase
|
5
|
-
|
6
|
-
|
7
|
-
class AgentScorerEvaluator(EvaluatorBase):
|
8
|
-
"""Evaluator that uses an agent to score outputs."""
|
9
|
-
|
10
|
-
def __init__(
|
11
|
-
self,
|
12
|
-
agent_config: Dict[str, Any],
|
13
|
-
scoring_criteria: Dict[str, Any],
|
14
|
-
target_output_key: str = "*",
|
15
|
-
):
|
16
|
-
"""Initialize the agent scorer evaluator.
|
17
|
-
|
18
|
-
Args:
|
19
|
-
agent_config: Configuration for the scoring agent
|
20
|
-
scoring_criteria: Criteria used for scoring
|
21
|
-
target_output_key: Key in output to evaluate ("*" for entire output)
|
22
|
-
"""
|
23
|
-
super().__init__()
|
24
|
-
self.agent_config = agent_config or {}
|
25
|
-
self.scoring_criteria = scoring_criteria or {}
|
26
|
-
self.target_output_key = target_output_key
|
27
|
-
|
28
|
-
async def evaluate(
|
29
|
-
self,
|
30
|
-
evaluation_id: str,
|
31
|
-
evaluation_name: str,
|
32
|
-
input_data: Dict[str, Any],
|
33
|
-
expected_output: Dict[str, Any],
|
34
|
-
actual_output: Dict[str, Any],
|
35
|
-
) -> EvaluationResult:
|
36
|
-
"""Evaluate using an agent scorer.
|
37
|
-
|
38
|
-
Args:
|
39
|
-
evaluation_id: The ID of the evaluation being processed
|
40
|
-
evaluation_name: The name of the evaluation
|
41
|
-
input_data: The input data for the evaluation
|
42
|
-
expected_output: The expected output
|
43
|
-
actual_output: The actual output from the agent
|
44
|
-
|
45
|
-
Returns:
|
46
|
-
EvaluationResult containing the score and details
|
47
|
-
"""
|
48
|
-
raise NotImplementedError()
|
@@ -1,41 +0,0 @@
|
|
1
|
-
from typing import Any, Dict
|
2
|
-
|
3
|
-
from .._models import EvaluationResult
|
4
|
-
from ._evaluator_base import EvaluatorBase
|
5
|
-
|
6
|
-
|
7
|
-
class DeterministicEvaluator(EvaluatorBase):
|
8
|
-
"""Evaluator for deterministic/rule-based evaluations."""
|
9
|
-
|
10
|
-
def __init__(self, rule_config: Dict[str, Any], target_output_key: str = "*"):
|
11
|
-
"""Initialize the deterministic evaluator.
|
12
|
-
|
13
|
-
Args:
|
14
|
-
rule_config: Configuration for the rule (expected_value, regex_pattern, etc.)
|
15
|
-
target_output_key: Key in output to evaluate ("*" for entire output)
|
16
|
-
"""
|
17
|
-
super().__init__()
|
18
|
-
self.rule_config = rule_config or {}
|
19
|
-
self.target_output_key = target_output_key
|
20
|
-
|
21
|
-
async def evaluate(
|
22
|
-
self,
|
23
|
-
evaluation_id: str,
|
24
|
-
evaluation_name: str,
|
25
|
-
input_data: Dict[str, Any],
|
26
|
-
expected_output: Dict[str, Any],
|
27
|
-
actual_output: Dict[str, Any],
|
28
|
-
) -> EvaluationResult:
|
29
|
-
"""Evaluate using deterministic rules.
|
30
|
-
|
31
|
-
Args:
|
32
|
-
evaluation_id: The ID of the evaluation being processed
|
33
|
-
evaluation_name: The name of the evaluation
|
34
|
-
input_data: The input data for the evaluation
|
35
|
-
expected_output: The expected output
|
36
|
-
actual_output: The actual output from the agent
|
37
|
-
|
38
|
-
Returns:
|
39
|
-
EvaluationResult containing the score and details
|
40
|
-
"""
|
41
|
-
raise NotImplementedError()
|
File without changes
|
File without changes
|
File without changes
|