uipath 2.1.17__py3-none-any.whl → 2.1.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- uipath/_cli/_evals/_evaluators/__init__.py +6 -4
- uipath/_cli/_evals/_evaluators/_deterministic_evaluator_base.py +46 -0
- uipath/_cli/_evals/_evaluators/_evaluator_factory.py +42 -22
- uipath/_cli/_evals/_evaluators/_exact_match_evaluator.py +40 -0
- uipath/_cli/_evals/_evaluators/_json_similarity_evaluator.py +168 -0
- uipath/_cli/_evals/_evaluators/_llm_as_judge_evaluator.py +2 -0
- uipath/_cli/_evals/_models/_evaluators.py +11 -10
- uipath/_cli/_evals/progress_reporter.py +15 -4
- {uipath-2.1.17.dist-info → uipath-2.1.18.dist-info}/METADATA +1 -1
- {uipath-2.1.17.dist-info → uipath-2.1.18.dist-info}/RECORD +13 -12
- uipath/_cli/_evals/_evaluators/_agent_scorer_evaluator.py +0 -48
- uipath/_cli/_evals/_evaluators/_deterministic_evaluator.py +0 -41
- {uipath-2.1.17.dist-info → uipath-2.1.18.dist-info}/WHEEL +0 -0
- {uipath-2.1.17.dist-info → uipath-2.1.18.dist-info}/entry_points.txt +0 -0
- {uipath-2.1.17.dist-info → uipath-2.1.18.dist-info}/licenses/LICENSE +0 -0
@@ -3,18 +3,20 @@
|
|
3
3
|
This package contains all evaluator types and the factory for creating them.
|
4
4
|
"""
|
5
5
|
|
6
|
-
from .
|
7
|
-
from ._deterministic_evaluator import DeterministicEvaluator
|
6
|
+
from ._deterministic_evaluator_base import DeterministicEvaluatorBase
|
8
7
|
from ._evaluator_base import EvaluatorBase
|
9
8
|
from ._evaluator_factory import EvaluatorFactory
|
9
|
+
from ._exact_match_evaluator import ExactMatchEvaluator
|
10
|
+
from ._json_similarity_evaluator import JsonSimilarityEvaluator
|
10
11
|
from ._llm_as_judge_evaluator import LlmAsAJudgeEvaluator
|
11
12
|
from ._trajectory_evaluator import TrajectoryEvaluator
|
12
13
|
|
13
14
|
__all__ = [
|
14
15
|
"EvaluatorBase",
|
16
|
+
"DeterministicEvaluatorBase",
|
15
17
|
"EvaluatorFactory",
|
16
|
-
"
|
18
|
+
"JsonSimilarityEvaluator",
|
19
|
+
"ExactMatchEvaluator",
|
17
20
|
"LlmAsAJudgeEvaluator",
|
18
|
-
"AgentScorerEvaluator",
|
19
21
|
"TrajectoryEvaluator",
|
20
22
|
]
|
@@ -0,0 +1,46 @@
|
|
1
|
+
import copy
|
2
|
+
import json
|
3
|
+
from abc import ABC
|
4
|
+
from typing import Any, Dict, Tuple
|
5
|
+
|
6
|
+
from ._evaluator_base import EvaluatorBase
|
7
|
+
|
8
|
+
|
9
|
+
class DeterministicEvaluatorBase(EvaluatorBase, ABC):
|
10
|
+
def __init__(self, target_output_key: str = "*"):
|
11
|
+
super().__init__()
|
12
|
+
self.target_output_key = target_output_key
|
13
|
+
|
14
|
+
def _select_targets(
|
15
|
+
self, expected_output: Dict[str, Any], actual_output: Dict[str, Any]
|
16
|
+
) -> Tuple[Any, Any]:
|
17
|
+
actual_output_copy = copy.deepcopy(actual_output)
|
18
|
+
expected_output_copy = copy.deepcopy(expected_output)
|
19
|
+
if self.target_output_key != "*":
|
20
|
+
if (
|
21
|
+
self.target_output_key not in actual_output
|
22
|
+
or self.target_output_key not in expected_output
|
23
|
+
):
|
24
|
+
raise ValueError(
|
25
|
+
f"Field '{self.target_output_key}' missing from expected or actual output"
|
26
|
+
)
|
27
|
+
actual_output_copy = actual_output_copy[self.target_output_key]
|
28
|
+
expected_output_copy = expected_output[self.target_output_key]
|
29
|
+
return actual_output_copy, expected_output_copy
|
30
|
+
|
31
|
+
def _canonical_json(self, obj: Any) -> str:
|
32
|
+
return json.dumps(
|
33
|
+
self._normalize_numbers(obj),
|
34
|
+
sort_keys=True,
|
35
|
+
separators=(",", ":"),
|
36
|
+
ensure_ascii=False,
|
37
|
+
)
|
38
|
+
|
39
|
+
def _normalize_numbers(self, obj: Any) -> Any:
|
40
|
+
if isinstance(obj, dict):
|
41
|
+
return {k: self._normalize_numbers(v) for k, v in obj.items()}
|
42
|
+
if isinstance(obj, (list, tuple)):
|
43
|
+
return [self._normalize_numbers(v) for v in obj]
|
44
|
+
if isinstance(obj, (int, float)) and not isinstance(obj, bool):
|
45
|
+
return float(obj)
|
46
|
+
return obj
|
@@ -1,9 +1,9 @@
|
|
1
1
|
from typing import Any, Dict
|
2
2
|
|
3
3
|
from .._models import EvaluatorCategory, EvaluatorType
|
4
|
-
from ._agent_scorer_evaluator import AgentScorerEvaluator
|
5
|
-
from ._deterministic_evaluator import DeterministicEvaluator
|
6
4
|
from ._evaluator_base import EvaluatorBase, EvaluatorBaseParams
|
5
|
+
from ._exact_match_evaluator import ExactMatchEvaluator
|
6
|
+
from ._json_similarity_evaluator import JsonSimilarityEvaluator
|
7
7
|
from ._llm_as_judge_evaluator import LlmAsAJudgeEvaluator
|
8
8
|
from ._trajectory_evaluator import TrajectoryEvaluator
|
9
9
|
|
@@ -50,23 +50,50 @@ class EvaluatorFactory:
|
|
50
50
|
)
|
51
51
|
|
52
52
|
# Create evaluator based on category
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
53
|
+
match category:
|
54
|
+
case EvaluatorCategory.Deterministic:
|
55
|
+
if evaluator_type == evaluator_type.Equals:
|
56
|
+
return EvaluatorFactory._create_exact_match_evaluator(
|
57
|
+
base_params, data
|
58
|
+
)
|
59
|
+
elif evaluator_type == evaluator_type.JsonSimilarity:
|
60
|
+
return EvaluatorFactory._create_json_similarity_evaluator(
|
61
|
+
base_params, data
|
62
|
+
)
|
63
|
+
else:
|
64
|
+
raise ValueError(
|
65
|
+
f"Unknown evaluator type {evaluator_type} for category {category}"
|
66
|
+
)
|
67
|
+
case EvaluatorCategory.LlmAsAJudge:
|
68
|
+
return EvaluatorFactory._create_llm_as_judge_evaluator(
|
69
|
+
base_params, data
|
70
|
+
)
|
71
|
+
case EvaluatorCategory.AgentScorer:
|
72
|
+
raise NotImplementedError()
|
73
|
+
case EvaluatorCategory.Trajectory:
|
74
|
+
return EvaluatorFactory._create_trajectory_evaluator(base_params, data)
|
75
|
+
case _:
|
76
|
+
raise ValueError(f"Unknown evaluator category: {category}")
|
63
77
|
|
64
78
|
@staticmethod
|
65
|
-
def
|
79
|
+
def _create_exact_match_evaluator(
|
66
80
|
base_params: EvaluatorBaseParams, data: Dict[str, Any]
|
67
|
-
) ->
|
81
|
+
) -> ExactMatchEvaluator:
|
68
82
|
"""Create a deterministic evaluator."""
|
69
|
-
|
83
|
+
return ExactMatchEvaluator.from_params(
|
84
|
+
base_params,
|
85
|
+
target_output_key=data.get("targetOutputKey", "*"),
|
86
|
+
)
|
87
|
+
|
88
|
+
@staticmethod
|
89
|
+
def _create_json_similarity_evaluator(
|
90
|
+
base_params: EvaluatorBaseParams, data: Dict[str, Any]
|
91
|
+
) -> JsonSimilarityEvaluator:
|
92
|
+
"""Create a deterministic evaluator."""
|
93
|
+
return JsonSimilarityEvaluator.from_params(
|
94
|
+
base_params,
|
95
|
+
target_output_key=data.get("targetOutputKey", "*"),
|
96
|
+
)
|
70
97
|
|
71
98
|
@staticmethod
|
72
99
|
def _create_llm_as_judge_evaluator(
|
@@ -88,13 +115,6 @@ class EvaluatorFactory:
|
|
88
115
|
target_output_key=data.get("targetOutputKey", "*"),
|
89
116
|
)
|
90
117
|
|
91
|
-
@staticmethod
|
92
|
-
def _create_agent_scorer_evaluator(
|
93
|
-
base_params: EvaluatorBaseParams, data: Dict[str, Any]
|
94
|
-
) -> AgentScorerEvaluator:
|
95
|
-
"""Create an agent scorer evaluator."""
|
96
|
-
raise NotImplementedError()
|
97
|
-
|
98
118
|
@staticmethod
|
99
119
|
def _create_trajectory_evaluator(
|
100
120
|
base_params: EvaluatorBaseParams, data: Dict[str, Any]
|
@@ -0,0 +1,40 @@
|
|
1
|
+
import copy
|
2
|
+
from typing import Any, Dict
|
3
|
+
|
4
|
+
from uipath._cli._evals._evaluators._deterministic_evaluator_base import (
|
5
|
+
DeterministicEvaluatorBase,
|
6
|
+
)
|
7
|
+
from uipath._cli._evals._models import EvaluationResult
|
8
|
+
from uipath._cli._evals._models._evaluators import ScoreType
|
9
|
+
|
10
|
+
|
11
|
+
class ExactMatchEvaluator(DeterministicEvaluatorBase):
|
12
|
+
async def evaluate(
|
13
|
+
self,
|
14
|
+
evaluation_id: str,
|
15
|
+
evaluation_name: str,
|
16
|
+
input_data: Dict[str, Any],
|
17
|
+
expected_output: Dict[str, Any],
|
18
|
+
actual_output: Dict[str, Any],
|
19
|
+
) -> EvaluationResult:
|
20
|
+
actual_output_copy = copy.deepcopy(actual_output)
|
21
|
+
expected_output_copy = copy.deepcopy(expected_output)
|
22
|
+
|
23
|
+
actual_output, expected_output = self._select_targets(
|
24
|
+
expected_output, actual_output
|
25
|
+
)
|
26
|
+
are_equal = self._canonical_json(actual_output) == self._canonical_json(
|
27
|
+
expected_output
|
28
|
+
)
|
29
|
+
|
30
|
+
return EvaluationResult(
|
31
|
+
evaluation_id=evaluation_id,
|
32
|
+
evaluation_name=evaluation_name,
|
33
|
+
evaluator_id=self.id,
|
34
|
+
evaluator_name=self.name,
|
35
|
+
score=are_equal,
|
36
|
+
input=input_data,
|
37
|
+
expected_output=expected_output_copy,
|
38
|
+
actual_output=actual_output_copy,
|
39
|
+
score_type=ScoreType.BOOLEAN,
|
40
|
+
)
|
@@ -0,0 +1,168 @@
|
|
1
|
+
import copy
|
2
|
+
import math
|
3
|
+
from typing import Any, Dict, Tuple
|
4
|
+
|
5
|
+
from uipath._cli._evals._evaluators._deterministic_evaluator_base import (
|
6
|
+
DeterministicEvaluatorBase,
|
7
|
+
)
|
8
|
+
from uipath._cli._evals._models import EvaluationResult
|
9
|
+
from uipath._cli._evals._models._evaluators import ScoreType
|
10
|
+
|
11
|
+
|
12
|
+
class JsonSimilarityEvaluator(DeterministicEvaluatorBase):
|
13
|
+
"""Deterministic evaluator that scores structural JSON similarity.
|
14
|
+
|
15
|
+
Compares expected versus actual JSON-like structures and returns a
|
16
|
+
numerical score in the range [0, 100]. The comparison is token-based
|
17
|
+
and tolerant for numbers and strings (via Levenshtein distance).
|
18
|
+
"""
|
19
|
+
|
20
|
+
async def evaluate(
|
21
|
+
self,
|
22
|
+
evaluation_id: str,
|
23
|
+
evaluation_name: str,
|
24
|
+
input_data: Dict[str, Any],
|
25
|
+
expected_output: Dict[str, Any],
|
26
|
+
actual_output: Dict[str, Any],
|
27
|
+
) -> EvaluationResult:
|
28
|
+
"""Evaluate similarity between expected and actual JSON outputs.
|
29
|
+
|
30
|
+
Args:
|
31
|
+
evaluation_id: Unique identifier for this evaluation run.
|
32
|
+
evaluation_name: Human friendly evaluation name.
|
33
|
+
input_data: Input payload used to produce the outputs.
|
34
|
+
expected_output: Ground-truth JSON structure.
|
35
|
+
actual_output: Produced JSON structure to compare against the ground truth.
|
36
|
+
|
37
|
+
Returns:
|
38
|
+
EvaluationResult: Structured result with the numerical similarity score.
|
39
|
+
"""
|
40
|
+
actual_output_copy = copy.deepcopy(actual_output)
|
41
|
+
expected_output_copy = copy.deepcopy(expected_output)
|
42
|
+
|
43
|
+
actual_output, expected_output = self._select_targets(
|
44
|
+
expected_output, actual_output
|
45
|
+
)
|
46
|
+
similarity = self._compare_json(expected_output, actual_output)
|
47
|
+
|
48
|
+
return EvaluationResult(
|
49
|
+
evaluation_id=evaluation_id,
|
50
|
+
evaluation_name=evaluation_name,
|
51
|
+
evaluator_id=self.id,
|
52
|
+
evaluator_name=self.name,
|
53
|
+
score=similarity,
|
54
|
+
input=input_data,
|
55
|
+
expected_output=expected_output_copy,
|
56
|
+
actual_output=actual_output_copy,
|
57
|
+
score_type=ScoreType.NUMERICAL,
|
58
|
+
)
|
59
|
+
|
60
|
+
def _compare_json(self, expected: Any, actual: Any) -> float:
|
61
|
+
matched_leaves, total_leaves = self._compare_tokens(expected, actual)
|
62
|
+
if total_leaves == 0:
|
63
|
+
return 100.0
|
64
|
+
sim = (matched_leaves / total_leaves) * 100.0
|
65
|
+
return max(0.0, min(100.0, sim))
|
66
|
+
|
67
|
+
def _compare_tokens(
|
68
|
+
self, expected_token: Any, actual_token: Any
|
69
|
+
) -> Tuple[float, float]:
|
70
|
+
if self._is_number(expected_token) and self._is_number(actual_token):
|
71
|
+
return self._compare_numbers(float(expected_token), float(actual_token))
|
72
|
+
|
73
|
+
if type(expected_token) is not type(actual_token):
|
74
|
+
return 0.0, self._count_leaves(expected_token)
|
75
|
+
|
76
|
+
if isinstance(expected_token, dict):
|
77
|
+
matched_leaves = total_leaves = 0.0
|
78
|
+
# Only expected keys count
|
79
|
+
for expected_key, expected_value in expected_token.items():
|
80
|
+
if isinstance(actual_token, dict) and expected_key in actual_token:
|
81
|
+
matched, total = self._compare_tokens(
|
82
|
+
expected_value, actual_token[expected_key]
|
83
|
+
)
|
84
|
+
else:
|
85
|
+
matched, total = (0.0, self._count_leaves(expected_value))
|
86
|
+
matched_leaves += matched
|
87
|
+
total_leaves += total
|
88
|
+
return matched_leaves, total_leaves
|
89
|
+
|
90
|
+
if isinstance(expected_token, list):
|
91
|
+
matched_leaves = total_leaves = 0.0
|
92
|
+
common_length = min(len(expected_token), len(actual_token))
|
93
|
+
for index in range(common_length):
|
94
|
+
matched, total = self._compare_tokens(
|
95
|
+
expected_token[index], actual_token[index]
|
96
|
+
)
|
97
|
+
matched_leaves += matched
|
98
|
+
total_leaves += total
|
99
|
+
for index in range(common_length, len(expected_token)):
|
100
|
+
total_leaves += self._count_leaves(expected_token[index])
|
101
|
+
return (matched_leaves, total_leaves)
|
102
|
+
|
103
|
+
if isinstance(expected_token, bool):
|
104
|
+
return (1.0, 1.0) if expected_token == actual_token else (0.0, 1.0)
|
105
|
+
|
106
|
+
if isinstance(expected_token, str):
|
107
|
+
return self._compare_strings(expected_token, actual_token)
|
108
|
+
|
109
|
+
return (1.0, 1.0) if str(expected_token) == str(actual_token) else (0.0, 1.0)
|
110
|
+
|
111
|
+
def _compare_numbers(
|
112
|
+
self, expected_number: float, actual_number: float
|
113
|
+
) -> Tuple[float, float]:
|
114
|
+
total = 1.0
|
115
|
+
if math.isclose(expected_number, 0.0, abs_tol=1e-12):
|
116
|
+
matched = 1.0 if math.isclose(actual_number, 0.0, abs_tol=1e-12) else 0.0
|
117
|
+
else:
|
118
|
+
ratio = abs(expected_number - actual_number) / abs(expected_number)
|
119
|
+
matched = max(0.0, min(1.0, 1.0 - ratio))
|
120
|
+
return matched, total
|
121
|
+
|
122
|
+
def _compare_strings(
|
123
|
+
self, expected_string: str, actual_string: str
|
124
|
+
) -> Tuple[float, float]:
|
125
|
+
total = 1.0
|
126
|
+
if not expected_string and not actual_string:
|
127
|
+
return 1.0, total
|
128
|
+
distance = self._levenshtein(expected_string, actual_string)
|
129
|
+
max_length = max(len(expected_string), len(actual_string))
|
130
|
+
similarity = 1.0 - (distance / max_length) if max_length else 1.0
|
131
|
+
similarity = max(0.0, min(1.0, similarity))
|
132
|
+
return similarity, total
|
133
|
+
|
134
|
+
def _count_leaves(self, token_node: Any) -> float:
|
135
|
+
if isinstance(token_node, dict):
|
136
|
+
return sum(
|
137
|
+
self._count_leaves(child_value) for child_value in token_node.values()
|
138
|
+
)
|
139
|
+
if isinstance(token_node, list):
|
140
|
+
return sum(self._count_leaves(child_value) for child_value in token_node)
|
141
|
+
return 1.0
|
142
|
+
|
143
|
+
def _levenshtein(self, source_text: str, target_text: str) -> int:
|
144
|
+
if not source_text:
|
145
|
+
return len(target_text)
|
146
|
+
if not target_text:
|
147
|
+
return len(source_text)
|
148
|
+
source_len, target_len = len(source_text), len(target_text)
|
149
|
+
distance_matrix = [[0] * (target_len + 1) for _ in range(source_len + 1)]
|
150
|
+
for row_idx in range(source_len + 1):
|
151
|
+
distance_matrix[row_idx][0] = row_idx
|
152
|
+
for col_idx in range(target_len + 1):
|
153
|
+
distance_matrix[0][col_idx] = col_idx
|
154
|
+
for row_idx in range(1, source_len + 1):
|
155
|
+
for col_idx in range(1, target_len + 1):
|
156
|
+
substitution_cost = (
|
157
|
+
0 if source_text[row_idx - 1] == target_text[col_idx - 1] else 1
|
158
|
+
)
|
159
|
+
distance_matrix[row_idx][col_idx] = min(
|
160
|
+
distance_matrix[row_idx - 1][col_idx] + 1, # deletion
|
161
|
+
distance_matrix[row_idx][col_idx - 1] + 1, # insertion
|
162
|
+
distance_matrix[row_idx - 1][col_idx - 1]
|
163
|
+
+ substitution_cost, # substitution
|
164
|
+
)
|
165
|
+
return distance_matrix[source_len][target_len]
|
166
|
+
|
167
|
+
def _is_number(self, value: Any) -> bool:
|
168
|
+
return isinstance(value, (int, float)) and not isinstance(value, bool)
|
@@ -11,6 +11,7 @@ from ...._utils.constants import (
|
|
11
11
|
COMMUNITY_agents_SUFFIX,
|
12
12
|
)
|
13
13
|
from .._models import EvaluationResult, LLMResponse
|
14
|
+
from .._models._evaluators import ScoreType
|
14
15
|
from ._evaluator_base import EvaluatorBase
|
15
16
|
|
16
17
|
|
@@ -86,6 +87,7 @@ class LlmAsAJudgeEvaluator(EvaluatorBase):
|
|
86
87
|
expected_output=expected_output,
|
87
88
|
actual_output=actual_output,
|
88
89
|
details=llm_response.justification,
|
90
|
+
score_type=ScoreType.NUMERICAL,
|
89
91
|
)
|
90
92
|
|
91
93
|
def _extract_target_value(self, output: Dict[str, Any]) -> Any:
|
@@ -1,8 +1,8 @@
|
|
1
|
-
from datetime import datetime
|
1
|
+
from datetime import datetime, timezone
|
2
2
|
from enum import IntEnum
|
3
3
|
from typing import Any, Dict, List, Optional
|
4
4
|
|
5
|
-
from pydantic import BaseModel
|
5
|
+
from pydantic import BaseModel
|
6
6
|
|
7
7
|
|
8
8
|
class LLMResponse(BaseModel):
|
@@ -50,6 +50,12 @@ class EvaluatorType(IntEnum):
|
|
50
50
|
raise ValueError(f"{value} is not a valid EvaluatorType value")
|
51
51
|
|
52
52
|
|
53
|
+
class ScoreType(IntEnum):
|
54
|
+
BOOLEAN = 0
|
55
|
+
NUMERICAL = 1
|
56
|
+
ERROR = 2
|
57
|
+
|
58
|
+
|
53
59
|
class EvaluationResult(BaseModel):
|
54
60
|
"""Result of a single evaluation."""
|
55
61
|
|
@@ -57,13 +63,14 @@ class EvaluationResult(BaseModel):
|
|
57
63
|
evaluation_name: str
|
58
64
|
evaluator_id: str
|
59
65
|
evaluator_name: str
|
60
|
-
score: float
|
66
|
+
score: float | bool
|
67
|
+
score_type: ScoreType
|
61
68
|
# this is marked as optional, as it is populated inside the 'measure_execution_time' decorator
|
62
69
|
evaluation_time: Optional[float] = None
|
63
70
|
input: Dict[str, Any]
|
64
71
|
expected_output: Dict[str, Any]
|
65
72
|
actual_output: Dict[str, Any]
|
66
|
-
timestamp: datetime =
|
73
|
+
timestamp: datetime = datetime.now(timezone.utc)
|
67
74
|
details: Optional[str] = None
|
68
75
|
|
69
76
|
|
@@ -76,12 +83,6 @@ class EvaluationSetResult(BaseModel):
|
|
76
83
|
average_score: float
|
77
84
|
|
78
85
|
|
79
|
-
class ScoreType(IntEnum):
|
80
|
-
BOOLEAN = 0
|
81
|
-
NUMERICAL = 1
|
82
|
-
ERROR = 2
|
83
|
-
|
84
|
-
|
85
86
|
class EvalItemResult(BaseModel):
|
86
87
|
"""Result of a single evaluation item."""
|
87
88
|
|
@@ -139,12 +139,23 @@ class ProgressReporter:
|
|
139
139
|
actual_output: dict[str, Any] = {}
|
140
140
|
for eval_result in eval_results:
|
141
141
|
# keep track of evaluator scores. this should be removed after this computation is done server-side
|
142
|
-
|
143
|
-
|
144
|
-
|
142
|
+
|
143
|
+
# check the evaluator score type
|
144
|
+
match eval_result.result.score_type:
|
145
|
+
case ScoreType.NUMERICAL:
|
146
|
+
self._evaluator_scores[eval_result.evaluator_id].append(
|
147
|
+
eval_result.result.score
|
148
|
+
)
|
149
|
+
case ScoreType.BOOLEAN:
|
150
|
+
self._evaluator_scores[eval_result.evaluator_id].append(
|
151
|
+
100 if eval_result.result.score else 0
|
152
|
+
)
|
153
|
+
case ScoreType.ERROR:
|
154
|
+
self._evaluator_scores[eval_result.evaluator_id].append(0)
|
155
|
+
|
145
156
|
evaluator_scores.append(
|
146
157
|
{
|
147
|
-
"type":
|
158
|
+
"type": eval_result.result.score_type.value,
|
148
159
|
"value": eval_result.result.score,
|
149
160
|
"justification": eval_result.result.details,
|
150
161
|
"evaluatorId": eval_result.evaluator_id,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: uipath
|
3
|
-
Version: 2.1.
|
3
|
+
Version: 2.1.18
|
4
4
|
Summary: Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools.
|
5
5
|
Project-URL: Homepage, https://uipath.com
|
6
6
|
Project-URL: Repository, https://github.com/UiPath/uipath-python
|
@@ -30,17 +30,18 @@ uipath/_cli/_auth/index.html,sha256=_Q2OtqPfapG_6vumbQYqtb2PfFe0smk7TlGERKEBvB4,
|
|
30
30
|
uipath/_cli/_auth/localhost.crt,sha256=oGl9oLLOiouHubAt39B4zEfylFvKEtbtr_43SIliXJc,1226
|
31
31
|
uipath/_cli/_auth/localhost.key,sha256=X31VYXD8scZtmGA837dGX5l6G-LXHLo5ItWJhZXaz3c,1679
|
32
32
|
uipath/_cli/_evals/evaluation_service.py,sha256=VVxZxoCJoB2SUhej_c0DzC9AlnIlWMKnug7z5weNSoE,22077
|
33
|
-
uipath/_cli/_evals/progress_reporter.py,sha256=
|
34
|
-
uipath/_cli/_evals/_evaluators/__init__.py,sha256=
|
35
|
-
uipath/_cli/_evals/_evaluators/
|
36
|
-
uipath/_cli/_evals/_evaluators/_deterministic_evaluator.py,sha256=P0du9KWz5MP5Pw70Ze7piqeBfFq7w0aU7DLeEiNC3k4,1398
|
33
|
+
uipath/_cli/_evals/progress_reporter.py,sha256=m1Dio1vG-04nFTFz5ijM_j1dhudlgOzQukmTkkg6wS4,11490
|
34
|
+
uipath/_cli/_evals/_evaluators/__init__.py,sha256=jD7KNLjbsUpsESFXX11eW2MEPXDNuPp2-t-IPB-inlM,734
|
35
|
+
uipath/_cli/_evals/_evaluators/_deterministic_evaluator_base.py,sha256=BTl0puBjp9iCsU3YFfYWqk4TOz4iE19O3q1-dK6qUOI,1723
|
37
36
|
uipath/_cli/_evals/_evaluators/_evaluator_base.py,sha256=knHUwYFt0gMG1uJhq5TGEab6M_YevxX019yT3yYwZsw,3787
|
38
|
-
uipath/_cli/_evals/_evaluators/_evaluator_factory.py,sha256=
|
39
|
-
uipath/_cli/_evals/_evaluators/
|
37
|
+
uipath/_cli/_evals/_evaluators/_evaluator_factory.py,sha256=RJtCuFREZ8Ijlldpa0521poZLmcR7vTU3WyYOmhJOkc,4688
|
38
|
+
uipath/_cli/_evals/_evaluators/_exact_match_evaluator.py,sha256=lvEtAitrZy9myoZLMXLqlBWBPX06Msu67kuFMGSbikM,1319
|
39
|
+
uipath/_cli/_evals/_evaluators/_json_similarity_evaluator.py,sha256=HpmkvuwU4Az3IIqFVLUmDvzkqb21pFMxY0sg2biZOMM,7093
|
40
|
+
uipath/_cli/_evals/_evaluators/_llm_as_judge_evaluator.py,sha256=nSLZ29xWqALEI53ifr79JPXjyx0T4sr7p-4NygwgAio,6594
|
40
41
|
uipath/_cli/_evals/_evaluators/_trajectory_evaluator.py,sha256=dnogQTOskpI4_cNF0Ge3hBceJJocvOgxBWAwaCWnzB0,1595
|
41
42
|
uipath/_cli/_evals/_models/__init__.py,sha256=Ewjp3u2YeTH2MmzY9LWf7EIbAoIf_nW9fMYbj7pGlPs,420
|
42
43
|
uipath/_cli/_evals/_models/_evaluation_set.py,sha256=UIapFwn_Ti9zHUIcL3xyHDcLZ4lq4sHJ3JXLvY5OYI0,1080
|
43
|
-
uipath/_cli/_evals/_models/_evaluators.py,sha256=
|
44
|
+
uipath/_cli/_evals/_models/_evaluators.py,sha256=l57NEVyYmzSKuoIXuGkE94Br01hAMg35fiS2MlTkaQM,2115
|
44
45
|
uipath/_cli/_push/sw_file_handler.py,sha256=tRE9n68xv0r20ulwOyALHtYwzbjGneiASwzNm8xtBN0,16372
|
45
46
|
uipath/_cli/_runtime/_contracts.py,sha256=WlpaiQAMWCo-JFHjee35Klf49A3GsKjOU1Mf2IpUGHY,16033
|
46
47
|
uipath/_cli/_runtime/_escalation.py,sha256=x3vI98qsfRA-fL_tNkRVTFXioM5Gv2w0GFcXJJ5eQtg,7981
|
@@ -114,8 +115,8 @@ uipath/tracing/_traced.py,sha256=qeVDrds2OUnpdUIA0RhtF0kg2dlAZhyC1RRkI-qivTM,185
|
|
114
115
|
uipath/tracing/_utils.py,sha256=ZeensQexnw69jVcsVrGyED7mPlAU-L1agDGm6_1A3oc,10388
|
115
116
|
uipath/utils/__init__.py,sha256=VD-KXFpF_oWexFg6zyiWMkxl2HM4hYJMIUDZ1UEtGx0,105
|
116
117
|
uipath/utils/_endpoints_manager.py,sha256=hiGEu6vyfQJoeiiql6w21TNiG6tADUfXlVBimxPU1-Q,4160
|
117
|
-
uipath-2.1.
|
118
|
-
uipath-2.1.
|
119
|
-
uipath-2.1.
|
120
|
-
uipath-2.1.
|
121
|
-
uipath-2.1.
|
118
|
+
uipath-2.1.18.dist-info/METADATA,sha256=V5bxB_ENxsAgMRKGPz3Kx3gvmmgnrRxRDVAbILiBTtY,6367
|
119
|
+
uipath-2.1.18.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
120
|
+
uipath-2.1.18.dist-info/entry_points.txt,sha256=9C2_29U6Oq1ExFu7usihR-dnfIVNSKc-0EFbh0rskB4,43
|
121
|
+
uipath-2.1.18.dist-info/licenses/LICENSE,sha256=-KBavWXepyDjimmzH5fVAsi-6jNVpIKFc2kZs0Ri4ng,1058
|
122
|
+
uipath-2.1.18.dist-info/RECORD,,
|
@@ -1,48 +0,0 @@
|
|
1
|
-
from typing import Any, Dict
|
2
|
-
|
3
|
-
from .._models import EvaluationResult
|
4
|
-
from ._evaluator_base import EvaluatorBase
|
5
|
-
|
6
|
-
|
7
|
-
class AgentScorerEvaluator(EvaluatorBase):
|
8
|
-
"""Evaluator that uses an agent to score outputs."""
|
9
|
-
|
10
|
-
def __init__(
|
11
|
-
self,
|
12
|
-
agent_config: Dict[str, Any],
|
13
|
-
scoring_criteria: Dict[str, Any],
|
14
|
-
target_output_key: str = "*",
|
15
|
-
):
|
16
|
-
"""Initialize the agent scorer evaluator.
|
17
|
-
|
18
|
-
Args:
|
19
|
-
agent_config: Configuration for the scoring agent
|
20
|
-
scoring_criteria: Criteria used for scoring
|
21
|
-
target_output_key: Key in output to evaluate ("*" for entire output)
|
22
|
-
"""
|
23
|
-
super().__init__()
|
24
|
-
self.agent_config = agent_config or {}
|
25
|
-
self.scoring_criteria = scoring_criteria or {}
|
26
|
-
self.target_output_key = target_output_key
|
27
|
-
|
28
|
-
async def evaluate(
|
29
|
-
self,
|
30
|
-
evaluation_id: str,
|
31
|
-
evaluation_name: str,
|
32
|
-
input_data: Dict[str, Any],
|
33
|
-
expected_output: Dict[str, Any],
|
34
|
-
actual_output: Dict[str, Any],
|
35
|
-
) -> EvaluationResult:
|
36
|
-
"""Evaluate using an agent scorer.
|
37
|
-
|
38
|
-
Args:
|
39
|
-
evaluation_id: The ID of the evaluation being processed
|
40
|
-
evaluation_name: The name of the evaluation
|
41
|
-
input_data: The input data for the evaluation
|
42
|
-
expected_output: The expected output
|
43
|
-
actual_output: The actual output from the agent
|
44
|
-
|
45
|
-
Returns:
|
46
|
-
EvaluationResult containing the score and details
|
47
|
-
"""
|
48
|
-
raise NotImplementedError()
|
@@ -1,41 +0,0 @@
|
|
1
|
-
from typing import Any, Dict
|
2
|
-
|
3
|
-
from .._models import EvaluationResult
|
4
|
-
from ._evaluator_base import EvaluatorBase
|
5
|
-
|
6
|
-
|
7
|
-
class DeterministicEvaluator(EvaluatorBase):
|
8
|
-
"""Evaluator for deterministic/rule-based evaluations."""
|
9
|
-
|
10
|
-
def __init__(self, rule_config: Dict[str, Any], target_output_key: str = "*"):
|
11
|
-
"""Initialize the deterministic evaluator.
|
12
|
-
|
13
|
-
Args:
|
14
|
-
rule_config: Configuration for the rule (expected_value, regex_pattern, etc.)
|
15
|
-
target_output_key: Key in output to evaluate ("*" for entire output)
|
16
|
-
"""
|
17
|
-
super().__init__()
|
18
|
-
self.rule_config = rule_config or {}
|
19
|
-
self.target_output_key = target_output_key
|
20
|
-
|
21
|
-
async def evaluate(
|
22
|
-
self,
|
23
|
-
evaluation_id: str,
|
24
|
-
evaluation_name: str,
|
25
|
-
input_data: Dict[str, Any],
|
26
|
-
expected_output: Dict[str, Any],
|
27
|
-
actual_output: Dict[str, Any],
|
28
|
-
) -> EvaluationResult:
|
29
|
-
"""Evaluate using deterministic rules.
|
30
|
-
|
31
|
-
Args:
|
32
|
-
evaluation_id: The ID of the evaluation being processed
|
33
|
-
evaluation_name: The name of the evaluation
|
34
|
-
input_data: The input data for the evaluation
|
35
|
-
expected_output: The expected output
|
36
|
-
actual_output: The actual output from the agent
|
37
|
-
|
38
|
-
Returns:
|
39
|
-
EvaluationResult containing the score and details
|
40
|
-
"""
|
41
|
-
raise NotImplementedError()
|
File without changes
|
File without changes
|
File without changes
|