uipath 2.1.108__py3-none-any.whl → 2.1.110__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of uipath might be problematic. Click here for more details.
- uipath/_cli/__init__.py +4 -0
- uipath/_cli/_evals/_console_progress_reporter.py +2 -2
- uipath/_cli/_evals/_evaluator_factory.py +314 -29
- uipath/_cli/_evals/_helpers.py +194 -0
- uipath/_cli/_evals/_models/_evaluation_set.py +73 -7
- uipath/_cli/_evals/_models/_evaluator.py +183 -9
- uipath/_cli/_evals/_models/_evaluator_base_params.py +3 -3
- uipath/_cli/_evals/_models/_output.py +87 -3
- uipath/_cli/_evals/_progress_reporter.py +288 -28
- uipath/_cli/_evals/_runtime.py +80 -26
- uipath/_cli/_evals/mocks/input_mocker.py +1 -3
- uipath/_cli/_evals/mocks/llm_mocker.py +2 -2
- uipath/_cli/_evals/mocks/mocker_factory.py +2 -2
- uipath/_cli/_evals/mocks/mockito_mocker.py +2 -2
- uipath/_cli/_evals/mocks/mocks.py +5 -3
- uipath/_cli/_push/models.py +17 -0
- uipath/_cli/_push/sw_file_handler.py +336 -3
- uipath/_cli/_runtime/_contracts.py +2 -4
- uipath/_cli/_runtime/_runtime.py +2 -5
- uipath/_cli/_templates/custom_evaluator.py.template +65 -0
- uipath/_cli/_utils/_eval_set.py +30 -9
- uipath/_cli/_utils/_resources.py +21 -0
- uipath/_cli/_utils/_studio_project.py +18 -0
- uipath/_cli/cli_add.py +114 -0
- uipath/_cli/cli_eval.py +5 -1
- uipath/_cli/cli_init.py +5 -4
- uipath/_cli/cli_pull.py +11 -26
- uipath/_cli/cli_push.py +2 -0
- uipath/_cli/cli_register.py +45 -0
- uipath/_events/_events.py +6 -5
- uipath/_utils/constants.py +4 -0
- uipath/eval/_helpers/evaluators_helpers.py +494 -0
- uipath/eval/_helpers/helpers.py +30 -2
- uipath/eval/evaluators/__init__.py +60 -5
- uipath/eval/evaluators/base_evaluator.py +546 -44
- uipath/eval/evaluators/contains_evaluator.py +80 -0
- uipath/eval/evaluators/exact_match_evaluator.py +43 -12
- uipath/eval/evaluators/json_similarity_evaluator.py +41 -12
- uipath/eval/evaluators/legacy_base_evaluator.py +89 -0
- uipath/eval/evaluators/{deterministic_evaluator_base.py → legacy_deterministic_evaluator_base.py} +2 -2
- uipath/eval/evaluators/legacy_exact_match_evaluator.py +37 -0
- uipath/eval/evaluators/legacy_json_similarity_evaluator.py +151 -0
- uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py +137 -0
- uipath/eval/evaluators/{trajectory_evaluator.py → legacy_trajectory_evaluator.py} +5 -6
- uipath/eval/evaluators/llm_as_judge_evaluator.py +143 -78
- uipath/eval/evaluators/llm_judge_output_evaluator.py +112 -0
- uipath/eval/evaluators/llm_judge_trajectory_evaluator.py +142 -0
- uipath/eval/evaluators/output_evaluator.py +117 -0
- uipath/eval/evaluators/tool_call_args_evaluator.py +82 -0
- uipath/eval/evaluators/tool_call_count_evaluator.py +87 -0
- uipath/eval/evaluators/tool_call_order_evaluator.py +84 -0
- uipath/eval/evaluators/tool_call_output_evaluator.py +87 -0
- uipath/eval/evaluators_types/ContainsEvaluator.json +73 -0
- uipath/eval/evaluators_types/ExactMatchEvaluator.json +89 -0
- uipath/eval/evaluators_types/JsonSimilarityEvaluator.json +81 -0
- uipath/eval/evaluators_types/LLMJudgeOutputEvaluator.json +110 -0
- uipath/eval/evaluators_types/LLMJudgeSimulationTrajectoryEvaluator.json +88 -0
- uipath/eval/evaluators_types/LLMJudgeStrictJSONSimilarityOutputEvaluator.json +110 -0
- uipath/eval/evaluators_types/LLMJudgeTrajectoryEvaluator.json +88 -0
- uipath/eval/evaluators_types/ToolCallArgsEvaluator.json +131 -0
- uipath/eval/evaluators_types/ToolCallCountEvaluator.json +104 -0
- uipath/eval/evaluators_types/ToolCallOrderEvaluator.json +100 -0
- uipath/eval/evaluators_types/ToolCallOutputEvaluator.json +124 -0
- uipath/eval/evaluators_types/generate_types.py +31 -0
- uipath/eval/models/__init__.py +16 -1
- uipath/eval/models/llm_judge_types.py +196 -0
- uipath/eval/models/models.py +109 -7
- {uipath-2.1.108.dist-info → uipath-2.1.110.dist-info}/METADATA +1 -1
- {uipath-2.1.108.dist-info → uipath-2.1.110.dist-info}/RECORD +72 -40
- {uipath-2.1.108.dist-info → uipath-2.1.110.dist-info}/WHEEL +0 -0
- {uipath-2.1.108.dist-info → uipath-2.1.110.dist-info}/entry_points.txt +0 -0
- {uipath-2.1.108.dist-info → uipath-2.1.110.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""Contains evaluator for agent outputs."""
|
|
2
|
+
|
|
3
|
+
from ..models import (
|
|
4
|
+
AgentExecution,
|
|
5
|
+
EvaluationResult,
|
|
6
|
+
EvaluatorType,
|
|
7
|
+
NumericEvaluationResult,
|
|
8
|
+
)
|
|
9
|
+
from .base_evaluator import BaseEvaluationCriteria
|
|
10
|
+
from .output_evaluator import (
|
|
11
|
+
OutputEvaluator,
|
|
12
|
+
OutputEvaluatorConfig,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ContainsEvaluationCriteria(BaseEvaluationCriteria):
|
|
17
|
+
"""Evaluation criteria for the contains evaluator."""
|
|
18
|
+
|
|
19
|
+
search_text: str
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ContainsEvaluatorConfig(OutputEvaluatorConfig[ContainsEvaluationCriteria]):
|
|
23
|
+
"""Configuration for the contains evaluator."""
|
|
24
|
+
|
|
25
|
+
name: str = "ContainsEvaluator"
|
|
26
|
+
case_sensitive: bool = False
|
|
27
|
+
negated: bool = False
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ContainsEvaluator(
|
|
31
|
+
OutputEvaluator[ContainsEvaluationCriteria, ContainsEvaluatorConfig, type(None)] # type: ignore
|
|
32
|
+
):
|
|
33
|
+
"""Evaluator that checks if the actual output contains the expected output.
|
|
34
|
+
|
|
35
|
+
This evaluator returns True if the actual output contains the expected output,
|
|
36
|
+
and False otherwise. It supports case sensitivity and negation options.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
@classmethod
|
|
40
|
+
def get_evaluator_id(cls) -> str:
|
|
41
|
+
"""Get the evaluator id."""
|
|
42
|
+
return EvaluatorType.CONTAINS.value
|
|
43
|
+
|
|
44
|
+
async def evaluate(
|
|
45
|
+
self,
|
|
46
|
+
agent_execution: AgentExecution,
|
|
47
|
+
evaluation_criteria: ContainsEvaluationCriteria,
|
|
48
|
+
) -> EvaluationResult:
|
|
49
|
+
"""Evaluate whether actual output contains the expected output.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
agent_execution: The execution details containing:
|
|
53
|
+
- agent_input: The input received by the agent
|
|
54
|
+
- agent_output: The actual output from the agent
|
|
55
|
+
- agent_trace: The execution spans to use for the evaluation
|
|
56
|
+
evaluation_criteria: The criteria to evaluate
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
EvaluationResult: Boolean result indicating if output contains expected value (True/False)
|
|
60
|
+
"""
|
|
61
|
+
actual_output = str(self._get_actual_output(agent_execution))
|
|
62
|
+
expected_output = str(self._get_expected_output(evaluation_criteria))
|
|
63
|
+
|
|
64
|
+
if not self.evaluator_config.case_sensitive:
|
|
65
|
+
actual_output = actual_output.lower()
|
|
66
|
+
expected_output = expected_output.lower()
|
|
67
|
+
|
|
68
|
+
is_contains = expected_output in actual_output
|
|
69
|
+
|
|
70
|
+
if self.evaluator_config.negated:
|
|
71
|
+
is_contains = not is_contains
|
|
72
|
+
return NumericEvaluationResult(
|
|
73
|
+
score=float(is_contains),
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
def _get_expected_output(
|
|
77
|
+
self, evaluation_criteria: ContainsEvaluationCriteria
|
|
78
|
+
) -> str:
|
|
79
|
+
"""Get the expected output from the evaluation criteria."""
|
|
80
|
+
return evaluation_criteria.search_text
|
|
@@ -1,14 +1,29 @@
|
|
|
1
|
-
"""Exact match evaluator for
|
|
1
|
+
"""Exact match evaluator for agent outputs."""
|
|
2
2
|
|
|
3
|
-
from
|
|
3
|
+
from ..models import (
|
|
4
|
+
AgentExecution,
|
|
5
|
+
EvaluationResult,
|
|
6
|
+
EvaluatorType,
|
|
7
|
+
NumericEvaluationResult,
|
|
8
|
+
)
|
|
9
|
+
from .output_evaluator import (
|
|
10
|
+
OutputEvaluationCriteria,
|
|
11
|
+
OutputEvaluator,
|
|
12
|
+
OutputEvaluatorConfig,
|
|
13
|
+
)
|
|
4
14
|
|
|
5
|
-
from uipath.eval.models import BooleanEvaluationResult, EvaluationResult
|
|
6
15
|
|
|
7
|
-
|
|
8
|
-
|
|
16
|
+
class ExactMatchEvaluatorConfig(OutputEvaluatorConfig[OutputEvaluationCriteria]):
|
|
17
|
+
"""Configuration for the exact match evaluator."""
|
|
9
18
|
|
|
19
|
+
name: str = "ExactMatchEvaluator"
|
|
20
|
+
case_sensitive: bool = False
|
|
21
|
+
negated: bool = False
|
|
10
22
|
|
|
11
|
-
|
|
23
|
+
|
|
24
|
+
class ExactMatchEvaluator(
|
|
25
|
+
OutputEvaluator[OutputEvaluationCriteria, ExactMatchEvaluatorConfig, type(None)] # type: ignore
|
|
26
|
+
):
|
|
12
27
|
"""Evaluator that performs exact structural matching between expected and actual outputs.
|
|
13
28
|
|
|
14
29
|
This evaluator returns True if the actual output exactly matches the expected output
|
|
@@ -16,22 +31,38 @@ class ExactMatchEvaluator(DeterministicEvaluatorBase[dict[str, Any]]):
|
|
|
16
31
|
to floats for consistent comparison.
|
|
17
32
|
"""
|
|
18
33
|
|
|
34
|
+
@classmethod
|
|
35
|
+
def get_evaluator_id(cls) -> str:
|
|
36
|
+
"""Get the evaluator id."""
|
|
37
|
+
return EvaluatorType.EXACT_MATCH.value
|
|
38
|
+
|
|
19
39
|
async def evaluate(
|
|
20
|
-
self,
|
|
40
|
+
self,
|
|
41
|
+
agent_execution: AgentExecution,
|
|
42
|
+
evaluation_criteria: OutputEvaluationCriteria,
|
|
21
43
|
) -> EvaluationResult:
|
|
22
44
|
"""Evaluate whether actual output exactly matches expected output.
|
|
23
45
|
|
|
24
46
|
Args:
|
|
25
47
|
agent_execution: The execution details containing:
|
|
26
48
|
- agent_input: The input received by the agent
|
|
27
|
-
-
|
|
28
|
-
-
|
|
49
|
+
- agent_output: The actual output from the agent
|
|
50
|
+
- agent_trace: The execution spans to use for the evaluation
|
|
29
51
|
evaluation_criteria: The criteria to evaluate
|
|
30
52
|
|
|
31
53
|
Returns:
|
|
32
54
|
EvaluationResult: Boolean result indicating exact match (True/False)
|
|
33
55
|
"""
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
56
|
+
actual_output = str(self._get_actual_output(agent_execution))
|
|
57
|
+
expected_output = str(self._get_expected_output(evaluation_criteria))
|
|
58
|
+
if not self.evaluator_config.case_sensitive:
|
|
59
|
+
actual_output = actual_output.lower()
|
|
60
|
+
expected_output = expected_output.lower()
|
|
61
|
+
|
|
62
|
+
is_exact_match = actual_output == expected_output
|
|
63
|
+
if self.evaluator_config.negated:
|
|
64
|
+
is_exact_match = not is_exact_match
|
|
65
|
+
|
|
66
|
+
return NumericEvaluationResult(
|
|
67
|
+
score=float(is_exact_match),
|
|
37
68
|
)
|
|
@@ -1,17 +1,30 @@
|
|
|
1
1
|
"""JSON similarity evaluator for flexible structural comparison of outputs."""
|
|
2
2
|
|
|
3
3
|
import math
|
|
4
|
-
from typing import Any, Tuple
|
|
4
|
+
from typing import Any, Tuple
|
|
5
5
|
|
|
6
|
-
from
|
|
6
|
+
from ..models import (
|
|
7
|
+
AgentExecution,
|
|
8
|
+
EvaluationResult,
|
|
9
|
+
EvaluatorType,
|
|
10
|
+
NumericEvaluationResult,
|
|
11
|
+
)
|
|
12
|
+
from .output_evaluator import (
|
|
13
|
+
OutputEvaluationCriteria,
|
|
14
|
+
OutputEvaluator,
|
|
15
|
+
OutputEvaluatorConfig,
|
|
16
|
+
)
|
|
7
17
|
|
|
8
|
-
from ..models.models import AgentExecution
|
|
9
|
-
from .deterministic_evaluator_base import DeterministicEvaluatorBase
|
|
10
18
|
|
|
11
|
-
|
|
19
|
+
class JsonSimilarityEvaluatorConfig(OutputEvaluatorConfig[OutputEvaluationCriteria]):
|
|
20
|
+
"""Configuration for the json similarity evaluator."""
|
|
12
21
|
|
|
22
|
+
name: str = "JsonSimilarityEvaluator"
|
|
13
23
|
|
|
14
|
-
|
|
24
|
+
|
|
25
|
+
class JsonSimilarityEvaluator(
|
|
26
|
+
OutputEvaluator[OutputEvaluationCriteria, JsonSimilarityEvaluatorConfig, str]
|
|
27
|
+
):
|
|
15
28
|
"""Deterministic evaluator that scores structural JSON similarity between expected and actual output.
|
|
16
29
|
|
|
17
30
|
Compares expected versus actual JSON-like structures and returns a
|
|
@@ -19,8 +32,15 @@ class JsonSimilarityEvaluator(DeterministicEvaluatorBase[dict[str, Any]]):
|
|
|
19
32
|
and tolerant for numbers and strings (via Levenshtein distance).
|
|
20
33
|
"""
|
|
21
34
|
|
|
35
|
+
@classmethod
|
|
36
|
+
def get_evaluator_id(cls) -> str:
|
|
37
|
+
"""Get the evaluator id."""
|
|
38
|
+
return EvaluatorType.JSON_SIMILARITY.value
|
|
39
|
+
|
|
22
40
|
async def evaluate(
|
|
23
|
-
self,
|
|
41
|
+
self,
|
|
42
|
+
agent_execution: AgentExecution,
|
|
43
|
+
evaluation_criteria: OutputEvaluationCriteria,
|
|
24
44
|
) -> EvaluationResult:
|
|
25
45
|
"""Evaluate similarity between expected and actual JSON outputs.
|
|
26
46
|
|
|
@@ -36,16 +56,25 @@ class JsonSimilarityEvaluator(DeterministicEvaluatorBase[dict[str, Any]]):
|
|
|
36
56
|
Returns:
|
|
37
57
|
EvaluationResult: Numerical score between 0-100 indicating similarity
|
|
38
58
|
"""
|
|
59
|
+
score, justification = self._compare_json(
|
|
60
|
+
self._get_expected_output(evaluation_criteria),
|
|
61
|
+
self._get_actual_output(agent_execution),
|
|
62
|
+
)
|
|
63
|
+
validated_justification = self.validate_justification(justification)
|
|
39
64
|
return NumericEvaluationResult(
|
|
40
|
-
score=
|
|
65
|
+
score=score,
|
|
66
|
+
details=validated_justification,
|
|
41
67
|
)
|
|
42
68
|
|
|
43
|
-
def _compare_json(self, expected: Any, actual: Any) -> float:
|
|
69
|
+
def _compare_json(self, expected: Any, actual: Any) -> tuple[float, str]:
|
|
44
70
|
matched_leaves, total_leaves = self._compare_tokens(expected, actual)
|
|
45
71
|
if total_leaves == 0:
|
|
46
|
-
return
|
|
47
|
-
sim =
|
|
48
|
-
return
|
|
72
|
+
return 1.0, "Total leaves are 0"
|
|
73
|
+
sim = matched_leaves / total_leaves
|
|
74
|
+
return (
|
|
75
|
+
max(0.0, min(1.0, sim)),
|
|
76
|
+
f"Matched leaves: {matched_leaves}, Total leaves: {total_leaves}",
|
|
77
|
+
)
|
|
49
78
|
|
|
50
79
|
def _compare_tokens(
|
|
51
80
|
self, expected_token: Any, actual_token: Any
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""Base evaluator abstract class for agent evaluation."""
|
|
2
|
+
|
|
3
|
+
import functools
|
|
4
|
+
import time
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from collections.abc import Callable
|
|
7
|
+
from typing import Any, Generic, TypeVar
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, ConfigDict
|
|
10
|
+
|
|
11
|
+
from uipath.eval.models import EvaluationResult
|
|
12
|
+
from uipath.eval.models.models import (
|
|
13
|
+
AgentExecution,
|
|
14
|
+
ErrorEvaluationResult,
|
|
15
|
+
LegacyEvaluatorCategory,
|
|
16
|
+
LegacyEvaluatorType,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def track_evaluation_metrics(func: Callable[..., Any]) -> Callable[..., Any]:
|
|
21
|
+
"""Decorator to track evaluation metrics and handle errors gracefully."""
|
|
22
|
+
|
|
23
|
+
@functools.wraps(func)
|
|
24
|
+
async def wrapper(*args: Any, **kwargs: Any) -> EvaluationResult:
|
|
25
|
+
start_time = time.time()
|
|
26
|
+
try:
|
|
27
|
+
result = await func(*args, **kwargs)
|
|
28
|
+
except Exception as e:
|
|
29
|
+
result = ErrorEvaluationResult(
|
|
30
|
+
details="Exception thrown by evaluator: {}".format(e),
|
|
31
|
+
evaluation_time=time.time() - start_time,
|
|
32
|
+
)
|
|
33
|
+
end_time = time.time()
|
|
34
|
+
execution_time = end_time - start_time
|
|
35
|
+
|
|
36
|
+
result.evaluation_time = execution_time
|
|
37
|
+
return result
|
|
38
|
+
|
|
39
|
+
return wrapper
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
T = TypeVar("T")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class LegacyBaseEvaluator(BaseModel, Generic[T], ABC):
|
|
46
|
+
"""Abstract base class for all evaluators."""
|
|
47
|
+
|
|
48
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
49
|
+
|
|
50
|
+
id: str
|
|
51
|
+
name: str
|
|
52
|
+
description: str
|
|
53
|
+
target_output_key: str = "*"
|
|
54
|
+
created_at: str
|
|
55
|
+
updated_at: str
|
|
56
|
+
category: LegacyEvaluatorCategory
|
|
57
|
+
evaluator_type: LegacyEvaluatorType
|
|
58
|
+
|
|
59
|
+
def __init_subclass__(cls, **kwargs: Any):
|
|
60
|
+
"""Hook for subclass creation - automatically applies evaluation metrics tracking."""
|
|
61
|
+
super().__init_subclass__(**kwargs)
|
|
62
|
+
|
|
63
|
+
if hasattr(cls, "evaluate") and not getattr(
|
|
64
|
+
cls.evaluate, "_has_metrics_decorator", False
|
|
65
|
+
):
|
|
66
|
+
cls.evaluate = track_evaluation_metrics(cls.evaluate) # type: ignore[method-assign]
|
|
67
|
+
cls.evaluate._has_metrics_decorator = True # type: ignore[attr-defined]
|
|
68
|
+
|
|
69
|
+
def model_post_init(self, __context: Any):
|
|
70
|
+
"""Post-initialization hook for Pydantic models."""
|
|
71
|
+
pass
|
|
72
|
+
|
|
73
|
+
@abstractmethod
|
|
74
|
+
async def evaluate(
|
|
75
|
+
self, agent_execution: AgentExecution, evaluation_criteria: T
|
|
76
|
+
) -> EvaluationResult:
|
|
77
|
+
"""Evaluate the given data and return a result.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
agent_execution: The execution details containing:
|
|
81
|
+
- agent_input: The input received by the agent
|
|
82
|
+
- actual_output: The actual output from the agent
|
|
83
|
+
- spans: The execution spans to use for the evaluation
|
|
84
|
+
evaluation_criteria: The criteria to evaluate
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
EvaluationResult containing the score and details
|
|
88
|
+
"""
|
|
89
|
+
pass
|
uipath/eval/evaluators/{deterministic_evaluator_base.py → legacy_deterministic_evaluator_base.py}
RENAMED
|
@@ -4,12 +4,12 @@ import json
|
|
|
4
4
|
from abc import ABC
|
|
5
5
|
from typing import Any, TypeVar
|
|
6
6
|
|
|
7
|
-
from .
|
|
7
|
+
from .legacy_base_evaluator import LegacyBaseEvaluator
|
|
8
8
|
|
|
9
9
|
T = TypeVar("T")
|
|
10
10
|
|
|
11
11
|
|
|
12
|
-
class DeterministicEvaluatorBase(
|
|
12
|
+
class DeterministicEvaluatorBase(LegacyBaseEvaluator[T], ABC):
|
|
13
13
|
"""Base class for evaluators that produce deterministic, reproducible results.
|
|
14
14
|
|
|
15
15
|
This class provides utility methods for canonical JSON comparison and number normalization
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Exact match evaluator for binary pass/fail evaluation of agent outputs."""
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from uipath.eval.models import BooleanEvaluationResult, EvaluationResult
|
|
6
|
+
|
|
7
|
+
from ..models.models import AgentExecution
|
|
8
|
+
from .legacy_deterministic_evaluator_base import DeterministicEvaluatorBase
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class LegacyExactMatchEvaluator(DeterministicEvaluatorBase[dict[str, Any]]):
|
|
12
|
+
"""Evaluator that performs exact structural matching between expected and actual outputs.
|
|
13
|
+
|
|
14
|
+
This evaluator returns True if the actual output exactly matches the expected output
|
|
15
|
+
after canonical JSON normalization, and False otherwise. Numbers are normalized
|
|
16
|
+
to floats for consistent comparison.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
async def evaluate(
|
|
20
|
+
self, agent_execution: AgentExecution, evaluation_criteria: dict[str, Any]
|
|
21
|
+
) -> EvaluationResult:
|
|
22
|
+
"""Evaluate whether actual output exactly matches expected output.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
agent_execution: The execution details containing:
|
|
26
|
+
- agent_input: The input received by the agent
|
|
27
|
+
- actual_output: The actual output from the agent
|
|
28
|
+
- spans: The execution spans to use for the evaluation
|
|
29
|
+
evaluation_criteria: The criteria to evaluate
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
EvaluationResult: Boolean result indicating exact match (True/False)
|
|
33
|
+
"""
|
|
34
|
+
return BooleanEvaluationResult(
|
|
35
|
+
score=self._canonical_json(agent_execution.agent_output)
|
|
36
|
+
== self._canonical_json(evaluation_criteria)
|
|
37
|
+
)
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""JSON similarity evaluator for flexible structural comparison of outputs."""
|
|
2
|
+
|
|
3
|
+
import math
|
|
4
|
+
from typing import Any, Tuple, TypeVar
|
|
5
|
+
|
|
6
|
+
from uipath.eval.models import EvaluationResult, NumericEvaluationResult
|
|
7
|
+
|
|
8
|
+
from ..models.models import AgentExecution
|
|
9
|
+
from .legacy_deterministic_evaluator_base import DeterministicEvaluatorBase
|
|
10
|
+
|
|
11
|
+
T = TypeVar("T")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class LegacyJsonSimilarityEvaluator(DeterministicEvaluatorBase[dict[str, Any]]):
|
|
15
|
+
"""Legacy deterministic evaluator that scores structural JSON similarity between expected and actual output.
|
|
16
|
+
|
|
17
|
+
Compares expected versus actual JSON-like structures and returns a
|
|
18
|
+
numerical score in the range [0, 100]. The comparison is token-based
|
|
19
|
+
and tolerant for numbers and strings (via Levenshtein distance).
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
async def evaluate(
|
|
23
|
+
self, agent_execution: AgentExecution, evaluation_criteria: dict[str, Any]
|
|
24
|
+
) -> EvaluationResult:
|
|
25
|
+
"""Evaluate similarity between expected and actual JSON outputs.
|
|
26
|
+
|
|
27
|
+
Uses token-based comparison with tolerance for numeric differences
|
|
28
|
+
and Levenshtein distance for string similarity.
|
|
29
|
+
|
|
30
|
+
agent_execution: The execution details containing:
|
|
31
|
+
- agent_input: The input received by the agent
|
|
32
|
+
- actual_output: The actual output from the agent
|
|
33
|
+
- spans: The execution spans to use for the evaluation
|
|
34
|
+
evaluation_criteria: The criteria to evaluate
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
EvaluationResult: Numerical score between 0-100 indicating similarity
|
|
38
|
+
"""
|
|
39
|
+
return NumericEvaluationResult(
|
|
40
|
+
score=self._compare_json(evaluation_criteria, agent_execution.agent_output)
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
def _compare_json(self, expected: Any, actual: Any) -> float:
|
|
44
|
+
matched_leaves, total_leaves = self._compare_tokens(expected, actual)
|
|
45
|
+
if total_leaves == 0:
|
|
46
|
+
return 100.0
|
|
47
|
+
sim = (matched_leaves / total_leaves) * 100.0
|
|
48
|
+
return max(0.0, min(100.0, sim))
|
|
49
|
+
|
|
50
|
+
def _compare_tokens(
|
|
51
|
+
self, expected_token: Any, actual_token: Any
|
|
52
|
+
) -> Tuple[float, float]:
|
|
53
|
+
if self._is_number(expected_token) and self._is_number(actual_token):
|
|
54
|
+
return self._compare_numbers(float(expected_token), float(actual_token))
|
|
55
|
+
|
|
56
|
+
if type(expected_token) is not type(actual_token):
|
|
57
|
+
return 0.0, self._count_leaves(expected_token)
|
|
58
|
+
|
|
59
|
+
if isinstance(expected_token, dict):
|
|
60
|
+
matched_leaves = total_leaves = 0.0
|
|
61
|
+
# Only expected keys count
|
|
62
|
+
for expected_key, expected_value in expected_token.items():
|
|
63
|
+
if isinstance(actual_token, dict) and expected_key in actual_token:
|
|
64
|
+
matched, total = self._compare_tokens(
|
|
65
|
+
expected_value, actual_token[expected_key]
|
|
66
|
+
)
|
|
67
|
+
else:
|
|
68
|
+
matched, total = (0.0, self._count_leaves(expected_value))
|
|
69
|
+
matched_leaves += matched
|
|
70
|
+
total_leaves += total
|
|
71
|
+
return matched_leaves, total_leaves
|
|
72
|
+
|
|
73
|
+
if isinstance(expected_token, list):
|
|
74
|
+
matched_leaves = total_leaves = 0.0
|
|
75
|
+
common_length = min(len(expected_token), len(actual_token))
|
|
76
|
+
for index in range(common_length):
|
|
77
|
+
matched, total = self._compare_tokens(
|
|
78
|
+
expected_token[index], actual_token[index]
|
|
79
|
+
)
|
|
80
|
+
matched_leaves += matched
|
|
81
|
+
total_leaves += total
|
|
82
|
+
for index in range(common_length, len(expected_token)):
|
|
83
|
+
total_leaves += self._count_leaves(expected_token[index])
|
|
84
|
+
return (matched_leaves, total_leaves)
|
|
85
|
+
|
|
86
|
+
if isinstance(expected_token, bool):
|
|
87
|
+
return (1.0, 1.0) if expected_token == actual_token else (0.0, 1.0)
|
|
88
|
+
|
|
89
|
+
if isinstance(expected_token, str):
|
|
90
|
+
return self._compare_strings(expected_token, actual_token)
|
|
91
|
+
|
|
92
|
+
return (1.0, 1.0) if str(expected_token) == str(actual_token) else (0.0, 1.0)
|
|
93
|
+
|
|
94
|
+
def _compare_numbers(
|
|
95
|
+
self, expected_number: float, actual_number: float
|
|
96
|
+
) -> Tuple[float, float]:
|
|
97
|
+
total = 1.0
|
|
98
|
+
if math.isclose(expected_number, 0.0, abs_tol=1e-12):
|
|
99
|
+
matched = 1.0 if math.isclose(actual_number, 0.0, abs_tol=1e-12) else 0.0
|
|
100
|
+
else:
|
|
101
|
+
ratio = abs(expected_number - actual_number) / abs(expected_number)
|
|
102
|
+
matched = max(0.0, min(1.0, 1.0 - ratio))
|
|
103
|
+
return matched, total
|
|
104
|
+
|
|
105
|
+
def _compare_strings(
|
|
106
|
+
self, expected_string: str, actual_string: str
|
|
107
|
+
) -> Tuple[float, float]:
|
|
108
|
+
total = 1.0
|
|
109
|
+
if not expected_string and not actual_string:
|
|
110
|
+
return 1.0, total
|
|
111
|
+
distance = self._levenshtein(expected_string, actual_string)
|
|
112
|
+
max_length = max(len(expected_string), len(actual_string))
|
|
113
|
+
similarity = 1.0 - (distance / max_length) if max_length else 1.0
|
|
114
|
+
similarity = max(0.0, min(1.0, similarity))
|
|
115
|
+
return similarity, total
|
|
116
|
+
|
|
117
|
+
def _count_leaves(self, token_node: Any) -> float:
|
|
118
|
+
if isinstance(token_node, dict):
|
|
119
|
+
return sum(
|
|
120
|
+
self._count_leaves(child_value) for child_value in token_node.values()
|
|
121
|
+
)
|
|
122
|
+
if isinstance(token_node, list):
|
|
123
|
+
return sum(self._count_leaves(child_value) for child_value in token_node)
|
|
124
|
+
return 1.0
|
|
125
|
+
|
|
126
|
+
def _levenshtein(self, source_text: str, target_text: str) -> int:
|
|
127
|
+
if not source_text:
|
|
128
|
+
return len(target_text)
|
|
129
|
+
if not target_text:
|
|
130
|
+
return len(source_text)
|
|
131
|
+
source_len, target_len = len(source_text), len(target_text)
|
|
132
|
+
distance_matrix = [[0] * (target_len + 1) for _ in range(source_len + 1)]
|
|
133
|
+
for row_idx in range(source_len + 1):
|
|
134
|
+
distance_matrix[row_idx][0] = row_idx
|
|
135
|
+
for col_idx in range(target_len + 1):
|
|
136
|
+
distance_matrix[0][col_idx] = col_idx
|
|
137
|
+
for row_idx in range(1, source_len + 1):
|
|
138
|
+
for col_idx in range(1, target_len + 1):
|
|
139
|
+
substitution_cost = (
|
|
140
|
+
0 if source_text[row_idx - 1] == target_text[col_idx - 1] else 1
|
|
141
|
+
)
|
|
142
|
+
distance_matrix[row_idx][col_idx] = min(
|
|
143
|
+
distance_matrix[row_idx - 1][col_idx] + 1, # deletion
|
|
144
|
+
distance_matrix[row_idx][col_idx - 1] + 1, # insertion
|
|
145
|
+
distance_matrix[row_idx - 1][col_idx - 1]
|
|
146
|
+
+ substitution_cost, # substitution
|
|
147
|
+
)
|
|
148
|
+
return distance_matrix[source_len][target_len]
|
|
149
|
+
|
|
150
|
+
def _is_number(self, value: Any) -> bool:
|
|
151
|
+
return isinstance(value, (int, float)) and not isinstance(value, bool)
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""LLM-as-a-judge evaluator for subjective quality assessment of agent outputs."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from typing import Any, Optional
|
|
5
|
+
|
|
6
|
+
from pydantic import field_validator
|
|
7
|
+
|
|
8
|
+
from uipath.eval.models import NumericEvaluationResult
|
|
9
|
+
|
|
10
|
+
from ..._services import UiPathLlmChatService
|
|
11
|
+
from ..._utils.constants import COMMUNITY_agents_SUFFIX
|
|
12
|
+
from ..models.models import AgentExecution, EvaluationResult, LLMResponse
|
|
13
|
+
from .legacy_base_evaluator import LegacyBaseEvaluator
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class LegacyLlmAsAJudgeEvaluator(LegacyBaseEvaluator[dict[str, Any]]):
|
|
17
|
+
"""Legacy evaluator that uses an LLM to judge the quality of agent output."""
|
|
18
|
+
|
|
19
|
+
prompt: str
|
|
20
|
+
model: str
|
|
21
|
+
actual_output_placeholder: str = "{{ActualOutput}}"
|
|
22
|
+
expected_output_placeholder: str = "{{ExpectedOutput}}"
|
|
23
|
+
llm: Optional[UiPathLlmChatService] = None
|
|
24
|
+
|
|
25
|
+
@field_validator("prompt")
|
|
26
|
+
@classmethod
|
|
27
|
+
def validate_prompt_placeholders(cls, v: str) -> str:
|
|
28
|
+
"""Validate that prompt contains required placeholders."""
|
|
29
|
+
if "{{ActualOutput}}" not in v or "{{ExpectedOutput}}" not in v:
|
|
30
|
+
raise ValueError(
|
|
31
|
+
"Prompt must contain both {ActualOutput} and {ExpectedOutput} placeholders"
|
|
32
|
+
)
|
|
33
|
+
return v
|
|
34
|
+
|
|
35
|
+
def model_post_init(self, __context: Any):
|
|
36
|
+
"""Initialize the LLM service after model creation."""
|
|
37
|
+
super().model_post_init(__context)
|
|
38
|
+
self._initialize_llm()
|
|
39
|
+
|
|
40
|
+
def _initialize_llm(self):
|
|
41
|
+
"""Initialize the LLM used for evaluation."""
|
|
42
|
+
from uipath import UiPath
|
|
43
|
+
|
|
44
|
+
uipath = UiPath()
|
|
45
|
+
self.llm = uipath.llm
|
|
46
|
+
|
|
47
|
+
async def evaluate(
|
|
48
|
+
self,
|
|
49
|
+
agent_execution: AgentExecution,
|
|
50
|
+
evaluation_criteria: dict[str, Any],
|
|
51
|
+
) -> EvaluationResult:
|
|
52
|
+
"""Evaluate using an LLM as a judge.
|
|
53
|
+
|
|
54
|
+
Sends the formatted prompt to the configured LLM and expects a JSON response
|
|
55
|
+
with a numerical score (0-100) and justification.
|
|
56
|
+
|
|
57
|
+
agent_execution: The execution details containing:
|
|
58
|
+
- agent_input: The input received by the agent
|
|
59
|
+
- actual_output: The actual output from the agent
|
|
60
|
+
- spans: The execution spans to use for the evaluation
|
|
61
|
+
evaluation_criteria: The criteria to evaluate
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
EvaluationResult: Numerical score with LLM justification as details
|
|
65
|
+
"""
|
|
66
|
+
# Create the evaluation prompt
|
|
67
|
+
evaluation_prompt = self._create_evaluation_prompt(
|
|
68
|
+
expected_output=evaluation_criteria,
|
|
69
|
+
actual_output=agent_execution.agent_output,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
llm_response = await self._get_llm_response(evaluation_prompt)
|
|
73
|
+
|
|
74
|
+
return NumericEvaluationResult(
|
|
75
|
+
score=llm_response.score,
|
|
76
|
+
details=llm_response.justification,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
def _create_evaluation_prompt(
|
|
80
|
+
self, expected_output: Any, actual_output: Any
|
|
81
|
+
) -> str:
|
|
82
|
+
"""Create the evaluation prompt for the LLM."""
|
|
83
|
+
formatted_prompt = self.prompt.replace(
|
|
84
|
+
self.actual_output_placeholder,
|
|
85
|
+
str(actual_output),
|
|
86
|
+
)
|
|
87
|
+
formatted_prompt = formatted_prompt.replace(
|
|
88
|
+
self.expected_output_placeholder,
|
|
89
|
+
str(expected_output),
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
return formatted_prompt
|
|
93
|
+
|
|
94
|
+
async def _get_llm_response(self, evaluation_prompt: str) -> LLMResponse:
|
|
95
|
+
"""Get response from the LLM.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
evaluation_prompt: The formatted prompt to send to the LLM
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
LLMResponse with score and justification
|
|
102
|
+
"""
|
|
103
|
+
# remove community-agents suffix from llm model name
|
|
104
|
+
model = self.model
|
|
105
|
+
if model.endswith(COMMUNITY_agents_SUFFIX):
|
|
106
|
+
model = model.replace(COMMUNITY_agents_SUFFIX, "")
|
|
107
|
+
|
|
108
|
+
# Prepare the request
|
|
109
|
+
request_data = {
|
|
110
|
+
"model": model,
|
|
111
|
+
"messages": [{"role": "user", "content": evaluation_prompt}],
|
|
112
|
+
"response_format": {
|
|
113
|
+
"type": "json_schema",
|
|
114
|
+
"json_schema": {
|
|
115
|
+
"name": "evaluation_response",
|
|
116
|
+
"schema": {
|
|
117
|
+
"type": "object",
|
|
118
|
+
"properties": {
|
|
119
|
+
"score": {
|
|
120
|
+
"type": "number",
|
|
121
|
+
"minimum": 0,
|
|
122
|
+
"maximum": 100,
|
|
123
|
+
"description": "Score between 0 and 100",
|
|
124
|
+
},
|
|
125
|
+
"justification": {
|
|
126
|
+
"type": "string",
|
|
127
|
+
"description": "Explanation for the score",
|
|
128
|
+
},
|
|
129
|
+
},
|
|
130
|
+
"required": ["score", "justification"],
|
|
131
|
+
},
|
|
132
|
+
},
|
|
133
|
+
},
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
response = await self.llm.chat_completions(**request_data) # type: ignore
|
|
137
|
+
return LLMResponse(**json.loads(response.choices[-1].message.content or "{}"))
|