uipath 2.1.52__py3-none-any.whl → 2.1.54__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- uipath/_cli/_evals/{_evaluators/_evaluator_factory.py → _evaluator_factory.py} +24 -23
- uipath/_cli/_evals/_models/_evaluation_set.py +23 -18
- uipath/_cli/_evals/_models/_evaluator_base_params.py +16 -0
- uipath/_cli/_evals/_models/_output.py +85 -0
- uipath/_cli/_evals/_runtime.py +102 -10
- uipath/_cli/_runtime/_contracts.py +11 -2
- uipath/_cli/_utils/_eval_set.py +1 -1
- uipath/_cli/_utils/_studio_project.py +30 -29
- uipath/_cli/cli_eval.py +46 -61
- uipath/eval/evaluators/__init__.py +15 -0
- uipath/eval/evaluators/base_evaluator.py +88 -0
- uipath/eval/evaluators/deterministic_evaluator_base.py +53 -0
- uipath/eval/evaluators/exact_match_evaluator.py +37 -0
- uipath/{_cli/_evals/_evaluators/_json_similarity_evaluator.py → eval/evaluators/json_similarity_evaluator.py} +23 -40
- uipath/eval/evaluators/llm_as_judge_evaluator.py +137 -0
- uipath/eval/evaluators/trajectory_evaluator.py +36 -0
- uipath/eval/models/__init__.py +19 -0
- uipath/{_cli/_evals/_models/_evaluators.py → eval/models/models.py} +67 -43
- {uipath-2.1.52.dist-info → uipath-2.1.54.dist-info}/METADATA +1 -1
- {uipath-2.1.52.dist-info → uipath-2.1.54.dist-info}/RECORD +23 -23
- uipath/_cli/_evals/_evaluators/__init__.py +0 -22
- uipath/_cli/_evals/_evaluators/_deterministic_evaluator_base.py +0 -46
- uipath/_cli/_evals/_evaluators/_evaluator_base.py +0 -124
- uipath/_cli/_evals/_evaluators/_exact_match_evaluator.py +0 -40
- uipath/_cli/_evals/_evaluators/_llm_as_judge_evaluator.py +0 -183
- uipath/_cli/_evals/_evaluators/_trajectory_evaluator.py +0 -48
- uipath/_cli/_evals/_models/__init__.py +0 -18
- uipath/_cli/_evals/_models/_agent_execution_output.py +0 -14
- uipath/_cli/_evals/progress_reporter.py +0 -304
- {uipath-2.1.52.dist-info → uipath-2.1.54.dist-info}/WHEEL +0 -0
- {uipath-2.1.52.dist-info → uipath-2.1.54.dist-info}/entry_points.txt +0 -0
- {uipath-2.1.52.dist-info → uipath-2.1.54.dist-info}/licenses/LICENSE +0 -0
uipath/_cli/cli_eval.py
CHANGED
@@ -2,7 +2,6 @@
|
|
2
2
|
import ast
|
3
3
|
import asyncio
|
4
4
|
import os
|
5
|
-
from datetime import datetime, timezone
|
6
5
|
from typing import List, Optional
|
7
6
|
|
8
7
|
import click
|
@@ -13,7 +12,7 @@ from uipath._cli._runtime._contracts import (
|
|
13
12
|
UiPathRuntimeFactory,
|
14
13
|
)
|
15
14
|
from uipath._cli._runtime._runtime import UiPathScriptRuntime
|
16
|
-
from uipath._cli.middlewares import
|
15
|
+
from uipath._cli.middlewares import Middlewares
|
17
16
|
from uipath.eval._helpers import auto_discover_entrypoint
|
18
17
|
|
19
18
|
from .._utils.constants import ENV_JOB_ID
|
@@ -32,55 +31,6 @@ class LiteralOption(click.Option):
|
|
32
31
|
raise click.BadParameter(value) from e
|
33
32
|
|
34
33
|
|
35
|
-
def eval_agent_middleware(
|
36
|
-
entrypoint: Optional[str] = None,
|
37
|
-
eval_set: Optional[str] = None,
|
38
|
-
eval_ids: Optional[List[str]] = None,
|
39
|
-
workers: int = 8,
|
40
|
-
no_report: bool = False,
|
41
|
-
**kwargs,
|
42
|
-
) -> MiddlewareResult:
|
43
|
-
"""Middleware to run an evaluation set against the agent."""
|
44
|
-
timestamp = datetime.now(timezone.utc).strftime("%M-%H-%d-%m-%Y")
|
45
|
-
|
46
|
-
eval_context = UiPathEvalContext.with_defaults()
|
47
|
-
eval_context.no_report = no_report
|
48
|
-
eval_context.workers = workers
|
49
|
-
eval_context.eval_set = eval_set or EvalHelpers.auto_discover_eval_set()
|
50
|
-
eval_context.eval_ids = eval_ids
|
51
|
-
eval_context.execution_output_file = (
|
52
|
-
f"evals/results/{timestamp}.json" if not os.getenv("UIPATH_JOB_KEY") else None
|
53
|
-
)
|
54
|
-
|
55
|
-
runtime_entrypoint = entrypoint or auto_discover_entrypoint()
|
56
|
-
|
57
|
-
def generate_runtime_context(**context_kwargs) -> UiPathRuntimeContext:
|
58
|
-
runtime_context = UiPathRuntimeContext.with_defaults(**context_kwargs)
|
59
|
-
runtime_context.entrypoint = runtime_entrypoint
|
60
|
-
return runtime_context
|
61
|
-
|
62
|
-
try:
|
63
|
-
runtime_factory = UiPathRuntimeFactory(
|
64
|
-
UiPathScriptRuntime,
|
65
|
-
UiPathRuntimeContext,
|
66
|
-
context_generator=generate_runtime_context,
|
67
|
-
)
|
68
|
-
|
69
|
-
async def execute():
|
70
|
-
async with UiPathEvalRuntime.from_eval_context(
|
71
|
-
factory=runtime_factory, context=eval_context
|
72
|
-
) as eval_runtime:
|
73
|
-
await eval_runtime.execute()
|
74
|
-
|
75
|
-
asyncio.run(execute())
|
76
|
-
return MiddlewareResult(should_continue=False)
|
77
|
-
|
78
|
-
except Exception as e:
|
79
|
-
return MiddlewareResult(
|
80
|
-
should_continue=False, error_message=f"Error running evaluation: {str(e)}"
|
81
|
-
)
|
82
|
-
|
83
|
-
|
84
34
|
@click.command()
|
85
35
|
@click.argument("entrypoint", required=False)
|
86
36
|
@click.argument("eval_set", required=False)
|
@@ -97,6 +47,12 @@ def eval_agent_middleware(
|
|
97
47
|
default=8,
|
98
48
|
help="Number of parallel workers for running evaluations (default: 8)",
|
99
49
|
)
|
50
|
+
@click.option(
|
51
|
+
"--output-file",
|
52
|
+
required=False,
|
53
|
+
type=click.Path(exists=False),
|
54
|
+
help="File path where the output will be written",
|
55
|
+
)
|
100
56
|
@track(when=lambda *_a, **_kw: os.getenv(ENV_JOB_ID) is None)
|
101
57
|
def eval(
|
102
58
|
entrypoint: Optional[str],
|
@@ -104,6 +60,7 @@ def eval(
|
|
104
60
|
eval_ids: List[str],
|
105
61
|
no_report: bool,
|
106
62
|
workers: int,
|
63
|
+
output_file: Optional[str],
|
107
64
|
) -> None:
|
108
65
|
"""Run an evaluation set against the agent.
|
109
66
|
|
@@ -121,21 +78,49 @@ def eval(
|
|
121
78
|
eval_ids,
|
122
79
|
no_report=no_report,
|
123
80
|
workers=workers,
|
81
|
+
execution_output_file=output_file,
|
124
82
|
)
|
125
83
|
|
126
|
-
if result.should_continue:
|
127
|
-
result = eval_agent_middleware(
|
128
|
-
entrypoint=entrypoint,
|
129
|
-
eval_set=eval_set,
|
130
|
-
eval_ids=eval_ids,
|
131
|
-
workers=workers,
|
132
|
-
no_report=no_report,
|
133
|
-
)
|
134
|
-
if result.should_continue:
|
135
|
-
console.error("Could not process the request with any available handler.")
|
136
84
|
if result.error_message:
|
137
85
|
console.error(result.error_message)
|
138
86
|
|
87
|
+
if result.should_continue:
|
88
|
+
|
89
|
+
def generate_runtime_context(**context_kwargs) -> UiPathRuntimeContext:
|
90
|
+
runtime_context = UiPathRuntimeContext.with_defaults(**context_kwargs)
|
91
|
+
runtime_context.entrypoint = runtime_entrypoint
|
92
|
+
return runtime_context
|
93
|
+
|
94
|
+
eval_context = UiPathEvalContext.with_defaults(
|
95
|
+
execution_output_file=output_file
|
96
|
+
)
|
97
|
+
|
98
|
+
eval_context.no_report = no_report
|
99
|
+
eval_context.workers = workers
|
100
|
+
eval_context.eval_set = eval_set or EvalHelpers.auto_discover_eval_set()
|
101
|
+
eval_context.eval_ids = eval_ids
|
102
|
+
|
103
|
+
runtime_entrypoint = entrypoint or auto_discover_entrypoint()
|
104
|
+
|
105
|
+
try:
|
106
|
+
runtime_factory = UiPathRuntimeFactory(
|
107
|
+
UiPathScriptRuntime,
|
108
|
+
UiPathRuntimeContext,
|
109
|
+
context_generator=generate_runtime_context,
|
110
|
+
)
|
111
|
+
|
112
|
+
async def execute():
|
113
|
+
async with UiPathEvalRuntime.from_eval_context(
|
114
|
+
factory=runtime_factory, context=eval_context
|
115
|
+
) as eval_runtime:
|
116
|
+
await eval_runtime.execute()
|
117
|
+
|
118
|
+
asyncio.run(execute())
|
119
|
+
except Exception as e:
|
120
|
+
console.error(
|
121
|
+
f"Error: Unexpected error occurred - {str(e)}", include_traceback=True
|
122
|
+
)
|
123
|
+
|
139
124
|
console.success("Evaluation completed successfully")
|
140
125
|
|
141
126
|
|
@@ -0,0 +1,15 @@
|
|
1
|
+
"""UiPath evaluator implementations for agent performance evaluation."""
|
2
|
+
|
3
|
+
from .base_evaluator import BaseEvaluator
|
4
|
+
from .exact_match_evaluator import ExactMatchEvaluator
|
5
|
+
from .json_similarity_evaluator import JsonSimilarityEvaluator
|
6
|
+
from .llm_as_judge_evaluator import LlmAsAJudgeEvaluator
|
7
|
+
from .trajectory_evaluator import TrajectoryEvaluator
|
8
|
+
|
9
|
+
__all__ = [
|
10
|
+
"BaseEvaluator",
|
11
|
+
"ExactMatchEvaluator",
|
12
|
+
"JsonSimilarityEvaluator",
|
13
|
+
"LlmAsAJudgeEvaluator",
|
14
|
+
"TrajectoryEvaluator",
|
15
|
+
]
|
@@ -0,0 +1,88 @@
|
|
1
|
+
"""Base evaluator abstract class for agent evaluation."""
|
2
|
+
|
3
|
+
import functools
|
4
|
+
import time
|
5
|
+
from abc import ABC, abstractmethod
|
6
|
+
from typing import Generic, TypeVar
|
7
|
+
|
8
|
+
from pydantic import BaseModel, ConfigDict
|
9
|
+
|
10
|
+
from uipath.eval.models import EvaluationResult
|
11
|
+
from uipath.eval.models.models import (
|
12
|
+
AgentExecution,
|
13
|
+
ErrorEvaluationResult,
|
14
|
+
EvaluatorCategory,
|
15
|
+
EvaluatorType,
|
16
|
+
)
|
17
|
+
|
18
|
+
|
19
|
+
def track_evaluation_metrics(func):
|
20
|
+
"""Decorator to track evaluation metrics and handle errors gracefully."""
|
21
|
+
|
22
|
+
@functools.wraps(func)
|
23
|
+
async def wrapper(*args, **kwargs) -> EvaluationResult:
|
24
|
+
start_time = time.time()
|
25
|
+
try:
|
26
|
+
result = await func(*args, **kwargs)
|
27
|
+
except Exception as e:
|
28
|
+
result = ErrorEvaluationResult(
|
29
|
+
details="Exception thrown by evaluator: {}".format(e),
|
30
|
+
evaluation_time=time.time() - start_time,
|
31
|
+
)
|
32
|
+
end_time = time.time()
|
33
|
+
execution_time = end_time - start_time
|
34
|
+
|
35
|
+
result.evaluation_time = execution_time
|
36
|
+
return result
|
37
|
+
|
38
|
+
return wrapper
|
39
|
+
|
40
|
+
|
41
|
+
T = TypeVar("T")
|
42
|
+
|
43
|
+
|
44
|
+
class BaseEvaluator(BaseModel, Generic[T], ABC):
|
45
|
+
"""Abstract base class for all evaluators."""
|
46
|
+
|
47
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
48
|
+
|
49
|
+
id: str
|
50
|
+
name: str
|
51
|
+
description: str
|
52
|
+
target_output_key: str = "*"
|
53
|
+
created_at: str
|
54
|
+
updated_at: str
|
55
|
+
category: EvaluatorCategory
|
56
|
+
evaluator_type: EvaluatorType
|
57
|
+
|
58
|
+
def __init_subclass__(cls, **kwargs):
|
59
|
+
"""Hook for subclass creation - automatically applies evaluation metrics tracking."""
|
60
|
+
super().__init_subclass__(**kwargs)
|
61
|
+
|
62
|
+
if hasattr(cls, "evaluate") and not getattr(
|
63
|
+
cls.evaluate, "_has_metrics_decorator", False
|
64
|
+
):
|
65
|
+
cls.evaluate = track_evaluation_metrics(cls.evaluate) # type: ignore[method-assign]
|
66
|
+
cls.evaluate._has_metrics_decorator = True # type: ignore[attr-defined]
|
67
|
+
|
68
|
+
def model_post_init(self, __context):
|
69
|
+
"""Post-initialization hook for Pydantic models."""
|
70
|
+
pass
|
71
|
+
|
72
|
+
@abstractmethod
|
73
|
+
async def evaluate(
|
74
|
+
self, agent_execution: AgentExecution, evaluation_criteria: T
|
75
|
+
) -> EvaluationResult:
|
76
|
+
"""Evaluate the given data and return a result.
|
77
|
+
|
78
|
+
Args:
|
79
|
+
agent_execution: The execution details containing:
|
80
|
+
- agent_input: The input received by the agent
|
81
|
+
- actual_output: The actual output from the agent
|
82
|
+
- spans: The execution spans to use for the evaluation
|
83
|
+
evaluation_criteria: The criteria to evaluate
|
84
|
+
|
85
|
+
Returns:
|
86
|
+
EvaluationResult containing the score and details
|
87
|
+
"""
|
88
|
+
pass
|
@@ -0,0 +1,53 @@
|
|
1
|
+
"""Base class for deterministic evaluators that provide consistent outputs."""
|
2
|
+
|
3
|
+
import json
|
4
|
+
from abc import ABC
|
5
|
+
from typing import Any, TypeVar
|
6
|
+
|
7
|
+
from .base_evaluator import BaseEvaluator
|
8
|
+
|
9
|
+
T = TypeVar("T")
|
10
|
+
|
11
|
+
|
12
|
+
class DeterministicEvaluatorBase(BaseEvaluator[T], ABC):
|
13
|
+
"""Base class for evaluators that produce deterministic, reproducible results.
|
14
|
+
|
15
|
+
This class provides utility methods for canonical JSON comparison and number normalization
|
16
|
+
to ensure consistent evaluation results across runs.
|
17
|
+
"""
|
18
|
+
|
19
|
+
def _canonical_json(self, obj: Any) -> str:
|
20
|
+
"""Convert an object to canonical JSON string for consistent comparison.
|
21
|
+
|
22
|
+
Args:
|
23
|
+
obj: The object to convert to canonical JSON
|
24
|
+
|
25
|
+
Returns:
|
26
|
+
str: Canonical JSON string with normalized numbers and sorted keys
|
27
|
+
"""
|
28
|
+
return json.dumps(
|
29
|
+
self._normalize_numbers(obj),
|
30
|
+
sort_keys=True,
|
31
|
+
separators=(",", ":"),
|
32
|
+
ensure_ascii=False,
|
33
|
+
)
|
34
|
+
|
35
|
+
def _normalize_numbers(self, obj: Any) -> Any:
|
36
|
+
"""Recursively normalize numbers in nested data structures.
|
37
|
+
|
38
|
+
Converts all numeric values (int, float) to float for consistent comparison,
|
39
|
+
while preserving booleans and other data types.
|
40
|
+
|
41
|
+
Args:
|
42
|
+
obj: The object to normalize
|
43
|
+
|
44
|
+
Returns:
|
45
|
+
Any: Object with normalized numbers
|
46
|
+
"""
|
47
|
+
if isinstance(obj, dict):
|
48
|
+
return {k: self._normalize_numbers(v) for k, v in obj.items()}
|
49
|
+
if isinstance(obj, (list, tuple)):
|
50
|
+
return [self._normalize_numbers(v) for v in obj]
|
51
|
+
if isinstance(obj, (int, float)) and not isinstance(obj, bool):
|
52
|
+
return float(obj)
|
53
|
+
return obj
|
@@ -0,0 +1,37 @@
|
|
1
|
+
"""Exact match evaluator for binary pass/fail evaluation of agent outputs."""
|
2
|
+
|
3
|
+
from typing import Any
|
4
|
+
|
5
|
+
from uipath.eval.models import BooleanEvaluationResult, EvaluationResult
|
6
|
+
|
7
|
+
from ..models.models import AgentExecution
|
8
|
+
from .deterministic_evaluator_base import DeterministicEvaluatorBase
|
9
|
+
|
10
|
+
|
11
|
+
class ExactMatchEvaluator(DeterministicEvaluatorBase[dict[str, Any]]):
|
12
|
+
"""Evaluator that performs exact structural matching between expected and actual outputs.
|
13
|
+
|
14
|
+
This evaluator returns True if the actual output exactly matches the expected output
|
15
|
+
after canonical JSON normalization, and False otherwise. Numbers are normalized
|
16
|
+
to floats for consistent comparison.
|
17
|
+
"""
|
18
|
+
|
19
|
+
async def evaluate(
|
20
|
+
self, agent_execution: AgentExecution, evaluation_criteria: dict[str, Any]
|
21
|
+
) -> EvaluationResult:
|
22
|
+
"""Evaluate whether actual output exactly matches expected output.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
agent_execution: The execution details containing:
|
26
|
+
- agent_input: The input received by the agent
|
27
|
+
- actual_output: The actual output from the agent
|
28
|
+
- spans: The execution spans to use for the evaluation
|
29
|
+
evaluation_criteria: The criteria to evaluate
|
30
|
+
|
31
|
+
Returns:
|
32
|
+
EvaluationResult: Boolean result indicating exact match (True/False)
|
33
|
+
"""
|
34
|
+
return BooleanEvaluationResult(
|
35
|
+
score=self._canonical_json(agent_execution.agent_output)
|
36
|
+
== self._canonical_json(evaluation_criteria)
|
37
|
+
)
|
@@ -1,16 +1,18 @@
|
|
1
|
-
|
1
|
+
"""JSON similarity evaluator for flexible structural comparison of outputs."""
|
2
|
+
|
2
3
|
import math
|
3
|
-
from typing import Any,
|
4
|
+
from typing import Any, Tuple, TypeVar
|
5
|
+
|
6
|
+
from uipath.eval.models import EvaluationResult, NumericEvaluationResult
|
7
|
+
|
8
|
+
from ..models.models import AgentExecution
|
9
|
+
from .deterministic_evaluator_base import DeterministicEvaluatorBase
|
4
10
|
|
5
|
-
|
6
|
-
DeterministicEvaluatorBase,
|
7
|
-
)
|
8
|
-
from uipath._cli._evals._models import EvaluationResult
|
9
|
-
from uipath._cli._evals._models._evaluators import ScoreType
|
11
|
+
T = TypeVar("T")
|
10
12
|
|
11
13
|
|
12
|
-
class JsonSimilarityEvaluator(DeterministicEvaluatorBase):
|
13
|
-
"""Deterministic evaluator that scores structural JSON similarity.
|
14
|
+
class JsonSimilarityEvaluator(DeterministicEvaluatorBase[dict[str, Any]]):
|
15
|
+
"""Deterministic evaluator that scores structural JSON similarity between expected and actual output.
|
14
16
|
|
15
17
|
Compares expected versus actual JSON-like structures and returns a
|
16
18
|
numerical score in the range [0, 100]. The comparison is token-based
|
@@ -18,43 +20,24 @@ class JsonSimilarityEvaluator(DeterministicEvaluatorBase):
|
|
18
20
|
"""
|
19
21
|
|
20
22
|
async def evaluate(
|
21
|
-
self,
|
22
|
-
evaluation_id: str,
|
23
|
-
evaluation_name: str,
|
24
|
-
input_data: Dict[str, Any],
|
25
|
-
expected_output: Dict[str, Any],
|
26
|
-
actual_output: Dict[str, Any],
|
23
|
+
self, agent_execution: AgentExecution, evaluation_criteria: dict[str, Any]
|
27
24
|
) -> EvaluationResult:
|
28
25
|
"""Evaluate similarity between expected and actual JSON outputs.
|
29
26
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
27
|
+
Uses token-based comparison with tolerance for numeric differences
|
28
|
+
and Levenshtein distance for string similarity.
|
29
|
+
|
30
|
+
agent_execution: The execution details containing:
|
31
|
+
- agent_input: The input received by the agent
|
32
|
+
- actual_output: The actual output from the agent
|
33
|
+
- spans: The execution spans to use for the evaluation
|
34
|
+
evaluation_criteria: The criteria to evaluate
|
36
35
|
|
37
36
|
Returns:
|
38
|
-
EvaluationResult:
|
37
|
+
EvaluationResult: Numerical score between 0-100 indicating similarity
|
39
38
|
"""
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
actual_output, expected_output = self._select_targets(
|
44
|
-
expected_output, actual_output
|
45
|
-
)
|
46
|
-
similarity = self._compare_json(expected_output, actual_output)
|
47
|
-
|
48
|
-
return EvaluationResult(
|
49
|
-
evaluation_id=evaluation_id,
|
50
|
-
evaluation_name=evaluation_name,
|
51
|
-
evaluator_id=self.id,
|
52
|
-
evaluator_name=self.name,
|
53
|
-
score=similarity,
|
54
|
-
input=input_data,
|
55
|
-
expected_output=expected_output_copy,
|
56
|
-
actual_output=actual_output_copy,
|
57
|
-
score_type=ScoreType.NUMERICAL,
|
39
|
+
return NumericEvaluationResult(
|
40
|
+
score=self._compare_json(evaluation_criteria, agent_execution.agent_output)
|
58
41
|
)
|
59
42
|
|
60
43
|
def _compare_json(self, expected: Any, actual: Any) -> float:
|
@@ -0,0 +1,137 @@
|
|
1
|
+
"""LLM-as-a-judge evaluator for subjective quality assessment of agent outputs."""
|
2
|
+
|
3
|
+
import json
|
4
|
+
from typing import Any, Optional
|
5
|
+
|
6
|
+
from pydantic import field_validator
|
7
|
+
|
8
|
+
from uipath.eval.models import NumericEvaluationResult
|
9
|
+
|
10
|
+
from ..._services import UiPathLlmChatService
|
11
|
+
from ..._utils.constants import COMMUNITY_agents_SUFFIX
|
12
|
+
from ..models.models import AgentExecution, EvaluationResult, LLMResponse
|
13
|
+
from .base_evaluator import BaseEvaluator
|
14
|
+
|
15
|
+
|
16
|
+
class LlmAsAJudgeEvaluator(BaseEvaluator[dict[str, Any]]):
|
17
|
+
"""Evaluator that uses an LLM to judge the quality of agent output."""
|
18
|
+
|
19
|
+
prompt: str
|
20
|
+
model: str
|
21
|
+
actual_output_placeholder: str = "{{ActualOutput}}"
|
22
|
+
expected_output_placeholder: str = "{{ExpectedOutput}}"
|
23
|
+
llm: Optional[UiPathLlmChatService] = None
|
24
|
+
|
25
|
+
@field_validator("prompt")
|
26
|
+
@classmethod
|
27
|
+
def validate_prompt_placeholders(cls, v: str) -> str:
|
28
|
+
"""Validate that prompt contains required placeholders."""
|
29
|
+
if "{{ActualOutput}}" not in v or "{{ExpectedOutput}}" not in v:
|
30
|
+
raise ValueError(
|
31
|
+
"Prompt must contain both {ActualOutput} and {ExpectedOutput} placeholders"
|
32
|
+
)
|
33
|
+
return v
|
34
|
+
|
35
|
+
def model_post_init(self, __context):
|
36
|
+
"""Initialize the LLM service after model creation."""
|
37
|
+
super().model_post_init(__context)
|
38
|
+
self._initialize_llm()
|
39
|
+
|
40
|
+
def _initialize_llm(self):
|
41
|
+
"""Initialize the LLM used for evaluation."""
|
42
|
+
from uipath import UiPath
|
43
|
+
|
44
|
+
uipath = UiPath()
|
45
|
+
self.llm = uipath.llm
|
46
|
+
|
47
|
+
async def evaluate(
|
48
|
+
self,
|
49
|
+
agent_execution: AgentExecution,
|
50
|
+
evaluation_criteria: dict[str, Any],
|
51
|
+
) -> EvaluationResult:
|
52
|
+
"""Evaluate using an LLM as a judge.
|
53
|
+
|
54
|
+
Sends the formatted prompt to the configured LLM and expects a JSON response
|
55
|
+
with a numerical score (0-100) and justification.
|
56
|
+
|
57
|
+
agent_execution: The execution details containing:
|
58
|
+
- agent_input: The input received by the agent
|
59
|
+
- actual_output: The actual output from the agent
|
60
|
+
- spans: The execution spans to use for the evaluation
|
61
|
+
evaluation_criteria: The criteria to evaluate
|
62
|
+
|
63
|
+
Returns:
|
64
|
+
EvaluationResult: Numerical score with LLM justification as details
|
65
|
+
"""
|
66
|
+
# Create the evaluation prompt
|
67
|
+
evaluation_prompt = self._create_evaluation_prompt(
|
68
|
+
expected_output=evaluation_criteria,
|
69
|
+
actual_output=agent_execution.agent_output,
|
70
|
+
)
|
71
|
+
|
72
|
+
llm_response = await self._get_llm_response(evaluation_prompt)
|
73
|
+
|
74
|
+
return NumericEvaluationResult(
|
75
|
+
score=llm_response.score,
|
76
|
+
details=llm_response.justification,
|
77
|
+
)
|
78
|
+
|
79
|
+
def _create_evaluation_prompt(
|
80
|
+
self, expected_output: Any, actual_output: Any
|
81
|
+
) -> str:
|
82
|
+
"""Create the evaluation prompt for the LLM."""
|
83
|
+
formatted_prompt = self.prompt.replace(
|
84
|
+
self.actual_output_placeholder,
|
85
|
+
str(actual_output),
|
86
|
+
)
|
87
|
+
formatted_prompt = formatted_prompt.replace(
|
88
|
+
self.expected_output_placeholder,
|
89
|
+
str(expected_output),
|
90
|
+
)
|
91
|
+
|
92
|
+
return formatted_prompt
|
93
|
+
|
94
|
+
async def _get_llm_response(self, evaluation_prompt: str) -> LLMResponse:
|
95
|
+
"""Get response from the LLM.
|
96
|
+
|
97
|
+
Args:
|
98
|
+
evaluation_prompt: The formatted prompt to send to the LLM
|
99
|
+
|
100
|
+
Returns:
|
101
|
+
LLMResponse with score and justification
|
102
|
+
"""
|
103
|
+
# remove community-agents suffix from llm model name
|
104
|
+
model = self.model
|
105
|
+
if model.endswith(COMMUNITY_agents_SUFFIX):
|
106
|
+
model = model.replace(COMMUNITY_agents_SUFFIX, "")
|
107
|
+
|
108
|
+
# Prepare the request
|
109
|
+
request_data = {
|
110
|
+
"model": model,
|
111
|
+
"messages": [{"role": "user", "content": evaluation_prompt}],
|
112
|
+
"response_format": {
|
113
|
+
"type": "json_schema",
|
114
|
+
"json_schema": {
|
115
|
+
"name": "evaluation_response",
|
116
|
+
"schema": {
|
117
|
+
"type": "object",
|
118
|
+
"properties": {
|
119
|
+
"score": {
|
120
|
+
"type": "number",
|
121
|
+
"minimum": 0,
|
122
|
+
"maximum": 100,
|
123
|
+
"description": "Score between 0 and 100",
|
124
|
+
},
|
125
|
+
"justification": {
|
126
|
+
"type": "string",
|
127
|
+
"description": "Explanation for the score",
|
128
|
+
},
|
129
|
+
},
|
130
|
+
"required": ["score", "justification"],
|
131
|
+
},
|
132
|
+
},
|
133
|
+
},
|
134
|
+
}
|
135
|
+
|
136
|
+
response = await self.llm.chat_completions(**request_data) # type: ignore
|
137
|
+
return LLMResponse(**json.loads(response.choices[-1].message.content))
|
@@ -0,0 +1,36 @@
|
|
1
|
+
"""Trajectory evaluator for analyzing execution paths and decision sequences."""
|
2
|
+
|
3
|
+
from typing import TypeVar
|
4
|
+
|
5
|
+
from uipath.eval.models import EvaluationResult
|
6
|
+
|
7
|
+
from ..models.models import AgentExecution
|
8
|
+
from .base_evaluator import BaseEvaluator
|
9
|
+
|
10
|
+
T = TypeVar("T")
|
11
|
+
|
12
|
+
|
13
|
+
class TrajectoryEvaluator(BaseEvaluator[T]):
|
14
|
+
"""Evaluator that analyzes the trajectory/path taken to reach outputs."""
|
15
|
+
|
16
|
+
async def evaluate(
|
17
|
+
self, agent_execution: AgentExecution, evaluation_criteria: T
|
18
|
+
) -> EvaluationResult:
|
19
|
+
"""Evaluate using trajectory analysis.
|
20
|
+
|
21
|
+
Analyzes the execution path and decision sequence taken by the agent
|
22
|
+
to assess the quality of the reasoning process.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
agent_execution: The execution details containing:
|
26
|
+
- agent_input: The input received by the agent
|
27
|
+
- actual_output: The actual output from the agent
|
28
|
+
- spans: The execution spans to use for the evaluation
|
29
|
+
evaluation_criteria: The criteria to evaluate
|
30
|
+
Returns:
|
31
|
+
EvaluationResult: Score based on trajectory analysis
|
32
|
+
|
33
|
+
Raises:
|
34
|
+
NotImplementedError: This evaluator is not yet implemented
|
35
|
+
"""
|
36
|
+
raise NotImplementedError()
|
@@ -0,0 +1,19 @@
|
|
1
|
+
"""UiPath evaluation module for agent performance assessment."""
|
2
|
+
|
3
|
+
from uipath.eval.models.models import (
|
4
|
+
BooleanEvaluationResult,
|
5
|
+
ErrorEvaluationResult,
|
6
|
+
EvalItemResult,
|
7
|
+
EvaluationResult,
|
8
|
+
NumericEvaluationResult,
|
9
|
+
ScoreType,
|
10
|
+
)
|
11
|
+
|
12
|
+
__all__ = [
|
13
|
+
"EvaluationResult",
|
14
|
+
"ScoreType",
|
15
|
+
"EvalItemResult",
|
16
|
+
"BooleanEvaluationResult",
|
17
|
+
"NumericEvaluationResult",
|
18
|
+
"ErrorEvaluationResult",
|
19
|
+
]
|