uipath 2.1.51__py3-none-any.whl → 2.1.53__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- uipath/_cli/_evals/{_evaluators/_evaluator_factory.py → _evaluator_factory.py} +24 -23
- uipath/_cli/_evals/_models/_evaluation_set.py +23 -18
- uipath/_cli/_evals/_models/_evaluator_base_params.py +16 -0
- uipath/_cli/_evals/_models/_output.py +85 -0
- uipath/_cli/_evals/_runtime.py +102 -10
- uipath/_cli/_runtime/_contracts.py +12 -3
- uipath/_cli/_utils/_eval_set.py +1 -1
- uipath/_cli/_utils/_project_files.py +1 -0
- uipath/_cli/cli_eval.py +46 -61
- uipath/eval/evaluators/__init__.py +15 -0
- uipath/eval/evaluators/base_evaluator.py +88 -0
- uipath/eval/evaluators/deterministic_evaluator_base.py +53 -0
- uipath/eval/evaluators/exact_match_evaluator.py +37 -0
- uipath/{_cli/_evals/_evaluators/_json_similarity_evaluator.py → eval/evaluators/json_similarity_evaluator.py} +23 -40
- uipath/eval/evaluators/llm_as_judge_evaluator.py +137 -0
- uipath/eval/evaluators/trajectory_evaluator.py +36 -0
- uipath/eval/models/__init__.py +19 -0
- uipath/{_cli/_evals/_models/_evaluators.py → eval/models/models.py} +67 -43
- {uipath-2.1.51.dist-info → uipath-2.1.53.dist-info}/METADATA +1 -1
- {uipath-2.1.51.dist-info → uipath-2.1.53.dist-info}/RECORD +23 -23
- uipath/_cli/_evals/_evaluators/__init__.py +0 -22
- uipath/_cli/_evals/_evaluators/_deterministic_evaluator_base.py +0 -46
- uipath/_cli/_evals/_evaluators/_evaluator_base.py +0 -124
- uipath/_cli/_evals/_evaluators/_exact_match_evaluator.py +0 -40
- uipath/_cli/_evals/_evaluators/_llm_as_judge_evaluator.py +0 -183
- uipath/_cli/_evals/_evaluators/_trajectory_evaluator.py +0 -48
- uipath/_cli/_evals/_models/__init__.py +0 -18
- uipath/_cli/_evals/_models/_agent_execution_output.py +0 -14
- uipath/_cli/_evals/progress_reporter.py +0 -304
- {uipath-2.1.51.dist-info → uipath-2.1.53.dist-info}/WHEEL +0 -0
- {uipath-2.1.51.dist-info → uipath-2.1.53.dist-info}/entry_points.txt +0 -0
- {uipath-2.1.51.dist-info → uipath-2.1.53.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,88 @@
|
|
1
|
+
"""Base evaluator abstract class for agent evaluation."""
|
2
|
+
|
3
|
+
import functools
|
4
|
+
import time
|
5
|
+
from abc import ABC, abstractmethod
|
6
|
+
from typing import Generic, TypeVar
|
7
|
+
|
8
|
+
from pydantic import BaseModel, ConfigDict
|
9
|
+
|
10
|
+
from uipath.eval.models import EvaluationResult
|
11
|
+
from uipath.eval.models.models import (
|
12
|
+
AgentExecution,
|
13
|
+
ErrorEvaluationResult,
|
14
|
+
EvaluatorCategory,
|
15
|
+
EvaluatorType,
|
16
|
+
)
|
17
|
+
|
18
|
+
|
19
|
+
def track_evaluation_metrics(func):
|
20
|
+
"""Decorator to track evaluation metrics and handle errors gracefully."""
|
21
|
+
|
22
|
+
@functools.wraps(func)
|
23
|
+
async def wrapper(*args, **kwargs) -> EvaluationResult:
|
24
|
+
start_time = time.time()
|
25
|
+
try:
|
26
|
+
result = await func(*args, **kwargs)
|
27
|
+
except Exception as e:
|
28
|
+
result = ErrorEvaluationResult(
|
29
|
+
details="Exception thrown by evaluator: {}".format(e),
|
30
|
+
evaluation_time=time.time() - start_time,
|
31
|
+
)
|
32
|
+
end_time = time.time()
|
33
|
+
execution_time = end_time - start_time
|
34
|
+
|
35
|
+
result.evaluation_time = execution_time
|
36
|
+
return result
|
37
|
+
|
38
|
+
return wrapper
|
39
|
+
|
40
|
+
|
41
|
+
T = TypeVar("T")
|
42
|
+
|
43
|
+
|
44
|
+
class BaseEvaluator(BaseModel, Generic[T], ABC):
|
45
|
+
"""Abstract base class for all evaluators."""
|
46
|
+
|
47
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
48
|
+
|
49
|
+
id: str
|
50
|
+
name: str
|
51
|
+
description: str
|
52
|
+
target_output_key: str = "*"
|
53
|
+
created_at: str
|
54
|
+
updated_at: str
|
55
|
+
category: EvaluatorCategory
|
56
|
+
evaluator_type: EvaluatorType
|
57
|
+
|
58
|
+
def __init_subclass__(cls, **kwargs):
|
59
|
+
"""Hook for subclass creation - automatically applies evaluation metrics tracking."""
|
60
|
+
super().__init_subclass__(**kwargs)
|
61
|
+
|
62
|
+
if hasattr(cls, "evaluate") and not getattr(
|
63
|
+
cls.evaluate, "_has_metrics_decorator", False
|
64
|
+
):
|
65
|
+
cls.evaluate = track_evaluation_metrics(cls.evaluate) # type: ignore[method-assign]
|
66
|
+
cls.evaluate._has_metrics_decorator = True # type: ignore[attr-defined]
|
67
|
+
|
68
|
+
def model_post_init(self, __context):
|
69
|
+
"""Post-initialization hook for Pydantic models."""
|
70
|
+
pass
|
71
|
+
|
72
|
+
@abstractmethod
|
73
|
+
async def evaluate(
|
74
|
+
self, agent_execution: AgentExecution, evaluation_criteria: T
|
75
|
+
) -> EvaluationResult:
|
76
|
+
"""Evaluate the given data and return a result.
|
77
|
+
|
78
|
+
Args:
|
79
|
+
agent_execution: The execution details containing:
|
80
|
+
- agent_input: The input received by the agent
|
81
|
+
- actual_output: The actual output from the agent
|
82
|
+
- spans: The execution spans to use for the evaluation
|
83
|
+
evaluation_criteria: The criteria to evaluate
|
84
|
+
|
85
|
+
Returns:
|
86
|
+
EvaluationResult containing the score and details
|
87
|
+
"""
|
88
|
+
pass
|
@@ -0,0 +1,53 @@
|
|
1
|
+
"""Base class for deterministic evaluators that provide consistent outputs."""
|
2
|
+
|
3
|
+
import json
|
4
|
+
from abc import ABC
|
5
|
+
from typing import Any, TypeVar
|
6
|
+
|
7
|
+
from .base_evaluator import BaseEvaluator
|
8
|
+
|
9
|
+
T = TypeVar("T")
|
10
|
+
|
11
|
+
|
12
|
+
class DeterministicEvaluatorBase(BaseEvaluator[T], ABC):
|
13
|
+
"""Base class for evaluators that produce deterministic, reproducible results.
|
14
|
+
|
15
|
+
This class provides utility methods for canonical JSON comparison and number normalization
|
16
|
+
to ensure consistent evaluation results across runs.
|
17
|
+
"""
|
18
|
+
|
19
|
+
def _canonical_json(self, obj: Any) -> str:
|
20
|
+
"""Convert an object to canonical JSON string for consistent comparison.
|
21
|
+
|
22
|
+
Args:
|
23
|
+
obj: The object to convert to canonical JSON
|
24
|
+
|
25
|
+
Returns:
|
26
|
+
str: Canonical JSON string with normalized numbers and sorted keys
|
27
|
+
"""
|
28
|
+
return json.dumps(
|
29
|
+
self._normalize_numbers(obj),
|
30
|
+
sort_keys=True,
|
31
|
+
separators=(",", ":"),
|
32
|
+
ensure_ascii=False,
|
33
|
+
)
|
34
|
+
|
35
|
+
def _normalize_numbers(self, obj: Any) -> Any:
|
36
|
+
"""Recursively normalize numbers in nested data structures.
|
37
|
+
|
38
|
+
Converts all numeric values (int, float) to float for consistent comparison,
|
39
|
+
while preserving booleans and other data types.
|
40
|
+
|
41
|
+
Args:
|
42
|
+
obj: The object to normalize
|
43
|
+
|
44
|
+
Returns:
|
45
|
+
Any: Object with normalized numbers
|
46
|
+
"""
|
47
|
+
if isinstance(obj, dict):
|
48
|
+
return {k: self._normalize_numbers(v) for k, v in obj.items()}
|
49
|
+
if isinstance(obj, (list, tuple)):
|
50
|
+
return [self._normalize_numbers(v) for v in obj]
|
51
|
+
if isinstance(obj, (int, float)) and not isinstance(obj, bool):
|
52
|
+
return float(obj)
|
53
|
+
return obj
|
@@ -0,0 +1,37 @@
|
|
1
|
+
"""Exact match evaluator for binary pass/fail evaluation of agent outputs."""
|
2
|
+
|
3
|
+
from typing import Any
|
4
|
+
|
5
|
+
from uipath.eval.models import BooleanEvaluationResult, EvaluationResult
|
6
|
+
|
7
|
+
from ..models.models import AgentExecution
|
8
|
+
from .deterministic_evaluator_base import DeterministicEvaluatorBase
|
9
|
+
|
10
|
+
|
11
|
+
class ExactMatchEvaluator(DeterministicEvaluatorBase[dict[str, Any]]):
|
12
|
+
"""Evaluator that performs exact structural matching between expected and actual outputs.
|
13
|
+
|
14
|
+
This evaluator returns True if the actual output exactly matches the expected output
|
15
|
+
after canonical JSON normalization, and False otherwise. Numbers are normalized
|
16
|
+
to floats for consistent comparison.
|
17
|
+
"""
|
18
|
+
|
19
|
+
async def evaluate(
|
20
|
+
self, agent_execution: AgentExecution, evaluation_criteria: dict[str, Any]
|
21
|
+
) -> EvaluationResult:
|
22
|
+
"""Evaluate whether actual output exactly matches expected output.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
agent_execution: The execution details containing:
|
26
|
+
- agent_input: The input received by the agent
|
27
|
+
- actual_output: The actual output from the agent
|
28
|
+
- spans: The execution spans to use for the evaluation
|
29
|
+
evaluation_criteria: The criteria to evaluate
|
30
|
+
|
31
|
+
Returns:
|
32
|
+
EvaluationResult: Boolean result indicating exact match (True/False)
|
33
|
+
"""
|
34
|
+
return BooleanEvaluationResult(
|
35
|
+
score=self._canonical_json(agent_execution.agent_output)
|
36
|
+
== self._canonical_json(evaluation_criteria)
|
37
|
+
)
|
@@ -1,16 +1,18 @@
|
|
1
|
-
|
1
|
+
"""JSON similarity evaluator for flexible structural comparison of outputs."""
|
2
|
+
|
2
3
|
import math
|
3
|
-
from typing import Any,
|
4
|
+
from typing import Any, Tuple, TypeVar
|
5
|
+
|
6
|
+
from uipath.eval.models import EvaluationResult, NumericEvaluationResult
|
7
|
+
|
8
|
+
from ..models.models import AgentExecution
|
9
|
+
from .deterministic_evaluator_base import DeterministicEvaluatorBase
|
4
10
|
|
5
|
-
|
6
|
-
DeterministicEvaluatorBase,
|
7
|
-
)
|
8
|
-
from uipath._cli._evals._models import EvaluationResult
|
9
|
-
from uipath._cli._evals._models._evaluators import ScoreType
|
11
|
+
T = TypeVar("T")
|
10
12
|
|
11
13
|
|
12
|
-
class JsonSimilarityEvaluator(DeterministicEvaluatorBase):
|
13
|
-
"""Deterministic evaluator that scores structural JSON similarity.
|
14
|
+
class JsonSimilarityEvaluator(DeterministicEvaluatorBase[dict[str, Any]]):
|
15
|
+
"""Deterministic evaluator that scores structural JSON similarity between expected and actual output.
|
14
16
|
|
15
17
|
Compares expected versus actual JSON-like structures and returns a
|
16
18
|
numerical score in the range [0, 100]. The comparison is token-based
|
@@ -18,43 +20,24 @@ class JsonSimilarityEvaluator(DeterministicEvaluatorBase):
|
|
18
20
|
"""
|
19
21
|
|
20
22
|
async def evaluate(
|
21
|
-
self,
|
22
|
-
evaluation_id: str,
|
23
|
-
evaluation_name: str,
|
24
|
-
input_data: Dict[str, Any],
|
25
|
-
expected_output: Dict[str, Any],
|
26
|
-
actual_output: Dict[str, Any],
|
23
|
+
self, agent_execution: AgentExecution, evaluation_criteria: dict[str, Any]
|
27
24
|
) -> EvaluationResult:
|
28
25
|
"""Evaluate similarity between expected and actual JSON outputs.
|
29
26
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
27
|
+
Uses token-based comparison with tolerance for numeric differences
|
28
|
+
and Levenshtein distance for string similarity.
|
29
|
+
|
30
|
+
agent_execution: The execution details containing:
|
31
|
+
- agent_input: The input received by the agent
|
32
|
+
- actual_output: The actual output from the agent
|
33
|
+
- spans: The execution spans to use for the evaluation
|
34
|
+
evaluation_criteria: The criteria to evaluate
|
36
35
|
|
37
36
|
Returns:
|
38
|
-
EvaluationResult:
|
37
|
+
EvaluationResult: Numerical score between 0-100 indicating similarity
|
39
38
|
"""
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
actual_output, expected_output = self._select_targets(
|
44
|
-
expected_output, actual_output
|
45
|
-
)
|
46
|
-
similarity = self._compare_json(expected_output, actual_output)
|
47
|
-
|
48
|
-
return EvaluationResult(
|
49
|
-
evaluation_id=evaluation_id,
|
50
|
-
evaluation_name=evaluation_name,
|
51
|
-
evaluator_id=self.id,
|
52
|
-
evaluator_name=self.name,
|
53
|
-
score=similarity,
|
54
|
-
input=input_data,
|
55
|
-
expected_output=expected_output_copy,
|
56
|
-
actual_output=actual_output_copy,
|
57
|
-
score_type=ScoreType.NUMERICAL,
|
39
|
+
return NumericEvaluationResult(
|
40
|
+
score=self._compare_json(evaluation_criteria, agent_execution.agent_output)
|
58
41
|
)
|
59
42
|
|
60
43
|
def _compare_json(self, expected: Any, actual: Any) -> float:
|
@@ -0,0 +1,137 @@
|
|
1
|
+
"""LLM-as-a-judge evaluator for subjective quality assessment of agent outputs."""
|
2
|
+
|
3
|
+
import json
|
4
|
+
from typing import Any, Optional
|
5
|
+
|
6
|
+
from pydantic import field_validator
|
7
|
+
|
8
|
+
from uipath.eval.models import NumericEvaluationResult
|
9
|
+
|
10
|
+
from ..._services import UiPathLlmChatService
|
11
|
+
from ..._utils.constants import COMMUNITY_agents_SUFFIX
|
12
|
+
from ..models.models import AgentExecution, EvaluationResult, LLMResponse
|
13
|
+
from .base_evaluator import BaseEvaluator
|
14
|
+
|
15
|
+
|
16
|
+
class LlmAsAJudgeEvaluator(BaseEvaluator[dict[str, Any]]):
|
17
|
+
"""Evaluator that uses an LLM to judge the quality of agent output."""
|
18
|
+
|
19
|
+
prompt: str
|
20
|
+
model: str
|
21
|
+
actual_output_placeholder: str = "{{ActualOutput}}"
|
22
|
+
expected_output_placeholder: str = "{{ExpectedOutput}}"
|
23
|
+
llm: Optional[UiPathLlmChatService] = None
|
24
|
+
|
25
|
+
@field_validator("prompt")
|
26
|
+
@classmethod
|
27
|
+
def validate_prompt_placeholders(cls, v: str) -> str:
|
28
|
+
"""Validate that prompt contains required placeholders."""
|
29
|
+
if "{{ActualOutput}}" not in v or "{{ExpectedOutput}}" not in v:
|
30
|
+
raise ValueError(
|
31
|
+
"Prompt must contain both {ActualOutput} and {ExpectedOutput} placeholders"
|
32
|
+
)
|
33
|
+
return v
|
34
|
+
|
35
|
+
def model_post_init(self, __context):
|
36
|
+
"""Initialize the LLM service after model creation."""
|
37
|
+
super().model_post_init(__context)
|
38
|
+
self._initialize_llm()
|
39
|
+
|
40
|
+
def _initialize_llm(self):
|
41
|
+
"""Initialize the LLM used for evaluation."""
|
42
|
+
from uipath import UiPath
|
43
|
+
|
44
|
+
uipath = UiPath()
|
45
|
+
self.llm = uipath.llm
|
46
|
+
|
47
|
+
async def evaluate(
|
48
|
+
self,
|
49
|
+
agent_execution: AgentExecution,
|
50
|
+
evaluation_criteria: dict[str, Any],
|
51
|
+
) -> EvaluationResult:
|
52
|
+
"""Evaluate using an LLM as a judge.
|
53
|
+
|
54
|
+
Sends the formatted prompt to the configured LLM and expects a JSON response
|
55
|
+
with a numerical score (0-100) and justification.
|
56
|
+
|
57
|
+
agent_execution: The execution details containing:
|
58
|
+
- agent_input: The input received by the agent
|
59
|
+
- actual_output: The actual output from the agent
|
60
|
+
- spans: The execution spans to use for the evaluation
|
61
|
+
evaluation_criteria: The criteria to evaluate
|
62
|
+
|
63
|
+
Returns:
|
64
|
+
EvaluationResult: Numerical score with LLM justification as details
|
65
|
+
"""
|
66
|
+
# Create the evaluation prompt
|
67
|
+
evaluation_prompt = self._create_evaluation_prompt(
|
68
|
+
expected_output=evaluation_criteria,
|
69
|
+
actual_output=agent_execution.agent_output,
|
70
|
+
)
|
71
|
+
|
72
|
+
llm_response = await self._get_llm_response(evaluation_prompt)
|
73
|
+
|
74
|
+
return NumericEvaluationResult(
|
75
|
+
score=llm_response.score,
|
76
|
+
details=llm_response.justification,
|
77
|
+
)
|
78
|
+
|
79
|
+
def _create_evaluation_prompt(
|
80
|
+
self, expected_output: Any, actual_output: Any
|
81
|
+
) -> str:
|
82
|
+
"""Create the evaluation prompt for the LLM."""
|
83
|
+
formatted_prompt = self.prompt.replace(
|
84
|
+
self.actual_output_placeholder,
|
85
|
+
str(actual_output),
|
86
|
+
)
|
87
|
+
formatted_prompt = formatted_prompt.replace(
|
88
|
+
self.expected_output_placeholder,
|
89
|
+
str(expected_output),
|
90
|
+
)
|
91
|
+
|
92
|
+
return formatted_prompt
|
93
|
+
|
94
|
+
async def _get_llm_response(self, evaluation_prompt: str) -> LLMResponse:
|
95
|
+
"""Get response from the LLM.
|
96
|
+
|
97
|
+
Args:
|
98
|
+
evaluation_prompt: The formatted prompt to send to the LLM
|
99
|
+
|
100
|
+
Returns:
|
101
|
+
LLMResponse with score and justification
|
102
|
+
"""
|
103
|
+
# remove community-agents suffix from llm model name
|
104
|
+
model = self.model
|
105
|
+
if model.endswith(COMMUNITY_agents_SUFFIX):
|
106
|
+
model = model.replace(COMMUNITY_agents_SUFFIX, "")
|
107
|
+
|
108
|
+
# Prepare the request
|
109
|
+
request_data = {
|
110
|
+
"model": model,
|
111
|
+
"messages": [{"role": "user", "content": evaluation_prompt}],
|
112
|
+
"response_format": {
|
113
|
+
"type": "json_schema",
|
114
|
+
"json_schema": {
|
115
|
+
"name": "evaluation_response",
|
116
|
+
"schema": {
|
117
|
+
"type": "object",
|
118
|
+
"properties": {
|
119
|
+
"score": {
|
120
|
+
"type": "number",
|
121
|
+
"minimum": 0,
|
122
|
+
"maximum": 100,
|
123
|
+
"description": "Score between 0 and 100",
|
124
|
+
},
|
125
|
+
"justification": {
|
126
|
+
"type": "string",
|
127
|
+
"description": "Explanation for the score",
|
128
|
+
},
|
129
|
+
},
|
130
|
+
"required": ["score", "justification"],
|
131
|
+
},
|
132
|
+
},
|
133
|
+
},
|
134
|
+
}
|
135
|
+
|
136
|
+
response = await self.llm.chat_completions(**request_data) # type: ignore
|
137
|
+
return LLMResponse(**json.loads(response.choices[-1].message.content))
|
@@ -0,0 +1,36 @@
|
|
1
|
+
"""Trajectory evaluator for analyzing execution paths and decision sequences."""
|
2
|
+
|
3
|
+
from typing import TypeVar
|
4
|
+
|
5
|
+
from uipath.eval.models import EvaluationResult
|
6
|
+
|
7
|
+
from ..models.models import AgentExecution
|
8
|
+
from .base_evaluator import BaseEvaluator
|
9
|
+
|
10
|
+
T = TypeVar("T")
|
11
|
+
|
12
|
+
|
13
|
+
class TrajectoryEvaluator(BaseEvaluator[T]):
|
14
|
+
"""Evaluator that analyzes the trajectory/path taken to reach outputs."""
|
15
|
+
|
16
|
+
async def evaluate(
|
17
|
+
self, agent_execution: AgentExecution, evaluation_criteria: T
|
18
|
+
) -> EvaluationResult:
|
19
|
+
"""Evaluate using trajectory analysis.
|
20
|
+
|
21
|
+
Analyzes the execution path and decision sequence taken by the agent
|
22
|
+
to assess the quality of the reasoning process.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
agent_execution: The execution details containing:
|
26
|
+
- agent_input: The input received by the agent
|
27
|
+
- actual_output: The actual output from the agent
|
28
|
+
- spans: The execution spans to use for the evaluation
|
29
|
+
evaluation_criteria: The criteria to evaluate
|
30
|
+
Returns:
|
31
|
+
EvaluationResult: Score based on trajectory analysis
|
32
|
+
|
33
|
+
Raises:
|
34
|
+
NotImplementedError: This evaluator is not yet implemented
|
35
|
+
"""
|
36
|
+
raise NotImplementedError()
|
@@ -0,0 +1,19 @@
|
|
1
|
+
"""UiPath evaluation module for agent performance assessment."""
|
2
|
+
|
3
|
+
from uipath.eval.models.models import (
|
4
|
+
BooleanEvaluationResult,
|
5
|
+
ErrorEvaluationResult,
|
6
|
+
EvalItemResult,
|
7
|
+
EvaluationResult,
|
8
|
+
NumericEvaluationResult,
|
9
|
+
ScoreType,
|
10
|
+
)
|
11
|
+
|
12
|
+
__all__ = [
|
13
|
+
"EvaluationResult",
|
14
|
+
"ScoreType",
|
15
|
+
"EvalItemResult",
|
16
|
+
"BooleanEvaluationResult",
|
17
|
+
"NumericEvaluationResult",
|
18
|
+
"ErrorEvaluationResult",
|
19
|
+
]
|
@@ -1,15 +1,79 @@
|
|
1
|
-
|
1
|
+
"""Models for evaluation framework including execution data and evaluation results."""
|
2
|
+
|
2
3
|
from enum import IntEnum
|
3
|
-
from typing import Any, Dict,
|
4
|
+
from typing import Annotated, Any, Dict, Literal, Optional, Union
|
5
|
+
|
6
|
+
from opentelemetry.sdk.trace import ReadableSpan
|
7
|
+
from pydantic import BaseModel, ConfigDict, Field
|
8
|
+
|
9
|
+
|
10
|
+
class AgentExecution(BaseModel):
|
11
|
+
"""Represents the execution data of an agent for evaluation purposes."""
|
4
12
|
|
5
|
-
|
13
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
14
|
+
|
15
|
+
agent_input: Optional[Dict[str, Any]]
|
16
|
+
agent_output: Dict[str, Any]
|
17
|
+
agent_trace: list[ReadableSpan]
|
6
18
|
|
7
19
|
|
8
20
|
class LLMResponse(BaseModel):
|
21
|
+
"""Response from an LLM evaluator."""
|
22
|
+
|
9
23
|
score: float
|
10
24
|
justification: str
|
11
25
|
|
12
26
|
|
27
|
+
class ScoreType(IntEnum):
|
28
|
+
"""Types of evaluation scores."""
|
29
|
+
|
30
|
+
BOOLEAN = 0
|
31
|
+
NUMERICAL = 1
|
32
|
+
ERROR = 2
|
33
|
+
|
34
|
+
|
35
|
+
class BaseEvaluationResult(BaseModel):
|
36
|
+
"""Base class for evaluation results."""
|
37
|
+
|
38
|
+
details: Optional[str] = None
|
39
|
+
# this is marked as optional, as it is populated inside the 'measure_execution_time' decorator
|
40
|
+
evaluation_time: Optional[float] = None
|
41
|
+
|
42
|
+
|
43
|
+
class BooleanEvaluationResult(BaseEvaluationResult):
|
44
|
+
"""Result of a boolean evaluation."""
|
45
|
+
|
46
|
+
score: bool
|
47
|
+
score_type: Literal[ScoreType.BOOLEAN] = ScoreType.BOOLEAN
|
48
|
+
|
49
|
+
|
50
|
+
class NumericEvaluationResult(BaseEvaluationResult):
|
51
|
+
"""Result of a numerical evaluation."""
|
52
|
+
|
53
|
+
score: float
|
54
|
+
score_type: Literal[ScoreType.NUMERICAL] = ScoreType.NUMERICAL
|
55
|
+
|
56
|
+
|
57
|
+
class ErrorEvaluationResult(BaseEvaluationResult):
|
58
|
+
"""Result of an error evaluation."""
|
59
|
+
|
60
|
+
score: float = 0.0
|
61
|
+
score_type: Literal[ScoreType.ERROR] = ScoreType.ERROR
|
62
|
+
|
63
|
+
|
64
|
+
EvaluationResult = Annotated[
|
65
|
+
Union[BooleanEvaluationResult, NumericEvaluationResult, ErrorEvaluationResult],
|
66
|
+
Field(discriminator="score_type"),
|
67
|
+
]
|
68
|
+
|
69
|
+
|
70
|
+
class EvalItemResult(BaseModel):
|
71
|
+
"""Result of a single evaluation item."""
|
72
|
+
|
73
|
+
evaluator_name: str
|
74
|
+
result: EvaluationResult
|
75
|
+
|
76
|
+
|
13
77
|
class EvaluatorCategory(IntEnum):
|
14
78
|
"""Types of evaluators."""
|
15
79
|
|
@@ -48,43 +112,3 @@ class EvaluatorType(IntEnum):
|
|
48
112
|
return cls(value)
|
49
113
|
else:
|
50
114
|
raise ValueError(f"{value} is not a valid EvaluatorType value")
|
51
|
-
|
52
|
-
|
53
|
-
class ScoreType(IntEnum):
|
54
|
-
BOOLEAN = 0
|
55
|
-
NUMERICAL = 1
|
56
|
-
ERROR = 2
|
57
|
-
|
58
|
-
|
59
|
-
class EvaluationResult(BaseModel):
|
60
|
-
"""Result of a single evaluation."""
|
61
|
-
|
62
|
-
evaluation_id: str
|
63
|
-
evaluation_name: str
|
64
|
-
evaluator_id: str
|
65
|
-
evaluator_name: str
|
66
|
-
score: float | bool
|
67
|
-
score_type: ScoreType
|
68
|
-
# this is marked as optional, as it is populated inside the 'measure_execution_time' decorator
|
69
|
-
evaluation_time: Optional[float] = None
|
70
|
-
input: Dict[str, Any]
|
71
|
-
expected_output: Dict[str, Any]
|
72
|
-
actual_output: Dict[str, Any]
|
73
|
-
timestamp: datetime = datetime.now(timezone.utc)
|
74
|
-
details: Optional[str] = None
|
75
|
-
|
76
|
-
|
77
|
-
class EvaluationSetResult(BaseModel):
|
78
|
-
"""Result of a complete evaluation set."""
|
79
|
-
|
80
|
-
eval_set_id: str
|
81
|
-
eval_set_name: str
|
82
|
-
results: List[EvaluationResult]
|
83
|
-
average_score: float
|
84
|
-
|
85
|
-
|
86
|
-
class EvalItemResult(BaseModel):
|
87
|
-
"""Result of a single evaluation item."""
|
88
|
-
|
89
|
-
evaluator_id: str
|
90
|
-
result: EvaluationResult
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: uipath
|
3
|
-
Version: 2.1.
|
3
|
+
Version: 2.1.53
|
4
4
|
Summary: Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools.
|
5
5
|
Project-URL: Homepage, https://uipath.com
|
6
6
|
Project-URL: Repository, https://github.com/UiPath/uipath-python
|