uipath 2.1.7__py3-none-any.whl → 2.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- uipath/_cli/__init__.py +2 -0
- uipath/_cli/_auth/auth_config.json +1 -1
- uipath/_cli/_evals/_evaluators/__init__.py +20 -0
- uipath/_cli/_evals/_evaluators/_agent_scorer_evaluator.py +48 -0
- uipath/_cli/_evals/_evaluators/_deterministic_evaluator.py +41 -0
- uipath/_cli/_evals/_evaluators/_evaluator_base.py +124 -0
- uipath/_cli/_evals/_evaluators/_evaluator_factory.py +103 -0
- uipath/_cli/_evals/_evaluators/_llm_as_judge_evaluator.py +181 -0
- uipath/_cli/_evals/_evaluators/_trajectory_evaluator.py +48 -0
- uipath/_cli/_evals/_models/__init__.py +18 -0
- uipath/_cli/_evals/_models/_evaluation_set.py +43 -0
- uipath/_cli/_evals/_models/_evaluators.py +89 -0
- uipath/_cli/_evals/evaluation_service.py +583 -0
- uipath/_cli/_evals/progress_reporter.py +356 -0
- uipath/_cli/_runtime/_contracts.py +25 -10
- uipath/_cli/_runtime/_logging.py +8 -6
- uipath/_cli/_utils/_console.py +105 -1
- uipath/_cli/cli_eval.py +95 -0
- uipath/_cli/cli_run.py +74 -32
- uipath/_services/api_client.py +5 -3
- uipath/_services/llm_gateway_service.py +4 -4
- uipath/_utils/constants.py +4 -0
- uipath/telemetry/_constants.py +3 -3
- {uipath-2.1.7.dist-info → uipath-2.1.9.dist-info}/METADATA +1 -1
- {uipath-2.1.7.dist-info → uipath-2.1.9.dist-info}/RECORD +28 -15
- {uipath-2.1.7.dist-info → uipath-2.1.9.dist-info}/WHEEL +0 -0
- {uipath-2.1.7.dist-info → uipath-2.1.9.dist-info}/entry_points.txt +0 -0
- {uipath-2.1.7.dist-info → uipath-2.1.9.dist-info}/licenses/LICENSE +0 -0
uipath/_cli/__init__.py
CHANGED
@@ -5,6 +5,7 @@ import click
|
|
5
5
|
|
6
6
|
from .cli_auth import auth as auth # type: ignore
|
7
7
|
from .cli_deploy import deploy as deploy # type: ignore
|
8
|
+
from .cli_eval import eval as eval # type: ignore
|
8
9
|
from .cli_init import init as init # type: ignore
|
9
10
|
from .cli_invoke import invoke as invoke # type: ignore
|
10
11
|
from .cli_new import new as new # type: ignore
|
@@ -67,3 +68,4 @@ cli.add_command(auth)
|
|
67
68
|
cli.add_command(invoke)
|
68
69
|
cli.add_command(push)
|
69
70
|
cli.add_command(pull)
|
71
|
+
cli.add_command(eval)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
{
|
2
2
|
"client_id": "36dea5b8-e8bb-423d-8e7b-c808df8f1c00",
|
3
3
|
"redirect_uri": "http://localhost:__PY_REPLACE_PORT__/oidc/login",
|
4
|
-
"scope": "offline_access OrchestratorApiUserAccess StudioWebBackend IdentityServerApi ConnectionService DataService DocumentUnderstanding EnterpriseContextService Directory JamJamApi LLMGateway LLMOps OMS RCS.FolderAuthorization TM.Projects TM.TestCases TM.Requirements TM.TestSets",
|
4
|
+
"scope": "offline_access ProcessMining OrchestratorApiUserAccess StudioWebBackend IdentityServerApi ConnectionService DataService DocumentUnderstanding EnterpriseContextService Directory JamJamApi LLMGateway LLMOps OMS RCS.FolderAuthorization TM.Projects TM.TestCases TM.Requirements TM.TestSets",
|
5
5
|
"port": 8104
|
6
6
|
}
|
@@ -0,0 +1,20 @@
|
|
1
|
+
"""Evaluators package for the evaluation system.
|
2
|
+
|
3
|
+
This package contains all evaluator types and the factory for creating them.
|
4
|
+
"""
|
5
|
+
|
6
|
+
from ._agent_scorer_evaluator import AgentScorerEvaluator
|
7
|
+
from ._deterministic_evaluator import DeterministicEvaluator
|
8
|
+
from ._evaluator_base import EvaluatorBase
|
9
|
+
from ._evaluator_factory import EvaluatorFactory
|
10
|
+
from ._llm_as_judge_evaluator import LlmAsAJudgeEvaluator
|
11
|
+
from ._trajectory_evaluator import TrajectoryEvaluator
|
12
|
+
|
13
|
+
__all__ = [
|
14
|
+
"EvaluatorBase",
|
15
|
+
"EvaluatorFactory",
|
16
|
+
"DeterministicEvaluator",
|
17
|
+
"LlmAsAJudgeEvaluator",
|
18
|
+
"AgentScorerEvaluator",
|
19
|
+
"TrajectoryEvaluator",
|
20
|
+
]
|
@@ -0,0 +1,48 @@
|
|
1
|
+
from typing import Any, Dict
|
2
|
+
|
3
|
+
from .._models import EvaluationResult
|
4
|
+
from ._evaluator_base import EvaluatorBase
|
5
|
+
|
6
|
+
|
7
|
+
class AgentScorerEvaluator(EvaluatorBase):
|
8
|
+
"""Evaluator that uses an agent to score outputs."""
|
9
|
+
|
10
|
+
def __init__(
|
11
|
+
self,
|
12
|
+
agent_config: Dict[str, Any],
|
13
|
+
scoring_criteria: Dict[str, Any],
|
14
|
+
target_output_key: str = "*",
|
15
|
+
):
|
16
|
+
"""Initialize the agent scorer evaluator.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
agent_config: Configuration for the scoring agent
|
20
|
+
scoring_criteria: Criteria used for scoring
|
21
|
+
target_output_key: Key in output to evaluate ("*" for entire output)
|
22
|
+
"""
|
23
|
+
super().__init__()
|
24
|
+
self.agent_config = agent_config or {}
|
25
|
+
self.scoring_criteria = scoring_criteria or {}
|
26
|
+
self.target_output_key = target_output_key
|
27
|
+
|
28
|
+
async def evaluate(
|
29
|
+
self,
|
30
|
+
evaluation_id: str,
|
31
|
+
evaluation_name: str,
|
32
|
+
input_data: Dict[str, Any],
|
33
|
+
expected_output: Dict[str, Any],
|
34
|
+
actual_output: Dict[str, Any],
|
35
|
+
) -> EvaluationResult:
|
36
|
+
"""Evaluate using an agent scorer.
|
37
|
+
|
38
|
+
Args:
|
39
|
+
evaluation_id: The ID of the evaluation being processed
|
40
|
+
evaluation_name: The name of the evaluation
|
41
|
+
input_data: The input data for the evaluation
|
42
|
+
expected_output: The expected output
|
43
|
+
actual_output: The actual output from the agent
|
44
|
+
|
45
|
+
Returns:
|
46
|
+
EvaluationResult containing the score and details
|
47
|
+
"""
|
48
|
+
raise NotImplementedError()
|
@@ -0,0 +1,41 @@
|
|
1
|
+
from typing import Any, Dict
|
2
|
+
|
3
|
+
from .._models import EvaluationResult
|
4
|
+
from ._evaluator_base import EvaluatorBase
|
5
|
+
|
6
|
+
|
7
|
+
class DeterministicEvaluator(EvaluatorBase):
|
8
|
+
"""Evaluator for deterministic/rule-based evaluations."""
|
9
|
+
|
10
|
+
def __init__(self, rule_config: Dict[str, Any], target_output_key: str = "*"):
|
11
|
+
"""Initialize the deterministic evaluator.
|
12
|
+
|
13
|
+
Args:
|
14
|
+
rule_config: Configuration for the rule (expected_value, regex_pattern, etc.)
|
15
|
+
target_output_key: Key in output to evaluate ("*" for entire output)
|
16
|
+
"""
|
17
|
+
super().__init__()
|
18
|
+
self.rule_config = rule_config or {}
|
19
|
+
self.target_output_key = target_output_key
|
20
|
+
|
21
|
+
async def evaluate(
|
22
|
+
self,
|
23
|
+
evaluation_id: str,
|
24
|
+
evaluation_name: str,
|
25
|
+
input_data: Dict[str, Any],
|
26
|
+
expected_output: Dict[str, Any],
|
27
|
+
actual_output: Dict[str, Any],
|
28
|
+
) -> EvaluationResult:
|
29
|
+
"""Evaluate using deterministic rules.
|
30
|
+
|
31
|
+
Args:
|
32
|
+
evaluation_id: The ID of the evaluation being processed
|
33
|
+
evaluation_name: The name of the evaluation
|
34
|
+
input_data: The input data for the evaluation
|
35
|
+
expected_output: The expected output
|
36
|
+
actual_output: The actual output from the agent
|
37
|
+
|
38
|
+
Returns:
|
39
|
+
EvaluationResult containing the score and details
|
40
|
+
"""
|
41
|
+
raise NotImplementedError()
|
@@ -0,0 +1,124 @@
|
|
1
|
+
import functools
|
2
|
+
import time
|
3
|
+
from abc import ABC, abstractmethod
|
4
|
+
from dataclasses import dataclass
|
5
|
+
from typing import Any, Dict
|
6
|
+
|
7
|
+
from uipath._cli._evals._models import (
|
8
|
+
EvaluationResult,
|
9
|
+
EvaluatorCategory,
|
10
|
+
EvaluatorType,
|
11
|
+
)
|
12
|
+
|
13
|
+
|
14
|
+
def measure_execution_time(func):
|
15
|
+
"""Decorator to measure execution time and update EvaluationResult.evaluation_time."""
|
16
|
+
|
17
|
+
@functools.wraps(func)
|
18
|
+
async def wrapper(*args, **kwargs) -> EvaluationResult:
|
19
|
+
start_time = time.time()
|
20
|
+
result = await func(*args, **kwargs)
|
21
|
+
end_time = time.time()
|
22
|
+
execution_time = end_time - start_time
|
23
|
+
|
24
|
+
result.evaluation_time = execution_time
|
25
|
+
return result
|
26
|
+
|
27
|
+
return wrapper
|
28
|
+
|
29
|
+
|
30
|
+
@dataclass
|
31
|
+
class EvaluatorBaseParams:
|
32
|
+
"""Parameters for initializing the base evaluator."""
|
33
|
+
|
34
|
+
evaluator_id: str
|
35
|
+
category: EvaluatorCategory
|
36
|
+
evaluator_type: EvaluatorType
|
37
|
+
name: str
|
38
|
+
description: str
|
39
|
+
created_at: str
|
40
|
+
updated_at: str
|
41
|
+
target_output_key: str
|
42
|
+
|
43
|
+
|
44
|
+
class EvaluatorBase(ABC):
|
45
|
+
"""Abstract base class for all evaluators."""
|
46
|
+
|
47
|
+
def __init__(self):
|
48
|
+
# initialization done via 'from_params' function
|
49
|
+
self.id: str
|
50
|
+
self.name: str
|
51
|
+
self.description: str
|
52
|
+
self.created_at: str
|
53
|
+
self.updated_at: str
|
54
|
+
self.category: EvaluatorCategory
|
55
|
+
self.type: EvaluatorType
|
56
|
+
self.target_output_key: str
|
57
|
+
pass
|
58
|
+
|
59
|
+
@classmethod
|
60
|
+
def from_params(cls, params: EvaluatorBaseParams, **kwargs):
|
61
|
+
"""Initialize the base evaluator from parameters.
|
62
|
+
|
63
|
+
Args:
|
64
|
+
params: EvaluatorBaseParams containing base configuration
|
65
|
+
**kwargs: Additional specific parameters for concrete evaluators
|
66
|
+
|
67
|
+
Returns:
|
68
|
+
Initialized evaluator instance
|
69
|
+
"""
|
70
|
+
instance = cls(**kwargs)
|
71
|
+
instance.id = params.evaluator_id
|
72
|
+
instance.category = params.category
|
73
|
+
instance.type = params.evaluator_type
|
74
|
+
instance.name = params.name
|
75
|
+
instance.description = params.description
|
76
|
+
instance.created_at = params.created_at
|
77
|
+
instance.updated_at = params.updated_at
|
78
|
+
instance.target_output_key = params.target_output_key
|
79
|
+
return instance
|
80
|
+
|
81
|
+
@measure_execution_time
|
82
|
+
@abstractmethod
|
83
|
+
async def evaluate(
|
84
|
+
self,
|
85
|
+
evaluation_id: str,
|
86
|
+
evaluation_name: str,
|
87
|
+
input_data: Dict[str, Any],
|
88
|
+
expected_output: Dict[str, Any],
|
89
|
+
actual_output: Dict[str, Any],
|
90
|
+
) -> EvaluationResult:
|
91
|
+
"""Evaluate the given data and return a result.
|
92
|
+
|
93
|
+
Args:
|
94
|
+
evaluation_id: The ID of the evaluation being processed
|
95
|
+
evaluation_name: The name of the evaluation
|
96
|
+
input_data: The input data for the evaluation
|
97
|
+
expected_output: The expected output
|
98
|
+
actual_output: The actual output from the agent
|
99
|
+
|
100
|
+
Returns:
|
101
|
+
EvaluationResult containing the score and details
|
102
|
+
"""
|
103
|
+
pass
|
104
|
+
|
105
|
+
def to_dict(self) -> Dict[str, Any]:
|
106
|
+
"""Convert the evaluator instance to a dictionary representation.
|
107
|
+
|
108
|
+
Returns:
|
109
|
+
Dict[str, Any]: Dictionary containing all evaluator properties
|
110
|
+
"""
|
111
|
+
return {
|
112
|
+
"id": self.id,
|
113
|
+
"name": self.name,
|
114
|
+
"description": self.description,
|
115
|
+
"created_at": self.created_at,
|
116
|
+
"updated_at": self.updated_at,
|
117
|
+
"category": self.category.name if self.category else None,
|
118
|
+
"type": self.type.name if self.type else None,
|
119
|
+
"target_output_key": self.target_output_key,
|
120
|
+
}
|
121
|
+
|
122
|
+
def __repr__(self) -> str:
|
123
|
+
"""String representation of the evaluator."""
|
124
|
+
return f"{self.__class__.__name__}(id='{self.id}', name='{self.name}', category={self.category.name})"
|
@@ -0,0 +1,103 @@
|
|
1
|
+
from typing import Any, Dict
|
2
|
+
|
3
|
+
from .._models import EvaluatorCategory, EvaluatorType
|
4
|
+
from ._agent_scorer_evaluator import AgentScorerEvaluator
|
5
|
+
from ._deterministic_evaluator import DeterministicEvaluator
|
6
|
+
from ._evaluator_base import EvaluatorBase, EvaluatorBaseParams
|
7
|
+
from ._llm_as_judge_evaluator import LlmAsAJudgeEvaluator
|
8
|
+
from ._trajectory_evaluator import TrajectoryEvaluator
|
9
|
+
|
10
|
+
|
11
|
+
class EvaluatorFactory:
|
12
|
+
"""Factory class for creating evaluator instances based on configuration."""
|
13
|
+
|
14
|
+
@staticmethod
|
15
|
+
def create_evaluator(data: Dict[str, Any]) -> EvaluatorBase:
|
16
|
+
"""Create an evaluator instance from configuration data.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
data: Dictionary containing evaluator configuration from JSON file
|
20
|
+
|
21
|
+
Returns:
|
22
|
+
Appropriate evaluator instance based on category
|
23
|
+
|
24
|
+
Raises:
|
25
|
+
ValueError: If category is unknown or required fields are missing
|
26
|
+
"""
|
27
|
+
# Extract common fields
|
28
|
+
evaluator_id = data.get("id")
|
29
|
+
if not evaluator_id:
|
30
|
+
raise ValueError("Evaluator configuration must include 'id' field")
|
31
|
+
|
32
|
+
category = EvaluatorCategory.from_int(data.get("category"))
|
33
|
+
evaluator_type = EvaluatorType.from_int(data.get("type", EvaluatorType.Unknown))
|
34
|
+
name = data.get("name", "")
|
35
|
+
description = data.get("description", "")
|
36
|
+
created_at = data.get("createdAt", "")
|
37
|
+
updated_at = data.get("updatedAt", "")
|
38
|
+
target_output_key = data.get("targetOutputKey", "")
|
39
|
+
|
40
|
+
# Create base parameters
|
41
|
+
base_params = EvaluatorBaseParams(
|
42
|
+
evaluator_id=evaluator_id,
|
43
|
+
category=category,
|
44
|
+
evaluator_type=evaluator_type,
|
45
|
+
name=name,
|
46
|
+
description=description,
|
47
|
+
created_at=created_at,
|
48
|
+
updated_at=updated_at,
|
49
|
+
target_output_key=target_output_key,
|
50
|
+
)
|
51
|
+
|
52
|
+
# Create evaluator based on category
|
53
|
+
if category == EvaluatorCategory.Deterministic:
|
54
|
+
return EvaluatorFactory._create_deterministic_evaluator(base_params, data)
|
55
|
+
elif category == EvaluatorCategory.LlmAsAJudge:
|
56
|
+
return EvaluatorFactory._create_llm_as_judge_evaluator(base_params, data)
|
57
|
+
elif category == EvaluatorCategory.AgentScorer:
|
58
|
+
return EvaluatorFactory._create_agent_scorer_evaluator(base_params, data)
|
59
|
+
elif category == EvaluatorCategory.Trajectory:
|
60
|
+
return EvaluatorFactory._create_trajectory_evaluator(base_params, data)
|
61
|
+
else:
|
62
|
+
raise ValueError(f"Unknown evaluator category: {category}")
|
63
|
+
|
64
|
+
@staticmethod
|
65
|
+
def _create_deterministic_evaluator(
|
66
|
+
base_params: EvaluatorBaseParams, data: Dict[str, Any]
|
67
|
+
) -> DeterministicEvaluator:
|
68
|
+
"""Create a deterministic evaluator."""
|
69
|
+
raise NotImplementedError()
|
70
|
+
|
71
|
+
@staticmethod
|
72
|
+
def _create_llm_as_judge_evaluator(
|
73
|
+
base_params: EvaluatorBaseParams, data: Dict[str, Any]
|
74
|
+
) -> LlmAsAJudgeEvaluator:
|
75
|
+
"""Create an LLM-as-a-judge evaluator."""
|
76
|
+
prompt = data.get("prompt", "")
|
77
|
+
if not prompt:
|
78
|
+
raise ValueError("LLM evaluator must include 'prompt' field")
|
79
|
+
|
80
|
+
model = data.get("model", "")
|
81
|
+
if not model:
|
82
|
+
raise ValueError("LLM evaluator must include 'model' field")
|
83
|
+
|
84
|
+
return LlmAsAJudgeEvaluator.from_params(
|
85
|
+
base_params,
|
86
|
+
prompt=prompt,
|
87
|
+
model=model,
|
88
|
+
target_output_key=data.get("targetOutputKey", "*"),
|
89
|
+
)
|
90
|
+
|
91
|
+
@staticmethod
|
92
|
+
def _create_agent_scorer_evaluator(
|
93
|
+
base_params: EvaluatorBaseParams, data: Dict[str, Any]
|
94
|
+
) -> AgentScorerEvaluator:
|
95
|
+
"""Create an agent scorer evaluator."""
|
96
|
+
raise NotImplementedError()
|
97
|
+
|
98
|
+
@staticmethod
|
99
|
+
def _create_trajectory_evaluator(
|
100
|
+
base_params: EvaluatorBaseParams, data: Dict[str, Any]
|
101
|
+
) -> TrajectoryEvaluator:
|
102
|
+
"""Create a trajectory evaluator."""
|
103
|
+
raise NotImplementedError()
|
@@ -0,0 +1,181 @@
|
|
1
|
+
import json
|
2
|
+
from typing import Any, Dict
|
3
|
+
|
4
|
+
from ...._config import Config
|
5
|
+
from ...._execution_context import ExecutionContext
|
6
|
+
from ...._services.llm_gateway_service import UiPathLlmChatService
|
7
|
+
from ...._utils.constants import (
|
8
|
+
ENV_BASE_URL,
|
9
|
+
ENV_UIPATH_ACCESS_TOKEN,
|
10
|
+
ENV_UNATTENDED_USER_ACCESS_TOKEN,
|
11
|
+
COMMUNITY_agents_SUFFIX,
|
12
|
+
)
|
13
|
+
from .._models import EvaluationResult, LLMResponse
|
14
|
+
from ._evaluator_base import EvaluatorBase
|
15
|
+
|
16
|
+
|
17
|
+
class LlmAsAJudgeEvaluator(EvaluatorBase):
|
18
|
+
"""Evaluator that uses an LLM to judge the quality of outputs."""
|
19
|
+
|
20
|
+
def __init__(self, prompt: str = "", model: str = "", target_output_key: str = "*"):
|
21
|
+
"""Initialize the LLM-as-a-judge evaluator.
|
22
|
+
|
23
|
+
Args:
|
24
|
+
prompt: The prompt template for the LLM
|
25
|
+
model: The model to use for evaluation
|
26
|
+
target_output_key: Key in output to evaluate ("*" for entire output)
|
27
|
+
"""
|
28
|
+
super().__init__()
|
29
|
+
self.actual_output_placeholder = "{{ActualOutput}}"
|
30
|
+
self.expected_output_placeholder = "{{ExpectedOutput}}"
|
31
|
+
self._initialize_llm()
|
32
|
+
self.prompt = prompt
|
33
|
+
self.model = model
|
34
|
+
self.target_output_key: str = target_output_key
|
35
|
+
|
36
|
+
def _initialize_llm(self):
|
37
|
+
"""Initialize the LLM used for evaluation."""
|
38
|
+
import os
|
39
|
+
|
40
|
+
base_url_value: str = os.getenv(ENV_BASE_URL) # type: ignore
|
41
|
+
secret_value: str = os.getenv(ENV_UNATTENDED_USER_ACCESS_TOKEN) or os.getenv(
|
42
|
+
ENV_UIPATH_ACCESS_TOKEN
|
43
|
+
) # type: ignore
|
44
|
+
config = Config(
|
45
|
+
base_url=base_url_value,
|
46
|
+
secret=secret_value,
|
47
|
+
)
|
48
|
+
self.llm = UiPathLlmChatService(config, ExecutionContext())
|
49
|
+
|
50
|
+
async def evaluate(
|
51
|
+
self,
|
52
|
+
evaluation_id: str,
|
53
|
+
evaluation_name: str,
|
54
|
+
input_data: Dict[str, Any],
|
55
|
+
expected_output: Dict[str, Any],
|
56
|
+
actual_output: Dict[str, Any],
|
57
|
+
) -> EvaluationResult:
|
58
|
+
"""Evaluate using an LLM as a judge.
|
59
|
+
|
60
|
+
Args:
|
61
|
+
evaluation_id: The ID of the evaluation being processed
|
62
|
+
evaluation_name: The name of the evaluation
|
63
|
+
input_data: The input data for the evaluation
|
64
|
+
expected_output: The expected output
|
65
|
+
actual_output: The actual output from the agent
|
66
|
+
|
67
|
+
Returns:
|
68
|
+
EvaluationResult containing the score and details
|
69
|
+
"""
|
70
|
+
# Extract the target value to evaluate
|
71
|
+
target_value = self._extract_target_value(actual_output)
|
72
|
+
expected_value = self._extract_target_value(expected_output)
|
73
|
+
|
74
|
+
# Create the evaluation prompt
|
75
|
+
evaluation_prompt = self._create_evaluation_prompt(expected_value, target_value)
|
76
|
+
|
77
|
+
llm_response = await self._get_llm_response(evaluation_prompt)
|
78
|
+
|
79
|
+
return EvaluationResult(
|
80
|
+
evaluation_id=evaluation_id,
|
81
|
+
evaluation_name=evaluation_name,
|
82
|
+
evaluator_id=self.id,
|
83
|
+
evaluator_name=self.name,
|
84
|
+
score=llm_response.score,
|
85
|
+
input=input_data,
|
86
|
+
expected_output=expected_output,
|
87
|
+
actual_output=actual_output,
|
88
|
+
details=llm_response.justification,
|
89
|
+
)
|
90
|
+
|
91
|
+
def _extract_target_value(self, output: Dict[str, Any]) -> Any:
|
92
|
+
"""Extract the target value from output based on target_output_key."""
|
93
|
+
if self.target_output_key == "*":
|
94
|
+
return output
|
95
|
+
|
96
|
+
# Handle nested keys
|
97
|
+
keys = self.target_output_key.split(".")
|
98
|
+
value = output
|
99
|
+
|
100
|
+
try:
|
101
|
+
for key in keys:
|
102
|
+
if isinstance(value, dict):
|
103
|
+
value = value[key]
|
104
|
+
else:
|
105
|
+
return None
|
106
|
+
return value
|
107
|
+
except (KeyError, TypeError):
|
108
|
+
return None
|
109
|
+
|
110
|
+
def _create_evaluation_prompt(
|
111
|
+
self, expected_output: Any, actual_output: Any
|
112
|
+
) -> str:
|
113
|
+
"""Create the evaluation prompt for the LLM."""
|
114
|
+
formatted_prompt = self.prompt.replace(
|
115
|
+
self.actual_output_placeholder,
|
116
|
+
str(actual_output),
|
117
|
+
)
|
118
|
+
formatted_prompt = formatted_prompt.replace(
|
119
|
+
self.expected_output_placeholder,
|
120
|
+
str(expected_output),
|
121
|
+
)
|
122
|
+
|
123
|
+
return formatted_prompt
|
124
|
+
|
125
|
+
async def _get_llm_response(self, evaluation_prompt: str) -> LLMResponse:
|
126
|
+
"""Get response from the LLM.
|
127
|
+
|
128
|
+
Args:
|
129
|
+
evaluation_prompt: The formatted prompt to send to the LLM
|
130
|
+
|
131
|
+
Returns:
|
132
|
+
LLMResponse with score and justification
|
133
|
+
"""
|
134
|
+
try:
|
135
|
+
# remove community-agents suffix from llm model name
|
136
|
+
model = self.model
|
137
|
+
if model.endswith(COMMUNITY_agents_SUFFIX):
|
138
|
+
model = model.replace(COMMUNITY_agents_SUFFIX, "")
|
139
|
+
|
140
|
+
# Prepare the request
|
141
|
+
request_data = {
|
142
|
+
"model": model,
|
143
|
+
"messages": [{"role": "user", "content": evaluation_prompt}],
|
144
|
+
"response_format": {
|
145
|
+
"type": "json_schema",
|
146
|
+
"json_schema": {
|
147
|
+
"name": "evaluation_response",
|
148
|
+
"schema": {
|
149
|
+
"type": "object",
|
150
|
+
"properties": {
|
151
|
+
"score": {
|
152
|
+
"type": "number",
|
153
|
+
"minimum": 0,
|
154
|
+
"maximum": 100,
|
155
|
+
"description": "Score between 0 and 100",
|
156
|
+
},
|
157
|
+
"justification": {
|
158
|
+
"type": "string",
|
159
|
+
"description": "Explanation for the score",
|
160
|
+
},
|
161
|
+
},
|
162
|
+
"required": ["score", "justification"],
|
163
|
+
},
|
164
|
+
},
|
165
|
+
},
|
166
|
+
}
|
167
|
+
|
168
|
+
response = await self.llm.chat_completions(**request_data)
|
169
|
+
|
170
|
+
try:
|
171
|
+
return LLMResponse(**json.loads(response.choices[-1].message.content))
|
172
|
+
except (json.JSONDecodeError, ValueError) as e:
|
173
|
+
return LLMResponse(
|
174
|
+
score=0.0, justification=f"Error parsing LLM response: {str(e)}"
|
175
|
+
)
|
176
|
+
|
177
|
+
except Exception as e:
|
178
|
+
# Fallback in case of any errors
|
179
|
+
return LLMResponse(
|
180
|
+
score=0.0, justification=f"Error during LLM evaluation: {str(e)}"
|
181
|
+
)
|
@@ -0,0 +1,48 @@
|
|
1
|
+
from typing import Any, Dict
|
2
|
+
|
3
|
+
from .._models import EvaluationResult
|
4
|
+
from ._evaluator_base import EvaluatorBase
|
5
|
+
|
6
|
+
|
7
|
+
class TrajectoryEvaluator(EvaluatorBase):
|
8
|
+
"""Evaluator that analyzes the trajectory/path taken to reach outputs."""
|
9
|
+
|
10
|
+
def __init__(
|
11
|
+
self,
|
12
|
+
trajectory_config: Dict[str, Any],
|
13
|
+
step_weights: Dict[str, float],
|
14
|
+
target_output_key: str = "*",
|
15
|
+
):
|
16
|
+
"""Initialize the trajectory evaluator.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
trajectory_config: Configuration for trajectory analysis
|
20
|
+
step_weights: Weights for different steps in the trajectory
|
21
|
+
target_output_key: Key in output to evaluate ("*" for entire output)
|
22
|
+
"""
|
23
|
+
super().__init__()
|
24
|
+
self.trajectory_config = trajectory_config or {}
|
25
|
+
self.step_weights = step_weights or {}
|
26
|
+
self.target_output_key = target_output_key
|
27
|
+
|
28
|
+
async def evaluate(
|
29
|
+
self,
|
30
|
+
evaluation_id: str,
|
31
|
+
evaluation_name: str,
|
32
|
+
input_data: Dict[str, Any],
|
33
|
+
expected_output: Dict[str, Any],
|
34
|
+
actual_output: Dict[str, Any],
|
35
|
+
) -> EvaluationResult:
|
36
|
+
"""Evaluate using trajectory analysis.
|
37
|
+
|
38
|
+
Args:
|
39
|
+
evaluation_id: The ID of the evaluation being processed
|
40
|
+
evaluation_name: The name of the evaluation
|
41
|
+
input_data: The input data for the evaluation
|
42
|
+
expected_output: The expected output
|
43
|
+
actual_output: The actual output from the agent
|
44
|
+
|
45
|
+
Returns:
|
46
|
+
EvaluationResult containing the score and details
|
47
|
+
"""
|
48
|
+
raise NotImplementedError()
|
@@ -0,0 +1,18 @@
|
|
1
|
+
from uipath._cli._evals._models._evaluation_set import EvaluationItem, EvaluationSet
|
2
|
+
from uipath._cli._evals._models._evaluators import (
|
3
|
+
EvaluationResult,
|
4
|
+
EvaluationSetResult,
|
5
|
+
EvaluatorCategory,
|
6
|
+
EvaluatorType,
|
7
|
+
LLMResponse,
|
8
|
+
)
|
9
|
+
|
10
|
+
__all__ = [
|
11
|
+
"LLMResponse",
|
12
|
+
"EvaluatorCategory",
|
13
|
+
"EvaluatorType",
|
14
|
+
"EvaluationResult",
|
15
|
+
"EvaluationSetResult",
|
16
|
+
"EvaluationItem",
|
17
|
+
"EvaluationSet",
|
18
|
+
]
|
@@ -0,0 +1,43 @@
|
|
1
|
+
from enum import IntEnum
|
2
|
+
from typing import Any, Dict, List
|
3
|
+
|
4
|
+
from pydantic import BaseModel, Field
|
5
|
+
|
6
|
+
|
7
|
+
class EvaluationItem(BaseModel):
|
8
|
+
"""Individual evaluation item within an evaluation set."""
|
9
|
+
|
10
|
+
id: str
|
11
|
+
name: str
|
12
|
+
inputs: Dict[str, Any]
|
13
|
+
expectedOutput: Dict[str, Any]
|
14
|
+
expectedAgentBehavior: str = ""
|
15
|
+
simulationInstructions: str = ""
|
16
|
+
simulateInput: bool = False
|
17
|
+
inputGenerationInstructions: str = ""
|
18
|
+
simulateTools: bool = False
|
19
|
+
toolsToSimulate: List[str] = Field(default_factory=list)
|
20
|
+
evalSetId: str
|
21
|
+
createdAt: str
|
22
|
+
updatedAt: str
|
23
|
+
|
24
|
+
|
25
|
+
class EvaluationSet(BaseModel):
|
26
|
+
"""Complete evaluation set model."""
|
27
|
+
|
28
|
+
id: str
|
29
|
+
fileName: str
|
30
|
+
evaluatorRefs: List[str] = Field(default_factory=list)
|
31
|
+
evaluations: List[EvaluationItem] = Field(default_factory=list)
|
32
|
+
name: str
|
33
|
+
batchSize: int = 10
|
34
|
+
timeoutMinutes: int = 20
|
35
|
+
modelSettings: List[Dict[str, Any]] = Field(default_factory=list)
|
36
|
+
createdAt: str
|
37
|
+
updatedAt: str
|
38
|
+
|
39
|
+
|
40
|
+
class EvaluationStatus(IntEnum):
|
41
|
+
PENDING = 0
|
42
|
+
IN_PROGRESS = 1
|
43
|
+
COMPLETED = 2
|