uipath 2.1.71__py3-none-any.whl → 2.1.73__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- uipath/_cli/_dev/_terminal/_components/_history.py +25 -5
- uipath/_cli/_evals/_evaluator_factory.py +35 -73
- uipath/_cli/_evals/_models/_evaluation_set.py +127 -18
- uipath/_cli/_evals/_models/_evaluator.py +106 -0
- uipath/_cli/_evals/_runtime.py +2 -0
- uipath/_cli/_evals/mocks/__init__.py +1 -0
- uipath/_cli/_evals/mocks/llm_mocker.py +153 -0
- uipath/_cli/_evals/mocks/mocker.py +29 -0
- uipath/_cli/_evals/mocks/mocker_factory.py +25 -0
- uipath/_cli/_evals/mocks/mockito_mocker.py +62 -0
- uipath/_cli/_evals/mocks/mocks.py +136 -0
- uipath/_cli/_runtime/_contracts.py +1 -0
- uipath/_cli/_runtime/_logging.py +112 -31
- uipath/_cli/cli_pull.py +1 -1
- uipath/_services/llm_gateway_service.py +24 -27
- uipath/agent/_utils.py +72 -3
- uipath/agent/models/agent.py +11 -0
- {uipath-2.1.71.dist-info → uipath-2.1.73.dist-info}/METADATA +4 -1
- {uipath-2.1.71.dist-info → uipath-2.1.73.dist-info}/RECORD +22 -15
- {uipath-2.1.71.dist-info → uipath-2.1.73.dist-info}/WHEEL +0 -0
- {uipath-2.1.71.dist-info → uipath-2.1.73.dist-info}/entry_points.txt +0 -0
- {uipath-2.1.71.dist-info → uipath-2.1.73.dist-info}/licenses/LICENSE +0 -0
@@ -73,13 +73,33 @@ class RunHistoryPanel(Container):
|
|
73
73
|
self.refresh_list()
|
74
74
|
|
75
75
|
def _refresh_running_items(self) -> None:
|
76
|
+
"""Refresh display names for running items only."""
|
76
77
|
if not any(run.status == "running" for run in self.runs):
|
77
|
-
return None
|
78
|
+
return None
|
78
79
|
|
79
|
-
|
80
|
+
try:
|
81
|
+
run_list = self.query_one("#run-list", ListView)
|
82
|
+
except Exception:
|
83
|
+
return None
|
84
|
+
|
85
|
+
# Take a snapshot of items to avoid mid-iteration changes
|
86
|
+
items_snapshot = list(run_list.children)
|
87
|
+
|
88
|
+
for item in items_snapshot:
|
89
|
+
if not hasattr(item, "run_id"):
|
90
|
+
continue
|
91
|
+
|
92
|
+
run = self.get_run_by_id(item.run_id)
|
93
|
+
if not run or run.status != "running":
|
94
|
+
continue
|
95
|
+
|
96
|
+
# Check if item still exists in the list (wasn't removed)
|
97
|
+
if item not in run_list.children:
|
98
|
+
continue
|
80
99
|
|
81
|
-
|
82
|
-
run = self.get_run_by_id(item.run_id) # type: ignore[attr-defined]
|
83
|
-
if run and run.status == "running":
|
100
|
+
try:
|
84
101
|
static = item.query_one(Static)
|
85
102
|
static.update(run.display_name)
|
103
|
+
except Exception:
|
104
|
+
# Item structure changed or was removed
|
105
|
+
continue
|
@@ -1,5 +1,14 @@
|
|
1
1
|
from typing import Any, Dict
|
2
2
|
|
3
|
+
from pydantic import TypeAdapter
|
4
|
+
|
5
|
+
from uipath._cli._evals._models._evaluator import (
|
6
|
+
EqualsEvaluatorParams,
|
7
|
+
Evaluator,
|
8
|
+
JsonSimilarityEvaluatorParams,
|
9
|
+
LLMEvaluatorParams,
|
10
|
+
TrajectoryEvaluatorParams,
|
11
|
+
)
|
3
12
|
from uipath._cli._evals._models._evaluator_base_params import EvaluatorBaseParams
|
4
13
|
from uipath.eval.evaluators import (
|
5
14
|
BaseEvaluator,
|
@@ -8,7 +17,6 @@ from uipath.eval.evaluators import (
|
|
8
17
|
LlmAsAJudgeEvaluator,
|
9
18
|
TrajectoryEvaluator,
|
10
19
|
)
|
11
|
-
from uipath.eval.models.models import EvaluatorCategory, EvaluatorType
|
12
20
|
|
13
21
|
|
14
22
|
class EvaluatorFactory:
|
@@ -35,110 +43,64 @@ class EvaluatorFactory:
|
|
35
43
|
if not id:
|
36
44
|
raise ValueError("Evaluator configuration must include 'id' field")
|
37
45
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
evaluator_type=evaluator_type,
|
50
|
-
name=name,
|
51
|
-
description=description,
|
52
|
-
created_at=created_at,
|
53
|
-
updated_at=updated_at,
|
54
|
-
target_output_key=target_output_key,
|
55
|
-
)
|
56
|
-
|
57
|
-
match category:
|
58
|
-
case EvaluatorCategory.Deterministic:
|
59
|
-
if evaluator_type == evaluator_type.Equals:
|
60
|
-
return EvaluatorFactory._create_exact_match_evaluator(
|
61
|
-
base_params, data
|
62
|
-
)
|
63
|
-
elif evaluator_type == evaluator_type.JsonSimilarity:
|
64
|
-
return EvaluatorFactory._create_json_similarity_evaluator(
|
65
|
-
base_params, data
|
66
|
-
)
|
67
|
-
else:
|
68
|
-
raise ValueError(
|
69
|
-
f"Unknown evaluator type {evaluator_type} for category {category}"
|
70
|
-
)
|
71
|
-
case EvaluatorCategory.LlmAsAJudge:
|
72
|
-
return EvaluatorFactory._create_llm_as_judge_evaluator(
|
73
|
-
base_params, data
|
74
|
-
)
|
75
|
-
case EvaluatorCategory.AgentScorer:
|
76
|
-
raise NotImplementedError()
|
77
|
-
case EvaluatorCategory.Trajectory:
|
78
|
-
return EvaluatorFactory._create_trajectory_evaluator(base_params, data)
|
46
|
+
params: EvaluatorBaseParams = TypeAdapter(Evaluator).validate_python(data)
|
47
|
+
|
48
|
+
match params:
|
49
|
+
case EqualsEvaluatorParams():
|
50
|
+
return EvaluatorFactory._create_exact_match_evaluator(params)
|
51
|
+
case JsonSimilarityEvaluatorParams():
|
52
|
+
return EvaluatorFactory._create_json_similarity_evaluator(params)
|
53
|
+
case LLMEvaluatorParams():
|
54
|
+
return EvaluatorFactory._create_llm_as_judge_evaluator(params)
|
55
|
+
case TrajectoryEvaluatorParams():
|
56
|
+
return EvaluatorFactory._create_trajectory_evaluator(params)
|
79
57
|
case _:
|
80
|
-
raise ValueError(f"Unknown evaluator category: {
|
58
|
+
raise ValueError(f"Unknown evaluator category: {params}")
|
81
59
|
|
82
60
|
@staticmethod
|
83
61
|
def _create_exact_match_evaluator(
|
84
|
-
|
62
|
+
params: EqualsEvaluatorParams,
|
85
63
|
) -> ExactMatchEvaluator:
|
86
64
|
"""Create a deterministic evaluator."""
|
87
|
-
return ExactMatchEvaluator(
|
88
|
-
**base_params.model_dump(),
|
89
|
-
)
|
65
|
+
return ExactMatchEvaluator(**params.model_dump())
|
90
66
|
|
91
67
|
@staticmethod
|
92
68
|
def _create_json_similarity_evaluator(
|
93
|
-
|
69
|
+
params: JsonSimilarityEvaluatorParams,
|
94
70
|
) -> JsonSimilarityEvaluator:
|
95
71
|
"""Create a deterministic evaluator."""
|
96
|
-
return JsonSimilarityEvaluator(
|
97
|
-
**base_params.model_dump(),
|
98
|
-
)
|
72
|
+
return JsonSimilarityEvaluator(**params.model_dump())
|
99
73
|
|
100
74
|
@staticmethod
|
101
75
|
def _create_llm_as_judge_evaluator(
|
102
|
-
|
76
|
+
params: LLMEvaluatorParams,
|
103
77
|
) -> LlmAsAJudgeEvaluator:
|
104
78
|
"""Create an LLM-as-a-judge evaluator."""
|
105
|
-
|
106
|
-
if not prompt:
|
79
|
+
if not params.prompt:
|
107
80
|
raise ValueError("LLM evaluator must include 'prompt' field")
|
108
81
|
|
109
|
-
|
110
|
-
if not model:
|
82
|
+
if not params.model:
|
111
83
|
raise ValueError("LLM evaluator must include 'model' field")
|
112
|
-
if model == "same-as-agent":
|
84
|
+
if params.model == "same-as-agent":
|
113
85
|
raise ValueError(
|
114
86
|
"'same-as-agent' model option is not supported by coded agents evaluations. Please select a specific model for the evaluator."
|
115
87
|
)
|
116
88
|
|
117
|
-
return LlmAsAJudgeEvaluator(
|
118
|
-
**base_params.model_dump(),
|
119
|
-
prompt=prompt,
|
120
|
-
model=model,
|
121
|
-
)
|
89
|
+
return LlmAsAJudgeEvaluator(**params.model_dump())
|
122
90
|
|
123
91
|
@staticmethod
|
124
92
|
def _create_trajectory_evaluator(
|
125
|
-
|
93
|
+
params: TrajectoryEvaluatorParams,
|
126
94
|
) -> TrajectoryEvaluator:
|
127
95
|
"""Create a trajectory evaluator."""
|
128
|
-
|
129
|
-
if not prompt:
|
96
|
+
if not params.prompt:
|
130
97
|
raise ValueError("Trajectory evaluator must include 'prompt' field")
|
131
98
|
|
132
|
-
|
133
|
-
if not model:
|
99
|
+
if not params.model:
|
134
100
|
raise ValueError("LLM evaluator must include 'model' field")
|
135
|
-
if model == "same-as-agent":
|
101
|
+
if params.model == "same-as-agent":
|
136
102
|
raise ValueError(
|
137
103
|
"'same-as-agent' model option is not supported by coded agents evaluations. Please select a specific model for the evaluator."
|
138
104
|
)
|
139
105
|
|
140
|
-
return TrajectoryEvaluator(
|
141
|
-
**base_params.model_dump(),
|
142
|
-
prompt=prompt,
|
143
|
-
model=model,
|
144
|
-
)
|
106
|
+
return TrajectoryEvaluator(**params.model_dump())
|
@@ -1,28 +1,135 @@
|
|
1
|
-
from enum import IntEnum
|
2
|
-
from typing import Any, Dict, List
|
1
|
+
from enum import Enum, IntEnum
|
2
|
+
from typing import Annotated, Any, Dict, List, Literal, Optional, Union
|
3
3
|
|
4
4
|
from pydantic import BaseModel, ConfigDict, Field
|
5
5
|
from pydantic.alias_generators import to_camel
|
6
6
|
|
7
7
|
|
8
|
+
class EvaluationSimulationTool(BaseModel):
|
9
|
+
name: str = Field(..., alias="name")
|
10
|
+
|
11
|
+
|
12
|
+
class MockingStrategyType(str, Enum):
|
13
|
+
LLM = "llm"
|
14
|
+
MOCKITO = "mockito"
|
15
|
+
UNKNOWN = "unknown"
|
16
|
+
|
17
|
+
|
18
|
+
class BaseMockingStrategy(BaseModel):
|
19
|
+
pass
|
20
|
+
|
21
|
+
|
22
|
+
class LLMMockingStrategy(BaseMockingStrategy):
|
23
|
+
type: Literal[MockingStrategyType.LLM] = MockingStrategyType.LLM
|
24
|
+
prompt: str = Field(..., alias="prompt")
|
25
|
+
tools_to_simulate: list[EvaluationSimulationTool] = Field(
|
26
|
+
..., alias="toolsToSimulate"
|
27
|
+
)
|
28
|
+
|
29
|
+
model_config = ConfigDict(
|
30
|
+
validate_by_name=True, validate_by_alias=True, extra="allow"
|
31
|
+
)
|
32
|
+
|
33
|
+
|
34
|
+
"""
|
35
|
+
{
|
36
|
+
"function": "postprocess",
|
37
|
+
"arguments": {
|
38
|
+
"args": [],
|
39
|
+
"kwargs": {"x": 3}
|
40
|
+
},
|
41
|
+
"then": [
|
42
|
+
{
|
43
|
+
"return": 3
|
44
|
+
},
|
45
|
+
{
|
46
|
+
"raise": {
|
47
|
+
"__target__": "NotImplementedError"
|
48
|
+
}
|
49
|
+
}
|
50
|
+
]
|
51
|
+
}
|
52
|
+
"""
|
53
|
+
|
54
|
+
|
55
|
+
class MockingArgument(BaseModel):
|
56
|
+
args: List[Any] = Field(default_factory=lambda: [], alias="args")
|
57
|
+
kwargs: Dict[str, Any] = Field(default_factory=lambda: {}, alias="kwargs")
|
58
|
+
|
59
|
+
|
60
|
+
class MockingAnswerType(str, Enum):
|
61
|
+
RETURN = "return"
|
62
|
+
RAISE = "raise"
|
63
|
+
|
64
|
+
|
65
|
+
class MockingAnswer(BaseModel):
|
66
|
+
type: MockingAnswerType
|
67
|
+
value: Any = Field(..., alias="value")
|
68
|
+
|
69
|
+
|
70
|
+
class MockingBehavior(BaseModel):
|
71
|
+
function: str = Field(..., alias="function")
|
72
|
+
arguments: MockingArgument = Field(..., alias="arguments")
|
73
|
+
then: List[MockingAnswer] = Field(..., alias="then")
|
74
|
+
|
75
|
+
|
76
|
+
class MockitoMockingStrategy(BaseMockingStrategy):
|
77
|
+
type: Literal[MockingStrategyType.MOCKITO] = MockingStrategyType.MOCKITO
|
78
|
+
behaviors: List[MockingBehavior] = Field(..., alias="config")
|
79
|
+
|
80
|
+
model_config = ConfigDict(
|
81
|
+
validate_by_name=True, validate_by_alias=True, extra="allow"
|
82
|
+
)
|
83
|
+
|
84
|
+
|
85
|
+
KnownMockingStrategy = Annotated[
|
86
|
+
Union[LLMMockingStrategy, MockitoMockingStrategy],
|
87
|
+
Field(discriminator="type"),
|
88
|
+
]
|
89
|
+
|
90
|
+
|
91
|
+
class UnknownMockingStrategy(BaseMockingStrategy):
|
92
|
+
type: str = Field(..., alias="type")
|
93
|
+
|
94
|
+
model_config = ConfigDict(
|
95
|
+
validate_by_name=True, validate_by_alias=True, extra="allow"
|
96
|
+
)
|
97
|
+
|
98
|
+
|
99
|
+
MockingStrategy = Union[KnownMockingStrategy, UnknownMockingStrategy]
|
100
|
+
|
101
|
+
|
102
|
+
def migrate_mocking_strategy(data) -> MockingStrategy:
|
103
|
+
if data.get("simulate_tools") and "tools_to_simulate" in data:
|
104
|
+
return LLMMockingStrategy(
|
105
|
+
**{
|
106
|
+
"prompt": data["simulation_instructions"],
|
107
|
+
"toolsToSimulate": data["tools_to_simulate"],
|
108
|
+
}
|
109
|
+
)
|
110
|
+
else:
|
111
|
+
return UnknownMockingStrategy(type=MockingStrategyType.UNKNOWN)
|
112
|
+
|
113
|
+
|
8
114
|
class EvaluationItem(BaseModel):
|
9
115
|
"""Individual evaluation item within an evaluation set."""
|
10
116
|
|
11
|
-
model_config = ConfigDict(
|
117
|
+
model_config = ConfigDict(
|
118
|
+
alias_generator=to_camel, populate_by_name=True, extra="allow"
|
119
|
+
)
|
12
120
|
|
13
121
|
id: str
|
14
122
|
name: str
|
15
123
|
inputs: Dict[str, Any]
|
16
124
|
expected_output: Dict[str, Any]
|
17
|
-
expected_agent_behavior: str = ""
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
updated_at: str
|
125
|
+
expected_agent_behavior: str = Field(default="", alias="expectedAgentBehavior")
|
126
|
+
eval_set_id: str = Field(alias="evalSetId")
|
127
|
+
created_at: str = Field(alias="createdAt")
|
128
|
+
updated_at: str = Field(alias="updatedAt")
|
129
|
+
mocking_strategy: Optional[MockingStrategy] = Field(
|
130
|
+
default=None,
|
131
|
+
alias="mockingStrategy",
|
132
|
+
)
|
26
133
|
|
27
134
|
|
28
135
|
class EvaluationSet(BaseModel):
|
@@ -31,15 +138,17 @@ class EvaluationSet(BaseModel):
|
|
31
138
|
model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
|
32
139
|
|
33
140
|
id: str
|
34
|
-
file_name: str
|
141
|
+
file_name: str = Field(..., alias="fileName")
|
35
142
|
evaluator_refs: List[str] = Field(default_factory=list)
|
36
143
|
evaluations: List[EvaluationItem] = Field(default_factory=list)
|
37
144
|
name: str
|
38
|
-
batch_size: int = 10
|
39
|
-
timeout_minutes: int = 20
|
40
|
-
model_settings: List[Dict[str, Any]] = Field(
|
41
|
-
|
42
|
-
|
145
|
+
batch_size: int = Field(10, alias="batchSize")
|
146
|
+
timeout_minutes: int = Field(default=20, alias="timeoutMinutes")
|
147
|
+
model_settings: List[Dict[str, Any]] = Field(
|
148
|
+
default_factory=list, alias="modelSettings"
|
149
|
+
)
|
150
|
+
created_at: str = Field(alias="createdAt")
|
151
|
+
updated_at: str = Field(alias="updatedAt")
|
43
152
|
|
44
153
|
def extract_selected_evals(self, eval_ids) -> None:
|
45
154
|
selected_evals: list[EvaluationItem] = []
|
@@ -0,0 +1,106 @@
|
|
1
|
+
from typing import Annotated, Any, Literal, Union
|
2
|
+
|
3
|
+
from pydantic import BaseModel, ConfigDict, Discriminator, Field, Tag
|
4
|
+
|
5
|
+
from uipath.eval.models.models import EvaluatorCategory, EvaluatorType
|
6
|
+
|
7
|
+
|
8
|
+
class EvaluatorBaseParams(BaseModel):
|
9
|
+
"""Parameters for initializing the base evaluator."""
|
10
|
+
|
11
|
+
id: str
|
12
|
+
name: str
|
13
|
+
description: str
|
14
|
+
evaluator_type: EvaluatorType = Field(..., alias="type")
|
15
|
+
created_at: str = Field(..., alias="createdAt")
|
16
|
+
updated_at: str = Field(..., alias="updatedAt")
|
17
|
+
target_output_key: str = Field(..., alias="targetOutputKey")
|
18
|
+
file_name: str = Field(..., alias="fileName")
|
19
|
+
|
20
|
+
|
21
|
+
class LLMEvaluatorParams(EvaluatorBaseParams):
|
22
|
+
category: Literal[EvaluatorCategory.LlmAsAJudge] = Field(..., alias="category")
|
23
|
+
prompt: str = Field(..., alias="prompt")
|
24
|
+
model: str = Field(..., alias="model")
|
25
|
+
|
26
|
+
model_config = ConfigDict(
|
27
|
+
validate_by_name=True, validate_by_alias=True, extra="allow"
|
28
|
+
)
|
29
|
+
|
30
|
+
|
31
|
+
class TrajectoryEvaluatorParams(EvaluatorBaseParams):
|
32
|
+
category: Literal[EvaluatorCategory.Trajectory] = Field(..., alias="category")
|
33
|
+
prompt: str = Field(..., alias="prompt")
|
34
|
+
model: str = Field(..., alias="model")
|
35
|
+
|
36
|
+
model_config = ConfigDict(
|
37
|
+
validate_by_name=True, validate_by_alias=True, extra="allow"
|
38
|
+
)
|
39
|
+
|
40
|
+
|
41
|
+
class EqualsEvaluatorParams(EvaluatorBaseParams):
|
42
|
+
model_config = ConfigDict(
|
43
|
+
validate_by_name=True, validate_by_alias=True, extra="allow"
|
44
|
+
)
|
45
|
+
|
46
|
+
|
47
|
+
class JsonSimilarityEvaluatorParams(EvaluatorBaseParams):
|
48
|
+
model_config = ConfigDict(
|
49
|
+
validate_by_name=True, validate_by_alias=True, extra="allow"
|
50
|
+
)
|
51
|
+
|
52
|
+
|
53
|
+
class UnknownEvaluatorParams(EvaluatorBaseParams):
|
54
|
+
model_config = ConfigDict(
|
55
|
+
validate_by_name=True, validate_by_alias=True, extra="allow"
|
56
|
+
)
|
57
|
+
|
58
|
+
|
59
|
+
def evaluator_discriminator(data: Any) -> str:
|
60
|
+
if isinstance(data, dict):
|
61
|
+
category = data.get("category")
|
62
|
+
evaluator_type = data.get("type")
|
63
|
+
match category:
|
64
|
+
case EvaluatorCategory.LlmAsAJudge:
|
65
|
+
return "LLMEvaluatorParams"
|
66
|
+
case EvaluatorCategory.Trajectory:
|
67
|
+
return "TrajectoryEvaluatorParams"
|
68
|
+
case EvaluatorCategory.Deterministic:
|
69
|
+
match evaluator_type:
|
70
|
+
case EvaluatorType.Equals:
|
71
|
+
return "EqualsEvaluatorParams"
|
72
|
+
case EvaluatorType.JsonSimilarity:
|
73
|
+
return "JsonSimilarityEvaluatorParams"
|
74
|
+
case _:
|
75
|
+
return "UnknownEvaluatorParams"
|
76
|
+
case _:
|
77
|
+
return "UnknownEvaluatorParams"
|
78
|
+
else:
|
79
|
+
return "UnknownEvaluatorParams"
|
80
|
+
|
81
|
+
|
82
|
+
Evaluator = Annotated[
|
83
|
+
Union[
|
84
|
+
Annotated[
|
85
|
+
LLMEvaluatorParams,
|
86
|
+
Tag("LLMEvaluatorParams"),
|
87
|
+
],
|
88
|
+
Annotated[
|
89
|
+
TrajectoryEvaluatorParams,
|
90
|
+
Tag("TrajectoryEvaluatorParams"),
|
91
|
+
],
|
92
|
+
Annotated[
|
93
|
+
EqualsEvaluatorParams,
|
94
|
+
Tag("EqualsEvaluatorParams"),
|
95
|
+
],
|
96
|
+
Annotated[
|
97
|
+
JsonSimilarityEvaluatorParams,
|
98
|
+
Tag("JsonSimilarityEvaluatorParams"),
|
99
|
+
],
|
100
|
+
Annotated[
|
101
|
+
UnknownEvaluatorParams,
|
102
|
+
Tag("UnknownEvaluatorParams"),
|
103
|
+
],
|
104
|
+
],
|
105
|
+
Field(discriminator=Discriminator(evaluator_discriminator)),
|
106
|
+
]
|
uipath/_cli/_evals/_runtime.py
CHANGED
@@ -35,6 +35,7 @@ from ._models._output import (
|
|
35
35
|
UiPathEvalOutput,
|
36
36
|
UiPathEvalRunExecutionOutput,
|
37
37
|
)
|
38
|
+
from .mocks.mocks import set_evaluation_item
|
38
39
|
|
39
40
|
T = TypeVar("T", bound=UiPathBaseRuntime)
|
40
41
|
C = TypeVar("C", bound=UiPathRuntimeContext)
|
@@ -137,6 +138,7 @@ class UiPathEvalRuntime(UiPathBaseRuntime, Generic[T, C]):
|
|
137
138
|
evaluation_set_name=evaluation_set.name, score=0, evaluation_set_results=[]
|
138
139
|
)
|
139
140
|
for eval_item in evaluation_set.evaluations:
|
141
|
+
set_evaluation_item(eval_item)
|
140
142
|
await event_bus.publish(
|
141
143
|
EvaluationEvents.CREATE_EVAL_RUN,
|
142
144
|
EvalRunCreatedEvent(
|
@@ -0,0 +1 @@
|
|
1
|
+
"""UiPath mocking framework."""
|
@@ -0,0 +1,153 @@
|
|
1
|
+
"""LLM mocker implementation."""
|
2
|
+
|
3
|
+
import json
|
4
|
+
import logging
|
5
|
+
from typing import Any, Callable
|
6
|
+
|
7
|
+
from pydantic import BaseModel
|
8
|
+
|
9
|
+
from uipath._cli._evals._models._evaluation_set import (
|
10
|
+
EvaluationItem,
|
11
|
+
LLMMockingStrategy,
|
12
|
+
)
|
13
|
+
from uipath._cli._evals.mocks.mocker import Mocker, R, T, UiPathMockingNoMatcherError
|
14
|
+
|
15
|
+
PROMPT = """You are simulating a tool call for automated testing purposes of an Agent.
|
16
|
+
Your task is to provide a realistic response for the tool based on its schema, examples, and context.
|
17
|
+
|
18
|
+
SELECTED TOOL INFORMATION:
|
19
|
+
{toolInfo}
|
20
|
+
|
21
|
+
SELECTED TOOL SCHEMA:
|
22
|
+
{toolSchema}
|
23
|
+
|
24
|
+
SELECTED TOOL EXAMPLES:
|
25
|
+
{toolRunExamples}
|
26
|
+
|
27
|
+
CHOSEN TOOL INPUT:
|
28
|
+
{currentToolInput}
|
29
|
+
|
30
|
+
CURRENT AGENT RUN SO FAR:
|
31
|
+
{testRunHistory}
|
32
|
+
|
33
|
+
HERE IS SOME INFORMATION ABOUT THE AGENT: DO NOT USE THIS INFORMATION TO MAKE THE RESPONSE, BUT RATHER TO UNDERSTAND THE CONTEXT IN WHICH THE TOOL IS BEING USED.
|
34
|
+
{agentInfo}
|
35
|
+
|
36
|
+
TEST RUN PROCTOR INSTRUCTIONS:
|
37
|
+
You will need to simulate a real user's interaction with the tool. This may require following some run specific instructions. If run instructions are provided, follow them exactly.
|
38
|
+
Here are the instructions for this run:
|
39
|
+
{testRunProctorInstructions}
|
40
|
+
|
41
|
+
Based on the above information, provide a realistic response for this tool call. Your response should:
|
42
|
+
1. Match the expected output format according to the tool schema
|
43
|
+
2. Be very consistent with how the tool has responded in previous examples. Do no omit fields or properties.
|
44
|
+
3. Always include the entire output regardless of token length.
|
45
|
+
3. Consider the context of the current test run and the agent being tested. If the agent is acting on a property, make sure the output includes that property.
|
46
|
+
|
47
|
+
Respond ONLY with valid JSON that would be a realistic and completetool response. Do not include any explanations or markdown.
|
48
|
+
"""
|
49
|
+
|
50
|
+
logger = logging.getLogger(__name__)
|
51
|
+
|
52
|
+
|
53
|
+
def pydantic_to_dict_safe(obj: Any) -> Any:
|
54
|
+
"""Serialize nested pydantic models to a dict."""
|
55
|
+
if isinstance(obj, BaseModel):
|
56
|
+
# Convert Pydantic model to dict recursively
|
57
|
+
return obj.model_dump(mode="json")
|
58
|
+
elif isinstance(obj, dict):
|
59
|
+
# Recursively convert dict entries
|
60
|
+
return {k: pydantic_to_dict_safe(v) for k, v in obj.items()}
|
61
|
+
elif isinstance(obj, list):
|
62
|
+
# Recursively convert items in lists
|
63
|
+
return [pydantic_to_dict_safe(item) for item in obj]
|
64
|
+
return obj # Return other types as is
|
65
|
+
|
66
|
+
|
67
|
+
class LLMMocker(Mocker):
|
68
|
+
"""LLM Based Mocker."""
|
69
|
+
|
70
|
+
def __init__(self, evaluation_item: EvaluationItem):
|
71
|
+
"""LLM Mocker constructor."""
|
72
|
+
self.evaluation_item = evaluation_item
|
73
|
+
assert isinstance(self.evaluation_item.mocking_strategy, LLMMockingStrategy)
|
74
|
+
|
75
|
+
async def response(
|
76
|
+
self, func: Callable[[T], R], params: dict[str, Any], *args: T, **kwargs
|
77
|
+
) -> R:
|
78
|
+
"""Respond with mocked response generated by an LLM."""
|
79
|
+
assert isinstance(self.evaluation_item.mocking_strategy, LLMMockingStrategy)
|
80
|
+
|
81
|
+
function_name = params.get("name") or func.__name__
|
82
|
+
if function_name in [
|
83
|
+
x.name for x in self.evaluation_item.mocking_strategy.tools_to_simulate
|
84
|
+
]:
|
85
|
+
from uipath import UiPath
|
86
|
+
from uipath._services.llm_gateway_service import _cleanup_schema
|
87
|
+
|
88
|
+
llm = UiPath().llm
|
89
|
+
return_type: Any = func.__annotations__.get("return", None)
|
90
|
+
if return_type is None:
|
91
|
+
return_type = Any
|
92
|
+
|
93
|
+
class OutputSchema(BaseModel):
|
94
|
+
response: return_type
|
95
|
+
|
96
|
+
response_format = {
|
97
|
+
"type": "json_schema",
|
98
|
+
"json_schema": {
|
99
|
+
"name": OutputSchema.__name__.lower(),
|
100
|
+
"strict": True,
|
101
|
+
"schema": _cleanup_schema(OutputSchema),
|
102
|
+
},
|
103
|
+
}
|
104
|
+
try:
|
105
|
+
prompt_input: dict[str, Any] = {
|
106
|
+
"toolRunExamples": [], # Taken from history. Contains id, input json, output json
|
107
|
+
"testRunHistory": [], # This should contain ordered spans.
|
108
|
+
"toolInfo": {
|
109
|
+
"name": function_name,
|
110
|
+
"description": params.get("description"),
|
111
|
+
"arguments": params.get(
|
112
|
+
"arguments"
|
113
|
+
), # arguments could be passed into tool
|
114
|
+
"settings": params.get(
|
115
|
+
"settings"
|
116
|
+
), # settings could be passed into tool
|
117
|
+
"inputSchema": params.get("input_schema"),
|
118
|
+
},
|
119
|
+
"toolSchema": params.get("input_schema"),
|
120
|
+
"currentToolInput": {
|
121
|
+
"args": args,
|
122
|
+
"kwargs": kwargs,
|
123
|
+
},
|
124
|
+
"agentInfo": { # This is incomplete
|
125
|
+
# "agentName": self.evaluation_item.name, # to be obtained.
|
126
|
+
"actionName": self.evaluation_item.name, # Not sure if this is right?
|
127
|
+
"userInput": self.evaluation_item.inputs,
|
128
|
+
},
|
129
|
+
"testRunProctorInstructions": self.evaluation_item.mocking_strategy.prompt,
|
130
|
+
}
|
131
|
+
prompt_input = {
|
132
|
+
k: json.dumps(pydantic_to_dict_safe(v))
|
133
|
+
for k, v in prompt_input.items()
|
134
|
+
}
|
135
|
+
response = await llm.chat_completions(
|
136
|
+
[
|
137
|
+
{
|
138
|
+
"role": "user",
|
139
|
+
"content": PROMPT.format(**prompt_input),
|
140
|
+
},
|
141
|
+
],
|
142
|
+
response_format=response_format,
|
143
|
+
)
|
144
|
+
mocked_response = OutputSchema(
|
145
|
+
**json.loads(response.choices[0].message.content)
|
146
|
+
)
|
147
|
+
return mocked_response.response
|
148
|
+
except Exception:
|
149
|
+
raise
|
150
|
+
else:
|
151
|
+
raise UiPathMockingNoMatcherError(
|
152
|
+
f"Method '{function_name}' is not simulated."
|
153
|
+
)
|
@@ -0,0 +1,29 @@
|
|
1
|
+
"""Mocker definitions and implementations."""
|
2
|
+
|
3
|
+
from abc import ABC, abstractmethod
|
4
|
+
from collections.abc import Callable
|
5
|
+
from typing import Any, TypeVar
|
6
|
+
|
7
|
+
T = TypeVar("T")
|
8
|
+
R = TypeVar("R")
|
9
|
+
|
10
|
+
|
11
|
+
class Mocker(ABC):
|
12
|
+
"""Mocker interface."""
|
13
|
+
|
14
|
+
@abstractmethod
|
15
|
+
async def response(
|
16
|
+
self,
|
17
|
+
func: Callable[[T], R],
|
18
|
+
params: dict[str, Any],
|
19
|
+
*args: T,
|
20
|
+
**kwargs,
|
21
|
+
) -> R:
|
22
|
+
"""Respond with mocked response."""
|
23
|
+
raise NotImplementedError()
|
24
|
+
|
25
|
+
|
26
|
+
class UiPathMockingNoMatcherError(Exception):
|
27
|
+
"""Exception when a mocker is unable to find a match with the invocation."""
|
28
|
+
|
29
|
+
pass
|