uipath 2.1.52__py3-none-any.whl → 2.1.53__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. uipath/_cli/_evals/{_evaluators/_evaluator_factory.py → _evaluator_factory.py} +24 -23
  2. uipath/_cli/_evals/_models/_evaluation_set.py +23 -18
  3. uipath/_cli/_evals/_models/_evaluator_base_params.py +16 -0
  4. uipath/_cli/_evals/_models/_output.py +85 -0
  5. uipath/_cli/_evals/_runtime.py +102 -10
  6. uipath/_cli/_runtime/_contracts.py +12 -3
  7. uipath/_cli/_utils/_eval_set.py +1 -1
  8. uipath/_cli/cli_eval.py +46 -61
  9. uipath/eval/evaluators/__init__.py +15 -0
  10. uipath/eval/evaluators/base_evaluator.py +88 -0
  11. uipath/eval/evaluators/deterministic_evaluator_base.py +53 -0
  12. uipath/eval/evaluators/exact_match_evaluator.py +37 -0
  13. uipath/{_cli/_evals/_evaluators/_json_similarity_evaluator.py → eval/evaluators/json_similarity_evaluator.py} +23 -40
  14. uipath/eval/evaluators/llm_as_judge_evaluator.py +137 -0
  15. uipath/eval/evaluators/trajectory_evaluator.py +36 -0
  16. uipath/eval/models/__init__.py +19 -0
  17. uipath/{_cli/_evals/_models/_evaluators.py → eval/models/models.py} +67 -43
  18. {uipath-2.1.52.dist-info → uipath-2.1.53.dist-info}/METADATA +1 -1
  19. {uipath-2.1.52.dist-info → uipath-2.1.53.dist-info}/RECORD +22 -22
  20. uipath/_cli/_evals/_evaluators/__init__.py +0 -22
  21. uipath/_cli/_evals/_evaluators/_deterministic_evaluator_base.py +0 -46
  22. uipath/_cli/_evals/_evaluators/_evaluator_base.py +0 -124
  23. uipath/_cli/_evals/_evaluators/_exact_match_evaluator.py +0 -40
  24. uipath/_cli/_evals/_evaluators/_llm_as_judge_evaluator.py +0 -183
  25. uipath/_cli/_evals/_evaluators/_trajectory_evaluator.py +0 -48
  26. uipath/_cli/_evals/_models/__init__.py +0 -18
  27. uipath/_cli/_evals/_models/_agent_execution_output.py +0 -14
  28. uipath/_cli/_evals/progress_reporter.py +0 -304
  29. {uipath-2.1.52.dist-info → uipath-2.1.53.dist-info}/WHEEL +0 -0
  30. {uipath-2.1.52.dist-info → uipath-2.1.53.dist-info}/entry_points.txt +0 -0
  31. {uipath-2.1.52.dist-info → uipath-2.1.53.dist-info}/licenses/LICENSE +0 -0
@@ -1,22 +0,0 @@
1
- """Evaluators package for the evaluation system.
2
-
3
- This package contains all evaluator types and the factory for creating them.
4
- """
5
-
6
- from ._deterministic_evaluator_base import DeterministicEvaluatorBase
7
- from ._evaluator_base import EvaluatorBase
8
- from ._evaluator_factory import EvaluatorFactory
9
- from ._exact_match_evaluator import ExactMatchEvaluator
10
- from ._json_similarity_evaluator import JsonSimilarityEvaluator
11
- from ._llm_as_judge_evaluator import LlmAsAJudgeEvaluator
12
- from ._trajectory_evaluator import TrajectoryEvaluator
13
-
14
- __all__ = [
15
- "EvaluatorBase",
16
- "DeterministicEvaluatorBase",
17
- "EvaluatorFactory",
18
- "JsonSimilarityEvaluator",
19
- "ExactMatchEvaluator",
20
- "LlmAsAJudgeEvaluator",
21
- "TrajectoryEvaluator",
22
- ]
@@ -1,46 +0,0 @@
1
- import copy
2
- import json
3
- from abc import ABC
4
- from typing import Any, Dict, Tuple
5
-
6
- from ._evaluator_base import EvaluatorBase
7
-
8
-
9
- class DeterministicEvaluatorBase(EvaluatorBase, ABC):
10
- def __init__(self, target_output_key: str = "*"):
11
- super().__init__()
12
- self.target_output_key = target_output_key
13
-
14
- def _select_targets(
15
- self, expected_output: Dict[str, Any], actual_output: Dict[str, Any]
16
- ) -> Tuple[Any, Any]:
17
- actual_output_copy = copy.deepcopy(actual_output)
18
- expected_output_copy = copy.deepcopy(expected_output)
19
- if self.target_output_key != "*":
20
- if (
21
- self.target_output_key not in actual_output
22
- or self.target_output_key not in expected_output
23
- ):
24
- raise ValueError(
25
- f"Field '{self.target_output_key}' missing from expected or actual output"
26
- )
27
- actual_output_copy = actual_output_copy[self.target_output_key]
28
- expected_output_copy = expected_output[self.target_output_key]
29
- return actual_output_copy, expected_output_copy
30
-
31
- def _canonical_json(self, obj: Any) -> str:
32
- return json.dumps(
33
- self._normalize_numbers(obj),
34
- sort_keys=True,
35
- separators=(",", ":"),
36
- ensure_ascii=False,
37
- )
38
-
39
- def _normalize_numbers(self, obj: Any) -> Any:
40
- if isinstance(obj, dict):
41
- return {k: self._normalize_numbers(v) for k, v in obj.items()}
42
- if isinstance(obj, (list, tuple)):
43
- return [self._normalize_numbers(v) for v in obj]
44
- if isinstance(obj, (int, float)) and not isinstance(obj, bool):
45
- return float(obj)
46
- return obj
@@ -1,124 +0,0 @@
1
- import functools
2
- import time
3
- from abc import ABC, abstractmethod
4
- from dataclasses import dataclass
5
- from typing import Any, Dict
6
-
7
- from uipath._cli._evals._models import (
8
- EvaluationResult,
9
- EvaluatorCategory,
10
- EvaluatorType,
11
- )
12
-
13
-
14
- def measure_execution_time(func):
15
- """Decorator to measure execution time and update EvaluationResult.evaluation_time."""
16
-
17
- @functools.wraps(func)
18
- async def wrapper(*args, **kwargs) -> EvaluationResult:
19
- start_time = time.time()
20
- result = await func(*args, **kwargs)
21
- end_time = time.time()
22
- execution_time = end_time - start_time
23
-
24
- result.evaluation_time = execution_time
25
- return result
26
-
27
- return wrapper
28
-
29
-
30
- @dataclass
31
- class EvaluatorBaseParams:
32
- """Parameters for initializing the base evaluator."""
33
-
34
- evaluator_id: str
35
- category: EvaluatorCategory
36
- evaluator_type: EvaluatorType
37
- name: str
38
- description: str
39
- created_at: str
40
- updated_at: str
41
- target_output_key: str
42
-
43
-
44
- class EvaluatorBase(ABC):
45
- """Abstract base class for all evaluators."""
46
-
47
- def __init__(self):
48
- # initialization done via 'from_params' function
49
- self.id: str
50
- self.name: str
51
- self.description: str
52
- self.created_at: str
53
- self.updated_at: str
54
- self.category: EvaluatorCategory
55
- self.type: EvaluatorType
56
- self.target_output_key: str
57
- pass
58
-
59
- @classmethod
60
- def from_params(cls, params: EvaluatorBaseParams, **kwargs):
61
- """Initialize the base evaluator from parameters.
62
-
63
- Args:
64
- params: EvaluatorBaseParams containing base configuration
65
- **kwargs: Additional specific parameters for concrete evaluators
66
-
67
- Returns:
68
- Initialized evaluator instance
69
- """
70
- instance = cls(**kwargs)
71
- instance.id = params.evaluator_id
72
- instance.category = params.category
73
- instance.type = params.evaluator_type
74
- instance.name = params.name
75
- instance.description = params.description
76
- instance.created_at = params.created_at
77
- instance.updated_at = params.updated_at
78
- instance.target_output_key = params.target_output_key
79
- return instance
80
-
81
- @measure_execution_time
82
- @abstractmethod
83
- async def evaluate(
84
- self,
85
- evaluation_id: str,
86
- evaluation_name: str,
87
- input_data: Dict[str, Any],
88
- expected_output: Dict[str, Any],
89
- actual_output: Dict[str, Any],
90
- ) -> EvaluationResult:
91
- """Evaluate the given data and return a result.
92
-
93
- Args:
94
- evaluation_id: The ID of the evaluation being processed
95
- evaluation_name: The name of the evaluation
96
- input_data: The input data for the evaluation
97
- expected_output: The expected output
98
- actual_output: The actual output from the agent
99
-
100
- Returns:
101
- EvaluationResult containing the score and details
102
- """
103
- pass
104
-
105
- def to_dict(self) -> Dict[str, Any]:
106
- """Convert the evaluator instance to a dictionary representation.
107
-
108
- Returns:
109
- Dict[str, Any]: Dictionary containing all evaluator properties
110
- """
111
- return {
112
- "id": self.id,
113
- "name": self.name,
114
- "description": self.description,
115
- "created_at": self.created_at,
116
- "updated_at": self.updated_at,
117
- "category": self.category.name if self.category else None,
118
- "type": self.type.name if self.type else None,
119
- "target_output_key": self.target_output_key,
120
- }
121
-
122
- def __repr__(self) -> str:
123
- """String representation of the evaluator."""
124
- return f"{self.__class__.__name__}(id='{self.id}', name='{self.name}', category={self.category.name})"
@@ -1,40 +0,0 @@
1
- import copy
2
- from typing import Any, Dict
3
-
4
- from uipath._cli._evals._evaluators._deterministic_evaluator_base import (
5
- DeterministicEvaluatorBase,
6
- )
7
- from uipath._cli._evals._models import EvaluationResult
8
- from uipath._cli._evals._models._evaluators import ScoreType
9
-
10
-
11
- class ExactMatchEvaluator(DeterministicEvaluatorBase):
12
- async def evaluate(
13
- self,
14
- evaluation_id: str,
15
- evaluation_name: str,
16
- input_data: Dict[str, Any],
17
- expected_output: Dict[str, Any],
18
- actual_output: Dict[str, Any],
19
- ) -> EvaluationResult:
20
- actual_output_copy = copy.deepcopy(actual_output)
21
- expected_output_copy = copy.deepcopy(expected_output)
22
-
23
- actual_output, expected_output = self._select_targets(
24
- expected_output, actual_output
25
- )
26
- are_equal = self._canonical_json(actual_output) == self._canonical_json(
27
- expected_output
28
- )
29
-
30
- return EvaluationResult(
31
- evaluation_id=evaluation_id,
32
- evaluation_name=evaluation_name,
33
- evaluator_id=self.id,
34
- evaluator_name=self.name,
35
- score=are_equal,
36
- input=input_data,
37
- expected_output=expected_output_copy,
38
- actual_output=actual_output_copy,
39
- score_type=ScoreType.BOOLEAN,
40
- )
@@ -1,183 +0,0 @@
1
- import json
2
- from typing import Any, Dict
3
-
4
- from ...._config import Config
5
- from ...._execution_context import ExecutionContext
6
- from ...._services.llm_gateway_service import UiPathLlmChatService
7
- from ...._utils.constants import (
8
- ENV_BASE_URL,
9
- ENV_UIPATH_ACCESS_TOKEN,
10
- ENV_UNATTENDED_USER_ACCESS_TOKEN,
11
- COMMUNITY_agents_SUFFIX,
12
- )
13
- from .._models import EvaluationResult, LLMResponse
14
- from .._models._evaluators import ScoreType
15
- from ._evaluator_base import EvaluatorBase
16
-
17
-
18
- class LlmAsAJudgeEvaluator(EvaluatorBase):
19
- """Evaluator that uses an LLM to judge the quality of outputs."""
20
-
21
- def __init__(self, prompt: str = "", model: str = "", target_output_key: str = "*"):
22
- """Initialize the LLM-as-a-judge evaluator.
23
-
24
- Args:
25
- prompt: The prompt template for the LLM
26
- model: The model to use for evaluation
27
- target_output_key: Key in output to evaluate ("*" for entire output)
28
- """
29
- super().__init__()
30
- self.actual_output_placeholder = "{{ActualOutput}}"
31
- self.expected_output_placeholder = "{{ExpectedOutput}}"
32
- self._initialize_llm()
33
- self.prompt = prompt
34
- self.model = model
35
- self.target_output_key: str = target_output_key
36
-
37
- def _initialize_llm(self):
38
- """Initialize the LLM used for evaluation."""
39
- import os
40
-
41
- base_url_value: str = os.getenv(ENV_BASE_URL) # type: ignore
42
- secret_value: str = os.getenv(ENV_UNATTENDED_USER_ACCESS_TOKEN) or os.getenv(
43
- ENV_UIPATH_ACCESS_TOKEN
44
- ) # type: ignore
45
- config = Config(
46
- base_url=base_url_value,
47
- secret=secret_value,
48
- )
49
- self.llm = UiPathLlmChatService(config, ExecutionContext())
50
-
51
- async def evaluate(
52
- self,
53
- evaluation_id: str,
54
- evaluation_name: str,
55
- input_data: Dict[str, Any],
56
- expected_output: Dict[str, Any],
57
- actual_output: Dict[str, Any],
58
- ) -> EvaluationResult:
59
- """Evaluate using an LLM as a judge.
60
-
61
- Args:
62
- evaluation_id: The ID of the evaluation being processed
63
- evaluation_name: The name of the evaluation
64
- input_data: The input data for the evaluation
65
- expected_output: The expected output
66
- actual_output: The actual output from the agent
67
-
68
- Returns:
69
- EvaluationResult containing the score and details
70
- """
71
- # Extract the target value to evaluate
72
- target_value = self._extract_target_value(actual_output)
73
- expected_value = self._extract_target_value(expected_output)
74
-
75
- # Create the evaluation prompt
76
- evaluation_prompt = self._create_evaluation_prompt(expected_value, target_value)
77
-
78
- llm_response = await self._get_llm_response(evaluation_prompt)
79
-
80
- return EvaluationResult(
81
- evaluation_id=evaluation_id,
82
- evaluation_name=evaluation_name,
83
- evaluator_id=self.id,
84
- evaluator_name=self.name,
85
- score=llm_response.score,
86
- input=input_data,
87
- expected_output=expected_output,
88
- actual_output=actual_output,
89
- details=llm_response.justification,
90
- score_type=ScoreType.NUMERICAL,
91
- )
92
-
93
- def _extract_target_value(self, output: Dict[str, Any]) -> Any:
94
- """Extract the target value from output based on target_output_key."""
95
- if self.target_output_key == "*":
96
- return output
97
-
98
- # Handle nested keys
99
- keys = self.target_output_key.split(".")
100
- value = output
101
-
102
- try:
103
- for key in keys:
104
- if isinstance(value, dict):
105
- value = value[key]
106
- else:
107
- return None
108
- return value
109
- except (KeyError, TypeError):
110
- return None
111
-
112
- def _create_evaluation_prompt(
113
- self, expected_output: Any, actual_output: Any
114
- ) -> str:
115
- """Create the evaluation prompt for the LLM."""
116
- formatted_prompt = self.prompt.replace(
117
- self.actual_output_placeholder,
118
- str(actual_output),
119
- )
120
- formatted_prompt = formatted_prompt.replace(
121
- self.expected_output_placeholder,
122
- str(expected_output),
123
- )
124
-
125
- return formatted_prompt
126
-
127
- async def _get_llm_response(self, evaluation_prompt: str) -> LLMResponse:
128
- """Get response from the LLM.
129
-
130
- Args:
131
- evaluation_prompt: The formatted prompt to send to the LLM
132
-
133
- Returns:
134
- LLMResponse with score and justification
135
- """
136
- try:
137
- # remove community-agents suffix from llm model name
138
- model = self.model
139
- if model.endswith(COMMUNITY_agents_SUFFIX):
140
- model = model.replace(COMMUNITY_agents_SUFFIX, "")
141
-
142
- # Prepare the request
143
- request_data = {
144
- "model": model,
145
- "messages": [{"role": "user", "content": evaluation_prompt}],
146
- "response_format": {
147
- "type": "json_schema",
148
- "json_schema": {
149
- "name": "evaluation_response",
150
- "schema": {
151
- "type": "object",
152
- "properties": {
153
- "score": {
154
- "type": "number",
155
- "minimum": 0,
156
- "maximum": 100,
157
- "description": "Score between 0 and 100",
158
- },
159
- "justification": {
160
- "type": "string",
161
- "description": "Explanation for the score",
162
- },
163
- },
164
- "required": ["score", "justification"],
165
- },
166
- },
167
- },
168
- }
169
-
170
- response = await self.llm.chat_completions(**request_data)
171
-
172
- try:
173
- return LLMResponse(**json.loads(response.choices[-1].message.content))
174
- except (json.JSONDecodeError, ValueError) as e:
175
- return LLMResponse(
176
- score=0.0, justification=f"Error parsing LLM response: {str(e)}"
177
- )
178
-
179
- except Exception as e:
180
- # Fallback in case of any errors
181
- return LLMResponse(
182
- score=0.0, justification=f"Error during LLM evaluation: {str(e)}"
183
- )
@@ -1,48 +0,0 @@
1
- from typing import Any, Dict
2
-
3
- from .._models import EvaluationResult
4
- from ._evaluator_base import EvaluatorBase
5
-
6
-
7
- class TrajectoryEvaluator(EvaluatorBase):
8
- """Evaluator that analyzes the trajectory/path taken to reach outputs."""
9
-
10
- def __init__(
11
- self,
12
- trajectory_config: Dict[str, Any],
13
- step_weights: Dict[str, float],
14
- target_output_key: str = "*",
15
- ):
16
- """Initialize the trajectory evaluator.
17
-
18
- Args:
19
- trajectory_config: Configuration for trajectory analysis
20
- step_weights: Weights for different steps in the trajectory
21
- target_output_key: Key in output to evaluate ("*" for entire output)
22
- """
23
- super().__init__()
24
- self.trajectory_config = trajectory_config or {}
25
- self.step_weights = step_weights or {}
26
- self.target_output_key = target_output_key
27
-
28
- async def evaluate(
29
- self,
30
- evaluation_id: str,
31
- evaluation_name: str,
32
- input_data: Dict[str, Any],
33
- expected_output: Dict[str, Any],
34
- actual_output: Dict[str, Any],
35
- ) -> EvaluationResult:
36
- """Evaluate using trajectory analysis.
37
-
38
- Args:
39
- evaluation_id: The ID of the evaluation being processed
40
- evaluation_name: The name of the evaluation
41
- input_data: The input data for the evaluation
42
- expected_output: The expected output
43
- actual_output: The actual output from the agent
44
-
45
- Returns:
46
- EvaluationResult containing the score and details
47
- """
48
- raise NotImplementedError()
@@ -1,18 +0,0 @@
1
- from uipath._cli._evals._models._evaluation_set import EvaluationItem, EvaluationSet
2
- from uipath._cli._evals._models._evaluators import (
3
- EvaluationResult,
4
- EvaluationSetResult,
5
- EvaluatorCategory,
6
- EvaluatorType,
7
- LLMResponse,
8
- )
9
-
10
- __all__ = [
11
- "LLMResponse",
12
- "EvaluatorCategory",
13
- "EvaluatorType",
14
- "EvaluationResult",
15
- "EvaluationSetResult",
16
- "EvaluationItem",
17
- "EvaluationSet",
18
- ]
@@ -1,14 +0,0 @@
1
- from opentelemetry.sdk.trace import ReadableSpan
2
- from pydantic import BaseModel, ConfigDict
3
-
4
- from uipath._cli._runtime._contracts import UiPathRuntimeResult
5
-
6
-
7
- class UiPathEvalRunExecutionOutput(BaseModel):
8
- """Result of a single agent response."""
9
-
10
- model_config = ConfigDict(arbitrary_types_allowed=True)
11
-
12
- execution_time: float
13
- spans: list[ReadableSpan]
14
- result: UiPathRuntimeResult