uipath 2.1.52__py3-none-any.whl → 2.1.54__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. uipath/_cli/_evals/{_evaluators/_evaluator_factory.py → _evaluator_factory.py} +24 -23
  2. uipath/_cli/_evals/_models/_evaluation_set.py +23 -18
  3. uipath/_cli/_evals/_models/_evaluator_base_params.py +16 -0
  4. uipath/_cli/_evals/_models/_output.py +85 -0
  5. uipath/_cli/_evals/_runtime.py +102 -10
  6. uipath/_cli/_runtime/_contracts.py +11 -2
  7. uipath/_cli/_utils/_eval_set.py +1 -1
  8. uipath/_cli/_utils/_studio_project.py +30 -29
  9. uipath/_cli/cli_eval.py +46 -61
  10. uipath/eval/evaluators/__init__.py +15 -0
  11. uipath/eval/evaluators/base_evaluator.py +88 -0
  12. uipath/eval/evaluators/deterministic_evaluator_base.py +53 -0
  13. uipath/eval/evaluators/exact_match_evaluator.py +37 -0
  14. uipath/{_cli/_evals/_evaluators/_json_similarity_evaluator.py → eval/evaluators/json_similarity_evaluator.py} +23 -40
  15. uipath/eval/evaluators/llm_as_judge_evaluator.py +137 -0
  16. uipath/eval/evaluators/trajectory_evaluator.py +36 -0
  17. uipath/eval/models/__init__.py +19 -0
  18. uipath/{_cli/_evals/_models/_evaluators.py → eval/models/models.py} +67 -43
  19. {uipath-2.1.52.dist-info → uipath-2.1.54.dist-info}/METADATA +1 -1
  20. {uipath-2.1.52.dist-info → uipath-2.1.54.dist-info}/RECORD +23 -23
  21. uipath/_cli/_evals/_evaluators/__init__.py +0 -22
  22. uipath/_cli/_evals/_evaluators/_deterministic_evaluator_base.py +0 -46
  23. uipath/_cli/_evals/_evaluators/_evaluator_base.py +0 -124
  24. uipath/_cli/_evals/_evaluators/_exact_match_evaluator.py +0 -40
  25. uipath/_cli/_evals/_evaluators/_llm_as_judge_evaluator.py +0 -183
  26. uipath/_cli/_evals/_evaluators/_trajectory_evaluator.py +0 -48
  27. uipath/_cli/_evals/_models/__init__.py +0 -18
  28. uipath/_cli/_evals/_models/_agent_execution_output.py +0 -14
  29. uipath/_cli/_evals/progress_reporter.py +0 -304
  30. {uipath-2.1.52.dist-info → uipath-2.1.54.dist-info}/WHEEL +0 -0
  31. {uipath-2.1.52.dist-info → uipath-2.1.54.dist-info}/entry_points.txt +0 -0
  32. {uipath-2.1.52.dist-info → uipath-2.1.54.dist-info}/licenses/LICENSE +0 -0
uipath/_cli/cli_eval.py CHANGED
@@ -2,7 +2,6 @@
2
2
  import ast
3
3
  import asyncio
4
4
  import os
5
- from datetime import datetime, timezone
6
5
  from typing import List, Optional
7
6
 
8
7
  import click
@@ -13,7 +12,7 @@ from uipath._cli._runtime._contracts import (
13
12
  UiPathRuntimeFactory,
14
13
  )
15
14
  from uipath._cli._runtime._runtime import UiPathScriptRuntime
16
- from uipath._cli.middlewares import MiddlewareResult, Middlewares
15
+ from uipath._cli.middlewares import Middlewares
17
16
  from uipath.eval._helpers import auto_discover_entrypoint
18
17
 
19
18
  from .._utils.constants import ENV_JOB_ID
@@ -32,55 +31,6 @@ class LiteralOption(click.Option):
32
31
  raise click.BadParameter(value) from e
33
32
 
34
33
 
35
- def eval_agent_middleware(
36
- entrypoint: Optional[str] = None,
37
- eval_set: Optional[str] = None,
38
- eval_ids: Optional[List[str]] = None,
39
- workers: int = 8,
40
- no_report: bool = False,
41
- **kwargs,
42
- ) -> MiddlewareResult:
43
- """Middleware to run an evaluation set against the agent."""
44
- timestamp = datetime.now(timezone.utc).strftime("%M-%H-%d-%m-%Y")
45
-
46
- eval_context = UiPathEvalContext.with_defaults()
47
- eval_context.no_report = no_report
48
- eval_context.workers = workers
49
- eval_context.eval_set = eval_set or EvalHelpers.auto_discover_eval_set()
50
- eval_context.eval_ids = eval_ids
51
- eval_context.execution_output_file = (
52
- f"evals/results/{timestamp}.json" if not os.getenv("UIPATH_JOB_KEY") else None
53
- )
54
-
55
- runtime_entrypoint = entrypoint or auto_discover_entrypoint()
56
-
57
- def generate_runtime_context(**context_kwargs) -> UiPathRuntimeContext:
58
- runtime_context = UiPathRuntimeContext.with_defaults(**context_kwargs)
59
- runtime_context.entrypoint = runtime_entrypoint
60
- return runtime_context
61
-
62
- try:
63
- runtime_factory = UiPathRuntimeFactory(
64
- UiPathScriptRuntime,
65
- UiPathRuntimeContext,
66
- context_generator=generate_runtime_context,
67
- )
68
-
69
- async def execute():
70
- async with UiPathEvalRuntime.from_eval_context(
71
- factory=runtime_factory, context=eval_context
72
- ) as eval_runtime:
73
- await eval_runtime.execute()
74
-
75
- asyncio.run(execute())
76
- return MiddlewareResult(should_continue=False)
77
-
78
- except Exception as e:
79
- return MiddlewareResult(
80
- should_continue=False, error_message=f"Error running evaluation: {str(e)}"
81
- )
82
-
83
-
84
34
  @click.command()
85
35
  @click.argument("entrypoint", required=False)
86
36
  @click.argument("eval_set", required=False)
@@ -97,6 +47,12 @@ def eval_agent_middleware(
97
47
  default=8,
98
48
  help="Number of parallel workers for running evaluations (default: 8)",
99
49
  )
50
+ @click.option(
51
+ "--output-file",
52
+ required=False,
53
+ type=click.Path(exists=False),
54
+ help="File path where the output will be written",
55
+ )
100
56
  @track(when=lambda *_a, **_kw: os.getenv(ENV_JOB_ID) is None)
101
57
  def eval(
102
58
  entrypoint: Optional[str],
@@ -104,6 +60,7 @@ def eval(
104
60
  eval_ids: List[str],
105
61
  no_report: bool,
106
62
  workers: int,
63
+ output_file: Optional[str],
107
64
  ) -> None:
108
65
  """Run an evaluation set against the agent.
109
66
 
@@ -121,21 +78,49 @@ def eval(
121
78
  eval_ids,
122
79
  no_report=no_report,
123
80
  workers=workers,
81
+ execution_output_file=output_file,
124
82
  )
125
83
 
126
- if result.should_continue:
127
- result = eval_agent_middleware(
128
- entrypoint=entrypoint,
129
- eval_set=eval_set,
130
- eval_ids=eval_ids,
131
- workers=workers,
132
- no_report=no_report,
133
- )
134
- if result.should_continue:
135
- console.error("Could not process the request with any available handler.")
136
84
  if result.error_message:
137
85
  console.error(result.error_message)
138
86
 
87
+ if result.should_continue:
88
+
89
+ def generate_runtime_context(**context_kwargs) -> UiPathRuntimeContext:
90
+ runtime_context = UiPathRuntimeContext.with_defaults(**context_kwargs)
91
+ runtime_context.entrypoint = runtime_entrypoint
92
+ return runtime_context
93
+
94
+ eval_context = UiPathEvalContext.with_defaults(
95
+ execution_output_file=output_file
96
+ )
97
+
98
+ eval_context.no_report = no_report
99
+ eval_context.workers = workers
100
+ eval_context.eval_set = eval_set or EvalHelpers.auto_discover_eval_set()
101
+ eval_context.eval_ids = eval_ids
102
+
103
+ runtime_entrypoint = entrypoint or auto_discover_entrypoint()
104
+
105
+ try:
106
+ runtime_factory = UiPathRuntimeFactory(
107
+ UiPathScriptRuntime,
108
+ UiPathRuntimeContext,
109
+ context_generator=generate_runtime_context,
110
+ )
111
+
112
+ async def execute():
113
+ async with UiPathEvalRuntime.from_eval_context(
114
+ factory=runtime_factory, context=eval_context
115
+ ) as eval_runtime:
116
+ await eval_runtime.execute()
117
+
118
+ asyncio.run(execute())
119
+ except Exception as e:
120
+ console.error(
121
+ f"Error: Unexpected error occurred - {str(e)}", include_traceback=True
122
+ )
123
+
139
124
  console.success("Evaluation completed successfully")
140
125
 
141
126
 
@@ -0,0 +1,15 @@
1
+ """UiPath evaluator implementations for agent performance evaluation."""
2
+
3
+ from .base_evaluator import BaseEvaluator
4
+ from .exact_match_evaluator import ExactMatchEvaluator
5
+ from .json_similarity_evaluator import JsonSimilarityEvaluator
6
+ from .llm_as_judge_evaluator import LlmAsAJudgeEvaluator
7
+ from .trajectory_evaluator import TrajectoryEvaluator
8
+
9
+ __all__ = [
10
+ "BaseEvaluator",
11
+ "ExactMatchEvaluator",
12
+ "JsonSimilarityEvaluator",
13
+ "LlmAsAJudgeEvaluator",
14
+ "TrajectoryEvaluator",
15
+ ]
@@ -0,0 +1,88 @@
1
+ """Base evaluator abstract class for agent evaluation."""
2
+
3
+ import functools
4
+ import time
5
+ from abc import ABC, abstractmethod
6
+ from typing import Generic, TypeVar
7
+
8
+ from pydantic import BaseModel, ConfigDict
9
+
10
+ from uipath.eval.models import EvaluationResult
11
+ from uipath.eval.models.models import (
12
+ AgentExecution,
13
+ ErrorEvaluationResult,
14
+ EvaluatorCategory,
15
+ EvaluatorType,
16
+ )
17
+
18
+
19
+ def track_evaluation_metrics(func):
20
+ """Decorator to track evaluation metrics and handle errors gracefully."""
21
+
22
+ @functools.wraps(func)
23
+ async def wrapper(*args, **kwargs) -> EvaluationResult:
24
+ start_time = time.time()
25
+ try:
26
+ result = await func(*args, **kwargs)
27
+ except Exception as e:
28
+ result = ErrorEvaluationResult(
29
+ details="Exception thrown by evaluator: {}".format(e),
30
+ evaluation_time=time.time() - start_time,
31
+ )
32
+ end_time = time.time()
33
+ execution_time = end_time - start_time
34
+
35
+ result.evaluation_time = execution_time
36
+ return result
37
+
38
+ return wrapper
39
+
40
+
41
+ T = TypeVar("T")
42
+
43
+
44
+ class BaseEvaluator(BaseModel, Generic[T], ABC):
45
+ """Abstract base class for all evaluators."""
46
+
47
+ model_config = ConfigDict(arbitrary_types_allowed=True)
48
+
49
+ id: str
50
+ name: str
51
+ description: str
52
+ target_output_key: str = "*"
53
+ created_at: str
54
+ updated_at: str
55
+ category: EvaluatorCategory
56
+ evaluator_type: EvaluatorType
57
+
58
+ def __init_subclass__(cls, **kwargs):
59
+ """Hook for subclass creation - automatically applies evaluation metrics tracking."""
60
+ super().__init_subclass__(**kwargs)
61
+
62
+ if hasattr(cls, "evaluate") and not getattr(
63
+ cls.evaluate, "_has_metrics_decorator", False
64
+ ):
65
+ cls.evaluate = track_evaluation_metrics(cls.evaluate) # type: ignore[method-assign]
66
+ cls.evaluate._has_metrics_decorator = True # type: ignore[attr-defined]
67
+
68
+ def model_post_init(self, __context):
69
+ """Post-initialization hook for Pydantic models."""
70
+ pass
71
+
72
+ @abstractmethod
73
+ async def evaluate(
74
+ self, agent_execution: AgentExecution, evaluation_criteria: T
75
+ ) -> EvaluationResult:
76
+ """Evaluate the given data and return a result.
77
+
78
+ Args:
79
+ agent_execution: The execution details containing:
80
+ - agent_input: The input received by the agent
81
+ - actual_output: The actual output from the agent
82
+ - spans: The execution spans to use for the evaluation
83
+ evaluation_criteria: The criteria to evaluate
84
+
85
+ Returns:
86
+ EvaluationResult containing the score and details
87
+ """
88
+ pass
@@ -0,0 +1,53 @@
1
+ """Base class for deterministic evaluators that provide consistent outputs."""
2
+
3
+ import json
4
+ from abc import ABC
5
+ from typing import Any, TypeVar
6
+
7
+ from .base_evaluator import BaseEvaluator
8
+
9
+ T = TypeVar("T")
10
+
11
+
12
+ class DeterministicEvaluatorBase(BaseEvaluator[T], ABC):
13
+ """Base class for evaluators that produce deterministic, reproducible results.
14
+
15
+ This class provides utility methods for canonical JSON comparison and number normalization
16
+ to ensure consistent evaluation results across runs.
17
+ """
18
+
19
+ def _canonical_json(self, obj: Any) -> str:
20
+ """Convert an object to canonical JSON string for consistent comparison.
21
+
22
+ Args:
23
+ obj: The object to convert to canonical JSON
24
+
25
+ Returns:
26
+ str: Canonical JSON string with normalized numbers and sorted keys
27
+ """
28
+ return json.dumps(
29
+ self._normalize_numbers(obj),
30
+ sort_keys=True,
31
+ separators=(",", ":"),
32
+ ensure_ascii=False,
33
+ )
34
+
35
+ def _normalize_numbers(self, obj: Any) -> Any:
36
+ """Recursively normalize numbers in nested data structures.
37
+
38
+ Converts all numeric values (int, float) to float for consistent comparison,
39
+ while preserving booleans and other data types.
40
+
41
+ Args:
42
+ obj: The object to normalize
43
+
44
+ Returns:
45
+ Any: Object with normalized numbers
46
+ """
47
+ if isinstance(obj, dict):
48
+ return {k: self._normalize_numbers(v) for k, v in obj.items()}
49
+ if isinstance(obj, (list, tuple)):
50
+ return [self._normalize_numbers(v) for v in obj]
51
+ if isinstance(obj, (int, float)) and not isinstance(obj, bool):
52
+ return float(obj)
53
+ return obj
@@ -0,0 +1,37 @@
1
+ """Exact match evaluator for binary pass/fail evaluation of agent outputs."""
2
+
3
+ from typing import Any
4
+
5
+ from uipath.eval.models import BooleanEvaluationResult, EvaluationResult
6
+
7
+ from ..models.models import AgentExecution
8
+ from .deterministic_evaluator_base import DeterministicEvaluatorBase
9
+
10
+
11
+ class ExactMatchEvaluator(DeterministicEvaluatorBase[dict[str, Any]]):
12
+ """Evaluator that performs exact structural matching between expected and actual outputs.
13
+
14
+ This evaluator returns True if the actual output exactly matches the expected output
15
+ after canonical JSON normalization, and False otherwise. Numbers are normalized
16
+ to floats for consistent comparison.
17
+ """
18
+
19
+ async def evaluate(
20
+ self, agent_execution: AgentExecution, evaluation_criteria: dict[str, Any]
21
+ ) -> EvaluationResult:
22
+ """Evaluate whether actual output exactly matches expected output.
23
+
24
+ Args:
25
+ agent_execution: The execution details containing:
26
+ - agent_input: The input received by the agent
27
+ - actual_output: The actual output from the agent
28
+ - spans: The execution spans to use for the evaluation
29
+ evaluation_criteria: The criteria to evaluate
30
+
31
+ Returns:
32
+ EvaluationResult: Boolean result indicating exact match (True/False)
33
+ """
34
+ return BooleanEvaluationResult(
35
+ score=self._canonical_json(agent_execution.agent_output)
36
+ == self._canonical_json(evaluation_criteria)
37
+ )
@@ -1,16 +1,18 @@
1
- import copy
1
+ """JSON similarity evaluator for flexible structural comparison of outputs."""
2
+
2
3
  import math
3
- from typing import Any, Dict, Tuple
4
+ from typing import Any, Tuple, TypeVar
5
+
6
+ from uipath.eval.models import EvaluationResult, NumericEvaluationResult
7
+
8
+ from ..models.models import AgentExecution
9
+ from .deterministic_evaluator_base import DeterministicEvaluatorBase
4
10
 
5
- from uipath._cli._evals._evaluators._deterministic_evaluator_base import (
6
- DeterministicEvaluatorBase,
7
- )
8
- from uipath._cli._evals._models import EvaluationResult
9
- from uipath._cli._evals._models._evaluators import ScoreType
11
+ T = TypeVar("T")
10
12
 
11
13
 
12
- class JsonSimilarityEvaluator(DeterministicEvaluatorBase):
13
- """Deterministic evaluator that scores structural JSON similarity.
14
+ class JsonSimilarityEvaluator(DeterministicEvaluatorBase[dict[str, Any]]):
15
+ """Deterministic evaluator that scores structural JSON similarity between expected and actual output.
14
16
 
15
17
  Compares expected versus actual JSON-like structures and returns a
16
18
  numerical score in the range [0, 100]. The comparison is token-based
@@ -18,43 +20,24 @@ class JsonSimilarityEvaluator(DeterministicEvaluatorBase):
18
20
  """
19
21
 
20
22
  async def evaluate(
21
- self,
22
- evaluation_id: str,
23
- evaluation_name: str,
24
- input_data: Dict[str, Any],
25
- expected_output: Dict[str, Any],
26
- actual_output: Dict[str, Any],
23
+ self, agent_execution: AgentExecution, evaluation_criteria: dict[str, Any]
27
24
  ) -> EvaluationResult:
28
25
  """Evaluate similarity between expected and actual JSON outputs.
29
26
 
30
- Args:
31
- evaluation_id: Unique identifier for this evaluation run.
32
- evaluation_name: Human friendly evaluation name.
33
- input_data: Input payload used to produce the outputs.
34
- expected_output: Ground-truth JSON structure.
35
- actual_output: Produced JSON structure to compare against the ground truth.
27
+ Uses token-based comparison with tolerance for numeric differences
28
+ and Levenshtein distance for string similarity.
29
+
30
+ agent_execution: The execution details containing:
31
+ - agent_input: The input received by the agent
32
+ - actual_output: The actual output from the agent
33
+ - spans: The execution spans to use for the evaluation
34
+ evaluation_criteria: The criteria to evaluate
36
35
 
37
36
  Returns:
38
- EvaluationResult: Structured result with the numerical similarity score.
37
+ EvaluationResult: Numerical score between 0-100 indicating similarity
39
38
  """
40
- actual_output_copy = copy.deepcopy(actual_output)
41
- expected_output_copy = copy.deepcopy(expected_output)
42
-
43
- actual_output, expected_output = self._select_targets(
44
- expected_output, actual_output
45
- )
46
- similarity = self._compare_json(expected_output, actual_output)
47
-
48
- return EvaluationResult(
49
- evaluation_id=evaluation_id,
50
- evaluation_name=evaluation_name,
51
- evaluator_id=self.id,
52
- evaluator_name=self.name,
53
- score=similarity,
54
- input=input_data,
55
- expected_output=expected_output_copy,
56
- actual_output=actual_output_copy,
57
- score_type=ScoreType.NUMERICAL,
39
+ return NumericEvaluationResult(
40
+ score=self._compare_json(evaluation_criteria, agent_execution.agent_output)
58
41
  )
59
42
 
60
43
  def _compare_json(self, expected: Any, actual: Any) -> float:
@@ -0,0 +1,137 @@
1
+ """LLM-as-a-judge evaluator for subjective quality assessment of agent outputs."""
2
+
3
+ import json
4
+ from typing import Any, Optional
5
+
6
+ from pydantic import field_validator
7
+
8
+ from uipath.eval.models import NumericEvaluationResult
9
+
10
+ from ..._services import UiPathLlmChatService
11
+ from ..._utils.constants import COMMUNITY_agents_SUFFIX
12
+ from ..models.models import AgentExecution, EvaluationResult, LLMResponse
13
+ from .base_evaluator import BaseEvaluator
14
+
15
+
16
+ class LlmAsAJudgeEvaluator(BaseEvaluator[dict[str, Any]]):
17
+ """Evaluator that uses an LLM to judge the quality of agent output."""
18
+
19
+ prompt: str
20
+ model: str
21
+ actual_output_placeholder: str = "{{ActualOutput}}"
22
+ expected_output_placeholder: str = "{{ExpectedOutput}}"
23
+ llm: Optional[UiPathLlmChatService] = None
24
+
25
+ @field_validator("prompt")
26
+ @classmethod
27
+ def validate_prompt_placeholders(cls, v: str) -> str:
28
+ """Validate that prompt contains required placeholders."""
29
+ if "{{ActualOutput}}" not in v or "{{ExpectedOutput}}" not in v:
30
+ raise ValueError(
31
+ "Prompt must contain both {ActualOutput} and {ExpectedOutput} placeholders"
32
+ )
33
+ return v
34
+
35
+ def model_post_init(self, __context):
36
+ """Initialize the LLM service after model creation."""
37
+ super().model_post_init(__context)
38
+ self._initialize_llm()
39
+
40
+ def _initialize_llm(self):
41
+ """Initialize the LLM used for evaluation."""
42
+ from uipath import UiPath
43
+
44
+ uipath = UiPath()
45
+ self.llm = uipath.llm
46
+
47
+ async def evaluate(
48
+ self,
49
+ agent_execution: AgentExecution,
50
+ evaluation_criteria: dict[str, Any],
51
+ ) -> EvaluationResult:
52
+ """Evaluate using an LLM as a judge.
53
+
54
+ Sends the formatted prompt to the configured LLM and expects a JSON response
55
+ with a numerical score (0-100) and justification.
56
+
57
+ agent_execution: The execution details containing:
58
+ - agent_input: The input received by the agent
59
+ - actual_output: The actual output from the agent
60
+ - spans: The execution spans to use for the evaluation
61
+ evaluation_criteria: The criteria to evaluate
62
+
63
+ Returns:
64
+ EvaluationResult: Numerical score with LLM justification as details
65
+ """
66
+ # Create the evaluation prompt
67
+ evaluation_prompt = self._create_evaluation_prompt(
68
+ expected_output=evaluation_criteria,
69
+ actual_output=agent_execution.agent_output,
70
+ )
71
+
72
+ llm_response = await self._get_llm_response(evaluation_prompt)
73
+
74
+ return NumericEvaluationResult(
75
+ score=llm_response.score,
76
+ details=llm_response.justification,
77
+ )
78
+
79
+ def _create_evaluation_prompt(
80
+ self, expected_output: Any, actual_output: Any
81
+ ) -> str:
82
+ """Create the evaluation prompt for the LLM."""
83
+ formatted_prompt = self.prompt.replace(
84
+ self.actual_output_placeholder,
85
+ str(actual_output),
86
+ )
87
+ formatted_prompt = formatted_prompt.replace(
88
+ self.expected_output_placeholder,
89
+ str(expected_output),
90
+ )
91
+
92
+ return formatted_prompt
93
+
94
+ async def _get_llm_response(self, evaluation_prompt: str) -> LLMResponse:
95
+ """Get response from the LLM.
96
+
97
+ Args:
98
+ evaluation_prompt: The formatted prompt to send to the LLM
99
+
100
+ Returns:
101
+ LLMResponse with score and justification
102
+ """
103
+ # remove community-agents suffix from llm model name
104
+ model = self.model
105
+ if model.endswith(COMMUNITY_agents_SUFFIX):
106
+ model = model.replace(COMMUNITY_agents_SUFFIX, "")
107
+
108
+ # Prepare the request
109
+ request_data = {
110
+ "model": model,
111
+ "messages": [{"role": "user", "content": evaluation_prompt}],
112
+ "response_format": {
113
+ "type": "json_schema",
114
+ "json_schema": {
115
+ "name": "evaluation_response",
116
+ "schema": {
117
+ "type": "object",
118
+ "properties": {
119
+ "score": {
120
+ "type": "number",
121
+ "minimum": 0,
122
+ "maximum": 100,
123
+ "description": "Score between 0 and 100",
124
+ },
125
+ "justification": {
126
+ "type": "string",
127
+ "description": "Explanation for the score",
128
+ },
129
+ },
130
+ "required": ["score", "justification"],
131
+ },
132
+ },
133
+ },
134
+ }
135
+
136
+ response = await self.llm.chat_completions(**request_data) # type: ignore
137
+ return LLMResponse(**json.loads(response.choices[-1].message.content))
@@ -0,0 +1,36 @@
1
+ """Trajectory evaluator for analyzing execution paths and decision sequences."""
2
+
3
+ from typing import TypeVar
4
+
5
+ from uipath.eval.models import EvaluationResult
6
+
7
+ from ..models.models import AgentExecution
8
+ from .base_evaluator import BaseEvaluator
9
+
10
+ T = TypeVar("T")
11
+
12
+
13
+ class TrajectoryEvaluator(BaseEvaluator[T]):
14
+ """Evaluator that analyzes the trajectory/path taken to reach outputs."""
15
+
16
+ async def evaluate(
17
+ self, agent_execution: AgentExecution, evaluation_criteria: T
18
+ ) -> EvaluationResult:
19
+ """Evaluate using trajectory analysis.
20
+
21
+ Analyzes the execution path and decision sequence taken by the agent
22
+ to assess the quality of the reasoning process.
23
+
24
+ Args:
25
+ agent_execution: The execution details containing:
26
+ - agent_input: The input received by the agent
27
+ - actual_output: The actual output from the agent
28
+ - spans: The execution spans to use for the evaluation
29
+ evaluation_criteria: The criteria to evaluate
30
+ Returns:
31
+ EvaluationResult: Score based on trajectory analysis
32
+
33
+ Raises:
34
+ NotImplementedError: This evaluator is not yet implemented
35
+ """
36
+ raise NotImplementedError()
@@ -0,0 +1,19 @@
1
+ """UiPath evaluation module for agent performance assessment."""
2
+
3
+ from uipath.eval.models.models import (
4
+ BooleanEvaluationResult,
5
+ ErrorEvaluationResult,
6
+ EvalItemResult,
7
+ EvaluationResult,
8
+ NumericEvaluationResult,
9
+ ScoreType,
10
+ )
11
+
12
+ __all__ = [
13
+ "EvaluationResult",
14
+ "ScoreType",
15
+ "EvalItemResult",
16
+ "BooleanEvaluationResult",
17
+ "NumericEvaluationResult",
18
+ "ErrorEvaluationResult",
19
+ ]