uipath 2.1.108__py3-none-any.whl → 2.1.109__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of uipath might be problematic. Click here for more details.

Files changed (69) hide show
  1. uipath/_cli/__init__.py +4 -0
  2. uipath/_cli/_evals/_console_progress_reporter.py +2 -2
  3. uipath/_cli/_evals/_evaluator_factory.py +314 -29
  4. uipath/_cli/_evals/_helpers.py +194 -0
  5. uipath/_cli/_evals/_models/_evaluation_set.py +73 -7
  6. uipath/_cli/_evals/_models/_evaluator.py +183 -9
  7. uipath/_cli/_evals/_models/_evaluator_base_params.py +3 -3
  8. uipath/_cli/_evals/_models/_output.py +87 -3
  9. uipath/_cli/_evals/_progress_reporter.py +288 -28
  10. uipath/_cli/_evals/_runtime.py +80 -26
  11. uipath/_cli/_evals/mocks/input_mocker.py +1 -3
  12. uipath/_cli/_evals/mocks/llm_mocker.py +2 -2
  13. uipath/_cli/_evals/mocks/mocker_factory.py +2 -2
  14. uipath/_cli/_evals/mocks/mockito_mocker.py +2 -2
  15. uipath/_cli/_evals/mocks/mocks.py +5 -3
  16. uipath/_cli/_push/models.py +17 -0
  17. uipath/_cli/_push/sw_file_handler.py +336 -3
  18. uipath/_cli/_templates/custom_evaluator.py.template +65 -0
  19. uipath/_cli/_utils/_eval_set.py +30 -9
  20. uipath/_cli/_utils/_resources.py +21 -0
  21. uipath/_cli/_utils/_studio_project.py +18 -0
  22. uipath/_cli/cli_add.py +114 -0
  23. uipath/_cli/cli_eval.py +5 -1
  24. uipath/_cli/cli_pull.py +11 -26
  25. uipath/_cli/cli_push.py +2 -0
  26. uipath/_cli/cli_register.py +45 -0
  27. uipath/_events/_events.py +6 -5
  28. uipath/_utils/constants.py +4 -0
  29. uipath/eval/_helpers/evaluators_helpers.py +494 -0
  30. uipath/eval/_helpers/helpers.py +30 -2
  31. uipath/eval/evaluators/__init__.py +60 -5
  32. uipath/eval/evaluators/base_evaluator.py +546 -44
  33. uipath/eval/evaluators/contains_evaluator.py +80 -0
  34. uipath/eval/evaluators/exact_match_evaluator.py +43 -12
  35. uipath/eval/evaluators/json_similarity_evaluator.py +41 -12
  36. uipath/eval/evaluators/legacy_base_evaluator.py +89 -0
  37. uipath/eval/evaluators/{deterministic_evaluator_base.py → legacy_deterministic_evaluator_base.py} +2 -2
  38. uipath/eval/evaluators/legacy_exact_match_evaluator.py +37 -0
  39. uipath/eval/evaluators/legacy_json_similarity_evaluator.py +151 -0
  40. uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py +137 -0
  41. uipath/eval/evaluators/{trajectory_evaluator.py → legacy_trajectory_evaluator.py} +5 -6
  42. uipath/eval/evaluators/llm_as_judge_evaluator.py +143 -78
  43. uipath/eval/evaluators/llm_judge_output_evaluator.py +112 -0
  44. uipath/eval/evaluators/llm_judge_trajectory_evaluator.py +142 -0
  45. uipath/eval/evaluators/output_evaluator.py +117 -0
  46. uipath/eval/evaluators/tool_call_args_evaluator.py +82 -0
  47. uipath/eval/evaluators/tool_call_count_evaluator.py +87 -0
  48. uipath/eval/evaluators/tool_call_order_evaluator.py +84 -0
  49. uipath/eval/evaluators/tool_call_output_evaluator.py +87 -0
  50. uipath/eval/evaluators_types/ContainsEvaluator.json +73 -0
  51. uipath/eval/evaluators_types/ExactMatchEvaluator.json +89 -0
  52. uipath/eval/evaluators_types/JsonSimilarityEvaluator.json +81 -0
  53. uipath/eval/evaluators_types/LLMJudgeOutputEvaluator.json +110 -0
  54. uipath/eval/evaluators_types/LLMJudgeSimulationTrajectoryEvaluator.json +88 -0
  55. uipath/eval/evaluators_types/LLMJudgeStrictJSONSimilarityOutputEvaluator.json +110 -0
  56. uipath/eval/evaluators_types/LLMJudgeTrajectoryEvaluator.json +88 -0
  57. uipath/eval/evaluators_types/ToolCallArgsEvaluator.json +131 -0
  58. uipath/eval/evaluators_types/ToolCallCountEvaluator.json +104 -0
  59. uipath/eval/evaluators_types/ToolCallOrderEvaluator.json +100 -0
  60. uipath/eval/evaluators_types/ToolCallOutputEvaluator.json +124 -0
  61. uipath/eval/evaluators_types/generate_types.py +31 -0
  62. uipath/eval/models/__init__.py +16 -1
  63. uipath/eval/models/llm_judge_types.py +196 -0
  64. uipath/eval/models/models.py +109 -7
  65. {uipath-2.1.108.dist-info → uipath-2.1.109.dist-info}/METADATA +1 -1
  66. {uipath-2.1.108.dist-info → uipath-2.1.109.dist-info}/RECORD +69 -37
  67. {uipath-2.1.108.dist-info → uipath-2.1.109.dist-info}/WHEEL +0 -0
  68. {uipath-2.1.108.dist-info → uipath-2.1.109.dist-info}/entry_points.txt +0 -0
  69. {uipath-2.1.108.dist-info → uipath-2.1.109.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,80 @@
1
+ """Contains evaluator for agent outputs."""
2
+
3
+ from ..models import (
4
+ AgentExecution,
5
+ EvaluationResult,
6
+ EvaluatorType,
7
+ NumericEvaluationResult,
8
+ )
9
+ from .base_evaluator import BaseEvaluationCriteria
10
+ from .output_evaluator import (
11
+ OutputEvaluator,
12
+ OutputEvaluatorConfig,
13
+ )
14
+
15
+
16
+ class ContainsEvaluationCriteria(BaseEvaluationCriteria):
17
+ """Evaluation criteria for the contains evaluator."""
18
+
19
+ search_text: str
20
+
21
+
22
+ class ContainsEvaluatorConfig(OutputEvaluatorConfig[ContainsEvaluationCriteria]):
23
+ """Configuration for the contains evaluator."""
24
+
25
+ name: str = "ContainsEvaluator"
26
+ case_sensitive: bool = False
27
+ negated: bool = False
28
+
29
+
30
+ class ContainsEvaluator(
31
+ OutputEvaluator[ContainsEvaluationCriteria, ContainsEvaluatorConfig, type(None)] # type: ignore
32
+ ):
33
+ """Evaluator that checks if the actual output contains the expected output.
34
+
35
+ This evaluator returns True if the actual output contains the expected output,
36
+ and False otherwise. It supports case sensitivity and negation options.
37
+ """
38
+
39
+ @classmethod
40
+ def get_evaluator_id(cls) -> str:
41
+ """Get the evaluator id."""
42
+ return EvaluatorType.CONTAINS.value
43
+
44
+ async def evaluate(
45
+ self,
46
+ agent_execution: AgentExecution,
47
+ evaluation_criteria: ContainsEvaluationCriteria,
48
+ ) -> EvaluationResult:
49
+ """Evaluate whether actual output contains the expected output.
50
+
51
+ Args:
52
+ agent_execution: The execution details containing:
53
+ - agent_input: The input received by the agent
54
+ - agent_output: The actual output from the agent
55
+ - agent_trace: The execution spans to use for the evaluation
56
+ evaluation_criteria: The criteria to evaluate
57
+
58
+ Returns:
59
+ EvaluationResult: Boolean result indicating if output contains expected value (True/False)
60
+ """
61
+ actual_output = str(self._get_actual_output(agent_execution))
62
+ expected_output = str(self._get_expected_output(evaluation_criteria))
63
+
64
+ if not self.evaluator_config.case_sensitive:
65
+ actual_output = actual_output.lower()
66
+ expected_output = expected_output.lower()
67
+
68
+ is_contains = expected_output in actual_output
69
+
70
+ if self.evaluator_config.negated:
71
+ is_contains = not is_contains
72
+ return NumericEvaluationResult(
73
+ score=float(is_contains),
74
+ )
75
+
76
+ def _get_expected_output(
77
+ self, evaluation_criteria: ContainsEvaluationCriteria
78
+ ) -> str:
79
+ """Get the expected output from the evaluation criteria."""
80
+ return evaluation_criteria.search_text
@@ -1,14 +1,29 @@
1
- """Exact match evaluator for binary pass/fail evaluation of agent outputs."""
1
+ """Exact match evaluator for agent outputs."""
2
2
 
3
- from typing import Any
3
+ from ..models import (
4
+ AgentExecution,
5
+ EvaluationResult,
6
+ EvaluatorType,
7
+ NumericEvaluationResult,
8
+ )
9
+ from .output_evaluator import (
10
+ OutputEvaluationCriteria,
11
+ OutputEvaluator,
12
+ OutputEvaluatorConfig,
13
+ )
4
14
 
5
- from uipath.eval.models import BooleanEvaluationResult, EvaluationResult
6
15
 
7
- from ..models.models import AgentExecution
8
- from .deterministic_evaluator_base import DeterministicEvaluatorBase
16
+ class ExactMatchEvaluatorConfig(OutputEvaluatorConfig[OutputEvaluationCriteria]):
17
+ """Configuration for the exact match evaluator."""
9
18
 
19
+ name: str = "ExactMatchEvaluator"
20
+ case_sensitive: bool = False
21
+ negated: bool = False
10
22
 
11
- class ExactMatchEvaluator(DeterministicEvaluatorBase[dict[str, Any]]):
23
+
24
+ class ExactMatchEvaluator(
25
+ OutputEvaluator[OutputEvaluationCriteria, ExactMatchEvaluatorConfig, type(None)] # type: ignore
26
+ ):
12
27
  """Evaluator that performs exact structural matching between expected and actual outputs.
13
28
 
14
29
  This evaluator returns True if the actual output exactly matches the expected output
@@ -16,22 +31,38 @@ class ExactMatchEvaluator(DeterministicEvaluatorBase[dict[str, Any]]):
16
31
  to floats for consistent comparison.
17
32
  """
18
33
 
34
+ @classmethod
35
+ def get_evaluator_id(cls) -> str:
36
+ """Get the evaluator id."""
37
+ return EvaluatorType.EXACT_MATCH.value
38
+
19
39
  async def evaluate(
20
- self, agent_execution: AgentExecution, evaluation_criteria: dict[str, Any]
40
+ self,
41
+ agent_execution: AgentExecution,
42
+ evaluation_criteria: OutputEvaluationCriteria,
21
43
  ) -> EvaluationResult:
22
44
  """Evaluate whether actual output exactly matches expected output.
23
45
 
24
46
  Args:
25
47
  agent_execution: The execution details containing:
26
48
  - agent_input: The input received by the agent
27
- - actual_output: The actual output from the agent
28
- - spans: The execution spans to use for the evaluation
49
+ - agent_output: The actual output from the agent
50
+ - agent_trace: The execution spans to use for the evaluation
29
51
  evaluation_criteria: The criteria to evaluate
30
52
 
31
53
  Returns:
32
54
  EvaluationResult: Boolean result indicating exact match (True/False)
33
55
  """
34
- return BooleanEvaluationResult(
35
- score=self._canonical_json(agent_execution.agent_output)
36
- == self._canonical_json(evaluation_criteria)
56
+ actual_output = str(self._get_actual_output(agent_execution))
57
+ expected_output = str(self._get_expected_output(evaluation_criteria))
58
+ if not self.evaluator_config.case_sensitive:
59
+ actual_output = actual_output.lower()
60
+ expected_output = expected_output.lower()
61
+
62
+ is_exact_match = actual_output == expected_output
63
+ if self.evaluator_config.negated:
64
+ is_exact_match = not is_exact_match
65
+
66
+ return NumericEvaluationResult(
67
+ score=float(is_exact_match),
37
68
  )
@@ -1,17 +1,30 @@
1
1
  """JSON similarity evaluator for flexible structural comparison of outputs."""
2
2
 
3
3
  import math
4
- from typing import Any, Tuple, TypeVar
4
+ from typing import Any, Tuple
5
5
 
6
- from uipath.eval.models import EvaluationResult, NumericEvaluationResult
6
+ from ..models import (
7
+ AgentExecution,
8
+ EvaluationResult,
9
+ EvaluatorType,
10
+ NumericEvaluationResult,
11
+ )
12
+ from .output_evaluator import (
13
+ OutputEvaluationCriteria,
14
+ OutputEvaluator,
15
+ OutputEvaluatorConfig,
16
+ )
7
17
 
8
- from ..models.models import AgentExecution
9
- from .deterministic_evaluator_base import DeterministicEvaluatorBase
10
18
 
11
- T = TypeVar("T")
19
+ class JsonSimilarityEvaluatorConfig(OutputEvaluatorConfig[OutputEvaluationCriteria]):
20
+ """Configuration for the json similarity evaluator."""
12
21
 
22
+ name: str = "JsonSimilarityEvaluator"
13
23
 
14
- class JsonSimilarityEvaluator(DeterministicEvaluatorBase[dict[str, Any]]):
24
+
25
+ class JsonSimilarityEvaluator(
26
+ OutputEvaluator[OutputEvaluationCriteria, JsonSimilarityEvaluatorConfig, str]
27
+ ):
15
28
  """Deterministic evaluator that scores structural JSON similarity between expected and actual output.
16
29
 
17
30
  Compares expected versus actual JSON-like structures and returns a
@@ -19,8 +32,15 @@ class JsonSimilarityEvaluator(DeterministicEvaluatorBase[dict[str, Any]]):
19
32
  and tolerant for numbers and strings (via Levenshtein distance).
20
33
  """
21
34
 
35
+ @classmethod
36
+ def get_evaluator_id(cls) -> str:
37
+ """Get the evaluator id."""
38
+ return EvaluatorType.JSON_SIMILARITY.value
39
+
22
40
  async def evaluate(
23
- self, agent_execution: AgentExecution, evaluation_criteria: dict[str, Any]
41
+ self,
42
+ agent_execution: AgentExecution,
43
+ evaluation_criteria: OutputEvaluationCriteria,
24
44
  ) -> EvaluationResult:
25
45
  """Evaluate similarity between expected and actual JSON outputs.
26
46
 
@@ -36,16 +56,25 @@ class JsonSimilarityEvaluator(DeterministicEvaluatorBase[dict[str, Any]]):
36
56
  Returns:
37
57
  EvaluationResult: Numerical score between 0-100 indicating similarity
38
58
  """
59
+ score, justification = self._compare_json(
60
+ self._get_expected_output(evaluation_criteria),
61
+ self._get_actual_output(agent_execution),
62
+ )
63
+ validated_justification = self.validate_justification(justification)
39
64
  return NumericEvaluationResult(
40
- score=self._compare_json(evaluation_criteria, agent_execution.agent_output)
65
+ score=score,
66
+ details=validated_justification,
41
67
  )
42
68
 
43
- def _compare_json(self, expected: Any, actual: Any) -> float:
69
+ def _compare_json(self, expected: Any, actual: Any) -> tuple[float, str]:
44
70
  matched_leaves, total_leaves = self._compare_tokens(expected, actual)
45
71
  if total_leaves == 0:
46
- return 100.0
47
- sim = (matched_leaves / total_leaves) * 100.0
48
- return max(0.0, min(100.0, sim))
72
+ return 1.0, "Total leaves are 0"
73
+ sim = matched_leaves / total_leaves
74
+ return (
75
+ max(0.0, min(1.0, sim)),
76
+ f"Matched leaves: {matched_leaves}, Total leaves: {total_leaves}",
77
+ )
49
78
 
50
79
  def _compare_tokens(
51
80
  self, expected_token: Any, actual_token: Any
@@ -0,0 +1,89 @@
1
+ """Base evaluator abstract class for agent evaluation."""
2
+
3
+ import functools
4
+ import time
5
+ from abc import ABC, abstractmethod
6
+ from collections.abc import Callable
7
+ from typing import Any, Generic, TypeVar
8
+
9
+ from pydantic import BaseModel, ConfigDict
10
+
11
+ from uipath.eval.models import EvaluationResult
12
+ from uipath.eval.models.models import (
13
+ AgentExecution,
14
+ ErrorEvaluationResult,
15
+ LegacyEvaluatorCategory,
16
+ LegacyEvaluatorType,
17
+ )
18
+
19
+
20
+ def track_evaluation_metrics(func: Callable[..., Any]) -> Callable[..., Any]:
21
+ """Decorator to track evaluation metrics and handle errors gracefully."""
22
+
23
+ @functools.wraps(func)
24
+ async def wrapper(*args: Any, **kwargs: Any) -> EvaluationResult:
25
+ start_time = time.time()
26
+ try:
27
+ result = await func(*args, **kwargs)
28
+ except Exception as e:
29
+ result = ErrorEvaluationResult(
30
+ details="Exception thrown by evaluator: {}".format(e),
31
+ evaluation_time=time.time() - start_time,
32
+ )
33
+ end_time = time.time()
34
+ execution_time = end_time - start_time
35
+
36
+ result.evaluation_time = execution_time
37
+ return result
38
+
39
+ return wrapper
40
+
41
+
42
+ T = TypeVar("T")
43
+
44
+
45
+ class LegacyBaseEvaluator(BaseModel, Generic[T], ABC):
46
+ """Abstract base class for all evaluators."""
47
+
48
+ model_config = ConfigDict(arbitrary_types_allowed=True)
49
+
50
+ id: str
51
+ name: str
52
+ description: str
53
+ target_output_key: str = "*"
54
+ created_at: str
55
+ updated_at: str
56
+ category: LegacyEvaluatorCategory
57
+ evaluator_type: LegacyEvaluatorType
58
+
59
+ def __init_subclass__(cls, **kwargs: Any):
60
+ """Hook for subclass creation - automatically applies evaluation metrics tracking."""
61
+ super().__init_subclass__(**kwargs)
62
+
63
+ if hasattr(cls, "evaluate") and not getattr(
64
+ cls.evaluate, "_has_metrics_decorator", False
65
+ ):
66
+ cls.evaluate = track_evaluation_metrics(cls.evaluate) # type: ignore[method-assign]
67
+ cls.evaluate._has_metrics_decorator = True # type: ignore[attr-defined]
68
+
69
+ def model_post_init(self, __context: Any):
70
+ """Post-initialization hook for Pydantic models."""
71
+ pass
72
+
73
+ @abstractmethod
74
+ async def evaluate(
75
+ self, agent_execution: AgentExecution, evaluation_criteria: T
76
+ ) -> EvaluationResult:
77
+ """Evaluate the given data and return a result.
78
+
79
+ Args:
80
+ agent_execution: The execution details containing:
81
+ - agent_input: The input received by the agent
82
+ - actual_output: The actual output from the agent
83
+ - spans: The execution spans to use for the evaluation
84
+ evaluation_criteria: The criteria to evaluate
85
+
86
+ Returns:
87
+ EvaluationResult containing the score and details
88
+ """
89
+ pass
@@ -4,12 +4,12 @@ import json
4
4
  from abc import ABC
5
5
  from typing import Any, TypeVar
6
6
 
7
- from .base_evaluator import BaseEvaluator
7
+ from .legacy_base_evaluator import LegacyBaseEvaluator
8
8
 
9
9
  T = TypeVar("T")
10
10
 
11
11
 
12
- class DeterministicEvaluatorBase(BaseEvaluator[T], ABC):
12
+ class DeterministicEvaluatorBase(LegacyBaseEvaluator[T], ABC):
13
13
  """Base class for evaluators that produce deterministic, reproducible results.
14
14
 
15
15
  This class provides utility methods for canonical JSON comparison and number normalization
@@ -0,0 +1,37 @@
1
+ """Exact match evaluator for binary pass/fail evaluation of agent outputs."""
2
+
3
+ from typing import Any
4
+
5
+ from uipath.eval.models import BooleanEvaluationResult, EvaluationResult
6
+
7
+ from ..models.models import AgentExecution
8
+ from .legacy_deterministic_evaluator_base import DeterministicEvaluatorBase
9
+
10
+
11
+ class LegacyExactMatchEvaluator(DeterministicEvaluatorBase[dict[str, Any]]):
12
+ """Evaluator that performs exact structural matching between expected and actual outputs.
13
+
14
+ This evaluator returns True if the actual output exactly matches the expected output
15
+ after canonical JSON normalization, and False otherwise. Numbers are normalized
16
+ to floats for consistent comparison.
17
+ """
18
+
19
+ async def evaluate(
20
+ self, agent_execution: AgentExecution, evaluation_criteria: dict[str, Any]
21
+ ) -> EvaluationResult:
22
+ """Evaluate whether actual output exactly matches expected output.
23
+
24
+ Args:
25
+ agent_execution: The execution details containing:
26
+ - agent_input: The input received by the agent
27
+ - actual_output: The actual output from the agent
28
+ - spans: The execution spans to use for the evaluation
29
+ evaluation_criteria: The criteria to evaluate
30
+
31
+ Returns:
32
+ EvaluationResult: Boolean result indicating exact match (True/False)
33
+ """
34
+ return BooleanEvaluationResult(
35
+ score=self._canonical_json(agent_execution.agent_output)
36
+ == self._canonical_json(evaluation_criteria)
37
+ )
@@ -0,0 +1,151 @@
1
+ """JSON similarity evaluator for flexible structural comparison of outputs."""
2
+
3
+ import math
4
+ from typing import Any, Tuple, TypeVar
5
+
6
+ from uipath.eval.models import EvaluationResult, NumericEvaluationResult
7
+
8
+ from ..models.models import AgentExecution
9
+ from .legacy_deterministic_evaluator_base import DeterministicEvaluatorBase
10
+
11
+ T = TypeVar("T")
12
+
13
+
14
+ class LegacyJsonSimilarityEvaluator(DeterministicEvaluatorBase[dict[str, Any]]):
15
+ """Legacy deterministic evaluator that scores structural JSON similarity between expected and actual output.
16
+
17
+ Compares expected versus actual JSON-like structures and returns a
18
+ numerical score in the range [0, 100]. The comparison is token-based
19
+ and tolerant for numbers and strings (via Levenshtein distance).
20
+ """
21
+
22
+ async def evaluate(
23
+ self, agent_execution: AgentExecution, evaluation_criteria: dict[str, Any]
24
+ ) -> EvaluationResult:
25
+ """Evaluate similarity between expected and actual JSON outputs.
26
+
27
+ Uses token-based comparison with tolerance for numeric differences
28
+ and Levenshtein distance for string similarity.
29
+
30
+ agent_execution: The execution details containing:
31
+ - agent_input: The input received by the agent
32
+ - actual_output: The actual output from the agent
33
+ - spans: The execution spans to use for the evaluation
34
+ evaluation_criteria: The criteria to evaluate
35
+
36
+ Returns:
37
+ EvaluationResult: Numerical score between 0-100 indicating similarity
38
+ """
39
+ return NumericEvaluationResult(
40
+ score=self._compare_json(evaluation_criteria, agent_execution.agent_output)
41
+ )
42
+
43
+ def _compare_json(self, expected: Any, actual: Any) -> float:
44
+ matched_leaves, total_leaves = self._compare_tokens(expected, actual)
45
+ if total_leaves == 0:
46
+ return 100.0
47
+ sim = (matched_leaves / total_leaves) * 100.0
48
+ return max(0.0, min(100.0, sim))
49
+
50
+ def _compare_tokens(
51
+ self, expected_token: Any, actual_token: Any
52
+ ) -> Tuple[float, float]:
53
+ if self._is_number(expected_token) and self._is_number(actual_token):
54
+ return self._compare_numbers(float(expected_token), float(actual_token))
55
+
56
+ if type(expected_token) is not type(actual_token):
57
+ return 0.0, self._count_leaves(expected_token)
58
+
59
+ if isinstance(expected_token, dict):
60
+ matched_leaves = total_leaves = 0.0
61
+ # Only expected keys count
62
+ for expected_key, expected_value in expected_token.items():
63
+ if isinstance(actual_token, dict) and expected_key in actual_token:
64
+ matched, total = self._compare_tokens(
65
+ expected_value, actual_token[expected_key]
66
+ )
67
+ else:
68
+ matched, total = (0.0, self._count_leaves(expected_value))
69
+ matched_leaves += matched
70
+ total_leaves += total
71
+ return matched_leaves, total_leaves
72
+
73
+ if isinstance(expected_token, list):
74
+ matched_leaves = total_leaves = 0.0
75
+ common_length = min(len(expected_token), len(actual_token))
76
+ for index in range(common_length):
77
+ matched, total = self._compare_tokens(
78
+ expected_token[index], actual_token[index]
79
+ )
80
+ matched_leaves += matched
81
+ total_leaves += total
82
+ for index in range(common_length, len(expected_token)):
83
+ total_leaves += self._count_leaves(expected_token[index])
84
+ return (matched_leaves, total_leaves)
85
+
86
+ if isinstance(expected_token, bool):
87
+ return (1.0, 1.0) if expected_token == actual_token else (0.0, 1.0)
88
+
89
+ if isinstance(expected_token, str):
90
+ return self._compare_strings(expected_token, actual_token)
91
+
92
+ return (1.0, 1.0) if str(expected_token) == str(actual_token) else (0.0, 1.0)
93
+
94
+ def _compare_numbers(
95
+ self, expected_number: float, actual_number: float
96
+ ) -> Tuple[float, float]:
97
+ total = 1.0
98
+ if math.isclose(expected_number, 0.0, abs_tol=1e-12):
99
+ matched = 1.0 if math.isclose(actual_number, 0.0, abs_tol=1e-12) else 0.0
100
+ else:
101
+ ratio = abs(expected_number - actual_number) / abs(expected_number)
102
+ matched = max(0.0, min(1.0, 1.0 - ratio))
103
+ return matched, total
104
+
105
+ def _compare_strings(
106
+ self, expected_string: str, actual_string: str
107
+ ) -> Tuple[float, float]:
108
+ total = 1.0
109
+ if not expected_string and not actual_string:
110
+ return 1.0, total
111
+ distance = self._levenshtein(expected_string, actual_string)
112
+ max_length = max(len(expected_string), len(actual_string))
113
+ similarity = 1.0 - (distance / max_length) if max_length else 1.0
114
+ similarity = max(0.0, min(1.0, similarity))
115
+ return similarity, total
116
+
117
+ def _count_leaves(self, token_node: Any) -> float:
118
+ if isinstance(token_node, dict):
119
+ return sum(
120
+ self._count_leaves(child_value) for child_value in token_node.values()
121
+ )
122
+ if isinstance(token_node, list):
123
+ return sum(self._count_leaves(child_value) for child_value in token_node)
124
+ return 1.0
125
+
126
+ def _levenshtein(self, source_text: str, target_text: str) -> int:
127
+ if not source_text:
128
+ return len(target_text)
129
+ if not target_text:
130
+ return len(source_text)
131
+ source_len, target_len = len(source_text), len(target_text)
132
+ distance_matrix = [[0] * (target_len + 1) for _ in range(source_len + 1)]
133
+ for row_idx in range(source_len + 1):
134
+ distance_matrix[row_idx][0] = row_idx
135
+ for col_idx in range(target_len + 1):
136
+ distance_matrix[0][col_idx] = col_idx
137
+ for row_idx in range(1, source_len + 1):
138
+ for col_idx in range(1, target_len + 1):
139
+ substitution_cost = (
140
+ 0 if source_text[row_idx - 1] == target_text[col_idx - 1] else 1
141
+ )
142
+ distance_matrix[row_idx][col_idx] = min(
143
+ distance_matrix[row_idx - 1][col_idx] + 1, # deletion
144
+ distance_matrix[row_idx][col_idx - 1] + 1, # insertion
145
+ distance_matrix[row_idx - 1][col_idx - 1]
146
+ + substitution_cost, # substitution
147
+ )
148
+ return distance_matrix[source_len][target_len]
149
+
150
+ def _is_number(self, value: Any) -> bool:
151
+ return isinstance(value, (int, float)) and not isinstance(value, bool)
@@ -0,0 +1,137 @@
1
+ """LLM-as-a-judge evaluator for subjective quality assessment of agent outputs."""
2
+
3
+ import json
4
+ from typing import Any, Optional
5
+
6
+ from pydantic import field_validator
7
+
8
+ from uipath.eval.models import NumericEvaluationResult
9
+
10
+ from ..._services import UiPathLlmChatService
11
+ from ..._utils.constants import COMMUNITY_agents_SUFFIX
12
+ from ..models.models import AgentExecution, EvaluationResult, LLMResponse
13
+ from .legacy_base_evaluator import LegacyBaseEvaluator
14
+
15
+
16
+ class LegacyLlmAsAJudgeEvaluator(LegacyBaseEvaluator[dict[str, Any]]):
17
+ """Legacy evaluator that uses an LLM to judge the quality of agent output."""
18
+
19
+ prompt: str
20
+ model: str
21
+ actual_output_placeholder: str = "{{ActualOutput}}"
22
+ expected_output_placeholder: str = "{{ExpectedOutput}}"
23
+ llm: Optional[UiPathLlmChatService] = None
24
+
25
+ @field_validator("prompt")
26
+ @classmethod
27
+ def validate_prompt_placeholders(cls, v: str) -> str:
28
+ """Validate that prompt contains required placeholders."""
29
+ if "{{ActualOutput}}" not in v or "{{ExpectedOutput}}" not in v:
30
+ raise ValueError(
31
+ "Prompt must contain both {ActualOutput} and {ExpectedOutput} placeholders"
32
+ )
33
+ return v
34
+
35
+ def model_post_init(self, __context: Any):
36
+ """Initialize the LLM service after model creation."""
37
+ super().model_post_init(__context)
38
+ self._initialize_llm()
39
+
40
+ def _initialize_llm(self):
41
+ """Initialize the LLM used for evaluation."""
42
+ from uipath import UiPath
43
+
44
+ uipath = UiPath()
45
+ self.llm = uipath.llm
46
+
47
+ async def evaluate(
48
+ self,
49
+ agent_execution: AgentExecution,
50
+ evaluation_criteria: dict[str, Any],
51
+ ) -> EvaluationResult:
52
+ """Evaluate using an LLM as a judge.
53
+
54
+ Sends the formatted prompt to the configured LLM and expects a JSON response
55
+ with a numerical score (0-100) and justification.
56
+
57
+ agent_execution: The execution details containing:
58
+ - agent_input: The input received by the agent
59
+ - actual_output: The actual output from the agent
60
+ - spans: The execution spans to use for the evaluation
61
+ evaluation_criteria: The criteria to evaluate
62
+
63
+ Returns:
64
+ EvaluationResult: Numerical score with LLM justification as details
65
+ """
66
+ # Create the evaluation prompt
67
+ evaluation_prompt = self._create_evaluation_prompt(
68
+ expected_output=evaluation_criteria,
69
+ actual_output=agent_execution.agent_output,
70
+ )
71
+
72
+ llm_response = await self._get_llm_response(evaluation_prompt)
73
+
74
+ return NumericEvaluationResult(
75
+ score=llm_response.score,
76
+ details=llm_response.justification,
77
+ )
78
+
79
+ def _create_evaluation_prompt(
80
+ self, expected_output: Any, actual_output: Any
81
+ ) -> str:
82
+ """Create the evaluation prompt for the LLM."""
83
+ formatted_prompt = self.prompt.replace(
84
+ self.actual_output_placeholder,
85
+ str(actual_output),
86
+ )
87
+ formatted_prompt = formatted_prompt.replace(
88
+ self.expected_output_placeholder,
89
+ str(expected_output),
90
+ )
91
+
92
+ return formatted_prompt
93
+
94
+ async def _get_llm_response(self, evaluation_prompt: str) -> LLMResponse:
95
+ """Get response from the LLM.
96
+
97
+ Args:
98
+ evaluation_prompt: The formatted prompt to send to the LLM
99
+
100
+ Returns:
101
+ LLMResponse with score and justification
102
+ """
103
+ # remove community-agents suffix from llm model name
104
+ model = self.model
105
+ if model.endswith(COMMUNITY_agents_SUFFIX):
106
+ model = model.replace(COMMUNITY_agents_SUFFIX, "")
107
+
108
+ # Prepare the request
109
+ request_data = {
110
+ "model": model,
111
+ "messages": [{"role": "user", "content": evaluation_prompt}],
112
+ "response_format": {
113
+ "type": "json_schema",
114
+ "json_schema": {
115
+ "name": "evaluation_response",
116
+ "schema": {
117
+ "type": "object",
118
+ "properties": {
119
+ "score": {
120
+ "type": "number",
121
+ "minimum": 0,
122
+ "maximum": 100,
123
+ "description": "Score between 0 and 100",
124
+ },
125
+ "justification": {
126
+ "type": "string",
127
+ "description": "Explanation for the score",
128
+ },
129
+ },
130
+ "required": ["score", "justification"],
131
+ },
132
+ },
133
+ },
134
+ }
135
+
136
+ response = await self.llm.chat_completions(**request_data) # type: ignore
137
+ return LLMResponse(**json.loads(response.choices[-1].message.content or "{}"))