PyPI - uipath - Versions diffs - 2.1.52__py3-none-any.whl → 2.1.54__py3-none-any.whl - Mend

uipath 2.1.52py3-none-any.whl → 2.1.54py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

uipath/_cli/_evals/{_evaluators/_evaluator_factory.py → _evaluator_factory.py} +24 -23
uipath/_cli/_evals/_models/_evaluation_set.py +23 -18
uipath/_cli/_evals/_models/_evaluator_base_params.py +16 -0
uipath/_cli/_evals/_models/_output.py +85 -0
uipath/_cli/_evals/_runtime.py +102 -10
uipath/_cli/_runtime/_contracts.py +11 -2
uipath/_cli/_utils/_eval_set.py +1 -1
uipath/_cli/_utils/_studio_project.py +30 -29
uipath/_cli/cli_eval.py +46 -61
uipath/eval/evaluators/__init__.py +15 -0
uipath/eval/evaluators/base_evaluator.py +88 -0
uipath/eval/evaluators/deterministic_evaluator_base.py +53 -0
uipath/eval/evaluators/exact_match_evaluator.py +37 -0
uipath/{_cli/_evals/_evaluators/_json_similarity_evaluator.py → eval/evaluators/json_similarity_evaluator.py} +23 -40
uipath/eval/evaluators/llm_as_judge_evaluator.py +137 -0
uipath/eval/evaluators/trajectory_evaluator.py +36 -0
uipath/eval/models/__init__.py +19 -0
uipath/{_cli/_evals/_models/_evaluators.py → eval/models/models.py} +67 -43
{uipath-2.1.52.dist-info → uipath-2.1.54.dist-info}/METADATA +1 -1
{uipath-2.1.52.dist-info → uipath-2.1.54.dist-info}/RECORD +23 -23
uipath/_cli/_evals/_evaluators/__init__.py +0 -22
uipath/_cli/_evals/_evaluators/_deterministic_evaluator_base.py +0 -46
uipath/_cli/_evals/_evaluators/_evaluator_base.py +0 -124
uipath/_cli/_evals/_evaluators/_exact_match_evaluator.py +0 -40
uipath/_cli/_evals/_evaluators/_llm_as_judge_evaluator.py +0 -183
uipath/_cli/_evals/_evaluators/_trajectory_evaluator.py +0 -48
uipath/_cli/_evals/_models/__init__.py +0 -18
uipath/_cli/_evals/_models/_agent_execution_output.py +0 -14
uipath/_cli/_evals/progress_reporter.py +0 -304
{uipath-2.1.52.dist-info → uipath-2.1.54.dist-info}/WHEEL +0 -0
{uipath-2.1.52.dist-info → uipath-2.1.54.dist-info}/entry_points.txt +0 -0
{uipath-2.1.52.dist-info → uipath-2.1.54.dist-info}/licenses/LICENSE +0 -0

uipath/_cli/cli_eval.py CHANGED Viewed

@@ -2,7 +2,6 @@
 import ast
 import asyncio
 import os
-from datetime import datetime, timezone
 from typing import List, Optional
 import click
@@ -13,7 +12,7 @@ from uipath._cli._runtime._contracts import (
     UiPathRuntimeFactory,
 )
 from uipath._cli._runtime._runtime import UiPathScriptRuntime
-from uipath._cli.middlewares import MiddlewareResult, Middlewares
+from uipath._cli.middlewares import Middlewares
 from uipath.eval._helpers import auto_discover_entrypoint
 from .._utils.constants import ENV_JOB_ID
@@ -32,55 +31,6 @@ class LiteralOption(click.Option):
             raise click.BadParameter(value) from e
-def eval_agent_middleware(
-    entrypoint: Optional[str] = None,
-    eval_set: Optional[str] = None,
-    eval_ids: Optional[List[str]] = None,
-    workers: int = 8,
-    no_report: bool = False,
-    **kwargs,
-) -> MiddlewareResult:
-    """Middleware to run an evaluation set against the agent."""
-    timestamp = datetime.now(timezone.utc).strftime("%M-%H-%d-%m-%Y")
-    eval_context = UiPathEvalContext.with_defaults()
-    eval_context.no_report = no_report
-    eval_context.workers = workers
-    eval_context.eval_set = eval_set or EvalHelpers.auto_discover_eval_set()
-    eval_context.eval_ids = eval_ids
-    eval_context.execution_output_file = (
-        f"evals/results/{timestamp}.json" if not os.getenv("UIPATH_JOB_KEY") else None
-    )
-    runtime_entrypoint = entrypoint or auto_discover_entrypoint()
-    def generate_runtime_context(**context_kwargs) -> UiPathRuntimeContext:
-        runtime_context = UiPathRuntimeContext.with_defaults(**context_kwargs)
-        runtime_context.entrypoint = runtime_entrypoint
-        return runtime_context
-    try:
-        runtime_factory = UiPathRuntimeFactory(
-            UiPathScriptRuntime,
-            UiPathRuntimeContext,
-            context_generator=generate_runtime_context,
-        )
-        async def execute():
-            async with UiPathEvalRuntime.from_eval_context(
-                factory=runtime_factory, context=eval_context
-            ) as eval_runtime:
-                await eval_runtime.execute()
-        asyncio.run(execute())
-        return MiddlewareResult(should_continue=False)
-    except Exception as e:
-        return MiddlewareResult(
-            should_continue=False, error_message=f"Error running evaluation: {str(e)}"
-        )
 @click.command()
 @click.argument("entrypoint", required=False)
 @click.argument("eval_set", required=False)
@@ -97,6 +47,12 @@ def eval_agent_middleware(
     default=8,
     help="Number of parallel workers for running evaluations (default: 8)",
 )
+@click.option(
+    "--output-file",
+    required=False,
+    type=click.Path(exists=False),
+    help="File path where the output will be written",
+)
 @track(when=lambda *_a, **_kw: os.getenv(ENV_JOB_ID) is None)
 def eval(
     entrypoint: Optional[str],
@@ -104,6 +60,7 @@ def eval(
     eval_ids: List[str],
     no_report: bool,
     workers: int,
+    output_file: Optional[str],
 ) -> None:
     """Run an evaluation set against the agent.
@@ -121,21 +78,49 @@ def eval(
         eval_ids,
         no_report=no_report,
         workers=workers,
+        execution_output_file=output_file,
     )
-    if result.should_continue:
-        result = eval_agent_middleware(
-            entrypoint=entrypoint,
-            eval_set=eval_set,
-            eval_ids=eval_ids,
-            workers=workers,
-            no_report=no_report,
-        )
-    if result.should_continue:
-        console.error("Could not process the request with any available handler.")
     if result.error_message:
         console.error(result.error_message)
+    if result.should_continue:
+        def generate_runtime_context(**context_kwargs) -> UiPathRuntimeContext:
+            runtime_context = UiPathRuntimeContext.with_defaults(**context_kwargs)
+            runtime_context.entrypoint = runtime_entrypoint
+            return runtime_context
+        eval_context = UiPathEvalContext.with_defaults(
+            execution_output_file=output_file
+        )
+        eval_context.no_report = no_report
+        eval_context.workers = workers
+        eval_context.eval_set = eval_set or EvalHelpers.auto_discover_eval_set()
+        eval_context.eval_ids = eval_ids
+        runtime_entrypoint = entrypoint or auto_discover_entrypoint()
+        try:
+            runtime_factory = UiPathRuntimeFactory(
+                UiPathScriptRuntime,
+                UiPathRuntimeContext,
+                context_generator=generate_runtime_context,
+            )
+            async def execute():
+                async with UiPathEvalRuntime.from_eval_context(
+                    factory=runtime_factory, context=eval_context
+                ) as eval_runtime:
+                    await eval_runtime.execute()
+            asyncio.run(execute())
+        except Exception as e:
+            console.error(
+                f"Error: Unexpected error occurred - {str(e)}", include_traceback=True
+            )
     console.success("Evaluation completed successfully")

uipath/eval/evaluators/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""UiPath evaluator implementations for agent performance evaluation."""
+from .base_evaluator import BaseEvaluator
+from .exact_match_evaluator import ExactMatchEvaluator
+from .json_similarity_evaluator import JsonSimilarityEvaluator
+from .llm_as_judge_evaluator import LlmAsAJudgeEvaluator
+from .trajectory_evaluator import TrajectoryEvaluator
+__all__ = [
+    "BaseEvaluator",
+    "ExactMatchEvaluator",
+    "JsonSimilarityEvaluator",
+    "LlmAsAJudgeEvaluator",
+    "TrajectoryEvaluator",
+]

uipath/eval/evaluators/base_evaluator.py ADDED Viewed

@@ -0,0 +1,88 @@
+"""Base evaluator abstract class for agent evaluation."""
+import functools
+import time
+from abc import ABC, abstractmethod
+from typing import Generic, TypeVar
+from pydantic import BaseModel, ConfigDict
+from uipath.eval.models import EvaluationResult
+from uipath.eval.models.models import (
+    AgentExecution,
+    ErrorEvaluationResult,
+    EvaluatorCategory,
+    EvaluatorType,
+)
+def track_evaluation_metrics(func):
+    """Decorator to track evaluation metrics and handle errors gracefully."""
+    @functools.wraps(func)
+    async def wrapper(*args, **kwargs) -> EvaluationResult:
+        start_time = time.time()
+        try:
+            result = await func(*args, **kwargs)
+        except Exception as e:
+            result = ErrorEvaluationResult(
+                details="Exception thrown by evaluator: {}".format(e),
+                evaluation_time=time.time() - start_time,
+            )
+        end_time = time.time()
+        execution_time = end_time - start_time
+        result.evaluation_time = execution_time
+        return result
+    return wrapper
+T = TypeVar("T")
+class BaseEvaluator(BaseModel, Generic[T], ABC):
+    """Abstract base class for all evaluators."""
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    id: str
+    name: str
+    description: str
+    target_output_key: str = "*"
+    created_at: str
+    updated_at: str
+    category: EvaluatorCategory
+    evaluator_type: EvaluatorType
+    def __init_subclass__(cls, **kwargs):
+        """Hook for subclass creation - automatically applies evaluation metrics tracking."""
+        super().__init_subclass__(**kwargs)
+        if hasattr(cls, "evaluate") and not getattr(
+            cls.evaluate, "_has_metrics_decorator", False
+        ):
+            cls.evaluate = track_evaluation_metrics(cls.evaluate)  # type: ignore[method-assign]
+            cls.evaluate._has_metrics_decorator = True  # type: ignore[attr-defined]
+    def model_post_init(self, __context):
+        """Post-initialization hook for Pydantic models."""
+        pass
+    @abstractmethod
+    async def evaluate(
+        self, agent_execution: AgentExecution, evaluation_criteria: T
+    ) -> EvaluationResult:
+        """Evaluate the given data and return a result.
+        Args:
+            agent_execution: The execution details containing:
+                - agent_input: The input received by the agent
+                - actual_output: The actual output from the agent
+                - spans: The execution spans to use for the evaluation
+            evaluation_criteria: The criteria to evaluate
+        Returns:
+            EvaluationResult containing the score and details
+        """
+        pass

uipath/eval/evaluators/deterministic_evaluator_base.py ADDED Viewed

@@ -0,0 +1,53 @@
+"""Base class for deterministic evaluators that provide consistent outputs."""
+import json
+from abc import ABC
+from typing import Any, TypeVar
+from .base_evaluator import BaseEvaluator
+T = TypeVar("T")
+class DeterministicEvaluatorBase(BaseEvaluator[T], ABC):
+    """Base class for evaluators that produce deterministic, reproducible results.
+    This class provides utility methods for canonical JSON comparison and number normalization
+    to ensure consistent evaluation results across runs.
+    """
+    def _canonical_json(self, obj: Any) -> str:
+        """Convert an object to canonical JSON string for consistent comparison.
+        Args:
+            obj: The object to convert to canonical JSON
+        Returns:
+            str: Canonical JSON string with normalized numbers and sorted keys
+        """
+        return json.dumps(
+            self._normalize_numbers(obj),
+            sort_keys=True,
+            separators=(",", ":"),
+            ensure_ascii=False,
+        )
+    def _normalize_numbers(self, obj: Any) -> Any:
+        """Recursively normalize numbers in nested data structures.
+        Converts all numeric values (int, float) to float for consistent comparison,
+        while preserving booleans and other data types.
+        Args:
+            obj: The object to normalize
+        Returns:
+            Any: Object with normalized numbers
+        """
+        if isinstance(obj, dict):
+            return {k: self._normalize_numbers(v) for k, v in obj.items()}
+        if isinstance(obj, (list, tuple)):
+            return [self._normalize_numbers(v) for v in obj]
+        if isinstance(obj, (int, float)) and not isinstance(obj, bool):
+            return float(obj)
+        return obj

uipath/eval/evaluators/exact_match_evaluator.py ADDED Viewed

@@ -0,0 +1,37 @@
+"""Exact match evaluator for binary pass/fail evaluation of agent outputs."""
+from typing import Any
+from uipath.eval.models import BooleanEvaluationResult, EvaluationResult
+from ..models.models import AgentExecution
+from .deterministic_evaluator_base import DeterministicEvaluatorBase
+class ExactMatchEvaluator(DeterministicEvaluatorBase[dict[str, Any]]):
+    """Evaluator that performs exact structural matching between expected and actual outputs.
+    This evaluator returns True if the actual output exactly matches the expected output
+    after canonical JSON normalization, and False otherwise. Numbers are normalized
+    to floats for consistent comparison.
+    """
+    async def evaluate(
+        self, agent_execution: AgentExecution, evaluation_criteria: dict[str, Any]
+    ) -> EvaluationResult:
+        """Evaluate whether actual output exactly matches expected output.
+        Args:
+            agent_execution: The execution details containing:
+                - agent_input: The input received by the agent
+                - actual_output: The actual output from the agent
+                - spans: The execution spans to use for the evaluation
+            evaluation_criteria: The criteria to evaluate
+        Returns:
+            EvaluationResult: Boolean result indicating exact match (True/False)
+        """
+        return BooleanEvaluationResult(
+            score=self._canonical_json(agent_execution.agent_output)
+            == self._canonical_json(evaluation_criteria)
+        )

uipath/{_cli/_evals/_evaluators/_json_similarity_evaluator.py → eval/evaluators/json_similarity_evaluator.py} RENAMED Viewed

@@ -1,16 +1,18 @@
-import copy
+"""JSON similarity evaluator for flexible structural comparison of outputs."""
 import math
-from typing import Any, Dict, Tuple
+from typing import Any, Tuple, TypeVar
+from uipath.eval.models import EvaluationResult, NumericEvaluationResult
+from ..models.models import AgentExecution
+from .deterministic_evaluator_base import DeterministicEvaluatorBase
-from uipath._cli._evals._evaluators._deterministic_evaluator_base import (
-    DeterministicEvaluatorBase,
-)
-from uipath._cli._evals._models import EvaluationResult
-from uipath._cli._evals._models._evaluators import ScoreType
+T = TypeVar("T")
-class JsonSimilarityEvaluator(DeterministicEvaluatorBase):
-    """Deterministic evaluator that scores structural JSON similarity.
+class JsonSimilarityEvaluator(DeterministicEvaluatorBase[dict[str, Any]]):
+    """Deterministic evaluator that scores structural JSON similarity between expected and actual output.
     Compares expected versus actual JSON-like structures and returns a
     numerical score in the range [0, 100]. The comparison is token-based
@@ -18,43 +20,24 @@ class JsonSimilarityEvaluator(DeterministicEvaluatorBase):
     """
     async def evaluate(
-        self,
-        evaluation_id: str,
-        evaluation_name: str,
-        input_data: Dict[str, Any],
-        expected_output: Dict[str, Any],
-        actual_output: Dict[str, Any],
+        self, agent_execution: AgentExecution, evaluation_criteria: dict[str, Any]
     ) -> EvaluationResult:
         """Evaluate similarity between expected and actual JSON outputs.
-        Args:
-            evaluation_id: Unique identifier for this evaluation run.
-            evaluation_name: Human friendly evaluation name.
-            input_data: Input payload used to produce the outputs.
-            expected_output: Ground-truth JSON structure.
-            actual_output: Produced JSON structure to compare against the ground truth.
+        Uses token-based comparison with tolerance for numeric differences
+        and Levenshtein distance for string similarity.
+            agent_execution: The execution details containing:
+                - agent_input: The input received by the agent
+                - actual_output: The actual output from the agent
+                - spans: The execution spans to use for the evaluation
+            evaluation_criteria: The criteria to evaluate
         Returns:
-            EvaluationResult: Structured result with the numerical similarity score.
+            EvaluationResult: Numerical score between 0-100 indicating similarity
         """
-        actual_output_copy = copy.deepcopy(actual_output)
-        expected_output_copy = copy.deepcopy(expected_output)
-        actual_output, expected_output = self._select_targets(
-            expected_output, actual_output
-        )
-        similarity = self._compare_json(expected_output, actual_output)
-        return EvaluationResult(
-            evaluation_id=evaluation_id,
-            evaluation_name=evaluation_name,
-            evaluator_id=self.id,
-            evaluator_name=self.name,
-            score=similarity,
-            input=input_data,
-            expected_output=expected_output_copy,
-            actual_output=actual_output_copy,
-            score_type=ScoreType.NUMERICAL,
+        return NumericEvaluationResult(
+            score=self._compare_json(evaluation_criteria, agent_execution.agent_output)
         )
     def _compare_json(self, expected: Any, actual: Any) -> float:

uipath/eval/evaluators/llm_as_judge_evaluator.py ADDED Viewed

@@ -0,0 +1,137 @@
+"""LLM-as-a-judge evaluator for subjective quality assessment of agent outputs."""
+import json
+from typing import Any, Optional
+from pydantic import field_validator
+from uipath.eval.models import NumericEvaluationResult
+from ..._services import UiPathLlmChatService
+from ..._utils.constants import COMMUNITY_agents_SUFFIX
+from ..models.models import AgentExecution, EvaluationResult, LLMResponse
+from .base_evaluator import BaseEvaluator
+class LlmAsAJudgeEvaluator(BaseEvaluator[dict[str, Any]]):
+    """Evaluator that uses an LLM to judge the quality of agent output."""
+    prompt: str
+    model: str
+    actual_output_placeholder: str = "{{ActualOutput}}"
+    expected_output_placeholder: str = "{{ExpectedOutput}}"
+    llm: Optional[UiPathLlmChatService] = None
+    @field_validator("prompt")
+    @classmethod
+    def validate_prompt_placeholders(cls, v: str) -> str:
+        """Validate that prompt contains required placeholders."""
+        if "{{ActualOutput}}" not in v or "{{ExpectedOutput}}" not in v:
+            raise ValueError(
+                "Prompt must contain both {ActualOutput} and {ExpectedOutput} placeholders"
+            )
+        return v
+    def model_post_init(self, __context):
+        """Initialize the LLM service after model creation."""
+        super().model_post_init(__context)
+        self._initialize_llm()
+    def _initialize_llm(self):
+        """Initialize the LLM used for evaluation."""
+        from uipath import UiPath
+        uipath = UiPath()
+        self.llm = uipath.llm
+    async def evaluate(
+        self,
+        agent_execution: AgentExecution,
+        evaluation_criteria: dict[str, Any],
+    ) -> EvaluationResult:
+        """Evaluate using an LLM as a judge.
+        Sends the formatted prompt to the configured LLM and expects a JSON response
+        with a numerical score (0-100) and justification.
+            agent_execution: The execution details containing:
+                - agent_input: The input received by the agent
+                - actual_output: The actual output from the agent
+                - spans: The execution spans to use for the evaluation
+            evaluation_criteria: The criteria to evaluate
+        Returns:
+            EvaluationResult: Numerical score with LLM justification as details
+        """
+        # Create the evaluation prompt
+        evaluation_prompt = self._create_evaluation_prompt(
+            expected_output=evaluation_criteria,
+            actual_output=agent_execution.agent_output,
+        )
+        llm_response = await self._get_llm_response(evaluation_prompt)
+        return NumericEvaluationResult(
+            score=llm_response.score,
+            details=llm_response.justification,
+        )
+    def _create_evaluation_prompt(
+        self, expected_output: Any, actual_output: Any
+    ) -> str:
+        """Create the evaluation prompt for the LLM."""
+        formatted_prompt = self.prompt.replace(
+            self.actual_output_placeholder,
+            str(actual_output),
+        )
+        formatted_prompt = formatted_prompt.replace(
+            self.expected_output_placeholder,
+            str(expected_output),
+        )
+        return formatted_prompt
+    async def _get_llm_response(self, evaluation_prompt: str) -> LLMResponse:
+        """Get response from the LLM.
+        Args:
+            evaluation_prompt: The formatted prompt to send to the LLM
+        Returns:
+            LLMResponse with score and justification
+        """
+        # remove community-agents suffix from llm model name
+        model = self.model
+        if model.endswith(COMMUNITY_agents_SUFFIX):
+            model = model.replace(COMMUNITY_agents_SUFFIX, "")
+        # Prepare the request
+        request_data = {
+            "model": model,
+            "messages": [{"role": "user", "content": evaluation_prompt}],
+            "response_format": {
+                "type": "json_schema",
+                "json_schema": {
+                    "name": "evaluation_response",
+                    "schema": {
+                        "type": "object",
+                        "properties": {
+                            "score": {
+                                "type": "number",
+                                "minimum": 0,
+                                "maximum": 100,
+                                "description": "Score between 0 and 100",
+                            },
+                            "justification": {
+                                "type": "string",
+                                "description": "Explanation for the score",
+                            },
+                        },
+                        "required": ["score", "justification"],
+                    },
+                },
+            },
+        }
+        response = await self.llm.chat_completions(**request_data)  # type: ignore
+        return LLMResponse(**json.loads(response.choices[-1].message.content))

uipath/eval/evaluators/trajectory_evaluator.py ADDED Viewed

@@ -0,0 +1,36 @@
+"""Trajectory evaluator for analyzing execution paths and decision sequences."""
+from typing import TypeVar
+from uipath.eval.models import EvaluationResult
+from ..models.models import AgentExecution
+from .base_evaluator import BaseEvaluator
+T = TypeVar("T")
+class TrajectoryEvaluator(BaseEvaluator[T]):
+    """Evaluator that analyzes the trajectory/path taken to reach outputs."""
+    async def evaluate(
+        self, agent_execution: AgentExecution, evaluation_criteria: T
+    ) -> EvaluationResult:
+        """Evaluate using trajectory analysis.
+        Analyzes the execution path and decision sequence taken by the agent
+        to assess the quality of the reasoning process.
+        Args:
+            agent_execution: The execution details containing:
+                - agent_input: The input received by the agent
+                - actual_output: The actual output from the agent
+                - spans: The execution spans to use for the evaluation
+            evaluation_criteria: The criteria to evaluate
+        Returns:
+            EvaluationResult: Score based on trajectory analysis
+        Raises:
+            NotImplementedError: This evaluator is not yet implemented
+        """
+        raise NotImplementedError()

uipath/eval/models/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+"""UiPath evaluation module for agent performance assessment."""
+from uipath.eval.models.models import (
+    BooleanEvaluationResult,
+    ErrorEvaluationResult,
+    EvalItemResult,
+    EvaluationResult,
+    NumericEvaluationResult,
+    ScoreType,
+)
+__all__ = [
+    "EvaluationResult",
+    "ScoreType",
+    "EvalItemResult",
+    "BooleanEvaluationResult",
+    "NumericEvaluationResult",
+    "ErrorEvaluationResult",
+]

uipath 2.1.52__py3-none-any.whl → 2.1.54__py3-none-any.whl

uipath 2.1.52py3-none-any.whl → 2.1.54py3-none-any.whl