PyPI - strands-agents-evals - Versions diffs - 0.1.0__py3-none-any.whl - Mend

strands-agents-evals 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

strands_agents_evals-0.1.0.dist-info/METADATA +408 -0
strands_agents_evals-0.1.0.dist-info/RECORD +68 -0
strands_agents_evals-0.1.0.dist-info/WHEEL +4 -0
strands_agents_evals-0.1.0.dist-info/licenses/LICENSE +175 -0
strands_agents_evals-0.1.0.dist-info/licenses/NOTICE +1 -0
strands_evals/__init__.py +22 -0
strands_evals/case.py +53 -0
strands_evals/display/display_console.py +150 -0
strands_evals/evaluators/__init__.py +23 -0
strands_evals/evaluators/evaluator.py +182 -0
strands_evals/evaluators/faithfulness_evaluator.py +116 -0
strands_evals/evaluators/goal_success_rate_evaluator.py +90 -0
strands_evals/evaluators/harmfulness_evaluator.py +135 -0
strands_evals/evaluators/helpfulness_evaluator.py +148 -0
strands_evals/evaluators/interactions_evaluator.py +244 -0
strands_evals/evaluators/output_evaluator.py +72 -0
strands_evals/evaluators/prompt_templates/case_prompt_template.py +63 -0
strands_evals/evaluators/prompt_templates/faithfulness/__init__.py +11 -0
strands_evals/evaluators/prompt_templates/faithfulness/faithfulness_v0.py +30 -0
strands_evals/evaluators/prompt_templates/goal_success_rate/__init__.py +11 -0
strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_v0.py +17 -0
strands_evals/evaluators/prompt_templates/harmfulness/__init__.py +11 -0
strands_evals/evaluators/prompt_templates/harmfulness/harmfulness_v0.py +8 -0
strands_evals/evaluators/prompt_templates/helpfulness/__init__.py +11 -0
strands_evals/evaluators/prompt_templates/helpfulness/helpfulness_v0.py +38 -0
strands_evals/evaluators/prompt_templates/prompt_templates.py +176 -0
strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/__init__.py +11 -0
strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/tool_parameter_accuracy_v0.py +40 -0
strands_evals/evaluators/prompt_templates/tool_selection_accuracy/__init__.py +11 -0
strands_evals/evaluators/prompt_templates/tool_selection_accuracy/tool_selection_accuracy_v0.py +23 -0
strands_evals/evaluators/tool_parameter_accuracy_evaluator.py +112 -0
strands_evals/evaluators/tool_selection_accuracy_evaluator.py +112 -0
strands_evals/evaluators/trajectory_evaluator.py +100 -0
strands_evals/experiment.py +652 -0
strands_evals/extractors/__init__.py +3 -0
strands_evals/extractors/graph_extractor.py +30 -0
strands_evals/extractors/swarm_extractor.py +73 -0
strands_evals/extractors/tools_use_extractor.py +164 -0
strands_evals/extractors/trace_extractor.py +166 -0
strands_evals/generators/__init__.py +3 -0
strands_evals/generators/experiment_generator.py +498 -0
strands_evals/generators/prompt_template/prompt_templates.py +75 -0
strands_evals/generators/topic_planner.py +60 -0
strands_evals/mappers/__init__.py +6 -0
strands_evals/mappers/session_mapper.py +27 -0
strands_evals/mappers/strands_in_memory_session_mapper.py +473 -0
strands_evals/simulation/README.md +323 -0
strands_evals/simulation/__init__.py +6 -0
strands_evals/simulation/actor_simulator.py +292 -0
strands_evals/simulation/profiles/__init__.py +5 -0
strands_evals/simulation/profiles/actor_profile.py +26 -0
strands_evals/simulation/prompt_templates/__init__.py +11 -0
strands_evals/simulation/prompt_templates/actor_profile_extraction.py +25 -0
strands_evals/simulation/prompt_templates/actor_system_prompt.py +64 -0
strands_evals/simulation/prompt_templates/goal_completion.py +27 -0
strands_evals/simulation/tools/__init__.py +5 -0
strands_evals/simulation/tools/goal_completion.py +93 -0
strands_evals/telemetry/__init__.py +15 -0
strands_evals/telemetry/_cloudwatch_logger.py +209 -0
strands_evals/telemetry/config.py +207 -0
strands_evals/telemetry/tracer.py +38 -0
strands_evals/tools/evaluation_tools.py +67 -0
strands_evals/types/__init__.py +11 -0
strands_evals/types/evaluation.py +105 -0
strands_evals/types/evaluation_report.py +244 -0
strands_evals/types/simulation/__init__.py +5 -0
strands_evals/types/simulation/actor.py +34 -0
strands_evals/types/trace.py +205 -0

strands_evals/evaluators/trajectory_evaluator.py ADDED Viewed

@@ -0,0 +1,100 @@
+from strands import Agent
+from strands.models.model import Model
+from typing_extensions import Any, TypeVar, Union
+from ..tools.evaluation_tools import any_order_match_scorer, exact_match_scorer, in_order_match_scorer
+from ..types.evaluation import EvaluationData, EvaluationOutput
+from .evaluator import Evaluator
+from .prompt_templates.case_prompt_template import compose_test_prompt
+from .prompt_templates.prompt_templates import judge_trajectory_template_tools as SYSTEM_PROMPT
+InputT = TypeVar("InputT")
+OutputT = TypeVar("OutputT")
+class TrajectoryEvaluator(Evaluator[InputT, OutputT]):
+    """
+    An evaluator that is trajectory-based.
+    Attributes:
+        rubric: The user-specified criteria for evaluating a collection of test cases.
+        trajectory_description: A description of the available trajectory types. eg. tool descriptions
+        model: A string representing the model-id for Bedrock to use, or a Model instance.
+                    Defaults to strands.models.BedrockModel if None.
+        system_prompt: System prompt to guide model behavior.
+                    If None, the evaluator will use one of the default template.
+        include_inputs: Whether to include inputs to the task in the evaluation or not.
+    """
+    def __init__(
+        self,
+        rubric: str,
+        trajectory_description: dict | None = None,
+        model: Union[Model, str, None] = None,
+        system_prompt: str = SYSTEM_PROMPT,
+        include_inputs: bool = True,
+    ):
+        super().__init__()
+        self.rubric = rubric
+        self.trajectory_description = trajectory_description
+        self.model = model
+        self.include_inputs = include_inputs
+        self._tools: list[Union[str, dict[str, str], Any]] | None = [
+            exact_match_scorer,
+            in_order_match_scorer,
+            any_order_match_scorer,
+        ]
+        self.system_prompt = system_prompt
+    def update_trajectory_description(self, new_description: dict) -> None:
+        """
+        Update the description of the available trajectories.
+        Args:
+            new_description: The new description of the available trajectories.
+        """
+        self.trajectory_description = new_description
+    def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
+        """
+        Evaluate the performance of the task on the given test cases.
+        Args:
+            evaluation_case: The test case with all of the neccessary context to be evaluated.
+        Returns:
+            The results of the evaluation as EvaluationOutput.
+        """
+        evaluator_agent = Agent(
+            model=self.model, system_prompt=self.system_prompt, tools=self._tools, callback_handler=None
+        )
+        evaluation_prompt = compose_test_prompt(
+            evaluation_case=evaluation_case,
+            rubric=self.rubric,
+            include_inputs=self.include_inputs,
+            uses_trajectory=True,
+        )
+        result = evaluator_agent.structured_output(EvaluationOutput, evaluation_prompt)
+        return [result]
+    async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
+        """
+        Evaluate the performance of the task on the given test cases asynchronously.
+        Args:
+            evaluation_case: The test case with all of the neccessary context to be evaluated.
+        Returns:
+            The results of the evaluation as EvaluationOutput.
+        """
+        evaluator_agent = Agent(
+            model=self.model, system_prompt=self.system_prompt, tools=self._tools, callback_handler=None
+        )
+        evaluation_prompt = compose_test_prompt(
+            evaluation_case=evaluation_case,
+            rubric=self.rubric,
+            include_inputs=self.include_inputs,
+            uses_trajectory=True,
+        )
+        result = await evaluator_agent.structured_output_async(EvaluationOutput, evaluation_prompt)
+        return [result]