strands-agents-evals 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. strands_agents_evals-0.1.0.dist-info/METADATA +408 -0
  2. strands_agents_evals-0.1.0.dist-info/RECORD +68 -0
  3. strands_agents_evals-0.1.0.dist-info/WHEEL +4 -0
  4. strands_agents_evals-0.1.0.dist-info/licenses/LICENSE +175 -0
  5. strands_agents_evals-0.1.0.dist-info/licenses/NOTICE +1 -0
  6. strands_evals/__init__.py +22 -0
  7. strands_evals/case.py +53 -0
  8. strands_evals/display/display_console.py +150 -0
  9. strands_evals/evaluators/__init__.py +23 -0
  10. strands_evals/evaluators/evaluator.py +182 -0
  11. strands_evals/evaluators/faithfulness_evaluator.py +116 -0
  12. strands_evals/evaluators/goal_success_rate_evaluator.py +90 -0
  13. strands_evals/evaluators/harmfulness_evaluator.py +135 -0
  14. strands_evals/evaluators/helpfulness_evaluator.py +148 -0
  15. strands_evals/evaluators/interactions_evaluator.py +244 -0
  16. strands_evals/evaluators/output_evaluator.py +72 -0
  17. strands_evals/evaluators/prompt_templates/case_prompt_template.py +63 -0
  18. strands_evals/evaluators/prompt_templates/faithfulness/__init__.py +11 -0
  19. strands_evals/evaluators/prompt_templates/faithfulness/faithfulness_v0.py +30 -0
  20. strands_evals/evaluators/prompt_templates/goal_success_rate/__init__.py +11 -0
  21. strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_v0.py +17 -0
  22. strands_evals/evaluators/prompt_templates/harmfulness/__init__.py +11 -0
  23. strands_evals/evaluators/prompt_templates/harmfulness/harmfulness_v0.py +8 -0
  24. strands_evals/evaluators/prompt_templates/helpfulness/__init__.py +11 -0
  25. strands_evals/evaluators/prompt_templates/helpfulness/helpfulness_v0.py +38 -0
  26. strands_evals/evaluators/prompt_templates/prompt_templates.py +176 -0
  27. strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/__init__.py +11 -0
  28. strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/tool_parameter_accuracy_v0.py +40 -0
  29. strands_evals/evaluators/prompt_templates/tool_selection_accuracy/__init__.py +11 -0
  30. strands_evals/evaluators/prompt_templates/tool_selection_accuracy/tool_selection_accuracy_v0.py +23 -0
  31. strands_evals/evaluators/tool_parameter_accuracy_evaluator.py +112 -0
  32. strands_evals/evaluators/tool_selection_accuracy_evaluator.py +112 -0
  33. strands_evals/evaluators/trajectory_evaluator.py +100 -0
  34. strands_evals/experiment.py +652 -0
  35. strands_evals/extractors/__init__.py +3 -0
  36. strands_evals/extractors/graph_extractor.py +30 -0
  37. strands_evals/extractors/swarm_extractor.py +73 -0
  38. strands_evals/extractors/tools_use_extractor.py +164 -0
  39. strands_evals/extractors/trace_extractor.py +166 -0
  40. strands_evals/generators/__init__.py +3 -0
  41. strands_evals/generators/experiment_generator.py +498 -0
  42. strands_evals/generators/prompt_template/prompt_templates.py +75 -0
  43. strands_evals/generators/topic_planner.py +60 -0
  44. strands_evals/mappers/__init__.py +6 -0
  45. strands_evals/mappers/session_mapper.py +27 -0
  46. strands_evals/mappers/strands_in_memory_session_mapper.py +473 -0
  47. strands_evals/simulation/README.md +323 -0
  48. strands_evals/simulation/__init__.py +6 -0
  49. strands_evals/simulation/actor_simulator.py +292 -0
  50. strands_evals/simulation/profiles/__init__.py +5 -0
  51. strands_evals/simulation/profiles/actor_profile.py +26 -0
  52. strands_evals/simulation/prompt_templates/__init__.py +11 -0
  53. strands_evals/simulation/prompt_templates/actor_profile_extraction.py +25 -0
  54. strands_evals/simulation/prompt_templates/actor_system_prompt.py +64 -0
  55. strands_evals/simulation/prompt_templates/goal_completion.py +27 -0
  56. strands_evals/simulation/tools/__init__.py +5 -0
  57. strands_evals/simulation/tools/goal_completion.py +93 -0
  58. strands_evals/telemetry/__init__.py +15 -0
  59. strands_evals/telemetry/_cloudwatch_logger.py +209 -0
  60. strands_evals/telemetry/config.py +207 -0
  61. strands_evals/telemetry/tracer.py +38 -0
  62. strands_evals/tools/evaluation_tools.py +67 -0
  63. strands_evals/types/__init__.py +11 -0
  64. strands_evals/types/evaluation.py +105 -0
  65. strands_evals/types/evaluation_report.py +244 -0
  66. strands_evals/types/simulation/__init__.py +5 -0
  67. strands_evals/types/simulation/actor.py +34 -0
  68. strands_evals/types/trace.py +205 -0
@@ -0,0 +1,100 @@
1
+ from strands import Agent
2
+ from strands.models.model import Model
3
+ from typing_extensions import Any, TypeVar, Union
4
+
5
+ from ..tools.evaluation_tools import any_order_match_scorer, exact_match_scorer, in_order_match_scorer
6
+ from ..types.evaluation import EvaluationData, EvaluationOutput
7
+ from .evaluator import Evaluator
8
+ from .prompt_templates.case_prompt_template import compose_test_prompt
9
+ from .prompt_templates.prompt_templates import judge_trajectory_template_tools as SYSTEM_PROMPT
10
+
11
+ InputT = TypeVar("InputT")
12
+ OutputT = TypeVar("OutputT")
13
+
14
+
15
+ class TrajectoryEvaluator(Evaluator[InputT, OutputT]):
16
+ """
17
+ An evaluator that is trajectory-based.
18
+
19
+ Attributes:
20
+ rubric: The user-specified criteria for evaluating a collection of test cases.
21
+ trajectory_description: A description of the available trajectory types. eg. tool descriptions
22
+ model: A string representing the model-id for Bedrock to use, or a Model instance.
23
+ Defaults to strands.models.BedrockModel if None.
24
+ system_prompt: System prompt to guide model behavior.
25
+ If None, the evaluator will use one of the default template.
26
+ include_inputs: Whether to include inputs to the task in the evaluation or not.
27
+ """
28
+
29
+ def __init__(
30
+ self,
31
+ rubric: str,
32
+ trajectory_description: dict | None = None,
33
+ model: Union[Model, str, None] = None,
34
+ system_prompt: str = SYSTEM_PROMPT,
35
+ include_inputs: bool = True,
36
+ ):
37
+ super().__init__()
38
+ self.rubric = rubric
39
+ self.trajectory_description = trajectory_description
40
+ self.model = model
41
+ self.include_inputs = include_inputs
42
+ self._tools: list[Union[str, dict[str, str], Any]] | None = [
43
+ exact_match_scorer,
44
+ in_order_match_scorer,
45
+ any_order_match_scorer,
46
+ ]
47
+ self.system_prompt = system_prompt
48
+
49
+ def update_trajectory_description(self, new_description: dict) -> None:
50
+ """
51
+ Update the description of the available trajectories.
52
+
53
+ Args:
54
+ new_description: The new description of the available trajectories.
55
+ """
56
+ self.trajectory_description = new_description
57
+
58
+ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
59
+ """
60
+ Evaluate the performance of the task on the given test cases.
61
+
62
+ Args:
63
+ evaluation_case: The test case with all of the neccessary context to be evaluated.
64
+
65
+ Returns:
66
+ The results of the evaluation as EvaluationOutput.
67
+ """
68
+ evaluator_agent = Agent(
69
+ model=self.model, system_prompt=self.system_prompt, tools=self._tools, callback_handler=None
70
+ )
71
+ evaluation_prompt = compose_test_prompt(
72
+ evaluation_case=evaluation_case,
73
+ rubric=self.rubric,
74
+ include_inputs=self.include_inputs,
75
+ uses_trajectory=True,
76
+ )
77
+ result = evaluator_agent.structured_output(EvaluationOutput, evaluation_prompt)
78
+ return [result]
79
+
80
+ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
81
+ """
82
+ Evaluate the performance of the task on the given test cases asynchronously.
83
+
84
+ Args:
85
+ evaluation_case: The test case with all of the neccessary context to be evaluated.
86
+
87
+ Returns:
88
+ The results of the evaluation as EvaluationOutput.
89
+ """
90
+ evaluator_agent = Agent(
91
+ model=self.model, system_prompt=self.system_prompt, tools=self._tools, callback_handler=None
92
+ )
93
+ evaluation_prompt = compose_test_prompt(
94
+ evaluation_case=evaluation_case,
95
+ rubric=self.rubric,
96
+ include_inputs=self.include_inputs,
97
+ uses_trajectory=True,
98
+ )
99
+ result = await evaluator_agent.structured_output_async(EvaluationOutput, evaluation_prompt)
100
+ return [result]