strands-agents-evals 0.1.3__tar.gz → 0.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/.github/workflows/integration-test.yml +1 -1
  2. strands_agents_evals-0.1.5/.github/workflows/strands-command.yml +92 -0
  3. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/PKG-INFO +2 -1
  4. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/pyproject.toml +1 -0
  5. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/__init__.py +4 -0
  6. strands_agents_evals-0.1.5/src/strands_evals/evaluators/conciseness_evaluator.py +139 -0
  7. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/evaluator.py +4 -0
  8. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/faithfulness_evaluator.py +21 -16
  9. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/goal_success_rate_evaluator.py +21 -16
  10. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/harmfulness_evaluator.py +21 -16
  11. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/helpfulness_evaluator.py +21 -16
  12. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/interactions_evaluator.py +6 -4
  13. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/output_evaluator.py +6 -4
  14. strands_agents_evals-0.1.5/src/strands_evals/evaluators/prompt_templates/conciseness/__init__.py +11 -0
  15. strands_agents_evals-0.1.5/src/strands_evals/evaluators/prompt_templates/conciseness/conciseness_v0.py +9 -0
  16. strands_agents_evals-0.1.5/src/strands_evals/evaluators/prompt_templates/response_relevance/__init__.py +11 -0
  17. strands_agents_evals-0.1.5/src/strands_evals/evaluators/prompt_templates/response_relevance/response_relevance_v0.py +29 -0
  18. strands_agents_evals-0.1.5/src/strands_evals/evaluators/response_relevance_evaluator.py +144 -0
  19. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/tool_parameter_accuracy_evaluator.py +19 -8
  20. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/tool_selection_accuracy_evaluator.py +19 -8
  21. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/trajectory_evaluator.py +6 -4
  22. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/experiment.py +281 -90
  23. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/extractors/trace_extractor.py +13 -1
  24. strands_agents_evals-0.1.5/src/strands_evals/utils.py +37 -0
  25. strands_agents_evals-0.1.5/tests/strands_evals/evaluators/test_conciseness_evaluator.py +119 -0
  26. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/tests/strands_evals/evaluators/test_faithfulness_evaluator.py +13 -5
  27. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/tests/strands_evals/evaluators/test_goal_success_rate_evaluator.py +11 -7
  28. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/tests/strands_evals/evaluators/test_harmfulness_evaluator.py +13 -5
  29. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/tests/strands_evals/evaluators/test_helpfulness_evaluator.py +13 -5
  30. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/tests/strands_evals/evaluators/test_interactions_evaluator.py +19 -16
  31. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/tests/strands_evals/evaluators/test_output_evaluator.py +20 -14
  32. strands_agents_evals-0.1.5/tests/strands_evals/evaluators/test_response_relevance_evaluator.py +132 -0
  33. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/tests/strands_evals/evaluators/test_tool_parameter_accuracy_evaluator.py +11 -5
  34. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/tests/strands_evals/evaluators/test_tool_selection_accuracy_evaluator.py +13 -5
  35. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/tests/strands_evals/evaluators/test_trajectory_evaluator.py +22 -19
  36. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/tests/strands_evals/test_experiment.py +353 -0
  37. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/tests/test_integration.py +17 -7
  38. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  39. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  40. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  41. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  42. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/.github/dependabot.yml +0 -0
  43. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/.github/workflows/pr-and-push.yml +0 -0
  44. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/.github/workflows/pypi-publish-on-release.yml +0 -0
  45. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/.github/workflows/test-lint.yml +0 -0
  46. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/.gitignore +0 -0
  47. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/.pre-commit-config.yaml +0 -0
  48. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/CODE_OF_CONDUCT.md +0 -0
  49. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/CONTRIBUTING.md +0 -0
  50. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/LICENSE +0 -0
  51. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/NOTICE +0 -0
  52. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/README.md +0 -0
  53. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/STYLE_GUIDE.md +0 -0
  54. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/__init__.py +0 -0
  55. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/__init__.py +0 -0
  56. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/case.py +0 -0
  57. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/display/display_console.py +0 -0
  58. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/prompt_templates/case_prompt_template.py +0 -0
  59. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/prompt_templates/faithfulness/__init__.py +0 -0
  60. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/prompt_templates/faithfulness/faithfulness_v0.py +0 -0
  61. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/prompt_templates/goal_success_rate/__init__.py +0 -0
  62. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_v0.py +0 -0
  63. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/prompt_templates/harmfulness/__init__.py +0 -0
  64. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/prompt_templates/harmfulness/harmfulness_v0.py +0 -0
  65. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/prompt_templates/helpfulness/__init__.py +0 -0
  66. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/prompt_templates/helpfulness/helpfulness_v0.py +0 -0
  67. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/prompt_templates/prompt_templates.py +0 -0
  68. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/__init__.py +0 -0
  69. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/tool_parameter_accuracy_v0.py +0 -0
  70. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/prompt_templates/tool_selection_accuracy/__init__.py +0 -0
  71. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/prompt_templates/tool_selection_accuracy/tool_selection_accuracy_v0.py +0 -0
  72. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/extractors/__init__.py +0 -0
  73. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/extractors/graph_extractor.py +0 -0
  74. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/extractors/swarm_extractor.py +0 -0
  75. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/extractors/tools_use_extractor.py +0 -0
  76. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/generators/__init__.py +0 -0
  77. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/generators/experiment_generator.py +0 -0
  78. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/generators/prompt_template/prompt_templates.py +0 -0
  79. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/generators/topic_planner.py +0 -0
  80. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/mappers/__init__.py +0 -0
  81. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/mappers/session_mapper.py +0 -0
  82. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/mappers/strands_in_memory_session_mapper.py +0 -0
  83. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/simulation/README.md +0 -0
  84. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/simulation/__init__.py +0 -0
  85. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/simulation/actor_simulator.py +0 -0
  86. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/simulation/profiles/__init__.py +0 -0
  87. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/simulation/profiles/actor_profile.py +0 -0
  88. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/simulation/prompt_templates/__init__.py +0 -0
  89. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/simulation/prompt_templates/actor_profile_extraction.py +0 -0
  90. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/simulation/prompt_templates/actor_system_prompt.py +0 -0
  91. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/simulation/prompt_templates/goal_completion.py +0 -0
  92. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/simulation/tools/__init__.py +0 -0
  93. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/simulation/tools/goal_completion.py +0 -0
  94. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/telemetry/__init__.py +0 -0
  95. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/telemetry/_cloudwatch_logger.py +0 -0
  96. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/telemetry/config.py +0 -0
  97. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/telemetry/tracer.py +0 -0
  98. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/tools/evaluation_tools.py +0 -0
  99. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/types/__init__.py +0 -0
  100. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/types/evaluation.py +0 -0
  101. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/types/evaluation_report.py +0 -0
  102. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/types/simulation/__init__.py +0 -0
  103. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/types/simulation/actor.py +0 -0
  104. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/types/trace.py +0 -0
  105. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/tests/__init__.py +0 -0
  106. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/tests/strands_evals/evaluators/test_evaluator.py +0 -0
  107. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/tests/strands_evals/extractors/test_graph_extractor.py +0 -0
  108. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/tests/strands_evals/extractors/test_swarm_extractor.py +0 -0
  109. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/tests/strands_evals/extractors/test_tools_use_extractor.py +0 -0
  110. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/tests/strands_evals/extractors/test_trace_extractor.py +0 -0
  111. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/tests/strands_evals/generators/test_experiment_generator.py +0 -0
  112. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/tests/strands_evals/generators/test_topic_planner.py +0 -0
  113. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/tests/strands_evals/mappers/__init__.py +0 -0
  114. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/tests/strands_evals/mappers/test_strands_in_memory_mapper.py +0 -0
  115. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/tests/strands_evals/simulation/__init__.py +0 -0
  116. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/tests/strands_evals/simulation/test_actor_simulator.py +0 -0
  117. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/tests/strands_evals/simulation/test_goal_completion.py +0 -0
  118. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/tests/strands_evals/telemetry/test_config.py +0 -0
  119. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/tests/strands_evals/telemetry/test_tracer.py +0 -0
  120. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/tests/strands_evals/test_cases.py +0 -0
  121. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/tests/strands_evals/tools/test_evaluation_tools.py +0 -0
  122. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/tests/strands_evals/types/test_trace.py +0 -0
  123. {strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/tests_integ/test_output_evaluator.py +0 -0
@@ -46,7 +46,7 @@ jobs:
46
46
  contents: read
47
47
  steps:
48
48
  - name: Configure Credentials
49
- uses: aws-actions/configure-aws-credentials@v5
49
+ uses: aws-actions/configure-aws-credentials@v6
50
50
  with:
51
51
  role-to-assume: ${{ secrets.STRANDS_INTEG_TEST_ROLE }}
52
52
  aws-region: us-east-1
@@ -0,0 +1,92 @@
1
+ name: Strands Command Handler
2
+
3
+ on:
4
+ issue_comment:
5
+ types: [created]
6
+ workflow_dispatch:
7
+ inputs:
8
+ issue_id:
9
+ description: 'Issue ID to process (can be issue or PR number)'
10
+ required: true
11
+ type: string
12
+ command:
13
+ description: 'Strands command to execute'
14
+ required: false
15
+ type: string
16
+ default: ''
17
+ session_id:
18
+ description: 'Optional session ID to use'
19
+ required: false
20
+ type: string
21
+ default: ''
22
+
23
+ jobs:
24
+ authorization-check:
25
+ if: startsWith(github.event.comment.body, '/strands') || github.event_name == 'workflow_dispatch'
26
+ name: Check access
27
+ permissions: read-all
28
+ runs-on: ubuntu-latest
29
+ outputs:
30
+ approval-env: ${{ steps.auth.outputs.result }}
31
+ steps:
32
+ - name: Check Authorization
33
+ id: auth
34
+ uses: strands-agents/devtools/authorization-check@main
35
+ with:
36
+ skip-check: ${{ github.event_name == 'workflow_dispatch' }}
37
+ username: ${{ github.event.comment.user.login || 'invalid' }}
38
+ allowed-roles: 'triage,write,admin'
39
+
40
+ setup-and-process:
41
+ needs: [authorization-check]
42
+ environment: ${{ needs.authorization-check.outputs.approval-env }}
43
+ permissions:
44
+ # Needed to create a branch for the Implementer Agent
45
+ contents: write
46
+ # These both are needed to add the `strands-running` label to issues and prs
47
+ issues: write
48
+ pull-requests: write
49
+ runs-on: ubuntu-latest
50
+ steps:
51
+ - name: Parse input
52
+ id: parse
53
+ uses: strands-agents/devtools/strands-command/actions/strands-input-parser@main
54
+ with:
55
+ issue_id: ${{ inputs.issue_id }}
56
+ command: ${{ inputs.command }}
57
+ session_id: ${{ inputs.session_id }}
58
+
59
+ execute-readonly-agent:
60
+ needs: [setup-and-process]
61
+ permissions:
62
+ contents: read
63
+ issues: read
64
+ pull-requests: read
65
+ id-token: write # Required for OIDC
66
+ runs-on: ubuntu-latest
67
+ timeout-minutes: 60
68
+ steps:
69
+
70
+ # Add any steps here to set up the environment for the Agent in your repo
71
+ # setup node, setup python, or any other dependencies
72
+
73
+ - name: Run Strands Agent
74
+ id: agent-runner
75
+ uses: strands-agents/devtools/strands-command/actions/strands-agent-runner@main
76
+ with:
77
+ aws_role_arn: ${{ secrets.AWS_ROLE_ARN }}
78
+ sessions_bucket: ${{ secrets.AGENT_SESSIONS_BUCKET }}
79
+ write_permission: 'false'
80
+
81
+ finalize:
82
+ if: always()
83
+ needs: [setup-and-process, execute-readonly-agent]
84
+ permissions:
85
+ contents: write
86
+ issues: write
87
+ pull-requests: write
88
+ runs-on: ubuntu-latest
89
+ timeout-minutes: 30
90
+ steps:
91
+ - name: Execute write operations
92
+ uses: strands-agents/devtools/strands-command/actions/strands-finalize@main
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: strands-agents-evals
3
- Version: 0.1.3
3
+ Version: 0.1.5
4
4
  Summary: Evaluation framework for Strands
5
5
  Author-email: AWS <opensource@amazon.com>
6
6
  License: Apache-2.0
@@ -15,6 +15,7 @@ Requires-Dist: pydantic<3.0.0,>=2.0.0
15
15
  Requires-Dist: rich<15.0.0,>=14.0.0
16
16
  Requires-Dist: strands-agents-tools<1.0.0,>=0.1.0
17
17
  Requires-Dist: strands-agents>=1.0.0
18
+ Requires-Dist: tenacity<10.0.0,>=8.0.0
18
19
  Requires-Dist: typing-extensions>=4.0
19
20
  Provides-Extra: dev
20
21
  Requires-Dist: hatch<2.0.0,>=1.0.0; extra == 'dev'
@@ -23,6 +23,7 @@ dependencies = [
23
23
  "opentelemetry-sdk>=1.20.0",
24
24
  "opentelemetry-instrumentation-threading>=0.51b0,<1.00b0",
25
25
  "boto3>=1.26.0",
26
+ "tenacity>=8.0.0,<10.0.0",
26
27
  ]
27
28
 
28
29
  [tool.hatch.build.targets.wheel]
@@ -1,3 +1,4 @@
1
+ from .conciseness_evaluator import ConcisenessEvaluator
1
2
  from .evaluator import Evaluator
2
3
  from .faithfulness_evaluator import FaithfulnessEvaluator
3
4
  from .goal_success_rate_evaluator import GoalSuccessRateEvaluator
@@ -5,6 +6,7 @@ from .harmfulness_evaluator import HarmfulnessEvaluator
5
6
  from .helpfulness_evaluator import HelpfulnessEvaluator
6
7
  from .interactions_evaluator import InteractionsEvaluator
7
8
  from .output_evaluator import OutputEvaluator
9
+ from .response_relevance_evaluator import ResponseRelevanceEvaluator
8
10
  from .tool_parameter_accuracy_evaluator import ToolParameterAccuracyEvaluator
9
11
  from .tool_selection_accuracy_evaluator import ToolSelectionAccuracyEvaluator
10
12
  from .trajectory_evaluator import TrajectoryEvaluator
@@ -18,6 +20,8 @@ __all__ = [
18
20
  "HarmfulnessEvaluator",
19
21
  "GoalSuccessRateEvaluator",
20
22
  "FaithfulnessEvaluator",
23
+ "ResponseRelevanceEvaluator",
21
24
  "ToolSelectionAccuracyEvaluator",
22
25
  "ToolParameterAccuracyEvaluator",
26
+ "ConcisenessEvaluator",
23
27
  ]
@@ -0,0 +1,139 @@
1
+ from enum import Enum
2
+ from typing import cast
3
+
4
+ from pydantic import BaseModel, Field
5
+ from strands import Agent
6
+ from strands.models.model import Model
7
+ from typing_extensions import TypeVar, Union
8
+
9
+ from ..types.evaluation import EvaluationData, EvaluationOutput
10
+ from ..types.trace import EvaluationLevel, TextContent, ToolExecution, TraceLevelInput
11
+ from .evaluator import Evaluator
12
+ from .prompt_templates.conciseness import get_template
13
+
14
+ InputT = TypeVar("InputT")
15
+ OutputT = TypeVar("OutputT")
16
+
17
+
18
+ class ConcisenessScore(str, Enum):
19
+ """Categorical conciseness ratings."""
20
+
21
+ NOT_CONCISE = "Not Concise"
22
+ PARTIALLY_CONCISE = "Partially Concise"
23
+ PERFECTLY_CONCISE = "Perfectly Concise"
24
+
25
+
26
+ class ConcisenessRating(BaseModel):
27
+ """Structured output for conciseness evaluation."""
28
+
29
+ reasoning: str = Field(description="Step by step reasoning to derive the final score")
30
+ score: ConcisenessScore = Field(description="Categorical conciseness rating")
31
+
32
+
33
+ class ConcisenessEvaluator(Evaluator[InputT, OutputT]):
34
+ """Evaluates how concise the assistant's response is."""
35
+
36
+ evaluation_level = EvaluationLevel.TRACE_LEVEL
37
+
38
+ _score_mapping = {
39
+ ConcisenessScore.NOT_CONCISE: 0.0,
40
+ ConcisenessScore.PARTIALLY_CONCISE: 0.5,
41
+ ConcisenessScore.PERFECTLY_CONCISE: 1.0,
42
+ }
43
+
44
+ def __init__(
45
+ self,
46
+ version: str = "v0",
47
+ model: Union[Model, str, None] = None,
48
+ system_prompt: str | None = None,
49
+ include_inputs: bool = True,
50
+ ):
51
+ super().__init__()
52
+ self.system_prompt = system_prompt or get_template(version).SYSTEM_PROMPT
53
+ self.version = version
54
+ self.model = model
55
+ self.include_inputs = include_inputs
56
+
57
+ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
58
+ parsed_input = self._get_last_turn(evaluation_case)
59
+ prompt = self._format_prompt(parsed_input)
60
+ evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
61
+ result = evaluator_agent(prompt, structured_output_model=ConcisenessRating)
62
+ return self._create_evaluation_output(result)
63
+
64
+ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
65
+ parsed_input = self._get_last_turn(evaluation_case)
66
+ prompt = self._format_prompt(parsed_input)
67
+ evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
68
+ result = await evaluator_agent.invoke_async(prompt, structured_output_model=ConcisenessRating)
69
+ return self._create_evaluation_output(result)
70
+
71
+ def _create_evaluation_output(self, result) -> list[EvaluationOutput]:
72
+ rating = cast(ConcisenessRating, result.structured_output)
73
+ normalized_score = self._score_mapping[rating.score]
74
+ return [
75
+ EvaluationOutput(
76
+ score=normalized_score,
77
+ test_pass=normalized_score >= 0.5,
78
+ reason=rating.reasoning,
79
+ label=rating.score,
80
+ )
81
+ ]
82
+
83
+ def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
84
+ """Extract the most recent turn from the conversation for evaluation."""
85
+ parsed_inputs = self._parse_trajectory(evaluation_case)
86
+ if not parsed_inputs:
87
+ raise ValueError(
88
+ "No turn-level inputs could be parsed from the trajectory. "
89
+ "Ensure actual_trajectory is a Session with at least one AgentInvocationSpan."
90
+ )
91
+ return parsed_inputs[-1]
92
+
93
+ def _extract_user_prompt(self, parsed_input: TraceLevelInput) -> str:
94
+ """Extract user prompt from last message in session history.
95
+
96
+ Args:
97
+ parsed_input: Trace-level input containing session history
98
+
99
+ Returns:
100
+ User prompt text, or empty string if not available
101
+ """
102
+ if not parsed_input.session_history:
103
+ return ""
104
+
105
+ last_msg = parsed_input.session_history[-1]
106
+ if not isinstance(last_msg, list) and self._has_text_content(last_msg):
107
+ first_content = last_msg.content[0]
108
+ if isinstance(first_content, TextContent):
109
+ return first_content.text
110
+
111
+ return ""
112
+
113
+ def _format_prompt(self, parsed_input: TraceLevelInput) -> str:
114
+ """Format evaluation prompt from parsed trace data.
115
+
116
+ Args:
117
+ parsed_input: Trace-level input containing agent response and session history
118
+
119
+ Returns:
120
+ Formatted prompt string with conversation history and target turn
121
+ """
122
+ parts = []
123
+
124
+ if parsed_input.session_history:
125
+ history_lines = []
126
+ for msg in parsed_input.session_history:
127
+ if isinstance(msg, list) and msg and isinstance(msg[0], ToolExecution):
128
+ continue # Skip tool execution lists
129
+ if not isinstance(msg, list) and self._has_text_content(msg):
130
+ first_content = msg.content[0]
131
+ if isinstance(first_content, TextContent):
132
+ history_lines.append(f"{msg.role.value.capitalize()}: {first_content.text}")
133
+ history_str = "\n".join(history_lines)
134
+ parts.append(f"# Previous turns:\n{history_str}")
135
+
136
+ user_prompt = self._extract_user_prompt(parsed_input)
137
+ parts.append(f"# Target turn to evaluate:\nUser: {user_prompt}\nAssistant: {parsed_input.agent_response.text}")
138
+
139
+ return "\n\n".join(parts)
@@ -63,6 +63,10 @@ class Evaluator(Generic[InputT, OutputT]):
63
63
 
64
64
  @staticmethod
65
65
  def _default_aggregator(outputs: list[EvaluationOutput]) -> tuple[float, bool, str]:
66
+ # Handle empty outputs list to avoid division by zero
67
+ if not outputs:
68
+ return (0.0, False, "No evaluation outputs produced")
69
+
66
70
  avg_score = sum(o.score for o in outputs) / len(outputs)
67
71
  all_pass = all(o.test_pass for o in outputs)
68
72
  combined_reason = " | ".join(o.reason for o in outputs if o.reason)
@@ -1,4 +1,5 @@
1
1
  from enum import Enum
2
+ from typing import cast
2
3
 
3
4
  from pydantic import BaseModel, Field
4
5
  from strands import Agent
@@ -59,29 +60,33 @@ class FaithfulnessEvaluator(Evaluator[InputT, OutputT]):
59
60
  parsed_input = self._get_last_turn(evaluation_case)
60
61
  prompt = self._format_prompt(parsed_input)
61
62
  evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
62
- rating = evaluator_agent.structured_output(FaithfulnessRating, prompt)
63
+ result = evaluator_agent(prompt, structured_output_model=FaithfulnessRating)
64
+ rating = cast(FaithfulnessRating, result.structured_output)
63
65
  normalized_score = self._score_mapping[rating.score]
64
- result = EvaluationOutput(
65
- score=normalized_score,
66
- test_pass=normalized_score >= 0.5,
67
- reason=rating.reasoning,
68
- label=rating.score,
69
- )
70
- return [result]
66
+ return [
67
+ EvaluationOutput(
68
+ score=normalized_score,
69
+ test_pass=normalized_score >= 0.5,
70
+ reason=rating.reasoning,
71
+ label=rating.score,
72
+ )
73
+ ]
71
74
 
72
75
  async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
73
76
  parsed_input = self._get_last_turn(evaluation_case)
74
77
  prompt = self._format_prompt(parsed_input)
75
78
  evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
76
- rating = await evaluator_agent.structured_output_async(FaithfulnessRating, prompt)
79
+ result = await evaluator_agent.invoke_async(prompt, structured_output_model=FaithfulnessRating)
80
+ rating = cast(FaithfulnessRating, result.structured_output)
77
81
  normalized_score = self._score_mapping[rating.score]
78
- result = EvaluationOutput(
79
- score=normalized_score,
80
- test_pass=normalized_score >= 0.5,
81
- reason=rating.reasoning,
82
- label=rating.score,
83
- )
84
- return [result]
82
+ return [
83
+ EvaluationOutput(
84
+ score=normalized_score,
85
+ test_pass=normalized_score >= 0.5,
86
+ reason=rating.reasoning,
87
+ label=rating.score,
88
+ )
89
+ ]
85
90
 
86
91
  def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
87
92
  """Extract the most recent turn from the conversation for evaluation."""
@@ -1,4 +1,5 @@
1
1
  from enum import Enum
2
+ from typing import cast
2
3
 
3
4
  from pydantic import BaseModel, Field
4
5
  from strands import Agent
@@ -53,29 +54,33 @@ class GoalSuccessRateEvaluator(Evaluator[InputT, OutputT]):
53
54
  session_input = self._parse_trajectory(evaluation_case)
54
55
  prompt = self._format_prompt(session_input)
55
56
  evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
56
- rating = evaluator_agent.structured_output(GoalSuccessRating, prompt)
57
+ result = evaluator_agent(prompt, structured_output_model=GoalSuccessRating)
58
+ rating = cast(GoalSuccessRating, result.structured_output)
57
59
  normalized_score = self._score_mapping[rating.score]
58
- result = EvaluationOutput(
59
- score=normalized_score,
60
- test_pass=normalized_score >= 1.0,
61
- reason=rating.reasoning,
62
- label=rating.score,
63
- )
64
- return [result]
60
+ return [
61
+ EvaluationOutput(
62
+ score=normalized_score,
63
+ test_pass=normalized_score >= 1.0,
64
+ reason=rating.reasoning,
65
+ label=rating.score,
66
+ )
67
+ ]
65
68
 
66
69
  async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
67
70
  session_input = self._parse_trajectory(evaluation_case)
68
71
  prompt = self._format_prompt(session_input)
69
72
  evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
70
- rating = await evaluator_agent.structured_output_async(GoalSuccessRating, prompt)
73
+ result = await evaluator_agent.invoke_async(prompt, structured_output_model=GoalSuccessRating)
74
+ rating = cast(GoalSuccessRating, result.structured_output)
71
75
  normalized_score = self._score_mapping[rating.score]
72
- result = EvaluationOutput(
73
- score=normalized_score,
74
- test_pass=normalized_score >= 1.0,
75
- reason=rating.reasoning,
76
- label=rating.score,
77
- )
78
- return [result]
76
+ return [
77
+ EvaluationOutput(
78
+ score=normalized_score,
79
+ test_pass=normalized_score >= 1.0,
80
+ reason=rating.reasoning,
81
+ label=rating.score,
82
+ )
83
+ ]
79
84
 
80
85
  def _format_prompt(self, session_input: SessionLevelInput) -> str:
81
86
  """Format evaluation prompt from session-level input."""
@@ -1,4 +1,5 @@
1
1
  from enum import Enum
2
+ from typing import cast
2
3
 
3
4
  from pydantic import BaseModel, Field
4
5
  from strands import Agent
@@ -52,29 +53,33 @@ class HarmfulnessEvaluator(Evaluator[InputT, OutputT]):
52
53
  parsed_input = self._get_last_turn(evaluation_case)
53
54
  prompt = self._format_prompt(parsed_input)
54
55
  evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
55
- rating = evaluator_agent.structured_output(HarmfulnessRating, prompt)
56
+ result = evaluator_agent(prompt, structured_output_model=HarmfulnessRating)
57
+ rating = cast(HarmfulnessRating, result.structured_output)
56
58
  normalized_score = self._score_mapping[rating.score]
57
- result = EvaluationOutput(
58
- score=normalized_score,
59
- test_pass=normalized_score == 1.0,
60
- reason=rating.reasoning,
61
- label=rating.score,
62
- )
63
- return [result]
59
+ return [
60
+ EvaluationOutput(
61
+ score=normalized_score,
62
+ test_pass=normalized_score == 1.0,
63
+ reason=rating.reasoning,
64
+ label=rating.score,
65
+ )
66
+ ]
64
67
 
65
68
  async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
66
69
  parsed_input = self._get_last_turn(evaluation_case)
67
70
  prompt = self._format_prompt(parsed_input)
68
71
  evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
69
- rating = await evaluator_agent.structured_output_async(HarmfulnessRating, prompt)
72
+ result = await evaluator_agent.invoke_async(prompt, structured_output_model=HarmfulnessRating)
73
+ rating = cast(HarmfulnessRating, result.structured_output)
70
74
  normalized_score = self._score_mapping[rating.score]
71
- result = EvaluationOutput(
72
- score=normalized_score,
73
- test_pass=normalized_score == 1.0,
74
- reason=rating.reasoning,
75
- label=rating.score,
76
- )
77
- return [result]
75
+ return [
76
+ EvaluationOutput(
77
+ score=normalized_score,
78
+ test_pass=normalized_score == 1.0,
79
+ reason=rating.reasoning,
80
+ label=rating.score,
81
+ )
82
+ ]
78
83
 
79
84
  def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
80
85
  """Extract the most recent turn from the conversation for evaluation."""
@@ -1,4 +1,5 @@
1
1
  from enum import Enum
2
+ from typing import cast
2
3
 
3
4
  from pydantic import BaseModel, Field
4
5
  from strands import Agent
@@ -65,29 +66,33 @@ class HelpfulnessEvaluator(Evaluator[InputT, OutputT]):
65
66
  parsed_input = self._get_last_turn(evaluation_case)
66
67
  prompt = self._format_prompt(parsed_input)
67
68
  evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
68
- rating = evaluator_agent.structured_output(HelpfulnessRating, prompt)
69
+ result = evaluator_agent(prompt, structured_output_model=HelpfulnessRating)
70
+ rating = cast(HelpfulnessRating, result.structured_output)
69
71
  normalized_score = self._score_mapping[rating.score]
70
- result = EvaluationOutput(
71
- score=normalized_score,
72
- test_pass=normalized_score >= 0.5,
73
- reason=rating.reasoning,
74
- label=rating.score,
75
- )
76
- return [result]
72
+ return [
73
+ EvaluationOutput(
74
+ score=normalized_score,
75
+ test_pass=normalized_score >= 0.5,
76
+ reason=rating.reasoning,
77
+ label=rating.score,
78
+ )
79
+ ]
77
80
 
78
81
  async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
79
82
  parsed_input = self._get_last_turn(evaluation_case)
80
83
  prompt = self._format_prompt(parsed_input)
81
84
  evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
82
- rating = await evaluator_agent.structured_output_async(HelpfulnessRating, prompt)
85
+ result = await evaluator_agent.invoke_async(prompt, structured_output_model=HelpfulnessRating)
86
+ rating = cast(HelpfulnessRating, result.structured_output)
83
87
  normalized_score = self._score_mapping[rating.score]
84
- result = EvaluationOutput(
85
- score=normalized_score,
86
- test_pass=normalized_score >= 0.5,
87
- reason=rating.reasoning,
88
- label=rating.score,
89
- )
90
- return [result]
88
+ return [
89
+ EvaluationOutput(
90
+ score=normalized_score,
91
+ test_pass=normalized_score >= 0.5,
92
+ reason=rating.reasoning,
93
+ label=rating.score,
94
+ )
95
+ ]
91
96
 
92
97
  def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
93
98
  """Extract the most recent turn from the conversation for evaluation."""
@@ -1,3 +1,5 @@
1
+ from typing import cast
2
+
1
3
  from strands import Agent
2
4
  from strands.agent.conversation_manager import SlidingWindowConversationManager
3
5
  from strands.models.model import Model
@@ -198,8 +200,8 @@ class InteractionsEvaluator(Evaluator[InputT, OutputT]):
198
200
  for i in range(num_interactions):
199
201
  is_last = i == num_interactions - 1
200
202
  evaluation_prompt = self._compose_prompt(evaluation_case, i, is_last)
201
- result = evaluator_agent.structured_output(EvaluationOutput, evaluation_prompt)
202
- results.append(result)
203
+ result = evaluator_agent(evaluation_prompt, structured_output_model=EvaluationOutput)
204
+ results.append(cast(EvaluationOutput, result.structured_output))
203
205
 
204
206
  return results
205
207
 
@@ -238,7 +240,7 @@ class InteractionsEvaluator(Evaluator[InputT, OutputT]):
238
240
  for i in range(num_interactions):
239
241
  is_last = i == num_interactions - 1
240
242
  evaluation_prompt = self._compose_prompt(evaluation_case, i, is_last)
241
- result = await evaluator_agent.structured_output_async(EvaluationOutput, evaluation_prompt)
242
- results.append(result)
243
+ result = await evaluator_agent.invoke_async(evaluation_prompt, structured_output_model=EvaluationOutput)
244
+ results.append(cast(EvaluationOutput, result.structured_output))
243
245
 
244
246
  return results
@@ -1,3 +1,5 @@
1
+ from typing import cast
2
+
1
3
  from strands import Agent
2
4
  from strands.models.model import Model
3
5
  from typing_extensions import TypeVar, Union
@@ -51,8 +53,8 @@ class OutputEvaluator(Evaluator[InputT, OutputT]):
51
53
  evaluation_prompt = compose_test_prompt(
52
54
  evaluation_case=evaluation_case, rubric=self.rubric, include_inputs=self.include_inputs
53
55
  )
54
- result = evaluator_agent.structured_output(EvaluationOutput, evaluation_prompt)
55
- return [result]
56
+ result = evaluator_agent(evaluation_prompt, structured_output_model=EvaluationOutput)
57
+ return [cast(EvaluationOutput, result.structured_output)]
56
58
 
57
59
  async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
58
60
  """
@@ -68,5 +70,5 @@ class OutputEvaluator(Evaluator[InputT, OutputT]):
68
70
  evaluation_prompt = compose_test_prompt(
69
71
  evaluation_case=evaluation_case, rubric=self.rubric, include_inputs=self.include_inputs
70
72
  )
71
- result = await evaluator_agent.structured_output_async(EvaluationOutput, evaluation_prompt)
72
- return [result]
73
+ result = await evaluator_agent.invoke_async(evaluation_prompt, structured_output_model=EvaluationOutput)
74
+ return [cast(EvaluationOutput, result.structured_output)]
@@ -0,0 +1,11 @@
1
+ from . import conciseness_v0
2
+
3
+ VERSIONS = {
4
+ "v0": conciseness_v0,
5
+ }
6
+
7
+ DEFAULT_VERSION = "v0"
8
+
9
+
10
+ def get_template(version: str = DEFAULT_VERSION):
11
+ return VERSIONS[version]
@@ -0,0 +1,9 @@
1
+ SYSTEM_PROMPT = """You are evaluating how concise the Assistant's response is.
2
+ A concise response provides exactly what was requested using the minimum necessary words, without extra explanations, pleasantries, or repetition unless specifically asked for.
3
+
4
+ ## Scoring
5
+ - Perfectly Concise: delivers exactly what was asked with no unnecessary content
6
+ - Partially Concise: minor extra wording but still focused
7
+ - Not Concise: verbose, repetitive, or includes substantial unnecessary content
8
+
9
+ **IMPORTANT**: The agent prompt and tools ALWAYS takes priority over your own knowledge."""
@@ -0,0 +1,11 @@
1
+ from . import response_relevance_v0
2
+
3
+ VERSIONS = {
4
+ "v0": response_relevance_v0,
5
+ }
6
+
7
+ DEFAULT_VERSION = "v0"
8
+
9
+
10
+ def get_template(version: str = DEFAULT_VERSION):
11
+ return VERSIONS[version]
@@ -0,0 +1,29 @@
1
+ SYSTEM_PROMPT = """You are an objective judge evaluating the relevance of an AI assistant's response to the user's question. Your task is to assess how focused the response is on addressing the given question.
2
+
3
+ # Evaluation Guidelines:
4
+
5
+ When evaluating the relevance of the response, consider the following rubrics:
6
+
7
+ - If everything in the response can be understood to directly address the input, the response is perfectly relevant.
8
+ - If anything in the response is unrelated to the input, the response is less relevant.
9
+ - Relevance only evaluates whether the response is on topic. Content that indicates that the assistant understood the question, but was unable to answer it truthfully, faithfully, coherently or correctly still counts as a relevant response. Only content that is extraneous to answering the question should be penalized.
10
+ - Duplicate information does not penalize relevance. The response could say the same thing multiple times. If that thing is a relevant answer to the user's query, relevance is not penalized.
11
+
12
+ # Rating Scale:
13
+
14
+ 1. Not At All
15
+ - No part of the response is relevant to the question
16
+
17
+ 2. Not Generally
18
+ - An overwhelming amount of the response is irrelevant or the relevant information is not a direct answer
19
+
20
+ 3. Neutral/Mixed
21
+ - Roughly half of the response is relevant to the question
22
+
23
+ 4. Generally Yes
24
+ - An overwhelming amount of the response is relevant to the question
25
+
26
+ 5. Completely Yes
27
+ - Every piece of the response is relevant to the question
28
+
29
+ IMPORTANT: The tool output ALWAYS takes priority over your own knowledge. Focus on whether the response addresses the user's question, not on factual accuracy."""