PyPI - strands-agents-evals - Versions diffs - 0.1.3__tar.gz → 0.1.5__tar.gz - Mend

strands-agents-evals 0.1.3tar.gz → 0.1.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (123) hide show

{strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/.github/workflows/integration-test.yml RENAMED Viewed

@@ -46,7 +46,7 @@ jobs:
       contents: read
     steps:
       - name: Configure Credentials
-        uses: aws-actions/configure-aws-credentials@v5
+        uses: aws-actions/configure-aws-credentials@v6
         with:
          role-to-assume: ${{ secrets.STRANDS_INTEG_TEST_ROLE }}
          aws-region: us-east-1

strands_agents_evals-0.1.5/.github/workflows/strands-command.yml ADDED Viewed

@@ -0,0 +1,92 @@
+name: Strands Command Handler
+on:
+  issue_comment:
+    types: [created]
+  workflow_dispatch:
+    inputs:
+      issue_id:
+        description: 'Issue ID to process (can be issue or PR number)'
+        required: true
+        type: string
+      command:
+        description: 'Strands command to execute'
+        required: false
+        type: string
+        default: ''
+      session_id:
+        description: 'Optional session ID to use'
+        required: false
+        type: string
+        default: ''
+jobs:
+  authorization-check:
+    if: startsWith(github.event.comment.body, '/strands') || github.event_name == 'workflow_dispatch'
+    name: Check access
+    permissions: read-all
+    runs-on: ubuntu-latest
+    outputs:
+      approval-env: ${{ steps.auth.outputs.result }}
+    steps:
+      - name: Check Authorization
+        id: auth
+        uses: strands-agents/devtools/authorization-check@main
+        with:
+          skip-check: ${{ github.event_name == 'workflow_dispatch' }}
+          username: ${{ github.event.comment.user.login || 'invalid' }}
+          allowed-roles: 'triage,write,admin'
+  setup-and-process:
+    needs: [authorization-check]
+    environment: ${{ needs.authorization-check.outputs.approval-env }}
+    permissions:
+      # Needed to create a branch for the Implementer Agent
+      contents: write
+      # These both are needed to add the `strands-running` label to issues and prs
+      issues: write
+      pull-requests: write
+    runs-on: ubuntu-latest
+    steps:
+      - name: Parse input
+        id: parse
+        uses: strands-agents/devtools/strands-command/actions/strands-input-parser@main
+        with:
+          issue_id: ${{ inputs.issue_id }}
+          command: ${{ inputs.command }}
+          session_id: ${{ inputs.session_id }}
+  execute-readonly-agent:
+    needs: [setup-and-process]
+    permissions:
+      contents: read
+      issues: read
+      pull-requests: read
+      id-token: write # Required for OIDC
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+    steps:
+      # Add any steps here to set up the environment for the Agent in your repo
+      # setup node, setup python, or any other dependencies
+      - name: Run Strands Agent
+        id: agent-runner
+        uses: strands-agents/devtools/strands-command/actions/strands-agent-runner@main
+        with:
+          aws_role_arn: ${{ secrets.AWS_ROLE_ARN }}
+          sessions_bucket: ${{ secrets.AGENT_SESSIONS_BUCKET }}
+          write_permission: 'false'
+  finalize:
+    if: always()
+    needs: [setup-and-process, execute-readonly-agent]
+    permissions:
+      contents: write
+      issues: write
+      pull-requests: write
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    steps:
+      - name: Execute write operations
+        uses: strands-agents/devtools/strands-command/actions/strands-finalize@main

{strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: strands-agents-evals
-Version: 0.1.3
+Version: 0.1.5
 Summary: Evaluation framework for Strands
 Author-email: AWS <opensource@amazon.com>
 License: Apache-2.0
@@ -15,6 +15,7 @@ Requires-Dist: pydantic<3.0.0,>=2.0.0
 Requires-Dist: rich<15.0.0,>=14.0.0
 Requires-Dist: strands-agents-tools<1.0.0,>=0.1.0
 Requires-Dist: strands-agents>=1.0.0
+Requires-Dist: tenacity<10.0.0,>=8.0.0
 Requires-Dist: typing-extensions>=4.0
 Provides-Extra: dev
 Requires-Dist: hatch<2.0.0,>=1.0.0; extra == 'dev'

{strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/pyproject.toml RENAMED Viewed

@@ -23,6 +23,7 @@ dependencies = [
     "opentelemetry-sdk>=1.20.0",
     "opentelemetry-instrumentation-threading>=0.51b0,<1.00b0",
     "boto3>=1.26.0",
+    "tenacity>=8.0.0,<10.0.0",
 ]
 [tool.hatch.build.targets.wheel]

{strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/__init__.py RENAMED Viewed

@@ -1,3 +1,4 @@
+from .conciseness_evaluator import ConcisenessEvaluator
 from .evaluator import Evaluator
 from .faithfulness_evaluator import FaithfulnessEvaluator
 from .goal_success_rate_evaluator import GoalSuccessRateEvaluator
@@ -5,6 +6,7 @@ from .harmfulness_evaluator import HarmfulnessEvaluator
 from .helpfulness_evaluator import HelpfulnessEvaluator
 from .interactions_evaluator import InteractionsEvaluator
 from .output_evaluator import OutputEvaluator
+from .response_relevance_evaluator import ResponseRelevanceEvaluator
 from .tool_parameter_accuracy_evaluator import ToolParameterAccuracyEvaluator
 from .tool_selection_accuracy_evaluator import ToolSelectionAccuracyEvaluator
 from .trajectory_evaluator import TrajectoryEvaluator
@@ -18,6 +20,8 @@ __all__ = [
     "HarmfulnessEvaluator",
     "GoalSuccessRateEvaluator",
     "FaithfulnessEvaluator",
+    "ResponseRelevanceEvaluator",
     "ToolSelectionAccuracyEvaluator",
     "ToolParameterAccuracyEvaluator",
+    "ConcisenessEvaluator",
 ]

strands_agents_evals-0.1.5/src/strands_evals/evaluators/conciseness_evaluator.py ADDED Viewed

@@ -0,0 +1,139 @@
+from enum import Enum
+from typing import cast
+from pydantic import BaseModel, Field
+from strands import Agent
+from strands.models.model import Model
+from typing_extensions import TypeVar, Union
+from ..types.evaluation import EvaluationData, EvaluationOutput
+from ..types.trace import EvaluationLevel, TextContent, ToolExecution, TraceLevelInput
+from .evaluator import Evaluator
+from .prompt_templates.conciseness import get_template
+InputT = TypeVar("InputT")
+OutputT = TypeVar("OutputT")
+class ConcisenessScore(str, Enum):
+    """Categorical conciseness ratings."""
+    NOT_CONCISE = "Not Concise"
+    PARTIALLY_CONCISE = "Partially Concise"
+    PERFECTLY_CONCISE = "Perfectly Concise"
+class ConcisenessRating(BaseModel):
+    """Structured output for conciseness evaluation."""
+    reasoning: str = Field(description="Step by step reasoning to derive the final score")
+    score: ConcisenessScore = Field(description="Categorical conciseness rating")
+class ConcisenessEvaluator(Evaluator[InputT, OutputT]):
+    """Evaluates how concise the assistant's response is."""
+    evaluation_level = EvaluationLevel.TRACE_LEVEL
+    _score_mapping = {
+        ConcisenessScore.NOT_CONCISE: 0.0,
+        ConcisenessScore.PARTIALLY_CONCISE: 0.5,
+        ConcisenessScore.PERFECTLY_CONCISE: 1.0,
+    }
+    def __init__(
+        self,
+        version: str = "v0",
+        model: Union[Model, str, None] = None,
+        system_prompt: str | None = None,
+        include_inputs: bool = True,
+    ):
+        super().__init__()
+        self.system_prompt = system_prompt or get_template(version).SYSTEM_PROMPT
+        self.version = version
+        self.model = model
+        self.include_inputs = include_inputs
+    def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
+        parsed_input = self._get_last_turn(evaluation_case)
+        prompt = self._format_prompt(parsed_input)
+        evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
+        result = evaluator_agent(prompt, structured_output_model=ConcisenessRating)
+        return self._create_evaluation_output(result)
+    async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
+        parsed_input = self._get_last_turn(evaluation_case)
+        prompt = self._format_prompt(parsed_input)
+        evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
+        result = await evaluator_agent.invoke_async(prompt, structured_output_model=ConcisenessRating)
+        return self._create_evaluation_output(result)
+    def _create_evaluation_output(self, result) -> list[EvaluationOutput]:
+        rating = cast(ConcisenessRating, result.structured_output)
+        normalized_score = self._score_mapping[rating.score]
+        return [
+            EvaluationOutput(
+                score=normalized_score,
+                test_pass=normalized_score >= 0.5,
+                reason=rating.reasoning,
+                label=rating.score,
+            )
+        ]
+    def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
+        """Extract the most recent turn from the conversation for evaluation."""
+        parsed_inputs = self._parse_trajectory(evaluation_case)
+        if not parsed_inputs:
+            raise ValueError(
+                "No turn-level inputs could be parsed from the trajectory. "
+                "Ensure actual_trajectory is a Session with at least one AgentInvocationSpan."
+            )
+        return parsed_inputs[-1]
+    def _extract_user_prompt(self, parsed_input: TraceLevelInput) -> str:
+        """Extract user prompt from last message in session history.
+        Args:
+            parsed_input: Trace-level input containing session history
+        Returns:
+            User prompt text, or empty string if not available
+        """
+        if not parsed_input.session_history:
+            return ""
+        last_msg = parsed_input.session_history[-1]
+        if not isinstance(last_msg, list) and self._has_text_content(last_msg):
+            first_content = last_msg.content[0]
+            if isinstance(first_content, TextContent):
+                return first_content.text
+        return ""
+    def _format_prompt(self, parsed_input: TraceLevelInput) -> str:
+        """Format evaluation prompt from parsed trace data.
+        Args:
+            parsed_input: Trace-level input containing agent response and session history
+        Returns:
+            Formatted prompt string with conversation history and target turn
+        """
+        parts = []
+        if parsed_input.session_history:
+            history_lines = []
+            for msg in parsed_input.session_history:
+                if isinstance(msg, list) and msg and isinstance(msg[0], ToolExecution):
+                    continue  # Skip tool execution lists
+                if not isinstance(msg, list) and self._has_text_content(msg):
+                    first_content = msg.content[0]
+                    if isinstance(first_content, TextContent):
+                        history_lines.append(f"{msg.role.value.capitalize()}: {first_content.text}")
+            history_str = "\n".join(history_lines)
+            parts.append(f"# Previous turns:\n{history_str}")
+        user_prompt = self._extract_user_prompt(parsed_input)
+        parts.append(f"# Target turn to evaluate:\nUser: {user_prompt}\nAssistant: {parsed_input.agent_response.text}")
+        return "\n\n".join(parts)

{strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/evaluator.py RENAMED Viewed

@@ -63,6 +63,10 @@ class Evaluator(Generic[InputT, OutputT]):
     @staticmethod
     def _default_aggregator(outputs: list[EvaluationOutput]) -> tuple[float, bool, str]:
+        # Handle empty outputs list to avoid division by zero
+        if not outputs:
+            return (0.0, False, "No evaluation outputs produced")
         avg_score = sum(o.score for o in outputs) / len(outputs)
         all_pass = all(o.test_pass for o in outputs)
         combined_reason = " | ".join(o.reason for o in outputs if o.reason)

{strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/faithfulness_evaluator.py RENAMED Viewed

@@ -1,4 +1,5 @@
 from enum import Enum
+from typing import cast
 from pydantic import BaseModel, Field
 from strands import Agent
@@ -59,29 +60,33 @@ class FaithfulnessEvaluator(Evaluator[InputT, OutputT]):
         parsed_input = self._get_last_turn(evaluation_case)
         prompt = self._format_prompt(parsed_input)
         evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
-        rating = evaluator_agent.structured_output(FaithfulnessRating, prompt)
+        result = evaluator_agent(prompt, structured_output_model=FaithfulnessRating)
+        rating = cast(FaithfulnessRating, result.structured_output)
         normalized_score = self._score_mapping[rating.score]
-        result = EvaluationOutput(
-            score=normalized_score,
-            test_pass=normalized_score >= 0.5,
-            reason=rating.reasoning,
-            label=rating.score,
-        )
-        return [result]
+        return [
+            EvaluationOutput(
+                score=normalized_score,
+                test_pass=normalized_score >= 0.5,
+                reason=rating.reasoning,
+                label=rating.score,
+            )
+        ]
     async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
         parsed_input = self._get_last_turn(evaluation_case)
         prompt = self._format_prompt(parsed_input)
         evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
-        rating = await evaluator_agent.structured_output_async(FaithfulnessRating, prompt)
+        result = await evaluator_agent.invoke_async(prompt, structured_output_model=FaithfulnessRating)
+        rating = cast(FaithfulnessRating, result.structured_output)
         normalized_score = self._score_mapping[rating.score]
-        result = EvaluationOutput(
-            score=normalized_score,
-            test_pass=normalized_score >= 0.5,
-            reason=rating.reasoning,
-            label=rating.score,
-        )
-        return [result]
+        return [
+            EvaluationOutput(
+                score=normalized_score,
+                test_pass=normalized_score >= 0.5,
+                reason=rating.reasoning,
+                label=rating.score,
+            )
+        ]
     def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
         """Extract the most recent turn from the conversation for evaluation."""

{strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/goal_success_rate_evaluator.py RENAMED Viewed

@@ -1,4 +1,5 @@
 from enum import Enum
+from typing import cast
 from pydantic import BaseModel, Field
 from strands import Agent
@@ -53,29 +54,33 @@ class GoalSuccessRateEvaluator(Evaluator[InputT, OutputT]):
         session_input = self._parse_trajectory(evaluation_case)
         prompt = self._format_prompt(session_input)
         evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
-        rating = evaluator_agent.structured_output(GoalSuccessRating, prompt)
+        result = evaluator_agent(prompt, structured_output_model=GoalSuccessRating)
+        rating = cast(GoalSuccessRating, result.structured_output)
         normalized_score = self._score_mapping[rating.score]
-        result = EvaluationOutput(
-            score=normalized_score,
-            test_pass=normalized_score >= 1.0,
-            reason=rating.reasoning,
-            label=rating.score,
-        )
-        return [result]
+        return [
+            EvaluationOutput(
+                score=normalized_score,
+                test_pass=normalized_score >= 1.0,
+                reason=rating.reasoning,
+                label=rating.score,
+            )
+        ]
     async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
         session_input = self._parse_trajectory(evaluation_case)
         prompt = self._format_prompt(session_input)
         evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
-        rating = await evaluator_agent.structured_output_async(GoalSuccessRating, prompt)
+        result = await evaluator_agent.invoke_async(prompt, structured_output_model=GoalSuccessRating)
+        rating = cast(GoalSuccessRating, result.structured_output)
         normalized_score = self._score_mapping[rating.score]
-        result = EvaluationOutput(
-            score=normalized_score,
-            test_pass=normalized_score >= 1.0,
-            reason=rating.reasoning,
-            label=rating.score,
-        )
-        return [result]
+        return [
+            EvaluationOutput(
+                score=normalized_score,
+                test_pass=normalized_score >= 1.0,
+                reason=rating.reasoning,
+                label=rating.score,
+            )
+        ]
     def _format_prompt(self, session_input: SessionLevelInput) -> str:
         """Format evaluation prompt from session-level input."""

{strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/harmfulness_evaluator.py RENAMED Viewed

@@ -1,4 +1,5 @@
 from enum import Enum
+from typing import cast
 from pydantic import BaseModel, Field
 from strands import Agent
@@ -52,29 +53,33 @@ class HarmfulnessEvaluator(Evaluator[InputT, OutputT]):
         parsed_input = self._get_last_turn(evaluation_case)
         prompt = self._format_prompt(parsed_input)
         evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
-        rating = evaluator_agent.structured_output(HarmfulnessRating, prompt)
+        result = evaluator_agent(prompt, structured_output_model=HarmfulnessRating)
+        rating = cast(HarmfulnessRating, result.structured_output)
         normalized_score = self._score_mapping[rating.score]
-        result = EvaluationOutput(
-            score=normalized_score,
-            test_pass=normalized_score == 1.0,
-            reason=rating.reasoning,
-            label=rating.score,
-        )
-        return [result]
+        return [
+            EvaluationOutput(
+                score=normalized_score,
+                test_pass=normalized_score == 1.0,
+                reason=rating.reasoning,
+                label=rating.score,
+            )
+        ]
     async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
         parsed_input = self._get_last_turn(evaluation_case)
         prompt = self._format_prompt(parsed_input)
         evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
-        rating = await evaluator_agent.structured_output_async(HarmfulnessRating, prompt)
+        result = await evaluator_agent.invoke_async(prompt, structured_output_model=HarmfulnessRating)
+        rating = cast(HarmfulnessRating, result.structured_output)
         normalized_score = self._score_mapping[rating.score]
-        result = EvaluationOutput(
-            score=normalized_score,
-            test_pass=normalized_score == 1.0,
-            reason=rating.reasoning,
-            label=rating.score,
-        )
-        return [result]
+        return [
+            EvaluationOutput(
+                score=normalized_score,
+                test_pass=normalized_score == 1.0,
+                reason=rating.reasoning,
+                label=rating.score,
+            )
+        ]
     def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
         """Extract the most recent turn from the conversation for evaluation."""

{strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/helpfulness_evaluator.py RENAMED Viewed

@@ -1,4 +1,5 @@
 from enum import Enum
+from typing import cast
 from pydantic import BaseModel, Field
 from strands import Agent
@@ -65,29 +66,33 @@ class HelpfulnessEvaluator(Evaluator[InputT, OutputT]):
         parsed_input = self._get_last_turn(evaluation_case)
         prompt = self._format_prompt(parsed_input)
         evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
-        rating = evaluator_agent.structured_output(HelpfulnessRating, prompt)
+        result = evaluator_agent(prompt, structured_output_model=HelpfulnessRating)
+        rating = cast(HelpfulnessRating, result.structured_output)
         normalized_score = self._score_mapping[rating.score]
-        result = EvaluationOutput(
-            score=normalized_score,
-            test_pass=normalized_score >= 0.5,
-            reason=rating.reasoning,
-            label=rating.score,
-        )
-        return [result]
+        return [
+            EvaluationOutput(
+                score=normalized_score,
+                test_pass=normalized_score >= 0.5,
+                reason=rating.reasoning,
+                label=rating.score,
+            )
+        ]
     async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
         parsed_input = self._get_last_turn(evaluation_case)
         prompt = self._format_prompt(parsed_input)
         evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
-        rating = await evaluator_agent.structured_output_async(HelpfulnessRating, prompt)
+        result = await evaluator_agent.invoke_async(prompt, structured_output_model=HelpfulnessRating)
+        rating = cast(HelpfulnessRating, result.structured_output)
         normalized_score = self._score_mapping[rating.score]
-        result = EvaluationOutput(
-            score=normalized_score,
-            test_pass=normalized_score >= 0.5,
-            reason=rating.reasoning,
-            label=rating.score,
-        )
-        return [result]
+        return [
+            EvaluationOutput(
+                score=normalized_score,
+                test_pass=normalized_score >= 0.5,
+                reason=rating.reasoning,
+                label=rating.score,
+            )
+        ]
     def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
         """Extract the most recent turn from the conversation for evaluation."""

{strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/interactions_evaluator.py RENAMED Viewed

@@ -1,3 +1,5 @@
+from typing import cast
 from strands import Agent
 from strands.agent.conversation_manager import SlidingWindowConversationManager
 from strands.models.model import Model
@@ -198,8 +200,8 @@ class InteractionsEvaluator(Evaluator[InputT, OutputT]):
         for i in range(num_interactions):
             is_last = i == num_interactions - 1
             evaluation_prompt = self._compose_prompt(evaluation_case, i, is_last)
-            result = evaluator_agent.structured_output(EvaluationOutput, evaluation_prompt)
-            results.append(result)
+            result = evaluator_agent(evaluation_prompt, structured_output_model=EvaluationOutput)
+            results.append(cast(EvaluationOutput, result.structured_output))
         return results
@@ -238,7 +240,7 @@ class InteractionsEvaluator(Evaluator[InputT, OutputT]):
         for i in range(num_interactions):
             is_last = i == num_interactions - 1
             evaluation_prompt = self._compose_prompt(evaluation_case, i, is_last)
-            result = await evaluator_agent.structured_output_async(EvaluationOutput, evaluation_prompt)
-            results.append(result)
+            result = await evaluator_agent.invoke_async(evaluation_prompt, structured_output_model=EvaluationOutput)
+            results.append(cast(EvaluationOutput, result.structured_output))
         return results

{strands_agents_evals-0.1.3 → strands_agents_evals-0.1.5}/src/strands_evals/evaluators/output_evaluator.py RENAMED Viewed

@@ -1,3 +1,5 @@
+from typing import cast
 from strands import Agent
 from strands.models.model import Model
 from typing_extensions import TypeVar, Union
@@ -51,8 +53,8 @@ class OutputEvaluator(Evaluator[InputT, OutputT]):
         evaluation_prompt = compose_test_prompt(
             evaluation_case=evaluation_case, rubric=self.rubric, include_inputs=self.include_inputs
         )
-        result = evaluator_agent.structured_output(EvaluationOutput, evaluation_prompt)
-        return [result]
+        result = evaluator_agent(evaluation_prompt, structured_output_model=EvaluationOutput)
+        return [cast(EvaluationOutput, result.structured_output)]
     async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
         """
@@ -68,5 +70,5 @@ class OutputEvaluator(Evaluator[InputT, OutputT]):
         evaluation_prompt = compose_test_prompt(
             evaluation_case=evaluation_case, rubric=self.rubric, include_inputs=self.include_inputs
         )
-        result = await evaluator_agent.structured_output_async(EvaluationOutput, evaluation_prompt)
-        return [result]
+        result = await evaluator_agent.invoke_async(evaluation_prompt, structured_output_model=EvaluationOutput)
+        return [cast(EvaluationOutput, result.structured_output)]

strands_agents_evals-0.1.5/src/strands_evals/evaluators/prompt_templates/conciseness/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from . import conciseness_v0
+VERSIONS = {
+    "v0": conciseness_v0,
+}
+DEFAULT_VERSION = "v0"
+def get_template(version: str = DEFAULT_VERSION):
+    return VERSIONS[version]

strands_agents_evals-0.1.5/src/strands_evals/evaluators/prompt_templates/conciseness/conciseness_v0.py ADDED Viewed

@@ -0,0 +1,9 @@
+SYSTEM_PROMPT = """You are evaluating how concise the Assistant's response is.
+A concise response provides exactly what was requested using the minimum necessary words, without extra explanations, pleasantries, or repetition unless specifically asked for.
+## Scoring
+- Perfectly Concise: delivers exactly what was asked with no unnecessary content
+- Partially Concise: minor extra wording but still focused
+- Not Concise: verbose, repetitive, or includes substantial unnecessary content
+**IMPORTANT**: The agent prompt and tools ALWAYS takes priority over your own knowledge."""

strands_agents_evals-0.1.5/src/strands_evals/evaluators/prompt_templates/response_relevance/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from . import response_relevance_v0
+VERSIONS = {
+    "v0": response_relevance_v0,
+}
+DEFAULT_VERSION = "v0"
+def get_template(version: str = DEFAULT_VERSION):
+    return VERSIONS[version]

strands_agents_evals-0.1.5/src/strands_evals/evaluators/prompt_templates/response_relevance/response_relevance_v0.py ADDED Viewed

@@ -0,0 +1,29 @@
+SYSTEM_PROMPT = """You are an objective judge evaluating the relevance of an AI assistant's response to the user's question. Your task is to assess how focused the response is on addressing the given question.
+# Evaluation Guidelines:
+When evaluating the relevance of the response, consider the following rubrics:
+- If everything in the response can be understood to directly address the input, the response is perfectly relevant.
+- If anything in the response is unrelated to the input, the response is less relevant.
+- Relevance only evaluates whether the response is on topic. Content that indicates that the assistant understood the question, but was unable to answer it truthfully, faithfully, coherently or correctly still counts as a relevant response. Only content that is extraneous to answering the question should be penalized.
+- Duplicate information does not penalize relevance. The response could say the same thing multiple times. If that thing is a relevant answer to the user's query, relevance is not penalized.
+# Rating Scale:
+1. Not At All
+   - No part of the response is relevant to the question
+2. Not Generally
+   - An overwhelming amount of the response is irrelevant or the relevant information is not a direct answer
+3. Neutral/Mixed
+   - Roughly half of the response is relevant to the question
+4. Generally Yes
+   - An overwhelming amount of the response is relevant to the question
+5. Completely Yes
+   - Every piece of the response is relevant to the question
+IMPORTANT: The tool output ALWAYS takes priority over your own knowledge. Focus on whether the response addresses the user's question, not on factual accuracy."""

strands-agents-evals 0.1.3__tar.gz → 0.1.5__tar.gz

strands-agents-evals 0.1.3tar.gz → 0.1.5tar.gz