strands-agents-evals 0.1.0__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/.github/workflows/pypi-publish-on-release.yml +2 -2
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/PKG-INFO +72 -3
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/README.md +71 -2
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/pyproject.toml +2 -1
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/__init__.py +0 -2
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/experiment.py +21 -14
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/extractors/tools_use_extractor.py +6 -2
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/extractors/test_tools_use_extractor.py +69 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/test_experiment.py +36 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/test_integration.py +42 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/.github/dependabot.yml +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/.github/workflows/integration-test.yml +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/.github/workflows/pr-and-push.yml +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/.github/workflows/test-lint.yml +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/.gitignore +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/.pre-commit-config.yaml +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/CODE_OF_CONDUCT.md +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/CONTRIBUTING.md +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/LICENSE +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/NOTICE +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/STYLE_GUIDE.md +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/__init__.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/case.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/display/display_console.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/__init__.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/evaluator.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/faithfulness_evaluator.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/goal_success_rate_evaluator.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/harmfulness_evaluator.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/helpfulness_evaluator.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/interactions_evaluator.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/output_evaluator.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/case_prompt_template.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/faithfulness/__init__.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/faithfulness/faithfulness_v0.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/goal_success_rate/__init__.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_v0.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/harmfulness/__init__.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/harmfulness/harmfulness_v0.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/helpfulness/__init__.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/helpfulness/helpfulness_v0.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/prompt_templates.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/__init__.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/tool_parameter_accuracy_v0.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/tool_selection_accuracy/__init__.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/prompt_templates/tool_selection_accuracy/tool_selection_accuracy_v0.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/tool_parameter_accuracy_evaluator.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/tool_selection_accuracy_evaluator.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/trajectory_evaluator.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/extractors/__init__.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/extractors/graph_extractor.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/extractors/swarm_extractor.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/extractors/trace_extractor.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/generators/__init__.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/generators/experiment_generator.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/generators/prompt_template/prompt_templates.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/generators/topic_planner.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/mappers/__init__.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/mappers/session_mapper.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/mappers/strands_in_memory_session_mapper.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/README.md +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/__init__.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/actor_simulator.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/profiles/__init__.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/profiles/actor_profile.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/prompt_templates/__init__.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/prompt_templates/actor_profile_extraction.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/prompt_templates/actor_system_prompt.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/prompt_templates/goal_completion.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/tools/__init__.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/tools/goal_completion.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/telemetry/__init__.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/telemetry/_cloudwatch_logger.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/telemetry/config.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/telemetry/tracer.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/tools/evaluation_tools.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/types/__init__.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/types/evaluation.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/types/evaluation_report.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/types/simulation/__init__.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/types/simulation/actor.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/types/trace.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/__init__.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/evaluators/test_evaluator.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/evaluators/test_faithfulness_evaluator.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/evaluators/test_goal_success_rate_evaluator.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/evaluators/test_harmfulness_evaluator.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/evaluators/test_helpfulness_evaluator.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/evaluators/test_interactions_evaluator.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/evaluators/test_output_evaluator.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/evaluators/test_tool_parameter_accuracy_evaluator.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/evaluators/test_tool_selection_accuracy_evaluator.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/evaluators/test_trajectory_evaluator.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/extractors/test_graph_extractor.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/extractors/test_swarm_extractor.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/extractors/test_trace_extractor.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/generators/test_experiment_generator.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/generators/test_topic_planner.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/mappers/__init__.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/mappers/test_strands_in_memory_mapper.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/simulation/__init__.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/simulation/test_actor_simulator.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/simulation/test_goal_completion.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/telemetry/test_config.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/telemetry/test_tracer.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/test_cases.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/tools/test_evaluation_tools.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/types/test_trace.py +0 -0
- {strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests_integ/test_output_evaluator.py +0 -0
|
@@ -52,7 +52,7 @@ jobs:
|
|
|
52
52
|
hatch build
|
|
53
53
|
|
|
54
54
|
- name: Store the distribution packages
|
|
55
|
-
uses: actions/upload-artifact@
|
|
55
|
+
uses: actions/upload-artifact@v6
|
|
56
56
|
with:
|
|
57
57
|
name: python-package-distributions
|
|
58
58
|
path: dist/
|
|
@@ -74,7 +74,7 @@ jobs:
|
|
|
74
74
|
|
|
75
75
|
steps:
|
|
76
76
|
- name: Download all the dists
|
|
77
|
-
uses: actions/download-artifact@
|
|
77
|
+
uses: actions/download-artifact@v7
|
|
78
78
|
with:
|
|
79
79
|
name: python-package-distributions
|
|
80
80
|
path: dist/
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: strands-agents-evals
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: Evaluation framework for Strands
|
|
5
5
|
Author-email: AWS <opensource@amazon.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -68,6 +68,7 @@ Strands Evaluation is a powerful framework for evaluating AI agents and LLM appl
|
|
|
68
68
|
## Feature Overview
|
|
69
69
|
|
|
70
70
|
- **Multiple Evaluation Types**: Output evaluation, trajectory analysis, tool usage assessment, and interaction evaluation
|
|
71
|
+
- **Dynamic Simulators**: Multi-turn conversation simulation with realistic user behavior and goal-oriented interactions
|
|
71
72
|
- **LLM-as-a-Judge**: Built-in evaluators using language models for sophisticated assessment with structured scoring
|
|
72
73
|
- **Trace-based Evaluation**: Analyze agent behavior through OpenTelemetry execution traces
|
|
73
74
|
- **Automated Experiment Generation**: Generate comprehensive test suites from context descriptions
|
|
@@ -226,6 +227,73 @@ reports = experiment.run_evaluations(user_task_function)
|
|
|
226
227
|
reports[0].run_display()
|
|
227
228
|
```
|
|
228
229
|
|
|
230
|
+
### Multi-turn Conversation Simulation
|
|
231
|
+
|
|
232
|
+
Simulate realistic user interactions with dynamic, goal-oriented conversations using ActorSimulator:
|
|
233
|
+
|
|
234
|
+
```python
|
|
235
|
+
from strands import Agent
|
|
236
|
+
from strands_evals import Case, Experiment, ActorSimulator
|
|
237
|
+
from strands_evals.evaluators import HelpfulnessEvaluator, GoalSuccessRateEvaluator
|
|
238
|
+
from strands_evals.mappers import StrandsInMemorySessionMapper
|
|
239
|
+
from strands_evals.telemetry import StrandsEvalsTelemetry
|
|
240
|
+
|
|
241
|
+
# Setup telemetry
|
|
242
|
+
telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter()
|
|
243
|
+
memory_exporter = telemetry.in_memory_exporter
|
|
244
|
+
|
|
245
|
+
def task_function(case: Case) -> dict:
|
|
246
|
+
# Create simulator to drive conversation
|
|
247
|
+
simulator = ActorSimulator.from_case_for_user_simulator(
|
|
248
|
+
case=case,
|
|
249
|
+
max_turns=10
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
# Create agent to evaluate
|
|
253
|
+
agent = Agent(
|
|
254
|
+
trace_attributes={
|
|
255
|
+
"gen_ai.conversation.id": case.session_id,
|
|
256
|
+
"session.id": case.session_id
|
|
257
|
+
},
|
|
258
|
+
callback_handler=None
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
# Run multi-turn conversation
|
|
262
|
+
all_spans = []
|
|
263
|
+
user_message = case.input
|
|
264
|
+
|
|
265
|
+
while simulator.has_next():
|
|
266
|
+
memory_exporter.clear()
|
|
267
|
+
agent_response = agent(user_message)
|
|
268
|
+
turn_spans = list(memory_exporter.get_finished_spans())
|
|
269
|
+
all_spans.extend(turn_spans)
|
|
270
|
+
|
|
271
|
+
user_result = simulator.act(str(agent_response))
|
|
272
|
+
user_message = str(user_result.structured_output.message)
|
|
273
|
+
|
|
274
|
+
# Map to session for evaluation
|
|
275
|
+
mapper = StrandsInMemorySessionMapper()
|
|
276
|
+
session = mapper.map_to_session(all_spans, session_id=case.session_id)
|
|
277
|
+
|
|
278
|
+
return {"output": str(agent_response), "trajectory": session}
|
|
279
|
+
|
|
280
|
+
# Use evaluators to assess simulated conversations
|
|
281
|
+
evaluators = [
|
|
282
|
+
HelpfulnessEvaluator(),
|
|
283
|
+
GoalSuccessRateEvaluator()
|
|
284
|
+
]
|
|
285
|
+
|
|
286
|
+
experiment = Experiment(cases=test_cases, evaluators=evaluators)
|
|
287
|
+
reports = experiment.run_evaluations(task_function)
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
**Key Benefits:**
|
|
291
|
+
- **Dynamic Interactions**: Simulator adapts responses based on agent behavior
|
|
292
|
+
- **Goal-Oriented Testing**: Verify agents can complete user objectives through dialogue
|
|
293
|
+
- **Realistic Conversations**: Generate authentic multi-turn interaction patterns
|
|
294
|
+
- **No Predefined Scripts**: Test agents without hardcoded conversation paths
|
|
295
|
+
- **Comprehensive Evaluation**: Combine with trace-based evaluators for full assessment
|
|
296
|
+
|
|
229
297
|
### Automated Experiment Generation
|
|
230
298
|
|
|
231
299
|
Generate comprehensive test suites automatically from context descriptions:
|
|
@@ -388,8 +456,9 @@ reports[0].run_display() # Interactive display with metrics breakdown
|
|
|
388
456
|
|
|
389
457
|
For detailed guidance & examples, explore our documentation:
|
|
390
458
|
|
|
391
|
-
- [User Guide](https://strandsagents.com/latest
|
|
392
|
-
- [Evaluator Reference](https://strandsagents.com/latest/user-guide/evals-sdk/evaluators/)
|
|
459
|
+
- [User Guide](https://strandsagents.com/latest/documentation/docs/user-guide/evals-sdk/quickstart/)
|
|
460
|
+
- [Evaluator Reference](https://strandsagents.com/latest/documentation/docs/user-guide/evals-sdk/evaluators/)
|
|
461
|
+
- [Simulators Guide](https://strandsagents.com/latest/documentation/docs/user-guide/evals-sdk/simulators/)
|
|
393
462
|
|
|
394
463
|
## Contributing ❤️
|
|
395
464
|
|
|
@@ -36,6 +36,7 @@ Strands Evaluation is a powerful framework for evaluating AI agents and LLM appl
|
|
|
36
36
|
## Feature Overview
|
|
37
37
|
|
|
38
38
|
- **Multiple Evaluation Types**: Output evaluation, trajectory analysis, tool usage assessment, and interaction evaluation
|
|
39
|
+
- **Dynamic Simulators**: Multi-turn conversation simulation with realistic user behavior and goal-oriented interactions
|
|
39
40
|
- **LLM-as-a-Judge**: Built-in evaluators using language models for sophisticated assessment with structured scoring
|
|
40
41
|
- **Trace-based Evaluation**: Analyze agent behavior through OpenTelemetry execution traces
|
|
41
42
|
- **Automated Experiment Generation**: Generate comprehensive test suites from context descriptions
|
|
@@ -194,6 +195,73 @@ reports = experiment.run_evaluations(user_task_function)
|
|
|
194
195
|
reports[0].run_display()
|
|
195
196
|
```
|
|
196
197
|
|
|
198
|
+
### Multi-turn Conversation Simulation
|
|
199
|
+
|
|
200
|
+
Simulate realistic user interactions with dynamic, goal-oriented conversations using ActorSimulator:
|
|
201
|
+
|
|
202
|
+
```python
|
|
203
|
+
from strands import Agent
|
|
204
|
+
from strands_evals import Case, Experiment, ActorSimulator
|
|
205
|
+
from strands_evals.evaluators import HelpfulnessEvaluator, GoalSuccessRateEvaluator
|
|
206
|
+
from strands_evals.mappers import StrandsInMemorySessionMapper
|
|
207
|
+
from strands_evals.telemetry import StrandsEvalsTelemetry
|
|
208
|
+
|
|
209
|
+
# Setup telemetry
|
|
210
|
+
telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter()
|
|
211
|
+
memory_exporter = telemetry.in_memory_exporter
|
|
212
|
+
|
|
213
|
+
def task_function(case: Case) -> dict:
|
|
214
|
+
# Create simulator to drive conversation
|
|
215
|
+
simulator = ActorSimulator.from_case_for_user_simulator(
|
|
216
|
+
case=case,
|
|
217
|
+
max_turns=10
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
# Create agent to evaluate
|
|
221
|
+
agent = Agent(
|
|
222
|
+
trace_attributes={
|
|
223
|
+
"gen_ai.conversation.id": case.session_id,
|
|
224
|
+
"session.id": case.session_id
|
|
225
|
+
},
|
|
226
|
+
callback_handler=None
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
# Run multi-turn conversation
|
|
230
|
+
all_spans = []
|
|
231
|
+
user_message = case.input
|
|
232
|
+
|
|
233
|
+
while simulator.has_next():
|
|
234
|
+
memory_exporter.clear()
|
|
235
|
+
agent_response = agent(user_message)
|
|
236
|
+
turn_spans = list(memory_exporter.get_finished_spans())
|
|
237
|
+
all_spans.extend(turn_spans)
|
|
238
|
+
|
|
239
|
+
user_result = simulator.act(str(agent_response))
|
|
240
|
+
user_message = str(user_result.structured_output.message)
|
|
241
|
+
|
|
242
|
+
# Map to session for evaluation
|
|
243
|
+
mapper = StrandsInMemorySessionMapper()
|
|
244
|
+
session = mapper.map_to_session(all_spans, session_id=case.session_id)
|
|
245
|
+
|
|
246
|
+
return {"output": str(agent_response), "trajectory": session}
|
|
247
|
+
|
|
248
|
+
# Use evaluators to assess simulated conversations
|
|
249
|
+
evaluators = [
|
|
250
|
+
HelpfulnessEvaluator(),
|
|
251
|
+
GoalSuccessRateEvaluator()
|
|
252
|
+
]
|
|
253
|
+
|
|
254
|
+
experiment = Experiment(cases=test_cases, evaluators=evaluators)
|
|
255
|
+
reports = experiment.run_evaluations(task_function)
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
**Key Benefits:**
|
|
259
|
+
- **Dynamic Interactions**: Simulator adapts responses based on agent behavior
|
|
260
|
+
- **Goal-Oriented Testing**: Verify agents can complete user objectives through dialogue
|
|
261
|
+
- **Realistic Conversations**: Generate authentic multi-turn interaction patterns
|
|
262
|
+
- **No Predefined Scripts**: Test agents without hardcoded conversation paths
|
|
263
|
+
- **Comprehensive Evaluation**: Combine with trace-based evaluators for full assessment
|
|
264
|
+
|
|
197
265
|
### Automated Experiment Generation
|
|
198
266
|
|
|
199
267
|
Generate comprehensive test suites automatically from context descriptions:
|
|
@@ -356,8 +424,9 @@ reports[0].run_display() # Interactive display with metrics breakdown
|
|
|
356
424
|
|
|
357
425
|
For detailed guidance & examples, explore our documentation:
|
|
358
426
|
|
|
359
|
-
- [User Guide](https://strandsagents.com/latest
|
|
360
|
-
- [Evaluator Reference](https://strandsagents.com/latest/user-guide/evals-sdk/evaluators/)
|
|
427
|
+
- [User Guide](https://strandsagents.com/latest/documentation/docs/user-guide/evals-sdk/quickstart/)
|
|
428
|
+
- [Evaluator Reference](https://strandsagents.com/latest/documentation/docs/user-guide/evals-sdk/evaluators/)
|
|
429
|
+
- [Simulators Guide](https://strandsagents.com/latest/documentation/docs/user-guide/evals-sdk/simulators/)
|
|
361
430
|
|
|
362
431
|
## Contributing ❤️
|
|
363
432
|
|
|
@@ -138,7 +138,8 @@ disable_error_code = [
|
|
|
138
138
|
disallow_untyped_decorators = false
|
|
139
139
|
|
|
140
140
|
[tool.hatch.version]
|
|
141
|
-
|
|
141
|
+
source = "vcs" # Use git tags for versioning
|
|
142
|
+
|
|
142
143
|
[tool.pytest.ini_options]
|
|
143
144
|
asyncio_mode = "auto"
|
|
144
145
|
testpaths = ["tests"]
|
|
@@ -391,8 +391,8 @@ class Experiment(Generic[InputT, OutputT]):
|
|
|
391
391
|
"gen_ai.evaluation.case.input": serialize(case.input),
|
|
392
392
|
},
|
|
393
393
|
) as case_span:
|
|
394
|
+
# Task execution span - execute once
|
|
394
395
|
try:
|
|
395
|
-
# Task execution span - execute once
|
|
396
396
|
with self._tracer.start_as_current_span(
|
|
397
397
|
"task_execution",
|
|
398
398
|
attributes={
|
|
@@ -414,9 +414,21 @@ class Experiment(Generic[InputT, OutputT]):
|
|
|
414
414
|
),
|
|
415
415
|
}
|
|
416
416
|
)
|
|
417
|
-
|
|
418
|
-
|
|
417
|
+
except Exception as e:
|
|
418
|
+
case_span.record_exception(e)
|
|
419
419
|
for evaluator in self._evaluators:
|
|
420
|
+
eval_name = evaluator.get_type_name()
|
|
421
|
+
evaluator_data[eval_name]["cases"].append(case.model_dump())
|
|
422
|
+
evaluator_data[eval_name]["test_passes"].append(False)
|
|
423
|
+
evaluator_data[eval_name]["scores"].append(0)
|
|
424
|
+
evaluator_data[eval_name]["reasons"].append(f"Task execution error: {str(e)}")
|
|
425
|
+
evaluator_data[eval_name]["detailed_results"].append([])
|
|
426
|
+
continue
|
|
427
|
+
|
|
428
|
+
# Evaluate with each evaluator using the same task output
|
|
429
|
+
for evaluator in self._evaluators:
|
|
430
|
+
eval_name = evaluator.get_type_name()
|
|
431
|
+
try:
|
|
420
432
|
with self._tracer.start_as_current_span(
|
|
421
433
|
f"evaluator {evaluator.get_type_name()}",
|
|
422
434
|
attributes={
|
|
@@ -436,21 +448,16 @@ class Experiment(Generic[InputT, OutputT]):
|
|
|
436
448
|
}
|
|
437
449
|
)
|
|
438
450
|
|
|
439
|
-
eval_name = evaluator.get_type_name()
|
|
440
451
|
evaluator_data[eval_name]["cases"].append(evaluation_context.model_dump())
|
|
441
452
|
evaluator_data[eval_name]["test_passes"].append(aggregate_pass)
|
|
442
453
|
evaluator_data[eval_name]["scores"].append(aggregate_score)
|
|
443
454
|
evaluator_data[eval_name]["reasons"].append(aggregate_reason or "")
|
|
444
455
|
evaluator_data[eval_name]["detailed_results"].append(evaluation_outputs)
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
case_span.record_exception(e)
|
|
448
|
-
for evaluator in self._evaluators:
|
|
449
|
-
eval_name = evaluator.get_type_name()
|
|
450
|
-
evaluator_data[eval_name]["cases"].append(case.model_dump())
|
|
456
|
+
except Exception as e:
|
|
457
|
+
evaluator_data[eval_name]["cases"].append(evaluation_context.model_dump())
|
|
451
458
|
evaluator_data[eval_name]["test_passes"].append(False)
|
|
452
459
|
evaluator_data[eval_name]["scores"].append(0)
|
|
453
|
-
evaluator_data[eval_name]["reasons"].append(f"
|
|
460
|
+
evaluator_data[eval_name]["reasons"].append(f"Evaluator error: {str(e)}")
|
|
454
461
|
evaluator_data[eval_name]["detailed_results"].append([])
|
|
455
462
|
|
|
456
463
|
reports = []
|
|
@@ -577,8 +584,8 @@ class Experiment(Generic[InputT, OutputT]):
|
|
|
577
584
|
|
|
578
585
|
file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
579
586
|
|
|
580
|
-
with open(file_path, "w") as f:
|
|
581
|
-
json.dump(self.to_dict(), f, indent=2)
|
|
587
|
+
with open(file_path, "w", encoding="utf-8") as f:
|
|
588
|
+
json.dump(self.to_dict(), f, indent=2, ensure_ascii=False)
|
|
582
589
|
|
|
583
590
|
@classmethod
|
|
584
591
|
def from_dict(cls, data: dict, custom_evaluators: list[type[Evaluator]] | None = None):
|
|
@@ -646,7 +653,7 @@ class Experiment(Generic[InputT, OutputT]):
|
|
|
646
653
|
f"Only .json format is supported. Got file: {path}. Please provide a path with .json extension."
|
|
647
654
|
)
|
|
648
655
|
|
|
649
|
-
with open(file_path, "r") as f:
|
|
656
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
650
657
|
data = json.load(f)
|
|
651
658
|
|
|
652
659
|
return cls.from_dict(data, custom_evaluators)
|
|
@@ -33,6 +33,7 @@ def extract_agent_tools_used_from_messages(agent_messages):
|
|
|
33
33
|
tool_id = tool.get("toolUseId")
|
|
34
34
|
# get the tool result from the next message
|
|
35
35
|
tool_result = None
|
|
36
|
+
is_error = False
|
|
36
37
|
next_message_i = i + 1
|
|
37
38
|
while next_message_i < len(agent_messages):
|
|
38
39
|
next_message = agent_messages[next_message_i]
|
|
@@ -42,13 +43,16 @@ def extract_agent_tools_used_from_messages(agent_messages):
|
|
|
42
43
|
content = next_message.get("content")
|
|
43
44
|
if content:
|
|
44
45
|
tool_result_dict = content[0].get("toolResult")
|
|
45
|
-
if tool_result_dict.get("toolUseId") == tool_id:
|
|
46
|
+
if tool_result_dict and tool_result_dict.get("toolUseId") == tool_id:
|
|
46
47
|
tool_result_content = tool_result_dict.get("content", [])
|
|
47
48
|
if len(tool_result_content) > 0:
|
|
48
49
|
tool_result = tool_result_content[0].get("text")
|
|
50
|
+
is_error = tool_result_dict.get("status") == "error"
|
|
49
51
|
break
|
|
50
52
|
|
|
51
|
-
tools_used.append(
|
|
53
|
+
tools_used.append(
|
|
54
|
+
{"name": tool_name, "input": tool_input, "tool_result": tool_result, "is_error": is_error}
|
|
55
|
+
)
|
|
52
56
|
return tools_used
|
|
53
57
|
|
|
54
58
|
|
|
@@ -45,6 +45,7 @@ def test_tools_use_extractor_extract_from_messages_with_tools():
|
|
|
45
45
|
assert result[0]["name"] == "calculator"
|
|
46
46
|
assert result[0]["input"] == {"expression": "2+2"}
|
|
47
47
|
assert result[0]["tool_result"] == "Result: 4"
|
|
48
|
+
assert result[0]["is_error"] is False
|
|
48
49
|
|
|
49
50
|
|
|
50
51
|
def test_tools_use_extractor_extract_from_messages_no_tools():
|
|
@@ -59,6 +60,38 @@ def test_tools_use_extractor_extract_from_messages_no_tools():
|
|
|
59
60
|
assert result == []
|
|
60
61
|
|
|
61
62
|
|
|
63
|
+
def test_tools_use_extractor_extract_from_messages_with_error():
|
|
64
|
+
"""Test extracting tool usage from messages with error status"""
|
|
65
|
+
messages = [
|
|
66
|
+
{"role": "user", "content": [{"text": "Calculate invalid"}]},
|
|
67
|
+
{
|
|
68
|
+
"role": "assistant",
|
|
69
|
+
"content": [
|
|
70
|
+
{"toolUse": {"toolUseId": "tool_123", "name": "calculator", "input": {"expression": "invalid"}}},
|
|
71
|
+
],
|
|
72
|
+
},
|
|
73
|
+
{
|
|
74
|
+
"role": "user",
|
|
75
|
+
"content": [
|
|
76
|
+
{
|
|
77
|
+
"toolResult": {
|
|
78
|
+
"status": "error",
|
|
79
|
+
"content": [{"text": "Invalid expression"}],
|
|
80
|
+
"toolUseId": "tool_123",
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
],
|
|
84
|
+
},
|
|
85
|
+
]
|
|
86
|
+
|
|
87
|
+
result = extract_agent_tools_used_from_messages(messages)
|
|
88
|
+
|
|
89
|
+
assert len(result) == 1
|
|
90
|
+
assert result[0]["name"] == "calculator"
|
|
91
|
+
assert result[0]["tool_result"] == "Invalid expression"
|
|
92
|
+
assert result[0]["is_error"] is True
|
|
93
|
+
|
|
94
|
+
|
|
62
95
|
def test_tools_use_extractor_extract_from_messages_empty():
|
|
63
96
|
"""Test extracting tool usage from empty messages"""
|
|
64
97
|
result = extract_agent_tools_used_from_messages([])
|
|
@@ -96,6 +129,7 @@ def test_tools_use_extractor_extract_from_messages_no_tool_result():
|
|
|
96
129
|
assert result[0]["name"] == "calculator"
|
|
97
130
|
assert result[0]["input"] == {"expression": "2+2"}
|
|
98
131
|
assert result[0]["tool_result"] is None
|
|
132
|
+
assert result[0]["is_error"] is False
|
|
99
133
|
|
|
100
134
|
|
|
101
135
|
def test_tools_use_extractor_extract_from_messages_malformed_tool_result():
|
|
@@ -209,3 +243,38 @@ def test_tools_use_extractor_extract_tools_description_empty():
|
|
|
209
243
|
result = extract_tools_description(mock_agent, is_short=True)
|
|
210
244
|
|
|
211
245
|
assert result == {}
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def test_tools_use_extractor_extract_from_messages_user_message_without_tool_result():
|
|
249
|
+
"""Test extracting tool usage when user message content lacks toolResult key."""
|
|
250
|
+
messages = [
|
|
251
|
+
{
|
|
252
|
+
"role": "assistant",
|
|
253
|
+
"content": [
|
|
254
|
+
{"toolUse": {"toolUseId": "tool_abc", "name": "calculator", "input": {"expression": "5+5"}}},
|
|
255
|
+
],
|
|
256
|
+
},
|
|
257
|
+
{
|
|
258
|
+
"role": "user",
|
|
259
|
+
"content": [{"text": "Some user text without toolResult"}], # No toolResult key
|
|
260
|
+
},
|
|
261
|
+
{
|
|
262
|
+
"role": "user",
|
|
263
|
+
"content": [
|
|
264
|
+
{
|
|
265
|
+
"toolResult": {
|
|
266
|
+
"status": "success",
|
|
267
|
+
"content": [{"text": "Result: 10"}],
|
|
268
|
+
"toolUseId": "tool_abc",
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
],
|
|
272
|
+
},
|
|
273
|
+
]
|
|
274
|
+
result = extract_agent_tools_used_from_messages(messages)
|
|
275
|
+
|
|
276
|
+
assert len(result) == 1
|
|
277
|
+
assert result[0]["name"] == "calculator"
|
|
278
|
+
assert result[0]["input"] == {"expression": "5+5"}
|
|
279
|
+
assert result[0]["tool_result"] == "Result: 10"
|
|
280
|
+
assert result[0]["is_error"] is False
|
{strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/test_experiment.py
RENAMED
|
@@ -34,6 +34,16 @@ class MockEvaluator2(Evaluator[str, str]):
|
|
|
34
34
|
return [EvaluationOutput(score=0.5, test_pass=True, reason="Async test evaluation 2")]
|
|
35
35
|
|
|
36
36
|
|
|
37
|
+
class ThrowingEvaluator(Evaluator[str, str]):
|
|
38
|
+
"""Evaluator that always throws an exception - used to test error isolation"""
|
|
39
|
+
|
|
40
|
+
def evaluate(self, evaluation_case: EvaluationData[str, str]) -> list[EvaluationOutput]:
|
|
41
|
+
raise RuntimeError("Evaluator exploded")
|
|
42
|
+
|
|
43
|
+
async def evaluate_async(self, evaluation_case: EvaluationData[str, str]) -> list[EvaluationOutput]:
|
|
44
|
+
raise RuntimeError("Async evaluator exploded")
|
|
45
|
+
|
|
46
|
+
|
|
37
47
|
@pytest.fixture
|
|
38
48
|
def mock_evaluator():
|
|
39
49
|
return MockEvaluator()
|
|
@@ -1052,3 +1062,29 @@ def test_experiment_run_evaluations_multiple_cases(mock_span, simple_task):
|
|
|
1052
1062
|
assert len(reports) == 1
|
|
1053
1063
|
assert len(reports[0].scores) == 2
|
|
1054
1064
|
assert all(score == 1.0 for score in reports[0].scores)
|
|
1065
|
+
|
|
1066
|
+
|
|
1067
|
+
def test_experiment_run_evaluations_evaluator_error_isolated():
|
|
1068
|
+
"""Test that one evaluator failing doesn't affect other evaluators."""
|
|
1069
|
+
case = Case(name="test", input="hello", expected_output="hello")
|
|
1070
|
+
|
|
1071
|
+
# MockEvaluator succeeds, ThrowingEvaluator fails
|
|
1072
|
+
experiment = Experiment(cases=[case], evaluators=[MockEvaluator(), ThrowingEvaluator()])
|
|
1073
|
+
|
|
1074
|
+
def echo_task(c):
|
|
1075
|
+
return c.input
|
|
1076
|
+
|
|
1077
|
+
reports = experiment.run_evaluations(echo_task)
|
|
1078
|
+
|
|
1079
|
+
assert len(reports) == 2
|
|
1080
|
+
|
|
1081
|
+
# First evaluator (MockEvaluator) should succeed
|
|
1082
|
+
assert reports[0].scores[0] == 1.0
|
|
1083
|
+
assert reports[0].test_passes[0] is True
|
|
1084
|
+
assert reports[0].reasons[0] == "Mock evaluation"
|
|
1085
|
+
|
|
1086
|
+
# Second evaluator (ThrowingEvaluator) should fail with error message
|
|
1087
|
+
assert reports[1].scores[0] == 0
|
|
1088
|
+
assert reports[1].test_passes[0] is False
|
|
1089
|
+
assert "Evaluator error" in reports[1].reasons[0]
|
|
1090
|
+
assert "Evaluator exploded" in reports[1].reasons[0]
|
|
@@ -348,3 +348,45 @@ async def test_async_dataset_with_interactions(interaction_case):
|
|
|
348
348
|
assert len(report.cases) == 1
|
|
349
349
|
assert report.cases[0].get("actual_interactions") is not None
|
|
350
350
|
assert len(report.cases[0].get("actual_interactions")) == 2
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def test_integration_tool_error_extraction():
|
|
354
|
+
"""Test that is_error field is correctly extracted from tool execution"""
|
|
355
|
+
from strands_evals.extractors.tools_use_extractor import extract_agent_tools_used_from_messages
|
|
356
|
+
|
|
357
|
+
# Create mock messages simulating tool success and error
|
|
358
|
+
messages = [
|
|
359
|
+
{"role": "user", "content": [{"text": "test"}]},
|
|
360
|
+
{
|
|
361
|
+
"role": "assistant",
|
|
362
|
+
"content": [
|
|
363
|
+
{"toolUse": {"toolUseId": "tool1", "name": "success_tool", "input": {}}},
|
|
364
|
+
],
|
|
365
|
+
},
|
|
366
|
+
{
|
|
367
|
+
"role": "user",
|
|
368
|
+
"content": [
|
|
369
|
+
{"toolResult": {"status": "success", "content": [{"text": "ok"}], "toolUseId": "tool1"}},
|
|
370
|
+
],
|
|
371
|
+
},
|
|
372
|
+
{
|
|
373
|
+
"role": "assistant",
|
|
374
|
+
"content": [
|
|
375
|
+
{"toolUse": {"toolUseId": "tool2", "name": "error_tool", "input": {}}},
|
|
376
|
+
],
|
|
377
|
+
},
|
|
378
|
+
{
|
|
379
|
+
"role": "user",
|
|
380
|
+
"content": [
|
|
381
|
+
{"toolResult": {"status": "error", "content": [{"text": "failed"}], "toolUseId": "tool2"}},
|
|
382
|
+
],
|
|
383
|
+
},
|
|
384
|
+
]
|
|
385
|
+
|
|
386
|
+
tools_used = extract_agent_tools_used_from_messages(messages)
|
|
387
|
+
|
|
388
|
+
assert len(tools_used) == 2
|
|
389
|
+
assert tools_used[0]["name"] == "success_tool"
|
|
390
|
+
assert tools_used[0]["is_error"] is False
|
|
391
|
+
assert tools_used[1]["name"] == "error_tool"
|
|
392
|
+
assert tools_used[1]["is_error"] is True
|
{strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/.github/ISSUE_TEMPLATE/bug_report.yml
RENAMED
|
File without changes
|
|
File without changes
|
{strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/.github/ISSUE_TEMPLATE/feature_request.yml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/.github/workflows/integration-test.yml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/__init__.py
RENAMED
|
File without changes
|
{strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/evaluators/evaluator.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/extractors/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/generators/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/mappers/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/README.md
RENAMED
|
File without changes
|
{strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/simulation/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/telemetry/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/telemetry/config.py
RENAMED
|
File without changes
|
{strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/telemetry/tracer.py
RENAMED
|
File without changes
|
|
File without changes
|
{strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/types/__init__.py
RENAMED
|
File without changes
|
{strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/src/strands_evals/types/evaluation.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/mappers/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/simulation/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests/strands_evals/types/test_trace.py
RENAMED
|
File without changes
|
{strands_agents_evals-0.1.0 → strands_agents_evals-0.1.2}/tests_integ/test_output_evaluator.py
RENAMED
|
File without changes
|