strands-agents-evals 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- strands_agents_evals-0.1.0.dist-info/METADATA +408 -0
- strands_agents_evals-0.1.0.dist-info/RECORD +68 -0
- strands_agents_evals-0.1.0.dist-info/WHEEL +4 -0
- strands_agents_evals-0.1.0.dist-info/licenses/LICENSE +175 -0
- strands_agents_evals-0.1.0.dist-info/licenses/NOTICE +1 -0
- strands_evals/__init__.py +22 -0
- strands_evals/case.py +53 -0
- strands_evals/display/display_console.py +150 -0
- strands_evals/evaluators/__init__.py +23 -0
- strands_evals/evaluators/evaluator.py +182 -0
- strands_evals/evaluators/faithfulness_evaluator.py +116 -0
- strands_evals/evaluators/goal_success_rate_evaluator.py +90 -0
- strands_evals/evaluators/harmfulness_evaluator.py +135 -0
- strands_evals/evaluators/helpfulness_evaluator.py +148 -0
- strands_evals/evaluators/interactions_evaluator.py +244 -0
- strands_evals/evaluators/output_evaluator.py +72 -0
- strands_evals/evaluators/prompt_templates/case_prompt_template.py +63 -0
- strands_evals/evaluators/prompt_templates/faithfulness/__init__.py +11 -0
- strands_evals/evaluators/prompt_templates/faithfulness/faithfulness_v0.py +30 -0
- strands_evals/evaluators/prompt_templates/goal_success_rate/__init__.py +11 -0
- strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_v0.py +17 -0
- strands_evals/evaluators/prompt_templates/harmfulness/__init__.py +11 -0
- strands_evals/evaluators/prompt_templates/harmfulness/harmfulness_v0.py +8 -0
- strands_evals/evaluators/prompt_templates/helpfulness/__init__.py +11 -0
- strands_evals/evaluators/prompt_templates/helpfulness/helpfulness_v0.py +38 -0
- strands_evals/evaluators/prompt_templates/prompt_templates.py +176 -0
- strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/__init__.py +11 -0
- strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/tool_parameter_accuracy_v0.py +40 -0
- strands_evals/evaluators/prompt_templates/tool_selection_accuracy/__init__.py +11 -0
- strands_evals/evaluators/prompt_templates/tool_selection_accuracy/tool_selection_accuracy_v0.py +23 -0
- strands_evals/evaluators/tool_parameter_accuracy_evaluator.py +112 -0
- strands_evals/evaluators/tool_selection_accuracy_evaluator.py +112 -0
- strands_evals/evaluators/trajectory_evaluator.py +100 -0
- strands_evals/experiment.py +652 -0
- strands_evals/extractors/__init__.py +3 -0
- strands_evals/extractors/graph_extractor.py +30 -0
- strands_evals/extractors/swarm_extractor.py +73 -0
- strands_evals/extractors/tools_use_extractor.py +164 -0
- strands_evals/extractors/trace_extractor.py +166 -0
- strands_evals/generators/__init__.py +3 -0
- strands_evals/generators/experiment_generator.py +498 -0
- strands_evals/generators/prompt_template/prompt_templates.py +75 -0
- strands_evals/generators/topic_planner.py +60 -0
- strands_evals/mappers/__init__.py +6 -0
- strands_evals/mappers/session_mapper.py +27 -0
- strands_evals/mappers/strands_in_memory_session_mapper.py +473 -0
- strands_evals/simulation/README.md +323 -0
- strands_evals/simulation/__init__.py +6 -0
- strands_evals/simulation/actor_simulator.py +292 -0
- strands_evals/simulation/profiles/__init__.py +5 -0
- strands_evals/simulation/profiles/actor_profile.py +26 -0
- strands_evals/simulation/prompt_templates/__init__.py +11 -0
- strands_evals/simulation/prompt_templates/actor_profile_extraction.py +25 -0
- strands_evals/simulation/prompt_templates/actor_system_prompt.py +64 -0
- strands_evals/simulation/prompt_templates/goal_completion.py +27 -0
- strands_evals/simulation/tools/__init__.py +5 -0
- strands_evals/simulation/tools/goal_completion.py +93 -0
- strands_evals/telemetry/__init__.py +15 -0
- strands_evals/telemetry/_cloudwatch_logger.py +209 -0
- strands_evals/telemetry/config.py +207 -0
- strands_evals/telemetry/tracer.py +38 -0
- strands_evals/tools/evaluation_tools.py +67 -0
- strands_evals/types/__init__.py +11 -0
- strands_evals/types/evaluation.py +105 -0
- strands_evals/types/evaluation_report.py +244 -0
- strands_evals/types/simulation/__init__.py +5 -0
- strands_evals/types/simulation/actor.py +34 -0
- strands_evals/types/trace.py +205 -0
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Default system prompt for actor simulation.
|
|
3
|
+
|
|
4
|
+
This module contains the default system prompt that configures the actor's behavior,
|
|
5
|
+
communication style, and response protocols for realistic conversation simulation.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from textwrap import dedent
|
|
9
|
+
|
|
10
|
+
DEFAULT_USER_SIMULATOR_PROMPT_TEMPLATE = dedent("""## User Simulation
|
|
11
|
+
|
|
12
|
+
Core Identity:
|
|
13
|
+
- You are simulating a user seeking assistance from an AI assistant
|
|
14
|
+
- You speak in first person only
|
|
15
|
+
- You strictly follow your defined User Goal and User Profile throughout the conversation
|
|
16
|
+
|
|
17
|
+
## User Profile
|
|
18
|
+
{actor_profile}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
Response Protocols:
|
|
22
|
+
When assistant requests information:
|
|
23
|
+
- Provide brief, specific information
|
|
24
|
+
- Maximum 2-3 sentences
|
|
25
|
+
|
|
26
|
+
When assistant provides solutions/answers:
|
|
27
|
+
- Ask follow-ups, seek clarification, or express satisfaction. Do no deviate from the User Goal.
|
|
28
|
+
- While following up, do not increase the conversation scope beyond your User Goal.
|
|
29
|
+
|
|
30
|
+
Communication Rules:
|
|
31
|
+
1. STRICT maximum response length: 2-3 sentences
|
|
32
|
+
2. You are seeking help, NOT providing help - never give solutions!
|
|
33
|
+
3. Maintain your user profile and expertise level consistently
|
|
34
|
+
4. Express more of your user profile - let your background, expertise level, and personality
|
|
35
|
+
shine through in your responses
|
|
36
|
+
5. Don't break character by mentioning "assistant" or "AI" explicitly
|
|
37
|
+
6. Address AI assistant responses in second person ("Your suggestion..." not "The assistant's suggestion...")
|
|
38
|
+
7. Do not explicitly mention conversation redirection
|
|
39
|
+
8. Never include meta-references or self-instructions in your responses. These reveal you
|
|
40
|
+
are a simulator and is not how a real human would communicate. Don't write phrases like:
|
|
41
|
+
- I need to respond as the user would ...
|
|
42
|
+
- As the simulated user, I should ...
|
|
43
|
+
- Here's how the user might respond ...
|
|
44
|
+
- Based on my user goal, I need to ...
|
|
45
|
+
9. Use the Exit Conditions strictly to stick to User Goal.
|
|
46
|
+
10. Use all relevant tools first to ground your responses, and then respond
|
|
47
|
+
|
|
48
|
+
Exit Conditions:
|
|
49
|
+
1. Use get_conversation_goal_completion tool to check if your User Goal is met. When your User Goal is met:
|
|
50
|
+
- Just generate "<stop/>" to terminate conversation
|
|
51
|
+
2. If conversation becomes unproductive or unsafe:
|
|
52
|
+
- Naturally steer back towards your User Goal
|
|
53
|
+
- If this becomes impossible, just generate: "<stop/>" to terminate conversation
|
|
54
|
+
|
|
55
|
+
CRITICAL BEHAVIORAL CONSTRAINTS:
|
|
56
|
+
- You are ONLY a user seeking assistance, NEVER the one providing assistance.
|
|
57
|
+
- NEVER generate comprehensive responses, detailed plans, or extensive information.
|
|
58
|
+
- NEVER solve problems yourself - that's the assistant's job. Under no circumstances,
|
|
59
|
+
you can use your tools to solve your user goal/sub goals.
|
|
60
|
+
- If you find yourself writing more than 3 sentences, you're doing it wrong.
|
|
61
|
+
- Generate only "<stop/>" to terminate conversation
|
|
62
|
+
|
|
63
|
+
Response Format:
|
|
64
|
+
Generate ONLY the next SHORT message (1-3 sentences). No explanations, no solutions, no comprehensive information.""")
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Goal completion assessment prompt template for actor simulation.
|
|
3
|
+
|
|
4
|
+
This module contains the prompt template used to evaluate whether a conversation
|
|
5
|
+
has successfully achieved the actor's initial goals using a 3-point assessment scale.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from textwrap import dedent
|
|
9
|
+
|
|
10
|
+
GOAL_COMPLETION_PROMPT = dedent(
|
|
11
|
+
"""Please evaluate the following conversation against its intended goals using this
|
|
12
|
+
3-point assessment scale:
|
|
13
|
+
|
|
14
|
+
1 = Does not meet the goal at all
|
|
15
|
+
2 = Partially meets the goal with significant gaps
|
|
16
|
+
3 = Fully meets the goal
|
|
17
|
+
|
|
18
|
+
Initial Goal:
|
|
19
|
+
{initial_goal}
|
|
20
|
+
|
|
21
|
+
Conversation to evaluate:
|
|
22
|
+
{conversation}
|
|
23
|
+
|
|
24
|
+
Please provide:
|
|
25
|
+
- A score (1-3)
|
|
26
|
+
- Brief one line justification"""
|
|
27
|
+
)
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from strands import Agent, tool
|
|
4
|
+
from typing_extensions import Any
|
|
5
|
+
|
|
6
|
+
from strands_evals.simulation.prompt_templates.goal_completion import GOAL_COMPLETION_PROMPT
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@tool
|
|
12
|
+
def get_conversation_goal_completion(initial_goal: str, conversation: list[dict[str, str]]) -> str:
|
|
13
|
+
"""
|
|
14
|
+
Evaluate conversation goal completion using a 3-point assessment scale.
|
|
15
|
+
|
|
16
|
+
Analyzes the conversation against the actor's initial goal and provides a score
|
|
17
|
+
with justification.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
initial_goal: The actor's original goal or objective.
|
|
21
|
+
conversation: List of conversation turns, each with 'role' and 'content' keys.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
Assessment string with score (1-3) and brief justification.
|
|
25
|
+
|
|
26
|
+
Raises:
|
|
27
|
+
ValueError: If the conversation format is invalid.
|
|
28
|
+
"""
|
|
29
|
+
# Format conversation for the prompt
|
|
30
|
+
conversation_text = _format_conversation_for_assessment(conversation)
|
|
31
|
+
|
|
32
|
+
# Create the assessment prompt
|
|
33
|
+
prompt = GOAL_COMPLETION_PROMPT.format(initial_goal=initial_goal, conversation=conversation_text)
|
|
34
|
+
|
|
35
|
+
goal_completion_agent = Agent(callback_handler=None)
|
|
36
|
+
response = goal_completion_agent(prompt)
|
|
37
|
+
logger.info("Successfully completed goal completion assessment")
|
|
38
|
+
return str(response)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _format_conversation_for_assessment(conversation: list[dict[str, Any]]) -> str:
|
|
42
|
+
"""
|
|
43
|
+
Format conversation history for goal completion assessment.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
conversation: List of conversation turns with 'role' and 'content' keys.
|
|
47
|
+
Content can be either a string or a list of content blocks.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
Formatted conversation string with each turn on a separate line.
|
|
51
|
+
|
|
52
|
+
Raises:
|
|
53
|
+
ValueError: If conversation format is invalid.
|
|
54
|
+
"""
|
|
55
|
+
try:
|
|
56
|
+
formatted_turns = []
|
|
57
|
+
|
|
58
|
+
for i, turn in enumerate(conversation):
|
|
59
|
+
if not isinstance(turn, dict):
|
|
60
|
+
raise ValueError(f"Conversation turn {i} must be a dictionary")
|
|
61
|
+
|
|
62
|
+
role = turn.get("role", "").strip()
|
|
63
|
+
content_raw = turn.get("content", "")
|
|
64
|
+
|
|
65
|
+
# Handle both string format and list of content blocks
|
|
66
|
+
if isinstance(content_raw, str):
|
|
67
|
+
content = content_raw.strip()
|
|
68
|
+
elif isinstance(content_raw, list):
|
|
69
|
+
content_parts = []
|
|
70
|
+
for block in content_raw:
|
|
71
|
+
if isinstance(block, dict) and "text" in block:
|
|
72
|
+
content_parts.append(block["text"])
|
|
73
|
+
content = " ".join(content_parts).strip()
|
|
74
|
+
else:
|
|
75
|
+
logger.warning(f"Skipping conversation turn {i} with invalid content type: {type(content_raw)}")
|
|
76
|
+
continue
|
|
77
|
+
|
|
78
|
+
if not role or not content:
|
|
79
|
+
logger.warning(f"Skipping conversation turn {i} with missing role or content")
|
|
80
|
+
continue
|
|
81
|
+
|
|
82
|
+
formatted_turn = f"{role.upper()}: {content}"
|
|
83
|
+
formatted_turns.append(formatted_turn)
|
|
84
|
+
|
|
85
|
+
if not formatted_turns:
|
|
86
|
+
raise ValueError("No valid conversation turns found")
|
|
87
|
+
|
|
88
|
+
return "\n\n".join(formatted_turns)
|
|
89
|
+
|
|
90
|
+
except ValueError:
|
|
91
|
+
raise
|
|
92
|
+
except Exception as e:
|
|
93
|
+
raise ValueError("Error formatting conversation") from e
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tracing module for strands_evals.
|
|
3
|
+
|
|
4
|
+
This module provides OpenTelemetry-based tracing capabilities for evaluation workflows,
|
|
5
|
+
allowing detailed observability into evaluators, evaluation data, and agent execution.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .config import StrandsEvalsTelemetry
|
|
9
|
+
from .tracer import get_tracer, serialize
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"StrandsEvalsTelemetry",
|
|
13
|
+
"get_tracer",
|
|
14
|
+
"serialize",
|
|
15
|
+
]
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CloudWatch logging utilities for sending evaluation results directly to CloudWatch Logs.
|
|
3
|
+
|
|
4
|
+
NOTE: This module is temporary and will be removed in the future once ADOT integration is complete.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
import os
|
|
10
|
+
import time
|
|
11
|
+
|
|
12
|
+
import boto3
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
# Module-level CloudWatch client (lazy initialization)
|
|
17
|
+
_cloudwatch_client = None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _get_cloudwatch_client():
|
|
21
|
+
"""
|
|
22
|
+
Get or create the CloudWatch Logs client (singleton pattern).
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
boto3 CloudWatch Logs client
|
|
26
|
+
"""
|
|
27
|
+
global _cloudwatch_client
|
|
28
|
+
if _cloudwatch_client is None:
|
|
29
|
+
region = os.environ.get("AWS_REGION", "us-east-1")
|
|
30
|
+
_cloudwatch_client = boto3.client("logs", region_name=region)
|
|
31
|
+
return _cloudwatch_client
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _parse_log_config_from_env(config_id: str):
|
|
35
|
+
"""
|
|
36
|
+
Parse log group and stream from environment variables.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
config_id: The config ID (not used for log group, only for ARN)
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
Tuple of (destination_log_group, log_stream_name, service_name, resource_log_group)
|
|
43
|
+
- destination_log_group: From EVALUATION_RESULTS_LOG_GROUP env var (actual destination where logs are sent)
|
|
44
|
+
- resource_log_group: The log group from OTEL_RESOURCE_ATTRIBUTES for resource attributes in EMF
|
|
45
|
+
"""
|
|
46
|
+
# Parse from OTEL_EXPORTER_OTLP_LOGS_HEADERS
|
|
47
|
+
logs_headers = os.environ.get("OTEL_EXPORTER_OTLP_LOGS_HEADERS", "")
|
|
48
|
+
log_stream = "default"
|
|
49
|
+
|
|
50
|
+
if logs_headers:
|
|
51
|
+
for header in logs_headers.split(","):
|
|
52
|
+
if "=" in header:
|
|
53
|
+
key, value = header.split("=", 1)
|
|
54
|
+
if key.strip() == "x-aws-log-stream":
|
|
55
|
+
log_stream = value.strip()
|
|
56
|
+
|
|
57
|
+
# Destination log group is from EVALUATION_RESULTS_LOG_GROUP environment variable
|
|
58
|
+
destination_log_group = os.environ.get("EVALUATION_RESULTS_LOG_GROUP", "default_strands_evals_results")
|
|
59
|
+
destination_log_group = f"/aws/bedrock-agentcore/evaluations/results/{destination_log_group}"
|
|
60
|
+
|
|
61
|
+
# Get resource log group from OTEL_RESOURCE_ATTRIBUTES (for EMF resource attributes)
|
|
62
|
+
resource_log_group = None
|
|
63
|
+
resource_attrs = os.environ.get("OTEL_RESOURCE_ATTRIBUTES", "")
|
|
64
|
+
for attr in resource_attrs.split(","):
|
|
65
|
+
if "=" in attr:
|
|
66
|
+
key, value = attr.split("=", 1)
|
|
67
|
+
if key.strip() == "aws.log.group.names":
|
|
68
|
+
resource_log_group = value.strip()
|
|
69
|
+
|
|
70
|
+
# Get service name from OTEL_RESOURCE_ATTRIBUTES
|
|
71
|
+
service_name = None
|
|
72
|
+
for attr in resource_attrs.split(","):
|
|
73
|
+
if "=" in attr:
|
|
74
|
+
key, value = attr.split("=", 1)
|
|
75
|
+
if key.strip() == "service.name":
|
|
76
|
+
service_name = value.strip()
|
|
77
|
+
break
|
|
78
|
+
|
|
79
|
+
if not service_name:
|
|
80
|
+
raise ValueError("service.name must be set in OTEL_RESOURCE_ATTRIBUTES environment variable")
|
|
81
|
+
|
|
82
|
+
return destination_log_group, log_stream, service_name, resource_log_group
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _send_to_cloudwatch(
|
|
86
|
+
message: str,
|
|
87
|
+
log_data: dict,
|
|
88
|
+
trace_id: str,
|
|
89
|
+
evaluator_name: str,
|
|
90
|
+
score: float,
|
|
91
|
+
config_id: str,
|
|
92
|
+
label: str = "",
|
|
93
|
+
):
|
|
94
|
+
"""
|
|
95
|
+
Send log event directly to CloudWatch Logs using boto3 in EMF format.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
message: The log message
|
|
99
|
+
log_data: Dictionary containing the log event data
|
|
100
|
+
trace_id: The OpenTelemetry trace ID
|
|
101
|
+
evaluator_name: The name of the evaluator (e.g., "Custom.HelpfulnessEvaluator")
|
|
102
|
+
score: The evaluation score
|
|
103
|
+
config_id: The config ID for log group path
|
|
104
|
+
label: The evaluation label (optional)
|
|
105
|
+
"""
|
|
106
|
+
try:
|
|
107
|
+
destination_log_group, log_stream, service_name, resource_log_group = _parse_log_config_from_env(config_id)
|
|
108
|
+
if not destination_log_group:
|
|
109
|
+
logger.warning("No destination log group configured, skipping CloudWatch logging")
|
|
110
|
+
return
|
|
111
|
+
|
|
112
|
+
# Get the singleton CloudWatch client
|
|
113
|
+
cloudwatch_client = _get_cloudwatch_client()
|
|
114
|
+
|
|
115
|
+
# Ensure destination log group exists
|
|
116
|
+
try:
|
|
117
|
+
cloudwatch_client.create_log_group(logGroupName=destination_log_group)
|
|
118
|
+
except cloudwatch_client.exceptions.ResourceAlreadyExistsException:
|
|
119
|
+
pass
|
|
120
|
+
except Exception as e:
|
|
121
|
+
logger.warning(f"Failed to create log group: {str(e)}")
|
|
122
|
+
|
|
123
|
+
# Ensure log stream exists
|
|
124
|
+
try:
|
|
125
|
+
cloudwatch_client.create_log_stream(logGroupName=destination_log_group, logStreamName=log_stream)
|
|
126
|
+
except cloudwatch_client.exceptions.ResourceAlreadyExistsException:
|
|
127
|
+
pass
|
|
128
|
+
except Exception as e:
|
|
129
|
+
logger.warning(f"Failed to create log stream: {str(e)}")
|
|
130
|
+
|
|
131
|
+
# Get sequence token for the log stream
|
|
132
|
+
sequence_token = None
|
|
133
|
+
try:
|
|
134
|
+
response = cloudwatch_client.describe_log_streams(
|
|
135
|
+
logGroupName=destination_log_group, logStreamNamePrefix=log_stream
|
|
136
|
+
)
|
|
137
|
+
if response["logStreams"]:
|
|
138
|
+
sequence_token = response["logStreams"][0].get("uploadSequenceToken")
|
|
139
|
+
except Exception as e:
|
|
140
|
+
logger.warning(f"Failed to get sequence token: {str(e)}")
|
|
141
|
+
|
|
142
|
+
# Get current timestamp
|
|
143
|
+
current_time_ns = time.time_ns()
|
|
144
|
+
current_time_ms = int(current_time_ns / 1_000_000)
|
|
145
|
+
|
|
146
|
+
# Extract online evaluation config ID from ARN
|
|
147
|
+
online_eval_config_id = ""
|
|
148
|
+
if "aws.bedrock_agentcore.online_evaluation_config.arn" in log_data:
|
|
149
|
+
arn = log_data["aws.bedrock_agentcore.online_evaluation_config.arn"]
|
|
150
|
+
if "/" in arn:
|
|
151
|
+
online_eval_config_id = arn.split("/")[-1]
|
|
152
|
+
|
|
153
|
+
emf_log = {
|
|
154
|
+
"resource": {
|
|
155
|
+
"attributes": {
|
|
156
|
+
"aws.local.service": service_name,
|
|
157
|
+
"aws.service.type": "gen_ai_agent",
|
|
158
|
+
"telemetry.sdk.language": "python",
|
|
159
|
+
"service.name": service_name,
|
|
160
|
+
"aws.log.group.names": resource_log_group if resource_log_group else destination_log_group,
|
|
161
|
+
}
|
|
162
|
+
},
|
|
163
|
+
"timeUnixNano": current_time_ns,
|
|
164
|
+
"observedTimeUnixNano": current_time_ns,
|
|
165
|
+
"severityText": "INFO",
|
|
166
|
+
"body": message,
|
|
167
|
+
"attributes": {
|
|
168
|
+
"otelServiceName": service_name,
|
|
169
|
+
"otelTraceID": trace_id,
|
|
170
|
+
"otelTraceSampled": True,
|
|
171
|
+
**log_data,
|
|
172
|
+
},
|
|
173
|
+
"flags": 1,
|
|
174
|
+
"traceId": trace_id,
|
|
175
|
+
"onlineEvaluationConfigId": online_eval_config_id,
|
|
176
|
+
evaluator_name: score,
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
# Add label if provided
|
|
180
|
+
emf_log["label"] = label or "YES"
|
|
181
|
+
emf_log["service.name"] = service_name
|
|
182
|
+
|
|
183
|
+
# Add EMF metadata for CloudWatch metrics
|
|
184
|
+
emf_log["_aws"] = {
|
|
185
|
+
"Timestamp": current_time_ms,
|
|
186
|
+
"CloudWatchMetrics": [
|
|
187
|
+
{
|
|
188
|
+
"Namespace": "Bedrock-AgentCore/Evaluations",
|
|
189
|
+
"Dimensions": [
|
|
190
|
+
["service.name"],
|
|
191
|
+
["label", "service.name"],
|
|
192
|
+
["service.name", "onlineEvaluationConfigId"],
|
|
193
|
+
["label", "service.name", "onlineEvaluationConfigId"],
|
|
194
|
+
],
|
|
195
|
+
"Metrics": [{"Name": evaluator_name, "Unit": "None"}],
|
|
196
|
+
}
|
|
197
|
+
],
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
log_event = {"timestamp": current_time_ms, "message": json.dumps(emf_log)}
|
|
201
|
+
put_log_params = {"logGroupName": destination_log_group, "logStreamName": log_stream, "logEvents": [log_event]}
|
|
202
|
+
|
|
203
|
+
if sequence_token:
|
|
204
|
+
put_log_params["sequenceToken"] = sequence_token
|
|
205
|
+
|
|
206
|
+
cloudwatch_client.put_log_events(**put_log_params)
|
|
207
|
+
|
|
208
|
+
except Exception as e:
|
|
209
|
+
logger.warning(f"Failed to send log to CloudWatch: {str(e)}")
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
"""OpenTelemetry configuration and setup utilities for strands_evals.
|
|
2
|
+
|
|
3
|
+
This module provides centralized configuration and initialization functionality
|
|
4
|
+
for OpenTelemetry tracing in evaluation workflows.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import opentelemetry.trace as trace_api
|
|
11
|
+
from opentelemetry import propagate
|
|
12
|
+
from opentelemetry.baggage.propagation import W3CBaggagePropagator
|
|
13
|
+
from opentelemetry.propagators.composite import CompositePropagator
|
|
14
|
+
from opentelemetry.sdk.resources import Resource
|
|
15
|
+
from opentelemetry.sdk.trace import TracerProvider as SDKTracerProvider
|
|
16
|
+
from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter, SimpleSpanProcessor
|
|
17
|
+
from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
|
|
18
|
+
from opentelemetry.trace import TracerProvider
|
|
19
|
+
from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def get_otel_resource() -> Resource:
|
|
25
|
+
"""Create a standard OpenTelemetry resource with service information.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Resource object with standard service information.
|
|
29
|
+
"""
|
|
30
|
+
resource = Resource.create(
|
|
31
|
+
{
|
|
32
|
+
"service.name": "strands-evals",
|
|
33
|
+
"service.version": "0.1.0",
|
|
34
|
+
"telemetry.sdk.name": "opentelemetry",
|
|
35
|
+
"telemetry.sdk.language": "python",
|
|
36
|
+
}
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
return resource
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class StrandsEvalsTelemetry:
|
|
43
|
+
"""OpenTelemetry configuration and setup for strands_evals.
|
|
44
|
+
|
|
45
|
+
Automatically initializes a tracer provider with text map propagators.
|
|
46
|
+
Trace exporters (console, OTLP) can be set up individually using dedicated methods
|
|
47
|
+
that support method chaining for convenient configuration.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
tracer_provider: Optional pre-configured SDKTracerProvider. If None,
|
|
51
|
+
a new one will be created and set as the global tracer provider.
|
|
52
|
+
|
|
53
|
+
Environment Variables:
|
|
54
|
+
Environment variables are handled by the underlying OpenTelemetry SDK:
|
|
55
|
+
- OTEL_EXPORTER_OTLP_ENDPOINT: OTLP endpoint URL
|
|
56
|
+
- OTEL_EXPORTER_OTLP_HEADERS: Headers for OTLP requests
|
|
57
|
+
|
|
58
|
+
Examples:
|
|
59
|
+
Quick setup with method chaining:
|
|
60
|
+
>>> StrandsEvalsTelemetry().setup_console_exporter().setup_otlp_exporter()
|
|
61
|
+
|
|
62
|
+
Using a custom tracer provider:
|
|
63
|
+
>>> StrandsEvalsTelemetry(tracer_provider=my_provider).setup_console_exporter()
|
|
64
|
+
|
|
65
|
+
Step-by-step configuration:
|
|
66
|
+
>>> telemetry = StrandsEvalsTelemetry()
|
|
67
|
+
>>> telemetry.setup_console_exporter()
|
|
68
|
+
>>> telemetry.setup_otlp_exporter()
|
|
69
|
+
|
|
70
|
+
Note:
|
|
71
|
+
- The tracer provider is automatically initialized upon instantiation
|
|
72
|
+
- When no tracer_provider is provided, the instance sets itself as the global provider
|
|
73
|
+
- Exporters must be explicitly configured using the setup methods
|
|
74
|
+
- Failed exporter configurations are logged but do not raise exceptions
|
|
75
|
+
- All setup methods return self to enable method chaining
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
def __init__(
|
|
79
|
+
self,
|
|
80
|
+
tracer_provider: SDKTracerProvider | None = None,
|
|
81
|
+
) -> None:
|
|
82
|
+
"""Initialize the StrandsEvalsTelemetry instance.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
tracer_provider: Optional pre-configured tracer provider.
|
|
86
|
+
If None, a new one will be created and set as global.
|
|
87
|
+
|
|
88
|
+
The instance is ready to use immediately after initialization, though
|
|
89
|
+
trace exporters must be configured separately using the setup methods.
|
|
90
|
+
"""
|
|
91
|
+
self.resource = get_otel_resource()
|
|
92
|
+
self._in_memory_exporter: InMemorySpanExporter | None = None
|
|
93
|
+
self.tracer_provider: TracerProvider
|
|
94
|
+
|
|
95
|
+
if tracer_provider:
|
|
96
|
+
self.tracer_provider = tracer_provider
|
|
97
|
+
else:
|
|
98
|
+
self._initialize_tracer()
|
|
99
|
+
|
|
100
|
+
def _initialize_tracer(self) -> None:
|
|
101
|
+
"""Initialize the OpenTelemetry tracer."""
|
|
102
|
+
logger.info("Initializing tracer for strands-evals")
|
|
103
|
+
|
|
104
|
+
# Create tracer provider
|
|
105
|
+
tracer_provider = SDKTracerProvider(resource=self.resource)
|
|
106
|
+
|
|
107
|
+
# Set as global tracer provider
|
|
108
|
+
trace_api.set_tracer_provider(tracer_provider)
|
|
109
|
+
|
|
110
|
+
# Get the global tracer provider (may be wrapped in a proxy)
|
|
111
|
+
self.tracer_provider = trace_api.get_tracer_provider()
|
|
112
|
+
|
|
113
|
+
# Set up propagators
|
|
114
|
+
propagate.set_global_textmap(
|
|
115
|
+
CompositePropagator(
|
|
116
|
+
[
|
|
117
|
+
W3CBaggagePropagator(),
|
|
118
|
+
TraceContextTextMapPropagator(),
|
|
119
|
+
]
|
|
120
|
+
)
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
@property
|
|
124
|
+
def in_memory_exporter(self) -> InMemorySpanExporter:
|
|
125
|
+
"""Get the in-memory exporter instance.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
The configured in-memory span exporter.
|
|
129
|
+
|
|
130
|
+
Raises:
|
|
131
|
+
RuntimeError: If setup_in_memory_exporter() has not been called.
|
|
132
|
+
"""
|
|
133
|
+
if self._in_memory_exporter is None:
|
|
134
|
+
raise RuntimeError(
|
|
135
|
+
"In-memory exporter is not configured. Call setup_in_memory_exporter() before accessing this property."
|
|
136
|
+
)
|
|
137
|
+
return self._in_memory_exporter
|
|
138
|
+
|
|
139
|
+
def setup_console_exporter(self, **kwargs: Any) -> "StrandsEvalsTelemetry":
|
|
140
|
+
"""Set up console exporter for the tracer provider.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
**kwargs: Optional keyword arguments passed directly to
|
|
144
|
+
OpenTelemetry's ConsoleSpanExporter initializer.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
self: Enables method chaining.
|
|
148
|
+
|
|
149
|
+
This method configures a SimpleSpanProcessor with a ConsoleSpanExporter,
|
|
150
|
+
allowing trace data to be output to the console. Any additional keyword
|
|
151
|
+
arguments provided will be forwarded to the ConsoleSpanExporter.
|
|
152
|
+
"""
|
|
153
|
+
try:
|
|
154
|
+
logger.info("Enabling console export for strands-evals")
|
|
155
|
+
console_processor = SimpleSpanProcessor(ConsoleSpanExporter(**kwargs))
|
|
156
|
+
self.tracer_provider.add_span_processor(console_processor)
|
|
157
|
+
except Exception as e:
|
|
158
|
+
logger.exception("error=<%s> | Failed to configure console exporter", e)
|
|
159
|
+
return self
|
|
160
|
+
|
|
161
|
+
def setup_in_memory_exporter(self, **kwargs: Any) -> "StrandsEvalsTelemetry":
|
|
162
|
+
"""Set up in-memory exporter for the tracer provider.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
**kwargs: Optional keyword arguments passed directly to
|
|
166
|
+
OpenTelemetry's InMemorySpanExporter initializer.
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
self: Enables method chaining.
|
|
170
|
+
|
|
171
|
+
This method configures a SimpleSpanProcessor with an InMemorySpanExporter,
|
|
172
|
+
allowing trace data to be stored in memory for testing and debugging purposes.
|
|
173
|
+
Any additional keyword arguments provided will be forwarded to the InMemorySpanExporter.
|
|
174
|
+
"""
|
|
175
|
+
try:
|
|
176
|
+
logger.info("Enabling in-memory export for strands-evals")
|
|
177
|
+
self._in_memory_exporter = InMemorySpanExporter()
|
|
178
|
+
span_processor = SimpleSpanProcessor(self._in_memory_exporter)
|
|
179
|
+
self.tracer_provider.add_span_processor(span_processor)
|
|
180
|
+
except Exception as e:
|
|
181
|
+
logger.exception("error=<%s> | Failed to configure console exporter", e)
|
|
182
|
+
return self
|
|
183
|
+
|
|
184
|
+
def setup_otlp_exporter(self, **kwargs: Any) -> "StrandsEvalsTelemetry":
|
|
185
|
+
"""Set up OTLP exporter for the tracer provider.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
**kwargs: Optional keyword arguments passed directly to
|
|
189
|
+
OpenTelemetry's OTLPSpanExporter initializer.
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
self: Enables method chaining.
|
|
193
|
+
|
|
194
|
+
This method configures a BatchSpanProcessor with an OTLPSpanExporter,
|
|
195
|
+
allowing trace data to be exported to an OTLP endpoint. Any additional
|
|
196
|
+
keyword arguments provided will be forwarded to the OTLPSpanExporter.
|
|
197
|
+
"""
|
|
198
|
+
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
|
|
199
|
+
|
|
200
|
+
try:
|
|
201
|
+
otlp_exporter = OTLPSpanExporter(**kwargs)
|
|
202
|
+
batch_processor = BatchSpanProcessor(otlp_exporter)
|
|
203
|
+
self.tracer_provider.add_span_processor(batch_processor)
|
|
204
|
+
logger.info("OTLP exporter configured for strands-evals")
|
|
205
|
+
except Exception as e:
|
|
206
|
+
logger.exception("error=<%s> | Failed to configure OTLP exporter", e)
|
|
207
|
+
return self
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""OpenTelemetry tracing for strands_evals.
|
|
2
|
+
|
|
3
|
+
This module provides a simple way to get the OpenTelemetry tracer
|
|
4
|
+
for evaluation workflows.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
import opentelemetry.trace as trace_api
|
|
12
|
+
from opentelemetry.sdk.trace import TracerProvider as SDKTracerProvider
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def get_tracer() -> trace_api.Tracer:
|
|
18
|
+
"""Get the OpenTelemetry tracer for strands_evals.
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
OpenTelemetry Tracer instance from the global tracer provider
|
|
22
|
+
"""
|
|
23
|
+
tracer_provider = trace_api.get_tracer_provider()
|
|
24
|
+
if not isinstance(tracer_provider, SDKTracerProvider):
|
|
25
|
+
tracer_provider = trace_api.NoOpTracerProvider()
|
|
26
|
+
return tracer_provider.get_tracer("strands-evals")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def serialize(obj: Any) -> str:
|
|
30
|
+
"""Serialize an object to JSON string.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
obj: The object to serialize
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
JSON string representation
|
|
37
|
+
"""
|
|
38
|
+
return json.dumps(obj, ensure_ascii=False, default=str)
|