strands-agents-evals 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. strands_agents_evals-0.1.0.dist-info/METADATA +408 -0
  2. strands_agents_evals-0.1.0.dist-info/RECORD +68 -0
  3. strands_agents_evals-0.1.0.dist-info/WHEEL +4 -0
  4. strands_agents_evals-0.1.0.dist-info/licenses/LICENSE +175 -0
  5. strands_agents_evals-0.1.0.dist-info/licenses/NOTICE +1 -0
  6. strands_evals/__init__.py +22 -0
  7. strands_evals/case.py +53 -0
  8. strands_evals/display/display_console.py +150 -0
  9. strands_evals/evaluators/__init__.py +23 -0
  10. strands_evals/evaluators/evaluator.py +182 -0
  11. strands_evals/evaluators/faithfulness_evaluator.py +116 -0
  12. strands_evals/evaluators/goal_success_rate_evaluator.py +90 -0
  13. strands_evals/evaluators/harmfulness_evaluator.py +135 -0
  14. strands_evals/evaluators/helpfulness_evaluator.py +148 -0
  15. strands_evals/evaluators/interactions_evaluator.py +244 -0
  16. strands_evals/evaluators/output_evaluator.py +72 -0
  17. strands_evals/evaluators/prompt_templates/case_prompt_template.py +63 -0
  18. strands_evals/evaluators/prompt_templates/faithfulness/__init__.py +11 -0
  19. strands_evals/evaluators/prompt_templates/faithfulness/faithfulness_v0.py +30 -0
  20. strands_evals/evaluators/prompt_templates/goal_success_rate/__init__.py +11 -0
  21. strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_v0.py +17 -0
  22. strands_evals/evaluators/prompt_templates/harmfulness/__init__.py +11 -0
  23. strands_evals/evaluators/prompt_templates/harmfulness/harmfulness_v0.py +8 -0
  24. strands_evals/evaluators/prompt_templates/helpfulness/__init__.py +11 -0
  25. strands_evals/evaluators/prompt_templates/helpfulness/helpfulness_v0.py +38 -0
  26. strands_evals/evaluators/prompt_templates/prompt_templates.py +176 -0
  27. strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/__init__.py +11 -0
  28. strands_evals/evaluators/prompt_templates/tool_parameter_accuracy/tool_parameter_accuracy_v0.py +40 -0
  29. strands_evals/evaluators/prompt_templates/tool_selection_accuracy/__init__.py +11 -0
  30. strands_evals/evaluators/prompt_templates/tool_selection_accuracy/tool_selection_accuracy_v0.py +23 -0
  31. strands_evals/evaluators/tool_parameter_accuracy_evaluator.py +112 -0
  32. strands_evals/evaluators/tool_selection_accuracy_evaluator.py +112 -0
  33. strands_evals/evaluators/trajectory_evaluator.py +100 -0
  34. strands_evals/experiment.py +652 -0
  35. strands_evals/extractors/__init__.py +3 -0
  36. strands_evals/extractors/graph_extractor.py +30 -0
  37. strands_evals/extractors/swarm_extractor.py +73 -0
  38. strands_evals/extractors/tools_use_extractor.py +164 -0
  39. strands_evals/extractors/trace_extractor.py +166 -0
  40. strands_evals/generators/__init__.py +3 -0
  41. strands_evals/generators/experiment_generator.py +498 -0
  42. strands_evals/generators/prompt_template/prompt_templates.py +75 -0
  43. strands_evals/generators/topic_planner.py +60 -0
  44. strands_evals/mappers/__init__.py +6 -0
  45. strands_evals/mappers/session_mapper.py +27 -0
  46. strands_evals/mappers/strands_in_memory_session_mapper.py +473 -0
  47. strands_evals/simulation/README.md +323 -0
  48. strands_evals/simulation/__init__.py +6 -0
  49. strands_evals/simulation/actor_simulator.py +292 -0
  50. strands_evals/simulation/profiles/__init__.py +5 -0
  51. strands_evals/simulation/profiles/actor_profile.py +26 -0
  52. strands_evals/simulation/prompt_templates/__init__.py +11 -0
  53. strands_evals/simulation/prompt_templates/actor_profile_extraction.py +25 -0
  54. strands_evals/simulation/prompt_templates/actor_system_prompt.py +64 -0
  55. strands_evals/simulation/prompt_templates/goal_completion.py +27 -0
  56. strands_evals/simulation/tools/__init__.py +5 -0
  57. strands_evals/simulation/tools/goal_completion.py +93 -0
  58. strands_evals/telemetry/__init__.py +15 -0
  59. strands_evals/telemetry/_cloudwatch_logger.py +209 -0
  60. strands_evals/telemetry/config.py +207 -0
  61. strands_evals/telemetry/tracer.py +38 -0
  62. strands_evals/tools/evaluation_tools.py +67 -0
  63. strands_evals/types/__init__.py +11 -0
  64. strands_evals/types/evaluation.py +105 -0
  65. strands_evals/types/evaluation_report.py +244 -0
  66. strands_evals/types/simulation/__init__.py +5 -0
  67. strands_evals/types/simulation/actor.py +34 -0
  68. strands_evals/types/trace.py +205 -0
@@ -0,0 +1,64 @@
1
+ """
2
+ Default system prompt for actor simulation.
3
+
4
+ This module contains the default system prompt that configures the actor's behavior,
5
+ communication style, and response protocols for realistic conversation simulation.
6
+ """
7
+
8
+ from textwrap import dedent
9
+
10
+ DEFAULT_USER_SIMULATOR_PROMPT_TEMPLATE = dedent("""## User Simulation
11
+
12
+ Core Identity:
13
+ - You are simulating a user seeking assistance from an AI assistant
14
+ - You speak in first person only
15
+ - You strictly follow your defined User Goal and User Profile throughout the conversation
16
+
17
+ ## User Profile
18
+ {actor_profile}
19
+
20
+
21
+ Response Protocols:
22
+ When assistant requests information:
23
+ - Provide brief, specific information
24
+ - Maximum 2-3 sentences
25
+
26
+ When assistant provides solutions/answers:
27
+ - Ask follow-ups, seek clarification, or express satisfaction. Do no deviate from the User Goal.
28
+ - While following up, do not increase the conversation scope beyond your User Goal.
29
+
30
+ Communication Rules:
31
+ 1. STRICT maximum response length: 2-3 sentences
32
+ 2. You are seeking help, NOT providing help - never give solutions!
33
+ 3. Maintain your user profile and expertise level consistently
34
+ 4. Express more of your user profile - let your background, expertise level, and personality
35
+ shine through in your responses
36
+ 5. Don't break character by mentioning "assistant" or "AI" explicitly
37
+ 6. Address AI assistant responses in second person ("Your suggestion..." not "The assistant's suggestion...")
38
+ 7. Do not explicitly mention conversation redirection
39
+ 8. Never include meta-references or self-instructions in your responses. These reveal you
40
+ are a simulator and is not how a real human would communicate. Don't write phrases like:
41
+ - I need to respond as the user would ...
42
+ - As the simulated user, I should ...
43
+ - Here's how the user might respond ...
44
+ - Based on my user goal, I need to ...
45
+ 9. Use the Exit Conditions strictly to stick to User Goal.
46
+ 10. Use all relevant tools first to ground your responses, and then respond
47
+
48
+ Exit Conditions:
49
+ 1. Use get_conversation_goal_completion tool to check if your User Goal is met. When your User Goal is met:
50
+ - Just generate "<stop/>" to terminate conversation
51
+ 2. If conversation becomes unproductive or unsafe:
52
+ - Naturally steer back towards your User Goal
53
+ - If this becomes impossible, just generate: "<stop/>" to terminate conversation
54
+
55
+ CRITICAL BEHAVIORAL CONSTRAINTS:
56
+ - You are ONLY a user seeking assistance, NEVER the one providing assistance.
57
+ - NEVER generate comprehensive responses, detailed plans, or extensive information.
58
+ - NEVER solve problems yourself - that's the assistant's job. Under no circumstances,
59
+ you can use your tools to solve your user goal/sub goals.
60
+ - If you find yourself writing more than 3 sentences, you're doing it wrong.
61
+ - Generate only "<stop/>" to terminate conversation
62
+
63
+ Response Format:
64
+ Generate ONLY the next SHORT message (1-3 sentences). No explanations, no solutions, no comprehensive information.""")
@@ -0,0 +1,27 @@
1
+ """
2
+ Goal completion assessment prompt template for actor simulation.
3
+
4
+ This module contains the prompt template used to evaluate whether a conversation
5
+ has successfully achieved the actor's initial goals using a 3-point assessment scale.
6
+ """
7
+
8
+ from textwrap import dedent
9
+
10
+ GOAL_COMPLETION_PROMPT = dedent(
11
+ """Please evaluate the following conversation against its intended goals using this
12
+ 3-point assessment scale:
13
+
14
+ 1 = Does not meet the goal at all
15
+ 2 = Partially meets the goal with significant gaps
16
+ 3 = Fully meets the goal
17
+
18
+ Initial Goal:
19
+ {initial_goal}
20
+
21
+ Conversation to evaluate:
22
+ {conversation}
23
+
24
+ Please provide:
25
+ - A score (1-3)
26
+ - Brief one line justification"""
27
+ )
@@ -0,0 +1,5 @@
1
+ """Tools for actor simulation."""
2
+
3
+ from .goal_completion import get_conversation_goal_completion
4
+
5
+ __all__ = ["get_conversation_goal_completion"]
@@ -0,0 +1,93 @@
1
+ import logging
2
+
3
+ from strands import Agent, tool
4
+ from typing_extensions import Any
5
+
6
+ from strands_evals.simulation.prompt_templates.goal_completion import GOAL_COMPLETION_PROMPT
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ @tool
12
+ def get_conversation_goal_completion(initial_goal: str, conversation: list[dict[str, str]]) -> str:
13
+ """
14
+ Evaluate conversation goal completion using a 3-point assessment scale.
15
+
16
+ Analyzes the conversation against the actor's initial goal and provides a score
17
+ with justification.
18
+
19
+ Args:
20
+ initial_goal: The actor's original goal or objective.
21
+ conversation: List of conversation turns, each with 'role' and 'content' keys.
22
+
23
+ Returns:
24
+ Assessment string with score (1-3) and brief justification.
25
+
26
+ Raises:
27
+ ValueError: If the conversation format is invalid.
28
+ """
29
+ # Format conversation for the prompt
30
+ conversation_text = _format_conversation_for_assessment(conversation)
31
+
32
+ # Create the assessment prompt
33
+ prompt = GOAL_COMPLETION_PROMPT.format(initial_goal=initial_goal, conversation=conversation_text)
34
+
35
+ goal_completion_agent = Agent(callback_handler=None)
36
+ response = goal_completion_agent(prompt)
37
+ logger.info("Successfully completed goal completion assessment")
38
+ return str(response)
39
+
40
+
41
+ def _format_conversation_for_assessment(conversation: list[dict[str, Any]]) -> str:
42
+ """
43
+ Format conversation history for goal completion assessment.
44
+
45
+ Args:
46
+ conversation: List of conversation turns with 'role' and 'content' keys.
47
+ Content can be either a string or a list of content blocks.
48
+
49
+ Returns:
50
+ Formatted conversation string with each turn on a separate line.
51
+
52
+ Raises:
53
+ ValueError: If conversation format is invalid.
54
+ """
55
+ try:
56
+ formatted_turns = []
57
+
58
+ for i, turn in enumerate(conversation):
59
+ if not isinstance(turn, dict):
60
+ raise ValueError(f"Conversation turn {i} must be a dictionary")
61
+
62
+ role = turn.get("role", "").strip()
63
+ content_raw = turn.get("content", "")
64
+
65
+ # Handle both string format and list of content blocks
66
+ if isinstance(content_raw, str):
67
+ content = content_raw.strip()
68
+ elif isinstance(content_raw, list):
69
+ content_parts = []
70
+ for block in content_raw:
71
+ if isinstance(block, dict) and "text" in block:
72
+ content_parts.append(block["text"])
73
+ content = " ".join(content_parts).strip()
74
+ else:
75
+ logger.warning(f"Skipping conversation turn {i} with invalid content type: {type(content_raw)}")
76
+ continue
77
+
78
+ if not role or not content:
79
+ logger.warning(f"Skipping conversation turn {i} with missing role or content")
80
+ continue
81
+
82
+ formatted_turn = f"{role.upper()}: {content}"
83
+ formatted_turns.append(formatted_turn)
84
+
85
+ if not formatted_turns:
86
+ raise ValueError("No valid conversation turns found")
87
+
88
+ return "\n\n".join(formatted_turns)
89
+
90
+ except ValueError:
91
+ raise
92
+ except Exception as e:
93
+ raise ValueError("Error formatting conversation") from e
@@ -0,0 +1,15 @@
1
+ """
2
+ Tracing module for strands_evals.
3
+
4
+ This module provides OpenTelemetry-based tracing capabilities for evaluation workflows,
5
+ allowing detailed observability into evaluators, evaluation data, and agent execution.
6
+ """
7
+
8
+ from .config import StrandsEvalsTelemetry
9
+ from .tracer import get_tracer, serialize
10
+
11
+ __all__ = [
12
+ "StrandsEvalsTelemetry",
13
+ "get_tracer",
14
+ "serialize",
15
+ ]
@@ -0,0 +1,209 @@
1
+ """
2
+ CloudWatch logging utilities for sending evaluation results directly to CloudWatch Logs.
3
+
4
+ NOTE: This module is temporary and will be removed in the future once ADOT integration is complete.
5
+ """
6
+
7
+ import json
8
+ import logging
9
+ import os
10
+ import time
11
+
12
+ import boto3
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Module-level CloudWatch client (lazy initialization)
17
+ _cloudwatch_client = None
18
+
19
+
20
+ def _get_cloudwatch_client():
21
+ """
22
+ Get or create the CloudWatch Logs client (singleton pattern).
23
+
24
+ Returns:
25
+ boto3 CloudWatch Logs client
26
+ """
27
+ global _cloudwatch_client
28
+ if _cloudwatch_client is None:
29
+ region = os.environ.get("AWS_REGION", "us-east-1")
30
+ _cloudwatch_client = boto3.client("logs", region_name=region)
31
+ return _cloudwatch_client
32
+
33
+
34
+ def _parse_log_config_from_env(config_id: str):
35
+ """
36
+ Parse log group and stream from environment variables.
37
+
38
+ Args:
39
+ config_id: The config ID (not used for log group, only for ARN)
40
+
41
+ Returns:
42
+ Tuple of (destination_log_group, log_stream_name, service_name, resource_log_group)
43
+ - destination_log_group: From EVALUATION_RESULTS_LOG_GROUP env var (actual destination where logs are sent)
44
+ - resource_log_group: The log group from OTEL_RESOURCE_ATTRIBUTES for resource attributes in EMF
45
+ """
46
+ # Parse from OTEL_EXPORTER_OTLP_LOGS_HEADERS
47
+ logs_headers = os.environ.get("OTEL_EXPORTER_OTLP_LOGS_HEADERS", "")
48
+ log_stream = "default"
49
+
50
+ if logs_headers:
51
+ for header in logs_headers.split(","):
52
+ if "=" in header:
53
+ key, value = header.split("=", 1)
54
+ if key.strip() == "x-aws-log-stream":
55
+ log_stream = value.strip()
56
+
57
+ # Destination log group is from EVALUATION_RESULTS_LOG_GROUP environment variable
58
+ destination_log_group = os.environ.get("EVALUATION_RESULTS_LOG_GROUP", "default_strands_evals_results")
59
+ destination_log_group = f"/aws/bedrock-agentcore/evaluations/results/{destination_log_group}"
60
+
61
+ # Get resource log group from OTEL_RESOURCE_ATTRIBUTES (for EMF resource attributes)
62
+ resource_log_group = None
63
+ resource_attrs = os.environ.get("OTEL_RESOURCE_ATTRIBUTES", "")
64
+ for attr in resource_attrs.split(","):
65
+ if "=" in attr:
66
+ key, value = attr.split("=", 1)
67
+ if key.strip() == "aws.log.group.names":
68
+ resource_log_group = value.strip()
69
+
70
+ # Get service name from OTEL_RESOURCE_ATTRIBUTES
71
+ service_name = None
72
+ for attr in resource_attrs.split(","):
73
+ if "=" in attr:
74
+ key, value = attr.split("=", 1)
75
+ if key.strip() == "service.name":
76
+ service_name = value.strip()
77
+ break
78
+
79
+ if not service_name:
80
+ raise ValueError("service.name must be set in OTEL_RESOURCE_ATTRIBUTES environment variable")
81
+
82
+ return destination_log_group, log_stream, service_name, resource_log_group
83
+
84
+
85
+ def _send_to_cloudwatch(
86
+ message: str,
87
+ log_data: dict,
88
+ trace_id: str,
89
+ evaluator_name: str,
90
+ score: float,
91
+ config_id: str,
92
+ label: str = "",
93
+ ):
94
+ """
95
+ Send log event directly to CloudWatch Logs using boto3 in EMF format.
96
+
97
+ Args:
98
+ message: The log message
99
+ log_data: Dictionary containing the log event data
100
+ trace_id: The OpenTelemetry trace ID
101
+ evaluator_name: The name of the evaluator (e.g., "Custom.HelpfulnessEvaluator")
102
+ score: The evaluation score
103
+ config_id: The config ID for log group path
104
+ label: The evaluation label (optional)
105
+ """
106
+ try:
107
+ destination_log_group, log_stream, service_name, resource_log_group = _parse_log_config_from_env(config_id)
108
+ if not destination_log_group:
109
+ logger.warning("No destination log group configured, skipping CloudWatch logging")
110
+ return
111
+
112
+ # Get the singleton CloudWatch client
113
+ cloudwatch_client = _get_cloudwatch_client()
114
+
115
+ # Ensure destination log group exists
116
+ try:
117
+ cloudwatch_client.create_log_group(logGroupName=destination_log_group)
118
+ except cloudwatch_client.exceptions.ResourceAlreadyExistsException:
119
+ pass
120
+ except Exception as e:
121
+ logger.warning(f"Failed to create log group: {str(e)}")
122
+
123
+ # Ensure log stream exists
124
+ try:
125
+ cloudwatch_client.create_log_stream(logGroupName=destination_log_group, logStreamName=log_stream)
126
+ except cloudwatch_client.exceptions.ResourceAlreadyExistsException:
127
+ pass
128
+ except Exception as e:
129
+ logger.warning(f"Failed to create log stream: {str(e)}")
130
+
131
+ # Get sequence token for the log stream
132
+ sequence_token = None
133
+ try:
134
+ response = cloudwatch_client.describe_log_streams(
135
+ logGroupName=destination_log_group, logStreamNamePrefix=log_stream
136
+ )
137
+ if response["logStreams"]:
138
+ sequence_token = response["logStreams"][0].get("uploadSequenceToken")
139
+ except Exception as e:
140
+ logger.warning(f"Failed to get sequence token: {str(e)}")
141
+
142
+ # Get current timestamp
143
+ current_time_ns = time.time_ns()
144
+ current_time_ms = int(current_time_ns / 1_000_000)
145
+
146
+ # Extract online evaluation config ID from ARN
147
+ online_eval_config_id = ""
148
+ if "aws.bedrock_agentcore.online_evaluation_config.arn" in log_data:
149
+ arn = log_data["aws.bedrock_agentcore.online_evaluation_config.arn"]
150
+ if "/" in arn:
151
+ online_eval_config_id = arn.split("/")[-1]
152
+
153
+ emf_log = {
154
+ "resource": {
155
+ "attributes": {
156
+ "aws.local.service": service_name,
157
+ "aws.service.type": "gen_ai_agent",
158
+ "telemetry.sdk.language": "python",
159
+ "service.name": service_name,
160
+ "aws.log.group.names": resource_log_group if resource_log_group else destination_log_group,
161
+ }
162
+ },
163
+ "timeUnixNano": current_time_ns,
164
+ "observedTimeUnixNano": current_time_ns,
165
+ "severityText": "INFO",
166
+ "body": message,
167
+ "attributes": {
168
+ "otelServiceName": service_name,
169
+ "otelTraceID": trace_id,
170
+ "otelTraceSampled": True,
171
+ **log_data,
172
+ },
173
+ "flags": 1,
174
+ "traceId": trace_id,
175
+ "onlineEvaluationConfigId": online_eval_config_id,
176
+ evaluator_name: score,
177
+ }
178
+
179
+ # Add label if provided
180
+ emf_log["label"] = label or "YES"
181
+ emf_log["service.name"] = service_name
182
+
183
+ # Add EMF metadata for CloudWatch metrics
184
+ emf_log["_aws"] = {
185
+ "Timestamp": current_time_ms,
186
+ "CloudWatchMetrics": [
187
+ {
188
+ "Namespace": "Bedrock-AgentCore/Evaluations",
189
+ "Dimensions": [
190
+ ["service.name"],
191
+ ["label", "service.name"],
192
+ ["service.name", "onlineEvaluationConfigId"],
193
+ ["label", "service.name", "onlineEvaluationConfigId"],
194
+ ],
195
+ "Metrics": [{"Name": evaluator_name, "Unit": "None"}],
196
+ }
197
+ ],
198
+ }
199
+
200
+ log_event = {"timestamp": current_time_ms, "message": json.dumps(emf_log)}
201
+ put_log_params = {"logGroupName": destination_log_group, "logStreamName": log_stream, "logEvents": [log_event]}
202
+
203
+ if sequence_token:
204
+ put_log_params["sequenceToken"] = sequence_token
205
+
206
+ cloudwatch_client.put_log_events(**put_log_params)
207
+
208
+ except Exception as e:
209
+ logger.warning(f"Failed to send log to CloudWatch: {str(e)}")
@@ -0,0 +1,207 @@
1
+ """OpenTelemetry configuration and setup utilities for strands_evals.
2
+
3
+ This module provides centralized configuration and initialization functionality
4
+ for OpenTelemetry tracing in evaluation workflows.
5
+ """
6
+
7
+ import logging
8
+ from typing import Any
9
+
10
+ import opentelemetry.trace as trace_api
11
+ from opentelemetry import propagate
12
+ from opentelemetry.baggage.propagation import W3CBaggagePropagator
13
+ from opentelemetry.propagators.composite import CompositePropagator
14
+ from opentelemetry.sdk.resources import Resource
15
+ from opentelemetry.sdk.trace import TracerProvider as SDKTracerProvider
16
+ from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter, SimpleSpanProcessor
17
+ from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
18
+ from opentelemetry.trace import TracerProvider
19
+ from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ def get_otel_resource() -> Resource:
25
+ """Create a standard OpenTelemetry resource with service information.
26
+
27
+ Returns:
28
+ Resource object with standard service information.
29
+ """
30
+ resource = Resource.create(
31
+ {
32
+ "service.name": "strands-evals",
33
+ "service.version": "0.1.0",
34
+ "telemetry.sdk.name": "opentelemetry",
35
+ "telemetry.sdk.language": "python",
36
+ }
37
+ )
38
+
39
+ return resource
40
+
41
+
42
+ class StrandsEvalsTelemetry:
43
+ """OpenTelemetry configuration and setup for strands_evals.
44
+
45
+ Automatically initializes a tracer provider with text map propagators.
46
+ Trace exporters (console, OTLP) can be set up individually using dedicated methods
47
+ that support method chaining for convenient configuration.
48
+
49
+ Args:
50
+ tracer_provider: Optional pre-configured SDKTracerProvider. If None,
51
+ a new one will be created and set as the global tracer provider.
52
+
53
+ Environment Variables:
54
+ Environment variables are handled by the underlying OpenTelemetry SDK:
55
+ - OTEL_EXPORTER_OTLP_ENDPOINT: OTLP endpoint URL
56
+ - OTEL_EXPORTER_OTLP_HEADERS: Headers for OTLP requests
57
+
58
+ Examples:
59
+ Quick setup with method chaining:
60
+ >>> StrandsEvalsTelemetry().setup_console_exporter().setup_otlp_exporter()
61
+
62
+ Using a custom tracer provider:
63
+ >>> StrandsEvalsTelemetry(tracer_provider=my_provider).setup_console_exporter()
64
+
65
+ Step-by-step configuration:
66
+ >>> telemetry = StrandsEvalsTelemetry()
67
+ >>> telemetry.setup_console_exporter()
68
+ >>> telemetry.setup_otlp_exporter()
69
+
70
+ Note:
71
+ - The tracer provider is automatically initialized upon instantiation
72
+ - When no tracer_provider is provided, the instance sets itself as the global provider
73
+ - Exporters must be explicitly configured using the setup methods
74
+ - Failed exporter configurations are logged but do not raise exceptions
75
+ - All setup methods return self to enable method chaining
76
+ """
77
+
78
+ def __init__(
79
+ self,
80
+ tracer_provider: SDKTracerProvider | None = None,
81
+ ) -> None:
82
+ """Initialize the StrandsEvalsTelemetry instance.
83
+
84
+ Args:
85
+ tracer_provider: Optional pre-configured tracer provider.
86
+ If None, a new one will be created and set as global.
87
+
88
+ The instance is ready to use immediately after initialization, though
89
+ trace exporters must be configured separately using the setup methods.
90
+ """
91
+ self.resource = get_otel_resource()
92
+ self._in_memory_exporter: InMemorySpanExporter | None = None
93
+ self.tracer_provider: TracerProvider
94
+
95
+ if tracer_provider:
96
+ self.tracer_provider = tracer_provider
97
+ else:
98
+ self._initialize_tracer()
99
+
100
+ def _initialize_tracer(self) -> None:
101
+ """Initialize the OpenTelemetry tracer."""
102
+ logger.info("Initializing tracer for strands-evals")
103
+
104
+ # Create tracer provider
105
+ tracer_provider = SDKTracerProvider(resource=self.resource)
106
+
107
+ # Set as global tracer provider
108
+ trace_api.set_tracer_provider(tracer_provider)
109
+
110
+ # Get the global tracer provider (may be wrapped in a proxy)
111
+ self.tracer_provider = trace_api.get_tracer_provider()
112
+
113
+ # Set up propagators
114
+ propagate.set_global_textmap(
115
+ CompositePropagator(
116
+ [
117
+ W3CBaggagePropagator(),
118
+ TraceContextTextMapPropagator(),
119
+ ]
120
+ )
121
+ )
122
+
123
+ @property
124
+ def in_memory_exporter(self) -> InMemorySpanExporter:
125
+ """Get the in-memory exporter instance.
126
+
127
+ Returns:
128
+ The configured in-memory span exporter.
129
+
130
+ Raises:
131
+ RuntimeError: If setup_in_memory_exporter() has not been called.
132
+ """
133
+ if self._in_memory_exporter is None:
134
+ raise RuntimeError(
135
+ "In-memory exporter is not configured. Call setup_in_memory_exporter() before accessing this property."
136
+ )
137
+ return self._in_memory_exporter
138
+
139
+ def setup_console_exporter(self, **kwargs: Any) -> "StrandsEvalsTelemetry":
140
+ """Set up console exporter for the tracer provider.
141
+
142
+ Args:
143
+ **kwargs: Optional keyword arguments passed directly to
144
+ OpenTelemetry's ConsoleSpanExporter initializer.
145
+
146
+ Returns:
147
+ self: Enables method chaining.
148
+
149
+ This method configures a SimpleSpanProcessor with a ConsoleSpanExporter,
150
+ allowing trace data to be output to the console. Any additional keyword
151
+ arguments provided will be forwarded to the ConsoleSpanExporter.
152
+ """
153
+ try:
154
+ logger.info("Enabling console export for strands-evals")
155
+ console_processor = SimpleSpanProcessor(ConsoleSpanExporter(**kwargs))
156
+ self.tracer_provider.add_span_processor(console_processor)
157
+ except Exception as e:
158
+ logger.exception("error=<%s> | Failed to configure console exporter", e)
159
+ return self
160
+
161
+ def setup_in_memory_exporter(self, **kwargs: Any) -> "StrandsEvalsTelemetry":
162
+ """Set up in-memory exporter for the tracer provider.
163
+
164
+ Args:
165
+ **kwargs: Optional keyword arguments passed directly to
166
+ OpenTelemetry's InMemorySpanExporter initializer.
167
+
168
+ Returns:
169
+ self: Enables method chaining.
170
+
171
+ This method configures a SimpleSpanProcessor with an InMemorySpanExporter,
172
+ allowing trace data to be stored in memory for testing and debugging purposes.
173
+ Any additional keyword arguments provided will be forwarded to the InMemorySpanExporter.
174
+ """
175
+ try:
176
+ logger.info("Enabling in-memory export for strands-evals")
177
+ self._in_memory_exporter = InMemorySpanExporter()
178
+ span_processor = SimpleSpanProcessor(self._in_memory_exporter)
179
+ self.tracer_provider.add_span_processor(span_processor)
180
+ except Exception as e:
181
+ logger.exception("error=<%s> | Failed to configure console exporter", e)
182
+ return self
183
+
184
+ def setup_otlp_exporter(self, **kwargs: Any) -> "StrandsEvalsTelemetry":
185
+ """Set up OTLP exporter for the tracer provider.
186
+
187
+ Args:
188
+ **kwargs: Optional keyword arguments passed directly to
189
+ OpenTelemetry's OTLPSpanExporter initializer.
190
+
191
+ Returns:
192
+ self: Enables method chaining.
193
+
194
+ This method configures a BatchSpanProcessor with an OTLPSpanExporter,
195
+ allowing trace data to be exported to an OTLP endpoint. Any additional
196
+ keyword arguments provided will be forwarded to the OTLPSpanExporter.
197
+ """
198
+ from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
199
+
200
+ try:
201
+ otlp_exporter = OTLPSpanExporter(**kwargs)
202
+ batch_processor = BatchSpanProcessor(otlp_exporter)
203
+ self.tracer_provider.add_span_processor(batch_processor)
204
+ logger.info("OTLP exporter configured for strands-evals")
205
+ except Exception as e:
206
+ logger.exception("error=<%s> | Failed to configure OTLP exporter", e)
207
+ return self
@@ -0,0 +1,38 @@
1
+ """OpenTelemetry tracing for strands_evals.
2
+
3
+ This module provides a simple way to get the OpenTelemetry tracer
4
+ for evaluation workflows.
5
+ """
6
+
7
+ import json
8
+ import logging
9
+ from typing import Any
10
+
11
+ import opentelemetry.trace as trace_api
12
+ from opentelemetry.sdk.trace import TracerProvider as SDKTracerProvider
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def get_tracer() -> trace_api.Tracer:
18
+ """Get the OpenTelemetry tracer for strands_evals.
19
+
20
+ Returns:
21
+ OpenTelemetry Tracer instance from the global tracer provider
22
+ """
23
+ tracer_provider = trace_api.get_tracer_provider()
24
+ if not isinstance(tracer_provider, SDKTracerProvider):
25
+ tracer_provider = trace_api.NoOpTracerProvider()
26
+ return tracer_provider.get_tracer("strands-evals")
27
+
28
+
29
+ def serialize(obj: Any) -> str:
30
+ """Serialize an object to JSON string.
31
+
32
+ Args:
33
+ obj: The object to serialize
34
+
35
+ Returns:
36
+ JSON string representation
37
+ """
38
+ return json.dumps(obj, ensure_ascii=False, default=str)