langwatch-scenario 0.4.0__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {langwatch_scenario-0.4.0.dist-info → langwatch_scenario-0.7.1.dist-info}/METADATA +210 -86
- langwatch_scenario-0.7.1.dist-info/RECORD +237 -0
- scenario/__init__.py +12 -118
- scenario/_events/__init__.py +64 -0
- scenario/_events/event_bus.py +185 -0
- scenario/_events/event_reporter.py +83 -0
- scenario/_events/events.py +162 -0
- scenario/_events/messages.py +58 -0
- scenario/_events/utils.py +97 -0
- scenario/_generated/langwatch_api_client/README.md +139 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/__init__.py +13 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/api/__init__.py +1 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/api/default/__init__.py +1 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/api/default/delete_api_annotations_id.py +155 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/api/default/delete_api_prompts_by_id.py +218 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/api/default/delete_api_scenario_events.py +183 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/api/default/get_api_annotations.py +136 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/api/default/get_api_annotations_id.py +155 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/api/default/get_api_annotations_trace_id.py +160 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/api/default/get_api_dataset_by_slug_or_id.py +229 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/api/default/get_api_prompts.py +188 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/api/default/get_api_prompts_by_id.py +218 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/api/default/get_api_prompts_by_id_versions.py +218 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/api/default/get_api_trace_id.py +155 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/api/default/patch_api_annotations_id.py +178 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/api/default/post_api_annotations_trace_id.py +178 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/api/default/post_api_dataset_by_slug_entries.py +108 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/api/default/post_api_prompts.py +187 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/api/default/post_api_prompts_by_id_versions.py +241 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/api/default/post_api_scenario_events.py +229 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/api/default/post_api_trace_id_share.py +155 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/api/default/post_api_trace_id_unshare.py +155 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/api/default/put_api_prompts_by_id.py +241 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/api/traces/__init__.py +1 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/api/traces/post_api_trace_search.py +168 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/client.py +268 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/errors.py +16 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/__init__.py +455 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/annotation.py +131 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/dataset_post_entries.py +74 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/dataset_post_entries_entries_item.py +44 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/delete_api_annotations_id_response_200.py +68 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/delete_api_prompts_by_id_response_200.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/delete_api_prompts_by_id_response_400.py +61 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/delete_api_prompts_by_id_response_400_error.py +8 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/delete_api_prompts_by_id_response_401.py +61 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/delete_api_prompts_by_id_response_401_error.py +8 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/delete_api_prompts_by_id_response_404.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/delete_api_prompts_by_id_response_500.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/delete_api_scenario_events_response_200.py +81 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/delete_api_scenario_events_response_400.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/delete_api_scenario_events_response_401.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/delete_api_scenario_events_response_500.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/error.py +67 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/evaluation.py +164 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/evaluation_timestamps.py +68 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_dataset_by_slug_or_id_response_200.py +75 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_dataset_by_slug_or_id_response_200_data_item.py +109 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_dataset_by_slug_or_id_response_200_data_item_entry.py +44 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_dataset_by_slug_or_id_response_400.py +78 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_dataset_by_slug_or_id_response_401.py +78 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_dataset_by_slug_or_id_response_404.py +78 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_dataset_by_slug_or_id_response_422.py +67 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_dataset_by_slug_or_id_response_500.py +78 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_response_200.py +172 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_response_200_messages_item.py +69 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_response_200_messages_item_role.py +10 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_response_200_response_format_type_0.py +81 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_response_200_response_format_type_0_json_schema.py +77 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_response_200_response_format_type_0_json_schema_schema.py +44 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_response_200_response_format_type_0_type.py +8 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_response_400.py +61 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_response_400_error.py +8 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_response_401.py +61 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_response_401_error.py +8 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_response_404.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_response_500.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_versions_response_200.py +155 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_versions_response_200_config_data.py +204 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_versions_response_200_config_data_demonstrations.py +101 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_versions_response_200_config_data_demonstrations_columns_item.py +79 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_versions_response_200_config_data_demonstrations_columns_item_type.py +18 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_versions_response_200_config_data_demonstrations_rows_item.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_versions_response_200_config_data_inputs_item.py +71 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_versions_response_200_config_data_inputs_item_type.py +16 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_versions_response_200_config_data_messages_item.py +71 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_versions_response_200_config_data_messages_item_role.py +10 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_versions_response_200_config_data_outputs_item.py +98 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_versions_response_200_config_data_outputs_item_json_schema.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_versions_response_200_config_data_outputs_item_type.py +11 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_versions_response_200_config_data_prompting_technique.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_versions_response_400.py +61 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_versions_response_400_error.py +8 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_versions_response_401.py +61 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_versions_response_401_error.py +8 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_versions_response_404.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_by_id_versions_response_500.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_response_200_item.py +172 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_response_200_item_messages_item.py +69 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_response_200_item_messages_item_role.py +10 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_response_200_item_response_format_type_0.py +81 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_response_200_item_response_format_type_0_json_schema.py +77 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_response_200_item_response_format_type_0_json_schema_schema.py +44 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_response_200_item_response_format_type_0_type.py +8 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_response_400.py +61 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_response_400_error.py +8 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_response_401.py +61 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_response_401_error.py +8 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_prompts_response_500.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_trace_id_response_200.py +249 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_trace_id_response_200_error_type_0.py +79 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_trace_id_response_200_evaluations_item.py +152 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_trace_id_response_200_evaluations_item_error.py +79 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_trace_id_response_200_evaluations_item_timestamps.py +68 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_trace_id_response_200_input.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_trace_id_response_200_metadata.py +68 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_trace_id_response_200_metrics.py +95 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_trace_id_response_200_output.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_trace_id_response_200_spans_item.py +271 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_trace_id_response_200_spans_item_error_type_0.py +79 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_trace_id_response_200_spans_item_input.py +90 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_trace_id_response_200_spans_item_input_value_item.py +69 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_trace_id_response_200_spans_item_metrics.py +77 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_trace_id_response_200_spans_item_output.py +89 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_trace_id_response_200_spans_item_output_value_item.py +68 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_trace_id_response_200_spans_item_params.py +68 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_trace_id_response_200_spans_item_timestamps.py +95 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/get_api_trace_id_response_200_timestamps.py +77 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/input_.py +68 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/metadata.py +68 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/metrics.py +115 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/output.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/pagination.py +68 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/patch_api_annotations_id_body.py +77 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/patch_api_annotations_id_response_200.py +68 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_annotations_trace_id_body.py +77 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_body.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_body.py +147 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_body_config_data.py +207 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_body_config_data_demonstrations.py +106 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_body_config_data_demonstrations_columns_item.py +79 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_body_config_data_demonstrations_columns_item_type.py +18 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_body_config_data_demonstrations_rows_item.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_body_config_data_inputs_item.py +71 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_body_config_data_inputs_item_type.py +16 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_body_config_data_messages_item.py +71 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_body_config_data_messages_item_role.py +10 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_body_config_data_outputs_item.py +98 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_body_config_data_outputs_item_json_schema.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_body_config_data_outputs_item_type.py +11 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_body_config_data_prompting_technique.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_response_200.py +155 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_response_200_config_data.py +206 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_response_200_config_data_demonstrations.py +101 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_response_200_config_data_demonstrations_columns_item.py +79 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_response_200_config_data_demonstrations_columns_item_type.py +18 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_response_200_config_data_demonstrations_rows_item.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_response_200_config_data_inputs_item.py +71 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_response_200_config_data_inputs_item_type.py +16 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_response_200_config_data_messages_item.py +71 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_response_200_config_data_messages_item_role.py +10 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_response_200_config_data_outputs_item.py +98 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_response_200_config_data_outputs_item_json_schema.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_response_200_config_data_outputs_item_type.py +11 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_response_200_config_data_prompting_technique.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_response_400.py +61 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_response_400_error.py +8 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_response_401.py +61 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_response_401_error.py +8 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_response_404.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_by_id_versions_response_500.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_response_200.py +172 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_response_200_messages_item.py +69 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_response_200_messages_item_role.py +10 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_response_200_response_format_type_0.py +81 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_response_200_response_format_type_0_json_schema.py +77 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_response_200_response_format_type_0_json_schema_schema.py +44 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_response_200_response_format_type_0_type.py +8 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_response_400.py +61 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_response_400_error.py +8 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_response_401.py +61 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_response_401_error.py +8 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_prompts_response_500.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_scenario_events_body_type_0.py +127 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_scenario_events_body_type_0_metadata.py +68 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_scenario_events_body_type_1.py +164 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_scenario_events_body_type_1_results_type_0.py +98 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_scenario_events_body_type_1_results_type_0_verdict.py +10 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_scenario_events_body_type_1_status.py +13 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_scenario_events_body_type_2.py +245 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_scenario_events_body_type_2_messages_item_type_0.py +88 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_scenario_events_body_type_2_messages_item_type_1.py +88 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_scenario_events_body_type_2_messages_item_type_2.py +120 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_scenario_events_body_type_2_messages_item_type_2_tool_calls_item.py +87 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_scenario_events_body_type_2_messages_item_type_2_tool_calls_item_function.py +67 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_scenario_events_body_type_2_messages_item_type_3.py +88 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_scenario_events_body_type_2_messages_item_type_4.py +85 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_scenario_events_response_201.py +81 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_scenario_events_response_400.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_scenario_events_response_401.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_scenario_events_response_500.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_trace_id_share_response_200.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/post_api_trace_id_unshare_response_200.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/put_api_prompts_by_id_body.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/put_api_prompts_by_id_response_200.py +75 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/put_api_prompts_by_id_response_400.py +61 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/put_api_prompts_by_id_response_400_error.py +8 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/put_api_prompts_by_id_response_401.py +61 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/put_api_prompts_by_id_response_401_error.py +8 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/put_api_prompts_by_id_response_404.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/put_api_prompts_by_id_response_500.py +59 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/search_request.py +133 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/search_request_filters.py +51 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/search_response.py +93 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/timestamps.py +77 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/models/trace.py +225 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/py.typed +1 -0
- scenario/_generated/langwatch_api_client/lang_watch_api_client/types.py +46 -0
- scenario/_generated/langwatch_api_client/pyproject.toml +27 -0
- scenario/_utils/__init__.py +32 -0
- scenario/_utils/ids.py +58 -0
- scenario/_utils/message_conversion.py +103 -0
- scenario/{utils.py → _utils/utils.py} +21 -110
- scenario/agent_adapter.py +8 -4
- scenario/cache.py +4 -3
- scenario/config.py +7 -5
- scenario/judge_agent.py +13 -29
- scenario/pytest_plugin.py +6 -51
- scenario/scenario_executor.py +372 -215
- scenario/scenario_state.py +6 -6
- scenario/script.py +9 -9
- scenario/types.py +15 -8
- scenario/user_simulator_agent.py +4 -11
- langwatch_scenario-0.4.0.dist-info/RECORD +0 -18
- {langwatch_scenario-0.4.0.dist-info → langwatch_scenario-0.7.1.dist-info}/WHEEL +0 -0
- {langwatch_scenario-0.4.0.dist-info → langwatch_scenario-0.7.1.dist-info}/entry_points.txt +0 -0
- {langwatch_scenario-0.4.0.dist-info → langwatch_scenario-0.7.1.dist-info}/top_level.txt +0 -0
- /scenario/{error_messages.py → _error_messages.py} +0 -0
scenario/scenario_executor.py
CHANGED
@@ -12,37 +12,55 @@ from typing import (
|
|
12
12
|
Callable,
|
13
13
|
Dict,
|
14
14
|
List,
|
15
|
-
Any,
|
16
15
|
Optional,
|
17
16
|
Set,
|
18
17
|
Tuple,
|
19
18
|
Union,
|
19
|
+
TypedDict,
|
20
20
|
)
|
21
21
|
import time
|
22
|
+
import warnings
|
22
23
|
import termcolor
|
23
24
|
import asyncio
|
24
25
|
import concurrent.futures
|
25
26
|
|
26
27
|
from scenario.config import ScenarioConfig
|
27
|
-
from scenario.
|
28
|
-
await_if_awaitable,
|
29
|
-
check_valid_return_type,
|
28
|
+
from scenario._utils import (
|
30
29
|
convert_agent_return_types_to_openai_messages,
|
30
|
+
check_valid_return_type,
|
31
31
|
print_openai_messages,
|
32
32
|
show_spinner,
|
33
|
+
await_if_awaitable,
|
34
|
+
get_or_create_batch_run_id,
|
35
|
+
generate_scenario_run_id,
|
33
36
|
)
|
34
37
|
from openai.types.chat import (
|
35
38
|
ChatCompletionMessageParam,
|
36
39
|
ChatCompletionUserMessageParam,
|
40
|
+
ChatCompletionAssistantMessageParam,
|
37
41
|
)
|
38
42
|
|
39
43
|
from .types import AgentInput, AgentRole, ScenarioResult, ScriptStep
|
40
|
-
from .
|
44
|
+
from ._error_messages import agent_response_not_awaitable
|
41
45
|
from .cache import context_scenario
|
42
46
|
from .agent_adapter import AgentAdapter
|
43
47
|
from .script import proceed
|
44
48
|
from pksuid import PKSUID
|
45
49
|
from .scenario_state import ScenarioState
|
50
|
+
from ._events import (
|
51
|
+
ScenarioEventBus,
|
52
|
+
ScenarioEvent,
|
53
|
+
ScenarioRunStartedEvent,
|
54
|
+
ScenarioMessageSnapshotEvent,
|
55
|
+
ScenarioRunFinishedEvent,
|
56
|
+
ScenarioRunStartedEventMetadata,
|
57
|
+
ScenarioRunFinishedEventResults,
|
58
|
+
ScenarioRunFinishedEventVerdict,
|
59
|
+
ScenarioRunFinishedEventStatus,
|
60
|
+
convert_messages_to_api_client_messages,
|
61
|
+
)
|
62
|
+
from rx.subject.subject import Subject
|
63
|
+
from rx.core.observable.observable import Observable
|
46
64
|
|
47
65
|
|
48
66
|
class ScenarioExecutor:
|
@@ -66,41 +84,8 @@ class ScenarioExecutor:
|
|
66
84
|
agents: List of agent adapters participating in the scenario
|
67
85
|
script: Optional list of script steps to control scenario flow
|
68
86
|
config: Configuration settings for execution behavior
|
69
|
-
|
70
|
-
Example:
|
71
|
-
```python
|
72
|
-
# Direct instantiation (less common)
|
73
|
-
executor = ScenarioExecutor(
|
74
|
-
name="weather query test",
|
75
|
-
description="User asks about weather, agent should provide helpful response",
|
76
|
-
agents=[
|
77
|
-
weather_agent,
|
78
|
-
scenario.UserSimulatorAgent(),
|
79
|
-
scenario.JudgeAgent(criteria=["Agent provides helpful weather info"])
|
80
|
-
],
|
81
|
-
max_turns=10,
|
82
|
-
verbose=True
|
83
|
-
)
|
84
|
-
result = await executor._run()
|
85
|
-
|
86
|
-
# Preferred high-level API
|
87
|
-
result = await scenario.run(
|
88
|
-
name="weather query test",
|
89
|
-
description="User asks about weather, agent should provide helpful response",
|
90
|
-
agents=[
|
91
|
-
weather_agent,
|
92
|
-
scenario.UserSimulatorAgent(),
|
93
|
-
scenario.JudgeAgent(criteria=["Agent provides helpful weather info"])
|
94
|
-
]
|
95
|
-
)
|
96
|
-
```
|
97
|
-
|
98
|
-
Note:
|
99
|
-
- Scenarios run in isolated thread pools to support parallel execution
|
100
|
-
- All agent interactions are cached when cache_key is configured
|
101
|
-
- Debug mode allows step-by-step execution with user intervention
|
102
|
-
- Results include detailed timing information and conversation history
|
103
87
|
"""
|
88
|
+
|
104
89
|
name: str
|
105
90
|
description: str
|
106
91
|
agents: List[AgentAdapter]
|
@@ -115,6 +100,11 @@ class ScenarioExecutor:
|
|
115
100
|
_pending_roles_on_turn: List[AgentRole] = []
|
116
101
|
_pending_agents_on_turn: Set[AgentAdapter] = set()
|
117
102
|
_agent_times: Dict[int, float] = {}
|
103
|
+
_events: Subject
|
104
|
+
|
105
|
+
event_bus: ScenarioEventBus
|
106
|
+
|
107
|
+
batch_run_id: str
|
118
108
|
|
119
109
|
def __init__(
|
120
110
|
self,
|
@@ -127,6 +117,7 @@ class ScenarioExecutor:
|
|
127
117
|
verbose: Optional[Union[bool, int]] = None,
|
128
118
|
cache_key: Optional[str] = None,
|
129
119
|
debug: Optional[bool] = None,
|
120
|
+
event_bus: Optional[ScenarioEventBus] = None,
|
130
121
|
):
|
131
122
|
"""
|
132
123
|
Initialize a scenario executor.
|
@@ -147,26 +138,7 @@ class ScenarioExecutor:
|
|
147
138
|
Overrides global configuration for this scenario.
|
148
139
|
debug: Whether to enable debug mode with step-by-step execution.
|
149
140
|
Overrides global configuration for this scenario.
|
150
|
-
|
151
|
-
Example:
|
152
|
-
```python
|
153
|
-
executor = ScenarioExecutor(
|
154
|
-
name="customer service test",
|
155
|
-
description="Customer has a billing question and needs help",
|
156
|
-
agents=[
|
157
|
-
customer_service_agent,
|
158
|
-
scenario.UserSimulatorAgent(),
|
159
|
-
scenario.JudgeAgent(criteria=[
|
160
|
-
"Agent is polite and professional",
|
161
|
-
"Agent addresses the billing question",
|
162
|
-
"Agent provides clear next steps"
|
163
|
-
])
|
164
|
-
],
|
165
|
-
max_turns=15,
|
166
|
-
verbose=True,
|
167
|
-
debug=False
|
168
|
-
)
|
169
|
-
```
|
141
|
+
event_bus: Optional event bus that will subscribe to this executor's events
|
170
142
|
"""
|
171
143
|
self.name = name
|
172
144
|
self.description = description
|
@@ -183,115 +155,33 @@ class ScenarioExecutor:
|
|
183
155
|
|
184
156
|
self.reset()
|
185
157
|
|
186
|
-
|
187
|
-
|
188
|
-
cls,
|
189
|
-
name: str,
|
190
|
-
description: str,
|
191
|
-
agents: List[AgentAdapter] = [],
|
192
|
-
max_turns: Optional[int] = None,
|
193
|
-
verbose: Optional[Union[bool, int]] = None,
|
194
|
-
cache_key: Optional[str] = None,
|
195
|
-
debug: Optional[bool] = None,
|
196
|
-
script: Optional[List[ScriptStep]] = None,
|
197
|
-
) -> ScenarioResult:
|
198
|
-
"""
|
199
|
-
High-level interface for running a scenario test.
|
200
|
-
|
201
|
-
This is the main entry point for executing scenario tests. It creates a
|
202
|
-
ScenarioExecutor instance and runs it in an isolated thread pool to support
|
203
|
-
parallel execution and prevent blocking.
|
158
|
+
# Create executor's own event stream
|
159
|
+
self._events = Subject()
|
204
160
|
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
agents: List of agent adapters (agent under test, user simulator, judge)
|
209
|
-
max_turns: Maximum conversation turns before timeout (default: 10)
|
210
|
-
verbose: Show detailed output during execution
|
211
|
-
cache_key: Cache key for deterministic behavior
|
212
|
-
debug: Enable debug mode for step-by-step execution
|
213
|
-
script: Optional script steps to control scenario flow
|
161
|
+
# Create and configure event bus to subscribe to our events
|
162
|
+
self.event_bus = event_bus or ScenarioEventBus()
|
163
|
+
self.event_bus.subscribe_to_events(self._events)
|
214
164
|
|
215
|
-
|
216
|
-
ScenarioResult containing the test outcome, conversation history,
|
217
|
-
success/failure status, and detailed reasoning
|
218
|
-
|
219
|
-
Example:
|
220
|
-
```python
|
221
|
-
import scenario
|
165
|
+
self.batch_run_id = get_or_create_batch_run_id()
|
222
166
|
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
agents=[
|
228
|
-
my_agent,
|
229
|
-
scenario.UserSimulatorAgent(),
|
230
|
-
scenario.JudgeAgent(criteria=["Agent provides helpful response"])
|
231
|
-
]
|
232
|
-
)
|
167
|
+
@property
|
168
|
+
def events(self) -> Observable:
|
169
|
+
"""Expose event stream for subscribers like the event bus."""
|
170
|
+
return self._events
|
233
171
|
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
description="Test specific conversation flow",
|
238
|
-
agents=[
|
239
|
-
my_agent,
|
240
|
-
scenario.UserSimulatorAgent(),
|
241
|
-
scenario.JudgeAgent(criteria=["Agent provides helpful response"])
|
242
|
-
],
|
243
|
-
script=[
|
244
|
-
scenario.user("Hello"),
|
245
|
-
scenario.agent(),
|
246
|
-
custom_eval,
|
247
|
-
scenario.succeed()
|
248
|
-
]
|
249
|
-
)
|
172
|
+
def _emit_event(self, event: ScenarioEvent) -> None:
|
173
|
+
"""
|
174
|
+
Emit a domain event to all subscribers.
|
250
175
|
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
print(f"Conversation had {len(result.messages)} messages")
|
255
|
-
```
|
176
|
+
This method publishes scenario events to the internal event stream,
|
177
|
+
which subscribers (like the event bus) can observe and react to.
|
178
|
+
The timestamp is automatically set to the current time.
|
256
179
|
|
257
|
-
|
258
|
-
|
259
|
-
- Blocks until scenario completes or times out
|
260
|
-
- All agent calls are automatically cached when cache_key is set
|
261
|
-
- Exception handling ensures clean resource cleanup
|
180
|
+
Args:
|
181
|
+
event: The scenario event to emit
|
262
182
|
"""
|
263
|
-
|
264
|
-
|
265
|
-
description=description,
|
266
|
-
agents=agents,
|
267
|
-
max_turns=max_turns,
|
268
|
-
verbose=verbose,
|
269
|
-
cache_key=cache_key,
|
270
|
-
debug=debug,
|
271
|
-
script=script,
|
272
|
-
)
|
273
|
-
|
274
|
-
# We'll use a thread pool to run the execution logic, we
|
275
|
-
# require a separate thread because even though asyncio is
|
276
|
-
# being used throughout, any user code on the callback can
|
277
|
-
# be blocking, preventing them from running scenarios in parallel
|
278
|
-
with concurrent.futures.ThreadPoolExecutor() as executor:
|
279
|
-
|
280
|
-
def run_in_thread():
|
281
|
-
loop = asyncio.new_event_loop()
|
282
|
-
asyncio.set_event_loop(loop)
|
283
|
-
|
284
|
-
try:
|
285
|
-
return loop.run_until_complete(scenario._run())
|
286
|
-
finally:
|
287
|
-
loop.close()
|
288
|
-
|
289
|
-
# Run the function in the thread pool and await its result
|
290
|
-
# This converts the thread's execution into a Future that the current
|
291
|
-
# event loop can await without blocking
|
292
|
-
loop = asyncio.get_event_loop()
|
293
|
-
result = await loop.run_in_executor(executor, run_in_thread)
|
294
|
-
return result
|
183
|
+
event.timestamp = int(time.time() * 1000)
|
184
|
+
self._events.on_next(event)
|
295
185
|
|
296
186
|
def reset(self):
|
297
187
|
"""
|
@@ -300,18 +190,6 @@ class ScenarioExecutor:
|
|
300
190
|
This method reinitializes all internal state for a fresh scenario run,
|
301
191
|
including conversation history, turn counters, and agent timing information.
|
302
192
|
Called automatically during initialization and can be used to rerun scenarios.
|
303
|
-
|
304
|
-
Example:
|
305
|
-
```python
|
306
|
-
executor = ScenarioExecutor(...)
|
307
|
-
|
308
|
-
# Run first test
|
309
|
-
result1 = await executor._run()
|
310
|
-
|
311
|
-
# Reset and run again
|
312
|
-
executor.reset()
|
313
|
-
result2 = await executor._run()
|
314
|
-
```
|
315
193
|
"""
|
316
194
|
self._state = ScenarioState(
|
317
195
|
description=self.description,
|
@@ -349,24 +227,24 @@ class ScenarioExecutor:
|
|
349
227
|
Used to avoid broadcasting the message back to its creator.
|
350
228
|
|
351
229
|
Example:
|
352
|
-
```
|
230
|
+
```
|
353
231
|
def inject_system_message(state: ScenarioState) -> None:
|
354
|
-
state.
|
232
|
+
state.add_message({
|
355
233
|
"role": "system",
|
356
234
|
"content": "The user is now in a hurry"
|
357
235
|
})
|
358
236
|
|
359
237
|
# Use in script
|
360
238
|
result = await scenario.run(
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
239
|
+
name="system message test",
|
240
|
+
agents=[agent, user_sim, judge],
|
241
|
+
script=[
|
242
|
+
scenario.user("Hello"),
|
243
|
+
scenario.agent(),
|
244
|
+
inject_system_message,
|
245
|
+
scenario.user(), # Will see the system message
|
246
|
+
scenario.succeed()
|
247
|
+
]
|
370
248
|
)
|
371
249
|
```
|
372
250
|
"""
|
@@ -396,7 +274,7 @@ class ScenarioExecutor:
|
|
396
274
|
from_agent_idx: Index of the agent that generated these messages
|
397
275
|
|
398
276
|
Example:
|
399
|
-
```
|
277
|
+
```
|
400
278
|
# Agent returns multiple messages for a complex interaction
|
401
279
|
messages = [
|
402
280
|
{"role": "assistant", "content": "Let me search for that..."},
|
@@ -476,7 +354,11 @@ class ScenarioExecutor:
|
|
476
354
|
self, role: AgentRole
|
477
355
|
) -> Tuple[int, Optional[AgentAdapter]]:
|
478
356
|
for idx, agent in enumerate(self.agents):
|
479
|
-
if
|
357
|
+
if (
|
358
|
+
role == agent.role
|
359
|
+
and agent in self._pending_agents_on_turn
|
360
|
+
and agent.role in self._pending_roles_on_turn
|
361
|
+
):
|
480
362
|
return idx, agent
|
481
363
|
return -1, None
|
482
364
|
|
@@ -503,7 +385,7 @@ class ScenarioExecutor:
|
|
503
385
|
agent_time=agent_time,
|
504
386
|
)
|
505
387
|
|
506
|
-
async def
|
388
|
+
async def run(self) -> ScenarioResult:
|
507
389
|
"""
|
508
390
|
Run a scenario against the agent under test.
|
509
391
|
|
@@ -513,30 +395,63 @@ class ScenarioExecutor:
|
|
513
395
|
Returns:
|
514
396
|
ScenarioResult containing the test outcome
|
515
397
|
"""
|
398
|
+
scenario_run_id = generate_scenario_run_id()
|
516
399
|
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
self.reset()
|
521
|
-
|
522
|
-
for script_step in self.script:
|
523
|
-
callable = script_step(self._state)
|
524
|
-
if isinstance(callable, Awaitable):
|
525
|
-
result = await callable
|
526
|
-
else:
|
527
|
-
result = callable
|
400
|
+
try:
|
401
|
+
self._emit_run_started_event(scenario_run_id)
|
528
402
|
|
529
|
-
if
|
530
|
-
|
403
|
+
if self.config.verbose:
|
404
|
+
print("") # new line
|
405
|
+
|
406
|
+
self.reset()
|
407
|
+
|
408
|
+
for script_step in self.script:
|
409
|
+
callable = script_step(self._state)
|
410
|
+
if isinstance(callable, Awaitable):
|
411
|
+
result = await callable
|
412
|
+
else:
|
413
|
+
result = callable
|
414
|
+
self._emit_message_snapshot_event(scenario_run_id)
|
415
|
+
|
416
|
+
if isinstance(result, ScenarioResult):
|
417
|
+
status = (
|
418
|
+
ScenarioRunFinishedEventStatus.SUCCESS
|
419
|
+
if result.success
|
420
|
+
else ScenarioRunFinishedEventStatus.FAILED
|
421
|
+
)
|
422
|
+
self._emit_run_finished_event(scenario_run_id, result, status)
|
423
|
+
return result
|
531
424
|
|
532
|
-
|
533
|
-
|
425
|
+
result = self._reached_max_turns(
|
426
|
+
"""Reached end of script without conclusion, add one of the following to the end of the script:
|
534
427
|
|
535
428
|
- `scenario.proceed()` to let the simulation continue to play out
|
536
429
|
- `scenario.judge()` to force criteria judgement
|
537
430
|
- `scenario.succeed()` or `scenario.fail()` to end the test with an explicit result
|
538
|
-
|
539
|
-
|
431
|
+
"""
|
432
|
+
)
|
433
|
+
|
434
|
+
status = (
|
435
|
+
ScenarioRunFinishedEventStatus.SUCCESS
|
436
|
+
if result.success
|
437
|
+
else ScenarioRunFinishedEventStatus.FAILED
|
438
|
+
)
|
439
|
+
self._emit_run_finished_event(scenario_run_id, result, status)
|
440
|
+
return result
|
441
|
+
|
442
|
+
except Exception as e:
|
443
|
+
# Publish failure event before propagating the error
|
444
|
+
error_result = ScenarioResult(
|
445
|
+
success=False,
|
446
|
+
messages=self._state.messages,
|
447
|
+
reasoning=f"Scenario failed with error: {str(e)}",
|
448
|
+
total_time=time.time() - self._total_start_time,
|
449
|
+
agent_time=0,
|
450
|
+
)
|
451
|
+
self._emit_run_finished_event(
|
452
|
+
scenario_run_id, error_result, ScenarioRunFinishedEventStatus.ERROR
|
453
|
+
)
|
454
|
+
raise # Re-raise the exception after cleanup
|
540
455
|
|
541
456
|
async def _call_agent(
|
542
457
|
self, idx: int, role: AgentRole, request_judgment: bool = False
|
@@ -577,16 +492,19 @@ class ScenarioExecutor:
|
|
577
492
|
):
|
578
493
|
start_time = time.time()
|
579
494
|
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
495
|
+
# Prevent pydantic validation warnings which should already be disabled
|
496
|
+
with warnings.catch_warnings():
|
497
|
+
warnings.simplefilter("ignore")
|
498
|
+
agent_response = agent.call(
|
499
|
+
AgentInput(
|
500
|
+
# TODO: test thread_id
|
501
|
+
thread_id=self._state.thread_id,
|
502
|
+
messages=self._state.messages,
|
503
|
+
new_messages=self._pending_messages.get(idx, []),
|
504
|
+
judgment_request=request_judgment,
|
505
|
+
scenario_state=self._state,
|
506
|
+
)
|
588
507
|
)
|
589
|
-
)
|
590
508
|
if not isinstance(agent_response, Awaitable):
|
591
509
|
raise Exception(
|
592
510
|
agent_response_not_awaitable(agent.__class__.__name__),
|
@@ -708,15 +626,24 @@ class ScenarioExecutor:
|
|
708
626
|
reasoning=reasoning or "Scenario marked as failed with scenario.fail()",
|
709
627
|
)
|
710
628
|
|
629
|
+
def _consume_until_role(self, role: AgentRole) -> None:
|
630
|
+
while len(self._pending_roles_on_turn) > 0:
|
631
|
+
next_role = self._pending_roles_on_turn[0]
|
632
|
+
if next_role == role:
|
633
|
+
break
|
634
|
+
self._pending_roles_on_turn.pop(0)
|
635
|
+
|
711
636
|
async def _script_call_agent(
|
712
637
|
self,
|
713
638
|
role: AgentRole,
|
714
639
|
content: Optional[Union[str, ChatCompletionMessageParam]] = None,
|
715
640
|
request_judgment: bool = False,
|
716
641
|
) -> Optional[ScenarioResult]:
|
642
|
+
self._consume_until_role(role)
|
717
643
|
idx, next_agent = self._next_agent_for_role(role)
|
718
644
|
if not next_agent:
|
719
645
|
self._new_turn()
|
646
|
+
self._consume_until_role(role)
|
720
647
|
idx, next_agent = self._next_agent_for_role(role)
|
721
648
|
|
722
649
|
if not next_agent:
|
@@ -738,11 +665,16 @@ class ScenarioExecutor:
|
|
738
665
|
)
|
739
666
|
|
740
667
|
self._pending_agents_on_turn.remove(next_agent)
|
741
|
-
self._pending_roles_on_turn.remove(role)
|
742
668
|
|
743
669
|
if content:
|
744
670
|
if isinstance(content, str):
|
745
|
-
message =
|
671
|
+
message = (
|
672
|
+
ChatCompletionUserMessageParam(role="user", content=content)
|
673
|
+
if role == AgentRole.USER
|
674
|
+
else ChatCompletionAssistantMessageParam(
|
675
|
+
role="assistant", content=content
|
676
|
+
)
|
677
|
+
)
|
746
678
|
else:
|
747
679
|
message = content
|
748
680
|
|
@@ -756,3 +688,228 @@ class ScenarioExecutor:
|
|
756
688
|
)
|
757
689
|
if isinstance(result, ScenarioResult):
|
758
690
|
return result
|
691
|
+
|
692
|
+
# Event handling methods
|
693
|
+
|
694
|
+
class _CommonEventFields(TypedDict):
|
695
|
+
"""
|
696
|
+
Common fields shared across all scenario events.
|
697
|
+
|
698
|
+
These fields provide consistent identification and timing information
|
699
|
+
for all events emitted during scenario execution.
|
700
|
+
|
701
|
+
Attributes:
|
702
|
+
batch_run_id: Unique identifier for the batch of scenario runs
|
703
|
+
scenario_run_id: Unique identifier for this specific scenario run
|
704
|
+
scenario_id: Human-readable name/identifier for the scenario
|
705
|
+
timestamp: Unix timestamp in milliseconds when the event occurred
|
706
|
+
"""
|
707
|
+
|
708
|
+
batch_run_id: str
|
709
|
+
scenario_run_id: str
|
710
|
+
scenario_id: str
|
711
|
+
timestamp: int
|
712
|
+
|
713
|
+
def _create_common_event_fields(self, scenario_run_id: str) -> _CommonEventFields:
|
714
|
+
"""
|
715
|
+
Create common fields used across all scenario events.
|
716
|
+
|
717
|
+
This method generates the standard fields that every scenario event
|
718
|
+
must include for proper identification and timing.
|
719
|
+
|
720
|
+
Args:
|
721
|
+
scenario_run_id: Unique identifier for the current scenario run
|
722
|
+
|
723
|
+
Returns:
|
724
|
+
Dictionary containing common event fields with current timestamp
|
725
|
+
"""
|
726
|
+
return {
|
727
|
+
"batch_run_id": self.batch_run_id,
|
728
|
+
"scenario_run_id": scenario_run_id,
|
729
|
+
"scenario_id": self.name,
|
730
|
+
"timestamp": int(time.time() * 1000),
|
731
|
+
}
|
732
|
+
|
733
|
+
def _emit_run_started_event(self, scenario_run_id: str) -> None:
|
734
|
+
"""
|
735
|
+
Emit a scenario run started event.
|
736
|
+
|
737
|
+
This event is published when a scenario begins execution. It includes
|
738
|
+
metadata about the scenario such as name and description, and is used
|
739
|
+
to track the start of scenario runs in monitoring systems.
|
740
|
+
|
741
|
+
Args:
|
742
|
+
scenario_run_id: Unique identifier for the current scenario run
|
743
|
+
"""
|
744
|
+
common_fields = self._create_common_event_fields(scenario_run_id)
|
745
|
+
metadata = ScenarioRunStartedEventMetadata(
|
746
|
+
name=self.name,
|
747
|
+
description=self.description,
|
748
|
+
)
|
749
|
+
|
750
|
+
event = ScenarioRunStartedEvent(
|
751
|
+
**common_fields,
|
752
|
+
metadata=metadata,
|
753
|
+
)
|
754
|
+
self._emit_event(event)
|
755
|
+
|
756
|
+
def _emit_message_snapshot_event(self, scenario_run_id: str) -> None:
|
757
|
+
"""
|
758
|
+
Emit a message snapshot event.
|
759
|
+
|
760
|
+
This event captures the current state of the conversation during
|
761
|
+
scenario execution. It's published whenever messages are added to
|
762
|
+
the conversation, allowing real-time tracking of scenario progress.
|
763
|
+
"""
|
764
|
+
common_fields = self._create_common_event_fields(scenario_run_id)
|
765
|
+
|
766
|
+
event = ScenarioMessageSnapshotEvent(
|
767
|
+
**common_fields,
|
768
|
+
messages=convert_messages_to_api_client_messages(self._state.messages),
|
769
|
+
)
|
770
|
+
self._emit_event(event)
|
771
|
+
|
772
|
+
def _emit_run_finished_event(
|
773
|
+
self,
|
774
|
+
scenario_run_id: str,
|
775
|
+
result: ScenarioResult,
|
776
|
+
status: ScenarioRunFinishedEventStatus,
|
777
|
+
) -> None:
|
778
|
+
"""
|
779
|
+
Emit a scenario run finished event.
|
780
|
+
|
781
|
+
This event is published when a scenario completes execution, whether
|
782
|
+
successfully or with an error. It includes the final results, verdict,
|
783
|
+
and reasoning for the scenario outcome.
|
784
|
+
|
785
|
+
Args:
|
786
|
+
scenario_run_id: Unique identifier for the current scenario run
|
787
|
+
result: The final scenario result containing success/failure status
|
788
|
+
status: The execution status (SUCCESS, FAILED, or ERROR)
|
789
|
+
"""
|
790
|
+
common_fields = self._create_common_event_fields(scenario_run_id)
|
791
|
+
|
792
|
+
results = ScenarioRunFinishedEventResults(
|
793
|
+
verdict=(
|
794
|
+
ScenarioRunFinishedEventVerdict.SUCCESS
|
795
|
+
if result.success
|
796
|
+
else ScenarioRunFinishedEventVerdict.FAILURE
|
797
|
+
),
|
798
|
+
reasoning=result.reasoning or "",
|
799
|
+
met_criteria=result.passed_criteria,
|
800
|
+
unmet_criteria=result.failed_criteria,
|
801
|
+
)
|
802
|
+
|
803
|
+
event = ScenarioRunFinishedEvent(
|
804
|
+
**common_fields,
|
805
|
+
status=status,
|
806
|
+
results=results,
|
807
|
+
)
|
808
|
+
self._emit_event(event)
|
809
|
+
|
810
|
+
# Signal end of event stream
|
811
|
+
self._events.on_completed()
|
812
|
+
|
813
|
+
|
814
|
+
async def run(
|
815
|
+
name: str,
|
816
|
+
description: str,
|
817
|
+
agents: List[AgentAdapter] = [],
|
818
|
+
max_turns: Optional[int] = None,
|
819
|
+
verbose: Optional[Union[bool, int]] = None,
|
820
|
+
cache_key: Optional[str] = None,
|
821
|
+
debug: Optional[bool] = None,
|
822
|
+
script: Optional[List[ScriptStep]] = None,
|
823
|
+
) -> ScenarioResult:
|
824
|
+
"""
|
825
|
+
High-level interface for running a scenario test.
|
826
|
+
|
827
|
+
This is the main entry point for executing scenario tests. It creates a
|
828
|
+
ScenarioExecutor instance and runs it in an isolated thread pool to support
|
829
|
+
parallel execution and prevent blocking.
|
830
|
+
|
831
|
+
Args:
|
832
|
+
name: Human-readable name for the scenario
|
833
|
+
description: Detailed description of what the scenario tests
|
834
|
+
agents: List of agent adapters (agent under test, user simulator, judge)
|
835
|
+
max_turns: Maximum conversation turns before timeout (default: 10)
|
836
|
+
verbose: Show detailed output during execution
|
837
|
+
cache_key: Cache key for deterministic behavior
|
838
|
+
debug: Enable debug mode for step-by-step execution
|
839
|
+
script: Optional script steps to control scenario flow
|
840
|
+
|
841
|
+
Returns:
|
842
|
+
ScenarioResult containing the test outcome, conversation history,
|
843
|
+
success/failure status, and detailed reasoning
|
844
|
+
|
845
|
+
Example:
|
846
|
+
```
|
847
|
+
import scenario
|
848
|
+
|
849
|
+
# Simple scenario with automatic flow
|
850
|
+
result = await scenario.run(
|
851
|
+
name="help request",
|
852
|
+
description="User asks for help with a technical problem",
|
853
|
+
agents=[
|
854
|
+
my_agent,
|
855
|
+
scenario.UserSimulatorAgent(),
|
856
|
+
scenario.JudgeAgent(criteria=["Agent provides helpful response"])
|
857
|
+
]
|
858
|
+
)
|
859
|
+
|
860
|
+
# Scripted scenario with custom evaluations
|
861
|
+
result = await scenario.run(
|
862
|
+
name="custom interaction",
|
863
|
+
description="Test specific conversation flow",
|
864
|
+
agents=[
|
865
|
+
my_agent,
|
866
|
+
scenario.UserSimulatorAgent(),
|
867
|
+
scenario.JudgeAgent(criteria=["Agent provides helpful response"])
|
868
|
+
],
|
869
|
+
script=[
|
870
|
+
scenario.user("Hello"),
|
871
|
+
scenario.agent(),
|
872
|
+
custom_eval,
|
873
|
+
scenario.succeed()
|
874
|
+
]
|
875
|
+
)
|
876
|
+
|
877
|
+
# Results analysis
|
878
|
+
print(f"Test {'PASSED' if result.success else 'FAILED'}")
|
879
|
+
print(f"Reasoning: {result.reasoning}")
|
880
|
+
print(f"Conversation had {len(result.messages)} messages")
|
881
|
+
```
|
882
|
+
"""
|
883
|
+
scenario = ScenarioExecutor(
|
884
|
+
name=name,
|
885
|
+
description=description,
|
886
|
+
agents=agents,
|
887
|
+
max_turns=max_turns,
|
888
|
+
verbose=verbose,
|
889
|
+
cache_key=cache_key,
|
890
|
+
debug=debug,
|
891
|
+
script=script,
|
892
|
+
)
|
893
|
+
|
894
|
+
# We'll use a thread pool to run the execution logic, we
|
895
|
+
# require a separate thread because even though asyncio is
|
896
|
+
# being used throughout, any user code on the callback can
|
897
|
+
# be blocking, preventing them from running scenarios in parallel
|
898
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
899
|
+
|
900
|
+
def run_in_thread():
|
901
|
+
loop = asyncio.new_event_loop()
|
902
|
+
asyncio.set_event_loop(loop)
|
903
|
+
|
904
|
+
try:
|
905
|
+
return loop.run_until_complete(scenario.run())
|
906
|
+
finally:
|
907
|
+
scenario.event_bus.drain()
|
908
|
+
loop.close()
|
909
|
+
|
910
|
+
# Run the function in the thread pool and await its result
|
911
|
+
# This converts the thread's execution into a Future that the current
|
912
|
+
# event loop can await without blocking
|
913
|
+
loop = asyncio.get_event_loop()
|
914
|
+
result = await loop.run_in_executor(executor, run_in_thread)
|
915
|
+
return result
|