aeri-python 4.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aeri/__init__.py +72 -0
- aeri/_client/_validation.py +204 -0
- aeri/_client/attributes.py +188 -0
- aeri/_client/client.py +3761 -0
- aeri/_client/constants.py +65 -0
- aeri/_client/datasets.py +302 -0
- aeri/_client/environment_variables.py +158 -0
- aeri/_client/get_client.py +149 -0
- aeri/_client/observe.py +661 -0
- aeri/_client/propagation.py +475 -0
- aeri/_client/resource_manager.py +510 -0
- aeri/_client/span.py +1519 -0
- aeri/_client/span_filter.py +76 -0
- aeri/_client/span_processor.py +206 -0
- aeri/_client/utils.py +132 -0
- aeri/_task_manager/media_manager.py +331 -0
- aeri/_task_manager/media_upload_consumer.py +44 -0
- aeri/_task_manager/media_upload_queue.py +12 -0
- aeri/_task_manager/score_ingestion_consumer.py +208 -0
- aeri/_task_manager/task_manager.py +475 -0
- aeri/_utils/__init__.py +19 -0
- aeri/_utils/environment.py +34 -0
- aeri/_utils/error_logging.py +47 -0
- aeri/_utils/parse_error.py +99 -0
- aeri/_utils/prompt_cache.py +188 -0
- aeri/_utils/request.py +137 -0
- aeri/_utils/serializer.py +205 -0
- aeri/api/.fern/metadata.json +14 -0
- aeri/api/__init__.py +836 -0
- aeri/api/annotation_queues/__init__.py +82 -0
- aeri/api/annotation_queues/client.py +1111 -0
- aeri/api/annotation_queues/raw_client.py +2288 -0
- aeri/api/annotation_queues/types/__init__.py +84 -0
- aeri/api/annotation_queues/types/annotation_queue.py +28 -0
- aeri/api/annotation_queues/types/annotation_queue_assignment_request.py +16 -0
- aeri/api/annotation_queues/types/annotation_queue_item.py +34 -0
- aeri/api/annotation_queues/types/annotation_queue_object_type.py +26 -0
- aeri/api/annotation_queues/types/annotation_queue_status.py +22 -0
- aeri/api/annotation_queues/types/create_annotation_queue_assignment_response.py +18 -0
- aeri/api/annotation_queues/types/create_annotation_queue_item_request.py +25 -0
- aeri/api/annotation_queues/types/create_annotation_queue_request.py +20 -0
- aeri/api/annotation_queues/types/delete_annotation_queue_assignment_response.py +14 -0
- aeri/api/annotation_queues/types/delete_annotation_queue_item_response.py +15 -0
- aeri/api/annotation_queues/types/paginated_annotation_queue_items.py +17 -0
- aeri/api/annotation_queues/types/paginated_annotation_queues.py +17 -0
- aeri/api/annotation_queues/types/update_annotation_queue_item_request.py +15 -0
- aeri/api/blob_storage_integrations/__init__.py +73 -0
- aeri/api/blob_storage_integrations/client.py +550 -0
- aeri/api/blob_storage_integrations/raw_client.py +976 -0
- aeri/api/blob_storage_integrations/types/__init__.py +77 -0
- aeri/api/blob_storage_integrations/types/blob_storage_export_frequency.py +26 -0
- aeri/api/blob_storage_integrations/types/blob_storage_export_mode.py +26 -0
- aeri/api/blob_storage_integrations/types/blob_storage_integration_deletion_response.py +14 -0
- aeri/api/blob_storage_integrations/types/blob_storage_integration_file_type.py +26 -0
- aeri/api/blob_storage_integrations/types/blob_storage_integration_response.py +64 -0
- aeri/api/blob_storage_integrations/types/blob_storage_integration_status_response.py +50 -0
- aeri/api/blob_storage_integrations/types/blob_storage_integration_type.py +26 -0
- aeri/api/blob_storage_integrations/types/blob_storage_integrations_response.py +15 -0
- aeri/api/blob_storage_integrations/types/blob_storage_sync_status.py +47 -0
- aeri/api/blob_storage_integrations/types/create_blob_storage_integration_request.py +91 -0
- aeri/api/client.py +679 -0
- aeri/api/comments/__init__.py +44 -0
- aeri/api/comments/client.py +407 -0
- aeri/api/comments/raw_client.py +750 -0
- aeri/api/comments/types/__init__.py +46 -0
- aeri/api/comments/types/create_comment_request.py +47 -0
- aeri/api/comments/types/create_comment_response.py +17 -0
- aeri/api/comments/types/get_comments_response.py +17 -0
- aeri/api/commons/__init__.py +210 -0
- aeri/api/commons/errors/__init__.py +56 -0
- aeri/api/commons/errors/access_denied_error.py +12 -0
- aeri/api/commons/errors/error.py +12 -0
- aeri/api/commons/errors/method_not_allowed_error.py +12 -0
- aeri/api/commons/errors/not_found_error.py +12 -0
- aeri/api/commons/errors/unauthorized_error.py +12 -0
- aeri/api/commons/types/__init__.py +190 -0
- aeri/api/commons/types/base_score.py +90 -0
- aeri/api/commons/types/base_score_v1.py +70 -0
- aeri/api/commons/types/boolean_score.py +26 -0
- aeri/api/commons/types/boolean_score_v1.py +26 -0
- aeri/api/commons/types/categorical_score.py +26 -0
- aeri/api/commons/types/categorical_score_v1.py +26 -0
- aeri/api/commons/types/comment.py +36 -0
- aeri/api/commons/types/comment_object_type.py +30 -0
- aeri/api/commons/types/config_category.py +15 -0
- aeri/api/commons/types/correction_score.py +26 -0
- aeri/api/commons/types/create_score_value.py +5 -0
- aeri/api/commons/types/dataset.py +49 -0
- aeri/api/commons/types/dataset_item.py +58 -0
- aeri/api/commons/types/dataset_run.py +63 -0
- aeri/api/commons/types/dataset_run_item.py +40 -0
- aeri/api/commons/types/dataset_run_with_items.py +19 -0
- aeri/api/commons/types/dataset_status.py +22 -0
- aeri/api/commons/types/map_value.py +11 -0
- aeri/api/commons/types/model.py +125 -0
- aeri/api/commons/types/model_price.py +14 -0
- aeri/api/commons/types/model_usage_unit.py +42 -0
- aeri/api/commons/types/numeric_score.py +17 -0
- aeri/api/commons/types/numeric_score_v1.py +17 -0
- aeri/api/commons/types/observation.py +142 -0
- aeri/api/commons/types/observation_level.py +30 -0
- aeri/api/commons/types/observation_v2.py +235 -0
- aeri/api/commons/types/observations_view.py +89 -0
- aeri/api/commons/types/pricing_tier.py +91 -0
- aeri/api/commons/types/pricing_tier_condition.py +68 -0
- aeri/api/commons/types/pricing_tier_input.py +76 -0
- aeri/api/commons/types/pricing_tier_operator.py +42 -0
- aeri/api/commons/types/score.py +201 -0
- aeri/api/commons/types/score_config.py +66 -0
- aeri/api/commons/types/score_config_data_type.py +26 -0
- aeri/api/commons/types/score_data_type.py +30 -0
- aeri/api/commons/types/score_source.py +26 -0
- aeri/api/commons/types/score_v1.py +131 -0
- aeri/api/commons/types/session.py +25 -0
- aeri/api/commons/types/session_with_traces.py +15 -0
- aeri/api/commons/types/trace.py +84 -0
- aeri/api/commons/types/trace_with_details.py +43 -0
- aeri/api/commons/types/trace_with_full_details.py +45 -0
- aeri/api/commons/types/usage.py +59 -0
- aeri/api/core/__init__.py +111 -0
- aeri/api/core/api_error.py +23 -0
- aeri/api/core/client_wrapper.py +141 -0
- aeri/api/core/datetime_utils.py +30 -0
- aeri/api/core/enum.py +20 -0
- aeri/api/core/file.py +70 -0
- aeri/api/core/force_multipart.py +18 -0
- aeri/api/core/http_client.py +711 -0
- aeri/api/core/http_response.py +55 -0
- aeri/api/core/http_sse/__init__.py +48 -0
- aeri/api/core/http_sse/_api.py +114 -0
- aeri/api/core/http_sse/_decoders.py +66 -0
- aeri/api/core/http_sse/_exceptions.py +7 -0
- aeri/api/core/http_sse/_models.py +17 -0
- aeri/api/core/jsonable_encoder.py +102 -0
- aeri/api/core/pydantic_utilities.py +310 -0
- aeri/api/core/query_encoder.py +60 -0
- aeri/api/core/remove_none_from_dict.py +11 -0
- aeri/api/core/request_options.py +35 -0
- aeri/api/core/serialization.py +282 -0
- aeri/api/dataset_items/__init__.py +52 -0
- aeri/api/dataset_items/client.py +499 -0
- aeri/api/dataset_items/raw_client.py +973 -0
- aeri/api/dataset_items/types/__init__.py +50 -0
- aeri/api/dataset_items/types/create_dataset_item_request.py +37 -0
- aeri/api/dataset_items/types/delete_dataset_item_response.py +17 -0
- aeri/api/dataset_items/types/paginated_dataset_items.py +17 -0
- aeri/api/dataset_run_items/__init__.py +43 -0
- aeri/api/dataset_run_items/client.py +323 -0
- aeri/api/dataset_run_items/raw_client.py +547 -0
- aeri/api/dataset_run_items/types/__init__.py +44 -0
- aeri/api/dataset_run_items/types/create_dataset_run_item_request.py +51 -0
- aeri/api/dataset_run_items/types/paginated_dataset_run_items.py +17 -0
- aeri/api/datasets/__init__.py +55 -0
- aeri/api/datasets/client.py +661 -0
- aeri/api/datasets/raw_client.py +1368 -0
- aeri/api/datasets/types/__init__.py +53 -0
- aeri/api/datasets/types/create_dataset_request.py +31 -0
- aeri/api/datasets/types/delete_dataset_run_response.py +14 -0
- aeri/api/datasets/types/paginated_dataset_runs.py +17 -0
- aeri/api/datasets/types/paginated_datasets.py +17 -0
- aeri/api/health/__init__.py +44 -0
- aeri/api/health/client.py +112 -0
- aeri/api/health/errors/__init__.py +42 -0
- aeri/api/health/errors/service_unavailable_error.py +13 -0
- aeri/api/health/raw_client.py +227 -0
- aeri/api/health/types/__init__.py +40 -0
- aeri/api/health/types/health_response.py +30 -0
- aeri/api/ingestion/__init__.py +169 -0
- aeri/api/ingestion/client.py +221 -0
- aeri/api/ingestion/raw_client.py +293 -0
- aeri/api/ingestion/types/__init__.py +169 -0
- aeri/api/ingestion/types/base_event.py +27 -0
- aeri/api/ingestion/types/create_event_body.py +14 -0
- aeri/api/ingestion/types/create_event_event.py +15 -0
- aeri/api/ingestion/types/create_generation_body.py +40 -0
- aeri/api/ingestion/types/create_generation_event.py +15 -0
- aeri/api/ingestion/types/create_observation_event.py +15 -0
- aeri/api/ingestion/types/create_span_body.py +19 -0
- aeri/api/ingestion/types/create_span_event.py +15 -0
- aeri/api/ingestion/types/ingestion_error.py +17 -0
- aeri/api/ingestion/types/ingestion_event.py +155 -0
- aeri/api/ingestion/types/ingestion_response.py +17 -0
- aeri/api/ingestion/types/ingestion_success.py +15 -0
- aeri/api/ingestion/types/ingestion_usage.py +8 -0
- aeri/api/ingestion/types/observation_body.py +53 -0
- aeri/api/ingestion/types/observation_type.py +54 -0
- aeri/api/ingestion/types/open_ai_completion_usage_schema.py +26 -0
- aeri/api/ingestion/types/open_ai_response_usage_schema.py +24 -0
- aeri/api/ingestion/types/open_ai_usage.py +28 -0
- aeri/api/ingestion/types/optional_observation_body.py +36 -0
- aeri/api/ingestion/types/score_body.py +75 -0
- aeri/api/ingestion/types/score_event.py +15 -0
- aeri/api/ingestion/types/sdk_log_body.py +14 -0
- aeri/api/ingestion/types/sdk_log_event.py +15 -0
- aeri/api/ingestion/types/trace_body.py +36 -0
- aeri/api/ingestion/types/trace_event.py +15 -0
- aeri/api/ingestion/types/update_event_body.py +14 -0
- aeri/api/ingestion/types/update_generation_body.py +40 -0
- aeri/api/ingestion/types/update_generation_event.py +15 -0
- aeri/api/ingestion/types/update_observation_event.py +15 -0
- aeri/api/ingestion/types/update_span_body.py +19 -0
- aeri/api/ingestion/types/update_span_event.py +15 -0
- aeri/api/ingestion/types/usage_details.py +10 -0
- aeri/api/legacy/__init__.py +61 -0
- aeri/api/legacy/client.py +105 -0
- aeri/api/legacy/metrics_v1/__init__.py +40 -0
- aeri/api/legacy/metrics_v1/client.py +214 -0
- aeri/api/legacy/metrics_v1/raw_client.py +322 -0
- aeri/api/legacy/metrics_v1/types/__init__.py +40 -0
- aeri/api/legacy/metrics_v1/types/metrics_response.py +19 -0
- aeri/api/legacy/observations_v1/__init__.py +43 -0
- aeri/api/legacy/observations_v1/client.py +523 -0
- aeri/api/legacy/observations_v1/raw_client.py +759 -0
- aeri/api/legacy/observations_v1/types/__init__.py +44 -0
- aeri/api/legacy/observations_v1/types/observations.py +17 -0
- aeri/api/legacy/observations_v1/types/observations_views.py +17 -0
- aeri/api/legacy/raw_client.py +13 -0
- aeri/api/legacy/score_v1/__init__.py +43 -0
- aeri/api/legacy/score_v1/client.py +329 -0
- aeri/api/legacy/score_v1/raw_client.py +545 -0
- aeri/api/legacy/score_v1/types/__init__.py +44 -0
- aeri/api/legacy/score_v1/types/create_score_request.py +75 -0
- aeri/api/legacy/score_v1/types/create_score_response.py +17 -0
- aeri/api/llm_connections/__init__.py +55 -0
- aeri/api/llm_connections/client.py +311 -0
- aeri/api/llm_connections/raw_client.py +541 -0
- aeri/api/llm_connections/types/__init__.py +53 -0
- aeri/api/llm_connections/types/llm_adapter.py +38 -0
- aeri/api/llm_connections/types/llm_connection.py +77 -0
- aeri/api/llm_connections/types/paginated_llm_connections.py +17 -0
- aeri/api/llm_connections/types/upsert_llm_connection_request.py +69 -0
- aeri/api/media/__init__.py +58 -0
- aeri/api/media/client.py +427 -0
- aeri/api/media/raw_client.py +739 -0
- aeri/api/media/types/__init__.py +56 -0
- aeri/api/media/types/get_media_response.py +55 -0
- aeri/api/media/types/get_media_upload_url_request.py +51 -0
- aeri/api/media/types/get_media_upload_url_response.py +28 -0
- aeri/api/media/types/media_content_type.py +232 -0
- aeri/api/media/types/patch_media_body.py +43 -0
- aeri/api/metrics/__init__.py +40 -0
- aeri/api/metrics/client.py +422 -0
- aeri/api/metrics/raw_client.py +530 -0
- aeri/api/metrics/types/__init__.py +40 -0
- aeri/api/metrics/types/metrics_v2response.py +19 -0
- aeri/api/models/__init__.py +43 -0
- aeri/api/models/client.py +523 -0
- aeri/api/models/raw_client.py +993 -0
- aeri/api/models/types/__init__.py +44 -0
- aeri/api/models/types/create_model_request.py +103 -0
- aeri/api/models/types/paginated_models.py +17 -0
- aeri/api/observations/__init__.py +43 -0
- aeri/api/observations/client.py +522 -0
- aeri/api/observations/raw_client.py +641 -0
- aeri/api/observations/types/__init__.py +44 -0
- aeri/api/observations/types/observations_v2meta.py +21 -0
- aeri/api/observations/types/observations_v2response.py +28 -0
- aeri/api/opentelemetry/__init__.py +67 -0
- aeri/api/opentelemetry/client.py +276 -0
- aeri/api/opentelemetry/raw_client.py +291 -0
- aeri/api/opentelemetry/types/__init__.py +65 -0
- aeri/api/opentelemetry/types/otel_attribute.py +27 -0
- aeri/api/opentelemetry/types/otel_attribute_value.py +46 -0
- aeri/api/opentelemetry/types/otel_resource.py +24 -0
- aeri/api/opentelemetry/types/otel_resource_span.py +32 -0
- aeri/api/opentelemetry/types/otel_scope.py +34 -0
- aeri/api/opentelemetry/types/otel_scope_span.py +28 -0
- aeri/api/opentelemetry/types/otel_span.py +76 -0
- aeri/api/opentelemetry/types/otel_trace_response.py +16 -0
- aeri/api/organizations/__init__.py +73 -0
- aeri/api/organizations/client.py +756 -0
- aeri/api/organizations/raw_client.py +1707 -0
- aeri/api/organizations/types/__init__.py +71 -0
- aeri/api/organizations/types/delete_membership_request.py +16 -0
- aeri/api/organizations/types/membership_deletion_response.py +17 -0
- aeri/api/organizations/types/membership_request.py +18 -0
- aeri/api/organizations/types/membership_response.py +20 -0
- aeri/api/organizations/types/membership_role.py +30 -0
- aeri/api/organizations/types/memberships_response.py +15 -0
- aeri/api/organizations/types/organization_api_key.py +31 -0
- aeri/api/organizations/types/organization_api_keys_response.py +19 -0
- aeri/api/organizations/types/organization_project.py +25 -0
- aeri/api/organizations/types/organization_projects_response.py +15 -0
- aeri/api/projects/__init__.py +67 -0
- aeri/api/projects/client.py +760 -0
- aeri/api/projects/raw_client.py +1577 -0
- aeri/api/projects/types/__init__.py +65 -0
- aeri/api/projects/types/api_key_deletion_response.py +18 -0
- aeri/api/projects/types/api_key_list.py +23 -0
- aeri/api/projects/types/api_key_response.py +30 -0
- aeri/api/projects/types/api_key_summary.py +35 -0
- aeri/api/projects/types/organization.py +22 -0
- aeri/api/projects/types/project.py +34 -0
- aeri/api/projects/types/project_deletion_response.py +15 -0
- aeri/api/projects/types/projects.py +15 -0
- aeri/api/prompt_version/__init__.py +4 -0
- aeri/api/prompt_version/client.py +157 -0
- aeri/api/prompt_version/raw_client.py +264 -0
- aeri/api/prompts/__init__.py +100 -0
- aeri/api/prompts/client.py +550 -0
- aeri/api/prompts/raw_client.py +987 -0
- aeri/api/prompts/types/__init__.py +96 -0
- aeri/api/prompts/types/base_prompt.py +42 -0
- aeri/api/prompts/types/chat_message.py +17 -0
- aeri/api/prompts/types/chat_message_type.py +15 -0
- aeri/api/prompts/types/chat_message_with_placeholders.py +8 -0
- aeri/api/prompts/types/chat_prompt.py +15 -0
- aeri/api/prompts/types/create_chat_prompt_request.py +37 -0
- aeri/api/prompts/types/create_chat_prompt_type.py +15 -0
- aeri/api/prompts/types/create_prompt_request.py +8 -0
- aeri/api/prompts/types/create_text_prompt_request.py +36 -0
- aeri/api/prompts/types/create_text_prompt_type.py +15 -0
- aeri/api/prompts/types/placeholder_message.py +16 -0
- aeri/api/prompts/types/placeholder_message_type.py +15 -0
- aeri/api/prompts/types/prompt.py +58 -0
- aeri/api/prompts/types/prompt_meta.py +35 -0
- aeri/api/prompts/types/prompt_meta_list_response.py +17 -0
- aeri/api/prompts/types/prompt_type.py +20 -0
- aeri/api/prompts/types/text_prompt.py +14 -0
- aeri/api/scim/__init__.py +94 -0
- aeri/api/scim/client.py +686 -0
- aeri/api/scim/raw_client.py +1528 -0
- aeri/api/scim/types/__init__.py +92 -0
- aeri/api/scim/types/authentication_scheme.py +20 -0
- aeri/api/scim/types/bulk_config.py +22 -0
- aeri/api/scim/types/empty_response.py +16 -0
- aeri/api/scim/types/filter_config.py +17 -0
- aeri/api/scim/types/resource_meta.py +17 -0
- aeri/api/scim/types/resource_type.py +27 -0
- aeri/api/scim/types/resource_types_response.py +21 -0
- aeri/api/scim/types/schema_extension.py +17 -0
- aeri/api/scim/types/schema_resource.py +19 -0
- aeri/api/scim/types/schemas_response.py +21 -0
- aeri/api/scim/types/scim_email.py +16 -0
- aeri/api/scim/types/scim_feature_support.py +14 -0
- aeri/api/scim/types/scim_name.py +14 -0
- aeri/api/scim/types/scim_user.py +24 -0
- aeri/api/scim/types/scim_users_list_response.py +25 -0
- aeri/api/scim/types/service_provider_config.py +36 -0
- aeri/api/scim/types/user_meta.py +20 -0
- aeri/api/score_configs/__init__.py +44 -0
- aeri/api/score_configs/client.py +526 -0
- aeri/api/score_configs/raw_client.py +1012 -0
- aeri/api/score_configs/types/__init__.py +46 -0
- aeri/api/score_configs/types/create_score_config_request.py +46 -0
- aeri/api/score_configs/types/score_configs.py +17 -0
- aeri/api/score_configs/types/update_score_config_request.py +53 -0
- aeri/api/scores/__init__.py +76 -0
- aeri/api/scores/client.py +420 -0
- aeri/api/scores/raw_client.py +656 -0
- aeri/api/scores/types/__init__.py +76 -0
- aeri/api/scores/types/get_scores_response.py +17 -0
- aeri/api/scores/types/get_scores_response_data.py +211 -0
- aeri/api/scores/types/get_scores_response_data_boolean.py +15 -0
- aeri/api/scores/types/get_scores_response_data_categorical.py +15 -0
- aeri/api/scores/types/get_scores_response_data_correction.py +15 -0
- aeri/api/scores/types/get_scores_response_data_numeric.py +15 -0
- aeri/api/scores/types/get_scores_response_trace_data.py +38 -0
- aeri/api/sessions/__init__.py +40 -0
- aeri/api/sessions/client.py +262 -0
- aeri/api/sessions/raw_client.py +500 -0
- aeri/api/sessions/types/__init__.py +40 -0
- aeri/api/sessions/types/paginated_sessions.py +17 -0
- aeri/api/trace/__init__.py +44 -0
- aeri/api/trace/client.py +728 -0
- aeri/api/trace/raw_client.py +1208 -0
- aeri/api/trace/types/__init__.py +46 -0
- aeri/api/trace/types/delete_trace_response.py +14 -0
- aeri/api/trace/types/sort.py +14 -0
- aeri/api/trace/types/traces.py +17 -0
- aeri/api/utils/__init__.py +44 -0
- aeri/api/utils/pagination/__init__.py +40 -0
- aeri/api/utils/pagination/types/__init__.py +40 -0
- aeri/api/utils/pagination/types/meta_response.py +38 -0
- aeri/batch_evaluation.py +1643 -0
- aeri/experiment.py +1044 -0
- aeri/langchain/CallbackHandler.py +1377 -0
- aeri/langchain/__init__.py +5 -0
- aeri/langchain/utils.py +212 -0
- aeri/logger.py +28 -0
- aeri/media.py +352 -0
- aeri/model.py +477 -0
- aeri/openai.py +1124 -0
- aeri/py.typed +0 -0
- aeri/span_filter.py +17 -0
- aeri/types.py +79 -0
- aeri/version.py +3 -0
- aeri_python-4.0.0.dist-info/METADATA +51 -0
- aeri_python-4.0.0.dist-info/RECORD +391 -0
- aeri_python-4.0.0.dist-info/WHEEL +4 -0
- aeri_python-4.0.0.dist-info/licenses/LICENSE +21 -0
aeri/batch_evaluation.py
ADDED
|
@@ -0,0 +1,1643 @@
|
|
|
1
|
+
"""Batch evaluation functionality for Aeri.
|
|
2
|
+
|
|
3
|
+
This module provides comprehensive batch evaluation capabilities for running evaluations
|
|
4
|
+
on traces and observations fetched from Aeri. It includes type definitions,
|
|
5
|
+
protocols, result classes, and the implementation for large-scale evaluation workflows
|
|
6
|
+
with error handling, retry logic, and resume capability.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import json
|
|
11
|
+
import time
|
|
12
|
+
from typing import (
|
|
13
|
+
TYPE_CHECKING,
|
|
14
|
+
Any,
|
|
15
|
+
Awaitable,
|
|
16
|
+
Dict,
|
|
17
|
+
List,
|
|
18
|
+
Optional,
|
|
19
|
+
Protocol,
|
|
20
|
+
Set,
|
|
21
|
+
Tuple,
|
|
22
|
+
Union,
|
|
23
|
+
cast,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
from aeri.api import (
|
|
27
|
+
ObservationsView,
|
|
28
|
+
TraceWithFullDetails,
|
|
29
|
+
)
|
|
30
|
+
from aeri.experiment import Evaluation, EvaluatorFunction
|
|
31
|
+
from aeri.logger import aeri_logger as logger
|
|
32
|
+
|
|
33
|
+
if TYPE_CHECKING:
|
|
34
|
+
from aeri._client.client import Aeri
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class EvaluatorInputs:
|
|
38
|
+
"""Input data structure for evaluators, returned by mapper functions.
|
|
39
|
+
|
|
40
|
+
This class provides a strongly-typed container for transforming API response
|
|
41
|
+
objects (traces, observations) into the standardized format expected
|
|
42
|
+
by evaluator functions. It ensures consistent access to input, output, expected
|
|
43
|
+
output, and metadata regardless of the source entity type.
|
|
44
|
+
|
|
45
|
+
Attributes:
|
|
46
|
+
input: The input data that was provided to generate the output being evaluated.
|
|
47
|
+
For traces, this might be the initial prompt or request. For observations,
|
|
48
|
+
this could be the span's input. The exact meaning depends on your use case.
|
|
49
|
+
output: The actual output that was produced and needs to be evaluated.
|
|
50
|
+
For traces, this is typically the final response. For observations,
|
|
51
|
+
this might be the generation output or span result.
|
|
52
|
+
expected_output: Optional ground truth or expected result for comparison.
|
|
53
|
+
Used by evaluators to assess correctness. May be None if no ground truth
|
|
54
|
+
is available for the entity being evaluated.
|
|
55
|
+
metadata: Optional structured metadata providing additional context for evaluation.
|
|
56
|
+
Can include information about the entity, execution context, user attributes,
|
|
57
|
+
or any other relevant data that evaluators might use.
|
|
58
|
+
|
|
59
|
+
Examples:
|
|
60
|
+
Simple mapper for traces:
|
|
61
|
+
```python
|
|
62
|
+
from aeri import EvaluatorInputs
|
|
63
|
+
|
|
64
|
+
def trace_mapper(trace):
|
|
65
|
+
return EvaluatorInputs(
|
|
66
|
+
input=trace.input,
|
|
67
|
+
output=trace.output,
|
|
68
|
+
expected_output=None, # No ground truth available
|
|
69
|
+
metadata={"user_id": trace.user_id, "tags": trace.tags}
|
|
70
|
+
)
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Mapper for observations extracting specific fields:
|
|
74
|
+
```python
|
|
75
|
+
def observation_mapper(observation):
|
|
76
|
+
# Extract input/output from observation's data
|
|
77
|
+
input_data = observation.input if hasattr(observation, 'input') else None
|
|
78
|
+
output_data = observation.output if hasattr(observation, 'output') else None
|
|
79
|
+
|
|
80
|
+
return EvaluatorInputs(
|
|
81
|
+
input=input_data,
|
|
82
|
+
output=output_data,
|
|
83
|
+
expected_output=None,
|
|
84
|
+
metadata={
|
|
85
|
+
"observation_type": observation.type,
|
|
86
|
+
"model": observation.model,
|
|
87
|
+
"latency_ms": observation.end_time - observation.start_time
|
|
88
|
+
}
|
|
89
|
+
)
|
|
90
|
+
```
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
Note:
|
|
94
|
+
All arguments must be passed as keywords when instantiating this class.
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
def __init__(
|
|
98
|
+
self,
|
|
99
|
+
*,
|
|
100
|
+
input: Any,
|
|
101
|
+
output: Any,
|
|
102
|
+
expected_output: Any = None,
|
|
103
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
104
|
+
):
|
|
105
|
+
"""Initialize EvaluatorInputs with the provided data.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
input: The input data for evaluation.
|
|
109
|
+
output: The output data to be evaluated.
|
|
110
|
+
expected_output: Optional ground truth for comparison.
|
|
111
|
+
metadata: Optional additional context for evaluation.
|
|
112
|
+
|
|
113
|
+
Note:
|
|
114
|
+
All arguments must be provided as keywords.
|
|
115
|
+
"""
|
|
116
|
+
self.input = input
|
|
117
|
+
self.output = output
|
|
118
|
+
self.expected_output = expected_output
|
|
119
|
+
self.metadata = metadata
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class MapperFunction(Protocol):
|
|
123
|
+
"""Protocol defining the interface for mapper functions in batch evaluation.
|
|
124
|
+
|
|
125
|
+
Mapper functions transform API response objects (traces or observations)
|
|
126
|
+
into the standardized EvaluatorInputs format that evaluators expect. This abstraction
|
|
127
|
+
allows you to define how to extract and structure evaluation data from different
|
|
128
|
+
entity types.
|
|
129
|
+
|
|
130
|
+
Mapper functions must:
|
|
131
|
+
- Accept a single item parameter (trace, observation)
|
|
132
|
+
- Return an EvaluatorInputs instance with input, output, expected_output, metadata
|
|
133
|
+
- Can be either synchronous or asynchronous
|
|
134
|
+
- Should handle missing or malformed data gracefully
|
|
135
|
+
"""
|
|
136
|
+
|
|
137
|
+
def __call__(
|
|
138
|
+
self,
|
|
139
|
+
*,
|
|
140
|
+
item: Union["TraceWithFullDetails", "ObservationsView"],
|
|
141
|
+
**kwargs: Dict[str, Any],
|
|
142
|
+
) -> Union[EvaluatorInputs, Awaitable[EvaluatorInputs]]:
|
|
143
|
+
"""Transform an API response object into evaluator inputs.
|
|
144
|
+
|
|
145
|
+
This method defines how to extract evaluation-relevant data from the raw
|
|
146
|
+
API response object. The implementation should map entity-specific fields
|
|
147
|
+
to the standardized input/output/expected_output/metadata structure.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
item: The API response object to transform. The type depends on the scope:
|
|
151
|
+
- TraceWithFullDetails: When evaluating traces
|
|
152
|
+
- ObservationsView: When evaluating observations
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
EvaluatorInputs: A structured container with:
|
|
156
|
+
- input: The input data that generated the output
|
|
157
|
+
- output: The output to be evaluated
|
|
158
|
+
- expected_output: Optional ground truth for comparison
|
|
159
|
+
- metadata: Optional additional context
|
|
160
|
+
|
|
161
|
+
Can return either a direct EvaluatorInputs instance or an awaitable
|
|
162
|
+
(for async mappers that need to fetch additional data).
|
|
163
|
+
|
|
164
|
+
Examples:
|
|
165
|
+
Basic trace mapper:
|
|
166
|
+
```python
|
|
167
|
+
def map_trace(trace):
|
|
168
|
+
return EvaluatorInputs(
|
|
169
|
+
input=trace.input,
|
|
170
|
+
output=trace.output,
|
|
171
|
+
expected_output=None,
|
|
172
|
+
metadata={"trace_id": trace.id, "user": trace.user_id}
|
|
173
|
+
)
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
Observation mapper with conditional logic:
|
|
177
|
+
```python
|
|
178
|
+
def map_observation(observation):
|
|
179
|
+
# Extract fields based on observation type
|
|
180
|
+
if observation.type == "GENERATION":
|
|
181
|
+
input_data = observation.input
|
|
182
|
+
output_data = observation.output
|
|
183
|
+
else:
|
|
184
|
+
# For other types, use different fields
|
|
185
|
+
input_data = observation.metadata.get("input")
|
|
186
|
+
output_data = observation.metadata.get("output")
|
|
187
|
+
|
|
188
|
+
return EvaluatorInputs(
|
|
189
|
+
input=input_data,
|
|
190
|
+
output=output_data,
|
|
191
|
+
expected_output=None,
|
|
192
|
+
metadata={"obs_id": observation.id, "type": observation.type}
|
|
193
|
+
)
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
Async mapper (if additional processing needed):
|
|
197
|
+
```python
|
|
198
|
+
async def map_trace_async(trace):
|
|
199
|
+
# Could do async processing here if needed
|
|
200
|
+
processed_output = await some_async_transformation(trace.output)
|
|
201
|
+
|
|
202
|
+
return EvaluatorInputs(
|
|
203
|
+
input=trace.input,
|
|
204
|
+
output=processed_output,
|
|
205
|
+
expected_output=None,
|
|
206
|
+
metadata={"trace_id": trace.id}
|
|
207
|
+
)
|
|
208
|
+
```
|
|
209
|
+
"""
|
|
210
|
+
...
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
class CompositeEvaluatorFunction(Protocol):
|
|
214
|
+
"""Protocol defining the interface for composite evaluator functions.
|
|
215
|
+
|
|
216
|
+
Composite evaluators create aggregate scores from multiple item-level evaluations.
|
|
217
|
+
This is commonly used to compute weighted averages, combined metrics, or other
|
|
218
|
+
composite assessments based on individual evaluation results.
|
|
219
|
+
|
|
220
|
+
Composite evaluators:
|
|
221
|
+
- Accept the same inputs as item-level evaluators (input, output, expected_output, metadata)
|
|
222
|
+
plus the list of evaluations
|
|
223
|
+
- Return either a single Evaluation, a list of Evaluations, or a dict
|
|
224
|
+
- Can be either synchronous or asynchronous
|
|
225
|
+
- Have access to both raw item data and evaluation results
|
|
226
|
+
"""
|
|
227
|
+
|
|
228
|
+
def __call__(
|
|
229
|
+
self,
|
|
230
|
+
*,
|
|
231
|
+
input: Optional[Any] = None,
|
|
232
|
+
output: Optional[Any] = None,
|
|
233
|
+
expected_output: Optional[Any] = None,
|
|
234
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
235
|
+
evaluations: List[Evaluation],
|
|
236
|
+
**kwargs: Dict[str, Any],
|
|
237
|
+
) -> Union[
|
|
238
|
+
Evaluation,
|
|
239
|
+
List[Evaluation],
|
|
240
|
+
Dict[str, Any],
|
|
241
|
+
Awaitable[Evaluation],
|
|
242
|
+
Awaitable[List[Evaluation]],
|
|
243
|
+
Awaitable[Dict[str, Any]],
|
|
244
|
+
]:
|
|
245
|
+
r"""Create a composite evaluation from item-level evaluation results.
|
|
246
|
+
|
|
247
|
+
This method combines multiple evaluation scores into a single composite metric.
|
|
248
|
+
Common use cases include weighted averages, pass/fail decisions based on multiple
|
|
249
|
+
criteria, or custom scoring logic that considers multiple dimensions.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
input: The input data that was provided to the system being evaluated.
|
|
253
|
+
output: The output generated by the system being evaluated.
|
|
254
|
+
expected_output: The expected/reference output for comparison (if available).
|
|
255
|
+
metadata: Additional metadata about the evaluation context.
|
|
256
|
+
evaluations: List of evaluation results from item-level evaluators.
|
|
257
|
+
Each evaluation contains name, value, comment, and metadata.
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
Can return any of:
|
|
261
|
+
- Evaluation: A single composite evaluation result
|
|
262
|
+
- List[Evaluation]: Multiple composite evaluations
|
|
263
|
+
- Dict: A dict that will be converted to an Evaluation
|
|
264
|
+
- name: Identifier for the composite metric (e.g., "composite_score")
|
|
265
|
+
- value: The computed composite value
|
|
266
|
+
- comment: Optional explanation of how the score was computed
|
|
267
|
+
- metadata: Optional details about the composition logic
|
|
268
|
+
|
|
269
|
+
Can return either a direct Evaluation instance or an awaitable
|
|
270
|
+
(for async composite evaluators).
|
|
271
|
+
|
|
272
|
+
Examples:
|
|
273
|
+
Simple weighted average:
|
|
274
|
+
```python
|
|
275
|
+
def weighted_composite(*, input, output, expected_output, metadata, evaluations):
|
|
276
|
+
weights = {
|
|
277
|
+
"accuracy": 0.5,
|
|
278
|
+
"relevance": 0.3,
|
|
279
|
+
"safety": 0.2
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
total_score = 0.0
|
|
283
|
+
total_weight = 0.0
|
|
284
|
+
|
|
285
|
+
for eval in evaluations:
|
|
286
|
+
if eval.name in weights and isinstance(eval.value, (int, float)):
|
|
287
|
+
total_score += eval.value * weights[eval.name]
|
|
288
|
+
total_weight += weights[eval.name]
|
|
289
|
+
|
|
290
|
+
final_score = total_score / total_weight if total_weight > 0 else 0.0
|
|
291
|
+
|
|
292
|
+
return Evaluation(
|
|
293
|
+
name="composite_score",
|
|
294
|
+
value=final_score,
|
|
295
|
+
comment=f"Weighted average of {len(evaluations)} metrics"
|
|
296
|
+
)
|
|
297
|
+
```
|
|
298
|
+
|
|
299
|
+
Pass/fail composite based on thresholds:
|
|
300
|
+
```python
|
|
301
|
+
def pass_fail_composite(*, input, output, expected_output, metadata, evaluations):
|
|
302
|
+
# Must pass all criteria
|
|
303
|
+
thresholds = {
|
|
304
|
+
"accuracy": 0.7,
|
|
305
|
+
"safety": 0.9,
|
|
306
|
+
"relevance": 0.6
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
passes = True
|
|
310
|
+
failing_metrics = []
|
|
311
|
+
|
|
312
|
+
for metric, threshold in thresholds.items():
|
|
313
|
+
eval_result = next((e for e in evaluations if e.name == metric), None)
|
|
314
|
+
if eval_result and isinstance(eval_result.value, (int, float)):
|
|
315
|
+
if eval_result.value < threshold:
|
|
316
|
+
passes = False
|
|
317
|
+
failing_metrics.append(metric)
|
|
318
|
+
|
|
319
|
+
return Evaluation(
|
|
320
|
+
name="passes_all_checks",
|
|
321
|
+
value=passes,
|
|
322
|
+
comment=f"Failed: {', '.join(failing_metrics)}" if failing_metrics else "All checks passed",
|
|
323
|
+
data_type="BOOLEAN"
|
|
324
|
+
)
|
|
325
|
+
```
|
|
326
|
+
|
|
327
|
+
Async composite with external scoring:
|
|
328
|
+
```python
|
|
329
|
+
async def llm_composite(*, input, output, expected_output, metadata, evaluations):
|
|
330
|
+
# Use LLM to synthesize multiple evaluation results
|
|
331
|
+
eval_summary = "\n".join(
|
|
332
|
+
f"- {e.name}: {e.value}" for e in evaluations
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
prompt = f"Given these evaluation scores:\n{eval_summary}\n"
|
|
336
|
+
prompt += f"For the output: {output}\n"
|
|
337
|
+
prompt += "Provide an overall quality score from 0-1."
|
|
338
|
+
|
|
339
|
+
response = await openai.chat.completions.create(
|
|
340
|
+
model="gpt-4",
|
|
341
|
+
messages=[{"role": "user", "content": prompt}]
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
score = float(response.choices[0].message.content.strip())
|
|
345
|
+
|
|
346
|
+
return Evaluation(
|
|
347
|
+
name="llm_composite_score",
|
|
348
|
+
value=score,
|
|
349
|
+
comment="LLM-synthesized composite score"
|
|
350
|
+
)
|
|
351
|
+
```
|
|
352
|
+
|
|
353
|
+
Context-aware composite:
|
|
354
|
+
```python
|
|
355
|
+
def context_composite(*, input, output, expected_output, metadata, evaluations):
|
|
356
|
+
# Adjust weighting based on metadata
|
|
357
|
+
base_weights = {"accuracy": 0.5, "speed": 0.3, "cost": 0.2}
|
|
358
|
+
|
|
359
|
+
# If metadata indicates high importance, prioritize accuracy
|
|
360
|
+
if metadata and metadata.get('importance') == 'high':
|
|
361
|
+
weights = {"accuracy": 0.7, "speed": 0.2, "cost": 0.1}
|
|
362
|
+
else:
|
|
363
|
+
weights = base_weights
|
|
364
|
+
|
|
365
|
+
total = sum(
|
|
366
|
+
e.value * weights.get(e.name, 0)
|
|
367
|
+
for e in evaluations
|
|
368
|
+
if isinstance(e.value, (int, float))
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
return Evaluation(
|
|
372
|
+
name="weighted_composite",
|
|
373
|
+
value=total,
|
|
374
|
+
comment="Context-aware weighted composite"
|
|
375
|
+
)
|
|
376
|
+
```
|
|
377
|
+
"""
|
|
378
|
+
...
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
class EvaluatorStats:
|
|
382
|
+
"""Statistics for a single evaluator's performance during batch evaluation.
|
|
383
|
+
|
|
384
|
+
This class tracks detailed metrics about how a specific evaluator performed
|
|
385
|
+
across all items in a batch evaluation run. It helps identify evaluator issues,
|
|
386
|
+
understand reliability, and optimize evaluation pipelines.
|
|
387
|
+
|
|
388
|
+
Attributes:
|
|
389
|
+
name: The name of the evaluator function (extracted from __name__).
|
|
390
|
+
total_runs: Total number of times the evaluator was invoked.
|
|
391
|
+
successful_runs: Number of times the evaluator completed successfully.
|
|
392
|
+
failed_runs: Number of times the evaluator raised an exception or failed.
|
|
393
|
+
total_scores_created: Total number of evaluation scores created by this evaluator.
|
|
394
|
+
Can be higher than successful_runs if the evaluator returns multiple scores.
|
|
395
|
+
|
|
396
|
+
Examples:
|
|
397
|
+
Accessing evaluator stats from batch evaluation result:
|
|
398
|
+
```python
|
|
399
|
+
result = client.run_batched_evaluation(...)
|
|
400
|
+
|
|
401
|
+
for stats in result.evaluator_stats:
|
|
402
|
+
print(f"Evaluator: {stats.name}")
|
|
403
|
+
print(f" Success rate: {stats.successful_runs / stats.total_runs:.1%}")
|
|
404
|
+
print(f" Scores created: {stats.total_scores_created}")
|
|
405
|
+
|
|
406
|
+
if stats.failed_runs > 0:
|
|
407
|
+
print(f" ⚠️ Failed {stats.failed_runs} times")
|
|
408
|
+
```
|
|
409
|
+
|
|
410
|
+
Identifying problematic evaluators:
|
|
411
|
+
```python
|
|
412
|
+
result = client.run_batched_evaluation(...)
|
|
413
|
+
|
|
414
|
+
# Find evaluators with high failure rates
|
|
415
|
+
for stats in result.evaluator_stats:
|
|
416
|
+
failure_rate = stats.failed_runs / stats.total_runs
|
|
417
|
+
if failure_rate > 0.1: # More than 10% failures
|
|
418
|
+
print(f"⚠️ {stats.name} has {failure_rate:.1%} failure rate")
|
|
419
|
+
print(f" Consider debugging or removing this evaluator")
|
|
420
|
+
```
|
|
421
|
+
|
|
422
|
+
Note:
|
|
423
|
+
All arguments must be passed as keywords when instantiating this class.
|
|
424
|
+
"""
|
|
425
|
+
|
|
426
|
+
def __init__(
|
|
427
|
+
self,
|
|
428
|
+
*,
|
|
429
|
+
name: str,
|
|
430
|
+
total_runs: int = 0,
|
|
431
|
+
successful_runs: int = 0,
|
|
432
|
+
failed_runs: int = 0,
|
|
433
|
+
total_scores_created: int = 0,
|
|
434
|
+
):
|
|
435
|
+
"""Initialize EvaluatorStats with the provided metrics.
|
|
436
|
+
|
|
437
|
+
Args:
|
|
438
|
+
name: The evaluator function name.
|
|
439
|
+
total_runs: Total number of evaluator invocations.
|
|
440
|
+
successful_runs: Number of successful completions.
|
|
441
|
+
failed_runs: Number of failures.
|
|
442
|
+
total_scores_created: Total scores created by this evaluator.
|
|
443
|
+
|
|
444
|
+
Note:
|
|
445
|
+
All arguments must be provided as keywords.
|
|
446
|
+
"""
|
|
447
|
+
self.name = name
|
|
448
|
+
self.total_runs = total_runs
|
|
449
|
+
self.successful_runs = successful_runs
|
|
450
|
+
self.failed_runs = failed_runs
|
|
451
|
+
self.total_scores_created = total_scores_created
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
class BatchEvaluationResumeToken:
|
|
455
|
+
"""Token for resuming a failed batch evaluation run.
|
|
456
|
+
|
|
457
|
+
This class encapsulates all the information needed to resume a batch evaluation
|
|
458
|
+
that was interrupted or failed partway through. It uses timestamp-based filtering
|
|
459
|
+
to avoid re-processing items that were already evaluated, even if the underlying
|
|
460
|
+
dataset changed between runs.
|
|
461
|
+
|
|
462
|
+
Attributes:
|
|
463
|
+
scope: The type of items being evaluated ("traces", "observations").
|
|
464
|
+
filter: The original JSON filter string used to query items.
|
|
465
|
+
last_processed_timestamp: ISO 8601 timestamp of the last successfully processed item.
|
|
466
|
+
Used to construct a filter that only fetches items after this timestamp.
|
|
467
|
+
last_processed_id: The ID of the last successfully processed item, for reference.
|
|
468
|
+
items_processed: Count of items successfully processed before interruption.
|
|
469
|
+
|
|
470
|
+
Examples:
|
|
471
|
+
Resuming a failed batch evaluation:
|
|
472
|
+
```python
|
|
473
|
+
# Initial run that fails partway through
|
|
474
|
+
try:
|
|
475
|
+
result = client.run_batched_evaluation(
|
|
476
|
+
scope="traces",
|
|
477
|
+
mapper=my_mapper,
|
|
478
|
+
evaluators=[evaluator1, evaluator2],
|
|
479
|
+
filter='{"tags": ["production"]}',
|
|
480
|
+
max_items=10000
|
|
481
|
+
)
|
|
482
|
+
except Exception as e:
|
|
483
|
+
print(f"Evaluation failed: {e}")
|
|
484
|
+
|
|
485
|
+
# Save the resume token
|
|
486
|
+
if result.resume_token:
|
|
487
|
+
# Store resume token for later (e.g., in a file or database)
|
|
488
|
+
import json
|
|
489
|
+
with open("resume_token.json", "w") as f:
|
|
490
|
+
json.dump({
|
|
491
|
+
"scope": result.resume_token.scope,
|
|
492
|
+
"filter": result.resume_token.filter,
|
|
493
|
+
"last_timestamp": result.resume_token.last_processed_timestamp,
|
|
494
|
+
"last_id": result.resume_token.last_processed_id,
|
|
495
|
+
"items_done": result.resume_token.items_processed
|
|
496
|
+
}, f)
|
|
497
|
+
|
|
498
|
+
# Later, resume from where it left off
|
|
499
|
+
with open("resume_token.json") as f:
|
|
500
|
+
token_data = json.load(f)
|
|
501
|
+
|
|
502
|
+
resume_token = BatchEvaluationResumeToken(
|
|
503
|
+
scope=token_data["scope"],
|
|
504
|
+
filter=token_data["filter"],
|
|
505
|
+
last_processed_timestamp=token_data["last_timestamp"],
|
|
506
|
+
last_processed_id=token_data["last_id"],
|
|
507
|
+
items_processed=token_data["items_done"]
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
# Resume the evaluation
|
|
511
|
+
result = client.run_batched_evaluation(
|
|
512
|
+
scope="traces",
|
|
513
|
+
mapper=my_mapper,
|
|
514
|
+
evaluators=[evaluator1, evaluator2],
|
|
515
|
+
resume_from=resume_token
|
|
516
|
+
)
|
|
517
|
+
|
|
518
|
+
print(f"Processed {result.total_items_processed} additional items")
|
|
519
|
+
```
|
|
520
|
+
|
|
521
|
+
Handling partial completion:
|
|
522
|
+
```python
|
|
523
|
+
result = client.run_batched_evaluation(...)
|
|
524
|
+
|
|
525
|
+
if not result.completed:
|
|
526
|
+
print(f"Evaluation incomplete. Processed {result.resume_token.items_processed} items")
|
|
527
|
+
print(f"Last item: {result.resume_token.last_processed_id}")
|
|
528
|
+
print(f"Resume from: {result.resume_token.last_processed_timestamp}")
|
|
529
|
+
|
|
530
|
+
# Optionally retry automatically
|
|
531
|
+
if result.resume_token:
|
|
532
|
+
print("Retrying...")
|
|
533
|
+
result = client.run_batched_evaluation(
|
|
534
|
+
scope=result.resume_token.scope,
|
|
535
|
+
mapper=my_mapper,
|
|
536
|
+
evaluators=my_evaluators,
|
|
537
|
+
resume_from=result.resume_token
|
|
538
|
+
)
|
|
539
|
+
```
|
|
540
|
+
|
|
541
|
+
Note:
|
|
542
|
+
All arguments must be passed as keywords when instantiating this class.
|
|
543
|
+
The timestamp-based approach means that items created after the initial run
|
|
544
|
+
but before the timestamp will be skipped. This is intentional to avoid
|
|
545
|
+
duplicates and ensure consistent evaluation.
|
|
546
|
+
"""
|
|
547
|
+
|
|
548
|
+
def __init__(
|
|
549
|
+
self,
|
|
550
|
+
*,
|
|
551
|
+
scope: str,
|
|
552
|
+
filter: Optional[str],
|
|
553
|
+
last_processed_timestamp: str,
|
|
554
|
+
last_processed_id: str,
|
|
555
|
+
items_processed: int,
|
|
556
|
+
):
|
|
557
|
+
"""Initialize BatchEvaluationResumeToken with the provided state.
|
|
558
|
+
|
|
559
|
+
Args:
|
|
560
|
+
scope: The scope type ("traces", "observations").
|
|
561
|
+
filter: The original JSON filter string.
|
|
562
|
+
last_processed_timestamp: ISO 8601 timestamp of last processed item.
|
|
563
|
+
last_processed_id: ID of last processed item.
|
|
564
|
+
items_processed: Count of items processed before interruption.
|
|
565
|
+
|
|
566
|
+
Note:
|
|
567
|
+
All arguments must be provided as keywords.
|
|
568
|
+
"""
|
|
569
|
+
self.scope = scope
|
|
570
|
+
self.filter = filter
|
|
571
|
+
self.last_processed_timestamp = last_processed_timestamp
|
|
572
|
+
self.last_processed_id = last_processed_id
|
|
573
|
+
self.items_processed = items_processed
|
|
574
|
+
|
|
575
|
+
|
|
576
|
+
class BatchEvaluationResult:
|
|
577
|
+
r"""Complete result structure for batch evaluation execution.
|
|
578
|
+
|
|
579
|
+
This class encapsulates comprehensive statistics and metadata about a batch
|
|
580
|
+
evaluation run, including counts, evaluator-specific metrics, timing information,
|
|
581
|
+
error details, and resume capability.
|
|
582
|
+
|
|
583
|
+
Attributes:
|
|
584
|
+
total_items_fetched: Total number of items fetched from the API.
|
|
585
|
+
total_items_processed: Number of items successfully evaluated.
|
|
586
|
+
total_items_failed: Number of items that failed during evaluation.
|
|
587
|
+
total_scores_created: Total scores created by all item-level evaluators.
|
|
588
|
+
total_composite_scores_created: Scores created by the composite evaluator.
|
|
589
|
+
total_evaluations_failed: Number of individual evaluator failures across all items.
|
|
590
|
+
evaluator_stats: List of per-evaluator statistics (success/failure rates, scores created).
|
|
591
|
+
resume_token: Token for resuming if evaluation was interrupted (None if completed).
|
|
592
|
+
completed: True if all items were processed, False if stopped early or failed.
|
|
593
|
+
duration_seconds: Total time taken to execute the batch evaluation.
|
|
594
|
+
failed_item_ids: List of IDs for items that failed evaluation.
|
|
595
|
+
error_summary: Dictionary mapping error types to occurrence counts.
|
|
596
|
+
has_more_items: True if max_items limit was reached but more items exist.
|
|
597
|
+
item_evaluations: Dictionary mapping item IDs to their evaluation results (both regular and composite).
|
|
598
|
+
|
|
599
|
+
Examples:
|
|
600
|
+
Basic result inspection:
|
|
601
|
+
```python
|
|
602
|
+
result = client.run_batched_evaluation(...)
|
|
603
|
+
|
|
604
|
+
print(f"Processed: {result.total_items_processed}/{result.total_items_fetched}")
|
|
605
|
+
print(f"Scores created: {result.total_scores_created}")
|
|
606
|
+
print(f"Duration: {result.duration_seconds:.2f}s")
|
|
607
|
+
print(f"Success rate: {result.total_items_processed / result.total_items_fetched:.1%}")
|
|
608
|
+
```
|
|
609
|
+
|
|
610
|
+
Detailed analysis with evaluator stats:
|
|
611
|
+
```python
|
|
612
|
+
result = client.run_batched_evaluation(...)
|
|
613
|
+
|
|
614
|
+
print(f"\n📊 Batch Evaluation Results")
|
|
615
|
+
print(f"{'='*50}")
|
|
616
|
+
print(f"Items processed: {result.total_items_processed}")
|
|
617
|
+
print(f"Items failed: {result.total_items_failed}")
|
|
618
|
+
print(f"Scores created: {result.total_scores_created}")
|
|
619
|
+
|
|
620
|
+
if result.total_composite_scores_created > 0:
|
|
621
|
+
print(f"Composite scores: {result.total_composite_scores_created}")
|
|
622
|
+
|
|
623
|
+
print(f"\n📈 Evaluator Performance:")
|
|
624
|
+
for stats in result.evaluator_stats:
|
|
625
|
+
success_rate = stats.successful_runs / stats.total_runs if stats.total_runs > 0 else 0
|
|
626
|
+
print(f"\n {stats.name}:")
|
|
627
|
+
print(f" Success rate: {success_rate:.1%}")
|
|
628
|
+
print(f" Scores created: {stats.total_scores_created}")
|
|
629
|
+
if stats.failed_runs > 0:
|
|
630
|
+
print(f" ⚠️ Failures: {stats.failed_runs}")
|
|
631
|
+
|
|
632
|
+
if result.error_summary:
|
|
633
|
+
print(f"\n⚠️ Errors encountered:")
|
|
634
|
+
for error_type, count in result.error_summary.items():
|
|
635
|
+
print(f" {error_type}: {count}")
|
|
636
|
+
```
|
|
637
|
+
|
|
638
|
+
Handling incomplete runs:
|
|
639
|
+
```python
|
|
640
|
+
result = client.run_batched_evaluation(...)
|
|
641
|
+
|
|
642
|
+
if not result.completed:
|
|
643
|
+
print("⚠️ Evaluation incomplete!")
|
|
644
|
+
|
|
645
|
+
if result.resume_token:
|
|
646
|
+
print(f"Processed {result.resume_token.items_processed} items before failure")
|
|
647
|
+
print(f"Use resume_from parameter to continue from:")
|
|
648
|
+
print(f" Timestamp: {result.resume_token.last_processed_timestamp}")
|
|
649
|
+
print(f" Last ID: {result.resume_token.last_processed_id}")
|
|
650
|
+
|
|
651
|
+
if result.has_more_items:
|
|
652
|
+
print(f"ℹ️ More items available beyond max_items limit")
|
|
653
|
+
```
|
|
654
|
+
|
|
655
|
+
Performance monitoring:
|
|
656
|
+
```python
|
|
657
|
+
result = client.run_batched_evaluation(...)
|
|
658
|
+
|
|
659
|
+
items_per_second = result.total_items_processed / result.duration_seconds
|
|
660
|
+
avg_scores_per_item = result.total_scores_created / result.total_items_processed
|
|
661
|
+
|
|
662
|
+
print(f"Performance metrics:")
|
|
663
|
+
print(f" Throughput: {items_per_second:.2f} items/second")
|
|
664
|
+
print(f" Avg scores/item: {avg_scores_per_item:.2f}")
|
|
665
|
+
print(f" Total duration: {result.duration_seconds:.2f}s")
|
|
666
|
+
|
|
667
|
+
if result.total_evaluations_failed > 0:
|
|
668
|
+
failure_rate = result.total_evaluations_failed / (
|
|
669
|
+
result.total_items_processed * len(result.evaluator_stats)
|
|
670
|
+
)
|
|
671
|
+
print(f" Evaluation failure rate: {failure_rate:.1%}")
|
|
672
|
+
```
|
|
673
|
+
|
|
674
|
+
Note:
|
|
675
|
+
All arguments must be passed as keywords when instantiating this class.
|
|
676
|
+
"""
|
|
677
|
+
|
|
678
|
+
def __init__(
|
|
679
|
+
self,
|
|
680
|
+
*,
|
|
681
|
+
total_items_fetched: int,
|
|
682
|
+
total_items_processed: int,
|
|
683
|
+
total_items_failed: int,
|
|
684
|
+
total_scores_created: int,
|
|
685
|
+
total_composite_scores_created: int,
|
|
686
|
+
total_evaluations_failed: int,
|
|
687
|
+
evaluator_stats: List[EvaluatorStats],
|
|
688
|
+
resume_token: Optional[BatchEvaluationResumeToken],
|
|
689
|
+
completed: bool,
|
|
690
|
+
duration_seconds: float,
|
|
691
|
+
failed_item_ids: List[str],
|
|
692
|
+
error_summary: Dict[str, int],
|
|
693
|
+
has_more_items: bool,
|
|
694
|
+
item_evaluations: Dict[str, List["Evaluation"]],
|
|
695
|
+
):
|
|
696
|
+
"""Initialize BatchEvaluationResult with comprehensive statistics.
|
|
697
|
+
|
|
698
|
+
Args:
|
|
699
|
+
total_items_fetched: Total items fetched from API.
|
|
700
|
+
total_items_processed: Items successfully evaluated.
|
|
701
|
+
total_items_failed: Items that failed evaluation.
|
|
702
|
+
total_scores_created: Scores from item-level evaluators.
|
|
703
|
+
total_composite_scores_created: Scores from composite evaluator.
|
|
704
|
+
total_evaluations_failed: Individual evaluator failures.
|
|
705
|
+
evaluator_stats: Per-evaluator statistics.
|
|
706
|
+
resume_token: Token for resuming (None if completed).
|
|
707
|
+
completed: Whether all items were processed.
|
|
708
|
+
duration_seconds: Total execution time.
|
|
709
|
+
failed_item_ids: IDs of failed items.
|
|
710
|
+
error_summary: Error types and counts.
|
|
711
|
+
has_more_items: Whether more items exist beyond max_items.
|
|
712
|
+
item_evaluations: Dictionary mapping item IDs to their evaluation results.
|
|
713
|
+
|
|
714
|
+
Note:
|
|
715
|
+
All arguments must be provided as keywords.
|
|
716
|
+
"""
|
|
717
|
+
self.total_items_fetched = total_items_fetched
|
|
718
|
+
self.total_items_processed = total_items_processed
|
|
719
|
+
self.total_items_failed = total_items_failed
|
|
720
|
+
self.total_scores_created = total_scores_created
|
|
721
|
+
self.total_composite_scores_created = total_composite_scores_created
|
|
722
|
+
self.total_evaluations_failed = total_evaluations_failed
|
|
723
|
+
self.evaluator_stats = evaluator_stats
|
|
724
|
+
self.resume_token = resume_token
|
|
725
|
+
self.completed = completed
|
|
726
|
+
self.duration_seconds = duration_seconds
|
|
727
|
+
self.failed_item_ids = failed_item_ids
|
|
728
|
+
self.error_summary = error_summary
|
|
729
|
+
self.has_more_items = has_more_items
|
|
730
|
+
self.item_evaluations = item_evaluations
|
|
731
|
+
|
|
732
|
+
def __str__(self) -> str:
|
|
733
|
+
"""Return a formatted string representation of the batch evaluation results.
|
|
734
|
+
|
|
735
|
+
Returns:
|
|
736
|
+
A multi-line string with a summary of the evaluation results.
|
|
737
|
+
"""
|
|
738
|
+
lines = []
|
|
739
|
+
lines.append("=" * 60)
|
|
740
|
+
lines.append("Batch Evaluation Results")
|
|
741
|
+
lines.append("=" * 60)
|
|
742
|
+
|
|
743
|
+
# Summary statistics
|
|
744
|
+
lines.append(f"\nStatus: {'Completed' if self.completed else 'Incomplete'}")
|
|
745
|
+
lines.append(f"Duration: {self.duration_seconds:.2f}s")
|
|
746
|
+
lines.append(f"\nItems fetched: {self.total_items_fetched}")
|
|
747
|
+
lines.append(f"Items processed: {self.total_items_processed}")
|
|
748
|
+
|
|
749
|
+
if self.total_items_failed > 0:
|
|
750
|
+
lines.append(f"Items failed: {self.total_items_failed}")
|
|
751
|
+
|
|
752
|
+
# Success rate
|
|
753
|
+
if self.total_items_fetched > 0:
|
|
754
|
+
success_rate = self.total_items_processed / self.total_items_fetched * 100
|
|
755
|
+
lines.append(f"Success rate: {success_rate:.1f}%")
|
|
756
|
+
|
|
757
|
+
# Scores created
|
|
758
|
+
lines.append(f"\nScores created: {self.total_scores_created}")
|
|
759
|
+
if self.total_composite_scores_created > 0:
|
|
760
|
+
lines.append(f"Composite scores: {self.total_composite_scores_created}")
|
|
761
|
+
|
|
762
|
+
total_scores = self.total_scores_created + self.total_composite_scores_created
|
|
763
|
+
lines.append(f"Total scores: {total_scores}")
|
|
764
|
+
|
|
765
|
+
# Evaluator statistics
|
|
766
|
+
if self.evaluator_stats:
|
|
767
|
+
lines.append("\nEvaluator Performance:")
|
|
768
|
+
for stats in self.evaluator_stats:
|
|
769
|
+
lines.append(f" {stats.name}:")
|
|
770
|
+
if stats.total_runs > 0:
|
|
771
|
+
success_rate = (
|
|
772
|
+
stats.successful_runs / stats.total_runs * 100
|
|
773
|
+
if stats.total_runs > 0
|
|
774
|
+
else 0
|
|
775
|
+
)
|
|
776
|
+
lines.append(
|
|
777
|
+
f" Runs: {stats.successful_runs}/{stats.total_runs} "
|
|
778
|
+
f"({success_rate:.1f}% success)"
|
|
779
|
+
)
|
|
780
|
+
lines.append(f" Scores created: {stats.total_scores_created}")
|
|
781
|
+
if stats.failed_runs > 0:
|
|
782
|
+
lines.append(f" Failed runs: {stats.failed_runs}")
|
|
783
|
+
|
|
784
|
+
# Performance metrics
|
|
785
|
+
if self.total_items_processed > 0 and self.duration_seconds > 0:
|
|
786
|
+
items_per_sec = self.total_items_processed / self.duration_seconds
|
|
787
|
+
lines.append("\nPerformance:")
|
|
788
|
+
lines.append(f" Throughput: {items_per_sec:.2f} items/second")
|
|
789
|
+
if self.total_scores_created > 0:
|
|
790
|
+
avg_scores = self.total_scores_created / self.total_items_processed
|
|
791
|
+
lines.append(f" Avg scores per item: {avg_scores:.2f}")
|
|
792
|
+
|
|
793
|
+
# Errors and warnings
|
|
794
|
+
if self.error_summary:
|
|
795
|
+
lines.append("\nErrors encountered:")
|
|
796
|
+
for error_type, count in self.error_summary.items():
|
|
797
|
+
lines.append(f" {error_type}: {count}")
|
|
798
|
+
|
|
799
|
+
# Incomplete run information
|
|
800
|
+
if not self.completed:
|
|
801
|
+
lines.append("\nWarning: Evaluation incomplete")
|
|
802
|
+
if self.resume_token:
|
|
803
|
+
lines.append(
|
|
804
|
+
f" Last processed: {self.resume_token.last_processed_timestamp}"
|
|
805
|
+
)
|
|
806
|
+
lines.append(f" Items processed: {self.resume_token.items_processed}")
|
|
807
|
+
lines.append(" Use resume_from parameter to continue")
|
|
808
|
+
|
|
809
|
+
if self.has_more_items:
|
|
810
|
+
lines.append("\nNote: More items available beyond max_items limit")
|
|
811
|
+
|
|
812
|
+
lines.append("=" * 60)
|
|
813
|
+
return "\n".join(lines)
|
|
814
|
+
|
|
815
|
+
|
|
816
|
+
class BatchEvaluationRunner:
|
|
817
|
+
"""Handles batch evaluation execution for a Aeri client.
|
|
818
|
+
|
|
819
|
+
This class encapsulates all the logic for fetching items, running evaluators,
|
|
820
|
+
creating scores, and managing the evaluation lifecycle. It provides a clean
|
|
821
|
+
separation of concerns from the main Aeri client class.
|
|
822
|
+
|
|
823
|
+
The runner uses a streaming/pipeline approach to process items in batches,
|
|
824
|
+
avoiding loading the entire dataset into memory. This makes it suitable for
|
|
825
|
+
evaluating large numbers of items.
|
|
826
|
+
|
|
827
|
+
Attributes:
|
|
828
|
+
client: The Aeri client instance used for API calls and score creation.
|
|
829
|
+
"""
|
|
830
|
+
|
|
831
|
+
def __init__(self, client: "Aeri"):
|
|
832
|
+
"""Initialize the batch evaluation runner.
|
|
833
|
+
|
|
834
|
+
Args:
|
|
835
|
+
client: The Aeri client instance.
|
|
836
|
+
"""
|
|
837
|
+
self.client = client
|
|
838
|
+
|
|
839
|
+
async def run_async(
|
|
840
|
+
self,
|
|
841
|
+
*,
|
|
842
|
+
scope: str,
|
|
843
|
+
mapper: MapperFunction,
|
|
844
|
+
evaluators: List[EvaluatorFunction],
|
|
845
|
+
filter: Optional[str] = None,
|
|
846
|
+
fetch_batch_size: int = 50,
|
|
847
|
+
fetch_trace_fields: Optional[str] = "io",
|
|
848
|
+
max_items: Optional[int] = None,
|
|
849
|
+
max_concurrency: int = 5,
|
|
850
|
+
composite_evaluator: Optional[CompositeEvaluatorFunction] = None,
|
|
851
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
852
|
+
_add_observation_scores_to_trace: bool = False,
|
|
853
|
+
_additional_trace_tags: Optional[List[str]] = None,
|
|
854
|
+
max_retries: int = 3,
|
|
855
|
+
verbose: bool = False,
|
|
856
|
+
resume_from: Optional[BatchEvaluationResumeToken] = None,
|
|
857
|
+
) -> BatchEvaluationResult:
|
|
858
|
+
"""Run batch evaluation asynchronously.
|
|
859
|
+
|
|
860
|
+
This is the main implementation method that orchestrates the entire batch
|
|
861
|
+
evaluation process: fetching items, mapping, evaluating, creating scores,
|
|
862
|
+
and tracking statistics.
|
|
863
|
+
|
|
864
|
+
Args:
|
|
865
|
+
scope: The type of items to evaluate ("traces", "observations").
|
|
866
|
+
mapper: Function to transform API response items to evaluator inputs.
|
|
867
|
+
evaluators: List of evaluation functions to run on each item.
|
|
868
|
+
filter: JSON filter string for querying items.
|
|
869
|
+
fetch_batch_size: Number of items to fetch per API call.
|
|
870
|
+
fetch_trace_fields: Comma-separated list of fields to include when fetching traces. Available field groups: 'core' (always included), 'io' (input, output, metadata), 'scores', 'observations', 'metrics'. If not specified, all fields are returned. Example: 'core,scores,metrics'. Note: Excluded 'observations' or 'scores' fields return empty arrays; excluded 'metrics' returns -1 for 'totalCost' and 'latency'. Only relevant if scope is 'traces'. Default: 'io'
|
|
871
|
+
max_items: Maximum number of items to process (None = all).
|
|
872
|
+
max_concurrency: Maximum number of concurrent evaluations.
|
|
873
|
+
composite_evaluator: Optional function to create composite scores.
|
|
874
|
+
metadata: Metadata to add to all created scores.
|
|
875
|
+
_add_observation_scores_to_trace: Private option to duplicate
|
|
876
|
+
observation-level scores onto the parent trace.
|
|
877
|
+
_additional_trace_tags: Private option to add tags on traces via
|
|
878
|
+
ingestion trace-create events.
|
|
879
|
+
max_retries: Maximum retries for failed batch fetches.
|
|
880
|
+
verbose: If True, log progress to console.
|
|
881
|
+
resume_from: Resume token from a previous failed run.
|
|
882
|
+
|
|
883
|
+
Returns:
|
|
884
|
+
BatchEvaluationResult with comprehensive statistics.
|
|
885
|
+
"""
|
|
886
|
+
start_time = time.time()
|
|
887
|
+
|
|
888
|
+
# Initialize tracking variables
|
|
889
|
+
total_items_fetched = 0
|
|
890
|
+
total_items_processed = 0
|
|
891
|
+
total_items_failed = 0
|
|
892
|
+
total_scores_created = 0
|
|
893
|
+
total_composite_scores_created = 0
|
|
894
|
+
total_evaluations_failed = 0
|
|
895
|
+
failed_item_ids: List[str] = []
|
|
896
|
+
error_summary: Dict[str, int] = {}
|
|
897
|
+
item_evaluations: Dict[str, List[Evaluation]] = {}
|
|
898
|
+
|
|
899
|
+
# Initialize evaluator stats
|
|
900
|
+
evaluator_stats_dict = {
|
|
901
|
+
getattr(evaluator, "__name__", "unknown_evaluator"): EvaluatorStats(
|
|
902
|
+
name=getattr(evaluator, "__name__", "unknown_evaluator")
|
|
903
|
+
)
|
|
904
|
+
for evaluator in evaluators
|
|
905
|
+
}
|
|
906
|
+
|
|
907
|
+
# Handle resume token by modifying filter
|
|
908
|
+
effective_filter = self._build_timestamp_filter(filter, resume_from)
|
|
909
|
+
normalized_additional_trace_tags = (
|
|
910
|
+
self._dedupe_tags(_additional_trace_tags)
|
|
911
|
+
if _additional_trace_tags is not None
|
|
912
|
+
else []
|
|
913
|
+
)
|
|
914
|
+
updated_trace_ids: Set[str] = set()
|
|
915
|
+
|
|
916
|
+
# Create semaphore for concurrency control
|
|
917
|
+
semaphore = asyncio.Semaphore(max_concurrency)
|
|
918
|
+
|
|
919
|
+
# Pagination state
|
|
920
|
+
page = 1
|
|
921
|
+
has_more = True
|
|
922
|
+
last_item_timestamp: Optional[str] = None
|
|
923
|
+
last_item_id: Optional[str] = None
|
|
924
|
+
|
|
925
|
+
if verbose:
|
|
926
|
+
logger.info(f"Starting batch evaluation on {scope}")
|
|
927
|
+
if scope == "traces" and fetch_trace_fields:
|
|
928
|
+
logger.info(f"Fetching trace fields: {fetch_trace_fields}")
|
|
929
|
+
if resume_from:
|
|
930
|
+
logger.info(
|
|
931
|
+
f"Resuming from {resume_from.last_processed_timestamp} "
|
|
932
|
+
f"({resume_from.items_processed} items already processed)"
|
|
933
|
+
)
|
|
934
|
+
|
|
935
|
+
# Main pagination loop
|
|
936
|
+
while has_more:
|
|
937
|
+
# Check if we've reached max_items
|
|
938
|
+
if max_items is not None and total_items_fetched >= max_items:
|
|
939
|
+
if verbose:
|
|
940
|
+
logger.info(f"Reached max_items limit ({max_items})")
|
|
941
|
+
has_more = True # More items may exist
|
|
942
|
+
break
|
|
943
|
+
|
|
944
|
+
# Fetch next batch with retry logic
|
|
945
|
+
try:
|
|
946
|
+
items = await self._fetch_batch_with_retry(
|
|
947
|
+
scope=scope,
|
|
948
|
+
filter=effective_filter,
|
|
949
|
+
page=page,
|
|
950
|
+
limit=fetch_batch_size,
|
|
951
|
+
max_retries=max_retries,
|
|
952
|
+
fields=fetch_trace_fields,
|
|
953
|
+
)
|
|
954
|
+
except Exception as e:
|
|
955
|
+
# Failed after max_retries - create resume token and return
|
|
956
|
+
error_msg = f"Failed to fetch batch after {max_retries} retries"
|
|
957
|
+
logger.error(f"{error_msg}: {e}")
|
|
958
|
+
|
|
959
|
+
resume_token = BatchEvaluationResumeToken(
|
|
960
|
+
scope=scope,
|
|
961
|
+
filter=filter, # Original filter, not modified
|
|
962
|
+
last_processed_timestamp=last_item_timestamp or "",
|
|
963
|
+
last_processed_id=last_item_id or "",
|
|
964
|
+
items_processed=total_items_processed,
|
|
965
|
+
)
|
|
966
|
+
|
|
967
|
+
return self._build_result(
|
|
968
|
+
total_items_fetched=total_items_fetched,
|
|
969
|
+
total_items_processed=total_items_processed,
|
|
970
|
+
total_items_failed=total_items_failed,
|
|
971
|
+
total_scores_created=total_scores_created,
|
|
972
|
+
total_composite_scores_created=total_composite_scores_created,
|
|
973
|
+
total_evaluations_failed=total_evaluations_failed,
|
|
974
|
+
evaluator_stats_dict=evaluator_stats_dict,
|
|
975
|
+
resume_token=resume_token,
|
|
976
|
+
completed=False,
|
|
977
|
+
start_time=start_time,
|
|
978
|
+
failed_item_ids=failed_item_ids,
|
|
979
|
+
error_summary=error_summary,
|
|
980
|
+
has_more_items=has_more,
|
|
981
|
+
item_evaluations=item_evaluations,
|
|
982
|
+
)
|
|
983
|
+
|
|
984
|
+
# Check if we got any items
|
|
985
|
+
if not items:
|
|
986
|
+
has_more = False
|
|
987
|
+
if verbose:
|
|
988
|
+
logger.info("No more items to fetch")
|
|
989
|
+
break
|
|
990
|
+
|
|
991
|
+
total_items_fetched += len(items)
|
|
992
|
+
|
|
993
|
+
if verbose:
|
|
994
|
+
logger.info(f"Fetched batch {page} ({len(items)} items)")
|
|
995
|
+
|
|
996
|
+
# Limit items if max_items would be exceeded
|
|
997
|
+
items_to_process = items
|
|
998
|
+
if max_items is not None:
|
|
999
|
+
remaining_capacity = max_items - total_items_processed
|
|
1000
|
+
if len(items) > remaining_capacity:
|
|
1001
|
+
items_to_process = items[:remaining_capacity]
|
|
1002
|
+
if verbose:
|
|
1003
|
+
logger.info(
|
|
1004
|
+
f"Limiting batch to {len(items_to_process)} items "
|
|
1005
|
+
f"to respect max_items={max_items}"
|
|
1006
|
+
)
|
|
1007
|
+
|
|
1008
|
+
# Process items concurrently
|
|
1009
|
+
async def process_item(
|
|
1010
|
+
item: Union[TraceWithFullDetails, ObservationsView],
|
|
1011
|
+
) -> Tuple[str, Union[Tuple[int, int, int, List[Evaluation]], Exception]]:
|
|
1012
|
+
"""Process a single item and return (item_id, result)."""
|
|
1013
|
+
async with semaphore:
|
|
1014
|
+
item_id = self._get_item_id(item, scope)
|
|
1015
|
+
try:
|
|
1016
|
+
result = await self._process_batch_evaluation_item(
|
|
1017
|
+
item=item,
|
|
1018
|
+
scope=scope,
|
|
1019
|
+
mapper=mapper,
|
|
1020
|
+
evaluators=evaluators,
|
|
1021
|
+
composite_evaluator=composite_evaluator,
|
|
1022
|
+
metadata=metadata,
|
|
1023
|
+
_add_observation_scores_to_trace=_add_observation_scores_to_trace,
|
|
1024
|
+
evaluator_stats_dict=evaluator_stats_dict,
|
|
1025
|
+
)
|
|
1026
|
+
return (item_id, result)
|
|
1027
|
+
except Exception as e:
|
|
1028
|
+
return (item_id, e)
|
|
1029
|
+
|
|
1030
|
+
# Run all items in batch concurrently
|
|
1031
|
+
tasks = [process_item(item) for item in items_to_process]
|
|
1032
|
+
results = await asyncio.gather(*tasks)
|
|
1033
|
+
|
|
1034
|
+
# Process results and update statistics
|
|
1035
|
+
for item, (item_id, result) in zip(items_to_process, results):
|
|
1036
|
+
if isinstance(result, Exception):
|
|
1037
|
+
# Item processing failed
|
|
1038
|
+
total_items_failed += 1
|
|
1039
|
+
failed_item_ids.append(item_id)
|
|
1040
|
+
error_type = type(result).__name__
|
|
1041
|
+
error_summary[error_type] = error_summary.get(error_type, 0) + 1
|
|
1042
|
+
logger.warning(f"Item {item_id} failed: {result}")
|
|
1043
|
+
else:
|
|
1044
|
+
# Item processed successfully
|
|
1045
|
+
total_items_processed += 1
|
|
1046
|
+
scores_created, composite_created, evals_failed, evaluations = (
|
|
1047
|
+
result
|
|
1048
|
+
)
|
|
1049
|
+
total_scores_created += scores_created
|
|
1050
|
+
total_composite_scores_created += composite_created
|
|
1051
|
+
total_evaluations_failed += evals_failed
|
|
1052
|
+
|
|
1053
|
+
# Store evaluations for this item
|
|
1054
|
+
item_evaluations[item_id] = evaluations
|
|
1055
|
+
|
|
1056
|
+
if normalized_additional_trace_tags:
|
|
1057
|
+
trace_id = (
|
|
1058
|
+
item_id
|
|
1059
|
+
if scope == "traces"
|
|
1060
|
+
else cast(ObservationsView, item).trace_id
|
|
1061
|
+
)
|
|
1062
|
+
|
|
1063
|
+
if trace_id and trace_id not in updated_trace_ids:
|
|
1064
|
+
self.client._create_trace_tags_via_ingestion(
|
|
1065
|
+
trace_id=trace_id,
|
|
1066
|
+
tags=normalized_additional_trace_tags,
|
|
1067
|
+
)
|
|
1068
|
+
updated_trace_ids.add(trace_id)
|
|
1069
|
+
|
|
1070
|
+
# Update last processed tracking
|
|
1071
|
+
last_item_timestamp = self._get_item_timestamp(item, scope)
|
|
1072
|
+
last_item_id = item_id
|
|
1073
|
+
|
|
1074
|
+
if verbose:
|
|
1075
|
+
if max_items is not None and max_items > 0:
|
|
1076
|
+
progress_pct = total_items_processed / max_items * 100
|
|
1077
|
+
logger.info(
|
|
1078
|
+
f"Progress: {total_items_processed}/{max_items} items "
|
|
1079
|
+
f"({progress_pct:.1f}%), {total_scores_created} scores created"
|
|
1080
|
+
)
|
|
1081
|
+
else:
|
|
1082
|
+
logger.info(
|
|
1083
|
+
f"Progress: {total_items_processed} items processed, "
|
|
1084
|
+
f"{total_scores_created} scores created"
|
|
1085
|
+
)
|
|
1086
|
+
|
|
1087
|
+
# Check if we should continue to next page
|
|
1088
|
+
if len(items) < fetch_batch_size:
|
|
1089
|
+
# Last page - no more items available
|
|
1090
|
+
has_more = False
|
|
1091
|
+
else:
|
|
1092
|
+
page += 1
|
|
1093
|
+
|
|
1094
|
+
# Check max_items again before next fetch
|
|
1095
|
+
if max_items is not None and total_items_fetched >= max_items:
|
|
1096
|
+
has_more = True # More items exist but we're stopping
|
|
1097
|
+
break
|
|
1098
|
+
|
|
1099
|
+
# Flush all scores to Aeri
|
|
1100
|
+
if verbose:
|
|
1101
|
+
logger.info("Flushing scores to Aeri...")
|
|
1102
|
+
self.client.flush()
|
|
1103
|
+
|
|
1104
|
+
# Build final result
|
|
1105
|
+
duration = time.time() - start_time
|
|
1106
|
+
|
|
1107
|
+
if verbose:
|
|
1108
|
+
logger.info(
|
|
1109
|
+
f"Batch evaluation complete: {total_items_processed} items processed "
|
|
1110
|
+
f"in {duration:.2f}s"
|
|
1111
|
+
)
|
|
1112
|
+
|
|
1113
|
+
# Completed successfully if we either:
|
|
1114
|
+
# 1. Ran out of items (has_more is False), OR
|
|
1115
|
+
# 2. Hit max_items limit (intentionally stopped)
|
|
1116
|
+
completed_successfully = not has_more or (
|
|
1117
|
+
max_items is not None and total_items_fetched >= max_items
|
|
1118
|
+
)
|
|
1119
|
+
|
|
1120
|
+
return self._build_result(
|
|
1121
|
+
total_items_fetched=total_items_fetched,
|
|
1122
|
+
total_items_processed=total_items_processed,
|
|
1123
|
+
total_items_failed=total_items_failed,
|
|
1124
|
+
total_scores_created=total_scores_created,
|
|
1125
|
+
total_composite_scores_created=total_composite_scores_created,
|
|
1126
|
+
total_evaluations_failed=total_evaluations_failed,
|
|
1127
|
+
evaluator_stats_dict=evaluator_stats_dict,
|
|
1128
|
+
resume_token=None, # No resume needed on successful completion
|
|
1129
|
+
completed=completed_successfully,
|
|
1130
|
+
start_time=start_time,
|
|
1131
|
+
failed_item_ids=failed_item_ids,
|
|
1132
|
+
error_summary=error_summary,
|
|
1133
|
+
has_more_items=(
|
|
1134
|
+
has_more and max_items is not None and total_items_fetched >= max_items
|
|
1135
|
+
),
|
|
1136
|
+
item_evaluations=item_evaluations,
|
|
1137
|
+
)
|
|
1138
|
+
|
|
1139
|
+
async def _fetch_batch_with_retry(
|
|
1140
|
+
self,
|
|
1141
|
+
*,
|
|
1142
|
+
scope: str,
|
|
1143
|
+
filter: Optional[str],
|
|
1144
|
+
page: int,
|
|
1145
|
+
limit: int,
|
|
1146
|
+
max_retries: int,
|
|
1147
|
+
fields: Optional[str],
|
|
1148
|
+
) -> List[Union[TraceWithFullDetails, ObservationsView]]:
|
|
1149
|
+
"""Fetch a batch of items with retry logic.
|
|
1150
|
+
|
|
1151
|
+
Args:
|
|
1152
|
+
scope: The type of items ("traces", "observations").
|
|
1153
|
+
filter: JSON filter string for querying.
|
|
1154
|
+
page: Page number (1-indexed).
|
|
1155
|
+
limit: Number of items per page.
|
|
1156
|
+
max_retries: Maximum number of retry attempts.
|
|
1157
|
+
verbose: Whether to log retry attempts.
|
|
1158
|
+
fields: Trace fields to fetch
|
|
1159
|
+
|
|
1160
|
+
Returns:
|
|
1161
|
+
List of items from the API.
|
|
1162
|
+
|
|
1163
|
+
Raises:
|
|
1164
|
+
Exception: If all retry attempts fail.
|
|
1165
|
+
"""
|
|
1166
|
+
if scope == "traces":
|
|
1167
|
+
response = self.client.api.trace.list(
|
|
1168
|
+
page=page,
|
|
1169
|
+
limit=limit,
|
|
1170
|
+
filter=filter,
|
|
1171
|
+
request_options={"max_retries": max_retries},
|
|
1172
|
+
fields=fields,
|
|
1173
|
+
) # type: ignore
|
|
1174
|
+
return list(response.data) # type: ignore
|
|
1175
|
+
elif scope == "observations":
|
|
1176
|
+
response = self.client.api.legacy.observations_v1.get_many(
|
|
1177
|
+
page=page,
|
|
1178
|
+
limit=limit,
|
|
1179
|
+
filter=filter,
|
|
1180
|
+
request_options={"max_retries": max_retries},
|
|
1181
|
+
) # type: ignore
|
|
1182
|
+
return list(response.data) # type: ignore
|
|
1183
|
+
else:
|
|
1184
|
+
error_message = f"Invalid scope: {scope}"
|
|
1185
|
+
raise ValueError(error_message)
|
|
1186
|
+
|
|
1187
|
+
async def _process_batch_evaluation_item(
|
|
1188
|
+
self,
|
|
1189
|
+
item: Union[TraceWithFullDetails, ObservationsView],
|
|
1190
|
+
scope: str,
|
|
1191
|
+
mapper: MapperFunction,
|
|
1192
|
+
evaluators: List[EvaluatorFunction],
|
|
1193
|
+
composite_evaluator: Optional[CompositeEvaluatorFunction],
|
|
1194
|
+
metadata: Optional[Dict[str, Any]],
|
|
1195
|
+
_add_observation_scores_to_trace: bool,
|
|
1196
|
+
evaluator_stats_dict: Dict[str, EvaluatorStats],
|
|
1197
|
+
) -> Tuple[int, int, int, List[Evaluation]]:
|
|
1198
|
+
"""Process a single item: map, evaluate, create scores.
|
|
1199
|
+
|
|
1200
|
+
Args:
|
|
1201
|
+
item: The API response object to evaluate.
|
|
1202
|
+
scope: The type of item ("traces", "observations").
|
|
1203
|
+
mapper: Function to transform item to evaluator inputs.
|
|
1204
|
+
evaluators: List of evaluator functions.
|
|
1205
|
+
composite_evaluator: Optional composite evaluator function.
|
|
1206
|
+
metadata: Additional metadata to add to scores.
|
|
1207
|
+
_add_observation_scores_to_trace: Whether to duplicate
|
|
1208
|
+
observation-level scores at trace level.
|
|
1209
|
+
evaluator_stats_dict: Dictionary tracking evaluator statistics.
|
|
1210
|
+
|
|
1211
|
+
Returns:
|
|
1212
|
+
Tuple of (scores_created, composite_scores_created, evaluations_failed, all_evaluations).
|
|
1213
|
+
|
|
1214
|
+
Raises:
|
|
1215
|
+
Exception: If mapping fails or item processing encounters fatal error.
|
|
1216
|
+
"""
|
|
1217
|
+
scores_created = 0
|
|
1218
|
+
composite_scores_created = 0
|
|
1219
|
+
evaluations_failed = 0
|
|
1220
|
+
|
|
1221
|
+
# Run mapper to transform item
|
|
1222
|
+
evaluator_inputs = await self._run_mapper(mapper, item)
|
|
1223
|
+
|
|
1224
|
+
# Run all evaluators
|
|
1225
|
+
evaluations: List[Evaluation] = []
|
|
1226
|
+
for evaluator in evaluators:
|
|
1227
|
+
evaluator_name = getattr(evaluator, "__name__", "unknown_evaluator")
|
|
1228
|
+
stats = evaluator_stats_dict[evaluator_name]
|
|
1229
|
+
stats.total_runs += 1
|
|
1230
|
+
|
|
1231
|
+
try:
|
|
1232
|
+
eval_results = await self._run_evaluator_internal(
|
|
1233
|
+
evaluator,
|
|
1234
|
+
input=evaluator_inputs.input,
|
|
1235
|
+
output=evaluator_inputs.output,
|
|
1236
|
+
expected_output=evaluator_inputs.expected_output,
|
|
1237
|
+
metadata=evaluator_inputs.metadata,
|
|
1238
|
+
)
|
|
1239
|
+
|
|
1240
|
+
stats.successful_runs += 1
|
|
1241
|
+
stats.total_scores_created += len(eval_results)
|
|
1242
|
+
evaluations.extend(eval_results)
|
|
1243
|
+
|
|
1244
|
+
except Exception as e:
|
|
1245
|
+
# Evaluator failed - log warning and continue with other evaluators
|
|
1246
|
+
stats.failed_runs += 1
|
|
1247
|
+
evaluations_failed += 1
|
|
1248
|
+
logger.warning(
|
|
1249
|
+
f"Evaluator {evaluator_name} failed on item "
|
|
1250
|
+
f"{self._get_item_id(item, scope)}: {e}"
|
|
1251
|
+
)
|
|
1252
|
+
|
|
1253
|
+
# Create scores for item-level evaluations
|
|
1254
|
+
item_id = self._get_item_id(item, scope)
|
|
1255
|
+
for evaluation in evaluations:
|
|
1256
|
+
scores_created += self._create_score_for_scope(
|
|
1257
|
+
scope=scope,
|
|
1258
|
+
item_id=item_id,
|
|
1259
|
+
trace_id=cast(ObservationsView, item).trace_id
|
|
1260
|
+
if scope == "observations"
|
|
1261
|
+
else None,
|
|
1262
|
+
evaluation=evaluation,
|
|
1263
|
+
additional_metadata=metadata,
|
|
1264
|
+
add_observation_score_to_trace=_add_observation_scores_to_trace,
|
|
1265
|
+
)
|
|
1266
|
+
|
|
1267
|
+
# Run composite evaluator if provided and we have evaluations
|
|
1268
|
+
if composite_evaluator and evaluations:
|
|
1269
|
+
try:
|
|
1270
|
+
composite_evals = await self._run_composite_evaluator(
|
|
1271
|
+
composite_evaluator,
|
|
1272
|
+
input=evaluator_inputs.input,
|
|
1273
|
+
output=evaluator_inputs.output,
|
|
1274
|
+
expected_output=evaluator_inputs.expected_output,
|
|
1275
|
+
metadata=evaluator_inputs.metadata,
|
|
1276
|
+
evaluations=evaluations,
|
|
1277
|
+
)
|
|
1278
|
+
|
|
1279
|
+
# Create scores for all composite evaluations
|
|
1280
|
+
for composite_eval in composite_evals:
|
|
1281
|
+
composite_scores_created += self._create_score_for_scope(
|
|
1282
|
+
scope=scope,
|
|
1283
|
+
item_id=item_id,
|
|
1284
|
+
trace_id=cast(ObservationsView, item).trace_id
|
|
1285
|
+
if scope == "observations"
|
|
1286
|
+
else None,
|
|
1287
|
+
evaluation=composite_eval,
|
|
1288
|
+
additional_metadata=metadata,
|
|
1289
|
+
add_observation_score_to_trace=_add_observation_scores_to_trace,
|
|
1290
|
+
)
|
|
1291
|
+
|
|
1292
|
+
# Add composite evaluations to the list
|
|
1293
|
+
evaluations.extend(composite_evals)
|
|
1294
|
+
|
|
1295
|
+
except Exception as e:
|
|
1296
|
+
logger.warning(f"Composite evaluator failed on item {item_id}: {e}")
|
|
1297
|
+
|
|
1298
|
+
return (
|
|
1299
|
+
scores_created,
|
|
1300
|
+
composite_scores_created,
|
|
1301
|
+
evaluations_failed,
|
|
1302
|
+
evaluations,
|
|
1303
|
+
)
|
|
1304
|
+
|
|
1305
|
+
async def _run_evaluator_internal(
|
|
1306
|
+
self,
|
|
1307
|
+
evaluator: EvaluatorFunction,
|
|
1308
|
+
**kwargs: Any,
|
|
1309
|
+
) -> List[Evaluation]:
|
|
1310
|
+
"""Run an evaluator function and normalize the result.
|
|
1311
|
+
|
|
1312
|
+
Unlike experiment._run_evaluator, this version raises exceptions
|
|
1313
|
+
so we can track failures in our statistics.
|
|
1314
|
+
|
|
1315
|
+
Args:
|
|
1316
|
+
evaluator: The evaluator function to run.
|
|
1317
|
+
**kwargs: Arguments to pass to the evaluator.
|
|
1318
|
+
|
|
1319
|
+
Returns:
|
|
1320
|
+
List of Evaluation objects.
|
|
1321
|
+
|
|
1322
|
+
Raises:
|
|
1323
|
+
Exception: If evaluator raises an exception (not caught).
|
|
1324
|
+
"""
|
|
1325
|
+
result = evaluator(**kwargs)
|
|
1326
|
+
|
|
1327
|
+
# Handle async evaluators
|
|
1328
|
+
if asyncio.iscoroutine(result):
|
|
1329
|
+
result = await result
|
|
1330
|
+
|
|
1331
|
+
# Normalize to list
|
|
1332
|
+
if isinstance(result, (dict, Evaluation)):
|
|
1333
|
+
return [result] # type: ignore
|
|
1334
|
+
elif isinstance(result, list):
|
|
1335
|
+
return result
|
|
1336
|
+
else:
|
|
1337
|
+
return []
|
|
1338
|
+
|
|
1339
|
+
async def _run_mapper(
|
|
1340
|
+
self,
|
|
1341
|
+
mapper: MapperFunction,
|
|
1342
|
+
item: Union[TraceWithFullDetails, ObservationsView],
|
|
1343
|
+
) -> EvaluatorInputs:
|
|
1344
|
+
"""Run mapper function (handles both sync and async mappers).
|
|
1345
|
+
|
|
1346
|
+
Args:
|
|
1347
|
+
mapper: The mapper function to run.
|
|
1348
|
+
item: The API response object to map.
|
|
1349
|
+
|
|
1350
|
+
Returns:
|
|
1351
|
+
EvaluatorInputs instance.
|
|
1352
|
+
|
|
1353
|
+
Raises:
|
|
1354
|
+
Exception: If mapper raises an exception.
|
|
1355
|
+
"""
|
|
1356
|
+
result = mapper(item=item)
|
|
1357
|
+
if asyncio.iscoroutine(result):
|
|
1358
|
+
return await result # type: ignore
|
|
1359
|
+
return result # type: ignore
|
|
1360
|
+
|
|
1361
|
+
async def _run_composite_evaluator(
|
|
1362
|
+
self,
|
|
1363
|
+
composite_evaluator: CompositeEvaluatorFunction,
|
|
1364
|
+
input: Optional[Any],
|
|
1365
|
+
output: Optional[Any],
|
|
1366
|
+
expected_output: Optional[Any],
|
|
1367
|
+
metadata: Optional[Dict[str, Any]],
|
|
1368
|
+
evaluations: List[Evaluation],
|
|
1369
|
+
) -> List[Evaluation]:
|
|
1370
|
+
"""Run composite evaluator function (handles both sync and async).
|
|
1371
|
+
|
|
1372
|
+
Args:
|
|
1373
|
+
composite_evaluator: The composite evaluator function.
|
|
1374
|
+
input: The input data provided to the system.
|
|
1375
|
+
output: The output generated by the system.
|
|
1376
|
+
expected_output: The expected/reference output.
|
|
1377
|
+
metadata: Additional metadata about the evaluation context.
|
|
1378
|
+
evaluations: List of item-level evaluations.
|
|
1379
|
+
|
|
1380
|
+
Returns:
|
|
1381
|
+
List of Evaluation objects (normalized from single or list return).
|
|
1382
|
+
|
|
1383
|
+
Raises:
|
|
1384
|
+
Exception: If composite evaluator raises an exception.
|
|
1385
|
+
"""
|
|
1386
|
+
result = composite_evaluator(
|
|
1387
|
+
input=input,
|
|
1388
|
+
output=output,
|
|
1389
|
+
expected_output=expected_output,
|
|
1390
|
+
metadata=metadata,
|
|
1391
|
+
evaluations=evaluations,
|
|
1392
|
+
)
|
|
1393
|
+
if asyncio.iscoroutine(result):
|
|
1394
|
+
result = await result
|
|
1395
|
+
|
|
1396
|
+
# Normalize to list (same as regular evaluator)
|
|
1397
|
+
if isinstance(result, (dict, Evaluation)):
|
|
1398
|
+
return [result] # type: ignore
|
|
1399
|
+
elif isinstance(result, list):
|
|
1400
|
+
return result
|
|
1401
|
+
else:
|
|
1402
|
+
return []
|
|
1403
|
+
|
|
1404
|
+
def _create_score_for_scope(
|
|
1405
|
+
self,
|
|
1406
|
+
*,
|
|
1407
|
+
scope: str,
|
|
1408
|
+
item_id: str,
|
|
1409
|
+
trace_id: Optional[str] = None,
|
|
1410
|
+
evaluation: Evaluation,
|
|
1411
|
+
additional_metadata: Optional[Dict[str, Any]],
|
|
1412
|
+
add_observation_score_to_trace: bool = False,
|
|
1413
|
+
) -> int:
|
|
1414
|
+
"""Create a score linked to the appropriate entity based on scope.
|
|
1415
|
+
|
|
1416
|
+
Args:
|
|
1417
|
+
scope: The type of entity ("traces", "observations").
|
|
1418
|
+
item_id: The ID of the entity.
|
|
1419
|
+
trace_id: The trace ID of the entity; required if scope=observations
|
|
1420
|
+
evaluation: The evaluation result to create a score from.
|
|
1421
|
+
additional_metadata: Additional metadata to merge with evaluation metadata.
|
|
1422
|
+
add_observation_score_to_trace: Whether to duplicate observation
|
|
1423
|
+
score on parent trace as well.
|
|
1424
|
+
|
|
1425
|
+
Returns:
|
|
1426
|
+
Number of score events created.
|
|
1427
|
+
"""
|
|
1428
|
+
# Merge metadata
|
|
1429
|
+
score_metadata = {
|
|
1430
|
+
**(evaluation.metadata or {}),
|
|
1431
|
+
**(additional_metadata or {}),
|
|
1432
|
+
}
|
|
1433
|
+
|
|
1434
|
+
if scope == "traces":
|
|
1435
|
+
self.client.create_score(
|
|
1436
|
+
trace_id=item_id,
|
|
1437
|
+
name=evaluation.name,
|
|
1438
|
+
value=evaluation.value, # type: ignore
|
|
1439
|
+
comment=evaluation.comment,
|
|
1440
|
+
metadata=score_metadata,
|
|
1441
|
+
data_type=evaluation.data_type, # type: ignore[arg-type]
|
|
1442
|
+
config_id=evaluation.config_id,
|
|
1443
|
+
)
|
|
1444
|
+
return 1
|
|
1445
|
+
elif scope == "observations":
|
|
1446
|
+
self.client.create_score(
|
|
1447
|
+
observation_id=item_id,
|
|
1448
|
+
trace_id=trace_id,
|
|
1449
|
+
name=evaluation.name,
|
|
1450
|
+
value=evaluation.value, # type: ignore
|
|
1451
|
+
comment=evaluation.comment,
|
|
1452
|
+
metadata=score_metadata,
|
|
1453
|
+
data_type=evaluation.data_type, # type: ignore[arg-type]
|
|
1454
|
+
config_id=evaluation.config_id,
|
|
1455
|
+
)
|
|
1456
|
+
score_count = 1
|
|
1457
|
+
|
|
1458
|
+
if add_observation_score_to_trace and trace_id:
|
|
1459
|
+
self.client.create_score(
|
|
1460
|
+
trace_id=trace_id,
|
|
1461
|
+
name=evaluation.name,
|
|
1462
|
+
value=evaluation.value, # type: ignore
|
|
1463
|
+
comment=evaluation.comment,
|
|
1464
|
+
metadata=score_metadata,
|
|
1465
|
+
data_type=evaluation.data_type, # type: ignore[arg-type]
|
|
1466
|
+
config_id=evaluation.config_id,
|
|
1467
|
+
)
|
|
1468
|
+
score_count += 1
|
|
1469
|
+
|
|
1470
|
+
return score_count
|
|
1471
|
+
|
|
1472
|
+
return 0
|
|
1473
|
+
|
|
1474
|
+
def _build_timestamp_filter(
|
|
1475
|
+
self,
|
|
1476
|
+
original_filter: Optional[str],
|
|
1477
|
+
resume_from: Optional[BatchEvaluationResumeToken],
|
|
1478
|
+
) -> Optional[str]:
|
|
1479
|
+
"""Build filter with timestamp constraint for resume capability.
|
|
1480
|
+
|
|
1481
|
+
Args:
|
|
1482
|
+
original_filter: The original JSON filter string.
|
|
1483
|
+
resume_from: Optional resume token with timestamp information.
|
|
1484
|
+
|
|
1485
|
+
Returns:
|
|
1486
|
+
Modified filter string with timestamp constraint, or original filter.
|
|
1487
|
+
"""
|
|
1488
|
+
if not resume_from:
|
|
1489
|
+
return original_filter
|
|
1490
|
+
|
|
1491
|
+
# Parse original filter (should be array) or create empty array
|
|
1492
|
+
try:
|
|
1493
|
+
filter_list = json.loads(original_filter) if original_filter else []
|
|
1494
|
+
if not isinstance(filter_list, list):
|
|
1495
|
+
logger.warning(
|
|
1496
|
+
f"Filter should be a JSON array, got: {type(filter_list).__name__}"
|
|
1497
|
+
)
|
|
1498
|
+
filter_list = []
|
|
1499
|
+
except json.JSONDecodeError:
|
|
1500
|
+
logger.warning(
|
|
1501
|
+
f"Invalid JSON in original filter, ignoring: {original_filter}"
|
|
1502
|
+
)
|
|
1503
|
+
filter_list = []
|
|
1504
|
+
|
|
1505
|
+
# Add timestamp constraint to filter array
|
|
1506
|
+
timestamp_field = self._get_timestamp_field_for_scope(resume_from.scope)
|
|
1507
|
+
timestamp_filter = {
|
|
1508
|
+
"type": "datetime",
|
|
1509
|
+
"column": timestamp_field,
|
|
1510
|
+
"operator": ">",
|
|
1511
|
+
"value": resume_from.last_processed_timestamp,
|
|
1512
|
+
}
|
|
1513
|
+
filter_list.append(timestamp_filter)
|
|
1514
|
+
|
|
1515
|
+
return json.dumps(filter_list)
|
|
1516
|
+
|
|
1517
|
+
@staticmethod
|
|
1518
|
+
def _get_item_id(
|
|
1519
|
+
item: Union[TraceWithFullDetails, ObservationsView],
|
|
1520
|
+
scope: str,
|
|
1521
|
+
) -> str:
|
|
1522
|
+
"""Extract ID from item based on scope.
|
|
1523
|
+
|
|
1524
|
+
Args:
|
|
1525
|
+
item: The API response object.
|
|
1526
|
+
scope: The type of item.
|
|
1527
|
+
|
|
1528
|
+
Returns:
|
|
1529
|
+
The item's ID.
|
|
1530
|
+
"""
|
|
1531
|
+
return item.id
|
|
1532
|
+
|
|
1533
|
+
@staticmethod
|
|
1534
|
+
def _get_item_timestamp(
|
|
1535
|
+
item: Union[TraceWithFullDetails, ObservationsView],
|
|
1536
|
+
scope: str,
|
|
1537
|
+
) -> str:
|
|
1538
|
+
"""Extract timestamp from item based on scope.
|
|
1539
|
+
|
|
1540
|
+
Args:
|
|
1541
|
+
item: The API response object.
|
|
1542
|
+
scope: The type of item.
|
|
1543
|
+
|
|
1544
|
+
Returns:
|
|
1545
|
+
ISO 8601 timestamp string.
|
|
1546
|
+
"""
|
|
1547
|
+
if scope == "traces":
|
|
1548
|
+
# Type narrowing for traces
|
|
1549
|
+
if hasattr(item, "timestamp"):
|
|
1550
|
+
return item.timestamp.isoformat() # type: ignore[attr-defined]
|
|
1551
|
+
elif scope == "observations":
|
|
1552
|
+
# Type narrowing for observations
|
|
1553
|
+
if hasattr(item, "start_time"):
|
|
1554
|
+
return item.start_time.isoformat() # type: ignore[attr-defined]
|
|
1555
|
+
return ""
|
|
1556
|
+
|
|
1557
|
+
@staticmethod
|
|
1558
|
+
def _get_timestamp_field_for_scope(scope: str) -> str:
|
|
1559
|
+
"""Get the timestamp field name for filtering based on scope.
|
|
1560
|
+
|
|
1561
|
+
Args:
|
|
1562
|
+
scope: The type of items.
|
|
1563
|
+
|
|
1564
|
+
Returns:
|
|
1565
|
+
The field name to use in filters.
|
|
1566
|
+
"""
|
|
1567
|
+
if scope == "traces":
|
|
1568
|
+
return "timestamp"
|
|
1569
|
+
elif scope == "observations":
|
|
1570
|
+
return "start_time"
|
|
1571
|
+
return "timestamp" # Default
|
|
1572
|
+
|
|
1573
|
+
@staticmethod
|
|
1574
|
+
def _dedupe_tags(tags: Optional[List[str]]) -> List[str]:
|
|
1575
|
+
"""Deduplicate tags while preserving order."""
|
|
1576
|
+
if tags is None:
|
|
1577
|
+
return []
|
|
1578
|
+
|
|
1579
|
+
deduped: List[str] = []
|
|
1580
|
+
seen = set()
|
|
1581
|
+
for tag in tags:
|
|
1582
|
+
if tag not in seen:
|
|
1583
|
+
deduped.append(tag)
|
|
1584
|
+
seen.add(tag)
|
|
1585
|
+
|
|
1586
|
+
return deduped
|
|
1587
|
+
|
|
1588
|
+
def _build_result(
|
|
1589
|
+
self,
|
|
1590
|
+
total_items_fetched: int,
|
|
1591
|
+
total_items_processed: int,
|
|
1592
|
+
total_items_failed: int,
|
|
1593
|
+
total_scores_created: int,
|
|
1594
|
+
total_composite_scores_created: int,
|
|
1595
|
+
total_evaluations_failed: int,
|
|
1596
|
+
evaluator_stats_dict: Dict[str, EvaluatorStats],
|
|
1597
|
+
resume_token: Optional[BatchEvaluationResumeToken],
|
|
1598
|
+
completed: bool,
|
|
1599
|
+
start_time: float,
|
|
1600
|
+
failed_item_ids: List[str],
|
|
1601
|
+
error_summary: Dict[str, int],
|
|
1602
|
+
has_more_items: bool,
|
|
1603
|
+
item_evaluations: Dict[str, List[Evaluation]],
|
|
1604
|
+
) -> BatchEvaluationResult:
|
|
1605
|
+
"""Build the final BatchEvaluationResult.
|
|
1606
|
+
|
|
1607
|
+
Args:
|
|
1608
|
+
total_items_fetched: Total items fetched.
|
|
1609
|
+
total_items_processed: Items successfully processed.
|
|
1610
|
+
total_items_failed: Items that failed.
|
|
1611
|
+
total_scores_created: Scores from item evaluators.
|
|
1612
|
+
total_composite_scores_created: Scores from composite evaluator.
|
|
1613
|
+
total_evaluations_failed: Individual evaluator failures.
|
|
1614
|
+
evaluator_stats_dict: Per-evaluator statistics.
|
|
1615
|
+
resume_token: Resume token if incomplete.
|
|
1616
|
+
completed: Whether evaluation completed fully.
|
|
1617
|
+
start_time: Start time (unix timestamp).
|
|
1618
|
+
failed_item_ids: IDs of failed items.
|
|
1619
|
+
error_summary: Error type counts.
|
|
1620
|
+
has_more_items: Whether more items exist.
|
|
1621
|
+
item_evaluations: Dictionary mapping item IDs to their evaluation results.
|
|
1622
|
+
|
|
1623
|
+
Returns:
|
|
1624
|
+
BatchEvaluationResult instance.
|
|
1625
|
+
"""
|
|
1626
|
+
duration = time.time() - start_time
|
|
1627
|
+
|
|
1628
|
+
return BatchEvaluationResult(
|
|
1629
|
+
total_items_fetched=total_items_fetched,
|
|
1630
|
+
total_items_processed=total_items_processed,
|
|
1631
|
+
total_items_failed=total_items_failed,
|
|
1632
|
+
total_scores_created=total_scores_created,
|
|
1633
|
+
total_composite_scores_created=total_composite_scores_created,
|
|
1634
|
+
total_evaluations_failed=total_evaluations_failed,
|
|
1635
|
+
evaluator_stats=list(evaluator_stats_dict.values()),
|
|
1636
|
+
resume_token=resume_token,
|
|
1637
|
+
completed=completed,
|
|
1638
|
+
duration_seconds=duration,
|
|
1639
|
+
failed_item_ids=failed_item_ids,
|
|
1640
|
+
error_summary=error_summary,
|
|
1641
|
+
has_more_items=has_more_items,
|
|
1642
|
+
item_evaluations=item_evaluations,
|
|
1643
|
+
)
|