opik 1.8.39__py3-none-any.whl → 1.9.71__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik/__init__.py +19 -3
- opik/anonymizer/__init__.py +5 -0
- opik/anonymizer/anonymizer.py +12 -0
- opik/anonymizer/factory.py +80 -0
- opik/anonymizer/recursive_anonymizer.py +64 -0
- opik/anonymizer/rules.py +56 -0
- opik/anonymizer/rules_anonymizer.py +35 -0
- opik/api_objects/attachment/attachment_context.py +36 -0
- opik/api_objects/attachment/attachments_extractor.py +153 -0
- opik/api_objects/attachment/client.py +1 -0
- opik/api_objects/attachment/converters.py +2 -0
- opik/api_objects/attachment/decoder.py +18 -0
- opik/api_objects/attachment/decoder_base64.py +83 -0
- opik/api_objects/attachment/decoder_helpers.py +137 -0
- opik/api_objects/data_helpers.py +79 -0
- opik/api_objects/dataset/dataset.py +64 -4
- opik/api_objects/dataset/rest_operations.py +11 -2
- opik/api_objects/experiment/experiment.py +57 -57
- opik/api_objects/experiment/experiment_item.py +2 -1
- opik/api_objects/experiment/experiments_client.py +64 -0
- opik/api_objects/experiment/helpers.py +35 -11
- opik/api_objects/experiment/rest_operations.py +65 -5
- opik/api_objects/helpers.py +8 -5
- opik/api_objects/local_recording.py +81 -0
- opik/api_objects/opik_client.py +600 -108
- opik/api_objects/opik_query_language.py +39 -5
- opik/api_objects/prompt/__init__.py +12 -2
- opik/api_objects/prompt/base_prompt.py +69 -0
- opik/api_objects/prompt/base_prompt_template.py +29 -0
- opik/api_objects/prompt/chat/__init__.py +1 -0
- opik/api_objects/prompt/chat/chat_prompt.py +210 -0
- opik/api_objects/prompt/chat/chat_prompt_template.py +350 -0
- opik/api_objects/prompt/chat/content_renderer_registry.py +203 -0
- opik/api_objects/prompt/client.py +189 -47
- opik/api_objects/prompt/text/__init__.py +1 -0
- opik/api_objects/prompt/text/prompt.py +174 -0
- opik/api_objects/prompt/{prompt_template.py → text/prompt_template.py} +10 -6
- opik/api_objects/prompt/types.py +23 -0
- opik/api_objects/search_helpers.py +89 -0
- opik/api_objects/span/span_data.py +35 -25
- opik/api_objects/threads/threads_client.py +39 -5
- opik/api_objects/trace/trace_client.py +52 -2
- opik/api_objects/trace/trace_data.py +15 -24
- opik/api_objects/validation_helpers.py +3 -3
- opik/cli/__init__.py +5 -0
- opik/cli/__main__.py +6 -0
- opik/cli/configure.py +66 -0
- opik/cli/exports/__init__.py +131 -0
- opik/cli/exports/dataset.py +278 -0
- opik/cli/exports/experiment.py +784 -0
- opik/cli/exports/project.py +685 -0
- opik/cli/exports/prompt.py +578 -0
- opik/cli/exports/utils.py +406 -0
- opik/cli/harbor.py +39 -0
- opik/cli/healthcheck.py +21 -0
- opik/cli/imports/__init__.py +439 -0
- opik/cli/imports/dataset.py +143 -0
- opik/cli/imports/experiment.py +1192 -0
- opik/cli/imports/project.py +262 -0
- opik/cli/imports/prompt.py +177 -0
- opik/cli/imports/utils.py +280 -0
- opik/cli/main.py +49 -0
- opik/cli/proxy.py +93 -0
- opik/cli/usage_report/__init__.py +16 -0
- opik/cli/usage_report/charts.py +783 -0
- opik/cli/usage_report/cli.py +274 -0
- opik/cli/usage_report/constants.py +9 -0
- opik/cli/usage_report/extraction.py +749 -0
- opik/cli/usage_report/pdf.py +244 -0
- opik/cli/usage_report/statistics.py +78 -0
- opik/cli/usage_report/utils.py +235 -0
- opik/config.py +13 -7
- opik/configurator/configure.py +17 -0
- opik/datetime_helpers.py +12 -0
- opik/decorator/arguments_helpers.py +9 -1
- opik/decorator/base_track_decorator.py +205 -133
- opik/decorator/context_manager/span_context_manager.py +123 -0
- opik/decorator/context_manager/trace_context_manager.py +84 -0
- opik/decorator/opik_args/__init__.py +13 -0
- opik/decorator/opik_args/api_classes.py +71 -0
- opik/decorator/opik_args/helpers.py +120 -0
- opik/decorator/span_creation_handler.py +25 -6
- opik/dict_utils.py +3 -3
- opik/evaluation/__init__.py +13 -2
- opik/evaluation/engine/engine.py +272 -75
- opik/evaluation/engine/evaluation_tasks_executor.py +6 -3
- opik/evaluation/engine/helpers.py +31 -6
- opik/evaluation/engine/metrics_evaluator.py +237 -0
- opik/evaluation/evaluation_result.py +168 -2
- opik/evaluation/evaluator.py +533 -62
- opik/evaluation/metrics/__init__.py +103 -4
- opik/evaluation/metrics/aggregated_metric.py +35 -6
- opik/evaluation/metrics/base_metric.py +1 -1
- opik/evaluation/metrics/conversation/__init__.py +48 -0
- opik/evaluation/metrics/conversation/conversation_thread_metric.py +56 -2
- opik/evaluation/metrics/conversation/g_eval_wrappers.py +19 -0
- opik/evaluation/metrics/conversation/helpers.py +14 -15
- opik/evaluation/metrics/conversation/heuristics/__init__.py +14 -0
- opik/evaluation/metrics/conversation/heuristics/degeneration/__init__.py +3 -0
- opik/evaluation/metrics/conversation/heuristics/degeneration/metric.py +189 -0
- opik/evaluation/metrics/conversation/heuristics/degeneration/phrases.py +12 -0
- opik/evaluation/metrics/conversation/heuristics/knowledge_retention/__init__.py +3 -0
- opik/evaluation/metrics/conversation/heuristics/knowledge_retention/metric.py +172 -0
- opik/evaluation/metrics/conversation/llm_judges/__init__.py +32 -0
- opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/metric.py +22 -17
- opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/templates.py +1 -1
- opik/evaluation/metrics/conversation/llm_judges/g_eval_wrappers.py +442 -0
- opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/metric.py +13 -7
- opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/templates.py +1 -1
- opik/evaluation/metrics/conversation/llm_judges/user_frustration/__init__.py +0 -0
- opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/metric.py +21 -14
- opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/templates.py +1 -1
- opik/evaluation/metrics/conversation/types.py +4 -5
- opik/evaluation/metrics/conversation_types.py +9 -0
- opik/evaluation/metrics/heuristics/bertscore.py +107 -0
- opik/evaluation/metrics/heuristics/bleu.py +35 -15
- opik/evaluation/metrics/heuristics/chrf.py +127 -0
- opik/evaluation/metrics/heuristics/contains.py +47 -11
- opik/evaluation/metrics/heuristics/distribution_metrics.py +331 -0
- opik/evaluation/metrics/heuristics/gleu.py +113 -0
- opik/evaluation/metrics/heuristics/language_adherence.py +123 -0
- opik/evaluation/metrics/heuristics/meteor.py +119 -0
- opik/evaluation/metrics/heuristics/prompt_injection.py +150 -0
- opik/evaluation/metrics/heuristics/readability.py +129 -0
- opik/evaluation/metrics/heuristics/rouge.py +26 -9
- opik/evaluation/metrics/heuristics/spearman.py +88 -0
- opik/evaluation/metrics/heuristics/tone.py +155 -0
- opik/evaluation/metrics/heuristics/vader_sentiment.py +77 -0
- opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +20 -5
- opik/evaluation/metrics/llm_judges/context_precision/metric.py +20 -6
- opik/evaluation/metrics/llm_judges/context_recall/metric.py +20 -6
- opik/evaluation/metrics/llm_judges/g_eval/__init__.py +5 -0
- opik/evaluation/metrics/llm_judges/g_eval/metric.py +219 -68
- opik/evaluation/metrics/llm_judges/g_eval/parser.py +102 -52
- opik/evaluation/metrics/llm_judges/g_eval/presets.py +209 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/__init__.py +36 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/agent_assessment.py +77 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/bias_classifier.py +181 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/compliance_risk.py +41 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/prompt_uncertainty.py +41 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/qa_suite.py +146 -0
- opik/evaluation/metrics/llm_judges/hallucination/metric.py +16 -3
- opik/evaluation/metrics/llm_judges/llm_juries/__init__.py +3 -0
- opik/evaluation/metrics/llm_judges/llm_juries/metric.py +76 -0
- opik/evaluation/metrics/llm_judges/moderation/metric.py +16 -4
- opik/evaluation/metrics/llm_judges/structure_output_compliance/__init__.py +0 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +144 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/parser.py +79 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/schema.py +15 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/template.py +50 -0
- opik/evaluation/metrics/llm_judges/syc_eval/__init__.py +0 -0
- opik/evaluation/metrics/llm_judges/syc_eval/metric.py +252 -0
- opik/evaluation/metrics/llm_judges/syc_eval/parser.py +82 -0
- opik/evaluation/metrics/llm_judges/syc_eval/template.py +155 -0
- opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +20 -5
- opik/evaluation/metrics/llm_judges/usefulness/metric.py +16 -4
- opik/evaluation/metrics/ragas_metric.py +43 -23
- opik/evaluation/models/__init__.py +8 -0
- opik/evaluation/models/base_model.py +107 -1
- opik/evaluation/models/langchain/langchain_chat_model.py +15 -7
- opik/evaluation/models/langchain/message_converters.py +97 -15
- opik/evaluation/models/litellm/litellm_chat_model.py +156 -29
- opik/evaluation/models/litellm/util.py +125 -0
- opik/evaluation/models/litellm/warning_filters.py +16 -4
- opik/evaluation/models/model_capabilities.py +187 -0
- opik/evaluation/models/models_factory.py +25 -3
- opik/evaluation/preprocessing.py +92 -0
- opik/evaluation/report.py +70 -12
- opik/evaluation/rest_operations.py +49 -45
- opik/evaluation/samplers/__init__.py +4 -0
- opik/evaluation/samplers/base_dataset_sampler.py +40 -0
- opik/evaluation/samplers/random_dataset_sampler.py +48 -0
- opik/evaluation/score_statistics.py +66 -0
- opik/evaluation/scorers/__init__.py +4 -0
- opik/evaluation/scorers/scorer_function.py +55 -0
- opik/evaluation/scorers/scorer_wrapper_metric.py +130 -0
- opik/evaluation/test_case.py +3 -2
- opik/evaluation/test_result.py +1 -0
- opik/evaluation/threads/evaluator.py +31 -3
- opik/evaluation/threads/helpers.py +3 -2
- opik/evaluation/types.py +9 -1
- opik/exceptions.py +33 -0
- opik/file_upload/file_uploader.py +13 -0
- opik/file_upload/upload_options.py +2 -0
- opik/hooks/__init__.py +23 -0
- opik/hooks/anonymizer_hook.py +36 -0
- opik/hooks/httpx_client_hook.py +112 -0
- opik/httpx_client.py +12 -9
- opik/id_helpers.py +18 -0
- opik/integrations/adk/graph/subgraph_edges_builders.py +1 -2
- opik/integrations/adk/helpers.py +16 -7
- opik/integrations/adk/legacy_opik_tracer.py +7 -4
- opik/integrations/adk/opik_tracer.py +14 -1
- opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +7 -3
- opik/integrations/adk/recursive_callback_injector.py +4 -7
- opik/integrations/bedrock/converse/__init__.py +0 -0
- opik/integrations/bedrock/converse/chunks_aggregator.py +188 -0
- opik/integrations/bedrock/{converse_decorator.py → converse/converse_decorator.py} +4 -3
- opik/integrations/bedrock/invoke_agent_decorator.py +5 -4
- opik/integrations/bedrock/invoke_model/__init__.py +0 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/__init__.py +78 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/api.py +45 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/base.py +23 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/claude.py +121 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/format_detector.py +107 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/llama.py +108 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/mistral.py +118 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/nova.py +99 -0
- opik/integrations/bedrock/invoke_model/invoke_model_decorator.py +178 -0
- opik/integrations/bedrock/invoke_model/response_types.py +34 -0
- opik/integrations/bedrock/invoke_model/stream_wrappers.py +122 -0
- opik/integrations/bedrock/invoke_model/usage_converters.py +87 -0
- opik/integrations/bedrock/invoke_model/usage_extraction.py +108 -0
- opik/integrations/bedrock/opik_tracker.py +42 -4
- opik/integrations/bedrock/types.py +19 -0
- opik/integrations/crewai/crewai_decorator.py +8 -51
- opik/integrations/crewai/opik_tracker.py +31 -10
- opik/integrations/crewai/patchers/__init__.py +5 -0
- opik/integrations/crewai/patchers/flow.py +118 -0
- opik/integrations/crewai/patchers/litellm_completion.py +30 -0
- opik/integrations/crewai/patchers/llm_client.py +207 -0
- opik/integrations/dspy/callback.py +80 -17
- opik/integrations/dspy/parsers.py +168 -0
- opik/integrations/harbor/__init__.py +17 -0
- opik/integrations/harbor/experiment_service.py +269 -0
- opik/integrations/harbor/opik_tracker.py +528 -0
- opik/integrations/haystack/opik_connector.py +2 -2
- opik/integrations/haystack/opik_tracer.py +3 -7
- opik/integrations/langchain/__init__.py +3 -1
- opik/integrations/langchain/helpers.py +96 -0
- opik/integrations/langchain/langgraph_async_context_bridge.py +131 -0
- opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
- opik/integrations/langchain/opik_encoder_extension.py +1 -1
- opik/integrations/langchain/opik_tracer.py +474 -229
- opik/integrations/litellm/__init__.py +5 -0
- opik/integrations/litellm/completion_chunks_aggregator.py +115 -0
- opik/integrations/litellm/litellm_completion_decorator.py +242 -0
- opik/integrations/litellm/opik_tracker.py +43 -0
- opik/integrations/litellm/stream_patchers.py +151 -0
- opik/integrations/llama_index/callback.py +146 -107
- opik/integrations/openai/agents/opik_tracing_processor.py +1 -2
- opik/integrations/openai/openai_chat_completions_decorator.py +2 -16
- opik/integrations/openai/opik_tracker.py +1 -1
- opik/integrations/sagemaker/auth.py +5 -1
- opik/llm_usage/google_usage.py +3 -1
- opik/llm_usage/opik_usage.py +7 -8
- opik/llm_usage/opik_usage_factory.py +4 -2
- opik/logging_messages.py +6 -0
- opik/message_processing/batching/base_batcher.py +14 -21
- opik/message_processing/batching/batch_manager.py +22 -10
- opik/message_processing/batching/batch_manager_constuctors.py +10 -0
- opik/message_processing/batching/batchers.py +59 -27
- opik/message_processing/batching/flushing_thread.py +0 -3
- opik/message_processing/emulation/__init__.py +0 -0
- opik/message_processing/emulation/emulator_message_processor.py +578 -0
- opik/message_processing/emulation/local_emulator_message_processor.py +140 -0
- opik/message_processing/emulation/models.py +162 -0
- opik/message_processing/encoder_helpers.py +79 -0
- opik/message_processing/messages.py +56 -1
- opik/message_processing/preprocessing/__init__.py +0 -0
- opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
- opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
- opik/message_processing/preprocessing/constants.py +1 -0
- opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
- opik/message_processing/preprocessing/preprocessor.py +36 -0
- opik/message_processing/processors/__init__.py +0 -0
- opik/message_processing/processors/attachments_extraction_processor.py +146 -0
- opik/message_processing/processors/message_processors.py +92 -0
- opik/message_processing/processors/message_processors_chain.py +96 -0
- opik/message_processing/{message_processors.py → processors/online_message_processor.py} +85 -29
- opik/message_processing/queue_consumer.py +9 -3
- opik/message_processing/streamer.py +71 -33
- opik/message_processing/streamer_constructors.py +43 -10
- opik/opik_context.py +16 -4
- opik/plugins/pytest/hooks.py +5 -3
- opik/rest_api/__init__.py +346 -15
- opik/rest_api/alerts/__init__.py +7 -0
- opik/rest_api/alerts/client.py +667 -0
- opik/rest_api/alerts/raw_client.py +1015 -0
- opik/rest_api/alerts/types/__init__.py +7 -0
- opik/rest_api/alerts/types/get_webhook_examples_request_alert_type.py +5 -0
- opik/rest_api/annotation_queues/__init__.py +4 -0
- opik/rest_api/annotation_queues/client.py +668 -0
- opik/rest_api/annotation_queues/raw_client.py +1019 -0
- opik/rest_api/automation_rule_evaluators/client.py +34 -2
- opik/rest_api/automation_rule_evaluators/raw_client.py +24 -0
- opik/rest_api/client.py +15 -0
- opik/rest_api/dashboards/__init__.py +4 -0
- opik/rest_api/dashboards/client.py +462 -0
- opik/rest_api/dashboards/raw_client.py +648 -0
- opik/rest_api/datasets/client.py +1310 -44
- opik/rest_api/datasets/raw_client.py +2269 -358
- opik/rest_api/experiments/__init__.py +2 -2
- opik/rest_api/experiments/client.py +191 -5
- opik/rest_api/experiments/raw_client.py +301 -7
- opik/rest_api/experiments/types/__init__.py +4 -1
- opik/rest_api/experiments/types/experiment_update_status.py +5 -0
- opik/rest_api/experiments/types/experiment_update_type.py +5 -0
- opik/rest_api/experiments/types/experiment_write_status.py +5 -0
- opik/rest_api/feedback_definitions/types/find_feedback_definitions_request_type.py +1 -1
- opik/rest_api/llm_provider_key/client.py +20 -0
- opik/rest_api/llm_provider_key/raw_client.py +20 -0
- opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +1 -1
- opik/rest_api/manual_evaluation/__init__.py +4 -0
- opik/rest_api/manual_evaluation/client.py +347 -0
- opik/rest_api/manual_evaluation/raw_client.py +543 -0
- opik/rest_api/optimizations/client.py +145 -9
- opik/rest_api/optimizations/raw_client.py +237 -13
- opik/rest_api/optimizations/types/optimization_update_status.py +3 -1
- opik/rest_api/prompts/__init__.py +2 -2
- opik/rest_api/prompts/client.py +227 -6
- opik/rest_api/prompts/raw_client.py +331 -2
- opik/rest_api/prompts/types/__init__.py +3 -1
- opik/rest_api/prompts/types/create_prompt_version_detail_template_structure.py +5 -0
- opik/rest_api/prompts/types/prompt_write_template_structure.py +5 -0
- opik/rest_api/spans/__init__.py +0 -2
- opik/rest_api/spans/client.py +238 -76
- opik/rest_api/spans/raw_client.py +307 -95
- opik/rest_api/spans/types/__init__.py +0 -2
- opik/rest_api/traces/client.py +572 -161
- opik/rest_api/traces/raw_client.py +736 -229
- opik/rest_api/types/__init__.py +352 -17
- opik/rest_api/types/aggregation_data.py +1 -0
- opik/rest_api/types/alert.py +33 -0
- opik/rest_api/types/alert_alert_type.py +5 -0
- opik/rest_api/types/alert_page_public.py +24 -0
- opik/rest_api/types/alert_public.py +33 -0
- opik/rest_api/types/alert_public_alert_type.py +5 -0
- opik/rest_api/types/alert_trigger.py +27 -0
- opik/rest_api/types/alert_trigger_config.py +28 -0
- opik/rest_api/types/alert_trigger_config_public.py +28 -0
- opik/rest_api/types/alert_trigger_config_public_type.py +10 -0
- opik/rest_api/types/alert_trigger_config_type.py +10 -0
- opik/rest_api/types/alert_trigger_config_write.py +22 -0
- opik/rest_api/types/alert_trigger_config_write_type.py +10 -0
- opik/rest_api/types/alert_trigger_event_type.py +19 -0
- opik/rest_api/types/alert_trigger_public.py +27 -0
- opik/rest_api/types/alert_trigger_public_event_type.py +19 -0
- opik/rest_api/types/alert_trigger_write.py +23 -0
- opik/rest_api/types/alert_trigger_write_event_type.py +19 -0
- opik/rest_api/types/alert_write.py +28 -0
- opik/rest_api/types/alert_write_alert_type.py +5 -0
- opik/rest_api/types/annotation_queue.py +42 -0
- opik/rest_api/types/annotation_queue_batch.py +27 -0
- opik/rest_api/types/annotation_queue_item_ids.py +19 -0
- opik/rest_api/types/annotation_queue_page_public.py +28 -0
- opik/rest_api/types/annotation_queue_public.py +38 -0
- opik/rest_api/types/annotation_queue_public_scope.py +5 -0
- opik/rest_api/types/annotation_queue_reviewer.py +20 -0
- opik/rest_api/types/annotation_queue_reviewer_public.py +20 -0
- opik/rest_api/types/annotation_queue_scope.py +5 -0
- opik/rest_api/types/annotation_queue_write.py +31 -0
- opik/rest_api/types/annotation_queue_write_scope.py +5 -0
- opik/rest_api/types/audio_url.py +19 -0
- opik/rest_api/types/audio_url_public.py +19 -0
- opik/rest_api/types/audio_url_write.py +19 -0
- opik/rest_api/types/automation_rule_evaluator.py +62 -2
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_object_object_public.py +155 -0
- opik/rest_api/types/automation_rule_evaluator_page_public.py +3 -2
- opik/rest_api/types/automation_rule_evaluator_public.py +57 -2
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_public.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_write.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update.py +51 -1
- opik/rest_api/types/automation_rule_evaluator_update_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update_span_llm_as_judge.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_update_trace_thread_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update_trace_thread_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_write.py +51 -1
- opik/rest_api/types/boolean_feedback_definition.py +25 -0
- opik/rest_api/types/boolean_feedback_definition_create.py +20 -0
- opik/rest_api/types/boolean_feedback_definition_public.py +25 -0
- opik/rest_api/types/boolean_feedback_definition_update.py +20 -0
- opik/rest_api/types/boolean_feedback_detail.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_create.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_public.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_update.py +29 -0
- opik/rest_api/types/dashboard_page_public.py +24 -0
- opik/rest_api/types/dashboard_public.py +30 -0
- opik/rest_api/types/dataset.py +4 -0
- opik/rest_api/types/dataset_expansion.py +42 -0
- opik/rest_api/types/dataset_expansion_response.py +39 -0
- opik/rest_api/types/dataset_item.py +2 -0
- opik/rest_api/types/dataset_item_changes_public.py +5 -0
- opik/rest_api/types/dataset_item_compare.py +2 -0
- opik/rest_api/types/dataset_item_filter.py +27 -0
- opik/rest_api/types/dataset_item_filter_operator.py +21 -0
- opik/rest_api/types/dataset_item_page_compare.py +5 -0
- opik/rest_api/types/dataset_item_page_public.py +5 -0
- opik/rest_api/types/dataset_item_public.py +2 -0
- opik/rest_api/types/dataset_item_update.py +39 -0
- opik/rest_api/types/dataset_item_write.py +1 -0
- opik/rest_api/types/dataset_public.py +4 -0
- opik/rest_api/types/dataset_public_status.py +5 -0
- opik/rest_api/types/dataset_status.py +5 -0
- opik/rest_api/types/dataset_version_diff.py +22 -0
- opik/rest_api/types/dataset_version_diff_stats.py +24 -0
- opik/rest_api/types/dataset_version_page_public.py +23 -0
- opik/rest_api/types/dataset_version_public.py +59 -0
- opik/rest_api/types/dataset_version_summary.py +46 -0
- opik/rest_api/types/dataset_version_summary_public.py +46 -0
- opik/rest_api/types/experiment.py +7 -2
- opik/rest_api/types/experiment_group_response.py +2 -0
- opik/rest_api/types/experiment_public.py +7 -2
- opik/rest_api/types/experiment_public_status.py +5 -0
- opik/rest_api/types/experiment_score.py +20 -0
- opik/rest_api/types/experiment_score_public.py +20 -0
- opik/rest_api/types/experiment_score_write.py +20 -0
- opik/rest_api/types/experiment_status.py +5 -0
- opik/rest_api/types/feedback.py +25 -1
- opik/rest_api/types/feedback_create.py +20 -1
- opik/rest_api/types/feedback_object_public.py +27 -1
- opik/rest_api/types/feedback_public.py +25 -1
- opik/rest_api/types/feedback_score_batch_item.py +2 -1
- opik/rest_api/types/feedback_score_batch_item_thread.py +2 -1
- opik/rest_api/types/feedback_score_public.py +4 -0
- opik/rest_api/types/feedback_update.py +20 -1
- opik/rest_api/types/group_content_with_aggregations.py +1 -0
- opik/rest_api/types/group_detail.py +19 -0
- opik/rest_api/types/group_details.py +20 -0
- opik/rest_api/types/guardrail.py +1 -0
- opik/rest_api/types/guardrail_write.py +1 -0
- opik/rest_api/types/ids_holder.py +19 -0
- opik/rest_api/types/image_url.py +20 -0
- opik/rest_api/types/image_url_public.py +20 -0
- opik/rest_api/types/image_url_write.py +20 -0
- opik/rest_api/types/llm_as_judge_message.py +5 -1
- opik/rest_api/types/llm_as_judge_message_content.py +26 -0
- opik/rest_api/types/llm_as_judge_message_content_public.py +26 -0
- opik/rest_api/types/llm_as_judge_message_content_write.py +26 -0
- opik/rest_api/types/llm_as_judge_message_public.py +5 -1
- opik/rest_api/types/llm_as_judge_message_write.py +5 -1
- opik/rest_api/types/llm_as_judge_model_parameters.py +3 -0
- opik/rest_api/types/llm_as_judge_model_parameters_public.py +3 -0
- opik/rest_api/types/llm_as_judge_model_parameters_write.py +3 -0
- opik/rest_api/types/manual_evaluation_request.py +38 -0
- opik/rest_api/types/manual_evaluation_request_entity_type.py +5 -0
- opik/rest_api/types/manual_evaluation_response.py +27 -0
- opik/rest_api/types/optimization.py +4 -2
- opik/rest_api/types/optimization_public.py +4 -2
- opik/rest_api/types/optimization_public_status.py +3 -1
- opik/rest_api/types/optimization_status.py +3 -1
- opik/rest_api/types/optimization_studio_config.py +27 -0
- opik/rest_api/types/optimization_studio_config_public.py +27 -0
- opik/rest_api/types/optimization_studio_config_write.py +27 -0
- opik/rest_api/types/optimization_studio_log.py +22 -0
- opik/rest_api/types/optimization_write.py +4 -2
- opik/rest_api/types/optimization_write_status.py +3 -1
- opik/rest_api/types/project.py +1 -0
- opik/rest_api/types/project_detailed.py +1 -0
- opik/rest_api/types/project_reference.py +31 -0
- opik/rest_api/types/project_reference_public.py +31 -0
- opik/rest_api/types/project_stats_summary_item.py +1 -0
- opik/rest_api/types/prompt.py +6 -0
- opik/rest_api/types/prompt_detail.py +6 -0
- opik/rest_api/types/prompt_detail_template_structure.py +5 -0
- opik/rest_api/types/prompt_public.py +6 -0
- opik/rest_api/types/prompt_public_template_structure.py +5 -0
- opik/rest_api/types/prompt_template_structure.py +5 -0
- opik/rest_api/types/prompt_version.py +3 -0
- opik/rest_api/types/prompt_version_detail.py +3 -0
- opik/rest_api/types/prompt_version_detail_template_structure.py +5 -0
- opik/rest_api/types/prompt_version_link.py +1 -0
- opik/rest_api/types/prompt_version_link_public.py +1 -0
- opik/rest_api/types/prompt_version_page_public.py +5 -0
- opik/rest_api/types/prompt_version_public.py +3 -0
- opik/rest_api/types/prompt_version_public_template_structure.py +5 -0
- opik/rest_api/types/prompt_version_template_structure.py +5 -0
- opik/rest_api/types/prompt_version_update.py +33 -0
- opik/rest_api/types/provider_api_key.py +9 -0
- opik/rest_api/types/provider_api_key_provider.py +1 -1
- opik/rest_api/types/provider_api_key_public.py +9 -0
- opik/rest_api/types/provider_api_key_public_provider.py +1 -1
- opik/rest_api/types/score_name.py +1 -0
- opik/rest_api/types/service_toggles_config.py +18 -0
- opik/rest_api/types/span.py +1 -2
- opik/rest_api/types/span_enrichment_options.py +31 -0
- opik/rest_api/types/span_experiment_item_bulk_write_view.py +1 -2
- opik/rest_api/types/span_filter.py +23 -0
- opik/rest_api/types/span_filter_operator.py +21 -0
- opik/rest_api/types/span_filter_write.py +23 -0
- opik/rest_api/types/span_filter_write_operator.py +21 -0
- opik/rest_api/types/span_llm_as_judge_code.py +27 -0
- opik/rest_api/types/span_llm_as_judge_code_public.py +27 -0
- opik/rest_api/types/span_llm_as_judge_code_write.py +27 -0
- opik/rest_api/types/span_public.py +1 -2
- opik/rest_api/types/span_update.py +46 -0
- opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
- opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
- opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
- opik/rest_api/types/span_write.py +1 -2
- opik/rest_api/types/studio_evaluation.py +20 -0
- opik/rest_api/types/studio_evaluation_public.py +20 -0
- opik/rest_api/types/studio_evaluation_write.py +20 -0
- opik/rest_api/types/studio_llm_model.py +21 -0
- opik/rest_api/types/studio_llm_model_public.py +21 -0
- opik/rest_api/types/studio_llm_model_write.py +21 -0
- opik/rest_api/types/studio_message.py +20 -0
- opik/rest_api/types/studio_message_public.py +20 -0
- opik/rest_api/types/studio_message_write.py +20 -0
- opik/rest_api/types/studio_metric.py +21 -0
- opik/rest_api/types/studio_metric_public.py +21 -0
- opik/rest_api/types/studio_metric_write.py +21 -0
- opik/rest_api/types/studio_optimizer.py +21 -0
- opik/rest_api/types/studio_optimizer_public.py +21 -0
- opik/rest_api/types/studio_optimizer_write.py +21 -0
- opik/rest_api/types/studio_prompt.py +20 -0
- opik/rest_api/types/studio_prompt_public.py +20 -0
- opik/rest_api/types/studio_prompt_write.py +20 -0
- opik/rest_api/types/trace.py +11 -2
- opik/rest_api/types/trace_enrichment_options.py +32 -0
- opik/rest_api/types/trace_experiment_item_bulk_write_view.py +1 -2
- opik/rest_api/types/trace_filter.py +23 -0
- opik/rest_api/types/trace_filter_operator.py +21 -0
- opik/rest_api/types/trace_filter_write.py +23 -0
- opik/rest_api/types/trace_filter_write_operator.py +21 -0
- opik/rest_api/types/trace_public.py +11 -2
- opik/rest_api/types/trace_thread_filter_write.py +23 -0
- opik/rest_api/types/trace_thread_filter_write_operator.py +21 -0
- opik/rest_api/types/trace_thread_identifier.py +1 -0
- opik/rest_api/types/trace_update.py +39 -0
- opik/rest_api/types/trace_write.py +1 -2
- opik/rest_api/types/value_entry.py +2 -0
- opik/rest_api/types/value_entry_compare.py +2 -0
- opik/rest_api/types/value_entry_experiment_item_bulk_write_view.py +2 -0
- opik/rest_api/types/value_entry_public.py +2 -0
- opik/rest_api/types/video_url.py +19 -0
- opik/rest_api/types/video_url_public.py +19 -0
- opik/rest_api/types/video_url_write.py +19 -0
- opik/rest_api/types/webhook.py +28 -0
- opik/rest_api/types/webhook_examples.py +19 -0
- opik/rest_api/types/webhook_public.py +28 -0
- opik/rest_api/types/webhook_test_result.py +23 -0
- opik/rest_api/types/webhook_test_result_status.py +5 -0
- opik/rest_api/types/webhook_write.py +23 -0
- opik/rest_api/types/welcome_wizard_tracking.py +22 -0
- opik/rest_api/types/workspace_configuration.py +5 -0
- opik/rest_api/welcome_wizard/__init__.py +4 -0
- opik/rest_api/welcome_wizard/client.py +195 -0
- opik/rest_api/welcome_wizard/raw_client.py +208 -0
- opik/rest_api/workspaces/client.py +14 -2
- opik/rest_api/workspaces/raw_client.py +10 -0
- opik/s3_httpx_client.py +14 -1
- opik/simulation/__init__.py +6 -0
- opik/simulation/simulated_user.py +99 -0
- opik/simulation/simulator.py +108 -0
- opik/synchronization.py +5 -6
- opik/{decorator/tracing_runtime_config.py → tracing_runtime_config.py} +6 -7
- opik/types.py +36 -0
- opik/validation/chat_prompt_messages.py +241 -0
- opik/validation/feedback_score.py +3 -3
- opik/validation/validator.py +28 -0
- opik-1.9.71.dist-info/METADATA +370 -0
- opik-1.9.71.dist-info/RECORD +1110 -0
- opik/api_objects/prompt/prompt.py +0 -112
- opik/cli.py +0 -193
- opik/hooks.py +0 -13
- opik/integrations/bedrock/chunks_aggregator.py +0 -55
- opik/integrations/bedrock/helpers.py +0 -8
- opik/rest_api/types/automation_rule_evaluator_object_public.py +0 -100
- opik/rest_api/types/json_node_experiment_item_bulk_write_view.py +0 -5
- opik-1.8.39.dist-info/METADATA +0 -339
- opik-1.8.39.dist-info/RECORD +0 -790
- /opik/{evaluation/metrics/conversation/conversational_coherence → decorator/context_manager}/__init__.py +0 -0
- /opik/evaluation/metrics/conversation/{session_completeness → llm_judges/conversational_coherence}/__init__.py +0 -0
- /opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/schema.py +0 -0
- /opik/evaluation/metrics/conversation/{user_frustration → llm_judges/session_completeness}/__init__.py +0 -0
- /opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/schema.py +0 -0
- /opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/schema.py +0 -0
- /opik/integrations/bedrock/{stream_wrappers.py → converse/stream_wrappers.py} +0 -0
- /opik/rest_api/{spans/types → types}/span_update_type.py +0 -0
- {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/WHEEL +0 -0
- {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/entry_points.txt +0 -0
- {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/licenses/LICENSE +0 -0
- {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
import inspect
|
|
2
|
+
import logging
|
|
3
|
+
from typing import List, Dict, Any, Optional, Callable, Tuple
|
|
4
|
+
|
|
5
|
+
import opik.exceptions as exceptions
|
|
6
|
+
import opik.logging_messages as logging_messages
|
|
7
|
+
from opik.evaluation.metrics import (
|
|
8
|
+
arguments_helpers,
|
|
9
|
+
base_metric,
|
|
10
|
+
score_result,
|
|
11
|
+
arguments_validator,
|
|
12
|
+
)
|
|
13
|
+
from opik.evaluation.scorers import scorer_wrapper_metric
|
|
14
|
+
from opik.evaluation.types import ScoringKeyMappingType
|
|
15
|
+
from opik.message_processing.emulation import models
|
|
16
|
+
|
|
17
|
+
from . import exception_analyzer
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
LOGGER = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
EVALUATION_SPAN_PARAMETER_NAME = "task_span"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _has_evaluation_span_parameter(func: Callable) -> bool:
|
|
26
|
+
"""Check if a scoring function expects the task_span parameter."""
|
|
27
|
+
try:
|
|
28
|
+
sig = inspect.signature(func)
|
|
29
|
+
return EVALUATION_SPAN_PARAMETER_NAME in sig.parameters
|
|
30
|
+
except (ValueError, TypeError):
|
|
31
|
+
return False
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _compute_metric_scores(
|
|
35
|
+
scoring_metrics: List[base_metric.BaseMetric],
|
|
36
|
+
mapped_scoring_inputs: Dict[str, Any],
|
|
37
|
+
scoring_key_mapping: Optional[ScoringKeyMappingType],
|
|
38
|
+
dataset_item_content: Dict[str, Any],
|
|
39
|
+
task_output: Dict[str, Any],
|
|
40
|
+
) -> List[score_result.ScoreResult]:
|
|
41
|
+
"""
|
|
42
|
+
Compute scores using given metrics.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
scoring_metrics: List of metrics to compute
|
|
46
|
+
mapped_scoring_inputs: Scoring inputs after key mapping (will be used for regular metrics)
|
|
47
|
+
scoring_key_mapping: Optional mapping for renaming score arguments
|
|
48
|
+
dataset_item_content: Dataset item content (will be used for ScorerWrapperMetric)
|
|
49
|
+
task_output: Task output (will be used for ScorerWrapperMetric)
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
List of computed score results
|
|
53
|
+
"""
|
|
54
|
+
score_results: List[score_result.ScoreResult] = []
|
|
55
|
+
|
|
56
|
+
for metric in scoring_metrics:
|
|
57
|
+
try:
|
|
58
|
+
LOGGER.debug("Metric %s score started", metric.name)
|
|
59
|
+
|
|
60
|
+
if isinstance(metric, scorer_wrapper_metric.ScorerWrapperMetric):
|
|
61
|
+
# ScorerWrapperMetric uses original dataset item and task output without mappings
|
|
62
|
+
if (
|
|
63
|
+
task_span := mapped_scoring_inputs.get(
|
|
64
|
+
EVALUATION_SPAN_PARAMETER_NAME
|
|
65
|
+
)
|
|
66
|
+
) is not None:
|
|
67
|
+
result = metric.score(
|
|
68
|
+
dataset_item=dataset_item_content,
|
|
69
|
+
task_outputs=task_output,
|
|
70
|
+
task_span=task_span,
|
|
71
|
+
)
|
|
72
|
+
else:
|
|
73
|
+
result = metric.score(
|
|
74
|
+
dataset_item=dataset_item_content,
|
|
75
|
+
task_outputs=task_output,
|
|
76
|
+
)
|
|
77
|
+
else:
|
|
78
|
+
arguments_validator.validate_score_arguments(
|
|
79
|
+
metric=metric,
|
|
80
|
+
kwargs=mapped_scoring_inputs,
|
|
81
|
+
scoring_key_mapping=scoring_key_mapping,
|
|
82
|
+
)
|
|
83
|
+
result = metric.score(**mapped_scoring_inputs)
|
|
84
|
+
|
|
85
|
+
LOGGER.debug("Metric %s score ended", metric.name)
|
|
86
|
+
|
|
87
|
+
if isinstance(result, list):
|
|
88
|
+
score_results += result
|
|
89
|
+
else:
|
|
90
|
+
score_results.append(result)
|
|
91
|
+
|
|
92
|
+
except exceptions.ScoreMethodMissingArguments:
|
|
93
|
+
raise
|
|
94
|
+
except Exception as exception:
|
|
95
|
+
LOGGER.error(
|
|
96
|
+
"Failed to compute metric %s. Score result will be marked as failed.",
|
|
97
|
+
metric.name,
|
|
98
|
+
exc_info=True,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
if exception_analyzer.is_llm_provider_rate_limit_error(exception):
|
|
102
|
+
LOGGER.error(
|
|
103
|
+
logging_messages.LLM_PROVIDER_RATE_LIMIT_ERROR_DETECTED_IN_EVALUATE_FUNCTION
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
score_results.append(
|
|
107
|
+
score_result.ScoreResult(
|
|
108
|
+
name=metric.name,
|
|
109
|
+
value=0.0,
|
|
110
|
+
reason=str(exception),
|
|
111
|
+
scoring_failed=True,
|
|
112
|
+
)
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
return score_results
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class MetricsEvaluator:
|
|
119
|
+
"""
|
|
120
|
+
Handles metric computation and scoring.
|
|
121
|
+
|
|
122
|
+
Separates metrics into:
|
|
123
|
+
- Regular metrics: Score based on inputs/outputs
|
|
124
|
+
- Task span metrics: Score based on LLM call metadata (tokens, latency, etc)
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
def __init__(
|
|
128
|
+
self,
|
|
129
|
+
scoring_metrics: List[base_metric.BaseMetric],
|
|
130
|
+
scoring_key_mapping: Optional[ScoringKeyMappingType],
|
|
131
|
+
):
|
|
132
|
+
self._scoring_key_mapping = scoring_key_mapping
|
|
133
|
+
self._regular_metrics: List[base_metric.BaseMetric] = []
|
|
134
|
+
self._task_span_metrics: List[base_metric.BaseMetric] = []
|
|
135
|
+
|
|
136
|
+
self._analyze_metrics(scoring_metrics)
|
|
137
|
+
|
|
138
|
+
@property
|
|
139
|
+
def has_task_span_metrics(self) -> bool:
|
|
140
|
+
"""Check if any task span scoring metrics are configured."""
|
|
141
|
+
return len(self._task_span_metrics) > 0
|
|
142
|
+
|
|
143
|
+
@property
|
|
144
|
+
def task_span_metrics(self) -> List[base_metric.BaseMetric]:
|
|
145
|
+
"""Get list of task span scoring metrics."""
|
|
146
|
+
return self._task_span_metrics
|
|
147
|
+
|
|
148
|
+
@property
|
|
149
|
+
def regular_metrics(self) -> List[base_metric.BaseMetric]:
|
|
150
|
+
"""Get list of regular scoring metrics."""
|
|
151
|
+
return self._regular_metrics
|
|
152
|
+
|
|
153
|
+
def _analyze_metrics(
|
|
154
|
+
self,
|
|
155
|
+
scoring_metrics: List[base_metric.BaseMetric],
|
|
156
|
+
) -> None:
|
|
157
|
+
"""Separate metrics into regular and task-span categories."""
|
|
158
|
+
for metric in scoring_metrics:
|
|
159
|
+
if _has_evaluation_span_parameter(metric.score):
|
|
160
|
+
self._task_span_metrics.append(metric)
|
|
161
|
+
else:
|
|
162
|
+
self._regular_metrics.append(metric)
|
|
163
|
+
|
|
164
|
+
if self.has_task_span_metrics:
|
|
165
|
+
LOGGER.debug(
|
|
166
|
+
"Detected %d LLM task span scoring metrics.",
|
|
167
|
+
len(self._task_span_metrics),
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
def compute_regular_scores(
|
|
171
|
+
self,
|
|
172
|
+
dataset_item_content: Dict[str, Any],
|
|
173
|
+
task_output: Dict[str, Any],
|
|
174
|
+
) -> Tuple[List[score_result.ScoreResult], Dict[str, Any]]:
|
|
175
|
+
"""
|
|
176
|
+
Compute scores using regular metrics.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
dataset_item_content: Dataset item content
|
|
180
|
+
task_output: Task output
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
Tuple of (score results, mapped scoring inputs used for scoring regular non-wrapper metrics)
|
|
184
|
+
"""
|
|
185
|
+
mapped_scoring_inputs = arguments_helpers.create_scoring_inputs(
|
|
186
|
+
dataset_item=dataset_item_content,
|
|
187
|
+
task_output=task_output,
|
|
188
|
+
scoring_key_mapping=self._scoring_key_mapping,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
score_results = _compute_metric_scores(
|
|
192
|
+
scoring_metrics=self._regular_metrics,
|
|
193
|
+
mapped_scoring_inputs=mapped_scoring_inputs,
|
|
194
|
+
scoring_key_mapping=self._scoring_key_mapping,
|
|
195
|
+
dataset_item_content=dataset_item_content,
|
|
196
|
+
task_output=task_output,
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
return score_results, mapped_scoring_inputs
|
|
200
|
+
|
|
201
|
+
def compute_task_span_scores(
|
|
202
|
+
self,
|
|
203
|
+
dataset_item_content: Dict[str, Any],
|
|
204
|
+
task_output: Dict[str, Any],
|
|
205
|
+
task_span: models.SpanModel,
|
|
206
|
+
) -> Tuple[List[score_result.ScoreResult], Dict[str, Any]]:
|
|
207
|
+
"""
|
|
208
|
+
Compute scores using task span metrics.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
dataset_item_content: Dataset item content
|
|
212
|
+
task_output: Task output
|
|
213
|
+
task_span: Span model containing task execution metadata
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
Tuple of (score results, mapped scoring inputs used for scoring regular non-wrapper metrics)
|
|
217
|
+
"""
|
|
218
|
+
mapped_scoring_inputs = arguments_helpers.create_scoring_inputs(
|
|
219
|
+
dataset_item=dataset_item_content,
|
|
220
|
+
task_output=task_output,
|
|
221
|
+
scoring_key_mapping=self._scoring_key_mapping,
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
mapped_scoring_inputs_with_span = {
|
|
225
|
+
**mapped_scoring_inputs,
|
|
226
|
+
EVALUATION_SPAN_PARAMETER_NAME: task_span,
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
score_results = _compute_metric_scores(
|
|
230
|
+
scoring_metrics=self._task_span_metrics,
|
|
231
|
+
mapped_scoring_inputs=mapped_scoring_inputs_with_span,
|
|
232
|
+
scoring_key_mapping=self._scoring_key_mapping,
|
|
233
|
+
dataset_item_content=dataset_item_content,
|
|
234
|
+
task_output=task_output,
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
return score_results, mapped_scoring_inputs_with_span
|
|
@@ -1,8 +1,67 @@
|
|
|
1
|
-
from typing import List, Optional
|
|
1
|
+
from typing import List, Optional, Dict, TYPE_CHECKING
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
import logging
|
|
2
4
|
|
|
3
5
|
import dataclasses
|
|
4
6
|
|
|
5
|
-
from . import test_result
|
|
7
|
+
from . import score_statistics, test_result
|
|
8
|
+
from .metrics import score_result
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
pass
|
|
12
|
+
|
|
13
|
+
LOGGER = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclasses.dataclass
|
|
17
|
+
class DatasetItemResults:
|
|
18
|
+
"""Results for a single dataset item across all trials."""
|
|
19
|
+
|
|
20
|
+
test_results: List[test_result.TestResult]
|
|
21
|
+
scores: Dict[str, score_statistics.ScoreStatistics]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclasses.dataclass
|
|
25
|
+
class EvaluationResultGroupByDatasetItemsView:
|
|
26
|
+
"""View of evaluation results grouped by dataset items."""
|
|
27
|
+
|
|
28
|
+
experiment_id: str
|
|
29
|
+
dataset_id: str
|
|
30
|
+
experiment_name: Optional[str]
|
|
31
|
+
experiment_url: Optional[str]
|
|
32
|
+
trial_count: int
|
|
33
|
+
dataset_items: Dict[str, DatasetItemResults]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclasses.dataclass
|
|
37
|
+
class EvaluationResultAggregatedScoresView:
|
|
38
|
+
"""
|
|
39
|
+
Represents an aggregated view of evaluation results and scores for an
|
|
40
|
+
experiment.
|
|
41
|
+
|
|
42
|
+
This class is designed to encapsulate information about an experiment,
|
|
43
|
+
its related dataset, trial counts, test results, and aggregated score
|
|
44
|
+
statistics. It serves as a comprehensive representation useful in
|
|
45
|
+
evaluation pipelines or result summaries.
|
|
46
|
+
|
|
47
|
+
Attributes:
|
|
48
|
+
experiment_id: Unique identifier for the experiment.
|
|
49
|
+
dataset_id: Unique identifier for the associated dataset.
|
|
50
|
+
experiment_name: Human-readable name of the experiment.
|
|
51
|
+
experiment_url: URL link to the experiment for easy access.
|
|
52
|
+
trial_count: Number of trials conducted in the experiment.
|
|
53
|
+
test_results: Collection of test results from the experiment.
|
|
54
|
+
aggregated_scores: Aggregated statistical scores for evaluation
|
|
55
|
+
results keyed by score name.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
experiment_id: str
|
|
59
|
+
dataset_id: str
|
|
60
|
+
experiment_name: Optional[str]
|
|
61
|
+
experiment_url: Optional[str]
|
|
62
|
+
trial_count: int
|
|
63
|
+
test_results: List[test_result.TestResult]
|
|
64
|
+
aggregated_scores: Dict[str, score_statistics.ScoreStatistics]
|
|
6
65
|
|
|
7
66
|
|
|
8
67
|
@dataclasses.dataclass
|
|
@@ -11,3 +70,110 @@ class EvaluationResult:
|
|
|
11
70
|
dataset_id: str
|
|
12
71
|
experiment_name: Optional[str]
|
|
13
72
|
test_results: List[test_result.TestResult]
|
|
73
|
+
experiment_url: Optional[str]
|
|
74
|
+
trial_count: int
|
|
75
|
+
experiment_scores: List[score_result.ScoreResult] = dataclasses.field(
|
|
76
|
+
default_factory=list
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
def aggregate_evaluation_scores(self) -> EvaluationResultAggregatedScoresView:
|
|
80
|
+
"""
|
|
81
|
+
Aggregates evaluation scores from test results and returns the aggregated scores view.
|
|
82
|
+
|
|
83
|
+
The method calculates aggregated scores from test results and encapsulates the results
|
|
84
|
+
in an EvaluationResultAggregatedScoresView object, which contains information about
|
|
85
|
+
the experiment and computed aggregated scores.
|
|
86
|
+
|
|
87
|
+
The aggregated scores dictionary has keys for each found score name and values containing
|
|
88
|
+
the statistics for that score.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
EvaluationResultAggregatedScoresView object containing details about the
|
|
92
|
+
experiment and the aggregated scores calculated from test results.
|
|
93
|
+
"""
|
|
94
|
+
aggregated_scores = score_statistics.calculate_aggregated_statistics(
|
|
95
|
+
self.test_results
|
|
96
|
+
)
|
|
97
|
+
return EvaluationResultAggregatedScoresView(
|
|
98
|
+
experiment_id=self.experiment_id,
|
|
99
|
+
dataset_id=self.dataset_id,
|
|
100
|
+
experiment_name=self.experiment_name,
|
|
101
|
+
experiment_url=self.experiment_url,
|
|
102
|
+
trial_count=self.trial_count,
|
|
103
|
+
test_results=self.test_results,
|
|
104
|
+
aggregated_scores=aggregated_scores,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
def group_by_dataset_item_view(self) -> EvaluationResultGroupByDatasetItemsView:
|
|
108
|
+
"""
|
|
109
|
+
Create a view of evaluation results grouped by dataset items.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
EvaluationResultGroupByDatasetItemsView containing organized results with aggregated score statistics
|
|
113
|
+
"""
|
|
114
|
+
dataset_items = self._build_results_per_dataset_item()
|
|
115
|
+
|
|
116
|
+
return EvaluationResultGroupByDatasetItemsView(
|
|
117
|
+
experiment_id=self.experiment_id,
|
|
118
|
+
dataset_id=self.dataset_id,
|
|
119
|
+
experiment_name=self.experiment_name,
|
|
120
|
+
experiment_url=self.experiment_url,
|
|
121
|
+
trial_count=self.trial_count,
|
|
122
|
+
dataset_items=dataset_items,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
def _build_results_per_dataset_item(self) -> Dict[str, DatasetItemResults]:
|
|
126
|
+
"""
|
|
127
|
+
Build dataset item results with aggregated score statistics.
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
Dict mapping dataset item IDs to their results and aggregated score statistics
|
|
131
|
+
"""
|
|
132
|
+
if not self.test_results:
|
|
133
|
+
LOGGER.debug("No test results available for aggregation")
|
|
134
|
+
return {}
|
|
135
|
+
|
|
136
|
+
results_by_dataset_item = defaultdict(list)
|
|
137
|
+
for test_result_ in self.test_results:
|
|
138
|
+
dataset_item_id = test_result_.test_case.dataset_item_id
|
|
139
|
+
results_by_dataset_item[dataset_item_id].append(test_result_)
|
|
140
|
+
|
|
141
|
+
dataset_items_results = {}
|
|
142
|
+
for dataset_item_id, dataset_item_results in results_by_dataset_item.items():
|
|
143
|
+
dataset_item_results.sort(key=lambda x: x.trial_id)
|
|
144
|
+
aggregated_scores = score_statistics.calculate_aggregated_statistics(
|
|
145
|
+
dataset_item_results
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
dataset_items_results[dataset_item_id] = DatasetItemResults(
|
|
149
|
+
test_results=dataset_item_results, scores=aggregated_scores
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
return dataset_items_results
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
@dataclasses.dataclass
|
|
156
|
+
class EvaluationResultOnDictItems:
|
|
157
|
+
"""
|
|
158
|
+
Evaluation result for dict items evaluation without experiment tracking.
|
|
159
|
+
|
|
160
|
+
This class provides a similar interface to EvaluationResult but is designed
|
|
161
|
+
for lightweight evaluations that don't require experiment or dataset management.
|
|
162
|
+
It can aggregate scores across test results just like the regular evaluation.
|
|
163
|
+
|
|
164
|
+
Attributes:
|
|
165
|
+
test_results: Collection of test results from the evaluation.
|
|
166
|
+
"""
|
|
167
|
+
|
|
168
|
+
test_results: List[test_result.TestResult]
|
|
169
|
+
|
|
170
|
+
def aggregate_evaluation_scores(
|
|
171
|
+
self,
|
|
172
|
+
) -> Dict[str, score_statistics.ScoreStatistics]:
|
|
173
|
+
"""
|
|
174
|
+
Aggregates evaluation scores from test results.
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
Dictionary mapping score names to their aggregated statistics.
|
|
178
|
+
"""
|
|
179
|
+
return score_statistics.calculate_aggregated_statistics(self.test_results)
|