opik 1.8.39__py3-none-any.whl → 1.9.71__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik/__init__.py +19 -3
- opik/anonymizer/__init__.py +5 -0
- opik/anonymizer/anonymizer.py +12 -0
- opik/anonymizer/factory.py +80 -0
- opik/anonymizer/recursive_anonymizer.py +64 -0
- opik/anonymizer/rules.py +56 -0
- opik/anonymizer/rules_anonymizer.py +35 -0
- opik/api_objects/attachment/attachment_context.py +36 -0
- opik/api_objects/attachment/attachments_extractor.py +153 -0
- opik/api_objects/attachment/client.py +1 -0
- opik/api_objects/attachment/converters.py +2 -0
- opik/api_objects/attachment/decoder.py +18 -0
- opik/api_objects/attachment/decoder_base64.py +83 -0
- opik/api_objects/attachment/decoder_helpers.py +137 -0
- opik/api_objects/data_helpers.py +79 -0
- opik/api_objects/dataset/dataset.py +64 -4
- opik/api_objects/dataset/rest_operations.py +11 -2
- opik/api_objects/experiment/experiment.py +57 -57
- opik/api_objects/experiment/experiment_item.py +2 -1
- opik/api_objects/experiment/experiments_client.py +64 -0
- opik/api_objects/experiment/helpers.py +35 -11
- opik/api_objects/experiment/rest_operations.py +65 -5
- opik/api_objects/helpers.py +8 -5
- opik/api_objects/local_recording.py +81 -0
- opik/api_objects/opik_client.py +600 -108
- opik/api_objects/opik_query_language.py +39 -5
- opik/api_objects/prompt/__init__.py +12 -2
- opik/api_objects/prompt/base_prompt.py +69 -0
- opik/api_objects/prompt/base_prompt_template.py +29 -0
- opik/api_objects/prompt/chat/__init__.py +1 -0
- opik/api_objects/prompt/chat/chat_prompt.py +210 -0
- opik/api_objects/prompt/chat/chat_prompt_template.py +350 -0
- opik/api_objects/prompt/chat/content_renderer_registry.py +203 -0
- opik/api_objects/prompt/client.py +189 -47
- opik/api_objects/prompt/text/__init__.py +1 -0
- opik/api_objects/prompt/text/prompt.py +174 -0
- opik/api_objects/prompt/{prompt_template.py → text/prompt_template.py} +10 -6
- opik/api_objects/prompt/types.py +23 -0
- opik/api_objects/search_helpers.py +89 -0
- opik/api_objects/span/span_data.py +35 -25
- opik/api_objects/threads/threads_client.py +39 -5
- opik/api_objects/trace/trace_client.py +52 -2
- opik/api_objects/trace/trace_data.py +15 -24
- opik/api_objects/validation_helpers.py +3 -3
- opik/cli/__init__.py +5 -0
- opik/cli/__main__.py +6 -0
- opik/cli/configure.py +66 -0
- opik/cli/exports/__init__.py +131 -0
- opik/cli/exports/dataset.py +278 -0
- opik/cli/exports/experiment.py +784 -0
- opik/cli/exports/project.py +685 -0
- opik/cli/exports/prompt.py +578 -0
- opik/cli/exports/utils.py +406 -0
- opik/cli/harbor.py +39 -0
- opik/cli/healthcheck.py +21 -0
- opik/cli/imports/__init__.py +439 -0
- opik/cli/imports/dataset.py +143 -0
- opik/cli/imports/experiment.py +1192 -0
- opik/cli/imports/project.py +262 -0
- opik/cli/imports/prompt.py +177 -0
- opik/cli/imports/utils.py +280 -0
- opik/cli/main.py +49 -0
- opik/cli/proxy.py +93 -0
- opik/cli/usage_report/__init__.py +16 -0
- opik/cli/usage_report/charts.py +783 -0
- opik/cli/usage_report/cli.py +274 -0
- opik/cli/usage_report/constants.py +9 -0
- opik/cli/usage_report/extraction.py +749 -0
- opik/cli/usage_report/pdf.py +244 -0
- opik/cli/usage_report/statistics.py +78 -0
- opik/cli/usage_report/utils.py +235 -0
- opik/config.py +13 -7
- opik/configurator/configure.py +17 -0
- opik/datetime_helpers.py +12 -0
- opik/decorator/arguments_helpers.py +9 -1
- opik/decorator/base_track_decorator.py +205 -133
- opik/decorator/context_manager/span_context_manager.py +123 -0
- opik/decorator/context_manager/trace_context_manager.py +84 -0
- opik/decorator/opik_args/__init__.py +13 -0
- opik/decorator/opik_args/api_classes.py +71 -0
- opik/decorator/opik_args/helpers.py +120 -0
- opik/decorator/span_creation_handler.py +25 -6
- opik/dict_utils.py +3 -3
- opik/evaluation/__init__.py +13 -2
- opik/evaluation/engine/engine.py +272 -75
- opik/evaluation/engine/evaluation_tasks_executor.py +6 -3
- opik/evaluation/engine/helpers.py +31 -6
- opik/evaluation/engine/metrics_evaluator.py +237 -0
- opik/evaluation/evaluation_result.py +168 -2
- opik/evaluation/evaluator.py +533 -62
- opik/evaluation/metrics/__init__.py +103 -4
- opik/evaluation/metrics/aggregated_metric.py +35 -6
- opik/evaluation/metrics/base_metric.py +1 -1
- opik/evaluation/metrics/conversation/__init__.py +48 -0
- opik/evaluation/metrics/conversation/conversation_thread_metric.py +56 -2
- opik/evaluation/metrics/conversation/g_eval_wrappers.py +19 -0
- opik/evaluation/metrics/conversation/helpers.py +14 -15
- opik/evaluation/metrics/conversation/heuristics/__init__.py +14 -0
- opik/evaluation/metrics/conversation/heuristics/degeneration/__init__.py +3 -0
- opik/evaluation/metrics/conversation/heuristics/degeneration/metric.py +189 -0
- opik/evaluation/metrics/conversation/heuristics/degeneration/phrases.py +12 -0
- opik/evaluation/metrics/conversation/heuristics/knowledge_retention/__init__.py +3 -0
- opik/evaluation/metrics/conversation/heuristics/knowledge_retention/metric.py +172 -0
- opik/evaluation/metrics/conversation/llm_judges/__init__.py +32 -0
- opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/metric.py +22 -17
- opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/templates.py +1 -1
- opik/evaluation/metrics/conversation/llm_judges/g_eval_wrappers.py +442 -0
- opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/metric.py +13 -7
- opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/templates.py +1 -1
- opik/evaluation/metrics/conversation/llm_judges/user_frustration/__init__.py +0 -0
- opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/metric.py +21 -14
- opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/templates.py +1 -1
- opik/evaluation/metrics/conversation/types.py +4 -5
- opik/evaluation/metrics/conversation_types.py +9 -0
- opik/evaluation/metrics/heuristics/bertscore.py +107 -0
- opik/evaluation/metrics/heuristics/bleu.py +35 -15
- opik/evaluation/metrics/heuristics/chrf.py +127 -0
- opik/evaluation/metrics/heuristics/contains.py +47 -11
- opik/evaluation/metrics/heuristics/distribution_metrics.py +331 -0
- opik/evaluation/metrics/heuristics/gleu.py +113 -0
- opik/evaluation/metrics/heuristics/language_adherence.py +123 -0
- opik/evaluation/metrics/heuristics/meteor.py +119 -0
- opik/evaluation/metrics/heuristics/prompt_injection.py +150 -0
- opik/evaluation/metrics/heuristics/readability.py +129 -0
- opik/evaluation/metrics/heuristics/rouge.py +26 -9
- opik/evaluation/metrics/heuristics/spearman.py +88 -0
- opik/evaluation/metrics/heuristics/tone.py +155 -0
- opik/evaluation/metrics/heuristics/vader_sentiment.py +77 -0
- opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +20 -5
- opik/evaluation/metrics/llm_judges/context_precision/metric.py +20 -6
- opik/evaluation/metrics/llm_judges/context_recall/metric.py +20 -6
- opik/evaluation/metrics/llm_judges/g_eval/__init__.py +5 -0
- opik/evaluation/metrics/llm_judges/g_eval/metric.py +219 -68
- opik/evaluation/metrics/llm_judges/g_eval/parser.py +102 -52
- opik/evaluation/metrics/llm_judges/g_eval/presets.py +209 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/__init__.py +36 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/agent_assessment.py +77 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/bias_classifier.py +181 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/compliance_risk.py +41 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/prompt_uncertainty.py +41 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/qa_suite.py +146 -0
- opik/evaluation/metrics/llm_judges/hallucination/metric.py +16 -3
- opik/evaluation/metrics/llm_judges/llm_juries/__init__.py +3 -0
- opik/evaluation/metrics/llm_judges/llm_juries/metric.py +76 -0
- opik/evaluation/metrics/llm_judges/moderation/metric.py +16 -4
- opik/evaluation/metrics/llm_judges/structure_output_compliance/__init__.py +0 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +144 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/parser.py +79 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/schema.py +15 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/template.py +50 -0
- opik/evaluation/metrics/llm_judges/syc_eval/__init__.py +0 -0
- opik/evaluation/metrics/llm_judges/syc_eval/metric.py +252 -0
- opik/evaluation/metrics/llm_judges/syc_eval/parser.py +82 -0
- opik/evaluation/metrics/llm_judges/syc_eval/template.py +155 -0
- opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +20 -5
- opik/evaluation/metrics/llm_judges/usefulness/metric.py +16 -4
- opik/evaluation/metrics/ragas_metric.py +43 -23
- opik/evaluation/models/__init__.py +8 -0
- opik/evaluation/models/base_model.py +107 -1
- opik/evaluation/models/langchain/langchain_chat_model.py +15 -7
- opik/evaluation/models/langchain/message_converters.py +97 -15
- opik/evaluation/models/litellm/litellm_chat_model.py +156 -29
- opik/evaluation/models/litellm/util.py +125 -0
- opik/evaluation/models/litellm/warning_filters.py +16 -4
- opik/evaluation/models/model_capabilities.py +187 -0
- opik/evaluation/models/models_factory.py +25 -3
- opik/evaluation/preprocessing.py +92 -0
- opik/evaluation/report.py +70 -12
- opik/evaluation/rest_operations.py +49 -45
- opik/evaluation/samplers/__init__.py +4 -0
- opik/evaluation/samplers/base_dataset_sampler.py +40 -0
- opik/evaluation/samplers/random_dataset_sampler.py +48 -0
- opik/evaluation/score_statistics.py +66 -0
- opik/evaluation/scorers/__init__.py +4 -0
- opik/evaluation/scorers/scorer_function.py +55 -0
- opik/evaluation/scorers/scorer_wrapper_metric.py +130 -0
- opik/evaluation/test_case.py +3 -2
- opik/evaluation/test_result.py +1 -0
- opik/evaluation/threads/evaluator.py +31 -3
- opik/evaluation/threads/helpers.py +3 -2
- opik/evaluation/types.py +9 -1
- opik/exceptions.py +33 -0
- opik/file_upload/file_uploader.py +13 -0
- opik/file_upload/upload_options.py +2 -0
- opik/hooks/__init__.py +23 -0
- opik/hooks/anonymizer_hook.py +36 -0
- opik/hooks/httpx_client_hook.py +112 -0
- opik/httpx_client.py +12 -9
- opik/id_helpers.py +18 -0
- opik/integrations/adk/graph/subgraph_edges_builders.py +1 -2
- opik/integrations/adk/helpers.py +16 -7
- opik/integrations/adk/legacy_opik_tracer.py +7 -4
- opik/integrations/adk/opik_tracer.py +14 -1
- opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +7 -3
- opik/integrations/adk/recursive_callback_injector.py +4 -7
- opik/integrations/bedrock/converse/__init__.py +0 -0
- opik/integrations/bedrock/converse/chunks_aggregator.py +188 -0
- opik/integrations/bedrock/{converse_decorator.py → converse/converse_decorator.py} +4 -3
- opik/integrations/bedrock/invoke_agent_decorator.py +5 -4
- opik/integrations/bedrock/invoke_model/__init__.py +0 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/__init__.py +78 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/api.py +45 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/base.py +23 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/claude.py +121 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/format_detector.py +107 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/llama.py +108 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/mistral.py +118 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/nova.py +99 -0
- opik/integrations/bedrock/invoke_model/invoke_model_decorator.py +178 -0
- opik/integrations/bedrock/invoke_model/response_types.py +34 -0
- opik/integrations/bedrock/invoke_model/stream_wrappers.py +122 -0
- opik/integrations/bedrock/invoke_model/usage_converters.py +87 -0
- opik/integrations/bedrock/invoke_model/usage_extraction.py +108 -0
- opik/integrations/bedrock/opik_tracker.py +42 -4
- opik/integrations/bedrock/types.py +19 -0
- opik/integrations/crewai/crewai_decorator.py +8 -51
- opik/integrations/crewai/opik_tracker.py +31 -10
- opik/integrations/crewai/patchers/__init__.py +5 -0
- opik/integrations/crewai/patchers/flow.py +118 -0
- opik/integrations/crewai/patchers/litellm_completion.py +30 -0
- opik/integrations/crewai/patchers/llm_client.py +207 -0
- opik/integrations/dspy/callback.py +80 -17
- opik/integrations/dspy/parsers.py +168 -0
- opik/integrations/harbor/__init__.py +17 -0
- opik/integrations/harbor/experiment_service.py +269 -0
- opik/integrations/harbor/opik_tracker.py +528 -0
- opik/integrations/haystack/opik_connector.py +2 -2
- opik/integrations/haystack/opik_tracer.py +3 -7
- opik/integrations/langchain/__init__.py +3 -1
- opik/integrations/langchain/helpers.py +96 -0
- opik/integrations/langchain/langgraph_async_context_bridge.py +131 -0
- opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
- opik/integrations/langchain/opik_encoder_extension.py +1 -1
- opik/integrations/langchain/opik_tracer.py +474 -229
- opik/integrations/litellm/__init__.py +5 -0
- opik/integrations/litellm/completion_chunks_aggregator.py +115 -0
- opik/integrations/litellm/litellm_completion_decorator.py +242 -0
- opik/integrations/litellm/opik_tracker.py +43 -0
- opik/integrations/litellm/stream_patchers.py +151 -0
- opik/integrations/llama_index/callback.py +146 -107
- opik/integrations/openai/agents/opik_tracing_processor.py +1 -2
- opik/integrations/openai/openai_chat_completions_decorator.py +2 -16
- opik/integrations/openai/opik_tracker.py +1 -1
- opik/integrations/sagemaker/auth.py +5 -1
- opik/llm_usage/google_usage.py +3 -1
- opik/llm_usage/opik_usage.py +7 -8
- opik/llm_usage/opik_usage_factory.py +4 -2
- opik/logging_messages.py +6 -0
- opik/message_processing/batching/base_batcher.py +14 -21
- opik/message_processing/batching/batch_manager.py +22 -10
- opik/message_processing/batching/batch_manager_constuctors.py +10 -0
- opik/message_processing/batching/batchers.py +59 -27
- opik/message_processing/batching/flushing_thread.py +0 -3
- opik/message_processing/emulation/__init__.py +0 -0
- opik/message_processing/emulation/emulator_message_processor.py +578 -0
- opik/message_processing/emulation/local_emulator_message_processor.py +140 -0
- opik/message_processing/emulation/models.py +162 -0
- opik/message_processing/encoder_helpers.py +79 -0
- opik/message_processing/messages.py +56 -1
- opik/message_processing/preprocessing/__init__.py +0 -0
- opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
- opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
- opik/message_processing/preprocessing/constants.py +1 -0
- opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
- opik/message_processing/preprocessing/preprocessor.py +36 -0
- opik/message_processing/processors/__init__.py +0 -0
- opik/message_processing/processors/attachments_extraction_processor.py +146 -0
- opik/message_processing/processors/message_processors.py +92 -0
- opik/message_processing/processors/message_processors_chain.py +96 -0
- opik/message_processing/{message_processors.py → processors/online_message_processor.py} +85 -29
- opik/message_processing/queue_consumer.py +9 -3
- opik/message_processing/streamer.py +71 -33
- opik/message_processing/streamer_constructors.py +43 -10
- opik/opik_context.py +16 -4
- opik/plugins/pytest/hooks.py +5 -3
- opik/rest_api/__init__.py +346 -15
- opik/rest_api/alerts/__init__.py +7 -0
- opik/rest_api/alerts/client.py +667 -0
- opik/rest_api/alerts/raw_client.py +1015 -0
- opik/rest_api/alerts/types/__init__.py +7 -0
- opik/rest_api/alerts/types/get_webhook_examples_request_alert_type.py +5 -0
- opik/rest_api/annotation_queues/__init__.py +4 -0
- opik/rest_api/annotation_queues/client.py +668 -0
- opik/rest_api/annotation_queues/raw_client.py +1019 -0
- opik/rest_api/automation_rule_evaluators/client.py +34 -2
- opik/rest_api/automation_rule_evaluators/raw_client.py +24 -0
- opik/rest_api/client.py +15 -0
- opik/rest_api/dashboards/__init__.py +4 -0
- opik/rest_api/dashboards/client.py +462 -0
- opik/rest_api/dashboards/raw_client.py +648 -0
- opik/rest_api/datasets/client.py +1310 -44
- opik/rest_api/datasets/raw_client.py +2269 -358
- opik/rest_api/experiments/__init__.py +2 -2
- opik/rest_api/experiments/client.py +191 -5
- opik/rest_api/experiments/raw_client.py +301 -7
- opik/rest_api/experiments/types/__init__.py +4 -1
- opik/rest_api/experiments/types/experiment_update_status.py +5 -0
- opik/rest_api/experiments/types/experiment_update_type.py +5 -0
- opik/rest_api/experiments/types/experiment_write_status.py +5 -0
- opik/rest_api/feedback_definitions/types/find_feedback_definitions_request_type.py +1 -1
- opik/rest_api/llm_provider_key/client.py +20 -0
- opik/rest_api/llm_provider_key/raw_client.py +20 -0
- opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +1 -1
- opik/rest_api/manual_evaluation/__init__.py +4 -0
- opik/rest_api/manual_evaluation/client.py +347 -0
- opik/rest_api/manual_evaluation/raw_client.py +543 -0
- opik/rest_api/optimizations/client.py +145 -9
- opik/rest_api/optimizations/raw_client.py +237 -13
- opik/rest_api/optimizations/types/optimization_update_status.py +3 -1
- opik/rest_api/prompts/__init__.py +2 -2
- opik/rest_api/prompts/client.py +227 -6
- opik/rest_api/prompts/raw_client.py +331 -2
- opik/rest_api/prompts/types/__init__.py +3 -1
- opik/rest_api/prompts/types/create_prompt_version_detail_template_structure.py +5 -0
- opik/rest_api/prompts/types/prompt_write_template_structure.py +5 -0
- opik/rest_api/spans/__init__.py +0 -2
- opik/rest_api/spans/client.py +238 -76
- opik/rest_api/spans/raw_client.py +307 -95
- opik/rest_api/spans/types/__init__.py +0 -2
- opik/rest_api/traces/client.py +572 -161
- opik/rest_api/traces/raw_client.py +736 -229
- opik/rest_api/types/__init__.py +352 -17
- opik/rest_api/types/aggregation_data.py +1 -0
- opik/rest_api/types/alert.py +33 -0
- opik/rest_api/types/alert_alert_type.py +5 -0
- opik/rest_api/types/alert_page_public.py +24 -0
- opik/rest_api/types/alert_public.py +33 -0
- opik/rest_api/types/alert_public_alert_type.py +5 -0
- opik/rest_api/types/alert_trigger.py +27 -0
- opik/rest_api/types/alert_trigger_config.py +28 -0
- opik/rest_api/types/alert_trigger_config_public.py +28 -0
- opik/rest_api/types/alert_trigger_config_public_type.py +10 -0
- opik/rest_api/types/alert_trigger_config_type.py +10 -0
- opik/rest_api/types/alert_trigger_config_write.py +22 -0
- opik/rest_api/types/alert_trigger_config_write_type.py +10 -0
- opik/rest_api/types/alert_trigger_event_type.py +19 -0
- opik/rest_api/types/alert_trigger_public.py +27 -0
- opik/rest_api/types/alert_trigger_public_event_type.py +19 -0
- opik/rest_api/types/alert_trigger_write.py +23 -0
- opik/rest_api/types/alert_trigger_write_event_type.py +19 -0
- opik/rest_api/types/alert_write.py +28 -0
- opik/rest_api/types/alert_write_alert_type.py +5 -0
- opik/rest_api/types/annotation_queue.py +42 -0
- opik/rest_api/types/annotation_queue_batch.py +27 -0
- opik/rest_api/types/annotation_queue_item_ids.py +19 -0
- opik/rest_api/types/annotation_queue_page_public.py +28 -0
- opik/rest_api/types/annotation_queue_public.py +38 -0
- opik/rest_api/types/annotation_queue_public_scope.py +5 -0
- opik/rest_api/types/annotation_queue_reviewer.py +20 -0
- opik/rest_api/types/annotation_queue_reviewer_public.py +20 -0
- opik/rest_api/types/annotation_queue_scope.py +5 -0
- opik/rest_api/types/annotation_queue_write.py +31 -0
- opik/rest_api/types/annotation_queue_write_scope.py +5 -0
- opik/rest_api/types/audio_url.py +19 -0
- opik/rest_api/types/audio_url_public.py +19 -0
- opik/rest_api/types/audio_url_write.py +19 -0
- opik/rest_api/types/automation_rule_evaluator.py +62 -2
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_object_object_public.py +155 -0
- opik/rest_api/types/automation_rule_evaluator_page_public.py +3 -2
- opik/rest_api/types/automation_rule_evaluator_public.py +57 -2
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_public.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_write.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update.py +51 -1
- opik/rest_api/types/automation_rule_evaluator_update_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update_span_llm_as_judge.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_update_trace_thread_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update_trace_thread_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_write.py +51 -1
- opik/rest_api/types/boolean_feedback_definition.py +25 -0
- opik/rest_api/types/boolean_feedback_definition_create.py +20 -0
- opik/rest_api/types/boolean_feedback_definition_public.py +25 -0
- opik/rest_api/types/boolean_feedback_definition_update.py +20 -0
- opik/rest_api/types/boolean_feedback_detail.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_create.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_public.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_update.py +29 -0
- opik/rest_api/types/dashboard_page_public.py +24 -0
- opik/rest_api/types/dashboard_public.py +30 -0
- opik/rest_api/types/dataset.py +4 -0
- opik/rest_api/types/dataset_expansion.py +42 -0
- opik/rest_api/types/dataset_expansion_response.py +39 -0
- opik/rest_api/types/dataset_item.py +2 -0
- opik/rest_api/types/dataset_item_changes_public.py +5 -0
- opik/rest_api/types/dataset_item_compare.py +2 -0
- opik/rest_api/types/dataset_item_filter.py +27 -0
- opik/rest_api/types/dataset_item_filter_operator.py +21 -0
- opik/rest_api/types/dataset_item_page_compare.py +5 -0
- opik/rest_api/types/dataset_item_page_public.py +5 -0
- opik/rest_api/types/dataset_item_public.py +2 -0
- opik/rest_api/types/dataset_item_update.py +39 -0
- opik/rest_api/types/dataset_item_write.py +1 -0
- opik/rest_api/types/dataset_public.py +4 -0
- opik/rest_api/types/dataset_public_status.py +5 -0
- opik/rest_api/types/dataset_status.py +5 -0
- opik/rest_api/types/dataset_version_diff.py +22 -0
- opik/rest_api/types/dataset_version_diff_stats.py +24 -0
- opik/rest_api/types/dataset_version_page_public.py +23 -0
- opik/rest_api/types/dataset_version_public.py +59 -0
- opik/rest_api/types/dataset_version_summary.py +46 -0
- opik/rest_api/types/dataset_version_summary_public.py +46 -0
- opik/rest_api/types/experiment.py +7 -2
- opik/rest_api/types/experiment_group_response.py +2 -0
- opik/rest_api/types/experiment_public.py +7 -2
- opik/rest_api/types/experiment_public_status.py +5 -0
- opik/rest_api/types/experiment_score.py +20 -0
- opik/rest_api/types/experiment_score_public.py +20 -0
- opik/rest_api/types/experiment_score_write.py +20 -0
- opik/rest_api/types/experiment_status.py +5 -0
- opik/rest_api/types/feedback.py +25 -1
- opik/rest_api/types/feedback_create.py +20 -1
- opik/rest_api/types/feedback_object_public.py +27 -1
- opik/rest_api/types/feedback_public.py +25 -1
- opik/rest_api/types/feedback_score_batch_item.py +2 -1
- opik/rest_api/types/feedback_score_batch_item_thread.py +2 -1
- opik/rest_api/types/feedback_score_public.py +4 -0
- opik/rest_api/types/feedback_update.py +20 -1
- opik/rest_api/types/group_content_with_aggregations.py +1 -0
- opik/rest_api/types/group_detail.py +19 -0
- opik/rest_api/types/group_details.py +20 -0
- opik/rest_api/types/guardrail.py +1 -0
- opik/rest_api/types/guardrail_write.py +1 -0
- opik/rest_api/types/ids_holder.py +19 -0
- opik/rest_api/types/image_url.py +20 -0
- opik/rest_api/types/image_url_public.py +20 -0
- opik/rest_api/types/image_url_write.py +20 -0
- opik/rest_api/types/llm_as_judge_message.py +5 -1
- opik/rest_api/types/llm_as_judge_message_content.py +26 -0
- opik/rest_api/types/llm_as_judge_message_content_public.py +26 -0
- opik/rest_api/types/llm_as_judge_message_content_write.py +26 -0
- opik/rest_api/types/llm_as_judge_message_public.py +5 -1
- opik/rest_api/types/llm_as_judge_message_write.py +5 -1
- opik/rest_api/types/llm_as_judge_model_parameters.py +3 -0
- opik/rest_api/types/llm_as_judge_model_parameters_public.py +3 -0
- opik/rest_api/types/llm_as_judge_model_parameters_write.py +3 -0
- opik/rest_api/types/manual_evaluation_request.py +38 -0
- opik/rest_api/types/manual_evaluation_request_entity_type.py +5 -0
- opik/rest_api/types/manual_evaluation_response.py +27 -0
- opik/rest_api/types/optimization.py +4 -2
- opik/rest_api/types/optimization_public.py +4 -2
- opik/rest_api/types/optimization_public_status.py +3 -1
- opik/rest_api/types/optimization_status.py +3 -1
- opik/rest_api/types/optimization_studio_config.py +27 -0
- opik/rest_api/types/optimization_studio_config_public.py +27 -0
- opik/rest_api/types/optimization_studio_config_write.py +27 -0
- opik/rest_api/types/optimization_studio_log.py +22 -0
- opik/rest_api/types/optimization_write.py +4 -2
- opik/rest_api/types/optimization_write_status.py +3 -1
- opik/rest_api/types/project.py +1 -0
- opik/rest_api/types/project_detailed.py +1 -0
- opik/rest_api/types/project_reference.py +31 -0
- opik/rest_api/types/project_reference_public.py +31 -0
- opik/rest_api/types/project_stats_summary_item.py +1 -0
- opik/rest_api/types/prompt.py +6 -0
- opik/rest_api/types/prompt_detail.py +6 -0
- opik/rest_api/types/prompt_detail_template_structure.py +5 -0
- opik/rest_api/types/prompt_public.py +6 -0
- opik/rest_api/types/prompt_public_template_structure.py +5 -0
- opik/rest_api/types/prompt_template_structure.py +5 -0
- opik/rest_api/types/prompt_version.py +3 -0
- opik/rest_api/types/prompt_version_detail.py +3 -0
- opik/rest_api/types/prompt_version_detail_template_structure.py +5 -0
- opik/rest_api/types/prompt_version_link.py +1 -0
- opik/rest_api/types/prompt_version_link_public.py +1 -0
- opik/rest_api/types/prompt_version_page_public.py +5 -0
- opik/rest_api/types/prompt_version_public.py +3 -0
- opik/rest_api/types/prompt_version_public_template_structure.py +5 -0
- opik/rest_api/types/prompt_version_template_structure.py +5 -0
- opik/rest_api/types/prompt_version_update.py +33 -0
- opik/rest_api/types/provider_api_key.py +9 -0
- opik/rest_api/types/provider_api_key_provider.py +1 -1
- opik/rest_api/types/provider_api_key_public.py +9 -0
- opik/rest_api/types/provider_api_key_public_provider.py +1 -1
- opik/rest_api/types/score_name.py +1 -0
- opik/rest_api/types/service_toggles_config.py +18 -0
- opik/rest_api/types/span.py +1 -2
- opik/rest_api/types/span_enrichment_options.py +31 -0
- opik/rest_api/types/span_experiment_item_bulk_write_view.py +1 -2
- opik/rest_api/types/span_filter.py +23 -0
- opik/rest_api/types/span_filter_operator.py +21 -0
- opik/rest_api/types/span_filter_write.py +23 -0
- opik/rest_api/types/span_filter_write_operator.py +21 -0
- opik/rest_api/types/span_llm_as_judge_code.py +27 -0
- opik/rest_api/types/span_llm_as_judge_code_public.py +27 -0
- opik/rest_api/types/span_llm_as_judge_code_write.py +27 -0
- opik/rest_api/types/span_public.py +1 -2
- opik/rest_api/types/span_update.py +46 -0
- opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
- opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
- opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
- opik/rest_api/types/span_write.py +1 -2
- opik/rest_api/types/studio_evaluation.py +20 -0
- opik/rest_api/types/studio_evaluation_public.py +20 -0
- opik/rest_api/types/studio_evaluation_write.py +20 -0
- opik/rest_api/types/studio_llm_model.py +21 -0
- opik/rest_api/types/studio_llm_model_public.py +21 -0
- opik/rest_api/types/studio_llm_model_write.py +21 -0
- opik/rest_api/types/studio_message.py +20 -0
- opik/rest_api/types/studio_message_public.py +20 -0
- opik/rest_api/types/studio_message_write.py +20 -0
- opik/rest_api/types/studio_metric.py +21 -0
- opik/rest_api/types/studio_metric_public.py +21 -0
- opik/rest_api/types/studio_metric_write.py +21 -0
- opik/rest_api/types/studio_optimizer.py +21 -0
- opik/rest_api/types/studio_optimizer_public.py +21 -0
- opik/rest_api/types/studio_optimizer_write.py +21 -0
- opik/rest_api/types/studio_prompt.py +20 -0
- opik/rest_api/types/studio_prompt_public.py +20 -0
- opik/rest_api/types/studio_prompt_write.py +20 -0
- opik/rest_api/types/trace.py +11 -2
- opik/rest_api/types/trace_enrichment_options.py +32 -0
- opik/rest_api/types/trace_experiment_item_bulk_write_view.py +1 -2
- opik/rest_api/types/trace_filter.py +23 -0
- opik/rest_api/types/trace_filter_operator.py +21 -0
- opik/rest_api/types/trace_filter_write.py +23 -0
- opik/rest_api/types/trace_filter_write_operator.py +21 -0
- opik/rest_api/types/trace_public.py +11 -2
- opik/rest_api/types/trace_thread_filter_write.py +23 -0
- opik/rest_api/types/trace_thread_filter_write_operator.py +21 -0
- opik/rest_api/types/trace_thread_identifier.py +1 -0
- opik/rest_api/types/trace_update.py +39 -0
- opik/rest_api/types/trace_write.py +1 -2
- opik/rest_api/types/value_entry.py +2 -0
- opik/rest_api/types/value_entry_compare.py +2 -0
- opik/rest_api/types/value_entry_experiment_item_bulk_write_view.py +2 -0
- opik/rest_api/types/value_entry_public.py +2 -0
- opik/rest_api/types/video_url.py +19 -0
- opik/rest_api/types/video_url_public.py +19 -0
- opik/rest_api/types/video_url_write.py +19 -0
- opik/rest_api/types/webhook.py +28 -0
- opik/rest_api/types/webhook_examples.py +19 -0
- opik/rest_api/types/webhook_public.py +28 -0
- opik/rest_api/types/webhook_test_result.py +23 -0
- opik/rest_api/types/webhook_test_result_status.py +5 -0
- opik/rest_api/types/webhook_write.py +23 -0
- opik/rest_api/types/welcome_wizard_tracking.py +22 -0
- opik/rest_api/types/workspace_configuration.py +5 -0
- opik/rest_api/welcome_wizard/__init__.py +4 -0
- opik/rest_api/welcome_wizard/client.py +195 -0
- opik/rest_api/welcome_wizard/raw_client.py +208 -0
- opik/rest_api/workspaces/client.py +14 -2
- opik/rest_api/workspaces/raw_client.py +10 -0
- opik/s3_httpx_client.py +14 -1
- opik/simulation/__init__.py +6 -0
- opik/simulation/simulated_user.py +99 -0
- opik/simulation/simulator.py +108 -0
- opik/synchronization.py +5 -6
- opik/{decorator/tracing_runtime_config.py → tracing_runtime_config.py} +6 -7
- opik/types.py +36 -0
- opik/validation/chat_prompt_messages.py +241 -0
- opik/validation/feedback_score.py +3 -3
- opik/validation/validator.py +28 -0
- opik-1.9.71.dist-info/METADATA +370 -0
- opik-1.9.71.dist-info/RECORD +1110 -0
- opik/api_objects/prompt/prompt.py +0 -112
- opik/cli.py +0 -193
- opik/hooks.py +0 -13
- opik/integrations/bedrock/chunks_aggregator.py +0 -55
- opik/integrations/bedrock/helpers.py +0 -8
- opik/rest_api/types/automation_rule_evaluator_object_public.py +0 -100
- opik/rest_api/types/json_node_experiment_item_bulk_write_view.py +0 -5
- opik-1.8.39.dist-info/METADATA +0 -339
- opik-1.8.39.dist-info/RECORD +0 -790
- /opik/{evaluation/metrics/conversation/conversational_coherence → decorator/context_manager}/__init__.py +0 -0
- /opik/evaluation/metrics/conversation/{session_completeness → llm_judges/conversational_coherence}/__init__.py +0 -0
- /opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/schema.py +0 -0
- /opik/evaluation/metrics/conversation/{user_frustration → llm_judges/session_completeness}/__init__.py +0 -0
- /opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/schema.py +0 -0
- /opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/schema.py +0 -0
- /opik/integrations/bedrock/{stream_wrappers.py → converse/stream_wrappers.py} +0 -0
- /opik/rest_api/{spans/types → types}/span_update_type.py +0 -0
- {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/WHEEL +0 -0
- {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/entry_points.txt +0 -0
- {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/licenses/LICENSE +0 -0
- {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/top_level.txt +0 -0
opik/evaluation/evaluator.py
CHANGED
|
@@ -1,37 +1,98 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import time
|
|
3
|
-
from typing import Any, Callable, Dict, List, Optional, Union
|
|
3
|
+
from typing import Any, Callable, Dict, List, Optional, Union, cast
|
|
4
4
|
|
|
5
|
-
from .. import
|
|
5
|
+
from ..api_objects.prompt import base_prompt
|
|
6
6
|
from ..api_objects import opik_client
|
|
7
7
|
from ..api_objects import dataset, experiment
|
|
8
8
|
from ..api_objects.experiment import helpers as experiment_helpers
|
|
9
|
-
from ..api_objects.prompt import
|
|
10
|
-
from . import
|
|
11
|
-
from .
|
|
12
|
-
|
|
13
|
-
|
|
9
|
+
from ..api_objects.prompt.chat import chat_prompt_template
|
|
10
|
+
from ..api_objects.prompt import types as prompt_types
|
|
11
|
+
from . import (
|
|
12
|
+
asyncio_support,
|
|
13
|
+
engine,
|
|
14
|
+
evaluation_result,
|
|
15
|
+
report,
|
|
16
|
+
rest_operations,
|
|
17
|
+
samplers,
|
|
18
|
+
)
|
|
19
|
+
from .metrics import base_metric, score_result
|
|
20
|
+
from .models import ModelCapabilities, base_model, models_factory
|
|
21
|
+
from .scorers import scorer_function, scorer_wrapper_metric
|
|
22
|
+
from . import test_result
|
|
23
|
+
from .types import ExperimentScoreFunction, LLMTask, ScoringKeyMappingType
|
|
24
|
+
from .. import url_helpers
|
|
14
25
|
|
|
15
26
|
LOGGER = logging.getLogger(__name__)
|
|
27
|
+
MODALITY_SUPPORT_DOC_URL = (
|
|
28
|
+
"https://www.comet.com/docs/opik/evaluation/evaluate_multimodal"
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _try_notifying_about_experiment_completion(
|
|
33
|
+
experiment: experiment.Experiment,
|
|
34
|
+
) -> None:
|
|
35
|
+
try:
|
|
36
|
+
experiment.experiments_rest_client.finish_experiments(ids=[experiment.id])
|
|
37
|
+
except Exception:
|
|
38
|
+
LOGGER.debug(
|
|
39
|
+
"Failed to notify backend about the experiment completion. Experiment ID: %s",
|
|
40
|
+
experiment.id,
|
|
41
|
+
exc_info=True,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _compute_experiment_scores(
|
|
46
|
+
experiment_scoring_functions: List[ExperimentScoreFunction],
|
|
47
|
+
test_results: List[test_result.TestResult],
|
|
48
|
+
) -> List[score_result.ScoreResult]:
|
|
49
|
+
"""Compute experiment-level scores from test results."""
|
|
50
|
+
if not experiment_scoring_functions or not test_results:
|
|
51
|
+
return []
|
|
52
|
+
|
|
53
|
+
all_scores: List[score_result.ScoreResult] = []
|
|
54
|
+
for score_function in experiment_scoring_functions:
|
|
55
|
+
try:
|
|
56
|
+
scores = score_function(test_results)
|
|
57
|
+
# Handle Union[ScoreResult, List[ScoreResult]]
|
|
58
|
+
if isinstance(scores, list):
|
|
59
|
+
all_scores.extend(scores)
|
|
60
|
+
else:
|
|
61
|
+
all_scores.append(scores)
|
|
62
|
+
except Exception as e:
|
|
63
|
+
LOGGER.warning(
|
|
64
|
+
"Failed to compute experiment score: %s",
|
|
65
|
+
e,
|
|
66
|
+
exc_info=True,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
return all_scores
|
|
16
70
|
|
|
17
71
|
|
|
18
72
|
def evaluate(
|
|
19
73
|
dataset: dataset.Dataset,
|
|
20
74
|
task: LLMTask,
|
|
21
75
|
scoring_metrics: Optional[List[base_metric.BaseMetric]] = None,
|
|
76
|
+
scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
|
|
77
|
+
experiment_name_prefix: Optional[str] = None,
|
|
22
78
|
experiment_name: Optional[str] = None,
|
|
23
79
|
project_name: Optional[str] = None,
|
|
24
80
|
experiment_config: Optional[Dict[str, Any]] = None,
|
|
25
81
|
verbose: int = 1,
|
|
26
82
|
nb_samples: Optional[int] = None,
|
|
27
83
|
task_threads: int = 16,
|
|
28
|
-
prompt: Optional[
|
|
29
|
-
prompts: Optional[List[
|
|
84
|
+
prompt: Optional[base_prompt.BasePrompt] = None,
|
|
85
|
+
prompts: Optional[List[base_prompt.BasePrompt]] = None,
|
|
30
86
|
scoring_key_mapping: Optional[ScoringKeyMappingType] = None,
|
|
31
87
|
dataset_item_ids: Optional[List[str]] = None,
|
|
88
|
+
dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
|
|
89
|
+
trial_count: int = 1,
|
|
90
|
+
experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
|
|
32
91
|
) -> evaluation_result.EvaluationResult:
|
|
33
92
|
"""
|
|
34
|
-
Performs task evaluation on a given dataset.
|
|
93
|
+
Performs task evaluation on a given dataset. You can use either `scoring_metrics` or `scorer_functions` to calculate
|
|
94
|
+
evaluation metrics. The scorer functions doesn't require `scoring_key_mapping` and use reserved parameters
|
|
95
|
+
to receive inputs and outputs from the task.
|
|
35
96
|
|
|
36
97
|
Args:
|
|
37
98
|
dataset: An Opik dataset instance
|
|
@@ -39,6 +100,10 @@ def evaluate(
|
|
|
39
100
|
task: A callable object that takes dict with dataset item content
|
|
40
101
|
as input and returns dict which will later be used for scoring.
|
|
41
102
|
|
|
103
|
+
experiment_name_prefix: The prefix to be added to automatically generated experiment names to make them unique
|
|
104
|
+
but grouped under the same prefix. For example, if you set `experiment_name_prefix="my-experiment"`,
|
|
105
|
+
the first experiment created will be named `my-experiment-<unique-random-part>`.
|
|
106
|
+
|
|
42
107
|
experiment_name: The name of the experiment associated with evaluation run.
|
|
43
108
|
If None, a generated name will be used.
|
|
44
109
|
|
|
@@ -53,8 +118,16 @@ def evaluate(
|
|
|
53
118
|
are mandatory in `task`-returned dictionary.
|
|
54
119
|
If no value provided, the experiment won't have any scoring metrics.
|
|
55
120
|
|
|
121
|
+
scoring_functions: List of scorer functions to be executed during evaluation.
|
|
122
|
+
Each scorer function includes a scoring method that accepts predefined
|
|
123
|
+
arguments supplied by the evaluation engine:
|
|
124
|
+
• dataset_item — a dictionary containing the dataset item content,
|
|
125
|
+
• task_outputs — a dictionary containing the LLM task output.
|
|
126
|
+
• task_span - the data collected during the LLM task execution [optional].
|
|
127
|
+
|
|
56
128
|
verbose: an integer value that controls evaluation output logs such as summary and tqdm progress bar.
|
|
57
|
-
0 - no outputs, 1 - outputs are enabled (default)
|
|
129
|
+
0 - no outputs, 1 - outputs are enabled (default), 2 - outputs are enabled and detailed statistics
|
|
130
|
+
are displayed.
|
|
58
131
|
|
|
59
132
|
nb_samples: number of samples to evaluate. If no value is provided, all samples in the dataset will be evaluated.
|
|
60
133
|
|
|
@@ -73,9 +146,20 @@ def evaluate(
|
|
|
73
146
|
`{"input": "user_question"}` to map the "user_question" key to "input".
|
|
74
147
|
|
|
75
148
|
dataset_item_ids: list of dataset item ids to evaluate. If not provided, all samples in the dataset will be evaluated.
|
|
149
|
+
|
|
150
|
+
dataset_sampler: An instance of a dataset sampler that will be used to sample dataset items for evaluation.
|
|
151
|
+
If not provided, all samples in the dataset will be evaluated.
|
|
152
|
+
|
|
153
|
+
trial_count: number of times to run the task and evaluate the task output for every dataset item.
|
|
154
|
+
|
|
155
|
+
experiment_scoring_functions: List of callable functions that compute experiment-level scores.
|
|
156
|
+
Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
|
|
157
|
+
These scores are computed after all test results are collected and represent aggregate
|
|
158
|
+
metrics across the entire experiment.
|
|
76
159
|
"""
|
|
77
|
-
|
|
78
|
-
|
|
160
|
+
experiment_scoring_functions = (
|
|
161
|
+
[] if experiment_scoring_functions is None else experiment_scoring_functions
|
|
162
|
+
)
|
|
79
163
|
|
|
80
164
|
checked_prompts = experiment_helpers.handle_prompt_args(
|
|
81
165
|
prompt=prompt,
|
|
@@ -84,6 +168,11 @@ def evaluate(
|
|
|
84
168
|
|
|
85
169
|
client = opik_client.get_client_cached()
|
|
86
170
|
|
|
171
|
+
experiment_name = _use_or_create_experiment_name(
|
|
172
|
+
experiment_name=experiment_name,
|
|
173
|
+
experiment_name_prefix=experiment_name_prefix,
|
|
174
|
+
)
|
|
175
|
+
|
|
87
176
|
experiment = client.create_experiment(
|
|
88
177
|
name=experiment_name,
|
|
89
178
|
dataset_name=dataset.name,
|
|
@@ -91,6 +180,13 @@ def evaluate(
|
|
|
91
180
|
prompts=checked_prompts,
|
|
92
181
|
)
|
|
93
182
|
|
|
183
|
+
# wrap scoring functions if any
|
|
184
|
+
scoring_metrics = _wrap_scoring_functions(
|
|
185
|
+
scoring_functions=scoring_functions,
|
|
186
|
+
scoring_metrics=scoring_metrics,
|
|
187
|
+
project_name=project_name,
|
|
188
|
+
)
|
|
189
|
+
|
|
94
190
|
return _evaluate_task(
|
|
95
191
|
client=client,
|
|
96
192
|
experiment=experiment,
|
|
@@ -103,6 +199,9 @@ def evaluate(
|
|
|
103
199
|
task_threads=task_threads,
|
|
104
200
|
scoring_key_mapping=scoring_key_mapping,
|
|
105
201
|
dataset_item_ids=dataset_item_ids,
|
|
202
|
+
dataset_sampler=dataset_sampler,
|
|
203
|
+
trial_count=trial_count,
|
|
204
|
+
experiment_scoring_functions=experiment_scoring_functions,
|
|
106
205
|
)
|
|
107
206
|
|
|
108
207
|
|
|
@@ -119,6 +218,9 @@ def _evaluate_task(
|
|
|
119
218
|
task_threads: int,
|
|
120
219
|
scoring_key_mapping: Optional[ScoringKeyMappingType],
|
|
121
220
|
dataset_item_ids: Optional[List[str]],
|
|
221
|
+
dataset_sampler: Optional[samplers.BaseDatasetSampler],
|
|
222
|
+
trial_count: int,
|
|
223
|
+
experiment_scoring_functions: List[ExperimentScoreFunction],
|
|
122
224
|
) -> evaluation_result.EvaluationResult:
|
|
123
225
|
start_time = time.time()
|
|
124
226
|
|
|
@@ -126,51 +228,82 @@ def _evaluate_task(
|
|
|
126
228
|
evaluation_engine = engine.EvaluationEngine(
|
|
127
229
|
client=client,
|
|
128
230
|
project_name=project_name,
|
|
129
|
-
experiment_=experiment,
|
|
130
231
|
scoring_metrics=scoring_metrics,
|
|
131
232
|
workers=task_threads,
|
|
132
233
|
verbose=verbose,
|
|
133
234
|
scoring_key_mapping=scoring_key_mapping,
|
|
134
235
|
)
|
|
135
|
-
test_results = evaluation_engine.
|
|
236
|
+
test_results = evaluation_engine.evaluate_llm_task_on_dataset(
|
|
136
237
|
dataset_=dataset,
|
|
137
238
|
task=task,
|
|
138
239
|
nb_samples=nb_samples,
|
|
139
240
|
dataset_item_ids=dataset_item_ids,
|
|
241
|
+
dataset_sampler=dataset_sampler,
|
|
242
|
+
trial_count=trial_count,
|
|
243
|
+
experiment_=experiment,
|
|
140
244
|
)
|
|
141
245
|
|
|
142
246
|
total_time = time.time() - start_time
|
|
143
247
|
|
|
144
|
-
|
|
145
|
-
|
|
248
|
+
# Compute experiment scores
|
|
249
|
+
computed_experiment_scores = _compute_experiment_scores(
|
|
250
|
+
experiment_scoring_functions=experiment_scoring_functions,
|
|
251
|
+
test_results=test_results,
|
|
252
|
+
)
|
|
146
253
|
|
|
147
|
-
|
|
254
|
+
if verbose >= 1:
|
|
255
|
+
report.display_experiment_results(
|
|
256
|
+
dataset.name, total_time, test_results, computed_experiment_scores
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
experiment_url = url_helpers.get_experiment_url_by_id(
|
|
148
260
|
experiment_id=experiment.id,
|
|
149
261
|
dataset_id=dataset.id,
|
|
150
262
|
url_override=client.config.url_override,
|
|
151
263
|
)
|
|
152
264
|
|
|
265
|
+
report.display_experiment_link(experiment_url=experiment_url)
|
|
266
|
+
|
|
153
267
|
client.flush()
|
|
154
268
|
|
|
269
|
+
_try_notifying_about_experiment_completion(experiment)
|
|
270
|
+
|
|
271
|
+
# Log experiment scores to backend
|
|
272
|
+
if computed_experiment_scores:
|
|
273
|
+
experiment.log_experiment_scores(score_results=computed_experiment_scores)
|
|
274
|
+
|
|
155
275
|
evaluation_result_ = evaluation_result.EvaluationResult(
|
|
156
276
|
dataset_id=dataset.id,
|
|
157
277
|
experiment_id=experiment.id,
|
|
158
278
|
experiment_name=experiment.name,
|
|
159
279
|
test_results=test_results,
|
|
280
|
+
experiment_url=experiment_url,
|
|
281
|
+
trial_count=trial_count,
|
|
282
|
+
experiment_scores=computed_experiment_scores,
|
|
160
283
|
)
|
|
161
284
|
|
|
285
|
+
if verbose >= 2:
|
|
286
|
+
report.display_evaluation_scores_statistics(
|
|
287
|
+
dataset_name=dataset.name,
|
|
288
|
+
evaluation_results=evaluation_result_,
|
|
289
|
+
)
|
|
290
|
+
|
|
162
291
|
return evaluation_result_
|
|
163
292
|
|
|
164
293
|
|
|
165
294
|
def evaluate_experiment(
|
|
166
295
|
experiment_name: str,
|
|
167
296
|
scoring_metrics: List[base_metric.BaseMetric],
|
|
297
|
+
scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
|
|
168
298
|
scoring_threads: int = 16,
|
|
169
299
|
verbose: int = 1,
|
|
170
300
|
scoring_key_mapping: Optional[ScoringKeyMappingType] = None,
|
|
171
301
|
experiment_id: Optional[str] = None,
|
|
302
|
+
experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
|
|
172
303
|
) -> evaluation_result.EvaluationResult:
|
|
173
|
-
"""Update existing experiment with new evaluation metrics.
|
|
304
|
+
"""Update the existing experiment with new evaluation metrics. You can use either `scoring_metrics` or `scorer_functions` to calculate
|
|
305
|
+
evaluation metrics. The scorer functions doesn't require `scoring_key_mapping` and use reserved parameters
|
|
306
|
+
to receive inputs and outputs from the task.
|
|
174
307
|
|
|
175
308
|
Args:
|
|
176
309
|
experiment_name: The name of the experiment to update.
|
|
@@ -181,15 +314,32 @@ def evaluate_experiment(
|
|
|
181
314
|
of the `score` method in metrics that you need to find out which keys
|
|
182
315
|
are mandatory in `task`-returned dictionary.
|
|
183
316
|
|
|
317
|
+
scoring_functions: List of scorer functions to be executed during evaluation.
|
|
318
|
+
Each scorer function includes a scoring method that accepts predefined
|
|
319
|
+
arguments supplied by the evaluation engine:
|
|
320
|
+
• dataset_item — a dictionary containing the dataset item content,
|
|
321
|
+
• task_outputs — a dictionary containing the LLM task output.
|
|
322
|
+
• task_span - the data collected during the LLM task execution [optional].
|
|
323
|
+
|
|
184
324
|
scoring_threads: amount of thread workers to run scoring metrics.
|
|
185
325
|
|
|
186
326
|
verbose: an integer value that controls evaluation output logs such as summary and tqdm progress bar.
|
|
187
327
|
|
|
188
328
|
scoring_key_mapping: A dictionary that allows you to rename keys present in either the dataset item or the task output
|
|
189
|
-
so that they match the keys expected by the scoring metrics. For example if you have a dataset item with the following content:
|
|
329
|
+
so that they match the keys expected by the scoring metrics. For example, if you have a dataset item with the following content:
|
|
190
330
|
{"user_question": "What is Opik ?"} and a scoring metric that expects a key "input", you can use scoring_key_mapping
|
|
191
331
|
`{"input": "user_question"}` to map the "user_question" key to "input".
|
|
332
|
+
|
|
333
|
+
experiment_id: The ID of the experiment to evaluate. If not provided, the experiment will be evaluated based on the experiment name.
|
|
334
|
+
|
|
335
|
+
experiment_scoring_functions: List of callable functions that compute experiment-level scores.
|
|
336
|
+
Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
|
|
337
|
+
These scores are computed after all test results are collected and represent aggregate
|
|
338
|
+
metrics across the entire experiment.
|
|
192
339
|
"""
|
|
340
|
+
experiment_scoring_functions = (
|
|
341
|
+
[] if experiment_scoring_functions is None else experiment_scoring_functions
|
|
342
|
+
)
|
|
193
343
|
start_time = time.time()
|
|
194
344
|
|
|
195
345
|
client = opik_client.get_client_cached()
|
|
@@ -202,10 +352,11 @@ def evaluate_experiment(
|
|
|
202
352
|
client=client, experiment_name=experiment_name
|
|
203
353
|
)
|
|
204
354
|
|
|
355
|
+
dataset_ = client.get_dataset(name=experiment.dataset_name)
|
|
356
|
+
|
|
205
357
|
test_cases = rest_operations.get_experiment_test_cases(
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
dataset_id=experiment.dataset_id,
|
|
358
|
+
experiment_=experiment,
|
|
359
|
+
dataset_=dataset_,
|
|
209
360
|
scoring_key_mapping=scoring_key_mapping,
|
|
210
361
|
)
|
|
211
362
|
first_trace_id = test_cases[0].trace_id
|
|
@@ -213,11 +364,17 @@ def evaluate_experiment(
|
|
|
213
364
|
client=client, trace_id=first_trace_id
|
|
214
365
|
)
|
|
215
366
|
|
|
367
|
+
# wrap scoring functions if any
|
|
368
|
+
scoring_metrics = _wrap_scoring_functions(
|
|
369
|
+
scoring_functions=scoring_functions,
|
|
370
|
+
scoring_metrics=scoring_metrics,
|
|
371
|
+
project_name=project_name,
|
|
372
|
+
)
|
|
373
|
+
|
|
216
374
|
with asyncio_support.async_http_connections_expire_immediately():
|
|
217
375
|
evaluation_engine = engine.EvaluationEngine(
|
|
218
376
|
client=client,
|
|
219
377
|
project_name=project_name,
|
|
220
|
-
experiment_=experiment,
|
|
221
378
|
scoring_metrics=scoring_metrics,
|
|
222
379
|
workers=scoring_threads,
|
|
223
380
|
verbose=verbose,
|
|
@@ -229,50 +386,104 @@ def evaluate_experiment(
|
|
|
229
386
|
|
|
230
387
|
total_time = time.time() - start_time
|
|
231
388
|
|
|
232
|
-
|
|
389
|
+
# Compute experiment scores
|
|
390
|
+
computed_experiment_scores = _compute_experiment_scores(
|
|
391
|
+
experiment_scoring_functions=experiment_scoring_functions,
|
|
392
|
+
test_results=test_results,
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
if verbose >= 1:
|
|
233
396
|
report.display_experiment_results(
|
|
234
|
-
|
|
397
|
+
dataset_.name,
|
|
398
|
+
total_time,
|
|
399
|
+
test_results,
|
|
400
|
+
computed_experiment_scores,
|
|
235
401
|
)
|
|
236
402
|
|
|
237
|
-
|
|
238
|
-
dataset_id=experiment.dataset_id,
|
|
403
|
+
experiment_url = url_helpers.get_experiment_url_by_id(
|
|
239
404
|
experiment_id=experiment.id,
|
|
405
|
+
dataset_id=dataset_.id,
|
|
240
406
|
url_override=client.config.url_override,
|
|
241
407
|
)
|
|
242
408
|
|
|
409
|
+
report.display_experiment_link(experiment_url=experiment_url)
|
|
410
|
+
|
|
411
|
+
_try_notifying_about_experiment_completion(experiment)
|
|
412
|
+
|
|
413
|
+
# Log experiment scores to backend
|
|
414
|
+
if computed_experiment_scores:
|
|
415
|
+
experiment.log_experiment_scores(score_results=computed_experiment_scores)
|
|
416
|
+
|
|
243
417
|
evaluation_result_ = evaluation_result.EvaluationResult(
|
|
244
|
-
dataset_id=
|
|
418
|
+
dataset_id=dataset_.id,
|
|
245
419
|
experiment_id=experiment.id,
|
|
246
420
|
experiment_name=experiment.name,
|
|
247
421
|
test_results=test_results,
|
|
422
|
+
experiment_url=experiment_url,
|
|
423
|
+
trial_count=1,
|
|
424
|
+
experiment_scores=computed_experiment_scores,
|
|
248
425
|
)
|
|
249
426
|
|
|
427
|
+
if verbose >= 2:
|
|
428
|
+
report.display_evaluation_scores_statistics(
|
|
429
|
+
dataset_name=dataset_.name,
|
|
430
|
+
evaluation_results=evaluation_result_,
|
|
431
|
+
)
|
|
432
|
+
|
|
250
433
|
return evaluation_result_
|
|
251
434
|
|
|
252
435
|
|
|
253
436
|
def _build_prompt_evaluation_task(
|
|
254
437
|
model: base_model.OpikBaseModel, messages: List[Dict[str, Any]]
|
|
255
438
|
) -> Callable[[Dict[str, Any]], Dict[str, Any]]:
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
439
|
+
supported_modalities = cast(
|
|
440
|
+
prompt_types.SupportedModalities,
|
|
441
|
+
{
|
|
442
|
+
"vision": ModelCapabilities.supports_vision(
|
|
443
|
+
getattr(model, "model_name", None)
|
|
444
|
+
),
|
|
445
|
+
"video": ModelCapabilities.supports_video(
|
|
446
|
+
getattr(model, "model_name", None)
|
|
447
|
+
),
|
|
448
|
+
},
|
|
449
|
+
)
|
|
450
|
+
# Disable placeholder validation since we pass all dataset item fields to format()
|
|
451
|
+
chat_prompt_template_ = chat_prompt_template.ChatPromptTemplate(
|
|
452
|
+
messages=messages, validate_placeholders=False
|
|
453
|
+
)
|
|
269
454
|
|
|
270
|
-
|
|
455
|
+
required_modalities = chat_prompt_template_.required_modalities()
|
|
456
|
+
unsupported_modalities = {
|
|
457
|
+
modality
|
|
458
|
+
for modality in required_modalities
|
|
459
|
+
if not supported_modalities.get(modality, False)
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
if unsupported_modalities:
|
|
463
|
+
modalities_list = ", ".join(sorted(unsupported_modalities))
|
|
464
|
+
LOGGER.warning(
|
|
465
|
+
"Model '%s' does not support %s content. Multimedia parts will be flattened "
|
|
466
|
+
"to text placeholders. See %s for supported models and customization options.",
|
|
467
|
+
getattr(model, "model_name", "unknown"),
|
|
468
|
+
modalities_list,
|
|
469
|
+
MODALITY_SUPPORT_DOC_URL,
|
|
470
|
+
)
|
|
271
471
|
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
472
|
+
def _prompt_evaluation_task(prompt_variables: Dict[str, Any]) -> Dict[str, Any]:
|
|
473
|
+
template_type_override = prompt_variables.get("type")
|
|
474
|
+
processed_messages = chat_prompt_template_.format(
|
|
475
|
+
variables=prompt_variables,
|
|
476
|
+
supported_modalities=supported_modalities,
|
|
477
|
+
template_type=template_type_override,
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
with base_model.get_provider_response(
|
|
481
|
+
model_provider=model, messages=processed_messages
|
|
482
|
+
) as llm_output:
|
|
483
|
+
return {
|
|
484
|
+
"input": processed_messages,
|
|
485
|
+
"output": llm_output.choices[0].message.content,
|
|
486
|
+
}
|
|
276
487
|
|
|
277
488
|
return _prompt_evaluation_task
|
|
278
489
|
|
|
@@ -282,14 +493,19 @@ def evaluate_prompt(
|
|
|
282
493
|
messages: List[Dict[str, Any]],
|
|
283
494
|
model: Optional[Union[str, base_model.OpikBaseModel]] = None,
|
|
284
495
|
scoring_metrics: Optional[List[base_metric.BaseMetric]] = None,
|
|
496
|
+
scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
|
|
497
|
+
experiment_name_prefix: Optional[str] = None,
|
|
285
498
|
experiment_name: Optional[str] = None,
|
|
286
499
|
project_name: Optional[str] = None,
|
|
287
500
|
experiment_config: Optional[Dict[str, Any]] = None,
|
|
288
501
|
verbose: int = 1,
|
|
289
502
|
nb_samples: Optional[int] = None,
|
|
290
503
|
task_threads: int = 16,
|
|
291
|
-
prompt: Optional[
|
|
504
|
+
prompt: Optional[base_prompt.BasePrompt] = None,
|
|
292
505
|
dataset_item_ids: Optional[List[str]] = None,
|
|
506
|
+
dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
|
|
507
|
+
trial_count: int = 1,
|
|
508
|
+
experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
|
|
293
509
|
) -> evaluation_result.EvaluationResult:
|
|
294
510
|
"""
|
|
295
511
|
Performs prompt evaluation on a given dataset.
|
|
@@ -304,6 +520,17 @@ def evaluate_prompt(
|
|
|
304
520
|
scoring_metrics: List of metrics to calculate during evaluation.
|
|
305
521
|
The LLM input and output will be passed as arguments to each metric `score(...)` method.
|
|
306
522
|
|
|
523
|
+
scoring_functions: List of scorer functions to be executed during evaluation.
|
|
524
|
+
Each scorer function includes a scoring method that accepts predefined
|
|
525
|
+
arguments supplied by the evaluation engine:
|
|
526
|
+
• dataset_item — a dictionary containing the dataset item content,
|
|
527
|
+
• task_outputs — a dictionary containing the LLM task output.
|
|
528
|
+
• task_span - the data collected during the LLM task execution [optional].
|
|
529
|
+
|
|
530
|
+
experiment_name_prefix: The prefix to be added to automatically generated experiment names to make them unique
|
|
531
|
+
but grouped under the same prefix. For example, if you set `experiment_name_prefix="my-experiment"`,
|
|
532
|
+
the first experiment created will be named `my-experiment-<unique-random-part>`.
|
|
533
|
+
|
|
307
534
|
experiment_name: name of the experiment.
|
|
308
535
|
|
|
309
536
|
project_name: The name of the project to log data
|
|
@@ -319,28 +546,48 @@ def evaluate_prompt(
|
|
|
319
546
|
prompt: Prompt object to link with experiment.
|
|
320
547
|
|
|
321
548
|
dataset_item_ids: list of dataset item ids to evaluate. If not provided, all samples in the dataset will be evaluated.
|
|
549
|
+
|
|
550
|
+
dataset_sampler: An instance of a dataset sampler that will be used to sample dataset items for evaluation.
|
|
551
|
+
If not provided, all samples in the dataset will be evaluated.
|
|
552
|
+
|
|
553
|
+
trial_count: number of times to execute the prompt and evaluate the LLM output for every dataset item.
|
|
554
|
+
|
|
555
|
+
experiment_scoring_functions: List of callable functions that compute experiment-level scores.
|
|
556
|
+
Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
|
|
557
|
+
These scores are computed after all test results are collected and represent aggregate
|
|
558
|
+
metrics across the entire experiment.
|
|
322
559
|
"""
|
|
560
|
+
experiment_scoring_functions = (
|
|
561
|
+
[] if experiment_scoring_functions is None else experiment_scoring_functions
|
|
562
|
+
)
|
|
323
563
|
if isinstance(model, str):
|
|
324
|
-
|
|
564
|
+
opik_model = models_factory.get(model_name=model)
|
|
325
565
|
elif not isinstance(model, base_model.OpikBaseModel):
|
|
326
566
|
raise ValueError("`model` must be either a string or an OpikBaseModel instance")
|
|
567
|
+
else:
|
|
568
|
+
opik_model = model
|
|
327
569
|
|
|
328
570
|
if experiment_config is None:
|
|
329
|
-
experiment_config = {
|
|
571
|
+
experiment_config = {
|
|
572
|
+
"prompt_template": messages,
|
|
573
|
+
"model": opik_model.model_name,
|
|
574
|
+
}
|
|
330
575
|
else:
|
|
331
576
|
if "prompt_template" not in experiment_config:
|
|
332
577
|
experiment_config["prompt_template"] = messages
|
|
333
578
|
|
|
334
579
|
if "model" not in experiment_config:
|
|
335
|
-
experiment_config["model"] =
|
|
336
|
-
|
|
337
|
-
if scoring_metrics is None:
|
|
338
|
-
scoring_metrics = []
|
|
580
|
+
experiment_config["model"] = opik_model.model_name
|
|
339
581
|
|
|
340
582
|
client = opik_client.get_client_cached()
|
|
341
583
|
|
|
342
584
|
prompts = [prompt] if prompt else None
|
|
343
585
|
|
|
586
|
+
experiment_name = _use_or_create_experiment_name(
|
|
587
|
+
experiment_name=experiment_name,
|
|
588
|
+
experiment_name_prefix=experiment_name_prefix,
|
|
589
|
+
)
|
|
590
|
+
|
|
344
591
|
experiment = client.create_experiment(
|
|
345
592
|
name=experiment_name,
|
|
346
593
|
dataset_name=dataset.name,
|
|
@@ -348,45 +595,79 @@ def evaluate_prompt(
|
|
|
348
595
|
prompts=prompts,
|
|
349
596
|
)
|
|
350
597
|
|
|
598
|
+
# wrap scoring functions if any
|
|
599
|
+
scoring_metrics = _wrap_scoring_functions(
|
|
600
|
+
scoring_functions=scoring_functions,
|
|
601
|
+
scoring_metrics=scoring_metrics,
|
|
602
|
+
project_name=project_name,
|
|
603
|
+
)
|
|
604
|
+
|
|
351
605
|
start_time = time.time()
|
|
352
606
|
|
|
353
607
|
with asyncio_support.async_http_connections_expire_immediately():
|
|
354
608
|
evaluation_engine = engine.EvaluationEngine(
|
|
355
609
|
client=client,
|
|
356
610
|
project_name=project_name,
|
|
357
|
-
experiment_=experiment,
|
|
358
611
|
scoring_metrics=scoring_metrics,
|
|
359
612
|
workers=task_threads,
|
|
360
613
|
verbose=verbose,
|
|
361
614
|
scoring_key_mapping=None,
|
|
362
615
|
)
|
|
363
|
-
test_results = evaluation_engine.
|
|
616
|
+
test_results = evaluation_engine.evaluate_llm_task_on_dataset(
|
|
364
617
|
dataset_=dataset,
|
|
365
|
-
task=_build_prompt_evaluation_task(model=
|
|
618
|
+
task=_build_prompt_evaluation_task(model=opik_model, messages=messages),
|
|
366
619
|
nb_samples=nb_samples,
|
|
367
620
|
dataset_item_ids=dataset_item_ids,
|
|
621
|
+
dataset_sampler=dataset_sampler,
|
|
622
|
+
trial_count=trial_count,
|
|
623
|
+
experiment_=experiment,
|
|
368
624
|
)
|
|
369
625
|
|
|
370
626
|
total_time = time.time() - start_time
|
|
371
627
|
|
|
372
|
-
|
|
373
|
-
|
|
628
|
+
# Compute experiment scores
|
|
629
|
+
computed_experiment_scores = _compute_experiment_scores(
|
|
630
|
+
experiment_scoring_functions=experiment_scoring_functions,
|
|
631
|
+
test_results=test_results,
|
|
632
|
+
)
|
|
633
|
+
|
|
634
|
+
if verbose >= 1:
|
|
635
|
+
report.display_experiment_results(
|
|
636
|
+
dataset.name, total_time, test_results, computed_experiment_scores
|
|
637
|
+
)
|
|
374
638
|
|
|
375
|
-
|
|
639
|
+
experiment_url = url_helpers.get_experiment_url_by_id(
|
|
376
640
|
experiment_id=experiment.id,
|
|
377
641
|
dataset_id=dataset.id,
|
|
378
642
|
url_override=client.config.url_override,
|
|
379
643
|
)
|
|
380
644
|
|
|
645
|
+
report.display_experiment_link(experiment_url=experiment_url)
|
|
646
|
+
|
|
381
647
|
client.flush()
|
|
382
648
|
|
|
649
|
+
_try_notifying_about_experiment_completion(experiment)
|
|
650
|
+
|
|
651
|
+
# Log experiment scores to backend
|
|
652
|
+
if computed_experiment_scores:
|
|
653
|
+
experiment.log_experiment_scores(score_results=computed_experiment_scores)
|
|
654
|
+
|
|
383
655
|
evaluation_result_ = evaluation_result.EvaluationResult(
|
|
384
656
|
experiment_id=experiment.id,
|
|
385
657
|
dataset_id=dataset.id,
|
|
386
658
|
experiment_name=experiment.name,
|
|
387
659
|
test_results=test_results,
|
|
660
|
+
experiment_url=experiment_url,
|
|
661
|
+
trial_count=trial_count,
|
|
662
|
+
experiment_scores=computed_experiment_scores,
|
|
388
663
|
)
|
|
389
664
|
|
|
665
|
+
if verbose >= 2:
|
|
666
|
+
report.display_evaluation_scores_statistics(
|
|
667
|
+
dataset_name=dataset.name,
|
|
668
|
+
evaluation_results=evaluation_result_,
|
|
669
|
+
)
|
|
670
|
+
|
|
390
671
|
return evaluation_result_
|
|
391
672
|
|
|
392
673
|
|
|
@@ -395,16 +676,21 @@ def evaluate_optimization_trial(
|
|
|
395
676
|
dataset: dataset.Dataset,
|
|
396
677
|
task: LLMTask,
|
|
397
678
|
scoring_metrics: Optional[List[base_metric.BaseMetric]] = None,
|
|
679
|
+
scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
|
|
680
|
+
experiment_name_prefix: Optional[str] = None,
|
|
398
681
|
experiment_name: Optional[str] = None,
|
|
399
682
|
project_name: Optional[str] = None,
|
|
400
683
|
experiment_config: Optional[Dict[str, Any]] = None,
|
|
401
684
|
verbose: int = 1,
|
|
402
685
|
nb_samples: Optional[int] = None,
|
|
403
686
|
task_threads: int = 16,
|
|
404
|
-
prompt: Optional[
|
|
405
|
-
prompts: Optional[List[
|
|
687
|
+
prompt: Optional[base_prompt.BasePrompt] = None,
|
|
688
|
+
prompts: Optional[List[base_prompt.BasePrompt]] = None,
|
|
406
689
|
scoring_key_mapping: Optional[ScoringKeyMappingType] = None,
|
|
407
690
|
dataset_item_ids: Optional[List[str]] = None,
|
|
691
|
+
dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
|
|
692
|
+
trial_count: int = 1,
|
|
693
|
+
experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
|
|
408
694
|
) -> evaluation_result.EvaluationResult:
|
|
409
695
|
"""
|
|
410
696
|
Performs task evaluation on a given dataset.
|
|
@@ -417,6 +703,17 @@ def evaluate_optimization_trial(
|
|
|
417
703
|
task: A callable object that takes dict with dataset item content
|
|
418
704
|
as input and returns dict which will later be used for scoring.
|
|
419
705
|
|
|
706
|
+
scoring_functions: List of scorer functions to be executed during evaluation.
|
|
707
|
+
Each scorer function includes a scoring method that accepts predefined
|
|
708
|
+
arguments supplied by the evaluation engine:
|
|
709
|
+
• dataset_item — a dictionary containing the dataset item content,
|
|
710
|
+
• task_outputs — a dictionary containing the LLM task output.
|
|
711
|
+
• task_span - the data collected during the LLM task execution [optional].
|
|
712
|
+
|
|
713
|
+
experiment_name_prefix: The prefix to be added to automatically generated experiment names to make them unique
|
|
714
|
+
but grouped under the same prefix. For example, if you set `experiment_name_prefix="my-experiment"`,
|
|
715
|
+
the first experiment created will be named `my-experiment-<unique-random-part>`.
|
|
716
|
+
|
|
420
717
|
experiment_name: The name of the experiment associated with evaluation run.
|
|
421
718
|
If None, a generated name will be used.
|
|
422
719
|
|
|
@@ -451,7 +748,21 @@ def evaluate_optimization_trial(
|
|
|
451
748
|
`{"input": "user_question"}` to map the "user_question" key to "input".
|
|
452
749
|
|
|
453
750
|
dataset_item_ids: list of dataset item ids to evaluate. If not provided, all samples in the dataset will be evaluated.
|
|
751
|
+
|
|
752
|
+
dataset_sampler: An instance of a dataset sampler that will be used to sample dataset items for evaluation.
|
|
753
|
+
If not provided, all samples in the dataset will be evaluated.
|
|
754
|
+
|
|
755
|
+
trial_count: number of times to execute the prompt and evaluate the LLM output for every dataset item.
|
|
756
|
+
|
|
757
|
+
experiment_scoring_functions: List of callable functions that compute experiment-level scores.
|
|
758
|
+
Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
|
|
759
|
+
These scores are computed after all test results are collected and represent aggregate
|
|
760
|
+
metrics across the entire experiment.
|
|
454
761
|
"""
|
|
762
|
+
experiment_scoring_functions = (
|
|
763
|
+
[] if experiment_scoring_functions is None else experiment_scoring_functions
|
|
764
|
+
)
|
|
765
|
+
|
|
455
766
|
if scoring_metrics is None:
|
|
456
767
|
scoring_metrics = []
|
|
457
768
|
|
|
@@ -460,8 +771,20 @@ def evaluate_optimization_trial(
|
|
|
460
771
|
prompts=prompts,
|
|
461
772
|
)
|
|
462
773
|
|
|
774
|
+
# wrap scoring functions if any
|
|
775
|
+
scoring_metrics = _wrap_scoring_functions(
|
|
776
|
+
scoring_functions=scoring_functions,
|
|
777
|
+
scoring_metrics=scoring_metrics,
|
|
778
|
+
project_name=project_name,
|
|
779
|
+
)
|
|
780
|
+
|
|
463
781
|
client = opik_client.get_client_cached()
|
|
464
782
|
|
|
783
|
+
experiment_name = _use_or_create_experiment_name(
|
|
784
|
+
experiment_name=experiment_name,
|
|
785
|
+
experiment_name_prefix=experiment_name_prefix,
|
|
786
|
+
)
|
|
787
|
+
|
|
465
788
|
experiment = client.create_experiment(
|
|
466
789
|
name=experiment_name,
|
|
467
790
|
dataset_name=dataset.name,
|
|
@@ -483,4 +806,152 @@ def evaluate_optimization_trial(
|
|
|
483
806
|
task_threads=task_threads,
|
|
484
807
|
scoring_key_mapping=scoring_key_mapping,
|
|
485
808
|
dataset_item_ids=dataset_item_ids,
|
|
809
|
+
dataset_sampler=dataset_sampler,
|
|
810
|
+
trial_count=trial_count,
|
|
811
|
+
experiment_scoring_functions=experiment_scoring_functions,
|
|
486
812
|
)
|
|
813
|
+
|
|
814
|
+
|
|
815
|
+
def evaluate_on_dict_items(
|
|
816
|
+
items: List[Dict[str, Any]],
|
|
817
|
+
task: LLMTask,
|
|
818
|
+
scoring_metrics: Optional[List[base_metric.BaseMetric]] = None,
|
|
819
|
+
scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
|
|
820
|
+
project_name: Optional[str] = None,
|
|
821
|
+
verbose: int = 0,
|
|
822
|
+
scoring_key_mapping: Optional[ScoringKeyMappingType] = None,
|
|
823
|
+
scoring_threads: int = 16,
|
|
824
|
+
) -> evaluation_result.EvaluationResultOnDictItems:
|
|
825
|
+
"""
|
|
826
|
+
Lightweight evaluation function that evaluates a task on dataset items (as dictionaries)
|
|
827
|
+
without requiring a Dataset object or creating an experiment.
|
|
828
|
+
|
|
829
|
+
This function is useful for optimization scenarios where you need to evaluate many
|
|
830
|
+
candidate solutions quickly using Opik's metric infrastructure. It creates traces for
|
|
831
|
+
tracking but doesn't require experiment setup or dataset management.
|
|
832
|
+
|
|
833
|
+
Args:
|
|
834
|
+
items: List of dataset item contents (dictionaries with the data to evaluate).
|
|
835
|
+
|
|
836
|
+
task: A callable object that takes dict with dataset item content
|
|
837
|
+
as input and returns dict which will later be used for scoring.
|
|
838
|
+
|
|
839
|
+
scoring_metrics: List of metrics to calculate during evaluation.
|
|
840
|
+
Each metric's `score(...)` method will be called with arguments taken from
|
|
841
|
+
the dataset item and task output.
|
|
842
|
+
|
|
843
|
+
scoring_functions: List of scorer functions to be executed during evaluation.
|
|
844
|
+
Each scorer function accepts predefined arguments:
|
|
845
|
+
• dataset_item — a dictionary containing the dataset item content,
|
|
846
|
+
• task_outputs — a dictionary containing the LLM task output.
|
|
847
|
+
|
|
848
|
+
project_name: The name of the project for logging traces.
|
|
849
|
+
|
|
850
|
+
verbose: Controls evaluation output logs and progress bars.
|
|
851
|
+
0 - no outputs (default), 1 - enable outputs.
|
|
852
|
+
|
|
853
|
+
scoring_key_mapping: A dictionary that allows you to rename keys present in either
|
|
854
|
+
the dataset item or the task output to match the keys expected by scoring metrics.
|
|
855
|
+
|
|
856
|
+
scoring_threads: Number of thread workers to run scoring metrics.
|
|
857
|
+
|
|
858
|
+
Returns:
|
|
859
|
+
EvaluationResultOnDictItems object containing test results and providing methods
|
|
860
|
+
to aggregate scores, similar to the regular evaluation result.
|
|
861
|
+
|
|
862
|
+
Example:
|
|
863
|
+
```python
|
|
864
|
+
import opik
|
|
865
|
+
from opik.evaluation.metrics import Equals
|
|
866
|
+
|
|
867
|
+
items = [
|
|
868
|
+
{"input": "What is 2+2?", "expected_output": "4"},
|
|
869
|
+
{"input": "What is 3+3?", "expected_output": "6"},
|
|
870
|
+
]
|
|
871
|
+
|
|
872
|
+
def my_task(item):
|
|
873
|
+
# Your LLM call here
|
|
874
|
+
question = item["input"]
|
|
875
|
+
# ... call model ...
|
|
876
|
+
return {"output": model_output}
|
|
877
|
+
|
|
878
|
+
result = opik.evaluate_on_dict_items(
|
|
879
|
+
items=items,
|
|
880
|
+
task=my_task,
|
|
881
|
+
scoring_metrics=[Equals()],
|
|
882
|
+
scoring_key_mapping={"reference": "expected_output"},
|
|
883
|
+
)
|
|
884
|
+
|
|
885
|
+
# Access individual test results
|
|
886
|
+
for test_result in result.test_results:
|
|
887
|
+
print(f"Score: {test_result.score_results[0].value}")
|
|
888
|
+
|
|
889
|
+
# Get aggregated statistics
|
|
890
|
+
aggregated = result.aggregate_evaluation_scores()
|
|
891
|
+
print(f"Mean equals score: {aggregated['equals_metric'].mean}")
|
|
892
|
+
```
|
|
893
|
+
"""
|
|
894
|
+
# Wrap scoring functions if any
|
|
895
|
+
scoring_metrics = _wrap_scoring_functions(
|
|
896
|
+
scoring_functions=scoring_functions,
|
|
897
|
+
scoring_metrics=scoring_metrics,
|
|
898
|
+
project_name=project_name,
|
|
899
|
+
)
|
|
900
|
+
|
|
901
|
+
if not scoring_metrics:
|
|
902
|
+
LOGGER.warning("No scoring metrics provided for items evaluation")
|
|
903
|
+
return evaluation_result.EvaluationResultOnDictItems(test_results=[])
|
|
904
|
+
|
|
905
|
+
client = opik_client.get_client_cached()
|
|
906
|
+
|
|
907
|
+
# Create evaluation engine
|
|
908
|
+
with asyncio_support.async_http_connections_expire_immediately():
|
|
909
|
+
evaluation_engine = engine.EvaluationEngine(
|
|
910
|
+
client=client,
|
|
911
|
+
project_name=project_name,
|
|
912
|
+
scoring_metrics=scoring_metrics,
|
|
913
|
+
workers=scoring_threads,
|
|
914
|
+
verbose=verbose,
|
|
915
|
+
scoring_key_mapping=scoring_key_mapping,
|
|
916
|
+
)
|
|
917
|
+
|
|
918
|
+
# Use the new evaluate_items method
|
|
919
|
+
test_results = evaluation_engine.evaluate_llm_task_on_dict_items(
|
|
920
|
+
items=items,
|
|
921
|
+
task=task,
|
|
922
|
+
)
|
|
923
|
+
|
|
924
|
+
return evaluation_result.EvaluationResultOnDictItems(
|
|
925
|
+
test_results=test_results,
|
|
926
|
+
)
|
|
927
|
+
|
|
928
|
+
|
|
929
|
+
def _wrap_scoring_functions(
|
|
930
|
+
scoring_functions: Optional[List[scorer_function.ScorerFunction]],
|
|
931
|
+
scoring_metrics: Optional[List[base_metric.BaseMetric]],
|
|
932
|
+
project_name: Optional[str],
|
|
933
|
+
) -> List[base_metric.BaseMetric]:
|
|
934
|
+
if scoring_functions:
|
|
935
|
+
function_metrics = scorer_wrapper_metric.wrap_scorer_functions(
|
|
936
|
+
scoring_functions, project_name=project_name
|
|
937
|
+
)
|
|
938
|
+
if scoring_metrics:
|
|
939
|
+
scoring_metrics.extend(function_metrics)
|
|
940
|
+
else:
|
|
941
|
+
scoring_metrics = function_metrics
|
|
942
|
+
|
|
943
|
+
return scoring_metrics if scoring_metrics else []
|
|
944
|
+
|
|
945
|
+
|
|
946
|
+
def _use_or_create_experiment_name(
|
|
947
|
+
experiment_name: Optional[str], experiment_name_prefix: Optional[str]
|
|
948
|
+
) -> Optional[str]:
|
|
949
|
+
if experiment_name:
|
|
950
|
+
return experiment_name
|
|
951
|
+
|
|
952
|
+
if experiment_name_prefix:
|
|
953
|
+
return experiment_helpers.generate_unique_experiment_name(
|
|
954
|
+
experiment_name_prefix
|
|
955
|
+
)
|
|
956
|
+
else:
|
|
957
|
+
return None
|