PyPI - opik - Versions diffs - 1.8.39__py3-none-any.whl → 1.9.71__py3-none-any.whl - Mend

opik 1.8.39py3-none-any.whl → 1.9.71py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (592) hide show

opik/__init__.py +19 -3
opik/anonymizer/__init__.py +5 -0
opik/anonymizer/anonymizer.py +12 -0
opik/anonymizer/factory.py +80 -0
opik/anonymizer/recursive_anonymizer.py +64 -0
opik/anonymizer/rules.py +56 -0
opik/anonymizer/rules_anonymizer.py +35 -0
opik/api_objects/attachment/attachment_context.py +36 -0
opik/api_objects/attachment/attachments_extractor.py +153 -0
opik/api_objects/attachment/client.py +1 -0
opik/api_objects/attachment/converters.py +2 -0
opik/api_objects/attachment/decoder.py +18 -0
opik/api_objects/attachment/decoder_base64.py +83 -0
opik/api_objects/attachment/decoder_helpers.py +137 -0
opik/api_objects/data_helpers.py +79 -0
opik/api_objects/dataset/dataset.py +64 -4
opik/api_objects/dataset/rest_operations.py +11 -2
opik/api_objects/experiment/experiment.py +57 -57
opik/api_objects/experiment/experiment_item.py +2 -1
opik/api_objects/experiment/experiments_client.py +64 -0
opik/api_objects/experiment/helpers.py +35 -11
opik/api_objects/experiment/rest_operations.py +65 -5
opik/api_objects/helpers.py +8 -5
opik/api_objects/local_recording.py +81 -0
opik/api_objects/opik_client.py +600 -108
opik/api_objects/opik_query_language.py +39 -5
opik/api_objects/prompt/__init__.py +12 -2
opik/api_objects/prompt/base_prompt.py +69 -0
opik/api_objects/prompt/base_prompt_template.py +29 -0
opik/api_objects/prompt/chat/__init__.py +1 -0
opik/api_objects/prompt/chat/chat_prompt.py +210 -0
opik/api_objects/prompt/chat/chat_prompt_template.py +350 -0
opik/api_objects/prompt/chat/content_renderer_registry.py +203 -0
opik/api_objects/prompt/client.py +189 -47
opik/api_objects/prompt/text/__init__.py +1 -0
opik/api_objects/prompt/text/prompt.py +174 -0
opik/api_objects/prompt/{prompt_template.py → text/prompt_template.py} +10 -6
opik/api_objects/prompt/types.py +23 -0
opik/api_objects/search_helpers.py +89 -0
opik/api_objects/span/span_data.py +35 -25
opik/api_objects/threads/threads_client.py +39 -5
opik/api_objects/trace/trace_client.py +52 -2
opik/api_objects/trace/trace_data.py +15 -24
opik/api_objects/validation_helpers.py +3 -3
opik/cli/__init__.py +5 -0
opik/cli/__main__.py +6 -0
opik/cli/configure.py +66 -0
opik/cli/exports/__init__.py +131 -0
opik/cli/exports/dataset.py +278 -0
opik/cli/exports/experiment.py +784 -0
opik/cli/exports/project.py +685 -0
opik/cli/exports/prompt.py +578 -0
opik/cli/exports/utils.py +406 -0
opik/cli/harbor.py +39 -0
opik/cli/healthcheck.py +21 -0
opik/cli/imports/__init__.py +439 -0
opik/cli/imports/dataset.py +143 -0
opik/cli/imports/experiment.py +1192 -0
opik/cli/imports/project.py +262 -0
opik/cli/imports/prompt.py +177 -0
opik/cli/imports/utils.py +280 -0
opik/cli/main.py +49 -0
opik/cli/proxy.py +93 -0
opik/cli/usage_report/__init__.py +16 -0
opik/cli/usage_report/charts.py +783 -0
opik/cli/usage_report/cli.py +274 -0
opik/cli/usage_report/constants.py +9 -0
opik/cli/usage_report/extraction.py +749 -0
opik/cli/usage_report/pdf.py +244 -0
opik/cli/usage_report/statistics.py +78 -0
opik/cli/usage_report/utils.py +235 -0
opik/config.py +13 -7
opik/configurator/configure.py +17 -0
opik/datetime_helpers.py +12 -0
opik/decorator/arguments_helpers.py +9 -1
opik/decorator/base_track_decorator.py +205 -133
opik/decorator/context_manager/span_context_manager.py +123 -0
opik/decorator/context_manager/trace_context_manager.py +84 -0
opik/decorator/opik_args/__init__.py +13 -0
opik/decorator/opik_args/api_classes.py +71 -0
opik/decorator/opik_args/helpers.py +120 -0
opik/decorator/span_creation_handler.py +25 -6
opik/dict_utils.py +3 -3
opik/evaluation/__init__.py +13 -2
opik/evaluation/engine/engine.py +272 -75
opik/evaluation/engine/evaluation_tasks_executor.py +6 -3
opik/evaluation/engine/helpers.py +31 -6
opik/evaluation/engine/metrics_evaluator.py +237 -0
opik/evaluation/evaluation_result.py +168 -2
opik/evaluation/evaluator.py +533 -62
opik/evaluation/metrics/__init__.py +103 -4
opik/evaluation/metrics/aggregated_metric.py +35 -6
opik/evaluation/metrics/base_metric.py +1 -1
opik/evaluation/metrics/conversation/__init__.py +48 -0
opik/evaluation/metrics/conversation/conversation_thread_metric.py +56 -2
opik/evaluation/metrics/conversation/g_eval_wrappers.py +19 -0
opik/evaluation/metrics/conversation/helpers.py +14 -15
opik/evaluation/metrics/conversation/heuristics/__init__.py +14 -0
opik/evaluation/metrics/conversation/heuristics/degeneration/__init__.py +3 -0
opik/evaluation/metrics/conversation/heuristics/degeneration/metric.py +189 -0
opik/evaluation/metrics/conversation/heuristics/degeneration/phrases.py +12 -0
opik/evaluation/metrics/conversation/heuristics/knowledge_retention/__init__.py +3 -0
opik/evaluation/metrics/conversation/heuristics/knowledge_retention/metric.py +172 -0
opik/evaluation/metrics/conversation/llm_judges/__init__.py +32 -0
opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/metric.py +22 -17
opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/templates.py +1 -1
opik/evaluation/metrics/conversation/llm_judges/g_eval_wrappers.py +442 -0
opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/metric.py +13 -7
opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/templates.py +1 -1
opik/evaluation/metrics/conversation/llm_judges/user_frustration/__init__.py +0 -0
opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/metric.py +21 -14
opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/templates.py +1 -1
opik/evaluation/metrics/conversation/types.py +4 -5
opik/evaluation/metrics/conversation_types.py +9 -0
opik/evaluation/metrics/heuristics/bertscore.py +107 -0
opik/evaluation/metrics/heuristics/bleu.py +35 -15
opik/evaluation/metrics/heuristics/chrf.py +127 -0
opik/evaluation/metrics/heuristics/contains.py +47 -11
opik/evaluation/metrics/heuristics/distribution_metrics.py +331 -0
opik/evaluation/metrics/heuristics/gleu.py +113 -0
opik/evaluation/metrics/heuristics/language_adherence.py +123 -0
opik/evaluation/metrics/heuristics/meteor.py +119 -0
opik/evaluation/metrics/heuristics/prompt_injection.py +150 -0
opik/evaluation/metrics/heuristics/readability.py +129 -0
opik/evaluation/metrics/heuristics/rouge.py +26 -9
opik/evaluation/metrics/heuristics/spearman.py +88 -0
opik/evaluation/metrics/heuristics/tone.py +155 -0
opik/evaluation/metrics/heuristics/vader_sentiment.py +77 -0
opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +20 -5
opik/evaluation/metrics/llm_judges/context_precision/metric.py +20 -6
opik/evaluation/metrics/llm_judges/context_recall/metric.py +20 -6
opik/evaluation/metrics/llm_judges/g_eval/__init__.py +5 -0
opik/evaluation/metrics/llm_judges/g_eval/metric.py +219 -68
opik/evaluation/metrics/llm_judges/g_eval/parser.py +102 -52
opik/evaluation/metrics/llm_judges/g_eval/presets.py +209 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/__init__.py +36 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/agent_assessment.py +77 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/bias_classifier.py +181 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/compliance_risk.py +41 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/prompt_uncertainty.py +41 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/qa_suite.py +146 -0
opik/evaluation/metrics/llm_judges/hallucination/metric.py +16 -3
opik/evaluation/metrics/llm_judges/llm_juries/__init__.py +3 -0
opik/evaluation/metrics/llm_judges/llm_juries/metric.py +76 -0
opik/evaluation/metrics/llm_judges/moderation/metric.py +16 -4
opik/evaluation/metrics/llm_judges/structure_output_compliance/__init__.py +0 -0
opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +144 -0
opik/evaluation/metrics/llm_judges/structure_output_compliance/parser.py +79 -0
opik/evaluation/metrics/llm_judges/structure_output_compliance/schema.py +15 -0
opik/evaluation/metrics/llm_judges/structure_output_compliance/template.py +50 -0
opik/evaluation/metrics/llm_judges/syc_eval/__init__.py +0 -0
opik/evaluation/metrics/llm_judges/syc_eval/metric.py +252 -0
opik/evaluation/metrics/llm_judges/syc_eval/parser.py +82 -0
opik/evaluation/metrics/llm_judges/syc_eval/template.py +155 -0
opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +20 -5
opik/evaluation/metrics/llm_judges/usefulness/metric.py +16 -4
opik/evaluation/metrics/ragas_metric.py +43 -23
opik/evaluation/models/__init__.py +8 -0
opik/evaluation/models/base_model.py +107 -1
opik/evaluation/models/langchain/langchain_chat_model.py +15 -7
opik/evaluation/models/langchain/message_converters.py +97 -15
opik/evaluation/models/litellm/litellm_chat_model.py +156 -29
opik/evaluation/models/litellm/util.py +125 -0
opik/evaluation/models/litellm/warning_filters.py +16 -4
opik/evaluation/models/model_capabilities.py +187 -0
opik/evaluation/models/models_factory.py +25 -3
opik/evaluation/preprocessing.py +92 -0
opik/evaluation/report.py +70 -12
opik/evaluation/rest_operations.py +49 -45
opik/evaluation/samplers/__init__.py +4 -0
opik/evaluation/samplers/base_dataset_sampler.py +40 -0
opik/evaluation/samplers/random_dataset_sampler.py +48 -0
opik/evaluation/score_statistics.py +66 -0
opik/evaluation/scorers/__init__.py +4 -0
opik/evaluation/scorers/scorer_function.py +55 -0
opik/evaluation/scorers/scorer_wrapper_metric.py +130 -0
opik/evaluation/test_case.py +3 -2
opik/evaluation/test_result.py +1 -0
opik/evaluation/threads/evaluator.py +31 -3
opik/evaluation/threads/helpers.py +3 -2
opik/evaluation/types.py +9 -1
opik/exceptions.py +33 -0
opik/file_upload/file_uploader.py +13 -0
opik/file_upload/upload_options.py +2 -0
opik/hooks/__init__.py +23 -0
opik/hooks/anonymizer_hook.py +36 -0
opik/hooks/httpx_client_hook.py +112 -0
opik/httpx_client.py +12 -9
opik/id_helpers.py +18 -0
opik/integrations/adk/graph/subgraph_edges_builders.py +1 -2
opik/integrations/adk/helpers.py +16 -7
opik/integrations/adk/legacy_opik_tracer.py +7 -4
opik/integrations/adk/opik_tracer.py +14 -1
opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +7 -3
opik/integrations/adk/recursive_callback_injector.py +4 -7
opik/integrations/bedrock/converse/__init__.py +0 -0
opik/integrations/bedrock/converse/chunks_aggregator.py +188 -0
opik/integrations/bedrock/{converse_decorator.py → converse/converse_decorator.py} +4 -3
opik/integrations/bedrock/invoke_agent_decorator.py +5 -4
opik/integrations/bedrock/invoke_model/__init__.py +0 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/__init__.py +78 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/api.py +45 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/base.py +23 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/claude.py +121 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/format_detector.py +107 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/llama.py +108 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/mistral.py +118 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/nova.py +99 -0
opik/integrations/bedrock/invoke_model/invoke_model_decorator.py +178 -0
opik/integrations/bedrock/invoke_model/response_types.py +34 -0
opik/integrations/bedrock/invoke_model/stream_wrappers.py +122 -0
opik/integrations/bedrock/invoke_model/usage_converters.py +87 -0
opik/integrations/bedrock/invoke_model/usage_extraction.py +108 -0
opik/integrations/bedrock/opik_tracker.py +42 -4
opik/integrations/bedrock/types.py +19 -0
opik/integrations/crewai/crewai_decorator.py +8 -51
opik/integrations/crewai/opik_tracker.py +31 -10
opik/integrations/crewai/patchers/__init__.py +5 -0
opik/integrations/crewai/patchers/flow.py +118 -0
opik/integrations/crewai/patchers/litellm_completion.py +30 -0
opik/integrations/crewai/patchers/llm_client.py +207 -0
opik/integrations/dspy/callback.py +80 -17
opik/integrations/dspy/parsers.py +168 -0
opik/integrations/harbor/__init__.py +17 -0
opik/integrations/harbor/experiment_service.py +269 -0
opik/integrations/harbor/opik_tracker.py +528 -0
opik/integrations/haystack/opik_connector.py +2 -2
opik/integrations/haystack/opik_tracer.py +3 -7
opik/integrations/langchain/__init__.py +3 -1
opik/integrations/langchain/helpers.py +96 -0
opik/integrations/langchain/langgraph_async_context_bridge.py +131 -0
opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
opik/integrations/langchain/opik_encoder_extension.py +1 -1
opik/integrations/langchain/opik_tracer.py +474 -229
opik/integrations/litellm/__init__.py +5 -0
opik/integrations/litellm/completion_chunks_aggregator.py +115 -0
opik/integrations/litellm/litellm_completion_decorator.py +242 -0
opik/integrations/litellm/opik_tracker.py +43 -0
opik/integrations/litellm/stream_patchers.py +151 -0
opik/integrations/llama_index/callback.py +146 -107
opik/integrations/openai/agents/opik_tracing_processor.py +1 -2
opik/integrations/openai/openai_chat_completions_decorator.py +2 -16
opik/integrations/openai/opik_tracker.py +1 -1
opik/integrations/sagemaker/auth.py +5 -1
opik/llm_usage/google_usage.py +3 -1
opik/llm_usage/opik_usage.py +7 -8
opik/llm_usage/opik_usage_factory.py +4 -2
opik/logging_messages.py +6 -0
opik/message_processing/batching/base_batcher.py +14 -21
opik/message_processing/batching/batch_manager.py +22 -10
opik/message_processing/batching/batch_manager_constuctors.py +10 -0
opik/message_processing/batching/batchers.py +59 -27
opik/message_processing/batching/flushing_thread.py +0 -3
opik/message_processing/emulation/__init__.py +0 -0
opik/message_processing/emulation/emulator_message_processor.py +578 -0
opik/message_processing/emulation/local_emulator_message_processor.py +140 -0
opik/message_processing/emulation/models.py +162 -0
opik/message_processing/encoder_helpers.py +79 -0
opik/message_processing/messages.py +56 -1
opik/message_processing/preprocessing/__init__.py +0 -0
opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
opik/message_processing/preprocessing/constants.py +1 -0
opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
opik/message_processing/preprocessing/preprocessor.py +36 -0
opik/message_processing/processors/__init__.py +0 -0
opik/message_processing/processors/attachments_extraction_processor.py +146 -0
opik/message_processing/processors/message_processors.py +92 -0
opik/message_processing/processors/message_processors_chain.py +96 -0
opik/message_processing/{message_processors.py → processors/online_message_processor.py} +85 -29
opik/message_processing/queue_consumer.py +9 -3
opik/message_processing/streamer.py +71 -33
opik/message_processing/streamer_constructors.py +43 -10
opik/opik_context.py +16 -4
opik/plugins/pytest/hooks.py +5 -3
opik/rest_api/__init__.py +346 -15
opik/rest_api/alerts/__init__.py +7 -0
opik/rest_api/alerts/client.py +667 -0
opik/rest_api/alerts/raw_client.py +1015 -0
opik/rest_api/alerts/types/__init__.py +7 -0
opik/rest_api/alerts/types/get_webhook_examples_request_alert_type.py +5 -0
opik/rest_api/annotation_queues/__init__.py +4 -0
opik/rest_api/annotation_queues/client.py +668 -0
opik/rest_api/annotation_queues/raw_client.py +1019 -0
opik/rest_api/automation_rule_evaluators/client.py +34 -2
opik/rest_api/automation_rule_evaluators/raw_client.py +24 -0
opik/rest_api/client.py +15 -0
opik/rest_api/dashboards/__init__.py +4 -0
opik/rest_api/dashboards/client.py +462 -0
opik/rest_api/dashboards/raw_client.py +648 -0
opik/rest_api/datasets/client.py +1310 -44
opik/rest_api/datasets/raw_client.py +2269 -358
opik/rest_api/experiments/__init__.py +2 -2
opik/rest_api/experiments/client.py +191 -5
opik/rest_api/experiments/raw_client.py +301 -7
opik/rest_api/experiments/types/__init__.py +4 -1
opik/rest_api/experiments/types/experiment_update_status.py +5 -0
opik/rest_api/experiments/types/experiment_update_type.py +5 -0
opik/rest_api/experiments/types/experiment_write_status.py +5 -0
opik/rest_api/feedback_definitions/types/find_feedback_definitions_request_type.py +1 -1
opik/rest_api/llm_provider_key/client.py +20 -0
opik/rest_api/llm_provider_key/raw_client.py +20 -0
opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +1 -1
opik/rest_api/manual_evaluation/__init__.py +4 -0
opik/rest_api/manual_evaluation/client.py +347 -0
opik/rest_api/manual_evaluation/raw_client.py +543 -0
opik/rest_api/optimizations/client.py +145 -9
opik/rest_api/optimizations/raw_client.py +237 -13
opik/rest_api/optimizations/types/optimization_update_status.py +3 -1
opik/rest_api/prompts/__init__.py +2 -2
opik/rest_api/prompts/client.py +227 -6
opik/rest_api/prompts/raw_client.py +331 -2
opik/rest_api/prompts/types/__init__.py +3 -1
opik/rest_api/prompts/types/create_prompt_version_detail_template_structure.py +5 -0
opik/rest_api/prompts/types/prompt_write_template_structure.py +5 -0
opik/rest_api/spans/__init__.py +0 -2
opik/rest_api/spans/client.py +238 -76
opik/rest_api/spans/raw_client.py +307 -95
opik/rest_api/spans/types/__init__.py +0 -2
opik/rest_api/traces/client.py +572 -161
opik/rest_api/traces/raw_client.py +736 -229
opik/rest_api/types/__init__.py +352 -17
opik/rest_api/types/aggregation_data.py +1 -0
opik/rest_api/types/alert.py +33 -0
opik/rest_api/types/alert_alert_type.py +5 -0
opik/rest_api/types/alert_page_public.py +24 -0
opik/rest_api/types/alert_public.py +33 -0
opik/rest_api/types/alert_public_alert_type.py +5 -0
opik/rest_api/types/alert_trigger.py +27 -0
opik/rest_api/types/alert_trigger_config.py +28 -0
opik/rest_api/types/alert_trigger_config_public.py +28 -0
opik/rest_api/types/alert_trigger_config_public_type.py +10 -0
opik/rest_api/types/alert_trigger_config_type.py +10 -0
opik/rest_api/types/alert_trigger_config_write.py +22 -0
opik/rest_api/types/alert_trigger_config_write_type.py +10 -0
opik/rest_api/types/alert_trigger_event_type.py +19 -0
opik/rest_api/types/alert_trigger_public.py +27 -0
opik/rest_api/types/alert_trigger_public_event_type.py +19 -0
opik/rest_api/types/alert_trigger_write.py +23 -0
opik/rest_api/types/alert_trigger_write_event_type.py +19 -0
opik/rest_api/types/alert_write.py +28 -0
opik/rest_api/types/alert_write_alert_type.py +5 -0
opik/rest_api/types/annotation_queue.py +42 -0
opik/rest_api/types/annotation_queue_batch.py +27 -0
opik/rest_api/types/annotation_queue_item_ids.py +19 -0
opik/rest_api/types/annotation_queue_page_public.py +28 -0
opik/rest_api/types/annotation_queue_public.py +38 -0
opik/rest_api/types/annotation_queue_public_scope.py +5 -0
opik/rest_api/types/annotation_queue_reviewer.py +20 -0
opik/rest_api/types/annotation_queue_reviewer_public.py +20 -0
opik/rest_api/types/annotation_queue_scope.py +5 -0
opik/rest_api/types/annotation_queue_write.py +31 -0
opik/rest_api/types/annotation_queue_write_scope.py +5 -0
opik/rest_api/types/audio_url.py +19 -0
opik/rest_api/types/audio_url_public.py +19 -0
opik/rest_api/types/audio_url_write.py +19 -0
opik/rest_api/types/automation_rule_evaluator.py +62 -2
opik/rest_api/types/automation_rule_evaluator_llm_as_judge.py +2 -0
opik/rest_api/types/automation_rule_evaluator_llm_as_judge_public.py +2 -0
opik/rest_api/types/automation_rule_evaluator_llm_as_judge_write.py +2 -0
opik/rest_api/types/automation_rule_evaluator_object_object_public.py +155 -0
opik/rest_api/types/automation_rule_evaluator_page_public.py +3 -2
opik/rest_api/types/automation_rule_evaluator_public.py +57 -2
opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_public.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_write.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_public.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_write.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_public.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_write.py +2 -0
opik/rest_api/types/automation_rule_evaluator_update.py +51 -1
opik/rest_api/types/automation_rule_evaluator_update_llm_as_judge.py +2 -0
opik/rest_api/types/automation_rule_evaluator_update_span_llm_as_judge.py +22 -0
opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
opik/rest_api/types/automation_rule_evaluator_update_trace_thread_llm_as_judge.py +2 -0
opik/rest_api/types/automation_rule_evaluator_update_trace_thread_user_defined_metric_python.py +2 -0
opik/rest_api/types/automation_rule_evaluator_update_user_defined_metric_python.py +2 -0
opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python.py +2 -0
opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_public.py +2 -0
opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_write.py +2 -0
opik/rest_api/types/automation_rule_evaluator_write.py +51 -1
opik/rest_api/types/boolean_feedback_definition.py +25 -0
opik/rest_api/types/boolean_feedback_definition_create.py +20 -0
opik/rest_api/types/boolean_feedback_definition_public.py +25 -0
opik/rest_api/types/boolean_feedback_definition_update.py +20 -0
opik/rest_api/types/boolean_feedback_detail.py +29 -0
opik/rest_api/types/boolean_feedback_detail_create.py +29 -0
opik/rest_api/types/boolean_feedback_detail_public.py +29 -0
opik/rest_api/types/boolean_feedback_detail_update.py +29 -0
opik/rest_api/types/dashboard_page_public.py +24 -0
opik/rest_api/types/dashboard_public.py +30 -0
opik/rest_api/types/dataset.py +4 -0
opik/rest_api/types/dataset_expansion.py +42 -0
opik/rest_api/types/dataset_expansion_response.py +39 -0
opik/rest_api/types/dataset_item.py +2 -0
opik/rest_api/types/dataset_item_changes_public.py +5 -0
opik/rest_api/types/dataset_item_compare.py +2 -0
opik/rest_api/types/dataset_item_filter.py +27 -0
opik/rest_api/types/dataset_item_filter_operator.py +21 -0
opik/rest_api/types/dataset_item_page_compare.py +5 -0
opik/rest_api/types/dataset_item_page_public.py +5 -0
opik/rest_api/types/dataset_item_public.py +2 -0
opik/rest_api/types/dataset_item_update.py +39 -0
opik/rest_api/types/dataset_item_write.py +1 -0
opik/rest_api/types/dataset_public.py +4 -0
opik/rest_api/types/dataset_public_status.py +5 -0
opik/rest_api/types/dataset_status.py +5 -0
opik/rest_api/types/dataset_version_diff.py +22 -0
opik/rest_api/types/dataset_version_diff_stats.py +24 -0
opik/rest_api/types/dataset_version_page_public.py +23 -0
opik/rest_api/types/dataset_version_public.py +59 -0
opik/rest_api/types/dataset_version_summary.py +46 -0
opik/rest_api/types/dataset_version_summary_public.py +46 -0
opik/rest_api/types/experiment.py +7 -2
opik/rest_api/types/experiment_group_response.py +2 -0
opik/rest_api/types/experiment_public.py +7 -2
opik/rest_api/types/experiment_public_status.py +5 -0
opik/rest_api/types/experiment_score.py +20 -0
opik/rest_api/types/experiment_score_public.py +20 -0
opik/rest_api/types/experiment_score_write.py +20 -0
opik/rest_api/types/experiment_status.py +5 -0
opik/rest_api/types/feedback.py +25 -1
opik/rest_api/types/feedback_create.py +20 -1
opik/rest_api/types/feedback_object_public.py +27 -1
opik/rest_api/types/feedback_public.py +25 -1
opik/rest_api/types/feedback_score_batch_item.py +2 -1
opik/rest_api/types/feedback_score_batch_item_thread.py +2 -1
opik/rest_api/types/feedback_score_public.py +4 -0
opik/rest_api/types/feedback_update.py +20 -1
opik/rest_api/types/group_content_with_aggregations.py +1 -0
opik/rest_api/types/group_detail.py +19 -0
opik/rest_api/types/group_details.py +20 -0
opik/rest_api/types/guardrail.py +1 -0
opik/rest_api/types/guardrail_write.py +1 -0
opik/rest_api/types/ids_holder.py +19 -0
opik/rest_api/types/image_url.py +20 -0
opik/rest_api/types/image_url_public.py +20 -0
opik/rest_api/types/image_url_write.py +20 -0
opik/rest_api/types/llm_as_judge_message.py +5 -1
opik/rest_api/types/llm_as_judge_message_content.py +26 -0
opik/rest_api/types/llm_as_judge_message_content_public.py +26 -0
opik/rest_api/types/llm_as_judge_message_content_write.py +26 -0
opik/rest_api/types/llm_as_judge_message_public.py +5 -1
opik/rest_api/types/llm_as_judge_message_write.py +5 -1
opik/rest_api/types/llm_as_judge_model_parameters.py +3 -0
opik/rest_api/types/llm_as_judge_model_parameters_public.py +3 -0
opik/rest_api/types/llm_as_judge_model_parameters_write.py +3 -0
opik/rest_api/types/manual_evaluation_request.py +38 -0
opik/rest_api/types/manual_evaluation_request_entity_type.py +5 -0
opik/rest_api/types/manual_evaluation_response.py +27 -0
opik/rest_api/types/optimization.py +4 -2
opik/rest_api/types/optimization_public.py +4 -2
opik/rest_api/types/optimization_public_status.py +3 -1
opik/rest_api/types/optimization_status.py +3 -1
opik/rest_api/types/optimization_studio_config.py +27 -0
opik/rest_api/types/optimization_studio_config_public.py +27 -0
opik/rest_api/types/optimization_studio_config_write.py +27 -0
opik/rest_api/types/optimization_studio_log.py +22 -0
opik/rest_api/types/optimization_write.py +4 -2
opik/rest_api/types/optimization_write_status.py +3 -1
opik/rest_api/types/project.py +1 -0
opik/rest_api/types/project_detailed.py +1 -0
opik/rest_api/types/project_reference.py +31 -0
opik/rest_api/types/project_reference_public.py +31 -0
opik/rest_api/types/project_stats_summary_item.py +1 -0
opik/rest_api/types/prompt.py +6 -0
opik/rest_api/types/prompt_detail.py +6 -0
opik/rest_api/types/prompt_detail_template_structure.py +5 -0
opik/rest_api/types/prompt_public.py +6 -0
opik/rest_api/types/prompt_public_template_structure.py +5 -0
opik/rest_api/types/prompt_template_structure.py +5 -0
opik/rest_api/types/prompt_version.py +3 -0
opik/rest_api/types/prompt_version_detail.py +3 -0
opik/rest_api/types/prompt_version_detail_template_structure.py +5 -0
opik/rest_api/types/prompt_version_link.py +1 -0
opik/rest_api/types/prompt_version_link_public.py +1 -0
opik/rest_api/types/prompt_version_page_public.py +5 -0
opik/rest_api/types/prompt_version_public.py +3 -0
opik/rest_api/types/prompt_version_public_template_structure.py +5 -0
opik/rest_api/types/prompt_version_template_structure.py +5 -0
opik/rest_api/types/prompt_version_update.py +33 -0
opik/rest_api/types/provider_api_key.py +9 -0
opik/rest_api/types/provider_api_key_provider.py +1 -1
opik/rest_api/types/provider_api_key_public.py +9 -0
opik/rest_api/types/provider_api_key_public_provider.py +1 -1
opik/rest_api/types/score_name.py +1 -0
opik/rest_api/types/service_toggles_config.py +18 -0
opik/rest_api/types/span.py +1 -2
opik/rest_api/types/span_enrichment_options.py +31 -0
opik/rest_api/types/span_experiment_item_bulk_write_view.py +1 -2
opik/rest_api/types/span_filter.py +23 -0
opik/rest_api/types/span_filter_operator.py +21 -0
opik/rest_api/types/span_filter_write.py +23 -0
opik/rest_api/types/span_filter_write_operator.py +21 -0
opik/rest_api/types/span_llm_as_judge_code.py +27 -0
opik/rest_api/types/span_llm_as_judge_code_public.py +27 -0
opik/rest_api/types/span_llm_as_judge_code_write.py +27 -0
opik/rest_api/types/span_public.py +1 -2
opik/rest_api/types/span_update.py +46 -0
opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
opik/rest_api/types/span_write.py +1 -2
opik/rest_api/types/studio_evaluation.py +20 -0
opik/rest_api/types/studio_evaluation_public.py +20 -0
opik/rest_api/types/studio_evaluation_write.py +20 -0
opik/rest_api/types/studio_llm_model.py +21 -0
opik/rest_api/types/studio_llm_model_public.py +21 -0
opik/rest_api/types/studio_llm_model_write.py +21 -0
opik/rest_api/types/studio_message.py +20 -0
opik/rest_api/types/studio_message_public.py +20 -0
opik/rest_api/types/studio_message_write.py +20 -0
opik/rest_api/types/studio_metric.py +21 -0
opik/rest_api/types/studio_metric_public.py +21 -0
opik/rest_api/types/studio_metric_write.py +21 -0
opik/rest_api/types/studio_optimizer.py +21 -0
opik/rest_api/types/studio_optimizer_public.py +21 -0
opik/rest_api/types/studio_optimizer_write.py +21 -0
opik/rest_api/types/studio_prompt.py +20 -0
opik/rest_api/types/studio_prompt_public.py +20 -0
opik/rest_api/types/studio_prompt_write.py +20 -0
opik/rest_api/types/trace.py +11 -2
opik/rest_api/types/trace_enrichment_options.py +32 -0
opik/rest_api/types/trace_experiment_item_bulk_write_view.py +1 -2
opik/rest_api/types/trace_filter.py +23 -0
opik/rest_api/types/trace_filter_operator.py +21 -0
opik/rest_api/types/trace_filter_write.py +23 -0
opik/rest_api/types/trace_filter_write_operator.py +21 -0
opik/rest_api/types/trace_public.py +11 -2
opik/rest_api/types/trace_thread_filter_write.py +23 -0
opik/rest_api/types/trace_thread_filter_write_operator.py +21 -0
opik/rest_api/types/trace_thread_identifier.py +1 -0
opik/rest_api/types/trace_update.py +39 -0
opik/rest_api/types/trace_write.py +1 -2
opik/rest_api/types/value_entry.py +2 -0
opik/rest_api/types/value_entry_compare.py +2 -0
opik/rest_api/types/value_entry_experiment_item_bulk_write_view.py +2 -0
opik/rest_api/types/value_entry_public.py +2 -0
opik/rest_api/types/video_url.py +19 -0
opik/rest_api/types/video_url_public.py +19 -0
opik/rest_api/types/video_url_write.py +19 -0
opik/rest_api/types/webhook.py +28 -0
opik/rest_api/types/webhook_examples.py +19 -0
opik/rest_api/types/webhook_public.py +28 -0
opik/rest_api/types/webhook_test_result.py +23 -0
opik/rest_api/types/webhook_test_result_status.py +5 -0
opik/rest_api/types/webhook_write.py +23 -0
opik/rest_api/types/welcome_wizard_tracking.py +22 -0
opik/rest_api/types/workspace_configuration.py +5 -0
opik/rest_api/welcome_wizard/__init__.py +4 -0
opik/rest_api/welcome_wizard/client.py +195 -0
opik/rest_api/welcome_wizard/raw_client.py +208 -0
opik/rest_api/workspaces/client.py +14 -2
opik/rest_api/workspaces/raw_client.py +10 -0
opik/s3_httpx_client.py +14 -1
opik/simulation/__init__.py +6 -0
opik/simulation/simulated_user.py +99 -0
opik/simulation/simulator.py +108 -0
opik/synchronization.py +5 -6
opik/{decorator/tracing_runtime_config.py → tracing_runtime_config.py} +6 -7
opik/types.py +36 -0
opik/validation/chat_prompt_messages.py +241 -0
opik/validation/feedback_score.py +3 -3
opik/validation/validator.py +28 -0
opik-1.9.71.dist-info/METADATA +370 -0
opik-1.9.71.dist-info/RECORD +1110 -0
opik/api_objects/prompt/prompt.py +0 -112
opik/cli.py +0 -193
opik/hooks.py +0 -13
opik/integrations/bedrock/chunks_aggregator.py +0 -55
opik/integrations/bedrock/helpers.py +0 -8
opik/rest_api/types/automation_rule_evaluator_object_public.py +0 -100
opik/rest_api/types/json_node_experiment_item_bulk_write_view.py +0 -5
opik-1.8.39.dist-info/METADATA +0 -339
opik-1.8.39.dist-info/RECORD +0 -790
/opik/{evaluation/metrics/conversation/conversational_coherence → decorator/context_manager}/__init__.py +0 -0
/opik/evaluation/metrics/conversation/{session_completeness → llm_judges/conversational_coherence}/__init__.py +0 -0
/opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/schema.py +0 -0
/opik/evaluation/metrics/conversation/{user_frustration → llm_judges/session_completeness}/__init__.py +0 -0
/opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/schema.py +0 -0
/opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/schema.py +0 -0
/opik/integrations/bedrock/{stream_wrappers.py → converse/stream_wrappers.py} +0 -0
/opik/rest_api/{spans/types → types}/span_update_type.py +0 -0
{opik-1.8.39.dist-info → opik-1.9.71.dist-info}/WHEEL +0 -0
{opik-1.8.39.dist-info → opik-1.9.71.dist-info}/entry_points.txt +0 -0
{opik-1.8.39.dist-info → opik-1.9.71.dist-info}/licenses/LICENSE +0 -0
{opik-1.8.39.dist-info → opik-1.9.71.dist-info}/top_level.txt +0 -0

opik/evaluation/metrics/conversation/llm_judges/g_eval_wrappers.py ADDED Viewed

@@ -0,0 +1,442 @@
+"""Conversation-level adapters for GEval-based judges."""
+from __future__ import annotations
+from typing import Any, Optional
+import opik.exceptions as exceptions
+from opik.evaluation.metrics import score_result
+from opik.evaluation.metrics.conversation import types as conversation_types
+from opik.evaluation.metrics.base_metric import BaseMetric
+from opik.evaluation.metrics.conversation.conversation_thread_metric import (
+    ConversationThreadMetric,
+)
+from opik.evaluation.metrics.llm_judges.g_eval_presets import (
+    compliance_risk as compliance_presets,
+    prompt_uncertainty as prompt_presets,
+    qa_suite as qa_presets,
+)
+class GEvalConversationMetric(ConversationThreadMetric):
+    """
+    Wrap a GEval-style judge so it can evaluate an entire conversation transcript.
+    The wrapper extracts the latest assistant turn from the conversation and sends
+    it to the provided judge. Results are normalised into a ``ScoreResult`` so they
+    can plug into the wider Opik evaluation pipeline. Any errors raised by the
+    underlying judge are captured and reported as a failed score computation.
+    The conversation input should match :class:`ConversationThreadMetric`
+    semantics—a list of dicts containing ``role`` and ``content`` keys ordered by
+    time.
+    Args:
+        judge: A GEval-compatible metric instance that accepts ``output`` as its
+            primary argument.
+        name: Optional override for the metric name used in results. When ``None``
+            the name is derived from the wrapped judge.
+    Returns:
+        ScoreResult: Mirrors the wrapped judge's score/value/metadata fields. When
+        the judge fails, ``scoring_failed`` is set and ``value`` is ``0.0``.
+    Example:
+        >>> from opik.evaluation.metrics.conversation.llm_judges.g_eval_wrappers import (
+        ...     GEvalConversationMetric,
+        ... )
+        >>> from opik.evaluation.metrics.llm_judges.g_eval_presets.qa_suite import DialogueHelpfulnessJudge
+        >>> conversation = [
+        ...     {"role": "user", "content": "Summarise these notes."},
+        ...     {"role": "assistant", "content": "Here is a concise summary..."},
+        ... ]
+        >>> metric = GEvalConversationMetric(judge=DialogueHelpfulnessJudge(model="gpt-4"))
+        >>> result = metric.score(conversation)
+        >>> result.value  # doctest: +SKIP
+        0.83
+    """
+    def __init__(
+        self,
+        judge: BaseMetric,
+        name: Optional[str] = None,
+    ) -> None:
+        super().__init__(
+            name=name or f"conversation_{judge.name}",
+            track=getattr(judge, "track", True),
+            project_name=getattr(judge, "project_name", None),
+        )
+        self._judge = judge
+    def _normalize_result(self, raw_result: Any) -> score_result.ScoreResult:
+        if isinstance(raw_result, score_result.ScoreResult):
+            return raw_result
+        if isinstance(raw_result, list):
+            if not raw_result:
+                raise exceptions.MetricComputationError(
+                    "Judge returned an empty list of results."
+                )
+            first = raw_result[0]
+            if isinstance(first, score_result.ScoreResult):
+                return first
+        raise exceptions.MetricComputationError(
+            f"Judge {self._judge.name} returned unsupported result type {type(raw_result)!r}"
+        )
+    def score(
+        self,
+        conversation: conversation_types.Conversation,
+        **_: Any,
+    ) -> score_result.ScoreResult:
+        """
+        Evaluate the final assistant turn in a conversation.
+        Args:
+            conversation: Sequence of dict-like turns containing ``role`` and
+                ``content`` keys. Only assistant turns with non-empty ``content``
+                are considered.
+        Returns:
+            ScoreResult: Normalised output from the wrapped judge. If no assistant
+            message is present, the result is marked as failed with ``value=0.0``.
+        """
+        last_assistant = next(
+            (
+                turn.get("content", "")
+                for turn in reversed(conversation)
+                if turn.get("role") == "assistant"
+            ),
+            "",
+        )
+        if not last_assistant.strip():
+            return score_result.ScoreResult(
+                name=self.name,
+                value=0.0,
+                reason="Conversation contains no assistant messages to evaluate.",
+                scoring_failed=True,
+            )
+        try:
+            raw_result = self._judge.score(output=last_assistant)
+        except exceptions.MetricComputationError as error:
+            reason = str(error)
+        except Exception as error:
+            reason = (
+                f"Judge {self._judge.name} raised {error.__class__.__name__}: {error}"
+            )
+        else:
+            judge_result = self._normalize_result(raw_result)
+            return score_result.ScoreResult(
+                name=self.name,
+                value=judge_result.value,
+                reason=judge_result.reason,
+                metadata=judge_result.metadata,
+                scoring_failed=judge_result.scoring_failed,
+            )
+        return score_result.ScoreResult(
+            name=self.name,
+            value=0.0,
+            reason=reason,
+            scoring_failed=True,
+        )
+class ConversationComplianceRiskMetric(GEvalConversationMetric):
+    """
+    Evaluate the latest assistant response for compliance and risk exposure.
+    This metric forwards the final assistant turn to
+    :class:`~opik.evaluation.metrics.llm_judges.g_eval_presets.compliance_risk.ComplianceRiskJudge`
+    and returns its assessment as a conversation-level ``ScoreResult``.
+    Args:
+        model: Optional model name or identifier understood by the judge.
+        track: Whether to automatically track metric results. Defaults to ``True``.
+        project_name: Optional tracking project name. Defaults to ``None``.
+        temperature: Sampling temperature supplied to the underlying judge model.
+    Returns:
+        ScoreResult: Compliance score emitted by the wrapped judge; failed
+        evaluations set ``scoring_failed`` and ``value=0.0``.
+    Example:
+        >>> from opik.evaluation.metrics import ConversationComplianceRiskMetric
+        >>> conversation = [
+        ...     {"role": "user", "content": "Generate an employment contract."},
+        ...     {"role": "assistant", "content": "Here is a standard contract template..."},
+        ... ]
+        >>> metric = ConversationComplianceRiskMetric(model="gpt-4")
+        >>> result = metric.score(conversation)
+        >>> result.value  # doctest: +SKIP
+        0.12
+    """
+    def __init__(
+        self,
+        model: Optional[str] = None,
+        track: bool = True,
+        project_name: Optional[str] = None,
+        temperature: float = 0.0,
+    ) -> None:
+        super().__init__(
+            judge=compliance_presets.ComplianceRiskJudge(
+                model=model,
+                track=track,
+                project_name=project_name,
+                temperature=temperature,
+            ),
+            name="conversation_compliance_risk",
+        )
+class ConversationDialogueHelpfulnessMetric(GEvalConversationMetric):
+    """
+    Score how helpful the closing assistant message is within the dialogue.
+    The metric expects the same conversation shape as
+    :class:`ConversationThreadMetric`. It uses
+    :class:`~opik.evaluation.metrics.llm_judges.g_eval_presets.qa_suite.DialogueHelpfulnessJudge`
+    to evaluate usefulness and responsiveness of the final assistant turn.
+    Args:
+        model: Optional model name passed to the judge.
+        track: Whether to automatically track results. Defaults to ``True``.
+        project_name: Optional tracking project. Defaults to ``None``.
+        temperature: Temperature fed into the judge's underlying model.
+    Returns:
+        ScoreResult: Helpfulness score from the wrapped judge.
+    Example:
+        >>> from opik.evaluation.metrics import ConversationDialogueHelpfulnessMetric
+        >>> conversation = [
+        ...     {"role": "user", "content": "How do I reset my password?"},
+        ...     {"role": "assistant", "content": "Click the reset link and follow the steps."},
+        ... ]
+        >>> metric = ConversationDialogueHelpfulnessMetric(model="gpt-4")
+        >>> result = metric.score(conversation)
+        >>> result.value  # doctest: +SKIP
+        0.88
+    """
+    def __init__(
+        self,
+        model: Optional[str] = None,
+        track: bool = True,
+        project_name: Optional[str] = None,
+        temperature: float = 0.0,
+    ) -> None:
+        super().__init__(
+            judge=qa_presets.DialogueHelpfulnessJudge(
+                model=model,
+                track=track,
+                project_name=project_name,
+                temperature=temperature,
+            ),
+            name="conversation_dialogue_helpfulness",
+        )
+class ConversationQARelevanceMetric(GEvalConversationMetric):
+    """
+    Quantify how relevant the assistant's final answer is to the preceding query.
+    This metric expects a conversation sequence compatible with
+    :class:`ConversationThreadMetric` and wraps
+    :class:`~opik.evaluation.metrics.llm_judges.g_eval_presets.qa_suite.QARelevanceJudge`
+    and is useful when the conversation emulates a Q&A exchange.
+    Args:
+        model: Optional model name used by the judge backend.
+        track: Whether to automatically track outcomes. Defaults to ``True``.
+        project_name: Optional project for tracked scores. Defaults to ``None``.
+        temperature: Judge sampling temperature.
+    Returns:
+        ScoreResult: Relevance score from the judge.
+    Example:
+        >>> from opik.evaluation.metrics import ConversationQARelevanceMetric
+        >>> conversation = [
+        ...     {"role": "user", "content": "Who wrote Dune?"},
+        ...     {"role": "assistant", "content": "Frank Herbert wrote Dune."},
+        ... ]
+        >>> metric = ConversationQARelevanceMetric(model="gpt-4")
+        >>> result = metric.score(conversation)
+        >>> result.value  # doctest: +SKIP
+        1.0
+    """
+    def __init__(
+        self,
+        model: Optional[str] = None,
+        track: bool = True,
+        project_name: Optional[str] = None,
+        temperature: float = 0.0,
+    ) -> None:
+        super().__init__(
+            judge=qa_presets.QARelevanceJudge(
+                model=model,
+                track=track,
+                project_name=project_name,
+                temperature=temperature,
+            ),
+            name="conversation_qa_relevance",
+        )
+class ConversationSummarizationCoherenceMetric(GEvalConversationMetric):
+    """
+    Assess the coherence of a summary-style assistant response.
+    The metric expects the conversation schema defined by
+    :class:`ConversationThreadMetric` and invokes
+    :class:`~opik.evaluation.metrics.llm_judges.g_eval_presets.qa_suite.SummarizationCoherenceJudge`
+    to rate whether the summary flows naturally and captures the conversation
+    structure.
+    Args:
+        model: Optional model name or identifier for the judge.
+        track: Whether to track metric results automatically. Defaults to ``True``.
+        project_name: Optional project name for tracked scores. Defaults to ``None``.
+        temperature: Sampling temperature passed to the judge model.
+    Returns:
+        ScoreResult: Coherence score from the judge.
+    Example:
+        >>> from opik.evaluation.metrics import ConversationSummarizationCoherenceMetric
+        >>> conversation = [
+        ...     {"role": "user", "content": "Summarise this chat."},
+        ...     {"role": "assistant", "content": "Summary: we discussed timelines and budgets."},
+        ... ]
+        >>> metric = ConversationSummarizationCoherenceMetric(model="gpt-4")
+        >>> result = metric.score(conversation)
+        >>> result.value  # doctest: +SKIP
+        0.91
+    """
+    def __init__(
+        self,
+        model: Optional[str] = None,
+        track: bool = True,
+        project_name: Optional[str] = None,
+        temperature: float = 0.0,
+    ) -> None:
+        super().__init__(
+            judge=qa_presets.SummarizationCoherenceJudge(
+                model=model,
+                track=track,
+                project_name=project_name,
+                temperature=temperature,
+            ),
+            name="conversation_summarization_coherence",
+        )
+class ConversationSummarizationConsistencyMetric(GEvalConversationMetric):
+    """
+    Check whether a dialogue summary stays faithful to the source turns.
+    The metric assumes the standard conversation schema and delegates scoring to
+    :class:`~opik.evaluation.metrics.llm_judges.g_eval_presets.qa_suite.SummarizationConsistencyJudge`
+    and reports the result at the conversation level.
+    Args:
+        model: Optional model name passed through to the judge.
+        track: Whether to automatically track results. Defaults to ``True``.
+        project_name: Optional tracking project. Defaults to ``None``.
+        temperature: Temperature parameter supplied to the judge model.
+    Returns:
+        ScoreResult: Consistency score from the judge.
+    Example:
+        >>> from opik.evaluation.metrics import ConversationSummarizationConsistencyMetric
+        >>> conversation = [
+        ...     {"role": "user", "content": "Give me a summary."},
+        ...     {"role": "assistant", "content": "Summary: project ships next week."},
+        ... ]
+        >>> metric = ConversationSummarizationConsistencyMetric(model="gpt-4")
+        >>> result = metric.score(conversation)
+        >>> result.value  # doctest: +SKIP
+        0.95
+    """
+    def __init__(
+        self,
+        model: Optional[str] = None,
+        track: bool = True,
+        project_name: Optional[str] = None,
+        temperature: float = 0.0,
+    ) -> None:
+        super().__init__(
+            judge=qa_presets.SummarizationConsistencyJudge(
+                model=model,
+                track=track,
+                project_name=project_name,
+                temperature=temperature,
+            ),
+            name="conversation_summarization_consistency",
+        )
+class ConversationPromptUncertaintyMetric(GEvalConversationMetric):
+    """
+    Measure how uncertain the assistant appears about executing the prompt.
+    The metric expects the standard conversation schema and pipes the latest
+    assistant reply into
+    :class:`~opik.evaluation.metrics.llm_judges.g_eval_presets.prompt_diagnostics.PromptUncertaintyJudge`
+    and returns the judge's score in a conversation-friendly format.
+    Args:
+        model: Optional model name for the judge to use.
+        track: Whether to automatically track the metric. Defaults to ``True``.
+        project_name: Optional tracking project. Defaults to ``None``.
+        temperature: Sampling temperature for the judge model.
+    Returns:
+        ScoreResult: Uncertainty score from the judge.
+    Example:
+        >>> from opik.evaluation.metrics import ConversationPromptUncertaintyMetric
+        >>> conversation = [
+        ...     {"role": "user", "content": "Follow the brief precisely."},
+        ...     {"role": "assistant", "content": "I'm not fully certain which part to prioritise."},
+        ... ]
+        >>> metric = ConversationPromptUncertaintyMetric(model="gpt-4")
+        >>> result = metric.score(conversation)
+        >>> result.value  # doctest: +SKIP
+        0.42
+    """
+    def __init__(
+        self,
+        model: Optional[str] = None,
+        track: bool = True,
+        project_name: Optional[str] = None,
+        temperature: float = 0.0,
+    ) -> None:
+        super().__init__(
+            judge=prompt_presets.PromptUncertaintyJudge(
+                model=model,
+                track=track,
+                project_name=project_name,
+                temperature=temperature,
+            ),
+            name="conversation_prompt_uncertainty",
+        )
+__all__ = [
+    "GEvalConversationMetric",
+    "ConversationComplianceRiskMetric",
+    "ConversationDialogueHelpfulnessMetric",
+    "ConversationQARelevanceMetric",
+    "ConversationSummarizationCoherenceMetric",
+    "ConversationSummarizationConsistencyMetric",
+    "ConversationPromptUncertaintyMetric",
+]

opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/metric.py RENAMED Viewed

@@ -3,17 +3,20 @@ import logging
 from typing import Optional, Any, Union, List
 import pydantic
+from opik.evaluation.metrics.conversation import types as conversation_types
 import opik.exceptions as exceptions
+from opik.evaluation.metrics import score_result
+from opik.evaluation.metrics.conversation.conversation_thread_metric import (
+    ConversationThreadMetric,
+)
+from opik.evaluation.metrics.llm_judges import parsing_helpers
 from opik.evaluation.models import base_model, models_factory
 from . import schema, templates
-from .. import conversation_thread_metric, types as conversation_types
-from ... import score_result
-from ...llm_judges import parsing_helpers
 LOGGER = logging.getLogger(__name__)
-class SessionCompletenessQuality(conversation_thread_metric.ConversationThreadMetric):
+class SessionCompletenessQuality(ConversationThreadMetric):
     """
     Represents the Session Completeness Quality metric for a conversation thread.
@@ -34,6 +37,7 @@ class SessionCompletenessQuality(conversation_thread_metric.ConversationThreadMe
         include_reason: Whether to include a reason for the score.
         track: Whether to track the metric. Default is True.
         project_name: The project name to track the metric in.
+        temperature: The temperature to use for the model. Defaults to 1e-8.
     Example:
         >>> from opik.evaluation.metrics import SessionCompletenessQuality
@@ -59,22 +63,24 @@ class SessionCompletenessQuality(conversation_thread_metric.ConversationThreadMe
         include_reason: bool = True,
         track: bool = True,
         project_name: Optional[str] = None,
+        temperature: float = 1e-8,
     ):
         super().__init__(
             name=name,
             track=track,
             project_name=project_name,
         )
-        self._init_model(model)
         self._include_reason = include_reason
+        self._init_model(model, temperature=temperature)
     def _init_model(
-        self, model: Optional[Union[str, base_model.OpikBaseModel]]
+        self, model: Optional[Union[str, base_model.OpikBaseModel]], temperature: float
     ) -> None:
         if isinstance(model, base_model.OpikBaseModel):
             self._model = model
         else:
-            self._model = models_factory.get(model_name=model)
+            self._model = models_factory.get(model_name=model, temperature=temperature)
     def score(
         self,

opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/templates.py RENAMED Viewed

@@ -1,6 +1,6 @@
 from typing import List
-from .. import types as conversation_types
+from opik.evaluation.metrics.conversation import types as conversation_types
 def extract_user_goals(conversation: conversation_types.Conversation) -> str:

opik/evaluation/metrics/conversation/llm_judges/user_frustration/__init__.py ADDED Viewed

File without changes

opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/metric.py RENAMED Viewed

@@ -4,21 +4,21 @@ from typing import Optional, Union, Any, List, Dict
 import pydantic
+from opik.evaluation.metrics.conversation import helpers as conversation_helpers
+from opik.evaluation.metrics.conversation import types as conversation_types
 import opik.exceptions as exceptions
 from opik.evaluation.metrics import score_result
+from opik.evaluation.metrics.conversation.conversation_thread_metric import (
+    ConversationThreadMetric,
+)
+from opik.evaluation.metrics.llm_judges import parsing_helpers
 from opik.evaluation.models import base_model, models_factory
 from . import schema, templates
-from .. import (
-    types as conversation_types,
-    conversation_thread_metric,
-    helpers,
-)
-from ...llm_judges import parsing_helpers
 LOGGER = logging.getLogger(__name__)
-class UserFrustrationMetric(conversation_thread_metric.ConversationThreadMetric):
+class UserFrustrationMetric(ConversationThreadMetric):
     """
     A heuristic score estimating the likelihood that the user experienced confusion, annoyance,
     or disengagement during the session — due to repetition, lack of adaptation, ignored
@@ -47,6 +47,7 @@ class UserFrustrationMetric(conversation_thread_metric.ConversationThreadMetric)
         window_size: The window size to use for calculating the score. It defines the
             maximal number of historical turns to include in each window when assessing
             the frustration of the current turn in the conversation. Default is 10.
+        temperature: The temperature to use for the model. Defaults to 1e-8.
     Example:
         >>> from opik.evaluation.metrics import UserFrustrationMetric
@@ -73,23 +74,25 @@ class UserFrustrationMetric(conversation_thread_metric.ConversationThreadMetric)
         track: bool = True,
         project_name: Optional[str] = None,
         window_size: int = 10,
+        temperature: float = 1e-8,
     ):
         super().__init__(
             name=name,
             track=track,
             project_name=project_name,
         )
-        self._init_model(model)
         self._include_reason = include_reason
         self._window_size = window_size
+        self._init_model(model, temperature=temperature)
     def _init_model(
-        self, model: Optional[Union[str, base_model.OpikBaseModel]]
+        self, model: Optional[Union[str, base_model.OpikBaseModel]], temperature: float
     ) -> None:
         if isinstance(model, base_model.OpikBaseModel):
             self._model = model
         else:
-            self._model = models_factory.get(model_name=model)
+            self._model = models_factory.get(model_name=model, temperature=temperature)
     def score(
         self,
@@ -110,8 +113,10 @@ class UserFrustrationMetric(conversation_thread_metric.ConversationThreadMetric)
         conversation: conversation_types.Conversation,
     ) -> score_result.ScoreResult:
         try:
-            turns_windows = helpers.extract_turns_windows_from_conversation(
-                conversation=conversation, window_size=self._window_size
+            turns_windows = (
+                conversation_helpers.extract_turns_windows_from_conversation(
+                    conversation=conversation, window_size=self._window_size
+                )
             )
             verdicts = [
@@ -141,8 +146,10 @@ class UserFrustrationMetric(conversation_thread_metric.ConversationThreadMetric)
         conversation: conversation_types.Conversation,
     ) -> score_result.ScoreResult:
         try:
-            turns_windows = helpers.extract_turns_windows_from_conversation(
-                conversation=conversation, window_size=self._window_size
+            turns_windows = (
+                conversation_helpers.extract_turns_windows_from_conversation(
+                    conversation=conversation, window_size=self._window_size
+                )
             )
             verdicts = await asyncio.gather(

opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/templates.py RENAMED Viewed

@@ -1,6 +1,6 @@
 from typing import List, Dict
-from .. import types as conversation_types
+from opik.evaluation.metrics.conversation import types as conversation_types
 def evaluate_conversation(sliding_window: conversation_types.Conversation) -> str:

opik/evaluation/metrics/conversation/types.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Dict, Literal, List, Optional
+from typing import Dict, List, Literal, Optional
 import pydantic
@@ -9,15 +9,12 @@ Conversation = List[ConversationDict]
 class ConversationTurn(pydantic.BaseModel):
     """
     Representation of a single turn in a conversation.
     This class defines a model for encapsulating a single conversational
     turn consisting of an input user's message and an output LLM message. It is
     designed to handle the exchange of messages in a structured format.
     Args:
         input: The input message of the conversation turn.
         output: The output message of the conversation turn.
     Example:
         >>> conversation_turn = ConversationTurn(
         >>>     input={"role": "user", "content": "Hello!"},
@@ -31,5 +28,7 @@ class ConversationTurn(pydantic.BaseModel):
     def as_list(self) -> List[ConversationDict]:
         if self.output is None:
             return [self.input]
         return [self.input, self.output]
+__all__ = ["ConversationDict", "Conversation", "ConversationTurn"]

opik/evaluation/metrics/conversation_types.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""Public aliases for conversation type helpers."""
+from .conversation.types import (
+    Conversation,
+    ConversationDict,
+    ConversationTurn,
+)
+__all__ = ["Conversation", "ConversationDict", "ConversationTurn"]

opik 1.8.39__py3-none-any.whl → 1.9.71__py3-none-any.whl

opik 1.8.39py3-none-any.whl → 1.9.71py3-none-any.whl