PyPI - opik - Versions diffs - 1.8.39__py3-none-any.whl → 1.9.71__py3-none-any.whl - Mend

opik 1.8.39py3-none-any.whl → 1.9.71py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (592) hide show

opik/__init__.py +19 -3
opik/anonymizer/__init__.py +5 -0
opik/anonymizer/anonymizer.py +12 -0
opik/anonymizer/factory.py +80 -0
opik/anonymizer/recursive_anonymizer.py +64 -0
opik/anonymizer/rules.py +56 -0
opik/anonymizer/rules_anonymizer.py +35 -0
opik/api_objects/attachment/attachment_context.py +36 -0
opik/api_objects/attachment/attachments_extractor.py +153 -0
opik/api_objects/attachment/client.py +1 -0
opik/api_objects/attachment/converters.py +2 -0
opik/api_objects/attachment/decoder.py +18 -0
opik/api_objects/attachment/decoder_base64.py +83 -0
opik/api_objects/attachment/decoder_helpers.py +137 -0
opik/api_objects/data_helpers.py +79 -0
opik/api_objects/dataset/dataset.py +64 -4
opik/api_objects/dataset/rest_operations.py +11 -2
opik/api_objects/experiment/experiment.py +57 -57
opik/api_objects/experiment/experiment_item.py +2 -1
opik/api_objects/experiment/experiments_client.py +64 -0
opik/api_objects/experiment/helpers.py +35 -11
opik/api_objects/experiment/rest_operations.py +65 -5
opik/api_objects/helpers.py +8 -5
opik/api_objects/local_recording.py +81 -0
opik/api_objects/opik_client.py +600 -108
opik/api_objects/opik_query_language.py +39 -5
opik/api_objects/prompt/__init__.py +12 -2
opik/api_objects/prompt/base_prompt.py +69 -0
opik/api_objects/prompt/base_prompt_template.py +29 -0
opik/api_objects/prompt/chat/__init__.py +1 -0
opik/api_objects/prompt/chat/chat_prompt.py +210 -0
opik/api_objects/prompt/chat/chat_prompt_template.py +350 -0
opik/api_objects/prompt/chat/content_renderer_registry.py +203 -0
opik/api_objects/prompt/client.py +189 -47
opik/api_objects/prompt/text/__init__.py +1 -0
opik/api_objects/prompt/text/prompt.py +174 -0
opik/api_objects/prompt/{prompt_template.py → text/prompt_template.py} +10 -6
opik/api_objects/prompt/types.py +23 -0
opik/api_objects/search_helpers.py +89 -0
opik/api_objects/span/span_data.py +35 -25
opik/api_objects/threads/threads_client.py +39 -5
opik/api_objects/trace/trace_client.py +52 -2
opik/api_objects/trace/trace_data.py +15 -24
opik/api_objects/validation_helpers.py +3 -3
opik/cli/__init__.py +5 -0
opik/cli/__main__.py +6 -0
opik/cli/configure.py +66 -0
opik/cli/exports/__init__.py +131 -0
opik/cli/exports/dataset.py +278 -0
opik/cli/exports/experiment.py +784 -0
opik/cli/exports/project.py +685 -0
opik/cli/exports/prompt.py +578 -0
opik/cli/exports/utils.py +406 -0
opik/cli/harbor.py +39 -0
opik/cli/healthcheck.py +21 -0
opik/cli/imports/__init__.py +439 -0
opik/cli/imports/dataset.py +143 -0
opik/cli/imports/experiment.py +1192 -0
opik/cli/imports/project.py +262 -0
opik/cli/imports/prompt.py +177 -0
opik/cli/imports/utils.py +280 -0
opik/cli/main.py +49 -0
opik/cli/proxy.py +93 -0
opik/cli/usage_report/__init__.py +16 -0
opik/cli/usage_report/charts.py +783 -0
opik/cli/usage_report/cli.py +274 -0
opik/cli/usage_report/constants.py +9 -0
opik/cli/usage_report/extraction.py +749 -0
opik/cli/usage_report/pdf.py +244 -0
opik/cli/usage_report/statistics.py +78 -0
opik/cli/usage_report/utils.py +235 -0
opik/config.py +13 -7
opik/configurator/configure.py +17 -0
opik/datetime_helpers.py +12 -0
opik/decorator/arguments_helpers.py +9 -1
opik/decorator/base_track_decorator.py +205 -133
opik/decorator/context_manager/span_context_manager.py +123 -0
opik/decorator/context_manager/trace_context_manager.py +84 -0
opik/decorator/opik_args/__init__.py +13 -0
opik/decorator/opik_args/api_classes.py +71 -0
opik/decorator/opik_args/helpers.py +120 -0
opik/decorator/span_creation_handler.py +25 -6
opik/dict_utils.py +3 -3
opik/evaluation/__init__.py +13 -2
opik/evaluation/engine/engine.py +272 -75
opik/evaluation/engine/evaluation_tasks_executor.py +6 -3
opik/evaluation/engine/helpers.py +31 -6
opik/evaluation/engine/metrics_evaluator.py +237 -0
opik/evaluation/evaluation_result.py +168 -2
opik/evaluation/evaluator.py +533 -62
opik/evaluation/metrics/__init__.py +103 -4
opik/evaluation/metrics/aggregated_metric.py +35 -6
opik/evaluation/metrics/base_metric.py +1 -1
opik/evaluation/metrics/conversation/__init__.py +48 -0
opik/evaluation/metrics/conversation/conversation_thread_metric.py +56 -2
opik/evaluation/metrics/conversation/g_eval_wrappers.py +19 -0
opik/evaluation/metrics/conversation/helpers.py +14 -15
opik/evaluation/metrics/conversation/heuristics/__init__.py +14 -0
opik/evaluation/metrics/conversation/heuristics/degeneration/__init__.py +3 -0
opik/evaluation/metrics/conversation/heuristics/degeneration/metric.py +189 -0
opik/evaluation/metrics/conversation/heuristics/degeneration/phrases.py +12 -0
opik/evaluation/metrics/conversation/heuristics/knowledge_retention/__init__.py +3 -0
opik/evaluation/metrics/conversation/heuristics/knowledge_retention/metric.py +172 -0
opik/evaluation/metrics/conversation/llm_judges/__init__.py +32 -0
opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/metric.py +22 -17
opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/templates.py +1 -1
opik/evaluation/metrics/conversation/llm_judges/g_eval_wrappers.py +442 -0
opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/metric.py +13 -7
opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/templates.py +1 -1
opik/evaluation/metrics/conversation/llm_judges/user_frustration/__init__.py +0 -0
opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/metric.py +21 -14
opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/templates.py +1 -1
opik/evaluation/metrics/conversation/types.py +4 -5
opik/evaluation/metrics/conversation_types.py +9 -0
opik/evaluation/metrics/heuristics/bertscore.py +107 -0
opik/evaluation/metrics/heuristics/bleu.py +35 -15
opik/evaluation/metrics/heuristics/chrf.py +127 -0
opik/evaluation/metrics/heuristics/contains.py +47 -11
opik/evaluation/metrics/heuristics/distribution_metrics.py +331 -0
opik/evaluation/metrics/heuristics/gleu.py +113 -0
opik/evaluation/metrics/heuristics/language_adherence.py +123 -0
opik/evaluation/metrics/heuristics/meteor.py +119 -0
opik/evaluation/metrics/heuristics/prompt_injection.py +150 -0
opik/evaluation/metrics/heuristics/readability.py +129 -0
opik/evaluation/metrics/heuristics/rouge.py +26 -9
opik/evaluation/metrics/heuristics/spearman.py +88 -0
opik/evaluation/metrics/heuristics/tone.py +155 -0
opik/evaluation/metrics/heuristics/vader_sentiment.py +77 -0
opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +20 -5
opik/evaluation/metrics/llm_judges/context_precision/metric.py +20 -6
opik/evaluation/metrics/llm_judges/context_recall/metric.py +20 -6
opik/evaluation/metrics/llm_judges/g_eval/__init__.py +5 -0
opik/evaluation/metrics/llm_judges/g_eval/metric.py +219 -68
opik/evaluation/metrics/llm_judges/g_eval/parser.py +102 -52
opik/evaluation/metrics/llm_judges/g_eval/presets.py +209 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/__init__.py +36 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/agent_assessment.py +77 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/bias_classifier.py +181 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/compliance_risk.py +41 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/prompt_uncertainty.py +41 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/qa_suite.py +146 -0
opik/evaluation/metrics/llm_judges/hallucination/metric.py +16 -3
opik/evaluation/metrics/llm_judges/llm_juries/__init__.py +3 -0
opik/evaluation/metrics/llm_judges/llm_juries/metric.py +76 -0
opik/evaluation/metrics/llm_judges/moderation/metric.py +16 -4
opik/evaluation/metrics/llm_judges/structure_output_compliance/__init__.py +0 -0
opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +144 -0
opik/evaluation/metrics/llm_judges/structure_output_compliance/parser.py +79 -0
opik/evaluation/metrics/llm_judges/structure_output_compliance/schema.py +15 -0
opik/evaluation/metrics/llm_judges/structure_output_compliance/template.py +50 -0
opik/evaluation/metrics/llm_judges/syc_eval/__init__.py +0 -0
opik/evaluation/metrics/llm_judges/syc_eval/metric.py +252 -0
opik/evaluation/metrics/llm_judges/syc_eval/parser.py +82 -0
opik/evaluation/metrics/llm_judges/syc_eval/template.py +155 -0
opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +20 -5
opik/evaluation/metrics/llm_judges/usefulness/metric.py +16 -4
opik/evaluation/metrics/ragas_metric.py +43 -23
opik/evaluation/models/__init__.py +8 -0
opik/evaluation/models/base_model.py +107 -1
opik/evaluation/models/langchain/langchain_chat_model.py +15 -7
opik/evaluation/models/langchain/message_converters.py +97 -15
opik/evaluation/models/litellm/litellm_chat_model.py +156 -29
opik/evaluation/models/litellm/util.py +125 -0
opik/evaluation/models/litellm/warning_filters.py +16 -4
opik/evaluation/models/model_capabilities.py +187 -0
opik/evaluation/models/models_factory.py +25 -3
opik/evaluation/preprocessing.py +92 -0
opik/evaluation/report.py +70 -12
opik/evaluation/rest_operations.py +49 -45
opik/evaluation/samplers/__init__.py +4 -0
opik/evaluation/samplers/base_dataset_sampler.py +40 -0
opik/evaluation/samplers/random_dataset_sampler.py +48 -0
opik/evaluation/score_statistics.py +66 -0
opik/evaluation/scorers/__init__.py +4 -0
opik/evaluation/scorers/scorer_function.py +55 -0
opik/evaluation/scorers/scorer_wrapper_metric.py +130 -0
opik/evaluation/test_case.py +3 -2
opik/evaluation/test_result.py +1 -0
opik/evaluation/threads/evaluator.py +31 -3
opik/evaluation/threads/helpers.py +3 -2
opik/evaluation/types.py +9 -1
opik/exceptions.py +33 -0
opik/file_upload/file_uploader.py +13 -0
opik/file_upload/upload_options.py +2 -0
opik/hooks/__init__.py +23 -0
opik/hooks/anonymizer_hook.py +36 -0
opik/hooks/httpx_client_hook.py +112 -0
opik/httpx_client.py +12 -9
opik/id_helpers.py +18 -0
opik/integrations/adk/graph/subgraph_edges_builders.py +1 -2
opik/integrations/adk/helpers.py +16 -7
opik/integrations/adk/legacy_opik_tracer.py +7 -4
opik/integrations/adk/opik_tracer.py +14 -1
opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +7 -3
opik/integrations/adk/recursive_callback_injector.py +4 -7
opik/integrations/bedrock/converse/__init__.py +0 -0
opik/integrations/bedrock/converse/chunks_aggregator.py +188 -0
opik/integrations/bedrock/{converse_decorator.py → converse/converse_decorator.py} +4 -3
opik/integrations/bedrock/invoke_agent_decorator.py +5 -4
opik/integrations/bedrock/invoke_model/__init__.py +0 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/__init__.py +78 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/api.py +45 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/base.py +23 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/claude.py +121 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/format_detector.py +107 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/llama.py +108 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/mistral.py +118 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/nova.py +99 -0
opik/integrations/bedrock/invoke_model/invoke_model_decorator.py +178 -0
opik/integrations/bedrock/invoke_model/response_types.py +34 -0
opik/integrations/bedrock/invoke_model/stream_wrappers.py +122 -0
opik/integrations/bedrock/invoke_model/usage_converters.py +87 -0
opik/integrations/bedrock/invoke_model/usage_extraction.py +108 -0
opik/integrations/bedrock/opik_tracker.py +42 -4
opik/integrations/bedrock/types.py +19 -0
opik/integrations/crewai/crewai_decorator.py +8 -51
opik/integrations/crewai/opik_tracker.py +31 -10
opik/integrations/crewai/patchers/__init__.py +5 -0
opik/integrations/crewai/patchers/flow.py +118 -0
opik/integrations/crewai/patchers/litellm_completion.py +30 -0
opik/integrations/crewai/patchers/llm_client.py +207 -0
opik/integrations/dspy/callback.py +80 -17
opik/integrations/dspy/parsers.py +168 -0
opik/integrations/harbor/__init__.py +17 -0
opik/integrations/harbor/experiment_service.py +269 -0
opik/integrations/harbor/opik_tracker.py +528 -0
opik/integrations/haystack/opik_connector.py +2 -2
opik/integrations/haystack/opik_tracer.py +3 -7
opik/integrations/langchain/__init__.py +3 -1
opik/integrations/langchain/helpers.py +96 -0
opik/integrations/langchain/langgraph_async_context_bridge.py +131 -0
opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
opik/integrations/langchain/opik_encoder_extension.py +1 -1
opik/integrations/langchain/opik_tracer.py +474 -229
opik/integrations/litellm/__init__.py +5 -0
opik/integrations/litellm/completion_chunks_aggregator.py +115 -0
opik/integrations/litellm/litellm_completion_decorator.py +242 -0
opik/integrations/litellm/opik_tracker.py +43 -0
opik/integrations/litellm/stream_patchers.py +151 -0
opik/integrations/llama_index/callback.py +146 -107
opik/integrations/openai/agents/opik_tracing_processor.py +1 -2
opik/integrations/openai/openai_chat_completions_decorator.py +2 -16
opik/integrations/openai/opik_tracker.py +1 -1
opik/integrations/sagemaker/auth.py +5 -1
opik/llm_usage/google_usage.py +3 -1
opik/llm_usage/opik_usage.py +7 -8
opik/llm_usage/opik_usage_factory.py +4 -2
opik/logging_messages.py +6 -0
opik/message_processing/batching/base_batcher.py +14 -21
opik/message_processing/batching/batch_manager.py +22 -10
opik/message_processing/batching/batch_manager_constuctors.py +10 -0
opik/message_processing/batching/batchers.py +59 -27
opik/message_processing/batching/flushing_thread.py +0 -3
opik/message_processing/emulation/__init__.py +0 -0
opik/message_processing/emulation/emulator_message_processor.py +578 -0
opik/message_processing/emulation/local_emulator_message_processor.py +140 -0
opik/message_processing/emulation/models.py +162 -0
opik/message_processing/encoder_helpers.py +79 -0
opik/message_processing/messages.py +56 -1
opik/message_processing/preprocessing/__init__.py +0 -0
opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
opik/message_processing/preprocessing/constants.py +1 -0
opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
opik/message_processing/preprocessing/preprocessor.py +36 -0
opik/message_processing/processors/__init__.py +0 -0
opik/message_processing/processors/attachments_extraction_processor.py +146 -0
opik/message_processing/processors/message_processors.py +92 -0
opik/message_processing/processors/message_processors_chain.py +96 -0
opik/message_processing/{message_processors.py → processors/online_message_processor.py} +85 -29
opik/message_processing/queue_consumer.py +9 -3
opik/message_processing/streamer.py +71 -33
opik/message_processing/streamer_constructors.py +43 -10
opik/opik_context.py +16 -4
opik/plugins/pytest/hooks.py +5 -3
opik/rest_api/__init__.py +346 -15
opik/rest_api/alerts/__init__.py +7 -0
opik/rest_api/alerts/client.py +667 -0
opik/rest_api/alerts/raw_client.py +1015 -0
opik/rest_api/alerts/types/__init__.py +7 -0
opik/rest_api/alerts/types/get_webhook_examples_request_alert_type.py +5 -0
opik/rest_api/annotation_queues/__init__.py +4 -0
opik/rest_api/annotation_queues/client.py +668 -0
opik/rest_api/annotation_queues/raw_client.py +1019 -0
opik/rest_api/automation_rule_evaluators/client.py +34 -2
opik/rest_api/automation_rule_evaluators/raw_client.py +24 -0
opik/rest_api/client.py +15 -0
opik/rest_api/dashboards/__init__.py +4 -0
opik/rest_api/dashboards/client.py +462 -0
opik/rest_api/dashboards/raw_client.py +648 -0
opik/rest_api/datasets/client.py +1310 -44
opik/rest_api/datasets/raw_client.py +2269 -358
opik/rest_api/experiments/__init__.py +2 -2
opik/rest_api/experiments/client.py +191 -5
opik/rest_api/experiments/raw_client.py +301 -7
opik/rest_api/experiments/types/__init__.py +4 -1
opik/rest_api/experiments/types/experiment_update_status.py +5 -0
opik/rest_api/experiments/types/experiment_update_type.py +5 -0
opik/rest_api/experiments/types/experiment_write_status.py +5 -0
opik/rest_api/feedback_definitions/types/find_feedback_definitions_request_type.py +1 -1
opik/rest_api/llm_provider_key/client.py +20 -0
opik/rest_api/llm_provider_key/raw_client.py +20 -0
opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +1 -1
opik/rest_api/manual_evaluation/__init__.py +4 -0
opik/rest_api/manual_evaluation/client.py +347 -0
opik/rest_api/manual_evaluation/raw_client.py +543 -0
opik/rest_api/optimizations/client.py +145 -9
opik/rest_api/optimizations/raw_client.py +237 -13
opik/rest_api/optimizations/types/optimization_update_status.py +3 -1
opik/rest_api/prompts/__init__.py +2 -2
opik/rest_api/prompts/client.py +227 -6
opik/rest_api/prompts/raw_client.py +331 -2
opik/rest_api/prompts/types/__init__.py +3 -1
opik/rest_api/prompts/types/create_prompt_version_detail_template_structure.py +5 -0
opik/rest_api/prompts/types/prompt_write_template_structure.py +5 -0
opik/rest_api/spans/__init__.py +0 -2
opik/rest_api/spans/client.py +238 -76
opik/rest_api/spans/raw_client.py +307 -95
opik/rest_api/spans/types/__init__.py +0 -2
opik/rest_api/traces/client.py +572 -161
opik/rest_api/traces/raw_client.py +736 -229
opik/rest_api/types/__init__.py +352 -17
opik/rest_api/types/aggregation_data.py +1 -0
opik/rest_api/types/alert.py +33 -0
opik/rest_api/types/alert_alert_type.py +5 -0
opik/rest_api/types/alert_page_public.py +24 -0
opik/rest_api/types/alert_public.py +33 -0
opik/rest_api/types/alert_public_alert_type.py +5 -0
opik/rest_api/types/alert_trigger.py +27 -0
opik/rest_api/types/alert_trigger_config.py +28 -0
opik/rest_api/types/alert_trigger_config_public.py +28 -0
opik/rest_api/types/alert_trigger_config_public_type.py +10 -0
opik/rest_api/types/alert_trigger_config_type.py +10 -0
opik/rest_api/types/alert_trigger_config_write.py +22 -0
opik/rest_api/types/alert_trigger_config_write_type.py +10 -0
opik/rest_api/types/alert_trigger_event_type.py +19 -0
opik/rest_api/types/alert_trigger_public.py +27 -0
opik/rest_api/types/alert_trigger_public_event_type.py +19 -0
opik/rest_api/types/alert_trigger_write.py +23 -0
opik/rest_api/types/alert_trigger_write_event_type.py +19 -0
opik/rest_api/types/alert_write.py +28 -0
opik/rest_api/types/alert_write_alert_type.py +5 -0
opik/rest_api/types/annotation_queue.py +42 -0
opik/rest_api/types/annotation_queue_batch.py +27 -0
opik/rest_api/types/annotation_queue_item_ids.py +19 -0
opik/rest_api/types/annotation_queue_page_public.py +28 -0
opik/rest_api/types/annotation_queue_public.py +38 -0
opik/rest_api/types/annotation_queue_public_scope.py +5 -0
opik/rest_api/types/annotation_queue_reviewer.py +20 -0
opik/rest_api/types/annotation_queue_reviewer_public.py +20 -0
opik/rest_api/types/annotation_queue_scope.py +5 -0
opik/rest_api/types/annotation_queue_write.py +31 -0
opik/rest_api/types/annotation_queue_write_scope.py +5 -0
opik/rest_api/types/audio_url.py +19 -0
opik/rest_api/types/audio_url_public.py +19 -0
opik/rest_api/types/audio_url_write.py +19 -0
opik/rest_api/types/automation_rule_evaluator.py +62 -2
opik/rest_api/types/automation_rule_evaluator_llm_as_judge.py +2 -0
opik/rest_api/types/automation_rule_evaluator_llm_as_judge_public.py +2 -0
opik/rest_api/types/automation_rule_evaluator_llm_as_judge_write.py +2 -0
opik/rest_api/types/automation_rule_evaluator_object_object_public.py +155 -0
opik/rest_api/types/automation_rule_evaluator_page_public.py +3 -2
opik/rest_api/types/automation_rule_evaluator_public.py +57 -2
opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_public.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_write.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_public.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_write.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_public.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_write.py +2 -0
opik/rest_api/types/automation_rule_evaluator_update.py +51 -1
opik/rest_api/types/automation_rule_evaluator_update_llm_as_judge.py +2 -0
opik/rest_api/types/automation_rule_evaluator_update_span_llm_as_judge.py +22 -0
opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
opik/rest_api/types/automation_rule_evaluator_update_trace_thread_llm_as_judge.py +2 -0
opik/rest_api/types/automation_rule_evaluator_update_trace_thread_user_defined_metric_python.py +2 -0
opik/rest_api/types/automation_rule_evaluator_update_user_defined_metric_python.py +2 -0
opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python.py +2 -0
opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_public.py +2 -0
opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_write.py +2 -0
opik/rest_api/types/automation_rule_evaluator_write.py +51 -1
opik/rest_api/types/boolean_feedback_definition.py +25 -0
opik/rest_api/types/boolean_feedback_definition_create.py +20 -0
opik/rest_api/types/boolean_feedback_definition_public.py +25 -0
opik/rest_api/types/boolean_feedback_definition_update.py +20 -0
opik/rest_api/types/boolean_feedback_detail.py +29 -0
opik/rest_api/types/boolean_feedback_detail_create.py +29 -0
opik/rest_api/types/boolean_feedback_detail_public.py +29 -0
opik/rest_api/types/boolean_feedback_detail_update.py +29 -0
opik/rest_api/types/dashboard_page_public.py +24 -0
opik/rest_api/types/dashboard_public.py +30 -0
opik/rest_api/types/dataset.py +4 -0
opik/rest_api/types/dataset_expansion.py +42 -0
opik/rest_api/types/dataset_expansion_response.py +39 -0
opik/rest_api/types/dataset_item.py +2 -0
opik/rest_api/types/dataset_item_changes_public.py +5 -0
opik/rest_api/types/dataset_item_compare.py +2 -0
opik/rest_api/types/dataset_item_filter.py +27 -0
opik/rest_api/types/dataset_item_filter_operator.py +21 -0
opik/rest_api/types/dataset_item_page_compare.py +5 -0
opik/rest_api/types/dataset_item_page_public.py +5 -0
opik/rest_api/types/dataset_item_public.py +2 -0
opik/rest_api/types/dataset_item_update.py +39 -0
opik/rest_api/types/dataset_item_write.py +1 -0
opik/rest_api/types/dataset_public.py +4 -0
opik/rest_api/types/dataset_public_status.py +5 -0
opik/rest_api/types/dataset_status.py +5 -0
opik/rest_api/types/dataset_version_diff.py +22 -0
opik/rest_api/types/dataset_version_diff_stats.py +24 -0
opik/rest_api/types/dataset_version_page_public.py +23 -0
opik/rest_api/types/dataset_version_public.py +59 -0
opik/rest_api/types/dataset_version_summary.py +46 -0
opik/rest_api/types/dataset_version_summary_public.py +46 -0
opik/rest_api/types/experiment.py +7 -2
opik/rest_api/types/experiment_group_response.py +2 -0
opik/rest_api/types/experiment_public.py +7 -2
opik/rest_api/types/experiment_public_status.py +5 -0
opik/rest_api/types/experiment_score.py +20 -0
opik/rest_api/types/experiment_score_public.py +20 -0
opik/rest_api/types/experiment_score_write.py +20 -0
opik/rest_api/types/experiment_status.py +5 -0
opik/rest_api/types/feedback.py +25 -1
opik/rest_api/types/feedback_create.py +20 -1
opik/rest_api/types/feedback_object_public.py +27 -1
opik/rest_api/types/feedback_public.py +25 -1
opik/rest_api/types/feedback_score_batch_item.py +2 -1
opik/rest_api/types/feedback_score_batch_item_thread.py +2 -1
opik/rest_api/types/feedback_score_public.py +4 -0
opik/rest_api/types/feedback_update.py +20 -1
opik/rest_api/types/group_content_with_aggregations.py +1 -0
opik/rest_api/types/group_detail.py +19 -0
opik/rest_api/types/group_details.py +20 -0
opik/rest_api/types/guardrail.py +1 -0
opik/rest_api/types/guardrail_write.py +1 -0
opik/rest_api/types/ids_holder.py +19 -0
opik/rest_api/types/image_url.py +20 -0
opik/rest_api/types/image_url_public.py +20 -0
opik/rest_api/types/image_url_write.py +20 -0
opik/rest_api/types/llm_as_judge_message.py +5 -1
opik/rest_api/types/llm_as_judge_message_content.py +26 -0
opik/rest_api/types/llm_as_judge_message_content_public.py +26 -0
opik/rest_api/types/llm_as_judge_message_content_write.py +26 -0
opik/rest_api/types/llm_as_judge_message_public.py +5 -1
opik/rest_api/types/llm_as_judge_message_write.py +5 -1
opik/rest_api/types/llm_as_judge_model_parameters.py +3 -0
opik/rest_api/types/llm_as_judge_model_parameters_public.py +3 -0
opik/rest_api/types/llm_as_judge_model_parameters_write.py +3 -0
opik/rest_api/types/manual_evaluation_request.py +38 -0
opik/rest_api/types/manual_evaluation_request_entity_type.py +5 -0
opik/rest_api/types/manual_evaluation_response.py +27 -0
opik/rest_api/types/optimization.py +4 -2
opik/rest_api/types/optimization_public.py +4 -2
opik/rest_api/types/optimization_public_status.py +3 -1
opik/rest_api/types/optimization_status.py +3 -1
opik/rest_api/types/optimization_studio_config.py +27 -0
opik/rest_api/types/optimization_studio_config_public.py +27 -0
opik/rest_api/types/optimization_studio_config_write.py +27 -0
opik/rest_api/types/optimization_studio_log.py +22 -0
opik/rest_api/types/optimization_write.py +4 -2
opik/rest_api/types/optimization_write_status.py +3 -1
opik/rest_api/types/project.py +1 -0
opik/rest_api/types/project_detailed.py +1 -0
opik/rest_api/types/project_reference.py +31 -0
opik/rest_api/types/project_reference_public.py +31 -0
opik/rest_api/types/project_stats_summary_item.py +1 -0
opik/rest_api/types/prompt.py +6 -0
opik/rest_api/types/prompt_detail.py +6 -0
opik/rest_api/types/prompt_detail_template_structure.py +5 -0
opik/rest_api/types/prompt_public.py +6 -0
opik/rest_api/types/prompt_public_template_structure.py +5 -0
opik/rest_api/types/prompt_template_structure.py +5 -0
opik/rest_api/types/prompt_version.py +3 -0
opik/rest_api/types/prompt_version_detail.py +3 -0
opik/rest_api/types/prompt_version_detail_template_structure.py +5 -0
opik/rest_api/types/prompt_version_link.py +1 -0
opik/rest_api/types/prompt_version_link_public.py +1 -0
opik/rest_api/types/prompt_version_page_public.py +5 -0
opik/rest_api/types/prompt_version_public.py +3 -0
opik/rest_api/types/prompt_version_public_template_structure.py +5 -0
opik/rest_api/types/prompt_version_template_structure.py +5 -0
opik/rest_api/types/prompt_version_update.py +33 -0
opik/rest_api/types/provider_api_key.py +9 -0
opik/rest_api/types/provider_api_key_provider.py +1 -1
opik/rest_api/types/provider_api_key_public.py +9 -0
opik/rest_api/types/provider_api_key_public_provider.py +1 -1
opik/rest_api/types/score_name.py +1 -0
opik/rest_api/types/service_toggles_config.py +18 -0
opik/rest_api/types/span.py +1 -2
opik/rest_api/types/span_enrichment_options.py +31 -0
opik/rest_api/types/span_experiment_item_bulk_write_view.py +1 -2
opik/rest_api/types/span_filter.py +23 -0
opik/rest_api/types/span_filter_operator.py +21 -0
opik/rest_api/types/span_filter_write.py +23 -0
opik/rest_api/types/span_filter_write_operator.py +21 -0
opik/rest_api/types/span_llm_as_judge_code.py +27 -0
opik/rest_api/types/span_llm_as_judge_code_public.py +27 -0
opik/rest_api/types/span_llm_as_judge_code_write.py +27 -0
opik/rest_api/types/span_public.py +1 -2
opik/rest_api/types/span_update.py +46 -0
opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
opik/rest_api/types/span_write.py +1 -2
opik/rest_api/types/studio_evaluation.py +20 -0
opik/rest_api/types/studio_evaluation_public.py +20 -0
opik/rest_api/types/studio_evaluation_write.py +20 -0
opik/rest_api/types/studio_llm_model.py +21 -0
opik/rest_api/types/studio_llm_model_public.py +21 -0
opik/rest_api/types/studio_llm_model_write.py +21 -0
opik/rest_api/types/studio_message.py +20 -0
opik/rest_api/types/studio_message_public.py +20 -0
opik/rest_api/types/studio_message_write.py +20 -0
opik/rest_api/types/studio_metric.py +21 -0
opik/rest_api/types/studio_metric_public.py +21 -0
opik/rest_api/types/studio_metric_write.py +21 -0
opik/rest_api/types/studio_optimizer.py +21 -0
opik/rest_api/types/studio_optimizer_public.py +21 -0
opik/rest_api/types/studio_optimizer_write.py +21 -0
opik/rest_api/types/studio_prompt.py +20 -0
opik/rest_api/types/studio_prompt_public.py +20 -0
opik/rest_api/types/studio_prompt_write.py +20 -0
opik/rest_api/types/trace.py +11 -2
opik/rest_api/types/trace_enrichment_options.py +32 -0
opik/rest_api/types/trace_experiment_item_bulk_write_view.py +1 -2
opik/rest_api/types/trace_filter.py +23 -0
opik/rest_api/types/trace_filter_operator.py +21 -0
opik/rest_api/types/trace_filter_write.py +23 -0
opik/rest_api/types/trace_filter_write_operator.py +21 -0
opik/rest_api/types/trace_public.py +11 -2
opik/rest_api/types/trace_thread_filter_write.py +23 -0
opik/rest_api/types/trace_thread_filter_write_operator.py +21 -0
opik/rest_api/types/trace_thread_identifier.py +1 -0
opik/rest_api/types/trace_update.py +39 -0
opik/rest_api/types/trace_write.py +1 -2
opik/rest_api/types/value_entry.py +2 -0
opik/rest_api/types/value_entry_compare.py +2 -0
opik/rest_api/types/value_entry_experiment_item_bulk_write_view.py +2 -0
opik/rest_api/types/value_entry_public.py +2 -0
opik/rest_api/types/video_url.py +19 -0
opik/rest_api/types/video_url_public.py +19 -0
opik/rest_api/types/video_url_write.py +19 -0
opik/rest_api/types/webhook.py +28 -0
opik/rest_api/types/webhook_examples.py +19 -0
opik/rest_api/types/webhook_public.py +28 -0
opik/rest_api/types/webhook_test_result.py +23 -0
opik/rest_api/types/webhook_test_result_status.py +5 -0
opik/rest_api/types/webhook_write.py +23 -0
opik/rest_api/types/welcome_wizard_tracking.py +22 -0
opik/rest_api/types/workspace_configuration.py +5 -0
opik/rest_api/welcome_wizard/__init__.py +4 -0
opik/rest_api/welcome_wizard/client.py +195 -0
opik/rest_api/welcome_wizard/raw_client.py +208 -0
opik/rest_api/workspaces/client.py +14 -2
opik/rest_api/workspaces/raw_client.py +10 -0
opik/s3_httpx_client.py +14 -1
opik/simulation/__init__.py +6 -0
opik/simulation/simulated_user.py +99 -0
opik/simulation/simulator.py +108 -0
opik/synchronization.py +5 -6
opik/{decorator/tracing_runtime_config.py → tracing_runtime_config.py} +6 -7
opik/types.py +36 -0
opik/validation/chat_prompt_messages.py +241 -0
opik/validation/feedback_score.py +3 -3
opik/validation/validator.py +28 -0
opik-1.9.71.dist-info/METADATA +370 -0
opik-1.9.71.dist-info/RECORD +1110 -0
opik/api_objects/prompt/prompt.py +0 -112
opik/cli.py +0 -193
opik/hooks.py +0 -13
opik/integrations/bedrock/chunks_aggregator.py +0 -55
opik/integrations/bedrock/helpers.py +0 -8
opik/rest_api/types/automation_rule_evaluator_object_public.py +0 -100
opik/rest_api/types/json_node_experiment_item_bulk_write_view.py +0 -5
opik-1.8.39.dist-info/METADATA +0 -339
opik-1.8.39.dist-info/RECORD +0 -790
/opik/{evaluation/metrics/conversation/conversational_coherence → decorator/context_manager}/__init__.py +0 -0
/opik/evaluation/metrics/conversation/{session_completeness → llm_judges/conversational_coherence}/__init__.py +0 -0
/opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/schema.py +0 -0
/opik/evaluation/metrics/conversation/{user_frustration → llm_judges/session_completeness}/__init__.py +0 -0
/opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/schema.py +0 -0
/opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/schema.py +0 -0
/opik/integrations/bedrock/{stream_wrappers.py → converse/stream_wrappers.py} +0 -0
/opik/rest_api/{spans/types → types}/span_update_type.py +0 -0
{opik-1.8.39.dist-info → opik-1.9.71.dist-info}/WHEEL +0 -0
{opik-1.8.39.dist-info → opik-1.9.71.dist-info}/entry_points.txt +0 -0
{opik-1.8.39.dist-info → opik-1.9.71.dist-info}/licenses/LICENSE +0 -0
{opik-1.8.39.dist-info → opik-1.9.71.dist-info}/top_level.txt +0 -0

opik/evaluation/metrics/__init__.py CHANGED Viewed

@@ -1,23 +1,81 @@
 from .aggregated_metric import AggregatedMetric
-from .conversation.session_completeness.metric import SessionCompletenessQuality
-from .conversation.conversational_coherence.metric import ConversationalCoherenceMetric
-from .conversation.user_frustration.metric import UserFrustrationMetric
+# Keep the canonical import first for the new layout while still tolerating
+# older packaging artefacts (some environments import this module before the
+# conversation package is available).  If the eager import fails we fall back
+# to the lazy getter below, letting legacy entry-points keep working.
+from .conversation.conversation_thread_metric import ConversationThreadMetric
+from .conversation import types as conversation_types
+from .conversation.heuristics.degeneration.metric import ConversationDegenerationMetric
+from .conversation.heuristics.knowledge_retention.metric import (
+    KnowledgeRetentionMetric,
+)
+from .conversation.llm_judges.conversational_coherence.metric import (
+    ConversationalCoherenceMetric,
+)
+from .conversation.llm_judges.g_eval_wrappers import (
+    GEvalConversationMetric,
+    ConversationComplianceRiskMetric,
+    ConversationDialogueHelpfulnessMetric,
+    ConversationQARelevanceMetric,
+    ConversationSummarizationCoherenceMetric,
+    ConversationSummarizationConsistencyMetric,
+    ConversationPromptUncertaintyMetric,
+)
+from .conversation.llm_judges.session_completeness.metric import (
+    SessionCompletenessQuality,
+)
+from .conversation.llm_judges.user_frustration.metric import UserFrustrationMetric
 from .heuristics.contains import Contains
 from .heuristics.equals import Equals
+from .heuristics.gleu import GLEU
+from .heuristics.chrf import ChrF
 from .heuristics.is_json import IsJson
+from .heuristics.distribution_metrics import (
+    JSDivergence,
+    JSDistance,
+    KLDivergence,
+)
 from .heuristics.levenshtein_ratio import LevenshteinRatio
+from .heuristics.meteor import METEOR
+from .heuristics.bertscore import BERTScore
+from .heuristics.spearman import SpearmanRanking
+from .heuristics.readability import Readability
+from .heuristics.tone import Tone
+from .heuristics.prompt_injection import PromptInjection
+from .heuristics.language_adherence import LanguageAdherenceMetric
 from .heuristics.regex_match import RegexMatch
 from .heuristics.bleu import SentenceBLEU, CorpusBLEU
 from .heuristics.rouge import ROUGE
 from .heuristics.sentiment import Sentiment
+from .heuristics.vader_sentiment import VADERSentiment
 from .llm_judges.answer_relevance.metric import AnswerRelevance
+from .llm_judges.g_eval_presets import (
+    AgentTaskCompletionJudge,
+    AgentToolCorrectnessJudge,
+    ComplianceRiskJudge,
+    DemographicBiasJudge,
+    DialogueHelpfulnessJudge,
+    GenderBiasJudge,
+    PoliticalBiasJudge,
+    PromptUncertaintyJudge,
+    QARelevanceJudge,
+    RegionalBiasJudge,
+    ReligiousBiasJudge,
+    SummarizationCoherenceJudge,
+    SummarizationConsistencyJudge,
+)
 from .llm_judges.context_precision.metric import ContextPrecision
 from .llm_judges.context_recall.metric import ContextRecall
-from .llm_judges.g_eval.metric import GEval
+from .llm_judges.g_eval.metric import GEval, GEvalPreset
 from .llm_judges.hallucination.metric import Hallucination
 from .llm_judges.moderation.metric import Moderation
+from .llm_judges.llm_juries.metric import LLMJuriesJudge
 from .llm_judges.trajectory_accuracy import TrajectoryAccuracy
+from .llm_judges.syc_eval.metric import SycEval
 from .llm_judges.usefulness.metric import Usefulness
+from .llm_judges.structure_output_compliance.metric import StructuredOutputCompliance
 from .base_metric import BaseMetric
 from .ragas_metric import RagasMetricWrapper
 from opik.exceptions import MetricComputationError
@@ -27,17 +85,51 @@ from opik.exceptions import MetricComputationError
 __all__ = [
     "AggregatedMetric",
     "AnswerRelevance",
+    "AgentTaskCompletionJudge",
+    "AgentToolCorrectnessJudge",
     "BaseMetric",
+    "ConversationDegenerationMetric",
+    "KnowledgeRetentionMetric",
+    "GEvalConversationMetric",
+    "ConversationComplianceRiskMetric",
+    "ConversationDialogueHelpfulnessMetric",
+    "ConversationQARelevanceMetric",
+    "ConversationSummarizationCoherenceMetric",
+    "ConversationSummarizationConsistencyMetric",
+    "ConversationPromptUncertaintyMetric",
+    "conversation_types",
+    "ComplianceRiskJudge",
     "Contains",
     "ContextPrecision",
     "ContextRecall",
     "ConversationalCoherenceMetric",
     "CorpusBLEU",
+    "DemographicBiasJudge",
     "Equals",
     "GEval",
+    "GEvalPreset",
+    "GLEU",
+    "GenderBiasJudge",
     "Hallucination",
     "IsJson",
+    "JSDivergence",
+    "JSDistance",
+    "KLDivergence",
     "LevenshteinRatio",
+    "BERTScore",
+    "METEOR",
+    "ChrF",
+    "Readability",
+    "PromptInjection",
+    "LanguageAdherenceMetric",
+    "PoliticalBiasJudge",
+    "PromptUncertaintyJudge",
+    "SpearmanRanking",
+    "ReligiousBiasJudge",
+    "RegionalBiasJudge",
+    "VADERSentiment",
+    "Tone",
+    "StructuredOutputCompliance",
     "MetricComputationError",
     "Moderation",
     "RagasMetricWrapper",
@@ -46,8 +138,15 @@ __all__ = [
     "SentenceBLEU",
     "Sentiment",
     "SessionCompletenessQuality",
+    "SycEval",
     "Usefulness",
     "UserFrustrationMetric",
     "TrajectoryAccuracy",
+    "DialogueHelpfulnessJudge",
+    "QARelevanceJudge",
+    "SummarizationCoherenceJudge",
+    "SummarizationConsistencyJudge",
+    "LLMJuriesJudge",
+    "ConversationThreadMetric",
     # "Factuality",
 ]

opik/evaluation/metrics/aggregated_metric.py CHANGED Viewed

@@ -8,14 +8,43 @@ from . import arguments_helpers, arguments_validator, base_metric, score_result
 class AggregatedMetric(
     base_metric.BaseMetric, arguments_validator.ScoreArgumentsValidator
 ):
-    """A metric that aggregates results obtained from a list of provided metrics using specified aggregation function.
+    """
+    Combine the output of multiple metrics into a single aggregated ``ScoreResult``.
+    Each metric in ``metrics`` is executed with the provided scoring kwargs, then the
+    ``aggregator`` callback decides how to merge the individual results. This is
+    handy for building ensembles such as min/max, weighted averages, or custom
+    pass/fail checks without re-implementing the metrics themselves.
     Args:
-        name: The name of the metric.
-        metrics: A list of concrete metric instances that inherit the `opik.evaluation.base_metric.BaseMetric`.
-        aggregator: The aggregation function to use for evaluation.
-        track: Whether to track the metric. Defaults to True.
-        project_name: Optional project name to track the metric in for the cases when there are no parent span/trace to inherit project name from.
+        name: Display name for the aggregated metric result.
+        metrics: Ordered list of metric instances that should be executed.
+        aggregator: Callable receiving the list of ``ScoreResult`` objects and
+            returning the final aggregated ``ScoreResult``.
+        track: Whether to automatically track the metric in Opik. Defaults to
+            ``True``.
+        project_name: Optional tracking project used when no parent context exists.
+    Example:
+        >>> from opik.evaluation.metrics import AggregatedMetric, Contains, RegexMatch
+        >>> metrics = [Contains(track=False), RegexMatch(pattern=r"\\d+", track=False)]
+        >>> from opik.evaluation.metrics import score_result
+        >>> def combine(results):
+        ...     score = sum(result.value for result in results) / len(results)
+        ...     return score_result.ScoreResult(
+        ...         name="combined_contains_regex",
+        ...         value=score,
+        ...         reason="Average of contains and regex checks",
+        ...     )
+        >>> metric = AggregatedMetric(
+        ...     name="combined_contains_regex",
+        ...     metrics=metrics,
+        ...     aggregator=combine,
+        ... )
+        >>> response = "Order number 12345 confirmed"
+        >>> result = metric.score(output=response, reference="order")
+        >>> float(result.value)  # doctest: +SKIP
+        1.0
     """
     def __init__(

opik/evaluation/metrics/base_metric.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import abc
-from typing import Any, List, Union, Optional
+from typing import Any, List, Optional, Union
 import opik
 import opik.config as opik_config

opik/evaluation/metrics/conversation/__init__.py CHANGED Viewed

@@ -0,0 +1,48 @@
+"""Public conversation metrics API."""
+from .conversation_thread_metric import ConversationThreadMetric
+from .conversation_turns_factory import build_conversation_turns
+from .helpers import (
+    extract_turns_windows_from_conversation,
+    get_turns_in_sliding_window,
+    merge_turns,
+)
+from .types import Conversation, ConversationDict, ConversationTurn
+__all__ = [
+    "ConversationThreadMetric",
+    "Conversation",
+    "ConversationDict",
+    "ConversationTurn",
+    "build_conversation_turns",
+    "extract_turns_windows_from_conversation",
+    "get_turns_in_sliding_window",
+    "merge_turns",
+    "ConversationDegenerationMetric",
+    "KnowledgeRetentionMetric",
+    "ConversationalCoherenceMetric",
+    "SessionCompletenessQuality",
+    "UserFrustrationMetric",
+    "ConversationComplianceRiskMetric",
+    "ConversationDialogueHelpfulnessMetric",
+    "ConversationPromptUncertaintyMetric",
+    "ConversationQARelevanceMetric",
+    "ConversationSummarizationCoherenceMetric",
+    "ConversationSummarizationConsistencyMetric",
+    "GEvalConversationMetric",
+]
+from .heuristics.degeneration.metric import ConversationDegenerationMetric
+from .heuristics.knowledge_retention.metric import KnowledgeRetentionMetric
+from .llm_judges.conversational_coherence.metric import ConversationalCoherenceMetric
+from .llm_judges.g_eval_wrappers import (
+    GEvalConversationMetric,
+    ConversationComplianceRiskMetric,
+    ConversationDialogueHelpfulnessMetric,
+    ConversationQARelevanceMetric,
+    ConversationSummarizationCoherenceMetric,
+    ConversationSummarizationConsistencyMetric,
+    ConversationPromptUncertaintyMetric,
+)
+from .llm_judges.session_completeness.metric import SessionCompletenessQuality
+from .llm_judges.user_frustration.metric import UserFrustrationMetric

opik/evaluation/metrics/conversation/conversation_thread_metric.py CHANGED Viewed

@@ -5,11 +5,53 @@ from .. import base_metric, score_result
 class ConversationThreadMetric(base_metric.BaseMetric):
-    """Abstract base class for all conversation thread metrics."""
+    """
+    Abstract base class for all conversation thread metrics. When creating a custom
+    conversation metric, you should inherit from this class and implement the abstract methods.
+    Conversation metrics are designed to evaluate multi-turn conversations rather than
+    single input-output pairs. They accept a conversation as a list of message dictionaries,
+    where each message has a 'role' (either 'user' or 'assistant') and 'content'.
+    Args:
+        name: The name of the metric. If not provided, uses the class name as default.
+        track: Whether to track the metric. Defaults to True.
+        project_name: Optional project name to track the metric in for the cases when
+            there is no parent span/trace to inherit project name from.
+    Example:
+        >>> from opik.evaluation.metrics.conversation import conversation_thread_metric, types
+        >>> from opik.evaluation.metrics import score_result
+        >>> from typing import Any
+        >>>
+        >>> class ConversationLengthMetric(conversation_thread_metric.ConversationThreadMetric):
+        >>>     def __init__(self, name: str = "conversation_length_score"):
+        >>>         super().__init__(name)
+        >>>
+        >>>     def score(self, conversation: types.Conversation, **kwargs: Any):
+        >>>         num_turns = sum(1 for msg in conversation if msg["role"] == "assistant")
+        >>>         return score_result.ScoreResult(
+        >>>             name=self.name,
+        >>>             value=num_turns,
+        >>>             reason=f"Conversation has {num_turns} turns"
+        >>>         )
+    """
     def score(
         self, conversation: types.Conversation, **kwargs: Any
     ) -> Union[score_result.ScoreResult, List[score_result.ScoreResult]]:
+        """
+        Evaluate a conversation and return a score.
+        Args:
+            conversation: A list of conversation messages. Each message is a dictionary
+                with 'role' (either 'user' or 'assistant') and 'content' (the message text).
+            **kwargs: Additional keyword arguments that may be used by specific metric implementations.
+        Returns:
+            A ScoreResult object or list of ScoreResult objects containing the evaluation score,
+            metric name, and optional reasoning.
+        """
         raise NotImplementedError(
             "Please use concrete metric classes instead of this one."
         )
@@ -18,7 +60,19 @@ class ConversationThreadMetric(base_metric.BaseMetric):
         self, conversation: types.Conversation, **kwargs: Any
     ) -> Union[score_result.ScoreResult, List[score_result.ScoreResult]]:
         """
-        Async public method that can be called independently.
+        Asynchronously evaluate a conversation and return a score.
+        This is the async version of the score method. By default, it calls the
+        synchronous score method, but can be overridden for true async implementations.
+        Args:
+            conversation: A list of conversation messages. Each message is a dictionary
+                with 'role' (either 'user' or 'assistant') and 'content' (the message text).
+            **kwargs: Additional keyword arguments that may be used by specific metric implementations.
+        Returns:
+            A ScoreResult object or list of ScoreResult objects containing the evaluation score,
+            metric name, and optional reasoning.
         """
         raise NotImplementedError(
             "Please use concrete metric classes instead of this one."

opik/evaluation/metrics/conversation/g_eval_wrappers.py ADDED Viewed

@@ -0,0 +1,19 @@
+from .llm_judges.g_eval_wrappers import (
+    ConversationComplianceRiskMetric,
+    ConversationDialogueHelpfulnessMetric,
+    ConversationPromptUncertaintyMetric,
+    ConversationQARelevanceMetric,
+    ConversationSummarizationCoherenceMetric,
+    ConversationSummarizationConsistencyMetric,
+    GEvalConversationMetric,
+)
+__all__ = [
+    "GEvalConversationMetric",
+    "ConversationComplianceRiskMetric",
+    "ConversationDialogueHelpfulnessMetric",
+    "ConversationPromptUncertaintyMetric",
+    "ConversationQARelevanceMetric",
+    "ConversationSummarizationCoherenceMetric",
+    "ConversationSummarizationConsistencyMetric",
+]

opik/evaluation/metrics/conversation/helpers.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import Any, Generator, List
-from . import types, conversation_turns_factory
+from . import types
+from .conversation_turns_factory import build_conversation_turns
 def get_turns_in_sliding_window(
@@ -8,22 +9,20 @@ def get_turns_in_sliding_window(
 ) -> Generator[List[types.ConversationTurn], Any, None]:
     """
     Generates windows of conversation turns of a fixed size from a list of turns.
     This function creates a sliding window over the list of conversation turns.
     Each window includes the current turn and up to `window_size - 1` previous
     conversation turns. If there are fewer turns available than the `window_size`,
     the window will consist of all available turns up to the current turn.
     Args:
         turns: List of conversation turn objects representing the interactions
             in a conversation.
         window_size: Integer specifying the maximum number of turns to include
             in each window.
     Yields:
         A generator that produces lists of conversation turns, where each list
         represents a sliding window of turns.
     """
     for i in range(len(turns)):
         yield turns[max(0, i - window_size + 1) : i + 1]
@@ -31,19 +30,17 @@ def get_turns_in_sliding_window(
 def merge_turns(turns: List[types.ConversationTurn]) -> types.Conversation:
     """
     Merges a list of conversation turns into a single conversation.
     This function takes a list of conversation turns and combines them
     into a single conversation by extending the output list with the data
     from each turn.
     Args:
         turns: A list of conversation turn  objects to be combined.
     Returns:
         types.Conversation: A combined conversation object containing all
             the turns from the input list.
     """
-    output = []
+    output: types.Conversation = []
     for turn in turns:
         output.extend(turn.as_list())
     return output
@@ -56,24 +53,20 @@ def extract_turns_windows_from_conversation(
     Extracts a list of conversation windows based on turns using a sliding window
     approach. This function divides a conversation into consecutive overlapping
     windows, where each window contains a specified number of turns.
     Args:
         conversation: The input conversation from which turns will be processed.
         window_size: The number of turns to include in each sliding window.
     Returns:
         A list of conversations, each representing a window of turns specified
         by the given window size.
     Raises:
         ValueError: If the conversation is empty or if it has no turns.
     """
     if len(conversation) == 0:
         raise ValueError("Conversation is empty")
-    turns = conversation_turns_factory.build_conversation_turns(
-        conversation=conversation
-    )
+    turns = build_conversation_turns(conversation=conversation)
     if len(turns) == 0:
         raise ValueError("Conversation has no turns")
@@ -81,5 +74,11 @@ def extract_turns_windows_from_conversation(
         merge_turns(turns_window)
         for turns_window in get_turns_in_sliding_window(turns, window_size)
     ]
     return turns_windows
+__all__ = [
+    "get_turns_in_sliding_window",
+    "merge_turns",
+    "extract_turns_windows_from_conversation",
+]

opik/evaluation/metrics/conversation/heuristics/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+"""Heuristic conversation-level metrics.
+Exposes the reusable conversation-level heuristics under the public namespace
+``opik.evaluation.metrics.conversation.heuristics.*`` so documentation and downstream
+code can import them directly.
+"""
+from .degeneration.metric import ConversationDegenerationMetric
+from .knowledge_retention.metric import KnowledgeRetentionMetric
+__all__ = [
+    "ConversationDegenerationMetric",
+    "KnowledgeRetentionMetric",
+]

opik/evaluation/metrics/conversation/heuristics/degeneration/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .metric import ConversationDegenerationMetric
+__all__ = ["ConversationDegenerationMetric"]

opik/evaluation/metrics/conversation/heuristics/degeneration/metric.py ADDED Viewed

@@ -0,0 +1,189 @@
+from __future__ import annotations
+import math
+import re
+from collections import Counter
+from typing import Dict, List, Optional
+from opik.evaluation.metrics.conversation import types as conversation_types
+from opik.evaluation.metrics.conversation.conversation_thread_metric import (
+    ConversationThreadMetric,
+)
+from opik.evaluation.metrics.score_result import ScoreResult
+from opik.exceptions import MetricComputationError
+from .phrases import DEFAULT_FALLBACK_PHRASES
+def _tokenize(text: str) -> List[str]:
+    return re.findall(r"\b\w+\b", text.lower())
+def _ngram_counts(tokens: List[str], n: int) -> Counter:
+    if len(tokens) < n:
+        return Counter()
+    return Counter(tuple(tokens[i : i + n]) for i in range(len(tokens) - n + 1))
+class ConversationDegenerationMetric(ConversationThreadMetric):
+    """
+    Score how strongly an assistant conversation shows degeneration or repetition.
+    The metric inspects each assistant turn, measuring repeated n-grams, overlap with
+    the previous reply, low lexical diversity, and presence of known fallback
+    phrases (for example, "as an AI language model..."). Each turn receives a
+    degeneration score between `0.0` and `1.0`; the overall metric reports the peak
+    risk observed so you can quickly flag sections where the assistant got stuck or
+    stopped being helpful. Detailed per-turn diagnostics are returned in the
+    ``ScoreResult.metadata`` payload.
+    Args:
+        name: Display name for the metric result. Defaults to
+            ``"conversation_degeneration_metric"``.
+        track: Whether the metric should automatically track to an Opik project.
+            Defaults to ``True``.
+        project_name: Optional project to store tracked results in. Defaults to
+            ``None`` (inherit global setting).
+        ngram_size: Size of the n-grams used to detect repetition within a single
+            response. Must be at least ``2``. Defaults to ``3``.
+        fallback_phrases: Custom list of phrases that should be treated as
+            degeneration signatures. If ``None``, a sensible default list is used.
+    Example:
+        >>> from opik.evaluation.metrics import ConversationDegenerationMetric
+        >>> conversation = [
+        ...     {"role": "user", "content": "Can you draft a short bio for Ada?"},
+        ...     {"role": "assistant", "content": "Sure, here is a short bio for Ada."},
+        ...     {"role": "user", "content": "Could you add more detail?"},
+        ...     {"role": "assistant", "content": "Sure, here is a short bio for Ada."},
+        ... ]
+        >>> metric = ConversationDegenerationMetric(ngram_size=3)
+        >>> result = metric.score(conversation)
+        >>> float(result.value)  # doctest: +SKIP
+        0.75
+    """
+    def __init__(
+        self,
+        name: str = "conversation_degeneration_metric",
+        track: bool = True,
+        project_name: Optional[str] = None,
+        ngram_size: int = 3,
+        fallback_phrases: Optional[List[str]] = None,
+    ) -> None:
+        super().__init__(name=name, track=track, project_name=project_name)
+        if ngram_size < 2:
+            raise MetricComputationError("ngram_size must be >= 2")
+        self._ngram_size = ngram_size
+        phrases = (
+            fallback_phrases
+            if fallback_phrases is not None
+            else DEFAULT_FALLBACK_PHRASES
+        )
+        self._fallback_phrases = [phrase.lower() for phrase in phrases]
+    def score(
+        self,
+        conversation: conversation_types.Conversation,
+        **ignored_kwargs: object,
+    ) -> ScoreResult:
+        assistant_turns = [
+            turn["content"]
+            for turn in conversation
+            if turn.get("role") == "assistant" and turn.get("content")
+        ]
+        if not assistant_turns:
+            raise MetricComputationError("Conversation contains no assistant messages")
+        per_turn_metadata: List[Dict[str, float]] = []
+        degeneracy_scores: List[float] = []
+        prev_tokens: Optional[List[str]] = None
+        for content in assistant_turns:
+            tokens = _tokenize(content)
+            if not tokens:
+                continue
+            entropy_norm = self._token_entropy(tokens)
+            repetition_ratio = self._repetition_ratio(tokens)
+            prev_overlap = self._overlap_with_previous(tokens, prev_tokens)
+            fallback_score = 1.0 if self._contains_fallback_phrase(content) else 0.0
+            normalized_entropy = 1.0 - entropy_norm
+            # Combine all four risk factors with equal weight; this keeps the
+            # heuristic interpretable and matches the legacy scoring behaviour.
+            deg_score = min(
+                1.0,
+                (repetition_ratio + prev_overlap + fallback_score + normalized_entropy)
+                / 4.0,
+            )
+            per_turn_metadata.append(
+                {
+                    "repetition_ratio": repetition_ratio,
+                    "overlap_previous": prev_overlap,
+                    "fallback_hit": fallback_score,
+                    "normalized_entropy": normalized_entropy,
+                    "degeneration_score": deg_score,
+                }
+            )
+            degeneracy_scores.append(deg_score)
+            prev_tokens = tokens
+        if not degeneracy_scores:
+            raise MetricComputationError(
+                "Assistant messages were empty after tokenization"
+            )
+        average_score = sum(degeneracy_scores) / len(degeneracy_scores)
+        peak_score = max(degeneracy_scores)
+        return ScoreResult(
+            value=peak_score,
+            name=self.name,
+            reason=(
+                f"Peak degeneration risk ({len(degeneracy_scores)} turns):"
+                f" {peak_score:.3f}"
+            ),
+            metadata={
+                "per_turn": per_turn_metadata,
+                "average_score": average_score,
+                "peak_score": peak_score,
+            },
+        )
+    def _token_entropy(self, tokens: List[str]) -> float:
+        counts = Counter(tokens)
+        total = float(len(tokens))
+        entropy = 0.0
+        for count in counts.values():
+            prob = count / total
+            entropy -= prob * math.log(prob, 2)
+        max_entropy = math.log(len(counts), 2) if counts else 1.0
+        if max_entropy == 0:
+            return 0.0
+        return min(1.0, entropy / max_entropy)
+    def _repetition_ratio(self, tokens: List[str]) -> float:
+        ngram_counts = _ngram_counts(tokens, self._ngram_size)
+        total = sum(ngram_counts.values())
+        if total == 0:
+            return 0.0
+        repeated = sum(count for count in ngram_counts.values() if count > 1)
+        return repeated / total
+    def _overlap_with_previous(
+        self, tokens: List[str], prev_tokens: Optional[List[str]]
+    ) -> float:
+        if not prev_tokens:
+            return 0.0
+        current_set = set(tokens)
+        prev_set = set(prev_tokens)
+        if not current_set or not prev_set:
+            return 0.0
+        intersection = len(current_set & prev_set)
+        union = len(current_set | prev_set)
+        return intersection / union
+    def _contains_fallback_phrase(self, content: str) -> bool:
+        lowered = content.lower()
+        return any(phrase in lowered for phrase in self._fallback_phrases)

opik/evaluation/metrics/conversation/heuristics/degeneration/phrases.py ADDED Viewed

@@ -0,0 +1,12 @@
+"""Phrase lists used by the conversation degeneration metric."""
+DEFAULT_FALLBACK_PHRASES = [
+    "i'm sorry",
+    "as an ai language model",
+    "i cannot",
+    "i'm unable",
+    "please provide",
+    "i don't have access",
+    "i don't understand",
+    "could you please clarify",
+]

opik/evaluation/metrics/conversation/heuristics/knowledge_retention/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .metric import KnowledgeRetentionMetric
+__all__ = ["KnowledgeRetentionMetric"]

opik 1.8.39__py3-none-any.whl → 1.9.71__py3-none-any.whl

opik 1.8.39py3-none-any.whl → 1.9.71py3-none-any.whl