opik 1.8.39__py3-none-any.whl → 1.9.71__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik/__init__.py +19 -3
- opik/anonymizer/__init__.py +5 -0
- opik/anonymizer/anonymizer.py +12 -0
- opik/anonymizer/factory.py +80 -0
- opik/anonymizer/recursive_anonymizer.py +64 -0
- opik/anonymizer/rules.py +56 -0
- opik/anonymizer/rules_anonymizer.py +35 -0
- opik/api_objects/attachment/attachment_context.py +36 -0
- opik/api_objects/attachment/attachments_extractor.py +153 -0
- opik/api_objects/attachment/client.py +1 -0
- opik/api_objects/attachment/converters.py +2 -0
- opik/api_objects/attachment/decoder.py +18 -0
- opik/api_objects/attachment/decoder_base64.py +83 -0
- opik/api_objects/attachment/decoder_helpers.py +137 -0
- opik/api_objects/data_helpers.py +79 -0
- opik/api_objects/dataset/dataset.py +64 -4
- opik/api_objects/dataset/rest_operations.py +11 -2
- opik/api_objects/experiment/experiment.py +57 -57
- opik/api_objects/experiment/experiment_item.py +2 -1
- opik/api_objects/experiment/experiments_client.py +64 -0
- opik/api_objects/experiment/helpers.py +35 -11
- opik/api_objects/experiment/rest_operations.py +65 -5
- opik/api_objects/helpers.py +8 -5
- opik/api_objects/local_recording.py +81 -0
- opik/api_objects/opik_client.py +600 -108
- opik/api_objects/opik_query_language.py +39 -5
- opik/api_objects/prompt/__init__.py +12 -2
- opik/api_objects/prompt/base_prompt.py +69 -0
- opik/api_objects/prompt/base_prompt_template.py +29 -0
- opik/api_objects/prompt/chat/__init__.py +1 -0
- opik/api_objects/prompt/chat/chat_prompt.py +210 -0
- opik/api_objects/prompt/chat/chat_prompt_template.py +350 -0
- opik/api_objects/prompt/chat/content_renderer_registry.py +203 -0
- opik/api_objects/prompt/client.py +189 -47
- opik/api_objects/prompt/text/__init__.py +1 -0
- opik/api_objects/prompt/text/prompt.py +174 -0
- opik/api_objects/prompt/{prompt_template.py → text/prompt_template.py} +10 -6
- opik/api_objects/prompt/types.py +23 -0
- opik/api_objects/search_helpers.py +89 -0
- opik/api_objects/span/span_data.py +35 -25
- opik/api_objects/threads/threads_client.py +39 -5
- opik/api_objects/trace/trace_client.py +52 -2
- opik/api_objects/trace/trace_data.py +15 -24
- opik/api_objects/validation_helpers.py +3 -3
- opik/cli/__init__.py +5 -0
- opik/cli/__main__.py +6 -0
- opik/cli/configure.py +66 -0
- opik/cli/exports/__init__.py +131 -0
- opik/cli/exports/dataset.py +278 -0
- opik/cli/exports/experiment.py +784 -0
- opik/cli/exports/project.py +685 -0
- opik/cli/exports/prompt.py +578 -0
- opik/cli/exports/utils.py +406 -0
- opik/cli/harbor.py +39 -0
- opik/cli/healthcheck.py +21 -0
- opik/cli/imports/__init__.py +439 -0
- opik/cli/imports/dataset.py +143 -0
- opik/cli/imports/experiment.py +1192 -0
- opik/cli/imports/project.py +262 -0
- opik/cli/imports/prompt.py +177 -0
- opik/cli/imports/utils.py +280 -0
- opik/cli/main.py +49 -0
- opik/cli/proxy.py +93 -0
- opik/cli/usage_report/__init__.py +16 -0
- opik/cli/usage_report/charts.py +783 -0
- opik/cli/usage_report/cli.py +274 -0
- opik/cli/usage_report/constants.py +9 -0
- opik/cli/usage_report/extraction.py +749 -0
- opik/cli/usage_report/pdf.py +244 -0
- opik/cli/usage_report/statistics.py +78 -0
- opik/cli/usage_report/utils.py +235 -0
- opik/config.py +13 -7
- opik/configurator/configure.py +17 -0
- opik/datetime_helpers.py +12 -0
- opik/decorator/arguments_helpers.py +9 -1
- opik/decorator/base_track_decorator.py +205 -133
- opik/decorator/context_manager/span_context_manager.py +123 -0
- opik/decorator/context_manager/trace_context_manager.py +84 -0
- opik/decorator/opik_args/__init__.py +13 -0
- opik/decorator/opik_args/api_classes.py +71 -0
- opik/decorator/opik_args/helpers.py +120 -0
- opik/decorator/span_creation_handler.py +25 -6
- opik/dict_utils.py +3 -3
- opik/evaluation/__init__.py +13 -2
- opik/evaluation/engine/engine.py +272 -75
- opik/evaluation/engine/evaluation_tasks_executor.py +6 -3
- opik/evaluation/engine/helpers.py +31 -6
- opik/evaluation/engine/metrics_evaluator.py +237 -0
- opik/evaluation/evaluation_result.py +168 -2
- opik/evaluation/evaluator.py +533 -62
- opik/evaluation/metrics/__init__.py +103 -4
- opik/evaluation/metrics/aggregated_metric.py +35 -6
- opik/evaluation/metrics/base_metric.py +1 -1
- opik/evaluation/metrics/conversation/__init__.py +48 -0
- opik/evaluation/metrics/conversation/conversation_thread_metric.py +56 -2
- opik/evaluation/metrics/conversation/g_eval_wrappers.py +19 -0
- opik/evaluation/metrics/conversation/helpers.py +14 -15
- opik/evaluation/metrics/conversation/heuristics/__init__.py +14 -0
- opik/evaluation/metrics/conversation/heuristics/degeneration/__init__.py +3 -0
- opik/evaluation/metrics/conversation/heuristics/degeneration/metric.py +189 -0
- opik/evaluation/metrics/conversation/heuristics/degeneration/phrases.py +12 -0
- opik/evaluation/metrics/conversation/heuristics/knowledge_retention/__init__.py +3 -0
- opik/evaluation/metrics/conversation/heuristics/knowledge_retention/metric.py +172 -0
- opik/evaluation/metrics/conversation/llm_judges/__init__.py +32 -0
- opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/metric.py +22 -17
- opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/templates.py +1 -1
- opik/evaluation/metrics/conversation/llm_judges/g_eval_wrappers.py +442 -0
- opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/metric.py +13 -7
- opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/templates.py +1 -1
- opik/evaluation/metrics/conversation/llm_judges/user_frustration/__init__.py +0 -0
- opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/metric.py +21 -14
- opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/templates.py +1 -1
- opik/evaluation/metrics/conversation/types.py +4 -5
- opik/evaluation/metrics/conversation_types.py +9 -0
- opik/evaluation/metrics/heuristics/bertscore.py +107 -0
- opik/evaluation/metrics/heuristics/bleu.py +35 -15
- opik/evaluation/metrics/heuristics/chrf.py +127 -0
- opik/evaluation/metrics/heuristics/contains.py +47 -11
- opik/evaluation/metrics/heuristics/distribution_metrics.py +331 -0
- opik/evaluation/metrics/heuristics/gleu.py +113 -0
- opik/evaluation/metrics/heuristics/language_adherence.py +123 -0
- opik/evaluation/metrics/heuristics/meteor.py +119 -0
- opik/evaluation/metrics/heuristics/prompt_injection.py +150 -0
- opik/evaluation/metrics/heuristics/readability.py +129 -0
- opik/evaluation/metrics/heuristics/rouge.py +26 -9
- opik/evaluation/metrics/heuristics/spearman.py +88 -0
- opik/evaluation/metrics/heuristics/tone.py +155 -0
- opik/evaluation/metrics/heuristics/vader_sentiment.py +77 -0
- opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +20 -5
- opik/evaluation/metrics/llm_judges/context_precision/metric.py +20 -6
- opik/evaluation/metrics/llm_judges/context_recall/metric.py +20 -6
- opik/evaluation/metrics/llm_judges/g_eval/__init__.py +5 -0
- opik/evaluation/metrics/llm_judges/g_eval/metric.py +219 -68
- opik/evaluation/metrics/llm_judges/g_eval/parser.py +102 -52
- opik/evaluation/metrics/llm_judges/g_eval/presets.py +209 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/__init__.py +36 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/agent_assessment.py +77 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/bias_classifier.py +181 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/compliance_risk.py +41 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/prompt_uncertainty.py +41 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/qa_suite.py +146 -0
- opik/evaluation/metrics/llm_judges/hallucination/metric.py +16 -3
- opik/evaluation/metrics/llm_judges/llm_juries/__init__.py +3 -0
- opik/evaluation/metrics/llm_judges/llm_juries/metric.py +76 -0
- opik/evaluation/metrics/llm_judges/moderation/metric.py +16 -4
- opik/evaluation/metrics/llm_judges/structure_output_compliance/__init__.py +0 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +144 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/parser.py +79 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/schema.py +15 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/template.py +50 -0
- opik/evaluation/metrics/llm_judges/syc_eval/__init__.py +0 -0
- opik/evaluation/metrics/llm_judges/syc_eval/metric.py +252 -0
- opik/evaluation/metrics/llm_judges/syc_eval/parser.py +82 -0
- opik/evaluation/metrics/llm_judges/syc_eval/template.py +155 -0
- opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +20 -5
- opik/evaluation/metrics/llm_judges/usefulness/metric.py +16 -4
- opik/evaluation/metrics/ragas_metric.py +43 -23
- opik/evaluation/models/__init__.py +8 -0
- opik/evaluation/models/base_model.py +107 -1
- opik/evaluation/models/langchain/langchain_chat_model.py +15 -7
- opik/evaluation/models/langchain/message_converters.py +97 -15
- opik/evaluation/models/litellm/litellm_chat_model.py +156 -29
- opik/evaluation/models/litellm/util.py +125 -0
- opik/evaluation/models/litellm/warning_filters.py +16 -4
- opik/evaluation/models/model_capabilities.py +187 -0
- opik/evaluation/models/models_factory.py +25 -3
- opik/evaluation/preprocessing.py +92 -0
- opik/evaluation/report.py +70 -12
- opik/evaluation/rest_operations.py +49 -45
- opik/evaluation/samplers/__init__.py +4 -0
- opik/evaluation/samplers/base_dataset_sampler.py +40 -0
- opik/evaluation/samplers/random_dataset_sampler.py +48 -0
- opik/evaluation/score_statistics.py +66 -0
- opik/evaluation/scorers/__init__.py +4 -0
- opik/evaluation/scorers/scorer_function.py +55 -0
- opik/evaluation/scorers/scorer_wrapper_metric.py +130 -0
- opik/evaluation/test_case.py +3 -2
- opik/evaluation/test_result.py +1 -0
- opik/evaluation/threads/evaluator.py +31 -3
- opik/evaluation/threads/helpers.py +3 -2
- opik/evaluation/types.py +9 -1
- opik/exceptions.py +33 -0
- opik/file_upload/file_uploader.py +13 -0
- opik/file_upload/upload_options.py +2 -0
- opik/hooks/__init__.py +23 -0
- opik/hooks/anonymizer_hook.py +36 -0
- opik/hooks/httpx_client_hook.py +112 -0
- opik/httpx_client.py +12 -9
- opik/id_helpers.py +18 -0
- opik/integrations/adk/graph/subgraph_edges_builders.py +1 -2
- opik/integrations/adk/helpers.py +16 -7
- opik/integrations/adk/legacy_opik_tracer.py +7 -4
- opik/integrations/adk/opik_tracer.py +14 -1
- opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +7 -3
- opik/integrations/adk/recursive_callback_injector.py +4 -7
- opik/integrations/bedrock/converse/__init__.py +0 -0
- opik/integrations/bedrock/converse/chunks_aggregator.py +188 -0
- opik/integrations/bedrock/{converse_decorator.py → converse/converse_decorator.py} +4 -3
- opik/integrations/bedrock/invoke_agent_decorator.py +5 -4
- opik/integrations/bedrock/invoke_model/__init__.py +0 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/__init__.py +78 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/api.py +45 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/base.py +23 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/claude.py +121 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/format_detector.py +107 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/llama.py +108 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/mistral.py +118 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/nova.py +99 -0
- opik/integrations/bedrock/invoke_model/invoke_model_decorator.py +178 -0
- opik/integrations/bedrock/invoke_model/response_types.py +34 -0
- opik/integrations/bedrock/invoke_model/stream_wrappers.py +122 -0
- opik/integrations/bedrock/invoke_model/usage_converters.py +87 -0
- opik/integrations/bedrock/invoke_model/usage_extraction.py +108 -0
- opik/integrations/bedrock/opik_tracker.py +42 -4
- opik/integrations/bedrock/types.py +19 -0
- opik/integrations/crewai/crewai_decorator.py +8 -51
- opik/integrations/crewai/opik_tracker.py +31 -10
- opik/integrations/crewai/patchers/__init__.py +5 -0
- opik/integrations/crewai/patchers/flow.py +118 -0
- opik/integrations/crewai/patchers/litellm_completion.py +30 -0
- opik/integrations/crewai/patchers/llm_client.py +207 -0
- opik/integrations/dspy/callback.py +80 -17
- opik/integrations/dspy/parsers.py +168 -0
- opik/integrations/harbor/__init__.py +17 -0
- opik/integrations/harbor/experiment_service.py +269 -0
- opik/integrations/harbor/opik_tracker.py +528 -0
- opik/integrations/haystack/opik_connector.py +2 -2
- opik/integrations/haystack/opik_tracer.py +3 -7
- opik/integrations/langchain/__init__.py +3 -1
- opik/integrations/langchain/helpers.py +96 -0
- opik/integrations/langchain/langgraph_async_context_bridge.py +131 -0
- opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
- opik/integrations/langchain/opik_encoder_extension.py +1 -1
- opik/integrations/langchain/opik_tracer.py +474 -229
- opik/integrations/litellm/__init__.py +5 -0
- opik/integrations/litellm/completion_chunks_aggregator.py +115 -0
- opik/integrations/litellm/litellm_completion_decorator.py +242 -0
- opik/integrations/litellm/opik_tracker.py +43 -0
- opik/integrations/litellm/stream_patchers.py +151 -0
- opik/integrations/llama_index/callback.py +146 -107
- opik/integrations/openai/agents/opik_tracing_processor.py +1 -2
- opik/integrations/openai/openai_chat_completions_decorator.py +2 -16
- opik/integrations/openai/opik_tracker.py +1 -1
- opik/integrations/sagemaker/auth.py +5 -1
- opik/llm_usage/google_usage.py +3 -1
- opik/llm_usage/opik_usage.py +7 -8
- opik/llm_usage/opik_usage_factory.py +4 -2
- opik/logging_messages.py +6 -0
- opik/message_processing/batching/base_batcher.py +14 -21
- opik/message_processing/batching/batch_manager.py +22 -10
- opik/message_processing/batching/batch_manager_constuctors.py +10 -0
- opik/message_processing/batching/batchers.py +59 -27
- opik/message_processing/batching/flushing_thread.py +0 -3
- opik/message_processing/emulation/__init__.py +0 -0
- opik/message_processing/emulation/emulator_message_processor.py +578 -0
- opik/message_processing/emulation/local_emulator_message_processor.py +140 -0
- opik/message_processing/emulation/models.py +162 -0
- opik/message_processing/encoder_helpers.py +79 -0
- opik/message_processing/messages.py +56 -1
- opik/message_processing/preprocessing/__init__.py +0 -0
- opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
- opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
- opik/message_processing/preprocessing/constants.py +1 -0
- opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
- opik/message_processing/preprocessing/preprocessor.py +36 -0
- opik/message_processing/processors/__init__.py +0 -0
- opik/message_processing/processors/attachments_extraction_processor.py +146 -0
- opik/message_processing/processors/message_processors.py +92 -0
- opik/message_processing/processors/message_processors_chain.py +96 -0
- opik/message_processing/{message_processors.py → processors/online_message_processor.py} +85 -29
- opik/message_processing/queue_consumer.py +9 -3
- opik/message_processing/streamer.py +71 -33
- opik/message_processing/streamer_constructors.py +43 -10
- opik/opik_context.py +16 -4
- opik/plugins/pytest/hooks.py +5 -3
- opik/rest_api/__init__.py +346 -15
- opik/rest_api/alerts/__init__.py +7 -0
- opik/rest_api/alerts/client.py +667 -0
- opik/rest_api/alerts/raw_client.py +1015 -0
- opik/rest_api/alerts/types/__init__.py +7 -0
- opik/rest_api/alerts/types/get_webhook_examples_request_alert_type.py +5 -0
- opik/rest_api/annotation_queues/__init__.py +4 -0
- opik/rest_api/annotation_queues/client.py +668 -0
- opik/rest_api/annotation_queues/raw_client.py +1019 -0
- opik/rest_api/automation_rule_evaluators/client.py +34 -2
- opik/rest_api/automation_rule_evaluators/raw_client.py +24 -0
- opik/rest_api/client.py +15 -0
- opik/rest_api/dashboards/__init__.py +4 -0
- opik/rest_api/dashboards/client.py +462 -0
- opik/rest_api/dashboards/raw_client.py +648 -0
- opik/rest_api/datasets/client.py +1310 -44
- opik/rest_api/datasets/raw_client.py +2269 -358
- opik/rest_api/experiments/__init__.py +2 -2
- opik/rest_api/experiments/client.py +191 -5
- opik/rest_api/experiments/raw_client.py +301 -7
- opik/rest_api/experiments/types/__init__.py +4 -1
- opik/rest_api/experiments/types/experiment_update_status.py +5 -0
- opik/rest_api/experiments/types/experiment_update_type.py +5 -0
- opik/rest_api/experiments/types/experiment_write_status.py +5 -0
- opik/rest_api/feedback_definitions/types/find_feedback_definitions_request_type.py +1 -1
- opik/rest_api/llm_provider_key/client.py +20 -0
- opik/rest_api/llm_provider_key/raw_client.py +20 -0
- opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +1 -1
- opik/rest_api/manual_evaluation/__init__.py +4 -0
- opik/rest_api/manual_evaluation/client.py +347 -0
- opik/rest_api/manual_evaluation/raw_client.py +543 -0
- opik/rest_api/optimizations/client.py +145 -9
- opik/rest_api/optimizations/raw_client.py +237 -13
- opik/rest_api/optimizations/types/optimization_update_status.py +3 -1
- opik/rest_api/prompts/__init__.py +2 -2
- opik/rest_api/prompts/client.py +227 -6
- opik/rest_api/prompts/raw_client.py +331 -2
- opik/rest_api/prompts/types/__init__.py +3 -1
- opik/rest_api/prompts/types/create_prompt_version_detail_template_structure.py +5 -0
- opik/rest_api/prompts/types/prompt_write_template_structure.py +5 -0
- opik/rest_api/spans/__init__.py +0 -2
- opik/rest_api/spans/client.py +238 -76
- opik/rest_api/spans/raw_client.py +307 -95
- opik/rest_api/spans/types/__init__.py +0 -2
- opik/rest_api/traces/client.py +572 -161
- opik/rest_api/traces/raw_client.py +736 -229
- opik/rest_api/types/__init__.py +352 -17
- opik/rest_api/types/aggregation_data.py +1 -0
- opik/rest_api/types/alert.py +33 -0
- opik/rest_api/types/alert_alert_type.py +5 -0
- opik/rest_api/types/alert_page_public.py +24 -0
- opik/rest_api/types/alert_public.py +33 -0
- opik/rest_api/types/alert_public_alert_type.py +5 -0
- opik/rest_api/types/alert_trigger.py +27 -0
- opik/rest_api/types/alert_trigger_config.py +28 -0
- opik/rest_api/types/alert_trigger_config_public.py +28 -0
- opik/rest_api/types/alert_trigger_config_public_type.py +10 -0
- opik/rest_api/types/alert_trigger_config_type.py +10 -0
- opik/rest_api/types/alert_trigger_config_write.py +22 -0
- opik/rest_api/types/alert_trigger_config_write_type.py +10 -0
- opik/rest_api/types/alert_trigger_event_type.py +19 -0
- opik/rest_api/types/alert_trigger_public.py +27 -0
- opik/rest_api/types/alert_trigger_public_event_type.py +19 -0
- opik/rest_api/types/alert_trigger_write.py +23 -0
- opik/rest_api/types/alert_trigger_write_event_type.py +19 -0
- opik/rest_api/types/alert_write.py +28 -0
- opik/rest_api/types/alert_write_alert_type.py +5 -0
- opik/rest_api/types/annotation_queue.py +42 -0
- opik/rest_api/types/annotation_queue_batch.py +27 -0
- opik/rest_api/types/annotation_queue_item_ids.py +19 -0
- opik/rest_api/types/annotation_queue_page_public.py +28 -0
- opik/rest_api/types/annotation_queue_public.py +38 -0
- opik/rest_api/types/annotation_queue_public_scope.py +5 -0
- opik/rest_api/types/annotation_queue_reviewer.py +20 -0
- opik/rest_api/types/annotation_queue_reviewer_public.py +20 -0
- opik/rest_api/types/annotation_queue_scope.py +5 -0
- opik/rest_api/types/annotation_queue_write.py +31 -0
- opik/rest_api/types/annotation_queue_write_scope.py +5 -0
- opik/rest_api/types/audio_url.py +19 -0
- opik/rest_api/types/audio_url_public.py +19 -0
- opik/rest_api/types/audio_url_write.py +19 -0
- opik/rest_api/types/automation_rule_evaluator.py +62 -2
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_object_object_public.py +155 -0
- opik/rest_api/types/automation_rule_evaluator_page_public.py +3 -2
- opik/rest_api/types/automation_rule_evaluator_public.py +57 -2
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_public.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_write.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update.py +51 -1
- opik/rest_api/types/automation_rule_evaluator_update_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update_span_llm_as_judge.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_update_trace_thread_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update_trace_thread_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_write.py +51 -1
- opik/rest_api/types/boolean_feedback_definition.py +25 -0
- opik/rest_api/types/boolean_feedback_definition_create.py +20 -0
- opik/rest_api/types/boolean_feedback_definition_public.py +25 -0
- opik/rest_api/types/boolean_feedback_definition_update.py +20 -0
- opik/rest_api/types/boolean_feedback_detail.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_create.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_public.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_update.py +29 -0
- opik/rest_api/types/dashboard_page_public.py +24 -0
- opik/rest_api/types/dashboard_public.py +30 -0
- opik/rest_api/types/dataset.py +4 -0
- opik/rest_api/types/dataset_expansion.py +42 -0
- opik/rest_api/types/dataset_expansion_response.py +39 -0
- opik/rest_api/types/dataset_item.py +2 -0
- opik/rest_api/types/dataset_item_changes_public.py +5 -0
- opik/rest_api/types/dataset_item_compare.py +2 -0
- opik/rest_api/types/dataset_item_filter.py +27 -0
- opik/rest_api/types/dataset_item_filter_operator.py +21 -0
- opik/rest_api/types/dataset_item_page_compare.py +5 -0
- opik/rest_api/types/dataset_item_page_public.py +5 -0
- opik/rest_api/types/dataset_item_public.py +2 -0
- opik/rest_api/types/dataset_item_update.py +39 -0
- opik/rest_api/types/dataset_item_write.py +1 -0
- opik/rest_api/types/dataset_public.py +4 -0
- opik/rest_api/types/dataset_public_status.py +5 -0
- opik/rest_api/types/dataset_status.py +5 -0
- opik/rest_api/types/dataset_version_diff.py +22 -0
- opik/rest_api/types/dataset_version_diff_stats.py +24 -0
- opik/rest_api/types/dataset_version_page_public.py +23 -0
- opik/rest_api/types/dataset_version_public.py +59 -0
- opik/rest_api/types/dataset_version_summary.py +46 -0
- opik/rest_api/types/dataset_version_summary_public.py +46 -0
- opik/rest_api/types/experiment.py +7 -2
- opik/rest_api/types/experiment_group_response.py +2 -0
- opik/rest_api/types/experiment_public.py +7 -2
- opik/rest_api/types/experiment_public_status.py +5 -0
- opik/rest_api/types/experiment_score.py +20 -0
- opik/rest_api/types/experiment_score_public.py +20 -0
- opik/rest_api/types/experiment_score_write.py +20 -0
- opik/rest_api/types/experiment_status.py +5 -0
- opik/rest_api/types/feedback.py +25 -1
- opik/rest_api/types/feedback_create.py +20 -1
- opik/rest_api/types/feedback_object_public.py +27 -1
- opik/rest_api/types/feedback_public.py +25 -1
- opik/rest_api/types/feedback_score_batch_item.py +2 -1
- opik/rest_api/types/feedback_score_batch_item_thread.py +2 -1
- opik/rest_api/types/feedback_score_public.py +4 -0
- opik/rest_api/types/feedback_update.py +20 -1
- opik/rest_api/types/group_content_with_aggregations.py +1 -0
- opik/rest_api/types/group_detail.py +19 -0
- opik/rest_api/types/group_details.py +20 -0
- opik/rest_api/types/guardrail.py +1 -0
- opik/rest_api/types/guardrail_write.py +1 -0
- opik/rest_api/types/ids_holder.py +19 -0
- opik/rest_api/types/image_url.py +20 -0
- opik/rest_api/types/image_url_public.py +20 -0
- opik/rest_api/types/image_url_write.py +20 -0
- opik/rest_api/types/llm_as_judge_message.py +5 -1
- opik/rest_api/types/llm_as_judge_message_content.py +26 -0
- opik/rest_api/types/llm_as_judge_message_content_public.py +26 -0
- opik/rest_api/types/llm_as_judge_message_content_write.py +26 -0
- opik/rest_api/types/llm_as_judge_message_public.py +5 -1
- opik/rest_api/types/llm_as_judge_message_write.py +5 -1
- opik/rest_api/types/llm_as_judge_model_parameters.py +3 -0
- opik/rest_api/types/llm_as_judge_model_parameters_public.py +3 -0
- opik/rest_api/types/llm_as_judge_model_parameters_write.py +3 -0
- opik/rest_api/types/manual_evaluation_request.py +38 -0
- opik/rest_api/types/manual_evaluation_request_entity_type.py +5 -0
- opik/rest_api/types/manual_evaluation_response.py +27 -0
- opik/rest_api/types/optimization.py +4 -2
- opik/rest_api/types/optimization_public.py +4 -2
- opik/rest_api/types/optimization_public_status.py +3 -1
- opik/rest_api/types/optimization_status.py +3 -1
- opik/rest_api/types/optimization_studio_config.py +27 -0
- opik/rest_api/types/optimization_studio_config_public.py +27 -0
- opik/rest_api/types/optimization_studio_config_write.py +27 -0
- opik/rest_api/types/optimization_studio_log.py +22 -0
- opik/rest_api/types/optimization_write.py +4 -2
- opik/rest_api/types/optimization_write_status.py +3 -1
- opik/rest_api/types/project.py +1 -0
- opik/rest_api/types/project_detailed.py +1 -0
- opik/rest_api/types/project_reference.py +31 -0
- opik/rest_api/types/project_reference_public.py +31 -0
- opik/rest_api/types/project_stats_summary_item.py +1 -0
- opik/rest_api/types/prompt.py +6 -0
- opik/rest_api/types/prompt_detail.py +6 -0
- opik/rest_api/types/prompt_detail_template_structure.py +5 -0
- opik/rest_api/types/prompt_public.py +6 -0
- opik/rest_api/types/prompt_public_template_structure.py +5 -0
- opik/rest_api/types/prompt_template_structure.py +5 -0
- opik/rest_api/types/prompt_version.py +3 -0
- opik/rest_api/types/prompt_version_detail.py +3 -0
- opik/rest_api/types/prompt_version_detail_template_structure.py +5 -0
- opik/rest_api/types/prompt_version_link.py +1 -0
- opik/rest_api/types/prompt_version_link_public.py +1 -0
- opik/rest_api/types/prompt_version_page_public.py +5 -0
- opik/rest_api/types/prompt_version_public.py +3 -0
- opik/rest_api/types/prompt_version_public_template_structure.py +5 -0
- opik/rest_api/types/prompt_version_template_structure.py +5 -0
- opik/rest_api/types/prompt_version_update.py +33 -0
- opik/rest_api/types/provider_api_key.py +9 -0
- opik/rest_api/types/provider_api_key_provider.py +1 -1
- opik/rest_api/types/provider_api_key_public.py +9 -0
- opik/rest_api/types/provider_api_key_public_provider.py +1 -1
- opik/rest_api/types/score_name.py +1 -0
- opik/rest_api/types/service_toggles_config.py +18 -0
- opik/rest_api/types/span.py +1 -2
- opik/rest_api/types/span_enrichment_options.py +31 -0
- opik/rest_api/types/span_experiment_item_bulk_write_view.py +1 -2
- opik/rest_api/types/span_filter.py +23 -0
- opik/rest_api/types/span_filter_operator.py +21 -0
- opik/rest_api/types/span_filter_write.py +23 -0
- opik/rest_api/types/span_filter_write_operator.py +21 -0
- opik/rest_api/types/span_llm_as_judge_code.py +27 -0
- opik/rest_api/types/span_llm_as_judge_code_public.py +27 -0
- opik/rest_api/types/span_llm_as_judge_code_write.py +27 -0
- opik/rest_api/types/span_public.py +1 -2
- opik/rest_api/types/span_update.py +46 -0
- opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
- opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
- opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
- opik/rest_api/types/span_write.py +1 -2
- opik/rest_api/types/studio_evaluation.py +20 -0
- opik/rest_api/types/studio_evaluation_public.py +20 -0
- opik/rest_api/types/studio_evaluation_write.py +20 -0
- opik/rest_api/types/studio_llm_model.py +21 -0
- opik/rest_api/types/studio_llm_model_public.py +21 -0
- opik/rest_api/types/studio_llm_model_write.py +21 -0
- opik/rest_api/types/studio_message.py +20 -0
- opik/rest_api/types/studio_message_public.py +20 -0
- opik/rest_api/types/studio_message_write.py +20 -0
- opik/rest_api/types/studio_metric.py +21 -0
- opik/rest_api/types/studio_metric_public.py +21 -0
- opik/rest_api/types/studio_metric_write.py +21 -0
- opik/rest_api/types/studio_optimizer.py +21 -0
- opik/rest_api/types/studio_optimizer_public.py +21 -0
- opik/rest_api/types/studio_optimizer_write.py +21 -0
- opik/rest_api/types/studio_prompt.py +20 -0
- opik/rest_api/types/studio_prompt_public.py +20 -0
- opik/rest_api/types/studio_prompt_write.py +20 -0
- opik/rest_api/types/trace.py +11 -2
- opik/rest_api/types/trace_enrichment_options.py +32 -0
- opik/rest_api/types/trace_experiment_item_bulk_write_view.py +1 -2
- opik/rest_api/types/trace_filter.py +23 -0
- opik/rest_api/types/trace_filter_operator.py +21 -0
- opik/rest_api/types/trace_filter_write.py +23 -0
- opik/rest_api/types/trace_filter_write_operator.py +21 -0
- opik/rest_api/types/trace_public.py +11 -2
- opik/rest_api/types/trace_thread_filter_write.py +23 -0
- opik/rest_api/types/trace_thread_filter_write_operator.py +21 -0
- opik/rest_api/types/trace_thread_identifier.py +1 -0
- opik/rest_api/types/trace_update.py +39 -0
- opik/rest_api/types/trace_write.py +1 -2
- opik/rest_api/types/value_entry.py +2 -0
- opik/rest_api/types/value_entry_compare.py +2 -0
- opik/rest_api/types/value_entry_experiment_item_bulk_write_view.py +2 -0
- opik/rest_api/types/value_entry_public.py +2 -0
- opik/rest_api/types/video_url.py +19 -0
- opik/rest_api/types/video_url_public.py +19 -0
- opik/rest_api/types/video_url_write.py +19 -0
- opik/rest_api/types/webhook.py +28 -0
- opik/rest_api/types/webhook_examples.py +19 -0
- opik/rest_api/types/webhook_public.py +28 -0
- opik/rest_api/types/webhook_test_result.py +23 -0
- opik/rest_api/types/webhook_test_result_status.py +5 -0
- opik/rest_api/types/webhook_write.py +23 -0
- opik/rest_api/types/welcome_wizard_tracking.py +22 -0
- opik/rest_api/types/workspace_configuration.py +5 -0
- opik/rest_api/welcome_wizard/__init__.py +4 -0
- opik/rest_api/welcome_wizard/client.py +195 -0
- opik/rest_api/welcome_wizard/raw_client.py +208 -0
- opik/rest_api/workspaces/client.py +14 -2
- opik/rest_api/workspaces/raw_client.py +10 -0
- opik/s3_httpx_client.py +14 -1
- opik/simulation/__init__.py +6 -0
- opik/simulation/simulated_user.py +99 -0
- opik/simulation/simulator.py +108 -0
- opik/synchronization.py +5 -6
- opik/{decorator/tracing_runtime_config.py → tracing_runtime_config.py} +6 -7
- opik/types.py +36 -0
- opik/validation/chat_prompt_messages.py +241 -0
- opik/validation/feedback_score.py +3 -3
- opik/validation/validator.py +28 -0
- opik-1.9.71.dist-info/METADATA +370 -0
- opik-1.9.71.dist-info/RECORD +1110 -0
- opik/api_objects/prompt/prompt.py +0 -112
- opik/cli.py +0 -193
- opik/hooks.py +0 -13
- opik/integrations/bedrock/chunks_aggregator.py +0 -55
- opik/integrations/bedrock/helpers.py +0 -8
- opik/rest_api/types/automation_rule_evaluator_object_public.py +0 -100
- opik/rest_api/types/json_node_experiment_item_bulk_write_view.py +0 -5
- opik-1.8.39.dist-info/METADATA +0 -339
- opik-1.8.39.dist-info/RECORD +0 -790
- /opik/{evaluation/metrics/conversation/conversational_coherence → decorator/context_manager}/__init__.py +0 -0
- /opik/evaluation/metrics/conversation/{session_completeness → llm_judges/conversational_coherence}/__init__.py +0 -0
- /opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/schema.py +0 -0
- /opik/evaluation/metrics/conversation/{user_frustration → llm_judges/session_completeness}/__init__.py +0 -0
- /opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/schema.py +0 -0
- /opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/schema.py +0 -0
- /opik/integrations/bedrock/{stream_wrappers.py → converse/stream_wrappers.py} +0 -0
- /opik/rest_api/{spans/types → types}/span_update_type.py +0 -0
- {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/WHEEL +0 -0
- {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/entry_points.txt +0 -0
- {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/licenses/LICENSE +0 -0
- {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/top_level.txt +0 -0
|
@@ -1,23 +1,81 @@
|
|
|
1
1
|
from .aggregated_metric import AggregatedMetric
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
2
|
+
|
|
3
|
+
# Keep the canonical import first for the new layout while still tolerating
|
|
4
|
+
# older packaging artefacts (some environments import this module before the
|
|
5
|
+
# conversation package is available). If the eager import fails we fall back
|
|
6
|
+
# to the lazy getter below, letting legacy entry-points keep working.
|
|
7
|
+
from .conversation.conversation_thread_metric import ConversationThreadMetric
|
|
8
|
+
|
|
9
|
+
from .conversation import types as conversation_types
|
|
10
|
+
from .conversation.heuristics.degeneration.metric import ConversationDegenerationMetric
|
|
11
|
+
from .conversation.heuristics.knowledge_retention.metric import (
|
|
12
|
+
KnowledgeRetentionMetric,
|
|
13
|
+
)
|
|
14
|
+
from .conversation.llm_judges.conversational_coherence.metric import (
|
|
15
|
+
ConversationalCoherenceMetric,
|
|
16
|
+
)
|
|
17
|
+
from .conversation.llm_judges.g_eval_wrappers import (
|
|
18
|
+
GEvalConversationMetric,
|
|
19
|
+
ConversationComplianceRiskMetric,
|
|
20
|
+
ConversationDialogueHelpfulnessMetric,
|
|
21
|
+
ConversationQARelevanceMetric,
|
|
22
|
+
ConversationSummarizationCoherenceMetric,
|
|
23
|
+
ConversationSummarizationConsistencyMetric,
|
|
24
|
+
ConversationPromptUncertaintyMetric,
|
|
25
|
+
)
|
|
26
|
+
from .conversation.llm_judges.session_completeness.metric import (
|
|
27
|
+
SessionCompletenessQuality,
|
|
28
|
+
)
|
|
29
|
+
from .conversation.llm_judges.user_frustration.metric import UserFrustrationMetric
|
|
5
30
|
from .heuristics.contains import Contains
|
|
6
31
|
from .heuristics.equals import Equals
|
|
32
|
+
from .heuristics.gleu import GLEU
|
|
33
|
+
from .heuristics.chrf import ChrF
|
|
7
34
|
from .heuristics.is_json import IsJson
|
|
35
|
+
from .heuristics.distribution_metrics import (
|
|
36
|
+
JSDivergence,
|
|
37
|
+
JSDistance,
|
|
38
|
+
KLDivergence,
|
|
39
|
+
)
|
|
8
40
|
from .heuristics.levenshtein_ratio import LevenshteinRatio
|
|
41
|
+
from .heuristics.meteor import METEOR
|
|
42
|
+
from .heuristics.bertscore import BERTScore
|
|
43
|
+
from .heuristics.spearman import SpearmanRanking
|
|
44
|
+
from .heuristics.readability import Readability
|
|
45
|
+
from .heuristics.tone import Tone
|
|
46
|
+
from .heuristics.prompt_injection import PromptInjection
|
|
47
|
+
from .heuristics.language_adherence import LanguageAdherenceMetric
|
|
9
48
|
from .heuristics.regex_match import RegexMatch
|
|
10
49
|
from .heuristics.bleu import SentenceBLEU, CorpusBLEU
|
|
11
50
|
from .heuristics.rouge import ROUGE
|
|
12
51
|
from .heuristics.sentiment import Sentiment
|
|
52
|
+
from .heuristics.vader_sentiment import VADERSentiment
|
|
13
53
|
from .llm_judges.answer_relevance.metric import AnswerRelevance
|
|
54
|
+
from .llm_judges.g_eval_presets import (
|
|
55
|
+
AgentTaskCompletionJudge,
|
|
56
|
+
AgentToolCorrectnessJudge,
|
|
57
|
+
ComplianceRiskJudge,
|
|
58
|
+
DemographicBiasJudge,
|
|
59
|
+
DialogueHelpfulnessJudge,
|
|
60
|
+
GenderBiasJudge,
|
|
61
|
+
PoliticalBiasJudge,
|
|
62
|
+
PromptUncertaintyJudge,
|
|
63
|
+
QARelevanceJudge,
|
|
64
|
+
RegionalBiasJudge,
|
|
65
|
+
ReligiousBiasJudge,
|
|
66
|
+
SummarizationCoherenceJudge,
|
|
67
|
+
SummarizationConsistencyJudge,
|
|
68
|
+
)
|
|
14
69
|
from .llm_judges.context_precision.metric import ContextPrecision
|
|
15
70
|
from .llm_judges.context_recall.metric import ContextRecall
|
|
16
|
-
from .llm_judges.g_eval.metric import GEval
|
|
71
|
+
from .llm_judges.g_eval.metric import GEval, GEvalPreset
|
|
17
72
|
from .llm_judges.hallucination.metric import Hallucination
|
|
18
73
|
from .llm_judges.moderation.metric import Moderation
|
|
74
|
+
from .llm_judges.llm_juries.metric import LLMJuriesJudge
|
|
19
75
|
from .llm_judges.trajectory_accuracy import TrajectoryAccuracy
|
|
76
|
+
from .llm_judges.syc_eval.metric import SycEval
|
|
20
77
|
from .llm_judges.usefulness.metric import Usefulness
|
|
78
|
+
from .llm_judges.structure_output_compliance.metric import StructuredOutputCompliance
|
|
21
79
|
from .base_metric import BaseMetric
|
|
22
80
|
from .ragas_metric import RagasMetricWrapper
|
|
23
81
|
from opik.exceptions import MetricComputationError
|
|
@@ -27,17 +85,51 @@ from opik.exceptions import MetricComputationError
|
|
|
27
85
|
__all__ = [
|
|
28
86
|
"AggregatedMetric",
|
|
29
87
|
"AnswerRelevance",
|
|
88
|
+
"AgentTaskCompletionJudge",
|
|
89
|
+
"AgentToolCorrectnessJudge",
|
|
30
90
|
"BaseMetric",
|
|
91
|
+
"ConversationDegenerationMetric",
|
|
92
|
+
"KnowledgeRetentionMetric",
|
|
93
|
+
"GEvalConversationMetric",
|
|
94
|
+
"ConversationComplianceRiskMetric",
|
|
95
|
+
"ConversationDialogueHelpfulnessMetric",
|
|
96
|
+
"ConversationQARelevanceMetric",
|
|
97
|
+
"ConversationSummarizationCoherenceMetric",
|
|
98
|
+
"ConversationSummarizationConsistencyMetric",
|
|
99
|
+
"ConversationPromptUncertaintyMetric",
|
|
100
|
+
"conversation_types",
|
|
101
|
+
"ComplianceRiskJudge",
|
|
31
102
|
"Contains",
|
|
32
103
|
"ContextPrecision",
|
|
33
104
|
"ContextRecall",
|
|
34
105
|
"ConversationalCoherenceMetric",
|
|
35
106
|
"CorpusBLEU",
|
|
107
|
+
"DemographicBiasJudge",
|
|
36
108
|
"Equals",
|
|
37
109
|
"GEval",
|
|
110
|
+
"GEvalPreset",
|
|
111
|
+
"GLEU",
|
|
112
|
+
"GenderBiasJudge",
|
|
38
113
|
"Hallucination",
|
|
39
114
|
"IsJson",
|
|
115
|
+
"JSDivergence",
|
|
116
|
+
"JSDistance",
|
|
117
|
+
"KLDivergence",
|
|
40
118
|
"LevenshteinRatio",
|
|
119
|
+
"BERTScore",
|
|
120
|
+
"METEOR",
|
|
121
|
+
"ChrF",
|
|
122
|
+
"Readability",
|
|
123
|
+
"PromptInjection",
|
|
124
|
+
"LanguageAdherenceMetric",
|
|
125
|
+
"PoliticalBiasJudge",
|
|
126
|
+
"PromptUncertaintyJudge",
|
|
127
|
+
"SpearmanRanking",
|
|
128
|
+
"ReligiousBiasJudge",
|
|
129
|
+
"RegionalBiasJudge",
|
|
130
|
+
"VADERSentiment",
|
|
131
|
+
"Tone",
|
|
132
|
+
"StructuredOutputCompliance",
|
|
41
133
|
"MetricComputationError",
|
|
42
134
|
"Moderation",
|
|
43
135
|
"RagasMetricWrapper",
|
|
@@ -46,8 +138,15 @@ __all__ = [
|
|
|
46
138
|
"SentenceBLEU",
|
|
47
139
|
"Sentiment",
|
|
48
140
|
"SessionCompletenessQuality",
|
|
141
|
+
"SycEval",
|
|
49
142
|
"Usefulness",
|
|
50
143
|
"UserFrustrationMetric",
|
|
51
144
|
"TrajectoryAccuracy",
|
|
145
|
+
"DialogueHelpfulnessJudge",
|
|
146
|
+
"QARelevanceJudge",
|
|
147
|
+
"SummarizationCoherenceJudge",
|
|
148
|
+
"SummarizationConsistencyJudge",
|
|
149
|
+
"LLMJuriesJudge",
|
|
150
|
+
"ConversationThreadMetric",
|
|
52
151
|
# "Factuality",
|
|
53
152
|
]
|
|
@@ -8,14 +8,43 @@ from . import arguments_helpers, arguments_validator, base_metric, score_result
|
|
|
8
8
|
class AggregatedMetric(
|
|
9
9
|
base_metric.BaseMetric, arguments_validator.ScoreArgumentsValidator
|
|
10
10
|
):
|
|
11
|
-
"""
|
|
11
|
+
"""
|
|
12
|
+
Combine the output of multiple metrics into a single aggregated ``ScoreResult``.
|
|
13
|
+
|
|
14
|
+
Each metric in ``metrics`` is executed with the provided scoring kwargs, then the
|
|
15
|
+
``aggregator`` callback decides how to merge the individual results. This is
|
|
16
|
+
handy for building ensembles such as min/max, weighted averages, or custom
|
|
17
|
+
pass/fail checks without re-implementing the metrics themselves.
|
|
12
18
|
|
|
13
19
|
Args:
|
|
14
|
-
name:
|
|
15
|
-
metrics:
|
|
16
|
-
aggregator:
|
|
17
|
-
|
|
18
|
-
|
|
20
|
+
name: Display name for the aggregated metric result.
|
|
21
|
+
metrics: Ordered list of metric instances that should be executed.
|
|
22
|
+
aggregator: Callable receiving the list of ``ScoreResult`` objects and
|
|
23
|
+
returning the final aggregated ``ScoreResult``.
|
|
24
|
+
track: Whether to automatically track the metric in Opik. Defaults to
|
|
25
|
+
``True``.
|
|
26
|
+
project_name: Optional tracking project used when no parent context exists.
|
|
27
|
+
|
|
28
|
+
Example:
|
|
29
|
+
>>> from opik.evaluation.metrics import AggregatedMetric, Contains, RegexMatch
|
|
30
|
+
>>> metrics = [Contains(track=False), RegexMatch(pattern=r"\\d+", track=False)]
|
|
31
|
+
>>> from opik.evaluation.metrics import score_result
|
|
32
|
+
>>> def combine(results):
|
|
33
|
+
... score = sum(result.value for result in results) / len(results)
|
|
34
|
+
... return score_result.ScoreResult(
|
|
35
|
+
... name="combined_contains_regex",
|
|
36
|
+
... value=score,
|
|
37
|
+
... reason="Average of contains and regex checks",
|
|
38
|
+
... )
|
|
39
|
+
>>> metric = AggregatedMetric(
|
|
40
|
+
... name="combined_contains_regex",
|
|
41
|
+
... metrics=metrics,
|
|
42
|
+
... aggregator=combine,
|
|
43
|
+
... )
|
|
44
|
+
>>> response = "Order number 12345 confirmed"
|
|
45
|
+
>>> result = metric.score(output=response, reference="order")
|
|
46
|
+
>>> float(result.value) # doctest: +SKIP
|
|
47
|
+
1.0
|
|
19
48
|
"""
|
|
20
49
|
|
|
21
50
|
def __init__(
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""Public conversation metrics API."""
|
|
2
|
+
|
|
3
|
+
from .conversation_thread_metric import ConversationThreadMetric
|
|
4
|
+
from .conversation_turns_factory import build_conversation_turns
|
|
5
|
+
from .helpers import (
|
|
6
|
+
extract_turns_windows_from_conversation,
|
|
7
|
+
get_turns_in_sliding_window,
|
|
8
|
+
merge_turns,
|
|
9
|
+
)
|
|
10
|
+
from .types import Conversation, ConversationDict, ConversationTurn
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"ConversationThreadMetric",
|
|
14
|
+
"Conversation",
|
|
15
|
+
"ConversationDict",
|
|
16
|
+
"ConversationTurn",
|
|
17
|
+
"build_conversation_turns",
|
|
18
|
+
"extract_turns_windows_from_conversation",
|
|
19
|
+
"get_turns_in_sliding_window",
|
|
20
|
+
"merge_turns",
|
|
21
|
+
"ConversationDegenerationMetric",
|
|
22
|
+
"KnowledgeRetentionMetric",
|
|
23
|
+
"ConversationalCoherenceMetric",
|
|
24
|
+
"SessionCompletenessQuality",
|
|
25
|
+
"UserFrustrationMetric",
|
|
26
|
+
"ConversationComplianceRiskMetric",
|
|
27
|
+
"ConversationDialogueHelpfulnessMetric",
|
|
28
|
+
"ConversationPromptUncertaintyMetric",
|
|
29
|
+
"ConversationQARelevanceMetric",
|
|
30
|
+
"ConversationSummarizationCoherenceMetric",
|
|
31
|
+
"ConversationSummarizationConsistencyMetric",
|
|
32
|
+
"GEvalConversationMetric",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
from .heuristics.degeneration.metric import ConversationDegenerationMetric
|
|
36
|
+
from .heuristics.knowledge_retention.metric import KnowledgeRetentionMetric
|
|
37
|
+
from .llm_judges.conversational_coherence.metric import ConversationalCoherenceMetric
|
|
38
|
+
from .llm_judges.g_eval_wrappers import (
|
|
39
|
+
GEvalConversationMetric,
|
|
40
|
+
ConversationComplianceRiskMetric,
|
|
41
|
+
ConversationDialogueHelpfulnessMetric,
|
|
42
|
+
ConversationQARelevanceMetric,
|
|
43
|
+
ConversationSummarizationCoherenceMetric,
|
|
44
|
+
ConversationSummarizationConsistencyMetric,
|
|
45
|
+
ConversationPromptUncertaintyMetric,
|
|
46
|
+
)
|
|
47
|
+
from .llm_judges.session_completeness.metric import SessionCompletenessQuality
|
|
48
|
+
from .llm_judges.user_frustration.metric import UserFrustrationMetric
|
|
@@ -5,11 +5,53 @@ from .. import base_metric, score_result
|
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
class ConversationThreadMetric(base_metric.BaseMetric):
|
|
8
|
-
"""
|
|
8
|
+
"""
|
|
9
|
+
Abstract base class for all conversation thread metrics. When creating a custom
|
|
10
|
+
conversation metric, you should inherit from this class and implement the abstract methods.
|
|
11
|
+
|
|
12
|
+
Conversation metrics are designed to evaluate multi-turn conversations rather than
|
|
13
|
+
single input-output pairs. They accept a conversation as a list of message dictionaries,
|
|
14
|
+
where each message has a 'role' (either 'user' or 'assistant') and 'content'.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
name: The name of the metric. If not provided, uses the class name as default.
|
|
18
|
+
track: Whether to track the metric. Defaults to True.
|
|
19
|
+
project_name: Optional project name to track the metric in for the cases when
|
|
20
|
+
there is no parent span/trace to inherit project name from.
|
|
21
|
+
|
|
22
|
+
Example:
|
|
23
|
+
>>> from opik.evaluation.metrics.conversation import conversation_thread_metric, types
|
|
24
|
+
>>> from opik.evaluation.metrics import score_result
|
|
25
|
+
>>> from typing import Any
|
|
26
|
+
>>>
|
|
27
|
+
>>> class ConversationLengthMetric(conversation_thread_metric.ConversationThreadMetric):
|
|
28
|
+
>>> def __init__(self, name: str = "conversation_length_score"):
|
|
29
|
+
>>> super().__init__(name)
|
|
30
|
+
>>>
|
|
31
|
+
>>> def score(self, conversation: types.Conversation, **kwargs: Any):
|
|
32
|
+
>>> num_turns = sum(1 for msg in conversation if msg["role"] == "assistant")
|
|
33
|
+
>>> return score_result.ScoreResult(
|
|
34
|
+
>>> name=self.name,
|
|
35
|
+
>>> value=num_turns,
|
|
36
|
+
>>> reason=f"Conversation has {num_turns} turns"
|
|
37
|
+
>>> )
|
|
38
|
+
"""
|
|
9
39
|
|
|
10
40
|
def score(
|
|
11
41
|
self, conversation: types.Conversation, **kwargs: Any
|
|
12
42
|
) -> Union[score_result.ScoreResult, List[score_result.ScoreResult]]:
|
|
43
|
+
"""
|
|
44
|
+
Evaluate a conversation and return a score.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
conversation: A list of conversation messages. Each message is a dictionary
|
|
48
|
+
with 'role' (either 'user' or 'assistant') and 'content' (the message text).
|
|
49
|
+
**kwargs: Additional keyword arguments that may be used by specific metric implementations.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
A ScoreResult object or list of ScoreResult objects containing the evaluation score,
|
|
53
|
+
metric name, and optional reasoning.
|
|
54
|
+
"""
|
|
13
55
|
raise NotImplementedError(
|
|
14
56
|
"Please use concrete metric classes instead of this one."
|
|
15
57
|
)
|
|
@@ -18,7 +60,19 @@ class ConversationThreadMetric(base_metric.BaseMetric):
|
|
|
18
60
|
self, conversation: types.Conversation, **kwargs: Any
|
|
19
61
|
) -> Union[score_result.ScoreResult, List[score_result.ScoreResult]]:
|
|
20
62
|
"""
|
|
21
|
-
|
|
63
|
+
Asynchronously evaluate a conversation and return a score.
|
|
64
|
+
|
|
65
|
+
This is the async version of the score method. By default, it calls the
|
|
66
|
+
synchronous score method, but can be overridden for true async implementations.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
conversation: A list of conversation messages. Each message is a dictionary
|
|
70
|
+
with 'role' (either 'user' or 'assistant') and 'content' (the message text).
|
|
71
|
+
**kwargs: Additional keyword arguments that may be used by specific metric implementations.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
A ScoreResult object or list of ScoreResult objects containing the evaluation score,
|
|
75
|
+
metric name, and optional reasoning.
|
|
22
76
|
"""
|
|
23
77
|
raise NotImplementedError(
|
|
24
78
|
"Please use concrete metric classes instead of this one."
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from .llm_judges.g_eval_wrappers import (
|
|
2
|
+
ConversationComplianceRiskMetric,
|
|
3
|
+
ConversationDialogueHelpfulnessMetric,
|
|
4
|
+
ConversationPromptUncertaintyMetric,
|
|
5
|
+
ConversationQARelevanceMetric,
|
|
6
|
+
ConversationSummarizationCoherenceMetric,
|
|
7
|
+
ConversationSummarizationConsistencyMetric,
|
|
8
|
+
GEvalConversationMetric,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"GEvalConversationMetric",
|
|
13
|
+
"ConversationComplianceRiskMetric",
|
|
14
|
+
"ConversationDialogueHelpfulnessMetric",
|
|
15
|
+
"ConversationPromptUncertaintyMetric",
|
|
16
|
+
"ConversationQARelevanceMetric",
|
|
17
|
+
"ConversationSummarizationCoherenceMetric",
|
|
18
|
+
"ConversationSummarizationConsistencyMetric",
|
|
19
|
+
]
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import Any, Generator, List
|
|
2
2
|
|
|
3
|
-
from . import types
|
|
3
|
+
from . import types
|
|
4
|
+
from .conversation_turns_factory import build_conversation_turns
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
def get_turns_in_sliding_window(
|
|
@@ -8,22 +9,20 @@ def get_turns_in_sliding_window(
|
|
|
8
9
|
) -> Generator[List[types.ConversationTurn], Any, None]:
|
|
9
10
|
"""
|
|
10
11
|
Generates windows of conversation turns of a fixed size from a list of turns.
|
|
11
|
-
|
|
12
12
|
This function creates a sliding window over the list of conversation turns.
|
|
13
13
|
Each window includes the current turn and up to `window_size - 1` previous
|
|
14
14
|
conversation turns. If there are fewer turns available than the `window_size`,
|
|
15
15
|
the window will consist of all available turns up to the current turn.
|
|
16
|
-
|
|
17
16
|
Args:
|
|
18
17
|
turns: List of conversation turn objects representing the interactions
|
|
19
18
|
in a conversation.
|
|
20
19
|
window_size: Integer specifying the maximum number of turns to include
|
|
21
20
|
in each window.
|
|
22
|
-
|
|
23
21
|
Yields:
|
|
24
22
|
A generator that produces lists of conversation turns, where each list
|
|
25
23
|
represents a sliding window of turns.
|
|
26
24
|
"""
|
|
25
|
+
|
|
27
26
|
for i in range(len(turns)):
|
|
28
27
|
yield turns[max(0, i - window_size + 1) : i + 1]
|
|
29
28
|
|
|
@@ -31,19 +30,17 @@ def get_turns_in_sliding_window(
|
|
|
31
30
|
def merge_turns(turns: List[types.ConversationTurn]) -> types.Conversation:
|
|
32
31
|
"""
|
|
33
32
|
Merges a list of conversation turns into a single conversation.
|
|
34
|
-
|
|
35
33
|
This function takes a list of conversation turns and combines them
|
|
36
34
|
into a single conversation by extending the output list with the data
|
|
37
35
|
from each turn.
|
|
38
|
-
|
|
39
36
|
Args:
|
|
40
37
|
turns: A list of conversation turn objects to be combined.
|
|
41
|
-
|
|
42
38
|
Returns:
|
|
43
39
|
types.Conversation: A combined conversation object containing all
|
|
44
40
|
the turns from the input list.
|
|
45
41
|
"""
|
|
46
|
-
|
|
42
|
+
|
|
43
|
+
output: types.Conversation = []
|
|
47
44
|
for turn in turns:
|
|
48
45
|
output.extend(turn.as_list())
|
|
49
46
|
return output
|
|
@@ -56,24 +53,20 @@ def extract_turns_windows_from_conversation(
|
|
|
56
53
|
Extracts a list of conversation windows based on turns using a sliding window
|
|
57
54
|
approach. This function divides a conversation into consecutive overlapping
|
|
58
55
|
windows, where each window contains a specified number of turns.
|
|
59
|
-
|
|
60
56
|
Args:
|
|
61
57
|
conversation: The input conversation from which turns will be processed.
|
|
62
58
|
window_size: The number of turns to include in each sliding window.
|
|
63
|
-
|
|
64
59
|
Returns:
|
|
65
60
|
A list of conversations, each representing a window of turns specified
|
|
66
61
|
by the given window size.
|
|
67
|
-
|
|
68
62
|
Raises:
|
|
69
63
|
ValueError: If the conversation is empty or if it has no turns.
|
|
70
64
|
"""
|
|
65
|
+
|
|
71
66
|
if len(conversation) == 0:
|
|
72
67
|
raise ValueError("Conversation is empty")
|
|
73
68
|
|
|
74
|
-
turns =
|
|
75
|
-
conversation=conversation
|
|
76
|
-
)
|
|
69
|
+
turns = build_conversation_turns(conversation=conversation)
|
|
77
70
|
if len(turns) == 0:
|
|
78
71
|
raise ValueError("Conversation has no turns")
|
|
79
72
|
|
|
@@ -81,5 +74,11 @@ def extract_turns_windows_from_conversation(
|
|
|
81
74
|
merge_turns(turns_window)
|
|
82
75
|
for turns_window in get_turns_in_sliding_window(turns, window_size)
|
|
83
76
|
]
|
|
84
|
-
|
|
85
77
|
return turns_windows
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
__all__ = [
|
|
81
|
+
"get_turns_in_sliding_window",
|
|
82
|
+
"merge_turns",
|
|
83
|
+
"extract_turns_windows_from_conversation",
|
|
84
|
+
]
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Heuristic conversation-level metrics.
|
|
2
|
+
|
|
3
|
+
Exposes the reusable conversation-level heuristics under the public namespace
|
|
4
|
+
``opik.evaluation.metrics.conversation.heuristics.*`` so documentation and downstream
|
|
5
|
+
code can import them directly.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .degeneration.metric import ConversationDegenerationMetric
|
|
9
|
+
from .knowledge_retention.metric import KnowledgeRetentionMetric
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"ConversationDegenerationMetric",
|
|
13
|
+
"KnowledgeRetentionMetric",
|
|
14
|
+
]
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import math
|
|
4
|
+
import re
|
|
5
|
+
from collections import Counter
|
|
6
|
+
from typing import Dict, List, Optional
|
|
7
|
+
|
|
8
|
+
from opik.evaluation.metrics.conversation import types as conversation_types
|
|
9
|
+
from opik.evaluation.metrics.conversation.conversation_thread_metric import (
|
|
10
|
+
ConversationThreadMetric,
|
|
11
|
+
)
|
|
12
|
+
from opik.evaluation.metrics.score_result import ScoreResult
|
|
13
|
+
from opik.exceptions import MetricComputationError
|
|
14
|
+
from .phrases import DEFAULT_FALLBACK_PHRASES
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _tokenize(text: str) -> List[str]:
|
|
18
|
+
return re.findall(r"\b\w+\b", text.lower())
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _ngram_counts(tokens: List[str], n: int) -> Counter:
|
|
22
|
+
if len(tokens) < n:
|
|
23
|
+
return Counter()
|
|
24
|
+
return Counter(tuple(tokens[i : i + n]) for i in range(len(tokens) - n + 1))
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ConversationDegenerationMetric(ConversationThreadMetric):
|
|
28
|
+
"""
|
|
29
|
+
Score how strongly an assistant conversation shows degeneration or repetition.
|
|
30
|
+
|
|
31
|
+
The metric inspects each assistant turn, measuring repeated n-grams, overlap with
|
|
32
|
+
the previous reply, low lexical diversity, and presence of known fallback
|
|
33
|
+
phrases (for example, "as an AI language model..."). Each turn receives a
|
|
34
|
+
degeneration score between `0.0` and `1.0`; the overall metric reports the peak
|
|
35
|
+
risk observed so you can quickly flag sections where the assistant got stuck or
|
|
36
|
+
stopped being helpful. Detailed per-turn diagnostics are returned in the
|
|
37
|
+
``ScoreResult.metadata`` payload.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
name: Display name for the metric result. Defaults to
|
|
41
|
+
``"conversation_degeneration_metric"``.
|
|
42
|
+
track: Whether the metric should automatically track to an Opik project.
|
|
43
|
+
Defaults to ``True``.
|
|
44
|
+
project_name: Optional project to store tracked results in. Defaults to
|
|
45
|
+
``None`` (inherit global setting).
|
|
46
|
+
ngram_size: Size of the n-grams used to detect repetition within a single
|
|
47
|
+
response. Must be at least ``2``. Defaults to ``3``.
|
|
48
|
+
fallback_phrases: Custom list of phrases that should be treated as
|
|
49
|
+
degeneration signatures. If ``None``, a sensible default list is used.
|
|
50
|
+
|
|
51
|
+
Example:
|
|
52
|
+
>>> from opik.evaluation.metrics import ConversationDegenerationMetric
|
|
53
|
+
>>> conversation = [
|
|
54
|
+
... {"role": "user", "content": "Can you draft a short bio for Ada?"},
|
|
55
|
+
... {"role": "assistant", "content": "Sure, here is a short bio for Ada."},
|
|
56
|
+
... {"role": "user", "content": "Could you add more detail?"},
|
|
57
|
+
... {"role": "assistant", "content": "Sure, here is a short bio for Ada."},
|
|
58
|
+
... ]
|
|
59
|
+
>>> metric = ConversationDegenerationMetric(ngram_size=3)
|
|
60
|
+
>>> result = metric.score(conversation)
|
|
61
|
+
>>> float(result.value) # doctest: +SKIP
|
|
62
|
+
0.75
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
def __init__(
|
|
66
|
+
self,
|
|
67
|
+
name: str = "conversation_degeneration_metric",
|
|
68
|
+
track: bool = True,
|
|
69
|
+
project_name: Optional[str] = None,
|
|
70
|
+
ngram_size: int = 3,
|
|
71
|
+
fallback_phrases: Optional[List[str]] = None,
|
|
72
|
+
) -> None:
|
|
73
|
+
super().__init__(name=name, track=track, project_name=project_name)
|
|
74
|
+
if ngram_size < 2:
|
|
75
|
+
raise MetricComputationError("ngram_size must be >= 2")
|
|
76
|
+
self._ngram_size = ngram_size
|
|
77
|
+
phrases = (
|
|
78
|
+
fallback_phrases
|
|
79
|
+
if fallback_phrases is not None
|
|
80
|
+
else DEFAULT_FALLBACK_PHRASES
|
|
81
|
+
)
|
|
82
|
+
self._fallback_phrases = [phrase.lower() for phrase in phrases]
|
|
83
|
+
|
|
84
|
+
def score(
|
|
85
|
+
self,
|
|
86
|
+
conversation: conversation_types.Conversation,
|
|
87
|
+
**ignored_kwargs: object,
|
|
88
|
+
) -> ScoreResult:
|
|
89
|
+
assistant_turns = [
|
|
90
|
+
turn["content"]
|
|
91
|
+
for turn in conversation
|
|
92
|
+
if turn.get("role") == "assistant" and turn.get("content")
|
|
93
|
+
]
|
|
94
|
+
if not assistant_turns:
|
|
95
|
+
raise MetricComputationError("Conversation contains no assistant messages")
|
|
96
|
+
|
|
97
|
+
per_turn_metadata: List[Dict[str, float]] = []
|
|
98
|
+
degeneracy_scores: List[float] = []
|
|
99
|
+
|
|
100
|
+
prev_tokens: Optional[List[str]] = None
|
|
101
|
+
for content in assistant_turns:
|
|
102
|
+
tokens = _tokenize(content)
|
|
103
|
+
if not tokens:
|
|
104
|
+
continue
|
|
105
|
+
|
|
106
|
+
entropy_norm = self._token_entropy(tokens)
|
|
107
|
+
repetition_ratio = self._repetition_ratio(tokens)
|
|
108
|
+
prev_overlap = self._overlap_with_previous(tokens, prev_tokens)
|
|
109
|
+
fallback_score = 1.0 if self._contains_fallback_phrase(content) else 0.0
|
|
110
|
+
|
|
111
|
+
normalized_entropy = 1.0 - entropy_norm
|
|
112
|
+
# Combine all four risk factors with equal weight; this keeps the
|
|
113
|
+
# heuristic interpretable and matches the legacy scoring behaviour.
|
|
114
|
+
deg_score = min(
|
|
115
|
+
1.0,
|
|
116
|
+
(repetition_ratio + prev_overlap + fallback_score + normalized_entropy)
|
|
117
|
+
/ 4.0,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
per_turn_metadata.append(
|
|
121
|
+
{
|
|
122
|
+
"repetition_ratio": repetition_ratio,
|
|
123
|
+
"overlap_previous": prev_overlap,
|
|
124
|
+
"fallback_hit": fallback_score,
|
|
125
|
+
"normalized_entropy": normalized_entropy,
|
|
126
|
+
"degeneration_score": deg_score,
|
|
127
|
+
}
|
|
128
|
+
)
|
|
129
|
+
degeneracy_scores.append(deg_score)
|
|
130
|
+
prev_tokens = tokens
|
|
131
|
+
|
|
132
|
+
if not degeneracy_scores:
|
|
133
|
+
raise MetricComputationError(
|
|
134
|
+
"Assistant messages were empty after tokenization"
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
average_score = sum(degeneracy_scores) / len(degeneracy_scores)
|
|
138
|
+
peak_score = max(degeneracy_scores)
|
|
139
|
+
|
|
140
|
+
return ScoreResult(
|
|
141
|
+
value=peak_score,
|
|
142
|
+
name=self.name,
|
|
143
|
+
reason=(
|
|
144
|
+
f"Peak degeneration risk ({len(degeneracy_scores)} turns):"
|
|
145
|
+
f" {peak_score:.3f}"
|
|
146
|
+
),
|
|
147
|
+
metadata={
|
|
148
|
+
"per_turn": per_turn_metadata,
|
|
149
|
+
"average_score": average_score,
|
|
150
|
+
"peak_score": peak_score,
|
|
151
|
+
},
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
def _token_entropy(self, tokens: List[str]) -> float:
|
|
155
|
+
counts = Counter(tokens)
|
|
156
|
+
total = float(len(tokens))
|
|
157
|
+
entropy = 0.0
|
|
158
|
+
for count in counts.values():
|
|
159
|
+
prob = count / total
|
|
160
|
+
entropy -= prob * math.log(prob, 2)
|
|
161
|
+
max_entropy = math.log(len(counts), 2) if counts else 1.0
|
|
162
|
+
if max_entropy == 0:
|
|
163
|
+
return 0.0
|
|
164
|
+
return min(1.0, entropy / max_entropy)
|
|
165
|
+
|
|
166
|
+
def _repetition_ratio(self, tokens: List[str]) -> float:
|
|
167
|
+
ngram_counts = _ngram_counts(tokens, self._ngram_size)
|
|
168
|
+
total = sum(ngram_counts.values())
|
|
169
|
+
if total == 0:
|
|
170
|
+
return 0.0
|
|
171
|
+
repeated = sum(count for count in ngram_counts.values() if count > 1)
|
|
172
|
+
return repeated / total
|
|
173
|
+
|
|
174
|
+
def _overlap_with_previous(
|
|
175
|
+
self, tokens: List[str], prev_tokens: Optional[List[str]]
|
|
176
|
+
) -> float:
|
|
177
|
+
if not prev_tokens:
|
|
178
|
+
return 0.0
|
|
179
|
+
current_set = set(tokens)
|
|
180
|
+
prev_set = set(prev_tokens)
|
|
181
|
+
if not current_set or not prev_set:
|
|
182
|
+
return 0.0
|
|
183
|
+
intersection = len(current_set & prev_set)
|
|
184
|
+
union = len(current_set | prev_set)
|
|
185
|
+
return intersection / union
|
|
186
|
+
|
|
187
|
+
def _contains_fallback_phrase(self, content: str) -> bool:
|
|
188
|
+
lowered = content.lower()
|
|
189
|
+
return any(phrase in lowered for phrase in self._fallback_phrases)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Phrase lists used by the conversation degeneration metric."""
|
|
2
|
+
|
|
3
|
+
DEFAULT_FALLBACK_PHRASES = [
|
|
4
|
+
"i'm sorry",
|
|
5
|
+
"as an ai language model",
|
|
6
|
+
"i cannot",
|
|
7
|
+
"i'm unable",
|
|
8
|
+
"please provide",
|
|
9
|
+
"i don't have access",
|
|
10
|
+
"i don't understand",
|
|
11
|
+
"could you please clarify",
|
|
12
|
+
]
|