PyPI - opik - Versions diffs - 1.8.39__py3-none-any.whl → 1.9.71__py3-none-any.whl - Mend

opik 1.8.39py3-none-any.whl → 1.9.71py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (592) hide show

opik/__init__.py +19 -3
opik/anonymizer/__init__.py +5 -0
opik/anonymizer/anonymizer.py +12 -0
opik/anonymizer/factory.py +80 -0
opik/anonymizer/recursive_anonymizer.py +64 -0
opik/anonymizer/rules.py +56 -0
opik/anonymizer/rules_anonymizer.py +35 -0
opik/api_objects/attachment/attachment_context.py +36 -0
opik/api_objects/attachment/attachments_extractor.py +153 -0
opik/api_objects/attachment/client.py +1 -0
opik/api_objects/attachment/converters.py +2 -0
opik/api_objects/attachment/decoder.py +18 -0
opik/api_objects/attachment/decoder_base64.py +83 -0
opik/api_objects/attachment/decoder_helpers.py +137 -0
opik/api_objects/data_helpers.py +79 -0
opik/api_objects/dataset/dataset.py +64 -4
opik/api_objects/dataset/rest_operations.py +11 -2
opik/api_objects/experiment/experiment.py +57 -57
opik/api_objects/experiment/experiment_item.py +2 -1
opik/api_objects/experiment/experiments_client.py +64 -0
opik/api_objects/experiment/helpers.py +35 -11
opik/api_objects/experiment/rest_operations.py +65 -5
opik/api_objects/helpers.py +8 -5
opik/api_objects/local_recording.py +81 -0
opik/api_objects/opik_client.py +600 -108
opik/api_objects/opik_query_language.py +39 -5
opik/api_objects/prompt/__init__.py +12 -2
opik/api_objects/prompt/base_prompt.py +69 -0
opik/api_objects/prompt/base_prompt_template.py +29 -0
opik/api_objects/prompt/chat/__init__.py +1 -0
opik/api_objects/prompt/chat/chat_prompt.py +210 -0
opik/api_objects/prompt/chat/chat_prompt_template.py +350 -0
opik/api_objects/prompt/chat/content_renderer_registry.py +203 -0
opik/api_objects/prompt/client.py +189 -47
opik/api_objects/prompt/text/__init__.py +1 -0
opik/api_objects/prompt/text/prompt.py +174 -0
opik/api_objects/prompt/{prompt_template.py → text/prompt_template.py} +10 -6
opik/api_objects/prompt/types.py +23 -0
opik/api_objects/search_helpers.py +89 -0
opik/api_objects/span/span_data.py +35 -25
opik/api_objects/threads/threads_client.py +39 -5
opik/api_objects/trace/trace_client.py +52 -2
opik/api_objects/trace/trace_data.py +15 -24
opik/api_objects/validation_helpers.py +3 -3
opik/cli/__init__.py +5 -0
opik/cli/__main__.py +6 -0
opik/cli/configure.py +66 -0
opik/cli/exports/__init__.py +131 -0
opik/cli/exports/dataset.py +278 -0
opik/cli/exports/experiment.py +784 -0
opik/cli/exports/project.py +685 -0
opik/cli/exports/prompt.py +578 -0
opik/cli/exports/utils.py +406 -0
opik/cli/harbor.py +39 -0
opik/cli/healthcheck.py +21 -0
opik/cli/imports/__init__.py +439 -0
opik/cli/imports/dataset.py +143 -0
opik/cli/imports/experiment.py +1192 -0
opik/cli/imports/project.py +262 -0
opik/cli/imports/prompt.py +177 -0
opik/cli/imports/utils.py +280 -0
opik/cli/main.py +49 -0
opik/cli/proxy.py +93 -0
opik/cli/usage_report/__init__.py +16 -0
opik/cli/usage_report/charts.py +783 -0
opik/cli/usage_report/cli.py +274 -0
opik/cli/usage_report/constants.py +9 -0
opik/cli/usage_report/extraction.py +749 -0
opik/cli/usage_report/pdf.py +244 -0
opik/cli/usage_report/statistics.py +78 -0
opik/cli/usage_report/utils.py +235 -0
opik/config.py +13 -7
opik/configurator/configure.py +17 -0
opik/datetime_helpers.py +12 -0
opik/decorator/arguments_helpers.py +9 -1
opik/decorator/base_track_decorator.py +205 -133
opik/decorator/context_manager/span_context_manager.py +123 -0
opik/decorator/context_manager/trace_context_manager.py +84 -0
opik/decorator/opik_args/__init__.py +13 -0
opik/decorator/opik_args/api_classes.py +71 -0
opik/decorator/opik_args/helpers.py +120 -0
opik/decorator/span_creation_handler.py +25 -6
opik/dict_utils.py +3 -3
opik/evaluation/__init__.py +13 -2
opik/evaluation/engine/engine.py +272 -75
opik/evaluation/engine/evaluation_tasks_executor.py +6 -3
opik/evaluation/engine/helpers.py +31 -6
opik/evaluation/engine/metrics_evaluator.py +237 -0
opik/evaluation/evaluation_result.py +168 -2
opik/evaluation/evaluator.py +533 -62
opik/evaluation/metrics/__init__.py +103 -4
opik/evaluation/metrics/aggregated_metric.py +35 -6
opik/evaluation/metrics/base_metric.py +1 -1
opik/evaluation/metrics/conversation/__init__.py +48 -0
opik/evaluation/metrics/conversation/conversation_thread_metric.py +56 -2
opik/evaluation/metrics/conversation/g_eval_wrappers.py +19 -0
opik/evaluation/metrics/conversation/helpers.py +14 -15
opik/evaluation/metrics/conversation/heuristics/__init__.py +14 -0
opik/evaluation/metrics/conversation/heuristics/degeneration/__init__.py +3 -0
opik/evaluation/metrics/conversation/heuristics/degeneration/metric.py +189 -0
opik/evaluation/metrics/conversation/heuristics/degeneration/phrases.py +12 -0
opik/evaluation/metrics/conversation/heuristics/knowledge_retention/__init__.py +3 -0
opik/evaluation/metrics/conversation/heuristics/knowledge_retention/metric.py +172 -0
opik/evaluation/metrics/conversation/llm_judges/__init__.py +32 -0
opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/metric.py +22 -17
opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/templates.py +1 -1
opik/evaluation/metrics/conversation/llm_judges/g_eval_wrappers.py +442 -0
opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/metric.py +13 -7
opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/templates.py +1 -1
opik/evaluation/metrics/conversation/llm_judges/user_frustration/__init__.py +0 -0
opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/metric.py +21 -14
opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/templates.py +1 -1
opik/evaluation/metrics/conversation/types.py +4 -5
opik/evaluation/metrics/conversation_types.py +9 -0
opik/evaluation/metrics/heuristics/bertscore.py +107 -0
opik/evaluation/metrics/heuristics/bleu.py +35 -15
opik/evaluation/metrics/heuristics/chrf.py +127 -0
opik/evaluation/metrics/heuristics/contains.py +47 -11
opik/evaluation/metrics/heuristics/distribution_metrics.py +331 -0
opik/evaluation/metrics/heuristics/gleu.py +113 -0
opik/evaluation/metrics/heuristics/language_adherence.py +123 -0
opik/evaluation/metrics/heuristics/meteor.py +119 -0
opik/evaluation/metrics/heuristics/prompt_injection.py +150 -0
opik/evaluation/metrics/heuristics/readability.py +129 -0
opik/evaluation/metrics/heuristics/rouge.py +26 -9
opik/evaluation/metrics/heuristics/spearman.py +88 -0
opik/evaluation/metrics/heuristics/tone.py +155 -0
opik/evaluation/metrics/heuristics/vader_sentiment.py +77 -0
opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +20 -5
opik/evaluation/metrics/llm_judges/context_precision/metric.py +20 -6
opik/evaluation/metrics/llm_judges/context_recall/metric.py +20 -6
opik/evaluation/metrics/llm_judges/g_eval/__init__.py +5 -0
opik/evaluation/metrics/llm_judges/g_eval/metric.py +219 -68
opik/evaluation/metrics/llm_judges/g_eval/parser.py +102 -52
opik/evaluation/metrics/llm_judges/g_eval/presets.py +209 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/__init__.py +36 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/agent_assessment.py +77 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/bias_classifier.py +181 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/compliance_risk.py +41 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/prompt_uncertainty.py +41 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/qa_suite.py +146 -0
opik/evaluation/metrics/llm_judges/hallucination/metric.py +16 -3
opik/evaluation/metrics/llm_judges/llm_juries/__init__.py +3 -0
opik/evaluation/metrics/llm_judges/llm_juries/metric.py +76 -0
opik/evaluation/metrics/llm_judges/moderation/metric.py +16 -4
opik/evaluation/metrics/llm_judges/structure_output_compliance/__init__.py +0 -0
opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +144 -0
opik/evaluation/metrics/llm_judges/structure_output_compliance/parser.py +79 -0
opik/evaluation/metrics/llm_judges/structure_output_compliance/schema.py +15 -0
opik/evaluation/metrics/llm_judges/structure_output_compliance/template.py +50 -0
opik/evaluation/metrics/llm_judges/syc_eval/__init__.py +0 -0
opik/evaluation/metrics/llm_judges/syc_eval/metric.py +252 -0
opik/evaluation/metrics/llm_judges/syc_eval/parser.py +82 -0
opik/evaluation/metrics/llm_judges/syc_eval/template.py +155 -0
opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +20 -5
opik/evaluation/metrics/llm_judges/usefulness/metric.py +16 -4
opik/evaluation/metrics/ragas_metric.py +43 -23
opik/evaluation/models/__init__.py +8 -0
opik/evaluation/models/base_model.py +107 -1
opik/evaluation/models/langchain/langchain_chat_model.py +15 -7
opik/evaluation/models/langchain/message_converters.py +97 -15
opik/evaluation/models/litellm/litellm_chat_model.py +156 -29
opik/evaluation/models/litellm/util.py +125 -0
opik/evaluation/models/litellm/warning_filters.py +16 -4
opik/evaluation/models/model_capabilities.py +187 -0
opik/evaluation/models/models_factory.py +25 -3
opik/evaluation/preprocessing.py +92 -0
opik/evaluation/report.py +70 -12
opik/evaluation/rest_operations.py +49 -45
opik/evaluation/samplers/__init__.py +4 -0
opik/evaluation/samplers/base_dataset_sampler.py +40 -0
opik/evaluation/samplers/random_dataset_sampler.py +48 -0
opik/evaluation/score_statistics.py +66 -0
opik/evaluation/scorers/__init__.py +4 -0
opik/evaluation/scorers/scorer_function.py +55 -0
opik/evaluation/scorers/scorer_wrapper_metric.py +130 -0
opik/evaluation/test_case.py +3 -2
opik/evaluation/test_result.py +1 -0
opik/evaluation/threads/evaluator.py +31 -3
opik/evaluation/threads/helpers.py +3 -2
opik/evaluation/types.py +9 -1
opik/exceptions.py +33 -0
opik/file_upload/file_uploader.py +13 -0
opik/file_upload/upload_options.py +2 -0
opik/hooks/__init__.py +23 -0
opik/hooks/anonymizer_hook.py +36 -0
opik/hooks/httpx_client_hook.py +112 -0
opik/httpx_client.py +12 -9
opik/id_helpers.py +18 -0
opik/integrations/adk/graph/subgraph_edges_builders.py +1 -2
opik/integrations/adk/helpers.py +16 -7
opik/integrations/adk/legacy_opik_tracer.py +7 -4
opik/integrations/adk/opik_tracer.py +14 -1
opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +7 -3
opik/integrations/adk/recursive_callback_injector.py +4 -7
opik/integrations/bedrock/converse/__init__.py +0 -0
opik/integrations/bedrock/converse/chunks_aggregator.py +188 -0
opik/integrations/bedrock/{converse_decorator.py → converse/converse_decorator.py} +4 -3
opik/integrations/bedrock/invoke_agent_decorator.py +5 -4
opik/integrations/bedrock/invoke_model/__init__.py +0 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/__init__.py +78 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/api.py +45 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/base.py +23 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/claude.py +121 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/format_detector.py +107 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/llama.py +108 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/mistral.py +118 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/nova.py +99 -0
opik/integrations/bedrock/invoke_model/invoke_model_decorator.py +178 -0
opik/integrations/bedrock/invoke_model/response_types.py +34 -0
opik/integrations/bedrock/invoke_model/stream_wrappers.py +122 -0
opik/integrations/bedrock/invoke_model/usage_converters.py +87 -0
opik/integrations/bedrock/invoke_model/usage_extraction.py +108 -0
opik/integrations/bedrock/opik_tracker.py +42 -4
opik/integrations/bedrock/types.py +19 -0
opik/integrations/crewai/crewai_decorator.py +8 -51
opik/integrations/crewai/opik_tracker.py +31 -10
opik/integrations/crewai/patchers/__init__.py +5 -0
opik/integrations/crewai/patchers/flow.py +118 -0
opik/integrations/crewai/patchers/litellm_completion.py +30 -0
opik/integrations/crewai/patchers/llm_client.py +207 -0
opik/integrations/dspy/callback.py +80 -17
opik/integrations/dspy/parsers.py +168 -0
opik/integrations/harbor/__init__.py +17 -0
opik/integrations/harbor/experiment_service.py +269 -0
opik/integrations/harbor/opik_tracker.py +528 -0
opik/integrations/haystack/opik_connector.py +2 -2
opik/integrations/haystack/opik_tracer.py +3 -7
opik/integrations/langchain/__init__.py +3 -1
opik/integrations/langchain/helpers.py +96 -0
opik/integrations/langchain/langgraph_async_context_bridge.py +131 -0
opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
opik/integrations/langchain/opik_encoder_extension.py +1 -1
opik/integrations/langchain/opik_tracer.py +474 -229
opik/integrations/litellm/__init__.py +5 -0
opik/integrations/litellm/completion_chunks_aggregator.py +115 -0
opik/integrations/litellm/litellm_completion_decorator.py +242 -0
opik/integrations/litellm/opik_tracker.py +43 -0
opik/integrations/litellm/stream_patchers.py +151 -0
opik/integrations/llama_index/callback.py +146 -107
opik/integrations/openai/agents/opik_tracing_processor.py +1 -2
opik/integrations/openai/openai_chat_completions_decorator.py +2 -16
opik/integrations/openai/opik_tracker.py +1 -1
opik/integrations/sagemaker/auth.py +5 -1
opik/llm_usage/google_usage.py +3 -1
opik/llm_usage/opik_usage.py +7 -8
opik/llm_usage/opik_usage_factory.py +4 -2
opik/logging_messages.py +6 -0
opik/message_processing/batching/base_batcher.py +14 -21
opik/message_processing/batching/batch_manager.py +22 -10
opik/message_processing/batching/batch_manager_constuctors.py +10 -0
opik/message_processing/batching/batchers.py +59 -27
opik/message_processing/batching/flushing_thread.py +0 -3
opik/message_processing/emulation/__init__.py +0 -0
opik/message_processing/emulation/emulator_message_processor.py +578 -0
opik/message_processing/emulation/local_emulator_message_processor.py +140 -0
opik/message_processing/emulation/models.py +162 -0
opik/message_processing/encoder_helpers.py +79 -0
opik/message_processing/messages.py +56 -1
opik/message_processing/preprocessing/__init__.py +0 -0
opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
opik/message_processing/preprocessing/constants.py +1 -0
opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
opik/message_processing/preprocessing/preprocessor.py +36 -0
opik/message_processing/processors/__init__.py +0 -0
opik/message_processing/processors/attachments_extraction_processor.py +146 -0
opik/message_processing/processors/message_processors.py +92 -0
opik/message_processing/processors/message_processors_chain.py +96 -0
opik/message_processing/{message_processors.py → processors/online_message_processor.py} +85 -29
opik/message_processing/queue_consumer.py +9 -3
opik/message_processing/streamer.py +71 -33
opik/message_processing/streamer_constructors.py +43 -10
opik/opik_context.py +16 -4
opik/plugins/pytest/hooks.py +5 -3
opik/rest_api/__init__.py +346 -15
opik/rest_api/alerts/__init__.py +7 -0
opik/rest_api/alerts/client.py +667 -0
opik/rest_api/alerts/raw_client.py +1015 -0
opik/rest_api/alerts/types/__init__.py +7 -0
opik/rest_api/alerts/types/get_webhook_examples_request_alert_type.py +5 -0
opik/rest_api/annotation_queues/__init__.py +4 -0
opik/rest_api/annotation_queues/client.py +668 -0
opik/rest_api/annotation_queues/raw_client.py +1019 -0
opik/rest_api/automation_rule_evaluators/client.py +34 -2
opik/rest_api/automation_rule_evaluators/raw_client.py +24 -0
opik/rest_api/client.py +15 -0
opik/rest_api/dashboards/__init__.py +4 -0
opik/rest_api/dashboards/client.py +462 -0
opik/rest_api/dashboards/raw_client.py +648 -0
opik/rest_api/datasets/client.py +1310 -44
opik/rest_api/datasets/raw_client.py +2269 -358
opik/rest_api/experiments/__init__.py +2 -2
opik/rest_api/experiments/client.py +191 -5
opik/rest_api/experiments/raw_client.py +301 -7
opik/rest_api/experiments/types/__init__.py +4 -1
opik/rest_api/experiments/types/experiment_update_status.py +5 -0
opik/rest_api/experiments/types/experiment_update_type.py +5 -0
opik/rest_api/experiments/types/experiment_write_status.py +5 -0
opik/rest_api/feedback_definitions/types/find_feedback_definitions_request_type.py +1 -1
opik/rest_api/llm_provider_key/client.py +20 -0
opik/rest_api/llm_provider_key/raw_client.py +20 -0
opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +1 -1
opik/rest_api/manual_evaluation/__init__.py +4 -0
opik/rest_api/manual_evaluation/client.py +347 -0
opik/rest_api/manual_evaluation/raw_client.py +543 -0
opik/rest_api/optimizations/client.py +145 -9
opik/rest_api/optimizations/raw_client.py +237 -13
opik/rest_api/optimizations/types/optimization_update_status.py +3 -1
opik/rest_api/prompts/__init__.py +2 -2
opik/rest_api/prompts/client.py +227 -6
opik/rest_api/prompts/raw_client.py +331 -2
opik/rest_api/prompts/types/__init__.py +3 -1
opik/rest_api/prompts/types/create_prompt_version_detail_template_structure.py +5 -0
opik/rest_api/prompts/types/prompt_write_template_structure.py +5 -0
opik/rest_api/spans/__init__.py +0 -2
opik/rest_api/spans/client.py +238 -76
opik/rest_api/spans/raw_client.py +307 -95
opik/rest_api/spans/types/__init__.py +0 -2
opik/rest_api/traces/client.py +572 -161
opik/rest_api/traces/raw_client.py +736 -229
opik/rest_api/types/__init__.py +352 -17
opik/rest_api/types/aggregation_data.py +1 -0
opik/rest_api/types/alert.py +33 -0
opik/rest_api/types/alert_alert_type.py +5 -0
opik/rest_api/types/alert_page_public.py +24 -0
opik/rest_api/types/alert_public.py +33 -0
opik/rest_api/types/alert_public_alert_type.py +5 -0
opik/rest_api/types/alert_trigger.py +27 -0
opik/rest_api/types/alert_trigger_config.py +28 -0
opik/rest_api/types/alert_trigger_config_public.py +28 -0
opik/rest_api/types/alert_trigger_config_public_type.py +10 -0
opik/rest_api/types/alert_trigger_config_type.py +10 -0
opik/rest_api/types/alert_trigger_config_write.py +22 -0
opik/rest_api/types/alert_trigger_config_write_type.py +10 -0
opik/rest_api/types/alert_trigger_event_type.py +19 -0
opik/rest_api/types/alert_trigger_public.py +27 -0
opik/rest_api/types/alert_trigger_public_event_type.py +19 -0
opik/rest_api/types/alert_trigger_write.py +23 -0
opik/rest_api/types/alert_trigger_write_event_type.py +19 -0
opik/rest_api/types/alert_write.py +28 -0
opik/rest_api/types/alert_write_alert_type.py +5 -0
opik/rest_api/types/annotation_queue.py +42 -0
opik/rest_api/types/annotation_queue_batch.py +27 -0
opik/rest_api/types/annotation_queue_item_ids.py +19 -0
opik/rest_api/types/annotation_queue_page_public.py +28 -0
opik/rest_api/types/annotation_queue_public.py +38 -0
opik/rest_api/types/annotation_queue_public_scope.py +5 -0
opik/rest_api/types/annotation_queue_reviewer.py +20 -0
opik/rest_api/types/annotation_queue_reviewer_public.py +20 -0
opik/rest_api/types/annotation_queue_scope.py +5 -0
opik/rest_api/types/annotation_queue_write.py +31 -0
opik/rest_api/types/annotation_queue_write_scope.py +5 -0
opik/rest_api/types/audio_url.py +19 -0
opik/rest_api/types/audio_url_public.py +19 -0
opik/rest_api/types/audio_url_write.py +19 -0
opik/rest_api/types/automation_rule_evaluator.py +62 -2
opik/rest_api/types/automation_rule_evaluator_llm_as_judge.py +2 -0
opik/rest_api/types/automation_rule_evaluator_llm_as_judge_public.py +2 -0
opik/rest_api/types/automation_rule_evaluator_llm_as_judge_write.py +2 -0
opik/rest_api/types/automation_rule_evaluator_object_object_public.py +155 -0
opik/rest_api/types/automation_rule_evaluator_page_public.py +3 -2
opik/rest_api/types/automation_rule_evaluator_public.py +57 -2
opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_public.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_write.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_public.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_write.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_public.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_write.py +2 -0
opik/rest_api/types/automation_rule_evaluator_update.py +51 -1
opik/rest_api/types/automation_rule_evaluator_update_llm_as_judge.py +2 -0
opik/rest_api/types/automation_rule_evaluator_update_span_llm_as_judge.py +22 -0
opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
opik/rest_api/types/automation_rule_evaluator_update_trace_thread_llm_as_judge.py +2 -0
opik/rest_api/types/automation_rule_evaluator_update_trace_thread_user_defined_metric_python.py +2 -0
opik/rest_api/types/automation_rule_evaluator_update_user_defined_metric_python.py +2 -0
opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python.py +2 -0
opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_public.py +2 -0
opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_write.py +2 -0
opik/rest_api/types/automation_rule_evaluator_write.py +51 -1
opik/rest_api/types/boolean_feedback_definition.py +25 -0
opik/rest_api/types/boolean_feedback_definition_create.py +20 -0
opik/rest_api/types/boolean_feedback_definition_public.py +25 -0
opik/rest_api/types/boolean_feedback_definition_update.py +20 -0
opik/rest_api/types/boolean_feedback_detail.py +29 -0
opik/rest_api/types/boolean_feedback_detail_create.py +29 -0
opik/rest_api/types/boolean_feedback_detail_public.py +29 -0
opik/rest_api/types/boolean_feedback_detail_update.py +29 -0
opik/rest_api/types/dashboard_page_public.py +24 -0
opik/rest_api/types/dashboard_public.py +30 -0
opik/rest_api/types/dataset.py +4 -0
opik/rest_api/types/dataset_expansion.py +42 -0
opik/rest_api/types/dataset_expansion_response.py +39 -0
opik/rest_api/types/dataset_item.py +2 -0
opik/rest_api/types/dataset_item_changes_public.py +5 -0
opik/rest_api/types/dataset_item_compare.py +2 -0
opik/rest_api/types/dataset_item_filter.py +27 -0
opik/rest_api/types/dataset_item_filter_operator.py +21 -0
opik/rest_api/types/dataset_item_page_compare.py +5 -0
opik/rest_api/types/dataset_item_page_public.py +5 -0
opik/rest_api/types/dataset_item_public.py +2 -0
opik/rest_api/types/dataset_item_update.py +39 -0
opik/rest_api/types/dataset_item_write.py +1 -0
opik/rest_api/types/dataset_public.py +4 -0
opik/rest_api/types/dataset_public_status.py +5 -0
opik/rest_api/types/dataset_status.py +5 -0
opik/rest_api/types/dataset_version_diff.py +22 -0
opik/rest_api/types/dataset_version_diff_stats.py +24 -0
opik/rest_api/types/dataset_version_page_public.py +23 -0
opik/rest_api/types/dataset_version_public.py +59 -0
opik/rest_api/types/dataset_version_summary.py +46 -0
opik/rest_api/types/dataset_version_summary_public.py +46 -0
opik/rest_api/types/experiment.py +7 -2
opik/rest_api/types/experiment_group_response.py +2 -0
opik/rest_api/types/experiment_public.py +7 -2
opik/rest_api/types/experiment_public_status.py +5 -0
opik/rest_api/types/experiment_score.py +20 -0
opik/rest_api/types/experiment_score_public.py +20 -0
opik/rest_api/types/experiment_score_write.py +20 -0
opik/rest_api/types/experiment_status.py +5 -0
opik/rest_api/types/feedback.py +25 -1
opik/rest_api/types/feedback_create.py +20 -1
opik/rest_api/types/feedback_object_public.py +27 -1
opik/rest_api/types/feedback_public.py +25 -1
opik/rest_api/types/feedback_score_batch_item.py +2 -1
opik/rest_api/types/feedback_score_batch_item_thread.py +2 -1
opik/rest_api/types/feedback_score_public.py +4 -0
opik/rest_api/types/feedback_update.py +20 -1
opik/rest_api/types/group_content_with_aggregations.py +1 -0
opik/rest_api/types/group_detail.py +19 -0
opik/rest_api/types/group_details.py +20 -0
opik/rest_api/types/guardrail.py +1 -0
opik/rest_api/types/guardrail_write.py +1 -0
opik/rest_api/types/ids_holder.py +19 -0
opik/rest_api/types/image_url.py +20 -0
opik/rest_api/types/image_url_public.py +20 -0
opik/rest_api/types/image_url_write.py +20 -0
opik/rest_api/types/llm_as_judge_message.py +5 -1
opik/rest_api/types/llm_as_judge_message_content.py +26 -0
opik/rest_api/types/llm_as_judge_message_content_public.py +26 -0
opik/rest_api/types/llm_as_judge_message_content_write.py +26 -0
opik/rest_api/types/llm_as_judge_message_public.py +5 -1
opik/rest_api/types/llm_as_judge_message_write.py +5 -1
opik/rest_api/types/llm_as_judge_model_parameters.py +3 -0
opik/rest_api/types/llm_as_judge_model_parameters_public.py +3 -0
opik/rest_api/types/llm_as_judge_model_parameters_write.py +3 -0
opik/rest_api/types/manual_evaluation_request.py +38 -0
opik/rest_api/types/manual_evaluation_request_entity_type.py +5 -0
opik/rest_api/types/manual_evaluation_response.py +27 -0
opik/rest_api/types/optimization.py +4 -2
opik/rest_api/types/optimization_public.py +4 -2
opik/rest_api/types/optimization_public_status.py +3 -1
opik/rest_api/types/optimization_status.py +3 -1
opik/rest_api/types/optimization_studio_config.py +27 -0
opik/rest_api/types/optimization_studio_config_public.py +27 -0
opik/rest_api/types/optimization_studio_config_write.py +27 -0
opik/rest_api/types/optimization_studio_log.py +22 -0
opik/rest_api/types/optimization_write.py +4 -2
opik/rest_api/types/optimization_write_status.py +3 -1
opik/rest_api/types/project.py +1 -0
opik/rest_api/types/project_detailed.py +1 -0
opik/rest_api/types/project_reference.py +31 -0
opik/rest_api/types/project_reference_public.py +31 -0
opik/rest_api/types/project_stats_summary_item.py +1 -0
opik/rest_api/types/prompt.py +6 -0
opik/rest_api/types/prompt_detail.py +6 -0
opik/rest_api/types/prompt_detail_template_structure.py +5 -0
opik/rest_api/types/prompt_public.py +6 -0
opik/rest_api/types/prompt_public_template_structure.py +5 -0
opik/rest_api/types/prompt_template_structure.py +5 -0
opik/rest_api/types/prompt_version.py +3 -0
opik/rest_api/types/prompt_version_detail.py +3 -0
opik/rest_api/types/prompt_version_detail_template_structure.py +5 -0
opik/rest_api/types/prompt_version_link.py +1 -0
opik/rest_api/types/prompt_version_link_public.py +1 -0
opik/rest_api/types/prompt_version_page_public.py +5 -0
opik/rest_api/types/prompt_version_public.py +3 -0
opik/rest_api/types/prompt_version_public_template_structure.py +5 -0
opik/rest_api/types/prompt_version_template_structure.py +5 -0
opik/rest_api/types/prompt_version_update.py +33 -0
opik/rest_api/types/provider_api_key.py +9 -0
opik/rest_api/types/provider_api_key_provider.py +1 -1
opik/rest_api/types/provider_api_key_public.py +9 -0
opik/rest_api/types/provider_api_key_public_provider.py +1 -1
opik/rest_api/types/score_name.py +1 -0
opik/rest_api/types/service_toggles_config.py +18 -0
opik/rest_api/types/span.py +1 -2
opik/rest_api/types/span_enrichment_options.py +31 -0
opik/rest_api/types/span_experiment_item_bulk_write_view.py +1 -2
opik/rest_api/types/span_filter.py +23 -0
opik/rest_api/types/span_filter_operator.py +21 -0
opik/rest_api/types/span_filter_write.py +23 -0
opik/rest_api/types/span_filter_write_operator.py +21 -0
opik/rest_api/types/span_llm_as_judge_code.py +27 -0
opik/rest_api/types/span_llm_as_judge_code_public.py +27 -0
opik/rest_api/types/span_llm_as_judge_code_write.py +27 -0
opik/rest_api/types/span_public.py +1 -2
opik/rest_api/types/span_update.py +46 -0
opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
opik/rest_api/types/span_write.py +1 -2
opik/rest_api/types/studio_evaluation.py +20 -0
opik/rest_api/types/studio_evaluation_public.py +20 -0
opik/rest_api/types/studio_evaluation_write.py +20 -0
opik/rest_api/types/studio_llm_model.py +21 -0
opik/rest_api/types/studio_llm_model_public.py +21 -0
opik/rest_api/types/studio_llm_model_write.py +21 -0
opik/rest_api/types/studio_message.py +20 -0
opik/rest_api/types/studio_message_public.py +20 -0
opik/rest_api/types/studio_message_write.py +20 -0
opik/rest_api/types/studio_metric.py +21 -0
opik/rest_api/types/studio_metric_public.py +21 -0
opik/rest_api/types/studio_metric_write.py +21 -0
opik/rest_api/types/studio_optimizer.py +21 -0
opik/rest_api/types/studio_optimizer_public.py +21 -0
opik/rest_api/types/studio_optimizer_write.py +21 -0
opik/rest_api/types/studio_prompt.py +20 -0
opik/rest_api/types/studio_prompt_public.py +20 -0
opik/rest_api/types/studio_prompt_write.py +20 -0
opik/rest_api/types/trace.py +11 -2
opik/rest_api/types/trace_enrichment_options.py +32 -0
opik/rest_api/types/trace_experiment_item_bulk_write_view.py +1 -2
opik/rest_api/types/trace_filter.py +23 -0
opik/rest_api/types/trace_filter_operator.py +21 -0
opik/rest_api/types/trace_filter_write.py +23 -0
opik/rest_api/types/trace_filter_write_operator.py +21 -0
opik/rest_api/types/trace_public.py +11 -2
opik/rest_api/types/trace_thread_filter_write.py +23 -0
opik/rest_api/types/trace_thread_filter_write_operator.py +21 -0
opik/rest_api/types/trace_thread_identifier.py +1 -0
opik/rest_api/types/trace_update.py +39 -0
opik/rest_api/types/trace_write.py +1 -2
opik/rest_api/types/value_entry.py +2 -0
opik/rest_api/types/value_entry_compare.py +2 -0
opik/rest_api/types/value_entry_experiment_item_bulk_write_view.py +2 -0
opik/rest_api/types/value_entry_public.py +2 -0
opik/rest_api/types/video_url.py +19 -0
opik/rest_api/types/video_url_public.py +19 -0
opik/rest_api/types/video_url_write.py +19 -0
opik/rest_api/types/webhook.py +28 -0
opik/rest_api/types/webhook_examples.py +19 -0
opik/rest_api/types/webhook_public.py +28 -0
opik/rest_api/types/webhook_test_result.py +23 -0
opik/rest_api/types/webhook_test_result_status.py +5 -0
opik/rest_api/types/webhook_write.py +23 -0
opik/rest_api/types/welcome_wizard_tracking.py +22 -0
opik/rest_api/types/workspace_configuration.py +5 -0
opik/rest_api/welcome_wizard/__init__.py +4 -0
opik/rest_api/welcome_wizard/client.py +195 -0
opik/rest_api/welcome_wizard/raw_client.py +208 -0
opik/rest_api/workspaces/client.py +14 -2
opik/rest_api/workspaces/raw_client.py +10 -0
opik/s3_httpx_client.py +14 -1
opik/simulation/__init__.py +6 -0
opik/simulation/simulated_user.py +99 -0
opik/simulation/simulator.py +108 -0
opik/synchronization.py +5 -6
opik/{decorator/tracing_runtime_config.py → tracing_runtime_config.py} +6 -7
opik/types.py +36 -0
opik/validation/chat_prompt_messages.py +241 -0
opik/validation/feedback_score.py +3 -3
opik/validation/validator.py +28 -0
opik-1.9.71.dist-info/METADATA +370 -0
opik-1.9.71.dist-info/RECORD +1110 -0
opik/api_objects/prompt/prompt.py +0 -112
opik/cli.py +0 -193
opik/hooks.py +0 -13
opik/integrations/bedrock/chunks_aggregator.py +0 -55
opik/integrations/bedrock/helpers.py +0 -8
opik/rest_api/types/automation_rule_evaluator_object_public.py +0 -100
opik/rest_api/types/json_node_experiment_item_bulk_write_view.py +0 -5
opik-1.8.39.dist-info/METADATA +0 -339
opik-1.8.39.dist-info/RECORD +0 -790
/opik/{evaluation/metrics/conversation/conversational_coherence → decorator/context_manager}/__init__.py +0 -0
/opik/evaluation/metrics/conversation/{session_completeness → llm_judges/conversational_coherence}/__init__.py +0 -0
/opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/schema.py +0 -0
/opik/evaluation/metrics/conversation/{user_frustration → llm_judges/session_completeness}/__init__.py +0 -0
/opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/schema.py +0 -0
/opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/schema.py +0 -0
/opik/integrations/bedrock/{stream_wrappers.py → converse/stream_wrappers.py} +0 -0
/opik/rest_api/{spans/types → types}/span_update_type.py +0 -0
{opik-1.8.39.dist-info → opik-1.9.71.dist-info}/WHEEL +0 -0
{opik-1.8.39.dist-info → opik-1.9.71.dist-info}/entry_points.txt +0 -0
{opik-1.8.39.dist-info → opik-1.9.71.dist-info}/licenses/LICENSE +0 -0
{opik-1.8.39.dist-info → opik-1.9.71.dist-info}/top_level.txt +0 -0

opik/evaluation/metrics/heuristics/distribution_metrics.py ADDED Viewed

@@ -0,0 +1,331 @@
+from __future__ import annotations
+import math
+from collections import Counter
+from typing import Any, Callable, Dict, Iterable, List, Optional, Protocol, Sequence
+from opik.exceptions import MetricComputationError
+from opik.evaluation.metrics import base_metric, score_result
+TokenizeFn = Callable[[str], Iterable[str]]
+class _JSDistanceFn(Protocol):
+    def __call__(
+        self,
+        p: Sequence[float],
+        q: Sequence[float],
+        base: Optional[
+            float
+        ] = ...,  # matches scipy signature allowing positional or keyword use
+    ) -> float: ...
+def _load_jensen_shannon_distance() -> _JSDistanceFn:
+    try:
+        from scipy.spatial.distance import jensenshannon
+    except ImportError as error:  # pragma: no cover - optional dependency
+        raise ImportError(
+            "Install scipy via `pip install scipy` to use Jensen-Shannon metrics."
+        ) from error
+    return jensenshannon
+def _default_tokenizer(text: str) -> Iterable[str]:
+    return text.lower().split()
+class _DistributionMetricBase(base_metric.BaseMetric):
+    """
+    Internal helper for metrics that compare token distributions.
+    Args:
+        tokenizer: Optional tokenizer returning an iterable of tokens given text.
+        name: Display name for the metric.
+        track: Whether to automatically track metric results.
+        project_name: Optional tracking project.
+        normalize: When ``True`` the histogram is converted to probabilities.
+        smoothing: Optional additive constant applied during KL-like computations.
+    """
+    def __init__(
+        self,
+        tokenizer: Optional[TokenizeFn],
+        name: str,
+        track: bool,
+        project_name: Optional[str],
+        normalize: bool,
+        smoothing: float = 0.0,
+    ) -> None:
+        super().__init__(name=name, track=track, project_name=project_name)
+        self._tokenizer = tokenizer or _default_tokenizer
+        self._normalize = normalize
+        self._smoothing = max(0.0, smoothing)
+    def _build_distribution(self, text: str) -> Dict[str, float]:
+        tokens = list(self._tokenizer(text))
+        if len(tokens) == 0:
+            raise MetricComputationError(
+                "Tokenized text is empty (distribution-based metric)."
+            )
+        counts = Counter(tokens)
+        if not self._normalize:
+            return {token: float(count) for token, count in counts.items()}
+        total = float(sum(counts.values()))
+        return {token: count / total for token, count in counts.items()}
+    def _smooth(self, value: float) -> float:
+        if self._smoothing == 0.0:
+            return value
+        return value + self._smoothing
+class JSDivergence(_DistributionMetricBase):
+    """
+    Compute Jensen–Shannon similarity (``1 - JSD``) between two texts.
+    Args:
+        tokenizer: Optional tokenizer function. Defaults to whitespace split.
+        base: Logarithm base used when computing divergence (> ``1.0``).
+        normalize: Whether to normalise token counts to probabilities first.
+        name: Display name for the metric result.
+        track: Whether to automatically track metric results.
+        project_name: Optional tracking project name.
+    Note:
+        Requires :mod:`scipy` to be installed.
+    Example:
+        >>> from opik.evaluation.metrics import JSDivergence
+        >>> metric = JSDivergence()
+        >>> result = metric.score(
+        ...     output="cat cat sat",
+        ...     reference="cat sat on mat",
+        ... )
+        >>> round(result.value, 3)  # doctest: +SKIP
+        0.812
+    """
+    def __init__(
+        self,
+        tokenizer: Optional[TokenizeFn] = None,
+        base: float = 2.0,
+        normalize: bool = True,
+        name: str = "js_divergence_metric",
+        track: bool = True,
+        project_name: Optional[str] = None,
+    ) -> None:
+        if base <= 1.0:
+            raise ValueError("base must be greater than 1.0")
+        super().__init__(
+            tokenizer=tokenizer,
+            name=name,
+            track=track,
+            project_name=project_name,
+            normalize=normalize,
+        )
+        self._base = base
+        self._js_distance_fn = _load_jensen_shannon_distance()
+    def score(
+        self,
+        output: str,
+        reference: str,
+        **ignored_kwargs: Any,
+    ) -> score_result.ScoreResult:
+        if not output.strip():
+            raise MetricComputationError(
+                "Candidate is empty (Jensen-Shannon divergence)."
+            )
+        if not reference.strip():
+            raise MetricComputationError(
+                "Reference is empty (Jensen-Shannon divergence)."
+            )
+        output_dist = self._build_distribution(output)
+        reference_dist = self._build_distribution(reference)
+        divergence = self._js_divergence(output_dist, reference_dist)
+        score = max(0.0, min(1.0, 1.0 - divergence))
+        return score_result.ScoreResult(
+            value=score,
+            name=self.name,
+            reason=(
+                f"Jensen-Shannon similarity (base={self._base:g}): {score:.4f} "
+                f"(divergence={divergence:.4f})"
+            ),
+            metadata={
+                "divergence": divergence,
+                "distance": math.sqrt(divergence),
+                "base": self._base,
+            },
+        )
+    def _js_divergence(
+        self,
+        p_dist: Dict[str, float],
+        q_dist: Dict[str, float],
+    ) -> float:
+        vocabulary = sorted(set(p_dist) | set(q_dist))
+        if not vocabulary:
+            return 0.0
+        p_vector = [p_dist.get(token, 0.0) for token in vocabulary]
+        q_vector = [q_dist.get(token, 0.0) for token in vocabulary]
+        p_probs = self._ensure_probability_vector(p_vector)
+        q_probs = self._ensure_probability_vector(q_vector)
+        distance = float(self._js_distance_fn(p_probs, q_probs, base=self._base))
+        return distance**2
+    def _ensure_probability_vector(self, values: Sequence[float]) -> List[float]:
+        total = sum(values)
+        if total <= 0.0:
+            raise MetricComputationError(
+                "Distribution is empty after tokenisation (Jensen-Shannon metric)."
+            )
+        return [value / total for value in values]
+class JSDistance(JSDivergence):
+    """
+    Return the raw Jensen–Shannon divergence instead of similarity.
+    Args:
+        tokenizer: Optional tokenizer function.
+        base: Logarithm base used for the divergence calculation.
+        normalize: Whether to normalise counts into probabilities.
+        name: Display name for the metric result.
+        track: Whether to automatically track metric results.
+        project_name: Optional tracking project name.
+    Example:
+        >>> from opik.evaluation.metrics import JSDistance
+        >>> metric = JSDistance()
+        >>> result = metric.score("a a b", reference="a b b")
+        >>> round(result.value, 3)  # doctest: +SKIP
+        0.188
+    """
+    def __init__(
+        self,
+        tokenizer: Optional[TokenizeFn] = None,
+        base: float = 2.0,
+        normalize: bool = True,
+        name: str = "js_distance_metric",
+        track: bool = True,
+        project_name: Optional[str] = None,
+    ) -> None:
+        super().__init__(
+            tokenizer=tokenizer,
+            base=base,
+            normalize=normalize,
+            name=name,
+            track=track,
+            project_name=project_name,
+        )
+    def score(
+        self,
+        output: str,
+        reference: str,
+        **ignored_kwargs: Any,
+    ) -> score_result.ScoreResult:
+        similarity = super().score(output=output, reference=reference)
+        metadata = similarity.metadata or {}
+        divergence = float(metadata.get("divergence", 0.0))
+        distance = float(metadata.get("distance", math.sqrt(divergence)))
+        return score_result.ScoreResult(
+            value=divergence,
+            name=self.name,
+            reason=f"Jensen-Shannon divergence (base={self._base:g}): {divergence:.4f}",
+            metadata={
+                "distance": distance,
+                "base": metadata.get("base", self._base),
+            },
+        )
+class KLDivergence(_DistributionMetricBase):
+    """
+    Compute the (optionally symmetric) KL divergence between token distributions.
+    Args:
+        tokenizer: Optional tokenizer function. Defaults to whitespace split.
+        direction: Direction to compute (``"pq"``, ``"qp"``, or ``"avg"`` for
+            symmetric).
+        normalize: Whether to normalise token counts to probabilities first.
+        smoothing: Additive smoothing constant to avoid divide-by-zero.
+        name: Display name for the metric result.
+        track: Whether to automatically track metric results.
+        project_name: Optional tracking project name.
+    Example:
+        >>> from opik.evaluation.metrics import KLDivergence
+        >>> metric = KLDivergence(direction="avg")
+        >>> result = metric.score("hello hello world", reference="hello world")
+        >>> round(result.value, 4)  # doctest: +SKIP
+        0.0583
+    """
+    def __init__(
+        self,
+        tokenizer: Optional[TokenizeFn] = None,
+        direction: str = "pq",
+        normalize: bool = True,
+        smoothing: float = 1e-12,
+        name: str = "kl_divergence_metric",
+        track: bool = True,
+        project_name: Optional[str] = None,
+    ) -> None:
+        if direction not in {"pq", "qp", "avg"}:
+            raise ValueError("direction must be one of {'pq', 'qp', 'avg'}")
+        super().__init__(
+            tokenizer=tokenizer,
+            name=name,
+            track=track,
+            project_name=project_name,
+            normalize=normalize,
+            smoothing=smoothing,
+        )
+        self._direction = direction
+    def score(
+        self,
+        output: str,
+        reference: str,
+        **ignored_kwargs: Any,
+    ) -> score_result.ScoreResult:
+        if not output.strip():
+            raise MetricComputationError("Candidate is empty (KL divergence metric).")
+        if not reference.strip():
+            raise MetricComputationError("Reference is empty (KL divergence metric).")
+        p_dist = self._build_distribution(output)
+        q_dist = self._build_distribution(reference)
+        if self._direction == "pq":
+            divergence = self._kl(p_dist, q_dist)
+        elif self._direction == "qp":
+            divergence = self._kl(q_dist, p_dist)
+        else:
+            divergence = 0.5 * (self._kl(p_dist, q_dist) + self._kl(q_dist, p_dist))
+        return score_result.ScoreResult(
+            value=divergence,
+            name=self.name,
+            reason=f"KL divergence ({self._direction}): {divergence:.4f}",
+        )
+    def _kl(self, p_dist: Dict[str, float], q_dist: Dict[str, float]) -> float:
+        divergence = 0.0
+        for token, p_val in p_dist.items():
+            p_val = self._smooth(p_val)
+            q_val = self._smooth(q_dist.get(token, 0.0))
+            divergence += p_val * math.log(p_val / q_val)
+        return divergence

opik/evaluation/metrics/heuristics/gleu.py ADDED Viewed

@@ -0,0 +1,113 @@
+from typing import Any, Callable, Optional, Sequence, Union
+from opik.exceptions import MetricComputationError
+from opik.evaluation.metrics import base_metric, score_result
+try:
+    from nltk.translate import gleu_score as nltk_gleu_score
+except ImportError:  # pragma: no cover - optional dependency
+    nltk_gleu_score = None
+GleuFn = Callable[[Sequence[Sequence[str]], Sequence[str]], float]
+class GLEU(base_metric.BaseMetric):
+    """
+    Sentence-level GLEU metric powered by ``nltk.translate.gleu_score``.
+    References:
+      - NLTK Reference Documentation on GLEU
+        https://www.nltk.org/api/nltk.translate.gleu_score.html
+      - OECD Catalogue of Tools & Metrics for Trustworthy AI
+        https://oecd.ai/en/catalogue/metrics/google-bleu-gleu
+      - Hugging Face Evaluate: Google BLEU (GLEU) metric overview
+        https://huggingface.co/spaces/evaluate-metric/google_bleu
+    Args:
+        gleu_fn: Optional custom scoring callable compatible with
+            ``nltk.translate.gleu_score.sentence_gleu``. Useful for testing.
+        min_len: Minimum n-gram size considered.
+        max_len: Maximum n-gram size considered.
+        name: Display name for the metric result.
+        track: Whether to automatically track metric results.
+        project_name: Optional tracking project name.
+    Example:
+        >>> from opik.evaluation.metrics import GLEU
+        >>> metric = GLEU(min_len=1, max_len=4)
+        >>> result = metric.score(
+        ...     output="The cat sat on the mat",
+        ...     reference="The cat is on the mat",
+        ... )
+        >>> round(result.value, 3)  # doctest: +SKIP
+        0.816
+    """
+    def __init__(
+        self,
+        gleu_fn: Optional[GleuFn] = None,
+        min_len: int = 1,
+        max_len: int = 4,
+        name: str = "gleu_metric",
+        track: bool = True,
+        project_name: Optional[str] = None,
+    ) -> None:
+        if min_len <= 0 or max_len <= 0:
+            raise ValueError("min_len and max_len must be positive integers.")
+        if min_len > max_len:
+            raise ValueError("min_len cannot exceed max_len.")
+        super().__init__(name=name, track=track, project_name=project_name)
+        if gleu_fn is not None:
+            self._gleu_fn = gleu_fn
+        else:
+            if nltk_gleu_score is None:  # pragma: no cover - optional dependency
+                raise ImportError(
+                    "GLEU metric requires the optional 'nltk' package. Install via"
+                    " `pip install nltk` or provide `gleu_fn`."
+                )
+            def _scorer(
+                references: Sequence[Sequence[str]], hypothesis: Sequence[str]
+            ) -> float:
+                return float(
+                    nltk_gleu_score.sentence_gleu(
+                        references,
+                        hypothesis,
+                        min_len=min_len,
+                        max_len=max_len,
+                    )
+                )
+            self._gleu_fn = _scorer
+    def score(
+        self,
+        output: str,
+        reference: Union[str, Sequence[str]],
+        **ignored_kwargs: Any,
+    ) -> score_result.ScoreResult:
+        if not output.strip():
+            raise MetricComputationError("Candidate is empty (GLEU metric).")
+        hypothesis_tokens = output.split()
+        if isinstance(reference, str):
+            references = [reference.split()]
+        else:
+            ref_list = list(reference)
+            if not ref_list:
+                raise MetricComputationError("Reference is empty (GLEU metric).")
+            references = [ref.split() for ref in ref_list]
+        if any(len(ref) == 0 for ref in references):
+            raise MetricComputationError(
+                "Reference contains empty segment (GLEU metric)."
+            )
+        score = self._gleu_fn(references, hypothesis_tokens)
+        return score_result.ScoreResult(
+            value=float(score),
+            name=self.name,
+            reason=f"GLEU score: {float(score):.4f}",
+        )

opik/evaluation/metrics/heuristics/language_adherence.py ADDED Viewed

@@ -0,0 +1,123 @@
+"""Language adherence metric leveraging fastText-style language identification."""
+from __future__ import annotations
+from typing import Any, Callable, Optional, Tuple
+from opik.exceptions import MetricComputationError
+from opik.evaluation.metrics.base_metric import BaseMetric
+from opik.evaluation.metrics.score_result import ScoreResult
+try:  # optional dependency
+    import fasttext
+except ImportError:  # pragma: no cover
+    fasttext = None  # type: ignore
+DetectorFn = Callable[[str], Tuple[str, float]]
+class LanguageAdherenceMetric(BaseMetric):
+    """
+    Check whether text is written in the expected language.
+    The metric relies on a fastText language identification model (or a
+    user-supplied detector callable) to predict the language of the evaluated text
+    and compares it with ``expected_language``. It outputs ``1.0`` when the detected
+    language matches and ``0.0`` otherwise, along with the detected label and
+    confidence score in ``metadata``.
+    References:
+      - fastText language identification models
+        https://fasttext.cc/docs/en/language-identification.html
+      - Joulin et al., "Bag of Tricks for Efficient Text Classification" (EACL 2017)
+        https://aclanthology.org/E17-2068/
+    Args:
+        expected_language: Language code the text should conform to, e.g. ``"en"``.
+        model_path: Path to a fastText language identification model. Required unless
+            ``detector`` is provided.
+        name: Display name for the metric result. Defaults to
+            ``"language_adherence_metric"``.
+        track: Whether to automatically track metric results. Defaults to ``True``.
+        project_name: Optional tracking project name. Defaults to ``None``.
+        detector: Optional callable accepting text and returning a
+            ``(language, confidence)`` tuple. When provided, ``model_path`` is not
+            needed.
+    Example:
+        >>> from opik.evaluation.metrics import LanguageAdherenceMetric
+        >>> # Assuming `lid.176.ftz` is available locally for fastText
+        >>> metric = LanguageAdherenceMetric(expected_language="en", model_path="lid.176.ftz")
+        >>> result = metric.score("This response is written in English.")  # doctest: +SKIP
+        >>> result.value  # doctest: +SKIP
+        1.0
+    """
+    def __init__(
+        self,
+        expected_language: str,
+        model_path: Optional[str] = None,
+        name: str = "language_adherence_metric",
+        track: bool = True,
+        project_name: Optional[str] = None,
+        detector: Optional[DetectorFn] = None,
+    ) -> None:
+        super().__init__(name=name, track=track, project_name=project_name)
+        self._expected_language = expected_language
+        self._detector_fn: DetectorFn
+        self._model_path = model_path
+        self._fasttext_model: Optional[Any]
+        if detector is not None:
+            self._detector_fn = detector
+            self._fasttext_model = None
+            return
+        if fasttext is None:
+            raise ImportError(
+                "Install fasttext via `pip install fasttext` and provide a fastText language"
+                " model (e.g., lid.176.ftz) or supply a custom detector callable."
+            )
+        if model_path is None:
+            raise ValueError(
+                "model_path is required when using the fastText-based detector."
+            )
+        self._fasttext_model = fasttext.load_model(model_path)
+        self._detector_fn = self._predict_with_fasttext
+    def score(self, output: str, **ignored_kwargs: Any) -> ScoreResult:
+        processed = output
+        if not processed.strip():
+            raise MetricComputationError("Text is empty for language adherence check.")
+        language, confidence = self._detector_fn(processed)
+        adherence = 1.0 if language == self._expected_language else 0.0
+        metadata = {
+            "detected_language": language,
+            "confidence": confidence,
+            "expected_language": self._expected_language,
+        }
+        reason = (
+            "Language adheres to expectation"
+            if adherence == 1.0
+            else f"Detected language '{language}' differs from expected '{self._expected_language}'"
+        )
+        return ScoreResult(
+            value=adherence, name=self.name, reason=reason, metadata=metadata
+        )
+    def _predict_with_fasttext(self, text: str) -> tuple[str, float]:
+        if self._fasttext_model is None:
+            raise MetricComputationError(
+                "fastText model is not loaded. Ensure that LanguageAdherenceMetric was initialized with a valid model_path and fastText is installed."
+            )
+        prediction = self._fasttext_model.predict(text)
+        label = prediction[0][0] if prediction[0] else ""
+        language = label.replace("__label__", "")
+        confidence = float(prediction[1][0]) if prediction[1] else 0.0
+        return language, confidence

opik/evaluation/metrics/heuristics/meteor.py ADDED Viewed

@@ -0,0 +1,119 @@
+from typing import Any, Callable, Optional, Sequence, Union
+try:
+    import nltk  # type: ignore
+    from nltk.corpus import wordnet  # type: ignore
+except ImportError:  # pragma: no cover - optional dependency
+    nltk = None
+    wordnet = None
+from opik.exceptions import MetricComputationError
+from opik.evaluation.metrics import base_metric, score_result
+try:
+    from nltk.translate import meteor_score as nltk_meteor_score
+except ImportError:  # pragma: no cover - optional dependency
+    nltk_meteor_score = None
+MeteorFn = Callable[[Sequence[str], str], float]
+class METEOR(base_metric.BaseMetric):
+    """Computes the METEOR score between output and reference text.
+    This implementation wraps ``nltk.translate.meteor_score.meteor_score`` while
+    allowing a custom scoring function to be injected (useful for testing).
+    References:
+      - Banerjee & Lavie, "METEOR: An Automatic Metric for MT Evaluation with Improved
+        Correlation with Human Judgments" (ACL Workshop 2005)
+        https://aclanthology.org/W05-0909/
+      - Hugging Face Evaluate: METEOR metric overview
+        https://huggingface.co/spaces/evaluate-metric/meteor
+    Args:
+        meteor_fn: Optional callable with the same interface as
+            ``nltk.translate.meteor_score.meteor_score``. When omitted the
+            function from NLTK is used.
+        alpha: Precision weight.
+        beta: Penalty exponent.
+        gamma: Fragmentation penalty weight.
+        name: Optional metric name.
+        track: Whether Opik should track the metric automatically.
+        project_name: Optional project name used when tracking.
+    """
+    def __init__(
+        self,
+        meteor_fn: Optional[MeteorFn] = None,
+        alpha: float = 0.9,
+        beta: float = 3.0,
+        gamma: float = 0.5,
+        name: str = "meteor_metric",
+        track: bool = True,
+        project_name: Optional[str] = None,
+    ) -> None:
+        super().__init__(name=name, track=track, project_name=project_name)
+        if meteor_fn is not None:
+            self._meteor_fn = meteor_fn
+        else:
+            if nltk_meteor_score is None:  # pragma: no cover - optional dependency
+                raise ImportError(
+                    "METEOR metric requires the optional 'nltk' package. Install via"
+                    " `pip install nltk` or provide `meteor_fn`."
+                )
+            if nltk is not None and wordnet is not None:
+                try:
+                    wordnet.ensure_loaded()  # type: ignore[attr-defined]
+                except (
+                    LookupError
+                ):  # pragma: no cover - download path relies on network access
+                    try:
+                        nltk.download("wordnet", quiet=True)
+                        nltk.download("omw-1.4", quiet=True)
+                        wordnet.ensure_loaded()  # type: ignore[attr-defined]
+                    except Exception as download_error:
+                        raise ImportError(
+                            "METEOR metric requires the NLTK corpora 'wordnet' and 'omw-1.4'. "
+                            "Install manually via `python -m nltk.downloader wordnet omw-1.4`."
+                        ) from download_error
+            def _scorer(references: Sequence[str], hypothesis: str) -> float:
+                try:
+                    return float(
+                        nltk_meteor_score.meteor_score(
+                            references, hypothesis, alpha=alpha, beta=beta, gamma=gamma
+                        )
+                    )
+                except LookupError as error:
+                    raise MetricComputationError(
+                        "NLTK resource requirement for METEOR not satisfied. "
+                        "Download WordNet via `nltk.download('wordnet')`."
+                    ) from error
+            self._meteor_fn = _scorer
+    def score(
+        self,
+        output: str,
+        reference: Union[str, Sequence[str]],
+        **ignored_kwargs: Any,
+    ) -> score_result.ScoreResult:
+        if not output.strip():
+            raise MetricComputationError("Candidate is empty (METEOR metric).")
+        if isinstance(reference, str):
+            references: Sequence[str] = [reference]
+        else:
+            references = list(reference)
+        if not references or any(not ref.strip() for ref in references):
+            raise MetricComputationError("Reference is empty (METEOR metric).")
+        score = self._meteor_fn(references, output)
+        return score_result.ScoreResult(
+            value=float(score),
+            name=self.name,
+            reason=f"METEOR score: {float(score):.4f}",
+        )

opik 1.8.39__py3-none-any.whl → 1.9.71__py3-none-any.whl

opik 1.8.39py3-none-any.whl → 1.9.71py3-none-any.whl