opik 1.8.39__py3-none-any.whl → 1.9.71__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik/__init__.py +19 -3
- opik/anonymizer/__init__.py +5 -0
- opik/anonymizer/anonymizer.py +12 -0
- opik/anonymizer/factory.py +80 -0
- opik/anonymizer/recursive_anonymizer.py +64 -0
- opik/anonymizer/rules.py +56 -0
- opik/anonymizer/rules_anonymizer.py +35 -0
- opik/api_objects/attachment/attachment_context.py +36 -0
- opik/api_objects/attachment/attachments_extractor.py +153 -0
- opik/api_objects/attachment/client.py +1 -0
- opik/api_objects/attachment/converters.py +2 -0
- opik/api_objects/attachment/decoder.py +18 -0
- opik/api_objects/attachment/decoder_base64.py +83 -0
- opik/api_objects/attachment/decoder_helpers.py +137 -0
- opik/api_objects/data_helpers.py +79 -0
- opik/api_objects/dataset/dataset.py +64 -4
- opik/api_objects/dataset/rest_operations.py +11 -2
- opik/api_objects/experiment/experiment.py +57 -57
- opik/api_objects/experiment/experiment_item.py +2 -1
- opik/api_objects/experiment/experiments_client.py +64 -0
- opik/api_objects/experiment/helpers.py +35 -11
- opik/api_objects/experiment/rest_operations.py +65 -5
- opik/api_objects/helpers.py +8 -5
- opik/api_objects/local_recording.py +81 -0
- opik/api_objects/opik_client.py +600 -108
- opik/api_objects/opik_query_language.py +39 -5
- opik/api_objects/prompt/__init__.py +12 -2
- opik/api_objects/prompt/base_prompt.py +69 -0
- opik/api_objects/prompt/base_prompt_template.py +29 -0
- opik/api_objects/prompt/chat/__init__.py +1 -0
- opik/api_objects/prompt/chat/chat_prompt.py +210 -0
- opik/api_objects/prompt/chat/chat_prompt_template.py +350 -0
- opik/api_objects/prompt/chat/content_renderer_registry.py +203 -0
- opik/api_objects/prompt/client.py +189 -47
- opik/api_objects/prompt/text/__init__.py +1 -0
- opik/api_objects/prompt/text/prompt.py +174 -0
- opik/api_objects/prompt/{prompt_template.py → text/prompt_template.py} +10 -6
- opik/api_objects/prompt/types.py +23 -0
- opik/api_objects/search_helpers.py +89 -0
- opik/api_objects/span/span_data.py +35 -25
- opik/api_objects/threads/threads_client.py +39 -5
- opik/api_objects/trace/trace_client.py +52 -2
- opik/api_objects/trace/trace_data.py +15 -24
- opik/api_objects/validation_helpers.py +3 -3
- opik/cli/__init__.py +5 -0
- opik/cli/__main__.py +6 -0
- opik/cli/configure.py +66 -0
- opik/cli/exports/__init__.py +131 -0
- opik/cli/exports/dataset.py +278 -0
- opik/cli/exports/experiment.py +784 -0
- opik/cli/exports/project.py +685 -0
- opik/cli/exports/prompt.py +578 -0
- opik/cli/exports/utils.py +406 -0
- opik/cli/harbor.py +39 -0
- opik/cli/healthcheck.py +21 -0
- opik/cli/imports/__init__.py +439 -0
- opik/cli/imports/dataset.py +143 -0
- opik/cli/imports/experiment.py +1192 -0
- opik/cli/imports/project.py +262 -0
- opik/cli/imports/prompt.py +177 -0
- opik/cli/imports/utils.py +280 -0
- opik/cli/main.py +49 -0
- opik/cli/proxy.py +93 -0
- opik/cli/usage_report/__init__.py +16 -0
- opik/cli/usage_report/charts.py +783 -0
- opik/cli/usage_report/cli.py +274 -0
- opik/cli/usage_report/constants.py +9 -0
- opik/cli/usage_report/extraction.py +749 -0
- opik/cli/usage_report/pdf.py +244 -0
- opik/cli/usage_report/statistics.py +78 -0
- opik/cli/usage_report/utils.py +235 -0
- opik/config.py +13 -7
- opik/configurator/configure.py +17 -0
- opik/datetime_helpers.py +12 -0
- opik/decorator/arguments_helpers.py +9 -1
- opik/decorator/base_track_decorator.py +205 -133
- opik/decorator/context_manager/span_context_manager.py +123 -0
- opik/decorator/context_manager/trace_context_manager.py +84 -0
- opik/decorator/opik_args/__init__.py +13 -0
- opik/decorator/opik_args/api_classes.py +71 -0
- opik/decorator/opik_args/helpers.py +120 -0
- opik/decorator/span_creation_handler.py +25 -6
- opik/dict_utils.py +3 -3
- opik/evaluation/__init__.py +13 -2
- opik/evaluation/engine/engine.py +272 -75
- opik/evaluation/engine/evaluation_tasks_executor.py +6 -3
- opik/evaluation/engine/helpers.py +31 -6
- opik/evaluation/engine/metrics_evaluator.py +237 -0
- opik/evaluation/evaluation_result.py +168 -2
- opik/evaluation/evaluator.py +533 -62
- opik/evaluation/metrics/__init__.py +103 -4
- opik/evaluation/metrics/aggregated_metric.py +35 -6
- opik/evaluation/metrics/base_metric.py +1 -1
- opik/evaluation/metrics/conversation/__init__.py +48 -0
- opik/evaluation/metrics/conversation/conversation_thread_metric.py +56 -2
- opik/evaluation/metrics/conversation/g_eval_wrappers.py +19 -0
- opik/evaluation/metrics/conversation/helpers.py +14 -15
- opik/evaluation/metrics/conversation/heuristics/__init__.py +14 -0
- opik/evaluation/metrics/conversation/heuristics/degeneration/__init__.py +3 -0
- opik/evaluation/metrics/conversation/heuristics/degeneration/metric.py +189 -0
- opik/evaluation/metrics/conversation/heuristics/degeneration/phrases.py +12 -0
- opik/evaluation/metrics/conversation/heuristics/knowledge_retention/__init__.py +3 -0
- opik/evaluation/metrics/conversation/heuristics/knowledge_retention/metric.py +172 -0
- opik/evaluation/metrics/conversation/llm_judges/__init__.py +32 -0
- opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/metric.py +22 -17
- opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/templates.py +1 -1
- opik/evaluation/metrics/conversation/llm_judges/g_eval_wrappers.py +442 -0
- opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/metric.py +13 -7
- opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/templates.py +1 -1
- opik/evaluation/metrics/conversation/llm_judges/user_frustration/__init__.py +0 -0
- opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/metric.py +21 -14
- opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/templates.py +1 -1
- opik/evaluation/metrics/conversation/types.py +4 -5
- opik/evaluation/metrics/conversation_types.py +9 -0
- opik/evaluation/metrics/heuristics/bertscore.py +107 -0
- opik/evaluation/metrics/heuristics/bleu.py +35 -15
- opik/evaluation/metrics/heuristics/chrf.py +127 -0
- opik/evaluation/metrics/heuristics/contains.py +47 -11
- opik/evaluation/metrics/heuristics/distribution_metrics.py +331 -0
- opik/evaluation/metrics/heuristics/gleu.py +113 -0
- opik/evaluation/metrics/heuristics/language_adherence.py +123 -0
- opik/evaluation/metrics/heuristics/meteor.py +119 -0
- opik/evaluation/metrics/heuristics/prompt_injection.py +150 -0
- opik/evaluation/metrics/heuristics/readability.py +129 -0
- opik/evaluation/metrics/heuristics/rouge.py +26 -9
- opik/evaluation/metrics/heuristics/spearman.py +88 -0
- opik/evaluation/metrics/heuristics/tone.py +155 -0
- opik/evaluation/metrics/heuristics/vader_sentiment.py +77 -0
- opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +20 -5
- opik/evaluation/metrics/llm_judges/context_precision/metric.py +20 -6
- opik/evaluation/metrics/llm_judges/context_recall/metric.py +20 -6
- opik/evaluation/metrics/llm_judges/g_eval/__init__.py +5 -0
- opik/evaluation/metrics/llm_judges/g_eval/metric.py +219 -68
- opik/evaluation/metrics/llm_judges/g_eval/parser.py +102 -52
- opik/evaluation/metrics/llm_judges/g_eval/presets.py +209 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/__init__.py +36 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/agent_assessment.py +77 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/bias_classifier.py +181 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/compliance_risk.py +41 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/prompt_uncertainty.py +41 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/qa_suite.py +146 -0
- opik/evaluation/metrics/llm_judges/hallucination/metric.py +16 -3
- opik/evaluation/metrics/llm_judges/llm_juries/__init__.py +3 -0
- opik/evaluation/metrics/llm_judges/llm_juries/metric.py +76 -0
- opik/evaluation/metrics/llm_judges/moderation/metric.py +16 -4
- opik/evaluation/metrics/llm_judges/structure_output_compliance/__init__.py +0 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +144 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/parser.py +79 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/schema.py +15 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/template.py +50 -0
- opik/evaluation/metrics/llm_judges/syc_eval/__init__.py +0 -0
- opik/evaluation/metrics/llm_judges/syc_eval/metric.py +252 -0
- opik/evaluation/metrics/llm_judges/syc_eval/parser.py +82 -0
- opik/evaluation/metrics/llm_judges/syc_eval/template.py +155 -0
- opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +20 -5
- opik/evaluation/metrics/llm_judges/usefulness/metric.py +16 -4
- opik/evaluation/metrics/ragas_metric.py +43 -23
- opik/evaluation/models/__init__.py +8 -0
- opik/evaluation/models/base_model.py +107 -1
- opik/evaluation/models/langchain/langchain_chat_model.py +15 -7
- opik/evaluation/models/langchain/message_converters.py +97 -15
- opik/evaluation/models/litellm/litellm_chat_model.py +156 -29
- opik/evaluation/models/litellm/util.py +125 -0
- opik/evaluation/models/litellm/warning_filters.py +16 -4
- opik/evaluation/models/model_capabilities.py +187 -0
- opik/evaluation/models/models_factory.py +25 -3
- opik/evaluation/preprocessing.py +92 -0
- opik/evaluation/report.py +70 -12
- opik/evaluation/rest_operations.py +49 -45
- opik/evaluation/samplers/__init__.py +4 -0
- opik/evaluation/samplers/base_dataset_sampler.py +40 -0
- opik/evaluation/samplers/random_dataset_sampler.py +48 -0
- opik/evaluation/score_statistics.py +66 -0
- opik/evaluation/scorers/__init__.py +4 -0
- opik/evaluation/scorers/scorer_function.py +55 -0
- opik/evaluation/scorers/scorer_wrapper_metric.py +130 -0
- opik/evaluation/test_case.py +3 -2
- opik/evaluation/test_result.py +1 -0
- opik/evaluation/threads/evaluator.py +31 -3
- opik/evaluation/threads/helpers.py +3 -2
- opik/evaluation/types.py +9 -1
- opik/exceptions.py +33 -0
- opik/file_upload/file_uploader.py +13 -0
- opik/file_upload/upload_options.py +2 -0
- opik/hooks/__init__.py +23 -0
- opik/hooks/anonymizer_hook.py +36 -0
- opik/hooks/httpx_client_hook.py +112 -0
- opik/httpx_client.py +12 -9
- opik/id_helpers.py +18 -0
- opik/integrations/adk/graph/subgraph_edges_builders.py +1 -2
- opik/integrations/adk/helpers.py +16 -7
- opik/integrations/adk/legacy_opik_tracer.py +7 -4
- opik/integrations/adk/opik_tracer.py +14 -1
- opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +7 -3
- opik/integrations/adk/recursive_callback_injector.py +4 -7
- opik/integrations/bedrock/converse/__init__.py +0 -0
- opik/integrations/bedrock/converse/chunks_aggregator.py +188 -0
- opik/integrations/bedrock/{converse_decorator.py → converse/converse_decorator.py} +4 -3
- opik/integrations/bedrock/invoke_agent_decorator.py +5 -4
- opik/integrations/bedrock/invoke_model/__init__.py +0 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/__init__.py +78 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/api.py +45 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/base.py +23 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/claude.py +121 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/format_detector.py +107 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/llama.py +108 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/mistral.py +118 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/nova.py +99 -0
- opik/integrations/bedrock/invoke_model/invoke_model_decorator.py +178 -0
- opik/integrations/bedrock/invoke_model/response_types.py +34 -0
- opik/integrations/bedrock/invoke_model/stream_wrappers.py +122 -0
- opik/integrations/bedrock/invoke_model/usage_converters.py +87 -0
- opik/integrations/bedrock/invoke_model/usage_extraction.py +108 -0
- opik/integrations/bedrock/opik_tracker.py +42 -4
- opik/integrations/bedrock/types.py +19 -0
- opik/integrations/crewai/crewai_decorator.py +8 -51
- opik/integrations/crewai/opik_tracker.py +31 -10
- opik/integrations/crewai/patchers/__init__.py +5 -0
- opik/integrations/crewai/patchers/flow.py +118 -0
- opik/integrations/crewai/patchers/litellm_completion.py +30 -0
- opik/integrations/crewai/patchers/llm_client.py +207 -0
- opik/integrations/dspy/callback.py +80 -17
- opik/integrations/dspy/parsers.py +168 -0
- opik/integrations/harbor/__init__.py +17 -0
- opik/integrations/harbor/experiment_service.py +269 -0
- opik/integrations/harbor/opik_tracker.py +528 -0
- opik/integrations/haystack/opik_connector.py +2 -2
- opik/integrations/haystack/opik_tracer.py +3 -7
- opik/integrations/langchain/__init__.py +3 -1
- opik/integrations/langchain/helpers.py +96 -0
- opik/integrations/langchain/langgraph_async_context_bridge.py +131 -0
- opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
- opik/integrations/langchain/opik_encoder_extension.py +1 -1
- opik/integrations/langchain/opik_tracer.py +474 -229
- opik/integrations/litellm/__init__.py +5 -0
- opik/integrations/litellm/completion_chunks_aggregator.py +115 -0
- opik/integrations/litellm/litellm_completion_decorator.py +242 -0
- opik/integrations/litellm/opik_tracker.py +43 -0
- opik/integrations/litellm/stream_patchers.py +151 -0
- opik/integrations/llama_index/callback.py +146 -107
- opik/integrations/openai/agents/opik_tracing_processor.py +1 -2
- opik/integrations/openai/openai_chat_completions_decorator.py +2 -16
- opik/integrations/openai/opik_tracker.py +1 -1
- opik/integrations/sagemaker/auth.py +5 -1
- opik/llm_usage/google_usage.py +3 -1
- opik/llm_usage/opik_usage.py +7 -8
- opik/llm_usage/opik_usage_factory.py +4 -2
- opik/logging_messages.py +6 -0
- opik/message_processing/batching/base_batcher.py +14 -21
- opik/message_processing/batching/batch_manager.py +22 -10
- opik/message_processing/batching/batch_manager_constuctors.py +10 -0
- opik/message_processing/batching/batchers.py +59 -27
- opik/message_processing/batching/flushing_thread.py +0 -3
- opik/message_processing/emulation/__init__.py +0 -0
- opik/message_processing/emulation/emulator_message_processor.py +578 -0
- opik/message_processing/emulation/local_emulator_message_processor.py +140 -0
- opik/message_processing/emulation/models.py +162 -0
- opik/message_processing/encoder_helpers.py +79 -0
- opik/message_processing/messages.py +56 -1
- opik/message_processing/preprocessing/__init__.py +0 -0
- opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
- opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
- opik/message_processing/preprocessing/constants.py +1 -0
- opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
- opik/message_processing/preprocessing/preprocessor.py +36 -0
- opik/message_processing/processors/__init__.py +0 -0
- opik/message_processing/processors/attachments_extraction_processor.py +146 -0
- opik/message_processing/processors/message_processors.py +92 -0
- opik/message_processing/processors/message_processors_chain.py +96 -0
- opik/message_processing/{message_processors.py → processors/online_message_processor.py} +85 -29
- opik/message_processing/queue_consumer.py +9 -3
- opik/message_processing/streamer.py +71 -33
- opik/message_processing/streamer_constructors.py +43 -10
- opik/opik_context.py +16 -4
- opik/plugins/pytest/hooks.py +5 -3
- opik/rest_api/__init__.py +346 -15
- opik/rest_api/alerts/__init__.py +7 -0
- opik/rest_api/alerts/client.py +667 -0
- opik/rest_api/alerts/raw_client.py +1015 -0
- opik/rest_api/alerts/types/__init__.py +7 -0
- opik/rest_api/alerts/types/get_webhook_examples_request_alert_type.py +5 -0
- opik/rest_api/annotation_queues/__init__.py +4 -0
- opik/rest_api/annotation_queues/client.py +668 -0
- opik/rest_api/annotation_queues/raw_client.py +1019 -0
- opik/rest_api/automation_rule_evaluators/client.py +34 -2
- opik/rest_api/automation_rule_evaluators/raw_client.py +24 -0
- opik/rest_api/client.py +15 -0
- opik/rest_api/dashboards/__init__.py +4 -0
- opik/rest_api/dashboards/client.py +462 -0
- opik/rest_api/dashboards/raw_client.py +648 -0
- opik/rest_api/datasets/client.py +1310 -44
- opik/rest_api/datasets/raw_client.py +2269 -358
- opik/rest_api/experiments/__init__.py +2 -2
- opik/rest_api/experiments/client.py +191 -5
- opik/rest_api/experiments/raw_client.py +301 -7
- opik/rest_api/experiments/types/__init__.py +4 -1
- opik/rest_api/experiments/types/experiment_update_status.py +5 -0
- opik/rest_api/experiments/types/experiment_update_type.py +5 -0
- opik/rest_api/experiments/types/experiment_write_status.py +5 -0
- opik/rest_api/feedback_definitions/types/find_feedback_definitions_request_type.py +1 -1
- opik/rest_api/llm_provider_key/client.py +20 -0
- opik/rest_api/llm_provider_key/raw_client.py +20 -0
- opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +1 -1
- opik/rest_api/manual_evaluation/__init__.py +4 -0
- opik/rest_api/manual_evaluation/client.py +347 -0
- opik/rest_api/manual_evaluation/raw_client.py +543 -0
- opik/rest_api/optimizations/client.py +145 -9
- opik/rest_api/optimizations/raw_client.py +237 -13
- opik/rest_api/optimizations/types/optimization_update_status.py +3 -1
- opik/rest_api/prompts/__init__.py +2 -2
- opik/rest_api/prompts/client.py +227 -6
- opik/rest_api/prompts/raw_client.py +331 -2
- opik/rest_api/prompts/types/__init__.py +3 -1
- opik/rest_api/prompts/types/create_prompt_version_detail_template_structure.py +5 -0
- opik/rest_api/prompts/types/prompt_write_template_structure.py +5 -0
- opik/rest_api/spans/__init__.py +0 -2
- opik/rest_api/spans/client.py +238 -76
- opik/rest_api/spans/raw_client.py +307 -95
- opik/rest_api/spans/types/__init__.py +0 -2
- opik/rest_api/traces/client.py +572 -161
- opik/rest_api/traces/raw_client.py +736 -229
- opik/rest_api/types/__init__.py +352 -17
- opik/rest_api/types/aggregation_data.py +1 -0
- opik/rest_api/types/alert.py +33 -0
- opik/rest_api/types/alert_alert_type.py +5 -0
- opik/rest_api/types/alert_page_public.py +24 -0
- opik/rest_api/types/alert_public.py +33 -0
- opik/rest_api/types/alert_public_alert_type.py +5 -0
- opik/rest_api/types/alert_trigger.py +27 -0
- opik/rest_api/types/alert_trigger_config.py +28 -0
- opik/rest_api/types/alert_trigger_config_public.py +28 -0
- opik/rest_api/types/alert_trigger_config_public_type.py +10 -0
- opik/rest_api/types/alert_trigger_config_type.py +10 -0
- opik/rest_api/types/alert_trigger_config_write.py +22 -0
- opik/rest_api/types/alert_trigger_config_write_type.py +10 -0
- opik/rest_api/types/alert_trigger_event_type.py +19 -0
- opik/rest_api/types/alert_trigger_public.py +27 -0
- opik/rest_api/types/alert_trigger_public_event_type.py +19 -0
- opik/rest_api/types/alert_trigger_write.py +23 -0
- opik/rest_api/types/alert_trigger_write_event_type.py +19 -0
- opik/rest_api/types/alert_write.py +28 -0
- opik/rest_api/types/alert_write_alert_type.py +5 -0
- opik/rest_api/types/annotation_queue.py +42 -0
- opik/rest_api/types/annotation_queue_batch.py +27 -0
- opik/rest_api/types/annotation_queue_item_ids.py +19 -0
- opik/rest_api/types/annotation_queue_page_public.py +28 -0
- opik/rest_api/types/annotation_queue_public.py +38 -0
- opik/rest_api/types/annotation_queue_public_scope.py +5 -0
- opik/rest_api/types/annotation_queue_reviewer.py +20 -0
- opik/rest_api/types/annotation_queue_reviewer_public.py +20 -0
- opik/rest_api/types/annotation_queue_scope.py +5 -0
- opik/rest_api/types/annotation_queue_write.py +31 -0
- opik/rest_api/types/annotation_queue_write_scope.py +5 -0
- opik/rest_api/types/audio_url.py +19 -0
- opik/rest_api/types/audio_url_public.py +19 -0
- opik/rest_api/types/audio_url_write.py +19 -0
- opik/rest_api/types/automation_rule_evaluator.py +62 -2
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_object_object_public.py +155 -0
- opik/rest_api/types/automation_rule_evaluator_page_public.py +3 -2
- opik/rest_api/types/automation_rule_evaluator_public.py +57 -2
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_public.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_write.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update.py +51 -1
- opik/rest_api/types/automation_rule_evaluator_update_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update_span_llm_as_judge.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_update_trace_thread_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update_trace_thread_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_write.py +51 -1
- opik/rest_api/types/boolean_feedback_definition.py +25 -0
- opik/rest_api/types/boolean_feedback_definition_create.py +20 -0
- opik/rest_api/types/boolean_feedback_definition_public.py +25 -0
- opik/rest_api/types/boolean_feedback_definition_update.py +20 -0
- opik/rest_api/types/boolean_feedback_detail.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_create.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_public.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_update.py +29 -0
- opik/rest_api/types/dashboard_page_public.py +24 -0
- opik/rest_api/types/dashboard_public.py +30 -0
- opik/rest_api/types/dataset.py +4 -0
- opik/rest_api/types/dataset_expansion.py +42 -0
- opik/rest_api/types/dataset_expansion_response.py +39 -0
- opik/rest_api/types/dataset_item.py +2 -0
- opik/rest_api/types/dataset_item_changes_public.py +5 -0
- opik/rest_api/types/dataset_item_compare.py +2 -0
- opik/rest_api/types/dataset_item_filter.py +27 -0
- opik/rest_api/types/dataset_item_filter_operator.py +21 -0
- opik/rest_api/types/dataset_item_page_compare.py +5 -0
- opik/rest_api/types/dataset_item_page_public.py +5 -0
- opik/rest_api/types/dataset_item_public.py +2 -0
- opik/rest_api/types/dataset_item_update.py +39 -0
- opik/rest_api/types/dataset_item_write.py +1 -0
- opik/rest_api/types/dataset_public.py +4 -0
- opik/rest_api/types/dataset_public_status.py +5 -0
- opik/rest_api/types/dataset_status.py +5 -0
- opik/rest_api/types/dataset_version_diff.py +22 -0
- opik/rest_api/types/dataset_version_diff_stats.py +24 -0
- opik/rest_api/types/dataset_version_page_public.py +23 -0
- opik/rest_api/types/dataset_version_public.py +59 -0
- opik/rest_api/types/dataset_version_summary.py +46 -0
- opik/rest_api/types/dataset_version_summary_public.py +46 -0
- opik/rest_api/types/experiment.py +7 -2
- opik/rest_api/types/experiment_group_response.py +2 -0
- opik/rest_api/types/experiment_public.py +7 -2
- opik/rest_api/types/experiment_public_status.py +5 -0
- opik/rest_api/types/experiment_score.py +20 -0
- opik/rest_api/types/experiment_score_public.py +20 -0
- opik/rest_api/types/experiment_score_write.py +20 -0
- opik/rest_api/types/experiment_status.py +5 -0
- opik/rest_api/types/feedback.py +25 -1
- opik/rest_api/types/feedback_create.py +20 -1
- opik/rest_api/types/feedback_object_public.py +27 -1
- opik/rest_api/types/feedback_public.py +25 -1
- opik/rest_api/types/feedback_score_batch_item.py +2 -1
- opik/rest_api/types/feedback_score_batch_item_thread.py +2 -1
- opik/rest_api/types/feedback_score_public.py +4 -0
- opik/rest_api/types/feedback_update.py +20 -1
- opik/rest_api/types/group_content_with_aggregations.py +1 -0
- opik/rest_api/types/group_detail.py +19 -0
- opik/rest_api/types/group_details.py +20 -0
- opik/rest_api/types/guardrail.py +1 -0
- opik/rest_api/types/guardrail_write.py +1 -0
- opik/rest_api/types/ids_holder.py +19 -0
- opik/rest_api/types/image_url.py +20 -0
- opik/rest_api/types/image_url_public.py +20 -0
- opik/rest_api/types/image_url_write.py +20 -0
- opik/rest_api/types/llm_as_judge_message.py +5 -1
- opik/rest_api/types/llm_as_judge_message_content.py +26 -0
- opik/rest_api/types/llm_as_judge_message_content_public.py +26 -0
- opik/rest_api/types/llm_as_judge_message_content_write.py +26 -0
- opik/rest_api/types/llm_as_judge_message_public.py +5 -1
- opik/rest_api/types/llm_as_judge_message_write.py +5 -1
- opik/rest_api/types/llm_as_judge_model_parameters.py +3 -0
- opik/rest_api/types/llm_as_judge_model_parameters_public.py +3 -0
- opik/rest_api/types/llm_as_judge_model_parameters_write.py +3 -0
- opik/rest_api/types/manual_evaluation_request.py +38 -0
- opik/rest_api/types/manual_evaluation_request_entity_type.py +5 -0
- opik/rest_api/types/manual_evaluation_response.py +27 -0
- opik/rest_api/types/optimization.py +4 -2
- opik/rest_api/types/optimization_public.py +4 -2
- opik/rest_api/types/optimization_public_status.py +3 -1
- opik/rest_api/types/optimization_status.py +3 -1
- opik/rest_api/types/optimization_studio_config.py +27 -0
- opik/rest_api/types/optimization_studio_config_public.py +27 -0
- opik/rest_api/types/optimization_studio_config_write.py +27 -0
- opik/rest_api/types/optimization_studio_log.py +22 -0
- opik/rest_api/types/optimization_write.py +4 -2
- opik/rest_api/types/optimization_write_status.py +3 -1
- opik/rest_api/types/project.py +1 -0
- opik/rest_api/types/project_detailed.py +1 -0
- opik/rest_api/types/project_reference.py +31 -0
- opik/rest_api/types/project_reference_public.py +31 -0
- opik/rest_api/types/project_stats_summary_item.py +1 -0
- opik/rest_api/types/prompt.py +6 -0
- opik/rest_api/types/prompt_detail.py +6 -0
- opik/rest_api/types/prompt_detail_template_structure.py +5 -0
- opik/rest_api/types/prompt_public.py +6 -0
- opik/rest_api/types/prompt_public_template_structure.py +5 -0
- opik/rest_api/types/prompt_template_structure.py +5 -0
- opik/rest_api/types/prompt_version.py +3 -0
- opik/rest_api/types/prompt_version_detail.py +3 -0
- opik/rest_api/types/prompt_version_detail_template_structure.py +5 -0
- opik/rest_api/types/prompt_version_link.py +1 -0
- opik/rest_api/types/prompt_version_link_public.py +1 -0
- opik/rest_api/types/prompt_version_page_public.py +5 -0
- opik/rest_api/types/prompt_version_public.py +3 -0
- opik/rest_api/types/prompt_version_public_template_structure.py +5 -0
- opik/rest_api/types/prompt_version_template_structure.py +5 -0
- opik/rest_api/types/prompt_version_update.py +33 -0
- opik/rest_api/types/provider_api_key.py +9 -0
- opik/rest_api/types/provider_api_key_provider.py +1 -1
- opik/rest_api/types/provider_api_key_public.py +9 -0
- opik/rest_api/types/provider_api_key_public_provider.py +1 -1
- opik/rest_api/types/score_name.py +1 -0
- opik/rest_api/types/service_toggles_config.py +18 -0
- opik/rest_api/types/span.py +1 -2
- opik/rest_api/types/span_enrichment_options.py +31 -0
- opik/rest_api/types/span_experiment_item_bulk_write_view.py +1 -2
- opik/rest_api/types/span_filter.py +23 -0
- opik/rest_api/types/span_filter_operator.py +21 -0
- opik/rest_api/types/span_filter_write.py +23 -0
- opik/rest_api/types/span_filter_write_operator.py +21 -0
- opik/rest_api/types/span_llm_as_judge_code.py +27 -0
- opik/rest_api/types/span_llm_as_judge_code_public.py +27 -0
- opik/rest_api/types/span_llm_as_judge_code_write.py +27 -0
- opik/rest_api/types/span_public.py +1 -2
- opik/rest_api/types/span_update.py +46 -0
- opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
- opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
- opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
- opik/rest_api/types/span_write.py +1 -2
- opik/rest_api/types/studio_evaluation.py +20 -0
- opik/rest_api/types/studio_evaluation_public.py +20 -0
- opik/rest_api/types/studio_evaluation_write.py +20 -0
- opik/rest_api/types/studio_llm_model.py +21 -0
- opik/rest_api/types/studio_llm_model_public.py +21 -0
- opik/rest_api/types/studio_llm_model_write.py +21 -0
- opik/rest_api/types/studio_message.py +20 -0
- opik/rest_api/types/studio_message_public.py +20 -0
- opik/rest_api/types/studio_message_write.py +20 -0
- opik/rest_api/types/studio_metric.py +21 -0
- opik/rest_api/types/studio_metric_public.py +21 -0
- opik/rest_api/types/studio_metric_write.py +21 -0
- opik/rest_api/types/studio_optimizer.py +21 -0
- opik/rest_api/types/studio_optimizer_public.py +21 -0
- opik/rest_api/types/studio_optimizer_write.py +21 -0
- opik/rest_api/types/studio_prompt.py +20 -0
- opik/rest_api/types/studio_prompt_public.py +20 -0
- opik/rest_api/types/studio_prompt_write.py +20 -0
- opik/rest_api/types/trace.py +11 -2
- opik/rest_api/types/trace_enrichment_options.py +32 -0
- opik/rest_api/types/trace_experiment_item_bulk_write_view.py +1 -2
- opik/rest_api/types/trace_filter.py +23 -0
- opik/rest_api/types/trace_filter_operator.py +21 -0
- opik/rest_api/types/trace_filter_write.py +23 -0
- opik/rest_api/types/trace_filter_write_operator.py +21 -0
- opik/rest_api/types/trace_public.py +11 -2
- opik/rest_api/types/trace_thread_filter_write.py +23 -0
- opik/rest_api/types/trace_thread_filter_write_operator.py +21 -0
- opik/rest_api/types/trace_thread_identifier.py +1 -0
- opik/rest_api/types/trace_update.py +39 -0
- opik/rest_api/types/trace_write.py +1 -2
- opik/rest_api/types/value_entry.py +2 -0
- opik/rest_api/types/value_entry_compare.py +2 -0
- opik/rest_api/types/value_entry_experiment_item_bulk_write_view.py +2 -0
- opik/rest_api/types/value_entry_public.py +2 -0
- opik/rest_api/types/video_url.py +19 -0
- opik/rest_api/types/video_url_public.py +19 -0
- opik/rest_api/types/video_url_write.py +19 -0
- opik/rest_api/types/webhook.py +28 -0
- opik/rest_api/types/webhook_examples.py +19 -0
- opik/rest_api/types/webhook_public.py +28 -0
- opik/rest_api/types/webhook_test_result.py +23 -0
- opik/rest_api/types/webhook_test_result_status.py +5 -0
- opik/rest_api/types/webhook_write.py +23 -0
- opik/rest_api/types/welcome_wizard_tracking.py +22 -0
- opik/rest_api/types/workspace_configuration.py +5 -0
- opik/rest_api/welcome_wizard/__init__.py +4 -0
- opik/rest_api/welcome_wizard/client.py +195 -0
- opik/rest_api/welcome_wizard/raw_client.py +208 -0
- opik/rest_api/workspaces/client.py +14 -2
- opik/rest_api/workspaces/raw_client.py +10 -0
- opik/s3_httpx_client.py +14 -1
- opik/simulation/__init__.py +6 -0
- opik/simulation/simulated_user.py +99 -0
- opik/simulation/simulator.py +108 -0
- opik/synchronization.py +5 -6
- opik/{decorator/tracing_runtime_config.py → tracing_runtime_config.py} +6 -7
- opik/types.py +36 -0
- opik/validation/chat_prompt_messages.py +241 -0
- opik/validation/feedback_score.py +3 -3
- opik/validation/validator.py +28 -0
- opik-1.9.71.dist-info/METADATA +370 -0
- opik-1.9.71.dist-info/RECORD +1110 -0
- opik/api_objects/prompt/prompt.py +0 -112
- opik/cli.py +0 -193
- opik/hooks.py +0 -13
- opik/integrations/bedrock/chunks_aggregator.py +0 -55
- opik/integrations/bedrock/helpers.py +0 -8
- opik/rest_api/types/automation_rule_evaluator_object_public.py +0 -100
- opik/rest_api/types/json_node_experiment_item_bulk_write_view.py +0 -5
- opik-1.8.39.dist-info/METADATA +0 -339
- opik-1.8.39.dist-info/RECORD +0 -790
- /opik/{evaluation/metrics/conversation/conversational_coherence → decorator/context_manager}/__init__.py +0 -0
- /opik/evaluation/metrics/conversation/{session_completeness → llm_judges/conversational_coherence}/__init__.py +0 -0
- /opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/schema.py +0 -0
- /opik/evaluation/metrics/conversation/{user_frustration → llm_judges/session_completeness}/__init__.py +0 -0
- /opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/schema.py +0 -0
- /opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/schema.py +0 -0
- /opik/integrations/bedrock/{stream_wrappers.py → converse/stream_wrappers.py} +0 -0
- /opik/rest_api/{spans/types → types}/span_update_type.py +0 -0
- {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/WHEEL +0 -0
- {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/entry_points.txt +0 -0
- {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/licenses/LICENSE +0 -0
- {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Callable, Optional, Sequence, Tuple, Union
|
|
4
|
+
|
|
5
|
+
from opik.exceptions import MetricComputationError
|
|
6
|
+
from opik.evaluation.metrics import base_metric, score_result
|
|
7
|
+
|
|
8
|
+
try: # pragma: no cover - optional dependency
|
|
9
|
+
from bert_score import score as bert_score_fn
|
|
10
|
+
except ImportError: # pragma: no cover - optional dependency
|
|
11
|
+
bert_score_fn = None
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
BertScoreFn = Callable[
|
|
15
|
+
[Sequence[str], Union[Sequence[str], Sequence[Sequence[str]]]], Tuple[Any, Any, Any]
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class BERTScore(base_metric.BaseMetric):
|
|
20
|
+
"""Wrapper around the `bert-score` library.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
scorer_fn: Optional callable compatible with ``bert_score.score`` for
|
|
24
|
+
dependency injection or advanced usage.
|
|
25
|
+
model_type: Model checkpoint to use when loading the scorer. Ignored when
|
|
26
|
+
``scorer_fn`` is provided.
|
|
27
|
+
lang: Two-letter language code used by the default scorer.
|
|
28
|
+
rescale_with_baseline: Whether to rescale the score using the provided
|
|
29
|
+
baseline statistics.
|
|
30
|
+
device: Optional device string forwarded to ``bert_score`` (e.g., "cpu",
|
|
31
|
+
"cuda").
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
scorer_fn: Optional[BertScoreFn] = None,
|
|
37
|
+
model_type: Optional[str] = "bert-base-uncased",
|
|
38
|
+
lang: Optional[str] = "en",
|
|
39
|
+
rescale_with_baseline: bool = False,
|
|
40
|
+
device: Optional[str] = None,
|
|
41
|
+
name: str = "bertscore_metric",
|
|
42
|
+
track: bool = True,
|
|
43
|
+
project_name: Optional[str] = None,
|
|
44
|
+
**scorer_kwargs: Any,
|
|
45
|
+
) -> None:
|
|
46
|
+
super().__init__(name=name, track=track, project_name=project_name)
|
|
47
|
+
|
|
48
|
+
if scorer_fn is not None:
|
|
49
|
+
self._scorer_fn = scorer_fn
|
|
50
|
+
else:
|
|
51
|
+
if bert_score_fn is None: # pragma: no cover - optional dependency
|
|
52
|
+
raise ImportError(
|
|
53
|
+
"BERTScore metric requires the optional 'bert-score' package. "
|
|
54
|
+
"Install via `pip install bert-score` or provide `scorer_fn`."
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
def _score(
|
|
58
|
+
candidates: Sequence[str],
|
|
59
|
+
references: Union[Sequence[str], Sequence[Sequence[str]]],
|
|
60
|
+
) -> Tuple[Any, Any, Any]:
|
|
61
|
+
return bert_score_fn(
|
|
62
|
+
candidates,
|
|
63
|
+
references,
|
|
64
|
+
model_type=model_type,
|
|
65
|
+
lang=lang,
|
|
66
|
+
rescale_with_baseline=rescale_with_baseline,
|
|
67
|
+
device=device,
|
|
68
|
+
**scorer_kwargs,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
self._scorer_fn = _score
|
|
72
|
+
|
|
73
|
+
def score(
|
|
74
|
+
self,
|
|
75
|
+
output: str,
|
|
76
|
+
reference: Union[str, Sequence[str], Sequence[Sequence[str]]],
|
|
77
|
+
**ignored_kwargs: Any,
|
|
78
|
+
) -> score_result.ScoreResult:
|
|
79
|
+
if not output.strip():
|
|
80
|
+
raise MetricComputationError("Candidate is empty (BERTScore metric).")
|
|
81
|
+
|
|
82
|
+
references: Union[Sequence[str], Sequence[Sequence[str]]]
|
|
83
|
+
if isinstance(reference, str):
|
|
84
|
+
references = [reference]
|
|
85
|
+
else:
|
|
86
|
+
references = reference
|
|
87
|
+
if isinstance(reference, Sequence) and len(reference) == 0:
|
|
88
|
+
raise MetricComputationError("Reference is empty (BERTScore metric).")
|
|
89
|
+
|
|
90
|
+
precision, recall, f1 = self._scorer_fn([output], references)
|
|
91
|
+
|
|
92
|
+
score_value = float(f1[0].item() if hasattr(f1[0], "item") else f1[0])
|
|
93
|
+
metadata = {
|
|
94
|
+
"precision": float(
|
|
95
|
+
precision[0].item() if hasattr(precision[0], "item") else precision[0]
|
|
96
|
+
),
|
|
97
|
+
"recall": float(
|
|
98
|
+
recall[0].item() if hasattr(recall[0], "item") else recall[0]
|
|
99
|
+
),
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
return score_result.ScoreResult(
|
|
103
|
+
value=score_value,
|
|
104
|
+
name=self.name,
|
|
105
|
+
reason=f"BERTScore F1: {score_value:.4f}",
|
|
106
|
+
metadata=metadata,
|
|
107
|
+
)
|
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
|
|
1
|
+
import warnings
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
from typing import Any, Iterator, List, Optional, Tuple, Union
|
|
2
4
|
|
|
3
5
|
from opik.exceptions import MetricComputationError
|
|
4
6
|
from opik.evaluation.metrics import base_metric, score_result
|
|
@@ -15,8 +17,12 @@ class BaseBLEU(base_metric.BaseMetric):
|
|
|
15
17
|
and weights initialization. This class is not intended to be used directly.
|
|
16
18
|
|
|
17
19
|
References:
|
|
18
|
-
-
|
|
19
|
-
https://
|
|
20
|
+
- BLEU: Papineni et al., "BLEU: a Method for Automatic Evaluation of Machine Translation" (ACL 2002)
|
|
21
|
+
https://aclanthology.org/P02-1040/
|
|
22
|
+
- NLTK BLEU documentation
|
|
23
|
+
https://www.nltk.org/api/nltk.translate.bleu_score.html
|
|
24
|
+
- Hugging Face Evaluate: BLEU metric overview
|
|
25
|
+
https://huggingface.co/spaces/evaluate-metric/bleu
|
|
20
26
|
|
|
21
27
|
Args:
|
|
22
28
|
name: The name of the metric (e.g. "sentence_bleu_metric" or "corpus_bleu_metric").
|
|
@@ -73,6 +79,18 @@ class BaseBLEU(base_metric.BaseMetric):
|
|
|
73
79
|
return tuple(normalized)
|
|
74
80
|
|
|
75
81
|
|
|
82
|
+
@contextmanager
|
|
83
|
+
def _suppress_bleu_warnings() -> Iterator[None]:
|
|
84
|
+
with warnings.catch_warnings():
|
|
85
|
+
warnings.filterwarnings(
|
|
86
|
+
"ignore",
|
|
87
|
+
message=r"The hypothesis contains 0 counts of 2-gram overlaps\.",
|
|
88
|
+
category=UserWarning,
|
|
89
|
+
module="nltk\\.translate\\.bleu_score",
|
|
90
|
+
)
|
|
91
|
+
yield
|
|
92
|
+
|
|
93
|
+
|
|
76
94
|
class SentenceBLEU(BaseBLEU):
|
|
77
95
|
"""
|
|
78
96
|
Computes sentence-level BLEU for a single candidate string vs. one or more references.
|
|
@@ -151,12 +169,13 @@ class SentenceBLEU(BaseBLEU):
|
|
|
151
169
|
smoothing_func = self._get_smoothing_func()
|
|
152
170
|
|
|
153
171
|
try:
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
172
|
+
with _suppress_bleu_warnings():
|
|
173
|
+
bleu_val = nltk_bleu_score.sentence_bleu(
|
|
174
|
+
ref_lists,
|
|
175
|
+
candidate_tokens,
|
|
176
|
+
weights=used_weights,
|
|
177
|
+
smoothing_function=smoothing_func,
|
|
178
|
+
)
|
|
160
179
|
except ZeroDivisionError:
|
|
161
180
|
bleu_val = 0.0
|
|
162
181
|
|
|
@@ -268,12 +287,13 @@ class CorpusBLEU(BaseBLEU):
|
|
|
268
287
|
smoothing_func = self._get_smoothing_func()
|
|
269
288
|
|
|
270
289
|
try:
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
290
|
+
with _suppress_bleu_warnings():
|
|
291
|
+
bleu_val = nltk_bleu_score.corpus_bleu(
|
|
292
|
+
all_references,
|
|
293
|
+
all_candidates,
|
|
294
|
+
weights=used_weights,
|
|
295
|
+
smoothing_function=smoothing_func,
|
|
296
|
+
)
|
|
277
297
|
except ZeroDivisionError:
|
|
278
298
|
bleu_val = 0.0
|
|
279
299
|
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""Character n-gram F-score (chrF/chrF++) metric wrapper."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Callable, Optional, Sequence, Union
|
|
6
|
+
|
|
7
|
+
from opik.evaluation.metrics.base_metric import BaseMetric
|
|
8
|
+
from opik.evaluation.metrics.score_result import ScoreResult
|
|
9
|
+
from opik.exceptions import MetricComputationError
|
|
10
|
+
|
|
11
|
+
try: # pragma: no cover - optional dependency
|
|
12
|
+
from nltk.translate import chrf_score as nltk_chrf_score
|
|
13
|
+
except ImportError: # pragma: no cover - optional dependency
|
|
14
|
+
nltk_chrf_score = None
|
|
15
|
+
|
|
16
|
+
ChrFFn = Callable[[Sequence[str], Sequence[str]], float]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ChrF(BaseMetric):
|
|
20
|
+
"""
|
|
21
|
+
Compute chrF / chrF++ scores between a candidate string and references.
|
|
22
|
+
|
|
23
|
+
By default the implementation delegates to ``nltk.translate.chrf_score`` and
|
|
24
|
+
supports both chrF (character n-gram overlap) and chrF++ (when ``word_order``
|
|
25
|
+
is non-zero). Scores range from `0.0` (no overlap) to `1.0` (perfect match).
|
|
26
|
+
|
|
27
|
+
References:
|
|
28
|
+
- Popović, "chrF: character n-gram F-score for automatic MT evaluation" (WMT 2015)
|
|
29
|
+
https://aclanthology.org/W15-3049/
|
|
30
|
+
- NLTK chrf_score module documentation
|
|
31
|
+
https://www.nltk.org/api/nltk.translate.chrf_score.html
|
|
32
|
+
- Hugging Face Evaluate: chrF metric overview
|
|
33
|
+
https://huggingface.co/spaces/evaluate-metric/chrf
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
name: Display name for the metric result. Defaults to ``"chrf_metric"``.
|
|
37
|
+
track: Whether to automatically track metric results. Defaults to ``True``.
|
|
38
|
+
project_name: Optional tracking project name. Defaults to ``None``.
|
|
39
|
+
beta: Weighting between precision and recall (``beta = 2`` is standard).
|
|
40
|
+
ignore_whitespace: Whether whitespace is ignored before scoring.
|
|
41
|
+
char_order: Maximum character n-gram order.
|
|
42
|
+
word_order: Maximum word n-gram order (set ``>0`` to enable chrF++).
|
|
43
|
+
lowercase: Whether to lowercase candidate and references prior to scoring.
|
|
44
|
+
chrf_fn: Optional custom scoring callable for testing or offline usage.
|
|
45
|
+
|
|
46
|
+
Example:
|
|
47
|
+
>>> from opik.evaluation.metrics import ChrF
|
|
48
|
+
>>> metric = ChrF(beta=2.0, char_order=6, lowercase=True)
|
|
49
|
+
>>> result = metric.score(
|
|
50
|
+
... output="The quick brown fox",
|
|
51
|
+
... reference="The quick brown fox jumps",
|
|
52
|
+
... )
|
|
53
|
+
>>> round(result.value, 4) # doctest: +SKIP
|
|
54
|
+
0.8795
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
def __init__(
|
|
58
|
+
self,
|
|
59
|
+
name: str = "chrf_metric",
|
|
60
|
+
track: bool = True,
|
|
61
|
+
project_name: Optional[str] = None,
|
|
62
|
+
beta: float = 2.0,
|
|
63
|
+
ignore_whitespace: bool = False,
|
|
64
|
+
char_order: int = 6,
|
|
65
|
+
word_order: int = 0,
|
|
66
|
+
lowercase: bool = False,
|
|
67
|
+
chrf_fn: Optional[ChrFFn] = None,
|
|
68
|
+
) -> None:
|
|
69
|
+
super().__init__(name=name, track=track, project_name=project_name)
|
|
70
|
+
self._beta = beta
|
|
71
|
+
self._ignore_whitespace = ignore_whitespace
|
|
72
|
+
self._char_order = char_order
|
|
73
|
+
self._word_order = word_order
|
|
74
|
+
self._lowercase = lowercase
|
|
75
|
+
|
|
76
|
+
if chrf_fn is not None:
|
|
77
|
+
self._chrf_fn = chrf_fn
|
|
78
|
+
else:
|
|
79
|
+
if nltk_chrf_score is None: # pragma: no cover - optional dependency
|
|
80
|
+
raise ImportError(
|
|
81
|
+
"chrF metric requires the optional 'nltk' package. Install via"
|
|
82
|
+
" `pip install nltk` or provide `chrf_fn`."
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
def _compute(candidate: Sequence[str], references: Sequence[str]) -> float:
|
|
86
|
+
try:
|
|
87
|
+
return float(
|
|
88
|
+
nltk_chrf_score.sentence_chrf(
|
|
89
|
+
references,
|
|
90
|
+
candidate,
|
|
91
|
+
beta=self._beta,
|
|
92
|
+
)
|
|
93
|
+
)
|
|
94
|
+
except TypeError:
|
|
95
|
+
# Older NLTK versions expose the helper with fewer keyword arguments.
|
|
96
|
+
return float(nltk_chrf_score.sentence_chrf(references, candidate))
|
|
97
|
+
|
|
98
|
+
self._chrf_fn = _compute
|
|
99
|
+
|
|
100
|
+
def score(
|
|
101
|
+
self,
|
|
102
|
+
output: str,
|
|
103
|
+
reference: Union[str, Sequence[str]],
|
|
104
|
+
**ignored_kwargs: Any,
|
|
105
|
+
) -> ScoreResult:
|
|
106
|
+
if not output.strip():
|
|
107
|
+
raise MetricComputationError("Candidate is empty (chrF metric).")
|
|
108
|
+
if isinstance(reference, str):
|
|
109
|
+
references = [reference]
|
|
110
|
+
else:
|
|
111
|
+
references = list(reference)
|
|
112
|
+
if not references or any(not ref.strip() for ref in references):
|
|
113
|
+
raise MetricComputationError("Reference is empty (chrF metric).")
|
|
114
|
+
|
|
115
|
+
if self._lowercase:
|
|
116
|
+
output_text = output.lower()
|
|
117
|
+
references = [ref.lower() for ref in references]
|
|
118
|
+
else:
|
|
119
|
+
output_text = output
|
|
120
|
+
|
|
121
|
+
value = self._chrf_fn(output_text, references)
|
|
122
|
+
|
|
123
|
+
return ScoreResult(
|
|
124
|
+
value=float(value),
|
|
125
|
+
name=self.name,
|
|
126
|
+
reason=f"chrF score: {float(value):.4f}",
|
|
127
|
+
)
|
|
@@ -12,24 +12,44 @@ class Contains(base_metric.BaseMetric):
|
|
|
12
12
|
|
|
13
13
|
Args:
|
|
14
14
|
case_sensitive: Whether the comparison should be case-sensitive. Defaults to False.
|
|
15
|
+
reference: Optional default reference string. If provided, it will be used unless
|
|
16
|
+
a reference is explicitly passed to `score()`.
|
|
15
17
|
name: The name of the metric. Defaults to "contains_metric".
|
|
16
18
|
track: Whether to track the metric. Defaults to True.
|
|
17
|
-
project_name: Optional project name to track the metric in for the cases when there are
|
|
19
|
+
project_name: Optional project name to track the metric in for the cases when there are
|
|
20
|
+
no parent span/trace to inherit project name from.
|
|
18
21
|
|
|
19
|
-
|
|
20
|
-
>>>
|
|
21
|
-
>>> contains_metric = Contains(
|
|
22
|
-
>>> result = contains_metric.score("Hello, World!"
|
|
22
|
+
Examples:
|
|
23
|
+
>>> # Using a default reference at initialization
|
|
24
|
+
>>> contains_metric = Contains(reference="world")
|
|
25
|
+
>>> result = contains_metric.score("Hello, World!")
|
|
23
26
|
>>> print(result.value)
|
|
24
27
|
1.0
|
|
25
|
-
|
|
28
|
+
|
|
29
|
+
>>> # Overriding the default reference at score time
|
|
30
|
+
>>> result = contains_metric.score("Hello, World!", reference="there")
|
|
26
31
|
>>> print(result.value)
|
|
27
32
|
0.0
|
|
33
|
+
|
|
34
|
+
>>> # If no reference is set at all, score() raises an error
|
|
35
|
+
>>> contains_metric = Contains()
|
|
36
|
+
>>> contains_metric.score("Hello")
|
|
37
|
+
Traceback (most recent call last):
|
|
38
|
+
...
|
|
39
|
+
ValueError: No reference string provided. Either pass `reference` to `score()` or set a default reference when creating the metric.
|
|
40
|
+
|
|
41
|
+
>>> # Empty reference string is invalid
|
|
42
|
+
>>> contains_metric = Contains(reference="")
|
|
43
|
+
>>> contains_metric.score("Hello")
|
|
44
|
+
Traceback (most recent call last):
|
|
45
|
+
...
|
|
46
|
+
ValueError: Invalid reference string provided. Reference must be a non-empty string.
|
|
28
47
|
"""
|
|
29
48
|
|
|
30
49
|
def __init__(
|
|
31
50
|
self,
|
|
32
51
|
case_sensitive: bool = False,
|
|
52
|
+
reference: Optional[str] = None,
|
|
33
53
|
name: str = "contains_metric",
|
|
34
54
|
track: bool = True,
|
|
35
55
|
project_name: Optional[str] = None,
|
|
@@ -39,28 +59,44 @@ class Contains(base_metric.BaseMetric):
|
|
|
39
59
|
track=track,
|
|
40
60
|
project_name=project_name,
|
|
41
61
|
)
|
|
42
|
-
|
|
43
62
|
self._case_sensitive = case_sensitive
|
|
63
|
+
self._default_reference = reference
|
|
44
64
|
|
|
45
65
|
def score(
|
|
46
|
-
self, output: str, reference: str, **ignored_kwargs: Any
|
|
66
|
+
self, output: str, reference: Optional[str] = None, **ignored_kwargs: Any
|
|
47
67
|
) -> score_result.ScoreResult:
|
|
48
68
|
"""
|
|
49
69
|
Calculate the score based on whether the reference string is contained in the output string.
|
|
50
70
|
|
|
51
71
|
Args:
|
|
52
72
|
output: The output string to check.
|
|
53
|
-
reference: The reference string to look for in the output.
|
|
73
|
+
reference: The reference string to look for in the output. If None, falls back to the
|
|
74
|
+
default reference provided at initialization.
|
|
54
75
|
**ignored_kwargs: Additional keyword arguments that are ignored.
|
|
55
76
|
|
|
56
77
|
Returns:
|
|
57
78
|
score_result.ScoreResult: A ScoreResult object with a value of 1.0 if the reference
|
|
58
79
|
is found in the output, 0.0 otherwise.
|
|
59
80
|
"""
|
|
81
|
+
# Use provided reference, else fall back to default
|
|
82
|
+
ref = reference if reference is not None else self._default_reference
|
|
83
|
+
|
|
84
|
+
# Handle missing reference (None) separately
|
|
85
|
+
if ref is None:
|
|
86
|
+
raise ValueError(
|
|
87
|
+
"No reference string provided. Either pass `reference` to `score()` or set a default reference when creating the metric."
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# Handle empty string separately
|
|
91
|
+
if ref == "":
|
|
92
|
+
raise ValueError(
|
|
93
|
+
"Invalid reference string provided. Reference must be a non-empty string."
|
|
94
|
+
)
|
|
95
|
+
|
|
60
96
|
value = output if self._case_sensitive else output.lower()
|
|
61
|
-
|
|
97
|
+
ref = ref if self._case_sensitive else ref.lower()
|
|
62
98
|
|
|
63
|
-
if
|
|
99
|
+
if ref in value:
|
|
64
100
|
return score_result.ScoreResult(value=1.0, name=self.name)
|
|
65
101
|
|
|
66
102
|
return score_result.ScoreResult(value=0.0, name=self.name)
|