opik 1.8.39__py3-none-any.whl → 1.9.71__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik/__init__.py +19 -3
- opik/anonymizer/__init__.py +5 -0
- opik/anonymizer/anonymizer.py +12 -0
- opik/anonymizer/factory.py +80 -0
- opik/anonymizer/recursive_anonymizer.py +64 -0
- opik/anonymizer/rules.py +56 -0
- opik/anonymizer/rules_anonymizer.py +35 -0
- opik/api_objects/attachment/attachment_context.py +36 -0
- opik/api_objects/attachment/attachments_extractor.py +153 -0
- opik/api_objects/attachment/client.py +1 -0
- opik/api_objects/attachment/converters.py +2 -0
- opik/api_objects/attachment/decoder.py +18 -0
- opik/api_objects/attachment/decoder_base64.py +83 -0
- opik/api_objects/attachment/decoder_helpers.py +137 -0
- opik/api_objects/data_helpers.py +79 -0
- opik/api_objects/dataset/dataset.py +64 -4
- opik/api_objects/dataset/rest_operations.py +11 -2
- opik/api_objects/experiment/experiment.py +57 -57
- opik/api_objects/experiment/experiment_item.py +2 -1
- opik/api_objects/experiment/experiments_client.py +64 -0
- opik/api_objects/experiment/helpers.py +35 -11
- opik/api_objects/experiment/rest_operations.py +65 -5
- opik/api_objects/helpers.py +8 -5
- opik/api_objects/local_recording.py +81 -0
- opik/api_objects/opik_client.py +600 -108
- opik/api_objects/opik_query_language.py +39 -5
- opik/api_objects/prompt/__init__.py +12 -2
- opik/api_objects/prompt/base_prompt.py +69 -0
- opik/api_objects/prompt/base_prompt_template.py +29 -0
- opik/api_objects/prompt/chat/__init__.py +1 -0
- opik/api_objects/prompt/chat/chat_prompt.py +210 -0
- opik/api_objects/prompt/chat/chat_prompt_template.py +350 -0
- opik/api_objects/prompt/chat/content_renderer_registry.py +203 -0
- opik/api_objects/prompt/client.py +189 -47
- opik/api_objects/prompt/text/__init__.py +1 -0
- opik/api_objects/prompt/text/prompt.py +174 -0
- opik/api_objects/prompt/{prompt_template.py → text/prompt_template.py} +10 -6
- opik/api_objects/prompt/types.py +23 -0
- opik/api_objects/search_helpers.py +89 -0
- opik/api_objects/span/span_data.py +35 -25
- opik/api_objects/threads/threads_client.py +39 -5
- opik/api_objects/trace/trace_client.py +52 -2
- opik/api_objects/trace/trace_data.py +15 -24
- opik/api_objects/validation_helpers.py +3 -3
- opik/cli/__init__.py +5 -0
- opik/cli/__main__.py +6 -0
- opik/cli/configure.py +66 -0
- opik/cli/exports/__init__.py +131 -0
- opik/cli/exports/dataset.py +278 -0
- opik/cli/exports/experiment.py +784 -0
- opik/cli/exports/project.py +685 -0
- opik/cli/exports/prompt.py +578 -0
- opik/cli/exports/utils.py +406 -0
- opik/cli/harbor.py +39 -0
- opik/cli/healthcheck.py +21 -0
- opik/cli/imports/__init__.py +439 -0
- opik/cli/imports/dataset.py +143 -0
- opik/cli/imports/experiment.py +1192 -0
- opik/cli/imports/project.py +262 -0
- opik/cli/imports/prompt.py +177 -0
- opik/cli/imports/utils.py +280 -0
- opik/cli/main.py +49 -0
- opik/cli/proxy.py +93 -0
- opik/cli/usage_report/__init__.py +16 -0
- opik/cli/usage_report/charts.py +783 -0
- opik/cli/usage_report/cli.py +274 -0
- opik/cli/usage_report/constants.py +9 -0
- opik/cli/usage_report/extraction.py +749 -0
- opik/cli/usage_report/pdf.py +244 -0
- opik/cli/usage_report/statistics.py +78 -0
- opik/cli/usage_report/utils.py +235 -0
- opik/config.py +13 -7
- opik/configurator/configure.py +17 -0
- opik/datetime_helpers.py +12 -0
- opik/decorator/arguments_helpers.py +9 -1
- opik/decorator/base_track_decorator.py +205 -133
- opik/decorator/context_manager/span_context_manager.py +123 -0
- opik/decorator/context_manager/trace_context_manager.py +84 -0
- opik/decorator/opik_args/__init__.py +13 -0
- opik/decorator/opik_args/api_classes.py +71 -0
- opik/decorator/opik_args/helpers.py +120 -0
- opik/decorator/span_creation_handler.py +25 -6
- opik/dict_utils.py +3 -3
- opik/evaluation/__init__.py +13 -2
- opik/evaluation/engine/engine.py +272 -75
- opik/evaluation/engine/evaluation_tasks_executor.py +6 -3
- opik/evaluation/engine/helpers.py +31 -6
- opik/evaluation/engine/metrics_evaluator.py +237 -0
- opik/evaluation/evaluation_result.py +168 -2
- opik/evaluation/evaluator.py +533 -62
- opik/evaluation/metrics/__init__.py +103 -4
- opik/evaluation/metrics/aggregated_metric.py +35 -6
- opik/evaluation/metrics/base_metric.py +1 -1
- opik/evaluation/metrics/conversation/__init__.py +48 -0
- opik/evaluation/metrics/conversation/conversation_thread_metric.py +56 -2
- opik/evaluation/metrics/conversation/g_eval_wrappers.py +19 -0
- opik/evaluation/metrics/conversation/helpers.py +14 -15
- opik/evaluation/metrics/conversation/heuristics/__init__.py +14 -0
- opik/evaluation/metrics/conversation/heuristics/degeneration/__init__.py +3 -0
- opik/evaluation/metrics/conversation/heuristics/degeneration/metric.py +189 -0
- opik/evaluation/metrics/conversation/heuristics/degeneration/phrases.py +12 -0
- opik/evaluation/metrics/conversation/heuristics/knowledge_retention/__init__.py +3 -0
- opik/evaluation/metrics/conversation/heuristics/knowledge_retention/metric.py +172 -0
- opik/evaluation/metrics/conversation/llm_judges/__init__.py +32 -0
- opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/metric.py +22 -17
- opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/templates.py +1 -1
- opik/evaluation/metrics/conversation/llm_judges/g_eval_wrappers.py +442 -0
- opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/metric.py +13 -7
- opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/templates.py +1 -1
- opik/evaluation/metrics/conversation/llm_judges/user_frustration/__init__.py +0 -0
- opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/metric.py +21 -14
- opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/templates.py +1 -1
- opik/evaluation/metrics/conversation/types.py +4 -5
- opik/evaluation/metrics/conversation_types.py +9 -0
- opik/evaluation/metrics/heuristics/bertscore.py +107 -0
- opik/evaluation/metrics/heuristics/bleu.py +35 -15
- opik/evaluation/metrics/heuristics/chrf.py +127 -0
- opik/evaluation/metrics/heuristics/contains.py +47 -11
- opik/evaluation/metrics/heuristics/distribution_metrics.py +331 -0
- opik/evaluation/metrics/heuristics/gleu.py +113 -0
- opik/evaluation/metrics/heuristics/language_adherence.py +123 -0
- opik/evaluation/metrics/heuristics/meteor.py +119 -0
- opik/evaluation/metrics/heuristics/prompt_injection.py +150 -0
- opik/evaluation/metrics/heuristics/readability.py +129 -0
- opik/evaluation/metrics/heuristics/rouge.py +26 -9
- opik/evaluation/metrics/heuristics/spearman.py +88 -0
- opik/evaluation/metrics/heuristics/tone.py +155 -0
- opik/evaluation/metrics/heuristics/vader_sentiment.py +77 -0
- opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +20 -5
- opik/evaluation/metrics/llm_judges/context_precision/metric.py +20 -6
- opik/evaluation/metrics/llm_judges/context_recall/metric.py +20 -6
- opik/evaluation/metrics/llm_judges/g_eval/__init__.py +5 -0
- opik/evaluation/metrics/llm_judges/g_eval/metric.py +219 -68
- opik/evaluation/metrics/llm_judges/g_eval/parser.py +102 -52
- opik/evaluation/metrics/llm_judges/g_eval/presets.py +209 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/__init__.py +36 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/agent_assessment.py +77 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/bias_classifier.py +181 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/compliance_risk.py +41 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/prompt_uncertainty.py +41 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/qa_suite.py +146 -0
- opik/evaluation/metrics/llm_judges/hallucination/metric.py +16 -3
- opik/evaluation/metrics/llm_judges/llm_juries/__init__.py +3 -0
- opik/evaluation/metrics/llm_judges/llm_juries/metric.py +76 -0
- opik/evaluation/metrics/llm_judges/moderation/metric.py +16 -4
- opik/evaluation/metrics/llm_judges/structure_output_compliance/__init__.py +0 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +144 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/parser.py +79 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/schema.py +15 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/template.py +50 -0
- opik/evaluation/metrics/llm_judges/syc_eval/__init__.py +0 -0
- opik/evaluation/metrics/llm_judges/syc_eval/metric.py +252 -0
- opik/evaluation/metrics/llm_judges/syc_eval/parser.py +82 -0
- opik/evaluation/metrics/llm_judges/syc_eval/template.py +155 -0
- opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +20 -5
- opik/evaluation/metrics/llm_judges/usefulness/metric.py +16 -4
- opik/evaluation/metrics/ragas_metric.py +43 -23
- opik/evaluation/models/__init__.py +8 -0
- opik/evaluation/models/base_model.py +107 -1
- opik/evaluation/models/langchain/langchain_chat_model.py +15 -7
- opik/evaluation/models/langchain/message_converters.py +97 -15
- opik/evaluation/models/litellm/litellm_chat_model.py +156 -29
- opik/evaluation/models/litellm/util.py +125 -0
- opik/evaluation/models/litellm/warning_filters.py +16 -4
- opik/evaluation/models/model_capabilities.py +187 -0
- opik/evaluation/models/models_factory.py +25 -3
- opik/evaluation/preprocessing.py +92 -0
- opik/evaluation/report.py +70 -12
- opik/evaluation/rest_operations.py +49 -45
- opik/evaluation/samplers/__init__.py +4 -0
- opik/evaluation/samplers/base_dataset_sampler.py +40 -0
- opik/evaluation/samplers/random_dataset_sampler.py +48 -0
- opik/evaluation/score_statistics.py +66 -0
- opik/evaluation/scorers/__init__.py +4 -0
- opik/evaluation/scorers/scorer_function.py +55 -0
- opik/evaluation/scorers/scorer_wrapper_metric.py +130 -0
- opik/evaluation/test_case.py +3 -2
- opik/evaluation/test_result.py +1 -0
- opik/evaluation/threads/evaluator.py +31 -3
- opik/evaluation/threads/helpers.py +3 -2
- opik/evaluation/types.py +9 -1
- opik/exceptions.py +33 -0
- opik/file_upload/file_uploader.py +13 -0
- opik/file_upload/upload_options.py +2 -0
- opik/hooks/__init__.py +23 -0
- opik/hooks/anonymizer_hook.py +36 -0
- opik/hooks/httpx_client_hook.py +112 -0
- opik/httpx_client.py +12 -9
- opik/id_helpers.py +18 -0
- opik/integrations/adk/graph/subgraph_edges_builders.py +1 -2
- opik/integrations/adk/helpers.py +16 -7
- opik/integrations/adk/legacy_opik_tracer.py +7 -4
- opik/integrations/adk/opik_tracer.py +14 -1
- opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +7 -3
- opik/integrations/adk/recursive_callback_injector.py +4 -7
- opik/integrations/bedrock/converse/__init__.py +0 -0
- opik/integrations/bedrock/converse/chunks_aggregator.py +188 -0
- opik/integrations/bedrock/{converse_decorator.py → converse/converse_decorator.py} +4 -3
- opik/integrations/bedrock/invoke_agent_decorator.py +5 -4
- opik/integrations/bedrock/invoke_model/__init__.py +0 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/__init__.py +78 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/api.py +45 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/base.py +23 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/claude.py +121 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/format_detector.py +107 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/llama.py +108 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/mistral.py +118 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/nova.py +99 -0
- opik/integrations/bedrock/invoke_model/invoke_model_decorator.py +178 -0
- opik/integrations/bedrock/invoke_model/response_types.py +34 -0
- opik/integrations/bedrock/invoke_model/stream_wrappers.py +122 -0
- opik/integrations/bedrock/invoke_model/usage_converters.py +87 -0
- opik/integrations/bedrock/invoke_model/usage_extraction.py +108 -0
- opik/integrations/bedrock/opik_tracker.py +42 -4
- opik/integrations/bedrock/types.py +19 -0
- opik/integrations/crewai/crewai_decorator.py +8 -51
- opik/integrations/crewai/opik_tracker.py +31 -10
- opik/integrations/crewai/patchers/__init__.py +5 -0
- opik/integrations/crewai/patchers/flow.py +118 -0
- opik/integrations/crewai/patchers/litellm_completion.py +30 -0
- opik/integrations/crewai/patchers/llm_client.py +207 -0
- opik/integrations/dspy/callback.py +80 -17
- opik/integrations/dspy/parsers.py +168 -0
- opik/integrations/harbor/__init__.py +17 -0
- opik/integrations/harbor/experiment_service.py +269 -0
- opik/integrations/harbor/opik_tracker.py +528 -0
- opik/integrations/haystack/opik_connector.py +2 -2
- opik/integrations/haystack/opik_tracer.py +3 -7
- opik/integrations/langchain/__init__.py +3 -1
- opik/integrations/langchain/helpers.py +96 -0
- opik/integrations/langchain/langgraph_async_context_bridge.py +131 -0
- opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
- opik/integrations/langchain/opik_encoder_extension.py +1 -1
- opik/integrations/langchain/opik_tracer.py +474 -229
- opik/integrations/litellm/__init__.py +5 -0
- opik/integrations/litellm/completion_chunks_aggregator.py +115 -0
- opik/integrations/litellm/litellm_completion_decorator.py +242 -0
- opik/integrations/litellm/opik_tracker.py +43 -0
- opik/integrations/litellm/stream_patchers.py +151 -0
- opik/integrations/llama_index/callback.py +146 -107
- opik/integrations/openai/agents/opik_tracing_processor.py +1 -2
- opik/integrations/openai/openai_chat_completions_decorator.py +2 -16
- opik/integrations/openai/opik_tracker.py +1 -1
- opik/integrations/sagemaker/auth.py +5 -1
- opik/llm_usage/google_usage.py +3 -1
- opik/llm_usage/opik_usage.py +7 -8
- opik/llm_usage/opik_usage_factory.py +4 -2
- opik/logging_messages.py +6 -0
- opik/message_processing/batching/base_batcher.py +14 -21
- opik/message_processing/batching/batch_manager.py +22 -10
- opik/message_processing/batching/batch_manager_constuctors.py +10 -0
- opik/message_processing/batching/batchers.py +59 -27
- opik/message_processing/batching/flushing_thread.py +0 -3
- opik/message_processing/emulation/__init__.py +0 -0
- opik/message_processing/emulation/emulator_message_processor.py +578 -0
- opik/message_processing/emulation/local_emulator_message_processor.py +140 -0
- opik/message_processing/emulation/models.py +162 -0
- opik/message_processing/encoder_helpers.py +79 -0
- opik/message_processing/messages.py +56 -1
- opik/message_processing/preprocessing/__init__.py +0 -0
- opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
- opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
- opik/message_processing/preprocessing/constants.py +1 -0
- opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
- opik/message_processing/preprocessing/preprocessor.py +36 -0
- opik/message_processing/processors/__init__.py +0 -0
- opik/message_processing/processors/attachments_extraction_processor.py +146 -0
- opik/message_processing/processors/message_processors.py +92 -0
- opik/message_processing/processors/message_processors_chain.py +96 -0
- opik/message_processing/{message_processors.py → processors/online_message_processor.py} +85 -29
- opik/message_processing/queue_consumer.py +9 -3
- opik/message_processing/streamer.py +71 -33
- opik/message_processing/streamer_constructors.py +43 -10
- opik/opik_context.py +16 -4
- opik/plugins/pytest/hooks.py +5 -3
- opik/rest_api/__init__.py +346 -15
- opik/rest_api/alerts/__init__.py +7 -0
- opik/rest_api/alerts/client.py +667 -0
- opik/rest_api/alerts/raw_client.py +1015 -0
- opik/rest_api/alerts/types/__init__.py +7 -0
- opik/rest_api/alerts/types/get_webhook_examples_request_alert_type.py +5 -0
- opik/rest_api/annotation_queues/__init__.py +4 -0
- opik/rest_api/annotation_queues/client.py +668 -0
- opik/rest_api/annotation_queues/raw_client.py +1019 -0
- opik/rest_api/automation_rule_evaluators/client.py +34 -2
- opik/rest_api/automation_rule_evaluators/raw_client.py +24 -0
- opik/rest_api/client.py +15 -0
- opik/rest_api/dashboards/__init__.py +4 -0
- opik/rest_api/dashboards/client.py +462 -0
- opik/rest_api/dashboards/raw_client.py +648 -0
- opik/rest_api/datasets/client.py +1310 -44
- opik/rest_api/datasets/raw_client.py +2269 -358
- opik/rest_api/experiments/__init__.py +2 -2
- opik/rest_api/experiments/client.py +191 -5
- opik/rest_api/experiments/raw_client.py +301 -7
- opik/rest_api/experiments/types/__init__.py +4 -1
- opik/rest_api/experiments/types/experiment_update_status.py +5 -0
- opik/rest_api/experiments/types/experiment_update_type.py +5 -0
- opik/rest_api/experiments/types/experiment_write_status.py +5 -0
- opik/rest_api/feedback_definitions/types/find_feedback_definitions_request_type.py +1 -1
- opik/rest_api/llm_provider_key/client.py +20 -0
- opik/rest_api/llm_provider_key/raw_client.py +20 -0
- opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +1 -1
- opik/rest_api/manual_evaluation/__init__.py +4 -0
- opik/rest_api/manual_evaluation/client.py +347 -0
- opik/rest_api/manual_evaluation/raw_client.py +543 -0
- opik/rest_api/optimizations/client.py +145 -9
- opik/rest_api/optimizations/raw_client.py +237 -13
- opik/rest_api/optimizations/types/optimization_update_status.py +3 -1
- opik/rest_api/prompts/__init__.py +2 -2
- opik/rest_api/prompts/client.py +227 -6
- opik/rest_api/prompts/raw_client.py +331 -2
- opik/rest_api/prompts/types/__init__.py +3 -1
- opik/rest_api/prompts/types/create_prompt_version_detail_template_structure.py +5 -0
- opik/rest_api/prompts/types/prompt_write_template_structure.py +5 -0
- opik/rest_api/spans/__init__.py +0 -2
- opik/rest_api/spans/client.py +238 -76
- opik/rest_api/spans/raw_client.py +307 -95
- opik/rest_api/spans/types/__init__.py +0 -2
- opik/rest_api/traces/client.py +572 -161
- opik/rest_api/traces/raw_client.py +736 -229
- opik/rest_api/types/__init__.py +352 -17
- opik/rest_api/types/aggregation_data.py +1 -0
- opik/rest_api/types/alert.py +33 -0
- opik/rest_api/types/alert_alert_type.py +5 -0
- opik/rest_api/types/alert_page_public.py +24 -0
- opik/rest_api/types/alert_public.py +33 -0
- opik/rest_api/types/alert_public_alert_type.py +5 -0
- opik/rest_api/types/alert_trigger.py +27 -0
- opik/rest_api/types/alert_trigger_config.py +28 -0
- opik/rest_api/types/alert_trigger_config_public.py +28 -0
- opik/rest_api/types/alert_trigger_config_public_type.py +10 -0
- opik/rest_api/types/alert_trigger_config_type.py +10 -0
- opik/rest_api/types/alert_trigger_config_write.py +22 -0
- opik/rest_api/types/alert_trigger_config_write_type.py +10 -0
- opik/rest_api/types/alert_trigger_event_type.py +19 -0
- opik/rest_api/types/alert_trigger_public.py +27 -0
- opik/rest_api/types/alert_trigger_public_event_type.py +19 -0
- opik/rest_api/types/alert_trigger_write.py +23 -0
- opik/rest_api/types/alert_trigger_write_event_type.py +19 -0
- opik/rest_api/types/alert_write.py +28 -0
- opik/rest_api/types/alert_write_alert_type.py +5 -0
- opik/rest_api/types/annotation_queue.py +42 -0
- opik/rest_api/types/annotation_queue_batch.py +27 -0
- opik/rest_api/types/annotation_queue_item_ids.py +19 -0
- opik/rest_api/types/annotation_queue_page_public.py +28 -0
- opik/rest_api/types/annotation_queue_public.py +38 -0
- opik/rest_api/types/annotation_queue_public_scope.py +5 -0
- opik/rest_api/types/annotation_queue_reviewer.py +20 -0
- opik/rest_api/types/annotation_queue_reviewer_public.py +20 -0
- opik/rest_api/types/annotation_queue_scope.py +5 -0
- opik/rest_api/types/annotation_queue_write.py +31 -0
- opik/rest_api/types/annotation_queue_write_scope.py +5 -0
- opik/rest_api/types/audio_url.py +19 -0
- opik/rest_api/types/audio_url_public.py +19 -0
- opik/rest_api/types/audio_url_write.py +19 -0
- opik/rest_api/types/automation_rule_evaluator.py +62 -2
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_object_object_public.py +155 -0
- opik/rest_api/types/automation_rule_evaluator_page_public.py +3 -2
- opik/rest_api/types/automation_rule_evaluator_public.py +57 -2
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_public.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_write.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update.py +51 -1
- opik/rest_api/types/automation_rule_evaluator_update_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update_span_llm_as_judge.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_update_trace_thread_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update_trace_thread_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_write.py +51 -1
- opik/rest_api/types/boolean_feedback_definition.py +25 -0
- opik/rest_api/types/boolean_feedback_definition_create.py +20 -0
- opik/rest_api/types/boolean_feedback_definition_public.py +25 -0
- opik/rest_api/types/boolean_feedback_definition_update.py +20 -0
- opik/rest_api/types/boolean_feedback_detail.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_create.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_public.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_update.py +29 -0
- opik/rest_api/types/dashboard_page_public.py +24 -0
- opik/rest_api/types/dashboard_public.py +30 -0
- opik/rest_api/types/dataset.py +4 -0
- opik/rest_api/types/dataset_expansion.py +42 -0
- opik/rest_api/types/dataset_expansion_response.py +39 -0
- opik/rest_api/types/dataset_item.py +2 -0
- opik/rest_api/types/dataset_item_changes_public.py +5 -0
- opik/rest_api/types/dataset_item_compare.py +2 -0
- opik/rest_api/types/dataset_item_filter.py +27 -0
- opik/rest_api/types/dataset_item_filter_operator.py +21 -0
- opik/rest_api/types/dataset_item_page_compare.py +5 -0
- opik/rest_api/types/dataset_item_page_public.py +5 -0
- opik/rest_api/types/dataset_item_public.py +2 -0
- opik/rest_api/types/dataset_item_update.py +39 -0
- opik/rest_api/types/dataset_item_write.py +1 -0
- opik/rest_api/types/dataset_public.py +4 -0
- opik/rest_api/types/dataset_public_status.py +5 -0
- opik/rest_api/types/dataset_status.py +5 -0
- opik/rest_api/types/dataset_version_diff.py +22 -0
- opik/rest_api/types/dataset_version_diff_stats.py +24 -0
- opik/rest_api/types/dataset_version_page_public.py +23 -0
- opik/rest_api/types/dataset_version_public.py +59 -0
- opik/rest_api/types/dataset_version_summary.py +46 -0
- opik/rest_api/types/dataset_version_summary_public.py +46 -0
- opik/rest_api/types/experiment.py +7 -2
- opik/rest_api/types/experiment_group_response.py +2 -0
- opik/rest_api/types/experiment_public.py +7 -2
- opik/rest_api/types/experiment_public_status.py +5 -0
- opik/rest_api/types/experiment_score.py +20 -0
- opik/rest_api/types/experiment_score_public.py +20 -0
- opik/rest_api/types/experiment_score_write.py +20 -0
- opik/rest_api/types/experiment_status.py +5 -0
- opik/rest_api/types/feedback.py +25 -1
- opik/rest_api/types/feedback_create.py +20 -1
- opik/rest_api/types/feedback_object_public.py +27 -1
- opik/rest_api/types/feedback_public.py +25 -1
- opik/rest_api/types/feedback_score_batch_item.py +2 -1
- opik/rest_api/types/feedback_score_batch_item_thread.py +2 -1
- opik/rest_api/types/feedback_score_public.py +4 -0
- opik/rest_api/types/feedback_update.py +20 -1
- opik/rest_api/types/group_content_with_aggregations.py +1 -0
- opik/rest_api/types/group_detail.py +19 -0
- opik/rest_api/types/group_details.py +20 -0
- opik/rest_api/types/guardrail.py +1 -0
- opik/rest_api/types/guardrail_write.py +1 -0
- opik/rest_api/types/ids_holder.py +19 -0
- opik/rest_api/types/image_url.py +20 -0
- opik/rest_api/types/image_url_public.py +20 -0
- opik/rest_api/types/image_url_write.py +20 -0
- opik/rest_api/types/llm_as_judge_message.py +5 -1
- opik/rest_api/types/llm_as_judge_message_content.py +26 -0
- opik/rest_api/types/llm_as_judge_message_content_public.py +26 -0
- opik/rest_api/types/llm_as_judge_message_content_write.py +26 -0
- opik/rest_api/types/llm_as_judge_message_public.py +5 -1
- opik/rest_api/types/llm_as_judge_message_write.py +5 -1
- opik/rest_api/types/llm_as_judge_model_parameters.py +3 -0
- opik/rest_api/types/llm_as_judge_model_parameters_public.py +3 -0
- opik/rest_api/types/llm_as_judge_model_parameters_write.py +3 -0
- opik/rest_api/types/manual_evaluation_request.py +38 -0
- opik/rest_api/types/manual_evaluation_request_entity_type.py +5 -0
- opik/rest_api/types/manual_evaluation_response.py +27 -0
- opik/rest_api/types/optimization.py +4 -2
- opik/rest_api/types/optimization_public.py +4 -2
- opik/rest_api/types/optimization_public_status.py +3 -1
- opik/rest_api/types/optimization_status.py +3 -1
- opik/rest_api/types/optimization_studio_config.py +27 -0
- opik/rest_api/types/optimization_studio_config_public.py +27 -0
- opik/rest_api/types/optimization_studio_config_write.py +27 -0
- opik/rest_api/types/optimization_studio_log.py +22 -0
- opik/rest_api/types/optimization_write.py +4 -2
- opik/rest_api/types/optimization_write_status.py +3 -1
- opik/rest_api/types/project.py +1 -0
- opik/rest_api/types/project_detailed.py +1 -0
- opik/rest_api/types/project_reference.py +31 -0
- opik/rest_api/types/project_reference_public.py +31 -0
- opik/rest_api/types/project_stats_summary_item.py +1 -0
- opik/rest_api/types/prompt.py +6 -0
- opik/rest_api/types/prompt_detail.py +6 -0
- opik/rest_api/types/prompt_detail_template_structure.py +5 -0
- opik/rest_api/types/prompt_public.py +6 -0
- opik/rest_api/types/prompt_public_template_structure.py +5 -0
- opik/rest_api/types/prompt_template_structure.py +5 -0
- opik/rest_api/types/prompt_version.py +3 -0
- opik/rest_api/types/prompt_version_detail.py +3 -0
- opik/rest_api/types/prompt_version_detail_template_structure.py +5 -0
- opik/rest_api/types/prompt_version_link.py +1 -0
- opik/rest_api/types/prompt_version_link_public.py +1 -0
- opik/rest_api/types/prompt_version_page_public.py +5 -0
- opik/rest_api/types/prompt_version_public.py +3 -0
- opik/rest_api/types/prompt_version_public_template_structure.py +5 -0
- opik/rest_api/types/prompt_version_template_structure.py +5 -0
- opik/rest_api/types/prompt_version_update.py +33 -0
- opik/rest_api/types/provider_api_key.py +9 -0
- opik/rest_api/types/provider_api_key_provider.py +1 -1
- opik/rest_api/types/provider_api_key_public.py +9 -0
- opik/rest_api/types/provider_api_key_public_provider.py +1 -1
- opik/rest_api/types/score_name.py +1 -0
- opik/rest_api/types/service_toggles_config.py +18 -0
- opik/rest_api/types/span.py +1 -2
- opik/rest_api/types/span_enrichment_options.py +31 -0
- opik/rest_api/types/span_experiment_item_bulk_write_view.py +1 -2
- opik/rest_api/types/span_filter.py +23 -0
- opik/rest_api/types/span_filter_operator.py +21 -0
- opik/rest_api/types/span_filter_write.py +23 -0
- opik/rest_api/types/span_filter_write_operator.py +21 -0
- opik/rest_api/types/span_llm_as_judge_code.py +27 -0
- opik/rest_api/types/span_llm_as_judge_code_public.py +27 -0
- opik/rest_api/types/span_llm_as_judge_code_write.py +27 -0
- opik/rest_api/types/span_public.py +1 -2
- opik/rest_api/types/span_update.py +46 -0
- opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
- opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
- opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
- opik/rest_api/types/span_write.py +1 -2
- opik/rest_api/types/studio_evaluation.py +20 -0
- opik/rest_api/types/studio_evaluation_public.py +20 -0
- opik/rest_api/types/studio_evaluation_write.py +20 -0
- opik/rest_api/types/studio_llm_model.py +21 -0
- opik/rest_api/types/studio_llm_model_public.py +21 -0
- opik/rest_api/types/studio_llm_model_write.py +21 -0
- opik/rest_api/types/studio_message.py +20 -0
- opik/rest_api/types/studio_message_public.py +20 -0
- opik/rest_api/types/studio_message_write.py +20 -0
- opik/rest_api/types/studio_metric.py +21 -0
- opik/rest_api/types/studio_metric_public.py +21 -0
- opik/rest_api/types/studio_metric_write.py +21 -0
- opik/rest_api/types/studio_optimizer.py +21 -0
- opik/rest_api/types/studio_optimizer_public.py +21 -0
- opik/rest_api/types/studio_optimizer_write.py +21 -0
- opik/rest_api/types/studio_prompt.py +20 -0
- opik/rest_api/types/studio_prompt_public.py +20 -0
- opik/rest_api/types/studio_prompt_write.py +20 -0
- opik/rest_api/types/trace.py +11 -2
- opik/rest_api/types/trace_enrichment_options.py +32 -0
- opik/rest_api/types/trace_experiment_item_bulk_write_view.py +1 -2
- opik/rest_api/types/trace_filter.py +23 -0
- opik/rest_api/types/trace_filter_operator.py +21 -0
- opik/rest_api/types/trace_filter_write.py +23 -0
- opik/rest_api/types/trace_filter_write_operator.py +21 -0
- opik/rest_api/types/trace_public.py +11 -2
- opik/rest_api/types/trace_thread_filter_write.py +23 -0
- opik/rest_api/types/trace_thread_filter_write_operator.py +21 -0
- opik/rest_api/types/trace_thread_identifier.py +1 -0
- opik/rest_api/types/trace_update.py +39 -0
- opik/rest_api/types/trace_write.py +1 -2
- opik/rest_api/types/value_entry.py +2 -0
- opik/rest_api/types/value_entry_compare.py +2 -0
- opik/rest_api/types/value_entry_experiment_item_bulk_write_view.py +2 -0
- opik/rest_api/types/value_entry_public.py +2 -0
- opik/rest_api/types/video_url.py +19 -0
- opik/rest_api/types/video_url_public.py +19 -0
- opik/rest_api/types/video_url_write.py +19 -0
- opik/rest_api/types/webhook.py +28 -0
- opik/rest_api/types/webhook_examples.py +19 -0
- opik/rest_api/types/webhook_public.py +28 -0
- opik/rest_api/types/webhook_test_result.py +23 -0
- opik/rest_api/types/webhook_test_result_status.py +5 -0
- opik/rest_api/types/webhook_write.py +23 -0
- opik/rest_api/types/welcome_wizard_tracking.py +22 -0
- opik/rest_api/types/workspace_configuration.py +5 -0
- opik/rest_api/welcome_wizard/__init__.py +4 -0
- opik/rest_api/welcome_wizard/client.py +195 -0
- opik/rest_api/welcome_wizard/raw_client.py +208 -0
- opik/rest_api/workspaces/client.py +14 -2
- opik/rest_api/workspaces/raw_client.py +10 -0
- opik/s3_httpx_client.py +14 -1
- opik/simulation/__init__.py +6 -0
- opik/simulation/simulated_user.py +99 -0
- opik/simulation/simulator.py +108 -0
- opik/synchronization.py +5 -6
- opik/{decorator/tracing_runtime_config.py → tracing_runtime_config.py} +6 -7
- opik/types.py +36 -0
- opik/validation/chat_prompt_messages.py +241 -0
- opik/validation/feedback_score.py +3 -3
- opik/validation/validator.py +28 -0
- opik-1.9.71.dist-info/METADATA +370 -0
- opik-1.9.71.dist-info/RECORD +1110 -0
- opik/api_objects/prompt/prompt.py +0 -112
- opik/cli.py +0 -193
- opik/hooks.py +0 -13
- opik/integrations/bedrock/chunks_aggregator.py +0 -55
- opik/integrations/bedrock/helpers.py +0 -8
- opik/rest_api/types/automation_rule_evaluator_object_public.py +0 -100
- opik/rest_api/types/json_node_experiment_item_bulk_write_view.py +0 -5
- opik-1.8.39.dist-info/METADATA +0 -339
- opik-1.8.39.dist-info/RECORD +0 -790
- /opik/{evaluation/metrics/conversation/conversational_coherence → decorator/context_manager}/__init__.py +0 -0
- /opik/evaluation/metrics/conversation/{session_completeness → llm_judges/conversational_coherence}/__init__.py +0 -0
- /opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/schema.py +0 -0
- /opik/evaluation/metrics/conversation/{user_frustration → llm_judges/session_completeness}/__init__.py +0 -0
- /opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/schema.py +0 -0
- /opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/schema.py +0 -0
- /opik/integrations/bedrock/{stream_wrappers.py → converse/stream_wrappers.py} +0 -0
- /opik/rest_api/{spans/types → types}/span_update_type.py +0 -0
- {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/WHEEL +0 -0
- {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/entry_points.txt +0 -0
- {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/licenses/LICENSE +0 -0
- {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Optional, Union
|
|
4
|
+
|
|
5
|
+
from opik.evaluation.metrics.llm_judges.g_eval import metric as g_eval_metric
|
|
6
|
+
from opik.evaluation.models import base_model
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DemographicBiasJudge(g_eval_metric.GEvalPreset):
|
|
10
|
+
"""
|
|
11
|
+
Score demographic stereotyping or bias in a response.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
model: Optional model identifier or ``OpikBaseModel`` instance.
|
|
15
|
+
track: Whether to automatically track results. Defaults to ``True``.
|
|
16
|
+
project_name: Optional tracking project name.
|
|
17
|
+
temperature: Sampling temperature forwarded to the model.
|
|
18
|
+
|
|
19
|
+
Example:
|
|
20
|
+
>>> from opik.evaluation.metrics import DemographicBiasJudge
|
|
21
|
+
>>> judge = DemographicBiasJudge(model="gpt-4")
|
|
22
|
+
>>> result = judge.score(output="People from X group are always late.") # doctest: +SKIP
|
|
23
|
+
>>> result.value # doctest: +SKIP
|
|
24
|
+
0.95
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
model: Optional[Union[str, base_model.OpikBaseModel]] = None,
|
|
30
|
+
track: bool = True,
|
|
31
|
+
project_name: Optional[str] = None,
|
|
32
|
+
temperature: float = 0.0,
|
|
33
|
+
) -> None:
|
|
34
|
+
super().__init__(
|
|
35
|
+
preset="bias_demographic",
|
|
36
|
+
model=model,
|
|
37
|
+
track=track,
|
|
38
|
+
project_name=project_name,
|
|
39
|
+
temperature=temperature,
|
|
40
|
+
name="demographic_bias_judge",
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class PoliticalBiasJudge(g_eval_metric.GEvalPreset):
|
|
45
|
+
"""
|
|
46
|
+
Detect partisan or ideological bias in a response.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
model: Optional model identifier or ``OpikBaseModel`` instance.
|
|
50
|
+
track: Whether to automatically track results. Defaults to ``True``.
|
|
51
|
+
project_name: Optional tracking project name.
|
|
52
|
+
temperature: Sampling temperature forwarded to the model.
|
|
53
|
+
|
|
54
|
+
Example:
|
|
55
|
+
>>> from opik.evaluation.metrics import PoliticalBiasJudge
|
|
56
|
+
>>> judge = PoliticalBiasJudge(model="gpt-4")
|
|
57
|
+
>>> result = judge.score(output="Vote for candidate X because Y is corrupt") # doctest: +SKIP
|
|
58
|
+
>>> result.value # doctest: +SKIP
|
|
59
|
+
0.87
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def __init__(
|
|
63
|
+
self,
|
|
64
|
+
model: Optional[Union[str, base_model.OpikBaseModel]] = None,
|
|
65
|
+
track: bool = True,
|
|
66
|
+
project_name: Optional[str] = None,
|
|
67
|
+
temperature: float = 0.0,
|
|
68
|
+
) -> None:
|
|
69
|
+
super().__init__(
|
|
70
|
+
preset="bias_political",
|
|
71
|
+
model=model,
|
|
72
|
+
track=track,
|
|
73
|
+
project_name=project_name,
|
|
74
|
+
temperature=temperature,
|
|
75
|
+
name="political_bias_judge",
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class GenderBiasJudge(g_eval_metric.GEvalPreset):
|
|
80
|
+
"""
|
|
81
|
+
Detect gender stereotyping or exclusion in generated text.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
model: Optional model identifier or ``OpikBaseModel`` instance.
|
|
85
|
+
track: Whether to automatically track results. Defaults to ``True``.
|
|
86
|
+
project_name: Optional tracking project name.
|
|
87
|
+
temperature: Sampling temperature forwarded to the model.
|
|
88
|
+
|
|
89
|
+
Example:
|
|
90
|
+
>>> from opik.evaluation.metrics import GenderBiasJudge
|
|
91
|
+
>>> judge = GenderBiasJudge(model="gpt-4")
|
|
92
|
+
>>> result = judge.score(output="Women are naturally worse at math.") # doctest: +SKIP
|
|
93
|
+
>>> result.value # doctest: +SKIP
|
|
94
|
+
0.93
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
def __init__(
|
|
98
|
+
self,
|
|
99
|
+
model: Optional[Union[str, base_model.OpikBaseModel]] = None,
|
|
100
|
+
track: bool = True,
|
|
101
|
+
project_name: Optional[str] = None,
|
|
102
|
+
temperature: float = 0.0,
|
|
103
|
+
) -> None:
|
|
104
|
+
super().__init__(
|
|
105
|
+
preset="bias_gender",
|
|
106
|
+
model=model,
|
|
107
|
+
track=track,
|
|
108
|
+
project_name=project_name,
|
|
109
|
+
temperature=temperature,
|
|
110
|
+
name="gender_bias_judge",
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class ReligiousBiasJudge(g_eval_metric.GEvalPreset):
|
|
115
|
+
"""
|
|
116
|
+
Evaluate responses for religious bias or disrespectful language.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
model: Optional model identifier or ``OpikBaseModel`` instance.
|
|
120
|
+
track: Whether to automatically track results. Defaults to ``True``.
|
|
121
|
+
project_name: Optional tracking project name.
|
|
122
|
+
temperature: Sampling temperature forwarded to the model.
|
|
123
|
+
|
|
124
|
+
Example:
|
|
125
|
+
>>> from opik.evaluation.metrics import ReligiousBiasJudge
|
|
126
|
+
>>> judge = ReligiousBiasJudge(model="gpt-4")
|
|
127
|
+
>>> result = judge.score(output="Believers of X are all foolish.") # doctest: +SKIP
|
|
128
|
+
>>> result.value # doctest: +SKIP
|
|
129
|
+
0.9
|
|
130
|
+
"""
|
|
131
|
+
|
|
132
|
+
def __init__(
|
|
133
|
+
self,
|
|
134
|
+
model: Optional[Union[str, base_model.OpikBaseModel]] = None,
|
|
135
|
+
track: bool = True,
|
|
136
|
+
project_name: Optional[str] = None,
|
|
137
|
+
temperature: float = 0.0,
|
|
138
|
+
) -> None:
|
|
139
|
+
super().__init__(
|
|
140
|
+
preset="bias_religion",
|
|
141
|
+
model=model,
|
|
142
|
+
track=track,
|
|
143
|
+
project_name=project_name,
|
|
144
|
+
temperature=temperature,
|
|
145
|
+
name="religious_bias_judge",
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class RegionalBiasJudge(g_eval_metric.GEvalPreset):
|
|
150
|
+
"""
|
|
151
|
+
Assess geographic or cultural bias in responses.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
model: Optional model identifier or ``OpikBaseModel`` instance.
|
|
155
|
+
track: Whether to automatically track results. Defaults to ``True``.
|
|
156
|
+
project_name: Optional tracking project name.
|
|
157
|
+
temperature: Sampling temperature forwarded to the model.
|
|
158
|
+
|
|
159
|
+
Example:
|
|
160
|
+
>>> from opik.evaluation.metrics import RegionalBiasJudge
|
|
161
|
+
>>> judge = RegionalBiasJudge(model="gpt-4")
|
|
162
|
+
>>> result = judge.score(output="People from region Z are lazy.") # doctest: +SKIP
|
|
163
|
+
>>> result.value # doctest: +SKIP
|
|
164
|
+
0.88
|
|
165
|
+
"""
|
|
166
|
+
|
|
167
|
+
def __init__(
|
|
168
|
+
self,
|
|
169
|
+
model: Optional[Union[str, base_model.OpikBaseModel]] = None,
|
|
170
|
+
track: bool = True,
|
|
171
|
+
project_name: Optional[str] = None,
|
|
172
|
+
temperature: float = 0.0,
|
|
173
|
+
) -> None:
|
|
174
|
+
super().__init__(
|
|
175
|
+
preset="bias_regional",
|
|
176
|
+
model=model,
|
|
177
|
+
track=track,
|
|
178
|
+
project_name=project_name,
|
|
179
|
+
temperature=temperature,
|
|
180
|
+
name="regional_bias_judge",
|
|
181
|
+
)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Optional, Union
|
|
4
|
+
|
|
5
|
+
from opik.evaluation.metrics.llm_judges.g_eval import metric as g_eval_metric
|
|
6
|
+
from opik.evaluation.models import base_model
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ComplianceRiskJudge(g_eval_metric.GEvalPreset):
|
|
10
|
+
"""
|
|
11
|
+
Evaluate responses for non-compliant or misleading claims in regulated sectors.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
model: Optional model identifier or ``OpikBaseModel`` instance.
|
|
15
|
+
track: Whether to automatically track judge outputs. Defaults to ``True``.
|
|
16
|
+
project_name: Optional tracking project name.
|
|
17
|
+
temperature: Sampling temperature forwarded to the underlying model.
|
|
18
|
+
|
|
19
|
+
Example:
|
|
20
|
+
>>> from opik.evaluation.metrics import ComplianceRiskJudge
|
|
21
|
+
>>> judge = ComplianceRiskJudge(model="gpt-4")
|
|
22
|
+
>>> result = judge.score(output="This pill cures diabetes in a week.") # doctest: +SKIP
|
|
23
|
+
>>> result.value # doctest: +SKIP
|
|
24
|
+
0.97
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
model: Optional[Union[str, base_model.OpikBaseModel]] = None,
|
|
30
|
+
track: bool = True,
|
|
31
|
+
project_name: Optional[str] = None,
|
|
32
|
+
temperature: float = 0.0,
|
|
33
|
+
) -> None:
|
|
34
|
+
super().__init__(
|
|
35
|
+
preset="compliance_regulated_truthfulness",
|
|
36
|
+
model=model,
|
|
37
|
+
track=track,
|
|
38
|
+
project_name=project_name,
|
|
39
|
+
temperature=temperature,
|
|
40
|
+
name="compliance_risk_judge",
|
|
41
|
+
)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Optional, Union
|
|
4
|
+
|
|
5
|
+
from opik.evaluation.metrics.llm_judges.g_eval import metric as g_eval_metric
|
|
6
|
+
from opik.evaluation.models import base_model
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PromptUncertaintyJudge(g_eval_metric.GEvalPreset):
|
|
10
|
+
"""
|
|
11
|
+
Rate how ambiguous or underspecified a prompt feels to the model.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
model: Optional model identifier or ``OpikBaseModel`` instance.
|
|
15
|
+
track: Whether to automatically track judge outputs. Defaults to ``True``.
|
|
16
|
+
project_name: Optional tracking project name.
|
|
17
|
+
temperature: Sampling temperature forwarded to the underlying model.
|
|
18
|
+
|
|
19
|
+
Example:
|
|
20
|
+
>>> from opik.evaluation.metrics import PromptUncertaintyJudge
|
|
21
|
+
>>> judge = PromptUncertaintyJudge(model="gpt-4")
|
|
22
|
+
>>> result = judge.score(output="Do the right thing in the best way possible.") # doctest: +SKIP
|
|
23
|
+
>>> result.value # doctest: +SKIP
|
|
24
|
+
0.8
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
model: Optional[Union[str, base_model.OpikBaseModel]] = None,
|
|
30
|
+
track: bool = True,
|
|
31
|
+
project_name: Optional[str] = None,
|
|
32
|
+
temperature: float = 0.0,
|
|
33
|
+
) -> None:
|
|
34
|
+
super().__init__(
|
|
35
|
+
preset="prompt_uncertainty",
|
|
36
|
+
model=model,
|
|
37
|
+
track=track,
|
|
38
|
+
project_name=project_name,
|
|
39
|
+
temperature=temperature,
|
|
40
|
+
name="prompt_uncertainty_judge",
|
|
41
|
+
)
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Optional, Union
|
|
4
|
+
|
|
5
|
+
from opik.evaluation.metrics.llm_judges.g_eval import metric as g_eval_metric
|
|
6
|
+
from opik.evaluation.models import base_model
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SummarizationConsistencyJudge(g_eval_metric.GEvalPreset):
|
|
10
|
+
"""
|
|
11
|
+
Score how faithful a summary is to its source content.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
model: Optional model identifier or ``OpikBaseModel`` instance.
|
|
15
|
+
track: Whether to automatically track judge outputs. Defaults to ``True``.
|
|
16
|
+
project_name: Optional tracking project name.
|
|
17
|
+
temperature: Sampling temperature forwarded to the underlying model.
|
|
18
|
+
|
|
19
|
+
Example:
|
|
20
|
+
>>> from opik.evaluation.metrics import SummarizationConsistencyJudge
|
|
21
|
+
>>> judge = SummarizationConsistencyJudge(model="gpt-4")
|
|
22
|
+
>>> result = judge.score(output="Summary omits key fact.") # doctest: +SKIP
|
|
23
|
+
>>> result.value # doctest: +SKIP
|
|
24
|
+
0.4
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
model: Optional[Union[str, base_model.OpikBaseModel]] = None,
|
|
30
|
+
track: bool = True,
|
|
31
|
+
project_name: Optional[str] = None,
|
|
32
|
+
temperature: float = 0.0,
|
|
33
|
+
) -> None:
|
|
34
|
+
super().__init__(
|
|
35
|
+
preset="summarization_consistency",
|
|
36
|
+
model=model,
|
|
37
|
+
track=track,
|
|
38
|
+
project_name=project_name,
|
|
39
|
+
temperature=temperature,
|
|
40
|
+
name="summarization_consistency_judge",
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class SummarizationCoherenceJudge(g_eval_metric.GEvalPreset):
|
|
45
|
+
"""
|
|
46
|
+
Evaluate the coherence and structure of generated summaries.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
model: Optional model identifier or ``OpikBaseModel`` instance.
|
|
50
|
+
track: Whether to automatically track judge outputs. Defaults to ``True``.
|
|
51
|
+
project_name: Optional tracking project name.
|
|
52
|
+
temperature: Sampling temperature forwarded to the underlying model.
|
|
53
|
+
|
|
54
|
+
Example:
|
|
55
|
+
>>> from opik.evaluation.metrics import SummarizationCoherenceJudge
|
|
56
|
+
>>> judge = SummarizationCoherenceJudge(model="gpt-4")
|
|
57
|
+
>>> result = judge.score(output="Summary jumps between unrelated topics.") # doctest: +SKIP
|
|
58
|
+
>>> result.value # doctest: +SKIP
|
|
59
|
+
0.5
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def __init__(
|
|
63
|
+
self,
|
|
64
|
+
model: Optional[Union[str, base_model.OpikBaseModel]] = None,
|
|
65
|
+
track: bool = True,
|
|
66
|
+
project_name: Optional[str] = None,
|
|
67
|
+
temperature: float = 0.0,
|
|
68
|
+
) -> None:
|
|
69
|
+
super().__init__(
|
|
70
|
+
preset="summarization_coherence",
|
|
71
|
+
model=model,
|
|
72
|
+
track=track,
|
|
73
|
+
project_name=project_name,
|
|
74
|
+
temperature=temperature,
|
|
75
|
+
name="summarization_coherence_judge",
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class DialogueHelpfulnessJudge(g_eval_metric.GEvalPreset):
|
|
80
|
+
"""
|
|
81
|
+
Judge how helpful an assistant reply is within a dialogue.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
model: Optional model identifier or ``OpikBaseModel`` instance.
|
|
85
|
+
track: Whether to automatically track judge outputs. Defaults to ``True``.
|
|
86
|
+
project_name: Optional tracking project name.
|
|
87
|
+
temperature: Sampling temperature forwarded to the underlying model.
|
|
88
|
+
|
|
89
|
+
Example:
|
|
90
|
+
>>> from opik.evaluation.metrics import DialogueHelpfulnessJudge
|
|
91
|
+
>>> judge = DialogueHelpfulnessJudge(model="gpt-4")
|
|
92
|
+
>>> result = judge.score(output="Assistant politely refuses without help.") # doctest: +SKIP
|
|
93
|
+
>>> result.value # doctest: +SKIP
|
|
94
|
+
0.3
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
def __init__(
|
|
98
|
+
self,
|
|
99
|
+
model: Optional[Union[str, base_model.OpikBaseModel]] = None,
|
|
100
|
+
track: bool = True,
|
|
101
|
+
project_name: Optional[str] = None,
|
|
102
|
+
temperature: float = 0.0,
|
|
103
|
+
) -> None:
|
|
104
|
+
super().__init__(
|
|
105
|
+
preset="dialogue_helpfulness",
|
|
106
|
+
model=model,
|
|
107
|
+
track=track,
|
|
108
|
+
project_name=project_name,
|
|
109
|
+
temperature=temperature,
|
|
110
|
+
name="dialogue_helpfulness_judge",
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class QARelevanceJudge(g_eval_metric.GEvalPreset):
|
|
115
|
+
"""
|
|
116
|
+
Check whether an answer directly addresses the user question.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
model: Optional model identifier or ``OpikBaseModel`` instance.
|
|
120
|
+
track: Whether to automatically track judge outputs. Defaults to ``True``.
|
|
121
|
+
project_name: Optional tracking project name.
|
|
122
|
+
temperature: Sampling temperature forwarded to the underlying model.
|
|
123
|
+
|
|
124
|
+
Example:
|
|
125
|
+
>>> from opik.evaluation.metrics import QARelevanceJudge
|
|
126
|
+
>>> judge = QARelevanceJudge(model="gpt-4")
|
|
127
|
+
>>> result = judge.score(output="Answer rambles without addressing the ask.") # doctest: +SKIP
|
|
128
|
+
>>> result.value # doctest: +SKIP
|
|
129
|
+
0.2
|
|
130
|
+
"""
|
|
131
|
+
|
|
132
|
+
def __init__(
|
|
133
|
+
self,
|
|
134
|
+
model: Optional[Union[str, base_model.OpikBaseModel]] = None,
|
|
135
|
+
track: bool = True,
|
|
136
|
+
project_name: Optional[str] = None,
|
|
137
|
+
temperature: float = 0.0,
|
|
138
|
+
) -> None:
|
|
139
|
+
super().__init__(
|
|
140
|
+
preset="qa_relevance",
|
|
141
|
+
model=model,
|
|
142
|
+
track=track,
|
|
143
|
+
project_name=project_name,
|
|
144
|
+
temperature=temperature,
|
|
145
|
+
name="qa_relevance_judge",
|
|
146
|
+
)
|
|
@@ -27,6 +27,8 @@ class Hallucination(base_metric.BaseMetric):
|
|
|
27
27
|
track: Whether to track the metric. Defaults to True.
|
|
28
28
|
project_name: Optional project name to track the metric in for the cases when
|
|
29
29
|
there are no parent span/trace to inherit project name from.
|
|
30
|
+
seed: Optional seed value for reproducible model generation. If provided, this seed will be passed to the model for deterministic outputs.
|
|
31
|
+
temperature: Optional temperature value for model generation. If provided, this temperature will be passed to the model. If not provided, the model's default temperature will be used.
|
|
30
32
|
|
|
31
33
|
Example:
|
|
32
34
|
>>> from opik.evaluation.metrics import Hallucination
|
|
@@ -49,18 +51,29 @@ class Hallucination(base_metric.BaseMetric):
|
|
|
49
51
|
few_shot_examples: Optional[List[template.FewShotExampleHallucination]] = None,
|
|
50
52
|
track: bool = True,
|
|
51
53
|
project_name: Optional[str] = None,
|
|
54
|
+
seed: Optional[int] = None,
|
|
55
|
+
temperature: Optional[float] = None,
|
|
52
56
|
):
|
|
53
57
|
super().__init__(name=name, track=track, project_name=project_name)
|
|
54
|
-
self.
|
|
58
|
+
self._seed = seed
|
|
59
|
+
self._init_model(model, temperature=temperature)
|
|
55
60
|
self.few_shot_examples = few_shot_examples
|
|
56
61
|
|
|
57
62
|
def _init_model(
|
|
58
|
-
self,
|
|
63
|
+
self,
|
|
64
|
+
model: Optional[Union[str, base_model.OpikBaseModel]],
|
|
65
|
+
temperature: Optional[float],
|
|
59
66
|
) -> None:
|
|
60
67
|
if isinstance(model, base_model.OpikBaseModel):
|
|
61
68
|
self._model = model
|
|
62
69
|
else:
|
|
63
|
-
|
|
70
|
+
model_kwargs = {}
|
|
71
|
+
if temperature is not None:
|
|
72
|
+
model_kwargs["temperature"] = temperature
|
|
73
|
+
if self._seed is not None:
|
|
74
|
+
model_kwargs["seed"] = self._seed
|
|
75
|
+
|
|
76
|
+
self._model = models_factory.get(model_name=model, **model_kwargs)
|
|
64
77
|
|
|
65
78
|
def score(
|
|
66
79
|
self,
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""LLM Juries: aggregate heterogeneous judges into a consensus score."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Dict, Iterable, List, Optional
|
|
6
|
+
|
|
7
|
+
from opik.evaluation.metrics.base_metric import BaseMetric
|
|
8
|
+
from opik.evaluation.metrics.score_result import ScoreResult
|
|
9
|
+
import opik.exceptions as exceptions
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class LLMJuriesJudge(BaseMetric):
|
|
13
|
+
"""
|
|
14
|
+
Aggregate multiple judge metrics into a consensus score.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
judges: Iterable of judge metrics to execute for consensus.
|
|
18
|
+
name: Display name for the aggregated result. Defaults to
|
|
19
|
+
``"llm_juries_judge"``.
|
|
20
|
+
track: Whether to automatically track results. Defaults to ``True``.
|
|
21
|
+
project_name: Optional tracking project name. Defaults to ``None``.
|
|
22
|
+
|
|
23
|
+
Example:
|
|
24
|
+
>>> from opik.evaluation.metrics import LLMJuriesJudge, ComplianceRiskJudge
|
|
25
|
+
>>> juries = LLMJuriesJudge(judges=[ComplianceRiskJudge(model="gpt-4")])
|
|
26
|
+
>>> result = juries.score(output="Financial guarantees provided.") # doctest: +SKIP
|
|
27
|
+
>>> result.value # doctest: +SKIP
|
|
28
|
+
0.6
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
judges: Iterable[BaseMetric],
|
|
34
|
+
name: str = "llm_juries_judge",
|
|
35
|
+
track: bool = True,
|
|
36
|
+
project_name: Optional[str] = None,
|
|
37
|
+
) -> None:
|
|
38
|
+
super().__init__(name=name, track=track, project_name=project_name)
|
|
39
|
+
self._judges = list(judges)
|
|
40
|
+
if not self._judges:
|
|
41
|
+
raise ValueError("LLMJuriesJudge requires at least one judge metric.")
|
|
42
|
+
|
|
43
|
+
def score(self, *args: Any, **kwargs: Any) -> ScoreResult:
|
|
44
|
+
precomputed: Optional[Dict[BaseMetric, ScoreResult]] = kwargs.pop(
|
|
45
|
+
"precomputed", None
|
|
46
|
+
)
|
|
47
|
+
scores: List[ScoreResult] = []
|
|
48
|
+
for judge in self._judges:
|
|
49
|
+
if precomputed is not None and judge in precomputed:
|
|
50
|
+
raw_result: Any = precomputed[judge]
|
|
51
|
+
else:
|
|
52
|
+
raw_result = judge.score(*args, **kwargs)
|
|
53
|
+
judge_results = raw_result if isinstance(raw_result, list) else [raw_result]
|
|
54
|
+
|
|
55
|
+
for result in judge_results:
|
|
56
|
+
if not isinstance(result, ScoreResult):
|
|
57
|
+
raise exceptions.MetricComputationError(
|
|
58
|
+
f"Judge {judge.name} returned unexpected result type {type(result)!r}"
|
|
59
|
+
)
|
|
60
|
+
if result.value < 0 or result.value > 1:
|
|
61
|
+
raise exceptions.MetricComputationError(
|
|
62
|
+
f"Judge {judge.name} returned out-of-range score {result.value}"
|
|
63
|
+
)
|
|
64
|
+
scores.append(result)
|
|
65
|
+
|
|
66
|
+
if not scores:
|
|
67
|
+
raise exceptions.MetricComputationError("No judge scores produced")
|
|
68
|
+
|
|
69
|
+
average = sum(res.value for res in scores) / len(scores)
|
|
70
|
+
metadata = {
|
|
71
|
+
"judge_scores": {res.name: res.value for res in scores},
|
|
72
|
+
}
|
|
73
|
+
reason = f"Averaged {len(scores)} judge scores"
|
|
74
|
+
return ScoreResult(
|
|
75
|
+
value=average, name=self.name, reason=reason, metadata=metadata
|
|
76
|
+
)
|
|
@@ -26,6 +26,8 @@ class Moderation(base_metric.BaseMetric):
|
|
|
26
26
|
track: Whether to track the metric. Defaults to True.
|
|
27
27
|
project_name: Optional project name to track the metric in for the cases when
|
|
28
28
|
there are no parent span/trace to inherit project name from.
|
|
29
|
+
seed: Optional seed value for reproducible model generation. If provided, this seed will be passed to the model for deterministic outputs.
|
|
30
|
+
temperature: Optional temperature value for model generation. If provided, this temperature will be passed to the model. If not provided, the model's default temperature will be used.
|
|
29
31
|
|
|
30
32
|
Example:
|
|
31
33
|
>>> from opik.evaluation.metrics import Moderation
|
|
@@ -42,23 +44,33 @@ class Moderation(base_metric.BaseMetric):
|
|
|
42
44
|
few_shot_examples: Optional[List[template.FewShotExampleModeration]] = None,
|
|
43
45
|
track: bool = True,
|
|
44
46
|
project_name: Optional[str] = None,
|
|
47
|
+
seed: Optional[int] = None,
|
|
48
|
+
temperature: Optional[float] = None,
|
|
45
49
|
):
|
|
46
50
|
super().__init__(
|
|
47
51
|
name=name,
|
|
48
52
|
track=track,
|
|
49
53
|
project_name=project_name,
|
|
50
54
|
)
|
|
51
|
-
|
|
52
|
-
self._init_model(model)
|
|
55
|
+
self._seed = seed
|
|
56
|
+
self._init_model(model, temperature=temperature)
|
|
53
57
|
self.few_shot_examples = [] if few_shot_examples is None else few_shot_examples
|
|
54
58
|
|
|
55
59
|
def _init_model(
|
|
56
|
-
self,
|
|
60
|
+
self,
|
|
61
|
+
model: Optional[Union[str, base_model.OpikBaseModel]],
|
|
62
|
+
temperature: Optional[float],
|
|
57
63
|
) -> None:
|
|
58
64
|
if isinstance(model, base_model.OpikBaseModel):
|
|
59
65
|
self._model = model
|
|
60
66
|
else:
|
|
61
|
-
|
|
67
|
+
model_kwargs = {}
|
|
68
|
+
if temperature is not None:
|
|
69
|
+
model_kwargs["temperature"] = temperature
|
|
70
|
+
if self._seed is not None:
|
|
71
|
+
model_kwargs["seed"] = self._seed
|
|
72
|
+
|
|
73
|
+
self._model = models_factory.get(model_name=model, **model_kwargs)
|
|
62
74
|
|
|
63
75
|
def score(self, output: str, **ignored_kwargs: Any) -> score_result.ScoreResult:
|
|
64
76
|
"""
|
|
File without changes
|