opik 1.8.39__py3-none-any.whl → 1.9.71__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik/__init__.py +19 -3
- opik/anonymizer/__init__.py +5 -0
- opik/anonymizer/anonymizer.py +12 -0
- opik/anonymizer/factory.py +80 -0
- opik/anonymizer/recursive_anonymizer.py +64 -0
- opik/anonymizer/rules.py +56 -0
- opik/anonymizer/rules_anonymizer.py +35 -0
- opik/api_objects/attachment/attachment_context.py +36 -0
- opik/api_objects/attachment/attachments_extractor.py +153 -0
- opik/api_objects/attachment/client.py +1 -0
- opik/api_objects/attachment/converters.py +2 -0
- opik/api_objects/attachment/decoder.py +18 -0
- opik/api_objects/attachment/decoder_base64.py +83 -0
- opik/api_objects/attachment/decoder_helpers.py +137 -0
- opik/api_objects/data_helpers.py +79 -0
- opik/api_objects/dataset/dataset.py +64 -4
- opik/api_objects/dataset/rest_operations.py +11 -2
- opik/api_objects/experiment/experiment.py +57 -57
- opik/api_objects/experiment/experiment_item.py +2 -1
- opik/api_objects/experiment/experiments_client.py +64 -0
- opik/api_objects/experiment/helpers.py +35 -11
- opik/api_objects/experiment/rest_operations.py +65 -5
- opik/api_objects/helpers.py +8 -5
- opik/api_objects/local_recording.py +81 -0
- opik/api_objects/opik_client.py +600 -108
- opik/api_objects/opik_query_language.py +39 -5
- opik/api_objects/prompt/__init__.py +12 -2
- opik/api_objects/prompt/base_prompt.py +69 -0
- opik/api_objects/prompt/base_prompt_template.py +29 -0
- opik/api_objects/prompt/chat/__init__.py +1 -0
- opik/api_objects/prompt/chat/chat_prompt.py +210 -0
- opik/api_objects/prompt/chat/chat_prompt_template.py +350 -0
- opik/api_objects/prompt/chat/content_renderer_registry.py +203 -0
- opik/api_objects/prompt/client.py +189 -47
- opik/api_objects/prompt/text/__init__.py +1 -0
- opik/api_objects/prompt/text/prompt.py +174 -0
- opik/api_objects/prompt/{prompt_template.py → text/prompt_template.py} +10 -6
- opik/api_objects/prompt/types.py +23 -0
- opik/api_objects/search_helpers.py +89 -0
- opik/api_objects/span/span_data.py +35 -25
- opik/api_objects/threads/threads_client.py +39 -5
- opik/api_objects/trace/trace_client.py +52 -2
- opik/api_objects/trace/trace_data.py +15 -24
- opik/api_objects/validation_helpers.py +3 -3
- opik/cli/__init__.py +5 -0
- opik/cli/__main__.py +6 -0
- opik/cli/configure.py +66 -0
- opik/cli/exports/__init__.py +131 -0
- opik/cli/exports/dataset.py +278 -0
- opik/cli/exports/experiment.py +784 -0
- opik/cli/exports/project.py +685 -0
- opik/cli/exports/prompt.py +578 -0
- opik/cli/exports/utils.py +406 -0
- opik/cli/harbor.py +39 -0
- opik/cli/healthcheck.py +21 -0
- opik/cli/imports/__init__.py +439 -0
- opik/cli/imports/dataset.py +143 -0
- opik/cli/imports/experiment.py +1192 -0
- opik/cli/imports/project.py +262 -0
- opik/cli/imports/prompt.py +177 -0
- opik/cli/imports/utils.py +280 -0
- opik/cli/main.py +49 -0
- opik/cli/proxy.py +93 -0
- opik/cli/usage_report/__init__.py +16 -0
- opik/cli/usage_report/charts.py +783 -0
- opik/cli/usage_report/cli.py +274 -0
- opik/cli/usage_report/constants.py +9 -0
- opik/cli/usage_report/extraction.py +749 -0
- opik/cli/usage_report/pdf.py +244 -0
- opik/cli/usage_report/statistics.py +78 -0
- opik/cli/usage_report/utils.py +235 -0
- opik/config.py +13 -7
- opik/configurator/configure.py +17 -0
- opik/datetime_helpers.py +12 -0
- opik/decorator/arguments_helpers.py +9 -1
- opik/decorator/base_track_decorator.py +205 -133
- opik/decorator/context_manager/span_context_manager.py +123 -0
- opik/decorator/context_manager/trace_context_manager.py +84 -0
- opik/decorator/opik_args/__init__.py +13 -0
- opik/decorator/opik_args/api_classes.py +71 -0
- opik/decorator/opik_args/helpers.py +120 -0
- opik/decorator/span_creation_handler.py +25 -6
- opik/dict_utils.py +3 -3
- opik/evaluation/__init__.py +13 -2
- opik/evaluation/engine/engine.py +272 -75
- opik/evaluation/engine/evaluation_tasks_executor.py +6 -3
- opik/evaluation/engine/helpers.py +31 -6
- opik/evaluation/engine/metrics_evaluator.py +237 -0
- opik/evaluation/evaluation_result.py +168 -2
- opik/evaluation/evaluator.py +533 -62
- opik/evaluation/metrics/__init__.py +103 -4
- opik/evaluation/metrics/aggregated_metric.py +35 -6
- opik/evaluation/metrics/base_metric.py +1 -1
- opik/evaluation/metrics/conversation/__init__.py +48 -0
- opik/evaluation/metrics/conversation/conversation_thread_metric.py +56 -2
- opik/evaluation/metrics/conversation/g_eval_wrappers.py +19 -0
- opik/evaluation/metrics/conversation/helpers.py +14 -15
- opik/evaluation/metrics/conversation/heuristics/__init__.py +14 -0
- opik/evaluation/metrics/conversation/heuristics/degeneration/__init__.py +3 -0
- opik/evaluation/metrics/conversation/heuristics/degeneration/metric.py +189 -0
- opik/evaluation/metrics/conversation/heuristics/degeneration/phrases.py +12 -0
- opik/evaluation/metrics/conversation/heuristics/knowledge_retention/__init__.py +3 -0
- opik/evaluation/metrics/conversation/heuristics/knowledge_retention/metric.py +172 -0
- opik/evaluation/metrics/conversation/llm_judges/__init__.py +32 -0
- opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/metric.py +22 -17
- opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/templates.py +1 -1
- opik/evaluation/metrics/conversation/llm_judges/g_eval_wrappers.py +442 -0
- opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/metric.py +13 -7
- opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/templates.py +1 -1
- opik/evaluation/metrics/conversation/llm_judges/user_frustration/__init__.py +0 -0
- opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/metric.py +21 -14
- opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/templates.py +1 -1
- opik/evaluation/metrics/conversation/types.py +4 -5
- opik/evaluation/metrics/conversation_types.py +9 -0
- opik/evaluation/metrics/heuristics/bertscore.py +107 -0
- opik/evaluation/metrics/heuristics/bleu.py +35 -15
- opik/evaluation/metrics/heuristics/chrf.py +127 -0
- opik/evaluation/metrics/heuristics/contains.py +47 -11
- opik/evaluation/metrics/heuristics/distribution_metrics.py +331 -0
- opik/evaluation/metrics/heuristics/gleu.py +113 -0
- opik/evaluation/metrics/heuristics/language_adherence.py +123 -0
- opik/evaluation/metrics/heuristics/meteor.py +119 -0
- opik/evaluation/metrics/heuristics/prompt_injection.py +150 -0
- opik/evaluation/metrics/heuristics/readability.py +129 -0
- opik/evaluation/metrics/heuristics/rouge.py +26 -9
- opik/evaluation/metrics/heuristics/spearman.py +88 -0
- opik/evaluation/metrics/heuristics/tone.py +155 -0
- opik/evaluation/metrics/heuristics/vader_sentiment.py +77 -0
- opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +20 -5
- opik/evaluation/metrics/llm_judges/context_precision/metric.py +20 -6
- opik/evaluation/metrics/llm_judges/context_recall/metric.py +20 -6
- opik/evaluation/metrics/llm_judges/g_eval/__init__.py +5 -0
- opik/evaluation/metrics/llm_judges/g_eval/metric.py +219 -68
- opik/evaluation/metrics/llm_judges/g_eval/parser.py +102 -52
- opik/evaluation/metrics/llm_judges/g_eval/presets.py +209 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/__init__.py +36 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/agent_assessment.py +77 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/bias_classifier.py +181 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/compliance_risk.py +41 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/prompt_uncertainty.py +41 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/qa_suite.py +146 -0
- opik/evaluation/metrics/llm_judges/hallucination/metric.py +16 -3
- opik/evaluation/metrics/llm_judges/llm_juries/__init__.py +3 -0
- opik/evaluation/metrics/llm_judges/llm_juries/metric.py +76 -0
- opik/evaluation/metrics/llm_judges/moderation/metric.py +16 -4
- opik/evaluation/metrics/llm_judges/structure_output_compliance/__init__.py +0 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +144 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/parser.py +79 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/schema.py +15 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/template.py +50 -0
- opik/evaluation/metrics/llm_judges/syc_eval/__init__.py +0 -0
- opik/evaluation/metrics/llm_judges/syc_eval/metric.py +252 -0
- opik/evaluation/metrics/llm_judges/syc_eval/parser.py +82 -0
- opik/evaluation/metrics/llm_judges/syc_eval/template.py +155 -0
- opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +20 -5
- opik/evaluation/metrics/llm_judges/usefulness/metric.py +16 -4
- opik/evaluation/metrics/ragas_metric.py +43 -23
- opik/evaluation/models/__init__.py +8 -0
- opik/evaluation/models/base_model.py +107 -1
- opik/evaluation/models/langchain/langchain_chat_model.py +15 -7
- opik/evaluation/models/langchain/message_converters.py +97 -15
- opik/evaluation/models/litellm/litellm_chat_model.py +156 -29
- opik/evaluation/models/litellm/util.py +125 -0
- opik/evaluation/models/litellm/warning_filters.py +16 -4
- opik/evaluation/models/model_capabilities.py +187 -0
- opik/evaluation/models/models_factory.py +25 -3
- opik/evaluation/preprocessing.py +92 -0
- opik/evaluation/report.py +70 -12
- opik/evaluation/rest_operations.py +49 -45
- opik/evaluation/samplers/__init__.py +4 -0
- opik/evaluation/samplers/base_dataset_sampler.py +40 -0
- opik/evaluation/samplers/random_dataset_sampler.py +48 -0
- opik/evaluation/score_statistics.py +66 -0
- opik/evaluation/scorers/__init__.py +4 -0
- opik/evaluation/scorers/scorer_function.py +55 -0
- opik/evaluation/scorers/scorer_wrapper_metric.py +130 -0
- opik/evaluation/test_case.py +3 -2
- opik/evaluation/test_result.py +1 -0
- opik/evaluation/threads/evaluator.py +31 -3
- opik/evaluation/threads/helpers.py +3 -2
- opik/evaluation/types.py +9 -1
- opik/exceptions.py +33 -0
- opik/file_upload/file_uploader.py +13 -0
- opik/file_upload/upload_options.py +2 -0
- opik/hooks/__init__.py +23 -0
- opik/hooks/anonymizer_hook.py +36 -0
- opik/hooks/httpx_client_hook.py +112 -0
- opik/httpx_client.py +12 -9
- opik/id_helpers.py +18 -0
- opik/integrations/adk/graph/subgraph_edges_builders.py +1 -2
- opik/integrations/adk/helpers.py +16 -7
- opik/integrations/adk/legacy_opik_tracer.py +7 -4
- opik/integrations/adk/opik_tracer.py +14 -1
- opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +7 -3
- opik/integrations/adk/recursive_callback_injector.py +4 -7
- opik/integrations/bedrock/converse/__init__.py +0 -0
- opik/integrations/bedrock/converse/chunks_aggregator.py +188 -0
- opik/integrations/bedrock/{converse_decorator.py → converse/converse_decorator.py} +4 -3
- opik/integrations/bedrock/invoke_agent_decorator.py +5 -4
- opik/integrations/bedrock/invoke_model/__init__.py +0 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/__init__.py +78 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/api.py +45 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/base.py +23 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/claude.py +121 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/format_detector.py +107 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/llama.py +108 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/mistral.py +118 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/nova.py +99 -0
- opik/integrations/bedrock/invoke_model/invoke_model_decorator.py +178 -0
- opik/integrations/bedrock/invoke_model/response_types.py +34 -0
- opik/integrations/bedrock/invoke_model/stream_wrappers.py +122 -0
- opik/integrations/bedrock/invoke_model/usage_converters.py +87 -0
- opik/integrations/bedrock/invoke_model/usage_extraction.py +108 -0
- opik/integrations/bedrock/opik_tracker.py +42 -4
- opik/integrations/bedrock/types.py +19 -0
- opik/integrations/crewai/crewai_decorator.py +8 -51
- opik/integrations/crewai/opik_tracker.py +31 -10
- opik/integrations/crewai/patchers/__init__.py +5 -0
- opik/integrations/crewai/patchers/flow.py +118 -0
- opik/integrations/crewai/patchers/litellm_completion.py +30 -0
- opik/integrations/crewai/patchers/llm_client.py +207 -0
- opik/integrations/dspy/callback.py +80 -17
- opik/integrations/dspy/parsers.py +168 -0
- opik/integrations/harbor/__init__.py +17 -0
- opik/integrations/harbor/experiment_service.py +269 -0
- opik/integrations/harbor/opik_tracker.py +528 -0
- opik/integrations/haystack/opik_connector.py +2 -2
- opik/integrations/haystack/opik_tracer.py +3 -7
- opik/integrations/langchain/__init__.py +3 -1
- opik/integrations/langchain/helpers.py +96 -0
- opik/integrations/langchain/langgraph_async_context_bridge.py +131 -0
- opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
- opik/integrations/langchain/opik_encoder_extension.py +1 -1
- opik/integrations/langchain/opik_tracer.py +474 -229
- opik/integrations/litellm/__init__.py +5 -0
- opik/integrations/litellm/completion_chunks_aggregator.py +115 -0
- opik/integrations/litellm/litellm_completion_decorator.py +242 -0
- opik/integrations/litellm/opik_tracker.py +43 -0
- opik/integrations/litellm/stream_patchers.py +151 -0
- opik/integrations/llama_index/callback.py +146 -107
- opik/integrations/openai/agents/opik_tracing_processor.py +1 -2
- opik/integrations/openai/openai_chat_completions_decorator.py +2 -16
- opik/integrations/openai/opik_tracker.py +1 -1
- opik/integrations/sagemaker/auth.py +5 -1
- opik/llm_usage/google_usage.py +3 -1
- opik/llm_usage/opik_usage.py +7 -8
- opik/llm_usage/opik_usage_factory.py +4 -2
- opik/logging_messages.py +6 -0
- opik/message_processing/batching/base_batcher.py +14 -21
- opik/message_processing/batching/batch_manager.py +22 -10
- opik/message_processing/batching/batch_manager_constuctors.py +10 -0
- opik/message_processing/batching/batchers.py +59 -27
- opik/message_processing/batching/flushing_thread.py +0 -3
- opik/message_processing/emulation/__init__.py +0 -0
- opik/message_processing/emulation/emulator_message_processor.py +578 -0
- opik/message_processing/emulation/local_emulator_message_processor.py +140 -0
- opik/message_processing/emulation/models.py +162 -0
- opik/message_processing/encoder_helpers.py +79 -0
- opik/message_processing/messages.py +56 -1
- opik/message_processing/preprocessing/__init__.py +0 -0
- opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
- opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
- opik/message_processing/preprocessing/constants.py +1 -0
- opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
- opik/message_processing/preprocessing/preprocessor.py +36 -0
- opik/message_processing/processors/__init__.py +0 -0
- opik/message_processing/processors/attachments_extraction_processor.py +146 -0
- opik/message_processing/processors/message_processors.py +92 -0
- opik/message_processing/processors/message_processors_chain.py +96 -0
- opik/message_processing/{message_processors.py → processors/online_message_processor.py} +85 -29
- opik/message_processing/queue_consumer.py +9 -3
- opik/message_processing/streamer.py +71 -33
- opik/message_processing/streamer_constructors.py +43 -10
- opik/opik_context.py +16 -4
- opik/plugins/pytest/hooks.py +5 -3
- opik/rest_api/__init__.py +346 -15
- opik/rest_api/alerts/__init__.py +7 -0
- opik/rest_api/alerts/client.py +667 -0
- opik/rest_api/alerts/raw_client.py +1015 -0
- opik/rest_api/alerts/types/__init__.py +7 -0
- opik/rest_api/alerts/types/get_webhook_examples_request_alert_type.py +5 -0
- opik/rest_api/annotation_queues/__init__.py +4 -0
- opik/rest_api/annotation_queues/client.py +668 -0
- opik/rest_api/annotation_queues/raw_client.py +1019 -0
- opik/rest_api/automation_rule_evaluators/client.py +34 -2
- opik/rest_api/automation_rule_evaluators/raw_client.py +24 -0
- opik/rest_api/client.py +15 -0
- opik/rest_api/dashboards/__init__.py +4 -0
- opik/rest_api/dashboards/client.py +462 -0
- opik/rest_api/dashboards/raw_client.py +648 -0
- opik/rest_api/datasets/client.py +1310 -44
- opik/rest_api/datasets/raw_client.py +2269 -358
- opik/rest_api/experiments/__init__.py +2 -2
- opik/rest_api/experiments/client.py +191 -5
- opik/rest_api/experiments/raw_client.py +301 -7
- opik/rest_api/experiments/types/__init__.py +4 -1
- opik/rest_api/experiments/types/experiment_update_status.py +5 -0
- opik/rest_api/experiments/types/experiment_update_type.py +5 -0
- opik/rest_api/experiments/types/experiment_write_status.py +5 -0
- opik/rest_api/feedback_definitions/types/find_feedback_definitions_request_type.py +1 -1
- opik/rest_api/llm_provider_key/client.py +20 -0
- opik/rest_api/llm_provider_key/raw_client.py +20 -0
- opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +1 -1
- opik/rest_api/manual_evaluation/__init__.py +4 -0
- opik/rest_api/manual_evaluation/client.py +347 -0
- opik/rest_api/manual_evaluation/raw_client.py +543 -0
- opik/rest_api/optimizations/client.py +145 -9
- opik/rest_api/optimizations/raw_client.py +237 -13
- opik/rest_api/optimizations/types/optimization_update_status.py +3 -1
- opik/rest_api/prompts/__init__.py +2 -2
- opik/rest_api/prompts/client.py +227 -6
- opik/rest_api/prompts/raw_client.py +331 -2
- opik/rest_api/prompts/types/__init__.py +3 -1
- opik/rest_api/prompts/types/create_prompt_version_detail_template_structure.py +5 -0
- opik/rest_api/prompts/types/prompt_write_template_structure.py +5 -0
- opik/rest_api/spans/__init__.py +0 -2
- opik/rest_api/spans/client.py +238 -76
- opik/rest_api/spans/raw_client.py +307 -95
- opik/rest_api/spans/types/__init__.py +0 -2
- opik/rest_api/traces/client.py +572 -161
- opik/rest_api/traces/raw_client.py +736 -229
- opik/rest_api/types/__init__.py +352 -17
- opik/rest_api/types/aggregation_data.py +1 -0
- opik/rest_api/types/alert.py +33 -0
- opik/rest_api/types/alert_alert_type.py +5 -0
- opik/rest_api/types/alert_page_public.py +24 -0
- opik/rest_api/types/alert_public.py +33 -0
- opik/rest_api/types/alert_public_alert_type.py +5 -0
- opik/rest_api/types/alert_trigger.py +27 -0
- opik/rest_api/types/alert_trigger_config.py +28 -0
- opik/rest_api/types/alert_trigger_config_public.py +28 -0
- opik/rest_api/types/alert_trigger_config_public_type.py +10 -0
- opik/rest_api/types/alert_trigger_config_type.py +10 -0
- opik/rest_api/types/alert_trigger_config_write.py +22 -0
- opik/rest_api/types/alert_trigger_config_write_type.py +10 -0
- opik/rest_api/types/alert_trigger_event_type.py +19 -0
- opik/rest_api/types/alert_trigger_public.py +27 -0
- opik/rest_api/types/alert_trigger_public_event_type.py +19 -0
- opik/rest_api/types/alert_trigger_write.py +23 -0
- opik/rest_api/types/alert_trigger_write_event_type.py +19 -0
- opik/rest_api/types/alert_write.py +28 -0
- opik/rest_api/types/alert_write_alert_type.py +5 -0
- opik/rest_api/types/annotation_queue.py +42 -0
- opik/rest_api/types/annotation_queue_batch.py +27 -0
- opik/rest_api/types/annotation_queue_item_ids.py +19 -0
- opik/rest_api/types/annotation_queue_page_public.py +28 -0
- opik/rest_api/types/annotation_queue_public.py +38 -0
- opik/rest_api/types/annotation_queue_public_scope.py +5 -0
- opik/rest_api/types/annotation_queue_reviewer.py +20 -0
- opik/rest_api/types/annotation_queue_reviewer_public.py +20 -0
- opik/rest_api/types/annotation_queue_scope.py +5 -0
- opik/rest_api/types/annotation_queue_write.py +31 -0
- opik/rest_api/types/annotation_queue_write_scope.py +5 -0
- opik/rest_api/types/audio_url.py +19 -0
- opik/rest_api/types/audio_url_public.py +19 -0
- opik/rest_api/types/audio_url_write.py +19 -0
- opik/rest_api/types/automation_rule_evaluator.py +62 -2
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_object_object_public.py +155 -0
- opik/rest_api/types/automation_rule_evaluator_page_public.py +3 -2
- opik/rest_api/types/automation_rule_evaluator_public.py +57 -2
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_public.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_write.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update.py +51 -1
- opik/rest_api/types/automation_rule_evaluator_update_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update_span_llm_as_judge.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_update_trace_thread_llm_as_judge.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update_trace_thread_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_update_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_public.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_write.py +2 -0
- opik/rest_api/types/automation_rule_evaluator_write.py +51 -1
- opik/rest_api/types/boolean_feedback_definition.py +25 -0
- opik/rest_api/types/boolean_feedback_definition_create.py +20 -0
- opik/rest_api/types/boolean_feedback_definition_public.py +25 -0
- opik/rest_api/types/boolean_feedback_definition_update.py +20 -0
- opik/rest_api/types/boolean_feedback_detail.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_create.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_public.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_update.py +29 -0
- opik/rest_api/types/dashboard_page_public.py +24 -0
- opik/rest_api/types/dashboard_public.py +30 -0
- opik/rest_api/types/dataset.py +4 -0
- opik/rest_api/types/dataset_expansion.py +42 -0
- opik/rest_api/types/dataset_expansion_response.py +39 -0
- opik/rest_api/types/dataset_item.py +2 -0
- opik/rest_api/types/dataset_item_changes_public.py +5 -0
- opik/rest_api/types/dataset_item_compare.py +2 -0
- opik/rest_api/types/dataset_item_filter.py +27 -0
- opik/rest_api/types/dataset_item_filter_operator.py +21 -0
- opik/rest_api/types/dataset_item_page_compare.py +5 -0
- opik/rest_api/types/dataset_item_page_public.py +5 -0
- opik/rest_api/types/dataset_item_public.py +2 -0
- opik/rest_api/types/dataset_item_update.py +39 -0
- opik/rest_api/types/dataset_item_write.py +1 -0
- opik/rest_api/types/dataset_public.py +4 -0
- opik/rest_api/types/dataset_public_status.py +5 -0
- opik/rest_api/types/dataset_status.py +5 -0
- opik/rest_api/types/dataset_version_diff.py +22 -0
- opik/rest_api/types/dataset_version_diff_stats.py +24 -0
- opik/rest_api/types/dataset_version_page_public.py +23 -0
- opik/rest_api/types/dataset_version_public.py +59 -0
- opik/rest_api/types/dataset_version_summary.py +46 -0
- opik/rest_api/types/dataset_version_summary_public.py +46 -0
- opik/rest_api/types/experiment.py +7 -2
- opik/rest_api/types/experiment_group_response.py +2 -0
- opik/rest_api/types/experiment_public.py +7 -2
- opik/rest_api/types/experiment_public_status.py +5 -0
- opik/rest_api/types/experiment_score.py +20 -0
- opik/rest_api/types/experiment_score_public.py +20 -0
- opik/rest_api/types/experiment_score_write.py +20 -0
- opik/rest_api/types/experiment_status.py +5 -0
- opik/rest_api/types/feedback.py +25 -1
- opik/rest_api/types/feedback_create.py +20 -1
- opik/rest_api/types/feedback_object_public.py +27 -1
- opik/rest_api/types/feedback_public.py +25 -1
- opik/rest_api/types/feedback_score_batch_item.py +2 -1
- opik/rest_api/types/feedback_score_batch_item_thread.py +2 -1
- opik/rest_api/types/feedback_score_public.py +4 -0
- opik/rest_api/types/feedback_update.py +20 -1
- opik/rest_api/types/group_content_with_aggregations.py +1 -0
- opik/rest_api/types/group_detail.py +19 -0
- opik/rest_api/types/group_details.py +20 -0
- opik/rest_api/types/guardrail.py +1 -0
- opik/rest_api/types/guardrail_write.py +1 -0
- opik/rest_api/types/ids_holder.py +19 -0
- opik/rest_api/types/image_url.py +20 -0
- opik/rest_api/types/image_url_public.py +20 -0
- opik/rest_api/types/image_url_write.py +20 -0
- opik/rest_api/types/llm_as_judge_message.py +5 -1
- opik/rest_api/types/llm_as_judge_message_content.py +26 -0
- opik/rest_api/types/llm_as_judge_message_content_public.py +26 -0
- opik/rest_api/types/llm_as_judge_message_content_write.py +26 -0
- opik/rest_api/types/llm_as_judge_message_public.py +5 -1
- opik/rest_api/types/llm_as_judge_message_write.py +5 -1
- opik/rest_api/types/llm_as_judge_model_parameters.py +3 -0
- opik/rest_api/types/llm_as_judge_model_parameters_public.py +3 -0
- opik/rest_api/types/llm_as_judge_model_parameters_write.py +3 -0
- opik/rest_api/types/manual_evaluation_request.py +38 -0
- opik/rest_api/types/manual_evaluation_request_entity_type.py +5 -0
- opik/rest_api/types/manual_evaluation_response.py +27 -0
- opik/rest_api/types/optimization.py +4 -2
- opik/rest_api/types/optimization_public.py +4 -2
- opik/rest_api/types/optimization_public_status.py +3 -1
- opik/rest_api/types/optimization_status.py +3 -1
- opik/rest_api/types/optimization_studio_config.py +27 -0
- opik/rest_api/types/optimization_studio_config_public.py +27 -0
- opik/rest_api/types/optimization_studio_config_write.py +27 -0
- opik/rest_api/types/optimization_studio_log.py +22 -0
- opik/rest_api/types/optimization_write.py +4 -2
- opik/rest_api/types/optimization_write_status.py +3 -1
- opik/rest_api/types/project.py +1 -0
- opik/rest_api/types/project_detailed.py +1 -0
- opik/rest_api/types/project_reference.py +31 -0
- opik/rest_api/types/project_reference_public.py +31 -0
- opik/rest_api/types/project_stats_summary_item.py +1 -0
- opik/rest_api/types/prompt.py +6 -0
- opik/rest_api/types/prompt_detail.py +6 -0
- opik/rest_api/types/prompt_detail_template_structure.py +5 -0
- opik/rest_api/types/prompt_public.py +6 -0
- opik/rest_api/types/prompt_public_template_structure.py +5 -0
- opik/rest_api/types/prompt_template_structure.py +5 -0
- opik/rest_api/types/prompt_version.py +3 -0
- opik/rest_api/types/prompt_version_detail.py +3 -0
- opik/rest_api/types/prompt_version_detail_template_structure.py +5 -0
- opik/rest_api/types/prompt_version_link.py +1 -0
- opik/rest_api/types/prompt_version_link_public.py +1 -0
- opik/rest_api/types/prompt_version_page_public.py +5 -0
- opik/rest_api/types/prompt_version_public.py +3 -0
- opik/rest_api/types/prompt_version_public_template_structure.py +5 -0
- opik/rest_api/types/prompt_version_template_structure.py +5 -0
- opik/rest_api/types/prompt_version_update.py +33 -0
- opik/rest_api/types/provider_api_key.py +9 -0
- opik/rest_api/types/provider_api_key_provider.py +1 -1
- opik/rest_api/types/provider_api_key_public.py +9 -0
- opik/rest_api/types/provider_api_key_public_provider.py +1 -1
- opik/rest_api/types/score_name.py +1 -0
- opik/rest_api/types/service_toggles_config.py +18 -0
- opik/rest_api/types/span.py +1 -2
- opik/rest_api/types/span_enrichment_options.py +31 -0
- opik/rest_api/types/span_experiment_item_bulk_write_view.py +1 -2
- opik/rest_api/types/span_filter.py +23 -0
- opik/rest_api/types/span_filter_operator.py +21 -0
- opik/rest_api/types/span_filter_write.py +23 -0
- opik/rest_api/types/span_filter_write_operator.py +21 -0
- opik/rest_api/types/span_llm_as_judge_code.py +27 -0
- opik/rest_api/types/span_llm_as_judge_code_public.py +27 -0
- opik/rest_api/types/span_llm_as_judge_code_write.py +27 -0
- opik/rest_api/types/span_public.py +1 -2
- opik/rest_api/types/span_update.py +46 -0
- opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
- opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
- opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
- opik/rest_api/types/span_write.py +1 -2
- opik/rest_api/types/studio_evaluation.py +20 -0
- opik/rest_api/types/studio_evaluation_public.py +20 -0
- opik/rest_api/types/studio_evaluation_write.py +20 -0
- opik/rest_api/types/studio_llm_model.py +21 -0
- opik/rest_api/types/studio_llm_model_public.py +21 -0
- opik/rest_api/types/studio_llm_model_write.py +21 -0
- opik/rest_api/types/studio_message.py +20 -0
- opik/rest_api/types/studio_message_public.py +20 -0
- opik/rest_api/types/studio_message_write.py +20 -0
- opik/rest_api/types/studio_metric.py +21 -0
- opik/rest_api/types/studio_metric_public.py +21 -0
- opik/rest_api/types/studio_metric_write.py +21 -0
- opik/rest_api/types/studio_optimizer.py +21 -0
- opik/rest_api/types/studio_optimizer_public.py +21 -0
- opik/rest_api/types/studio_optimizer_write.py +21 -0
- opik/rest_api/types/studio_prompt.py +20 -0
- opik/rest_api/types/studio_prompt_public.py +20 -0
- opik/rest_api/types/studio_prompt_write.py +20 -0
- opik/rest_api/types/trace.py +11 -2
- opik/rest_api/types/trace_enrichment_options.py +32 -0
- opik/rest_api/types/trace_experiment_item_bulk_write_view.py +1 -2
- opik/rest_api/types/trace_filter.py +23 -0
- opik/rest_api/types/trace_filter_operator.py +21 -0
- opik/rest_api/types/trace_filter_write.py +23 -0
- opik/rest_api/types/trace_filter_write_operator.py +21 -0
- opik/rest_api/types/trace_public.py +11 -2
- opik/rest_api/types/trace_thread_filter_write.py +23 -0
- opik/rest_api/types/trace_thread_filter_write_operator.py +21 -0
- opik/rest_api/types/trace_thread_identifier.py +1 -0
- opik/rest_api/types/trace_update.py +39 -0
- opik/rest_api/types/trace_write.py +1 -2
- opik/rest_api/types/value_entry.py +2 -0
- opik/rest_api/types/value_entry_compare.py +2 -0
- opik/rest_api/types/value_entry_experiment_item_bulk_write_view.py +2 -0
- opik/rest_api/types/value_entry_public.py +2 -0
- opik/rest_api/types/video_url.py +19 -0
- opik/rest_api/types/video_url_public.py +19 -0
- opik/rest_api/types/video_url_write.py +19 -0
- opik/rest_api/types/webhook.py +28 -0
- opik/rest_api/types/webhook_examples.py +19 -0
- opik/rest_api/types/webhook_public.py +28 -0
- opik/rest_api/types/webhook_test_result.py +23 -0
- opik/rest_api/types/webhook_test_result_status.py +5 -0
- opik/rest_api/types/webhook_write.py +23 -0
- opik/rest_api/types/welcome_wizard_tracking.py +22 -0
- opik/rest_api/types/workspace_configuration.py +5 -0
- opik/rest_api/welcome_wizard/__init__.py +4 -0
- opik/rest_api/welcome_wizard/client.py +195 -0
- opik/rest_api/welcome_wizard/raw_client.py +208 -0
- opik/rest_api/workspaces/client.py +14 -2
- opik/rest_api/workspaces/raw_client.py +10 -0
- opik/s3_httpx_client.py +14 -1
- opik/simulation/__init__.py +6 -0
- opik/simulation/simulated_user.py +99 -0
- opik/simulation/simulator.py +108 -0
- opik/synchronization.py +5 -6
- opik/{decorator/tracing_runtime_config.py → tracing_runtime_config.py} +6 -7
- opik/types.py +36 -0
- opik/validation/chat_prompt_messages.py +241 -0
- opik/validation/feedback_score.py +3 -3
- opik/validation/validator.py +28 -0
- opik-1.9.71.dist-info/METADATA +370 -0
- opik-1.9.71.dist-info/RECORD +1110 -0
- opik/api_objects/prompt/prompt.py +0 -112
- opik/cli.py +0 -193
- opik/hooks.py +0 -13
- opik/integrations/bedrock/chunks_aggregator.py +0 -55
- opik/integrations/bedrock/helpers.py +0 -8
- opik/rest_api/types/automation_rule_evaluator_object_public.py +0 -100
- opik/rest_api/types/json_node_experiment_item_bulk_write_view.py +0 -5
- opik-1.8.39.dist-info/METADATA +0 -339
- opik-1.8.39.dist-info/RECORD +0 -790
- /opik/{evaluation/metrics/conversation/conversational_coherence → decorator/context_manager}/__init__.py +0 -0
- /opik/evaluation/metrics/conversation/{session_completeness → llm_judges/conversational_coherence}/__init__.py +0 -0
- /opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/schema.py +0 -0
- /opik/evaluation/metrics/conversation/{user_frustration → llm_judges/session_completeness}/__init__.py +0 -0
- /opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/schema.py +0 -0
- /opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/schema.py +0 -0
- /opik/integrations/bedrock/{stream_wrappers.py → converse/stream_wrappers.py} +0 -0
- /opik/rest_api/{spans/types → types}/span_update_type.py +0 -0
- {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/WHEEL +0 -0
- {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/entry_points.txt +0 -0
- {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/licenses/LICENSE +0 -0
- {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/top_level.txt +0 -0
|
@@ -32,6 +32,8 @@ class AnswerRelevance(base_metric.BaseMetric):
|
|
|
32
32
|
require_context: if set to False, execution in no-context mode is allowed. Default is True.
|
|
33
33
|
track: Whether to track the metric. Defaults to True.
|
|
34
34
|
project_name: Optional project name to track the metric in for the cases when there are no parent span/trace to inherit project name from.
|
|
35
|
+
seed: Optional seed value for reproducible model generation. If provided, this seed will be passed to the model for deterministic outputs.
|
|
36
|
+
temperature: Optional temperature value for model generation. If provided, this temperature will be passed to the model. If not provided, the model's default temperature will be used.
|
|
35
37
|
|
|
36
38
|
Example:
|
|
37
39
|
>>> from opik.evaluation.metrics import AnswerRelevance
|
|
@@ -56,6 +58,8 @@ class AnswerRelevance(base_metric.BaseMetric):
|
|
|
56
58
|
require_context: bool = True,
|
|
57
59
|
track: bool = True,
|
|
58
60
|
project_name: Optional[str] = None,
|
|
61
|
+
seed: Optional[int] = None,
|
|
62
|
+
temperature: Optional[float] = None,
|
|
59
63
|
):
|
|
60
64
|
super().__init__(
|
|
61
65
|
name=name,
|
|
@@ -63,19 +67,28 @@ class AnswerRelevance(base_metric.BaseMetric):
|
|
|
63
67
|
project_name=project_name,
|
|
64
68
|
)
|
|
65
69
|
self._require_context = require_context
|
|
66
|
-
self.
|
|
70
|
+
self._seed = seed
|
|
71
|
+
self._init_model(model, temperature=temperature)
|
|
67
72
|
self._init_few_shot_examples(
|
|
68
73
|
few_shot_examples_with_context=few_shot_examples,
|
|
69
74
|
few_shot_examples_no_context=few_shot_examples_no_context,
|
|
70
75
|
)
|
|
71
76
|
|
|
72
77
|
def _init_model(
|
|
73
|
-
self,
|
|
78
|
+
self,
|
|
79
|
+
model: Optional[Union[str, base_model.OpikBaseModel]],
|
|
80
|
+
temperature: Optional[float],
|
|
74
81
|
) -> None:
|
|
75
82
|
if isinstance(model, base_model.OpikBaseModel):
|
|
76
83
|
self._model = model
|
|
77
84
|
else:
|
|
78
|
-
|
|
85
|
+
model_kwargs = {}
|
|
86
|
+
if temperature is not None:
|
|
87
|
+
model_kwargs["temperature"] = temperature
|
|
88
|
+
if self._seed is not None:
|
|
89
|
+
model_kwargs["seed"] = self._seed
|
|
90
|
+
|
|
91
|
+
self._model = models_factory.get(model_name=model, **model_kwargs)
|
|
79
92
|
|
|
80
93
|
def _init_few_shot_examples(
|
|
81
94
|
self,
|
|
@@ -124,7 +137,8 @@ class AnswerRelevance(base_metric.BaseMetric):
|
|
|
124
137
|
)
|
|
125
138
|
|
|
126
139
|
model_output = self._model.generate_string(
|
|
127
|
-
input=llm_query,
|
|
140
|
+
input=llm_query,
|
|
141
|
+
response_format=AnswerRelevanceResponseFormat,
|
|
128
142
|
)
|
|
129
143
|
return parser.parse_model_output(content=model_output, name=self.name)
|
|
130
144
|
|
|
@@ -154,7 +168,8 @@ class AnswerRelevance(base_metric.BaseMetric):
|
|
|
154
168
|
input=input, output=output, context=context
|
|
155
169
|
)
|
|
156
170
|
model_output = await self._model.agenerate_string(
|
|
157
|
-
input=llm_query,
|
|
171
|
+
input=llm_query,
|
|
172
|
+
response_format=AnswerRelevanceResponseFormat,
|
|
158
173
|
)
|
|
159
174
|
|
|
160
175
|
return parser.parse_model_output(content=model_output, name=self.name)
|
|
@@ -28,6 +28,8 @@ class ContextPrecision(base_metric.BaseMetric):
|
|
|
28
28
|
track: Whether to track the metric. Defaults to True.
|
|
29
29
|
project_name: Optional project name to track the metric in for the cases when
|
|
30
30
|
there are no parent span/trace to inherit project name from.
|
|
31
|
+
seed: Optional seed value for reproducible model generation. If provided, this seed will be passed to the model for deterministic outputs.
|
|
32
|
+
temperature: Optional temperature value for model generation. If provided, this temperature will be passed to the model. If not provided, the model's default temperature will be used.
|
|
31
33
|
|
|
32
34
|
Example:
|
|
33
35
|
>>> from opik.evaluation.metrics import ContextPrecision
|
|
@@ -48,23 +50,33 @@ class ContextPrecision(base_metric.BaseMetric):
|
|
|
48
50
|
] = None,
|
|
49
51
|
track: bool = True,
|
|
50
52
|
project_name: Optional[str] = None,
|
|
53
|
+
seed: Optional[int] = None,
|
|
54
|
+
temperature: Optional[float] = None,
|
|
51
55
|
):
|
|
52
56
|
super().__init__(
|
|
53
57
|
name=name,
|
|
54
58
|
track=track,
|
|
55
59
|
project_name=project_name,
|
|
56
60
|
)
|
|
57
|
-
|
|
58
|
-
self._init_model(model)
|
|
61
|
+
self._seed = seed
|
|
62
|
+
self._init_model(model, temperature=temperature)
|
|
59
63
|
self.few_shot_examples = few_shot_examples or template.FEW_SHOT_EXAMPLES
|
|
60
64
|
|
|
61
65
|
def _init_model(
|
|
62
|
-
self,
|
|
66
|
+
self,
|
|
67
|
+
model: Optional[Union[str, base_model.OpikBaseModel]],
|
|
68
|
+
temperature: Optional[float],
|
|
63
69
|
) -> None:
|
|
64
70
|
if isinstance(model, base_model.OpikBaseModel):
|
|
65
71
|
self._model = model
|
|
66
72
|
else:
|
|
67
|
-
|
|
73
|
+
model_kwargs = {}
|
|
74
|
+
if temperature is not None:
|
|
75
|
+
model_kwargs["temperature"] = temperature
|
|
76
|
+
if self._seed is not None:
|
|
77
|
+
model_kwargs["seed"] = self._seed
|
|
78
|
+
|
|
79
|
+
self._model = models_factory.get(model_name=model, **model_kwargs)
|
|
68
80
|
|
|
69
81
|
def score(
|
|
70
82
|
self,
|
|
@@ -96,7 +108,8 @@ class ContextPrecision(base_metric.BaseMetric):
|
|
|
96
108
|
few_shot_examples=self.few_shot_examples,
|
|
97
109
|
)
|
|
98
110
|
model_output = self._model.generate_string(
|
|
99
|
-
input=llm_query,
|
|
111
|
+
input=llm_query,
|
|
112
|
+
response_format=ContextPrecisionResponseFormat,
|
|
100
113
|
)
|
|
101
114
|
|
|
102
115
|
return parser.parse_model_output(content=model_output, name=self.name)
|
|
@@ -133,7 +146,8 @@ class ContextPrecision(base_metric.BaseMetric):
|
|
|
133
146
|
few_shot_examples=self.few_shot_examples,
|
|
134
147
|
)
|
|
135
148
|
model_output = await self._model.agenerate_string(
|
|
136
|
-
input=llm_query,
|
|
149
|
+
input=llm_query,
|
|
150
|
+
response_format=ContextPrecisionResponseFormat,
|
|
137
151
|
)
|
|
138
152
|
|
|
139
153
|
return parser.parse_model_output(content=model_output, name=self.name)
|
|
@@ -28,6 +28,8 @@ class ContextRecall(base_metric.BaseMetric):
|
|
|
28
28
|
track: Whether to track the metric. Defaults to True.
|
|
29
29
|
project_name: Optional project name to track the metric in for the cases when
|
|
30
30
|
there are no parent span/trace to inherit project name from.
|
|
31
|
+
seed: Optional seed value for reproducible model generation. If provided, this seed will be passed to the model for deterministic outputs.
|
|
32
|
+
temperature: Optional temperature value for model generation. If provided, this temperature will be passed to the model. If not provided, the model's default temperature will be used.
|
|
31
33
|
|
|
32
34
|
Example:
|
|
33
35
|
>>> from opik.evaluation.metrics import ContextRecall
|
|
@@ -46,23 +48,33 @@ class ContextRecall(base_metric.BaseMetric):
|
|
|
46
48
|
few_shot_examples: Optional[List[template.FewShotExampleContextRecall]] = None,
|
|
47
49
|
track: bool = True,
|
|
48
50
|
project_name: Optional[str] = None,
|
|
51
|
+
seed: Optional[int] = None,
|
|
52
|
+
temperature: Optional[float] = None,
|
|
49
53
|
):
|
|
50
54
|
super().__init__(
|
|
51
55
|
name=name,
|
|
52
56
|
track=track,
|
|
53
57
|
project_name=project_name,
|
|
54
58
|
)
|
|
55
|
-
|
|
56
|
-
self._init_model(model)
|
|
59
|
+
self._seed = seed
|
|
60
|
+
self._init_model(model, temperature=temperature)
|
|
57
61
|
self.few_shot_examples = few_shot_examples or template.FEW_SHOT_EXAMPLES
|
|
58
62
|
|
|
59
63
|
def _init_model(
|
|
60
|
-
self,
|
|
64
|
+
self,
|
|
65
|
+
model: Optional[Union[str, base_model.OpikBaseModel]],
|
|
66
|
+
temperature: Optional[float],
|
|
61
67
|
) -> None:
|
|
62
68
|
if isinstance(model, base_model.OpikBaseModel):
|
|
63
69
|
self._model = model
|
|
64
70
|
else:
|
|
65
|
-
|
|
71
|
+
model_kwargs = {}
|
|
72
|
+
if temperature is not None:
|
|
73
|
+
model_kwargs["temperature"] = temperature
|
|
74
|
+
if self._seed is not None:
|
|
75
|
+
model_kwargs["seed"] = self._seed
|
|
76
|
+
|
|
77
|
+
self._model = models_factory.get(model_name=model, **model_kwargs)
|
|
66
78
|
|
|
67
79
|
def score(
|
|
68
80
|
self,
|
|
@@ -94,7 +106,8 @@ class ContextRecall(base_metric.BaseMetric):
|
|
|
94
106
|
few_shot_examples=self.few_shot_examples,
|
|
95
107
|
)
|
|
96
108
|
model_output = self._model.generate_string(
|
|
97
|
-
input=llm_query,
|
|
109
|
+
input=llm_query,
|
|
110
|
+
response_format=ContextRecallResponseFormat,
|
|
98
111
|
)
|
|
99
112
|
|
|
100
113
|
return parser.parse_model_output(content=model_output, name=self.name)
|
|
@@ -131,7 +144,8 @@ class ContextRecall(base_metric.BaseMetric):
|
|
|
131
144
|
few_shot_examples=self.few_shot_examples,
|
|
132
145
|
)
|
|
133
146
|
model_output = await self._model.agenerate_string(
|
|
134
|
-
input=llm_query,
|
|
147
|
+
input=llm_query,
|
|
148
|
+
response_format=ContextRecallResponseFormat,
|
|
135
149
|
)
|
|
136
150
|
|
|
137
151
|
return parser.parse_model_output(content=model_output, name=self.name)
|
|
@@ -1,10 +1,13 @@
|
|
|
1
|
-
from
|
|
1
|
+
from collections import OrderedDict
|
|
2
|
+
from threading import Lock
|
|
3
|
+
from typing import Any, Dict, Optional, Tuple, Union
|
|
2
4
|
import pydantic
|
|
3
5
|
|
|
4
6
|
from opik.evaluation.metrics import base_metric, score_result
|
|
5
7
|
from opik.evaluation.models import base_model, models_factory
|
|
6
8
|
from opik.evaluation import models
|
|
7
9
|
from . import template, parser
|
|
10
|
+
from .presets import GEVAL_PRESETS
|
|
8
11
|
|
|
9
12
|
|
|
10
13
|
class GEvalScoreFormat(pydantic.BaseModel):
|
|
@@ -12,7 +15,56 @@ class GEvalScoreFormat(pydantic.BaseModel):
|
|
|
12
15
|
reason: str
|
|
13
16
|
|
|
14
17
|
|
|
18
|
+
def _freeze_for_cache(value: Any) -> Any:
|
|
19
|
+
"""Convert nested structures into hashable representations for caching."""
|
|
20
|
+
|
|
21
|
+
if isinstance(value, dict):
|
|
22
|
+
return tuple(
|
|
23
|
+
sorted((key, _freeze_for_cache(val)) for key, val in value.items())
|
|
24
|
+
)
|
|
25
|
+
if isinstance(value, (list, tuple)):
|
|
26
|
+
return tuple(_freeze_for_cache(item) for item in value)
|
|
27
|
+
if isinstance(value, set):
|
|
28
|
+
return tuple(sorted(_freeze_for_cache(item) for item in value))
|
|
29
|
+
return value
|
|
30
|
+
|
|
31
|
+
|
|
15
32
|
class GEval(base_metric.BaseMetric):
|
|
33
|
+
"""
|
|
34
|
+
Generalised evaluation metric that prompts an LLM to grade another LLM output.
|
|
35
|
+
|
|
36
|
+
GEval builds a reusable chain-of-thought using the provided
|
|
37
|
+
``task_introduction`` and ``evaluation_criteria`` prompts, then requests a
|
|
38
|
+
final score and rationale for each evaluated output.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
task_introduction: Instruction describing the evaluator's persona/purpose.
|
|
42
|
+
evaluation_criteria: Detailed rubric presented to the evaluator.
|
|
43
|
+
model: Optional model identifier or ``OpikBaseModel`` for the judge.
|
|
44
|
+
name: Display name for the metric result. Defaults to ``"g_eval_metric"``.
|
|
45
|
+
track: Whether to automatically track metric results. Defaults to ``True``.
|
|
46
|
+
project_name: Optional tracking project name.
|
|
47
|
+
temperature: Sampling temperature forwarded to the judge model.
|
|
48
|
+
seed: Optional seed for reproducible generation (if supported by the model).
|
|
49
|
+
|
|
50
|
+
Example:
|
|
51
|
+
>>> from opik.evaluation.metrics.llm_judges.g_eval.metric import GEval
|
|
52
|
+
>>> metric = GEval(
|
|
53
|
+
... task_introduction="You evaluate politeness of responses.",
|
|
54
|
+
... evaluation_criteria="Score from 1 (rude) to 5 (very polite).",
|
|
55
|
+
... model="gpt-4",
|
|
56
|
+
... )
|
|
57
|
+
>>> result = metric.score(output="Thanks so much for your help!") # doctest: +SKIP
|
|
58
|
+
>>> result.value # doctest: +SKIP
|
|
59
|
+
0.9
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
_CHAIN_OF_THOUGHT_CACHE: "OrderedDict[Tuple[str, str, str, Any], str]" = (
|
|
63
|
+
OrderedDict()
|
|
64
|
+
)
|
|
65
|
+
_CHAIN_OF_THOUGHT_LOCK: Lock = Lock()
|
|
66
|
+
_MAX_CHAIN_OF_THOUGHT_CACHE = 128
|
|
67
|
+
|
|
16
68
|
def __init__(
|
|
17
69
|
self,
|
|
18
70
|
task_introduction: str,
|
|
@@ -21,65 +73,61 @@ class GEval(base_metric.BaseMetric):
|
|
|
21
73
|
name: str = "g_eval_metric",
|
|
22
74
|
track: bool = True,
|
|
23
75
|
project_name: Optional[str] = None,
|
|
76
|
+
temperature: float = 0.0,
|
|
77
|
+
seed: Optional[int] = None,
|
|
24
78
|
):
|
|
25
|
-
"""
|
|
26
|
-
A metric that evaluates an LLM output based on chain-of-thought built with the evaluation criteria provided
|
|
27
|
-
by the user.
|
|
28
|
-
|
|
29
|
-
For more details see the original paper: https://arxiv.org/pdf/2303.16634
|
|
30
|
-
|
|
31
|
-
Args:
|
|
32
|
-
task_introduction: An instruction for LLM used to generate an evaluation chain-of-thought and in evaluation call itself.
|
|
33
|
-
`opik.evaluation.models.LiteLLMChatModel` is used by default.
|
|
34
|
-
evaluation_criteria: The main task for G-Eval metric written in human language.
|
|
35
|
-
model: The LLM to use for evaluation. Can be a string (model name) or an `opik.evaluation.models.OpikBaseModel` subclass instance.
|
|
36
|
-
name: The name of the metric.
|
|
37
|
-
track: Whether to track the metric. Defaults to True.
|
|
38
|
-
project_name: Optional project name to track the metric in for the cases when
|
|
39
|
-
there are no parent span/trace to inherit project name from.
|
|
40
|
-
"""
|
|
41
79
|
super().__init__(
|
|
42
80
|
name=name,
|
|
43
81
|
track=track,
|
|
44
82
|
project_name=project_name,
|
|
45
83
|
)
|
|
46
|
-
self._init_model(model)
|
|
47
|
-
|
|
48
84
|
self.task_introduction = task_introduction
|
|
49
85
|
self.evaluation_criteria = evaluation_criteria
|
|
86
|
+
self._seed = seed
|
|
87
|
+
|
|
50
88
|
self._log_probs_supported = False
|
|
51
89
|
|
|
52
|
-
self.
|
|
90
|
+
self._init_model(model, temperature=temperature)
|
|
53
91
|
|
|
54
92
|
def llm_chain_of_thought(self) -> str:
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
)
|
|
60
|
-
self._chain_of_thought_response = self._model.generate_string(input=prompt)
|
|
93
|
+
cache_key = self._chain_of_thought_cache_key()
|
|
94
|
+
cached = self._get_cached_chain_of_thought(cache_key)
|
|
95
|
+
if cached is not None:
|
|
96
|
+
return cached
|
|
61
97
|
|
|
62
|
-
|
|
98
|
+
prompt = template.G_EVAL_COT_TEMPLATE.format(
|
|
99
|
+
task_introduction=self.task_introduction,
|
|
100
|
+
evaluation_criteria=self.evaluation_criteria,
|
|
101
|
+
)
|
|
102
|
+
generated = self._model.generate_string(input=prompt)
|
|
103
|
+
self._store_chain_of_thought(cache_key, generated)
|
|
104
|
+
return generated
|
|
63
105
|
|
|
64
106
|
async def allm_chain_of_thought(self) -> str:
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
)
|
|
70
|
-
self._chain_of_thought_response = await self._model.agenerate_string(
|
|
71
|
-
input=prompt
|
|
72
|
-
)
|
|
107
|
+
cache_key = self._chain_of_thought_cache_key()
|
|
108
|
+
cached = self._get_cached_chain_of_thought(cache_key)
|
|
109
|
+
if cached is not None:
|
|
110
|
+
return cached
|
|
73
111
|
|
|
74
|
-
|
|
112
|
+
prompt = template.G_EVAL_COT_TEMPLATE.format(
|
|
113
|
+
task_introduction=self.task_introduction,
|
|
114
|
+
evaluation_criteria=self.evaluation_criteria,
|
|
115
|
+
)
|
|
116
|
+
generated = await self._model.agenerate_string(input=prompt)
|
|
117
|
+
self._store_chain_of_thought(cache_key, generated)
|
|
118
|
+
return generated
|
|
75
119
|
|
|
76
120
|
def _init_model(
|
|
77
|
-
self, model: Optional[Union[str, base_model.OpikBaseModel]]
|
|
121
|
+
self, model: Optional[Union[str, base_model.OpikBaseModel]], temperature: float
|
|
78
122
|
) -> None:
|
|
79
123
|
if isinstance(model, base_model.OpikBaseModel):
|
|
80
124
|
self._model = model
|
|
81
125
|
else:
|
|
82
|
-
|
|
126
|
+
model_kwargs = {"temperature": temperature}
|
|
127
|
+
if self._seed is not None:
|
|
128
|
+
model_kwargs["seed"] = self._seed
|
|
129
|
+
|
|
130
|
+
self._model = models_factory.get(model_name=model, **model_kwargs)
|
|
83
131
|
|
|
84
132
|
if (
|
|
85
133
|
hasattr(self._model, "supported_params")
|
|
@@ -88,6 +136,55 @@ class GEval(base_metric.BaseMetric):
|
|
|
88
136
|
):
|
|
89
137
|
self._log_probs_supported = True
|
|
90
138
|
|
|
139
|
+
@classmethod
|
|
140
|
+
def _get_cached_chain_of_thought(
|
|
141
|
+
cls, cache_key: Tuple[str, str, str, Any]
|
|
142
|
+
) -> Optional[str]:
|
|
143
|
+
with cls._CHAIN_OF_THOUGHT_LOCK:
|
|
144
|
+
value = cls._CHAIN_OF_THOUGHT_CACHE.get(cache_key)
|
|
145
|
+
if value is not None:
|
|
146
|
+
cls._CHAIN_OF_THOUGHT_CACHE.move_to_end(cache_key)
|
|
147
|
+
return value
|
|
148
|
+
|
|
149
|
+
@classmethod
|
|
150
|
+
def _store_chain_of_thought(
|
|
151
|
+
cls, cache_key: Tuple[str, str, str, Any], value: str
|
|
152
|
+
) -> None:
|
|
153
|
+
with cls._CHAIN_OF_THOUGHT_LOCK:
|
|
154
|
+
existing = cls._CHAIN_OF_THOUGHT_CACHE.get(cache_key)
|
|
155
|
+
if existing is not None:
|
|
156
|
+
cls._CHAIN_OF_THOUGHT_CACHE.move_to_end(cache_key)
|
|
157
|
+
return
|
|
158
|
+
cls._CHAIN_OF_THOUGHT_CACHE[cache_key] = value
|
|
159
|
+
cls._CHAIN_OF_THOUGHT_CACHE.move_to_end(cache_key)
|
|
160
|
+
while len(cls._CHAIN_OF_THOUGHT_CACHE) > cls._MAX_CHAIN_OF_THOUGHT_CACHE:
|
|
161
|
+
cls._CHAIN_OF_THOUGHT_CACHE.popitem(last=False)
|
|
162
|
+
|
|
163
|
+
def _chain_of_thought_cache_key(self) -> Tuple[str, str, str, Any]:
|
|
164
|
+
model_name = getattr(self._model, "model_name", "unknown")
|
|
165
|
+
return (
|
|
166
|
+
self.task_introduction,
|
|
167
|
+
self.evaluation_criteria,
|
|
168
|
+
model_name,
|
|
169
|
+
self._model_cache_fingerprint(),
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
def _model_cache_fingerprint(self) -> Any:
|
|
173
|
+
fingerprint_candidate = getattr(self._model, "cache_fingerprint", None)
|
|
174
|
+
if callable(fingerprint_candidate):
|
|
175
|
+
try:
|
|
176
|
+
fingerprint = fingerprint_candidate()
|
|
177
|
+
except Exception:
|
|
178
|
+
fingerprint = None
|
|
179
|
+
else:
|
|
180
|
+
return _freeze_for_cache(fingerprint)
|
|
181
|
+
|
|
182
|
+
completion_kwargs = getattr(self._model, "_completion_kwargs", None)
|
|
183
|
+
if isinstance(completion_kwargs, dict):
|
|
184
|
+
return _freeze_for_cache(completion_kwargs)
|
|
185
|
+
|
|
186
|
+
return id(self._model)
|
|
187
|
+
|
|
91
188
|
def score(
|
|
92
189
|
self,
|
|
93
190
|
output: str,
|
|
@@ -119,17 +216,23 @@ class GEval(base_metric.BaseMetric):
|
|
|
119
216
|
]
|
|
120
217
|
|
|
121
218
|
if isinstance(self._model, models.LiteLLMChatModel):
|
|
122
|
-
|
|
219
|
+
provider_kwargs: Dict[str, Any] = {
|
|
220
|
+
"response_format": GEvalScoreFormat,
|
|
221
|
+
}
|
|
222
|
+
if self._log_probs_supported:
|
|
223
|
+
provider_kwargs["logprobs"] = True
|
|
224
|
+
provider_kwargs["top_logprobs"] = 20
|
|
225
|
+
|
|
226
|
+
with base_model.get_provider_response(
|
|
227
|
+
model_provider=self._model,
|
|
123
228
|
messages=request,
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
log_probs_supported=self._log_probs_supported,
|
|
132
|
-
)
|
|
229
|
+
**provider_kwargs,
|
|
230
|
+
) as model_output:
|
|
231
|
+
return parser.parse_litellm_model_output(
|
|
232
|
+
content=model_output,
|
|
233
|
+
name=self.name,
|
|
234
|
+
log_probs_supported=self._log_probs_supported,
|
|
235
|
+
)
|
|
133
236
|
|
|
134
237
|
model_output_string = self._model.generate_string(
|
|
135
238
|
input=llm_query, response_format=GEvalScoreFormat
|
|
@@ -138,18 +241,13 @@ class GEval(base_metric.BaseMetric):
|
|
|
138
241
|
return parser.parse_model_output_string(model_output_string, self.name)
|
|
139
242
|
|
|
140
243
|
async def ascore(
|
|
141
|
-
self,
|
|
244
|
+
self,
|
|
245
|
+
output: str,
|
|
246
|
+
**ignored_kwargs: Any,
|
|
142
247
|
) -> score_result.ScoreResult:
|
|
143
248
|
"""
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
Args:
|
|
147
|
-
output: The LLM's output to evaluate.
|
|
148
|
-
**ignored_kwargs: Additional keyword arguments that are ignored.
|
|
149
|
-
|
|
150
|
-
Returns:
|
|
151
|
-
score_result.ScoreResult: A ScoreResult object containing the G-Eval score
|
|
152
|
-
(between 0.0 and 1.0) and a reason for the score.
|
|
249
|
+
Async variant of :meth:`score`, evaluating the provided LLM output using
|
|
250
|
+
the configured judge model and returning a ``ScoreResult``.
|
|
153
251
|
"""
|
|
154
252
|
llm_query = template.G_EVAL_QUERY_TEMPLATE.format(
|
|
155
253
|
task_introduction=self.task_introduction,
|
|
@@ -166,20 +264,73 @@ class GEval(base_metric.BaseMetric):
|
|
|
166
264
|
]
|
|
167
265
|
|
|
168
266
|
if isinstance(self._model, models.LiteLLMChatModel):
|
|
169
|
-
|
|
267
|
+
provider_kwargs: Dict[str, Any] = {
|
|
268
|
+
"response_format": GEvalScoreFormat,
|
|
269
|
+
}
|
|
270
|
+
if self._log_probs_supported:
|
|
271
|
+
provider_kwargs["logprobs"] = True
|
|
272
|
+
provider_kwargs["top_logprobs"] = 20
|
|
273
|
+
|
|
274
|
+
async with base_model.aget_provider_response(
|
|
275
|
+
model_provider=self._model,
|
|
170
276
|
messages=request,
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
log_probs_supported=self._log_probs_supported,
|
|
179
|
-
)
|
|
277
|
+
**provider_kwargs,
|
|
278
|
+
) as model_output:
|
|
279
|
+
return parser.parse_litellm_model_output(
|
|
280
|
+
content=model_output,
|
|
281
|
+
name=self.name,
|
|
282
|
+
log_probs_supported=self._log_probs_supported,
|
|
283
|
+
)
|
|
180
284
|
|
|
181
285
|
model_output_string = await self._model.agenerate_string(
|
|
182
286
|
input=llm_query, response_format=GEvalScoreFormat
|
|
183
287
|
)
|
|
184
288
|
|
|
185
289
|
return parser.parse_model_output_string(model_output_string, self.name)
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
class GEvalPreset(GEval):
|
|
293
|
+
"""
|
|
294
|
+
Pre-configured GEval variant with author-provided prompt templates.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
preset: Key name from ``GEVAL_PRESETS`` describing the evaluation rubric.
|
|
298
|
+
model: Optional model identifier or ``OpikBaseModel`` instance.
|
|
299
|
+
track: Whether to automatically track metric results. Defaults to ``True``.
|
|
300
|
+
project_name: Optional tracking project name.
|
|
301
|
+
temperature: Sampling temperature forwarded to the judge model.
|
|
302
|
+
name: Optional override for the metric name (defaults to preset name).
|
|
303
|
+
|
|
304
|
+
Example:
|
|
305
|
+
>>> from opik.evaluation.metrics.llm_judges.g_eval.metric import GEvalPreset
|
|
306
|
+
>>> metric = GEvalPreset(preset="qa_relevance", model="gpt-4")
|
|
307
|
+
>>> result = metric.score(output="Answer addresses the user's question.") # doctest: +SKIP
|
|
308
|
+
>>> result.value # doctest: +SKIP
|
|
309
|
+
0.85
|
|
310
|
+
"""
|
|
311
|
+
|
|
312
|
+
def __init__(
|
|
313
|
+
self,
|
|
314
|
+
preset: str,
|
|
315
|
+
model: Optional[Union[str, models.base_model.OpikBaseModel]] = None,
|
|
316
|
+
track: bool = True,
|
|
317
|
+
project_name: Optional[str] = None,
|
|
318
|
+
temperature: float = 0.0,
|
|
319
|
+
name: Optional[str] = None,
|
|
320
|
+
):
|
|
321
|
+
try:
|
|
322
|
+
definition = GEVAL_PRESETS[preset]
|
|
323
|
+
except KeyError as error:
|
|
324
|
+
raise ValueError(
|
|
325
|
+
f"Unknown GEval preset '{preset}'. Available presets: {list(GEVAL_PRESETS)}"
|
|
326
|
+
) from error
|
|
327
|
+
|
|
328
|
+
super().__init__(
|
|
329
|
+
task_introduction=definition.task_introduction,
|
|
330
|
+
evaluation_criteria=definition.evaluation_criteria,
|
|
331
|
+
model=model,
|
|
332
|
+
name=name or definition.name,
|
|
333
|
+
track=track,
|
|
334
|
+
project_name=project_name,
|
|
335
|
+
temperature=temperature,
|
|
336
|
+
)
|