PyPI - opik - Versions diffs - 1.8.39__py3-none-any.whl → 1.9.71__py3-none-any.whl - Mend

opik 1.8.39py3-none-any.whl → 1.9.71py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (592) hide show

opik/__init__.py +19 -3
opik/anonymizer/__init__.py +5 -0
opik/anonymizer/anonymizer.py +12 -0
opik/anonymizer/factory.py +80 -0
opik/anonymizer/recursive_anonymizer.py +64 -0
opik/anonymizer/rules.py +56 -0
opik/anonymizer/rules_anonymizer.py +35 -0
opik/api_objects/attachment/attachment_context.py +36 -0
opik/api_objects/attachment/attachments_extractor.py +153 -0
opik/api_objects/attachment/client.py +1 -0
opik/api_objects/attachment/converters.py +2 -0
opik/api_objects/attachment/decoder.py +18 -0
opik/api_objects/attachment/decoder_base64.py +83 -0
opik/api_objects/attachment/decoder_helpers.py +137 -0
opik/api_objects/data_helpers.py +79 -0
opik/api_objects/dataset/dataset.py +64 -4
opik/api_objects/dataset/rest_operations.py +11 -2
opik/api_objects/experiment/experiment.py +57 -57
opik/api_objects/experiment/experiment_item.py +2 -1
opik/api_objects/experiment/experiments_client.py +64 -0
opik/api_objects/experiment/helpers.py +35 -11
opik/api_objects/experiment/rest_operations.py +65 -5
opik/api_objects/helpers.py +8 -5
opik/api_objects/local_recording.py +81 -0
opik/api_objects/opik_client.py +600 -108
opik/api_objects/opik_query_language.py +39 -5
opik/api_objects/prompt/__init__.py +12 -2
opik/api_objects/prompt/base_prompt.py +69 -0
opik/api_objects/prompt/base_prompt_template.py +29 -0
opik/api_objects/prompt/chat/__init__.py +1 -0
opik/api_objects/prompt/chat/chat_prompt.py +210 -0
opik/api_objects/prompt/chat/chat_prompt_template.py +350 -0
opik/api_objects/prompt/chat/content_renderer_registry.py +203 -0
opik/api_objects/prompt/client.py +189 -47
opik/api_objects/prompt/text/__init__.py +1 -0
opik/api_objects/prompt/text/prompt.py +174 -0
opik/api_objects/prompt/{prompt_template.py → text/prompt_template.py} +10 -6
opik/api_objects/prompt/types.py +23 -0
opik/api_objects/search_helpers.py +89 -0
opik/api_objects/span/span_data.py +35 -25
opik/api_objects/threads/threads_client.py +39 -5
opik/api_objects/trace/trace_client.py +52 -2
opik/api_objects/trace/trace_data.py +15 -24
opik/api_objects/validation_helpers.py +3 -3
opik/cli/__init__.py +5 -0
opik/cli/__main__.py +6 -0
opik/cli/configure.py +66 -0
opik/cli/exports/__init__.py +131 -0
opik/cli/exports/dataset.py +278 -0
opik/cli/exports/experiment.py +784 -0
opik/cli/exports/project.py +685 -0
opik/cli/exports/prompt.py +578 -0
opik/cli/exports/utils.py +406 -0
opik/cli/harbor.py +39 -0
opik/cli/healthcheck.py +21 -0
opik/cli/imports/__init__.py +439 -0
opik/cli/imports/dataset.py +143 -0
opik/cli/imports/experiment.py +1192 -0
opik/cli/imports/project.py +262 -0
opik/cli/imports/prompt.py +177 -0
opik/cli/imports/utils.py +280 -0
opik/cli/main.py +49 -0
opik/cli/proxy.py +93 -0
opik/cli/usage_report/__init__.py +16 -0
opik/cli/usage_report/charts.py +783 -0
opik/cli/usage_report/cli.py +274 -0
opik/cli/usage_report/constants.py +9 -0
opik/cli/usage_report/extraction.py +749 -0
opik/cli/usage_report/pdf.py +244 -0
opik/cli/usage_report/statistics.py +78 -0
opik/cli/usage_report/utils.py +235 -0
opik/config.py +13 -7
opik/configurator/configure.py +17 -0
opik/datetime_helpers.py +12 -0
opik/decorator/arguments_helpers.py +9 -1
opik/decorator/base_track_decorator.py +205 -133
opik/decorator/context_manager/span_context_manager.py +123 -0
opik/decorator/context_manager/trace_context_manager.py +84 -0
opik/decorator/opik_args/__init__.py +13 -0
opik/decorator/opik_args/api_classes.py +71 -0
opik/decorator/opik_args/helpers.py +120 -0
opik/decorator/span_creation_handler.py +25 -6
opik/dict_utils.py +3 -3
opik/evaluation/__init__.py +13 -2
opik/evaluation/engine/engine.py +272 -75
opik/evaluation/engine/evaluation_tasks_executor.py +6 -3
opik/evaluation/engine/helpers.py +31 -6
opik/evaluation/engine/metrics_evaluator.py +237 -0
opik/evaluation/evaluation_result.py +168 -2
opik/evaluation/evaluator.py +533 -62
opik/evaluation/metrics/__init__.py +103 -4
opik/evaluation/metrics/aggregated_metric.py +35 -6
opik/evaluation/metrics/base_metric.py +1 -1
opik/evaluation/metrics/conversation/__init__.py +48 -0
opik/evaluation/metrics/conversation/conversation_thread_metric.py +56 -2
opik/evaluation/metrics/conversation/g_eval_wrappers.py +19 -0
opik/evaluation/metrics/conversation/helpers.py +14 -15
opik/evaluation/metrics/conversation/heuristics/__init__.py +14 -0
opik/evaluation/metrics/conversation/heuristics/degeneration/__init__.py +3 -0
opik/evaluation/metrics/conversation/heuristics/degeneration/metric.py +189 -0
opik/evaluation/metrics/conversation/heuristics/degeneration/phrases.py +12 -0
opik/evaluation/metrics/conversation/heuristics/knowledge_retention/__init__.py +3 -0
opik/evaluation/metrics/conversation/heuristics/knowledge_retention/metric.py +172 -0
opik/evaluation/metrics/conversation/llm_judges/__init__.py +32 -0
opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/metric.py +22 -17
opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/templates.py +1 -1
opik/evaluation/metrics/conversation/llm_judges/g_eval_wrappers.py +442 -0
opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/metric.py +13 -7
opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/templates.py +1 -1
opik/evaluation/metrics/conversation/llm_judges/user_frustration/__init__.py +0 -0
opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/metric.py +21 -14
opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/templates.py +1 -1
opik/evaluation/metrics/conversation/types.py +4 -5
opik/evaluation/metrics/conversation_types.py +9 -0
opik/evaluation/metrics/heuristics/bertscore.py +107 -0
opik/evaluation/metrics/heuristics/bleu.py +35 -15
opik/evaluation/metrics/heuristics/chrf.py +127 -0
opik/evaluation/metrics/heuristics/contains.py +47 -11
opik/evaluation/metrics/heuristics/distribution_metrics.py +331 -0
opik/evaluation/metrics/heuristics/gleu.py +113 -0
opik/evaluation/metrics/heuristics/language_adherence.py +123 -0
opik/evaluation/metrics/heuristics/meteor.py +119 -0
opik/evaluation/metrics/heuristics/prompt_injection.py +150 -0
opik/evaluation/metrics/heuristics/readability.py +129 -0
opik/evaluation/metrics/heuristics/rouge.py +26 -9
opik/evaluation/metrics/heuristics/spearman.py +88 -0
opik/evaluation/metrics/heuristics/tone.py +155 -0
opik/evaluation/metrics/heuristics/vader_sentiment.py +77 -0
opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +20 -5
opik/evaluation/metrics/llm_judges/context_precision/metric.py +20 -6
opik/evaluation/metrics/llm_judges/context_recall/metric.py +20 -6
opik/evaluation/metrics/llm_judges/g_eval/__init__.py +5 -0
opik/evaluation/metrics/llm_judges/g_eval/metric.py +219 -68
opik/evaluation/metrics/llm_judges/g_eval/parser.py +102 -52
opik/evaluation/metrics/llm_judges/g_eval/presets.py +209 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/__init__.py +36 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/agent_assessment.py +77 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/bias_classifier.py +181 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/compliance_risk.py +41 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/prompt_uncertainty.py +41 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/qa_suite.py +146 -0
opik/evaluation/metrics/llm_judges/hallucination/metric.py +16 -3
opik/evaluation/metrics/llm_judges/llm_juries/__init__.py +3 -0
opik/evaluation/metrics/llm_judges/llm_juries/metric.py +76 -0
opik/evaluation/metrics/llm_judges/moderation/metric.py +16 -4
opik/evaluation/metrics/llm_judges/structure_output_compliance/__init__.py +0 -0
opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +144 -0
opik/evaluation/metrics/llm_judges/structure_output_compliance/parser.py +79 -0
opik/evaluation/metrics/llm_judges/structure_output_compliance/schema.py +15 -0
opik/evaluation/metrics/llm_judges/structure_output_compliance/template.py +50 -0
opik/evaluation/metrics/llm_judges/syc_eval/__init__.py +0 -0
opik/evaluation/metrics/llm_judges/syc_eval/metric.py +252 -0
opik/evaluation/metrics/llm_judges/syc_eval/parser.py +82 -0
opik/evaluation/metrics/llm_judges/syc_eval/template.py +155 -0
opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +20 -5
opik/evaluation/metrics/llm_judges/usefulness/metric.py +16 -4
opik/evaluation/metrics/ragas_metric.py +43 -23
opik/evaluation/models/__init__.py +8 -0
opik/evaluation/models/base_model.py +107 -1
opik/evaluation/models/langchain/langchain_chat_model.py +15 -7
opik/evaluation/models/langchain/message_converters.py +97 -15
opik/evaluation/models/litellm/litellm_chat_model.py +156 -29
opik/evaluation/models/litellm/util.py +125 -0
opik/evaluation/models/litellm/warning_filters.py +16 -4
opik/evaluation/models/model_capabilities.py +187 -0
opik/evaluation/models/models_factory.py +25 -3
opik/evaluation/preprocessing.py +92 -0
opik/evaluation/report.py +70 -12
opik/evaluation/rest_operations.py +49 -45
opik/evaluation/samplers/__init__.py +4 -0
opik/evaluation/samplers/base_dataset_sampler.py +40 -0
opik/evaluation/samplers/random_dataset_sampler.py +48 -0
opik/evaluation/score_statistics.py +66 -0
opik/evaluation/scorers/__init__.py +4 -0
opik/evaluation/scorers/scorer_function.py +55 -0
opik/evaluation/scorers/scorer_wrapper_metric.py +130 -0
opik/evaluation/test_case.py +3 -2
opik/evaluation/test_result.py +1 -0
opik/evaluation/threads/evaluator.py +31 -3
opik/evaluation/threads/helpers.py +3 -2
opik/evaluation/types.py +9 -1
opik/exceptions.py +33 -0
opik/file_upload/file_uploader.py +13 -0
opik/file_upload/upload_options.py +2 -0
opik/hooks/__init__.py +23 -0
opik/hooks/anonymizer_hook.py +36 -0
opik/hooks/httpx_client_hook.py +112 -0
opik/httpx_client.py +12 -9
opik/id_helpers.py +18 -0
opik/integrations/adk/graph/subgraph_edges_builders.py +1 -2
opik/integrations/adk/helpers.py +16 -7
opik/integrations/adk/legacy_opik_tracer.py +7 -4
opik/integrations/adk/opik_tracer.py +14 -1
opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +7 -3
opik/integrations/adk/recursive_callback_injector.py +4 -7
opik/integrations/bedrock/converse/__init__.py +0 -0
opik/integrations/bedrock/converse/chunks_aggregator.py +188 -0
opik/integrations/bedrock/{converse_decorator.py → converse/converse_decorator.py} +4 -3
opik/integrations/bedrock/invoke_agent_decorator.py +5 -4
opik/integrations/bedrock/invoke_model/__init__.py +0 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/__init__.py +78 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/api.py +45 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/base.py +23 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/claude.py +121 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/format_detector.py +107 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/llama.py +108 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/mistral.py +118 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/nova.py +99 -0
opik/integrations/bedrock/invoke_model/invoke_model_decorator.py +178 -0
opik/integrations/bedrock/invoke_model/response_types.py +34 -0
opik/integrations/bedrock/invoke_model/stream_wrappers.py +122 -0
opik/integrations/bedrock/invoke_model/usage_converters.py +87 -0
opik/integrations/bedrock/invoke_model/usage_extraction.py +108 -0
opik/integrations/bedrock/opik_tracker.py +42 -4
opik/integrations/bedrock/types.py +19 -0
opik/integrations/crewai/crewai_decorator.py +8 -51
opik/integrations/crewai/opik_tracker.py +31 -10
opik/integrations/crewai/patchers/__init__.py +5 -0
opik/integrations/crewai/patchers/flow.py +118 -0
opik/integrations/crewai/patchers/litellm_completion.py +30 -0
opik/integrations/crewai/patchers/llm_client.py +207 -0
opik/integrations/dspy/callback.py +80 -17
opik/integrations/dspy/parsers.py +168 -0
opik/integrations/harbor/__init__.py +17 -0
opik/integrations/harbor/experiment_service.py +269 -0
opik/integrations/harbor/opik_tracker.py +528 -0
opik/integrations/haystack/opik_connector.py +2 -2
opik/integrations/haystack/opik_tracer.py +3 -7
opik/integrations/langchain/__init__.py +3 -1
opik/integrations/langchain/helpers.py +96 -0
opik/integrations/langchain/langgraph_async_context_bridge.py +131 -0
opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
opik/integrations/langchain/opik_encoder_extension.py +1 -1
opik/integrations/langchain/opik_tracer.py +474 -229
opik/integrations/litellm/__init__.py +5 -0
opik/integrations/litellm/completion_chunks_aggregator.py +115 -0
opik/integrations/litellm/litellm_completion_decorator.py +242 -0
opik/integrations/litellm/opik_tracker.py +43 -0
opik/integrations/litellm/stream_patchers.py +151 -0
opik/integrations/llama_index/callback.py +146 -107
opik/integrations/openai/agents/opik_tracing_processor.py +1 -2
opik/integrations/openai/openai_chat_completions_decorator.py +2 -16
opik/integrations/openai/opik_tracker.py +1 -1
opik/integrations/sagemaker/auth.py +5 -1
opik/llm_usage/google_usage.py +3 -1
opik/llm_usage/opik_usage.py +7 -8
opik/llm_usage/opik_usage_factory.py +4 -2
opik/logging_messages.py +6 -0
opik/message_processing/batching/base_batcher.py +14 -21
opik/message_processing/batching/batch_manager.py +22 -10
opik/message_processing/batching/batch_manager_constuctors.py +10 -0
opik/message_processing/batching/batchers.py +59 -27
opik/message_processing/batching/flushing_thread.py +0 -3
opik/message_processing/emulation/__init__.py +0 -0
opik/message_processing/emulation/emulator_message_processor.py +578 -0
opik/message_processing/emulation/local_emulator_message_processor.py +140 -0
opik/message_processing/emulation/models.py +162 -0
opik/message_processing/encoder_helpers.py +79 -0
opik/message_processing/messages.py +56 -1
opik/message_processing/preprocessing/__init__.py +0 -0
opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
opik/message_processing/preprocessing/constants.py +1 -0
opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
opik/message_processing/preprocessing/preprocessor.py +36 -0
opik/message_processing/processors/__init__.py +0 -0
opik/message_processing/processors/attachments_extraction_processor.py +146 -0
opik/message_processing/processors/message_processors.py +92 -0
opik/message_processing/processors/message_processors_chain.py +96 -0
opik/message_processing/{message_processors.py → processors/online_message_processor.py} +85 -29
opik/message_processing/queue_consumer.py +9 -3
opik/message_processing/streamer.py +71 -33
opik/message_processing/streamer_constructors.py +43 -10
opik/opik_context.py +16 -4
opik/plugins/pytest/hooks.py +5 -3
opik/rest_api/__init__.py +346 -15
opik/rest_api/alerts/__init__.py +7 -0
opik/rest_api/alerts/client.py +667 -0
opik/rest_api/alerts/raw_client.py +1015 -0
opik/rest_api/alerts/types/__init__.py +7 -0
opik/rest_api/alerts/types/get_webhook_examples_request_alert_type.py +5 -0
opik/rest_api/annotation_queues/__init__.py +4 -0
opik/rest_api/annotation_queues/client.py +668 -0
opik/rest_api/annotation_queues/raw_client.py +1019 -0
opik/rest_api/automation_rule_evaluators/client.py +34 -2
opik/rest_api/automation_rule_evaluators/raw_client.py +24 -0
opik/rest_api/client.py +15 -0
opik/rest_api/dashboards/__init__.py +4 -0
opik/rest_api/dashboards/client.py +462 -0
opik/rest_api/dashboards/raw_client.py +648 -0
opik/rest_api/datasets/client.py +1310 -44
opik/rest_api/datasets/raw_client.py +2269 -358
opik/rest_api/experiments/__init__.py +2 -2
opik/rest_api/experiments/client.py +191 -5
opik/rest_api/experiments/raw_client.py +301 -7
opik/rest_api/experiments/types/__init__.py +4 -1
opik/rest_api/experiments/types/experiment_update_status.py +5 -0
opik/rest_api/experiments/types/experiment_update_type.py +5 -0
opik/rest_api/experiments/types/experiment_write_status.py +5 -0
opik/rest_api/feedback_definitions/types/find_feedback_definitions_request_type.py +1 -1
opik/rest_api/llm_provider_key/client.py +20 -0
opik/rest_api/llm_provider_key/raw_client.py +20 -0
opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +1 -1
opik/rest_api/manual_evaluation/__init__.py +4 -0
opik/rest_api/manual_evaluation/client.py +347 -0
opik/rest_api/manual_evaluation/raw_client.py +543 -0
opik/rest_api/optimizations/client.py +145 -9
opik/rest_api/optimizations/raw_client.py +237 -13
opik/rest_api/optimizations/types/optimization_update_status.py +3 -1
opik/rest_api/prompts/__init__.py +2 -2
opik/rest_api/prompts/client.py +227 -6
opik/rest_api/prompts/raw_client.py +331 -2
opik/rest_api/prompts/types/__init__.py +3 -1
opik/rest_api/prompts/types/create_prompt_version_detail_template_structure.py +5 -0
opik/rest_api/prompts/types/prompt_write_template_structure.py +5 -0
opik/rest_api/spans/__init__.py +0 -2
opik/rest_api/spans/client.py +238 -76
opik/rest_api/spans/raw_client.py +307 -95
opik/rest_api/spans/types/__init__.py +0 -2
opik/rest_api/traces/client.py +572 -161
opik/rest_api/traces/raw_client.py +736 -229
opik/rest_api/types/__init__.py +352 -17
opik/rest_api/types/aggregation_data.py +1 -0
opik/rest_api/types/alert.py +33 -0
opik/rest_api/types/alert_alert_type.py +5 -0
opik/rest_api/types/alert_page_public.py +24 -0
opik/rest_api/types/alert_public.py +33 -0
opik/rest_api/types/alert_public_alert_type.py +5 -0
opik/rest_api/types/alert_trigger.py +27 -0
opik/rest_api/types/alert_trigger_config.py +28 -0
opik/rest_api/types/alert_trigger_config_public.py +28 -0
opik/rest_api/types/alert_trigger_config_public_type.py +10 -0
opik/rest_api/types/alert_trigger_config_type.py +10 -0
opik/rest_api/types/alert_trigger_config_write.py +22 -0
opik/rest_api/types/alert_trigger_config_write_type.py +10 -0
opik/rest_api/types/alert_trigger_event_type.py +19 -0
opik/rest_api/types/alert_trigger_public.py +27 -0
opik/rest_api/types/alert_trigger_public_event_type.py +19 -0
opik/rest_api/types/alert_trigger_write.py +23 -0
opik/rest_api/types/alert_trigger_write_event_type.py +19 -0
opik/rest_api/types/alert_write.py +28 -0
opik/rest_api/types/alert_write_alert_type.py +5 -0
opik/rest_api/types/annotation_queue.py +42 -0
opik/rest_api/types/annotation_queue_batch.py +27 -0
opik/rest_api/types/annotation_queue_item_ids.py +19 -0
opik/rest_api/types/annotation_queue_page_public.py +28 -0
opik/rest_api/types/annotation_queue_public.py +38 -0
opik/rest_api/types/annotation_queue_public_scope.py +5 -0
opik/rest_api/types/annotation_queue_reviewer.py +20 -0
opik/rest_api/types/annotation_queue_reviewer_public.py +20 -0
opik/rest_api/types/annotation_queue_scope.py +5 -0
opik/rest_api/types/annotation_queue_write.py +31 -0
opik/rest_api/types/annotation_queue_write_scope.py +5 -0
opik/rest_api/types/audio_url.py +19 -0
opik/rest_api/types/audio_url_public.py +19 -0
opik/rest_api/types/audio_url_write.py +19 -0
opik/rest_api/types/automation_rule_evaluator.py +62 -2
opik/rest_api/types/automation_rule_evaluator_llm_as_judge.py +2 -0
opik/rest_api/types/automation_rule_evaluator_llm_as_judge_public.py +2 -0
opik/rest_api/types/automation_rule_evaluator_llm_as_judge_write.py +2 -0
opik/rest_api/types/automation_rule_evaluator_object_object_public.py +155 -0
opik/rest_api/types/automation_rule_evaluator_page_public.py +3 -2
opik/rest_api/types/automation_rule_evaluator_public.py +57 -2
opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_public.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_write.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_public.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_write.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_public.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_write.py +2 -0
opik/rest_api/types/automation_rule_evaluator_update.py +51 -1
opik/rest_api/types/automation_rule_evaluator_update_llm_as_judge.py +2 -0
opik/rest_api/types/automation_rule_evaluator_update_span_llm_as_judge.py +22 -0
opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
opik/rest_api/types/automation_rule_evaluator_update_trace_thread_llm_as_judge.py +2 -0
opik/rest_api/types/automation_rule_evaluator_update_trace_thread_user_defined_metric_python.py +2 -0
opik/rest_api/types/automation_rule_evaluator_update_user_defined_metric_python.py +2 -0
opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python.py +2 -0
opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_public.py +2 -0
opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_write.py +2 -0
opik/rest_api/types/automation_rule_evaluator_write.py +51 -1
opik/rest_api/types/boolean_feedback_definition.py +25 -0
opik/rest_api/types/boolean_feedback_definition_create.py +20 -0
opik/rest_api/types/boolean_feedback_definition_public.py +25 -0
opik/rest_api/types/boolean_feedback_definition_update.py +20 -0
opik/rest_api/types/boolean_feedback_detail.py +29 -0
opik/rest_api/types/boolean_feedback_detail_create.py +29 -0
opik/rest_api/types/boolean_feedback_detail_public.py +29 -0
opik/rest_api/types/boolean_feedback_detail_update.py +29 -0
opik/rest_api/types/dashboard_page_public.py +24 -0
opik/rest_api/types/dashboard_public.py +30 -0
opik/rest_api/types/dataset.py +4 -0
opik/rest_api/types/dataset_expansion.py +42 -0
opik/rest_api/types/dataset_expansion_response.py +39 -0
opik/rest_api/types/dataset_item.py +2 -0
opik/rest_api/types/dataset_item_changes_public.py +5 -0
opik/rest_api/types/dataset_item_compare.py +2 -0
opik/rest_api/types/dataset_item_filter.py +27 -0
opik/rest_api/types/dataset_item_filter_operator.py +21 -0
opik/rest_api/types/dataset_item_page_compare.py +5 -0
opik/rest_api/types/dataset_item_page_public.py +5 -0
opik/rest_api/types/dataset_item_public.py +2 -0
opik/rest_api/types/dataset_item_update.py +39 -0
opik/rest_api/types/dataset_item_write.py +1 -0
opik/rest_api/types/dataset_public.py +4 -0
opik/rest_api/types/dataset_public_status.py +5 -0
opik/rest_api/types/dataset_status.py +5 -0
opik/rest_api/types/dataset_version_diff.py +22 -0
opik/rest_api/types/dataset_version_diff_stats.py +24 -0
opik/rest_api/types/dataset_version_page_public.py +23 -0
opik/rest_api/types/dataset_version_public.py +59 -0
opik/rest_api/types/dataset_version_summary.py +46 -0
opik/rest_api/types/dataset_version_summary_public.py +46 -0
opik/rest_api/types/experiment.py +7 -2
opik/rest_api/types/experiment_group_response.py +2 -0
opik/rest_api/types/experiment_public.py +7 -2
opik/rest_api/types/experiment_public_status.py +5 -0
opik/rest_api/types/experiment_score.py +20 -0
opik/rest_api/types/experiment_score_public.py +20 -0
opik/rest_api/types/experiment_score_write.py +20 -0
opik/rest_api/types/experiment_status.py +5 -0
opik/rest_api/types/feedback.py +25 -1
opik/rest_api/types/feedback_create.py +20 -1
opik/rest_api/types/feedback_object_public.py +27 -1
opik/rest_api/types/feedback_public.py +25 -1
opik/rest_api/types/feedback_score_batch_item.py +2 -1
opik/rest_api/types/feedback_score_batch_item_thread.py +2 -1
opik/rest_api/types/feedback_score_public.py +4 -0
opik/rest_api/types/feedback_update.py +20 -1
opik/rest_api/types/group_content_with_aggregations.py +1 -0
opik/rest_api/types/group_detail.py +19 -0
opik/rest_api/types/group_details.py +20 -0
opik/rest_api/types/guardrail.py +1 -0
opik/rest_api/types/guardrail_write.py +1 -0
opik/rest_api/types/ids_holder.py +19 -0
opik/rest_api/types/image_url.py +20 -0
opik/rest_api/types/image_url_public.py +20 -0
opik/rest_api/types/image_url_write.py +20 -0
opik/rest_api/types/llm_as_judge_message.py +5 -1
opik/rest_api/types/llm_as_judge_message_content.py +26 -0
opik/rest_api/types/llm_as_judge_message_content_public.py +26 -0
opik/rest_api/types/llm_as_judge_message_content_write.py +26 -0
opik/rest_api/types/llm_as_judge_message_public.py +5 -1
opik/rest_api/types/llm_as_judge_message_write.py +5 -1
opik/rest_api/types/llm_as_judge_model_parameters.py +3 -0
opik/rest_api/types/llm_as_judge_model_parameters_public.py +3 -0
opik/rest_api/types/llm_as_judge_model_parameters_write.py +3 -0
opik/rest_api/types/manual_evaluation_request.py +38 -0
opik/rest_api/types/manual_evaluation_request_entity_type.py +5 -0
opik/rest_api/types/manual_evaluation_response.py +27 -0
opik/rest_api/types/optimization.py +4 -2
opik/rest_api/types/optimization_public.py +4 -2
opik/rest_api/types/optimization_public_status.py +3 -1
opik/rest_api/types/optimization_status.py +3 -1
opik/rest_api/types/optimization_studio_config.py +27 -0
opik/rest_api/types/optimization_studio_config_public.py +27 -0
opik/rest_api/types/optimization_studio_config_write.py +27 -0
opik/rest_api/types/optimization_studio_log.py +22 -0
opik/rest_api/types/optimization_write.py +4 -2
opik/rest_api/types/optimization_write_status.py +3 -1
opik/rest_api/types/project.py +1 -0
opik/rest_api/types/project_detailed.py +1 -0
opik/rest_api/types/project_reference.py +31 -0
opik/rest_api/types/project_reference_public.py +31 -0
opik/rest_api/types/project_stats_summary_item.py +1 -0
opik/rest_api/types/prompt.py +6 -0
opik/rest_api/types/prompt_detail.py +6 -0
opik/rest_api/types/prompt_detail_template_structure.py +5 -0
opik/rest_api/types/prompt_public.py +6 -0
opik/rest_api/types/prompt_public_template_structure.py +5 -0
opik/rest_api/types/prompt_template_structure.py +5 -0
opik/rest_api/types/prompt_version.py +3 -0
opik/rest_api/types/prompt_version_detail.py +3 -0
opik/rest_api/types/prompt_version_detail_template_structure.py +5 -0
opik/rest_api/types/prompt_version_link.py +1 -0
opik/rest_api/types/prompt_version_link_public.py +1 -0
opik/rest_api/types/prompt_version_page_public.py +5 -0
opik/rest_api/types/prompt_version_public.py +3 -0
opik/rest_api/types/prompt_version_public_template_structure.py +5 -0
opik/rest_api/types/prompt_version_template_structure.py +5 -0
opik/rest_api/types/prompt_version_update.py +33 -0
opik/rest_api/types/provider_api_key.py +9 -0
opik/rest_api/types/provider_api_key_provider.py +1 -1
opik/rest_api/types/provider_api_key_public.py +9 -0
opik/rest_api/types/provider_api_key_public_provider.py +1 -1
opik/rest_api/types/score_name.py +1 -0
opik/rest_api/types/service_toggles_config.py +18 -0
opik/rest_api/types/span.py +1 -2
opik/rest_api/types/span_enrichment_options.py +31 -0
opik/rest_api/types/span_experiment_item_bulk_write_view.py +1 -2
opik/rest_api/types/span_filter.py +23 -0
opik/rest_api/types/span_filter_operator.py +21 -0
opik/rest_api/types/span_filter_write.py +23 -0
opik/rest_api/types/span_filter_write_operator.py +21 -0
opik/rest_api/types/span_llm_as_judge_code.py +27 -0
opik/rest_api/types/span_llm_as_judge_code_public.py +27 -0
opik/rest_api/types/span_llm_as_judge_code_write.py +27 -0
opik/rest_api/types/span_public.py +1 -2
opik/rest_api/types/span_update.py +46 -0
opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
opik/rest_api/types/span_write.py +1 -2
opik/rest_api/types/studio_evaluation.py +20 -0
opik/rest_api/types/studio_evaluation_public.py +20 -0
opik/rest_api/types/studio_evaluation_write.py +20 -0
opik/rest_api/types/studio_llm_model.py +21 -0
opik/rest_api/types/studio_llm_model_public.py +21 -0
opik/rest_api/types/studio_llm_model_write.py +21 -0
opik/rest_api/types/studio_message.py +20 -0
opik/rest_api/types/studio_message_public.py +20 -0
opik/rest_api/types/studio_message_write.py +20 -0
opik/rest_api/types/studio_metric.py +21 -0
opik/rest_api/types/studio_metric_public.py +21 -0
opik/rest_api/types/studio_metric_write.py +21 -0
opik/rest_api/types/studio_optimizer.py +21 -0
opik/rest_api/types/studio_optimizer_public.py +21 -0
opik/rest_api/types/studio_optimizer_write.py +21 -0
opik/rest_api/types/studio_prompt.py +20 -0
opik/rest_api/types/studio_prompt_public.py +20 -0
opik/rest_api/types/studio_prompt_write.py +20 -0
opik/rest_api/types/trace.py +11 -2
opik/rest_api/types/trace_enrichment_options.py +32 -0
opik/rest_api/types/trace_experiment_item_bulk_write_view.py +1 -2
opik/rest_api/types/trace_filter.py +23 -0
opik/rest_api/types/trace_filter_operator.py +21 -0
opik/rest_api/types/trace_filter_write.py +23 -0
opik/rest_api/types/trace_filter_write_operator.py +21 -0
opik/rest_api/types/trace_public.py +11 -2
opik/rest_api/types/trace_thread_filter_write.py +23 -0
opik/rest_api/types/trace_thread_filter_write_operator.py +21 -0
opik/rest_api/types/trace_thread_identifier.py +1 -0
opik/rest_api/types/trace_update.py +39 -0
opik/rest_api/types/trace_write.py +1 -2
opik/rest_api/types/value_entry.py +2 -0
opik/rest_api/types/value_entry_compare.py +2 -0
opik/rest_api/types/value_entry_experiment_item_bulk_write_view.py +2 -0
opik/rest_api/types/value_entry_public.py +2 -0
opik/rest_api/types/video_url.py +19 -0
opik/rest_api/types/video_url_public.py +19 -0
opik/rest_api/types/video_url_write.py +19 -0
opik/rest_api/types/webhook.py +28 -0
opik/rest_api/types/webhook_examples.py +19 -0
opik/rest_api/types/webhook_public.py +28 -0
opik/rest_api/types/webhook_test_result.py +23 -0
opik/rest_api/types/webhook_test_result_status.py +5 -0
opik/rest_api/types/webhook_write.py +23 -0
opik/rest_api/types/welcome_wizard_tracking.py +22 -0
opik/rest_api/types/workspace_configuration.py +5 -0
opik/rest_api/welcome_wizard/__init__.py +4 -0
opik/rest_api/welcome_wizard/client.py +195 -0
opik/rest_api/welcome_wizard/raw_client.py +208 -0
opik/rest_api/workspaces/client.py +14 -2
opik/rest_api/workspaces/raw_client.py +10 -0
opik/s3_httpx_client.py +14 -1
opik/simulation/__init__.py +6 -0
opik/simulation/simulated_user.py +99 -0
opik/simulation/simulator.py +108 -0
opik/synchronization.py +5 -6
opik/{decorator/tracing_runtime_config.py → tracing_runtime_config.py} +6 -7
opik/types.py +36 -0
opik/validation/chat_prompt_messages.py +241 -0
opik/validation/feedback_score.py +3 -3
opik/validation/validator.py +28 -0
opik-1.9.71.dist-info/METADATA +370 -0
opik-1.9.71.dist-info/RECORD +1110 -0
opik/api_objects/prompt/prompt.py +0 -112
opik/cli.py +0 -193
opik/hooks.py +0 -13
opik/integrations/bedrock/chunks_aggregator.py +0 -55
opik/integrations/bedrock/helpers.py +0 -8
opik/rest_api/types/automation_rule_evaluator_object_public.py +0 -100
opik/rest_api/types/json_node_experiment_item_bulk_write_view.py +0 -5
opik-1.8.39.dist-info/METADATA +0 -339
opik-1.8.39.dist-info/RECORD +0 -790
/opik/{evaluation/metrics/conversation/conversational_coherence → decorator/context_manager}/__init__.py +0 -0
/opik/evaluation/metrics/conversation/{session_completeness → llm_judges/conversational_coherence}/__init__.py +0 -0
/opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/schema.py +0 -0
/opik/evaluation/metrics/conversation/{user_frustration → llm_judges/session_completeness}/__init__.py +0 -0
/opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/schema.py +0 -0
/opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/schema.py +0 -0
/opik/integrations/bedrock/{stream_wrappers.py → converse/stream_wrappers.py} +0 -0
/opik/rest_api/{spans/types → types}/span_update_type.py +0 -0
{opik-1.8.39.dist-info → opik-1.9.71.dist-info}/WHEEL +0 -0
{opik-1.8.39.dist-info → opik-1.9.71.dist-info}/entry_points.txt +0 -0
{opik-1.8.39.dist-info → opik-1.9.71.dist-info}/licenses/LICENSE +0 -0
{opik-1.8.39.dist-info → opik-1.9.71.dist-info}/top_level.txt +0 -0

opik/evaluation/metrics/llm_judges/answer_relevance/metric.py CHANGED Viewed

@@ -32,6 +32,8 @@ class AnswerRelevance(base_metric.BaseMetric):
         require_context: if set to False, execution in no-context mode is allowed. Default is True.
         track: Whether to track the metric. Defaults to True.
         project_name: Optional project name to track the metric in for the cases when there are no parent span/trace to inherit project name from.
+        seed: Optional seed value for reproducible model generation. If provided, this seed will be passed to the model for deterministic outputs.
+        temperature: Optional temperature value for model generation. If provided, this temperature will be passed to the model. If not provided, the model's default temperature will be used.
     Example:
         >>> from opik.evaluation.metrics import AnswerRelevance
@@ -56,6 +58,8 @@ class AnswerRelevance(base_metric.BaseMetric):
         require_context: bool = True,
         track: bool = True,
         project_name: Optional[str] = None,
+        seed: Optional[int] = None,
+        temperature: Optional[float] = None,
     ):
         super().__init__(
             name=name,
@@ -63,19 +67,28 @@ class AnswerRelevance(base_metric.BaseMetric):
             project_name=project_name,
         )
         self._require_context = require_context
-        self._init_model(model)
+        self._seed = seed
+        self._init_model(model, temperature=temperature)
         self._init_few_shot_examples(
             few_shot_examples_with_context=few_shot_examples,
             few_shot_examples_no_context=few_shot_examples_no_context,
         )
     def _init_model(
-        self, model: Optional[Union[str, base_model.OpikBaseModel]]
+        self,
+        model: Optional[Union[str, base_model.OpikBaseModel]],
+        temperature: Optional[float],
     ) -> None:
         if isinstance(model, base_model.OpikBaseModel):
             self._model = model
         else:
-            self._model = models_factory.get(model_name=model)
+            model_kwargs = {}
+            if temperature is not None:
+                model_kwargs["temperature"] = temperature
+            if self._seed is not None:
+                model_kwargs["seed"] = self._seed
+            self._model = models_factory.get(model_name=model, **model_kwargs)
     def _init_few_shot_examples(
         self,
@@ -124,7 +137,8 @@ class AnswerRelevance(base_metric.BaseMetric):
         )
         model_output = self._model.generate_string(
-            input=llm_query, response_format=AnswerRelevanceResponseFormat
+            input=llm_query,
+            response_format=AnswerRelevanceResponseFormat,
         )
         return parser.parse_model_output(content=model_output, name=self.name)
@@ -154,7 +168,8 @@ class AnswerRelevance(base_metric.BaseMetric):
             input=input, output=output, context=context
         )
         model_output = await self._model.agenerate_string(
-            input=llm_query, response_format=AnswerRelevanceResponseFormat
+            input=llm_query,
+            response_format=AnswerRelevanceResponseFormat,
         )
         return parser.parse_model_output(content=model_output, name=self.name)

opik/evaluation/metrics/llm_judges/context_precision/metric.py CHANGED Viewed

@@ -28,6 +28,8 @@ class ContextPrecision(base_metric.BaseMetric):
         track: Whether to track the metric. Defaults to True.
         project_name: Optional project name to track the metric in for the cases when
             there are no parent span/trace to inherit project name from.
+        seed: Optional seed value for reproducible model generation. If provided, this seed will be passed to the model for deterministic outputs.
+        temperature: Optional temperature value for model generation. If provided, this temperature will be passed to the model. If not provided, the model's default temperature will be used.
     Example:
         >>> from opik.evaluation.metrics import ContextPrecision
@@ -48,23 +50,33 @@ class ContextPrecision(base_metric.BaseMetric):
         ] = None,
         track: bool = True,
         project_name: Optional[str] = None,
+        seed: Optional[int] = None,
+        temperature: Optional[float] = None,
     ):
         super().__init__(
             name=name,
             track=track,
             project_name=project_name,
         )
-        self._init_model(model)
+        self._seed = seed
+        self._init_model(model, temperature=temperature)
         self.few_shot_examples = few_shot_examples or template.FEW_SHOT_EXAMPLES
     def _init_model(
-        self, model: Optional[Union[str, base_model.OpikBaseModel]]
+        self,
+        model: Optional[Union[str, base_model.OpikBaseModel]],
+        temperature: Optional[float],
     ) -> None:
         if isinstance(model, base_model.OpikBaseModel):
             self._model = model
         else:
-            self._model = models_factory.get(model_name=model)
+            model_kwargs = {}
+            if temperature is not None:
+                model_kwargs["temperature"] = temperature
+            if self._seed is not None:
+                model_kwargs["seed"] = self._seed
+            self._model = models_factory.get(model_name=model, **model_kwargs)
     def score(
         self,
@@ -96,7 +108,8 @@ class ContextPrecision(base_metric.BaseMetric):
             few_shot_examples=self.few_shot_examples,
         )
         model_output = self._model.generate_string(
-            input=llm_query, response_format=ContextPrecisionResponseFormat
+            input=llm_query,
+            response_format=ContextPrecisionResponseFormat,
         )
         return parser.parse_model_output(content=model_output, name=self.name)
@@ -133,7 +146,8 @@ class ContextPrecision(base_metric.BaseMetric):
             few_shot_examples=self.few_shot_examples,
         )
         model_output = await self._model.agenerate_string(
-            input=llm_query, response_format=ContextPrecisionResponseFormat
+            input=llm_query,
+            response_format=ContextPrecisionResponseFormat,
         )
         return parser.parse_model_output(content=model_output, name=self.name)

opik/evaluation/metrics/llm_judges/context_recall/metric.py CHANGED Viewed

@@ -28,6 +28,8 @@ class ContextRecall(base_metric.BaseMetric):
         track: Whether to track the metric. Defaults to True.
         project_name: Optional project name to track the metric in for the cases when
             there are no parent span/trace to inherit project name from.
+        seed: Optional seed value for reproducible model generation. If provided, this seed will be passed to the model for deterministic outputs.
+        temperature: Optional temperature value for model generation. If provided, this temperature will be passed to the model. If not provided, the model's default temperature will be used.
     Example:
         >>> from opik.evaluation.metrics import ContextRecall
@@ -46,23 +48,33 @@ class ContextRecall(base_metric.BaseMetric):
         few_shot_examples: Optional[List[template.FewShotExampleContextRecall]] = None,
         track: bool = True,
         project_name: Optional[str] = None,
+        seed: Optional[int] = None,
+        temperature: Optional[float] = None,
     ):
         super().__init__(
             name=name,
             track=track,
             project_name=project_name,
         )
-        self._init_model(model)
+        self._seed = seed
+        self._init_model(model, temperature=temperature)
         self.few_shot_examples = few_shot_examples or template.FEW_SHOT_EXAMPLES
     def _init_model(
-        self, model: Optional[Union[str, base_model.OpikBaseModel]]
+        self,
+        model: Optional[Union[str, base_model.OpikBaseModel]],
+        temperature: Optional[float],
     ) -> None:
         if isinstance(model, base_model.OpikBaseModel):
             self._model = model
         else:
-            self._model = models_factory.get(model_name=model)
+            model_kwargs = {}
+            if temperature is not None:
+                model_kwargs["temperature"] = temperature
+            if self._seed is not None:
+                model_kwargs["seed"] = self._seed
+            self._model = models_factory.get(model_name=model, **model_kwargs)
     def score(
         self,
@@ -94,7 +106,8 @@ class ContextRecall(base_metric.BaseMetric):
             few_shot_examples=self.few_shot_examples,
         )
         model_output = self._model.generate_string(
-            input=llm_query, response_format=ContextRecallResponseFormat
+            input=llm_query,
+            response_format=ContextRecallResponseFormat,
         )
         return parser.parse_model_output(content=model_output, name=self.name)
@@ -131,7 +144,8 @@ class ContextRecall(base_metric.BaseMetric):
             few_shot_examples=self.few_shot_examples,
         )
         model_output = await self._model.agenerate_string(
-            input=llm_query, response_format=ContextRecallResponseFormat
+            input=llm_query,
+            response_format=ContextRecallResponseFormat,
         )
         return parser.parse_model_output(content=model_output, name=self.name)

opik/evaluation/metrics/llm_judges/g_eval/__init__.py CHANGED Viewed

@@ -0,0 +1,5 @@
+"""Public exports for the GEval metric package."""
+from .metric import GEval
+__all__ = ["GEval"]

opik/evaluation/metrics/llm_judges/g_eval/metric.py CHANGED Viewed

@@ -1,10 +1,13 @@
-from typing import Any, Optional, Union
+from collections import OrderedDict
+from threading import Lock
+from typing import Any, Dict, Optional, Tuple, Union
 import pydantic
 from opik.evaluation.metrics import base_metric, score_result
 from opik.evaluation.models import base_model, models_factory
 from opik.evaluation import models
 from . import template, parser
+from .presets import GEVAL_PRESETS
 class GEvalScoreFormat(pydantic.BaseModel):
@@ -12,7 +15,56 @@ class GEvalScoreFormat(pydantic.BaseModel):
     reason: str
+def _freeze_for_cache(value: Any) -> Any:
+    """Convert nested structures into hashable representations for caching."""
+    if isinstance(value, dict):
+        return tuple(
+            sorted((key, _freeze_for_cache(val)) for key, val in value.items())
+        )
+    if isinstance(value, (list, tuple)):
+        return tuple(_freeze_for_cache(item) for item in value)
+    if isinstance(value, set):
+        return tuple(sorted(_freeze_for_cache(item) for item in value))
+    return value
 class GEval(base_metric.BaseMetric):
+    """
+    Generalised evaluation metric that prompts an LLM to grade another LLM output.
+    GEval builds a reusable chain-of-thought using the provided
+    ``task_introduction`` and ``evaluation_criteria`` prompts, then requests a
+    final score and rationale for each evaluated output.
+    Args:
+        task_introduction: Instruction describing the evaluator's persona/purpose.
+        evaluation_criteria: Detailed rubric presented to the evaluator.
+        model: Optional model identifier or ``OpikBaseModel`` for the judge.
+        name: Display name for the metric result. Defaults to ``"g_eval_metric"``.
+        track: Whether to automatically track metric results. Defaults to ``True``.
+        project_name: Optional tracking project name.
+        temperature: Sampling temperature forwarded to the judge model.
+        seed: Optional seed for reproducible generation (if supported by the model).
+    Example:
+        >>> from opik.evaluation.metrics.llm_judges.g_eval.metric import GEval
+        >>> metric = GEval(
+        ...     task_introduction="You evaluate politeness of responses.",
+        ...     evaluation_criteria="Score from 1 (rude) to 5 (very polite).",
+        ...     model="gpt-4",
+        ... )
+        >>> result = metric.score(output="Thanks so much for your help!")  # doctest: +SKIP
+        >>> result.value  # doctest: +SKIP
+        0.9
+    """
+    _CHAIN_OF_THOUGHT_CACHE: "OrderedDict[Tuple[str, str, str, Any], str]" = (
+        OrderedDict()
+    )
+    _CHAIN_OF_THOUGHT_LOCK: Lock = Lock()
+    _MAX_CHAIN_OF_THOUGHT_CACHE = 128
     def __init__(
         self,
         task_introduction: str,
@@ -21,65 +73,61 @@ class GEval(base_metric.BaseMetric):
         name: str = "g_eval_metric",
         track: bool = True,
         project_name: Optional[str] = None,
+        temperature: float = 0.0,
+        seed: Optional[int] = None,
     ):
-        """
-        A metric that evaluates an LLM output based on chain-of-thought built with the evaluation criteria provided
-        by the user.
-        For more details see the original paper: https://arxiv.org/pdf/2303.16634
-        Args:
-            task_introduction: An instruction for LLM used to generate an evaluation chain-of-thought and in evaluation call itself.
-                `opik.evaluation.models.LiteLLMChatModel` is used by default.
-            evaluation_criteria: The main task for G-Eval metric written in human language.
-            model: The LLM to use for evaluation. Can be a string (model name) or an `opik.evaluation.models.OpikBaseModel` subclass instance.
-            name: The name of the metric.
-            track: Whether to track the metric. Defaults to True.
-            project_name: Optional project name to track the metric in for the cases when
-                there are no parent span/trace to inherit project name from.
-        """
         super().__init__(
             name=name,
             track=track,
             project_name=project_name,
         )
-        self._init_model(model)
         self.task_introduction = task_introduction
         self.evaluation_criteria = evaluation_criteria
+        self._seed = seed
         self._log_probs_supported = False
-        self._chain_of_thought_response: Optional[str] = None
+        self._init_model(model, temperature=temperature)
     def llm_chain_of_thought(self) -> str:
-        if self._chain_of_thought_response is None:
-            prompt = template.G_EVAL_COT_TEMPLATE.format(
-                task_introduction=self.task_introduction,
-                evaluation_criteria=self.evaluation_criteria,
-            )
-            self._chain_of_thought_response = self._model.generate_string(input=prompt)
+        cache_key = self._chain_of_thought_cache_key()
+        cached = self._get_cached_chain_of_thought(cache_key)
+        if cached is not None:
+            return cached
-        return self._chain_of_thought_response
+        prompt = template.G_EVAL_COT_TEMPLATE.format(
+            task_introduction=self.task_introduction,
+            evaluation_criteria=self.evaluation_criteria,
+        )
+        generated = self._model.generate_string(input=prompt)
+        self._store_chain_of_thought(cache_key, generated)
+        return generated
     async def allm_chain_of_thought(self) -> str:
-        if not self._chain_of_thought_response:
-            prompt = template.G_EVAL_COT_TEMPLATE.format(
-                task_introduction=self.task_introduction,
-                evaluation_criteria=self.evaluation_criteria,
-            )
-            self._chain_of_thought_response = await self._model.agenerate_string(
-                input=prompt
-            )
+        cache_key = self._chain_of_thought_cache_key()
+        cached = self._get_cached_chain_of_thought(cache_key)
+        if cached is not None:
+            return cached
-        return self._chain_of_thought_response
+        prompt = template.G_EVAL_COT_TEMPLATE.format(
+            task_introduction=self.task_introduction,
+            evaluation_criteria=self.evaluation_criteria,
+        )
+        generated = await self._model.agenerate_string(input=prompt)
+        self._store_chain_of_thought(cache_key, generated)
+        return generated
     def _init_model(
-        self, model: Optional[Union[str, base_model.OpikBaseModel]]
+        self, model: Optional[Union[str, base_model.OpikBaseModel]], temperature: float
     ) -> None:
         if isinstance(model, base_model.OpikBaseModel):
             self._model = model
         else:
-            self._model = models_factory.get(model_name=model)
+            model_kwargs = {"temperature": temperature}
+            if self._seed is not None:
+                model_kwargs["seed"] = self._seed
+            self._model = models_factory.get(model_name=model, **model_kwargs)
         if (
             hasattr(self._model, "supported_params")
@@ -88,6 +136,55 @@ class GEval(base_metric.BaseMetric):
         ):
             self._log_probs_supported = True
+    @classmethod
+    def _get_cached_chain_of_thought(
+        cls, cache_key: Tuple[str, str, str, Any]
+    ) -> Optional[str]:
+        with cls._CHAIN_OF_THOUGHT_LOCK:
+            value = cls._CHAIN_OF_THOUGHT_CACHE.get(cache_key)
+            if value is not None:
+                cls._CHAIN_OF_THOUGHT_CACHE.move_to_end(cache_key)
+            return value
+    @classmethod
+    def _store_chain_of_thought(
+        cls, cache_key: Tuple[str, str, str, Any], value: str
+    ) -> None:
+        with cls._CHAIN_OF_THOUGHT_LOCK:
+            existing = cls._CHAIN_OF_THOUGHT_CACHE.get(cache_key)
+            if existing is not None:
+                cls._CHAIN_OF_THOUGHT_CACHE.move_to_end(cache_key)
+                return
+            cls._CHAIN_OF_THOUGHT_CACHE[cache_key] = value
+            cls._CHAIN_OF_THOUGHT_CACHE.move_to_end(cache_key)
+            while len(cls._CHAIN_OF_THOUGHT_CACHE) > cls._MAX_CHAIN_OF_THOUGHT_CACHE:
+                cls._CHAIN_OF_THOUGHT_CACHE.popitem(last=False)
+    def _chain_of_thought_cache_key(self) -> Tuple[str, str, str, Any]:
+        model_name = getattr(self._model, "model_name", "unknown")
+        return (
+            self.task_introduction,
+            self.evaluation_criteria,
+            model_name,
+            self._model_cache_fingerprint(),
+        )
+    def _model_cache_fingerprint(self) -> Any:
+        fingerprint_candidate = getattr(self._model, "cache_fingerprint", None)
+        if callable(fingerprint_candidate):
+            try:
+                fingerprint = fingerprint_candidate()
+            except Exception:
+                fingerprint = None
+            else:
+                return _freeze_for_cache(fingerprint)
+        completion_kwargs = getattr(self._model, "_completion_kwargs", None)
+        if isinstance(completion_kwargs, dict):
+            return _freeze_for_cache(completion_kwargs)
+        return id(self._model)
     def score(
         self,
         output: str,
@@ -119,17 +216,23 @@ class GEval(base_metric.BaseMetric):
         ]
         if isinstance(self._model, models.LiteLLMChatModel):
-            model_output = self._model.generate_provider_response(
+            provider_kwargs: Dict[str, Any] = {
+                "response_format": GEvalScoreFormat,
+            }
+            if self._log_probs_supported:
+                provider_kwargs["logprobs"] = True
+                provider_kwargs["top_logprobs"] = 20
+            with base_model.get_provider_response(
+                model_provider=self._model,
                 messages=request,
-                logprobs=self._log_probs_supported,
-                top_logprobs=20 if self._log_probs_supported else None,
-                response_format=GEvalScoreFormat,
-            )
-            return parser.parse_litellm_model_output(
-                content=model_output,
-                name=self.name,
-                log_probs_supported=self._log_probs_supported,
-            )
+                **provider_kwargs,
+            ) as model_output:
+                return parser.parse_litellm_model_output(
+                    content=model_output,
+                    name=self.name,
+                    log_probs_supported=self._log_probs_supported,
+                )
         model_output_string = self._model.generate_string(
             input=llm_query, response_format=GEvalScoreFormat
@@ -138,18 +241,13 @@ class GEval(base_metric.BaseMetric):
         return parser.parse_model_output_string(model_output_string, self.name)
     async def ascore(
-        self, output: str, **ignored_kwargs: Any
+        self,
+        output: str,
+        **ignored_kwargs: Any,
     ) -> score_result.ScoreResult:
         """
-        Calculate the G-Eval score for the given LLM's output.
-        Args:
-            output: The LLM's output to evaluate.
-            **ignored_kwargs: Additional keyword arguments that are ignored.
-        Returns:
-            score_result.ScoreResult: A ScoreResult object containing the G-Eval score
-            (between 0.0 and 1.0) and a reason for the score.
+        Async variant of :meth:`score`, evaluating the provided LLM output using
+        the configured judge model and returning a ``ScoreResult``.
         """
         llm_query = template.G_EVAL_QUERY_TEMPLATE.format(
             task_introduction=self.task_introduction,
@@ -166,20 +264,73 @@ class GEval(base_metric.BaseMetric):
         ]
         if isinstance(self._model, models.LiteLLMChatModel):
-            model_output = await self._model.agenerate_provider_response(
+            provider_kwargs: Dict[str, Any] = {
+                "response_format": GEvalScoreFormat,
+            }
+            if self._log_probs_supported:
+                provider_kwargs["logprobs"] = True
+                provider_kwargs["top_logprobs"] = 20
+            async with base_model.aget_provider_response(
+                model_provider=self._model,
                 messages=request,
-                logprobs=self._log_probs_supported,
-                top_logprobs=20 if self._log_probs_supported else None,
-                response_format=GEvalScoreFormat,
-            )
-            return parser.parse_litellm_model_output(
-                content=model_output,
-                name=self.name,
-                log_probs_supported=self._log_probs_supported,
-            )
+                **provider_kwargs,
+            ) as model_output:
+                return parser.parse_litellm_model_output(
+                    content=model_output,
+                    name=self.name,
+                    log_probs_supported=self._log_probs_supported,
+                )
         model_output_string = await self._model.agenerate_string(
             input=llm_query, response_format=GEvalScoreFormat
         )
         return parser.parse_model_output_string(model_output_string, self.name)
+class GEvalPreset(GEval):
+    """
+    Pre-configured GEval variant with author-provided prompt templates.
+    Args:
+        preset: Key name from ``GEVAL_PRESETS`` describing the evaluation rubric.
+        model: Optional model identifier or ``OpikBaseModel`` instance.
+        track: Whether to automatically track metric results. Defaults to ``True``.
+        project_name: Optional tracking project name.
+        temperature: Sampling temperature forwarded to the judge model.
+        name: Optional override for the metric name (defaults to preset name).
+    Example:
+        >>> from opik.evaluation.metrics.llm_judges.g_eval.metric import GEvalPreset
+        >>> metric = GEvalPreset(preset="qa_relevance", model="gpt-4")
+        >>> result = metric.score(output="Answer addresses the user's question.")  # doctest: +SKIP
+        >>> result.value  # doctest: +SKIP
+        0.85
+    """
+    def __init__(
+        self,
+        preset: str,
+        model: Optional[Union[str, models.base_model.OpikBaseModel]] = None,
+        track: bool = True,
+        project_name: Optional[str] = None,
+        temperature: float = 0.0,
+        name: Optional[str] = None,
+    ):
+        try:
+            definition = GEVAL_PRESETS[preset]
+        except KeyError as error:
+            raise ValueError(
+                f"Unknown GEval preset '{preset}'. Available presets: {list(GEVAL_PRESETS)}"
+            ) from error
+        super().__init__(
+            task_introduction=definition.task_introduction,
+            evaluation_criteria=definition.evaluation_criteria,
+            model=model,
+            name=name or definition.name,
+            track=track,
+            project_name=project_name,
+            temperature=temperature,
+        )

opik 1.8.39__py3-none-any.whl → 1.9.71__py3-none-any.whl

opik 1.8.39py3-none-any.whl → 1.9.71py3-none-any.whl