PyPI - opik - Versions diffs - 1.9.5__py3-none-any.whl → 1.9.39__py3-none-any.whl - Mend

opik 1.9.5py3-none-any.whl → 1.9.39py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (248) hide show

opik/__init__.py +10 -3
opik/anonymizer/__init__.py +5 -0
opik/anonymizer/anonymizer.py +12 -0
opik/anonymizer/factory.py +80 -0
opik/anonymizer/recursive_anonymizer.py +64 -0
opik/anonymizer/rules.py +56 -0
opik/anonymizer/rules_anonymizer.py +35 -0
opik/api_objects/dataset/rest_operations.py +5 -0
opik/api_objects/experiment/experiment.py +46 -49
opik/api_objects/experiment/helpers.py +34 -10
opik/api_objects/local_recording.py +8 -3
opik/api_objects/opik_client.py +230 -48
opik/api_objects/opik_query_language.py +9 -0
opik/api_objects/prompt/__init__.py +11 -3
opik/api_objects/prompt/base_prompt.py +69 -0
opik/api_objects/prompt/base_prompt_template.py +29 -0
opik/api_objects/prompt/chat/__init__.py +1 -0
opik/api_objects/prompt/chat/chat_prompt.py +193 -0
opik/api_objects/prompt/chat/chat_prompt_template.py +350 -0
opik/api_objects/prompt/{chat_content_renderer_registry.py → chat/content_renderer_registry.py} +37 -35
opik/api_objects/prompt/client.py +101 -30
opik/api_objects/prompt/text/__init__.py +1 -0
opik/api_objects/prompt/text/prompt.py +174 -0
opik/api_objects/prompt/{prompt_template.py → text/prompt_template.py} +10 -6
opik/api_objects/prompt/types.py +1 -1
opik/cli/export.py +6 -2
opik/cli/usage_report/charts.py +39 -10
opik/cli/usage_report/cli.py +164 -45
opik/cli/usage_report/pdf.py +14 -1
opik/config.py +0 -5
opik/decorator/base_track_decorator.py +37 -40
opik/decorator/context_manager/span_context_manager.py +9 -0
opik/decorator/context_manager/trace_context_manager.py +5 -0
opik/dict_utils.py +3 -3
opik/evaluation/__init__.py +13 -2
opik/evaluation/engine/engine.py +195 -223
opik/evaluation/engine/helpers.py +8 -7
opik/evaluation/engine/metrics_evaluator.py +237 -0
opik/evaluation/evaluation_result.py +35 -1
opik/evaluation/evaluator.py +318 -30
opik/evaluation/models/litellm/util.py +78 -6
opik/evaluation/models/model_capabilities.py +33 -0
opik/evaluation/report.py +14 -2
opik/evaluation/rest_operations.py +36 -33
opik/evaluation/test_case.py +2 -2
opik/evaluation/types.py +9 -1
opik/exceptions.py +17 -0
opik/hooks/__init__.py +17 -1
opik/hooks/anonymizer_hook.py +36 -0
opik/id_helpers.py +18 -0
opik/integrations/adk/helpers.py +16 -7
opik/integrations/adk/legacy_opik_tracer.py +7 -4
opik/integrations/adk/opik_tracer.py +3 -1
opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +7 -3
opik/integrations/adk/recursive_callback_injector.py +1 -6
opik/integrations/dspy/callback.py +1 -4
opik/integrations/haystack/opik_connector.py +2 -2
opik/integrations/haystack/opik_tracer.py +2 -4
opik/integrations/langchain/opik_tracer.py +273 -82
opik/integrations/llama_index/callback.py +110 -108
opik/integrations/openai/agents/opik_tracing_processor.py +1 -2
opik/integrations/openai/opik_tracker.py +1 -1
opik/message_processing/batching/batchers.py +11 -7
opik/message_processing/encoder_helpers.py +79 -0
opik/message_processing/messages.py +25 -1
opik/message_processing/online_message_processor.py +23 -8
opik/opik_context.py +7 -7
opik/rest_api/__init__.py +188 -12
opik/rest_api/client.py +3 -0
opik/rest_api/dashboards/__init__.py +4 -0
opik/rest_api/dashboards/client.py +462 -0
opik/rest_api/dashboards/raw_client.py +648 -0
opik/rest_api/datasets/client.py +893 -89
opik/rest_api/datasets/raw_client.py +1328 -87
opik/rest_api/experiments/client.py +30 -2
opik/rest_api/experiments/raw_client.py +26 -0
opik/rest_api/feedback_definitions/types/find_feedback_definitions_request_type.py +1 -1
opik/rest_api/optimizations/client.py +302 -0
opik/rest_api/optimizations/raw_client.py +463 -0
opik/rest_api/optimizations/types/optimization_update_status.py +3 -1
opik/rest_api/prompts/__init__.py +2 -2
opik/rest_api/prompts/client.py +34 -4
opik/rest_api/prompts/raw_client.py +32 -2
opik/rest_api/prompts/types/__init__.py +3 -1
opik/rest_api/prompts/types/create_prompt_version_detail_template_structure.py +5 -0
opik/rest_api/prompts/types/prompt_write_template_structure.py +5 -0
opik/rest_api/spans/__init__.py +0 -2
opik/rest_api/spans/client.py +148 -64
opik/rest_api/spans/raw_client.py +210 -83
opik/rest_api/spans/types/__init__.py +0 -2
opik/rest_api/traces/client.py +241 -73
opik/rest_api/traces/raw_client.py +344 -90
opik/rest_api/types/__init__.py +200 -15
opik/rest_api/types/aggregation_data.py +1 -0
opik/rest_api/types/alert_trigger_config_public_type.py +6 -1
opik/rest_api/types/alert_trigger_config_type.py +6 -1
opik/rest_api/types/alert_trigger_config_write_type.py +6 -1
opik/rest_api/types/automation_rule_evaluator.py +23 -1
opik/rest_api/types/automation_rule_evaluator_llm_as_judge.py +2 -0
opik/rest_api/types/automation_rule_evaluator_llm_as_judge_public.py +2 -0
opik/rest_api/types/automation_rule_evaluator_llm_as_judge_write.py +2 -0
opik/rest_api/types/{automation_rule_evaluator_object_public.py → automation_rule_evaluator_object_object_public.py} +32 -10
opik/rest_api/types/automation_rule_evaluator_page_public.py +2 -2
opik/rest_api/types/automation_rule_evaluator_public.py +23 -1
opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_public.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_write.py +22 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_public.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_write.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_public.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_write.py +2 -0
opik/rest_api/types/automation_rule_evaluator_update.py +23 -1
opik/rest_api/types/automation_rule_evaluator_update_llm_as_judge.py +2 -0
opik/rest_api/types/automation_rule_evaluator_update_span_llm_as_judge.py +22 -0
opik/rest_api/types/automation_rule_evaluator_update_trace_thread_llm_as_judge.py +2 -0
opik/rest_api/types/automation_rule_evaluator_update_trace_thread_user_defined_metric_python.py +2 -0
opik/rest_api/types/automation_rule_evaluator_update_user_defined_metric_python.py +2 -0
opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python.py +2 -0
opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_public.py +2 -0
opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_write.py +2 -0
opik/rest_api/types/automation_rule_evaluator_write.py +23 -1
opik/rest_api/types/boolean_feedback_definition.py +25 -0
opik/rest_api/types/boolean_feedback_definition_create.py +20 -0
opik/rest_api/types/boolean_feedback_definition_public.py +25 -0
opik/rest_api/types/boolean_feedback_definition_update.py +20 -0
opik/rest_api/types/boolean_feedback_detail.py +29 -0
opik/rest_api/types/boolean_feedback_detail_create.py +29 -0
opik/rest_api/types/boolean_feedback_detail_public.py +29 -0
opik/rest_api/types/boolean_feedback_detail_update.py +29 -0
opik/rest_api/types/dashboard_page_public.py +24 -0
opik/rest_api/types/dashboard_public.py +30 -0
opik/rest_api/types/dataset.py +2 -0
opik/rest_api/types/dataset_item.py +2 -0
opik/rest_api/types/dataset_item_compare.py +2 -0
opik/rest_api/types/dataset_item_filter.py +23 -0
opik/rest_api/types/dataset_item_filter_operator.py +21 -0
opik/rest_api/types/dataset_item_page_compare.py +1 -0
opik/rest_api/types/dataset_item_page_public.py +1 -0
opik/rest_api/types/dataset_item_public.py +2 -0
opik/rest_api/types/dataset_item_update.py +39 -0
opik/rest_api/types/dataset_item_write.py +1 -0
opik/rest_api/types/dataset_public.py +2 -0
opik/rest_api/types/dataset_public_status.py +5 -0
opik/rest_api/types/dataset_status.py +5 -0
opik/rest_api/types/dataset_version_diff.py +22 -0
opik/rest_api/types/dataset_version_diff_stats.py +24 -0
opik/rest_api/types/dataset_version_page_public.py +23 -0
opik/rest_api/types/dataset_version_public.py +49 -0
opik/rest_api/types/experiment.py +2 -0
opik/rest_api/types/experiment_public.py +2 -0
opik/rest_api/types/experiment_score.py +20 -0
opik/rest_api/types/experiment_score_public.py +20 -0
opik/rest_api/types/experiment_score_write.py +20 -0
opik/rest_api/types/feedback.py +20 -1
opik/rest_api/types/feedback_create.py +16 -1
opik/rest_api/types/feedback_object_public.py +22 -1
opik/rest_api/types/feedback_public.py +20 -1
opik/rest_api/types/feedback_score_public.py +4 -0
opik/rest_api/types/feedback_update.py +16 -1
opik/rest_api/types/image_url.py +20 -0
opik/rest_api/types/image_url_public.py +20 -0
opik/rest_api/types/image_url_write.py +20 -0
opik/rest_api/types/llm_as_judge_message.py +5 -1
opik/rest_api/types/llm_as_judge_message_content.py +24 -0
opik/rest_api/types/llm_as_judge_message_content_public.py +24 -0
opik/rest_api/types/llm_as_judge_message_content_write.py +24 -0
opik/rest_api/types/llm_as_judge_message_public.py +5 -1
opik/rest_api/types/llm_as_judge_message_write.py +5 -1
opik/rest_api/types/llm_as_judge_model_parameters.py +2 -0
opik/rest_api/types/llm_as_judge_model_parameters_public.py +2 -0
opik/rest_api/types/llm_as_judge_model_parameters_write.py +2 -0
opik/rest_api/types/optimization.py +2 -0
opik/rest_api/types/optimization_public.py +2 -0
opik/rest_api/types/optimization_public_status.py +3 -1
opik/rest_api/types/optimization_status.py +3 -1
opik/rest_api/types/optimization_studio_config.py +27 -0
opik/rest_api/types/optimization_studio_config_public.py +27 -0
opik/rest_api/types/optimization_studio_config_write.py +27 -0
opik/rest_api/types/optimization_studio_log.py +22 -0
opik/rest_api/types/optimization_write.py +2 -0
opik/rest_api/types/optimization_write_status.py +3 -1
opik/rest_api/types/prompt.py +6 -0
opik/rest_api/types/prompt_detail.py +6 -0
opik/rest_api/types/prompt_detail_template_structure.py +5 -0
opik/rest_api/types/prompt_public.py +6 -0
opik/rest_api/types/prompt_public_template_structure.py +5 -0
opik/rest_api/types/prompt_template_structure.py +5 -0
opik/rest_api/types/prompt_version.py +2 -0
opik/rest_api/types/prompt_version_detail.py +2 -0
opik/rest_api/types/prompt_version_detail_template_structure.py +5 -0
opik/rest_api/types/prompt_version_public.py +2 -0
opik/rest_api/types/prompt_version_public_template_structure.py +5 -0
opik/rest_api/types/prompt_version_template_structure.py +5 -0
opik/rest_api/types/score_name.py +1 -0
opik/rest_api/types/service_toggles_config.py +6 -0
opik/rest_api/types/span_enrichment_options.py +31 -0
opik/rest_api/types/span_filter.py +23 -0
opik/rest_api/types/span_filter_operator.py +21 -0
opik/rest_api/types/span_filter_write.py +23 -0
opik/rest_api/types/span_filter_write_operator.py +21 -0
opik/rest_api/types/span_llm_as_judge_code.py +27 -0
opik/rest_api/types/span_llm_as_judge_code_public.py +27 -0
opik/rest_api/types/span_llm_as_judge_code_write.py +27 -0
opik/rest_api/types/span_update.py +46 -0
opik/rest_api/types/studio_evaluation.py +20 -0
opik/rest_api/types/studio_evaluation_public.py +20 -0
opik/rest_api/types/studio_evaluation_write.py +20 -0
opik/rest_api/types/studio_llm_model.py +21 -0
opik/rest_api/types/studio_llm_model_public.py +21 -0
opik/rest_api/types/studio_llm_model_write.py +21 -0
opik/rest_api/types/studio_message.py +20 -0
opik/rest_api/types/studio_message_public.py +20 -0
opik/rest_api/types/studio_message_write.py +20 -0
opik/rest_api/types/studio_metric.py +21 -0
opik/rest_api/types/studio_metric_public.py +21 -0
opik/rest_api/types/studio_metric_write.py +21 -0
opik/rest_api/types/studio_optimizer.py +21 -0
opik/rest_api/types/studio_optimizer_public.py +21 -0
opik/rest_api/types/studio_optimizer_write.py +21 -0
opik/rest_api/types/studio_prompt.py +20 -0
opik/rest_api/types/studio_prompt_public.py +20 -0
opik/rest_api/types/studio_prompt_write.py +20 -0
opik/rest_api/types/trace.py +6 -0
opik/rest_api/types/trace_public.py +6 -0
opik/rest_api/types/trace_thread_filter_write.py +23 -0
opik/rest_api/types/trace_thread_filter_write_operator.py +21 -0
opik/rest_api/types/trace_thread_update.py +19 -0
opik/rest_api/types/trace_update.py +39 -0
opik/rest_api/types/value_entry.py +2 -0
opik/rest_api/types/value_entry_compare.py +2 -0
opik/rest_api/types/value_entry_experiment_item_bulk_write_view.py +2 -0
opik/rest_api/types/value_entry_public.py +2 -0
opik/rest_api/types/video_url.py +19 -0
opik/rest_api/types/video_url_public.py +19 -0
opik/rest_api/types/video_url_write.py +19 -0
opik/synchronization.py +5 -6
opik/{decorator/tracing_runtime_config.py → tracing_runtime_config.py} +6 -7
{opik-1.9.5.dist-info → opik-1.9.39.dist-info}/METADATA +5 -4
{opik-1.9.5.dist-info → opik-1.9.39.dist-info}/RECORD +246 -151
opik/api_objects/prompt/chat_prompt_template.py +0 -164
opik/api_objects/prompt/prompt.py +0 -131
/opik/rest_api/{spans/types → types}/span_update_type.py +0 -0
{opik-1.9.5.dist-info → opik-1.9.39.dist-info}/WHEEL +0 -0
{opik-1.9.5.dist-info → opik-1.9.39.dist-info}/entry_points.txt +0 -0
{opik-1.9.5.dist-info → opik-1.9.39.dist-info}/licenses/LICENSE +0 -0
{opik-1.9.5.dist-info → opik-1.9.39.dist-info}/top_level.txt +0 -0

opik/evaluation/evaluator.py CHANGED Viewed

@@ -2,10 +2,12 @@ import logging
 import time
 from typing import Any, Callable, Dict, List, Optional, Union, cast
-from .. import Prompt
+from ..api_objects.prompt import base_prompt
 from ..api_objects import opik_client
 from ..api_objects import dataset, experiment
 from ..api_objects.experiment import helpers as experiment_helpers
+from ..api_objects.prompt.chat import chat_prompt_template
+from ..api_objects.prompt import types as prompt_types
 from . import (
     asyncio_support,
     engine,
@@ -14,13 +16,12 @@ from . import (
     rest_operations,
     samplers,
 )
-from .metrics import base_metric
+from .metrics import base_metric, score_result
 from .models import ModelCapabilities, base_model, models_factory
 from .scorers import scorer_function, scorer_wrapper_metric
-from .types import LLMTask, ScoringKeyMappingType
+from . import test_result
+from .types import ExperimentScoreFunction, LLMTask, ScoringKeyMappingType
 from .. import url_helpers
-from opik.api_objects.prompt.chat_prompt_template import ChatPromptTemplate
-from opik.api_objects.prompt.types import SupportedModalities
 LOGGER = logging.getLogger(__name__)
 MODALITY_SUPPORT_DOC_URL = (
@@ -41,23 +42,52 @@ def _try_notifying_about_experiment_completion(
         )
+def _compute_experiment_scores(
+    experiment_scoring_functions: List[ExperimentScoreFunction],
+    test_results: List[test_result.TestResult],
+) -> List[score_result.ScoreResult]:
+    """Compute experiment-level scores from test results."""
+    if not experiment_scoring_functions or not test_results:
+        return []
+    all_scores: List[score_result.ScoreResult] = []
+    for score_function in experiment_scoring_functions:
+        try:
+            scores = score_function(test_results)
+            # Handle Union[ScoreResult, List[ScoreResult]]
+            if isinstance(scores, list):
+                all_scores.extend(scores)
+            else:
+                all_scores.append(scores)
+        except Exception as e:
+            LOGGER.warning(
+                "Failed to compute experiment score: %s",
+                e,
+                exc_info=True,
+            )
+    return all_scores
 def evaluate(
     dataset: dataset.Dataset,
     task: LLMTask,
     scoring_metrics: Optional[List[base_metric.BaseMetric]] = None,
     scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
+    experiment_name_prefix: Optional[str] = None,
     experiment_name: Optional[str] = None,
     project_name: Optional[str] = None,
     experiment_config: Optional[Dict[str, Any]] = None,
     verbose: int = 1,
     nb_samples: Optional[int] = None,
     task_threads: int = 16,
-    prompt: Optional[Prompt] = None,
-    prompts: Optional[List[Prompt]] = None,
+    prompt: Optional[base_prompt.BasePrompt] = None,
+    prompts: Optional[List[base_prompt.BasePrompt]] = None,
     scoring_key_mapping: Optional[ScoringKeyMappingType] = None,
     dataset_item_ids: Optional[List[str]] = None,
     dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
     trial_count: int = 1,
+    experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
 ) -> evaluation_result.EvaluationResult:
     """
     Performs task evaluation on a given dataset. You can use either `scoring_metrics` or `scorer_functions` to calculate
@@ -70,6 +100,10 @@ def evaluate(
         task: A callable object that takes dict with dataset item content
             as input and returns dict which will later be used for scoring.
+        experiment_name_prefix: The prefix to be added to automatically generated experiment names to make them unique
+            but grouped under the same prefix. For example, if you set `experiment_name_prefix="my-experiment"`,
+            the first experiment created will be named `my-experiment-<unique-random-part>`.
         experiment_name: The name of the experiment associated with evaluation run.
             If None, a generated name will be used.
@@ -117,7 +151,16 @@ def evaluate(
             If not provided, all samples in the dataset will be evaluated.
         trial_count: number of times to run the task and evaluate the task output for every dataset item.
+        experiment_scoring_functions: List of callable functions that compute experiment-level scores.
+            Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
+            These scores are computed after all test results are collected and represent aggregate
+            metrics across the entire experiment.
     """
+    experiment_scoring_functions = (
+        [] if experiment_scoring_functions is None else experiment_scoring_functions
+    )
     checked_prompts = experiment_helpers.handle_prompt_args(
         prompt=prompt,
         prompts=prompts,
@@ -125,6 +168,11 @@ def evaluate(
     client = opik_client.get_client_cached()
+    experiment_name = _use_or_create_experiment_name(
+        experiment_name=experiment_name,
+        experiment_name_prefix=experiment_name_prefix,
+    )
     experiment = client.create_experiment(
         name=experiment_name,
         dataset_name=dataset.name,
@@ -153,6 +201,7 @@ def evaluate(
         dataset_item_ids=dataset_item_ids,
         dataset_sampler=dataset_sampler,
         trial_count=trial_count,
+        experiment_scoring_functions=experiment_scoring_functions,
     )
@@ -171,6 +220,7 @@ def _evaluate_task(
     dataset_item_ids: Optional[List[str]],
     dataset_sampler: Optional[samplers.BaseDatasetSampler],
     trial_count: int,
+    experiment_scoring_functions: List[ExperimentScoreFunction],
 ) -> evaluation_result.EvaluationResult:
     start_time = time.time()
@@ -178,25 +228,33 @@ def _evaluate_task(
         evaluation_engine = engine.EvaluationEngine(
             client=client,
             project_name=project_name,
-            experiment_=experiment,
             scoring_metrics=scoring_metrics,
             workers=task_threads,
             verbose=verbose,
             scoring_key_mapping=scoring_key_mapping,
         )
-        test_results = evaluation_engine.evaluate_llm_tasks(
+        test_results = evaluation_engine.evaluate_llm_task_on_dataset(
             dataset_=dataset,
             task=task,
             nb_samples=nb_samples,
             dataset_item_ids=dataset_item_ids,
             dataset_sampler=dataset_sampler,
             trial_count=trial_count,
+            experiment_=experiment,
         )
     total_time = time.time() - start_time
+    # Compute experiment scores
+    computed_experiment_scores = _compute_experiment_scores(
+        experiment_scoring_functions=experiment_scoring_functions,
+        test_results=test_results,
+    )
     if verbose >= 1:
-        report.display_experiment_results(dataset.name, total_time, test_results)
+        report.display_experiment_results(
+            dataset.name, total_time, test_results, computed_experiment_scores
+        )
     experiment_url = url_helpers.get_experiment_url_by_id(
         experiment_id=experiment.id,
@@ -210,6 +268,10 @@ def _evaluate_task(
     _try_notifying_about_experiment_completion(experiment)
+    # Log experiment scores to backend
+    if computed_experiment_scores:
+        experiment.log_experiment_scores(score_results=computed_experiment_scores)
     evaluation_result_ = evaluation_result.EvaluationResult(
         dataset_id=dataset.id,
         experiment_id=experiment.id,
@@ -217,6 +279,7 @@ def _evaluate_task(
         test_results=test_results,
         experiment_url=experiment_url,
         trial_count=trial_count,
+        experiment_scores=computed_experiment_scores,
     )
     if verbose >= 2:
@@ -236,6 +299,7 @@ def evaluate_experiment(
     verbose: int = 1,
     scoring_key_mapping: Optional[ScoringKeyMappingType] = None,
     experiment_id: Optional[str] = None,
+    experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
 ) -> evaluation_result.EvaluationResult:
     """Update the existing experiment with new evaluation metrics. You can use either `scoring_metrics` or `scorer_functions` to calculate
     evaluation metrics. The scorer functions doesn't require `scoring_key_mapping` and use reserved parameters
@@ -267,7 +331,15 @@ def evaluate_experiment(
             `{"input": "user_question"}` to map the "user_question" key to "input".
         experiment_id: The ID of the experiment to evaluate. If not provided, the experiment will be evaluated based on the experiment name.
+        experiment_scoring_functions: List of callable functions that compute experiment-level scores.
+            Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
+            These scores are computed after all test results are collected and represent aggregate
+            metrics across the entire experiment.
     """
+    experiment_scoring_functions = (
+        [] if experiment_scoring_functions is None else experiment_scoring_functions
+    )
     start_time = time.time()
     client = opik_client.get_client_cached()
@@ -280,10 +352,11 @@ def evaluate_experiment(
             client=client, experiment_name=experiment_name
         )
+    dataset_ = client.get_dataset(name=experiment.dataset_name)
     test_cases = rest_operations.get_experiment_test_cases(
-        client=client,
-        experiment_id=experiment.id,
-        dataset_id=experiment.dataset_id,
+        experiment_=experiment,
+        dataset_=dataset_,
         scoring_key_mapping=scoring_key_mapping,
     )
     first_trace_id = test_cases[0].trace_id
@@ -302,7 +375,6 @@ def evaluate_experiment(
         evaluation_engine = engine.EvaluationEngine(
             client=client,
             project_name=project_name,
-            experiment_=experiment,
             scoring_metrics=scoring_metrics,
             workers=scoring_threads,
             verbose=verbose,
@@ -314,14 +386,23 @@ def evaluate_experiment(
     total_time = time.time() - start_time
+    # Compute experiment scores
+    computed_experiment_scores = _compute_experiment_scores(
+        experiment_scoring_functions=experiment_scoring_functions,
+        test_results=test_results,
+    )
     if verbose >= 1:
         report.display_experiment_results(
-            experiment.dataset_name, total_time, test_results
+            dataset_.name,
+            total_time,
+            test_results,
+            computed_experiment_scores,
         )
     experiment_url = url_helpers.get_experiment_url_by_id(
         experiment_id=experiment.id,
-        dataset_id=experiment.dataset_id,
+        dataset_id=dataset_.id,
         url_override=client.config.url_override,
     )
@@ -329,18 +410,23 @@ def evaluate_experiment(
     _try_notifying_about_experiment_completion(experiment)
+    # Log experiment scores to backend
+    if computed_experiment_scores:
+        experiment.log_experiment_scores(score_results=computed_experiment_scores)
     evaluation_result_ = evaluation_result.EvaluationResult(
-        dataset_id=experiment.dataset_id,
+        dataset_id=dataset_.id,
         experiment_id=experiment.id,
         experiment_name=experiment.name,
         test_results=test_results,
         experiment_url=experiment_url,
         trial_count=1,
+        experiment_scores=computed_experiment_scores,
     )
     if verbose >= 2:
         report.display_evaluation_scores_statistics(
-            dataset_name=experiment.dataset_name,
+            dataset_name=dataset_.name,
             evaluation_results=evaluation_result_,
         )
@@ -351,16 +437,22 @@ def _build_prompt_evaluation_task(
     model: base_model.OpikBaseModel, messages: List[Dict[str, Any]]
 ) -> Callable[[Dict[str, Any]], Dict[str, Any]]:
     supported_modalities = cast(
-        SupportedModalities,
+        prompt_types.SupportedModalities,
         {
             "vision": ModelCapabilities.supports_vision(
                 getattr(model, "model_name", None)
-            )
+            ),
+            "video": ModelCapabilities.supports_video(
+                getattr(model, "model_name", None)
+            ),
         },
     )
-    chat_prompt_template = ChatPromptTemplate(messages=messages)
+    # Disable placeholder validation since we pass all dataset item fields to format()
+    chat_prompt_template_ = chat_prompt_template.ChatPromptTemplate(
+        messages=messages, validate_placeholders=False
+    )
-    required_modalities = chat_prompt_template.required_modalities()
+    required_modalities = chat_prompt_template_.required_modalities()
     unsupported_modalities = {
         modality
         for modality in required_modalities
@@ -379,7 +471,7 @@ def _build_prompt_evaluation_task(
     def _prompt_evaluation_task(prompt_variables: Dict[str, Any]) -> Dict[str, Any]:
         template_type_override = prompt_variables.get("type")
-        processed_messages = chat_prompt_template.format(
+        processed_messages = chat_prompt_template_.format(
             variables=prompt_variables,
             supported_modalities=supported_modalities,
             template_type=template_type_override,
@@ -402,16 +494,18 @@ def evaluate_prompt(
     model: Optional[Union[str, base_model.OpikBaseModel]] = None,
     scoring_metrics: Optional[List[base_metric.BaseMetric]] = None,
     scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
+    experiment_name_prefix: Optional[str] = None,
     experiment_name: Optional[str] = None,
     project_name: Optional[str] = None,
     experiment_config: Optional[Dict[str, Any]] = None,
     verbose: int = 1,
     nb_samples: Optional[int] = None,
     task_threads: int = 16,
-    prompt: Optional[Prompt] = None,
+    prompt: Optional[base_prompt.BasePrompt] = None,
     dataset_item_ids: Optional[List[str]] = None,
     dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
     trial_count: int = 1,
+    experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
 ) -> evaluation_result.EvaluationResult:
     """
     Performs prompt evaluation on a given dataset.
@@ -433,6 +527,10 @@ def evaluate_prompt(
                 • task_outputs — a dictionary containing the LLM task output.
                 • task_span - the data collected during the LLM task execution [optional].
+        experiment_name_prefix: The prefix to be added to automatically generated experiment names to make them unique
+            but grouped under the same prefix. For example, if you set `experiment_name_prefix="my-experiment"`,
+            the first experiment created will be named `my-experiment-<unique-random-part>`.
         experiment_name: name of the experiment.
         project_name: The name of the project to log data
@@ -453,7 +551,15 @@ def evaluate_prompt(
             If not provided, all samples in the dataset will be evaluated.
         trial_count: number of times to execute the prompt and evaluate the LLM output for every dataset item.
+        experiment_scoring_functions: List of callable functions that compute experiment-level scores.
+            Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
+            These scores are computed after all test results are collected and represent aggregate
+            metrics across the entire experiment.
     """
+    experiment_scoring_functions = (
+        [] if experiment_scoring_functions is None else experiment_scoring_functions
+    )
     if isinstance(model, str):
         opik_model = models_factory.get(model_name=model)
     elif not isinstance(model, base_model.OpikBaseModel):
@@ -477,6 +583,11 @@ def evaluate_prompt(
     prompts = [prompt] if prompt else None
+    experiment_name = _use_or_create_experiment_name(
+        experiment_name=experiment_name,
+        experiment_name_prefix=experiment_name_prefix,
+    )
     experiment = client.create_experiment(
         name=experiment_name,
         dataset_name=dataset.name,
@@ -497,25 +608,33 @@ def evaluate_prompt(
         evaluation_engine = engine.EvaluationEngine(
             client=client,
             project_name=project_name,
-            experiment_=experiment,
             scoring_metrics=scoring_metrics,
             workers=task_threads,
             verbose=verbose,
             scoring_key_mapping=None,
         )
-        test_results = evaluation_engine.evaluate_llm_tasks(
+        test_results = evaluation_engine.evaluate_llm_task_on_dataset(
             dataset_=dataset,
             task=_build_prompt_evaluation_task(model=opik_model, messages=messages),
             nb_samples=nb_samples,
             dataset_item_ids=dataset_item_ids,
             dataset_sampler=dataset_sampler,
             trial_count=trial_count,
+            experiment_=experiment,
         )
     total_time = time.time() - start_time
+    # Compute experiment scores
+    computed_experiment_scores = _compute_experiment_scores(
+        experiment_scoring_functions=experiment_scoring_functions,
+        test_results=test_results,
+    )
     if verbose >= 1:
-        report.display_experiment_results(dataset.name, total_time, test_results)
+        report.display_experiment_results(
+            dataset.name, total_time, test_results, computed_experiment_scores
+        )
     experiment_url = url_helpers.get_experiment_url_by_id(
         experiment_id=experiment.id,
@@ -529,6 +648,10 @@ def evaluate_prompt(
     _try_notifying_about_experiment_completion(experiment)
+    # Log experiment scores to backend
+    if computed_experiment_scores:
+        experiment.log_experiment_scores(score_results=computed_experiment_scores)
     evaluation_result_ = evaluation_result.EvaluationResult(
         experiment_id=experiment.id,
         dataset_id=dataset.id,
@@ -536,6 +659,7 @@ def evaluate_prompt(
         test_results=test_results,
         experiment_url=experiment_url,
         trial_count=trial_count,
+        experiment_scores=computed_experiment_scores,
     )
     if verbose >= 2:
@@ -552,18 +676,21 @@ def evaluate_optimization_trial(
     dataset: dataset.Dataset,
     task: LLMTask,
     scoring_metrics: Optional[List[base_metric.BaseMetric]] = None,
+    scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
+    experiment_name_prefix: Optional[str] = None,
     experiment_name: Optional[str] = None,
     project_name: Optional[str] = None,
     experiment_config: Optional[Dict[str, Any]] = None,
     verbose: int = 1,
     nb_samples: Optional[int] = None,
     task_threads: int = 16,
-    prompt: Optional[Prompt] = None,
-    prompts: Optional[List[Prompt]] = None,
+    prompt: Optional[base_prompt.BasePrompt] = None,
+    prompts: Optional[List[base_prompt.BasePrompt]] = None,
     scoring_key_mapping: Optional[ScoringKeyMappingType] = None,
     dataset_item_ids: Optional[List[str]] = None,
     dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
     trial_count: int = 1,
+    experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
 ) -> evaluation_result.EvaluationResult:
     """
     Performs task evaluation on a given dataset.
@@ -576,6 +703,17 @@ def evaluate_optimization_trial(
         task: A callable object that takes dict with dataset item content
             as input and returns dict which will later be used for scoring.
+        scoring_functions: List of scorer functions to be executed during evaluation.
+            Each scorer function includes a scoring method that accepts predefined
+            arguments supplied by the evaluation engine:
+                • dataset_item — a dictionary containing the dataset item content,
+                • task_outputs — a dictionary containing the LLM task output.
+                • task_span - the data collected during the LLM task execution [optional].
+        experiment_name_prefix: The prefix to be added to automatically generated experiment names to make them unique
+                    but grouped under the same prefix. For example, if you set `experiment_name_prefix="my-experiment"`,
+                    the first experiment created will be named `my-experiment-<unique-random-part>`.
         experiment_name: The name of the experiment associated with evaluation run.
             If None, a generated name will be used.
@@ -615,7 +753,16 @@ def evaluate_optimization_trial(
             If not provided, all samples in the dataset will be evaluated.
         trial_count: number of times to execute the prompt and evaluate the LLM output for every dataset item.
+        experiment_scoring_functions: List of callable functions that compute experiment-level scores.
+            Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
+            These scores are computed after all test results are collected and represent aggregate
+            metrics across the entire experiment.
     """
+    experiment_scoring_functions = (
+        [] if experiment_scoring_functions is None else experiment_scoring_functions
+    )
     if scoring_metrics is None:
         scoring_metrics = []
@@ -624,8 +771,20 @@ def evaluate_optimization_trial(
         prompts=prompts,
     )
+    # wrap scoring functions if any
+    scoring_metrics = _wrap_scoring_functions(
+        scoring_functions=scoring_functions,
+        scoring_metrics=scoring_metrics,
+        project_name=project_name,
+    )
     client = opik_client.get_client_cached()
+    experiment_name = _use_or_create_experiment_name(
+        experiment_name=experiment_name,
+        experiment_name_prefix=experiment_name_prefix,
+    )
     experiment = client.create_experiment(
         name=experiment_name,
         dataset_name=dataset.name,
@@ -649,13 +808,128 @@ def evaluate_optimization_trial(
         dataset_item_ids=dataset_item_ids,
         dataset_sampler=dataset_sampler,
         trial_count=trial_count,
+        experiment_scoring_functions=experiment_scoring_functions,
     )
-def _wrap_scoring_functions(
+def evaluate_on_dict_items(
+    items: List[Dict[str, Any]],
+    task: LLMTask,
     scoring_metrics: Optional[List[base_metric.BaseMetric]] = None,
     scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
     project_name: Optional[str] = None,
+    verbose: int = 0,
+    scoring_key_mapping: Optional[ScoringKeyMappingType] = None,
+    scoring_threads: int = 16,
+) -> evaluation_result.EvaluationResultOnDictItems:
+    """
+    Lightweight evaluation function that evaluates a task on dataset items (as dictionaries)
+    without requiring a Dataset object or creating an experiment.
+    This function is useful for optimization scenarios where you need to evaluate many
+    candidate solutions quickly using Opik's metric infrastructure. It creates traces for
+    tracking but doesn't require experiment setup or dataset management.
+    Args:
+        items: List of dataset item contents (dictionaries with the data to evaluate).
+        task: A callable object that takes dict with dataset item content
+            as input and returns dict which will later be used for scoring.
+        scoring_metrics: List of metrics to calculate during evaluation.
+            Each metric's `score(...)` method will be called with arguments taken from
+            the dataset item and task output.
+        scoring_functions: List of scorer functions to be executed during evaluation.
+            Each scorer function accepts predefined arguments:
+                • dataset_item — a dictionary containing the dataset item content,
+                • task_outputs — a dictionary containing the LLM task output.
+        project_name: The name of the project for logging traces.
+        verbose: Controls evaluation output logs and progress bars.
+            0 - no outputs (default), 1 - enable outputs.
+        scoring_key_mapping: A dictionary that allows you to rename keys present in either
+            the dataset item or the task output to match the keys expected by scoring metrics.
+        scoring_threads: Number of thread workers to run scoring metrics.
+    Returns:
+        EvaluationResultOnDictItems object containing test results and providing methods
+        to aggregate scores, similar to the regular evaluation result.
+    Example:
+        ```python
+        import opik
+        from opik.evaluation.metrics import Equals
+        items = [
+            {"input": "What is 2+2?", "expected_output": "4"},
+            {"input": "What is 3+3?", "expected_output": "6"},
+        ]
+        def my_task(item):
+            # Your LLM call here
+            question = item["input"]
+            # ... call model ...
+            return {"output": model_output}
+        result = opik.evaluate_on_dict_items(
+            items=items,
+            task=my_task,
+            scoring_metrics=[Equals()],
+            scoring_key_mapping={"reference": "expected_output"},
+        )
+        # Access individual test results
+        for test_result in result.test_results:
+            print(f"Score: {test_result.score_results[0].value}")
+        # Get aggregated statistics
+        aggregated = result.aggregate_evaluation_scores()
+        print(f"Mean equals score: {aggregated['equals_metric'].mean}")
+        ```
+    """
+    # Wrap scoring functions if any
+    scoring_metrics = _wrap_scoring_functions(
+        scoring_functions=scoring_functions,
+        scoring_metrics=scoring_metrics,
+        project_name=project_name,
+    )
+    if not scoring_metrics:
+        LOGGER.warning("No scoring metrics provided for items evaluation")
+        return evaluation_result.EvaluationResultOnDictItems(test_results=[])
+    client = opik_client.get_client_cached()
+    # Create evaluation engine
+    with asyncio_support.async_http_connections_expire_immediately():
+        evaluation_engine = engine.EvaluationEngine(
+            client=client,
+            project_name=project_name,
+            scoring_metrics=scoring_metrics,
+            workers=scoring_threads,
+            verbose=verbose,
+            scoring_key_mapping=scoring_key_mapping,
+        )
+        # Use the new evaluate_items method
+        test_results = evaluation_engine.evaluate_llm_task_on_dict_items(
+            items=items,
+            task=task,
+        )
+    return evaluation_result.EvaluationResultOnDictItems(
+        test_results=test_results,
+    )
+def _wrap_scoring_functions(
+    scoring_functions: Optional[List[scorer_function.ScorerFunction]],
+    scoring_metrics: Optional[List[base_metric.BaseMetric]],
+    project_name: Optional[str],
 ) -> List[base_metric.BaseMetric]:
     if scoring_functions:
         function_metrics = scorer_wrapper_metric.wrap_scorer_functions(
@@ -667,3 +941,17 @@ def _wrap_scoring_functions(
             scoring_metrics = function_metrics
     return scoring_metrics if scoring_metrics else []
+def _use_or_create_experiment_name(
+    experiment_name: Optional[str], experiment_name_prefix: Optional[str]
+) -> Optional[str]:
+    if experiment_name:
+        return experiment_name
+    if experiment_name_prefix:
+        return experiment_helpers.generate_unique_experiment_name(
+            experiment_name_prefix
+        )
+    else:
+        return None

opik 1.9.5__py3-none-any.whl → 1.9.39__py3-none-any.whl

opik 1.9.5py3-none-any.whl → 1.9.39py3-none-any.whl