PyPI - opik - Versions diffs - 1.9.39__py3-none-any.whl → 1.9.86__py3-none-any.whl - Mend

opik 1.9.39py3-none-any.whl → 1.9.86py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (195) hide show

opik/api_objects/attachment/attachment_context.py +36 -0
opik/api_objects/attachment/attachments_extractor.py +153 -0
opik/api_objects/attachment/client.py +1 -0
opik/api_objects/attachment/converters.py +2 -0
opik/api_objects/attachment/decoder.py +18 -0
opik/api_objects/attachment/decoder_base64.py +83 -0
opik/api_objects/attachment/decoder_helpers.py +137 -0
opik/api_objects/constants.py +2 -0
opik/api_objects/dataset/dataset.py +133 -40
opik/api_objects/dataset/rest_operations.py +2 -0
opik/api_objects/experiment/experiment.py +6 -0
opik/api_objects/helpers.py +8 -4
opik/api_objects/local_recording.py +6 -5
opik/api_objects/observation_data.py +101 -0
opik/api_objects/opik_client.py +78 -45
opik/api_objects/opik_query_language.py +9 -3
opik/api_objects/prompt/chat/chat_prompt.py +18 -1
opik/api_objects/prompt/client.py +8 -1
opik/api_objects/span/span_data.py +3 -88
opik/api_objects/threads/threads_client.py +7 -4
opik/api_objects/trace/trace_data.py +3 -74
opik/api_objects/validation_helpers.py +3 -3
opik/cli/exports/__init__.py +131 -0
opik/cli/exports/dataset.py +278 -0
opik/cli/exports/experiment.py +784 -0
opik/cli/exports/project.py +685 -0
opik/cli/exports/prompt.py +578 -0
opik/cli/exports/utils.py +406 -0
opik/cli/harbor.py +39 -0
opik/cli/imports/__init__.py +439 -0
opik/cli/imports/dataset.py +143 -0
opik/cli/imports/experiment.py +1192 -0
opik/cli/imports/project.py +262 -0
opik/cli/imports/prompt.py +177 -0
opik/cli/imports/utils.py +280 -0
opik/cli/main.py +14 -12
opik/config.py +12 -1
opik/datetime_helpers.py +12 -0
opik/decorator/arguments_helpers.py +4 -1
opik/decorator/base_track_decorator.py +111 -37
opik/decorator/context_manager/span_context_manager.py +5 -1
opik/decorator/generator_wrappers.py +5 -4
opik/decorator/span_creation_handler.py +13 -4
opik/evaluation/engine/engine.py +111 -28
opik/evaluation/engine/evaluation_tasks_executor.py +71 -19
opik/evaluation/evaluator.py +12 -0
opik/evaluation/metrics/conversation/llm_judges/conversational_coherence/metric.py +3 -1
opik/evaluation/metrics/conversation/llm_judges/session_completeness/metric.py +3 -1
opik/evaluation/metrics/conversation/llm_judges/user_frustration/metric.py +3 -1
opik/evaluation/metrics/heuristics/equals.py +11 -7
opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +3 -1
opik/evaluation/metrics/llm_judges/context_precision/metric.py +3 -1
opik/evaluation/metrics/llm_judges/context_recall/metric.py +3 -1
opik/evaluation/metrics/llm_judges/factuality/metric.py +1 -1
opik/evaluation/metrics/llm_judges/g_eval/metric.py +3 -1
opik/evaluation/metrics/llm_judges/hallucination/metric.py +3 -1
opik/evaluation/metrics/llm_judges/moderation/metric.py +3 -1
opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +3 -1
opik/evaluation/metrics/llm_judges/syc_eval/metric.py +4 -2
opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +3 -1
opik/evaluation/metrics/llm_judges/usefulness/metric.py +3 -1
opik/evaluation/metrics/ragas_metric.py +43 -23
opik/evaluation/models/litellm/litellm_chat_model.py +7 -2
opik/evaluation/models/litellm/util.py +4 -20
opik/evaluation/models/models_factory.py +19 -5
opik/evaluation/rest_operations.py +3 -3
opik/evaluation/threads/helpers.py +3 -2
opik/file_upload/file_uploader.py +13 -0
opik/file_upload/upload_options.py +2 -0
opik/integrations/adk/legacy_opik_tracer.py +9 -11
opik/integrations/adk/opik_tracer.py +2 -2
opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +2 -2
opik/integrations/dspy/callback.py +100 -14
opik/integrations/dspy/parsers.py +168 -0
opik/integrations/harbor/__init__.py +17 -0
opik/integrations/harbor/experiment_service.py +269 -0
opik/integrations/harbor/opik_tracker.py +528 -0
opik/integrations/haystack/opik_tracer.py +2 -2
opik/integrations/langchain/__init__.py +15 -2
opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
opik/integrations/langchain/opik_tracer.py +258 -160
opik/integrations/langchain/provider_usage_extractors/langchain_run_helpers/helpers.py +7 -4
opik/integrations/llama_index/callback.py +43 -6
opik/integrations/openai/agents/opik_tracing_processor.py +8 -10
opik/integrations/openai/opik_tracker.py +99 -4
opik/integrations/openai/videos/__init__.py +9 -0
opik/integrations/openai/videos/binary_response_write_to_file_decorator.py +88 -0
opik/integrations/openai/videos/videos_create_decorator.py +159 -0
opik/integrations/openai/videos/videos_download_decorator.py +110 -0
opik/message_processing/batching/base_batcher.py +14 -21
opik/message_processing/batching/batch_manager.py +22 -10
opik/message_processing/batching/batchers.py +32 -40
opik/message_processing/batching/flushing_thread.py +0 -3
opik/message_processing/emulation/emulator_message_processor.py +36 -1
opik/message_processing/emulation/models.py +21 -0
opik/message_processing/messages.py +9 -0
opik/message_processing/preprocessing/__init__.py +0 -0
opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
opik/message_processing/preprocessing/constants.py +1 -0
opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
opik/message_processing/preprocessing/preprocessor.py +36 -0
opik/message_processing/processors/__init__.py +0 -0
opik/message_processing/processors/attachments_extraction_processor.py +146 -0
opik/message_processing/{message_processors.py → processors/message_processors.py} +15 -1
opik/message_processing/{message_processors_chain.py → processors/message_processors_chain.py} +3 -2
opik/message_processing/{online_message_processor.py → processors/online_message_processor.py} +11 -9
opik/message_processing/queue_consumer.py +4 -2
opik/message_processing/streamer.py +71 -33
opik/message_processing/streamer_constructors.py +36 -8
opik/plugins/pytest/experiment_runner.py +1 -1
opik/plugins/pytest/hooks.py +5 -3
opik/rest_api/__init__.py +42 -0
opik/rest_api/datasets/client.py +321 -123
opik/rest_api/datasets/raw_client.py +470 -145
opik/rest_api/experiments/client.py +26 -0
opik/rest_api/experiments/raw_client.py +26 -0
opik/rest_api/llm_provider_key/client.py +4 -4
opik/rest_api/llm_provider_key/raw_client.py +4 -4
opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +2 -1
opik/rest_api/manual_evaluation/client.py +101 -0
opik/rest_api/manual_evaluation/raw_client.py +172 -0
opik/rest_api/optimizations/client.py +0 -166
opik/rest_api/optimizations/raw_client.py +0 -248
opik/rest_api/projects/client.py +9 -0
opik/rest_api/projects/raw_client.py +13 -0
opik/rest_api/projects/types/project_metric_request_public_metric_type.py +4 -0
opik/rest_api/prompts/client.py +130 -2
opik/rest_api/prompts/raw_client.py +175 -0
opik/rest_api/traces/client.py +101 -0
opik/rest_api/traces/raw_client.py +120 -0
opik/rest_api/types/__init__.py +50 -0
opik/rest_api/types/audio_url.py +19 -0
opik/rest_api/types/audio_url_public.py +19 -0
opik/rest_api/types/audio_url_write.py +19 -0
opik/rest_api/types/automation_rule_evaluator.py +38 -2
opik/rest_api/types/automation_rule_evaluator_object_object_public.py +33 -2
opik/rest_api/types/automation_rule_evaluator_public.py +33 -2
opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
opik/rest_api/types/automation_rule_evaluator_update.py +27 -1
opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
opik/rest_api/types/automation_rule_evaluator_write.py +27 -1
opik/rest_api/types/dataset.py +2 -0
opik/rest_api/types/dataset_item.py +1 -1
opik/rest_api/types/dataset_item_batch.py +4 -0
opik/rest_api/types/dataset_item_changes_public.py +5 -0
opik/rest_api/types/dataset_item_compare.py +1 -1
opik/rest_api/types/dataset_item_filter.py +4 -0
opik/rest_api/types/dataset_item_page_compare.py +0 -1
opik/rest_api/types/dataset_item_page_public.py +0 -1
opik/rest_api/types/dataset_item_public.py +1 -1
opik/rest_api/types/dataset_public.py +2 -0
opik/rest_api/types/dataset_version_public.py +10 -0
opik/rest_api/types/dataset_version_summary.py +46 -0
opik/rest_api/types/dataset_version_summary_public.py +46 -0
opik/rest_api/types/experiment.py +9 -0
opik/rest_api/types/experiment_public.py +9 -0
opik/rest_api/types/group_content_with_aggregations.py +1 -0
opik/rest_api/types/llm_as_judge_message_content.py +2 -0
opik/rest_api/types/llm_as_judge_message_content_public.py +2 -0
opik/rest_api/types/llm_as_judge_message_content_write.py +2 -0
opik/rest_api/types/manual_evaluation_request_entity_type.py +1 -1
opik/rest_api/types/project.py +1 -0
opik/rest_api/types/project_detailed.py +1 -0
opik/rest_api/types/project_metric_response_public_metric_type.py +4 -0
opik/rest_api/types/project_reference.py +31 -0
opik/rest_api/types/project_reference_public.py +31 -0
opik/rest_api/types/project_stats_summary_item.py +1 -0
opik/rest_api/types/prompt_version.py +1 -0
opik/rest_api/types/prompt_version_detail.py +1 -0
opik/rest_api/types/prompt_version_page_public.py +5 -0
opik/rest_api/types/prompt_version_public.py +1 -0
opik/rest_api/types/prompt_version_update.py +33 -0
opik/rest_api/types/provider_api_key.py +5 -1
opik/rest_api/types/provider_api_key_provider.py +2 -1
opik/rest_api/types/provider_api_key_public.py +5 -1
opik/rest_api/types/provider_api_key_public_provider.py +2 -1
opik/rest_api/types/service_toggles_config.py +11 -1
opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
opik/types.py +36 -0
opik/validation/chat_prompt_messages.py +241 -0
opik/validation/feedback_score.py +3 -3
opik/validation/validator.py +28 -0
{opik-1.9.39.dist-info → opik-1.9.86.dist-info}/METADATA +7 -7
{opik-1.9.39.dist-info → opik-1.9.86.dist-info}/RECORD +193 -142
opik/cli/export.py +0 -791
opik/cli/import_command.py +0 -575
{opik-1.9.39.dist-info → opik-1.9.86.dist-info}/WHEEL +0 -0
{opik-1.9.39.dist-info → opik-1.9.86.dist-info}/entry_points.txt +0 -0
{opik-1.9.39.dist-info → opik-1.9.86.dist-info}/licenses/LICENSE +0 -0
{opik-1.9.39.dist-info → opik-1.9.86.dist-info}/top_level.txt +0 -0

opik/evaluation/engine/evaluation_tasks_executor.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from concurrent import futures
-from typing import List, TypeVar
+from typing import Any, List, Optional, TypeVar, Generic
 from ...environment import get_tqdm_for_current_environment
 from .types import EvaluationTask
@@ -9,6 +9,70 @@ _tqdm = get_tqdm_for_current_environment()
 T = TypeVar("T")
+class StreamingExecutor(Generic[T]):
+    """
+    Executor that accepts and processes evaluation tasks incrementally using a thread pool.
+    Tasks can be submitted one at a time and will begin executing immediately, allowing
+    for streaming behavior regardless of the number of workers configured.
+    """
+    def __init__(
+        self,
+        workers: int,
+        verbose: int,
+        desc: str = "Evaluation",
+        total: Optional[int] = None,
+    ):
+        self._workers = workers
+        self._verbose = verbose
+        self._desc = desc
+        self._total = total
+        self._task_count = 0
+        self._pool: futures.ThreadPoolExecutor
+        self._submitted_futures: List[futures.Future[T]] = []
+        self._progress_bar: Optional[Any] = None
+    def __enter__(self) -> "StreamingExecutor[T]":
+        self._pool = futures.ThreadPoolExecutor(max_workers=self._workers)
+        self._pool.__enter__()
+        # Initialize progress bar on enter
+        self._progress_bar = _tqdm(
+            disable=(self._verbose < 1),
+            desc=self._desc,
+            total=self._total,
+        )
+        return self
+    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
+        # Close progress bar if it exists
+        if self._progress_bar is not None:
+            self._progress_bar.close()
+        self._pool.__exit__(exc_type, exc_val, exc_tb)
+    def submit(self, task: EvaluationTask[T]) -> None:
+        """Submit a task to the thread pool for execution."""
+        self._task_count += 1
+        future = self._pool.submit(task)
+        self._submitted_futures.append(future)
+    def get_results(self) -> List[T]:
+        """Collect results from futures as they complete with progress bar."""
+        results: List[T] = []
+        # Update total if it wasn't known initially
+        if self._progress_bar is not None and self._total is None:
+            self._progress_bar.total = self._task_count
+        # Process futures as they complete and update progress bar
+        for future in futures.as_completed(self._submitted_futures):
+            results.append(future.result())
+            if self._progress_bar is not None:
+                self._progress_bar.update(1)
+        return results
 def execute(
     evaluation_tasks: List[EvaluationTask[T]],
     workers: int,
@@ -28,21 +92,9 @@ def execute(
         return test_results
-    with futures.ThreadPoolExecutor(max_workers=workers) as pool:
-        test_result_futures = [
-            pool.submit(evaluation_task) for evaluation_task in evaluation_tasks
-        ]
-        test_results = [
-            test_result_future.result()
-            for test_result_future in _tqdm(
-                futures.as_completed(
-                    test_result_futures,
-                ),
-                disable=(verbose < 1),
-                desc=desc,
-                total=len(test_result_futures),
-            )
-        ]
-    return test_results
+    with StreamingExecutor[T](
+        workers=workers, verbose=verbose, desc=desc, total=len(evaluation_tasks)
+    ) as executor:
+        for evaluation_task in evaluation_tasks:
+            executor.submit(evaluation_task)
+        return executor.get_results()

opik/evaluation/evaluator.py CHANGED Viewed

@@ -88,6 +88,7 @@ def evaluate(
     dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
     trial_count: int = 1,
     experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
+    experiment_tags: Optional[List[str]] = None,
 ) -> evaluation_result.EvaluationResult:
     """
     Performs task evaluation on a given dataset. You can use either `scoring_metrics` or `scorer_functions` to calculate
@@ -156,6 +157,8 @@ def evaluate(
             Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
             These scores are computed after all test results are collected and represent aggregate
             metrics across the entire experiment.
+        experiment_tags: Optional list of tags to associate with the experiment.
     """
     experiment_scoring_functions = (
         [] if experiment_scoring_functions is None else experiment_scoring_functions
@@ -178,6 +181,7 @@ def evaluate(
         dataset_name=dataset.name,
         experiment_config=experiment_config,
         prompts=checked_prompts,
+        tags=experiment_tags,
     )
     # wrap scoring functions if any
@@ -506,6 +510,7 @@ def evaluate_prompt(
     dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
     trial_count: int = 1,
     experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
+    experiment_tags: Optional[List[str]] = None,
 ) -> evaluation_result.EvaluationResult:
     """
     Performs prompt evaluation on a given dataset.
@@ -556,6 +561,8 @@ def evaluate_prompt(
             Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
             These scores are computed after all test results are collected and represent aggregate
             metrics across the entire experiment.
+        experiment_tags: List of tags to be associated with the experiment.
     """
     experiment_scoring_functions = (
         [] if experiment_scoring_functions is None else experiment_scoring_functions
@@ -593,6 +600,7 @@ def evaluate_prompt(
         dataset_name=dataset.name,
         experiment_config=experiment_config,
         prompts=prompts,
+        tags=experiment_tags,
     )
     # wrap scoring functions if any
@@ -691,6 +699,7 @@ def evaluate_optimization_trial(
     dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
     trial_count: int = 1,
     experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
+    experiment_tags: Optional[List[str]] = None,
 ) -> evaluation_result.EvaluationResult:
     """
     Performs task evaluation on a given dataset.
@@ -758,6 +767,8 @@ def evaluate_optimization_trial(
             Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
             These scores are computed after all test results are collected and represent aggregate
             metrics across the entire experiment.
+        experiment_tags: A list of tags to associate with the experiment.
     """
     experiment_scoring_functions = (
         [] if experiment_scoring_functions is None else experiment_scoring_functions
@@ -792,6 +803,7 @@ def evaluate_optimization_trial(
         prompts=checked_prompts,
         type="trial",
         optimization_id=optimization_id,
+        tags=experiment_tags,
     )
     return _evaluate_task(

opik/evaluation/metrics/conversation/llm_judges/conversational_coherence/metric.py CHANGED Viewed

@@ -93,7 +93,9 @@ class ConversationalCoherenceMetric(ConversationThreadMetric):
         if isinstance(model, base_model.OpikBaseModel):
             self._model = model
         else:
-            self._model = models_factory.get(model_name=model, temperature=temperature)
+            self._model = models_factory.get(
+                model_name=model, track=self.track, temperature=temperature
+            )
     def score(
         self,

opik/evaluation/metrics/conversation/llm_judges/session_completeness/metric.py CHANGED Viewed

@@ -80,7 +80,9 @@ class SessionCompletenessQuality(ConversationThreadMetric):
         if isinstance(model, base_model.OpikBaseModel):
             self._model = model
         else:
-            self._model = models_factory.get(model_name=model, temperature=temperature)
+            self._model = models_factory.get(
+                model_name=model, track=self.track, temperature=temperature
+            )
     def score(
         self,

opik/evaluation/metrics/conversation/llm_judges/user_frustration/metric.py CHANGED Viewed

@@ -92,7 +92,9 @@ class UserFrustrationMetric(ConversationThreadMetric):
         if isinstance(model, base_model.OpikBaseModel):
             self._model = model
         else:
-            self._model = models_factory.get(model_name=model, temperature=temperature)
+            self._model = models_factory.get(
+                model_name=model, track=self.track, temperature=temperature
+            )
     def score(
         self,

opik/evaluation/metrics/heuristics/equals.py CHANGED Viewed

@@ -42,22 +42,26 @@ class Equals(base_metric.BaseMetric):
         self._case_sensitive = case_sensitive
     def score(
-        self, output: str, reference: str, **ignored_kwargs: Any
+        self, output: Any, reference: Any, **ignored_kwargs: Any
     ) -> score_result.ScoreResult:
         """
-        Calculate the score based on whether the output string exactly matches the expected output.
+        Calculate the score based on whether the output exactly matches the expected output.
         Args:
-            output: The output string to check.
-            reference: The expected output string to compare against.
+            output: The output to check. Will be converted to string for comparison.
+            reference: The expected output to compare against. Will be converted to string for comparison.
             **ignored_kwargs: Additional keyword arguments that are ignored.
         Returns:
-            score_result.ScoreResult: A ScoreResult object with a value of 1.0 if the strings match,
+            score_result.ScoreResult: A ScoreResult object with a value of 1.0 if the values match,
                 0.0 otherwise.
         """
-        value_left = output if self._case_sensitive else output.lower()
-        value_right = reference if self._case_sensitive else reference.lower()
+        # Convert to string to handle numeric and other types
+        output_str = str(output)
+        reference_str = str(reference)
+        value_left = output_str if self._case_sensitive else output_str.lower()
+        value_right = reference_str if self._case_sensitive else reference_str.lower()
         if value_left == value_right:
             return score_result.ScoreResult(value=1.0, name=self.name)

opik/evaluation/metrics/llm_judges/answer_relevance/metric.py CHANGED Viewed

@@ -88,7 +88,9 @@ class AnswerRelevance(base_metric.BaseMetric):
             if self._seed is not None:
                 model_kwargs["seed"] = self._seed
-            self._model = models_factory.get(model_name=model, **model_kwargs)
+            self._model = models_factory.get(
+                model_name=model, track=self.track, **model_kwargs
+            )
     def _init_few_shot_examples(
         self,

opik/evaluation/metrics/llm_judges/context_precision/metric.py CHANGED Viewed

@@ -76,7 +76,9 @@ class ContextPrecision(base_metric.BaseMetric):
             if self._seed is not None:
                 model_kwargs["seed"] = self._seed
-            self._model = models_factory.get(model_name=model, **model_kwargs)
+            self._model = models_factory.get(
+                model_name=model, track=self.track, **model_kwargs
+            )
     def score(
         self,

opik/evaluation/metrics/llm_judges/context_recall/metric.py CHANGED Viewed

@@ -74,7 +74,9 @@ class ContextRecall(base_metric.BaseMetric):
             if self._seed is not None:
                 model_kwargs["seed"] = self._seed
-            self._model = models_factory.get(model_name=model, **model_kwargs)
+            self._model = models_factory.get(
+                model_name=model, track=self.track, **model_kwargs
+            )
     def score(
         self,

opik/evaluation/metrics/llm_judges/factuality/metric.py CHANGED Viewed

@@ -63,7 +63,7 @@ class Factuality(base_metric.BaseMetric):
         if isinstance(model, base_model.OpikBaseModel):
             self._model = model
         else:
-            self._model = models_factory.get(model_name=model)
+            self._model = models_factory.get(model_name=model, track=self.track)
     def score(
         self, input: str, output: str, context: List[str], **ignored_kwargs: Any

opik/evaluation/metrics/llm_judges/g_eval/metric.py CHANGED Viewed

@@ -127,7 +127,9 @@ class GEval(base_metric.BaseMetric):
             if self._seed is not None:
                 model_kwargs["seed"] = self._seed
-            self._model = models_factory.get(model_name=model, **model_kwargs)
+            self._model = models_factory.get(
+                model_name=model, track=self.track, **model_kwargs
+            )
         if (
             hasattr(self._model, "supported_params")

opik/evaluation/metrics/llm_judges/hallucination/metric.py CHANGED Viewed

@@ -73,7 +73,9 @@ class Hallucination(base_metric.BaseMetric):
             if self._seed is not None:
                 model_kwargs["seed"] = self._seed
-            self._model = models_factory.get(model_name=model, **model_kwargs)
+            self._model = models_factory.get(
+                model_name=model, track=self.track, **model_kwargs
+            )
     def score(
         self,

opik/evaluation/metrics/llm_judges/moderation/metric.py CHANGED Viewed

@@ -70,7 +70,9 @@ class Moderation(base_metric.BaseMetric):
             if self._seed is not None:
                 model_kwargs["seed"] = self._seed
-            self._model = models_factory.get(model_name=model, **model_kwargs)
+            self._model = models_factory.get(
+                model_name=model, track=self.track, **model_kwargs
+            )
     def score(self, output: str, **ignored_kwargs: Any) -> score_result.ScoreResult:
         """

opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py CHANGED Viewed

@@ -69,7 +69,9 @@ class StructuredOutputCompliance(base_metric.BaseMetric):
             if self._seed is not None:
                 model_kwargs["seed"] = self._seed
-            self._model = models_factory.get(model_name=model, **model_kwargs)
+            self._model = models_factory.get(
+                model_name=model, track=self.track, **model_kwargs
+            )
     def score(
         self,

opik/evaluation/metrics/llm_judges/syc_eval/metric.py CHANGED Viewed

@@ -93,7 +93,7 @@ class SycEval(base_metric.BaseMetric):
         if isinstance(model, base_model.OpikBaseModel):
             self._model = model
         else:
-            self._model = models_factory.get(model_name=model)
+            self._model = models_factory.get(model_name=model, track=self.track)
     def _init_rebuttal_model(
         self, rebuttal_model: Optional[Union[str, base_model.OpikBaseModel]]
@@ -101,7 +101,9 @@ class SycEval(base_metric.BaseMetric):
         if isinstance(rebuttal_model, base_model.OpikBaseModel):
             self._rebuttal_model = rebuttal_model
         else:
-            self._rebuttal_model = models_factory.get(model_name=rebuttal_model)
+            self._rebuttal_model = models_factory.get(
+                model_name=rebuttal_model, track=self.track
+            )
     def score(
         self,

opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py CHANGED Viewed

@@ -84,7 +84,9 @@ class TrajectoryAccuracy(base_metric.BaseMetric):
             if self._seed is not None:
                 model_kwargs["seed"] = self._seed
-            self._model = models_factory.get(model_name=model, **model_kwargs)
+            self._model = models_factory.get(
+                model_name=model, track=self.track, **model_kwargs
+            )
     def score(
         self,

opik/evaluation/metrics/llm_judges/usefulness/metric.py CHANGED Viewed

@@ -68,7 +68,9 @@ class Usefulness(base_metric.BaseMetric):
             if self._seed is not None:
                 model_kwargs["seed"] = self._seed
-            self._model = models_factory.get(model_name=model, **model_kwargs)
+            self._model = models_factory.get(
+                model_name=model, track=self.track, **model_kwargs
+            )
     def score(
         self, input: str, output: str, **ignored_kwargs: Any

opik/evaluation/metrics/ragas_metric.py CHANGED Viewed

@@ -1,20 +1,13 @@
-import asyncio
 from opik.evaluation.metrics import base_metric, score_result
 import opik.exceptions as exceptions
 from typing import Dict, Any, Optional, TYPE_CHECKING
+import opik.opik_context as opik_context
 if TYPE_CHECKING:
     from ragas import metrics as ragas_metrics
     from ragas import dataset_schema as ragas_dataset_schema
-def get_or_create_asyncio_loop() -> asyncio.AbstractEventLoop:
-    try:
-        return asyncio.get_running_loop()
-    except RuntimeError:
-        return asyncio.new_event_loop()
+    from opik.integrations.langchain import OpikTracer
 class RagasMetricWrapper(base_metric.BaseMetric):
@@ -37,16 +30,6 @@ class RagasMetricWrapper(base_metric.BaseMetric):
             ragas_metrics.MetricType.SINGLE_TURN.name
         ]
-        self._opik_tracer = None
-        if self.track:
-            from opik.integrations.langchain import OpikTracer
-            self._opik_tracer = OpikTracer()
-            self.callbacks = [self._opik_tracer]
-        else:
-            self.callbacks = []
     def _create_ragas_single_turn_sample(
         self, input_dict: Dict[str, Any]
     ) -> "ragas_dataset_schema.SingleTurnSample":
@@ -80,13 +63,50 @@ class RagasMetricWrapper(base_metric.BaseMetric):
     async def ascore(self, **kwargs: Any) -> score_result.ScoreResult:
         sample = self._create_ragas_single_turn_sample(kwargs)
-        score = await self.ragas_metric.single_turn_ascore(
-            sample, callbacks=self.callbacks
-        )
+        callbacks = [_get_opik_tracer_instance()] if self.track else []
+        score = await self.ragas_metric.single_turn_ascore(sample, callbacks=callbacks)
         return score_result.ScoreResult(value=score, name=self.name)
     def score(self, **kwargs: Any) -> score_result.ScoreResult:
         sample = self._create_ragas_single_turn_sample(kwargs)
-        score = self.ragas_metric.single_turn_score(sample, callbacks=self.callbacks)
+        callbacks = [_get_opik_tracer_instance()] if self.track else []
+        score = self.ragas_metric.single_turn_score(sample, callbacks=callbacks)
         return score_result.ScoreResult(value=score, name=self.name)
+def _get_opik_tracer_instance() -> "OpikTracer":
+    from opik.integrations.langchain import OpikTracer
+    current_span_data = opik_context.get_current_span_data()
+    current_trace_data = opik_context.get_current_trace_data()
+    project_name = None
+    if current_span_data is not None:
+        project_name = (
+            current_trace_data.project_name
+            if current_trace_data is not None
+            else current_span_data.project_name
+        )
+    # OPIK-3505: Why opik_context_read_only_mode=True?
+    #
+    # Problem: Ragas runs metrics concurrently under the hood with a manual management
+    # of the event loop. It was discovered that these metrics share the same context and so
+    # ContextVar used in Opik context storage can't be modified safely by them because concurrent
+    # operations share the same span stack.
+    #
+    # Solution: Disable context modification (opik_context_read_only_mode=True).
+    # OpikTracer will still create spans/traces and track parent-child relationships
+    # using LangChain's Run IDs, but won't modify the shared ContextVar storage.
+    #
+    # Trade-off: @track-decorated functions called within Ragas won't be attached
+    # to the Ragas spans. This is acceptable since Ragas metrics are self-contained
+    # and don't typically call user-defined tracked functions.
+    opik_tracer = OpikTracer(
+        opik_context_read_only_mode=True,
+        project_name=project_name,
+    )
+    return opik_tracer

opik/evaluation/models/litellm/litellm_chat_model.py CHANGED Viewed

@@ -59,6 +59,7 @@ class LiteLLMChatModel(base_model.OpikBaseModel):
         self,
         model_name: str = "gpt-5-nano",
         must_support_arguments: Optional[List[str]] = None,
+        track: bool = True,
         **completion_kwargs: Any,
     ) -> None:
         import litellm
@@ -75,7 +76,8 @@ class LiteLLMChatModel(base_model.OpikBaseModel):
                 `litellm.get_supported_openai_params(model_name)` call is used to get
                 supported arguments. If any is missing, ValueError is raised.
                 You can pass the arguments from the table: https://docs.litellm.ai/docs/completion/input#translated-openai-params
+            track: Whether to track the model calls. When False, disables tracing for this model instance.
+                Defaults to True.
             completion_kwargs: key-value arguments to always pass additionally into `litellm.completion` function.
         """
         super().__init__(model_name=model_name)
@@ -100,7 +102,10 @@ class LiteLLMChatModel(base_model.OpikBaseModel):
         config = opik_config.OpikConfig()
-        if config.enable_litellm_models_monitoring:
+        # Enable tracking only if both track parameter is True and config allows it
+        enable_tracking = track and config.enable_litellm_models_monitoring
+        if enable_tracking:
             self._litellm_completion = litellm_integration.track_completion()(
                 litellm.completion
             )

opik/evaluation/models/litellm/util.py CHANGED Viewed

@@ -93,30 +93,14 @@ def _apply_qwen_dashscope_filters(
 ) -> None:
     """Apply Qwen/DashScope specific parameter filters.
-    top_logprobs is only meaningful if logprobs is true and must be an int
-    in [0, 5]. When logprobs is false, drops top_logprobs; when logprobs is
-    true, clamps top_logprobs into [0, 5].
+    Does not return log probabilities.
     """
     unsupported: list[tuple[str, Any]] = []
-    logprobs_value = params.get("logprobs")
-    if not logprobs_value:
-        if "top_logprobs" in params:
-            unsupported.append(("top_logprobs", params["top_logprobs"]))
-    else:
-        if "top_logprobs" in params:
-            raw_top_logprobs = params["top_logprobs"]
-            try:
-                top_logprobs = int(raw_top_logprobs)
-            except (TypeError, ValueError):
-                unsupported.append(("top_logprobs", raw_top_logprobs))
-            else:
-                if top_logprobs < 0:
-                    top_logprobs = 0
-                elif top_logprobs > 5:
-                    top_logprobs = 5
-                params["top_logprobs"] = top_logprobs
+    for param in ("logprobs", "top_logprobs"):
+        if param in params:
+            unsupported.append((param, params[param]))
     _drop_unsupported_params_with_warning(
         params,

opik/evaluation/models/models_factory.py CHANGED Viewed

@@ -18,18 +18,32 @@ def _freeze(value: Any) -> Any:
     return value
-def _make_cache_key(model_name: str, model_kwargs: Dict[str, Any]) -> Any:
+def _make_cache_key(model_name: str, track: bool, model_kwargs: Dict[str, Any]) -> Any:
     frozen_kwargs = frozenset((k, _freeze(v)) for k, v in model_kwargs.items())
-    return (model_name, frozen_kwargs)
+    return (model_name, track, frozen_kwargs)
-def get(model_name: Optional[str], **model_kwargs: Any) -> base_model.OpikBaseModel:
+def get(
+    model_name: Optional[str], track: bool = True, **model_kwargs: Any
+) -> base_model.OpikBaseModel:
+    """
+    Get or create a cached LiteLLM chat model instance.
+    Args:
+        model_name: The name of the model to use. Defaults to DEFAULT_GPT_MODEL_NAME if None.
+        track: Whether to track the model calls. When False, disables tracing for this model instance.
+            Defaults to True.
+        **model_kwargs: Additional keyword arguments to pass to the model constructor.
+    Returns:
+        A cached or newly created OpikBaseModel instance.
+    """
     if model_name is None:
         model_name = DEFAULT_GPT_MODEL_NAME
-    cache_key = _make_cache_key(model_name, model_kwargs)
+    cache_key = _make_cache_key(model_name, track, model_kwargs)
     if cache_key not in _MODEL_CACHE:
         _MODEL_CACHE[cache_key] = litellm_chat_model.LiteLLMChatModel(
-            model_name=model_name, **model_kwargs
+            model_name=model_name, track=track, **model_kwargs
         )
     return _MODEL_CACHE[cache_key]

opik/evaluation/rest_operations.py CHANGED Viewed

@@ -2,7 +2,7 @@ import logging
 from typing import List, Optional
 from opik.api_objects import dataset, experiment, opik_client
-from opik.types import FeedbackScoreDict
+from opik.types import BatchFeedbackScoreDict
 from . import test_case
 from .metrics import score_result
 from .types import ScoringKeyMappingType
@@ -80,13 +80,13 @@ def log_test_result_feedback_scores(
     trace_id: str,
     project_name: Optional[str],
 ) -> None:
-    all_trace_scores: List[FeedbackScoreDict] = []
+    all_trace_scores: List[BatchFeedbackScoreDict] = []
     for score_result_ in score_results:
         if score_result_.scoring_failed:
             continue
-        trace_score = FeedbackScoreDict(
+        trace_score = BatchFeedbackScoreDict(
             id=trace_id,
             name=score_result_.name,
             value=score_result_.value,

opik/evaluation/threads/helpers.py CHANGED Viewed

@@ -4,7 +4,7 @@ from . import evaluation_result
 from ...api_objects import opik_client
 from ...api_objects.conversation import conversation_thread, conversation_factory
 from ...rest_api import TraceThread, JsonListStringPublic
-from ...types import FeedbackScoreDict
+from ...types import BatchFeedbackScoreDict
 from ...api_objects.threads import threads_client
@@ -15,7 +15,7 @@ def log_feedback_scores(
 ) -> None:
     for result in results:
         feedback_scores = [
-            FeedbackScoreDict(
+            BatchFeedbackScoreDict(
                 id=result.thread_id,
                 name=score.name,
                 value=score.value,
@@ -42,6 +42,7 @@ def load_conversation_thread(
         project_name=project_name,
         filter_string=f'thread_id = "{thread.id}"',
         max_results=max_results,
+        truncate=False,
     )
     return conversation_factory.create_conversation_from_traces(
         traces=traces,

opik 1.9.39__py3-none-any.whl → 1.9.86__py3-none-any.whl

opik 1.9.39py3-none-any.whl → 1.9.86py3-none-any.whl