opik 1.9.41__py3-none-any.whl → 1.9.86__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (192) hide show
  1. opik/api_objects/attachment/attachment_context.py +36 -0
  2. opik/api_objects/attachment/attachments_extractor.py +153 -0
  3. opik/api_objects/attachment/client.py +1 -0
  4. opik/api_objects/attachment/converters.py +2 -0
  5. opik/api_objects/attachment/decoder.py +18 -0
  6. opik/api_objects/attachment/decoder_base64.py +83 -0
  7. opik/api_objects/attachment/decoder_helpers.py +137 -0
  8. opik/api_objects/constants.py +2 -0
  9. opik/api_objects/dataset/dataset.py +133 -40
  10. opik/api_objects/dataset/rest_operations.py +2 -0
  11. opik/api_objects/experiment/experiment.py +6 -0
  12. opik/api_objects/helpers.py +8 -4
  13. opik/api_objects/local_recording.py +6 -5
  14. opik/api_objects/observation_data.py +101 -0
  15. opik/api_objects/opik_client.py +78 -45
  16. opik/api_objects/opik_query_language.py +9 -3
  17. opik/api_objects/prompt/chat/chat_prompt.py +18 -1
  18. opik/api_objects/prompt/client.py +8 -1
  19. opik/api_objects/span/span_data.py +3 -88
  20. opik/api_objects/threads/threads_client.py +7 -4
  21. opik/api_objects/trace/trace_data.py +3 -74
  22. opik/api_objects/validation_helpers.py +3 -3
  23. opik/cli/exports/__init__.py +131 -0
  24. opik/cli/exports/dataset.py +278 -0
  25. opik/cli/exports/experiment.py +784 -0
  26. opik/cli/exports/project.py +685 -0
  27. opik/cli/exports/prompt.py +578 -0
  28. opik/cli/exports/utils.py +406 -0
  29. opik/cli/harbor.py +39 -0
  30. opik/cli/imports/__init__.py +439 -0
  31. opik/cli/imports/dataset.py +143 -0
  32. opik/cli/imports/experiment.py +1192 -0
  33. opik/cli/imports/project.py +262 -0
  34. opik/cli/imports/prompt.py +177 -0
  35. opik/cli/imports/utils.py +280 -0
  36. opik/cli/main.py +14 -12
  37. opik/config.py +12 -1
  38. opik/datetime_helpers.py +12 -0
  39. opik/decorator/arguments_helpers.py +4 -1
  40. opik/decorator/base_track_decorator.py +111 -37
  41. opik/decorator/context_manager/span_context_manager.py +5 -1
  42. opik/decorator/generator_wrappers.py +5 -4
  43. opik/decorator/span_creation_handler.py +13 -4
  44. opik/evaluation/engine/engine.py +111 -28
  45. opik/evaluation/engine/evaluation_tasks_executor.py +71 -19
  46. opik/evaluation/evaluator.py +12 -0
  47. opik/evaluation/metrics/conversation/llm_judges/conversational_coherence/metric.py +3 -1
  48. opik/evaluation/metrics/conversation/llm_judges/session_completeness/metric.py +3 -1
  49. opik/evaluation/metrics/conversation/llm_judges/user_frustration/metric.py +3 -1
  50. opik/evaluation/metrics/heuristics/equals.py +11 -7
  51. opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +3 -1
  52. opik/evaluation/metrics/llm_judges/context_precision/metric.py +3 -1
  53. opik/evaluation/metrics/llm_judges/context_recall/metric.py +3 -1
  54. opik/evaluation/metrics/llm_judges/factuality/metric.py +1 -1
  55. opik/evaluation/metrics/llm_judges/g_eval/metric.py +3 -1
  56. opik/evaluation/metrics/llm_judges/hallucination/metric.py +3 -1
  57. opik/evaluation/metrics/llm_judges/moderation/metric.py +3 -1
  58. opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +3 -1
  59. opik/evaluation/metrics/llm_judges/syc_eval/metric.py +4 -2
  60. opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +3 -1
  61. opik/evaluation/metrics/llm_judges/usefulness/metric.py +3 -1
  62. opik/evaluation/metrics/ragas_metric.py +43 -23
  63. opik/evaluation/models/litellm/litellm_chat_model.py +7 -2
  64. opik/evaluation/models/litellm/util.py +4 -20
  65. opik/evaluation/models/models_factory.py +19 -5
  66. opik/evaluation/rest_operations.py +3 -3
  67. opik/evaluation/threads/helpers.py +3 -2
  68. opik/file_upload/file_uploader.py +13 -0
  69. opik/file_upload/upload_options.py +2 -0
  70. opik/integrations/adk/legacy_opik_tracer.py +9 -11
  71. opik/integrations/adk/opik_tracer.py +2 -2
  72. opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +2 -2
  73. opik/integrations/dspy/callback.py +100 -14
  74. opik/integrations/dspy/parsers.py +168 -0
  75. opik/integrations/harbor/__init__.py +17 -0
  76. opik/integrations/harbor/experiment_service.py +269 -0
  77. opik/integrations/harbor/opik_tracker.py +528 -0
  78. opik/integrations/haystack/opik_tracer.py +2 -2
  79. opik/integrations/langchain/__init__.py +15 -2
  80. opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
  81. opik/integrations/langchain/opik_tracer.py +258 -160
  82. opik/integrations/langchain/provider_usage_extractors/langchain_run_helpers/helpers.py +7 -4
  83. opik/integrations/llama_index/callback.py +43 -6
  84. opik/integrations/openai/agents/opik_tracing_processor.py +8 -10
  85. opik/integrations/openai/opik_tracker.py +99 -4
  86. opik/integrations/openai/videos/__init__.py +9 -0
  87. opik/integrations/openai/videos/binary_response_write_to_file_decorator.py +88 -0
  88. opik/integrations/openai/videos/videos_create_decorator.py +159 -0
  89. opik/integrations/openai/videos/videos_download_decorator.py +110 -0
  90. opik/message_processing/batching/base_batcher.py +14 -21
  91. opik/message_processing/batching/batch_manager.py +22 -10
  92. opik/message_processing/batching/batchers.py +32 -40
  93. opik/message_processing/batching/flushing_thread.py +0 -3
  94. opik/message_processing/emulation/emulator_message_processor.py +36 -1
  95. opik/message_processing/emulation/models.py +21 -0
  96. opik/message_processing/messages.py +9 -0
  97. opik/message_processing/preprocessing/__init__.py +0 -0
  98. opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
  99. opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
  100. opik/message_processing/preprocessing/constants.py +1 -0
  101. opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
  102. opik/message_processing/preprocessing/preprocessor.py +36 -0
  103. opik/message_processing/processors/__init__.py +0 -0
  104. opik/message_processing/processors/attachments_extraction_processor.py +146 -0
  105. opik/message_processing/{message_processors.py → processors/message_processors.py} +15 -1
  106. opik/message_processing/{message_processors_chain.py → processors/message_processors_chain.py} +3 -2
  107. opik/message_processing/{online_message_processor.py → processors/online_message_processor.py} +11 -9
  108. opik/message_processing/queue_consumer.py +4 -2
  109. opik/message_processing/streamer.py +71 -33
  110. opik/message_processing/streamer_constructors.py +36 -8
  111. opik/plugins/pytest/experiment_runner.py +1 -1
  112. opik/plugins/pytest/hooks.py +5 -3
  113. opik/rest_api/__init__.py +38 -0
  114. opik/rest_api/datasets/client.py +249 -148
  115. opik/rest_api/datasets/raw_client.py +356 -217
  116. opik/rest_api/experiments/client.py +26 -0
  117. opik/rest_api/experiments/raw_client.py +26 -0
  118. opik/rest_api/llm_provider_key/client.py +4 -4
  119. opik/rest_api/llm_provider_key/raw_client.py +4 -4
  120. opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +2 -1
  121. opik/rest_api/manual_evaluation/client.py +101 -0
  122. opik/rest_api/manual_evaluation/raw_client.py +172 -0
  123. opik/rest_api/optimizations/client.py +0 -166
  124. opik/rest_api/optimizations/raw_client.py +0 -248
  125. opik/rest_api/projects/client.py +9 -0
  126. opik/rest_api/projects/raw_client.py +13 -0
  127. opik/rest_api/projects/types/project_metric_request_public_metric_type.py +4 -0
  128. opik/rest_api/prompts/client.py +130 -2
  129. opik/rest_api/prompts/raw_client.py +175 -0
  130. opik/rest_api/traces/client.py +101 -0
  131. opik/rest_api/traces/raw_client.py +120 -0
  132. opik/rest_api/types/__init__.py +46 -0
  133. opik/rest_api/types/audio_url.py +19 -0
  134. opik/rest_api/types/audio_url_public.py +19 -0
  135. opik/rest_api/types/audio_url_write.py +19 -0
  136. opik/rest_api/types/automation_rule_evaluator.py +38 -2
  137. opik/rest_api/types/automation_rule_evaluator_object_object_public.py +33 -2
  138. opik/rest_api/types/automation_rule_evaluator_public.py +33 -2
  139. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
  140. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
  141. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
  142. opik/rest_api/types/automation_rule_evaluator_update.py +27 -1
  143. opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
  144. opik/rest_api/types/automation_rule_evaluator_write.py +27 -1
  145. opik/rest_api/types/dataset_item.py +1 -1
  146. opik/rest_api/types/dataset_item_batch.py +4 -0
  147. opik/rest_api/types/dataset_item_changes_public.py +5 -0
  148. opik/rest_api/types/dataset_item_compare.py +1 -1
  149. opik/rest_api/types/dataset_item_filter.py +4 -0
  150. opik/rest_api/types/dataset_item_page_compare.py +0 -1
  151. opik/rest_api/types/dataset_item_page_public.py +0 -1
  152. opik/rest_api/types/dataset_item_public.py +1 -1
  153. opik/rest_api/types/dataset_version_public.py +5 -0
  154. opik/rest_api/types/dataset_version_summary.py +5 -0
  155. opik/rest_api/types/dataset_version_summary_public.py +5 -0
  156. opik/rest_api/types/experiment.py +9 -0
  157. opik/rest_api/types/experiment_public.py +9 -0
  158. opik/rest_api/types/llm_as_judge_message_content.py +2 -0
  159. opik/rest_api/types/llm_as_judge_message_content_public.py +2 -0
  160. opik/rest_api/types/llm_as_judge_message_content_write.py +2 -0
  161. opik/rest_api/types/manual_evaluation_request_entity_type.py +1 -1
  162. opik/rest_api/types/project.py +1 -0
  163. opik/rest_api/types/project_detailed.py +1 -0
  164. opik/rest_api/types/project_metric_response_public_metric_type.py +4 -0
  165. opik/rest_api/types/project_reference.py +31 -0
  166. opik/rest_api/types/project_reference_public.py +31 -0
  167. opik/rest_api/types/project_stats_summary_item.py +1 -0
  168. opik/rest_api/types/prompt_version.py +1 -0
  169. opik/rest_api/types/prompt_version_detail.py +1 -0
  170. opik/rest_api/types/prompt_version_page_public.py +5 -0
  171. opik/rest_api/types/prompt_version_public.py +1 -0
  172. opik/rest_api/types/prompt_version_update.py +33 -0
  173. opik/rest_api/types/provider_api_key.py +5 -1
  174. opik/rest_api/types/provider_api_key_provider.py +2 -1
  175. opik/rest_api/types/provider_api_key_public.py +5 -1
  176. opik/rest_api/types/provider_api_key_public_provider.py +2 -1
  177. opik/rest_api/types/service_toggles_config.py +11 -1
  178. opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
  179. opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
  180. opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
  181. opik/types.py +36 -0
  182. opik/validation/chat_prompt_messages.py +241 -0
  183. opik/validation/feedback_score.py +3 -3
  184. opik/validation/validator.py +28 -0
  185. {opik-1.9.41.dist-info → opik-1.9.86.dist-info}/METADATA +5 -5
  186. {opik-1.9.41.dist-info → opik-1.9.86.dist-info}/RECORD +190 -141
  187. opik/cli/export.py +0 -791
  188. opik/cli/import_command.py +0 -575
  189. {opik-1.9.41.dist-info → opik-1.9.86.dist-info}/WHEEL +0 -0
  190. {opik-1.9.41.dist-info → opik-1.9.86.dist-info}/entry_points.txt +0 -0
  191. {opik-1.9.41.dist-info → opik-1.9.86.dist-info}/licenses/LICENSE +0 -0
  192. {opik-1.9.41.dist-info → opik-1.9.86.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,5 @@
1
1
  from concurrent import futures
2
- from typing import List, TypeVar
2
+ from typing import Any, List, Optional, TypeVar, Generic
3
3
 
4
4
  from ...environment import get_tqdm_for_current_environment
5
5
  from .types import EvaluationTask
@@ -9,6 +9,70 @@ _tqdm = get_tqdm_for_current_environment()
9
9
  T = TypeVar("T")
10
10
 
11
11
 
12
+ class StreamingExecutor(Generic[T]):
13
+ """
14
+ Executor that accepts and processes evaluation tasks incrementally using a thread pool.
15
+
16
+ Tasks can be submitted one at a time and will begin executing immediately, allowing
17
+ for streaming behavior regardless of the number of workers configured.
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ workers: int,
23
+ verbose: int,
24
+ desc: str = "Evaluation",
25
+ total: Optional[int] = None,
26
+ ):
27
+ self._workers = workers
28
+ self._verbose = verbose
29
+ self._desc = desc
30
+ self._total = total
31
+ self._task_count = 0
32
+ self._pool: futures.ThreadPoolExecutor
33
+ self._submitted_futures: List[futures.Future[T]] = []
34
+ self._progress_bar: Optional[Any] = None
35
+
36
+ def __enter__(self) -> "StreamingExecutor[T]":
37
+ self._pool = futures.ThreadPoolExecutor(max_workers=self._workers)
38
+ self._pool.__enter__()
39
+ # Initialize progress bar on enter
40
+ self._progress_bar = _tqdm(
41
+ disable=(self._verbose < 1),
42
+ desc=self._desc,
43
+ total=self._total,
44
+ )
45
+ return self
46
+
47
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
48
+ # Close progress bar if it exists
49
+ if self._progress_bar is not None:
50
+ self._progress_bar.close()
51
+ self._pool.__exit__(exc_type, exc_val, exc_tb)
52
+
53
+ def submit(self, task: EvaluationTask[T]) -> None:
54
+ """Submit a task to the thread pool for execution."""
55
+ self._task_count += 1
56
+ future = self._pool.submit(task)
57
+ self._submitted_futures.append(future)
58
+
59
+ def get_results(self) -> List[T]:
60
+ """Collect results from futures as they complete with progress bar."""
61
+ results: List[T] = []
62
+
63
+ # Update total if it wasn't known initially
64
+ if self._progress_bar is not None and self._total is None:
65
+ self._progress_bar.total = self._task_count
66
+
67
+ # Process futures as they complete and update progress bar
68
+ for future in futures.as_completed(self._submitted_futures):
69
+ results.append(future.result())
70
+ if self._progress_bar is not None:
71
+ self._progress_bar.update(1)
72
+
73
+ return results
74
+
75
+
12
76
  def execute(
13
77
  evaluation_tasks: List[EvaluationTask[T]],
14
78
  workers: int,
@@ -28,21 +92,9 @@ def execute(
28
92
 
29
93
  return test_results
30
94
 
31
- with futures.ThreadPoolExecutor(max_workers=workers) as pool:
32
- test_result_futures = [
33
- pool.submit(evaluation_task) for evaluation_task in evaluation_tasks
34
- ]
35
-
36
- test_results = [
37
- test_result_future.result()
38
- for test_result_future in _tqdm(
39
- futures.as_completed(
40
- test_result_futures,
41
- ),
42
- disable=(verbose < 1),
43
- desc=desc,
44
- total=len(test_result_futures),
45
- )
46
- ]
47
-
48
- return test_results
95
+ with StreamingExecutor[T](
96
+ workers=workers, verbose=verbose, desc=desc, total=len(evaluation_tasks)
97
+ ) as executor:
98
+ for evaluation_task in evaluation_tasks:
99
+ executor.submit(evaluation_task)
100
+ return executor.get_results()
@@ -88,6 +88,7 @@ def evaluate(
88
88
  dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
89
89
  trial_count: int = 1,
90
90
  experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
91
+ experiment_tags: Optional[List[str]] = None,
91
92
  ) -> evaluation_result.EvaluationResult:
92
93
  """
93
94
  Performs task evaluation on a given dataset. You can use either `scoring_metrics` or `scorer_functions` to calculate
@@ -156,6 +157,8 @@ def evaluate(
156
157
  Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
157
158
  These scores are computed after all test results are collected and represent aggregate
158
159
  metrics across the entire experiment.
160
+
161
+ experiment_tags: Optional list of tags to associate with the experiment.
159
162
  """
160
163
  experiment_scoring_functions = (
161
164
  [] if experiment_scoring_functions is None else experiment_scoring_functions
@@ -178,6 +181,7 @@ def evaluate(
178
181
  dataset_name=dataset.name,
179
182
  experiment_config=experiment_config,
180
183
  prompts=checked_prompts,
184
+ tags=experiment_tags,
181
185
  )
182
186
 
183
187
  # wrap scoring functions if any
@@ -506,6 +510,7 @@ def evaluate_prompt(
506
510
  dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
507
511
  trial_count: int = 1,
508
512
  experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
513
+ experiment_tags: Optional[List[str]] = None,
509
514
  ) -> evaluation_result.EvaluationResult:
510
515
  """
511
516
  Performs prompt evaluation on a given dataset.
@@ -556,6 +561,8 @@ def evaluate_prompt(
556
561
  Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
557
562
  These scores are computed after all test results are collected and represent aggregate
558
563
  metrics across the entire experiment.
564
+
565
+ experiment_tags: List of tags to be associated with the experiment.
559
566
  """
560
567
  experiment_scoring_functions = (
561
568
  [] if experiment_scoring_functions is None else experiment_scoring_functions
@@ -593,6 +600,7 @@ def evaluate_prompt(
593
600
  dataset_name=dataset.name,
594
601
  experiment_config=experiment_config,
595
602
  prompts=prompts,
603
+ tags=experiment_tags,
596
604
  )
597
605
 
598
606
  # wrap scoring functions if any
@@ -691,6 +699,7 @@ def evaluate_optimization_trial(
691
699
  dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
692
700
  trial_count: int = 1,
693
701
  experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
702
+ experiment_tags: Optional[List[str]] = None,
694
703
  ) -> evaluation_result.EvaluationResult:
695
704
  """
696
705
  Performs task evaluation on a given dataset.
@@ -758,6 +767,8 @@ def evaluate_optimization_trial(
758
767
  Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
759
768
  These scores are computed after all test results are collected and represent aggregate
760
769
  metrics across the entire experiment.
770
+
771
+ experiment_tags: A list of tags to associate with the experiment.
761
772
  """
762
773
  experiment_scoring_functions = (
763
774
  [] if experiment_scoring_functions is None else experiment_scoring_functions
@@ -792,6 +803,7 @@ def evaluate_optimization_trial(
792
803
  prompts=checked_prompts,
793
804
  type="trial",
794
805
  optimization_id=optimization_id,
806
+ tags=experiment_tags,
795
807
  )
796
808
 
797
809
  return _evaluate_task(
@@ -93,7 +93,9 @@ class ConversationalCoherenceMetric(ConversationThreadMetric):
93
93
  if isinstance(model, base_model.OpikBaseModel):
94
94
  self._model = model
95
95
  else:
96
- self._model = models_factory.get(model_name=model, temperature=temperature)
96
+ self._model = models_factory.get(
97
+ model_name=model, track=self.track, temperature=temperature
98
+ )
97
99
 
98
100
  def score(
99
101
  self,
@@ -80,7 +80,9 @@ class SessionCompletenessQuality(ConversationThreadMetric):
80
80
  if isinstance(model, base_model.OpikBaseModel):
81
81
  self._model = model
82
82
  else:
83
- self._model = models_factory.get(model_name=model, temperature=temperature)
83
+ self._model = models_factory.get(
84
+ model_name=model, track=self.track, temperature=temperature
85
+ )
84
86
 
85
87
  def score(
86
88
  self,
@@ -92,7 +92,9 @@ class UserFrustrationMetric(ConversationThreadMetric):
92
92
  if isinstance(model, base_model.OpikBaseModel):
93
93
  self._model = model
94
94
  else:
95
- self._model = models_factory.get(model_name=model, temperature=temperature)
95
+ self._model = models_factory.get(
96
+ model_name=model, track=self.track, temperature=temperature
97
+ )
96
98
 
97
99
  def score(
98
100
  self,
@@ -42,22 +42,26 @@ class Equals(base_metric.BaseMetric):
42
42
  self._case_sensitive = case_sensitive
43
43
 
44
44
  def score(
45
- self, output: str, reference: str, **ignored_kwargs: Any
45
+ self, output: Any, reference: Any, **ignored_kwargs: Any
46
46
  ) -> score_result.ScoreResult:
47
47
  """
48
- Calculate the score based on whether the output string exactly matches the expected output.
48
+ Calculate the score based on whether the output exactly matches the expected output.
49
49
 
50
50
  Args:
51
- output: The output string to check.
52
- reference: The expected output string to compare against.
51
+ output: The output to check. Will be converted to string for comparison.
52
+ reference: The expected output to compare against. Will be converted to string for comparison.
53
53
  **ignored_kwargs: Additional keyword arguments that are ignored.
54
54
 
55
55
  Returns:
56
- score_result.ScoreResult: A ScoreResult object with a value of 1.0 if the strings match,
56
+ score_result.ScoreResult: A ScoreResult object with a value of 1.0 if the values match,
57
57
  0.0 otherwise.
58
58
  """
59
- value_left = output if self._case_sensitive else output.lower()
60
- value_right = reference if self._case_sensitive else reference.lower()
59
+ # Convert to string to handle numeric and other types
60
+ output_str = str(output)
61
+ reference_str = str(reference)
62
+
63
+ value_left = output_str if self._case_sensitive else output_str.lower()
64
+ value_right = reference_str if self._case_sensitive else reference_str.lower()
61
65
 
62
66
  if value_left == value_right:
63
67
  return score_result.ScoreResult(value=1.0, name=self.name)
@@ -88,7 +88,9 @@ class AnswerRelevance(base_metric.BaseMetric):
88
88
  if self._seed is not None:
89
89
  model_kwargs["seed"] = self._seed
90
90
 
91
- self._model = models_factory.get(model_name=model, **model_kwargs)
91
+ self._model = models_factory.get(
92
+ model_name=model, track=self.track, **model_kwargs
93
+ )
92
94
 
93
95
  def _init_few_shot_examples(
94
96
  self,
@@ -76,7 +76,9 @@ class ContextPrecision(base_metric.BaseMetric):
76
76
  if self._seed is not None:
77
77
  model_kwargs["seed"] = self._seed
78
78
 
79
- self._model = models_factory.get(model_name=model, **model_kwargs)
79
+ self._model = models_factory.get(
80
+ model_name=model, track=self.track, **model_kwargs
81
+ )
80
82
 
81
83
  def score(
82
84
  self,
@@ -74,7 +74,9 @@ class ContextRecall(base_metric.BaseMetric):
74
74
  if self._seed is not None:
75
75
  model_kwargs["seed"] = self._seed
76
76
 
77
- self._model = models_factory.get(model_name=model, **model_kwargs)
77
+ self._model = models_factory.get(
78
+ model_name=model, track=self.track, **model_kwargs
79
+ )
78
80
 
79
81
  def score(
80
82
  self,
@@ -63,7 +63,7 @@ class Factuality(base_metric.BaseMetric):
63
63
  if isinstance(model, base_model.OpikBaseModel):
64
64
  self._model = model
65
65
  else:
66
- self._model = models_factory.get(model_name=model)
66
+ self._model = models_factory.get(model_name=model, track=self.track)
67
67
 
68
68
  def score(
69
69
  self, input: str, output: str, context: List[str], **ignored_kwargs: Any
@@ -127,7 +127,9 @@ class GEval(base_metric.BaseMetric):
127
127
  if self._seed is not None:
128
128
  model_kwargs["seed"] = self._seed
129
129
 
130
- self._model = models_factory.get(model_name=model, **model_kwargs)
130
+ self._model = models_factory.get(
131
+ model_name=model, track=self.track, **model_kwargs
132
+ )
131
133
 
132
134
  if (
133
135
  hasattr(self._model, "supported_params")
@@ -73,7 +73,9 @@ class Hallucination(base_metric.BaseMetric):
73
73
  if self._seed is not None:
74
74
  model_kwargs["seed"] = self._seed
75
75
 
76
- self._model = models_factory.get(model_name=model, **model_kwargs)
76
+ self._model = models_factory.get(
77
+ model_name=model, track=self.track, **model_kwargs
78
+ )
77
79
 
78
80
  def score(
79
81
  self,
@@ -70,7 +70,9 @@ class Moderation(base_metric.BaseMetric):
70
70
  if self._seed is not None:
71
71
  model_kwargs["seed"] = self._seed
72
72
 
73
- self._model = models_factory.get(model_name=model, **model_kwargs)
73
+ self._model = models_factory.get(
74
+ model_name=model, track=self.track, **model_kwargs
75
+ )
74
76
 
75
77
  def score(self, output: str, **ignored_kwargs: Any) -> score_result.ScoreResult:
76
78
  """
@@ -69,7 +69,9 @@ class StructuredOutputCompliance(base_metric.BaseMetric):
69
69
  if self._seed is not None:
70
70
  model_kwargs["seed"] = self._seed
71
71
 
72
- self._model = models_factory.get(model_name=model, **model_kwargs)
72
+ self._model = models_factory.get(
73
+ model_name=model, track=self.track, **model_kwargs
74
+ )
73
75
 
74
76
  def score(
75
77
  self,
@@ -93,7 +93,7 @@ class SycEval(base_metric.BaseMetric):
93
93
  if isinstance(model, base_model.OpikBaseModel):
94
94
  self._model = model
95
95
  else:
96
- self._model = models_factory.get(model_name=model)
96
+ self._model = models_factory.get(model_name=model, track=self.track)
97
97
 
98
98
  def _init_rebuttal_model(
99
99
  self, rebuttal_model: Optional[Union[str, base_model.OpikBaseModel]]
@@ -101,7 +101,9 @@ class SycEval(base_metric.BaseMetric):
101
101
  if isinstance(rebuttal_model, base_model.OpikBaseModel):
102
102
  self._rebuttal_model = rebuttal_model
103
103
  else:
104
- self._rebuttal_model = models_factory.get(model_name=rebuttal_model)
104
+ self._rebuttal_model = models_factory.get(
105
+ model_name=rebuttal_model, track=self.track
106
+ )
105
107
 
106
108
  def score(
107
109
  self,
@@ -84,7 +84,9 @@ class TrajectoryAccuracy(base_metric.BaseMetric):
84
84
  if self._seed is not None:
85
85
  model_kwargs["seed"] = self._seed
86
86
 
87
- self._model = models_factory.get(model_name=model, **model_kwargs)
87
+ self._model = models_factory.get(
88
+ model_name=model, track=self.track, **model_kwargs
89
+ )
88
90
 
89
91
  def score(
90
92
  self,
@@ -68,7 +68,9 @@ class Usefulness(base_metric.BaseMetric):
68
68
  if self._seed is not None:
69
69
  model_kwargs["seed"] = self._seed
70
70
 
71
- self._model = models_factory.get(model_name=model, **model_kwargs)
71
+ self._model = models_factory.get(
72
+ model_name=model, track=self.track, **model_kwargs
73
+ )
72
74
 
73
75
  def score(
74
76
  self, input: str, output: str, **ignored_kwargs: Any
@@ -1,20 +1,13 @@
1
- import asyncio
2
-
3
1
  from opik.evaluation.metrics import base_metric, score_result
4
2
  import opik.exceptions as exceptions
5
3
 
6
4
  from typing import Dict, Any, Optional, TYPE_CHECKING
5
+ import opik.opik_context as opik_context
7
6
 
8
7
  if TYPE_CHECKING:
9
8
  from ragas import metrics as ragas_metrics
10
9
  from ragas import dataset_schema as ragas_dataset_schema
11
-
12
-
13
- def get_or_create_asyncio_loop() -> asyncio.AbstractEventLoop:
14
- try:
15
- return asyncio.get_running_loop()
16
- except RuntimeError:
17
- return asyncio.new_event_loop()
10
+ from opik.integrations.langchain import OpikTracer
18
11
 
19
12
 
20
13
  class RagasMetricWrapper(base_metric.BaseMetric):
@@ -37,16 +30,6 @@ class RagasMetricWrapper(base_metric.BaseMetric):
37
30
  ragas_metrics.MetricType.SINGLE_TURN.name
38
31
  ]
39
32
 
40
- self._opik_tracer = None
41
- if self.track:
42
- from opik.integrations.langchain import OpikTracer
43
-
44
- self._opik_tracer = OpikTracer()
45
-
46
- self.callbacks = [self._opik_tracer]
47
- else:
48
- self.callbacks = []
49
-
50
33
  def _create_ragas_single_turn_sample(
51
34
  self, input_dict: Dict[str, Any]
52
35
  ) -> "ragas_dataset_schema.SingleTurnSample":
@@ -80,13 +63,50 @@ class RagasMetricWrapper(base_metric.BaseMetric):
80
63
  async def ascore(self, **kwargs: Any) -> score_result.ScoreResult:
81
64
  sample = self._create_ragas_single_turn_sample(kwargs)
82
65
 
83
- score = await self.ragas_metric.single_turn_ascore(
84
- sample, callbacks=self.callbacks
85
- )
66
+ callbacks = [_get_opik_tracer_instance()] if self.track else []
67
+
68
+ score = await self.ragas_metric.single_turn_ascore(sample, callbacks=callbacks)
86
69
  return score_result.ScoreResult(value=score, name=self.name)
87
70
 
88
71
  def score(self, **kwargs: Any) -> score_result.ScoreResult:
89
72
  sample = self._create_ragas_single_turn_sample(kwargs)
90
73
 
91
- score = self.ragas_metric.single_turn_score(sample, callbacks=self.callbacks)
74
+ callbacks = [_get_opik_tracer_instance()] if self.track else []
75
+
76
+ score = self.ragas_metric.single_turn_score(sample, callbacks=callbacks)
92
77
  return score_result.ScoreResult(value=score, name=self.name)
78
+
79
+
80
+ def _get_opik_tracer_instance() -> "OpikTracer":
81
+ from opik.integrations.langchain import OpikTracer
82
+
83
+ current_span_data = opik_context.get_current_span_data()
84
+ current_trace_data = opik_context.get_current_trace_data()
85
+ project_name = None
86
+
87
+ if current_span_data is not None:
88
+ project_name = (
89
+ current_trace_data.project_name
90
+ if current_trace_data is not None
91
+ else current_span_data.project_name
92
+ )
93
+
94
+ # OPIK-3505: Why opik_context_read_only_mode=True?
95
+ #
96
+ # Problem: Ragas runs metrics concurrently under the hood with a manual management
97
+ # of the event loop. It was discovered that these metrics share the same context and so
98
+ # ContextVar used in Opik context storage can't be modified safely by them because concurrent
99
+ # operations share the same span stack.
100
+ #
101
+ # Solution: Disable context modification (opik_context_read_only_mode=True).
102
+ # OpikTracer will still create spans/traces and track parent-child relationships
103
+ # using LangChain's Run IDs, but won't modify the shared ContextVar storage.
104
+ #
105
+ # Trade-off: @track-decorated functions called within Ragas won't be attached
106
+ # to the Ragas spans. This is acceptable since Ragas metrics are self-contained
107
+ # and don't typically call user-defined tracked functions.
108
+ opik_tracer = OpikTracer(
109
+ opik_context_read_only_mode=True,
110
+ project_name=project_name,
111
+ )
112
+ return opik_tracer
@@ -59,6 +59,7 @@ class LiteLLMChatModel(base_model.OpikBaseModel):
59
59
  self,
60
60
  model_name: str = "gpt-5-nano",
61
61
  must_support_arguments: Optional[List[str]] = None,
62
+ track: bool = True,
62
63
  **completion_kwargs: Any,
63
64
  ) -> None:
64
65
  import litellm
@@ -75,7 +76,8 @@ class LiteLLMChatModel(base_model.OpikBaseModel):
75
76
  `litellm.get_supported_openai_params(model_name)` call is used to get
76
77
  supported arguments. If any is missing, ValueError is raised.
77
78
  You can pass the arguments from the table: https://docs.litellm.ai/docs/completion/input#translated-openai-params
78
-
79
+ track: Whether to track the model calls. When False, disables tracing for this model instance.
80
+ Defaults to True.
79
81
  completion_kwargs: key-value arguments to always pass additionally into `litellm.completion` function.
80
82
  """
81
83
  super().__init__(model_name=model_name)
@@ -100,7 +102,10 @@ class LiteLLMChatModel(base_model.OpikBaseModel):
100
102
 
101
103
  config = opik_config.OpikConfig()
102
104
 
103
- if config.enable_litellm_models_monitoring:
105
+ # Enable tracking only if both track parameter is True and config allows it
106
+ enable_tracking = track and config.enable_litellm_models_monitoring
107
+
108
+ if enable_tracking:
104
109
  self._litellm_completion = litellm_integration.track_completion()(
105
110
  litellm.completion
106
111
  )
@@ -93,30 +93,14 @@ def _apply_qwen_dashscope_filters(
93
93
  ) -> None:
94
94
  """Apply Qwen/DashScope specific parameter filters.
95
95
 
96
- top_logprobs is only meaningful if logprobs is true and must be an int
97
- in [0, 5]. When logprobs is false, drops top_logprobs; when logprobs is
98
- true, clamps top_logprobs into [0, 5].
96
+ Does not return log probabilities.
99
97
  """
100
98
 
101
99
  unsupported: list[tuple[str, Any]] = []
102
100
 
103
- logprobs_value = params.get("logprobs")
104
- if not logprobs_value:
105
- if "top_logprobs" in params:
106
- unsupported.append(("top_logprobs", params["top_logprobs"]))
107
- else:
108
- if "top_logprobs" in params:
109
- raw_top_logprobs = params["top_logprobs"]
110
- try:
111
- top_logprobs = int(raw_top_logprobs)
112
- except (TypeError, ValueError):
113
- unsupported.append(("top_logprobs", raw_top_logprobs))
114
- else:
115
- if top_logprobs < 0:
116
- top_logprobs = 0
117
- elif top_logprobs > 5:
118
- top_logprobs = 5
119
- params["top_logprobs"] = top_logprobs
101
+ for param in ("logprobs", "top_logprobs"):
102
+ if param in params:
103
+ unsupported.append((param, params[param]))
120
104
 
121
105
  _drop_unsupported_params_with_warning(
122
106
  params,
@@ -18,18 +18,32 @@ def _freeze(value: Any) -> Any:
18
18
  return value
19
19
 
20
20
 
21
- def _make_cache_key(model_name: str, model_kwargs: Dict[str, Any]) -> Any:
21
+ def _make_cache_key(model_name: str, track: bool, model_kwargs: Dict[str, Any]) -> Any:
22
22
  frozen_kwargs = frozenset((k, _freeze(v)) for k, v in model_kwargs.items())
23
- return (model_name, frozen_kwargs)
23
+ return (model_name, track, frozen_kwargs)
24
24
 
25
25
 
26
- def get(model_name: Optional[str], **model_kwargs: Any) -> base_model.OpikBaseModel:
26
+ def get(
27
+ model_name: Optional[str], track: bool = True, **model_kwargs: Any
28
+ ) -> base_model.OpikBaseModel:
29
+ """
30
+ Get or create a cached LiteLLM chat model instance.
31
+
32
+ Args:
33
+ model_name: The name of the model to use. Defaults to DEFAULT_GPT_MODEL_NAME if None.
34
+ track: Whether to track the model calls. When False, disables tracing for this model instance.
35
+ Defaults to True.
36
+ **model_kwargs: Additional keyword arguments to pass to the model constructor.
37
+
38
+ Returns:
39
+ A cached or newly created OpikBaseModel instance.
40
+ """
27
41
  if model_name is None:
28
42
  model_name = DEFAULT_GPT_MODEL_NAME
29
43
 
30
- cache_key = _make_cache_key(model_name, model_kwargs)
44
+ cache_key = _make_cache_key(model_name, track, model_kwargs)
31
45
  if cache_key not in _MODEL_CACHE:
32
46
  _MODEL_CACHE[cache_key] = litellm_chat_model.LiteLLMChatModel(
33
- model_name=model_name, **model_kwargs
47
+ model_name=model_name, track=track, **model_kwargs
34
48
  )
35
49
  return _MODEL_CACHE[cache_key]
@@ -2,7 +2,7 @@ import logging
2
2
  from typing import List, Optional
3
3
 
4
4
  from opik.api_objects import dataset, experiment, opik_client
5
- from opik.types import FeedbackScoreDict
5
+ from opik.types import BatchFeedbackScoreDict
6
6
  from . import test_case
7
7
  from .metrics import score_result
8
8
  from .types import ScoringKeyMappingType
@@ -80,13 +80,13 @@ def log_test_result_feedback_scores(
80
80
  trace_id: str,
81
81
  project_name: Optional[str],
82
82
  ) -> None:
83
- all_trace_scores: List[FeedbackScoreDict] = []
83
+ all_trace_scores: List[BatchFeedbackScoreDict] = []
84
84
 
85
85
  for score_result_ in score_results:
86
86
  if score_result_.scoring_failed:
87
87
  continue
88
88
 
89
- trace_score = FeedbackScoreDict(
89
+ trace_score = BatchFeedbackScoreDict(
90
90
  id=trace_id,
91
91
  name=score_result_.name,
92
92
  value=score_result_.value,
@@ -4,7 +4,7 @@ from . import evaluation_result
4
4
  from ...api_objects import opik_client
5
5
  from ...api_objects.conversation import conversation_thread, conversation_factory
6
6
  from ...rest_api import TraceThread, JsonListStringPublic
7
- from ...types import FeedbackScoreDict
7
+ from ...types import BatchFeedbackScoreDict
8
8
  from ...api_objects.threads import threads_client
9
9
 
10
10
 
@@ -15,7 +15,7 @@ def log_feedback_scores(
15
15
  ) -> None:
16
16
  for result in results:
17
17
  feedback_scores = [
18
- FeedbackScoreDict(
18
+ BatchFeedbackScoreDict(
19
19
  id=result.thread_id,
20
20
  name=score.name,
21
21
  value=score.value,
@@ -42,6 +42,7 @@ def load_conversation_thread(
42
42
  project_name=project_name,
43
43
  filter_string=f'thread_id = "{thread.id}"',
44
44
  max_results=max_results,
45
+ truncate=False,
45
46
  )
46
47
  return conversation_factory.create_conversation_from_traces(
47
48
  traces=traces,