opik 1.9.39__py3-none-any.whl → 1.9.86__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (195) hide show
  1. opik/api_objects/attachment/attachment_context.py +36 -0
  2. opik/api_objects/attachment/attachments_extractor.py +153 -0
  3. opik/api_objects/attachment/client.py +1 -0
  4. opik/api_objects/attachment/converters.py +2 -0
  5. opik/api_objects/attachment/decoder.py +18 -0
  6. opik/api_objects/attachment/decoder_base64.py +83 -0
  7. opik/api_objects/attachment/decoder_helpers.py +137 -0
  8. opik/api_objects/constants.py +2 -0
  9. opik/api_objects/dataset/dataset.py +133 -40
  10. opik/api_objects/dataset/rest_operations.py +2 -0
  11. opik/api_objects/experiment/experiment.py +6 -0
  12. opik/api_objects/helpers.py +8 -4
  13. opik/api_objects/local_recording.py +6 -5
  14. opik/api_objects/observation_data.py +101 -0
  15. opik/api_objects/opik_client.py +78 -45
  16. opik/api_objects/opik_query_language.py +9 -3
  17. opik/api_objects/prompt/chat/chat_prompt.py +18 -1
  18. opik/api_objects/prompt/client.py +8 -1
  19. opik/api_objects/span/span_data.py +3 -88
  20. opik/api_objects/threads/threads_client.py +7 -4
  21. opik/api_objects/trace/trace_data.py +3 -74
  22. opik/api_objects/validation_helpers.py +3 -3
  23. opik/cli/exports/__init__.py +131 -0
  24. opik/cli/exports/dataset.py +278 -0
  25. opik/cli/exports/experiment.py +784 -0
  26. opik/cli/exports/project.py +685 -0
  27. opik/cli/exports/prompt.py +578 -0
  28. opik/cli/exports/utils.py +406 -0
  29. opik/cli/harbor.py +39 -0
  30. opik/cli/imports/__init__.py +439 -0
  31. opik/cli/imports/dataset.py +143 -0
  32. opik/cli/imports/experiment.py +1192 -0
  33. opik/cli/imports/project.py +262 -0
  34. opik/cli/imports/prompt.py +177 -0
  35. opik/cli/imports/utils.py +280 -0
  36. opik/cli/main.py +14 -12
  37. opik/config.py +12 -1
  38. opik/datetime_helpers.py +12 -0
  39. opik/decorator/arguments_helpers.py +4 -1
  40. opik/decorator/base_track_decorator.py +111 -37
  41. opik/decorator/context_manager/span_context_manager.py +5 -1
  42. opik/decorator/generator_wrappers.py +5 -4
  43. opik/decorator/span_creation_handler.py +13 -4
  44. opik/evaluation/engine/engine.py +111 -28
  45. opik/evaluation/engine/evaluation_tasks_executor.py +71 -19
  46. opik/evaluation/evaluator.py +12 -0
  47. opik/evaluation/metrics/conversation/llm_judges/conversational_coherence/metric.py +3 -1
  48. opik/evaluation/metrics/conversation/llm_judges/session_completeness/metric.py +3 -1
  49. opik/evaluation/metrics/conversation/llm_judges/user_frustration/metric.py +3 -1
  50. opik/evaluation/metrics/heuristics/equals.py +11 -7
  51. opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +3 -1
  52. opik/evaluation/metrics/llm_judges/context_precision/metric.py +3 -1
  53. opik/evaluation/metrics/llm_judges/context_recall/metric.py +3 -1
  54. opik/evaluation/metrics/llm_judges/factuality/metric.py +1 -1
  55. opik/evaluation/metrics/llm_judges/g_eval/metric.py +3 -1
  56. opik/evaluation/metrics/llm_judges/hallucination/metric.py +3 -1
  57. opik/evaluation/metrics/llm_judges/moderation/metric.py +3 -1
  58. opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +3 -1
  59. opik/evaluation/metrics/llm_judges/syc_eval/metric.py +4 -2
  60. opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +3 -1
  61. opik/evaluation/metrics/llm_judges/usefulness/metric.py +3 -1
  62. opik/evaluation/metrics/ragas_metric.py +43 -23
  63. opik/evaluation/models/litellm/litellm_chat_model.py +7 -2
  64. opik/evaluation/models/litellm/util.py +4 -20
  65. opik/evaluation/models/models_factory.py +19 -5
  66. opik/evaluation/rest_operations.py +3 -3
  67. opik/evaluation/threads/helpers.py +3 -2
  68. opik/file_upload/file_uploader.py +13 -0
  69. opik/file_upload/upload_options.py +2 -0
  70. opik/integrations/adk/legacy_opik_tracer.py +9 -11
  71. opik/integrations/adk/opik_tracer.py +2 -2
  72. opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +2 -2
  73. opik/integrations/dspy/callback.py +100 -14
  74. opik/integrations/dspy/parsers.py +168 -0
  75. opik/integrations/harbor/__init__.py +17 -0
  76. opik/integrations/harbor/experiment_service.py +269 -0
  77. opik/integrations/harbor/opik_tracker.py +528 -0
  78. opik/integrations/haystack/opik_tracer.py +2 -2
  79. opik/integrations/langchain/__init__.py +15 -2
  80. opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
  81. opik/integrations/langchain/opik_tracer.py +258 -160
  82. opik/integrations/langchain/provider_usage_extractors/langchain_run_helpers/helpers.py +7 -4
  83. opik/integrations/llama_index/callback.py +43 -6
  84. opik/integrations/openai/agents/opik_tracing_processor.py +8 -10
  85. opik/integrations/openai/opik_tracker.py +99 -4
  86. opik/integrations/openai/videos/__init__.py +9 -0
  87. opik/integrations/openai/videos/binary_response_write_to_file_decorator.py +88 -0
  88. opik/integrations/openai/videos/videos_create_decorator.py +159 -0
  89. opik/integrations/openai/videos/videos_download_decorator.py +110 -0
  90. opik/message_processing/batching/base_batcher.py +14 -21
  91. opik/message_processing/batching/batch_manager.py +22 -10
  92. opik/message_processing/batching/batchers.py +32 -40
  93. opik/message_processing/batching/flushing_thread.py +0 -3
  94. opik/message_processing/emulation/emulator_message_processor.py +36 -1
  95. opik/message_processing/emulation/models.py +21 -0
  96. opik/message_processing/messages.py +9 -0
  97. opik/message_processing/preprocessing/__init__.py +0 -0
  98. opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
  99. opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
  100. opik/message_processing/preprocessing/constants.py +1 -0
  101. opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
  102. opik/message_processing/preprocessing/preprocessor.py +36 -0
  103. opik/message_processing/processors/__init__.py +0 -0
  104. opik/message_processing/processors/attachments_extraction_processor.py +146 -0
  105. opik/message_processing/{message_processors.py → processors/message_processors.py} +15 -1
  106. opik/message_processing/{message_processors_chain.py → processors/message_processors_chain.py} +3 -2
  107. opik/message_processing/{online_message_processor.py → processors/online_message_processor.py} +11 -9
  108. opik/message_processing/queue_consumer.py +4 -2
  109. opik/message_processing/streamer.py +71 -33
  110. opik/message_processing/streamer_constructors.py +36 -8
  111. opik/plugins/pytest/experiment_runner.py +1 -1
  112. opik/plugins/pytest/hooks.py +5 -3
  113. opik/rest_api/__init__.py +42 -0
  114. opik/rest_api/datasets/client.py +321 -123
  115. opik/rest_api/datasets/raw_client.py +470 -145
  116. opik/rest_api/experiments/client.py +26 -0
  117. opik/rest_api/experiments/raw_client.py +26 -0
  118. opik/rest_api/llm_provider_key/client.py +4 -4
  119. opik/rest_api/llm_provider_key/raw_client.py +4 -4
  120. opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +2 -1
  121. opik/rest_api/manual_evaluation/client.py +101 -0
  122. opik/rest_api/manual_evaluation/raw_client.py +172 -0
  123. opik/rest_api/optimizations/client.py +0 -166
  124. opik/rest_api/optimizations/raw_client.py +0 -248
  125. opik/rest_api/projects/client.py +9 -0
  126. opik/rest_api/projects/raw_client.py +13 -0
  127. opik/rest_api/projects/types/project_metric_request_public_metric_type.py +4 -0
  128. opik/rest_api/prompts/client.py +130 -2
  129. opik/rest_api/prompts/raw_client.py +175 -0
  130. opik/rest_api/traces/client.py +101 -0
  131. opik/rest_api/traces/raw_client.py +120 -0
  132. opik/rest_api/types/__init__.py +50 -0
  133. opik/rest_api/types/audio_url.py +19 -0
  134. opik/rest_api/types/audio_url_public.py +19 -0
  135. opik/rest_api/types/audio_url_write.py +19 -0
  136. opik/rest_api/types/automation_rule_evaluator.py +38 -2
  137. opik/rest_api/types/automation_rule_evaluator_object_object_public.py +33 -2
  138. opik/rest_api/types/automation_rule_evaluator_public.py +33 -2
  139. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
  140. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
  141. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
  142. opik/rest_api/types/automation_rule_evaluator_update.py +27 -1
  143. opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
  144. opik/rest_api/types/automation_rule_evaluator_write.py +27 -1
  145. opik/rest_api/types/dataset.py +2 -0
  146. opik/rest_api/types/dataset_item.py +1 -1
  147. opik/rest_api/types/dataset_item_batch.py +4 -0
  148. opik/rest_api/types/dataset_item_changes_public.py +5 -0
  149. opik/rest_api/types/dataset_item_compare.py +1 -1
  150. opik/rest_api/types/dataset_item_filter.py +4 -0
  151. opik/rest_api/types/dataset_item_page_compare.py +0 -1
  152. opik/rest_api/types/dataset_item_page_public.py +0 -1
  153. opik/rest_api/types/dataset_item_public.py +1 -1
  154. opik/rest_api/types/dataset_public.py +2 -0
  155. opik/rest_api/types/dataset_version_public.py +10 -0
  156. opik/rest_api/types/dataset_version_summary.py +46 -0
  157. opik/rest_api/types/dataset_version_summary_public.py +46 -0
  158. opik/rest_api/types/experiment.py +9 -0
  159. opik/rest_api/types/experiment_public.py +9 -0
  160. opik/rest_api/types/group_content_with_aggregations.py +1 -0
  161. opik/rest_api/types/llm_as_judge_message_content.py +2 -0
  162. opik/rest_api/types/llm_as_judge_message_content_public.py +2 -0
  163. opik/rest_api/types/llm_as_judge_message_content_write.py +2 -0
  164. opik/rest_api/types/manual_evaluation_request_entity_type.py +1 -1
  165. opik/rest_api/types/project.py +1 -0
  166. opik/rest_api/types/project_detailed.py +1 -0
  167. opik/rest_api/types/project_metric_response_public_metric_type.py +4 -0
  168. opik/rest_api/types/project_reference.py +31 -0
  169. opik/rest_api/types/project_reference_public.py +31 -0
  170. opik/rest_api/types/project_stats_summary_item.py +1 -0
  171. opik/rest_api/types/prompt_version.py +1 -0
  172. opik/rest_api/types/prompt_version_detail.py +1 -0
  173. opik/rest_api/types/prompt_version_page_public.py +5 -0
  174. opik/rest_api/types/prompt_version_public.py +1 -0
  175. opik/rest_api/types/prompt_version_update.py +33 -0
  176. opik/rest_api/types/provider_api_key.py +5 -1
  177. opik/rest_api/types/provider_api_key_provider.py +2 -1
  178. opik/rest_api/types/provider_api_key_public.py +5 -1
  179. opik/rest_api/types/provider_api_key_public_provider.py +2 -1
  180. opik/rest_api/types/service_toggles_config.py +11 -1
  181. opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
  182. opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
  183. opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
  184. opik/types.py +36 -0
  185. opik/validation/chat_prompt_messages.py +241 -0
  186. opik/validation/feedback_score.py +3 -3
  187. opik/validation/validator.py +28 -0
  188. {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/METADATA +7 -7
  189. {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/RECORD +193 -142
  190. opik/cli/export.py +0 -791
  191. opik/cli/import_command.py +0 -575
  192. {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/WHEEL +0 -0
  193. {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/entry_points.txt +0 -0
  194. {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/licenses/LICENSE +0 -0
  195. {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,269 @@
1
+ """
2
+ Experiment service for Harbor integration.
3
+
4
+ This module manages the connection between Harbor benchmark runs and Opik experiments,
5
+ enabling evaluation tracking and result visualization.
6
+
7
+ Harbor Terminology Mapping to Opik:
8
+ -----------------------------------
9
+ - **Harbor Job**: A benchmark run that evaluates one or more agents on a dataset.
10
+ Maps to an Opik Experiment.
11
+
12
+ - **Harbor Trial**: A single agent run on a single task within a job.
13
+ Each trial produces one Opik Trace (capturing the agent's execution).
14
+
15
+ - **Harbor Source**: The benchmark dataset being used (e.g., "terminal-bench", "swe-bench").
16
+ Maps to an Opik Dataset. Each source gets its own dataset.
17
+
18
+ - **Harbor Task**: A specific problem/challenge within a dataset (e.g., "fix-git" task).
19
+ Maps to an Opik Dataset Item.
20
+
21
+ Flow Overview:
22
+ --------------
23
+ 1. When a Harbor job starts, this service is initialized with an experiment name.
24
+ 2. For each unique source (benchmark dataset), we create/get an Opik Dataset and Experiment.
25
+ 3. For each trial (agent run on a task), we:
26
+ a. Create a dataset item for the task (or reuse existing one if task was run before)
27
+ b. Link the trial's trace to the experiment via ExperimentItemReferences
28
+ 4. This allows viewing all trial results in Opik's experiment comparison UI.
29
+ """
30
+
31
+ import logging
32
+ from datetime import datetime
33
+ from typing import Any, Dict, Optional, Set, TYPE_CHECKING
34
+
35
+ from opik.api_objects import opik_client
36
+ from opik.api_objects.experiment import experiment_item
37
+
38
+ if TYPE_CHECKING:
39
+ from opik.api_objects.experiment.experiment import Experiment
40
+ from opik.api_objects.dataset.dataset import Dataset
41
+
42
+ LOGGER = logging.getLogger(__name__)
43
+
44
+ # Global singleton service instance (one per Harbor job)
45
+ _SERVICE: Optional["HarborExperimentService"] = None
46
+
47
+
48
+ class HarborExperimentService:
49
+ """
50
+ Manages Opik datasets and experiments for Harbor benchmark jobs.
51
+
52
+ This service handles the mapping between Harbor's evaluation structure and Opik's
53
+ experiment tracking:
54
+
55
+ - Each Harbor source (benchmark dataset) → One Opik Dataset + One Opik Experiment
56
+ - Each Harbor task → One Opik Dataset Item
57
+ - Each Harbor trial → One Opik Trace, linked to the experiment
58
+
59
+ The service uses lazy initialization - datasets and experiments are created
60
+ on-demand when the first trial for a source is linked.
61
+
62
+ Attributes:
63
+ _experiment_name: Name for experiments created by this service.
64
+ _experiment_config: Config dict stored on experiments (agent/model info).
65
+ _client: Cached Opik client instance.
66
+ _datasets: Map of source name → Opik Dataset.
67
+ _experiments: Map of source name → Opik Experiment.
68
+ _linked_trials: Set of trial names already linked (prevents duplicates).
69
+ """
70
+
71
+ def __init__(
72
+ self,
73
+ experiment_name: str,
74
+ experiment_config: Optional[Dict[str, Any]] = None,
75
+ ) -> None:
76
+ """
77
+ Initialize the experiment service.
78
+
79
+ Args:
80
+ experiment_name: Name for experiments. Typically includes job_id
81
+ for uniqueness (e.g., "harbor-job-abc123").
82
+ experiment_config: Optional config dict to store on experiments.
83
+ Typically contains agent/model info (e.g., {"agent_name": "terminus",
84
+ "model_name": "gpt-4o"}).
85
+ """
86
+ self._experiment_name = experiment_name
87
+ self._experiment_config = experiment_config or {}
88
+ self._experiment_config["created_from"] = "harbor"
89
+ self._client = opik_client.get_client_cached()
90
+
91
+ # Lazy-initialized per source (benchmark dataset)
92
+ self._datasets: Dict[str, "Dataset"] = {}
93
+ self._experiments: Dict[str, "Experiment"] = {}
94
+
95
+ # Track which trials have been linked to avoid duplicates
96
+ self._linked_trials: Set[str] = set()
97
+
98
+ def _ensure_dataset_and_experiment(self, source: str) -> None:
99
+ """
100
+ Ensure an Opik Dataset and Experiment exist for the given source.
101
+
102
+ Creates them lazily on first access. Each Harbor source (benchmark dataset)
103
+ gets its own Opik Dataset and Experiment pair.
104
+
105
+ Args:
106
+ source: The Harbor source/benchmark name (e.g., "terminal-bench").
107
+ """
108
+ if source in self._experiments:
109
+ return
110
+
111
+ try:
112
+ # Create or get the dataset for this benchmark source
113
+ dataset = self._client.get_or_create_dataset(
114
+ name=source,
115
+ description=f"Harbor benchmark dataset: {source}",
116
+ )
117
+ self._datasets[source] = dataset
118
+ LOGGER.info("Using dataset '%s' for Harbor source", source)
119
+
120
+ # Create a new experiment for this job run
121
+ experiment = self._client.create_experiment(
122
+ dataset_name=source,
123
+ name=self._experiment_name,
124
+ experiment_config=self._experiment_config,
125
+ )
126
+ self._experiments[source] = experiment
127
+ LOGGER.info(
128
+ "Created experiment '%s' for dataset '%s'",
129
+ self._experiment_name,
130
+ source,
131
+ )
132
+ except Exception as e:
133
+ LOGGER.warning(
134
+ "Failed to create dataset/experiment for source '%s': %s",
135
+ source,
136
+ e,
137
+ )
138
+
139
+ def link_trial_to_experiment(
140
+ self,
141
+ trial_name: str,
142
+ trace_id: str,
143
+ source: Optional[str] = None,
144
+ task_name: Optional[str] = None,
145
+ ) -> None:
146
+ """
147
+ Link a Harbor trial's trace to the Opik experiment.
148
+
149
+ This creates the connection between a trial's execution trace and the
150
+ experiment, enabling the trial to appear in Opik's experiment comparison UI.
151
+
152
+ The flow:
153
+ 1. Ensure dataset and experiment exist for the source
154
+ 2. Create or find the dataset item for this task
155
+ 3. Link the trace to the experiment via the dataset item
156
+
157
+ Args:
158
+ trial_name: Unique identifier for the trial (e.g., "task__abc123").
159
+ Used to prevent duplicate linking.
160
+ trace_id: The Opik trace ID for this trial's execution.
161
+ source: The Harbor source/benchmark name. Defaults to "harbor-default".
162
+ task_name: The task name within the benchmark (e.g., "fix-git").
163
+ Used to create/find the dataset item.
164
+ """
165
+ source = source or "harbor-default"
166
+
167
+ # Prevent duplicate linking of the same trial
168
+ if trial_name in self._linked_trials:
169
+ return
170
+
171
+ # Ensure we have a dataset and experiment for this source
172
+ self._ensure_dataset_and_experiment(source)
173
+
174
+ experiment = self._experiments.get(source)
175
+ dataset = self._datasets.get(source)
176
+
177
+ if experiment is None or dataset is None:
178
+ LOGGER.warning(
179
+ "Failed to create experiment/dataset for source '%s', "
180
+ "trial '%s' will not be linked",
181
+ source,
182
+ trial_name,
183
+ )
184
+ return
185
+
186
+ try:
187
+ # Insert the task as a dataset item (idempotent - duplicates are handled)
188
+ dataset.insert([{"task_name": task_name}])
189
+
190
+ # Find the dataset item ID for this task.
191
+ # We search because the same task may have been inserted in a previous run,
192
+ # and we want to reuse the existing item ID for proper experiment linking.
193
+ items = dataset.get_items()
194
+ dataset_item_id = None
195
+ for item in items:
196
+ if item.get("task_name") == task_name:
197
+ dataset_item_id = item.get("id")
198
+ break
199
+
200
+ if dataset_item_id is None:
201
+ LOGGER.warning("Could not find dataset item for task '%s'", task_name)
202
+ return
203
+
204
+ # Link the trace to the experiment via the dataset item
205
+ experiment.insert(
206
+ [
207
+ experiment_item.ExperimentItemReferences(
208
+ dataset_item_id=dataset_item_id,
209
+ trace_id=trace_id,
210
+ )
211
+ ]
212
+ )
213
+
214
+ self._linked_trials.add(trial_name)
215
+ LOGGER.debug(
216
+ "Linked trial '%s' (trace %s) to experiment '%s'",
217
+ trial_name,
218
+ trace_id,
219
+ self._experiment_name,
220
+ )
221
+ except Exception as e:
222
+ LOGGER.warning("Failed to link trial '%s' to experiment: %s", trial_name, e)
223
+
224
+
225
+ def setup_lazy(
226
+ experiment_name: Optional[str] = None,
227
+ experiment_config: Optional[Dict[str, Any]] = None,
228
+ ) -> None:
229
+ """
230
+ Setup the experiment service lazily.
231
+
232
+ Called when the first Harbor trial runs. Creates the global service instance
233
+ that will be used for all subsequent trial linking. Datasets and experiments
234
+ are created on-demand when trials are linked.
235
+
236
+ Args:
237
+ experiment_name: Name for the experiment. If None, auto-generates
238
+ a timestamped name like "harbor-20241209-143000".
239
+ experiment_config: Optional config dict to store on experiments.
240
+ Typically contains agent/model info (e.g., {"agent_name": "terminus",
241
+ "model_name": "gpt-4o"}).
242
+ """
243
+ global _SERVICE
244
+
245
+ if _SERVICE is not None:
246
+ LOGGER.debug("Experiment service already setup, skipping")
247
+ return
248
+
249
+ if experiment_name is None:
250
+ timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
251
+ experiment_name = f"harbor-{timestamp}"
252
+
253
+ _SERVICE = HarborExperimentService(
254
+ experiment_name=experiment_name,
255
+ experiment_config=experiment_config,
256
+ )
257
+
258
+ LOGGER.info("Experiment service setup for '%s'", experiment_name)
259
+
260
+
261
+ def get_service() -> Optional[HarborExperimentService]:
262
+ """Get the current experiment service instance, or None if not initialized."""
263
+ return _SERVICE
264
+
265
+
266
+ def reset() -> None:
267
+ """Reset the experiment service. Used for testing."""
268
+ global _SERVICE
269
+ _SERVICE = None