opik 1.9.39__py3-none-any.whl → 1.9.86__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (195) hide show
  1. opik/api_objects/attachment/attachment_context.py +36 -0
  2. opik/api_objects/attachment/attachments_extractor.py +153 -0
  3. opik/api_objects/attachment/client.py +1 -0
  4. opik/api_objects/attachment/converters.py +2 -0
  5. opik/api_objects/attachment/decoder.py +18 -0
  6. opik/api_objects/attachment/decoder_base64.py +83 -0
  7. opik/api_objects/attachment/decoder_helpers.py +137 -0
  8. opik/api_objects/constants.py +2 -0
  9. opik/api_objects/dataset/dataset.py +133 -40
  10. opik/api_objects/dataset/rest_operations.py +2 -0
  11. opik/api_objects/experiment/experiment.py +6 -0
  12. opik/api_objects/helpers.py +8 -4
  13. opik/api_objects/local_recording.py +6 -5
  14. opik/api_objects/observation_data.py +101 -0
  15. opik/api_objects/opik_client.py +78 -45
  16. opik/api_objects/opik_query_language.py +9 -3
  17. opik/api_objects/prompt/chat/chat_prompt.py +18 -1
  18. opik/api_objects/prompt/client.py +8 -1
  19. opik/api_objects/span/span_data.py +3 -88
  20. opik/api_objects/threads/threads_client.py +7 -4
  21. opik/api_objects/trace/trace_data.py +3 -74
  22. opik/api_objects/validation_helpers.py +3 -3
  23. opik/cli/exports/__init__.py +131 -0
  24. opik/cli/exports/dataset.py +278 -0
  25. opik/cli/exports/experiment.py +784 -0
  26. opik/cli/exports/project.py +685 -0
  27. opik/cli/exports/prompt.py +578 -0
  28. opik/cli/exports/utils.py +406 -0
  29. opik/cli/harbor.py +39 -0
  30. opik/cli/imports/__init__.py +439 -0
  31. opik/cli/imports/dataset.py +143 -0
  32. opik/cli/imports/experiment.py +1192 -0
  33. opik/cli/imports/project.py +262 -0
  34. opik/cli/imports/prompt.py +177 -0
  35. opik/cli/imports/utils.py +280 -0
  36. opik/cli/main.py +14 -12
  37. opik/config.py +12 -1
  38. opik/datetime_helpers.py +12 -0
  39. opik/decorator/arguments_helpers.py +4 -1
  40. opik/decorator/base_track_decorator.py +111 -37
  41. opik/decorator/context_manager/span_context_manager.py +5 -1
  42. opik/decorator/generator_wrappers.py +5 -4
  43. opik/decorator/span_creation_handler.py +13 -4
  44. opik/evaluation/engine/engine.py +111 -28
  45. opik/evaluation/engine/evaluation_tasks_executor.py +71 -19
  46. opik/evaluation/evaluator.py +12 -0
  47. opik/evaluation/metrics/conversation/llm_judges/conversational_coherence/metric.py +3 -1
  48. opik/evaluation/metrics/conversation/llm_judges/session_completeness/metric.py +3 -1
  49. opik/evaluation/metrics/conversation/llm_judges/user_frustration/metric.py +3 -1
  50. opik/evaluation/metrics/heuristics/equals.py +11 -7
  51. opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +3 -1
  52. opik/evaluation/metrics/llm_judges/context_precision/metric.py +3 -1
  53. opik/evaluation/metrics/llm_judges/context_recall/metric.py +3 -1
  54. opik/evaluation/metrics/llm_judges/factuality/metric.py +1 -1
  55. opik/evaluation/metrics/llm_judges/g_eval/metric.py +3 -1
  56. opik/evaluation/metrics/llm_judges/hallucination/metric.py +3 -1
  57. opik/evaluation/metrics/llm_judges/moderation/metric.py +3 -1
  58. opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +3 -1
  59. opik/evaluation/metrics/llm_judges/syc_eval/metric.py +4 -2
  60. opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +3 -1
  61. opik/evaluation/metrics/llm_judges/usefulness/metric.py +3 -1
  62. opik/evaluation/metrics/ragas_metric.py +43 -23
  63. opik/evaluation/models/litellm/litellm_chat_model.py +7 -2
  64. opik/evaluation/models/litellm/util.py +4 -20
  65. opik/evaluation/models/models_factory.py +19 -5
  66. opik/evaluation/rest_operations.py +3 -3
  67. opik/evaluation/threads/helpers.py +3 -2
  68. opik/file_upload/file_uploader.py +13 -0
  69. opik/file_upload/upload_options.py +2 -0
  70. opik/integrations/adk/legacy_opik_tracer.py +9 -11
  71. opik/integrations/adk/opik_tracer.py +2 -2
  72. opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +2 -2
  73. opik/integrations/dspy/callback.py +100 -14
  74. opik/integrations/dspy/parsers.py +168 -0
  75. opik/integrations/harbor/__init__.py +17 -0
  76. opik/integrations/harbor/experiment_service.py +269 -0
  77. opik/integrations/harbor/opik_tracker.py +528 -0
  78. opik/integrations/haystack/opik_tracer.py +2 -2
  79. opik/integrations/langchain/__init__.py +15 -2
  80. opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
  81. opik/integrations/langchain/opik_tracer.py +258 -160
  82. opik/integrations/langchain/provider_usage_extractors/langchain_run_helpers/helpers.py +7 -4
  83. opik/integrations/llama_index/callback.py +43 -6
  84. opik/integrations/openai/agents/opik_tracing_processor.py +8 -10
  85. opik/integrations/openai/opik_tracker.py +99 -4
  86. opik/integrations/openai/videos/__init__.py +9 -0
  87. opik/integrations/openai/videos/binary_response_write_to_file_decorator.py +88 -0
  88. opik/integrations/openai/videos/videos_create_decorator.py +159 -0
  89. opik/integrations/openai/videos/videos_download_decorator.py +110 -0
  90. opik/message_processing/batching/base_batcher.py +14 -21
  91. opik/message_processing/batching/batch_manager.py +22 -10
  92. opik/message_processing/batching/batchers.py +32 -40
  93. opik/message_processing/batching/flushing_thread.py +0 -3
  94. opik/message_processing/emulation/emulator_message_processor.py +36 -1
  95. opik/message_processing/emulation/models.py +21 -0
  96. opik/message_processing/messages.py +9 -0
  97. opik/message_processing/preprocessing/__init__.py +0 -0
  98. opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
  99. opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
  100. opik/message_processing/preprocessing/constants.py +1 -0
  101. opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
  102. opik/message_processing/preprocessing/preprocessor.py +36 -0
  103. opik/message_processing/processors/__init__.py +0 -0
  104. opik/message_processing/processors/attachments_extraction_processor.py +146 -0
  105. opik/message_processing/{message_processors.py → processors/message_processors.py} +15 -1
  106. opik/message_processing/{message_processors_chain.py → processors/message_processors_chain.py} +3 -2
  107. opik/message_processing/{online_message_processor.py → processors/online_message_processor.py} +11 -9
  108. opik/message_processing/queue_consumer.py +4 -2
  109. opik/message_processing/streamer.py +71 -33
  110. opik/message_processing/streamer_constructors.py +36 -8
  111. opik/plugins/pytest/experiment_runner.py +1 -1
  112. opik/plugins/pytest/hooks.py +5 -3
  113. opik/rest_api/__init__.py +42 -0
  114. opik/rest_api/datasets/client.py +321 -123
  115. opik/rest_api/datasets/raw_client.py +470 -145
  116. opik/rest_api/experiments/client.py +26 -0
  117. opik/rest_api/experiments/raw_client.py +26 -0
  118. opik/rest_api/llm_provider_key/client.py +4 -4
  119. opik/rest_api/llm_provider_key/raw_client.py +4 -4
  120. opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +2 -1
  121. opik/rest_api/manual_evaluation/client.py +101 -0
  122. opik/rest_api/manual_evaluation/raw_client.py +172 -0
  123. opik/rest_api/optimizations/client.py +0 -166
  124. opik/rest_api/optimizations/raw_client.py +0 -248
  125. opik/rest_api/projects/client.py +9 -0
  126. opik/rest_api/projects/raw_client.py +13 -0
  127. opik/rest_api/projects/types/project_metric_request_public_metric_type.py +4 -0
  128. opik/rest_api/prompts/client.py +130 -2
  129. opik/rest_api/prompts/raw_client.py +175 -0
  130. opik/rest_api/traces/client.py +101 -0
  131. opik/rest_api/traces/raw_client.py +120 -0
  132. opik/rest_api/types/__init__.py +50 -0
  133. opik/rest_api/types/audio_url.py +19 -0
  134. opik/rest_api/types/audio_url_public.py +19 -0
  135. opik/rest_api/types/audio_url_write.py +19 -0
  136. opik/rest_api/types/automation_rule_evaluator.py +38 -2
  137. opik/rest_api/types/automation_rule_evaluator_object_object_public.py +33 -2
  138. opik/rest_api/types/automation_rule_evaluator_public.py +33 -2
  139. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
  140. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
  141. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
  142. opik/rest_api/types/automation_rule_evaluator_update.py +27 -1
  143. opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
  144. opik/rest_api/types/automation_rule_evaluator_write.py +27 -1
  145. opik/rest_api/types/dataset.py +2 -0
  146. opik/rest_api/types/dataset_item.py +1 -1
  147. opik/rest_api/types/dataset_item_batch.py +4 -0
  148. opik/rest_api/types/dataset_item_changes_public.py +5 -0
  149. opik/rest_api/types/dataset_item_compare.py +1 -1
  150. opik/rest_api/types/dataset_item_filter.py +4 -0
  151. opik/rest_api/types/dataset_item_page_compare.py +0 -1
  152. opik/rest_api/types/dataset_item_page_public.py +0 -1
  153. opik/rest_api/types/dataset_item_public.py +1 -1
  154. opik/rest_api/types/dataset_public.py +2 -0
  155. opik/rest_api/types/dataset_version_public.py +10 -0
  156. opik/rest_api/types/dataset_version_summary.py +46 -0
  157. opik/rest_api/types/dataset_version_summary_public.py +46 -0
  158. opik/rest_api/types/experiment.py +9 -0
  159. opik/rest_api/types/experiment_public.py +9 -0
  160. opik/rest_api/types/group_content_with_aggregations.py +1 -0
  161. opik/rest_api/types/llm_as_judge_message_content.py +2 -0
  162. opik/rest_api/types/llm_as_judge_message_content_public.py +2 -0
  163. opik/rest_api/types/llm_as_judge_message_content_write.py +2 -0
  164. opik/rest_api/types/manual_evaluation_request_entity_type.py +1 -1
  165. opik/rest_api/types/project.py +1 -0
  166. opik/rest_api/types/project_detailed.py +1 -0
  167. opik/rest_api/types/project_metric_response_public_metric_type.py +4 -0
  168. opik/rest_api/types/project_reference.py +31 -0
  169. opik/rest_api/types/project_reference_public.py +31 -0
  170. opik/rest_api/types/project_stats_summary_item.py +1 -0
  171. opik/rest_api/types/prompt_version.py +1 -0
  172. opik/rest_api/types/prompt_version_detail.py +1 -0
  173. opik/rest_api/types/prompt_version_page_public.py +5 -0
  174. opik/rest_api/types/prompt_version_public.py +1 -0
  175. opik/rest_api/types/prompt_version_update.py +33 -0
  176. opik/rest_api/types/provider_api_key.py +5 -1
  177. opik/rest_api/types/provider_api_key_provider.py +2 -1
  178. opik/rest_api/types/provider_api_key_public.py +5 -1
  179. opik/rest_api/types/provider_api_key_public_provider.py +2 -1
  180. opik/rest_api/types/service_toggles_config.py +11 -1
  181. opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
  182. opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
  183. opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
  184. opik/types.py +36 -0
  185. opik/validation/chat_prompt_messages.py +241 -0
  186. opik/validation/feedback_score.py +3 -3
  187. opik/validation/validator.py +28 -0
  188. {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/METADATA +7 -7
  189. {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/RECORD +193 -142
  190. opik/cli/export.py +0 -791
  191. opik/cli/import_command.py +0 -575
  192. {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/WHEEL +0 -0
  193. {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/entry_points.txt +0 -0
  194. {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/licenses/LICENSE +0 -0
  195. {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,528 @@
1
+ """
2
+ Opik tracking integration for Harbor benchmark evaluation framework.
3
+
4
+ This module provides the `track_harbor` function to add Opik tracing to Harbor Jobs,
5
+ enabling real-time streaming of trial results to Opik for visualization and analysis.
6
+
7
+ Example:
8
+ >>> from opik.integrations.harbor import track_harbor
9
+ >>> from harbor.job import Job
10
+ >>> import os
11
+ >>>
12
+ >>> os.environ["OPIK_PROJECT_NAME"] = "swebench-evaluation"
13
+ >>>
14
+ >>> job = Job(config)
15
+ >>> tracked_job = track_harbor(job)
16
+ >>> result = await tracked_job.run()
17
+
18
+ Or enable tracking globally (for CLI usage):
19
+ >>> from opik.integrations.harbor import track_harbor
20
+ >>> track_harbor()
21
+ >>> # Now run Harbor code - it will be traced
22
+ """
23
+
24
+ import functools
25
+ import logging
26
+ from typing import Any, Callable, Dict, List, Optional, Tuple
27
+ from typing_extensions import override
28
+
29
+ from harbor.job import Job
30
+ from harbor.models.trajectories.step import Step
31
+ from harbor.models.trial.result import TrialResult
32
+ from harbor.models.verifier.result import VerifierResult
33
+ from harbor.trial.trial import Trial
34
+ from harbor.verifier.verifier import Verifier
35
+
36
+ from opik import datetime_helpers, id_helpers, opik_context, track
37
+ from opik.api_objects import opik_client, span
38
+ from opik.decorator import arguments_helpers, base_track_decorator
39
+ from opik.types import FeedbackScoreDict, SpanType
40
+
41
+ from . import experiment_service
42
+
43
+ LOGGER = logging.getLogger(__name__)
44
+
45
+
46
+ class HarborTrialRunDecorator(base_track_decorator.BaseTrackDecorator):
47
+ """
48
+ Decorator for tracking Harbor Trial.run method.
49
+
50
+ Sets the trace name based on trial configuration before the span/trace
51
+ is sent to the backend.
52
+ """
53
+
54
+ @override
55
+ def _start_span_inputs_preprocessor(
56
+ self,
57
+ func: Callable,
58
+ track_options: arguments_helpers.TrackOptions,
59
+ args: Tuple,
60
+ kwargs: Dict[str, Any],
61
+ ) -> arguments_helpers.StartSpanParameters:
62
+ """Extract trial config and set trace name, input, metadata, and tags."""
63
+ # Extract Trial instance from args (Trial.run is an instance method)
64
+ if not args:
65
+ # Fallback if no args (shouldn't happen for instance methods)
66
+ name = (
67
+ track_options.name if track_options.name is not None else func.__name__
68
+ )
69
+ return arguments_helpers.StartSpanParameters(
70
+ name=name,
71
+ input=None,
72
+ type=track_options.type,
73
+ tags=track_options.tags,
74
+ metadata=track_options.metadata,
75
+ project_name=track_options.project_name,
76
+ )
77
+
78
+ trial: Trial = args[0]
79
+ config = trial.config
80
+
81
+ # Build trace name from config
82
+ trace_name = f"{config.agent.name}/{config.trial_name}"
83
+
84
+ # Build input dict
85
+ input_dict: Dict[str, Any] = {
86
+ "trial_name": config.trial_name,
87
+ "task": {
88
+ "name": config.task.name
89
+ if hasattr(config.task, "name")
90
+ else str(config.task.path),
91
+ "source": getattr(config.task, "source", None),
92
+ },
93
+ "agent": {
94
+ "name": config.agent.name,
95
+ "model": getattr(config.agent, "model_name", None),
96
+ },
97
+ }
98
+
99
+ # Build metadata
100
+ metadata = (
101
+ track_options.metadata.copy() if track_options.metadata is not None else {}
102
+ )
103
+ metadata["created_from"] = "harbor"
104
+
105
+ # Build tags
106
+ tags = track_options.tags if track_options.tags is not None else []
107
+ tags = list(tags) # Make a copy to avoid mutating the original
108
+ if "harbor" not in tags:
109
+ tags.append("harbor")
110
+ if config.agent.name not in tags:
111
+ tags.append(config.agent.name)
112
+
113
+ return arguments_helpers.StartSpanParameters(
114
+ name=trace_name,
115
+ input=input_dict,
116
+ type=track_options.type,
117
+ tags=tags,
118
+ metadata=metadata,
119
+ project_name=track_options.project_name,
120
+ )
121
+
122
+ @override
123
+ def _end_span_inputs_preprocessor(
124
+ self,
125
+ output: Any,
126
+ capture_output: bool,
127
+ current_span_data: span.SpanData,
128
+ ) -> arguments_helpers.EndSpanParameters:
129
+ """Process output - minimal implementation since output is handled in _wrap_trial_run."""
130
+ # Output is handled separately in _wrap_trial_run via opik_context.update_current_trace
131
+ # So we don't need to process it here
132
+ return arguments_helpers.EndSpanParameters(output=None)
133
+
134
+ @override
135
+ def _streams_handler(
136
+ self,
137
+ output: Any,
138
+ capture_output: bool,
139
+ generations_aggregator: Optional[Callable[[List[Any]], Any]],
140
+ ) -> Optional[Any]:
141
+ """No stream handling needed for Trial.run."""
142
+ return None
143
+
144
+
145
+ def _rewards_to_feedback_scores(
146
+ rewards: Optional[Dict[str, Any]],
147
+ error: Optional[str] = None,
148
+ ) -> List[FeedbackScoreDict]:
149
+ """Convert Harbor verifier rewards to Opik feedback scores."""
150
+ if rewards is None:
151
+ return []
152
+
153
+ feedback_scores: List[FeedbackScoreDict] = []
154
+ for name, value in rewards.items():
155
+ try:
156
+ float_value = float(value)
157
+
158
+ score = FeedbackScoreDict(name=name, value=float_value, reason=error)
159
+
160
+ feedback_scores.append(score)
161
+ except (ValueError, TypeError):
162
+ LOGGER.warning(
163
+ "Could not convert reward value to float: %s=%s", name, value
164
+ )
165
+
166
+ return feedback_scores
167
+
168
+
169
+ def _source_to_span_type(source: str) -> SpanType:
170
+ """Convert ATIF step source to Opik span type."""
171
+ if source == "agent":
172
+ return "llm"
173
+ return "general"
174
+
175
+
176
+ def _patch_step_class() -> None:
177
+ """Patch the Harbor Step class to create Opik spans on instantiation."""
178
+ # Check if already patched
179
+ if hasattr(_patch_step_class, "_patched"):
180
+ return
181
+
182
+ original_init = Step.__init__
183
+
184
+ @functools.wraps(original_init)
185
+ def patched_init(self: Step, *args: Any, **kwargs: Any) -> None:
186
+ original_init(self, *args, **kwargs)
187
+
188
+ trace_data = opik_context.get_current_trace_data()
189
+ if trace_data is None:
190
+ return
191
+
192
+ parent_span = opik_context.get_current_span_data()
193
+ parent_span_id = parent_span.id if parent_span else None
194
+
195
+ try:
196
+ client = opik_client.get_client_cached()
197
+
198
+ input_dict: Dict[str, Any] = {}
199
+ if self.message:
200
+ input_dict["message"] = self.message
201
+ if self.tool_calls:
202
+ input_dict["tool_calls"] = [
203
+ {
204
+ "tool_call_id": tc.tool_call_id,
205
+ "function_name": tc.function_name,
206
+ "arguments": tc.arguments,
207
+ }
208
+ for tc in self.tool_calls
209
+ ]
210
+
211
+ output_dict: Optional[Dict[str, Any]] = None
212
+ if self.observation and self.observation.results:
213
+ output_dict = {
214
+ "results": [
215
+ {"content": r.content} for r in self.observation.results
216
+ ]
217
+ }
218
+
219
+ metadata: Dict[str, Any] = {
220
+ "source": self.source,
221
+ "created_from": "harbor",
222
+ }
223
+ if self.reasoning_content:
224
+ metadata["reasoning"] = self.reasoning_content
225
+
226
+ usage: Optional[Dict[str, Any]] = None
227
+ total_cost: Optional[float] = None
228
+ if self.metrics:
229
+ usage = {}
230
+ if self.metrics.prompt_tokens is not None:
231
+ usage["prompt_tokens"] = self.metrics.prompt_tokens
232
+ if self.metrics.completion_tokens is not None:
233
+ usage["completion_tokens"] = self.metrics.completion_tokens
234
+ if self.metrics.prompt_tokens and self.metrics.completion_tokens:
235
+ usage["total_tokens"] = (
236
+ self.metrics.prompt_tokens + self.metrics.completion_tokens
237
+ )
238
+ if not usage:
239
+ usage = None
240
+ total_cost = getattr(self.metrics, "cost_usd", None)
241
+
242
+ client.span(
243
+ id=id_helpers.generate_id(),
244
+ trace_id=trace_data.id,
245
+ parent_span_id=parent_span_id,
246
+ name=f"step_{self.step_id}",
247
+ type=_source_to_span_type(self.source),
248
+ start_time=datetime_helpers.parse_iso_timestamp(self.timestamp),
249
+ input=input_dict if input_dict else None,
250
+ output=output_dict,
251
+ metadata=metadata,
252
+ usage=usage,
253
+ total_cost=total_cost,
254
+ model=self.model_name if self.source == "agent" else None,
255
+ tags=["harbor", self.source],
256
+ )
257
+
258
+ except Exception as e:
259
+ LOGGER.debug("Failed to create span for step: %s", e)
260
+
261
+ Step.__init__ = patched_init # type: ignore
262
+ setattr(_patch_step_class, "_patched", True)
263
+
264
+
265
+ def _enable_harbor_tracking(project_name: Optional[str] = None) -> None:
266
+ """Internal: Enable Opik tracking for Harbor by patching classes.
267
+
268
+ This patches Harbor's Trial and Verifier classes to add tracing.
269
+
270
+ Args:
271
+ project_name: Opik project name. If None, uses OPIK_PROJECT_NAME env var.
272
+ """
273
+ # Patch Trial methods (only if not already patched)
274
+ if not hasattr(Trial.run, "opik_tracked"):
275
+ Trial.run = _wrap_trial_run(Trial.run, project_name)
276
+
277
+ if not hasattr(Trial._setup_environment, "opik_tracked"):
278
+ Trial._setup_environment = _wrap_setup_environment(
279
+ Trial._setup_environment, project_name
280
+ )
281
+
282
+ if not hasattr(Trial._setup_agent, "opik_tracked"):
283
+ Trial._setup_agent = _wrap_setup_agent(Trial._setup_agent, project_name)
284
+
285
+ if not hasattr(Trial._execute_agent, "opik_tracked"):
286
+ Trial._execute_agent = _wrap_execute_agent(Trial._execute_agent, project_name)
287
+
288
+ if not hasattr(Trial._run_verification, "opik_tracked"):
289
+ Trial._run_verification = _wrap_run_verification(
290
+ Trial._run_verification, project_name
291
+ )
292
+
293
+ # Patch Verifier (only if not already patched)
294
+ if not hasattr(Verifier.verify, "opik_tracked"):
295
+ Verifier.verify = _wrap_verify(Verifier.verify, project_name)
296
+
297
+ # Patch Step class for real-time step tracking
298
+ _patch_step_class()
299
+
300
+ LOGGER.info("Opik tracking enabled for Harbor")
301
+
302
+
303
+ def track_harbor(
304
+ job: Optional["Job"] = None,
305
+ project_name: Optional[str] = None,
306
+ ) -> Optional["Job"]:
307
+ """Enable Opik tracking for Harbor.
308
+
309
+ Can be called two ways:
310
+ - track_harbor() - enables global tracking (for CLI usage)
311
+ - track_harbor(job) - wraps a job and enables tracking (for SDK usage)
312
+
313
+ Args:
314
+ job: Optional Harbor Job instance. If provided, returns the same job.
315
+ project_name: Opik project name. If None, uses OPIK_PROJECT_NAME env var.
316
+
317
+ Returns:
318
+ The job instance if provided, None otherwise.
319
+
320
+ Example:
321
+ >>> from opik.integrations.harbor import track_harbor
322
+ >>> job = Job(config)
323
+ >>> tracked_job = track_harbor(job)
324
+ >>> result = await tracked_job.run()
325
+ """
326
+ _enable_harbor_tracking(project_name=project_name)
327
+ return job
328
+
329
+
330
+ def _wrap_trial_run(original: Callable, project_name: Optional[str]) -> Callable:
331
+ """Wrap Trial.run with tracing, feedback scores, and experiment linking."""
332
+
333
+ decorator = HarborTrialRunDecorator()
334
+
335
+ @decorator.track(
336
+ tags=["harbor"],
337
+ project_name=project_name,
338
+ capture_output=False,
339
+ )
340
+ @functools.wraps(original)
341
+ async def wrapped(self: Trial) -> TrialResult:
342
+ config = self.config
343
+
344
+ # Lazily setup experiment service if not already done
345
+ # This ensures experiment tracking works for both SDK and CLI modes
346
+ if experiment_service.get_service() is None:
347
+ try:
348
+ # Use job_id for consistent experiment naming
349
+ experiment_name = (
350
+ f"harbor-job-{str(config.job_id)[:8]}" if config.job_id else None
351
+ )
352
+ # Build experiment config with agent/model info
353
+ experiment_config: Dict[str, Any] = {
354
+ "agent_name": config.agent.name,
355
+ }
356
+ model_name = getattr(config.agent, "model_name", None)
357
+ if model_name:
358
+ experiment_config["model_name"] = model_name
359
+
360
+ LOGGER.debug(
361
+ "Lazily setting up experiment service: experiment_name=%s",
362
+ experiment_name,
363
+ )
364
+ experiment_service.setup_lazy(
365
+ experiment_name=experiment_name,
366
+ experiment_config=experiment_config,
367
+ )
368
+ except Exception as e:
369
+ LOGGER.debug("Failed to lazily setup experiment service: %s", e)
370
+
371
+ result: TrialResult = await original(self)
372
+
373
+ # Update trace with output and feedback scores
374
+ output_dict: Dict[str, Any] = {
375
+ "trial_name": result.trial_name,
376
+ "task_name": result.task_name,
377
+ }
378
+ if result.verifier_result and result.verifier_result.rewards:
379
+ output_dict["rewards"] = result.verifier_result.rewards
380
+
381
+ feedback_scores = None
382
+ if result.verifier_result and result.verifier_result.rewards:
383
+ # Get error message if available
384
+ error_msg = getattr(result.verifier_result, "error", None) or getattr(
385
+ result, "error", None
386
+ )
387
+ feedback_scores = _rewards_to_feedback_scores(
388
+ result.verifier_result.rewards, error=error_msg
389
+ )
390
+
391
+ opik_context.update_current_trace(
392
+ output=output_dict,
393
+ feedback_scores=feedback_scores,
394
+ )
395
+
396
+ # Link to experiment
397
+ trace_data = opik_context.get_current_trace_data()
398
+ if trace_data is not None:
399
+ service = experiment_service.get_service()
400
+ LOGGER.debug(
401
+ "Linking trial to experiment: trial=%s, trace_id=%s, service=%s",
402
+ config.trial_name,
403
+ trace_data.id,
404
+ service,
405
+ )
406
+ if service is not None:
407
+ source = getattr(config.task, "source", None)
408
+ task_name = (
409
+ config.task.name
410
+ if hasattr(config.task, "name")
411
+ else str(config.task.path)
412
+ )
413
+ service.link_trial_to_experiment(
414
+ trial_name=config.trial_name,
415
+ trace_id=trace_data.id,
416
+ source=source,
417
+ task_name=task_name,
418
+ )
419
+ else:
420
+ LOGGER.debug(
421
+ "No experiment service available, skipping experiment linking"
422
+ )
423
+
424
+ return result
425
+
426
+ return wrapped
427
+
428
+
429
+ def _wrap_setup_environment(
430
+ original: Callable, project_name: Optional[str]
431
+ ) -> Callable:
432
+ """Wrap Trial._setup_environment with tracing."""
433
+
434
+ @track(name="setup_environment", tags=["harbor"], project_name=project_name)
435
+ @functools.wraps(original)
436
+ async def wrapped(self: Trial) -> None:
437
+ opik_context.update_current_span(
438
+ input={"phase": "environment_setup"},
439
+ metadata={"created_from": "harbor"},
440
+ )
441
+ await original(self)
442
+ opik_context.update_current_span(output={"status": "completed"})
443
+
444
+ return wrapped
445
+
446
+
447
+ def _wrap_setup_agent(original: Callable, project_name: Optional[str]) -> Callable:
448
+ """Wrap Trial._setup_agent with tracing."""
449
+
450
+ @track(name="setup_agent", tags=["harbor"], project_name=project_name)
451
+ @functools.wraps(original)
452
+ async def wrapped(self: Trial) -> None:
453
+ opik_context.update_current_span(
454
+ input={"phase": "agent_setup"},
455
+ metadata={"created_from": "harbor"},
456
+ )
457
+ await original(self)
458
+ opik_context.update_current_span(output={"status": "completed"})
459
+
460
+ return wrapped
461
+
462
+
463
+ def _wrap_execute_agent(original: Callable, project_name: Optional[str]) -> Callable:
464
+ """Wrap Trial._execute_agent with tracing."""
465
+
466
+ @track(name="execute_agent", tags=["harbor"], project_name=project_name)
467
+ @functools.wraps(original)
468
+ async def wrapped(self: Trial) -> None:
469
+ input_dict = {}
470
+ if hasattr(self, "_task") and self._task:
471
+ input_dict["instruction"] = self._task.instruction
472
+ opik_context.update_current_span(
473
+ input=input_dict,
474
+ metadata={"created_from": "harbor"},
475
+ )
476
+ await original(self)
477
+ opik_context.update_current_span(output={"status": "completed"})
478
+
479
+ return wrapped
480
+
481
+
482
+ def _wrap_run_verification(original: Callable, project_name: Optional[str]) -> Callable:
483
+ """Wrap Trial._run_verification with tracing."""
484
+
485
+ @track(name="run_verification", tags=["harbor"], project_name=project_name)
486
+ @functools.wraps(original)
487
+ async def wrapped(self: Trial) -> None:
488
+ opik_context.update_current_span(
489
+ input={"phase": "verification"},
490
+ metadata={"created_from": "harbor"},
491
+ )
492
+ await original(self)
493
+ opik_context.update_current_span(output={"status": "completed"})
494
+
495
+ return wrapped
496
+
497
+
498
+ def _wrap_verify(original: Callable, project_name: Optional[str]) -> Callable:
499
+ """Wrap Verifier.verify with tracing."""
500
+
501
+ @track(name="verify", tags=["harbor"], project_name=project_name)
502
+ @functools.wraps(original)
503
+ async def wrapped(self: Verifier) -> VerifierResult:
504
+ opik_context.update_current_span(
505
+ input={"phase": "verify"},
506
+ metadata={"created_from": "harbor"},
507
+ )
508
+ result: VerifierResult = await original(self)
509
+
510
+ output_dict: Dict[str, Any] = {}
511
+ if result.rewards:
512
+ output_dict["rewards"] = result.rewards
513
+ opik_context.update_current_span(
514
+ output=output_dict if output_dict else {"status": "completed"}
515
+ )
516
+
517
+ return result
518
+
519
+ return wrapped
520
+
521
+
522
+ def reset_harbor_tracking() -> None:
523
+ """Reset Harbor tracking state for testing purposes.
524
+
525
+ Resets the experiment service. Method patches remain active
526
+ (they use `opik_tracked` to prevent double-patching).
527
+ """
528
+ experiment_service.reset()
@@ -91,12 +91,12 @@ class OpikTracer(tracing.Tracer):
91
91
  project_name=self._project_name,
92
92
  )
93
93
 
94
- trace_data, span_data = span_creation_handler.create_span_respecting_context(
94
+ result = span_creation_handler.create_span_respecting_context(
95
95
  start_span_arguments=start_span_parameters,
96
96
  distributed_trace_headers=None,
97
97
  )
98
98
  final_span_or_trace_data: Union[opik_span.SpanData, opik_trace.TraceData] = (
99
- trace_data if trace_data is not None else span_data
99
+ result.trace_data if result.trace_data is not None else result.span_data
100
100
  )
101
101
 
102
102
  return opik_span_bridge.OpikSpanBridge(final_span_or_trace_data)
@@ -1,4 +1,17 @@
1
- from .opik_tracer import OpikTracer
1
+ from .opik_tracer import (
2
+ OpikTracer,
3
+ LANGGRAPH_INTERRUPT_OUTPUT_KEY,
4
+ LANGGRAPH_RESUME_INPUT_KEY,
5
+ LANGGRAPH_INTERRUPT_METADATA_KEY,
6
+ )
2
7
  from .langgraph_async_context_bridge import extract_current_langgraph_span_data
8
+ from .langgraph_tracer_injector import track_langgraph
3
9
 
4
- __all__ = ["OpikTracer", "extract_current_langgraph_span_data"]
10
+ __all__ = [
11
+ "OpikTracer",
12
+ "extract_current_langgraph_span_data",
13
+ "track_langgraph",
14
+ "LANGGRAPH_INTERRUPT_OUTPUT_KEY",
15
+ "LANGGRAPH_RESUME_INPUT_KEY",
16
+ "LANGGRAPH_INTERRUPT_METADATA_KEY",
17
+ ]
@@ -0,0 +1,88 @@
1
+ import logging
2
+ from typing import Any, Dict, List, TypeVar
3
+
4
+ from langchain_core.runnables import base as runnables_base
5
+
6
+ from . import opik_tracer as opik_tracer_module
7
+
8
+ LOGGER = logging.getLogger(__name__)
9
+
10
+ CompiledGraphType = TypeVar("CompiledGraphType", bound=runnables_base.Runnable)
11
+
12
+
13
+ def track_langgraph(
14
+ graph: CompiledGraphType,
15
+ opik_tracer: opik_tracer_module.OpikTracer,
16
+ ) -> CompiledGraphType:
17
+ """
18
+ Adds Opik tracking to a compiled LangGraph graph by injecting OpikTracer into its default config.
19
+
20
+ After calling this function, all subsequent invocations of the graph will automatically
21
+ be tracked without needing to pass the OpikTracer in the config parameter.
22
+
23
+ The function will automatically extract the graph structure visualization from the compiled
24
+ graph if it wasn't already provided when creating the OpikTracer. This visualization will
25
+ be included in the trace metadata in the Opik UI.
26
+
27
+ Args:
28
+ graph: A compiled LangGraph graph (result of StateGraph.compile()).
29
+ opik_tracer: An OpikTracer instance to use for tracking the graph.
30
+
31
+ Returns:
32
+ The modified graph with Opik tracking enabled.
33
+
34
+ Example:
35
+ ```python
36
+ from langgraph.graph import StateGraph, START, END
37
+ from opik.integrations.langchain import OpikTracer, track_langgraph
38
+
39
+ # Build your graph
40
+ builder = StateGraph(State)
41
+ builder.add_node("my_node", my_node_function)
42
+ builder.add_edge(START, "my_node")
43
+ builder.add_edge("my_node", END)
44
+
45
+ # Compile the graph
46
+ graph = builder.compile()
47
+
48
+ # Create OpikTracer and track the graph once
49
+ # No need to manually extract the graph - it's done automatically!
50
+ opik_tracer = OpikTracer(
51
+ tags=["production"],
52
+ metadata={"version": "1.0"}
53
+ )
54
+ graph = track_langgraph(graph, opik_tracer)
55
+
56
+ # Now all invocations are tracked automatically
57
+ result = graph.invoke({"message": "Hello"})
58
+ # No need to pass config={"callbacks": [opik_tracer]} anymore!
59
+ ```
60
+
61
+ Note:
62
+ - The graph visualization is automatically extracted and added to trace metadata
63
+ if not already provided in the OpikTracer constructor.
64
+ - If you need to customize the OpikTracer for specific invocations, you can still
65
+ pass it explicitly in the config parameter, which will override the default.
66
+ - The graph object is modified in-place and also returned for convenience.
67
+ - For async invocations using `ainvoke()`, you may still need to use
68
+ `extract_current_langgraph_span_data()` to propagate context to @track-decorated
69
+ functions within async nodes.
70
+ """
71
+ graph_structure = graph.get_graph(xray=True)
72
+ opik_tracer.set_graph(graph_structure)
73
+
74
+ # Inject the callback into the graph's default config
75
+ config: Dict[str, Any] = getattr(graph, "config", None) or {}
76
+ graph.config = config # type: ignore[attr-defined]
77
+ callbacks: List[Any] = config.setdefault("callbacks", [])
78
+
79
+ if any(isinstance(cb, opik_tracer_module.OpikTracer) for cb in callbacks):
80
+ LOGGER.warning(
81
+ "Graph already has an OpikTracer callback injected. "
82
+ "Skipping re-tracking to avoid duplicate callbacks."
83
+ )
84
+ return graph
85
+
86
+ callbacks.append(opik_tracer)
87
+
88
+ return graph