opik 1.9.39__py3-none-any.whl → 1.9.86__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (195) hide show
  1. opik/api_objects/attachment/attachment_context.py +36 -0
  2. opik/api_objects/attachment/attachments_extractor.py +153 -0
  3. opik/api_objects/attachment/client.py +1 -0
  4. opik/api_objects/attachment/converters.py +2 -0
  5. opik/api_objects/attachment/decoder.py +18 -0
  6. opik/api_objects/attachment/decoder_base64.py +83 -0
  7. opik/api_objects/attachment/decoder_helpers.py +137 -0
  8. opik/api_objects/constants.py +2 -0
  9. opik/api_objects/dataset/dataset.py +133 -40
  10. opik/api_objects/dataset/rest_operations.py +2 -0
  11. opik/api_objects/experiment/experiment.py +6 -0
  12. opik/api_objects/helpers.py +8 -4
  13. opik/api_objects/local_recording.py +6 -5
  14. opik/api_objects/observation_data.py +101 -0
  15. opik/api_objects/opik_client.py +78 -45
  16. opik/api_objects/opik_query_language.py +9 -3
  17. opik/api_objects/prompt/chat/chat_prompt.py +18 -1
  18. opik/api_objects/prompt/client.py +8 -1
  19. opik/api_objects/span/span_data.py +3 -88
  20. opik/api_objects/threads/threads_client.py +7 -4
  21. opik/api_objects/trace/trace_data.py +3 -74
  22. opik/api_objects/validation_helpers.py +3 -3
  23. opik/cli/exports/__init__.py +131 -0
  24. opik/cli/exports/dataset.py +278 -0
  25. opik/cli/exports/experiment.py +784 -0
  26. opik/cli/exports/project.py +685 -0
  27. opik/cli/exports/prompt.py +578 -0
  28. opik/cli/exports/utils.py +406 -0
  29. opik/cli/harbor.py +39 -0
  30. opik/cli/imports/__init__.py +439 -0
  31. opik/cli/imports/dataset.py +143 -0
  32. opik/cli/imports/experiment.py +1192 -0
  33. opik/cli/imports/project.py +262 -0
  34. opik/cli/imports/prompt.py +177 -0
  35. opik/cli/imports/utils.py +280 -0
  36. opik/cli/main.py +14 -12
  37. opik/config.py +12 -1
  38. opik/datetime_helpers.py +12 -0
  39. opik/decorator/arguments_helpers.py +4 -1
  40. opik/decorator/base_track_decorator.py +111 -37
  41. opik/decorator/context_manager/span_context_manager.py +5 -1
  42. opik/decorator/generator_wrappers.py +5 -4
  43. opik/decorator/span_creation_handler.py +13 -4
  44. opik/evaluation/engine/engine.py +111 -28
  45. opik/evaluation/engine/evaluation_tasks_executor.py +71 -19
  46. opik/evaluation/evaluator.py +12 -0
  47. opik/evaluation/metrics/conversation/llm_judges/conversational_coherence/metric.py +3 -1
  48. opik/evaluation/metrics/conversation/llm_judges/session_completeness/metric.py +3 -1
  49. opik/evaluation/metrics/conversation/llm_judges/user_frustration/metric.py +3 -1
  50. opik/evaluation/metrics/heuristics/equals.py +11 -7
  51. opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +3 -1
  52. opik/evaluation/metrics/llm_judges/context_precision/metric.py +3 -1
  53. opik/evaluation/metrics/llm_judges/context_recall/metric.py +3 -1
  54. opik/evaluation/metrics/llm_judges/factuality/metric.py +1 -1
  55. opik/evaluation/metrics/llm_judges/g_eval/metric.py +3 -1
  56. opik/evaluation/metrics/llm_judges/hallucination/metric.py +3 -1
  57. opik/evaluation/metrics/llm_judges/moderation/metric.py +3 -1
  58. opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +3 -1
  59. opik/evaluation/metrics/llm_judges/syc_eval/metric.py +4 -2
  60. opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +3 -1
  61. opik/evaluation/metrics/llm_judges/usefulness/metric.py +3 -1
  62. opik/evaluation/metrics/ragas_metric.py +43 -23
  63. opik/evaluation/models/litellm/litellm_chat_model.py +7 -2
  64. opik/evaluation/models/litellm/util.py +4 -20
  65. opik/evaluation/models/models_factory.py +19 -5
  66. opik/evaluation/rest_operations.py +3 -3
  67. opik/evaluation/threads/helpers.py +3 -2
  68. opik/file_upload/file_uploader.py +13 -0
  69. opik/file_upload/upload_options.py +2 -0
  70. opik/integrations/adk/legacy_opik_tracer.py +9 -11
  71. opik/integrations/adk/opik_tracer.py +2 -2
  72. opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +2 -2
  73. opik/integrations/dspy/callback.py +100 -14
  74. opik/integrations/dspy/parsers.py +168 -0
  75. opik/integrations/harbor/__init__.py +17 -0
  76. opik/integrations/harbor/experiment_service.py +269 -0
  77. opik/integrations/harbor/opik_tracker.py +528 -0
  78. opik/integrations/haystack/opik_tracer.py +2 -2
  79. opik/integrations/langchain/__init__.py +15 -2
  80. opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
  81. opik/integrations/langchain/opik_tracer.py +258 -160
  82. opik/integrations/langchain/provider_usage_extractors/langchain_run_helpers/helpers.py +7 -4
  83. opik/integrations/llama_index/callback.py +43 -6
  84. opik/integrations/openai/agents/opik_tracing_processor.py +8 -10
  85. opik/integrations/openai/opik_tracker.py +99 -4
  86. opik/integrations/openai/videos/__init__.py +9 -0
  87. opik/integrations/openai/videos/binary_response_write_to_file_decorator.py +88 -0
  88. opik/integrations/openai/videos/videos_create_decorator.py +159 -0
  89. opik/integrations/openai/videos/videos_download_decorator.py +110 -0
  90. opik/message_processing/batching/base_batcher.py +14 -21
  91. opik/message_processing/batching/batch_manager.py +22 -10
  92. opik/message_processing/batching/batchers.py +32 -40
  93. opik/message_processing/batching/flushing_thread.py +0 -3
  94. opik/message_processing/emulation/emulator_message_processor.py +36 -1
  95. opik/message_processing/emulation/models.py +21 -0
  96. opik/message_processing/messages.py +9 -0
  97. opik/message_processing/preprocessing/__init__.py +0 -0
  98. opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
  99. opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
  100. opik/message_processing/preprocessing/constants.py +1 -0
  101. opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
  102. opik/message_processing/preprocessing/preprocessor.py +36 -0
  103. opik/message_processing/processors/__init__.py +0 -0
  104. opik/message_processing/processors/attachments_extraction_processor.py +146 -0
  105. opik/message_processing/{message_processors.py → processors/message_processors.py} +15 -1
  106. opik/message_processing/{message_processors_chain.py → processors/message_processors_chain.py} +3 -2
  107. opik/message_processing/{online_message_processor.py → processors/online_message_processor.py} +11 -9
  108. opik/message_processing/queue_consumer.py +4 -2
  109. opik/message_processing/streamer.py +71 -33
  110. opik/message_processing/streamer_constructors.py +36 -8
  111. opik/plugins/pytest/experiment_runner.py +1 -1
  112. opik/plugins/pytest/hooks.py +5 -3
  113. opik/rest_api/__init__.py +42 -0
  114. opik/rest_api/datasets/client.py +321 -123
  115. opik/rest_api/datasets/raw_client.py +470 -145
  116. opik/rest_api/experiments/client.py +26 -0
  117. opik/rest_api/experiments/raw_client.py +26 -0
  118. opik/rest_api/llm_provider_key/client.py +4 -4
  119. opik/rest_api/llm_provider_key/raw_client.py +4 -4
  120. opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +2 -1
  121. opik/rest_api/manual_evaluation/client.py +101 -0
  122. opik/rest_api/manual_evaluation/raw_client.py +172 -0
  123. opik/rest_api/optimizations/client.py +0 -166
  124. opik/rest_api/optimizations/raw_client.py +0 -248
  125. opik/rest_api/projects/client.py +9 -0
  126. opik/rest_api/projects/raw_client.py +13 -0
  127. opik/rest_api/projects/types/project_metric_request_public_metric_type.py +4 -0
  128. opik/rest_api/prompts/client.py +130 -2
  129. opik/rest_api/prompts/raw_client.py +175 -0
  130. opik/rest_api/traces/client.py +101 -0
  131. opik/rest_api/traces/raw_client.py +120 -0
  132. opik/rest_api/types/__init__.py +50 -0
  133. opik/rest_api/types/audio_url.py +19 -0
  134. opik/rest_api/types/audio_url_public.py +19 -0
  135. opik/rest_api/types/audio_url_write.py +19 -0
  136. opik/rest_api/types/automation_rule_evaluator.py +38 -2
  137. opik/rest_api/types/automation_rule_evaluator_object_object_public.py +33 -2
  138. opik/rest_api/types/automation_rule_evaluator_public.py +33 -2
  139. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
  140. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
  141. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
  142. opik/rest_api/types/automation_rule_evaluator_update.py +27 -1
  143. opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
  144. opik/rest_api/types/automation_rule_evaluator_write.py +27 -1
  145. opik/rest_api/types/dataset.py +2 -0
  146. opik/rest_api/types/dataset_item.py +1 -1
  147. opik/rest_api/types/dataset_item_batch.py +4 -0
  148. opik/rest_api/types/dataset_item_changes_public.py +5 -0
  149. opik/rest_api/types/dataset_item_compare.py +1 -1
  150. opik/rest_api/types/dataset_item_filter.py +4 -0
  151. opik/rest_api/types/dataset_item_page_compare.py +0 -1
  152. opik/rest_api/types/dataset_item_page_public.py +0 -1
  153. opik/rest_api/types/dataset_item_public.py +1 -1
  154. opik/rest_api/types/dataset_public.py +2 -0
  155. opik/rest_api/types/dataset_version_public.py +10 -0
  156. opik/rest_api/types/dataset_version_summary.py +46 -0
  157. opik/rest_api/types/dataset_version_summary_public.py +46 -0
  158. opik/rest_api/types/experiment.py +9 -0
  159. opik/rest_api/types/experiment_public.py +9 -0
  160. opik/rest_api/types/group_content_with_aggregations.py +1 -0
  161. opik/rest_api/types/llm_as_judge_message_content.py +2 -0
  162. opik/rest_api/types/llm_as_judge_message_content_public.py +2 -0
  163. opik/rest_api/types/llm_as_judge_message_content_write.py +2 -0
  164. opik/rest_api/types/manual_evaluation_request_entity_type.py +1 -1
  165. opik/rest_api/types/project.py +1 -0
  166. opik/rest_api/types/project_detailed.py +1 -0
  167. opik/rest_api/types/project_metric_response_public_metric_type.py +4 -0
  168. opik/rest_api/types/project_reference.py +31 -0
  169. opik/rest_api/types/project_reference_public.py +31 -0
  170. opik/rest_api/types/project_stats_summary_item.py +1 -0
  171. opik/rest_api/types/prompt_version.py +1 -0
  172. opik/rest_api/types/prompt_version_detail.py +1 -0
  173. opik/rest_api/types/prompt_version_page_public.py +5 -0
  174. opik/rest_api/types/prompt_version_public.py +1 -0
  175. opik/rest_api/types/prompt_version_update.py +33 -0
  176. opik/rest_api/types/provider_api_key.py +5 -1
  177. opik/rest_api/types/provider_api_key_provider.py +2 -1
  178. opik/rest_api/types/provider_api_key_public.py +5 -1
  179. opik/rest_api/types/provider_api_key_public_provider.py +2 -1
  180. opik/rest_api/types/service_toggles_config.py +11 -1
  181. opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
  182. opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
  183. opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
  184. opik/types.py +36 -0
  185. opik/validation/chat_prompt_messages.py +241 -0
  186. opik/validation/feedback_score.py +3 -3
  187. opik/validation/validator.py +28 -0
  188. {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/METADATA +7 -7
  189. {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/RECORD +193 -142
  190. opik/cli/export.py +0 -791
  191. opik/cli/import_command.py +0 -575
  192. {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/WHEEL +0 -0
  193. {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/entry_points.txt +0 -0
  194. {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/licenses/LICENSE +0 -0
  195. {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import os
2
3
  from typing import Optional
3
4
 
4
5
  import httpx
@@ -28,6 +29,10 @@ def upload_attachment(
28
29
  httpx_client=upload_httpx_client,
29
30
  monitor=monitor,
30
31
  )
32
+
33
+ # delete the file after upload if requested
34
+ if upload_options.delete_after_upload:
35
+ _delete_attachment_file(upload_options.file_path)
31
36
  except Exception as e:
32
37
  LOGGER.error(
33
38
  "Failed to upload attachment: '%s' from file: [%s] with size: [%s]. Error: %s",
@@ -40,6 +45,14 @@ def upload_attachment(
40
45
  raise
41
46
 
42
47
 
48
+ def _delete_attachment_file(file_path: str) -> None:
49
+ try:
50
+ os.unlink(file_path)
51
+ except OSError as e:
52
+ LOGGER.info(f"Failed to delete attachment file: '{file_path}'. Reason: {e}.")
53
+ pass
54
+
55
+
43
56
  def _do_upload_attachment(
44
57
  upload_options: file_upload_options.FileUploadOptions,
45
58
  rest_client: rest_api_client.OpikApi,
@@ -16,6 +16,7 @@ class FileUploadOptions:
16
16
  entity_id: str
17
17
  project_name: str
18
18
  encoded_url_override: str
19
+ delete_after_upload: bool
19
20
 
20
21
 
21
22
  def file_upload_options_from_attachment(
@@ -32,4 +33,5 @@ def file_upload_options_from_attachment(
32
33
  entity_id=attachment.entity_id,
33
34
  project_name=attachment.project_name,
34
35
  encoded_url_override=attachment.encoded_url_override,
36
+ delete_after_upload=attachment.delete_after_upload,
35
37
  )
@@ -158,15 +158,13 @@ class LegacyOpikTracer:
158
158
  input=user_input,
159
159
  type="general",
160
160
  )
161
- _, opik_span_data = (
162
- span_creation_handler.create_span_respecting_context(
163
- start_span_arguments=start_span_arguments,
164
- distributed_trace_headers=None,
165
- opik_context_storage=self._context_storage,
166
- )
161
+ result = span_creation_handler.create_span_respecting_context(
162
+ start_span_arguments=start_span_arguments,
163
+ distributed_trace_headers=None,
164
+ opik_context_storage=self._context_storage,
167
165
  )
168
166
 
169
- self._start_span(span_data=opik_span_data)
167
+ self._start_span(span_data=result.span_data)
170
168
  except Exception as e:
171
169
  LOGGER.error(f"Failed during before_agent_callback(): {e}", exc_info=True)
172
170
 
@@ -212,7 +210,7 @@ class LegacyOpikTracer:
212
210
  if provider is None:
213
211
  provider = adk_helpers.get_adk_provider()
214
212
 
215
- _, span_data = span_creation_handler.create_span_respecting_context(
213
+ result = span_creation_handler.create_span_respecting_context(
216
214
  start_span_arguments=arguments_helpers.StartSpanParameters(
217
215
  name=llm_request.model,
218
216
  project_name=self.project_name,
@@ -226,7 +224,7 @@ class LegacyOpikTracer:
226
224
  opik_context_storage=self._context_storage,
227
225
  )
228
226
 
229
- self._start_span(span_data=span_data)
227
+ self._start_span(span_data=result.span_data)
230
228
 
231
229
  except Exception as e:
232
230
  LOGGER.error(f"Failed during before_model_callback(): {e}", exc_info=True)
@@ -300,7 +298,7 @@ class LegacyOpikTracer:
300
298
  **self.metadata,
301
299
  }
302
300
 
303
- _, span_data = span_creation_handler.create_span_respecting_context(
301
+ result = span_creation_handler.create_span_respecting_context(
304
302
  start_span_arguments=arguments_helpers.StartSpanParameters(
305
303
  name=tool.name,
306
304
  project_name=self.project_name,
@@ -312,7 +310,7 @@ class LegacyOpikTracer:
312
310
  opik_context_storage=self._context_storage,
313
311
  )
314
312
 
315
- self._start_span(span_data=span_data)
313
+ self._start_span(span_data=result.span_data)
316
314
 
317
315
  except Exception as e:
318
316
  LOGGER.error(f"Failed during before_tool_callback(): {e}", exc_info=True)
@@ -173,7 +173,7 @@ class OpikTracer:
173
173
  # ADK runs `before_model_callback` before running `start_as_current_span` function for the LLM call,
174
174
  # which makes it impossible to update the Opik span from this method.
175
175
  # So we create a span manually here. This flow is handled inside ADKTracerWrapper.
176
- _, span_data = span_creation_handler.create_span_respecting_context(
176
+ result = span_creation_handler.create_span_respecting_context(
177
177
  start_span_arguments=arguments_helpers.StartSpanParameters(
178
178
  name=model,
179
179
  project_name=self.project_name,
@@ -189,7 +189,7 @@ class OpikTracer:
189
189
  distributed_trace_headers=None,
190
190
  )
191
191
 
192
- context_storage.add_span_data(span_data)
192
+ context_storage.add_span_data(result.span_data)
193
193
  except Exception as e:
194
194
  LOGGER.error(f"Failed during before_model_callback(): {e}", exc_info=True)
195
195
 
@@ -190,11 +190,11 @@ def _prepare_trace_and_span_to_be_finalized(
190
190
  type="general",
191
191
  )
192
192
 
193
- _, span_to_close_in_finally_block = (
193
+ span_to_close_in_finally_block = (
194
194
  span_creation_handler.create_span_respecting_context(
195
195
  start_span_arguments=start_span_arguments,
196
196
  distributed_trace_headers=None,
197
- )
197
+ ).span_data
198
198
  )
199
199
  opik.context_storage.add_span_data(span_to_close_in_finally_block)
200
200
 
@@ -1,14 +1,16 @@
1
- from typing import Any, Dict, Optional, Union
1
+ from typing import Any, Dict, Optional, Tuple, Union
2
2
  import logging
3
3
 
4
4
  import dspy
5
5
  from dspy.utils import callback as dspy_callback
6
6
 
7
- from opik import context_storage, opik_context, tracing_runtime_config, types
7
+ from opik import context_storage, opik_context, tracing_runtime_config
8
+ from opik import llm_usage
8
9
  from opik.api_objects import helpers, span, trace, opik_client
9
10
  from opik.decorator import error_info_collector
10
11
 
11
12
  from .graph import build_mermaid_graph_from_module
13
+ from .parsers import LMHistoryInfo, extract_lm_info_from_history, get_span_type
12
14
 
13
15
  LOGGER = logging.getLogger(__name__)
14
16
 
@@ -32,6 +34,8 @@ class OpikCallback(dspy_callback.BaseCallback):
32
34
  ):
33
35
  self._map_call_id_to_span_data: Dict[str, span.SpanData] = {}
34
36
  self._map_call_id_to_trace_data: Dict[str, trace.TraceData] = {}
37
+ # Store (lm_instance, expected_messages) for extracting usage and verifying correct history entry
38
+ self._map_call_id_to_lm_info: Dict[str, Tuple[Any, Optional[Any]]] = {}
35
39
 
36
40
  self._origins_metadata: Dict[str, Any] = {"created_from": "dspy"}
37
41
 
@@ -103,7 +107,7 @@ class OpikCallback(dspy_callback.BaseCallback):
103
107
  parent_project_name=current_span_data.project_name,
104
108
  child_project_name=self._project_name,
105
109
  )
106
- span_type = self._get_span_type(instance)
110
+ span_type = get_span_type(instance)
107
111
 
108
112
  span_data = span.SpanData(
109
113
  trace_id=current_span_data.trace_id,
@@ -127,7 +131,7 @@ class OpikCallback(dspy_callback.BaseCallback):
127
131
  current_trace_data.project_name,
128
132
  self._project_name,
129
133
  )
130
- span_type = self._get_span_type(instance)
134
+ span_type = get_span_type(instance)
131
135
 
132
136
  span_data = span.SpanData(
133
137
  trace_id=current_trace_data.id,
@@ -198,13 +202,54 @@ class OpikCallback(dspy_callback.BaseCallback):
198
202
  call_id: str,
199
203
  outputs: Optional[Any],
200
204
  exception: Optional[Exception] = None,
205
+ usage: Optional[llm_usage.OpikUsage] = None,
206
+ extra_metadata: Optional[Dict[str, Any]] = None,
207
+ actual_provider: Optional[str] = None,
208
+ actual_model: Optional[str] = None,
209
+ total_cost: Optional[float] = None,
201
210
  ) -> None:
202
211
  if span_data := self._map_call_id_to_span_data.pop(call_id, None):
203
212
  if exception:
204
213
  error_info = error_info_collector.collect(exception)
205
214
  span_data.update(error_info=error_info)
206
215
 
207
- span_data.update(output={"output": outputs}).init_end_time()
216
+ # Prepare the update dict
217
+ update_kwargs: Dict[str, Any] = {
218
+ "output": {"output": outputs},
219
+ "usage": usage,
220
+ "total_cost": total_cost,
221
+ }
222
+
223
+ # Handle LLM routers like OpenRouter that return the actual serving provider/model
224
+ if extra_metadata is None:
225
+ extra_metadata = {}
226
+
227
+ # Update provider if actual provider differs (e.g., OpenRouter -> Hyperbolic)
228
+ if (
229
+ actual_provider is not None
230
+ and span_data.provider is not None
231
+ and span_data.provider.lower() != actual_provider.lower()
232
+ ):
233
+ # Store the original provider (e.g., "openrouter") in metadata
234
+ extra_metadata["llm_router"] = span_data.provider
235
+ # Update to the actual provider for accurate cost tracking
236
+ update_kwargs["provider"] = actual_provider.lower()
237
+
238
+ if (
239
+ actual_model is not None
240
+ and span_data.model is not None
241
+ and span_data.model != actual_model
242
+ ):
243
+ # Store the original model (e.g., "@preset/qwen") in metadata
244
+ extra_metadata["original_model"] = span_data.model
245
+ # Update to the actual model for accurate cost tracking
246
+ update_kwargs["model"] = actual_model
247
+
248
+ # Only set metadata if we have something to add
249
+ if extra_metadata:
250
+ update_kwargs["metadata"] = extra_metadata
251
+
252
+ span_data.update(**update_kwargs).init_end_time()
208
253
  if tracing_runtime_config.is_tracing_active():
209
254
  self._opik_client.span(**span_data.as_parameters)
210
255
 
@@ -231,7 +276,7 @@ class OpikCallback(dspy_callback.BaseCallback):
231
276
  trace_id = current_callback_context_data.id
232
277
  parent_span_id = None
233
278
 
234
- span_type = self._get_span_type(instance)
279
+ span_type = get_span_type(instance)
235
280
 
236
281
  return span.SpanData(
237
282
  trace_id=trace_id,
@@ -263,6 +308,13 @@ class OpikCallback(dspy_callback.BaseCallback):
263
308
  name=f"{span_data.name}: {provider} - {model}",
264
309
  )
265
310
  self._map_call_id_to_span_data[call_id] = span_data
311
+
312
+ # Store LM instance and expected messages for extracting usage
313
+ self._map_call_id_to_lm_info[call_id] = (
314
+ instance,
315
+ inputs.get("messages"),
316
+ )
317
+
266
318
  self._set_current_context_data(span_data)
267
319
 
268
320
  def on_lm_end(
@@ -271,10 +323,22 @@ class OpikCallback(dspy_callback.BaseCallback):
271
323
  outputs: Optional[Dict[str, Any]],
272
324
  exception: Optional[Exception] = None,
273
325
  ) -> None:
326
+ lm_info = self._extract_lm_info_from_history(call_id)
327
+
328
+ # Add cache_hit to span metadata only when we have a definitive value
329
+ extra_metadata = (
330
+ {"cache_hit": lm_info.cache_hit} if lm_info.cache_hit is not None else None
331
+ )
332
+
274
333
  self._end_span(
275
334
  call_id=call_id,
276
335
  exception=exception,
277
336
  outputs=outputs,
337
+ usage=lm_info.usage,
338
+ extra_metadata=extra_metadata,
339
+ actual_provider=lm_info.actual_provider,
340
+ actual_model=lm_info.actual_model,
341
+ total_cost=lm_info.total_cost,
278
342
  )
279
343
 
280
344
  def on_tool_start(
@@ -316,14 +380,36 @@ class OpikCallback(dspy_callback.BaseCallback):
316
380
  return span_data
317
381
  return self._context_storage.get_trace_data()
318
382
 
319
- def _get_span_type(self, instance: Any) -> types.SpanType:
320
- if isinstance(instance, dspy.Predict):
321
- return "llm"
322
- elif isinstance(instance, dspy.LM):
323
- return "llm"
324
- elif isinstance(instance, dspy.Tool):
325
- return "tool"
326
- return "general"
383
+ def _extract_lm_info_from_history(self, call_id: str) -> LMHistoryInfo:
384
+ """
385
+ Extract token usage, cache status, actual provider, and cost from the LM's history.
386
+
387
+ DSPy stores usage information in the LM's history after each call.
388
+ We verify the history entry matches our expected messages to handle
389
+ potential race conditions with concurrent LM calls.
390
+
391
+ For routers like OpenRouter, the response contains the actual provider
392
+ that served the request (e.g., "Novita", "Together"), which differs from
393
+ the router name used in the model string (e.g., "openrouter").
394
+
395
+ The cost field is provided by providers like OpenRouter and includes
396
+ accurate pricing for all token types (reasoning, cache, multimodal).
397
+
398
+ Returns:
399
+ LMHistoryInfo containing usage, cache_hit, actual_provider, and total_cost.
400
+ """
401
+ lm_info = self._map_call_id_to_lm_info.pop(call_id, None)
402
+ if lm_info is None:
403
+ return LMHistoryInfo(
404
+ usage=None,
405
+ cache_hit=None,
406
+ actual_provider=None,
407
+ actual_model=None,
408
+ total_cost=None,
409
+ )
410
+
411
+ lm_instance, expected_messages = lm_info
412
+ return extract_lm_info_from_history(lm_instance, expected_messages)
327
413
 
328
414
  def _get_opik_metadata(self, instance: Any) -> Dict[str, Any]:
329
415
  graph = None
@@ -0,0 +1,168 @@
1
+ """
2
+ Parsers and data structures for extracting information from DSPy LM responses.
3
+
4
+ This module contains utilities for parsing DSPy LM history entries and
5
+ extracting relevant information like usage, provider, and cost data.
6
+ """
7
+
8
+ from dataclasses import dataclass
9
+ from typing import Any, Optional
10
+ import logging
11
+
12
+ import dspy
13
+
14
+ from opik import llm_usage, types
15
+
16
+ LOGGER = logging.getLogger(__name__)
17
+
18
+
19
+ @dataclass
20
+ class LMHistoryInfo:
21
+ """
22
+ Information extracted from a DSPy LM history entry.
23
+
24
+ This dataclass holds the parsed information from an LM call's history,
25
+ including usage statistics, cache status, provider information, and cost.
26
+
27
+ Attributes:
28
+ usage: Token usage information (prompt, completion, total tokens)
29
+ cache_hit: Whether the response was served from cache.
30
+ True if cached, False if not, None if unknown.
31
+ actual_provider: The actual provider that served the request.
32
+ This is useful for LLM routers like OpenRouter that may route
33
+ to different underlying providers (e.g., "Novita", "Together").
34
+ actual_model: The actual model that served the request.
35
+ This is useful for LLM routers like OpenRouter when using presets
36
+ (e.g., "@preset/qwen" resolves to "qwen/qwen3-235b-a22b-2507").
37
+ total_cost: The total cost of the request from the provider.
38
+ This includes accurate pricing for all token types.
39
+ """
40
+
41
+ usage: Optional[llm_usage.OpikUsage]
42
+ cache_hit: Optional[bool]
43
+ actual_provider: Optional[str]
44
+ actual_model: Optional[str]
45
+ total_cost: Optional[float]
46
+
47
+
48
+ def get_span_type(instance: Any) -> types.SpanType:
49
+ """
50
+ Determine the span type based on the DSPy instance type.
51
+
52
+ Args:
53
+ instance: A DSPy module, LM, or tool instance.
54
+
55
+ Returns:
56
+ The appropriate span type: "llm" for Predict/LM, "tool" for Tool,
57
+ or "general" for other types.
58
+ """
59
+ if isinstance(instance, dspy.Predict):
60
+ return "llm"
61
+ elif isinstance(instance, dspy.LM):
62
+ return "llm"
63
+ elif isinstance(instance, dspy.Tool):
64
+ return "tool"
65
+ return "general"
66
+
67
+
68
+ def extract_lm_info_from_history(
69
+ lm_instance: Any,
70
+ expected_messages: Optional[Any],
71
+ ) -> LMHistoryInfo:
72
+ """
73
+ Extract token usage, cache status, actual provider, and cost from the LM's history.
74
+
75
+ DSPy stores usage information in the LM's history after each call.
76
+ We verify the history entry matches our expected messages to handle
77
+ potential race conditions with concurrent LM calls.
78
+
79
+ For routers like OpenRouter, the response contains the actual provider
80
+ that served the request (e.g., "Novita", "Together"), which differs from
81
+ the router name used in the model string (e.g., "openrouter").
82
+
83
+ The cost field is provided by providers like OpenRouter and includes
84
+ accurate pricing for all token types (reasoning, cache, multimodal).
85
+
86
+ Args:
87
+ lm_instance: The DSPy LM instance that has the history.
88
+ expected_messages: The expected messages to match in the history entry.
89
+
90
+ Returns:
91
+ LMHistoryInfo containing usage, cache_hit, actual_provider, and total_cost.
92
+ """
93
+ empty_result = LMHistoryInfo(
94
+ usage=None,
95
+ cache_hit=None,
96
+ actual_provider=None,
97
+ actual_model=None,
98
+ total_cost=None,
99
+ )
100
+
101
+ if not hasattr(lm_instance, "history") or not lm_instance.history:
102
+ return empty_result
103
+
104
+ try:
105
+ last_entry = lm_instance.history[-1]
106
+
107
+ # Verify we have the correct history entry by checking messages match
108
+ if last_entry.get("messages") != expected_messages:
109
+ LOGGER.debug(
110
+ "History entry messages don't match expected messages, "
111
+ "skipping usage extraction (possibly due to concurrent LM calls)"
112
+ )
113
+ return empty_result
114
+
115
+ response = last_entry.get("response")
116
+ usage_dict = last_entry.get("usage")
117
+
118
+ # Extract actual provider and model from response (useful for routers like OpenRouter)
119
+ # The response is a LiteLLM ModelResponse object with 'provider' and 'model' attributes
120
+ # when using routers like OpenRouter
121
+ actual_provider: Optional[str] = None
122
+ actual_model: Optional[str] = None
123
+ if response is not None:
124
+ if hasattr(response, "provider"):
125
+ actual_provider = response.provider
126
+ if hasattr(response, "model"):
127
+ actual_model = response.model
128
+
129
+ # Extract cost from history entry or usage dict
130
+ # OpenRouter and other providers return accurate cost including all token types
131
+ total_cost: Optional[float] = None
132
+ if (cost := last_entry.get("cost") or 0) > 0:
133
+ total_cost = cost
134
+ elif usage_dict and (cost := usage_dict.get("cost") or 0) > 0:
135
+ total_cost = cost
136
+
137
+ # Get explicit cache_hit if set, otherwise infer from usage (empty = cached)
138
+ if response is None:
139
+ cache_hit = not usage_dict
140
+ elif hasattr(response, "cache_hit") and response.cache_hit is not None:
141
+ cache_hit = response.cache_hit
142
+ else:
143
+ # Fallback: infer from usage (empty = cached)
144
+ cache_hit = not usage_dict
145
+
146
+ if usage_dict:
147
+ usage = llm_usage.build_opik_usage_from_unknown_provider(usage_dict)
148
+ return LMHistoryInfo(
149
+ usage=usage,
150
+ cache_hit=cache_hit,
151
+ actual_provider=actual_provider,
152
+ actual_model=actual_model,
153
+ total_cost=total_cost,
154
+ )
155
+ else:
156
+ return LMHistoryInfo(
157
+ usage=None,
158
+ cache_hit=cache_hit,
159
+ actual_provider=actual_provider,
160
+ actual_model=actual_model,
161
+ total_cost=None,
162
+ )
163
+ except Exception:
164
+ LOGGER.debug(
165
+ "Failed to extract info from DSPy LM history",
166
+ exc_info=True,
167
+ )
168
+ return empty_result
@@ -0,0 +1,17 @@
1
+ """
2
+ Opik integration for Harbor benchmark evaluation framework.
3
+
4
+ Example:
5
+ >>> from opik.integrations.harbor import track_harbor
6
+ >>> job = Job(config)
7
+ >>> tracked_job = track_harbor(job)
8
+ >>> result = await tracked_job.run()
9
+
10
+ Or enable tracking globally (for CLI usage):
11
+ >>> from opik.integrations.harbor import track_harbor
12
+ >>> track_harbor()
13
+ """
14
+
15
+ from .opik_tracker import track_harbor, reset_harbor_tracking
16
+
17
+ __all__ = ["track_harbor", "reset_harbor_tracking"]