opik 1.9.39__py3-none-any.whl → 1.9.86__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik/api_objects/attachment/attachment_context.py +36 -0
- opik/api_objects/attachment/attachments_extractor.py +153 -0
- opik/api_objects/attachment/client.py +1 -0
- opik/api_objects/attachment/converters.py +2 -0
- opik/api_objects/attachment/decoder.py +18 -0
- opik/api_objects/attachment/decoder_base64.py +83 -0
- opik/api_objects/attachment/decoder_helpers.py +137 -0
- opik/api_objects/constants.py +2 -0
- opik/api_objects/dataset/dataset.py +133 -40
- opik/api_objects/dataset/rest_operations.py +2 -0
- opik/api_objects/experiment/experiment.py +6 -0
- opik/api_objects/helpers.py +8 -4
- opik/api_objects/local_recording.py +6 -5
- opik/api_objects/observation_data.py +101 -0
- opik/api_objects/opik_client.py +78 -45
- opik/api_objects/opik_query_language.py +9 -3
- opik/api_objects/prompt/chat/chat_prompt.py +18 -1
- opik/api_objects/prompt/client.py +8 -1
- opik/api_objects/span/span_data.py +3 -88
- opik/api_objects/threads/threads_client.py +7 -4
- opik/api_objects/trace/trace_data.py +3 -74
- opik/api_objects/validation_helpers.py +3 -3
- opik/cli/exports/__init__.py +131 -0
- opik/cli/exports/dataset.py +278 -0
- opik/cli/exports/experiment.py +784 -0
- opik/cli/exports/project.py +685 -0
- opik/cli/exports/prompt.py +578 -0
- opik/cli/exports/utils.py +406 -0
- opik/cli/harbor.py +39 -0
- opik/cli/imports/__init__.py +439 -0
- opik/cli/imports/dataset.py +143 -0
- opik/cli/imports/experiment.py +1192 -0
- opik/cli/imports/project.py +262 -0
- opik/cli/imports/prompt.py +177 -0
- opik/cli/imports/utils.py +280 -0
- opik/cli/main.py +14 -12
- opik/config.py +12 -1
- opik/datetime_helpers.py +12 -0
- opik/decorator/arguments_helpers.py +4 -1
- opik/decorator/base_track_decorator.py +111 -37
- opik/decorator/context_manager/span_context_manager.py +5 -1
- opik/decorator/generator_wrappers.py +5 -4
- opik/decorator/span_creation_handler.py +13 -4
- opik/evaluation/engine/engine.py +111 -28
- opik/evaluation/engine/evaluation_tasks_executor.py +71 -19
- opik/evaluation/evaluator.py +12 -0
- opik/evaluation/metrics/conversation/llm_judges/conversational_coherence/metric.py +3 -1
- opik/evaluation/metrics/conversation/llm_judges/session_completeness/metric.py +3 -1
- opik/evaluation/metrics/conversation/llm_judges/user_frustration/metric.py +3 -1
- opik/evaluation/metrics/heuristics/equals.py +11 -7
- opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +3 -1
- opik/evaluation/metrics/llm_judges/context_precision/metric.py +3 -1
- opik/evaluation/metrics/llm_judges/context_recall/metric.py +3 -1
- opik/evaluation/metrics/llm_judges/factuality/metric.py +1 -1
- opik/evaluation/metrics/llm_judges/g_eval/metric.py +3 -1
- opik/evaluation/metrics/llm_judges/hallucination/metric.py +3 -1
- opik/evaluation/metrics/llm_judges/moderation/metric.py +3 -1
- opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +3 -1
- opik/evaluation/metrics/llm_judges/syc_eval/metric.py +4 -2
- opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +3 -1
- opik/evaluation/metrics/llm_judges/usefulness/metric.py +3 -1
- opik/evaluation/metrics/ragas_metric.py +43 -23
- opik/evaluation/models/litellm/litellm_chat_model.py +7 -2
- opik/evaluation/models/litellm/util.py +4 -20
- opik/evaluation/models/models_factory.py +19 -5
- opik/evaluation/rest_operations.py +3 -3
- opik/evaluation/threads/helpers.py +3 -2
- opik/file_upload/file_uploader.py +13 -0
- opik/file_upload/upload_options.py +2 -0
- opik/integrations/adk/legacy_opik_tracer.py +9 -11
- opik/integrations/adk/opik_tracer.py +2 -2
- opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +2 -2
- opik/integrations/dspy/callback.py +100 -14
- opik/integrations/dspy/parsers.py +168 -0
- opik/integrations/harbor/__init__.py +17 -0
- opik/integrations/harbor/experiment_service.py +269 -0
- opik/integrations/harbor/opik_tracker.py +528 -0
- opik/integrations/haystack/opik_tracer.py +2 -2
- opik/integrations/langchain/__init__.py +15 -2
- opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
- opik/integrations/langchain/opik_tracer.py +258 -160
- opik/integrations/langchain/provider_usage_extractors/langchain_run_helpers/helpers.py +7 -4
- opik/integrations/llama_index/callback.py +43 -6
- opik/integrations/openai/agents/opik_tracing_processor.py +8 -10
- opik/integrations/openai/opik_tracker.py +99 -4
- opik/integrations/openai/videos/__init__.py +9 -0
- opik/integrations/openai/videos/binary_response_write_to_file_decorator.py +88 -0
- opik/integrations/openai/videos/videos_create_decorator.py +159 -0
- opik/integrations/openai/videos/videos_download_decorator.py +110 -0
- opik/message_processing/batching/base_batcher.py +14 -21
- opik/message_processing/batching/batch_manager.py +22 -10
- opik/message_processing/batching/batchers.py +32 -40
- opik/message_processing/batching/flushing_thread.py +0 -3
- opik/message_processing/emulation/emulator_message_processor.py +36 -1
- opik/message_processing/emulation/models.py +21 -0
- opik/message_processing/messages.py +9 -0
- opik/message_processing/preprocessing/__init__.py +0 -0
- opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
- opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
- opik/message_processing/preprocessing/constants.py +1 -0
- opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
- opik/message_processing/preprocessing/preprocessor.py +36 -0
- opik/message_processing/processors/__init__.py +0 -0
- opik/message_processing/processors/attachments_extraction_processor.py +146 -0
- opik/message_processing/{message_processors.py → processors/message_processors.py} +15 -1
- opik/message_processing/{message_processors_chain.py → processors/message_processors_chain.py} +3 -2
- opik/message_processing/{online_message_processor.py → processors/online_message_processor.py} +11 -9
- opik/message_processing/queue_consumer.py +4 -2
- opik/message_processing/streamer.py +71 -33
- opik/message_processing/streamer_constructors.py +36 -8
- opik/plugins/pytest/experiment_runner.py +1 -1
- opik/plugins/pytest/hooks.py +5 -3
- opik/rest_api/__init__.py +42 -0
- opik/rest_api/datasets/client.py +321 -123
- opik/rest_api/datasets/raw_client.py +470 -145
- opik/rest_api/experiments/client.py +26 -0
- opik/rest_api/experiments/raw_client.py +26 -0
- opik/rest_api/llm_provider_key/client.py +4 -4
- opik/rest_api/llm_provider_key/raw_client.py +4 -4
- opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +2 -1
- opik/rest_api/manual_evaluation/client.py +101 -0
- opik/rest_api/manual_evaluation/raw_client.py +172 -0
- opik/rest_api/optimizations/client.py +0 -166
- opik/rest_api/optimizations/raw_client.py +0 -248
- opik/rest_api/projects/client.py +9 -0
- opik/rest_api/projects/raw_client.py +13 -0
- opik/rest_api/projects/types/project_metric_request_public_metric_type.py +4 -0
- opik/rest_api/prompts/client.py +130 -2
- opik/rest_api/prompts/raw_client.py +175 -0
- opik/rest_api/traces/client.py +101 -0
- opik/rest_api/traces/raw_client.py +120 -0
- opik/rest_api/types/__init__.py +50 -0
- opik/rest_api/types/audio_url.py +19 -0
- opik/rest_api/types/audio_url_public.py +19 -0
- opik/rest_api/types/audio_url_write.py +19 -0
- opik/rest_api/types/automation_rule_evaluator.py +38 -2
- opik/rest_api/types/automation_rule_evaluator_object_object_public.py +33 -2
- opik/rest_api/types/automation_rule_evaluator_public.py +33 -2
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_update.py +27 -1
- opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_write.py +27 -1
- opik/rest_api/types/dataset.py +2 -0
- opik/rest_api/types/dataset_item.py +1 -1
- opik/rest_api/types/dataset_item_batch.py +4 -0
- opik/rest_api/types/dataset_item_changes_public.py +5 -0
- opik/rest_api/types/dataset_item_compare.py +1 -1
- opik/rest_api/types/dataset_item_filter.py +4 -0
- opik/rest_api/types/dataset_item_page_compare.py +0 -1
- opik/rest_api/types/dataset_item_page_public.py +0 -1
- opik/rest_api/types/dataset_item_public.py +1 -1
- opik/rest_api/types/dataset_public.py +2 -0
- opik/rest_api/types/dataset_version_public.py +10 -0
- opik/rest_api/types/dataset_version_summary.py +46 -0
- opik/rest_api/types/dataset_version_summary_public.py +46 -0
- opik/rest_api/types/experiment.py +9 -0
- opik/rest_api/types/experiment_public.py +9 -0
- opik/rest_api/types/group_content_with_aggregations.py +1 -0
- opik/rest_api/types/llm_as_judge_message_content.py +2 -0
- opik/rest_api/types/llm_as_judge_message_content_public.py +2 -0
- opik/rest_api/types/llm_as_judge_message_content_write.py +2 -0
- opik/rest_api/types/manual_evaluation_request_entity_type.py +1 -1
- opik/rest_api/types/project.py +1 -0
- opik/rest_api/types/project_detailed.py +1 -0
- opik/rest_api/types/project_metric_response_public_metric_type.py +4 -0
- opik/rest_api/types/project_reference.py +31 -0
- opik/rest_api/types/project_reference_public.py +31 -0
- opik/rest_api/types/project_stats_summary_item.py +1 -0
- opik/rest_api/types/prompt_version.py +1 -0
- opik/rest_api/types/prompt_version_detail.py +1 -0
- opik/rest_api/types/prompt_version_page_public.py +5 -0
- opik/rest_api/types/prompt_version_public.py +1 -0
- opik/rest_api/types/prompt_version_update.py +33 -0
- opik/rest_api/types/provider_api_key.py +5 -1
- opik/rest_api/types/provider_api_key_provider.py +2 -1
- opik/rest_api/types/provider_api_key_public.py +5 -1
- opik/rest_api/types/provider_api_key_public_provider.py +2 -1
- opik/rest_api/types/service_toggles_config.py +11 -1
- opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
- opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
- opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
- opik/types.py +36 -0
- opik/validation/chat_prompt_messages.py +241 -0
- opik/validation/feedback_score.py +3 -3
- opik/validation/validator.py +28 -0
- {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/METADATA +7 -7
- {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/RECORD +193 -142
- opik/cli/export.py +0 -791
- opik/cli/import_command.py +0 -575
- {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/WHEEL +0 -0
- {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/entry_points.txt +0 -0
- {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/licenses/LICENSE +0 -0
- {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import os
|
|
2
3
|
from typing import Optional
|
|
3
4
|
|
|
4
5
|
import httpx
|
|
@@ -28,6 +29,10 @@ def upload_attachment(
|
|
|
28
29
|
httpx_client=upload_httpx_client,
|
|
29
30
|
monitor=monitor,
|
|
30
31
|
)
|
|
32
|
+
|
|
33
|
+
# delete the file after upload if requested
|
|
34
|
+
if upload_options.delete_after_upload:
|
|
35
|
+
_delete_attachment_file(upload_options.file_path)
|
|
31
36
|
except Exception as e:
|
|
32
37
|
LOGGER.error(
|
|
33
38
|
"Failed to upload attachment: '%s' from file: [%s] with size: [%s]. Error: %s",
|
|
@@ -40,6 +45,14 @@ def upload_attachment(
|
|
|
40
45
|
raise
|
|
41
46
|
|
|
42
47
|
|
|
48
|
+
def _delete_attachment_file(file_path: str) -> None:
|
|
49
|
+
try:
|
|
50
|
+
os.unlink(file_path)
|
|
51
|
+
except OSError as e:
|
|
52
|
+
LOGGER.info(f"Failed to delete attachment file: '{file_path}'. Reason: {e}.")
|
|
53
|
+
pass
|
|
54
|
+
|
|
55
|
+
|
|
43
56
|
def _do_upload_attachment(
|
|
44
57
|
upload_options: file_upload_options.FileUploadOptions,
|
|
45
58
|
rest_client: rest_api_client.OpikApi,
|
|
@@ -16,6 +16,7 @@ class FileUploadOptions:
|
|
|
16
16
|
entity_id: str
|
|
17
17
|
project_name: str
|
|
18
18
|
encoded_url_override: str
|
|
19
|
+
delete_after_upload: bool
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
def file_upload_options_from_attachment(
|
|
@@ -32,4 +33,5 @@ def file_upload_options_from_attachment(
|
|
|
32
33
|
entity_id=attachment.entity_id,
|
|
33
34
|
project_name=attachment.project_name,
|
|
34
35
|
encoded_url_override=attachment.encoded_url_override,
|
|
36
|
+
delete_after_upload=attachment.delete_after_upload,
|
|
35
37
|
)
|
|
@@ -158,15 +158,13 @@ class LegacyOpikTracer:
|
|
|
158
158
|
input=user_input,
|
|
159
159
|
type="general",
|
|
160
160
|
)
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
opik_context_storage=self._context_storage,
|
|
166
|
-
)
|
|
161
|
+
result = span_creation_handler.create_span_respecting_context(
|
|
162
|
+
start_span_arguments=start_span_arguments,
|
|
163
|
+
distributed_trace_headers=None,
|
|
164
|
+
opik_context_storage=self._context_storage,
|
|
167
165
|
)
|
|
168
166
|
|
|
169
|
-
self._start_span(span_data=
|
|
167
|
+
self._start_span(span_data=result.span_data)
|
|
170
168
|
except Exception as e:
|
|
171
169
|
LOGGER.error(f"Failed during before_agent_callback(): {e}", exc_info=True)
|
|
172
170
|
|
|
@@ -212,7 +210,7 @@ class LegacyOpikTracer:
|
|
|
212
210
|
if provider is None:
|
|
213
211
|
provider = adk_helpers.get_adk_provider()
|
|
214
212
|
|
|
215
|
-
|
|
213
|
+
result = span_creation_handler.create_span_respecting_context(
|
|
216
214
|
start_span_arguments=arguments_helpers.StartSpanParameters(
|
|
217
215
|
name=llm_request.model,
|
|
218
216
|
project_name=self.project_name,
|
|
@@ -226,7 +224,7 @@ class LegacyOpikTracer:
|
|
|
226
224
|
opik_context_storage=self._context_storage,
|
|
227
225
|
)
|
|
228
226
|
|
|
229
|
-
self._start_span(span_data=span_data)
|
|
227
|
+
self._start_span(span_data=result.span_data)
|
|
230
228
|
|
|
231
229
|
except Exception as e:
|
|
232
230
|
LOGGER.error(f"Failed during before_model_callback(): {e}", exc_info=True)
|
|
@@ -300,7 +298,7 @@ class LegacyOpikTracer:
|
|
|
300
298
|
**self.metadata,
|
|
301
299
|
}
|
|
302
300
|
|
|
303
|
-
|
|
301
|
+
result = span_creation_handler.create_span_respecting_context(
|
|
304
302
|
start_span_arguments=arguments_helpers.StartSpanParameters(
|
|
305
303
|
name=tool.name,
|
|
306
304
|
project_name=self.project_name,
|
|
@@ -312,7 +310,7 @@ class LegacyOpikTracer:
|
|
|
312
310
|
opik_context_storage=self._context_storage,
|
|
313
311
|
)
|
|
314
312
|
|
|
315
|
-
self._start_span(span_data=span_data)
|
|
313
|
+
self._start_span(span_data=result.span_data)
|
|
316
314
|
|
|
317
315
|
except Exception as e:
|
|
318
316
|
LOGGER.error(f"Failed during before_tool_callback(): {e}", exc_info=True)
|
|
@@ -173,7 +173,7 @@ class OpikTracer:
|
|
|
173
173
|
# ADK runs `before_model_callback` before running `start_as_current_span` function for the LLM call,
|
|
174
174
|
# which makes it impossible to update the Opik span from this method.
|
|
175
175
|
# So we create a span manually here. This flow is handled inside ADKTracerWrapper.
|
|
176
|
-
|
|
176
|
+
result = span_creation_handler.create_span_respecting_context(
|
|
177
177
|
start_span_arguments=arguments_helpers.StartSpanParameters(
|
|
178
178
|
name=model,
|
|
179
179
|
project_name=self.project_name,
|
|
@@ -189,7 +189,7 @@ class OpikTracer:
|
|
|
189
189
|
distributed_trace_headers=None,
|
|
190
190
|
)
|
|
191
191
|
|
|
192
|
-
context_storage.add_span_data(span_data)
|
|
192
|
+
context_storage.add_span_data(result.span_data)
|
|
193
193
|
except Exception as e:
|
|
194
194
|
LOGGER.error(f"Failed during before_model_callback(): {e}", exc_info=True)
|
|
195
195
|
|
|
@@ -190,11 +190,11 @@ def _prepare_trace_and_span_to_be_finalized(
|
|
|
190
190
|
type="general",
|
|
191
191
|
)
|
|
192
192
|
|
|
193
|
-
|
|
193
|
+
span_to_close_in_finally_block = (
|
|
194
194
|
span_creation_handler.create_span_respecting_context(
|
|
195
195
|
start_span_arguments=start_span_arguments,
|
|
196
196
|
distributed_trace_headers=None,
|
|
197
|
-
)
|
|
197
|
+
).span_data
|
|
198
198
|
)
|
|
199
199
|
opik.context_storage.add_span_data(span_to_close_in_finally_block)
|
|
200
200
|
|
|
@@ -1,14 +1,16 @@
|
|
|
1
|
-
from typing import Any, Dict, Optional, Union
|
|
1
|
+
from typing import Any, Dict, Optional, Tuple, Union
|
|
2
2
|
import logging
|
|
3
3
|
|
|
4
4
|
import dspy
|
|
5
5
|
from dspy.utils import callback as dspy_callback
|
|
6
6
|
|
|
7
|
-
from opik import context_storage, opik_context, tracing_runtime_config
|
|
7
|
+
from opik import context_storage, opik_context, tracing_runtime_config
|
|
8
|
+
from opik import llm_usage
|
|
8
9
|
from opik.api_objects import helpers, span, trace, opik_client
|
|
9
10
|
from opik.decorator import error_info_collector
|
|
10
11
|
|
|
11
12
|
from .graph import build_mermaid_graph_from_module
|
|
13
|
+
from .parsers import LMHistoryInfo, extract_lm_info_from_history, get_span_type
|
|
12
14
|
|
|
13
15
|
LOGGER = logging.getLogger(__name__)
|
|
14
16
|
|
|
@@ -32,6 +34,8 @@ class OpikCallback(dspy_callback.BaseCallback):
|
|
|
32
34
|
):
|
|
33
35
|
self._map_call_id_to_span_data: Dict[str, span.SpanData] = {}
|
|
34
36
|
self._map_call_id_to_trace_data: Dict[str, trace.TraceData] = {}
|
|
37
|
+
# Store (lm_instance, expected_messages) for extracting usage and verifying correct history entry
|
|
38
|
+
self._map_call_id_to_lm_info: Dict[str, Tuple[Any, Optional[Any]]] = {}
|
|
35
39
|
|
|
36
40
|
self._origins_metadata: Dict[str, Any] = {"created_from": "dspy"}
|
|
37
41
|
|
|
@@ -103,7 +107,7 @@ class OpikCallback(dspy_callback.BaseCallback):
|
|
|
103
107
|
parent_project_name=current_span_data.project_name,
|
|
104
108
|
child_project_name=self._project_name,
|
|
105
109
|
)
|
|
106
|
-
span_type =
|
|
110
|
+
span_type = get_span_type(instance)
|
|
107
111
|
|
|
108
112
|
span_data = span.SpanData(
|
|
109
113
|
trace_id=current_span_data.trace_id,
|
|
@@ -127,7 +131,7 @@ class OpikCallback(dspy_callback.BaseCallback):
|
|
|
127
131
|
current_trace_data.project_name,
|
|
128
132
|
self._project_name,
|
|
129
133
|
)
|
|
130
|
-
span_type =
|
|
134
|
+
span_type = get_span_type(instance)
|
|
131
135
|
|
|
132
136
|
span_data = span.SpanData(
|
|
133
137
|
trace_id=current_trace_data.id,
|
|
@@ -198,13 +202,54 @@ class OpikCallback(dspy_callback.BaseCallback):
|
|
|
198
202
|
call_id: str,
|
|
199
203
|
outputs: Optional[Any],
|
|
200
204
|
exception: Optional[Exception] = None,
|
|
205
|
+
usage: Optional[llm_usage.OpikUsage] = None,
|
|
206
|
+
extra_metadata: Optional[Dict[str, Any]] = None,
|
|
207
|
+
actual_provider: Optional[str] = None,
|
|
208
|
+
actual_model: Optional[str] = None,
|
|
209
|
+
total_cost: Optional[float] = None,
|
|
201
210
|
) -> None:
|
|
202
211
|
if span_data := self._map_call_id_to_span_data.pop(call_id, None):
|
|
203
212
|
if exception:
|
|
204
213
|
error_info = error_info_collector.collect(exception)
|
|
205
214
|
span_data.update(error_info=error_info)
|
|
206
215
|
|
|
207
|
-
|
|
216
|
+
# Prepare the update dict
|
|
217
|
+
update_kwargs: Dict[str, Any] = {
|
|
218
|
+
"output": {"output": outputs},
|
|
219
|
+
"usage": usage,
|
|
220
|
+
"total_cost": total_cost,
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
# Handle LLM routers like OpenRouter that return the actual serving provider/model
|
|
224
|
+
if extra_metadata is None:
|
|
225
|
+
extra_metadata = {}
|
|
226
|
+
|
|
227
|
+
# Update provider if actual provider differs (e.g., OpenRouter -> Hyperbolic)
|
|
228
|
+
if (
|
|
229
|
+
actual_provider is not None
|
|
230
|
+
and span_data.provider is not None
|
|
231
|
+
and span_data.provider.lower() != actual_provider.lower()
|
|
232
|
+
):
|
|
233
|
+
# Store the original provider (e.g., "openrouter") in metadata
|
|
234
|
+
extra_metadata["llm_router"] = span_data.provider
|
|
235
|
+
# Update to the actual provider for accurate cost tracking
|
|
236
|
+
update_kwargs["provider"] = actual_provider.lower()
|
|
237
|
+
|
|
238
|
+
if (
|
|
239
|
+
actual_model is not None
|
|
240
|
+
and span_data.model is not None
|
|
241
|
+
and span_data.model != actual_model
|
|
242
|
+
):
|
|
243
|
+
# Store the original model (e.g., "@preset/qwen") in metadata
|
|
244
|
+
extra_metadata["original_model"] = span_data.model
|
|
245
|
+
# Update to the actual model for accurate cost tracking
|
|
246
|
+
update_kwargs["model"] = actual_model
|
|
247
|
+
|
|
248
|
+
# Only set metadata if we have something to add
|
|
249
|
+
if extra_metadata:
|
|
250
|
+
update_kwargs["metadata"] = extra_metadata
|
|
251
|
+
|
|
252
|
+
span_data.update(**update_kwargs).init_end_time()
|
|
208
253
|
if tracing_runtime_config.is_tracing_active():
|
|
209
254
|
self._opik_client.span(**span_data.as_parameters)
|
|
210
255
|
|
|
@@ -231,7 +276,7 @@ class OpikCallback(dspy_callback.BaseCallback):
|
|
|
231
276
|
trace_id = current_callback_context_data.id
|
|
232
277
|
parent_span_id = None
|
|
233
278
|
|
|
234
|
-
span_type =
|
|
279
|
+
span_type = get_span_type(instance)
|
|
235
280
|
|
|
236
281
|
return span.SpanData(
|
|
237
282
|
trace_id=trace_id,
|
|
@@ -263,6 +308,13 @@ class OpikCallback(dspy_callback.BaseCallback):
|
|
|
263
308
|
name=f"{span_data.name}: {provider} - {model}",
|
|
264
309
|
)
|
|
265
310
|
self._map_call_id_to_span_data[call_id] = span_data
|
|
311
|
+
|
|
312
|
+
# Store LM instance and expected messages for extracting usage
|
|
313
|
+
self._map_call_id_to_lm_info[call_id] = (
|
|
314
|
+
instance,
|
|
315
|
+
inputs.get("messages"),
|
|
316
|
+
)
|
|
317
|
+
|
|
266
318
|
self._set_current_context_data(span_data)
|
|
267
319
|
|
|
268
320
|
def on_lm_end(
|
|
@@ -271,10 +323,22 @@ class OpikCallback(dspy_callback.BaseCallback):
|
|
|
271
323
|
outputs: Optional[Dict[str, Any]],
|
|
272
324
|
exception: Optional[Exception] = None,
|
|
273
325
|
) -> None:
|
|
326
|
+
lm_info = self._extract_lm_info_from_history(call_id)
|
|
327
|
+
|
|
328
|
+
# Add cache_hit to span metadata only when we have a definitive value
|
|
329
|
+
extra_metadata = (
|
|
330
|
+
{"cache_hit": lm_info.cache_hit} if lm_info.cache_hit is not None else None
|
|
331
|
+
)
|
|
332
|
+
|
|
274
333
|
self._end_span(
|
|
275
334
|
call_id=call_id,
|
|
276
335
|
exception=exception,
|
|
277
336
|
outputs=outputs,
|
|
337
|
+
usage=lm_info.usage,
|
|
338
|
+
extra_metadata=extra_metadata,
|
|
339
|
+
actual_provider=lm_info.actual_provider,
|
|
340
|
+
actual_model=lm_info.actual_model,
|
|
341
|
+
total_cost=lm_info.total_cost,
|
|
278
342
|
)
|
|
279
343
|
|
|
280
344
|
def on_tool_start(
|
|
@@ -316,14 +380,36 @@ class OpikCallback(dspy_callback.BaseCallback):
|
|
|
316
380
|
return span_data
|
|
317
381
|
return self._context_storage.get_trace_data()
|
|
318
382
|
|
|
319
|
-
def
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
383
|
+
def _extract_lm_info_from_history(self, call_id: str) -> LMHistoryInfo:
|
|
384
|
+
"""
|
|
385
|
+
Extract token usage, cache status, actual provider, and cost from the LM's history.
|
|
386
|
+
|
|
387
|
+
DSPy stores usage information in the LM's history after each call.
|
|
388
|
+
We verify the history entry matches our expected messages to handle
|
|
389
|
+
potential race conditions with concurrent LM calls.
|
|
390
|
+
|
|
391
|
+
For routers like OpenRouter, the response contains the actual provider
|
|
392
|
+
that served the request (e.g., "Novita", "Together"), which differs from
|
|
393
|
+
the router name used in the model string (e.g., "openrouter").
|
|
394
|
+
|
|
395
|
+
The cost field is provided by providers like OpenRouter and includes
|
|
396
|
+
accurate pricing for all token types (reasoning, cache, multimodal).
|
|
397
|
+
|
|
398
|
+
Returns:
|
|
399
|
+
LMHistoryInfo containing usage, cache_hit, actual_provider, and total_cost.
|
|
400
|
+
"""
|
|
401
|
+
lm_info = self._map_call_id_to_lm_info.pop(call_id, None)
|
|
402
|
+
if lm_info is None:
|
|
403
|
+
return LMHistoryInfo(
|
|
404
|
+
usage=None,
|
|
405
|
+
cache_hit=None,
|
|
406
|
+
actual_provider=None,
|
|
407
|
+
actual_model=None,
|
|
408
|
+
total_cost=None,
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
lm_instance, expected_messages = lm_info
|
|
412
|
+
return extract_lm_info_from_history(lm_instance, expected_messages)
|
|
327
413
|
|
|
328
414
|
def _get_opik_metadata(self, instance: Any) -> Dict[str, Any]:
|
|
329
415
|
graph = None
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Parsers and data structures for extracting information from DSPy LM responses.
|
|
3
|
+
|
|
4
|
+
This module contains utilities for parsing DSPy LM history entries and
|
|
5
|
+
extracting relevant information like usage, provider, and cost data.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import Any, Optional
|
|
10
|
+
import logging
|
|
11
|
+
|
|
12
|
+
import dspy
|
|
13
|
+
|
|
14
|
+
from opik import llm_usage, types
|
|
15
|
+
|
|
16
|
+
LOGGER = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class LMHistoryInfo:
|
|
21
|
+
"""
|
|
22
|
+
Information extracted from a DSPy LM history entry.
|
|
23
|
+
|
|
24
|
+
This dataclass holds the parsed information from an LM call's history,
|
|
25
|
+
including usage statistics, cache status, provider information, and cost.
|
|
26
|
+
|
|
27
|
+
Attributes:
|
|
28
|
+
usage: Token usage information (prompt, completion, total tokens)
|
|
29
|
+
cache_hit: Whether the response was served from cache.
|
|
30
|
+
True if cached, False if not, None if unknown.
|
|
31
|
+
actual_provider: The actual provider that served the request.
|
|
32
|
+
This is useful for LLM routers like OpenRouter that may route
|
|
33
|
+
to different underlying providers (e.g., "Novita", "Together").
|
|
34
|
+
actual_model: The actual model that served the request.
|
|
35
|
+
This is useful for LLM routers like OpenRouter when using presets
|
|
36
|
+
(e.g., "@preset/qwen" resolves to "qwen/qwen3-235b-a22b-2507").
|
|
37
|
+
total_cost: The total cost of the request from the provider.
|
|
38
|
+
This includes accurate pricing for all token types.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
usage: Optional[llm_usage.OpikUsage]
|
|
42
|
+
cache_hit: Optional[bool]
|
|
43
|
+
actual_provider: Optional[str]
|
|
44
|
+
actual_model: Optional[str]
|
|
45
|
+
total_cost: Optional[float]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def get_span_type(instance: Any) -> types.SpanType:
|
|
49
|
+
"""
|
|
50
|
+
Determine the span type based on the DSPy instance type.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
instance: A DSPy module, LM, or tool instance.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
The appropriate span type: "llm" for Predict/LM, "tool" for Tool,
|
|
57
|
+
or "general" for other types.
|
|
58
|
+
"""
|
|
59
|
+
if isinstance(instance, dspy.Predict):
|
|
60
|
+
return "llm"
|
|
61
|
+
elif isinstance(instance, dspy.LM):
|
|
62
|
+
return "llm"
|
|
63
|
+
elif isinstance(instance, dspy.Tool):
|
|
64
|
+
return "tool"
|
|
65
|
+
return "general"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def extract_lm_info_from_history(
|
|
69
|
+
lm_instance: Any,
|
|
70
|
+
expected_messages: Optional[Any],
|
|
71
|
+
) -> LMHistoryInfo:
|
|
72
|
+
"""
|
|
73
|
+
Extract token usage, cache status, actual provider, and cost from the LM's history.
|
|
74
|
+
|
|
75
|
+
DSPy stores usage information in the LM's history after each call.
|
|
76
|
+
We verify the history entry matches our expected messages to handle
|
|
77
|
+
potential race conditions with concurrent LM calls.
|
|
78
|
+
|
|
79
|
+
For routers like OpenRouter, the response contains the actual provider
|
|
80
|
+
that served the request (e.g., "Novita", "Together"), which differs from
|
|
81
|
+
the router name used in the model string (e.g., "openrouter").
|
|
82
|
+
|
|
83
|
+
The cost field is provided by providers like OpenRouter and includes
|
|
84
|
+
accurate pricing for all token types (reasoning, cache, multimodal).
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
lm_instance: The DSPy LM instance that has the history.
|
|
88
|
+
expected_messages: The expected messages to match in the history entry.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
LMHistoryInfo containing usage, cache_hit, actual_provider, and total_cost.
|
|
92
|
+
"""
|
|
93
|
+
empty_result = LMHistoryInfo(
|
|
94
|
+
usage=None,
|
|
95
|
+
cache_hit=None,
|
|
96
|
+
actual_provider=None,
|
|
97
|
+
actual_model=None,
|
|
98
|
+
total_cost=None,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
if not hasattr(lm_instance, "history") or not lm_instance.history:
|
|
102
|
+
return empty_result
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
last_entry = lm_instance.history[-1]
|
|
106
|
+
|
|
107
|
+
# Verify we have the correct history entry by checking messages match
|
|
108
|
+
if last_entry.get("messages") != expected_messages:
|
|
109
|
+
LOGGER.debug(
|
|
110
|
+
"History entry messages don't match expected messages, "
|
|
111
|
+
"skipping usage extraction (possibly due to concurrent LM calls)"
|
|
112
|
+
)
|
|
113
|
+
return empty_result
|
|
114
|
+
|
|
115
|
+
response = last_entry.get("response")
|
|
116
|
+
usage_dict = last_entry.get("usage")
|
|
117
|
+
|
|
118
|
+
# Extract actual provider and model from response (useful for routers like OpenRouter)
|
|
119
|
+
# The response is a LiteLLM ModelResponse object with 'provider' and 'model' attributes
|
|
120
|
+
# when using routers like OpenRouter
|
|
121
|
+
actual_provider: Optional[str] = None
|
|
122
|
+
actual_model: Optional[str] = None
|
|
123
|
+
if response is not None:
|
|
124
|
+
if hasattr(response, "provider"):
|
|
125
|
+
actual_provider = response.provider
|
|
126
|
+
if hasattr(response, "model"):
|
|
127
|
+
actual_model = response.model
|
|
128
|
+
|
|
129
|
+
# Extract cost from history entry or usage dict
|
|
130
|
+
# OpenRouter and other providers return accurate cost including all token types
|
|
131
|
+
total_cost: Optional[float] = None
|
|
132
|
+
if (cost := last_entry.get("cost") or 0) > 0:
|
|
133
|
+
total_cost = cost
|
|
134
|
+
elif usage_dict and (cost := usage_dict.get("cost") or 0) > 0:
|
|
135
|
+
total_cost = cost
|
|
136
|
+
|
|
137
|
+
# Get explicit cache_hit if set, otherwise infer from usage (empty = cached)
|
|
138
|
+
if response is None:
|
|
139
|
+
cache_hit = not usage_dict
|
|
140
|
+
elif hasattr(response, "cache_hit") and response.cache_hit is not None:
|
|
141
|
+
cache_hit = response.cache_hit
|
|
142
|
+
else:
|
|
143
|
+
# Fallback: infer from usage (empty = cached)
|
|
144
|
+
cache_hit = not usage_dict
|
|
145
|
+
|
|
146
|
+
if usage_dict:
|
|
147
|
+
usage = llm_usage.build_opik_usage_from_unknown_provider(usage_dict)
|
|
148
|
+
return LMHistoryInfo(
|
|
149
|
+
usage=usage,
|
|
150
|
+
cache_hit=cache_hit,
|
|
151
|
+
actual_provider=actual_provider,
|
|
152
|
+
actual_model=actual_model,
|
|
153
|
+
total_cost=total_cost,
|
|
154
|
+
)
|
|
155
|
+
else:
|
|
156
|
+
return LMHistoryInfo(
|
|
157
|
+
usage=None,
|
|
158
|
+
cache_hit=cache_hit,
|
|
159
|
+
actual_provider=actual_provider,
|
|
160
|
+
actual_model=actual_model,
|
|
161
|
+
total_cost=None,
|
|
162
|
+
)
|
|
163
|
+
except Exception:
|
|
164
|
+
LOGGER.debug(
|
|
165
|
+
"Failed to extract info from DSPy LM history",
|
|
166
|
+
exc_info=True,
|
|
167
|
+
)
|
|
168
|
+
return empty_result
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Opik integration for Harbor benchmark evaluation framework.
|
|
3
|
+
|
|
4
|
+
Example:
|
|
5
|
+
>>> from opik.integrations.harbor import track_harbor
|
|
6
|
+
>>> job = Job(config)
|
|
7
|
+
>>> tracked_job = track_harbor(job)
|
|
8
|
+
>>> result = await tracked_job.run()
|
|
9
|
+
|
|
10
|
+
Or enable tracking globally (for CLI usage):
|
|
11
|
+
>>> from opik.integrations.harbor import track_harbor
|
|
12
|
+
>>> track_harbor()
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from .opik_tracker import track_harbor, reset_harbor_tracking
|
|
16
|
+
|
|
17
|
+
__all__ = ["track_harbor", "reset_harbor_tracking"]
|