opik 1.9.41__py3-none-any.whl → 1.9.86__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik/api_objects/attachment/attachment_context.py +36 -0
- opik/api_objects/attachment/attachments_extractor.py +153 -0
- opik/api_objects/attachment/client.py +1 -0
- opik/api_objects/attachment/converters.py +2 -0
- opik/api_objects/attachment/decoder.py +18 -0
- opik/api_objects/attachment/decoder_base64.py +83 -0
- opik/api_objects/attachment/decoder_helpers.py +137 -0
- opik/api_objects/constants.py +2 -0
- opik/api_objects/dataset/dataset.py +133 -40
- opik/api_objects/dataset/rest_operations.py +2 -0
- opik/api_objects/experiment/experiment.py +6 -0
- opik/api_objects/helpers.py +8 -4
- opik/api_objects/local_recording.py +6 -5
- opik/api_objects/observation_data.py +101 -0
- opik/api_objects/opik_client.py +78 -45
- opik/api_objects/opik_query_language.py +9 -3
- opik/api_objects/prompt/chat/chat_prompt.py +18 -1
- opik/api_objects/prompt/client.py +8 -1
- opik/api_objects/span/span_data.py +3 -88
- opik/api_objects/threads/threads_client.py +7 -4
- opik/api_objects/trace/trace_data.py +3 -74
- opik/api_objects/validation_helpers.py +3 -3
- opik/cli/exports/__init__.py +131 -0
- opik/cli/exports/dataset.py +278 -0
- opik/cli/exports/experiment.py +784 -0
- opik/cli/exports/project.py +685 -0
- opik/cli/exports/prompt.py +578 -0
- opik/cli/exports/utils.py +406 -0
- opik/cli/harbor.py +39 -0
- opik/cli/imports/__init__.py +439 -0
- opik/cli/imports/dataset.py +143 -0
- opik/cli/imports/experiment.py +1192 -0
- opik/cli/imports/project.py +262 -0
- opik/cli/imports/prompt.py +177 -0
- opik/cli/imports/utils.py +280 -0
- opik/cli/main.py +14 -12
- opik/config.py +12 -1
- opik/datetime_helpers.py +12 -0
- opik/decorator/arguments_helpers.py +4 -1
- opik/decorator/base_track_decorator.py +111 -37
- opik/decorator/context_manager/span_context_manager.py +5 -1
- opik/decorator/generator_wrappers.py +5 -4
- opik/decorator/span_creation_handler.py +13 -4
- opik/evaluation/engine/engine.py +111 -28
- opik/evaluation/engine/evaluation_tasks_executor.py +71 -19
- opik/evaluation/evaluator.py +12 -0
- opik/evaluation/metrics/conversation/llm_judges/conversational_coherence/metric.py +3 -1
- opik/evaluation/metrics/conversation/llm_judges/session_completeness/metric.py +3 -1
- opik/evaluation/metrics/conversation/llm_judges/user_frustration/metric.py +3 -1
- opik/evaluation/metrics/heuristics/equals.py +11 -7
- opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +3 -1
- opik/evaluation/metrics/llm_judges/context_precision/metric.py +3 -1
- opik/evaluation/metrics/llm_judges/context_recall/metric.py +3 -1
- opik/evaluation/metrics/llm_judges/factuality/metric.py +1 -1
- opik/evaluation/metrics/llm_judges/g_eval/metric.py +3 -1
- opik/evaluation/metrics/llm_judges/hallucination/metric.py +3 -1
- opik/evaluation/metrics/llm_judges/moderation/metric.py +3 -1
- opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +3 -1
- opik/evaluation/metrics/llm_judges/syc_eval/metric.py +4 -2
- opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +3 -1
- opik/evaluation/metrics/llm_judges/usefulness/metric.py +3 -1
- opik/evaluation/metrics/ragas_metric.py +43 -23
- opik/evaluation/models/litellm/litellm_chat_model.py +7 -2
- opik/evaluation/models/litellm/util.py +4 -20
- opik/evaluation/models/models_factory.py +19 -5
- opik/evaluation/rest_operations.py +3 -3
- opik/evaluation/threads/helpers.py +3 -2
- opik/file_upload/file_uploader.py +13 -0
- opik/file_upload/upload_options.py +2 -0
- opik/integrations/adk/legacy_opik_tracer.py +9 -11
- opik/integrations/adk/opik_tracer.py +2 -2
- opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +2 -2
- opik/integrations/dspy/callback.py +100 -14
- opik/integrations/dspy/parsers.py +168 -0
- opik/integrations/harbor/__init__.py +17 -0
- opik/integrations/harbor/experiment_service.py +269 -0
- opik/integrations/harbor/opik_tracker.py +528 -0
- opik/integrations/haystack/opik_tracer.py +2 -2
- opik/integrations/langchain/__init__.py +15 -2
- opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
- opik/integrations/langchain/opik_tracer.py +258 -160
- opik/integrations/langchain/provider_usage_extractors/langchain_run_helpers/helpers.py +7 -4
- opik/integrations/llama_index/callback.py +43 -6
- opik/integrations/openai/agents/opik_tracing_processor.py +8 -10
- opik/integrations/openai/opik_tracker.py +99 -4
- opik/integrations/openai/videos/__init__.py +9 -0
- opik/integrations/openai/videos/binary_response_write_to_file_decorator.py +88 -0
- opik/integrations/openai/videos/videos_create_decorator.py +159 -0
- opik/integrations/openai/videos/videos_download_decorator.py +110 -0
- opik/message_processing/batching/base_batcher.py +14 -21
- opik/message_processing/batching/batch_manager.py +22 -10
- opik/message_processing/batching/batchers.py +32 -40
- opik/message_processing/batching/flushing_thread.py +0 -3
- opik/message_processing/emulation/emulator_message_processor.py +36 -1
- opik/message_processing/emulation/models.py +21 -0
- opik/message_processing/messages.py +9 -0
- opik/message_processing/preprocessing/__init__.py +0 -0
- opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
- opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
- opik/message_processing/preprocessing/constants.py +1 -0
- opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
- opik/message_processing/preprocessing/preprocessor.py +36 -0
- opik/message_processing/processors/__init__.py +0 -0
- opik/message_processing/processors/attachments_extraction_processor.py +146 -0
- opik/message_processing/{message_processors.py → processors/message_processors.py} +15 -1
- opik/message_processing/{message_processors_chain.py → processors/message_processors_chain.py} +3 -2
- opik/message_processing/{online_message_processor.py → processors/online_message_processor.py} +11 -9
- opik/message_processing/queue_consumer.py +4 -2
- opik/message_processing/streamer.py +71 -33
- opik/message_processing/streamer_constructors.py +36 -8
- opik/plugins/pytest/experiment_runner.py +1 -1
- opik/plugins/pytest/hooks.py +5 -3
- opik/rest_api/__init__.py +38 -0
- opik/rest_api/datasets/client.py +249 -148
- opik/rest_api/datasets/raw_client.py +356 -217
- opik/rest_api/experiments/client.py +26 -0
- opik/rest_api/experiments/raw_client.py +26 -0
- opik/rest_api/llm_provider_key/client.py +4 -4
- opik/rest_api/llm_provider_key/raw_client.py +4 -4
- opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +2 -1
- opik/rest_api/manual_evaluation/client.py +101 -0
- opik/rest_api/manual_evaluation/raw_client.py +172 -0
- opik/rest_api/optimizations/client.py +0 -166
- opik/rest_api/optimizations/raw_client.py +0 -248
- opik/rest_api/projects/client.py +9 -0
- opik/rest_api/projects/raw_client.py +13 -0
- opik/rest_api/projects/types/project_metric_request_public_metric_type.py +4 -0
- opik/rest_api/prompts/client.py +130 -2
- opik/rest_api/prompts/raw_client.py +175 -0
- opik/rest_api/traces/client.py +101 -0
- opik/rest_api/traces/raw_client.py +120 -0
- opik/rest_api/types/__init__.py +46 -0
- opik/rest_api/types/audio_url.py +19 -0
- opik/rest_api/types/audio_url_public.py +19 -0
- opik/rest_api/types/audio_url_write.py +19 -0
- opik/rest_api/types/automation_rule_evaluator.py +38 -2
- opik/rest_api/types/automation_rule_evaluator_object_object_public.py +33 -2
- opik/rest_api/types/automation_rule_evaluator_public.py +33 -2
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_update.py +27 -1
- opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_write.py +27 -1
- opik/rest_api/types/dataset_item.py +1 -1
- opik/rest_api/types/dataset_item_batch.py +4 -0
- opik/rest_api/types/dataset_item_changes_public.py +5 -0
- opik/rest_api/types/dataset_item_compare.py +1 -1
- opik/rest_api/types/dataset_item_filter.py +4 -0
- opik/rest_api/types/dataset_item_page_compare.py +0 -1
- opik/rest_api/types/dataset_item_page_public.py +0 -1
- opik/rest_api/types/dataset_item_public.py +1 -1
- opik/rest_api/types/dataset_version_public.py +5 -0
- opik/rest_api/types/dataset_version_summary.py +5 -0
- opik/rest_api/types/dataset_version_summary_public.py +5 -0
- opik/rest_api/types/experiment.py +9 -0
- opik/rest_api/types/experiment_public.py +9 -0
- opik/rest_api/types/llm_as_judge_message_content.py +2 -0
- opik/rest_api/types/llm_as_judge_message_content_public.py +2 -0
- opik/rest_api/types/llm_as_judge_message_content_write.py +2 -0
- opik/rest_api/types/manual_evaluation_request_entity_type.py +1 -1
- opik/rest_api/types/project.py +1 -0
- opik/rest_api/types/project_detailed.py +1 -0
- opik/rest_api/types/project_metric_response_public_metric_type.py +4 -0
- opik/rest_api/types/project_reference.py +31 -0
- opik/rest_api/types/project_reference_public.py +31 -0
- opik/rest_api/types/project_stats_summary_item.py +1 -0
- opik/rest_api/types/prompt_version.py +1 -0
- opik/rest_api/types/prompt_version_detail.py +1 -0
- opik/rest_api/types/prompt_version_page_public.py +5 -0
- opik/rest_api/types/prompt_version_public.py +1 -0
- opik/rest_api/types/prompt_version_update.py +33 -0
- opik/rest_api/types/provider_api_key.py +5 -1
- opik/rest_api/types/provider_api_key_provider.py +2 -1
- opik/rest_api/types/provider_api_key_public.py +5 -1
- opik/rest_api/types/provider_api_key_public_provider.py +2 -1
- opik/rest_api/types/service_toggles_config.py +11 -1
- opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
- opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
- opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
- opik/types.py +36 -0
- opik/validation/chat_prompt_messages.py +241 -0
- opik/validation/feedback_score.py +3 -3
- opik/validation/validator.py +28 -0
- {opik-1.9.41.dist-info → opik-1.9.86.dist-info}/METADATA +5 -5
- {opik-1.9.41.dist-info → opik-1.9.86.dist-info}/RECORD +190 -141
- opik/cli/export.py +0 -791
- opik/cli/import_command.py +0 -575
- {opik-1.9.41.dist-info → opik-1.9.86.dist-info}/WHEEL +0 -0
- {opik-1.9.41.dist-info → opik-1.9.86.dist-info}/entry_points.txt +0 -0
- {opik-1.9.41.dist-info → opik-1.9.86.dist-info}/licenses/LICENSE +0 -0
- {opik-1.9.41.dist-info → opik-1.9.86.dist-info}/top_level.txt +0 -0
|
@@ -68,6 +68,7 @@ class BaseTrackDecorator(abc.ABC):
|
|
|
68
68
|
generations_aggregator: Optional[Callable[[List[Any]], Any]] = None,
|
|
69
69
|
flush: bool = False,
|
|
70
70
|
project_name: Optional[str] = None,
|
|
71
|
+
create_duplicate_root_span: bool = True,
|
|
71
72
|
) -> Union[Callable, Callable[[Callable], Callable]]:
|
|
72
73
|
"""
|
|
73
74
|
Decorator to track the execution of a function.
|
|
@@ -85,6 +86,7 @@ class BaseTrackDecorator(abc.ABC):
|
|
|
85
86
|
generations_aggregator: Function to aggregate generation results.
|
|
86
87
|
flush: Whether to flush the client after logging.
|
|
87
88
|
project_name: The name of the project to log data.
|
|
89
|
+
create_duplicate_root_span: Whether to create a root span duplicating the root trace data.
|
|
88
90
|
|
|
89
91
|
Returns:
|
|
90
92
|
Callable: The decorated function(if used without parentheses)
|
|
@@ -113,6 +115,7 @@ class BaseTrackDecorator(abc.ABC):
|
|
|
113
115
|
generations_aggregator=generations_aggregator,
|
|
114
116
|
flush=flush,
|
|
115
117
|
project_name=project_name,
|
|
118
|
+
create_duplicate_root_span=create_duplicate_root_span,
|
|
116
119
|
)
|
|
117
120
|
|
|
118
121
|
if callable(name):
|
|
@@ -314,7 +317,7 @@ class BaseTrackDecorator(abc.ABC):
|
|
|
314
317
|
def wrapper(*args, **kwargs) -> Any: # type: ignore
|
|
315
318
|
if not tracing_runtime_config.is_tracing_active():
|
|
316
319
|
return func(*args, **kwargs)
|
|
317
|
-
self._before_call(
|
|
320
|
+
should_process_span_data = self._before_call(
|
|
318
321
|
func=func,
|
|
319
322
|
track_options=track_options,
|
|
320
323
|
args=args,
|
|
@@ -350,6 +353,7 @@ class BaseTrackDecorator(abc.ABC):
|
|
|
350
353
|
error_info=error_info,
|
|
351
354
|
capture_output=track_options.capture_output,
|
|
352
355
|
flush=track_options.flush,
|
|
356
|
+
should_process_span_data=should_process_span_data,
|
|
353
357
|
)
|
|
354
358
|
if func_exception is not None:
|
|
355
359
|
raise func_exception
|
|
@@ -368,7 +372,7 @@ class BaseTrackDecorator(abc.ABC):
|
|
|
368
372
|
async def wrapper(*args, **kwargs) -> Any: # type: ignore
|
|
369
373
|
if not tracing_runtime_config.is_tracing_active():
|
|
370
374
|
return await func(*args, **kwargs)
|
|
371
|
-
self._before_call(
|
|
375
|
+
should_process_span_data = self._before_call(
|
|
372
376
|
func=func,
|
|
373
377
|
track_options=track_options,
|
|
374
378
|
args=args,
|
|
@@ -403,6 +407,7 @@ class BaseTrackDecorator(abc.ABC):
|
|
|
403
407
|
error_info=error_info,
|
|
404
408
|
capture_output=track_options.capture_output,
|
|
405
409
|
flush=track_options.flush,
|
|
410
|
+
should_process_span_data=should_process_span_data,
|
|
406
411
|
)
|
|
407
412
|
if func_exception is not None:
|
|
408
413
|
raise func_exception
|
|
@@ -417,14 +422,14 @@ class BaseTrackDecorator(abc.ABC):
|
|
|
417
422
|
track_options: arguments_helpers.TrackOptions,
|
|
418
423
|
args: Tuple,
|
|
419
424
|
kwargs: Dict[str, Any],
|
|
420
|
-
) ->
|
|
425
|
+
) -> bool:
|
|
421
426
|
try:
|
|
422
|
-
self.__before_call_unsafe(
|
|
427
|
+
return self.__before_call_unsafe(
|
|
423
428
|
func=func,
|
|
424
429
|
track_options=track_options,
|
|
425
430
|
args=args,
|
|
426
431
|
kwargs=kwargs,
|
|
427
|
-
)
|
|
432
|
+
).should_process_span_data
|
|
428
433
|
except Exception as exception:
|
|
429
434
|
LOGGER.error(
|
|
430
435
|
logging_messages.UNEXPECTED_EXCEPTION_ON_SPAN_CREATION_FOR_TRACKED_FUNCTION,
|
|
@@ -433,6 +438,7 @@ class BaseTrackDecorator(abc.ABC):
|
|
|
433
438
|
str(exception),
|
|
434
439
|
exc_info=True,
|
|
435
440
|
)
|
|
441
|
+
return False
|
|
436
442
|
|
|
437
443
|
def __before_call_unsafe(
|
|
438
444
|
self,
|
|
@@ -440,7 +446,7 @@ class BaseTrackDecorator(abc.ABC):
|
|
|
440
446
|
track_options: arguments_helpers.TrackOptions,
|
|
441
447
|
args: Tuple,
|
|
442
448
|
kwargs: Dict[str, Any],
|
|
443
|
-
) ->
|
|
449
|
+
) -> span_creation_handler.SpanCreationResult:
|
|
444
450
|
track_start_options = self._prepare_tracking_start_options(
|
|
445
451
|
func=func,
|
|
446
452
|
track_options=track_options,
|
|
@@ -448,11 +454,12 @@ class BaseTrackDecorator(abc.ABC):
|
|
|
448
454
|
kwargs=kwargs,
|
|
449
455
|
)
|
|
450
456
|
|
|
451
|
-
add_start_candidates(
|
|
457
|
+
return add_start_candidates(
|
|
452
458
|
start_span_parameters=track_start_options.start_span_parameters,
|
|
453
459
|
opik_distributed_trace_headers=track_start_options.opik_distributed_trace_headers,
|
|
454
460
|
opik_args_data=track_start_options.opik_args,
|
|
455
461
|
tracing_active=tracing_runtime_config.is_tracing_active(),
|
|
462
|
+
create_duplicate_root_span=track_options.create_duplicate_root_span,
|
|
456
463
|
)
|
|
457
464
|
|
|
458
465
|
def _after_call(
|
|
@@ -463,6 +470,7 @@ class BaseTrackDecorator(abc.ABC):
|
|
|
463
470
|
generators_span_to_end: Optional[span.SpanData] = None,
|
|
464
471
|
generators_trace_to_end: Optional[trace.TraceData] = None,
|
|
465
472
|
flush: bool = False,
|
|
473
|
+
should_process_span_data: bool = True,
|
|
466
474
|
) -> None:
|
|
467
475
|
try:
|
|
468
476
|
self.__after_call_unsafe(
|
|
@@ -472,6 +480,7 @@ class BaseTrackDecorator(abc.ABC):
|
|
|
472
480
|
generators_span_to_end=generators_span_to_end,
|
|
473
481
|
generators_trace_to_end=generators_trace_to_end,
|
|
474
482
|
flush=flush,
|
|
483
|
+
should_process_span_data=should_process_span_data,
|
|
475
484
|
)
|
|
476
485
|
except Exception as exception:
|
|
477
486
|
LOGGER.error(
|
|
@@ -486,12 +495,19 @@ class BaseTrackDecorator(abc.ABC):
|
|
|
486
495
|
output: Optional[Any],
|
|
487
496
|
error_info: Optional[ErrorInfoDict],
|
|
488
497
|
capture_output: bool,
|
|
489
|
-
generators_span_to_end: Optional[span.SpanData]
|
|
490
|
-
generators_trace_to_end: Optional[trace.TraceData]
|
|
491
|
-
flush: bool
|
|
498
|
+
generators_span_to_end: Optional[span.SpanData],
|
|
499
|
+
generators_trace_to_end: Optional[trace.TraceData],
|
|
500
|
+
flush: bool,
|
|
501
|
+
should_process_span_data: bool,
|
|
492
502
|
) -> None:
|
|
503
|
+
span_data_to_end: Optional[span.SpanData] = None
|
|
493
504
|
if generators_span_to_end is None:
|
|
494
|
-
|
|
505
|
+
if should_process_span_data:
|
|
506
|
+
# the span data must be present in the context stack, otherwise something is wrong
|
|
507
|
+
span_data_to_end, trace_data_to_end = pop_end_candidates()
|
|
508
|
+
else:
|
|
509
|
+
# the span data is not in the context, only the root trace data there
|
|
510
|
+
trace_data_to_end = pop_end_candidate_trace_data()
|
|
495
511
|
else:
|
|
496
512
|
span_data_to_end, trace_data_to_end = (
|
|
497
513
|
generators_span_to_end,
|
|
@@ -499,20 +515,27 @@ class BaseTrackDecorator(abc.ABC):
|
|
|
499
515
|
)
|
|
500
516
|
|
|
501
517
|
if output is not None:
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
518
|
+
if should_process_span_data and span_data_to_end is not None:
|
|
519
|
+
# create end arguments from current span data only if appropriate
|
|
520
|
+
try:
|
|
521
|
+
end_arguments = self._end_span_inputs_preprocessor(
|
|
522
|
+
output=output,
|
|
523
|
+
capture_output=capture_output,
|
|
524
|
+
current_span_data=span_data_to_end,
|
|
525
|
+
)
|
|
526
|
+
except Exception as e:
|
|
527
|
+
LOGGER.error(
|
|
528
|
+
logging_messages.UNEXPECTED_EXCEPTION_ON_SPAN_FINALIZATION_FOR_TRACKED_FUNCTION,
|
|
529
|
+
output,
|
|
530
|
+
str(e),
|
|
531
|
+
exc_info=True,
|
|
532
|
+
)
|
|
533
|
+
|
|
534
|
+
end_arguments = arguments_helpers.EndSpanParameters(
|
|
535
|
+
output={"output": output}
|
|
536
|
+
)
|
|
537
|
+
else:
|
|
538
|
+
# just use output as end arguments
|
|
516
539
|
end_arguments = arguments_helpers.EndSpanParameters(
|
|
517
540
|
output={"output": output}
|
|
518
541
|
)
|
|
@@ -521,11 +544,12 @@ class BaseTrackDecorator(abc.ABC):
|
|
|
521
544
|
|
|
522
545
|
client = opik_client.get_client_cached()
|
|
523
546
|
|
|
524
|
-
span_data_to_end
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
547
|
+
if should_process_span_data and span_data_to_end is not None:
|
|
548
|
+
# save span data only if appropriate
|
|
549
|
+
span_data_to_end.init_end_time().update(
|
|
550
|
+
**end_arguments.to_kwargs(),
|
|
551
|
+
)
|
|
552
|
+
client.span(**span_data_to_end.as_parameters)
|
|
529
553
|
|
|
530
554
|
if trace_data_to_end is not None:
|
|
531
555
|
trace_data_to_end.init_end_time().update(
|
|
@@ -598,8 +622,26 @@ def pop_end_candidates() -> Tuple[span.SpanData, Optional[trace.TraceData]]:
|
|
|
598
622
|
span_data_to_end is not None
|
|
599
623
|
), "When pop_end_candidates is called, top span data must not be None. Otherwise something is wrong."
|
|
600
624
|
|
|
601
|
-
trace_data_to_end =
|
|
625
|
+
trace_data_to_end = pop_end_candidate_trace_data()
|
|
626
|
+
return span_data_to_end, trace_data_to_end
|
|
627
|
+
|
|
628
|
+
|
|
629
|
+
def pop_end_candidate_trace_data() -> Optional[trace.TraceData]:
|
|
630
|
+
"""
|
|
631
|
+
Pops the most recently created trace data from the stack if it meets specific criteria.
|
|
632
|
+
|
|
633
|
+
This function checks whether the context storage's span data stack is empty, and if so, it attempts
|
|
634
|
+
to pop and return the most recently created trace data associated with the context. The trace data
|
|
635
|
+
is only removed if its ID is part of a predefined set of trace IDs created using a decorator. If the
|
|
636
|
+
criteria are not met, None is returned.
|
|
602
637
|
|
|
638
|
+
Note: Decorator can't attach any child objects to the popped ones because
|
|
639
|
+
they are no longer in the context stack.
|
|
640
|
+
|
|
641
|
+
Returns:
|
|
642
|
+
The trace data popped from the stack if the criteria are met;
|
|
643
|
+
otherwise, None.
|
|
644
|
+
"""
|
|
603
645
|
possible_trace_data_to_end = context_storage.get_trace_data()
|
|
604
646
|
if (
|
|
605
647
|
context_storage.span_data_stack_empty()
|
|
@@ -608,8 +650,9 @@ def pop_end_candidates() -> Tuple[span.SpanData, Optional[trace.TraceData]]:
|
|
|
608
650
|
):
|
|
609
651
|
trace_data_to_end = context_storage.pop_trace_data()
|
|
610
652
|
TRACES_CREATED_BY_DECORATOR.discard(possible_trace_data_to_end.id)
|
|
653
|
+
return trace_data_to_end
|
|
611
654
|
|
|
612
|
-
return
|
|
655
|
+
return None
|
|
613
656
|
|
|
614
657
|
|
|
615
658
|
def add_start_candidates(
|
|
@@ -617,6 +660,7 @@ def add_start_candidates(
|
|
|
617
660
|
opik_distributed_trace_headers: Optional[DistributedTraceHeadersDict],
|
|
618
661
|
opik_args_data: Optional[opik_args.OpikArgs],
|
|
619
662
|
tracing_active: bool,
|
|
663
|
+
create_duplicate_root_span: bool,
|
|
620
664
|
) -> span_creation_handler.SpanCreationResult:
|
|
621
665
|
"""
|
|
622
666
|
Handles the creation and registration of a new start span and trace while respecting the
|
|
@@ -631,6 +675,8 @@ def add_start_candidates(
|
|
|
631
675
|
opik_args_data : Optional additional arguments that can be applied to the trace
|
|
632
676
|
data after the span is created.
|
|
633
677
|
tracing_active: A boolean indicating whether a tracing is active.
|
|
678
|
+
create_duplicate_root_span: A boolean indicating whether to create a root span along with the root trace
|
|
679
|
+
and duplicating its data.
|
|
634
680
|
|
|
635
681
|
Returns:
|
|
636
682
|
The result of the span creation, including the span and trace data.
|
|
@@ -638,14 +684,22 @@ def add_start_candidates(
|
|
|
638
684
|
span_creation_result = span_creation_handler.create_span_respecting_context(
|
|
639
685
|
start_span_arguments=start_span_parameters,
|
|
640
686
|
distributed_trace_headers=opik_distributed_trace_headers,
|
|
687
|
+
should_create_duplicate_root_span=create_duplicate_root_span,
|
|
641
688
|
)
|
|
642
|
-
|
|
689
|
+
if span_creation_result.should_process_span_data:
|
|
690
|
+
context_storage.add_span_data(span_creation_result.span_data)
|
|
643
691
|
|
|
644
|
-
|
|
645
|
-
|
|
692
|
+
if tracing_active:
|
|
693
|
+
client = opik_client.get_client_cached()
|
|
646
694
|
|
|
647
|
-
|
|
648
|
-
|
|
695
|
+
if client.config.log_start_trace_span:
|
|
696
|
+
client.span(**span_creation_result.span_data.as_start_parameters)
|
|
697
|
+
else:
|
|
698
|
+
_show_root_span_not_created_warning_if_needed(
|
|
699
|
+
start_span_parameters=start_span_parameters,
|
|
700
|
+
tracing_active=tracing_active,
|
|
701
|
+
should_process_span_data=span_creation_result.should_process_span_data,
|
|
702
|
+
)
|
|
649
703
|
|
|
650
704
|
if span_creation_result.trace_data is not None:
|
|
651
705
|
add_start_trace_candidate(
|
|
@@ -691,3 +745,23 @@ def add_start_trace_candidate(
|
|
|
691
745
|
client = opik_client.get_client_cached()
|
|
692
746
|
if client.config.log_start_trace_span:
|
|
693
747
|
client.trace(**trace_data.as_start_parameters)
|
|
748
|
+
|
|
749
|
+
|
|
750
|
+
def _show_root_span_not_created_warning_if_needed(
|
|
751
|
+
start_span_parameters: arguments_helpers.StartSpanParameters,
|
|
752
|
+
tracing_active: bool,
|
|
753
|
+
should_process_span_data: bool,
|
|
754
|
+
) -> None:
|
|
755
|
+
if not tracing_active:
|
|
756
|
+
return
|
|
757
|
+
|
|
758
|
+
user_provided_span_type_will_be_lost = (
|
|
759
|
+
not should_process_span_data and start_span_parameters.type in ["llm", "tool"]
|
|
760
|
+
)
|
|
761
|
+
if user_provided_span_type_will_be_lost:
|
|
762
|
+
LOGGER.warning(
|
|
763
|
+
"The root span '%s' of type '%s' will not be created because "
|
|
764
|
+
"its creation was explicitly disabled along with the root trace.",
|
|
765
|
+
start_span_parameters.name,
|
|
766
|
+
start_span_parameters.type,
|
|
767
|
+
)
|
|
@@ -65,6 +65,7 @@ def start_as_current_span(
|
|
|
65
65
|
opik_distributed_trace_headers=distributed_headers,
|
|
66
66
|
opik_args_data=None,
|
|
67
67
|
tracing_active=True,
|
|
68
|
+
create_duplicate_root_span=True,
|
|
68
69
|
)
|
|
69
70
|
|
|
70
71
|
end_arguments = arguments_helpers.EndSpanParameters(
|
|
@@ -85,6 +86,7 @@ def start_as_current_span(
|
|
|
85
86
|
end_arguments.metadata = span_creation_result.span_data.metadata or metadata
|
|
86
87
|
end_arguments.provider = span_creation_result.span_data.provider or provider
|
|
87
88
|
end_arguments.model = span_creation_result.span_data.model or model
|
|
89
|
+
end_arguments.attachments = span_creation_result.span_data.attachments
|
|
88
90
|
except Exception as exception:
|
|
89
91
|
LOGGER.error(
|
|
90
92
|
"Error in user's script while executing span context manager: %s",
|
|
@@ -100,8 +102,10 @@ def start_as_current_span(
|
|
|
100
102
|
# save span/trace data at the end of the context manager
|
|
101
103
|
client = opik_client.get_client_cached()
|
|
102
104
|
|
|
105
|
+
# Don't pass attachments to update() since they're already set on span_data
|
|
106
|
+
# and _update_attachments would duplicate them
|
|
103
107
|
span_creation_result.span_data.init_end_time().update(
|
|
104
|
-
**end_arguments.to_kwargs(),
|
|
108
|
+
**end_arguments.to_kwargs(ignore_keys=["attachments"]),
|
|
105
109
|
)
|
|
106
110
|
client.span(**span_creation_result.span_data.as_parameters)
|
|
107
111
|
|
|
@@ -58,12 +58,13 @@ class BaseTrackedGenerator(Generic[YieldType]):
|
|
|
58
58
|
if self._created_span_data is not None:
|
|
59
59
|
return
|
|
60
60
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
self._start_span_arguments, self._opik_distributed_trace_headers
|
|
64
|
-
)
|
|
61
|
+
result = span_creation_handler.create_span_respecting_context(
|
|
62
|
+
self._start_span_arguments, self._opik_distributed_trace_headers
|
|
65
63
|
)
|
|
66
64
|
|
|
65
|
+
self._created_trace_data = result.trace_data
|
|
66
|
+
self._created_span_data = result.span_data
|
|
67
|
+
|
|
67
68
|
def _handle_stop_iteration_before_raising(self) -> None:
|
|
68
69
|
output = _try_aggregate_items(
|
|
69
70
|
self._accumulated_values,
|
|
@@ -23,16 +23,20 @@ class SpanCreationResult(NamedTuple):
|
|
|
23
23
|
with the span if a new trace was created. Can be None if no new trace was created.
|
|
24
24
|
span_data : Data specific to the created span, containing
|
|
25
25
|
information such as span identifiers and timestamps.
|
|
26
|
+
should_process_span_data: A boolean indicating whether created span data should be further processed
|
|
27
|
+
after it was created (saved, logged, etc.).
|
|
26
28
|
"""
|
|
27
29
|
|
|
28
30
|
trace_data: Optional[trace.TraceData]
|
|
29
31
|
span_data: span.SpanData
|
|
32
|
+
should_process_span_data: bool
|
|
30
33
|
|
|
31
34
|
|
|
32
35
|
def create_span_respecting_context(
|
|
33
36
|
start_span_arguments: arguments_helpers.StartSpanParameters,
|
|
34
37
|
distributed_trace_headers: Optional[DistributedTraceHeadersDict],
|
|
35
38
|
opik_context_storage: Optional[context_storage.OpikContextStorage] = None,
|
|
39
|
+
should_create_duplicate_root_span: bool = True,
|
|
36
40
|
) -> SpanCreationResult:
|
|
37
41
|
"""
|
|
38
42
|
Handles different span creation flows.
|
|
@@ -48,7 +52,7 @@ def create_span_respecting_context(
|
|
|
48
52
|
trace_id=distributed_trace_headers["opik_trace_id"],
|
|
49
53
|
)
|
|
50
54
|
|
|
51
|
-
return SpanCreationResult(None, span_data)
|
|
55
|
+
return SpanCreationResult(None, span_data, should_process_span_data=True)
|
|
52
56
|
|
|
53
57
|
current_span_data = opik_context_storage.top_span_data()
|
|
54
58
|
current_trace_data = opik_context_storage.get_trace_data()
|
|
@@ -78,7 +82,7 @@ def create_span_respecting_context(
|
|
|
78
82
|
trace_id=current_span_data.trace_id,
|
|
79
83
|
)
|
|
80
84
|
|
|
81
|
-
return SpanCreationResult(None, span_data)
|
|
85
|
+
return SpanCreationResult(None, span_data, should_process_span_data=True)
|
|
82
86
|
|
|
83
87
|
if current_trace_data is not None and current_span_data is None:
|
|
84
88
|
# By default, we expect trace to be created with a span.
|
|
@@ -100,7 +104,7 @@ def create_span_respecting_context(
|
|
|
100
104
|
trace_id=current_trace_data.id,
|
|
101
105
|
)
|
|
102
106
|
|
|
103
|
-
return SpanCreationResult(None, span_data)
|
|
107
|
+
return SpanCreationResult(None, span_data, should_process_span_data=True)
|
|
104
108
|
|
|
105
109
|
if current_span_data is None and current_trace_data is None:
|
|
106
110
|
# Create a trace and root span because it is
|
|
@@ -113,6 +117,7 @@ def create_span_respecting_context(
|
|
|
113
117
|
metadata=start_span_arguments.metadata,
|
|
114
118
|
tags=start_span_arguments.tags,
|
|
115
119
|
project_name=start_span_arguments.project_name,
|
|
120
|
+
thread_id=start_span_arguments.thread_id,
|
|
116
121
|
)
|
|
117
122
|
|
|
118
123
|
current_span_data = arguments_helpers.create_span_data(
|
|
@@ -121,4 +126,8 @@ def create_span_respecting_context(
|
|
|
121
126
|
trace_id=current_trace_data.id,
|
|
122
127
|
)
|
|
123
128
|
|
|
124
|
-
return SpanCreationResult(
|
|
129
|
+
return SpanCreationResult(
|
|
130
|
+
current_trace_data,
|
|
131
|
+
current_span_data,
|
|
132
|
+
should_process_span_data=should_create_duplicate_root_span,
|
|
133
|
+
)
|
opik/evaluation/engine/engine.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import functools
|
|
2
2
|
import logging
|
|
3
|
-
from typing import List, Optional, Any, Dict
|
|
3
|
+
from typing import List, Optional, Any, Dict, Iterator
|
|
4
4
|
|
|
5
5
|
import opik.logging_messages as logging_messages
|
|
6
6
|
import opik.opik_context as opik_context
|
|
@@ -26,6 +26,30 @@ LOGGER = logging.getLogger(__name__)
|
|
|
26
26
|
|
|
27
27
|
EVALUATION_TASK_NAME = "evaluation_task"
|
|
28
28
|
|
|
29
|
+
EVALUATION_STREAM_DATASET_BATCH_SIZE = 200 # The limit is 10x smaller than the default streaming limit to improve the UX and not wait too long for the first items to be evaluated
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _calculate_total_items(
|
|
33
|
+
dataset: dataset.Dataset,
|
|
34
|
+
nb_samples: Optional[int],
|
|
35
|
+
dataset_item_ids: Optional[List[str]],
|
|
36
|
+
) -> Optional[int]:
|
|
37
|
+
"""
|
|
38
|
+
Calculate the total number of items that will be evaluated.
|
|
39
|
+
|
|
40
|
+
Returns None if the total cannot be determined (e.g., when using a sampler).
|
|
41
|
+
"""
|
|
42
|
+
if dataset_item_ids is not None:
|
|
43
|
+
return len(dataset_item_ids)
|
|
44
|
+
|
|
45
|
+
# If nb_samples is specified and smaller than dataset size, use it
|
|
46
|
+
if nb_samples is not None:
|
|
47
|
+
if dataset.dataset_items_count is not None:
|
|
48
|
+
return min(nb_samples, dataset.dataset_items_count)
|
|
49
|
+
return nb_samples
|
|
50
|
+
|
|
51
|
+
return dataset.dataset_items_count
|
|
52
|
+
|
|
29
53
|
|
|
30
54
|
class EvaluationEngine:
|
|
31
55
|
def __init__(
|
|
@@ -157,34 +181,57 @@ class EvaluationEngine:
|
|
|
157
181
|
|
|
158
182
|
def _compute_test_results_for_llm_task(
|
|
159
183
|
self,
|
|
160
|
-
dataset_items:
|
|
184
|
+
dataset_items: Iterator[dataset_item.DatasetItem],
|
|
161
185
|
task: LLMTask,
|
|
162
186
|
experiment_: Optional[experiment.Experiment],
|
|
163
187
|
trial_count: int,
|
|
164
188
|
description: str,
|
|
189
|
+
total_items: Optional[int] = None,
|
|
165
190
|
) -> List[test_result.TestResult]:
|
|
166
191
|
test_results: List[test_result.TestResult] = []
|
|
167
192
|
|
|
193
|
+
# Cache dataset items for multiple trials
|
|
194
|
+
dataset_items_cache: List[dataset_item.DatasetItem] = []
|
|
195
|
+
|
|
168
196
|
for trial_id in range(trial_count):
|
|
169
|
-
|
|
170
|
-
functools.partial(
|
|
171
|
-
self._compute_test_result_for_llm_task,
|
|
172
|
-
item=item,
|
|
173
|
-
task=task,
|
|
174
|
-
trial_id=trial_id,
|
|
175
|
-
experiment_=experiment_,
|
|
176
|
-
)
|
|
177
|
-
for item in dataset_items
|
|
178
|
-
]
|
|
197
|
+
desc = f"{description} trial {trial_id}" if trial_count > 1 else description
|
|
179
198
|
|
|
180
|
-
|
|
181
|
-
|
|
199
|
+
# Use streaming executor to submit tasks as items arrive
|
|
200
|
+
executor: evaluation_tasks_executor.StreamingExecutor[
|
|
201
|
+
test_result.TestResult
|
|
202
|
+
] = evaluation_tasks_executor.StreamingExecutor(
|
|
182
203
|
workers=self._workers,
|
|
183
204
|
verbose=self._verbose,
|
|
184
|
-
desc=
|
|
185
|
-
|
|
186
|
-
else description,
|
|
205
|
+
desc=desc,
|
|
206
|
+
total=total_items,
|
|
187
207
|
)
|
|
208
|
+
with executor:
|
|
209
|
+
# For first trial, consume from iterator and cache items
|
|
210
|
+
if trial_id == 0:
|
|
211
|
+
for item in dataset_items:
|
|
212
|
+
dataset_items_cache.append(item)
|
|
213
|
+
evaluation_task = functools.partial(
|
|
214
|
+
self._compute_test_result_for_llm_task,
|
|
215
|
+
item=item,
|
|
216
|
+
task=task,
|
|
217
|
+
trial_id=trial_id,
|
|
218
|
+
experiment_=experiment_,
|
|
219
|
+
)
|
|
220
|
+
executor.submit(evaluation_task)
|
|
221
|
+
else:
|
|
222
|
+
# For subsequent trials, use cached items
|
|
223
|
+
for item in dataset_items_cache:
|
|
224
|
+
evaluation_task = functools.partial(
|
|
225
|
+
self._compute_test_result_for_llm_task,
|
|
226
|
+
item=item,
|
|
227
|
+
task=task,
|
|
228
|
+
trial_id=trial_id,
|
|
229
|
+
experiment_=experiment_,
|
|
230
|
+
)
|
|
231
|
+
executor.submit(evaluation_task)
|
|
232
|
+
|
|
233
|
+
# Collect results from executor
|
|
234
|
+
test_results += executor.get_results()
|
|
188
235
|
|
|
189
236
|
return test_results
|
|
190
237
|
|
|
@@ -282,21 +329,54 @@ class EvaluationEngine:
|
|
|
282
329
|
trial_count: int,
|
|
283
330
|
experiment_: Optional[experiment.Experiment],
|
|
284
331
|
) -> List[test_result.TestResult]:
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
332
|
+
# Can't use streaming with these parameters yet, so fallback to non-streaming
|
|
333
|
+
use_streaming = (
|
|
334
|
+
dataset_sampler is None
|
|
335
|
+
and not self._metrics_evaluator.has_task_span_metrics
|
|
288
336
|
)
|
|
289
337
|
|
|
290
|
-
|
|
291
|
-
|
|
338
|
+
# Get dataset items using streaming or non-streaming approach
|
|
339
|
+
if use_streaming:
|
|
340
|
+
dataset_items_iter = dataset_.__internal_api__stream_items_as_dataclasses__(
|
|
341
|
+
nb_samples=nb_samples,
|
|
342
|
+
dataset_item_ids=dataset_item_ids,
|
|
343
|
+
batch_size=EVALUATION_STREAM_DATASET_BATCH_SIZE,
|
|
344
|
+
)
|
|
345
|
+
else:
|
|
346
|
+
LOGGER.info("Dataset streaming disabled due to evaluation parameters")
|
|
347
|
+
dataset_items_list = list(
|
|
348
|
+
dataset_.__internal_api__stream_items_as_dataclasses__(
|
|
349
|
+
nb_samples=nb_samples,
|
|
350
|
+
dataset_item_ids=dataset_item_ids,
|
|
351
|
+
batch_size=EVALUATION_STREAM_DATASET_BATCH_SIZE,
|
|
352
|
+
)
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
if dataset_sampler is not None:
|
|
356
|
+
dataset_items_list = dataset_sampler.sample(dataset_items_list)
|
|
357
|
+
|
|
358
|
+
# Convert list to iterator
|
|
359
|
+
dataset_items_iter = iter(dataset_items_list)
|
|
360
|
+
|
|
361
|
+
# Calculate total items for progress bar
|
|
362
|
+
if use_streaming:
|
|
363
|
+
total_items = _calculate_total_items(
|
|
364
|
+
dataset=dataset_,
|
|
365
|
+
nb_samples=nb_samples,
|
|
366
|
+
dataset_item_ids=dataset_item_ids,
|
|
367
|
+
)
|
|
368
|
+
else:
|
|
369
|
+
# After sampling, the actual count is the length of the list
|
|
370
|
+
total_items = len(dataset_items_list)
|
|
292
371
|
|
|
293
372
|
if not self._metrics_evaluator.has_task_span_metrics:
|
|
294
373
|
return self._compute_test_results_for_llm_task(
|
|
295
|
-
dataset_items=
|
|
374
|
+
dataset_items=dataset_items_iter,
|
|
296
375
|
task=task,
|
|
297
376
|
experiment_=experiment_,
|
|
298
377
|
trial_count=trial_count,
|
|
299
378
|
description="Evaluation",
|
|
379
|
+
total_items=total_items,
|
|
300
380
|
)
|
|
301
381
|
|
|
302
382
|
LOGGER.debug(
|
|
@@ -306,11 +386,12 @@ class EvaluationEngine:
|
|
|
306
386
|
|
|
307
387
|
with local_recording.record_traces_locally(client=self._client) as recording:
|
|
308
388
|
test_results = self._compute_test_results_for_llm_task(
|
|
309
|
-
dataset_items=
|
|
389
|
+
dataset_items=dataset_items_iter,
|
|
310
390
|
task=task,
|
|
311
391
|
experiment_=experiment_,
|
|
312
392
|
trial_count=trial_count,
|
|
313
393
|
description="Evaluation",
|
|
394
|
+
total_items=total_items,
|
|
314
395
|
)
|
|
315
396
|
self._update_test_results_with_task_span_metrics(
|
|
316
397
|
test_results=test_results,
|
|
@@ -339,7 +420,7 @@ class EvaluationEngine:
|
|
|
339
420
|
List of TestResult objects containing scores for each item.
|
|
340
421
|
"""
|
|
341
422
|
# Convert raw items to DatasetItem objects for compatibility
|
|
342
|
-
|
|
423
|
+
dataset_items_list = [
|
|
343
424
|
dataset_item.DatasetItem(
|
|
344
425
|
id=f"temp_item_{idx}",
|
|
345
426
|
**item,
|
|
@@ -349,11 +430,12 @@ class EvaluationEngine:
|
|
|
349
430
|
|
|
350
431
|
if not self._metrics_evaluator.has_task_span_metrics:
|
|
351
432
|
return self._compute_test_results_for_llm_task(
|
|
352
|
-
dataset_items=
|
|
433
|
+
dataset_items=iter(dataset_items_list),
|
|
353
434
|
task=task,
|
|
354
435
|
experiment_=None,
|
|
355
436
|
trial_count=1,
|
|
356
437
|
description="Items evaluation",
|
|
438
|
+
total_items=len(items),
|
|
357
439
|
)
|
|
358
440
|
|
|
359
441
|
LOGGER.debug(
|
|
@@ -363,11 +445,12 @@ class EvaluationEngine:
|
|
|
363
445
|
|
|
364
446
|
with local_recording.record_traces_locally(client=self._client) as recording:
|
|
365
447
|
test_results = self._compute_test_results_for_llm_task(
|
|
366
|
-
dataset_items=
|
|
448
|
+
dataset_items=iter(dataset_items_list),
|
|
367
449
|
task=task,
|
|
368
450
|
experiment_=None,
|
|
369
451
|
trial_count=1,
|
|
370
452
|
description="Items evaluation",
|
|
453
|
+
total_items=len(items),
|
|
371
454
|
)
|
|
372
455
|
self._update_test_results_with_task_span_metrics(
|
|
373
456
|
test_results=test_results,
|
|
@@ -388,7 +471,7 @@ class EvaluationEngine:
|
|
|
388
471
|
for test_case_ in test_cases
|
|
389
472
|
]
|
|
390
473
|
|
|
391
|
-
test_results = evaluation_tasks_executor.execute(
|
|
474
|
+
test_results: List[test_result.TestResult] = evaluation_tasks_executor.execute(
|
|
392
475
|
evaluation_tasks=evaluation_tasks,
|
|
393
476
|
workers=self._workers,
|
|
394
477
|
verbose=self._verbose,
|