opik 1.9.41__py3-none-any.whl → 1.9.86__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (192) hide show
  1. opik/api_objects/attachment/attachment_context.py +36 -0
  2. opik/api_objects/attachment/attachments_extractor.py +153 -0
  3. opik/api_objects/attachment/client.py +1 -0
  4. opik/api_objects/attachment/converters.py +2 -0
  5. opik/api_objects/attachment/decoder.py +18 -0
  6. opik/api_objects/attachment/decoder_base64.py +83 -0
  7. opik/api_objects/attachment/decoder_helpers.py +137 -0
  8. opik/api_objects/constants.py +2 -0
  9. opik/api_objects/dataset/dataset.py +133 -40
  10. opik/api_objects/dataset/rest_operations.py +2 -0
  11. opik/api_objects/experiment/experiment.py +6 -0
  12. opik/api_objects/helpers.py +8 -4
  13. opik/api_objects/local_recording.py +6 -5
  14. opik/api_objects/observation_data.py +101 -0
  15. opik/api_objects/opik_client.py +78 -45
  16. opik/api_objects/opik_query_language.py +9 -3
  17. opik/api_objects/prompt/chat/chat_prompt.py +18 -1
  18. opik/api_objects/prompt/client.py +8 -1
  19. opik/api_objects/span/span_data.py +3 -88
  20. opik/api_objects/threads/threads_client.py +7 -4
  21. opik/api_objects/trace/trace_data.py +3 -74
  22. opik/api_objects/validation_helpers.py +3 -3
  23. opik/cli/exports/__init__.py +131 -0
  24. opik/cli/exports/dataset.py +278 -0
  25. opik/cli/exports/experiment.py +784 -0
  26. opik/cli/exports/project.py +685 -0
  27. opik/cli/exports/prompt.py +578 -0
  28. opik/cli/exports/utils.py +406 -0
  29. opik/cli/harbor.py +39 -0
  30. opik/cli/imports/__init__.py +439 -0
  31. opik/cli/imports/dataset.py +143 -0
  32. opik/cli/imports/experiment.py +1192 -0
  33. opik/cli/imports/project.py +262 -0
  34. opik/cli/imports/prompt.py +177 -0
  35. opik/cli/imports/utils.py +280 -0
  36. opik/cli/main.py +14 -12
  37. opik/config.py +12 -1
  38. opik/datetime_helpers.py +12 -0
  39. opik/decorator/arguments_helpers.py +4 -1
  40. opik/decorator/base_track_decorator.py +111 -37
  41. opik/decorator/context_manager/span_context_manager.py +5 -1
  42. opik/decorator/generator_wrappers.py +5 -4
  43. opik/decorator/span_creation_handler.py +13 -4
  44. opik/evaluation/engine/engine.py +111 -28
  45. opik/evaluation/engine/evaluation_tasks_executor.py +71 -19
  46. opik/evaluation/evaluator.py +12 -0
  47. opik/evaluation/metrics/conversation/llm_judges/conversational_coherence/metric.py +3 -1
  48. opik/evaluation/metrics/conversation/llm_judges/session_completeness/metric.py +3 -1
  49. opik/evaluation/metrics/conversation/llm_judges/user_frustration/metric.py +3 -1
  50. opik/evaluation/metrics/heuristics/equals.py +11 -7
  51. opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +3 -1
  52. opik/evaluation/metrics/llm_judges/context_precision/metric.py +3 -1
  53. opik/evaluation/metrics/llm_judges/context_recall/metric.py +3 -1
  54. opik/evaluation/metrics/llm_judges/factuality/metric.py +1 -1
  55. opik/evaluation/metrics/llm_judges/g_eval/metric.py +3 -1
  56. opik/evaluation/metrics/llm_judges/hallucination/metric.py +3 -1
  57. opik/evaluation/metrics/llm_judges/moderation/metric.py +3 -1
  58. opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +3 -1
  59. opik/evaluation/metrics/llm_judges/syc_eval/metric.py +4 -2
  60. opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +3 -1
  61. opik/evaluation/metrics/llm_judges/usefulness/metric.py +3 -1
  62. opik/evaluation/metrics/ragas_metric.py +43 -23
  63. opik/evaluation/models/litellm/litellm_chat_model.py +7 -2
  64. opik/evaluation/models/litellm/util.py +4 -20
  65. opik/evaluation/models/models_factory.py +19 -5
  66. opik/evaluation/rest_operations.py +3 -3
  67. opik/evaluation/threads/helpers.py +3 -2
  68. opik/file_upload/file_uploader.py +13 -0
  69. opik/file_upload/upload_options.py +2 -0
  70. opik/integrations/adk/legacy_opik_tracer.py +9 -11
  71. opik/integrations/adk/opik_tracer.py +2 -2
  72. opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +2 -2
  73. opik/integrations/dspy/callback.py +100 -14
  74. opik/integrations/dspy/parsers.py +168 -0
  75. opik/integrations/harbor/__init__.py +17 -0
  76. opik/integrations/harbor/experiment_service.py +269 -0
  77. opik/integrations/harbor/opik_tracker.py +528 -0
  78. opik/integrations/haystack/opik_tracer.py +2 -2
  79. opik/integrations/langchain/__init__.py +15 -2
  80. opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
  81. opik/integrations/langchain/opik_tracer.py +258 -160
  82. opik/integrations/langchain/provider_usage_extractors/langchain_run_helpers/helpers.py +7 -4
  83. opik/integrations/llama_index/callback.py +43 -6
  84. opik/integrations/openai/agents/opik_tracing_processor.py +8 -10
  85. opik/integrations/openai/opik_tracker.py +99 -4
  86. opik/integrations/openai/videos/__init__.py +9 -0
  87. opik/integrations/openai/videos/binary_response_write_to_file_decorator.py +88 -0
  88. opik/integrations/openai/videos/videos_create_decorator.py +159 -0
  89. opik/integrations/openai/videos/videos_download_decorator.py +110 -0
  90. opik/message_processing/batching/base_batcher.py +14 -21
  91. opik/message_processing/batching/batch_manager.py +22 -10
  92. opik/message_processing/batching/batchers.py +32 -40
  93. opik/message_processing/batching/flushing_thread.py +0 -3
  94. opik/message_processing/emulation/emulator_message_processor.py +36 -1
  95. opik/message_processing/emulation/models.py +21 -0
  96. opik/message_processing/messages.py +9 -0
  97. opik/message_processing/preprocessing/__init__.py +0 -0
  98. opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
  99. opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
  100. opik/message_processing/preprocessing/constants.py +1 -0
  101. opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
  102. opik/message_processing/preprocessing/preprocessor.py +36 -0
  103. opik/message_processing/processors/__init__.py +0 -0
  104. opik/message_processing/processors/attachments_extraction_processor.py +146 -0
  105. opik/message_processing/{message_processors.py → processors/message_processors.py} +15 -1
  106. opik/message_processing/{message_processors_chain.py → processors/message_processors_chain.py} +3 -2
  107. opik/message_processing/{online_message_processor.py → processors/online_message_processor.py} +11 -9
  108. opik/message_processing/queue_consumer.py +4 -2
  109. opik/message_processing/streamer.py +71 -33
  110. opik/message_processing/streamer_constructors.py +36 -8
  111. opik/plugins/pytest/experiment_runner.py +1 -1
  112. opik/plugins/pytest/hooks.py +5 -3
  113. opik/rest_api/__init__.py +38 -0
  114. opik/rest_api/datasets/client.py +249 -148
  115. opik/rest_api/datasets/raw_client.py +356 -217
  116. opik/rest_api/experiments/client.py +26 -0
  117. opik/rest_api/experiments/raw_client.py +26 -0
  118. opik/rest_api/llm_provider_key/client.py +4 -4
  119. opik/rest_api/llm_provider_key/raw_client.py +4 -4
  120. opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +2 -1
  121. opik/rest_api/manual_evaluation/client.py +101 -0
  122. opik/rest_api/manual_evaluation/raw_client.py +172 -0
  123. opik/rest_api/optimizations/client.py +0 -166
  124. opik/rest_api/optimizations/raw_client.py +0 -248
  125. opik/rest_api/projects/client.py +9 -0
  126. opik/rest_api/projects/raw_client.py +13 -0
  127. opik/rest_api/projects/types/project_metric_request_public_metric_type.py +4 -0
  128. opik/rest_api/prompts/client.py +130 -2
  129. opik/rest_api/prompts/raw_client.py +175 -0
  130. opik/rest_api/traces/client.py +101 -0
  131. opik/rest_api/traces/raw_client.py +120 -0
  132. opik/rest_api/types/__init__.py +46 -0
  133. opik/rest_api/types/audio_url.py +19 -0
  134. opik/rest_api/types/audio_url_public.py +19 -0
  135. opik/rest_api/types/audio_url_write.py +19 -0
  136. opik/rest_api/types/automation_rule_evaluator.py +38 -2
  137. opik/rest_api/types/automation_rule_evaluator_object_object_public.py +33 -2
  138. opik/rest_api/types/automation_rule_evaluator_public.py +33 -2
  139. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
  140. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
  141. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
  142. opik/rest_api/types/automation_rule_evaluator_update.py +27 -1
  143. opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
  144. opik/rest_api/types/automation_rule_evaluator_write.py +27 -1
  145. opik/rest_api/types/dataset_item.py +1 -1
  146. opik/rest_api/types/dataset_item_batch.py +4 -0
  147. opik/rest_api/types/dataset_item_changes_public.py +5 -0
  148. opik/rest_api/types/dataset_item_compare.py +1 -1
  149. opik/rest_api/types/dataset_item_filter.py +4 -0
  150. opik/rest_api/types/dataset_item_page_compare.py +0 -1
  151. opik/rest_api/types/dataset_item_page_public.py +0 -1
  152. opik/rest_api/types/dataset_item_public.py +1 -1
  153. opik/rest_api/types/dataset_version_public.py +5 -0
  154. opik/rest_api/types/dataset_version_summary.py +5 -0
  155. opik/rest_api/types/dataset_version_summary_public.py +5 -0
  156. opik/rest_api/types/experiment.py +9 -0
  157. opik/rest_api/types/experiment_public.py +9 -0
  158. opik/rest_api/types/llm_as_judge_message_content.py +2 -0
  159. opik/rest_api/types/llm_as_judge_message_content_public.py +2 -0
  160. opik/rest_api/types/llm_as_judge_message_content_write.py +2 -0
  161. opik/rest_api/types/manual_evaluation_request_entity_type.py +1 -1
  162. opik/rest_api/types/project.py +1 -0
  163. opik/rest_api/types/project_detailed.py +1 -0
  164. opik/rest_api/types/project_metric_response_public_metric_type.py +4 -0
  165. opik/rest_api/types/project_reference.py +31 -0
  166. opik/rest_api/types/project_reference_public.py +31 -0
  167. opik/rest_api/types/project_stats_summary_item.py +1 -0
  168. opik/rest_api/types/prompt_version.py +1 -0
  169. opik/rest_api/types/prompt_version_detail.py +1 -0
  170. opik/rest_api/types/prompt_version_page_public.py +5 -0
  171. opik/rest_api/types/prompt_version_public.py +1 -0
  172. opik/rest_api/types/prompt_version_update.py +33 -0
  173. opik/rest_api/types/provider_api_key.py +5 -1
  174. opik/rest_api/types/provider_api_key_provider.py +2 -1
  175. opik/rest_api/types/provider_api_key_public.py +5 -1
  176. opik/rest_api/types/provider_api_key_public_provider.py +2 -1
  177. opik/rest_api/types/service_toggles_config.py +11 -1
  178. opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
  179. opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
  180. opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
  181. opik/types.py +36 -0
  182. opik/validation/chat_prompt_messages.py +241 -0
  183. opik/validation/feedback_score.py +3 -3
  184. opik/validation/validator.py +28 -0
  185. {opik-1.9.41.dist-info → opik-1.9.86.dist-info}/METADATA +5 -5
  186. {opik-1.9.41.dist-info → opik-1.9.86.dist-info}/RECORD +190 -141
  187. opik/cli/export.py +0 -791
  188. opik/cli/import_command.py +0 -575
  189. {opik-1.9.41.dist-info → opik-1.9.86.dist-info}/WHEEL +0 -0
  190. {opik-1.9.41.dist-info → opik-1.9.86.dist-info}/entry_points.txt +0 -0
  191. {opik-1.9.41.dist-info → opik-1.9.86.dist-info}/licenses/LICENSE +0 -0
  192. {opik-1.9.41.dist-info → opik-1.9.86.dist-info}/top_level.txt +0 -0
@@ -68,6 +68,7 @@ class BaseTrackDecorator(abc.ABC):
68
68
  generations_aggregator: Optional[Callable[[List[Any]], Any]] = None,
69
69
  flush: bool = False,
70
70
  project_name: Optional[str] = None,
71
+ create_duplicate_root_span: bool = True,
71
72
  ) -> Union[Callable, Callable[[Callable], Callable]]:
72
73
  """
73
74
  Decorator to track the execution of a function.
@@ -85,6 +86,7 @@ class BaseTrackDecorator(abc.ABC):
85
86
  generations_aggregator: Function to aggregate generation results.
86
87
  flush: Whether to flush the client after logging.
87
88
  project_name: The name of the project to log data.
89
+ create_duplicate_root_span: Whether to create a root span duplicating the root trace data.
88
90
 
89
91
  Returns:
90
92
  Callable: The decorated function(if used without parentheses)
@@ -113,6 +115,7 @@ class BaseTrackDecorator(abc.ABC):
113
115
  generations_aggregator=generations_aggregator,
114
116
  flush=flush,
115
117
  project_name=project_name,
118
+ create_duplicate_root_span=create_duplicate_root_span,
116
119
  )
117
120
 
118
121
  if callable(name):
@@ -314,7 +317,7 @@ class BaseTrackDecorator(abc.ABC):
314
317
  def wrapper(*args, **kwargs) -> Any: # type: ignore
315
318
  if not tracing_runtime_config.is_tracing_active():
316
319
  return func(*args, **kwargs)
317
- self._before_call(
320
+ should_process_span_data = self._before_call(
318
321
  func=func,
319
322
  track_options=track_options,
320
323
  args=args,
@@ -350,6 +353,7 @@ class BaseTrackDecorator(abc.ABC):
350
353
  error_info=error_info,
351
354
  capture_output=track_options.capture_output,
352
355
  flush=track_options.flush,
356
+ should_process_span_data=should_process_span_data,
353
357
  )
354
358
  if func_exception is not None:
355
359
  raise func_exception
@@ -368,7 +372,7 @@ class BaseTrackDecorator(abc.ABC):
368
372
  async def wrapper(*args, **kwargs) -> Any: # type: ignore
369
373
  if not tracing_runtime_config.is_tracing_active():
370
374
  return await func(*args, **kwargs)
371
- self._before_call(
375
+ should_process_span_data = self._before_call(
372
376
  func=func,
373
377
  track_options=track_options,
374
378
  args=args,
@@ -403,6 +407,7 @@ class BaseTrackDecorator(abc.ABC):
403
407
  error_info=error_info,
404
408
  capture_output=track_options.capture_output,
405
409
  flush=track_options.flush,
410
+ should_process_span_data=should_process_span_data,
406
411
  )
407
412
  if func_exception is not None:
408
413
  raise func_exception
@@ -417,14 +422,14 @@ class BaseTrackDecorator(abc.ABC):
417
422
  track_options: arguments_helpers.TrackOptions,
418
423
  args: Tuple,
419
424
  kwargs: Dict[str, Any],
420
- ) -> None:
425
+ ) -> bool:
421
426
  try:
422
- self.__before_call_unsafe(
427
+ return self.__before_call_unsafe(
423
428
  func=func,
424
429
  track_options=track_options,
425
430
  args=args,
426
431
  kwargs=kwargs,
427
- )
432
+ ).should_process_span_data
428
433
  except Exception as exception:
429
434
  LOGGER.error(
430
435
  logging_messages.UNEXPECTED_EXCEPTION_ON_SPAN_CREATION_FOR_TRACKED_FUNCTION,
@@ -433,6 +438,7 @@ class BaseTrackDecorator(abc.ABC):
433
438
  str(exception),
434
439
  exc_info=True,
435
440
  )
441
+ return False
436
442
 
437
443
  def __before_call_unsafe(
438
444
  self,
@@ -440,7 +446,7 @@ class BaseTrackDecorator(abc.ABC):
440
446
  track_options: arguments_helpers.TrackOptions,
441
447
  args: Tuple,
442
448
  kwargs: Dict[str, Any],
443
- ) -> None:
449
+ ) -> span_creation_handler.SpanCreationResult:
444
450
  track_start_options = self._prepare_tracking_start_options(
445
451
  func=func,
446
452
  track_options=track_options,
@@ -448,11 +454,12 @@ class BaseTrackDecorator(abc.ABC):
448
454
  kwargs=kwargs,
449
455
  )
450
456
 
451
- add_start_candidates(
457
+ return add_start_candidates(
452
458
  start_span_parameters=track_start_options.start_span_parameters,
453
459
  opik_distributed_trace_headers=track_start_options.opik_distributed_trace_headers,
454
460
  opik_args_data=track_start_options.opik_args,
455
461
  tracing_active=tracing_runtime_config.is_tracing_active(),
462
+ create_duplicate_root_span=track_options.create_duplicate_root_span,
456
463
  )
457
464
 
458
465
  def _after_call(
@@ -463,6 +470,7 @@ class BaseTrackDecorator(abc.ABC):
463
470
  generators_span_to_end: Optional[span.SpanData] = None,
464
471
  generators_trace_to_end: Optional[trace.TraceData] = None,
465
472
  flush: bool = False,
473
+ should_process_span_data: bool = True,
466
474
  ) -> None:
467
475
  try:
468
476
  self.__after_call_unsafe(
@@ -472,6 +480,7 @@ class BaseTrackDecorator(abc.ABC):
472
480
  generators_span_to_end=generators_span_to_end,
473
481
  generators_trace_to_end=generators_trace_to_end,
474
482
  flush=flush,
483
+ should_process_span_data=should_process_span_data,
475
484
  )
476
485
  except Exception as exception:
477
486
  LOGGER.error(
@@ -486,12 +495,19 @@ class BaseTrackDecorator(abc.ABC):
486
495
  output: Optional[Any],
487
496
  error_info: Optional[ErrorInfoDict],
488
497
  capture_output: bool,
489
- generators_span_to_end: Optional[span.SpanData] = None,
490
- generators_trace_to_end: Optional[trace.TraceData] = None,
491
- flush: bool = False,
498
+ generators_span_to_end: Optional[span.SpanData],
499
+ generators_trace_to_end: Optional[trace.TraceData],
500
+ flush: bool,
501
+ should_process_span_data: bool,
492
502
  ) -> None:
503
+ span_data_to_end: Optional[span.SpanData] = None
493
504
  if generators_span_to_end is None:
494
- span_data_to_end, trace_data_to_end = pop_end_candidates()
505
+ if should_process_span_data:
506
+ # the span data must be present in the context stack, otherwise something is wrong
507
+ span_data_to_end, trace_data_to_end = pop_end_candidates()
508
+ else:
509
+ # the span data is not in the context, only the root trace data there
510
+ trace_data_to_end = pop_end_candidate_trace_data()
495
511
  else:
496
512
  span_data_to_end, trace_data_to_end = (
497
513
  generators_span_to_end,
@@ -499,20 +515,27 @@ class BaseTrackDecorator(abc.ABC):
499
515
  )
500
516
 
501
517
  if output is not None:
502
- try:
503
- end_arguments = self._end_span_inputs_preprocessor(
504
- output=output,
505
- capture_output=capture_output,
506
- current_span_data=span_data_to_end,
507
- )
508
- except Exception as e:
509
- LOGGER.error(
510
- logging_messages.UNEXPECTED_EXCEPTION_ON_SPAN_FINALIZATION_FOR_TRACKED_FUNCTION,
511
- output,
512
- str(e),
513
- exc_info=True,
514
- )
515
-
518
+ if should_process_span_data and span_data_to_end is not None:
519
+ # create end arguments from current span data only if appropriate
520
+ try:
521
+ end_arguments = self._end_span_inputs_preprocessor(
522
+ output=output,
523
+ capture_output=capture_output,
524
+ current_span_data=span_data_to_end,
525
+ )
526
+ except Exception as e:
527
+ LOGGER.error(
528
+ logging_messages.UNEXPECTED_EXCEPTION_ON_SPAN_FINALIZATION_FOR_TRACKED_FUNCTION,
529
+ output,
530
+ str(e),
531
+ exc_info=True,
532
+ )
533
+
534
+ end_arguments = arguments_helpers.EndSpanParameters(
535
+ output={"output": output}
536
+ )
537
+ else:
538
+ # just use output as end arguments
516
539
  end_arguments = arguments_helpers.EndSpanParameters(
517
540
  output={"output": output}
518
541
  )
@@ -521,11 +544,12 @@ class BaseTrackDecorator(abc.ABC):
521
544
 
522
545
  client = opik_client.get_client_cached()
523
546
 
524
- span_data_to_end.init_end_time().update(
525
- **end_arguments.to_kwargs(),
526
- )
527
-
528
- client.span(**span_data_to_end.as_parameters)
547
+ if should_process_span_data and span_data_to_end is not None:
548
+ # save span data only if appropriate
549
+ span_data_to_end.init_end_time().update(
550
+ **end_arguments.to_kwargs(),
551
+ )
552
+ client.span(**span_data_to_end.as_parameters)
529
553
 
530
554
  if trace_data_to_end is not None:
531
555
  trace_data_to_end.init_end_time().update(
@@ -598,8 +622,26 @@ def pop_end_candidates() -> Tuple[span.SpanData, Optional[trace.TraceData]]:
598
622
  span_data_to_end is not None
599
623
  ), "When pop_end_candidates is called, top span data must not be None. Otherwise something is wrong."
600
624
 
601
- trace_data_to_end = None
625
+ trace_data_to_end = pop_end_candidate_trace_data()
626
+ return span_data_to_end, trace_data_to_end
627
+
628
+
629
+ def pop_end_candidate_trace_data() -> Optional[trace.TraceData]:
630
+ """
631
+ Pops the most recently created trace data from the stack if it meets specific criteria.
632
+
633
+ This function checks whether the context storage's span data stack is empty, and if so, it attempts
634
+ to pop and return the most recently created trace data associated with the context. The trace data
635
+ is only removed if its ID is part of a predefined set of trace IDs created using a decorator. If the
636
+ criteria are not met, None is returned.
602
637
 
638
+ Note: Decorator can't attach any child objects to the popped ones because
639
+ they are no longer in the context stack.
640
+
641
+ Returns:
642
+ The trace data popped from the stack if the criteria are met;
643
+ otherwise, None.
644
+ """
603
645
  possible_trace_data_to_end = context_storage.get_trace_data()
604
646
  if (
605
647
  context_storage.span_data_stack_empty()
@@ -608,8 +650,9 @@ def pop_end_candidates() -> Tuple[span.SpanData, Optional[trace.TraceData]]:
608
650
  ):
609
651
  trace_data_to_end = context_storage.pop_trace_data()
610
652
  TRACES_CREATED_BY_DECORATOR.discard(possible_trace_data_to_end.id)
653
+ return trace_data_to_end
611
654
 
612
- return span_data_to_end, trace_data_to_end
655
+ return None
613
656
 
614
657
 
615
658
  def add_start_candidates(
@@ -617,6 +660,7 @@ def add_start_candidates(
617
660
  opik_distributed_trace_headers: Optional[DistributedTraceHeadersDict],
618
661
  opik_args_data: Optional[opik_args.OpikArgs],
619
662
  tracing_active: bool,
663
+ create_duplicate_root_span: bool,
620
664
  ) -> span_creation_handler.SpanCreationResult:
621
665
  """
622
666
  Handles the creation and registration of a new start span and trace while respecting the
@@ -631,6 +675,8 @@ def add_start_candidates(
631
675
  opik_args_data : Optional additional arguments that can be applied to the trace
632
676
  data after the span is created.
633
677
  tracing_active: A boolean indicating whether a tracing is active.
678
+ create_duplicate_root_span: A boolean indicating whether to create a root span along with the root trace
679
+ and duplicating its data.
634
680
 
635
681
  Returns:
636
682
  The result of the span creation, including the span and trace data.
@@ -638,14 +684,22 @@ def add_start_candidates(
638
684
  span_creation_result = span_creation_handler.create_span_respecting_context(
639
685
  start_span_arguments=start_span_parameters,
640
686
  distributed_trace_headers=opik_distributed_trace_headers,
687
+ should_create_duplicate_root_span=create_duplicate_root_span,
641
688
  )
642
- context_storage.add_span_data(span_creation_result.span_data)
689
+ if span_creation_result.should_process_span_data:
690
+ context_storage.add_span_data(span_creation_result.span_data)
643
691
 
644
- if tracing_active:
645
- client = opik_client.get_client_cached()
692
+ if tracing_active:
693
+ client = opik_client.get_client_cached()
646
694
 
647
- if client.config.log_start_trace_span:
648
- client.span(**span_creation_result.span_data.as_start_parameters)
695
+ if client.config.log_start_trace_span:
696
+ client.span(**span_creation_result.span_data.as_start_parameters)
697
+ else:
698
+ _show_root_span_not_created_warning_if_needed(
699
+ start_span_parameters=start_span_parameters,
700
+ tracing_active=tracing_active,
701
+ should_process_span_data=span_creation_result.should_process_span_data,
702
+ )
649
703
 
650
704
  if span_creation_result.trace_data is not None:
651
705
  add_start_trace_candidate(
@@ -691,3 +745,23 @@ def add_start_trace_candidate(
691
745
  client = opik_client.get_client_cached()
692
746
  if client.config.log_start_trace_span:
693
747
  client.trace(**trace_data.as_start_parameters)
748
+
749
+
750
+ def _show_root_span_not_created_warning_if_needed(
751
+ start_span_parameters: arguments_helpers.StartSpanParameters,
752
+ tracing_active: bool,
753
+ should_process_span_data: bool,
754
+ ) -> None:
755
+ if not tracing_active:
756
+ return
757
+
758
+ user_provided_span_type_will_be_lost = (
759
+ not should_process_span_data and start_span_parameters.type in ["llm", "tool"]
760
+ )
761
+ if user_provided_span_type_will_be_lost:
762
+ LOGGER.warning(
763
+ "The root span '%s' of type '%s' will not be created because "
764
+ "its creation was explicitly disabled along with the root trace.",
765
+ start_span_parameters.name,
766
+ start_span_parameters.type,
767
+ )
@@ -65,6 +65,7 @@ def start_as_current_span(
65
65
  opik_distributed_trace_headers=distributed_headers,
66
66
  opik_args_data=None,
67
67
  tracing_active=True,
68
+ create_duplicate_root_span=True,
68
69
  )
69
70
 
70
71
  end_arguments = arguments_helpers.EndSpanParameters(
@@ -85,6 +86,7 @@ def start_as_current_span(
85
86
  end_arguments.metadata = span_creation_result.span_data.metadata or metadata
86
87
  end_arguments.provider = span_creation_result.span_data.provider or provider
87
88
  end_arguments.model = span_creation_result.span_data.model or model
89
+ end_arguments.attachments = span_creation_result.span_data.attachments
88
90
  except Exception as exception:
89
91
  LOGGER.error(
90
92
  "Error in user's script while executing span context manager: %s",
@@ -100,8 +102,10 @@ def start_as_current_span(
100
102
  # save span/trace data at the end of the context manager
101
103
  client = opik_client.get_client_cached()
102
104
 
105
+ # Don't pass attachments to update() since they're already set on span_data
106
+ # and _update_attachments would duplicate them
103
107
  span_creation_result.span_data.init_end_time().update(
104
- **end_arguments.to_kwargs(),
108
+ **end_arguments.to_kwargs(ignore_keys=["attachments"]),
105
109
  )
106
110
  client.span(**span_creation_result.span_data.as_parameters)
107
111
 
@@ -58,12 +58,13 @@ class BaseTrackedGenerator(Generic[YieldType]):
58
58
  if self._created_span_data is not None:
59
59
  return
60
60
 
61
- self._created_trace_data, self._created_span_data = (
62
- span_creation_handler.create_span_respecting_context(
63
- self._start_span_arguments, self._opik_distributed_trace_headers
64
- )
61
+ result = span_creation_handler.create_span_respecting_context(
62
+ self._start_span_arguments, self._opik_distributed_trace_headers
65
63
  )
66
64
 
65
+ self._created_trace_data = result.trace_data
66
+ self._created_span_data = result.span_data
67
+
67
68
  def _handle_stop_iteration_before_raising(self) -> None:
68
69
  output = _try_aggregate_items(
69
70
  self._accumulated_values,
@@ -23,16 +23,20 @@ class SpanCreationResult(NamedTuple):
23
23
  with the span if a new trace was created. Can be None if no new trace was created.
24
24
  span_data : Data specific to the created span, containing
25
25
  information such as span identifiers and timestamps.
26
+ should_process_span_data: A boolean indicating whether created span data should be further processed
27
+ after it was created (saved, logged, etc.).
26
28
  """
27
29
 
28
30
  trace_data: Optional[trace.TraceData]
29
31
  span_data: span.SpanData
32
+ should_process_span_data: bool
30
33
 
31
34
 
32
35
  def create_span_respecting_context(
33
36
  start_span_arguments: arguments_helpers.StartSpanParameters,
34
37
  distributed_trace_headers: Optional[DistributedTraceHeadersDict],
35
38
  opik_context_storage: Optional[context_storage.OpikContextStorage] = None,
39
+ should_create_duplicate_root_span: bool = True,
36
40
  ) -> SpanCreationResult:
37
41
  """
38
42
  Handles different span creation flows.
@@ -48,7 +52,7 @@ def create_span_respecting_context(
48
52
  trace_id=distributed_trace_headers["opik_trace_id"],
49
53
  )
50
54
 
51
- return SpanCreationResult(None, span_data)
55
+ return SpanCreationResult(None, span_data, should_process_span_data=True)
52
56
 
53
57
  current_span_data = opik_context_storage.top_span_data()
54
58
  current_trace_data = opik_context_storage.get_trace_data()
@@ -78,7 +82,7 @@ def create_span_respecting_context(
78
82
  trace_id=current_span_data.trace_id,
79
83
  )
80
84
 
81
- return SpanCreationResult(None, span_data)
85
+ return SpanCreationResult(None, span_data, should_process_span_data=True)
82
86
 
83
87
  if current_trace_data is not None and current_span_data is None:
84
88
  # By default, we expect trace to be created with a span.
@@ -100,7 +104,7 @@ def create_span_respecting_context(
100
104
  trace_id=current_trace_data.id,
101
105
  )
102
106
 
103
- return SpanCreationResult(None, span_data)
107
+ return SpanCreationResult(None, span_data, should_process_span_data=True)
104
108
 
105
109
  if current_span_data is None and current_trace_data is None:
106
110
  # Create a trace and root span because it is
@@ -113,6 +117,7 @@ def create_span_respecting_context(
113
117
  metadata=start_span_arguments.metadata,
114
118
  tags=start_span_arguments.tags,
115
119
  project_name=start_span_arguments.project_name,
120
+ thread_id=start_span_arguments.thread_id,
116
121
  )
117
122
 
118
123
  current_span_data = arguments_helpers.create_span_data(
@@ -121,4 +126,8 @@ def create_span_respecting_context(
121
126
  trace_id=current_trace_data.id,
122
127
  )
123
128
 
124
- return SpanCreationResult(current_trace_data, current_span_data)
129
+ return SpanCreationResult(
130
+ current_trace_data,
131
+ current_span_data,
132
+ should_process_span_data=should_create_duplicate_root_span,
133
+ )
@@ -1,6 +1,6 @@
1
1
  import functools
2
2
  import logging
3
- from typing import List, Optional, Any, Dict
3
+ from typing import List, Optional, Any, Dict, Iterator
4
4
 
5
5
  import opik.logging_messages as logging_messages
6
6
  import opik.opik_context as opik_context
@@ -26,6 +26,30 @@ LOGGER = logging.getLogger(__name__)
26
26
 
27
27
  EVALUATION_TASK_NAME = "evaluation_task"
28
28
 
29
+ EVALUATION_STREAM_DATASET_BATCH_SIZE = 200 # The limit is 10x smaller than the default streaming limit to improve the UX and not wait too long for the first items to be evaluated
30
+
31
+
32
+ def _calculate_total_items(
33
+ dataset: dataset.Dataset,
34
+ nb_samples: Optional[int],
35
+ dataset_item_ids: Optional[List[str]],
36
+ ) -> Optional[int]:
37
+ """
38
+ Calculate the total number of items that will be evaluated.
39
+
40
+ Returns None if the total cannot be determined (e.g., when using a sampler).
41
+ """
42
+ if dataset_item_ids is not None:
43
+ return len(dataset_item_ids)
44
+
45
+ # If nb_samples is specified and smaller than dataset size, use it
46
+ if nb_samples is not None:
47
+ if dataset.dataset_items_count is not None:
48
+ return min(nb_samples, dataset.dataset_items_count)
49
+ return nb_samples
50
+
51
+ return dataset.dataset_items_count
52
+
29
53
 
30
54
  class EvaluationEngine:
31
55
  def __init__(
@@ -157,34 +181,57 @@ class EvaluationEngine:
157
181
 
158
182
  def _compute_test_results_for_llm_task(
159
183
  self,
160
- dataset_items: List[dataset_item.DatasetItem],
184
+ dataset_items: Iterator[dataset_item.DatasetItem],
161
185
  task: LLMTask,
162
186
  experiment_: Optional[experiment.Experiment],
163
187
  trial_count: int,
164
188
  description: str,
189
+ total_items: Optional[int] = None,
165
190
  ) -> List[test_result.TestResult]:
166
191
  test_results: List[test_result.TestResult] = []
167
192
 
193
+ # Cache dataset items for multiple trials
194
+ dataset_items_cache: List[dataset_item.DatasetItem] = []
195
+
168
196
  for trial_id in range(trial_count):
169
- evaluation_tasks: List[EvaluationTask[test_result.TestResult]] = [
170
- functools.partial(
171
- self._compute_test_result_for_llm_task,
172
- item=item,
173
- task=task,
174
- trial_id=trial_id,
175
- experiment_=experiment_,
176
- )
177
- for item in dataset_items
178
- ]
197
+ desc = f"{description} trial {trial_id}" if trial_count > 1 else description
179
198
 
180
- test_results += evaluation_tasks_executor.execute(
181
- evaluation_tasks=evaluation_tasks,
199
+ # Use streaming executor to submit tasks as items arrive
200
+ executor: evaluation_tasks_executor.StreamingExecutor[
201
+ test_result.TestResult
202
+ ] = evaluation_tasks_executor.StreamingExecutor(
182
203
  workers=self._workers,
183
204
  verbose=self._verbose,
184
- desc=f"{description} trial {trial_id}"
185
- if trial_count > 1
186
- else description,
205
+ desc=desc,
206
+ total=total_items,
187
207
  )
208
+ with executor:
209
+ # For first trial, consume from iterator and cache items
210
+ if trial_id == 0:
211
+ for item in dataset_items:
212
+ dataset_items_cache.append(item)
213
+ evaluation_task = functools.partial(
214
+ self._compute_test_result_for_llm_task,
215
+ item=item,
216
+ task=task,
217
+ trial_id=trial_id,
218
+ experiment_=experiment_,
219
+ )
220
+ executor.submit(evaluation_task)
221
+ else:
222
+ # For subsequent trials, use cached items
223
+ for item in dataset_items_cache:
224
+ evaluation_task = functools.partial(
225
+ self._compute_test_result_for_llm_task,
226
+ item=item,
227
+ task=task,
228
+ trial_id=trial_id,
229
+ experiment_=experiment_,
230
+ )
231
+ executor.submit(evaluation_task)
232
+
233
+ # Collect results from executor
234
+ test_results += executor.get_results()
188
235
 
189
236
  return test_results
190
237
 
@@ -282,21 +329,54 @@ class EvaluationEngine:
282
329
  trial_count: int,
283
330
  experiment_: Optional[experiment.Experiment],
284
331
  ) -> List[test_result.TestResult]:
285
- dataset_items = dataset_.__internal_api__get_items_as_dataclasses__(
286
- nb_samples=nb_samples,
287
- dataset_item_ids=dataset_item_ids,
332
+ # Can't use streaming with these parameters yet, so fallback to non-streaming
333
+ use_streaming = (
334
+ dataset_sampler is None
335
+ and not self._metrics_evaluator.has_task_span_metrics
288
336
  )
289
337
 
290
- if dataset_sampler is not None:
291
- dataset_items = dataset_sampler.sample(dataset_items)
338
+ # Get dataset items using streaming or non-streaming approach
339
+ if use_streaming:
340
+ dataset_items_iter = dataset_.__internal_api__stream_items_as_dataclasses__(
341
+ nb_samples=nb_samples,
342
+ dataset_item_ids=dataset_item_ids,
343
+ batch_size=EVALUATION_STREAM_DATASET_BATCH_SIZE,
344
+ )
345
+ else:
346
+ LOGGER.info("Dataset streaming disabled due to evaluation parameters")
347
+ dataset_items_list = list(
348
+ dataset_.__internal_api__stream_items_as_dataclasses__(
349
+ nb_samples=nb_samples,
350
+ dataset_item_ids=dataset_item_ids,
351
+ batch_size=EVALUATION_STREAM_DATASET_BATCH_SIZE,
352
+ )
353
+ )
354
+
355
+ if dataset_sampler is not None:
356
+ dataset_items_list = dataset_sampler.sample(dataset_items_list)
357
+
358
+ # Convert list to iterator
359
+ dataset_items_iter = iter(dataset_items_list)
360
+
361
+ # Calculate total items for progress bar
362
+ if use_streaming:
363
+ total_items = _calculate_total_items(
364
+ dataset=dataset_,
365
+ nb_samples=nb_samples,
366
+ dataset_item_ids=dataset_item_ids,
367
+ )
368
+ else:
369
+ # After sampling, the actual count is the length of the list
370
+ total_items = len(dataset_items_list)
292
371
 
293
372
  if not self._metrics_evaluator.has_task_span_metrics:
294
373
  return self._compute_test_results_for_llm_task(
295
- dataset_items=dataset_items,
374
+ dataset_items=dataset_items_iter,
296
375
  task=task,
297
376
  experiment_=experiment_,
298
377
  trial_count=trial_count,
299
378
  description="Evaluation",
379
+ total_items=total_items,
300
380
  )
301
381
 
302
382
  LOGGER.debug(
@@ -306,11 +386,12 @@ class EvaluationEngine:
306
386
 
307
387
  with local_recording.record_traces_locally(client=self._client) as recording:
308
388
  test_results = self._compute_test_results_for_llm_task(
309
- dataset_items=dataset_items,
389
+ dataset_items=dataset_items_iter,
310
390
  task=task,
311
391
  experiment_=experiment_,
312
392
  trial_count=trial_count,
313
393
  description="Evaluation",
394
+ total_items=total_items,
314
395
  )
315
396
  self._update_test_results_with_task_span_metrics(
316
397
  test_results=test_results,
@@ -339,7 +420,7 @@ class EvaluationEngine:
339
420
  List of TestResult objects containing scores for each item.
340
421
  """
341
422
  # Convert raw items to DatasetItem objects for compatibility
342
- dataset_items = [
423
+ dataset_items_list = [
343
424
  dataset_item.DatasetItem(
344
425
  id=f"temp_item_{idx}",
345
426
  **item,
@@ -349,11 +430,12 @@ class EvaluationEngine:
349
430
 
350
431
  if not self._metrics_evaluator.has_task_span_metrics:
351
432
  return self._compute_test_results_for_llm_task(
352
- dataset_items=dataset_items,
433
+ dataset_items=iter(dataset_items_list),
353
434
  task=task,
354
435
  experiment_=None,
355
436
  trial_count=1,
356
437
  description="Items evaluation",
438
+ total_items=len(items),
357
439
  )
358
440
 
359
441
  LOGGER.debug(
@@ -363,11 +445,12 @@ class EvaluationEngine:
363
445
 
364
446
  with local_recording.record_traces_locally(client=self._client) as recording:
365
447
  test_results = self._compute_test_results_for_llm_task(
366
- dataset_items=dataset_items,
448
+ dataset_items=iter(dataset_items_list),
367
449
  task=task,
368
450
  experiment_=None,
369
451
  trial_count=1,
370
452
  description="Items evaluation",
453
+ total_items=len(items),
371
454
  )
372
455
  self._update_test_results_with_task_span_metrics(
373
456
  test_results=test_results,
@@ -388,7 +471,7 @@ class EvaluationEngine:
388
471
  for test_case_ in test_cases
389
472
  ]
390
473
 
391
- test_results = evaluation_tasks_executor.execute(
474
+ test_results: List[test_result.TestResult] = evaluation_tasks_executor.execute(
392
475
  evaluation_tasks=evaluation_tasks,
393
476
  workers=self._workers,
394
477
  verbose=self._verbose,