PyPI - opik - Versions diffs - 1.9.41__py3-none-any.whl → 1.9.86__py3-none-any.whl - Mend

opik 1.9.41py3-none-any.whl → 1.9.86py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (192) hide show

opik/api_objects/attachment/attachment_context.py +36 -0
opik/api_objects/attachment/attachments_extractor.py +153 -0
opik/api_objects/attachment/client.py +1 -0
opik/api_objects/attachment/converters.py +2 -0
opik/api_objects/attachment/decoder.py +18 -0
opik/api_objects/attachment/decoder_base64.py +83 -0
opik/api_objects/attachment/decoder_helpers.py +137 -0
opik/api_objects/constants.py +2 -0
opik/api_objects/dataset/dataset.py +133 -40
opik/api_objects/dataset/rest_operations.py +2 -0
opik/api_objects/experiment/experiment.py +6 -0
opik/api_objects/helpers.py +8 -4
opik/api_objects/local_recording.py +6 -5
opik/api_objects/observation_data.py +101 -0
opik/api_objects/opik_client.py +78 -45
opik/api_objects/opik_query_language.py +9 -3
opik/api_objects/prompt/chat/chat_prompt.py +18 -1
opik/api_objects/prompt/client.py +8 -1
opik/api_objects/span/span_data.py +3 -88
opik/api_objects/threads/threads_client.py +7 -4
opik/api_objects/trace/trace_data.py +3 -74
opik/api_objects/validation_helpers.py +3 -3
opik/cli/exports/__init__.py +131 -0
opik/cli/exports/dataset.py +278 -0
opik/cli/exports/experiment.py +784 -0
opik/cli/exports/project.py +685 -0
opik/cli/exports/prompt.py +578 -0
opik/cli/exports/utils.py +406 -0
opik/cli/harbor.py +39 -0
opik/cli/imports/__init__.py +439 -0
opik/cli/imports/dataset.py +143 -0
opik/cli/imports/experiment.py +1192 -0
opik/cli/imports/project.py +262 -0
opik/cli/imports/prompt.py +177 -0
opik/cli/imports/utils.py +280 -0
opik/cli/main.py +14 -12
opik/config.py +12 -1
opik/datetime_helpers.py +12 -0
opik/decorator/arguments_helpers.py +4 -1
opik/decorator/base_track_decorator.py +111 -37
opik/decorator/context_manager/span_context_manager.py +5 -1
opik/decorator/generator_wrappers.py +5 -4
opik/decorator/span_creation_handler.py +13 -4
opik/evaluation/engine/engine.py +111 -28
opik/evaluation/engine/evaluation_tasks_executor.py +71 -19
opik/evaluation/evaluator.py +12 -0
opik/evaluation/metrics/conversation/llm_judges/conversational_coherence/metric.py +3 -1
opik/evaluation/metrics/conversation/llm_judges/session_completeness/metric.py +3 -1
opik/evaluation/metrics/conversation/llm_judges/user_frustration/metric.py +3 -1
opik/evaluation/metrics/heuristics/equals.py +11 -7
opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +3 -1
opik/evaluation/metrics/llm_judges/context_precision/metric.py +3 -1
opik/evaluation/metrics/llm_judges/context_recall/metric.py +3 -1
opik/evaluation/metrics/llm_judges/factuality/metric.py +1 -1
opik/evaluation/metrics/llm_judges/g_eval/metric.py +3 -1
opik/evaluation/metrics/llm_judges/hallucination/metric.py +3 -1
opik/evaluation/metrics/llm_judges/moderation/metric.py +3 -1
opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +3 -1
opik/evaluation/metrics/llm_judges/syc_eval/metric.py +4 -2
opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +3 -1
opik/evaluation/metrics/llm_judges/usefulness/metric.py +3 -1
opik/evaluation/metrics/ragas_metric.py +43 -23
opik/evaluation/models/litellm/litellm_chat_model.py +7 -2
opik/evaluation/models/litellm/util.py +4 -20
opik/evaluation/models/models_factory.py +19 -5
opik/evaluation/rest_operations.py +3 -3
opik/evaluation/threads/helpers.py +3 -2
opik/file_upload/file_uploader.py +13 -0
opik/file_upload/upload_options.py +2 -0
opik/integrations/adk/legacy_opik_tracer.py +9 -11
opik/integrations/adk/opik_tracer.py +2 -2
opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +2 -2
opik/integrations/dspy/callback.py +100 -14
opik/integrations/dspy/parsers.py +168 -0
opik/integrations/harbor/__init__.py +17 -0
opik/integrations/harbor/experiment_service.py +269 -0
opik/integrations/harbor/opik_tracker.py +528 -0
opik/integrations/haystack/opik_tracer.py +2 -2
opik/integrations/langchain/__init__.py +15 -2
opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
opik/integrations/langchain/opik_tracer.py +258 -160
opik/integrations/langchain/provider_usage_extractors/langchain_run_helpers/helpers.py +7 -4
opik/integrations/llama_index/callback.py +43 -6
opik/integrations/openai/agents/opik_tracing_processor.py +8 -10
opik/integrations/openai/opik_tracker.py +99 -4
opik/integrations/openai/videos/__init__.py +9 -0
opik/integrations/openai/videos/binary_response_write_to_file_decorator.py +88 -0
opik/integrations/openai/videos/videos_create_decorator.py +159 -0
opik/integrations/openai/videos/videos_download_decorator.py +110 -0
opik/message_processing/batching/base_batcher.py +14 -21
opik/message_processing/batching/batch_manager.py +22 -10
opik/message_processing/batching/batchers.py +32 -40
opik/message_processing/batching/flushing_thread.py +0 -3
opik/message_processing/emulation/emulator_message_processor.py +36 -1
opik/message_processing/emulation/models.py +21 -0
opik/message_processing/messages.py +9 -0
opik/message_processing/preprocessing/__init__.py +0 -0
opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
opik/message_processing/preprocessing/constants.py +1 -0
opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
opik/message_processing/preprocessing/preprocessor.py +36 -0
opik/message_processing/processors/__init__.py +0 -0
opik/message_processing/processors/attachments_extraction_processor.py +146 -0
opik/message_processing/{message_processors.py → processors/message_processors.py} +15 -1
opik/message_processing/{message_processors_chain.py → processors/message_processors_chain.py} +3 -2
opik/message_processing/{online_message_processor.py → processors/online_message_processor.py} +11 -9
opik/message_processing/queue_consumer.py +4 -2
opik/message_processing/streamer.py +71 -33
opik/message_processing/streamer_constructors.py +36 -8
opik/plugins/pytest/experiment_runner.py +1 -1
opik/plugins/pytest/hooks.py +5 -3
opik/rest_api/__init__.py +38 -0
opik/rest_api/datasets/client.py +249 -148
opik/rest_api/datasets/raw_client.py +356 -217
opik/rest_api/experiments/client.py +26 -0
opik/rest_api/experiments/raw_client.py +26 -0
opik/rest_api/llm_provider_key/client.py +4 -4
opik/rest_api/llm_provider_key/raw_client.py +4 -4
opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +2 -1
opik/rest_api/manual_evaluation/client.py +101 -0
opik/rest_api/manual_evaluation/raw_client.py +172 -0
opik/rest_api/optimizations/client.py +0 -166
opik/rest_api/optimizations/raw_client.py +0 -248
opik/rest_api/projects/client.py +9 -0
opik/rest_api/projects/raw_client.py +13 -0
opik/rest_api/projects/types/project_metric_request_public_metric_type.py +4 -0
opik/rest_api/prompts/client.py +130 -2
opik/rest_api/prompts/raw_client.py +175 -0
opik/rest_api/traces/client.py +101 -0
opik/rest_api/traces/raw_client.py +120 -0
opik/rest_api/types/__init__.py +46 -0
opik/rest_api/types/audio_url.py +19 -0
opik/rest_api/types/audio_url_public.py +19 -0
opik/rest_api/types/audio_url_write.py +19 -0
opik/rest_api/types/automation_rule_evaluator.py +38 -2
opik/rest_api/types/automation_rule_evaluator_object_object_public.py +33 -2
opik/rest_api/types/automation_rule_evaluator_public.py +33 -2
opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
opik/rest_api/types/automation_rule_evaluator_update.py +27 -1
opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
opik/rest_api/types/automation_rule_evaluator_write.py +27 -1
opik/rest_api/types/dataset_item.py +1 -1
opik/rest_api/types/dataset_item_batch.py +4 -0
opik/rest_api/types/dataset_item_changes_public.py +5 -0
opik/rest_api/types/dataset_item_compare.py +1 -1
opik/rest_api/types/dataset_item_filter.py +4 -0
opik/rest_api/types/dataset_item_page_compare.py +0 -1
opik/rest_api/types/dataset_item_page_public.py +0 -1
opik/rest_api/types/dataset_item_public.py +1 -1
opik/rest_api/types/dataset_version_public.py +5 -0
opik/rest_api/types/dataset_version_summary.py +5 -0
opik/rest_api/types/dataset_version_summary_public.py +5 -0
opik/rest_api/types/experiment.py +9 -0
opik/rest_api/types/experiment_public.py +9 -0
opik/rest_api/types/llm_as_judge_message_content.py +2 -0
opik/rest_api/types/llm_as_judge_message_content_public.py +2 -0
opik/rest_api/types/llm_as_judge_message_content_write.py +2 -0
opik/rest_api/types/manual_evaluation_request_entity_type.py +1 -1
opik/rest_api/types/project.py +1 -0
opik/rest_api/types/project_detailed.py +1 -0
opik/rest_api/types/project_metric_response_public_metric_type.py +4 -0
opik/rest_api/types/project_reference.py +31 -0
opik/rest_api/types/project_reference_public.py +31 -0
opik/rest_api/types/project_stats_summary_item.py +1 -0
opik/rest_api/types/prompt_version.py +1 -0
opik/rest_api/types/prompt_version_detail.py +1 -0
opik/rest_api/types/prompt_version_page_public.py +5 -0
opik/rest_api/types/prompt_version_public.py +1 -0
opik/rest_api/types/prompt_version_update.py +33 -0
opik/rest_api/types/provider_api_key.py +5 -1
opik/rest_api/types/provider_api_key_provider.py +2 -1
opik/rest_api/types/provider_api_key_public.py +5 -1
opik/rest_api/types/provider_api_key_public_provider.py +2 -1
opik/rest_api/types/service_toggles_config.py +11 -1
opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
opik/types.py +36 -0
opik/validation/chat_prompt_messages.py +241 -0
opik/validation/feedback_score.py +3 -3
opik/validation/validator.py +28 -0
{opik-1.9.41.dist-info → opik-1.9.86.dist-info}/METADATA +5 -5
{opik-1.9.41.dist-info → opik-1.9.86.dist-info}/RECORD +190 -141
opik/cli/export.py +0 -791
opik/cli/import_command.py +0 -575
{opik-1.9.41.dist-info → opik-1.9.86.dist-info}/WHEEL +0 -0
{opik-1.9.41.dist-info → opik-1.9.86.dist-info}/entry_points.txt +0 -0
{opik-1.9.41.dist-info → opik-1.9.86.dist-info}/licenses/LICENSE +0 -0
{opik-1.9.41.dist-info → opik-1.9.86.dist-info}/top_level.txt +0 -0

opik/decorator/base_track_decorator.py CHANGED Viewed

@@ -68,6 +68,7 @@ class BaseTrackDecorator(abc.ABC):
         generations_aggregator: Optional[Callable[[List[Any]], Any]] = None,
         flush: bool = False,
         project_name: Optional[str] = None,
+        create_duplicate_root_span: bool = True,
     ) -> Union[Callable, Callable[[Callable], Callable]]:
         """
         Decorator to track the execution of a function.
@@ -85,6 +86,7 @@ class BaseTrackDecorator(abc.ABC):
             generations_aggregator: Function to aggregate generation results.
             flush: Whether to flush the client after logging.
             project_name: The name of the project to log data.
+            create_duplicate_root_span: Whether to create a root span duplicating the root trace data.
         Returns:
             Callable: The decorated function(if used without parentheses)
@@ -113,6 +115,7 @@ class BaseTrackDecorator(abc.ABC):
             generations_aggregator=generations_aggregator,
             flush=flush,
             project_name=project_name,
+            create_duplicate_root_span=create_duplicate_root_span,
         )
         if callable(name):
@@ -314,7 +317,7 @@ class BaseTrackDecorator(abc.ABC):
         def wrapper(*args, **kwargs) -> Any:  # type: ignore
             if not tracing_runtime_config.is_tracing_active():
                 return func(*args, **kwargs)
-            self._before_call(
+            should_process_span_data = self._before_call(
                 func=func,
                 track_options=track_options,
                 args=args,
@@ -350,6 +353,7 @@ class BaseTrackDecorator(abc.ABC):
                 error_info=error_info,
                 capture_output=track_options.capture_output,
                 flush=track_options.flush,
+                should_process_span_data=should_process_span_data,
             )
             if func_exception is not None:
                 raise func_exception
@@ -368,7 +372,7 @@ class BaseTrackDecorator(abc.ABC):
         async def wrapper(*args, **kwargs) -> Any:  # type: ignore
             if not tracing_runtime_config.is_tracing_active():
                 return await func(*args, **kwargs)
-            self._before_call(
+            should_process_span_data = self._before_call(
                 func=func,
                 track_options=track_options,
                 args=args,
@@ -403,6 +407,7 @@ class BaseTrackDecorator(abc.ABC):
                 error_info=error_info,
                 capture_output=track_options.capture_output,
                 flush=track_options.flush,
+                should_process_span_data=should_process_span_data,
             )
             if func_exception is not None:
                 raise func_exception
@@ -417,14 +422,14 @@ class BaseTrackDecorator(abc.ABC):
         track_options: arguments_helpers.TrackOptions,
         args: Tuple,
         kwargs: Dict[str, Any],
-    ) -> None:
+    ) -> bool:
         try:
-            self.__before_call_unsafe(
+            return self.__before_call_unsafe(
                 func=func,
                 track_options=track_options,
                 args=args,
                 kwargs=kwargs,
-            )
+            ).should_process_span_data
         except Exception as exception:
             LOGGER.error(
                 logging_messages.UNEXPECTED_EXCEPTION_ON_SPAN_CREATION_FOR_TRACKED_FUNCTION,
@@ -433,6 +438,7 @@ class BaseTrackDecorator(abc.ABC):
                 str(exception),
                 exc_info=True,
             )
+        return False
     def __before_call_unsafe(
         self,
@@ -440,7 +446,7 @@ class BaseTrackDecorator(abc.ABC):
         track_options: arguments_helpers.TrackOptions,
         args: Tuple,
         kwargs: Dict[str, Any],
-    ) -> None:
+    ) -> span_creation_handler.SpanCreationResult:
         track_start_options = self._prepare_tracking_start_options(
             func=func,
             track_options=track_options,
@@ -448,11 +454,12 @@ class BaseTrackDecorator(abc.ABC):
             kwargs=kwargs,
         )
-        add_start_candidates(
+        return add_start_candidates(
             start_span_parameters=track_start_options.start_span_parameters,
             opik_distributed_trace_headers=track_start_options.opik_distributed_trace_headers,
             opik_args_data=track_start_options.opik_args,
             tracing_active=tracing_runtime_config.is_tracing_active(),
+            create_duplicate_root_span=track_options.create_duplicate_root_span,
         )
     def _after_call(
@@ -463,6 +470,7 @@ class BaseTrackDecorator(abc.ABC):
         generators_span_to_end: Optional[span.SpanData] = None,
         generators_trace_to_end: Optional[trace.TraceData] = None,
         flush: bool = False,
+        should_process_span_data: bool = True,
     ) -> None:
         try:
             self.__after_call_unsafe(
@@ -472,6 +480,7 @@ class BaseTrackDecorator(abc.ABC):
                 generators_span_to_end=generators_span_to_end,
                 generators_trace_to_end=generators_trace_to_end,
                 flush=flush,
+                should_process_span_data=should_process_span_data,
             )
         except Exception as exception:
             LOGGER.error(
@@ -486,12 +495,19 @@ class BaseTrackDecorator(abc.ABC):
         output: Optional[Any],
         error_info: Optional[ErrorInfoDict],
         capture_output: bool,
-        generators_span_to_end: Optional[span.SpanData] = None,
-        generators_trace_to_end: Optional[trace.TraceData] = None,
-        flush: bool = False,
+        generators_span_to_end: Optional[span.SpanData],
+        generators_trace_to_end: Optional[trace.TraceData],
+        flush: bool,
+        should_process_span_data: bool,
     ) -> None:
+        span_data_to_end: Optional[span.SpanData] = None
         if generators_span_to_end is None:
-            span_data_to_end, trace_data_to_end = pop_end_candidates()
+            if should_process_span_data:
+                # the span data must be present in the context stack, otherwise something is wrong
+                span_data_to_end, trace_data_to_end = pop_end_candidates()
+            else:
+                # the span data is not in the context, only the root trace data there
+                trace_data_to_end = pop_end_candidate_trace_data()
         else:
             span_data_to_end, trace_data_to_end = (
                 generators_span_to_end,
@@ -499,20 +515,27 @@ class BaseTrackDecorator(abc.ABC):
             )
         if output is not None:
-            try:
-                end_arguments = self._end_span_inputs_preprocessor(
-                    output=output,
-                    capture_output=capture_output,
-                    current_span_data=span_data_to_end,
-                )
-            except Exception as e:
-                LOGGER.error(
-                    logging_messages.UNEXPECTED_EXCEPTION_ON_SPAN_FINALIZATION_FOR_TRACKED_FUNCTION,
-                    output,
-                    str(e),
-                    exc_info=True,
-                )
+            if should_process_span_data and span_data_to_end is not None:
+                # create end arguments from current span data only if appropriate
+                try:
+                    end_arguments = self._end_span_inputs_preprocessor(
+                        output=output,
+                        capture_output=capture_output,
+                        current_span_data=span_data_to_end,
+                    )
+                except Exception as e:
+                    LOGGER.error(
+                        logging_messages.UNEXPECTED_EXCEPTION_ON_SPAN_FINALIZATION_FOR_TRACKED_FUNCTION,
+                        output,
+                        str(e),
+                        exc_info=True,
+                    )
+                    end_arguments = arguments_helpers.EndSpanParameters(
+                        output={"output": output}
+                    )
+            else:
+                # just use output as end arguments
                 end_arguments = arguments_helpers.EndSpanParameters(
                     output={"output": output}
                 )
@@ -521,11 +544,12 @@ class BaseTrackDecorator(abc.ABC):
         client = opik_client.get_client_cached()
-        span_data_to_end.init_end_time().update(
-            **end_arguments.to_kwargs(),
-        )
-        client.span(**span_data_to_end.as_parameters)
+        if should_process_span_data and span_data_to_end is not None:
+            # save span data only if appropriate
+            span_data_to_end.init_end_time().update(
+                **end_arguments.to_kwargs(),
+            )
+            client.span(**span_data_to_end.as_parameters)
         if trace_data_to_end is not None:
             trace_data_to_end.init_end_time().update(
@@ -598,8 +622,26 @@ def pop_end_candidates() -> Tuple[span.SpanData, Optional[trace.TraceData]]:
         span_data_to_end is not None
     ), "When pop_end_candidates is called, top span data must not be None. Otherwise something is wrong."
-    trace_data_to_end = None
+    trace_data_to_end = pop_end_candidate_trace_data()
+    return span_data_to_end, trace_data_to_end
+def pop_end_candidate_trace_data() -> Optional[trace.TraceData]:
+    """
+    Pops the most recently created trace data from the stack if it meets specific criteria.
+    This function checks whether the context storage's span data stack is empty, and if so, it attempts
+    to pop and return the most recently created trace data associated with the context. The trace data
+    is only removed if its ID is part of a predefined set of trace IDs created using a decorator. If the
+    criteria are not met, None is returned.
+    Note: Decorator can't attach any child objects to the popped ones because
+    they are no longer in the context stack.
+    Returns:
+        The trace data popped from the stack if the criteria are met;
+        otherwise, None.
+    """
     possible_trace_data_to_end = context_storage.get_trace_data()
     if (
         context_storage.span_data_stack_empty()
@@ -608,8 +650,9 @@ def pop_end_candidates() -> Tuple[span.SpanData, Optional[trace.TraceData]]:
     ):
         trace_data_to_end = context_storage.pop_trace_data()
         TRACES_CREATED_BY_DECORATOR.discard(possible_trace_data_to_end.id)
+        return trace_data_to_end
-    return span_data_to_end, trace_data_to_end
+    return None
 def add_start_candidates(
@@ -617,6 +660,7 @@ def add_start_candidates(
     opik_distributed_trace_headers: Optional[DistributedTraceHeadersDict],
     opik_args_data: Optional[opik_args.OpikArgs],
     tracing_active: bool,
+    create_duplicate_root_span: bool,
 ) -> span_creation_handler.SpanCreationResult:
     """
     Handles the creation and registration of a new start span and trace while respecting the
@@ -631,6 +675,8 @@ def add_start_candidates(
         opik_args_data : Optional additional arguments that can be applied to the trace
             data after the span is created.
         tracing_active: A boolean indicating whether a tracing is active.
+        create_duplicate_root_span: A boolean indicating whether to create a root span along with the root trace
+            and duplicating its data.
     Returns:
         The result of the span creation, including the span and trace data.
@@ -638,14 +684,22 @@ def add_start_candidates(
     span_creation_result = span_creation_handler.create_span_respecting_context(
         start_span_arguments=start_span_parameters,
         distributed_trace_headers=opik_distributed_trace_headers,
+        should_create_duplicate_root_span=create_duplicate_root_span,
     )
-    context_storage.add_span_data(span_creation_result.span_data)
+    if span_creation_result.should_process_span_data:
+        context_storage.add_span_data(span_creation_result.span_data)
-    if tracing_active:
-        client = opik_client.get_client_cached()
+        if tracing_active:
+            client = opik_client.get_client_cached()
-        if client.config.log_start_trace_span:
-            client.span(**span_creation_result.span_data.as_start_parameters)
+            if client.config.log_start_trace_span:
+                client.span(**span_creation_result.span_data.as_start_parameters)
+    else:
+        _show_root_span_not_created_warning_if_needed(
+            start_span_parameters=start_span_parameters,
+            tracing_active=tracing_active,
+            should_process_span_data=span_creation_result.should_process_span_data,
+        )
     if span_creation_result.trace_data is not None:
         add_start_trace_candidate(
@@ -691,3 +745,23 @@ def add_start_trace_candidate(
     client = opik_client.get_client_cached()
     if client.config.log_start_trace_span:
         client.trace(**trace_data.as_start_parameters)
+def _show_root_span_not_created_warning_if_needed(
+    start_span_parameters: arguments_helpers.StartSpanParameters,
+    tracing_active: bool,
+    should_process_span_data: bool,
+) -> None:
+    if not tracing_active:
+        return
+    user_provided_span_type_will_be_lost = (
+        not should_process_span_data and start_span_parameters.type in ["llm", "tool"]
+    )
+    if user_provided_span_type_will_be_lost:
+        LOGGER.warning(
+            "The root span '%s' of type '%s' will not be created because "
+            "its creation was explicitly disabled along with the root trace.",
+            start_span_parameters.name,
+            start_span_parameters.type,
+        )

opik/decorator/context_manager/span_context_manager.py CHANGED Viewed

@@ -65,6 +65,7 @@ def start_as_current_span(
         opik_distributed_trace_headers=distributed_headers,
         opik_args_data=None,
         tracing_active=True,
+        create_duplicate_root_span=True,
     )
     end_arguments = arguments_helpers.EndSpanParameters(
@@ -85,6 +86,7 @@ def start_as_current_span(
         end_arguments.metadata = span_creation_result.span_data.metadata or metadata
         end_arguments.provider = span_creation_result.span_data.provider or provider
         end_arguments.model = span_creation_result.span_data.model or model
+        end_arguments.attachments = span_creation_result.span_data.attachments
     except Exception as exception:
         LOGGER.error(
             "Error in user's script while executing span context manager: %s",
@@ -100,8 +102,10 @@ def start_as_current_span(
         # save span/trace data at the end of the context manager
         client = opik_client.get_client_cached()
+        # Don't pass attachments to update() since they're already set on span_data
+        # and _update_attachments would duplicate them
         span_creation_result.span_data.init_end_time().update(
-            **end_arguments.to_kwargs(),
+            **end_arguments.to_kwargs(ignore_keys=["attachments"]),
         )
         client.span(**span_creation_result.span_data.as_parameters)

opik/decorator/generator_wrappers.py CHANGED Viewed

@@ -58,12 +58,13 @@ class BaseTrackedGenerator(Generic[YieldType]):
         if self._created_span_data is not None:
             return
-        self._created_trace_data, self._created_span_data = (
-            span_creation_handler.create_span_respecting_context(
-                self._start_span_arguments, self._opik_distributed_trace_headers
-            )
+        result = span_creation_handler.create_span_respecting_context(
+            self._start_span_arguments, self._opik_distributed_trace_headers
         )
+        self._created_trace_data = result.trace_data
+        self._created_span_data = result.span_data
     def _handle_stop_iteration_before_raising(self) -> None:
         output = _try_aggregate_items(
             self._accumulated_values,

opik/decorator/span_creation_handler.py CHANGED Viewed

@@ -23,16 +23,20 @@ class SpanCreationResult(NamedTuple):
             with the span if a new trace was created. Can be None if no new trace was created.
         span_data : Data specific to the created span, containing
             information such as span identifiers and timestamps.
+        should_process_span_data: A boolean indicating whether created span data should be further processed
+            after it was created (saved, logged, etc.).
     """
     trace_data: Optional[trace.TraceData]
     span_data: span.SpanData
+    should_process_span_data: bool
 def create_span_respecting_context(
     start_span_arguments: arguments_helpers.StartSpanParameters,
     distributed_trace_headers: Optional[DistributedTraceHeadersDict],
     opik_context_storage: Optional[context_storage.OpikContextStorage] = None,
+    should_create_duplicate_root_span: bool = True,
 ) -> SpanCreationResult:
     """
     Handles different span creation flows.
@@ -48,7 +52,7 @@ def create_span_respecting_context(
             trace_id=distributed_trace_headers["opik_trace_id"],
         )
-        return SpanCreationResult(None, span_data)
+        return SpanCreationResult(None, span_data, should_process_span_data=True)
     current_span_data = opik_context_storage.top_span_data()
     current_trace_data = opik_context_storage.get_trace_data()
@@ -78,7 +82,7 @@ def create_span_respecting_context(
             trace_id=current_span_data.trace_id,
         )
-        return SpanCreationResult(None, span_data)
+        return SpanCreationResult(None, span_data, should_process_span_data=True)
     if current_trace_data is not None and current_span_data is None:
         # By default, we expect trace to be created with a span.
@@ -100,7 +104,7 @@ def create_span_respecting_context(
             trace_id=current_trace_data.id,
         )
-        return SpanCreationResult(None, span_data)
+        return SpanCreationResult(None, span_data, should_process_span_data=True)
     if current_span_data is None and current_trace_data is None:
         # Create a trace and root span because it is
@@ -113,6 +117,7 @@ def create_span_respecting_context(
             metadata=start_span_arguments.metadata,
             tags=start_span_arguments.tags,
             project_name=start_span_arguments.project_name,
+            thread_id=start_span_arguments.thread_id,
         )
         current_span_data = arguments_helpers.create_span_data(
@@ -121,4 +126,8 @@ def create_span_respecting_context(
             trace_id=current_trace_data.id,
         )
-    return SpanCreationResult(current_trace_data, current_span_data)
+    return SpanCreationResult(
+        current_trace_data,
+        current_span_data,
+        should_process_span_data=should_create_duplicate_root_span,
+    )

opik/evaluation/engine/engine.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import functools
 import logging
-from typing import List, Optional, Any, Dict
+from typing import List, Optional, Any, Dict, Iterator
 import opik.logging_messages as logging_messages
 import opik.opik_context as opik_context
@@ -26,6 +26,30 @@ LOGGER = logging.getLogger(__name__)
 EVALUATION_TASK_NAME = "evaluation_task"
+EVALUATION_STREAM_DATASET_BATCH_SIZE = 200  # The limit is 10x smaller than the default streaming limit to improve the UX and not wait too long for the first items to be evaluated
+def _calculate_total_items(
+    dataset: dataset.Dataset,
+    nb_samples: Optional[int],
+    dataset_item_ids: Optional[List[str]],
+) -> Optional[int]:
+    """
+    Calculate the total number of items that will be evaluated.
+    Returns None if the total cannot be determined (e.g., when using a sampler).
+    """
+    if dataset_item_ids is not None:
+        return len(dataset_item_ids)
+    # If nb_samples is specified and smaller than dataset size, use it
+    if nb_samples is not None:
+        if dataset.dataset_items_count is not None:
+            return min(nb_samples, dataset.dataset_items_count)
+        return nb_samples
+    return dataset.dataset_items_count
 class EvaluationEngine:
     def __init__(
@@ -157,34 +181,57 @@ class EvaluationEngine:
     def _compute_test_results_for_llm_task(
         self,
-        dataset_items: List[dataset_item.DatasetItem],
+        dataset_items: Iterator[dataset_item.DatasetItem],
         task: LLMTask,
         experiment_: Optional[experiment.Experiment],
         trial_count: int,
         description: str,
+        total_items: Optional[int] = None,
     ) -> List[test_result.TestResult]:
         test_results: List[test_result.TestResult] = []
+        # Cache dataset items for multiple trials
+        dataset_items_cache: List[dataset_item.DatasetItem] = []
         for trial_id in range(trial_count):
-            evaluation_tasks: List[EvaluationTask[test_result.TestResult]] = [
-                functools.partial(
-                    self._compute_test_result_for_llm_task,
-                    item=item,
-                    task=task,
-                    trial_id=trial_id,
-                    experiment_=experiment_,
-                )
-                for item in dataset_items
-            ]
+            desc = f"{description} trial {trial_id}" if trial_count > 1 else description
-            test_results += evaluation_tasks_executor.execute(
-                evaluation_tasks=evaluation_tasks,
+            # Use streaming executor to submit tasks as items arrive
+            executor: evaluation_tasks_executor.StreamingExecutor[
+                test_result.TestResult
+            ] = evaluation_tasks_executor.StreamingExecutor(
                 workers=self._workers,
                 verbose=self._verbose,
-                desc=f"{description} trial {trial_id}"
-                if trial_count > 1
-                else description,
+                desc=desc,
+                total=total_items,
             )
+            with executor:
+                # For first trial, consume from iterator and cache items
+                if trial_id == 0:
+                    for item in dataset_items:
+                        dataset_items_cache.append(item)
+                        evaluation_task = functools.partial(
+                            self._compute_test_result_for_llm_task,
+                            item=item,
+                            task=task,
+                            trial_id=trial_id,
+                            experiment_=experiment_,
+                        )
+                        executor.submit(evaluation_task)
+                else:
+                    # For subsequent trials, use cached items
+                    for item in dataset_items_cache:
+                        evaluation_task = functools.partial(
+                            self._compute_test_result_for_llm_task,
+                            item=item,
+                            task=task,
+                            trial_id=trial_id,
+                            experiment_=experiment_,
+                        )
+                        executor.submit(evaluation_task)
+                # Collect results from executor
+                test_results += executor.get_results()
         return test_results
@@ -282,21 +329,54 @@ class EvaluationEngine:
         trial_count: int,
         experiment_: Optional[experiment.Experiment],
     ) -> List[test_result.TestResult]:
-        dataset_items = dataset_.__internal_api__get_items_as_dataclasses__(
-            nb_samples=nb_samples,
-            dataset_item_ids=dataset_item_ids,
+        # Can't use streaming with these parameters yet, so fallback to non-streaming
+        use_streaming = (
+            dataset_sampler is None
+            and not self._metrics_evaluator.has_task_span_metrics
         )
-        if dataset_sampler is not None:
-            dataset_items = dataset_sampler.sample(dataset_items)
+        # Get dataset items using streaming or non-streaming approach
+        if use_streaming:
+            dataset_items_iter = dataset_.__internal_api__stream_items_as_dataclasses__(
+                nb_samples=nb_samples,
+                dataset_item_ids=dataset_item_ids,
+                batch_size=EVALUATION_STREAM_DATASET_BATCH_SIZE,
+            )
+        else:
+            LOGGER.info("Dataset streaming disabled due to evaluation parameters")
+            dataset_items_list = list(
+                dataset_.__internal_api__stream_items_as_dataclasses__(
+                    nb_samples=nb_samples,
+                    dataset_item_ids=dataset_item_ids,
+                    batch_size=EVALUATION_STREAM_DATASET_BATCH_SIZE,
+                )
+            )
+            if dataset_sampler is not None:
+                dataset_items_list = dataset_sampler.sample(dataset_items_list)
+            # Convert list to iterator
+            dataset_items_iter = iter(dataset_items_list)
+        # Calculate total items for progress bar
+        if use_streaming:
+            total_items = _calculate_total_items(
+                dataset=dataset_,
+                nb_samples=nb_samples,
+                dataset_item_ids=dataset_item_ids,
+            )
+        else:
+            # After sampling, the actual count is the length of the list
+            total_items = len(dataset_items_list)
         if not self._metrics_evaluator.has_task_span_metrics:
             return self._compute_test_results_for_llm_task(
-                dataset_items=dataset_items,
+                dataset_items=dataset_items_iter,
                 task=task,
                 experiment_=experiment_,
                 trial_count=trial_count,
                 description="Evaluation",
+                total_items=total_items,
             )
         LOGGER.debug(
@@ -306,11 +386,12 @@ class EvaluationEngine:
         with local_recording.record_traces_locally(client=self._client) as recording:
             test_results = self._compute_test_results_for_llm_task(
-                dataset_items=dataset_items,
+                dataset_items=dataset_items_iter,
                 task=task,
                 experiment_=experiment_,
                 trial_count=trial_count,
                 description="Evaluation",
+                total_items=total_items,
             )
             self._update_test_results_with_task_span_metrics(
                 test_results=test_results,
@@ -339,7 +420,7 @@ class EvaluationEngine:
             List of TestResult objects containing scores for each item.
         """
         # Convert raw items to DatasetItem objects for compatibility
-        dataset_items = [
+        dataset_items_list = [
             dataset_item.DatasetItem(
                 id=f"temp_item_{idx}",
                 **item,
@@ -349,11 +430,12 @@ class EvaluationEngine:
         if not self._metrics_evaluator.has_task_span_metrics:
             return self._compute_test_results_for_llm_task(
-                dataset_items=dataset_items,
+                dataset_items=iter(dataset_items_list),
                 task=task,
                 experiment_=None,
                 trial_count=1,
                 description="Items evaluation",
+                total_items=len(items),
             )
         LOGGER.debug(
@@ -363,11 +445,12 @@ class EvaluationEngine:
         with local_recording.record_traces_locally(client=self._client) as recording:
             test_results = self._compute_test_results_for_llm_task(
-                dataset_items=dataset_items,
+                dataset_items=iter(dataset_items_list),
                 task=task,
                 experiment_=None,
                 trial_count=1,
                 description="Items evaluation",
+                total_items=len(items),
             )
             self._update_test_results_with_task_span_metrics(
                 test_results=test_results,
@@ -388,7 +471,7 @@ class EvaluationEngine:
             for test_case_ in test_cases
         ]
-        test_results = evaluation_tasks_executor.execute(
+        test_results: List[test_result.TestResult] = evaluation_tasks_executor.execute(
             evaluation_tasks=evaluation_tasks,
             workers=self._workers,
             verbose=self._verbose,

opik 1.9.41__py3-none-any.whl → 1.9.86__py3-none-any.whl

opik 1.9.41py3-none-any.whl → 1.9.86py3-none-any.whl