PyPI - opik - Versions diffs - 1.9.39__py3-none-any.whl → 1.9.86__py3-none-any.whl - Mend

opik 1.9.39py3-none-any.whl → 1.9.86py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (195) hide show

opik/api_objects/attachment/attachment_context.py +36 -0
opik/api_objects/attachment/attachments_extractor.py +153 -0
opik/api_objects/attachment/client.py +1 -0
opik/api_objects/attachment/converters.py +2 -0
opik/api_objects/attachment/decoder.py +18 -0
opik/api_objects/attachment/decoder_base64.py +83 -0
opik/api_objects/attachment/decoder_helpers.py +137 -0
opik/api_objects/constants.py +2 -0
opik/api_objects/dataset/dataset.py +133 -40
opik/api_objects/dataset/rest_operations.py +2 -0
opik/api_objects/experiment/experiment.py +6 -0
opik/api_objects/helpers.py +8 -4
opik/api_objects/local_recording.py +6 -5
opik/api_objects/observation_data.py +101 -0
opik/api_objects/opik_client.py +78 -45
opik/api_objects/opik_query_language.py +9 -3
opik/api_objects/prompt/chat/chat_prompt.py +18 -1
opik/api_objects/prompt/client.py +8 -1
opik/api_objects/span/span_data.py +3 -88
opik/api_objects/threads/threads_client.py +7 -4
opik/api_objects/trace/trace_data.py +3 -74
opik/api_objects/validation_helpers.py +3 -3
opik/cli/exports/__init__.py +131 -0
opik/cli/exports/dataset.py +278 -0
opik/cli/exports/experiment.py +784 -0
opik/cli/exports/project.py +685 -0
opik/cli/exports/prompt.py +578 -0
opik/cli/exports/utils.py +406 -0
opik/cli/harbor.py +39 -0
opik/cli/imports/__init__.py +439 -0
opik/cli/imports/dataset.py +143 -0
opik/cli/imports/experiment.py +1192 -0
opik/cli/imports/project.py +262 -0
opik/cli/imports/prompt.py +177 -0
opik/cli/imports/utils.py +280 -0
opik/cli/main.py +14 -12
opik/config.py +12 -1
opik/datetime_helpers.py +12 -0
opik/decorator/arguments_helpers.py +4 -1
opik/decorator/base_track_decorator.py +111 -37
opik/decorator/context_manager/span_context_manager.py +5 -1
opik/decorator/generator_wrappers.py +5 -4
opik/decorator/span_creation_handler.py +13 -4
opik/evaluation/engine/engine.py +111 -28
opik/evaluation/engine/evaluation_tasks_executor.py +71 -19
opik/evaluation/evaluator.py +12 -0
opik/evaluation/metrics/conversation/llm_judges/conversational_coherence/metric.py +3 -1
opik/evaluation/metrics/conversation/llm_judges/session_completeness/metric.py +3 -1
opik/evaluation/metrics/conversation/llm_judges/user_frustration/metric.py +3 -1
opik/evaluation/metrics/heuristics/equals.py +11 -7
opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +3 -1
opik/evaluation/metrics/llm_judges/context_precision/metric.py +3 -1
opik/evaluation/metrics/llm_judges/context_recall/metric.py +3 -1
opik/evaluation/metrics/llm_judges/factuality/metric.py +1 -1
opik/evaluation/metrics/llm_judges/g_eval/metric.py +3 -1
opik/evaluation/metrics/llm_judges/hallucination/metric.py +3 -1
opik/evaluation/metrics/llm_judges/moderation/metric.py +3 -1
opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +3 -1
opik/evaluation/metrics/llm_judges/syc_eval/metric.py +4 -2
opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +3 -1
opik/evaluation/metrics/llm_judges/usefulness/metric.py +3 -1
opik/evaluation/metrics/ragas_metric.py +43 -23
opik/evaluation/models/litellm/litellm_chat_model.py +7 -2
opik/evaluation/models/litellm/util.py +4 -20
opik/evaluation/models/models_factory.py +19 -5
opik/evaluation/rest_operations.py +3 -3
opik/evaluation/threads/helpers.py +3 -2
opik/file_upload/file_uploader.py +13 -0
opik/file_upload/upload_options.py +2 -0
opik/integrations/adk/legacy_opik_tracer.py +9 -11
opik/integrations/adk/opik_tracer.py +2 -2
opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +2 -2
opik/integrations/dspy/callback.py +100 -14
opik/integrations/dspy/parsers.py +168 -0
opik/integrations/harbor/__init__.py +17 -0
opik/integrations/harbor/experiment_service.py +269 -0
opik/integrations/harbor/opik_tracker.py +528 -0
opik/integrations/haystack/opik_tracer.py +2 -2
opik/integrations/langchain/__init__.py +15 -2
opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
opik/integrations/langchain/opik_tracer.py +258 -160
opik/integrations/langchain/provider_usage_extractors/langchain_run_helpers/helpers.py +7 -4
opik/integrations/llama_index/callback.py +43 -6
opik/integrations/openai/agents/opik_tracing_processor.py +8 -10
opik/integrations/openai/opik_tracker.py +99 -4
opik/integrations/openai/videos/__init__.py +9 -0
opik/integrations/openai/videos/binary_response_write_to_file_decorator.py +88 -0
opik/integrations/openai/videos/videos_create_decorator.py +159 -0
opik/integrations/openai/videos/videos_download_decorator.py +110 -0
opik/message_processing/batching/base_batcher.py +14 -21
opik/message_processing/batching/batch_manager.py +22 -10
opik/message_processing/batching/batchers.py +32 -40
opik/message_processing/batching/flushing_thread.py +0 -3
opik/message_processing/emulation/emulator_message_processor.py +36 -1
opik/message_processing/emulation/models.py +21 -0
opik/message_processing/messages.py +9 -0
opik/message_processing/preprocessing/__init__.py +0 -0
opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
opik/message_processing/preprocessing/constants.py +1 -0
opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
opik/message_processing/preprocessing/preprocessor.py +36 -0
opik/message_processing/processors/__init__.py +0 -0
opik/message_processing/processors/attachments_extraction_processor.py +146 -0
opik/message_processing/{message_processors.py → processors/message_processors.py} +15 -1
opik/message_processing/{message_processors_chain.py → processors/message_processors_chain.py} +3 -2
opik/message_processing/{online_message_processor.py → processors/online_message_processor.py} +11 -9
opik/message_processing/queue_consumer.py +4 -2
opik/message_processing/streamer.py +71 -33
opik/message_processing/streamer_constructors.py +36 -8
opik/plugins/pytest/experiment_runner.py +1 -1
opik/plugins/pytest/hooks.py +5 -3
opik/rest_api/__init__.py +42 -0
opik/rest_api/datasets/client.py +321 -123
opik/rest_api/datasets/raw_client.py +470 -145
opik/rest_api/experiments/client.py +26 -0
opik/rest_api/experiments/raw_client.py +26 -0
opik/rest_api/llm_provider_key/client.py +4 -4
opik/rest_api/llm_provider_key/raw_client.py +4 -4
opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +2 -1
opik/rest_api/manual_evaluation/client.py +101 -0
opik/rest_api/manual_evaluation/raw_client.py +172 -0
opik/rest_api/optimizations/client.py +0 -166
opik/rest_api/optimizations/raw_client.py +0 -248
opik/rest_api/projects/client.py +9 -0
opik/rest_api/projects/raw_client.py +13 -0
opik/rest_api/projects/types/project_metric_request_public_metric_type.py +4 -0
opik/rest_api/prompts/client.py +130 -2
opik/rest_api/prompts/raw_client.py +175 -0
opik/rest_api/traces/client.py +101 -0
opik/rest_api/traces/raw_client.py +120 -0
opik/rest_api/types/__init__.py +50 -0
opik/rest_api/types/audio_url.py +19 -0
opik/rest_api/types/audio_url_public.py +19 -0
opik/rest_api/types/audio_url_write.py +19 -0
opik/rest_api/types/automation_rule_evaluator.py +38 -2
opik/rest_api/types/automation_rule_evaluator_object_object_public.py +33 -2
opik/rest_api/types/automation_rule_evaluator_public.py +33 -2
opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
opik/rest_api/types/automation_rule_evaluator_update.py +27 -1
opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
opik/rest_api/types/automation_rule_evaluator_write.py +27 -1
opik/rest_api/types/dataset.py +2 -0
opik/rest_api/types/dataset_item.py +1 -1
opik/rest_api/types/dataset_item_batch.py +4 -0
opik/rest_api/types/dataset_item_changes_public.py +5 -0
opik/rest_api/types/dataset_item_compare.py +1 -1
opik/rest_api/types/dataset_item_filter.py +4 -0
opik/rest_api/types/dataset_item_page_compare.py +0 -1
opik/rest_api/types/dataset_item_page_public.py +0 -1
opik/rest_api/types/dataset_item_public.py +1 -1
opik/rest_api/types/dataset_public.py +2 -0
opik/rest_api/types/dataset_version_public.py +10 -0
opik/rest_api/types/dataset_version_summary.py +46 -0
opik/rest_api/types/dataset_version_summary_public.py +46 -0
opik/rest_api/types/experiment.py +9 -0
opik/rest_api/types/experiment_public.py +9 -0
opik/rest_api/types/group_content_with_aggregations.py +1 -0
opik/rest_api/types/llm_as_judge_message_content.py +2 -0
opik/rest_api/types/llm_as_judge_message_content_public.py +2 -0
opik/rest_api/types/llm_as_judge_message_content_write.py +2 -0
opik/rest_api/types/manual_evaluation_request_entity_type.py +1 -1
opik/rest_api/types/project.py +1 -0
opik/rest_api/types/project_detailed.py +1 -0
opik/rest_api/types/project_metric_response_public_metric_type.py +4 -0
opik/rest_api/types/project_reference.py +31 -0
opik/rest_api/types/project_reference_public.py +31 -0
opik/rest_api/types/project_stats_summary_item.py +1 -0
opik/rest_api/types/prompt_version.py +1 -0
opik/rest_api/types/prompt_version_detail.py +1 -0
opik/rest_api/types/prompt_version_page_public.py +5 -0
opik/rest_api/types/prompt_version_public.py +1 -0
opik/rest_api/types/prompt_version_update.py +33 -0
opik/rest_api/types/provider_api_key.py +5 -1
opik/rest_api/types/provider_api_key_provider.py +2 -1
opik/rest_api/types/provider_api_key_public.py +5 -1
opik/rest_api/types/provider_api_key_public_provider.py +2 -1
opik/rest_api/types/service_toggles_config.py +11 -1
opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
opik/types.py +36 -0
opik/validation/chat_prompt_messages.py +241 -0
opik/validation/feedback_score.py +3 -3
opik/validation/validator.py +28 -0
{opik-1.9.39.dist-info → opik-1.9.86.dist-info}/METADATA +7 -7
{opik-1.9.39.dist-info → opik-1.9.86.dist-info}/RECORD +193 -142
opik/cli/export.py +0 -791
opik/cli/import_command.py +0 -575
{opik-1.9.39.dist-info → opik-1.9.86.dist-info}/WHEEL +0 -0
{opik-1.9.39.dist-info → opik-1.9.86.dist-info}/entry_points.txt +0 -0
{opik-1.9.39.dist-info → opik-1.9.86.dist-info}/licenses/LICENSE +0 -0
{opik-1.9.39.dist-info → opik-1.9.86.dist-info}/top_level.txt +0 -0

opik/integrations/harbor/experiment_service.py ADDED Viewed

@@ -0,0 +1,269 @@
+"""
+Experiment service for Harbor integration.
+This module manages the connection between Harbor benchmark runs and Opik experiments,
+enabling evaluation tracking and result visualization.
+Harbor Terminology Mapping to Opik:
+-----------------------------------
+- **Harbor Job**: A benchmark run that evaluates one or more agents on a dataset.
+  Maps to an Opik Experiment.
+- **Harbor Trial**: A single agent run on a single task within a job.
+  Each trial produces one Opik Trace (capturing the agent's execution).
+- **Harbor Source**: The benchmark dataset being used (e.g., "terminal-bench", "swe-bench").
+  Maps to an Opik Dataset. Each source gets its own dataset.
+- **Harbor Task**: A specific problem/challenge within a dataset (e.g., "fix-git" task).
+  Maps to an Opik Dataset Item.
+Flow Overview:
+--------------
+1. When a Harbor job starts, this service is initialized with an experiment name.
+2. For each unique source (benchmark dataset), we create/get an Opik Dataset and Experiment.
+3. For each trial (agent run on a task), we:
+   a. Create a dataset item for the task (or reuse existing one if task was run before)
+   b. Link the trial's trace to the experiment via ExperimentItemReferences
+4. This allows viewing all trial results in Opik's experiment comparison UI.
+"""
+import logging
+from datetime import datetime
+from typing import Any, Dict, Optional, Set, TYPE_CHECKING
+from opik.api_objects import opik_client
+from opik.api_objects.experiment import experiment_item
+if TYPE_CHECKING:
+    from opik.api_objects.experiment.experiment import Experiment
+    from opik.api_objects.dataset.dataset import Dataset
+LOGGER = logging.getLogger(__name__)
+# Global singleton service instance (one per Harbor job)
+_SERVICE: Optional["HarborExperimentService"] = None
+class HarborExperimentService:
+    """
+    Manages Opik datasets and experiments for Harbor benchmark jobs.
+    This service handles the mapping between Harbor's evaluation structure and Opik's
+    experiment tracking:
+    - Each Harbor source (benchmark dataset) → One Opik Dataset + One Opik Experiment
+    - Each Harbor task → One Opik Dataset Item
+    - Each Harbor trial → One Opik Trace, linked to the experiment
+    The service uses lazy initialization - datasets and experiments are created
+    on-demand when the first trial for a source is linked.
+    Attributes:
+        _experiment_name: Name for experiments created by this service.
+        _experiment_config: Config dict stored on experiments (agent/model info).
+        _client: Cached Opik client instance.
+        _datasets: Map of source name → Opik Dataset.
+        _experiments: Map of source name → Opik Experiment.
+        _linked_trials: Set of trial names already linked (prevents duplicates).
+    """
+    def __init__(
+        self,
+        experiment_name: str,
+        experiment_config: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        """
+        Initialize the experiment service.
+        Args:
+            experiment_name: Name for experiments. Typically includes job_id
+                for uniqueness (e.g., "harbor-job-abc123").
+            experiment_config: Optional config dict to store on experiments.
+                Typically contains agent/model info (e.g., {"agent_name": "terminus",
+                "model_name": "gpt-4o"}).
+        """
+        self._experiment_name = experiment_name
+        self._experiment_config = experiment_config or {}
+        self._experiment_config["created_from"] = "harbor"
+        self._client = opik_client.get_client_cached()
+        # Lazy-initialized per source (benchmark dataset)
+        self._datasets: Dict[str, "Dataset"] = {}
+        self._experiments: Dict[str, "Experiment"] = {}
+        # Track which trials have been linked to avoid duplicates
+        self._linked_trials: Set[str] = set()
+    def _ensure_dataset_and_experiment(self, source: str) -> None:
+        """
+        Ensure an Opik Dataset and Experiment exist for the given source.
+        Creates them lazily on first access. Each Harbor source (benchmark dataset)
+        gets its own Opik Dataset and Experiment pair.
+        Args:
+            source: The Harbor source/benchmark name (e.g., "terminal-bench").
+        """
+        if source in self._experiments:
+            return
+        try:
+            # Create or get the dataset for this benchmark source
+            dataset = self._client.get_or_create_dataset(
+                name=source,
+                description=f"Harbor benchmark dataset: {source}",
+            )
+            self._datasets[source] = dataset
+            LOGGER.info("Using dataset '%s' for Harbor source", source)
+            # Create a new experiment for this job run
+            experiment = self._client.create_experiment(
+                dataset_name=source,
+                name=self._experiment_name,
+                experiment_config=self._experiment_config,
+            )
+            self._experiments[source] = experiment
+            LOGGER.info(
+                "Created experiment '%s' for dataset '%s'",
+                self._experiment_name,
+                source,
+            )
+        except Exception as e:
+            LOGGER.warning(
+                "Failed to create dataset/experiment for source '%s': %s",
+                source,
+                e,
+            )
+    def link_trial_to_experiment(
+        self,
+        trial_name: str,
+        trace_id: str,
+        source: Optional[str] = None,
+        task_name: Optional[str] = None,
+    ) -> None:
+        """
+        Link a Harbor trial's trace to the Opik experiment.
+        This creates the connection between a trial's execution trace and the
+        experiment, enabling the trial to appear in Opik's experiment comparison UI.
+        The flow:
+        1. Ensure dataset and experiment exist for the source
+        2. Create or find the dataset item for this task
+        3. Link the trace to the experiment via the dataset item
+        Args:
+            trial_name: Unique identifier for the trial (e.g., "task__abc123").
+                Used to prevent duplicate linking.
+            trace_id: The Opik trace ID for this trial's execution.
+            source: The Harbor source/benchmark name. Defaults to "harbor-default".
+            task_name: The task name within the benchmark (e.g., "fix-git").
+                Used to create/find the dataset item.
+        """
+        source = source or "harbor-default"
+        # Prevent duplicate linking of the same trial
+        if trial_name in self._linked_trials:
+            return
+        # Ensure we have a dataset and experiment for this source
+        self._ensure_dataset_and_experiment(source)
+        experiment = self._experiments.get(source)
+        dataset = self._datasets.get(source)
+        if experiment is None or dataset is None:
+            LOGGER.warning(
+                "Failed to create experiment/dataset for source '%s', "
+                "trial '%s' will not be linked",
+                source,
+                trial_name,
+            )
+            return
+        try:
+            # Insert the task as a dataset item (idempotent - duplicates are handled)
+            dataset.insert([{"task_name": task_name}])
+            # Find the dataset item ID for this task.
+            # We search because the same task may have been inserted in a previous run,
+            # and we want to reuse the existing item ID for proper experiment linking.
+            items = dataset.get_items()
+            dataset_item_id = None
+            for item in items:
+                if item.get("task_name") == task_name:
+                    dataset_item_id = item.get("id")
+                    break
+            if dataset_item_id is None:
+                LOGGER.warning("Could not find dataset item for task '%s'", task_name)
+                return
+            # Link the trace to the experiment via the dataset item
+            experiment.insert(
+                [
+                    experiment_item.ExperimentItemReferences(
+                        dataset_item_id=dataset_item_id,
+                        trace_id=trace_id,
+                    )
+                ]
+            )
+            self._linked_trials.add(trial_name)
+            LOGGER.debug(
+                "Linked trial '%s' (trace %s) to experiment '%s'",
+                trial_name,
+                trace_id,
+                self._experiment_name,
+            )
+        except Exception as e:
+            LOGGER.warning("Failed to link trial '%s' to experiment: %s", trial_name, e)
+def setup_lazy(
+    experiment_name: Optional[str] = None,
+    experiment_config: Optional[Dict[str, Any]] = None,
+) -> None:
+    """
+    Setup the experiment service lazily.
+    Called when the first Harbor trial runs. Creates the global service instance
+    that will be used for all subsequent trial linking. Datasets and experiments
+    are created on-demand when trials are linked.
+    Args:
+        experiment_name: Name for the experiment. If None, auto-generates
+            a timestamped name like "harbor-20241209-143000".
+        experiment_config: Optional config dict to store on experiments.
+            Typically contains agent/model info (e.g., {"agent_name": "terminus",
+            "model_name": "gpt-4o"}).
+    """
+    global _SERVICE
+    if _SERVICE is not None:
+        LOGGER.debug("Experiment service already setup, skipping")
+        return
+    if experiment_name is None:
+        timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+        experiment_name = f"harbor-{timestamp}"
+    _SERVICE = HarborExperimentService(
+        experiment_name=experiment_name,
+        experiment_config=experiment_config,
+    )
+    LOGGER.info("Experiment service setup for '%s'", experiment_name)
+def get_service() -> Optional[HarborExperimentService]:
+    """Get the current experiment service instance, or None if not initialized."""
+    return _SERVICE
+def reset() -> None:
+    """Reset the experiment service. Used for testing."""
+    global _SERVICE
+    _SERVICE = None

opik 1.9.39__py3-none-any.whl → 1.9.86__py3-none-any.whl

opik 1.9.39py3-none-any.whl → 1.9.86py3-none-any.whl