opik 1.9.39__py3-none-any.whl → 1.9.86__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik/api_objects/attachment/attachment_context.py +36 -0
- opik/api_objects/attachment/attachments_extractor.py +153 -0
- opik/api_objects/attachment/client.py +1 -0
- opik/api_objects/attachment/converters.py +2 -0
- opik/api_objects/attachment/decoder.py +18 -0
- opik/api_objects/attachment/decoder_base64.py +83 -0
- opik/api_objects/attachment/decoder_helpers.py +137 -0
- opik/api_objects/constants.py +2 -0
- opik/api_objects/dataset/dataset.py +133 -40
- opik/api_objects/dataset/rest_operations.py +2 -0
- opik/api_objects/experiment/experiment.py +6 -0
- opik/api_objects/helpers.py +8 -4
- opik/api_objects/local_recording.py +6 -5
- opik/api_objects/observation_data.py +101 -0
- opik/api_objects/opik_client.py +78 -45
- opik/api_objects/opik_query_language.py +9 -3
- opik/api_objects/prompt/chat/chat_prompt.py +18 -1
- opik/api_objects/prompt/client.py +8 -1
- opik/api_objects/span/span_data.py +3 -88
- opik/api_objects/threads/threads_client.py +7 -4
- opik/api_objects/trace/trace_data.py +3 -74
- opik/api_objects/validation_helpers.py +3 -3
- opik/cli/exports/__init__.py +131 -0
- opik/cli/exports/dataset.py +278 -0
- opik/cli/exports/experiment.py +784 -0
- opik/cli/exports/project.py +685 -0
- opik/cli/exports/prompt.py +578 -0
- opik/cli/exports/utils.py +406 -0
- opik/cli/harbor.py +39 -0
- opik/cli/imports/__init__.py +439 -0
- opik/cli/imports/dataset.py +143 -0
- opik/cli/imports/experiment.py +1192 -0
- opik/cli/imports/project.py +262 -0
- opik/cli/imports/prompt.py +177 -0
- opik/cli/imports/utils.py +280 -0
- opik/cli/main.py +14 -12
- opik/config.py +12 -1
- opik/datetime_helpers.py +12 -0
- opik/decorator/arguments_helpers.py +4 -1
- opik/decorator/base_track_decorator.py +111 -37
- opik/decorator/context_manager/span_context_manager.py +5 -1
- opik/decorator/generator_wrappers.py +5 -4
- opik/decorator/span_creation_handler.py +13 -4
- opik/evaluation/engine/engine.py +111 -28
- opik/evaluation/engine/evaluation_tasks_executor.py +71 -19
- opik/evaluation/evaluator.py +12 -0
- opik/evaluation/metrics/conversation/llm_judges/conversational_coherence/metric.py +3 -1
- opik/evaluation/metrics/conversation/llm_judges/session_completeness/metric.py +3 -1
- opik/evaluation/metrics/conversation/llm_judges/user_frustration/metric.py +3 -1
- opik/evaluation/metrics/heuristics/equals.py +11 -7
- opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +3 -1
- opik/evaluation/metrics/llm_judges/context_precision/metric.py +3 -1
- opik/evaluation/metrics/llm_judges/context_recall/metric.py +3 -1
- opik/evaluation/metrics/llm_judges/factuality/metric.py +1 -1
- opik/evaluation/metrics/llm_judges/g_eval/metric.py +3 -1
- opik/evaluation/metrics/llm_judges/hallucination/metric.py +3 -1
- opik/evaluation/metrics/llm_judges/moderation/metric.py +3 -1
- opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +3 -1
- opik/evaluation/metrics/llm_judges/syc_eval/metric.py +4 -2
- opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +3 -1
- opik/evaluation/metrics/llm_judges/usefulness/metric.py +3 -1
- opik/evaluation/metrics/ragas_metric.py +43 -23
- opik/evaluation/models/litellm/litellm_chat_model.py +7 -2
- opik/evaluation/models/litellm/util.py +4 -20
- opik/evaluation/models/models_factory.py +19 -5
- opik/evaluation/rest_operations.py +3 -3
- opik/evaluation/threads/helpers.py +3 -2
- opik/file_upload/file_uploader.py +13 -0
- opik/file_upload/upload_options.py +2 -0
- opik/integrations/adk/legacy_opik_tracer.py +9 -11
- opik/integrations/adk/opik_tracer.py +2 -2
- opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +2 -2
- opik/integrations/dspy/callback.py +100 -14
- opik/integrations/dspy/parsers.py +168 -0
- opik/integrations/harbor/__init__.py +17 -0
- opik/integrations/harbor/experiment_service.py +269 -0
- opik/integrations/harbor/opik_tracker.py +528 -0
- opik/integrations/haystack/opik_tracer.py +2 -2
- opik/integrations/langchain/__init__.py +15 -2
- opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
- opik/integrations/langchain/opik_tracer.py +258 -160
- opik/integrations/langchain/provider_usage_extractors/langchain_run_helpers/helpers.py +7 -4
- opik/integrations/llama_index/callback.py +43 -6
- opik/integrations/openai/agents/opik_tracing_processor.py +8 -10
- opik/integrations/openai/opik_tracker.py +99 -4
- opik/integrations/openai/videos/__init__.py +9 -0
- opik/integrations/openai/videos/binary_response_write_to_file_decorator.py +88 -0
- opik/integrations/openai/videos/videos_create_decorator.py +159 -0
- opik/integrations/openai/videos/videos_download_decorator.py +110 -0
- opik/message_processing/batching/base_batcher.py +14 -21
- opik/message_processing/batching/batch_manager.py +22 -10
- opik/message_processing/batching/batchers.py +32 -40
- opik/message_processing/batching/flushing_thread.py +0 -3
- opik/message_processing/emulation/emulator_message_processor.py +36 -1
- opik/message_processing/emulation/models.py +21 -0
- opik/message_processing/messages.py +9 -0
- opik/message_processing/preprocessing/__init__.py +0 -0
- opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
- opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
- opik/message_processing/preprocessing/constants.py +1 -0
- opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
- opik/message_processing/preprocessing/preprocessor.py +36 -0
- opik/message_processing/processors/__init__.py +0 -0
- opik/message_processing/processors/attachments_extraction_processor.py +146 -0
- opik/message_processing/{message_processors.py → processors/message_processors.py} +15 -1
- opik/message_processing/{message_processors_chain.py → processors/message_processors_chain.py} +3 -2
- opik/message_processing/{online_message_processor.py → processors/online_message_processor.py} +11 -9
- opik/message_processing/queue_consumer.py +4 -2
- opik/message_processing/streamer.py +71 -33
- opik/message_processing/streamer_constructors.py +36 -8
- opik/plugins/pytest/experiment_runner.py +1 -1
- opik/plugins/pytest/hooks.py +5 -3
- opik/rest_api/__init__.py +42 -0
- opik/rest_api/datasets/client.py +321 -123
- opik/rest_api/datasets/raw_client.py +470 -145
- opik/rest_api/experiments/client.py +26 -0
- opik/rest_api/experiments/raw_client.py +26 -0
- opik/rest_api/llm_provider_key/client.py +4 -4
- opik/rest_api/llm_provider_key/raw_client.py +4 -4
- opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +2 -1
- opik/rest_api/manual_evaluation/client.py +101 -0
- opik/rest_api/manual_evaluation/raw_client.py +172 -0
- opik/rest_api/optimizations/client.py +0 -166
- opik/rest_api/optimizations/raw_client.py +0 -248
- opik/rest_api/projects/client.py +9 -0
- opik/rest_api/projects/raw_client.py +13 -0
- opik/rest_api/projects/types/project_metric_request_public_metric_type.py +4 -0
- opik/rest_api/prompts/client.py +130 -2
- opik/rest_api/prompts/raw_client.py +175 -0
- opik/rest_api/traces/client.py +101 -0
- opik/rest_api/traces/raw_client.py +120 -0
- opik/rest_api/types/__init__.py +50 -0
- opik/rest_api/types/audio_url.py +19 -0
- opik/rest_api/types/audio_url_public.py +19 -0
- opik/rest_api/types/audio_url_write.py +19 -0
- opik/rest_api/types/automation_rule_evaluator.py +38 -2
- opik/rest_api/types/automation_rule_evaluator_object_object_public.py +33 -2
- opik/rest_api/types/automation_rule_evaluator_public.py +33 -2
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_update.py +27 -1
- opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_write.py +27 -1
- opik/rest_api/types/dataset.py +2 -0
- opik/rest_api/types/dataset_item.py +1 -1
- opik/rest_api/types/dataset_item_batch.py +4 -0
- opik/rest_api/types/dataset_item_changes_public.py +5 -0
- opik/rest_api/types/dataset_item_compare.py +1 -1
- opik/rest_api/types/dataset_item_filter.py +4 -0
- opik/rest_api/types/dataset_item_page_compare.py +0 -1
- opik/rest_api/types/dataset_item_page_public.py +0 -1
- opik/rest_api/types/dataset_item_public.py +1 -1
- opik/rest_api/types/dataset_public.py +2 -0
- opik/rest_api/types/dataset_version_public.py +10 -0
- opik/rest_api/types/dataset_version_summary.py +46 -0
- opik/rest_api/types/dataset_version_summary_public.py +46 -0
- opik/rest_api/types/experiment.py +9 -0
- opik/rest_api/types/experiment_public.py +9 -0
- opik/rest_api/types/group_content_with_aggregations.py +1 -0
- opik/rest_api/types/llm_as_judge_message_content.py +2 -0
- opik/rest_api/types/llm_as_judge_message_content_public.py +2 -0
- opik/rest_api/types/llm_as_judge_message_content_write.py +2 -0
- opik/rest_api/types/manual_evaluation_request_entity_type.py +1 -1
- opik/rest_api/types/project.py +1 -0
- opik/rest_api/types/project_detailed.py +1 -0
- opik/rest_api/types/project_metric_response_public_metric_type.py +4 -0
- opik/rest_api/types/project_reference.py +31 -0
- opik/rest_api/types/project_reference_public.py +31 -0
- opik/rest_api/types/project_stats_summary_item.py +1 -0
- opik/rest_api/types/prompt_version.py +1 -0
- opik/rest_api/types/prompt_version_detail.py +1 -0
- opik/rest_api/types/prompt_version_page_public.py +5 -0
- opik/rest_api/types/prompt_version_public.py +1 -0
- opik/rest_api/types/prompt_version_update.py +33 -0
- opik/rest_api/types/provider_api_key.py +5 -1
- opik/rest_api/types/provider_api_key_provider.py +2 -1
- opik/rest_api/types/provider_api_key_public.py +5 -1
- opik/rest_api/types/provider_api_key_public_provider.py +2 -1
- opik/rest_api/types/service_toggles_config.py +11 -1
- opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
- opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
- opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
- opik/types.py +36 -0
- opik/validation/chat_prompt_messages.py +241 -0
- opik/validation/feedback_score.py +3 -3
- opik/validation/validator.py +28 -0
- {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/METADATA +7 -7
- {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/RECORD +193 -142
- opik/cli/export.py +0 -791
- opik/cli/import_command.py +0 -575
- {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/WHEEL +0 -0
- {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/entry_points.txt +0 -0
- {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/licenses/LICENSE +0 -0
- {opik-1.9.39.dist-info → opik-1.9.86.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from concurrent import futures
|
|
2
|
-
from typing import List, TypeVar
|
|
2
|
+
from typing import Any, List, Optional, TypeVar, Generic
|
|
3
3
|
|
|
4
4
|
from ...environment import get_tqdm_for_current_environment
|
|
5
5
|
from .types import EvaluationTask
|
|
@@ -9,6 +9,70 @@ _tqdm = get_tqdm_for_current_environment()
|
|
|
9
9
|
T = TypeVar("T")
|
|
10
10
|
|
|
11
11
|
|
|
12
|
+
class StreamingExecutor(Generic[T]):
|
|
13
|
+
"""
|
|
14
|
+
Executor that accepts and processes evaluation tasks incrementally using a thread pool.
|
|
15
|
+
|
|
16
|
+
Tasks can be submitted one at a time and will begin executing immediately, allowing
|
|
17
|
+
for streaming behavior regardless of the number of workers configured.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
workers: int,
|
|
23
|
+
verbose: int,
|
|
24
|
+
desc: str = "Evaluation",
|
|
25
|
+
total: Optional[int] = None,
|
|
26
|
+
):
|
|
27
|
+
self._workers = workers
|
|
28
|
+
self._verbose = verbose
|
|
29
|
+
self._desc = desc
|
|
30
|
+
self._total = total
|
|
31
|
+
self._task_count = 0
|
|
32
|
+
self._pool: futures.ThreadPoolExecutor
|
|
33
|
+
self._submitted_futures: List[futures.Future[T]] = []
|
|
34
|
+
self._progress_bar: Optional[Any] = None
|
|
35
|
+
|
|
36
|
+
def __enter__(self) -> "StreamingExecutor[T]":
|
|
37
|
+
self._pool = futures.ThreadPoolExecutor(max_workers=self._workers)
|
|
38
|
+
self._pool.__enter__()
|
|
39
|
+
# Initialize progress bar on enter
|
|
40
|
+
self._progress_bar = _tqdm(
|
|
41
|
+
disable=(self._verbose < 1),
|
|
42
|
+
desc=self._desc,
|
|
43
|
+
total=self._total,
|
|
44
|
+
)
|
|
45
|
+
return self
|
|
46
|
+
|
|
47
|
+
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
|
48
|
+
# Close progress bar if it exists
|
|
49
|
+
if self._progress_bar is not None:
|
|
50
|
+
self._progress_bar.close()
|
|
51
|
+
self._pool.__exit__(exc_type, exc_val, exc_tb)
|
|
52
|
+
|
|
53
|
+
def submit(self, task: EvaluationTask[T]) -> None:
|
|
54
|
+
"""Submit a task to the thread pool for execution."""
|
|
55
|
+
self._task_count += 1
|
|
56
|
+
future = self._pool.submit(task)
|
|
57
|
+
self._submitted_futures.append(future)
|
|
58
|
+
|
|
59
|
+
def get_results(self) -> List[T]:
|
|
60
|
+
"""Collect results from futures as they complete with progress bar."""
|
|
61
|
+
results: List[T] = []
|
|
62
|
+
|
|
63
|
+
# Update total if it wasn't known initially
|
|
64
|
+
if self._progress_bar is not None and self._total is None:
|
|
65
|
+
self._progress_bar.total = self._task_count
|
|
66
|
+
|
|
67
|
+
# Process futures as they complete and update progress bar
|
|
68
|
+
for future in futures.as_completed(self._submitted_futures):
|
|
69
|
+
results.append(future.result())
|
|
70
|
+
if self._progress_bar is not None:
|
|
71
|
+
self._progress_bar.update(1)
|
|
72
|
+
|
|
73
|
+
return results
|
|
74
|
+
|
|
75
|
+
|
|
12
76
|
def execute(
|
|
13
77
|
evaluation_tasks: List[EvaluationTask[T]],
|
|
14
78
|
workers: int,
|
|
@@ -28,21 +92,9 @@ def execute(
|
|
|
28
92
|
|
|
29
93
|
return test_results
|
|
30
94
|
|
|
31
|
-
with
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
test_result_future.result()
|
|
38
|
-
for test_result_future in _tqdm(
|
|
39
|
-
futures.as_completed(
|
|
40
|
-
test_result_futures,
|
|
41
|
-
),
|
|
42
|
-
disable=(verbose < 1),
|
|
43
|
-
desc=desc,
|
|
44
|
-
total=len(test_result_futures),
|
|
45
|
-
)
|
|
46
|
-
]
|
|
47
|
-
|
|
48
|
-
return test_results
|
|
95
|
+
with StreamingExecutor[T](
|
|
96
|
+
workers=workers, verbose=verbose, desc=desc, total=len(evaluation_tasks)
|
|
97
|
+
) as executor:
|
|
98
|
+
for evaluation_task in evaluation_tasks:
|
|
99
|
+
executor.submit(evaluation_task)
|
|
100
|
+
return executor.get_results()
|
opik/evaluation/evaluator.py
CHANGED
|
@@ -88,6 +88,7 @@ def evaluate(
|
|
|
88
88
|
dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
|
|
89
89
|
trial_count: int = 1,
|
|
90
90
|
experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
|
|
91
|
+
experiment_tags: Optional[List[str]] = None,
|
|
91
92
|
) -> evaluation_result.EvaluationResult:
|
|
92
93
|
"""
|
|
93
94
|
Performs task evaluation on a given dataset. You can use either `scoring_metrics` or `scorer_functions` to calculate
|
|
@@ -156,6 +157,8 @@ def evaluate(
|
|
|
156
157
|
Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
|
|
157
158
|
These scores are computed after all test results are collected and represent aggregate
|
|
158
159
|
metrics across the entire experiment.
|
|
160
|
+
|
|
161
|
+
experiment_tags: Optional list of tags to associate with the experiment.
|
|
159
162
|
"""
|
|
160
163
|
experiment_scoring_functions = (
|
|
161
164
|
[] if experiment_scoring_functions is None else experiment_scoring_functions
|
|
@@ -178,6 +181,7 @@ def evaluate(
|
|
|
178
181
|
dataset_name=dataset.name,
|
|
179
182
|
experiment_config=experiment_config,
|
|
180
183
|
prompts=checked_prompts,
|
|
184
|
+
tags=experiment_tags,
|
|
181
185
|
)
|
|
182
186
|
|
|
183
187
|
# wrap scoring functions if any
|
|
@@ -506,6 +510,7 @@ def evaluate_prompt(
|
|
|
506
510
|
dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
|
|
507
511
|
trial_count: int = 1,
|
|
508
512
|
experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
|
|
513
|
+
experiment_tags: Optional[List[str]] = None,
|
|
509
514
|
) -> evaluation_result.EvaluationResult:
|
|
510
515
|
"""
|
|
511
516
|
Performs prompt evaluation on a given dataset.
|
|
@@ -556,6 +561,8 @@ def evaluate_prompt(
|
|
|
556
561
|
Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
|
|
557
562
|
These scores are computed after all test results are collected and represent aggregate
|
|
558
563
|
metrics across the entire experiment.
|
|
564
|
+
|
|
565
|
+
experiment_tags: List of tags to be associated with the experiment.
|
|
559
566
|
"""
|
|
560
567
|
experiment_scoring_functions = (
|
|
561
568
|
[] if experiment_scoring_functions is None else experiment_scoring_functions
|
|
@@ -593,6 +600,7 @@ def evaluate_prompt(
|
|
|
593
600
|
dataset_name=dataset.name,
|
|
594
601
|
experiment_config=experiment_config,
|
|
595
602
|
prompts=prompts,
|
|
603
|
+
tags=experiment_tags,
|
|
596
604
|
)
|
|
597
605
|
|
|
598
606
|
# wrap scoring functions if any
|
|
@@ -691,6 +699,7 @@ def evaluate_optimization_trial(
|
|
|
691
699
|
dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
|
|
692
700
|
trial_count: int = 1,
|
|
693
701
|
experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
|
|
702
|
+
experiment_tags: Optional[List[str]] = None,
|
|
694
703
|
) -> evaluation_result.EvaluationResult:
|
|
695
704
|
"""
|
|
696
705
|
Performs task evaluation on a given dataset.
|
|
@@ -758,6 +767,8 @@ def evaluate_optimization_trial(
|
|
|
758
767
|
Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
|
|
759
768
|
These scores are computed after all test results are collected and represent aggregate
|
|
760
769
|
metrics across the entire experiment.
|
|
770
|
+
|
|
771
|
+
experiment_tags: A list of tags to associate with the experiment.
|
|
761
772
|
"""
|
|
762
773
|
experiment_scoring_functions = (
|
|
763
774
|
[] if experiment_scoring_functions is None else experiment_scoring_functions
|
|
@@ -792,6 +803,7 @@ def evaluate_optimization_trial(
|
|
|
792
803
|
prompts=checked_prompts,
|
|
793
804
|
type="trial",
|
|
794
805
|
optimization_id=optimization_id,
|
|
806
|
+
tags=experiment_tags,
|
|
795
807
|
)
|
|
796
808
|
|
|
797
809
|
return _evaluate_task(
|
|
@@ -93,7 +93,9 @@ class ConversationalCoherenceMetric(ConversationThreadMetric):
|
|
|
93
93
|
if isinstance(model, base_model.OpikBaseModel):
|
|
94
94
|
self._model = model
|
|
95
95
|
else:
|
|
96
|
-
self._model = models_factory.get(
|
|
96
|
+
self._model = models_factory.get(
|
|
97
|
+
model_name=model, track=self.track, temperature=temperature
|
|
98
|
+
)
|
|
97
99
|
|
|
98
100
|
def score(
|
|
99
101
|
self,
|
|
@@ -80,7 +80,9 @@ class SessionCompletenessQuality(ConversationThreadMetric):
|
|
|
80
80
|
if isinstance(model, base_model.OpikBaseModel):
|
|
81
81
|
self._model = model
|
|
82
82
|
else:
|
|
83
|
-
self._model = models_factory.get(
|
|
83
|
+
self._model = models_factory.get(
|
|
84
|
+
model_name=model, track=self.track, temperature=temperature
|
|
85
|
+
)
|
|
84
86
|
|
|
85
87
|
def score(
|
|
86
88
|
self,
|
|
@@ -92,7 +92,9 @@ class UserFrustrationMetric(ConversationThreadMetric):
|
|
|
92
92
|
if isinstance(model, base_model.OpikBaseModel):
|
|
93
93
|
self._model = model
|
|
94
94
|
else:
|
|
95
|
-
self._model = models_factory.get(
|
|
95
|
+
self._model = models_factory.get(
|
|
96
|
+
model_name=model, track=self.track, temperature=temperature
|
|
97
|
+
)
|
|
96
98
|
|
|
97
99
|
def score(
|
|
98
100
|
self,
|
|
@@ -42,22 +42,26 @@ class Equals(base_metric.BaseMetric):
|
|
|
42
42
|
self._case_sensitive = case_sensitive
|
|
43
43
|
|
|
44
44
|
def score(
|
|
45
|
-
self, output:
|
|
45
|
+
self, output: Any, reference: Any, **ignored_kwargs: Any
|
|
46
46
|
) -> score_result.ScoreResult:
|
|
47
47
|
"""
|
|
48
|
-
Calculate the score based on whether the output
|
|
48
|
+
Calculate the score based on whether the output exactly matches the expected output.
|
|
49
49
|
|
|
50
50
|
Args:
|
|
51
|
-
output: The output
|
|
52
|
-
reference: The expected output
|
|
51
|
+
output: The output to check. Will be converted to string for comparison.
|
|
52
|
+
reference: The expected output to compare against. Will be converted to string for comparison.
|
|
53
53
|
**ignored_kwargs: Additional keyword arguments that are ignored.
|
|
54
54
|
|
|
55
55
|
Returns:
|
|
56
|
-
score_result.ScoreResult: A ScoreResult object with a value of 1.0 if the
|
|
56
|
+
score_result.ScoreResult: A ScoreResult object with a value of 1.0 if the values match,
|
|
57
57
|
0.0 otherwise.
|
|
58
58
|
"""
|
|
59
|
-
|
|
60
|
-
|
|
59
|
+
# Convert to string to handle numeric and other types
|
|
60
|
+
output_str = str(output)
|
|
61
|
+
reference_str = str(reference)
|
|
62
|
+
|
|
63
|
+
value_left = output_str if self._case_sensitive else output_str.lower()
|
|
64
|
+
value_right = reference_str if self._case_sensitive else reference_str.lower()
|
|
61
65
|
|
|
62
66
|
if value_left == value_right:
|
|
63
67
|
return score_result.ScoreResult(value=1.0, name=self.name)
|
|
@@ -88,7 +88,9 @@ class AnswerRelevance(base_metric.BaseMetric):
|
|
|
88
88
|
if self._seed is not None:
|
|
89
89
|
model_kwargs["seed"] = self._seed
|
|
90
90
|
|
|
91
|
-
self._model = models_factory.get(
|
|
91
|
+
self._model = models_factory.get(
|
|
92
|
+
model_name=model, track=self.track, **model_kwargs
|
|
93
|
+
)
|
|
92
94
|
|
|
93
95
|
def _init_few_shot_examples(
|
|
94
96
|
self,
|
|
@@ -76,7 +76,9 @@ class ContextPrecision(base_metric.BaseMetric):
|
|
|
76
76
|
if self._seed is not None:
|
|
77
77
|
model_kwargs["seed"] = self._seed
|
|
78
78
|
|
|
79
|
-
self._model = models_factory.get(
|
|
79
|
+
self._model = models_factory.get(
|
|
80
|
+
model_name=model, track=self.track, **model_kwargs
|
|
81
|
+
)
|
|
80
82
|
|
|
81
83
|
def score(
|
|
82
84
|
self,
|
|
@@ -74,7 +74,9 @@ class ContextRecall(base_metric.BaseMetric):
|
|
|
74
74
|
if self._seed is not None:
|
|
75
75
|
model_kwargs["seed"] = self._seed
|
|
76
76
|
|
|
77
|
-
self._model = models_factory.get(
|
|
77
|
+
self._model = models_factory.get(
|
|
78
|
+
model_name=model, track=self.track, **model_kwargs
|
|
79
|
+
)
|
|
78
80
|
|
|
79
81
|
def score(
|
|
80
82
|
self,
|
|
@@ -63,7 +63,7 @@ class Factuality(base_metric.BaseMetric):
|
|
|
63
63
|
if isinstance(model, base_model.OpikBaseModel):
|
|
64
64
|
self._model = model
|
|
65
65
|
else:
|
|
66
|
-
self._model = models_factory.get(model_name=model)
|
|
66
|
+
self._model = models_factory.get(model_name=model, track=self.track)
|
|
67
67
|
|
|
68
68
|
def score(
|
|
69
69
|
self, input: str, output: str, context: List[str], **ignored_kwargs: Any
|
|
@@ -127,7 +127,9 @@ class GEval(base_metric.BaseMetric):
|
|
|
127
127
|
if self._seed is not None:
|
|
128
128
|
model_kwargs["seed"] = self._seed
|
|
129
129
|
|
|
130
|
-
self._model = models_factory.get(
|
|
130
|
+
self._model = models_factory.get(
|
|
131
|
+
model_name=model, track=self.track, **model_kwargs
|
|
132
|
+
)
|
|
131
133
|
|
|
132
134
|
if (
|
|
133
135
|
hasattr(self._model, "supported_params")
|
|
@@ -73,7 +73,9 @@ class Hallucination(base_metric.BaseMetric):
|
|
|
73
73
|
if self._seed is not None:
|
|
74
74
|
model_kwargs["seed"] = self._seed
|
|
75
75
|
|
|
76
|
-
self._model = models_factory.get(
|
|
76
|
+
self._model = models_factory.get(
|
|
77
|
+
model_name=model, track=self.track, **model_kwargs
|
|
78
|
+
)
|
|
77
79
|
|
|
78
80
|
def score(
|
|
79
81
|
self,
|
|
@@ -70,7 +70,9 @@ class Moderation(base_metric.BaseMetric):
|
|
|
70
70
|
if self._seed is not None:
|
|
71
71
|
model_kwargs["seed"] = self._seed
|
|
72
72
|
|
|
73
|
-
self._model = models_factory.get(
|
|
73
|
+
self._model = models_factory.get(
|
|
74
|
+
model_name=model, track=self.track, **model_kwargs
|
|
75
|
+
)
|
|
74
76
|
|
|
75
77
|
def score(self, output: str, **ignored_kwargs: Any) -> score_result.ScoreResult:
|
|
76
78
|
"""
|
|
@@ -69,7 +69,9 @@ class StructuredOutputCompliance(base_metric.BaseMetric):
|
|
|
69
69
|
if self._seed is not None:
|
|
70
70
|
model_kwargs["seed"] = self._seed
|
|
71
71
|
|
|
72
|
-
self._model = models_factory.get(
|
|
72
|
+
self._model = models_factory.get(
|
|
73
|
+
model_name=model, track=self.track, **model_kwargs
|
|
74
|
+
)
|
|
73
75
|
|
|
74
76
|
def score(
|
|
75
77
|
self,
|
|
@@ -93,7 +93,7 @@ class SycEval(base_metric.BaseMetric):
|
|
|
93
93
|
if isinstance(model, base_model.OpikBaseModel):
|
|
94
94
|
self._model = model
|
|
95
95
|
else:
|
|
96
|
-
self._model = models_factory.get(model_name=model)
|
|
96
|
+
self._model = models_factory.get(model_name=model, track=self.track)
|
|
97
97
|
|
|
98
98
|
def _init_rebuttal_model(
|
|
99
99
|
self, rebuttal_model: Optional[Union[str, base_model.OpikBaseModel]]
|
|
@@ -101,7 +101,9 @@ class SycEval(base_metric.BaseMetric):
|
|
|
101
101
|
if isinstance(rebuttal_model, base_model.OpikBaseModel):
|
|
102
102
|
self._rebuttal_model = rebuttal_model
|
|
103
103
|
else:
|
|
104
|
-
self._rebuttal_model = models_factory.get(
|
|
104
|
+
self._rebuttal_model = models_factory.get(
|
|
105
|
+
model_name=rebuttal_model, track=self.track
|
|
106
|
+
)
|
|
105
107
|
|
|
106
108
|
def score(
|
|
107
109
|
self,
|
|
@@ -84,7 +84,9 @@ class TrajectoryAccuracy(base_metric.BaseMetric):
|
|
|
84
84
|
if self._seed is not None:
|
|
85
85
|
model_kwargs["seed"] = self._seed
|
|
86
86
|
|
|
87
|
-
self._model = models_factory.get(
|
|
87
|
+
self._model = models_factory.get(
|
|
88
|
+
model_name=model, track=self.track, **model_kwargs
|
|
89
|
+
)
|
|
88
90
|
|
|
89
91
|
def score(
|
|
90
92
|
self,
|
|
@@ -68,7 +68,9 @@ class Usefulness(base_metric.BaseMetric):
|
|
|
68
68
|
if self._seed is not None:
|
|
69
69
|
model_kwargs["seed"] = self._seed
|
|
70
70
|
|
|
71
|
-
self._model = models_factory.get(
|
|
71
|
+
self._model = models_factory.get(
|
|
72
|
+
model_name=model, track=self.track, **model_kwargs
|
|
73
|
+
)
|
|
72
74
|
|
|
73
75
|
def score(
|
|
74
76
|
self, input: str, output: str, **ignored_kwargs: Any
|
|
@@ -1,20 +1,13 @@
|
|
|
1
|
-
import asyncio
|
|
2
|
-
|
|
3
1
|
from opik.evaluation.metrics import base_metric, score_result
|
|
4
2
|
import opik.exceptions as exceptions
|
|
5
3
|
|
|
6
4
|
from typing import Dict, Any, Optional, TYPE_CHECKING
|
|
5
|
+
import opik.opik_context as opik_context
|
|
7
6
|
|
|
8
7
|
if TYPE_CHECKING:
|
|
9
8
|
from ragas import metrics as ragas_metrics
|
|
10
9
|
from ragas import dataset_schema as ragas_dataset_schema
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def get_or_create_asyncio_loop() -> asyncio.AbstractEventLoop:
|
|
14
|
-
try:
|
|
15
|
-
return asyncio.get_running_loop()
|
|
16
|
-
except RuntimeError:
|
|
17
|
-
return asyncio.new_event_loop()
|
|
10
|
+
from opik.integrations.langchain import OpikTracer
|
|
18
11
|
|
|
19
12
|
|
|
20
13
|
class RagasMetricWrapper(base_metric.BaseMetric):
|
|
@@ -37,16 +30,6 @@ class RagasMetricWrapper(base_metric.BaseMetric):
|
|
|
37
30
|
ragas_metrics.MetricType.SINGLE_TURN.name
|
|
38
31
|
]
|
|
39
32
|
|
|
40
|
-
self._opik_tracer = None
|
|
41
|
-
if self.track:
|
|
42
|
-
from opik.integrations.langchain import OpikTracer
|
|
43
|
-
|
|
44
|
-
self._opik_tracer = OpikTracer()
|
|
45
|
-
|
|
46
|
-
self.callbacks = [self._opik_tracer]
|
|
47
|
-
else:
|
|
48
|
-
self.callbacks = []
|
|
49
|
-
|
|
50
33
|
def _create_ragas_single_turn_sample(
|
|
51
34
|
self, input_dict: Dict[str, Any]
|
|
52
35
|
) -> "ragas_dataset_schema.SingleTurnSample":
|
|
@@ -80,13 +63,50 @@ class RagasMetricWrapper(base_metric.BaseMetric):
|
|
|
80
63
|
async def ascore(self, **kwargs: Any) -> score_result.ScoreResult:
|
|
81
64
|
sample = self._create_ragas_single_turn_sample(kwargs)
|
|
82
65
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
)
|
|
66
|
+
callbacks = [_get_opik_tracer_instance()] if self.track else []
|
|
67
|
+
|
|
68
|
+
score = await self.ragas_metric.single_turn_ascore(sample, callbacks=callbacks)
|
|
86
69
|
return score_result.ScoreResult(value=score, name=self.name)
|
|
87
70
|
|
|
88
71
|
def score(self, **kwargs: Any) -> score_result.ScoreResult:
|
|
89
72
|
sample = self._create_ragas_single_turn_sample(kwargs)
|
|
90
73
|
|
|
91
|
-
|
|
74
|
+
callbacks = [_get_opik_tracer_instance()] if self.track else []
|
|
75
|
+
|
|
76
|
+
score = self.ragas_metric.single_turn_score(sample, callbacks=callbacks)
|
|
92
77
|
return score_result.ScoreResult(value=score, name=self.name)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _get_opik_tracer_instance() -> "OpikTracer":
|
|
81
|
+
from opik.integrations.langchain import OpikTracer
|
|
82
|
+
|
|
83
|
+
current_span_data = opik_context.get_current_span_data()
|
|
84
|
+
current_trace_data = opik_context.get_current_trace_data()
|
|
85
|
+
project_name = None
|
|
86
|
+
|
|
87
|
+
if current_span_data is not None:
|
|
88
|
+
project_name = (
|
|
89
|
+
current_trace_data.project_name
|
|
90
|
+
if current_trace_data is not None
|
|
91
|
+
else current_span_data.project_name
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
# OPIK-3505: Why opik_context_read_only_mode=True?
|
|
95
|
+
#
|
|
96
|
+
# Problem: Ragas runs metrics concurrently under the hood with a manual management
|
|
97
|
+
# of the event loop. It was discovered that these metrics share the same context and so
|
|
98
|
+
# ContextVar used in Opik context storage can't be modified safely by them because concurrent
|
|
99
|
+
# operations share the same span stack.
|
|
100
|
+
#
|
|
101
|
+
# Solution: Disable context modification (opik_context_read_only_mode=True).
|
|
102
|
+
# OpikTracer will still create spans/traces and track parent-child relationships
|
|
103
|
+
# using LangChain's Run IDs, but won't modify the shared ContextVar storage.
|
|
104
|
+
#
|
|
105
|
+
# Trade-off: @track-decorated functions called within Ragas won't be attached
|
|
106
|
+
# to the Ragas spans. This is acceptable since Ragas metrics are self-contained
|
|
107
|
+
# and don't typically call user-defined tracked functions.
|
|
108
|
+
opik_tracer = OpikTracer(
|
|
109
|
+
opik_context_read_only_mode=True,
|
|
110
|
+
project_name=project_name,
|
|
111
|
+
)
|
|
112
|
+
return opik_tracer
|
|
@@ -59,6 +59,7 @@ class LiteLLMChatModel(base_model.OpikBaseModel):
|
|
|
59
59
|
self,
|
|
60
60
|
model_name: str = "gpt-5-nano",
|
|
61
61
|
must_support_arguments: Optional[List[str]] = None,
|
|
62
|
+
track: bool = True,
|
|
62
63
|
**completion_kwargs: Any,
|
|
63
64
|
) -> None:
|
|
64
65
|
import litellm
|
|
@@ -75,7 +76,8 @@ class LiteLLMChatModel(base_model.OpikBaseModel):
|
|
|
75
76
|
`litellm.get_supported_openai_params(model_name)` call is used to get
|
|
76
77
|
supported arguments. If any is missing, ValueError is raised.
|
|
77
78
|
You can pass the arguments from the table: https://docs.litellm.ai/docs/completion/input#translated-openai-params
|
|
78
|
-
|
|
79
|
+
track: Whether to track the model calls. When False, disables tracing for this model instance.
|
|
80
|
+
Defaults to True.
|
|
79
81
|
completion_kwargs: key-value arguments to always pass additionally into `litellm.completion` function.
|
|
80
82
|
"""
|
|
81
83
|
super().__init__(model_name=model_name)
|
|
@@ -100,7 +102,10 @@ class LiteLLMChatModel(base_model.OpikBaseModel):
|
|
|
100
102
|
|
|
101
103
|
config = opik_config.OpikConfig()
|
|
102
104
|
|
|
103
|
-
if config
|
|
105
|
+
# Enable tracking only if both track parameter is True and config allows it
|
|
106
|
+
enable_tracking = track and config.enable_litellm_models_monitoring
|
|
107
|
+
|
|
108
|
+
if enable_tracking:
|
|
104
109
|
self._litellm_completion = litellm_integration.track_completion()(
|
|
105
110
|
litellm.completion
|
|
106
111
|
)
|
|
@@ -93,30 +93,14 @@ def _apply_qwen_dashscope_filters(
|
|
|
93
93
|
) -> None:
|
|
94
94
|
"""Apply Qwen/DashScope specific parameter filters.
|
|
95
95
|
|
|
96
|
-
|
|
97
|
-
in [0, 5]. When logprobs is false, drops top_logprobs; when logprobs is
|
|
98
|
-
true, clamps top_logprobs into [0, 5].
|
|
96
|
+
Does not return log probabilities.
|
|
99
97
|
"""
|
|
100
98
|
|
|
101
99
|
unsupported: list[tuple[str, Any]] = []
|
|
102
100
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
unsupported.append(("top_logprobs", params["top_logprobs"]))
|
|
107
|
-
else:
|
|
108
|
-
if "top_logprobs" in params:
|
|
109
|
-
raw_top_logprobs = params["top_logprobs"]
|
|
110
|
-
try:
|
|
111
|
-
top_logprobs = int(raw_top_logprobs)
|
|
112
|
-
except (TypeError, ValueError):
|
|
113
|
-
unsupported.append(("top_logprobs", raw_top_logprobs))
|
|
114
|
-
else:
|
|
115
|
-
if top_logprobs < 0:
|
|
116
|
-
top_logprobs = 0
|
|
117
|
-
elif top_logprobs > 5:
|
|
118
|
-
top_logprobs = 5
|
|
119
|
-
params["top_logprobs"] = top_logprobs
|
|
101
|
+
for param in ("logprobs", "top_logprobs"):
|
|
102
|
+
if param in params:
|
|
103
|
+
unsupported.append((param, params[param]))
|
|
120
104
|
|
|
121
105
|
_drop_unsupported_params_with_warning(
|
|
122
106
|
params,
|
|
@@ -18,18 +18,32 @@ def _freeze(value: Any) -> Any:
|
|
|
18
18
|
return value
|
|
19
19
|
|
|
20
20
|
|
|
21
|
-
def _make_cache_key(model_name: str, model_kwargs: Dict[str, Any]) -> Any:
|
|
21
|
+
def _make_cache_key(model_name: str, track: bool, model_kwargs: Dict[str, Any]) -> Any:
|
|
22
22
|
frozen_kwargs = frozenset((k, _freeze(v)) for k, v in model_kwargs.items())
|
|
23
|
-
return (model_name, frozen_kwargs)
|
|
23
|
+
return (model_name, track, frozen_kwargs)
|
|
24
24
|
|
|
25
25
|
|
|
26
|
-
def get(
|
|
26
|
+
def get(
|
|
27
|
+
model_name: Optional[str], track: bool = True, **model_kwargs: Any
|
|
28
|
+
) -> base_model.OpikBaseModel:
|
|
29
|
+
"""
|
|
30
|
+
Get or create a cached LiteLLM chat model instance.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
model_name: The name of the model to use. Defaults to DEFAULT_GPT_MODEL_NAME if None.
|
|
34
|
+
track: Whether to track the model calls. When False, disables tracing for this model instance.
|
|
35
|
+
Defaults to True.
|
|
36
|
+
**model_kwargs: Additional keyword arguments to pass to the model constructor.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
A cached or newly created OpikBaseModel instance.
|
|
40
|
+
"""
|
|
27
41
|
if model_name is None:
|
|
28
42
|
model_name = DEFAULT_GPT_MODEL_NAME
|
|
29
43
|
|
|
30
|
-
cache_key = _make_cache_key(model_name, model_kwargs)
|
|
44
|
+
cache_key = _make_cache_key(model_name, track, model_kwargs)
|
|
31
45
|
if cache_key not in _MODEL_CACHE:
|
|
32
46
|
_MODEL_CACHE[cache_key] = litellm_chat_model.LiteLLMChatModel(
|
|
33
|
-
model_name=model_name, **model_kwargs
|
|
47
|
+
model_name=model_name, track=track, **model_kwargs
|
|
34
48
|
)
|
|
35
49
|
return _MODEL_CACHE[cache_key]
|
|
@@ -2,7 +2,7 @@ import logging
|
|
|
2
2
|
from typing import List, Optional
|
|
3
3
|
|
|
4
4
|
from opik.api_objects import dataset, experiment, opik_client
|
|
5
|
-
from opik.types import
|
|
5
|
+
from opik.types import BatchFeedbackScoreDict
|
|
6
6
|
from . import test_case
|
|
7
7
|
from .metrics import score_result
|
|
8
8
|
from .types import ScoringKeyMappingType
|
|
@@ -80,13 +80,13 @@ def log_test_result_feedback_scores(
|
|
|
80
80
|
trace_id: str,
|
|
81
81
|
project_name: Optional[str],
|
|
82
82
|
) -> None:
|
|
83
|
-
all_trace_scores: List[
|
|
83
|
+
all_trace_scores: List[BatchFeedbackScoreDict] = []
|
|
84
84
|
|
|
85
85
|
for score_result_ in score_results:
|
|
86
86
|
if score_result_.scoring_failed:
|
|
87
87
|
continue
|
|
88
88
|
|
|
89
|
-
trace_score =
|
|
89
|
+
trace_score = BatchFeedbackScoreDict(
|
|
90
90
|
id=trace_id,
|
|
91
91
|
name=score_result_.name,
|
|
92
92
|
value=score_result_.value,
|
|
@@ -4,7 +4,7 @@ from . import evaluation_result
|
|
|
4
4
|
from ...api_objects import opik_client
|
|
5
5
|
from ...api_objects.conversation import conversation_thread, conversation_factory
|
|
6
6
|
from ...rest_api import TraceThread, JsonListStringPublic
|
|
7
|
-
from ...types import
|
|
7
|
+
from ...types import BatchFeedbackScoreDict
|
|
8
8
|
from ...api_objects.threads import threads_client
|
|
9
9
|
|
|
10
10
|
|
|
@@ -15,7 +15,7 @@ def log_feedback_scores(
|
|
|
15
15
|
) -> None:
|
|
16
16
|
for result in results:
|
|
17
17
|
feedback_scores = [
|
|
18
|
-
|
|
18
|
+
BatchFeedbackScoreDict(
|
|
19
19
|
id=result.thread_id,
|
|
20
20
|
name=score.name,
|
|
21
21
|
value=score.value,
|
|
@@ -42,6 +42,7 @@ def load_conversation_thread(
|
|
|
42
42
|
project_name=project_name,
|
|
43
43
|
filter_string=f'thread_id = "{thread.id}"',
|
|
44
44
|
max_results=max_results,
|
|
45
|
+
truncate=False,
|
|
45
46
|
)
|
|
46
47
|
return conversation_factory.create_conversation_from_traces(
|
|
47
48
|
traces=traces,
|