PyPI - opik - Versions diffs - 1.8.39__py3-none-any.whl → 1.9.71__py3-none-any.whl - Mend

opik 1.8.39py3-none-any.whl → 1.9.71py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (592) hide show

opik/__init__.py +19 -3
opik/anonymizer/__init__.py +5 -0
opik/anonymizer/anonymizer.py +12 -0
opik/anonymizer/factory.py +80 -0
opik/anonymizer/recursive_anonymizer.py +64 -0
opik/anonymizer/rules.py +56 -0
opik/anonymizer/rules_anonymizer.py +35 -0
opik/api_objects/attachment/attachment_context.py +36 -0
opik/api_objects/attachment/attachments_extractor.py +153 -0
opik/api_objects/attachment/client.py +1 -0
opik/api_objects/attachment/converters.py +2 -0
opik/api_objects/attachment/decoder.py +18 -0
opik/api_objects/attachment/decoder_base64.py +83 -0
opik/api_objects/attachment/decoder_helpers.py +137 -0
opik/api_objects/data_helpers.py +79 -0
opik/api_objects/dataset/dataset.py +64 -4
opik/api_objects/dataset/rest_operations.py +11 -2
opik/api_objects/experiment/experiment.py +57 -57
opik/api_objects/experiment/experiment_item.py +2 -1
opik/api_objects/experiment/experiments_client.py +64 -0
opik/api_objects/experiment/helpers.py +35 -11
opik/api_objects/experiment/rest_operations.py +65 -5
opik/api_objects/helpers.py +8 -5
opik/api_objects/local_recording.py +81 -0
opik/api_objects/opik_client.py +600 -108
opik/api_objects/opik_query_language.py +39 -5
opik/api_objects/prompt/__init__.py +12 -2
opik/api_objects/prompt/base_prompt.py +69 -0
opik/api_objects/prompt/base_prompt_template.py +29 -0
opik/api_objects/prompt/chat/__init__.py +1 -0
opik/api_objects/prompt/chat/chat_prompt.py +210 -0
opik/api_objects/prompt/chat/chat_prompt_template.py +350 -0
opik/api_objects/prompt/chat/content_renderer_registry.py +203 -0
opik/api_objects/prompt/client.py +189 -47
opik/api_objects/prompt/text/__init__.py +1 -0
opik/api_objects/prompt/text/prompt.py +174 -0
opik/api_objects/prompt/{prompt_template.py → text/prompt_template.py} +10 -6
opik/api_objects/prompt/types.py +23 -0
opik/api_objects/search_helpers.py +89 -0
opik/api_objects/span/span_data.py +35 -25
opik/api_objects/threads/threads_client.py +39 -5
opik/api_objects/trace/trace_client.py +52 -2
opik/api_objects/trace/trace_data.py +15 -24
opik/api_objects/validation_helpers.py +3 -3
opik/cli/__init__.py +5 -0
opik/cli/__main__.py +6 -0
opik/cli/configure.py +66 -0
opik/cli/exports/__init__.py +131 -0
opik/cli/exports/dataset.py +278 -0
opik/cli/exports/experiment.py +784 -0
opik/cli/exports/project.py +685 -0
opik/cli/exports/prompt.py +578 -0
opik/cli/exports/utils.py +406 -0
opik/cli/harbor.py +39 -0
opik/cli/healthcheck.py +21 -0
opik/cli/imports/__init__.py +439 -0
opik/cli/imports/dataset.py +143 -0
opik/cli/imports/experiment.py +1192 -0
opik/cli/imports/project.py +262 -0
opik/cli/imports/prompt.py +177 -0
opik/cli/imports/utils.py +280 -0
opik/cli/main.py +49 -0
opik/cli/proxy.py +93 -0
opik/cli/usage_report/__init__.py +16 -0
opik/cli/usage_report/charts.py +783 -0
opik/cli/usage_report/cli.py +274 -0
opik/cli/usage_report/constants.py +9 -0
opik/cli/usage_report/extraction.py +749 -0
opik/cli/usage_report/pdf.py +244 -0
opik/cli/usage_report/statistics.py +78 -0
opik/cli/usage_report/utils.py +235 -0
opik/config.py +13 -7
opik/configurator/configure.py +17 -0
opik/datetime_helpers.py +12 -0
opik/decorator/arguments_helpers.py +9 -1
opik/decorator/base_track_decorator.py +205 -133
opik/decorator/context_manager/span_context_manager.py +123 -0
opik/decorator/context_manager/trace_context_manager.py +84 -0
opik/decorator/opik_args/__init__.py +13 -0
opik/decorator/opik_args/api_classes.py +71 -0
opik/decorator/opik_args/helpers.py +120 -0
opik/decorator/span_creation_handler.py +25 -6
opik/dict_utils.py +3 -3
opik/evaluation/__init__.py +13 -2
opik/evaluation/engine/engine.py +272 -75
opik/evaluation/engine/evaluation_tasks_executor.py +6 -3
opik/evaluation/engine/helpers.py +31 -6
opik/evaluation/engine/metrics_evaluator.py +237 -0
opik/evaluation/evaluation_result.py +168 -2
opik/evaluation/evaluator.py +533 -62
opik/evaluation/metrics/__init__.py +103 -4
opik/evaluation/metrics/aggregated_metric.py +35 -6
opik/evaluation/metrics/base_metric.py +1 -1
opik/evaluation/metrics/conversation/__init__.py +48 -0
opik/evaluation/metrics/conversation/conversation_thread_metric.py +56 -2
opik/evaluation/metrics/conversation/g_eval_wrappers.py +19 -0
opik/evaluation/metrics/conversation/helpers.py +14 -15
opik/evaluation/metrics/conversation/heuristics/__init__.py +14 -0
opik/evaluation/metrics/conversation/heuristics/degeneration/__init__.py +3 -0
opik/evaluation/metrics/conversation/heuristics/degeneration/metric.py +189 -0
opik/evaluation/metrics/conversation/heuristics/degeneration/phrases.py +12 -0
opik/evaluation/metrics/conversation/heuristics/knowledge_retention/__init__.py +3 -0
opik/evaluation/metrics/conversation/heuristics/knowledge_retention/metric.py +172 -0
opik/evaluation/metrics/conversation/llm_judges/__init__.py +32 -0
opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/metric.py +22 -17
opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/templates.py +1 -1
opik/evaluation/metrics/conversation/llm_judges/g_eval_wrappers.py +442 -0
opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/metric.py +13 -7
opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/templates.py +1 -1
opik/evaluation/metrics/conversation/llm_judges/user_frustration/__init__.py +0 -0
opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/metric.py +21 -14
opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/templates.py +1 -1
opik/evaluation/metrics/conversation/types.py +4 -5
opik/evaluation/metrics/conversation_types.py +9 -0
opik/evaluation/metrics/heuristics/bertscore.py +107 -0
opik/evaluation/metrics/heuristics/bleu.py +35 -15
opik/evaluation/metrics/heuristics/chrf.py +127 -0
opik/evaluation/metrics/heuristics/contains.py +47 -11
opik/evaluation/metrics/heuristics/distribution_metrics.py +331 -0
opik/evaluation/metrics/heuristics/gleu.py +113 -0
opik/evaluation/metrics/heuristics/language_adherence.py +123 -0
opik/evaluation/metrics/heuristics/meteor.py +119 -0
opik/evaluation/metrics/heuristics/prompt_injection.py +150 -0
opik/evaluation/metrics/heuristics/readability.py +129 -0
opik/evaluation/metrics/heuristics/rouge.py +26 -9
opik/evaluation/metrics/heuristics/spearman.py +88 -0
opik/evaluation/metrics/heuristics/tone.py +155 -0
opik/evaluation/metrics/heuristics/vader_sentiment.py +77 -0
opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +20 -5
opik/evaluation/metrics/llm_judges/context_precision/metric.py +20 -6
opik/evaluation/metrics/llm_judges/context_recall/metric.py +20 -6
opik/evaluation/metrics/llm_judges/g_eval/__init__.py +5 -0
opik/evaluation/metrics/llm_judges/g_eval/metric.py +219 -68
opik/evaluation/metrics/llm_judges/g_eval/parser.py +102 -52
opik/evaluation/metrics/llm_judges/g_eval/presets.py +209 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/__init__.py +36 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/agent_assessment.py +77 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/bias_classifier.py +181 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/compliance_risk.py +41 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/prompt_uncertainty.py +41 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/qa_suite.py +146 -0
opik/evaluation/metrics/llm_judges/hallucination/metric.py +16 -3
opik/evaluation/metrics/llm_judges/llm_juries/__init__.py +3 -0
opik/evaluation/metrics/llm_judges/llm_juries/metric.py +76 -0
opik/evaluation/metrics/llm_judges/moderation/metric.py +16 -4
opik/evaluation/metrics/llm_judges/structure_output_compliance/__init__.py +0 -0
opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +144 -0
opik/evaluation/metrics/llm_judges/structure_output_compliance/parser.py +79 -0
opik/evaluation/metrics/llm_judges/structure_output_compliance/schema.py +15 -0
opik/evaluation/metrics/llm_judges/structure_output_compliance/template.py +50 -0
opik/evaluation/metrics/llm_judges/syc_eval/__init__.py +0 -0
opik/evaluation/metrics/llm_judges/syc_eval/metric.py +252 -0
opik/evaluation/metrics/llm_judges/syc_eval/parser.py +82 -0
opik/evaluation/metrics/llm_judges/syc_eval/template.py +155 -0
opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +20 -5
opik/evaluation/metrics/llm_judges/usefulness/metric.py +16 -4
opik/evaluation/metrics/ragas_metric.py +43 -23
opik/evaluation/models/__init__.py +8 -0
opik/evaluation/models/base_model.py +107 -1
opik/evaluation/models/langchain/langchain_chat_model.py +15 -7
opik/evaluation/models/langchain/message_converters.py +97 -15
opik/evaluation/models/litellm/litellm_chat_model.py +156 -29
opik/evaluation/models/litellm/util.py +125 -0
opik/evaluation/models/litellm/warning_filters.py +16 -4
opik/evaluation/models/model_capabilities.py +187 -0
opik/evaluation/models/models_factory.py +25 -3
opik/evaluation/preprocessing.py +92 -0
opik/evaluation/report.py +70 -12
opik/evaluation/rest_operations.py +49 -45
opik/evaluation/samplers/__init__.py +4 -0
opik/evaluation/samplers/base_dataset_sampler.py +40 -0
opik/evaluation/samplers/random_dataset_sampler.py +48 -0
opik/evaluation/score_statistics.py +66 -0
opik/evaluation/scorers/__init__.py +4 -0
opik/evaluation/scorers/scorer_function.py +55 -0
opik/evaluation/scorers/scorer_wrapper_metric.py +130 -0
opik/evaluation/test_case.py +3 -2
opik/evaluation/test_result.py +1 -0
opik/evaluation/threads/evaluator.py +31 -3
opik/evaluation/threads/helpers.py +3 -2
opik/evaluation/types.py +9 -1
opik/exceptions.py +33 -0
opik/file_upload/file_uploader.py +13 -0
opik/file_upload/upload_options.py +2 -0
opik/hooks/__init__.py +23 -0
opik/hooks/anonymizer_hook.py +36 -0
opik/hooks/httpx_client_hook.py +112 -0
opik/httpx_client.py +12 -9
opik/id_helpers.py +18 -0
opik/integrations/adk/graph/subgraph_edges_builders.py +1 -2
opik/integrations/adk/helpers.py +16 -7
opik/integrations/adk/legacy_opik_tracer.py +7 -4
opik/integrations/adk/opik_tracer.py +14 -1
opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +7 -3
opik/integrations/adk/recursive_callback_injector.py +4 -7
opik/integrations/bedrock/converse/__init__.py +0 -0
opik/integrations/bedrock/converse/chunks_aggregator.py +188 -0
opik/integrations/bedrock/{converse_decorator.py → converse/converse_decorator.py} +4 -3
opik/integrations/bedrock/invoke_agent_decorator.py +5 -4
opik/integrations/bedrock/invoke_model/__init__.py +0 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/__init__.py +78 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/api.py +45 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/base.py +23 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/claude.py +121 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/format_detector.py +107 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/llama.py +108 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/mistral.py +118 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/nova.py +99 -0
opik/integrations/bedrock/invoke_model/invoke_model_decorator.py +178 -0
opik/integrations/bedrock/invoke_model/response_types.py +34 -0
opik/integrations/bedrock/invoke_model/stream_wrappers.py +122 -0
opik/integrations/bedrock/invoke_model/usage_converters.py +87 -0
opik/integrations/bedrock/invoke_model/usage_extraction.py +108 -0
opik/integrations/bedrock/opik_tracker.py +42 -4
opik/integrations/bedrock/types.py +19 -0
opik/integrations/crewai/crewai_decorator.py +8 -51
opik/integrations/crewai/opik_tracker.py +31 -10
opik/integrations/crewai/patchers/__init__.py +5 -0
opik/integrations/crewai/patchers/flow.py +118 -0
opik/integrations/crewai/patchers/litellm_completion.py +30 -0
opik/integrations/crewai/patchers/llm_client.py +207 -0
opik/integrations/dspy/callback.py +80 -17
opik/integrations/dspy/parsers.py +168 -0
opik/integrations/harbor/__init__.py +17 -0
opik/integrations/harbor/experiment_service.py +269 -0
opik/integrations/harbor/opik_tracker.py +528 -0
opik/integrations/haystack/opik_connector.py +2 -2
opik/integrations/haystack/opik_tracer.py +3 -7
opik/integrations/langchain/__init__.py +3 -1
opik/integrations/langchain/helpers.py +96 -0
opik/integrations/langchain/langgraph_async_context_bridge.py +131 -0
opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
opik/integrations/langchain/opik_encoder_extension.py +1 -1
opik/integrations/langchain/opik_tracer.py +474 -229
opik/integrations/litellm/__init__.py +5 -0
opik/integrations/litellm/completion_chunks_aggregator.py +115 -0
opik/integrations/litellm/litellm_completion_decorator.py +242 -0
opik/integrations/litellm/opik_tracker.py +43 -0
opik/integrations/litellm/stream_patchers.py +151 -0
opik/integrations/llama_index/callback.py +146 -107
opik/integrations/openai/agents/opik_tracing_processor.py +1 -2
opik/integrations/openai/openai_chat_completions_decorator.py +2 -16
opik/integrations/openai/opik_tracker.py +1 -1
opik/integrations/sagemaker/auth.py +5 -1
opik/llm_usage/google_usage.py +3 -1
opik/llm_usage/opik_usage.py +7 -8
opik/llm_usage/opik_usage_factory.py +4 -2
opik/logging_messages.py +6 -0
opik/message_processing/batching/base_batcher.py +14 -21
opik/message_processing/batching/batch_manager.py +22 -10
opik/message_processing/batching/batch_manager_constuctors.py +10 -0
opik/message_processing/batching/batchers.py +59 -27
opik/message_processing/batching/flushing_thread.py +0 -3
opik/message_processing/emulation/__init__.py +0 -0
opik/message_processing/emulation/emulator_message_processor.py +578 -0
opik/message_processing/emulation/local_emulator_message_processor.py +140 -0
opik/message_processing/emulation/models.py +162 -0
opik/message_processing/encoder_helpers.py +79 -0
opik/message_processing/messages.py +56 -1
opik/message_processing/preprocessing/__init__.py +0 -0
opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
opik/message_processing/preprocessing/constants.py +1 -0
opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
opik/message_processing/preprocessing/preprocessor.py +36 -0
opik/message_processing/processors/__init__.py +0 -0
opik/message_processing/processors/attachments_extraction_processor.py +146 -0
opik/message_processing/processors/message_processors.py +92 -0
opik/message_processing/processors/message_processors_chain.py +96 -0
opik/message_processing/{message_processors.py → processors/online_message_processor.py} +85 -29
opik/message_processing/queue_consumer.py +9 -3
opik/message_processing/streamer.py +71 -33
opik/message_processing/streamer_constructors.py +43 -10
opik/opik_context.py +16 -4
opik/plugins/pytest/hooks.py +5 -3
opik/rest_api/__init__.py +346 -15
opik/rest_api/alerts/__init__.py +7 -0
opik/rest_api/alerts/client.py +667 -0
opik/rest_api/alerts/raw_client.py +1015 -0
opik/rest_api/alerts/types/__init__.py +7 -0
opik/rest_api/alerts/types/get_webhook_examples_request_alert_type.py +5 -0
opik/rest_api/annotation_queues/__init__.py +4 -0
opik/rest_api/annotation_queues/client.py +668 -0
opik/rest_api/annotation_queues/raw_client.py +1019 -0
opik/rest_api/automation_rule_evaluators/client.py +34 -2
opik/rest_api/automation_rule_evaluators/raw_client.py +24 -0
opik/rest_api/client.py +15 -0
opik/rest_api/dashboards/__init__.py +4 -0
opik/rest_api/dashboards/client.py +462 -0
opik/rest_api/dashboards/raw_client.py +648 -0
opik/rest_api/datasets/client.py +1310 -44
opik/rest_api/datasets/raw_client.py +2269 -358
opik/rest_api/experiments/__init__.py +2 -2
opik/rest_api/experiments/client.py +191 -5
opik/rest_api/experiments/raw_client.py +301 -7
opik/rest_api/experiments/types/__init__.py +4 -1
opik/rest_api/experiments/types/experiment_update_status.py +5 -0
opik/rest_api/experiments/types/experiment_update_type.py +5 -0
opik/rest_api/experiments/types/experiment_write_status.py +5 -0
opik/rest_api/feedback_definitions/types/find_feedback_definitions_request_type.py +1 -1
opik/rest_api/llm_provider_key/client.py +20 -0
opik/rest_api/llm_provider_key/raw_client.py +20 -0
opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +1 -1
opik/rest_api/manual_evaluation/__init__.py +4 -0
opik/rest_api/manual_evaluation/client.py +347 -0
opik/rest_api/manual_evaluation/raw_client.py +543 -0
opik/rest_api/optimizations/client.py +145 -9
opik/rest_api/optimizations/raw_client.py +237 -13
opik/rest_api/optimizations/types/optimization_update_status.py +3 -1
opik/rest_api/prompts/__init__.py +2 -2
opik/rest_api/prompts/client.py +227 -6
opik/rest_api/prompts/raw_client.py +331 -2
opik/rest_api/prompts/types/__init__.py +3 -1
opik/rest_api/prompts/types/create_prompt_version_detail_template_structure.py +5 -0
opik/rest_api/prompts/types/prompt_write_template_structure.py +5 -0
opik/rest_api/spans/__init__.py +0 -2
opik/rest_api/spans/client.py +238 -76
opik/rest_api/spans/raw_client.py +307 -95
opik/rest_api/spans/types/__init__.py +0 -2
opik/rest_api/traces/client.py +572 -161
opik/rest_api/traces/raw_client.py +736 -229
opik/rest_api/types/__init__.py +352 -17
opik/rest_api/types/aggregation_data.py +1 -0
opik/rest_api/types/alert.py +33 -0
opik/rest_api/types/alert_alert_type.py +5 -0
opik/rest_api/types/alert_page_public.py +24 -0
opik/rest_api/types/alert_public.py +33 -0
opik/rest_api/types/alert_public_alert_type.py +5 -0
opik/rest_api/types/alert_trigger.py +27 -0
opik/rest_api/types/alert_trigger_config.py +28 -0
opik/rest_api/types/alert_trigger_config_public.py +28 -0
opik/rest_api/types/alert_trigger_config_public_type.py +10 -0
opik/rest_api/types/alert_trigger_config_type.py +10 -0
opik/rest_api/types/alert_trigger_config_write.py +22 -0
opik/rest_api/types/alert_trigger_config_write_type.py +10 -0
opik/rest_api/types/alert_trigger_event_type.py +19 -0
opik/rest_api/types/alert_trigger_public.py +27 -0
opik/rest_api/types/alert_trigger_public_event_type.py +19 -0
opik/rest_api/types/alert_trigger_write.py +23 -0
opik/rest_api/types/alert_trigger_write_event_type.py +19 -0
opik/rest_api/types/alert_write.py +28 -0
opik/rest_api/types/alert_write_alert_type.py +5 -0
opik/rest_api/types/annotation_queue.py +42 -0
opik/rest_api/types/annotation_queue_batch.py +27 -0
opik/rest_api/types/annotation_queue_item_ids.py +19 -0
opik/rest_api/types/annotation_queue_page_public.py +28 -0
opik/rest_api/types/annotation_queue_public.py +38 -0
opik/rest_api/types/annotation_queue_public_scope.py +5 -0
opik/rest_api/types/annotation_queue_reviewer.py +20 -0
opik/rest_api/types/annotation_queue_reviewer_public.py +20 -0
opik/rest_api/types/annotation_queue_scope.py +5 -0
opik/rest_api/types/annotation_queue_write.py +31 -0
opik/rest_api/types/annotation_queue_write_scope.py +5 -0
opik/rest_api/types/audio_url.py +19 -0
opik/rest_api/types/audio_url_public.py +19 -0
opik/rest_api/types/audio_url_write.py +19 -0
opik/rest_api/types/automation_rule_evaluator.py +62 -2
opik/rest_api/types/automation_rule_evaluator_llm_as_judge.py +2 -0
opik/rest_api/types/automation_rule_evaluator_llm_as_judge_public.py +2 -0
opik/rest_api/types/automation_rule_evaluator_llm_as_judge_write.py +2 -0
opik/rest_api/types/automation_rule_evaluator_object_object_public.py +155 -0
opik/rest_api/types/automation_rule_evaluator_page_public.py +3 -2
opik/rest_api/types/automation_rule_evaluator_public.py +57 -2
opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_public.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_write.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_public.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_write.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_public.py +2 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_write.py +2 -0
opik/rest_api/types/automation_rule_evaluator_update.py +51 -1
opik/rest_api/types/automation_rule_evaluator_update_llm_as_judge.py +2 -0
opik/rest_api/types/automation_rule_evaluator_update_span_llm_as_judge.py +22 -0
opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
opik/rest_api/types/automation_rule_evaluator_update_trace_thread_llm_as_judge.py +2 -0
opik/rest_api/types/automation_rule_evaluator_update_trace_thread_user_defined_metric_python.py +2 -0
opik/rest_api/types/automation_rule_evaluator_update_user_defined_metric_python.py +2 -0
opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python.py +2 -0
opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_public.py +2 -0
opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_write.py +2 -0
opik/rest_api/types/automation_rule_evaluator_write.py +51 -1
opik/rest_api/types/boolean_feedback_definition.py +25 -0
opik/rest_api/types/boolean_feedback_definition_create.py +20 -0
opik/rest_api/types/boolean_feedback_definition_public.py +25 -0
opik/rest_api/types/boolean_feedback_definition_update.py +20 -0
opik/rest_api/types/boolean_feedback_detail.py +29 -0
opik/rest_api/types/boolean_feedback_detail_create.py +29 -0
opik/rest_api/types/boolean_feedback_detail_public.py +29 -0
opik/rest_api/types/boolean_feedback_detail_update.py +29 -0
opik/rest_api/types/dashboard_page_public.py +24 -0
opik/rest_api/types/dashboard_public.py +30 -0
opik/rest_api/types/dataset.py +4 -0
opik/rest_api/types/dataset_expansion.py +42 -0
opik/rest_api/types/dataset_expansion_response.py +39 -0
opik/rest_api/types/dataset_item.py +2 -0
opik/rest_api/types/dataset_item_changes_public.py +5 -0
opik/rest_api/types/dataset_item_compare.py +2 -0
opik/rest_api/types/dataset_item_filter.py +27 -0
opik/rest_api/types/dataset_item_filter_operator.py +21 -0
opik/rest_api/types/dataset_item_page_compare.py +5 -0
opik/rest_api/types/dataset_item_page_public.py +5 -0
opik/rest_api/types/dataset_item_public.py +2 -0
opik/rest_api/types/dataset_item_update.py +39 -0
opik/rest_api/types/dataset_item_write.py +1 -0
opik/rest_api/types/dataset_public.py +4 -0
opik/rest_api/types/dataset_public_status.py +5 -0
opik/rest_api/types/dataset_status.py +5 -0
opik/rest_api/types/dataset_version_diff.py +22 -0
opik/rest_api/types/dataset_version_diff_stats.py +24 -0
opik/rest_api/types/dataset_version_page_public.py +23 -0
opik/rest_api/types/dataset_version_public.py +59 -0
opik/rest_api/types/dataset_version_summary.py +46 -0
opik/rest_api/types/dataset_version_summary_public.py +46 -0
opik/rest_api/types/experiment.py +7 -2
opik/rest_api/types/experiment_group_response.py +2 -0
opik/rest_api/types/experiment_public.py +7 -2
opik/rest_api/types/experiment_public_status.py +5 -0
opik/rest_api/types/experiment_score.py +20 -0
opik/rest_api/types/experiment_score_public.py +20 -0
opik/rest_api/types/experiment_score_write.py +20 -0
opik/rest_api/types/experiment_status.py +5 -0
opik/rest_api/types/feedback.py +25 -1
opik/rest_api/types/feedback_create.py +20 -1
opik/rest_api/types/feedback_object_public.py +27 -1
opik/rest_api/types/feedback_public.py +25 -1
opik/rest_api/types/feedback_score_batch_item.py +2 -1
opik/rest_api/types/feedback_score_batch_item_thread.py +2 -1
opik/rest_api/types/feedback_score_public.py +4 -0
opik/rest_api/types/feedback_update.py +20 -1
opik/rest_api/types/group_content_with_aggregations.py +1 -0
opik/rest_api/types/group_detail.py +19 -0
opik/rest_api/types/group_details.py +20 -0
opik/rest_api/types/guardrail.py +1 -0
opik/rest_api/types/guardrail_write.py +1 -0
opik/rest_api/types/ids_holder.py +19 -0
opik/rest_api/types/image_url.py +20 -0
opik/rest_api/types/image_url_public.py +20 -0
opik/rest_api/types/image_url_write.py +20 -0
opik/rest_api/types/llm_as_judge_message.py +5 -1
opik/rest_api/types/llm_as_judge_message_content.py +26 -0
opik/rest_api/types/llm_as_judge_message_content_public.py +26 -0
opik/rest_api/types/llm_as_judge_message_content_write.py +26 -0
opik/rest_api/types/llm_as_judge_message_public.py +5 -1
opik/rest_api/types/llm_as_judge_message_write.py +5 -1
opik/rest_api/types/llm_as_judge_model_parameters.py +3 -0
opik/rest_api/types/llm_as_judge_model_parameters_public.py +3 -0
opik/rest_api/types/llm_as_judge_model_parameters_write.py +3 -0
opik/rest_api/types/manual_evaluation_request.py +38 -0
opik/rest_api/types/manual_evaluation_request_entity_type.py +5 -0
opik/rest_api/types/manual_evaluation_response.py +27 -0
opik/rest_api/types/optimization.py +4 -2
opik/rest_api/types/optimization_public.py +4 -2
opik/rest_api/types/optimization_public_status.py +3 -1
opik/rest_api/types/optimization_status.py +3 -1
opik/rest_api/types/optimization_studio_config.py +27 -0
opik/rest_api/types/optimization_studio_config_public.py +27 -0
opik/rest_api/types/optimization_studio_config_write.py +27 -0
opik/rest_api/types/optimization_studio_log.py +22 -0
opik/rest_api/types/optimization_write.py +4 -2
opik/rest_api/types/optimization_write_status.py +3 -1
opik/rest_api/types/project.py +1 -0
opik/rest_api/types/project_detailed.py +1 -0
opik/rest_api/types/project_reference.py +31 -0
opik/rest_api/types/project_reference_public.py +31 -0
opik/rest_api/types/project_stats_summary_item.py +1 -0
opik/rest_api/types/prompt.py +6 -0
opik/rest_api/types/prompt_detail.py +6 -0
opik/rest_api/types/prompt_detail_template_structure.py +5 -0
opik/rest_api/types/prompt_public.py +6 -0
opik/rest_api/types/prompt_public_template_structure.py +5 -0
opik/rest_api/types/prompt_template_structure.py +5 -0
opik/rest_api/types/prompt_version.py +3 -0
opik/rest_api/types/prompt_version_detail.py +3 -0
opik/rest_api/types/prompt_version_detail_template_structure.py +5 -0
opik/rest_api/types/prompt_version_link.py +1 -0
opik/rest_api/types/prompt_version_link_public.py +1 -0
opik/rest_api/types/prompt_version_page_public.py +5 -0
opik/rest_api/types/prompt_version_public.py +3 -0
opik/rest_api/types/prompt_version_public_template_structure.py +5 -0
opik/rest_api/types/prompt_version_template_structure.py +5 -0
opik/rest_api/types/prompt_version_update.py +33 -0
opik/rest_api/types/provider_api_key.py +9 -0
opik/rest_api/types/provider_api_key_provider.py +1 -1
opik/rest_api/types/provider_api_key_public.py +9 -0
opik/rest_api/types/provider_api_key_public_provider.py +1 -1
opik/rest_api/types/score_name.py +1 -0
opik/rest_api/types/service_toggles_config.py +18 -0
opik/rest_api/types/span.py +1 -2
opik/rest_api/types/span_enrichment_options.py +31 -0
opik/rest_api/types/span_experiment_item_bulk_write_view.py +1 -2
opik/rest_api/types/span_filter.py +23 -0
opik/rest_api/types/span_filter_operator.py +21 -0
opik/rest_api/types/span_filter_write.py +23 -0
opik/rest_api/types/span_filter_write_operator.py +21 -0
opik/rest_api/types/span_llm_as_judge_code.py +27 -0
opik/rest_api/types/span_llm_as_judge_code_public.py +27 -0
opik/rest_api/types/span_llm_as_judge_code_write.py +27 -0
opik/rest_api/types/span_public.py +1 -2
opik/rest_api/types/span_update.py +46 -0
opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
opik/rest_api/types/span_write.py +1 -2
opik/rest_api/types/studio_evaluation.py +20 -0
opik/rest_api/types/studio_evaluation_public.py +20 -0
opik/rest_api/types/studio_evaluation_write.py +20 -0
opik/rest_api/types/studio_llm_model.py +21 -0
opik/rest_api/types/studio_llm_model_public.py +21 -0
opik/rest_api/types/studio_llm_model_write.py +21 -0
opik/rest_api/types/studio_message.py +20 -0
opik/rest_api/types/studio_message_public.py +20 -0
opik/rest_api/types/studio_message_write.py +20 -0
opik/rest_api/types/studio_metric.py +21 -0
opik/rest_api/types/studio_metric_public.py +21 -0
opik/rest_api/types/studio_metric_write.py +21 -0
opik/rest_api/types/studio_optimizer.py +21 -0
opik/rest_api/types/studio_optimizer_public.py +21 -0
opik/rest_api/types/studio_optimizer_write.py +21 -0
opik/rest_api/types/studio_prompt.py +20 -0
opik/rest_api/types/studio_prompt_public.py +20 -0
opik/rest_api/types/studio_prompt_write.py +20 -0
opik/rest_api/types/trace.py +11 -2
opik/rest_api/types/trace_enrichment_options.py +32 -0
opik/rest_api/types/trace_experiment_item_bulk_write_view.py +1 -2
opik/rest_api/types/trace_filter.py +23 -0
opik/rest_api/types/trace_filter_operator.py +21 -0
opik/rest_api/types/trace_filter_write.py +23 -0
opik/rest_api/types/trace_filter_write_operator.py +21 -0
opik/rest_api/types/trace_public.py +11 -2
opik/rest_api/types/trace_thread_filter_write.py +23 -0
opik/rest_api/types/trace_thread_filter_write_operator.py +21 -0
opik/rest_api/types/trace_thread_identifier.py +1 -0
opik/rest_api/types/trace_update.py +39 -0
opik/rest_api/types/trace_write.py +1 -2
opik/rest_api/types/value_entry.py +2 -0
opik/rest_api/types/value_entry_compare.py +2 -0
opik/rest_api/types/value_entry_experiment_item_bulk_write_view.py +2 -0
opik/rest_api/types/value_entry_public.py +2 -0
opik/rest_api/types/video_url.py +19 -0
opik/rest_api/types/video_url_public.py +19 -0
opik/rest_api/types/video_url_write.py +19 -0
opik/rest_api/types/webhook.py +28 -0
opik/rest_api/types/webhook_examples.py +19 -0
opik/rest_api/types/webhook_public.py +28 -0
opik/rest_api/types/webhook_test_result.py +23 -0
opik/rest_api/types/webhook_test_result_status.py +5 -0
opik/rest_api/types/webhook_write.py +23 -0
opik/rest_api/types/welcome_wizard_tracking.py +22 -0
opik/rest_api/types/workspace_configuration.py +5 -0
opik/rest_api/welcome_wizard/__init__.py +4 -0
opik/rest_api/welcome_wizard/client.py +195 -0
opik/rest_api/welcome_wizard/raw_client.py +208 -0
opik/rest_api/workspaces/client.py +14 -2
opik/rest_api/workspaces/raw_client.py +10 -0
opik/s3_httpx_client.py +14 -1
opik/simulation/__init__.py +6 -0
opik/simulation/simulated_user.py +99 -0
opik/simulation/simulator.py +108 -0
opik/synchronization.py +5 -6
opik/{decorator/tracing_runtime_config.py → tracing_runtime_config.py} +6 -7
opik/types.py +36 -0
opik/validation/chat_prompt_messages.py +241 -0
opik/validation/feedback_score.py +3 -3
opik/validation/validator.py +28 -0
opik-1.9.71.dist-info/METADATA +370 -0
opik-1.9.71.dist-info/RECORD +1110 -0
opik/api_objects/prompt/prompt.py +0 -112
opik/cli.py +0 -193
opik/hooks.py +0 -13
opik/integrations/bedrock/chunks_aggregator.py +0 -55
opik/integrations/bedrock/helpers.py +0 -8
opik/rest_api/types/automation_rule_evaluator_object_public.py +0 -100
opik/rest_api/types/json_node_experiment_item_bulk_write_view.py +0 -5
opik-1.8.39.dist-info/METADATA +0 -339
opik-1.8.39.dist-info/RECORD +0 -790
/opik/{evaluation/metrics/conversation/conversational_coherence → decorator/context_manager}/__init__.py +0 -0
/opik/evaluation/metrics/conversation/{session_completeness → llm_judges/conversational_coherence}/__init__.py +0 -0
/opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/schema.py +0 -0
/opik/evaluation/metrics/conversation/{user_frustration → llm_judges/session_completeness}/__init__.py +0 -0
/opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/schema.py +0 -0
/opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/schema.py +0 -0
/opik/integrations/bedrock/{stream_wrappers.py → converse/stream_wrappers.py} +0 -0
/opik/rest_api/{spans/types → types}/span_update_type.py +0 -0
{opik-1.8.39.dist-info → opik-1.9.71.dist-info}/WHEEL +0 -0
{opik-1.8.39.dist-info → opik-1.9.71.dist-info}/entry_points.txt +0 -0
{opik-1.8.39.dist-info → opik-1.9.71.dist-info}/licenses/LICENSE +0 -0
{opik-1.8.39.dist-info → opik-1.9.71.dist-info}/top_level.txt +0 -0

opik/evaluation/preprocessing.py ADDED Viewed

@@ -0,0 +1,92 @@
+"""Shared text preprocessing utilities for metrics."""
+from __future__ import annotations
+import re
+import string
+import unicodedata
+from typing import Callable, Literal
+try:  # optional dependency for emoji detection
+    import emoji
+except ImportError:  # pragma: no cover
+    emoji = None  # type: ignore
+_Normalizer = Callable[[str], str]
+def normalize_text(
+    text: str,
+    *,
+    lowercase: bool = True,
+    strip_accents: bool = False,
+    remove_punctuation: bool = False,
+    keep_emoji: bool = True,
+    normalize_form: Literal["NFC", "NFD", "NFKC", "NFKD"] = "NFKC",
+) -> str:
+    """Normalize text before metric processing.
+    Args:
+        text: Input string.
+        lowercase: Whether to lowercase the text.
+        strip_accents: Remove diacritical marks.
+        remove_punctuation: Strip ASCII punctuation.
+        keep_emoji: Preserve emoji characters; if False they are removed.
+        normalize_form: Unicode normalization form to apply (default NFKC).
+    """
+    normalized = unicodedata.normalize(normalize_form, text)
+    if lowercase:
+        normalized = normalized.lower()
+    if not keep_emoji:
+        normalized = _remove_emoji(normalized)
+    if strip_accents:
+        normalized = _strip_accents(normalized)
+    if remove_punctuation:
+        normalized = _remove_punctuation(normalized)
+    normalized = _collapse_whitespace(normalized)
+    return normalized.strip()
+def _remove_emoji(text: str) -> str:
+    if emoji is None:  # pragma: no cover
+        return "".join(
+            ch for ch in text if unicodedata.category(ch) not in {"So", "Sk"}
+        )
+    return emoji.replace_emoji(text, replace="")
+def _strip_accents(text: str) -> str:
+    decomposed = unicodedata.normalize("NFD", text)
+    return "".join(ch for ch in decomposed if unicodedata.category(ch) != "Mn")
+def _remove_punctuation(text: str) -> str:
+    translator = str.maketrans("", "", string.punctuation)
+    stripped = text.translate(translator)
+    return re.sub(
+        r"[\u2010-\u2015\u2018-\u201f\u2020-\u2027\u2030-\u2043]", "", stripped
+    )
+def _collapse_whitespace(text: str) -> str:
+    return re.sub(r"\s+", " ", text)
+DEFAULT_NORMALIZER: _Normalizer = normalize_text
+def ascii_normalizer(text: str) -> str:
+    return normalize_text(
+        text,
+        strip_accents=True,
+        remove_punctuation=True,
+        keep_emoji=False,
+    )
+ASCII_NORMALIZER: _Normalizer = ascii_normalizer

opik/evaluation/report.py CHANGED Viewed

@@ -1,11 +1,11 @@
 from collections import defaultdict
-from typing import Dict, List, Tuple
+from typing import Dict, List, Optional, Tuple
 from rich import align, console, panel, table, text
-from .. import url_helpers
-from . import test_result
+from . import test_result, evaluation_result
+from .metrics import score_result
 def _format_time(seconds: float) -> str:
@@ -42,7 +42,10 @@ def _compute_average_scores(
 def display_experiment_results(
-    dataset_name: str, total_time: float, test_results: List[test_result.TestResult]
+    dataset_name: str,
+    total_time: float,
+    test_results: List[test_result.TestResult],
+    experiment_scores: Optional[List[score_result.ScoreResult]] = None,
 ) -> None:
     average_scores, failed_scores = _compute_average_scores(test_results)
     nb_items = len(test_results)
@@ -63,6 +66,14 @@ def display_experiment_results(
             score_strings += text.Text(f" - {failed_scores[name]} failed", style="red")
         score_strings += text.Text("\n")
+    # Add experiment scores if available
+    if experiment_scores:
+        for score in experiment_scores:
+            score_strings += text.Text(
+                f"{score.name}: {score.value:.4f}", style="green bold"
+            )
+            score_strings += text.Text("\n")
     aligned_test_results = align.Align.left(score_strings)
     # Combine table, time text, and test results
@@ -87,16 +98,63 @@ def display_experiment_results(
     console_container.print("Uploading results to Opik ... ")
-def display_experiment_link(
-    experiment_id: str, dataset_id: str, url_override: str
-) -> None:
+def display_experiment_link(experiment_url: str) -> None:
     console_container = console.Console()
-    experiment_url = url_helpers.get_experiment_url_by_id(
-        experiment_id=experiment_id,
-        dataset_id=dataset_id,
-        url_override=url_override,
-    )
     console_container.print(
         f"View the results [link={experiment_url}]in your Opik dashboard[/link]."
     )
+def display_evaluation_scores_statistics(
+    dataset_name: str,
+    evaluation_results: evaluation_result.EvaluationResult,
+) -> None:
+    """
+    Displays evaluation scores statistics for a given dataset.
+    The function generates a summary of evaluation scores including mean, max,
+    min, and optionally standard deviation for each metric in the evaluation
+    results. The summarized scores are formatted and presented in a table
+    within a panel for user clarity.
+    Args:
+        dataset_name: Name of the dataset for which evaluation statistics are
+            being displayed.
+        evaluation_results: An object containing evaluation results with
+            aggregated scores and statistical data.
+    """
+    aggregated_view = evaluation_results.aggregate_evaluation_scores()
+    if not aggregated_view.aggregated_scores:
+        return
+    # Create a table for the statistics
+    stats_table = table.Table()
+    stats_table.add_column("Name", style="cyan", no_wrap=True)
+    stats_table.add_column("Mean", justify="right", style="green")
+    stats_table.add_column("Min", justify="right", style="yellow")
+    stats_table.add_column("Max", justify="right", style="yellow")
+    stats_table.add_column("Std", justify="right", style="magenta")
+    # Add rows for each metric
+    for name, stats in aggregated_view.aggregated_scores.items():
+        std_value = f"{stats.std:.4f}" if stats.std is not None else "N/A"
+        stats_table.add_row(
+            name,
+            f"{stats.mean:.4f}",
+            f"{stats.min:.4f}",
+            f"{stats.max:.4f}",
+            std_value,
+        )
+    # Create a panel with the table inside
+    panel_content = panel.Panel(
+        stats_table,
+        title=f"Evaluation statistics for {dataset_name}",
+        title_align="left",
+        expand=False,
+    )
+    # Display results
+    console_container = console.Console()
+    console_container.print(panel_content)

opik/evaluation/rest_operations.py CHANGED Viewed

@@ -1,11 +1,14 @@
+import logging
 from typing import List, Optional
-from opik.api_objects import experiment, opik_client
-from opik.types import FeedbackScoreDict
-from . import test_case, test_result
-from .metrics import arguments_helpers
+from opik.api_objects import dataset, experiment, opik_client
+from opik.types import BatchFeedbackScoreDict
+from . import test_case
+from .metrics import score_result
 from .types import ScoringKeyMappingType
+LOGGER = logging.getLogger(__name__)
 def get_experiment_with_unique_name(
     client: opik_client.Opik, experiment_name: str
@@ -34,63 +37,64 @@ def get_trace_project_name(client: opik_client.Opik, trace_id: str) -> str:
 def get_experiment_test_cases(
-    client: opik_client.Opik,
-    experiment_id: str,
-    dataset_id: str,
+    experiment_: experiment.Experiment,
+    dataset_: dataset.Dataset,
     scoring_key_mapping: Optional[ScoringKeyMappingType],
 ) -> List[test_case.TestCase]:
+    experiment_items = experiment_.get_items()
+    # Fetch dataset items to get input data for bulk-uploaded experiment items
+    dataset_items_by_id = {item["id"]: item for item in dataset_.get_items()}
     test_cases = []
-    page = 1
+    for item in experiment_items:
+        dataset_item_data = dataset_items_by_id.get(item.dataset_item_id)
-    while True:
-        experiment_items_page = (
-            client._rest_client.datasets.find_dataset_items_with_experiment_items(
-                id=dataset_id, experiment_ids=f'["{experiment_id}"]', page=page
+        if dataset_item_data is None:
+            LOGGER.error(
+                f"Unexpected error: Dataset item with id {item.dataset_item_id} not found, skipping experiment item {item.id}"
+            )
+            continue
+        if item.evaluation_task_output is None:
+            LOGGER.error(
+                f"Unexpected error: Evaluation task output is None for experiment item {item.id}, skipping experiment item"
+            )
+            continue
+        test_cases.append(
+            test_case.TestCase(
+                trace_id=item.trace_id,
+                dataset_item_id=item.dataset_item_id,
+                task_output=item.evaluation_task_output,
+                dataset_item_content=dataset_item_data,
             )
         )
-        if len(experiment_items_page.content) == 0:
-            break
-        for item in experiment_items_page.content:
-            experiment_item = item.experiment_items[0]
-            test_cases += [
-                test_case.TestCase(
-                    trace_id=experiment_item.trace_id,
-                    dataset_item_id=experiment_item.dataset_item_id,
-                    task_output=experiment_item.output,
-                    scoring_inputs=arguments_helpers.create_scoring_inputs(
-                        dataset_item=experiment_item.input,
-                        task_output=experiment_item.output,
-                        scoring_key_mapping=scoring_key_mapping,
-                    ),
-                )
-            ]
-        page += 1
     return test_cases
-def log_test_result_scores(
+def log_test_result_feedback_scores(
     client: opik_client.Opik,
-    test_result: test_result.TestResult,
+    score_results: List[score_result.ScoreResult],
+    trace_id: str,
     project_name: Optional[str],
 ) -> None:
-    all_trace_scores: List[FeedbackScoreDict] = []
+    all_trace_scores: List[BatchFeedbackScoreDict] = []
-    for score_result in test_result.score_results:
-        if score_result.scoring_failed:
+    for score_result_ in score_results:
+        if score_result_.scoring_failed:
             continue
-        trace_score = FeedbackScoreDict(
-            id=test_result.test_case.trace_id,
-            name=score_result.name,
-            value=score_result.value,
-            reason=score_result.reason,
+        trace_score = BatchFeedbackScoreDict(
+            id=trace_id,
+            name=score_result_.name,
+            value=score_result_.value,
+            reason=score_result_.reason,
         )
         all_trace_scores.append(trace_score)
-    client.log_traces_feedback_scores(
-        scores=all_trace_scores, project_name=project_name
-    )
+    if len(all_trace_scores) > 0:
+        client.log_traces_feedback_scores(
+            scores=all_trace_scores, project_name=project_name
+        )

opik/evaluation/samplers/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .base_dataset_sampler import BaseDatasetSampler
+from .random_dataset_sampler import RandomDatasetSampler
+__all__ = ["BaseDatasetSampler", "RandomDatasetSampler"]

opik/evaluation/samplers/base_dataset_sampler.py ADDED Viewed

@@ -0,0 +1,40 @@
+import abc
+from typing import List
+from opik.api_objects.dataset import dataset_item
+class BaseDatasetSampler(abc.ABC):
+    """
+    Defines the BaseDatasetSampler for sampling dataset items.
+    This is an abstract base class that provides the definition
+    for dataset sampling. It requires implementation of the `sample`
+    method in subclasses, which specifies the sampling logic tailored
+    to specific needs.
+    Methods in this class are enforced to be redefined in any
+    concrete implementation.
+    """
+    @abc.abstractmethod
+    def sample(
+        self, data_item: List[dataset_item.DatasetItem]
+    ) -> List[dataset_item.DatasetItem]:
+        """
+        Samples and filters a list of dataset items according to a specific implementation.
+        Args:
+            data_item (List[dataset_item.DatasetItem]): A list of DatasetItem objects to be
+                sampled and filtered.
+        Returns:
+            List[dataset_item.DatasetItem]: A list of DatasetItem objects resulting
+                from the sampling process.
+        Raises:
+            NotImplementedError: If the method is not implemented in a subclass.
+        """
+        pass

opik/evaluation/samplers/random_dataset_sampler.py ADDED Viewed

@@ -0,0 +1,48 @@
+import random
+from typing import List, Optional
+from opik.api_objects.dataset import dataset_item
+from . import base_dataset_sampler
+class RandomDatasetSampler(base_dataset_sampler.BaseDatasetSampler):
+    def __init__(
+        self, max_samples: int, shuffle: bool = True, seed: Optional[int] = None
+    ) -> None:
+        """Samples a random subset of dataset items.
+        This class is a dataset sampler that selects a random subset of items from a dataset.
+        The number of items to sample can be specified, and shuffling can be enabled or disabled.
+        An optional random seed can be provided for reproducibility.
+        Args:
+            max_samples: The maximum number of samples to generate.
+            shuffle: Whether to shuffle the samples. Default is True, False provides a speedup
+                for large datasets.
+            seed: Seed for the random number generator. If None, then fresh, unpredictable
+                entropy will be pulled from the OS.
+        """
+        self.max_samples = max_samples
+        self.shuffle = shuffle
+        self.seed = seed
+    def sample(
+        self, data_items: List[dataset_item.DatasetItem]
+    ) -> List[dataset_item.DatasetItem]:
+        if len(data_items) == 0:
+            return []
+        # Create a random number generator with the specified seed
+        rng = random.Random(self.seed)
+        # Determine how many samples to take
+        sample_size = min(len(data_items), self.max_samples)
+        # Do sample first to avoid shuffling the entire dataset
+        items = rng.sample(data_items, sample_size)
+        if self.shuffle:
+            rng.shuffle(items)
+        return items

opik/evaluation/score_statistics.py ADDED Viewed

@@ -0,0 +1,66 @@
+import dataclasses
+import math
+import statistics
+from collections import defaultdict
+from typing import List, Optional, Dict
+from opik.evaluation import test_result
+@dataclasses.dataclass
+class ScoreStatistics:
+    """Statistics for a single score metric across multiple trials."""
+    mean: float
+    max: float
+    min: float
+    values: List[float]
+    std: Optional[float] = None  # Standard deviation (None if count < 2)
+def calculate_aggregated_statistics(
+    evaluation_results: List[test_result.TestResult],
+) -> Dict[str, ScoreStatistics]:
+    """
+    Calculate mean, max, and min scores for each score name in the evaluation test results.
+    Args:
+        evaluation_results: List of TestResult objects to be aggregated
+    Returns:
+        Dict mapping score names to their aggregated statistics
+    """
+    if not evaluation_results:
+        return {}
+    # Group scores by name across all trials
+    scores_by_name = defaultdict(list)
+    for test_result_ in evaluation_results:
+        for score_result in test_result_.score_results:
+            # Only include successful scores with valid values
+            if not score_result.scoring_failed and _is_valid_score_value(
+                score_result.value
+            ):
+                scores_by_name[score_result.name].append(score_result.value)
+    # Calculate aggregated statistics for each score name
+    aggregated_scores = {}
+    for score_name, values in scores_by_name.items():
+        if values:
+            std = statistics.stdev(values) if len(values) >= 2 else None
+            aggregated_scores[score_name] = ScoreStatistics(
+                mean=statistics.mean(values),
+                max=max(values),
+                min=min(values),
+                values=values.copy(),  # Store the actual values used
+                std=std,
+            )
+    return aggregated_scores
+def _is_valid_score_value(value: float) -> bool:
+    """Check if a score value is valid for statistical calculations."""
+    return isinstance(value, (int, float)) and math.isfinite(value)

opik/evaluation/scorers/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .scorer_function import ScorerFunction
+from .scorer_wrapper_metric import ScorerWrapperMetric
+__all__ = ["ScorerFunction", "ScorerWrapperMetric"]

opik/evaluation/scorers/scorer_function.py ADDED Viewed

@@ -0,0 +1,55 @@
+import inspect
+from typing import Any, Dict, Optional, Protocol, Union, List
+from opik.evaluation.metrics import score_result
+from opik.message_processing.emulation import models
+class ScorerFunctionProtocol(Protocol):
+    """
+    Represents a protocol defining the structure for a scorer function.
+    This protocol serves as a contract for implementing scorer functions used in
+    evaluating tasks. A scorer function adhering to this protocol should take
+    dataset item data, task outputs, and optionally a task span model as input
+    parameters and return a scoring result.
+    """
+    def __call__(
+        self,
+        dataset_item: Dict[str, Any],
+        task_outputs: Dict[str, Any],
+        task_span: Optional[models.SpanModel] = None,
+    ) -> Union[score_result.ScoreResult, List[score_result.ScoreResult]]: ...
+ScorerFunction = ScorerFunctionProtocol
+EXPECTED_SCORER_FUNCTION_PARAMETERS = ["dataset_item", "task_outputs"]
+def validate_scorer_function(scorer_function: ScorerFunction) -> None:
+    if not callable(scorer_function):
+        raise ValueError("scorer_function must be a callable function")
+    parameters = inspect.signature(scorer_function).parameters
+    names = set(parameters.keys())
+    # Check if it has both dataset_item and task_outputs
+    has_dataset_item_and_task_outputs = all(
+        param in names for param in EXPECTED_SCORER_FUNCTION_PARAMETERS
+    )
+    # Check if it has at least one task_span parameter
+    has_task_span = "task_span" in names
+    if not (has_dataset_item_and_task_outputs or has_task_span):
+        raise ValueError(
+            f"scorer_function must have either both 'dataset_item' and 'task_outputs' parameters "
+            f"or at least one 'task_span' parameter. Found parameters: {list(names)}"
+        )
+def has_task_span_in_parameters(scorer_function: ScorerFunction) -> bool:
+    return "task_span" in inspect.signature(scorer_function).parameters

opik/evaluation/scorers/scorer_wrapper_metric.py ADDED Viewed

@@ -0,0 +1,130 @@
+from typing import Any, Callable, Dict, Optional, List, Union
+from opik.evaluation.metrics import base_metric, score_result
+from . import scorer_function
+from ...message_processing.emulation import models
+class ScorerWrapperMetric(base_metric.BaseMetric):
+    """
+    A wrapper metric that adapts a ScorerFunction to the BaseMetric interface.
+    This class allows using ScorerFunction instances as BaseMetric instances,
+    providing compatibility between the two interfaces.
+    Args:
+        scorer: The ScorerFunction to wrap
+        name: Optional name for the metric. If not provided, uses the class name.
+        track: Whether to track the metric. Defaults to True.
+        project_name: Optional project name for tracking.
+    Raises:
+        ValueError if the scorer function is invalid.
+    Example:
+        >>> def my_scorer(dataset_item: Dict[str, Any], task_outputs: Dict[str, Any]) -> score_result.ScoreResult:
+        >>>     return score_result.ScoreResult(name="my_metric", value=1.0)
+        >>>
+        >>> wrapper = ScorerWrapperMetric(scorer_function=my_scorer, name="wrapped_scorer")
+        >>> result = wrapper.score(dataset_item={"text": "hello"}, task_outputs={"text": "hello"})
+    """
+    def __init__(
+        self,
+        scorer: scorer_function.ScorerFunction,
+        name: str,
+        track: bool = True,
+        project_name: Optional[str] = None,
+    ) -> None:
+        super().__init__(name=name, track=track, project_name=project_name)
+        self.scorer = scorer
+        # validate scorer function
+        scorer_function.validate_scorer_function(scorer)
+    def score(
+        self,
+        dataset_item: Dict[str, Any],
+        task_outputs: Dict[str, Any],
+        **kwargs: Any,
+    ) -> Union[score_result.ScoreResult, List[score_result.ScoreResult]]:
+        """
+        Score using the wrapped ScorerFunction.
+        Args:
+            dataset_item: The dataset item data to score against
+            task_outputs: The output dictionary to be scored - can be the output of LLM task, etc.
+            **kwargs: Additional keyword arguments (ignored by the scorer function)
+        Returns:
+            ScoreResult from the wrapped scorer function
+        """
+        return self.scorer(dataset_item=dataset_item, task_outputs=task_outputs)
+class ScorerWrapperMetricTaskSpan(ScorerWrapperMetric):
+    def __init__(
+        self,
+        scorer: scorer_function.ScorerFunction,
+        name: str,
+        track: bool = True,
+        project_name: Optional[str] = None,
+    ) -> None:
+        super().__init__(
+            scorer=scorer, name=name, track=track, project_name=project_name
+        )
+    def score(
+        self,
+        dataset_item: Dict[str, Any],
+        task_outputs: Dict[str, Any],
+        task_span: Optional[models.SpanModel] = None,
+        **kwargs: Any,
+    ) -> Union[score_result.ScoreResult, List[score_result.ScoreResult]]:
+        """
+        Score using the wrapped ScorerFunction.
+        Args:
+            dataset_item: The dataset item data to score against
+            task_outputs: The output dictionary to be scored - can be the output of LLM task, etc.
+            task_span: The collected task span data.
+            **kwargs: Additional keyword arguments (ignored by the scorer function)
+        Returns:
+            ScoreResult from the wrapped scorer function
+        """
+        if task_span is not None and scorer_function.has_task_span_in_parameters(
+            self.scorer
+        ):
+            return self.scorer(
+                dataset_item=dataset_item,
+                task_outputs=task_outputs,
+                task_span=task_span,
+            )
+        return self.scorer(dataset_item=dataset_item, task_outputs=task_outputs)
+def _scorer_name(scorer: Callable) -> str:
+    return scorer.__name__
+def wrap_scorer_functions(
+    scorer_functions: List[scorer_function.ScorerFunction], project_name: Optional[str]
+) -> List[base_metric.BaseMetric]:
+    metrics: List[base_metric.BaseMetric] = []
+    for f in scorer_functions:
+        name = _scorer_name(f)
+        if scorer_function.has_task_span_in_parameters(f):
+            metrics.append(
+                ScorerWrapperMetricTaskSpan(
+                    scorer=f, project_name=project_name, name=name
+                )
+            )
+        else:
+            metrics.append(
+                ScorerWrapperMetric(scorer=f, project_name=project_name, name=name)
+            )
+    return metrics

opik 1.8.39__py3-none-any.whl → 1.9.71__py3-none-any.whl

opik 1.8.39py3-none-any.whl → 1.9.71py3-none-any.whl