opik 1.6.4__py3-none-any.whl → 1.9.71__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik/__init__.py +33 -2
- opik/anonymizer/__init__.py +5 -0
- opik/anonymizer/anonymizer.py +12 -0
- opik/anonymizer/factory.py +80 -0
- opik/anonymizer/recursive_anonymizer.py +64 -0
- opik/anonymizer/rules.py +56 -0
- opik/anonymizer/rules_anonymizer.py +35 -0
- opik/api_objects/attachment/__init__.py +5 -0
- opik/api_objects/attachment/attachment.py +20 -0
- opik/api_objects/attachment/attachment_context.py +36 -0
- opik/api_objects/attachment/attachments_extractor.py +153 -0
- opik/api_objects/attachment/client.py +220 -0
- opik/api_objects/attachment/converters.py +51 -0
- opik/api_objects/attachment/decoder.py +18 -0
- opik/api_objects/attachment/decoder_base64.py +83 -0
- opik/api_objects/attachment/decoder_helpers.py +137 -0
- opik/api_objects/conversation/__init__.py +0 -0
- opik/api_objects/conversation/conversation_factory.py +43 -0
- opik/api_objects/conversation/conversation_thread.py +49 -0
- opik/api_objects/data_helpers.py +79 -0
- opik/api_objects/dataset/dataset.py +107 -45
- opik/api_objects/dataset/rest_operations.py +12 -3
- opik/api_objects/experiment/experiment.py +81 -45
- opik/api_objects/experiment/experiment_item.py +2 -1
- opik/api_objects/experiment/experiments_client.py +64 -0
- opik/api_objects/experiment/helpers.py +35 -11
- opik/api_objects/experiment/rest_operations.py +88 -19
- opik/api_objects/helpers.py +104 -7
- opik/api_objects/local_recording.py +81 -0
- opik/api_objects/opik_client.py +872 -174
- opik/api_objects/opik_query_language.py +136 -18
- opik/api_objects/optimization/__init__.py +3 -0
- opik/api_objects/optimization/optimization.py +39 -0
- opik/api_objects/prompt/__init__.py +13 -1
- opik/api_objects/prompt/base_prompt.py +69 -0
- opik/api_objects/prompt/base_prompt_template.py +29 -0
- opik/api_objects/prompt/chat/__init__.py +1 -0
- opik/api_objects/prompt/chat/chat_prompt.py +210 -0
- opik/api_objects/prompt/chat/chat_prompt_template.py +350 -0
- opik/api_objects/prompt/chat/content_renderer_registry.py +203 -0
- opik/api_objects/prompt/client.py +193 -41
- opik/api_objects/prompt/text/__init__.py +1 -0
- opik/api_objects/prompt/text/prompt.py +174 -0
- opik/api_objects/prompt/text/prompt_template.py +55 -0
- opik/api_objects/prompt/types.py +29 -0
- opik/api_objects/rest_stream_parser.py +98 -0
- opik/api_objects/search_helpers.py +89 -0
- opik/api_objects/span/span_client.py +165 -45
- opik/api_objects/span/span_data.py +136 -25
- opik/api_objects/threads/__init__.py +0 -0
- opik/api_objects/threads/threads_client.py +185 -0
- opik/api_objects/trace/trace_client.py +72 -36
- opik/api_objects/trace/trace_data.py +112 -26
- opik/api_objects/validation_helpers.py +3 -3
- opik/cli/__init__.py +5 -0
- opik/cli/__main__.py +6 -0
- opik/cli/configure.py +66 -0
- opik/cli/exports/__init__.py +131 -0
- opik/cli/exports/dataset.py +278 -0
- opik/cli/exports/experiment.py +784 -0
- opik/cli/exports/project.py +685 -0
- opik/cli/exports/prompt.py +578 -0
- opik/cli/exports/utils.py +406 -0
- opik/cli/harbor.py +39 -0
- opik/cli/healthcheck.py +21 -0
- opik/cli/imports/__init__.py +439 -0
- opik/cli/imports/dataset.py +143 -0
- opik/cli/imports/experiment.py +1192 -0
- opik/cli/imports/project.py +262 -0
- opik/cli/imports/prompt.py +177 -0
- opik/cli/imports/utils.py +280 -0
- opik/cli/main.py +49 -0
- opik/cli/proxy.py +93 -0
- opik/cli/usage_report/__init__.py +16 -0
- opik/cli/usage_report/charts.py +783 -0
- opik/cli/usage_report/cli.py +274 -0
- opik/cli/usage_report/constants.py +9 -0
- opik/cli/usage_report/extraction.py +749 -0
- opik/cli/usage_report/pdf.py +244 -0
- opik/cli/usage_report/statistics.py +78 -0
- opik/cli/usage_report/utils.py +235 -0
- opik/config.py +62 -4
- opik/configurator/configure.py +45 -6
- opik/configurator/opik_rest_helpers.py +4 -1
- opik/context_storage.py +164 -65
- opik/datetime_helpers.py +12 -0
- opik/decorator/arguments_helpers.py +9 -1
- opik/decorator/base_track_decorator.py +298 -146
- opik/decorator/context_manager/__init__.py +0 -0
- opik/decorator/context_manager/span_context_manager.py +123 -0
- opik/decorator/context_manager/trace_context_manager.py +84 -0
- opik/decorator/generator_wrappers.py +3 -2
- opik/decorator/inspect_helpers.py +11 -0
- opik/decorator/opik_args/__init__.py +13 -0
- opik/decorator/opik_args/api_classes.py +71 -0
- opik/decorator/opik_args/helpers.py +120 -0
- opik/decorator/span_creation_handler.py +49 -21
- opik/decorator/tracker.py +9 -1
- opik/dict_utils.py +3 -3
- opik/environment.py +13 -1
- opik/error_tracking/api.py +1 -1
- opik/error_tracking/before_send.py +6 -5
- opik/error_tracking/environment_details.py +29 -7
- opik/error_tracking/error_filtering/filter_by_response_status_code.py +42 -0
- opik/error_tracking/error_filtering/filter_chain_builder.py +14 -3
- opik/evaluation/__init__.py +14 -2
- opik/evaluation/engine/engine.py +280 -82
- opik/evaluation/engine/evaluation_tasks_executor.py +15 -10
- opik/evaluation/engine/helpers.py +34 -9
- opik/evaluation/engine/metrics_evaluator.py +237 -0
- opik/evaluation/engine/types.py +5 -4
- opik/evaluation/evaluation_result.py +169 -2
- opik/evaluation/evaluator.py +659 -58
- opik/evaluation/metrics/__init__.py +121 -6
- opik/evaluation/metrics/aggregated_metric.py +92 -0
- opik/evaluation/metrics/arguments_helpers.py +15 -21
- opik/evaluation/metrics/arguments_validator.py +38 -0
- opik/evaluation/metrics/base_metric.py +20 -10
- opik/evaluation/metrics/conversation/__init__.py +48 -0
- opik/evaluation/metrics/conversation/conversation_thread_metric.py +79 -0
- opik/evaluation/metrics/conversation/conversation_turns_factory.py +39 -0
- opik/evaluation/metrics/conversation/g_eval_wrappers.py +19 -0
- opik/evaluation/metrics/conversation/helpers.py +84 -0
- opik/evaluation/metrics/conversation/heuristics/__init__.py +14 -0
- opik/evaluation/metrics/conversation/heuristics/degeneration/__init__.py +3 -0
- opik/evaluation/metrics/conversation/heuristics/degeneration/metric.py +189 -0
- opik/evaluation/metrics/conversation/heuristics/degeneration/phrases.py +12 -0
- opik/evaluation/metrics/conversation/heuristics/knowledge_retention/__init__.py +3 -0
- opik/evaluation/metrics/conversation/heuristics/knowledge_retention/metric.py +172 -0
- opik/evaluation/metrics/conversation/llm_judges/__init__.py +32 -0
- opik/evaluation/metrics/conversation/llm_judges/conversational_coherence/__init__.py +0 -0
- opik/evaluation/metrics/conversation/llm_judges/conversational_coherence/metric.py +274 -0
- opik/evaluation/metrics/conversation/llm_judges/conversational_coherence/schema.py +16 -0
- opik/evaluation/metrics/conversation/llm_judges/conversational_coherence/templates.py +95 -0
- opik/evaluation/metrics/conversation/llm_judges/g_eval_wrappers.py +442 -0
- opik/evaluation/metrics/conversation/llm_judges/session_completeness/__init__.py +0 -0
- opik/evaluation/metrics/conversation/llm_judges/session_completeness/metric.py +295 -0
- opik/evaluation/metrics/conversation/llm_judges/session_completeness/schema.py +22 -0
- opik/evaluation/metrics/conversation/llm_judges/session_completeness/templates.py +139 -0
- opik/evaluation/metrics/conversation/llm_judges/user_frustration/__init__.py +0 -0
- opik/evaluation/metrics/conversation/llm_judges/user_frustration/metric.py +277 -0
- opik/evaluation/metrics/conversation/llm_judges/user_frustration/schema.py +16 -0
- opik/evaluation/metrics/conversation/llm_judges/user_frustration/templates.py +135 -0
- opik/evaluation/metrics/conversation/types.py +34 -0
- opik/evaluation/metrics/conversation_types.py +9 -0
- opik/evaluation/metrics/heuristics/bertscore.py +107 -0
- opik/evaluation/metrics/heuristics/bleu.py +43 -16
- opik/evaluation/metrics/heuristics/chrf.py +127 -0
- opik/evaluation/metrics/heuristics/contains.py +50 -11
- opik/evaluation/metrics/heuristics/distribution_metrics.py +331 -0
- opik/evaluation/metrics/heuristics/equals.py +4 -1
- opik/evaluation/metrics/heuristics/gleu.py +113 -0
- opik/evaluation/metrics/heuristics/is_json.py +9 -3
- opik/evaluation/metrics/heuristics/language_adherence.py +123 -0
- opik/evaluation/metrics/heuristics/levenshtein_ratio.py +6 -5
- opik/evaluation/metrics/heuristics/meteor.py +119 -0
- opik/evaluation/metrics/heuristics/prompt_injection.py +150 -0
- opik/evaluation/metrics/heuristics/readability.py +129 -0
- opik/evaluation/metrics/heuristics/regex_match.py +4 -1
- opik/evaluation/metrics/heuristics/rouge.py +148 -0
- opik/evaluation/metrics/heuristics/sentiment.py +98 -0
- opik/evaluation/metrics/heuristics/spearman.py +88 -0
- opik/evaluation/metrics/heuristics/tone.py +155 -0
- opik/evaluation/metrics/heuristics/vader_sentiment.py +77 -0
- opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +27 -30
- opik/evaluation/metrics/llm_judges/answer_relevance/parser.py +27 -0
- opik/evaluation/metrics/llm_judges/answer_relevance/templates.py +10 -10
- opik/evaluation/metrics/llm_judges/context_precision/metric.py +28 -31
- opik/evaluation/metrics/llm_judges/context_precision/parser.py +27 -0
- opik/evaluation/metrics/llm_judges/context_precision/template.py +7 -7
- opik/evaluation/metrics/llm_judges/context_recall/metric.py +27 -31
- opik/evaluation/metrics/llm_judges/context_recall/parser.py +27 -0
- opik/evaluation/metrics/llm_judges/context_recall/template.py +7 -7
- opik/evaluation/metrics/llm_judges/factuality/metric.py +7 -26
- opik/evaluation/metrics/llm_judges/factuality/parser.py +35 -0
- opik/evaluation/metrics/llm_judges/factuality/template.py +1 -1
- opik/evaluation/metrics/llm_judges/g_eval/__init__.py +5 -0
- opik/evaluation/metrics/llm_judges/g_eval/metric.py +244 -113
- opik/evaluation/metrics/llm_judges/g_eval/parser.py +161 -0
- opik/evaluation/metrics/llm_judges/g_eval/presets.py +209 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/__init__.py +36 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/agent_assessment.py +77 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/bias_classifier.py +181 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/compliance_risk.py +41 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/prompt_uncertainty.py +41 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/qa_suite.py +146 -0
- opik/evaluation/metrics/llm_judges/hallucination/metric.py +23 -27
- opik/evaluation/metrics/llm_judges/hallucination/parser.py +29 -0
- opik/evaluation/metrics/llm_judges/hallucination/template.py +2 -4
- opik/evaluation/metrics/llm_judges/llm_juries/__init__.py +3 -0
- opik/evaluation/metrics/llm_judges/llm_juries/metric.py +76 -0
- opik/evaluation/metrics/llm_judges/moderation/metric.py +23 -28
- opik/evaluation/metrics/llm_judges/moderation/parser.py +27 -0
- opik/evaluation/metrics/llm_judges/moderation/template.py +2 -2
- opik/evaluation/metrics/llm_judges/parsing_helpers.py +26 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/__init__.py +0 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +144 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/parser.py +79 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/schema.py +15 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/template.py +50 -0
- opik/evaluation/metrics/llm_judges/syc_eval/__init__.py +0 -0
- opik/evaluation/metrics/llm_judges/syc_eval/metric.py +252 -0
- opik/evaluation/metrics/llm_judges/syc_eval/parser.py +82 -0
- opik/evaluation/metrics/llm_judges/syc_eval/template.py +155 -0
- opik/evaluation/metrics/llm_judges/trajectory_accuracy/__init__.py +3 -0
- opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +171 -0
- opik/evaluation/metrics/llm_judges/trajectory_accuracy/parser.py +38 -0
- opik/evaluation/metrics/llm_judges/trajectory_accuracy/templates.py +65 -0
- opik/evaluation/metrics/llm_judges/usefulness/metric.py +23 -32
- opik/evaluation/metrics/llm_judges/usefulness/parser.py +28 -0
- opik/evaluation/metrics/ragas_metric.py +112 -0
- opik/evaluation/models/__init__.py +10 -0
- opik/evaluation/models/base_model.py +140 -18
- opik/evaluation/models/langchain/__init__.py +3 -0
- opik/evaluation/models/langchain/langchain_chat_model.py +166 -0
- opik/evaluation/models/langchain/message_converters.py +106 -0
- opik/evaluation/models/langchain/opik_monitoring.py +23 -0
- opik/evaluation/models/litellm/litellm_chat_model.py +186 -40
- opik/evaluation/models/litellm/opik_monitor.py +24 -21
- opik/evaluation/models/litellm/util.py +125 -0
- opik/evaluation/models/litellm/warning_filters.py +16 -4
- opik/evaluation/models/model_capabilities.py +187 -0
- opik/evaluation/models/models_factory.py +25 -3
- opik/evaluation/preprocessing.py +92 -0
- opik/evaluation/report.py +70 -12
- opik/evaluation/rest_operations.py +49 -45
- opik/evaluation/samplers/__init__.py +4 -0
- opik/evaluation/samplers/base_dataset_sampler.py +40 -0
- opik/evaluation/samplers/random_dataset_sampler.py +48 -0
- opik/evaluation/score_statistics.py +66 -0
- opik/evaluation/scorers/__init__.py +4 -0
- opik/evaluation/scorers/scorer_function.py +55 -0
- opik/evaluation/scorers/scorer_wrapper_metric.py +130 -0
- opik/evaluation/test_case.py +3 -2
- opik/evaluation/test_result.py +1 -0
- opik/evaluation/threads/__init__.py +0 -0
- opik/evaluation/threads/context_helper.py +32 -0
- opik/evaluation/threads/evaluation_engine.py +181 -0
- opik/evaluation/threads/evaluation_result.py +18 -0
- opik/evaluation/threads/evaluator.py +120 -0
- opik/evaluation/threads/helpers.py +51 -0
- opik/evaluation/types.py +9 -1
- opik/exceptions.py +116 -3
- opik/file_upload/__init__.py +0 -0
- opik/file_upload/base_upload_manager.py +39 -0
- opik/file_upload/file_upload_monitor.py +14 -0
- opik/file_upload/file_uploader.py +141 -0
- opik/file_upload/mime_type.py +9 -0
- opik/file_upload/s3_multipart_upload/__init__.py +0 -0
- opik/file_upload/s3_multipart_upload/file_parts_strategy.py +89 -0
- opik/file_upload/s3_multipart_upload/s3_file_uploader.py +86 -0
- opik/file_upload/s3_multipart_upload/s3_upload_error.py +29 -0
- opik/file_upload/thread_pool.py +17 -0
- opik/file_upload/upload_client.py +114 -0
- opik/file_upload/upload_manager.py +255 -0
- opik/file_upload/upload_options.py +37 -0
- opik/format_helpers.py +17 -0
- opik/guardrails/__init__.py +4 -0
- opik/guardrails/guardrail.py +157 -0
- opik/guardrails/guards/__init__.py +5 -0
- opik/guardrails/guards/guard.py +17 -0
- opik/guardrails/guards/pii.py +47 -0
- opik/guardrails/guards/topic.py +76 -0
- opik/guardrails/rest_api_client.py +34 -0
- opik/guardrails/schemas.py +24 -0
- opik/guardrails/tracing.py +61 -0
- opik/healthcheck/__init__.py +2 -1
- opik/healthcheck/checks.py +2 -2
- opik/healthcheck/rich_representation.py +1 -1
- opik/hooks/__init__.py +23 -0
- opik/hooks/anonymizer_hook.py +36 -0
- opik/hooks/httpx_client_hook.py +112 -0
- opik/httpx_client.py +75 -4
- opik/id_helpers.py +18 -0
- opik/integrations/adk/__init__.py +14 -0
- opik/integrations/adk/callback_context_info_extractors.py +32 -0
- opik/integrations/adk/graph/__init__.py +0 -0
- opik/integrations/adk/graph/mermaid_graph_builder.py +128 -0
- opik/integrations/adk/graph/nodes.py +101 -0
- opik/integrations/adk/graph/subgraph_edges_builders.py +41 -0
- opik/integrations/adk/helpers.py +48 -0
- opik/integrations/adk/legacy_opik_tracer.py +381 -0
- opik/integrations/adk/opik_tracer.py +370 -0
- opik/integrations/adk/patchers/__init__.py +4 -0
- opik/integrations/adk/patchers/adk_otel_tracer/__init__.py +0 -0
- opik/integrations/adk/patchers/adk_otel_tracer/llm_span_helpers.py +30 -0
- opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +201 -0
- opik/integrations/adk/patchers/litellm_wrappers.py +91 -0
- opik/integrations/adk/patchers/llm_response_wrapper.py +105 -0
- opik/integrations/adk/patchers/patchers.py +64 -0
- opik/integrations/adk/recursive_callback_injector.py +126 -0
- opik/integrations/aisuite/aisuite_decorator.py +8 -3
- opik/integrations/aisuite/opik_tracker.py +1 -0
- opik/integrations/anthropic/messages_create_decorator.py +8 -3
- opik/integrations/anthropic/opik_tracker.py +0 -1
- opik/integrations/bedrock/converse/__init__.py +0 -0
- opik/integrations/bedrock/converse/chunks_aggregator.py +188 -0
- opik/integrations/bedrock/{converse_decorator.py → converse/converse_decorator.py} +18 -8
- opik/integrations/bedrock/invoke_agent_decorator.py +12 -7
- opik/integrations/bedrock/invoke_model/__init__.py +0 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/__init__.py +78 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/api.py +45 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/base.py +23 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/claude.py +121 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/format_detector.py +107 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/llama.py +108 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/mistral.py +118 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/nova.py +99 -0
- opik/integrations/bedrock/invoke_model/invoke_model_decorator.py +178 -0
- opik/integrations/bedrock/invoke_model/response_types.py +34 -0
- opik/integrations/bedrock/invoke_model/stream_wrappers.py +122 -0
- opik/integrations/bedrock/invoke_model/usage_converters.py +87 -0
- opik/integrations/bedrock/invoke_model/usage_extraction.py +108 -0
- opik/integrations/bedrock/opik_tracker.py +43 -4
- opik/integrations/bedrock/types.py +19 -0
- opik/integrations/crewai/crewai_decorator.py +34 -56
- opik/integrations/crewai/opik_tracker.py +31 -10
- opik/integrations/crewai/patchers/__init__.py +5 -0
- opik/integrations/crewai/patchers/flow.py +118 -0
- opik/integrations/crewai/patchers/litellm_completion.py +30 -0
- opik/integrations/crewai/patchers/llm_client.py +207 -0
- opik/integrations/dspy/callback.py +246 -84
- opik/integrations/dspy/graph.py +88 -0
- opik/integrations/dspy/parsers.py +168 -0
- opik/integrations/genai/encoder_extension.py +2 -6
- opik/integrations/genai/generate_content_decorator.py +20 -13
- opik/integrations/guardrails/guardrails_decorator.py +4 -0
- opik/integrations/harbor/__init__.py +17 -0
- opik/integrations/harbor/experiment_service.py +269 -0
- opik/integrations/harbor/opik_tracker.py +528 -0
- opik/integrations/haystack/constants.py +35 -0
- opik/integrations/haystack/converters.py +1 -2
- opik/integrations/haystack/opik_connector.py +28 -6
- opik/integrations/haystack/opik_span_bridge.py +284 -0
- opik/integrations/haystack/opik_tracer.py +124 -222
- opik/integrations/langchain/__init__.py +3 -1
- opik/integrations/langchain/helpers.py +96 -0
- opik/integrations/langchain/langgraph_async_context_bridge.py +131 -0
- opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
- opik/integrations/langchain/opik_encoder_extension.py +2 -2
- opik/integrations/langchain/opik_tracer.py +641 -206
- opik/integrations/langchain/provider_usage_extractors/__init__.py +5 -0
- opik/integrations/langchain/provider_usage_extractors/anthropic_usage_extractor.py +101 -0
- opik/integrations/langchain/provider_usage_extractors/anthropic_vertexai_usage_extractor.py +67 -0
- opik/integrations/langchain/provider_usage_extractors/bedrock_usage_extractor.py +94 -0
- opik/integrations/langchain/provider_usage_extractors/google_generative_ai_usage_extractor.py +109 -0
- opik/integrations/langchain/provider_usage_extractors/groq_usage_extractor.py +92 -0
- opik/integrations/langchain/provider_usage_extractors/langchain_run_helpers/__init__.py +15 -0
- opik/integrations/langchain/provider_usage_extractors/langchain_run_helpers/helpers.py +134 -0
- opik/integrations/langchain/provider_usage_extractors/langchain_run_helpers/langchain_usage.py +163 -0
- opik/integrations/langchain/provider_usage_extractors/openai_usage_extractor.py +124 -0
- opik/integrations/langchain/provider_usage_extractors/provider_usage_extractor_protocol.py +29 -0
- opik/integrations/langchain/provider_usage_extractors/usage_extractor.py +48 -0
- opik/integrations/langchain/provider_usage_extractors/vertexai_usage_extractor.py +109 -0
- opik/integrations/litellm/__init__.py +5 -0
- opik/integrations/litellm/completion_chunks_aggregator.py +115 -0
- opik/integrations/litellm/litellm_completion_decorator.py +242 -0
- opik/integrations/litellm/opik_tracker.py +43 -0
- opik/integrations/litellm/stream_patchers.py +151 -0
- opik/integrations/llama_index/callback.py +179 -78
- opik/integrations/llama_index/event_parsing_utils.py +29 -9
- opik/integrations/openai/agents/opik_tracing_processor.py +204 -32
- opik/integrations/openai/agents/span_data_parsers.py +15 -6
- opik/integrations/openai/chat_completion_chunks_aggregator.py +1 -1
- opik/integrations/openai/{openai_decorator.py → openai_chat_completions_decorator.py} +45 -35
- opik/integrations/openai/openai_responses_decorator.py +158 -0
- opik/integrations/openai/opik_tracker.py +94 -13
- opik/integrations/openai/response_events_aggregator.py +36 -0
- opik/integrations/openai/stream_patchers.py +125 -15
- opik/integrations/sagemaker/auth.py +5 -1
- opik/jsonable_encoder.py +29 -1
- opik/llm_usage/base_original_provider_usage.py +15 -8
- opik/llm_usage/bedrock_usage.py +8 -2
- opik/llm_usage/google_usage.py +6 -1
- opik/llm_usage/llm_usage_info.py +6 -0
- opik/llm_usage/{openai_usage.py → openai_chat_completions_usage.py} +2 -12
- opik/llm_usage/{openai_agent_usage.py → openai_responses_usage.py} +7 -15
- opik/llm_usage/opik_usage.py +36 -10
- opik/llm_usage/opik_usage_factory.py +35 -19
- opik/logging_messages.py +19 -7
- opik/message_processing/arguments_utils.py +22 -0
- opik/message_processing/batching/base_batcher.py +45 -17
- opik/message_processing/batching/batch_manager.py +22 -10
- opik/message_processing/batching/batch_manager_constuctors.py +36 -11
- opik/message_processing/batching/batchers.py +167 -44
- opik/message_processing/batching/flushing_thread.py +0 -3
- opik/message_processing/batching/sequence_splitter.py +50 -5
- opik/message_processing/emulation/__init__.py +0 -0
- opik/message_processing/emulation/emulator_message_processor.py +578 -0
- opik/message_processing/emulation/local_emulator_message_processor.py +140 -0
- opik/message_processing/emulation/models.py +162 -0
- opik/message_processing/encoder_helpers.py +79 -0
- opik/message_processing/message_queue.py +79 -0
- opik/message_processing/messages.py +154 -12
- opik/message_processing/preprocessing/__init__.py +0 -0
- opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
- opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
- opik/message_processing/preprocessing/constants.py +1 -0
- opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
- opik/message_processing/preprocessing/preprocessor.py +36 -0
- opik/message_processing/processors/__init__.py +0 -0
- opik/message_processing/processors/attachments_extraction_processor.py +146 -0
- opik/message_processing/processors/message_processors.py +92 -0
- opik/message_processing/processors/message_processors_chain.py +96 -0
- opik/message_processing/processors/online_message_processor.py +324 -0
- opik/message_processing/queue_consumer.py +61 -13
- opik/message_processing/streamer.py +102 -31
- opik/message_processing/streamer_constructors.py +67 -12
- opik/opik_context.py +103 -11
- opik/plugins/pytest/decorator.py +2 -2
- opik/plugins/pytest/experiment_runner.py +3 -2
- opik/plugins/pytest/hooks.py +6 -4
- opik/rate_limit/__init__.py +0 -0
- opik/rate_limit/rate_limit.py +25 -0
- opik/rest_api/__init__.py +643 -11
- opik/rest_api/alerts/__init__.py +7 -0
- opik/rest_api/alerts/client.py +667 -0
- opik/rest_api/alerts/raw_client.py +1015 -0
- opik/rest_api/alerts/types/__init__.py +7 -0
- opik/rest_api/alerts/types/get_webhook_examples_request_alert_type.py +5 -0
- opik/rest_api/annotation_queues/__init__.py +4 -0
- opik/rest_api/annotation_queues/client.py +668 -0
- opik/rest_api/annotation_queues/raw_client.py +1019 -0
- opik/rest_api/attachments/__init__.py +17 -0
- opik/rest_api/attachments/client.py +752 -0
- opik/rest_api/attachments/raw_client.py +1125 -0
- opik/rest_api/attachments/types/__init__.py +15 -0
- opik/rest_api/attachments/types/attachment_list_request_entity_type.py +5 -0
- opik/rest_api/attachments/types/download_attachment_request_entity_type.py +5 -0
- opik/rest_api/attachments/types/start_multipart_upload_request_entity_type.py +5 -0
- opik/rest_api/attachments/types/upload_attachment_request_entity_type.py +5 -0
- opik/rest_api/automation_rule_evaluators/__init__.py +2 -0
- opik/rest_api/automation_rule_evaluators/client.py +182 -1162
- opik/rest_api/automation_rule_evaluators/raw_client.py +598 -0
- opik/rest_api/chat_completions/__init__.py +2 -0
- opik/rest_api/chat_completions/client.py +115 -149
- opik/rest_api/chat_completions/raw_client.py +339 -0
- opik/rest_api/check/__init__.py +2 -0
- opik/rest_api/check/client.py +88 -106
- opik/rest_api/check/raw_client.py +258 -0
- opik/rest_api/client.py +112 -212
- opik/rest_api/core/__init__.py +5 -0
- opik/rest_api/core/api_error.py +12 -6
- opik/rest_api/core/client_wrapper.py +4 -14
- opik/rest_api/core/datetime_utils.py +1 -3
- opik/rest_api/core/file.py +2 -5
- opik/rest_api/core/http_client.py +42 -120
- opik/rest_api/core/http_response.py +55 -0
- opik/rest_api/core/jsonable_encoder.py +1 -4
- opik/rest_api/core/pydantic_utilities.py +79 -147
- opik/rest_api/core/query_encoder.py +1 -3
- opik/rest_api/core/serialization.py +10 -10
- opik/rest_api/dashboards/__init__.py +4 -0
- opik/rest_api/dashboards/client.py +462 -0
- opik/rest_api/dashboards/raw_client.py +648 -0
- opik/rest_api/datasets/__init__.py +5 -0
- opik/rest_api/datasets/client.py +1638 -1091
- opik/rest_api/datasets/raw_client.py +3389 -0
- opik/rest_api/datasets/types/__init__.py +8 -0
- opik/rest_api/datasets/types/dataset_update_visibility.py +5 -0
- opik/rest_api/datasets/types/dataset_write_visibility.py +5 -0
- opik/rest_api/errors/__init__.py +2 -0
- opik/rest_api/errors/bad_request_error.py +4 -3
- opik/rest_api/errors/conflict_error.py +4 -3
- opik/rest_api/errors/forbidden_error.py +4 -2
- opik/rest_api/errors/not_found_error.py +4 -3
- opik/rest_api/errors/not_implemented_error.py +4 -3
- opik/rest_api/errors/unauthorized_error.py +4 -3
- opik/rest_api/errors/unprocessable_entity_error.py +4 -3
- opik/rest_api/experiments/__init__.py +5 -0
- opik/rest_api/experiments/client.py +676 -752
- opik/rest_api/experiments/raw_client.py +1872 -0
- opik/rest_api/experiments/types/__init__.py +10 -0
- opik/rest_api/experiments/types/experiment_update_status.py +5 -0
- opik/rest_api/experiments/types/experiment_update_type.py +5 -0
- opik/rest_api/experiments/types/experiment_write_status.py +5 -0
- opik/rest_api/experiments/types/experiment_write_type.py +5 -0
- opik/rest_api/feedback_definitions/__init__.py +2 -0
- opik/rest_api/feedback_definitions/client.py +96 -370
- opik/rest_api/feedback_definitions/raw_client.py +541 -0
- opik/rest_api/feedback_definitions/types/__init__.py +2 -0
- opik/rest_api/feedback_definitions/types/find_feedback_definitions_request_type.py +1 -3
- opik/rest_api/guardrails/__init__.py +4 -0
- opik/rest_api/guardrails/client.py +104 -0
- opik/rest_api/guardrails/raw_client.py +102 -0
- opik/rest_api/llm_provider_key/__init__.py +2 -0
- opik/rest_api/llm_provider_key/client.py +166 -440
- opik/rest_api/llm_provider_key/raw_client.py +643 -0
- opik/rest_api/llm_provider_key/types/__init__.py +2 -0
- opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +1 -1
- opik/rest_api/manual_evaluation/__init__.py +4 -0
- opik/rest_api/manual_evaluation/client.py +347 -0
- opik/rest_api/manual_evaluation/raw_client.py +543 -0
- opik/rest_api/open_telemetry_ingestion/__init__.py +2 -0
- opik/rest_api/open_telemetry_ingestion/client.py +38 -63
- opik/rest_api/open_telemetry_ingestion/raw_client.py +88 -0
- opik/rest_api/optimizations/__init__.py +7 -0
- opik/rest_api/optimizations/client.py +704 -0
- opik/rest_api/optimizations/raw_client.py +920 -0
- opik/rest_api/optimizations/types/__init__.py +7 -0
- opik/rest_api/optimizations/types/optimization_update_status.py +7 -0
- opik/rest_api/projects/__init__.py +10 -1
- opik/rest_api/projects/client.py +180 -855
- opik/rest_api/projects/raw_client.py +1216 -0
- opik/rest_api/projects/types/__init__.py +11 -4
- opik/rest_api/projects/types/project_metric_request_public_interval.py +1 -3
- opik/rest_api/projects/types/project_metric_request_public_metric_type.py +11 -1
- opik/rest_api/projects/types/project_update_visibility.py +5 -0
- opik/rest_api/projects/types/project_write_visibility.py +5 -0
- opik/rest_api/prompts/__init__.py +4 -2
- opik/rest_api/prompts/client.py +381 -970
- opik/rest_api/prompts/raw_client.py +1634 -0
- opik/rest_api/prompts/types/__init__.py +5 -1
- opik/rest_api/prompts/types/create_prompt_version_detail_template_structure.py +5 -0
- opik/rest_api/prompts/types/prompt_write_template_structure.py +5 -0
- opik/rest_api/raw_client.py +156 -0
- opik/rest_api/redirect/__init__.py +4 -0
- opik/rest_api/redirect/client.py +375 -0
- opik/rest_api/redirect/raw_client.py +566 -0
- opik/rest_api/service_toggles/__init__.py +4 -0
- opik/rest_api/service_toggles/client.py +91 -0
- opik/rest_api/service_toggles/raw_client.py +93 -0
- opik/rest_api/spans/__init__.py +2 -0
- opik/rest_api/spans/client.py +659 -1354
- opik/rest_api/spans/raw_client.py +2383 -0
- opik/rest_api/spans/types/__init__.py +2 -0
- opik/rest_api/spans/types/find_feedback_score_names_1_request_type.py +1 -3
- opik/rest_api/spans/types/get_span_stats_request_type.py +1 -3
- opik/rest_api/spans/types/get_spans_by_project_request_type.py +1 -3
- opik/rest_api/spans/types/span_search_stream_request_public_type.py +1 -3
- opik/rest_api/system_usage/__init__.py +2 -0
- opik/rest_api/system_usage/client.py +157 -216
- opik/rest_api/system_usage/raw_client.py +455 -0
- opik/rest_api/traces/__init__.py +2 -0
- opik/rest_api/traces/client.py +2102 -1625
- opik/rest_api/traces/raw_client.py +4144 -0
- opik/rest_api/types/__init__.py +629 -24
- opik/rest_api/types/aggregation_data.py +27 -0
- opik/rest_api/types/alert.py +33 -0
- opik/rest_api/types/alert_alert_type.py +5 -0
- opik/rest_api/types/alert_page_public.py +24 -0
- opik/rest_api/types/alert_public.py +33 -0
- opik/rest_api/types/alert_public_alert_type.py +5 -0
- opik/rest_api/types/alert_trigger.py +27 -0
- opik/rest_api/types/alert_trigger_config.py +28 -0
- opik/rest_api/types/alert_trigger_config_public.py +28 -0
- opik/rest_api/types/alert_trigger_config_public_type.py +10 -0
- opik/rest_api/types/alert_trigger_config_type.py +10 -0
- opik/rest_api/types/alert_trigger_config_write.py +22 -0
- opik/rest_api/types/alert_trigger_config_write_type.py +10 -0
- opik/rest_api/types/alert_trigger_event_type.py +19 -0
- opik/rest_api/types/alert_trigger_public.py +27 -0
- opik/rest_api/types/alert_trigger_public_event_type.py +19 -0
- opik/rest_api/types/alert_trigger_write.py +23 -0
- opik/rest_api/types/alert_trigger_write_event_type.py +19 -0
- opik/rest_api/types/alert_write.py +28 -0
- opik/rest_api/types/alert_write_alert_type.py +5 -0
- opik/rest_api/types/annotation_queue.py +42 -0
- opik/rest_api/types/annotation_queue_batch.py +27 -0
- opik/rest_api/types/{json_schema_element.py → annotation_queue_item_ids.py} +5 -7
- opik/rest_api/types/annotation_queue_page_public.py +28 -0
- opik/rest_api/types/annotation_queue_public.py +38 -0
- opik/rest_api/types/annotation_queue_public_scope.py +5 -0
- opik/rest_api/types/{workspace_metadata.py → annotation_queue_reviewer.py} +6 -7
- opik/rest_api/types/annotation_queue_reviewer_public.py +20 -0
- opik/rest_api/types/annotation_queue_scope.py +5 -0
- opik/rest_api/types/annotation_queue_write.py +31 -0
- opik/rest_api/types/annotation_queue_write_scope.py +5 -0
- opik/rest_api/types/assistant_message.py +7 -8
- opik/rest_api/types/assistant_message_role.py +1 -3
- opik/rest_api/types/attachment.py +22 -0
- opik/rest_api/types/attachment_page.py +28 -0
- opik/rest_api/types/audio_url.py +19 -0
- opik/rest_api/types/audio_url_public.py +19 -0
- opik/rest_api/types/audio_url_write.py +19 -0
- opik/rest_api/types/automation_rule_evaluator.py +160 -0
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge.py +6 -6
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge_public.py +6 -6
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge_write.py +6 -6
- opik/rest_api/types/automation_rule_evaluator_object_object_public.py +155 -0
- opik/rest_api/types/automation_rule_evaluator_page_public.py +6 -6
- opik/rest_api/types/automation_rule_evaluator_public.py +155 -0
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_public.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_write.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_public.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_write.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_public.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_write.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_update.py +143 -0
- opik/rest_api/types/automation_rule_evaluator_update_llm_as_judge.py +6 -6
- opik/rest_api/types/automation_rule_evaluator_update_span_llm_as_judge.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_update_trace_thread_llm_as_judge.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_update_trace_thread_user_defined_metric_python.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_update_user_defined_metric_python.py +6 -6
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python.py +6 -6
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_public.py +6 -6
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_write.py +6 -6
- opik/rest_api/types/automation_rule_evaluator_write.py +143 -0
- opik/rest_api/types/avg_value_stat_public.py +3 -5
- opik/rest_api/types/batch_delete.py +3 -5
- opik/rest_api/types/batch_delete_by_project.py +20 -0
- opik/rest_api/types/bi_information.py +3 -5
- opik/rest_api/types/bi_information_response.py +4 -6
- opik/rest_api/types/boolean_feedback_definition.py +25 -0
- opik/rest_api/types/boolean_feedback_definition_create.py +20 -0
- opik/rest_api/types/boolean_feedback_definition_public.py +25 -0
- opik/rest_api/types/boolean_feedback_definition_update.py +20 -0
- opik/rest_api/types/boolean_feedback_detail.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_create.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_public.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_update.py +29 -0
- opik/rest_api/types/categorical_feedback_definition.py +5 -7
- opik/rest_api/types/categorical_feedback_definition_create.py +4 -6
- opik/rest_api/types/categorical_feedback_definition_public.py +5 -7
- opik/rest_api/types/categorical_feedback_definition_update.py +4 -6
- opik/rest_api/types/categorical_feedback_detail.py +3 -5
- opik/rest_api/types/categorical_feedback_detail_create.py +3 -5
- opik/rest_api/types/categorical_feedback_detail_public.py +3 -5
- opik/rest_api/types/categorical_feedback_detail_update.py +3 -5
- opik/rest_api/types/chat_completion_choice.py +4 -6
- opik/rest_api/types/chat_completion_response.py +5 -6
- opik/rest_api/types/check.py +22 -0
- opik/rest_api/types/{json_node_compare.py → check_name.py} +1 -1
- opik/rest_api/types/check_public.py +22 -0
- opik/rest_api/types/check_public_name.py +5 -0
- opik/rest_api/types/check_public_result.py +5 -0
- opik/rest_api/types/check_result.py +5 -0
- opik/rest_api/types/chunked_output_json_node.py +4 -6
- opik/rest_api/types/chunked_output_json_node_public.py +4 -6
- opik/rest_api/types/chunked_output_json_node_public_type.py +6 -10
- opik/rest_api/types/chunked_output_json_node_type.py +6 -10
- opik/rest_api/types/column.py +8 -10
- opik/rest_api/types/column_compare.py +8 -10
- opik/rest_api/types/column_public.py +8 -10
- opik/rest_api/types/column_types_item.py +1 -3
- opik/rest_api/types/comment.py +4 -6
- opik/rest_api/types/comment_compare.py +4 -6
- opik/rest_api/types/comment_public.py +4 -6
- opik/rest_api/types/complete_multipart_upload_request.py +33 -0
- opik/rest_api/types/complete_multipart_upload_request_entity_type.py +5 -0
- opik/rest_api/types/completion_tokens_details.py +3 -5
- opik/rest_api/types/count_value_stat_public.py +3 -5
- opik/rest_api/types/dashboard_page_public.py +24 -0
- opik/rest_api/types/dashboard_public.py +30 -0
- opik/rest_api/types/data_point_double.py +21 -0
- opik/rest_api/types/data_point_number_public.py +3 -5
- opik/rest_api/types/dataset.py +14 -6
- opik/rest_api/types/dataset_expansion.py +42 -0
- opik/rest_api/types/dataset_expansion_response.py +39 -0
- opik/rest_api/types/dataset_item.py +9 -8
- opik/rest_api/types/dataset_item_batch.py +3 -5
- opik/rest_api/types/dataset_item_changes_public.py +5 -0
- opik/rest_api/types/dataset_item_compare.py +9 -8
- opik/rest_api/types/dataset_item_compare_source.py +1 -3
- opik/rest_api/types/dataset_item_filter.py +27 -0
- opik/rest_api/types/dataset_item_filter_operator.py +21 -0
- opik/rest_api/types/dataset_item_page_compare.py +10 -7
- opik/rest_api/types/dataset_item_page_public.py +10 -7
- opik/rest_api/types/dataset_item_public.py +9 -8
- opik/rest_api/types/dataset_item_public_source.py +1 -3
- opik/rest_api/types/dataset_item_source.py +1 -3
- opik/rest_api/types/dataset_item_update.py +39 -0
- opik/rest_api/types/dataset_item_write.py +5 -6
- opik/rest_api/types/dataset_item_write_source.py +1 -3
- opik/rest_api/types/dataset_page_public.py +9 -6
- opik/rest_api/types/dataset_public.py +14 -6
- opik/rest_api/types/dataset_public_status.py +5 -0
- opik/rest_api/types/dataset_public_visibility.py +5 -0
- opik/rest_api/types/dataset_status.py +5 -0
- opik/rest_api/types/dataset_version_diff.py +22 -0
- opik/rest_api/types/dataset_version_diff_stats.py +24 -0
- opik/rest_api/types/dataset_version_page_public.py +23 -0
- opik/rest_api/types/dataset_version_public.py +59 -0
- opik/rest_api/types/dataset_version_summary.py +46 -0
- opik/rest_api/types/dataset_version_summary_public.py +46 -0
- opik/rest_api/types/dataset_visibility.py +5 -0
- opik/rest_api/types/delete_attachments_request.py +23 -0
- opik/rest_api/types/delete_attachments_request_entity_type.py +5 -0
- opik/rest_api/types/delete_feedback_score.py +4 -5
- opik/rest_api/types/delete_ids_holder.py +19 -0
- opik/rest_api/types/delta.py +7 -9
- opik/rest_api/types/error_count_with_deviation.py +21 -0
- opik/rest_api/types/error_count_with_deviation_detailed.py +21 -0
- opik/rest_api/types/error_info.py +3 -5
- opik/rest_api/types/error_info_experiment_item_bulk_write_view.py +21 -0
- opik/rest_api/types/error_info_public.py +3 -5
- opik/rest_api/types/error_info_write.py +3 -5
- opik/rest_api/types/error_message.py +3 -5
- opik/rest_api/types/error_message_detail.py +3 -5
- opik/rest_api/types/error_message_detailed.py +3 -5
- opik/rest_api/types/error_message_public.py +3 -5
- opik/rest_api/types/experiment.py +21 -10
- opik/rest_api/types/experiment_group_aggregations_response.py +20 -0
- opik/rest_api/types/experiment_group_response.py +22 -0
- opik/rest_api/types/experiment_item.py +14 -11
- opik/rest_api/types/experiment_item_bulk_record.py +27 -0
- opik/rest_api/types/experiment_item_bulk_record_experiment_item_bulk_write_view.py +27 -0
- opik/rest_api/types/experiment_item_bulk_upload.py +27 -0
- opik/rest_api/types/experiment_item_compare.py +14 -11
- opik/rest_api/types/experiment_item_compare_trace_visibility_mode.py +5 -0
- opik/rest_api/types/experiment_item_public.py +6 -6
- opik/rest_api/types/experiment_item_public_trace_visibility_mode.py +5 -0
- opik/rest_api/types/experiment_item_trace_visibility_mode.py +5 -0
- opik/rest_api/types/experiment_page_public.py +9 -6
- opik/rest_api/types/experiment_public.py +21 -10
- opik/rest_api/types/experiment_public_status.py +5 -0
- opik/rest_api/types/experiment_public_type.py +5 -0
- opik/rest_api/types/experiment_score.py +20 -0
- opik/rest_api/types/experiment_score_public.py +20 -0
- opik/rest_api/types/experiment_score_write.py +20 -0
- opik/rest_api/types/experiment_status.py +5 -0
- opik/rest_api/types/experiment_type.py +5 -0
- opik/rest_api/types/export_trace_service_request.py +5 -0
- opik/rest_api/types/feedback.py +40 -27
- opik/rest_api/types/feedback_create.py +27 -13
- opik/rest_api/types/feedback_definition_page_public.py +4 -6
- opik/rest_api/types/feedback_object_public.py +40 -27
- opik/rest_api/types/feedback_public.py +40 -27
- opik/rest_api/types/feedback_score.py +7 -7
- opik/rest_api/types/feedback_score_average.py +3 -5
- opik/rest_api/types/feedback_score_average_detailed.py +3 -5
- opik/rest_api/types/feedback_score_average_public.py +3 -5
- opik/rest_api/types/feedback_score_batch.py +4 -6
- opik/rest_api/types/feedback_score_batch_item.py +6 -6
- opik/rest_api/types/feedback_score_batch_item_source.py +1 -3
- opik/rest_api/types/feedback_score_batch_item_thread.py +32 -0
- opik/rest_api/types/feedback_score_batch_item_thread_source.py +5 -0
- opik/rest_api/types/feedback_score_compare.py +7 -7
- opik/rest_api/types/feedback_score_compare_source.py +1 -3
- opik/rest_api/types/feedback_score_experiment_item_bulk_write_view.py +31 -0
- opik/rest_api/types/feedback_score_experiment_item_bulk_write_view_source.py +5 -0
- opik/rest_api/types/feedback_score_names.py +4 -6
- opik/rest_api/types/feedback_score_public.py +11 -7
- opik/rest_api/types/feedback_score_public_source.py +1 -3
- opik/rest_api/types/feedback_score_source.py +1 -3
- opik/rest_api/types/feedback_update.py +27 -13
- opik/rest_api/types/function.py +4 -7
- opik/rest_api/types/function_call.py +3 -5
- opik/rest_api/types/group_content.py +19 -0
- opik/rest_api/types/group_content_with_aggregations.py +21 -0
- opik/rest_api/types/group_detail.py +19 -0
- opik/rest_api/types/group_details.py +20 -0
- opik/rest_api/types/guardrail.py +34 -0
- opik/rest_api/types/guardrail_batch.py +20 -0
- opik/rest_api/types/guardrail_name.py +5 -0
- opik/rest_api/types/guardrail_result.py +5 -0
- opik/rest_api/types/guardrail_write.py +33 -0
- opik/rest_api/types/guardrail_write_name.py +5 -0
- opik/rest_api/types/guardrail_write_result.py +5 -0
- opik/rest_api/types/guardrails_validation.py +21 -0
- opik/rest_api/types/guardrails_validation_public.py +21 -0
- opik/rest_api/types/ids_holder.py +19 -0
- opik/rest_api/types/image_url.py +20 -0
- opik/rest_api/types/image_url_public.py +20 -0
- opik/rest_api/types/image_url_write.py +20 -0
- opik/rest_api/types/json_list_string.py +7 -0
- opik/rest_api/types/json_list_string_compare.py +7 -0
- opik/rest_api/types/json_list_string_experiment_item_bulk_write_view.py +7 -0
- opik/rest_api/types/json_list_string_public.py +7 -0
- opik/rest_api/types/json_list_string_write.py +7 -0
- opik/rest_api/types/json_schema.py +5 -8
- opik/rest_api/types/llm_as_judge_code.py +8 -12
- opik/rest_api/types/llm_as_judge_code_public.py +8 -12
- opik/rest_api/types/llm_as_judge_code_write.py +8 -12
- opik/rest_api/types/llm_as_judge_message.py +9 -7
- opik/rest_api/types/llm_as_judge_message_content.py +26 -0
- opik/rest_api/types/llm_as_judge_message_content_public.py +26 -0
- opik/rest_api/types/llm_as_judge_message_content_write.py +26 -0
- opik/rest_api/types/llm_as_judge_message_public.py +9 -7
- opik/rest_api/types/llm_as_judge_message_public_role.py +1 -1
- opik/rest_api/types/llm_as_judge_message_role.py +1 -1
- opik/rest_api/types/llm_as_judge_message_write.py +9 -7
- opik/rest_api/types/llm_as_judge_message_write_role.py +1 -1
- opik/rest_api/types/llm_as_judge_model_parameters.py +6 -5
- opik/rest_api/types/llm_as_judge_model_parameters_public.py +6 -5
- opik/rest_api/types/llm_as_judge_model_parameters_write.py +6 -5
- opik/rest_api/types/llm_as_judge_output_schema.py +4 -6
- opik/rest_api/types/llm_as_judge_output_schema_public.py +4 -6
- opik/rest_api/types/llm_as_judge_output_schema_public_type.py +1 -3
- opik/rest_api/types/llm_as_judge_output_schema_type.py +1 -3
- opik/rest_api/types/llm_as_judge_output_schema_write.py +4 -6
- opik/rest_api/types/llm_as_judge_output_schema_write_type.py +1 -3
- opik/rest_api/types/log_item.py +5 -7
- opik/rest_api/types/log_item_level.py +1 -3
- opik/rest_api/types/log_page.py +4 -6
- opik/rest_api/types/manual_evaluation_request.py +38 -0
- opik/rest_api/types/manual_evaluation_request_entity_type.py +5 -0
- opik/rest_api/types/manual_evaluation_response.py +27 -0
- opik/rest_api/types/multipart_upload_part.py +20 -0
- opik/rest_api/types/numerical_feedback_definition.py +5 -7
- opik/rest_api/types/numerical_feedback_definition_create.py +4 -6
- opik/rest_api/types/numerical_feedback_definition_public.py +5 -7
- opik/rest_api/types/numerical_feedback_definition_update.py +4 -6
- opik/rest_api/types/numerical_feedback_detail.py +3 -5
- opik/rest_api/types/numerical_feedback_detail_create.py +3 -5
- opik/rest_api/types/numerical_feedback_detail_public.py +3 -5
- opik/rest_api/types/numerical_feedback_detail_update.py +3 -5
- opik/rest_api/types/optimization.py +37 -0
- opik/rest_api/types/optimization_page_public.py +28 -0
- opik/rest_api/types/optimization_public.py +37 -0
- opik/rest_api/types/optimization_public_status.py +7 -0
- opik/rest_api/types/optimization_status.py +7 -0
- opik/rest_api/types/optimization_studio_config.py +27 -0
- opik/rest_api/types/optimization_studio_config_public.py +27 -0
- opik/rest_api/types/optimization_studio_config_write.py +27 -0
- opik/rest_api/types/optimization_studio_log.py +22 -0
- opik/rest_api/types/optimization_write.py +30 -0
- opik/rest_api/types/optimization_write_status.py +7 -0
- opik/rest_api/types/page_columns.py +4 -6
- opik/rest_api/types/percentage_value_stat_public.py +4 -6
- opik/rest_api/types/percentage_values.py +8 -16
- opik/rest_api/types/percentage_values_detailed.py +8 -16
- opik/rest_api/types/percentage_values_public.py +8 -16
- opik/rest_api/types/project.py +12 -7
- opik/rest_api/types/project_detailed.py +12 -7
- opik/rest_api/types/project_detailed_visibility.py +5 -0
- opik/rest_api/types/project_metric_response_public.py +5 -9
- opik/rest_api/types/project_metric_response_public_interval.py +1 -3
- opik/rest_api/types/project_metric_response_public_metric_type.py +11 -1
- opik/rest_api/types/project_page_public.py +8 -10
- opik/rest_api/types/project_public.py +6 -6
- opik/rest_api/types/project_public_visibility.py +5 -0
- opik/rest_api/types/project_reference.py +31 -0
- opik/rest_api/types/project_reference_public.py +31 -0
- opik/rest_api/types/project_stat_item_object_public.py +8 -17
- opik/rest_api/types/project_stats_public.py +4 -6
- opik/rest_api/types/project_stats_summary.py +4 -6
- opik/rest_api/types/project_stats_summary_item.py +9 -6
- opik/rest_api/types/project_visibility.py +5 -0
- opik/rest_api/types/prompt.py +12 -7
- opik/rest_api/types/prompt_detail.py +12 -7
- opik/rest_api/types/prompt_detail_template_structure.py +5 -0
- opik/rest_api/types/prompt_page_public.py +9 -6
- opik/rest_api/types/prompt_public.py +11 -6
- opik/rest_api/types/prompt_public_template_structure.py +5 -0
- opik/rest_api/types/prompt_template_structure.py +5 -0
- opik/rest_api/types/prompt_tokens_details.py +19 -0
- opik/rest_api/types/prompt_version.py +7 -6
- opik/rest_api/types/prompt_version_detail.py +7 -6
- opik/rest_api/types/prompt_version_detail_template_structure.py +5 -0
- opik/rest_api/types/prompt_version_link.py +4 -5
- opik/rest_api/types/prompt_version_link_public.py +4 -5
- opik/rest_api/types/prompt_version_link_write.py +3 -5
- opik/rest_api/types/prompt_version_page_public.py +9 -6
- opik/rest_api/types/prompt_version_public.py +7 -6
- opik/rest_api/types/prompt_version_public_template_structure.py +5 -0
- opik/rest_api/types/prompt_version_template_structure.py +5 -0
- opik/rest_api/types/prompt_version_update.py +33 -0
- opik/rest_api/types/provider_api_key.py +18 -8
- opik/rest_api/types/provider_api_key_page_public.py +27 -0
- opik/rest_api/types/provider_api_key_provider.py +1 -1
- opik/rest_api/types/provider_api_key_public.py +18 -8
- opik/rest_api/types/provider_api_key_public_provider.py +1 -1
- opik/rest_api/types/response_format.py +5 -7
- opik/rest_api/types/response_format_type.py +1 -3
- opik/rest_api/types/result.py +21 -0
- opik/rest_api/types/results_number_public.py +4 -6
- opik/rest_api/types/score_name.py +4 -5
- opik/rest_api/types/service_toggles_config.py +44 -0
- opik/rest_api/types/span.py +13 -15
- opik/rest_api/types/span_batch.py +4 -6
- opik/rest_api/types/span_enrichment_options.py +31 -0
- opik/rest_api/types/span_experiment_item_bulk_write_view.py +39 -0
- opik/rest_api/types/span_experiment_item_bulk_write_view_type.py +5 -0
- opik/rest_api/types/span_filter.py +23 -0
- opik/rest_api/types/span_filter_operator.py +21 -0
- opik/rest_api/types/span_filter_public.py +4 -6
- opik/rest_api/types/span_filter_public_operator.py +2 -0
- opik/rest_api/types/span_filter_write.py +23 -0
- opik/rest_api/types/span_filter_write_operator.py +21 -0
- opik/rest_api/types/span_llm_as_judge_code.py +27 -0
- opik/rest_api/types/span_llm_as_judge_code_public.py +27 -0
- opik/rest_api/types/span_llm_as_judge_code_write.py +27 -0
- opik/rest_api/types/span_page_public.py +9 -6
- opik/rest_api/types/span_public.py +19 -16
- opik/rest_api/types/span_public_type.py +1 -1
- opik/rest_api/types/span_type.py +1 -1
- opik/rest_api/types/span_update.py +46 -0
- opik/rest_api/types/span_update_type.py +5 -0
- opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
- opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
- opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
- opik/rest_api/types/span_write.py +13 -14
- opik/rest_api/types/span_write_type.py +1 -1
- opik/rest_api/types/spans_count_response.py +20 -0
- opik/rest_api/types/start_multipart_upload_response.py +20 -0
- opik/rest_api/types/stream_options.py +3 -5
- opik/rest_api/types/studio_evaluation.py +20 -0
- opik/rest_api/types/studio_evaluation_public.py +20 -0
- opik/rest_api/types/studio_evaluation_write.py +20 -0
- opik/rest_api/types/studio_llm_model.py +21 -0
- opik/rest_api/types/studio_llm_model_public.py +21 -0
- opik/rest_api/types/studio_llm_model_write.py +21 -0
- opik/rest_api/types/studio_message.py +20 -0
- opik/rest_api/types/studio_message_public.py +20 -0
- opik/rest_api/types/studio_message_write.py +20 -0
- opik/rest_api/types/studio_metric.py +21 -0
- opik/rest_api/types/studio_metric_public.py +21 -0
- opik/rest_api/types/studio_metric_write.py +21 -0
- opik/rest_api/types/studio_optimizer.py +21 -0
- opik/rest_api/types/studio_optimizer_public.py +21 -0
- opik/rest_api/types/studio_optimizer_write.py +21 -0
- opik/rest_api/types/studio_prompt.py +20 -0
- opik/rest_api/types/studio_prompt_public.py +20 -0
- opik/rest_api/types/studio_prompt_write.py +20 -0
- opik/rest_api/types/tool.py +4 -6
- opik/rest_api/types/tool_call.py +4 -6
- opik/rest_api/types/trace.py +26 -12
- opik/rest_api/types/trace_batch.py +4 -6
- opik/rest_api/types/trace_count_response.py +4 -6
- opik/rest_api/types/trace_enrichment_options.py +32 -0
- opik/rest_api/types/trace_experiment_item_bulk_write_view.py +41 -0
- opik/rest_api/types/trace_filter.py +23 -0
- opik/rest_api/types/trace_filter_operator.py +21 -0
- opik/rest_api/types/trace_filter_public.py +23 -0
- opik/rest_api/types/trace_filter_public_operator.py +21 -0
- opik/rest_api/types/trace_filter_write.py +23 -0
- opik/rest_api/types/trace_filter_write_operator.py +21 -0
- opik/rest_api/types/trace_page_public.py +8 -10
- opik/rest_api/types/trace_public.py +27 -13
- opik/rest_api/types/trace_public_visibility_mode.py +5 -0
- opik/rest_api/types/trace_thread.py +18 -9
- opik/rest_api/types/trace_thread_filter.py +23 -0
- opik/rest_api/types/trace_thread_filter_operator.py +21 -0
- opik/rest_api/types/trace_thread_filter_public.py +23 -0
- opik/rest_api/types/trace_thread_filter_public_operator.py +21 -0
- opik/rest_api/types/trace_thread_filter_write.py +23 -0
- opik/rest_api/types/trace_thread_filter_write_operator.py +21 -0
- opik/rest_api/types/trace_thread_identifier.py +22 -0
- opik/rest_api/types/trace_thread_llm_as_judge_code.py +26 -0
- opik/rest_api/types/trace_thread_llm_as_judge_code_public.py +26 -0
- opik/rest_api/types/trace_thread_llm_as_judge_code_write.py +26 -0
- opik/rest_api/types/trace_thread_page.py +9 -6
- opik/rest_api/types/trace_thread_status.py +5 -0
- opik/rest_api/types/trace_thread_update.py +19 -0
- opik/rest_api/types/trace_thread_user_defined_metric_python_code.py +19 -0
- opik/rest_api/types/trace_thread_user_defined_metric_python_code_public.py +19 -0
- opik/rest_api/types/trace_thread_user_defined_metric_python_code_write.py +19 -0
- opik/rest_api/types/trace_update.py +39 -0
- opik/rest_api/types/trace_visibility_mode.py +5 -0
- opik/rest_api/types/trace_write.py +10 -11
- opik/rest_api/types/usage.py +6 -6
- opik/rest_api/types/user_defined_metric_python_code.py +3 -5
- opik/rest_api/types/user_defined_metric_python_code_public.py +3 -5
- opik/rest_api/types/user_defined_metric_python_code_write.py +3 -5
- opik/rest_api/types/value_entry.py +27 -0
- opik/rest_api/types/value_entry_compare.py +27 -0
- opik/rest_api/types/value_entry_compare_source.py +5 -0
- opik/rest_api/types/value_entry_experiment_item_bulk_write_view.py +27 -0
- opik/rest_api/types/value_entry_experiment_item_bulk_write_view_source.py +5 -0
- opik/rest_api/types/value_entry_public.py +27 -0
- opik/rest_api/types/value_entry_public_source.py +5 -0
- opik/rest_api/types/value_entry_source.py +5 -0
- opik/rest_api/types/video_url.py +19 -0
- opik/rest_api/types/video_url_public.py +19 -0
- opik/rest_api/types/video_url_write.py +19 -0
- opik/rest_api/types/webhook.py +28 -0
- opik/rest_api/types/webhook_examples.py +19 -0
- opik/rest_api/types/webhook_public.py +28 -0
- opik/rest_api/types/webhook_test_result.py +23 -0
- opik/rest_api/types/webhook_test_result_status.py +5 -0
- opik/rest_api/types/webhook_write.py +23 -0
- opik/rest_api/types/welcome_wizard_tracking.py +22 -0
- opik/rest_api/types/workspace_configuration.py +27 -0
- opik/rest_api/types/workspace_metric_request.py +24 -0
- opik/rest_api/types/workspace_metric_response.py +20 -0
- opik/rest_api/types/workspace_metrics_summary_request.py +23 -0
- opik/rest_api/types/workspace_metrics_summary_response.py +20 -0
- opik/rest_api/types/workspace_name_holder.py +19 -0
- opik/rest_api/types/workspace_spans_count.py +20 -0
- opik/rest_api/types/workspace_trace_count.py +3 -5
- opik/rest_api/welcome_wizard/__init__.py +4 -0
- opik/rest_api/welcome_wizard/client.py +195 -0
- opik/rest_api/welcome_wizard/raw_client.py +208 -0
- opik/rest_api/workspaces/__init__.py +2 -0
- opik/rest_api/workspaces/client.py +550 -77
- opik/rest_api/workspaces/raw_client.py +923 -0
- opik/rest_client_configurator/api.py +1 -0
- opik/rest_client_configurator/retry_decorator.py +1 -0
- opik/s3_httpx_client.py +67 -0
- opik/simulation/__init__.py +6 -0
- opik/simulation/simulated_user.py +99 -0
- opik/simulation/simulator.py +108 -0
- opik/synchronization.py +11 -24
- opik/tracing_runtime_config.py +48 -0
- opik/types.py +48 -2
- opik/url_helpers.py +13 -3
- opik/validation/chat_prompt_messages.py +241 -0
- opik/validation/feedback_score.py +4 -5
- opik/validation/parameter.py +122 -0
- opik/validation/parameters_validator.py +175 -0
- opik/validation/validator.py +30 -2
- opik/validation/validator_helpers.py +147 -0
- opik-1.9.71.dist-info/METADATA +370 -0
- opik-1.9.71.dist-info/RECORD +1110 -0
- {opik-1.6.4.dist-info → opik-1.9.71.dist-info}/WHEEL +1 -1
- opik-1.9.71.dist-info/licenses/LICENSE +203 -0
- opik/api_objects/prompt/prompt.py +0 -107
- opik/api_objects/prompt/prompt_template.py +0 -35
- opik/cli.py +0 -193
- opik/evaluation/metrics/models.py +0 -8
- opik/hooks.py +0 -13
- opik/integrations/bedrock/chunks_aggregator.py +0 -55
- opik/integrations/bedrock/helpers.py +0 -8
- opik/integrations/langchain/google_run_helpers.py +0 -75
- opik/integrations/langchain/openai_run_helpers.py +0 -122
- opik/message_processing/message_processors.py +0 -203
- opik/rest_api/types/delta_role.py +0 -7
- opik/rest_api/types/json_object_schema.py +0 -34
- opik-1.6.4.dist-info/METADATA +0 -270
- opik-1.6.4.dist-info/RECORD +0 -507
- /opik/integrations/bedrock/{stream_wrappers.py → converse/stream_wrappers.py} +0 -0
- {opik-1.6.4.dist-info → opik-1.9.71.dist-info}/entry_points.txt +0 -0
- {opik-1.6.4.dist-info → opik-1.9.71.dist-info}/top_level.txt +0 -0
opik/evaluation/evaluator.py
CHANGED
|
@@ -1,37 +1,98 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import time
|
|
3
|
-
from typing import Any, Callable, Dict, List, Optional, Union
|
|
3
|
+
from typing import Any, Callable, Dict, List, Optional, Union, cast
|
|
4
4
|
|
|
5
|
-
from .. import
|
|
5
|
+
from ..api_objects.prompt import base_prompt
|
|
6
6
|
from ..api_objects import opik_client
|
|
7
|
-
from ..api_objects
|
|
7
|
+
from ..api_objects import dataset, experiment
|
|
8
8
|
from ..api_objects.experiment import helpers as experiment_helpers
|
|
9
|
-
from ..api_objects.prompt import
|
|
10
|
-
from . import
|
|
11
|
-
from .
|
|
12
|
-
|
|
13
|
-
|
|
9
|
+
from ..api_objects.prompt.chat import chat_prompt_template
|
|
10
|
+
from ..api_objects.prompt import types as prompt_types
|
|
11
|
+
from . import (
|
|
12
|
+
asyncio_support,
|
|
13
|
+
engine,
|
|
14
|
+
evaluation_result,
|
|
15
|
+
report,
|
|
16
|
+
rest_operations,
|
|
17
|
+
samplers,
|
|
18
|
+
)
|
|
19
|
+
from .metrics import base_metric, score_result
|
|
20
|
+
from .models import ModelCapabilities, base_model, models_factory
|
|
21
|
+
from .scorers import scorer_function, scorer_wrapper_metric
|
|
22
|
+
from . import test_result
|
|
23
|
+
from .types import ExperimentScoreFunction, LLMTask, ScoringKeyMappingType
|
|
24
|
+
from .. import url_helpers
|
|
14
25
|
|
|
15
26
|
LOGGER = logging.getLogger(__name__)
|
|
27
|
+
MODALITY_SUPPORT_DOC_URL = (
|
|
28
|
+
"https://www.comet.com/docs/opik/evaluation/evaluate_multimodal"
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _try_notifying_about_experiment_completion(
|
|
33
|
+
experiment: experiment.Experiment,
|
|
34
|
+
) -> None:
|
|
35
|
+
try:
|
|
36
|
+
experiment.experiments_rest_client.finish_experiments(ids=[experiment.id])
|
|
37
|
+
except Exception:
|
|
38
|
+
LOGGER.debug(
|
|
39
|
+
"Failed to notify backend about the experiment completion. Experiment ID: %s",
|
|
40
|
+
experiment.id,
|
|
41
|
+
exc_info=True,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _compute_experiment_scores(
|
|
46
|
+
experiment_scoring_functions: List[ExperimentScoreFunction],
|
|
47
|
+
test_results: List[test_result.TestResult],
|
|
48
|
+
) -> List[score_result.ScoreResult]:
|
|
49
|
+
"""Compute experiment-level scores from test results."""
|
|
50
|
+
if not experiment_scoring_functions or not test_results:
|
|
51
|
+
return []
|
|
52
|
+
|
|
53
|
+
all_scores: List[score_result.ScoreResult] = []
|
|
54
|
+
for score_function in experiment_scoring_functions:
|
|
55
|
+
try:
|
|
56
|
+
scores = score_function(test_results)
|
|
57
|
+
# Handle Union[ScoreResult, List[ScoreResult]]
|
|
58
|
+
if isinstance(scores, list):
|
|
59
|
+
all_scores.extend(scores)
|
|
60
|
+
else:
|
|
61
|
+
all_scores.append(scores)
|
|
62
|
+
except Exception as e:
|
|
63
|
+
LOGGER.warning(
|
|
64
|
+
"Failed to compute experiment score: %s",
|
|
65
|
+
e,
|
|
66
|
+
exc_info=True,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
return all_scores
|
|
16
70
|
|
|
17
71
|
|
|
18
72
|
def evaluate(
|
|
19
73
|
dataset: dataset.Dataset,
|
|
20
74
|
task: LLMTask,
|
|
21
75
|
scoring_metrics: Optional[List[base_metric.BaseMetric]] = None,
|
|
76
|
+
scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
|
|
77
|
+
experiment_name_prefix: Optional[str] = None,
|
|
22
78
|
experiment_name: Optional[str] = None,
|
|
23
79
|
project_name: Optional[str] = None,
|
|
24
80
|
experiment_config: Optional[Dict[str, Any]] = None,
|
|
25
81
|
verbose: int = 1,
|
|
26
82
|
nb_samples: Optional[int] = None,
|
|
27
83
|
task_threads: int = 16,
|
|
28
|
-
prompt: Optional[
|
|
29
|
-
prompts: Optional[List[
|
|
84
|
+
prompt: Optional[base_prompt.BasePrompt] = None,
|
|
85
|
+
prompts: Optional[List[base_prompt.BasePrompt]] = None,
|
|
30
86
|
scoring_key_mapping: Optional[ScoringKeyMappingType] = None,
|
|
31
87
|
dataset_item_ids: Optional[List[str]] = None,
|
|
88
|
+
dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
|
|
89
|
+
trial_count: int = 1,
|
|
90
|
+
experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
|
|
32
91
|
) -> evaluation_result.EvaluationResult:
|
|
33
92
|
"""
|
|
34
|
-
Performs task evaluation on a given dataset.
|
|
93
|
+
Performs task evaluation on a given dataset. You can use either `scoring_metrics` or `scorer_functions` to calculate
|
|
94
|
+
evaluation metrics. The scorer functions doesn't require `scoring_key_mapping` and use reserved parameters
|
|
95
|
+
to receive inputs and outputs from the task.
|
|
35
96
|
|
|
36
97
|
Args:
|
|
37
98
|
dataset: An Opik dataset instance
|
|
@@ -39,6 +100,10 @@ def evaluate(
|
|
|
39
100
|
task: A callable object that takes dict with dataset item content
|
|
40
101
|
as input and returns dict which will later be used for scoring.
|
|
41
102
|
|
|
103
|
+
experiment_name_prefix: The prefix to be added to automatically generated experiment names to make them unique
|
|
104
|
+
but grouped under the same prefix. For example, if you set `experiment_name_prefix="my-experiment"`,
|
|
105
|
+
the first experiment created will be named `my-experiment-<unique-random-part>`.
|
|
106
|
+
|
|
42
107
|
experiment_name: The name of the experiment associated with evaluation run.
|
|
43
108
|
If None, a generated name will be used.
|
|
44
109
|
|
|
@@ -53,8 +118,16 @@ def evaluate(
|
|
|
53
118
|
are mandatory in `task`-returned dictionary.
|
|
54
119
|
If no value provided, the experiment won't have any scoring metrics.
|
|
55
120
|
|
|
121
|
+
scoring_functions: List of scorer functions to be executed during evaluation.
|
|
122
|
+
Each scorer function includes a scoring method that accepts predefined
|
|
123
|
+
arguments supplied by the evaluation engine:
|
|
124
|
+
• dataset_item — a dictionary containing the dataset item content,
|
|
125
|
+
• task_outputs — a dictionary containing the LLM task output.
|
|
126
|
+
• task_span - the data collected during the LLM task execution [optional].
|
|
127
|
+
|
|
56
128
|
verbose: an integer value that controls evaluation output logs such as summary and tqdm progress bar.
|
|
57
|
-
0 - no outputs, 1 - outputs are enabled (default)
|
|
129
|
+
0 - no outputs, 1 - outputs are enabled (default), 2 - outputs are enabled and detailed statistics
|
|
130
|
+
are displayed.
|
|
58
131
|
|
|
59
132
|
nb_samples: number of samples to evaluate. If no value is provided, all samples in the dataset will be evaluated.
|
|
60
133
|
|
|
@@ -73,9 +146,20 @@ def evaluate(
|
|
|
73
146
|
`{"input": "user_question"}` to map the "user_question" key to "input".
|
|
74
147
|
|
|
75
148
|
dataset_item_ids: list of dataset item ids to evaluate. If not provided, all samples in the dataset will be evaluated.
|
|
149
|
+
|
|
150
|
+
dataset_sampler: An instance of a dataset sampler that will be used to sample dataset items for evaluation.
|
|
151
|
+
If not provided, all samples in the dataset will be evaluated.
|
|
152
|
+
|
|
153
|
+
trial_count: number of times to run the task and evaluate the task output for every dataset item.
|
|
154
|
+
|
|
155
|
+
experiment_scoring_functions: List of callable functions that compute experiment-level scores.
|
|
156
|
+
Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
|
|
157
|
+
These scores are computed after all test results are collected and represent aggregate
|
|
158
|
+
metrics across the entire experiment.
|
|
76
159
|
"""
|
|
77
|
-
|
|
78
|
-
|
|
160
|
+
experiment_scoring_functions = (
|
|
161
|
+
[] if experiment_scoring_functions is None else experiment_scoring_functions
|
|
162
|
+
)
|
|
79
163
|
|
|
80
164
|
checked_prompts = experiment_helpers.handle_prompt_args(
|
|
81
165
|
prompt=prompt,
|
|
@@ -84,6 +168,11 @@ def evaluate(
|
|
|
84
168
|
|
|
85
169
|
client = opik_client.get_client_cached()
|
|
86
170
|
|
|
171
|
+
experiment_name = _use_or_create_experiment_name(
|
|
172
|
+
experiment_name=experiment_name,
|
|
173
|
+
experiment_name_prefix=experiment_name_prefix,
|
|
174
|
+
)
|
|
175
|
+
|
|
87
176
|
experiment = client.create_experiment(
|
|
88
177
|
name=experiment_name,
|
|
89
178
|
dataset_name=dataset.name,
|
|
@@ -91,56 +180,130 @@ def evaluate(
|
|
|
91
180
|
prompts=checked_prompts,
|
|
92
181
|
)
|
|
93
182
|
|
|
183
|
+
# wrap scoring functions if any
|
|
184
|
+
scoring_metrics = _wrap_scoring_functions(
|
|
185
|
+
scoring_functions=scoring_functions,
|
|
186
|
+
scoring_metrics=scoring_metrics,
|
|
187
|
+
project_name=project_name,
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
return _evaluate_task(
|
|
191
|
+
client=client,
|
|
192
|
+
experiment=experiment,
|
|
193
|
+
dataset=dataset,
|
|
194
|
+
task=task,
|
|
195
|
+
scoring_metrics=scoring_metrics,
|
|
196
|
+
project_name=project_name,
|
|
197
|
+
verbose=verbose,
|
|
198
|
+
nb_samples=nb_samples,
|
|
199
|
+
task_threads=task_threads,
|
|
200
|
+
scoring_key_mapping=scoring_key_mapping,
|
|
201
|
+
dataset_item_ids=dataset_item_ids,
|
|
202
|
+
dataset_sampler=dataset_sampler,
|
|
203
|
+
trial_count=trial_count,
|
|
204
|
+
experiment_scoring_functions=experiment_scoring_functions,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def _evaluate_task(
|
|
209
|
+
*,
|
|
210
|
+
client: opik_client.Opik,
|
|
211
|
+
experiment: experiment.Experiment,
|
|
212
|
+
dataset: dataset.Dataset,
|
|
213
|
+
task: LLMTask,
|
|
214
|
+
scoring_metrics: List[base_metric.BaseMetric],
|
|
215
|
+
project_name: Optional[str],
|
|
216
|
+
verbose: int,
|
|
217
|
+
nb_samples: Optional[int],
|
|
218
|
+
task_threads: int,
|
|
219
|
+
scoring_key_mapping: Optional[ScoringKeyMappingType],
|
|
220
|
+
dataset_item_ids: Optional[List[str]],
|
|
221
|
+
dataset_sampler: Optional[samplers.BaseDatasetSampler],
|
|
222
|
+
trial_count: int,
|
|
223
|
+
experiment_scoring_functions: List[ExperimentScoreFunction],
|
|
224
|
+
) -> evaluation_result.EvaluationResult:
|
|
94
225
|
start_time = time.time()
|
|
95
226
|
|
|
96
227
|
with asyncio_support.async_http_connections_expire_immediately():
|
|
97
228
|
evaluation_engine = engine.EvaluationEngine(
|
|
98
229
|
client=client,
|
|
99
230
|
project_name=project_name,
|
|
100
|
-
experiment_=experiment,
|
|
101
231
|
scoring_metrics=scoring_metrics,
|
|
102
232
|
workers=task_threads,
|
|
103
233
|
verbose=verbose,
|
|
104
234
|
scoring_key_mapping=scoring_key_mapping,
|
|
105
235
|
)
|
|
106
|
-
test_results = evaluation_engine.
|
|
236
|
+
test_results = evaluation_engine.evaluate_llm_task_on_dataset(
|
|
107
237
|
dataset_=dataset,
|
|
108
238
|
task=task,
|
|
109
239
|
nb_samples=nb_samples,
|
|
110
240
|
dataset_item_ids=dataset_item_ids,
|
|
241
|
+
dataset_sampler=dataset_sampler,
|
|
242
|
+
trial_count=trial_count,
|
|
243
|
+
experiment_=experiment,
|
|
111
244
|
)
|
|
112
245
|
|
|
113
246
|
total_time = time.time() - start_time
|
|
114
247
|
|
|
115
|
-
|
|
116
|
-
|
|
248
|
+
# Compute experiment scores
|
|
249
|
+
computed_experiment_scores = _compute_experiment_scores(
|
|
250
|
+
experiment_scoring_functions=experiment_scoring_functions,
|
|
251
|
+
test_results=test_results,
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
if verbose >= 1:
|
|
255
|
+
report.display_experiment_results(
|
|
256
|
+
dataset.name, total_time, test_results, computed_experiment_scores
|
|
257
|
+
)
|
|
117
258
|
|
|
118
|
-
|
|
259
|
+
experiment_url = url_helpers.get_experiment_url_by_id(
|
|
119
260
|
experiment_id=experiment.id,
|
|
120
261
|
dataset_id=dataset.id,
|
|
121
262
|
url_override=client.config.url_override,
|
|
122
263
|
)
|
|
123
264
|
|
|
265
|
+
report.display_experiment_link(experiment_url=experiment_url)
|
|
266
|
+
|
|
124
267
|
client.flush()
|
|
125
268
|
|
|
269
|
+
_try_notifying_about_experiment_completion(experiment)
|
|
270
|
+
|
|
271
|
+
# Log experiment scores to backend
|
|
272
|
+
if computed_experiment_scores:
|
|
273
|
+
experiment.log_experiment_scores(score_results=computed_experiment_scores)
|
|
274
|
+
|
|
126
275
|
evaluation_result_ = evaluation_result.EvaluationResult(
|
|
276
|
+
dataset_id=dataset.id,
|
|
127
277
|
experiment_id=experiment.id,
|
|
128
278
|
experiment_name=experiment.name,
|
|
129
279
|
test_results=test_results,
|
|
280
|
+
experiment_url=experiment_url,
|
|
281
|
+
trial_count=trial_count,
|
|
282
|
+
experiment_scores=computed_experiment_scores,
|
|
130
283
|
)
|
|
131
284
|
|
|
285
|
+
if verbose >= 2:
|
|
286
|
+
report.display_evaluation_scores_statistics(
|
|
287
|
+
dataset_name=dataset.name,
|
|
288
|
+
evaluation_results=evaluation_result_,
|
|
289
|
+
)
|
|
290
|
+
|
|
132
291
|
return evaluation_result_
|
|
133
292
|
|
|
134
293
|
|
|
135
294
|
def evaluate_experiment(
|
|
136
295
|
experiment_name: str,
|
|
137
296
|
scoring_metrics: List[base_metric.BaseMetric],
|
|
297
|
+
scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
|
|
138
298
|
scoring_threads: int = 16,
|
|
139
299
|
verbose: int = 1,
|
|
140
300
|
scoring_key_mapping: Optional[ScoringKeyMappingType] = None,
|
|
141
301
|
experiment_id: Optional[str] = None,
|
|
302
|
+
experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
|
|
142
303
|
) -> evaluation_result.EvaluationResult:
|
|
143
|
-
"""Update existing experiment with new evaluation metrics.
|
|
304
|
+
"""Update the existing experiment with new evaluation metrics. You can use either `scoring_metrics` or `scorer_functions` to calculate
|
|
305
|
+
evaluation metrics. The scorer functions doesn't require `scoring_key_mapping` and use reserved parameters
|
|
306
|
+
to receive inputs and outputs from the task.
|
|
144
307
|
|
|
145
308
|
Args:
|
|
146
309
|
experiment_name: The name of the experiment to update.
|
|
@@ -151,15 +314,32 @@ def evaluate_experiment(
|
|
|
151
314
|
of the `score` method in metrics that you need to find out which keys
|
|
152
315
|
are mandatory in `task`-returned dictionary.
|
|
153
316
|
|
|
317
|
+
scoring_functions: List of scorer functions to be executed during evaluation.
|
|
318
|
+
Each scorer function includes a scoring method that accepts predefined
|
|
319
|
+
arguments supplied by the evaluation engine:
|
|
320
|
+
• dataset_item — a dictionary containing the dataset item content,
|
|
321
|
+
• task_outputs — a dictionary containing the LLM task output.
|
|
322
|
+
• task_span - the data collected during the LLM task execution [optional].
|
|
323
|
+
|
|
154
324
|
scoring_threads: amount of thread workers to run scoring metrics.
|
|
155
325
|
|
|
156
326
|
verbose: an integer value that controls evaluation output logs such as summary and tqdm progress bar.
|
|
157
327
|
|
|
158
328
|
scoring_key_mapping: A dictionary that allows you to rename keys present in either the dataset item or the task output
|
|
159
|
-
so that they match the keys expected by the scoring metrics. For example if you have a dataset item with the following content:
|
|
329
|
+
so that they match the keys expected by the scoring metrics. For example, if you have a dataset item with the following content:
|
|
160
330
|
{"user_question": "What is Opik ?"} and a scoring metric that expects a key "input", you can use scoring_key_mapping
|
|
161
331
|
`{"input": "user_question"}` to map the "user_question" key to "input".
|
|
332
|
+
|
|
333
|
+
experiment_id: The ID of the experiment to evaluate. If not provided, the experiment will be evaluated based on the experiment name.
|
|
334
|
+
|
|
335
|
+
experiment_scoring_functions: List of callable functions that compute experiment-level scores.
|
|
336
|
+
Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
|
|
337
|
+
These scores are computed after all test results are collected and represent aggregate
|
|
338
|
+
metrics across the entire experiment.
|
|
162
339
|
"""
|
|
340
|
+
experiment_scoring_functions = (
|
|
341
|
+
[] if experiment_scoring_functions is None else experiment_scoring_functions
|
|
342
|
+
)
|
|
163
343
|
start_time = time.time()
|
|
164
344
|
|
|
165
345
|
client = opik_client.get_client_cached()
|
|
@@ -172,10 +352,11 @@ def evaluate_experiment(
|
|
|
172
352
|
client=client, experiment_name=experiment_name
|
|
173
353
|
)
|
|
174
354
|
|
|
355
|
+
dataset_ = client.get_dataset(name=experiment.dataset_name)
|
|
356
|
+
|
|
175
357
|
test_cases = rest_operations.get_experiment_test_cases(
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
dataset_id=experiment.dataset_id,
|
|
358
|
+
experiment_=experiment,
|
|
359
|
+
dataset_=dataset_,
|
|
179
360
|
scoring_key_mapping=scoring_key_mapping,
|
|
180
361
|
)
|
|
181
362
|
first_trace_id = test_cases[0].trace_id
|
|
@@ -183,11 +364,17 @@ def evaluate_experiment(
|
|
|
183
364
|
client=client, trace_id=first_trace_id
|
|
184
365
|
)
|
|
185
366
|
|
|
367
|
+
# wrap scoring functions if any
|
|
368
|
+
scoring_metrics = _wrap_scoring_functions(
|
|
369
|
+
scoring_functions=scoring_functions,
|
|
370
|
+
scoring_metrics=scoring_metrics,
|
|
371
|
+
project_name=project_name,
|
|
372
|
+
)
|
|
373
|
+
|
|
186
374
|
with asyncio_support.async_http_connections_expire_immediately():
|
|
187
375
|
evaluation_engine = engine.EvaluationEngine(
|
|
188
376
|
client=client,
|
|
189
377
|
project_name=project_name,
|
|
190
|
-
experiment_=experiment,
|
|
191
378
|
scoring_metrics=scoring_metrics,
|
|
192
379
|
workers=scoring_threads,
|
|
193
380
|
verbose=verbose,
|
|
@@ -199,47 +386,104 @@ def evaluate_experiment(
|
|
|
199
386
|
|
|
200
387
|
total_time = time.time() - start_time
|
|
201
388
|
|
|
202
|
-
|
|
389
|
+
# Compute experiment scores
|
|
390
|
+
computed_experiment_scores = _compute_experiment_scores(
|
|
391
|
+
experiment_scoring_functions=experiment_scoring_functions,
|
|
392
|
+
test_results=test_results,
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
if verbose >= 1:
|
|
203
396
|
report.display_experiment_results(
|
|
204
|
-
|
|
397
|
+
dataset_.name,
|
|
398
|
+
total_time,
|
|
399
|
+
test_results,
|
|
400
|
+
computed_experiment_scores,
|
|
205
401
|
)
|
|
206
402
|
|
|
207
|
-
|
|
208
|
-
dataset_id=experiment.dataset_id,
|
|
403
|
+
experiment_url = url_helpers.get_experiment_url_by_id(
|
|
209
404
|
experiment_id=experiment.id,
|
|
405
|
+
dataset_id=dataset_.id,
|
|
210
406
|
url_override=client.config.url_override,
|
|
211
407
|
)
|
|
212
408
|
|
|
409
|
+
report.display_experiment_link(experiment_url=experiment_url)
|
|
410
|
+
|
|
411
|
+
_try_notifying_about_experiment_completion(experiment)
|
|
412
|
+
|
|
413
|
+
# Log experiment scores to backend
|
|
414
|
+
if computed_experiment_scores:
|
|
415
|
+
experiment.log_experiment_scores(score_results=computed_experiment_scores)
|
|
416
|
+
|
|
213
417
|
evaluation_result_ = evaluation_result.EvaluationResult(
|
|
418
|
+
dataset_id=dataset_.id,
|
|
214
419
|
experiment_id=experiment.id,
|
|
215
420
|
experiment_name=experiment.name,
|
|
216
421
|
test_results=test_results,
|
|
422
|
+
experiment_url=experiment_url,
|
|
423
|
+
trial_count=1,
|
|
424
|
+
experiment_scores=computed_experiment_scores,
|
|
217
425
|
)
|
|
218
426
|
|
|
427
|
+
if verbose >= 2:
|
|
428
|
+
report.display_evaluation_scores_statistics(
|
|
429
|
+
dataset_name=dataset_.name,
|
|
430
|
+
evaluation_results=evaluation_result_,
|
|
431
|
+
)
|
|
432
|
+
|
|
219
433
|
return evaluation_result_
|
|
220
434
|
|
|
221
435
|
|
|
222
436
|
def _build_prompt_evaluation_task(
|
|
223
437
|
model: base_model.OpikBaseModel, messages: List[Dict[str, Any]]
|
|
224
438
|
) -> Callable[[Dict[str, Any]], Dict[str, Any]]:
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
439
|
+
supported_modalities = cast(
|
|
440
|
+
prompt_types.SupportedModalities,
|
|
441
|
+
{
|
|
442
|
+
"vision": ModelCapabilities.supports_vision(
|
|
443
|
+
getattr(model, "model_name", None)
|
|
444
|
+
),
|
|
445
|
+
"video": ModelCapabilities.supports_video(
|
|
446
|
+
getattr(model, "model_name", None)
|
|
447
|
+
),
|
|
448
|
+
},
|
|
449
|
+
)
|
|
450
|
+
# Disable placeholder validation since we pass all dataset item fields to format()
|
|
451
|
+
chat_prompt_template_ = chat_prompt_template.ChatPromptTemplate(
|
|
452
|
+
messages=messages, validate_placeholders=False
|
|
453
|
+
)
|
|
236
454
|
|
|
237
|
-
|
|
455
|
+
required_modalities = chat_prompt_template_.required_modalities()
|
|
456
|
+
unsupported_modalities = {
|
|
457
|
+
modality
|
|
458
|
+
for modality in required_modalities
|
|
459
|
+
if not supported_modalities.get(modality, False)
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
if unsupported_modalities:
|
|
463
|
+
modalities_list = ", ".join(sorted(unsupported_modalities))
|
|
464
|
+
LOGGER.warning(
|
|
465
|
+
"Model '%s' does not support %s content. Multimedia parts will be flattened "
|
|
466
|
+
"to text placeholders. See %s for supported models and customization options.",
|
|
467
|
+
getattr(model, "model_name", "unknown"),
|
|
468
|
+
modalities_list,
|
|
469
|
+
MODALITY_SUPPORT_DOC_URL,
|
|
470
|
+
)
|
|
238
471
|
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
472
|
+
def _prompt_evaluation_task(prompt_variables: Dict[str, Any]) -> Dict[str, Any]:
|
|
473
|
+
template_type_override = prompt_variables.get("type")
|
|
474
|
+
processed_messages = chat_prompt_template_.format(
|
|
475
|
+
variables=prompt_variables,
|
|
476
|
+
supported_modalities=supported_modalities,
|
|
477
|
+
template_type=template_type_override,
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
with base_model.get_provider_response(
|
|
481
|
+
model_provider=model, messages=processed_messages
|
|
482
|
+
) as llm_output:
|
|
483
|
+
return {
|
|
484
|
+
"input": processed_messages,
|
|
485
|
+
"output": llm_output.choices[0].message.content,
|
|
486
|
+
}
|
|
243
487
|
|
|
244
488
|
return _prompt_evaluation_task
|
|
245
489
|
|
|
@@ -249,14 +493,19 @@ def evaluate_prompt(
|
|
|
249
493
|
messages: List[Dict[str, Any]],
|
|
250
494
|
model: Optional[Union[str, base_model.OpikBaseModel]] = None,
|
|
251
495
|
scoring_metrics: Optional[List[base_metric.BaseMetric]] = None,
|
|
496
|
+
scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
|
|
497
|
+
experiment_name_prefix: Optional[str] = None,
|
|
252
498
|
experiment_name: Optional[str] = None,
|
|
253
499
|
project_name: Optional[str] = None,
|
|
254
500
|
experiment_config: Optional[Dict[str, Any]] = None,
|
|
255
501
|
verbose: int = 1,
|
|
256
502
|
nb_samples: Optional[int] = None,
|
|
257
503
|
task_threads: int = 16,
|
|
258
|
-
prompt: Optional[
|
|
504
|
+
prompt: Optional[base_prompt.BasePrompt] = None,
|
|
259
505
|
dataset_item_ids: Optional[List[str]] = None,
|
|
506
|
+
dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
|
|
507
|
+
trial_count: int = 1,
|
|
508
|
+
experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
|
|
260
509
|
) -> evaluation_result.EvaluationResult:
|
|
261
510
|
"""
|
|
262
511
|
Performs prompt evaluation on a given dataset.
|
|
@@ -271,6 +520,17 @@ def evaluate_prompt(
|
|
|
271
520
|
scoring_metrics: List of metrics to calculate during evaluation.
|
|
272
521
|
The LLM input and output will be passed as arguments to each metric `score(...)` method.
|
|
273
522
|
|
|
523
|
+
scoring_functions: List of scorer functions to be executed during evaluation.
|
|
524
|
+
Each scorer function includes a scoring method that accepts predefined
|
|
525
|
+
arguments supplied by the evaluation engine:
|
|
526
|
+
• dataset_item — a dictionary containing the dataset item content,
|
|
527
|
+
• task_outputs — a dictionary containing the LLM task output.
|
|
528
|
+
• task_span - the data collected during the LLM task execution [optional].
|
|
529
|
+
|
|
530
|
+
experiment_name_prefix: The prefix to be added to automatically generated experiment names to make them unique
|
|
531
|
+
but grouped under the same prefix. For example, if you set `experiment_name_prefix="my-experiment"`,
|
|
532
|
+
the first experiment created will be named `my-experiment-<unique-random-part>`.
|
|
533
|
+
|
|
274
534
|
experiment_name: name of the experiment.
|
|
275
535
|
|
|
276
536
|
project_name: The name of the project to log data
|
|
@@ -286,28 +546,48 @@ def evaluate_prompt(
|
|
|
286
546
|
prompt: Prompt object to link with experiment.
|
|
287
547
|
|
|
288
548
|
dataset_item_ids: list of dataset item ids to evaluate. If not provided, all samples in the dataset will be evaluated.
|
|
549
|
+
|
|
550
|
+
dataset_sampler: An instance of a dataset sampler that will be used to sample dataset items for evaluation.
|
|
551
|
+
If not provided, all samples in the dataset will be evaluated.
|
|
552
|
+
|
|
553
|
+
trial_count: number of times to execute the prompt and evaluate the LLM output for every dataset item.
|
|
554
|
+
|
|
555
|
+
experiment_scoring_functions: List of callable functions that compute experiment-level scores.
|
|
556
|
+
Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
|
|
557
|
+
These scores are computed after all test results are collected and represent aggregate
|
|
558
|
+
metrics across the entire experiment.
|
|
289
559
|
"""
|
|
560
|
+
experiment_scoring_functions = (
|
|
561
|
+
[] if experiment_scoring_functions is None else experiment_scoring_functions
|
|
562
|
+
)
|
|
290
563
|
if isinstance(model, str):
|
|
291
|
-
|
|
564
|
+
opik_model = models_factory.get(model_name=model)
|
|
292
565
|
elif not isinstance(model, base_model.OpikBaseModel):
|
|
293
566
|
raise ValueError("`model` must be either a string or an OpikBaseModel instance")
|
|
567
|
+
else:
|
|
568
|
+
opik_model = model
|
|
294
569
|
|
|
295
570
|
if experiment_config is None:
|
|
296
|
-
experiment_config = {
|
|
571
|
+
experiment_config = {
|
|
572
|
+
"prompt_template": messages,
|
|
573
|
+
"model": opik_model.model_name,
|
|
574
|
+
}
|
|
297
575
|
else:
|
|
298
576
|
if "prompt_template" not in experiment_config:
|
|
299
577
|
experiment_config["prompt_template"] = messages
|
|
300
578
|
|
|
301
579
|
if "model" not in experiment_config:
|
|
302
|
-
experiment_config["model"] =
|
|
303
|
-
|
|
304
|
-
if scoring_metrics is None:
|
|
305
|
-
scoring_metrics = []
|
|
580
|
+
experiment_config["model"] = opik_model.model_name
|
|
306
581
|
|
|
307
582
|
client = opik_client.get_client_cached()
|
|
308
583
|
|
|
309
584
|
prompts = [prompt] if prompt else None
|
|
310
585
|
|
|
586
|
+
experiment_name = _use_or_create_experiment_name(
|
|
587
|
+
experiment_name=experiment_name,
|
|
588
|
+
experiment_name_prefix=experiment_name_prefix,
|
|
589
|
+
)
|
|
590
|
+
|
|
311
591
|
experiment = client.create_experiment(
|
|
312
592
|
name=experiment_name,
|
|
313
593
|
dataset_name=dataset.name,
|
|
@@ -315,42 +595,363 @@ def evaluate_prompt(
|
|
|
315
595
|
prompts=prompts,
|
|
316
596
|
)
|
|
317
597
|
|
|
598
|
+
# wrap scoring functions if any
|
|
599
|
+
scoring_metrics = _wrap_scoring_functions(
|
|
600
|
+
scoring_functions=scoring_functions,
|
|
601
|
+
scoring_metrics=scoring_metrics,
|
|
602
|
+
project_name=project_name,
|
|
603
|
+
)
|
|
604
|
+
|
|
318
605
|
start_time = time.time()
|
|
319
606
|
|
|
320
607
|
with asyncio_support.async_http_connections_expire_immediately():
|
|
321
608
|
evaluation_engine = engine.EvaluationEngine(
|
|
322
609
|
client=client,
|
|
323
610
|
project_name=project_name,
|
|
324
|
-
experiment_=experiment,
|
|
325
611
|
scoring_metrics=scoring_metrics,
|
|
326
612
|
workers=task_threads,
|
|
327
613
|
verbose=verbose,
|
|
328
614
|
scoring_key_mapping=None,
|
|
329
615
|
)
|
|
330
|
-
test_results = evaluation_engine.
|
|
616
|
+
test_results = evaluation_engine.evaluate_llm_task_on_dataset(
|
|
331
617
|
dataset_=dataset,
|
|
332
|
-
task=_build_prompt_evaluation_task(model=
|
|
618
|
+
task=_build_prompt_evaluation_task(model=opik_model, messages=messages),
|
|
333
619
|
nb_samples=nb_samples,
|
|
334
620
|
dataset_item_ids=dataset_item_ids,
|
|
621
|
+
dataset_sampler=dataset_sampler,
|
|
622
|
+
trial_count=trial_count,
|
|
623
|
+
experiment_=experiment,
|
|
335
624
|
)
|
|
336
625
|
|
|
337
626
|
total_time = time.time() - start_time
|
|
338
627
|
|
|
339
|
-
|
|
340
|
-
|
|
628
|
+
# Compute experiment scores
|
|
629
|
+
computed_experiment_scores = _compute_experiment_scores(
|
|
630
|
+
experiment_scoring_functions=experiment_scoring_functions,
|
|
631
|
+
test_results=test_results,
|
|
632
|
+
)
|
|
341
633
|
|
|
342
|
-
|
|
634
|
+
if verbose >= 1:
|
|
635
|
+
report.display_experiment_results(
|
|
636
|
+
dataset.name, total_time, test_results, computed_experiment_scores
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
experiment_url = url_helpers.get_experiment_url_by_id(
|
|
343
640
|
experiment_id=experiment.id,
|
|
344
641
|
dataset_id=dataset.id,
|
|
345
642
|
url_override=client.config.url_override,
|
|
346
643
|
)
|
|
347
644
|
|
|
645
|
+
report.display_experiment_link(experiment_url=experiment_url)
|
|
646
|
+
|
|
348
647
|
client.flush()
|
|
349
648
|
|
|
649
|
+
_try_notifying_about_experiment_completion(experiment)
|
|
650
|
+
|
|
651
|
+
# Log experiment scores to backend
|
|
652
|
+
if computed_experiment_scores:
|
|
653
|
+
experiment.log_experiment_scores(score_results=computed_experiment_scores)
|
|
654
|
+
|
|
350
655
|
evaluation_result_ = evaluation_result.EvaluationResult(
|
|
351
656
|
experiment_id=experiment.id,
|
|
657
|
+
dataset_id=dataset.id,
|
|
352
658
|
experiment_name=experiment.name,
|
|
353
659
|
test_results=test_results,
|
|
660
|
+
experiment_url=experiment_url,
|
|
661
|
+
trial_count=trial_count,
|
|
662
|
+
experiment_scores=computed_experiment_scores,
|
|
354
663
|
)
|
|
355
664
|
|
|
665
|
+
if verbose >= 2:
|
|
666
|
+
report.display_evaluation_scores_statistics(
|
|
667
|
+
dataset_name=dataset.name,
|
|
668
|
+
evaluation_results=evaluation_result_,
|
|
669
|
+
)
|
|
670
|
+
|
|
356
671
|
return evaluation_result_
|
|
672
|
+
|
|
673
|
+
|
|
674
|
+
def evaluate_optimization_trial(
|
|
675
|
+
optimization_id: str,
|
|
676
|
+
dataset: dataset.Dataset,
|
|
677
|
+
task: LLMTask,
|
|
678
|
+
scoring_metrics: Optional[List[base_metric.BaseMetric]] = None,
|
|
679
|
+
scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
|
|
680
|
+
experiment_name_prefix: Optional[str] = None,
|
|
681
|
+
experiment_name: Optional[str] = None,
|
|
682
|
+
project_name: Optional[str] = None,
|
|
683
|
+
experiment_config: Optional[Dict[str, Any]] = None,
|
|
684
|
+
verbose: int = 1,
|
|
685
|
+
nb_samples: Optional[int] = None,
|
|
686
|
+
task_threads: int = 16,
|
|
687
|
+
prompt: Optional[base_prompt.BasePrompt] = None,
|
|
688
|
+
prompts: Optional[List[base_prompt.BasePrompt]] = None,
|
|
689
|
+
scoring_key_mapping: Optional[ScoringKeyMappingType] = None,
|
|
690
|
+
dataset_item_ids: Optional[List[str]] = None,
|
|
691
|
+
dataset_sampler: Optional[samplers.BaseDatasetSampler] = None,
|
|
692
|
+
trial_count: int = 1,
|
|
693
|
+
experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
|
|
694
|
+
) -> evaluation_result.EvaluationResult:
|
|
695
|
+
"""
|
|
696
|
+
Performs task evaluation on a given dataset.
|
|
697
|
+
|
|
698
|
+
Args:
|
|
699
|
+
optimization_id: The ID of the optimization associated with the experiment.
|
|
700
|
+
|
|
701
|
+
dataset: An Opik dataset instance
|
|
702
|
+
|
|
703
|
+
task: A callable object that takes dict with dataset item content
|
|
704
|
+
as input and returns dict which will later be used for scoring.
|
|
705
|
+
|
|
706
|
+
scoring_functions: List of scorer functions to be executed during evaluation.
|
|
707
|
+
Each scorer function includes a scoring method that accepts predefined
|
|
708
|
+
arguments supplied by the evaluation engine:
|
|
709
|
+
• dataset_item — a dictionary containing the dataset item content,
|
|
710
|
+
• task_outputs — a dictionary containing the LLM task output.
|
|
711
|
+
• task_span - the data collected during the LLM task execution [optional].
|
|
712
|
+
|
|
713
|
+
experiment_name_prefix: The prefix to be added to automatically generated experiment names to make them unique
|
|
714
|
+
but grouped under the same prefix. For example, if you set `experiment_name_prefix="my-experiment"`,
|
|
715
|
+
the first experiment created will be named `my-experiment-<unique-random-part>`.
|
|
716
|
+
|
|
717
|
+
experiment_name: The name of the experiment associated with evaluation run.
|
|
718
|
+
If None, a generated name will be used.
|
|
719
|
+
|
|
720
|
+
project_name: The name of the project. If not provided, traces and spans will be logged to the `Default Project`
|
|
721
|
+
|
|
722
|
+
experiment_config: The dictionary with parameters that describe experiment
|
|
723
|
+
|
|
724
|
+
scoring_metrics: List of metrics to calculate during evaluation.
|
|
725
|
+
Each metric has `score(...)` method, arguments for this method
|
|
726
|
+
are taken from the `task` output, check the signature
|
|
727
|
+
of the `score` method in metrics that you need to find out which keys
|
|
728
|
+
are mandatory in `task`-returned dictionary.
|
|
729
|
+
If no value provided, the experiment won't have any scoring metrics.
|
|
730
|
+
|
|
731
|
+
verbose: an integer value that controls evaluation output logs such as summary and tqdm progress bar.
|
|
732
|
+
0 - no outputs, 1 - outputs are enabled (default).
|
|
733
|
+
|
|
734
|
+
nb_samples: number of samples to evaluate. If no value is provided, all samples in the dataset will be evaluated.
|
|
735
|
+
|
|
736
|
+
task_threads: number of thread workers to run tasks. If set to 1, no additional
|
|
737
|
+
threads are created, all tasks executed in the current thread sequentially.
|
|
738
|
+
are executed sequentially in the current thread.
|
|
739
|
+
Use more than 1 worker if your task object is compatible with sharing across threads.
|
|
740
|
+
|
|
741
|
+
prompt: Prompt object to link with experiment. Deprecated, use `prompts` argument instead.
|
|
742
|
+
|
|
743
|
+
prompts: A list of Prompt objects to link with experiment.
|
|
744
|
+
|
|
745
|
+
scoring_key_mapping: A dictionary that allows you to rename keys present in either the dataset item or the task output
|
|
746
|
+
so that they match the keys expected by the scoring metrics. For example if you have a dataset item with the following content:
|
|
747
|
+
{"user_question": "What is Opik ?"} and a scoring metric that expects a key "input", you can use scoring_key_mapping
|
|
748
|
+
`{"input": "user_question"}` to map the "user_question" key to "input".
|
|
749
|
+
|
|
750
|
+
dataset_item_ids: list of dataset item ids to evaluate. If not provided, all samples in the dataset will be evaluated.
|
|
751
|
+
|
|
752
|
+
dataset_sampler: An instance of a dataset sampler that will be used to sample dataset items for evaluation.
|
|
753
|
+
If not provided, all samples in the dataset will be evaluated.
|
|
754
|
+
|
|
755
|
+
trial_count: number of times to execute the prompt and evaluate the LLM output for every dataset item.
|
|
756
|
+
|
|
757
|
+
experiment_scoring_functions: List of callable functions that compute experiment-level scores.
|
|
758
|
+
Each function takes a list of TestResult objects and returns a list of ScoreResult objects.
|
|
759
|
+
These scores are computed after all test results are collected and represent aggregate
|
|
760
|
+
metrics across the entire experiment.
|
|
761
|
+
"""
|
|
762
|
+
experiment_scoring_functions = (
|
|
763
|
+
[] if experiment_scoring_functions is None else experiment_scoring_functions
|
|
764
|
+
)
|
|
765
|
+
|
|
766
|
+
if scoring_metrics is None:
|
|
767
|
+
scoring_metrics = []
|
|
768
|
+
|
|
769
|
+
checked_prompts = experiment_helpers.handle_prompt_args(
|
|
770
|
+
prompt=prompt,
|
|
771
|
+
prompts=prompts,
|
|
772
|
+
)
|
|
773
|
+
|
|
774
|
+
# wrap scoring functions if any
|
|
775
|
+
scoring_metrics = _wrap_scoring_functions(
|
|
776
|
+
scoring_functions=scoring_functions,
|
|
777
|
+
scoring_metrics=scoring_metrics,
|
|
778
|
+
project_name=project_name,
|
|
779
|
+
)
|
|
780
|
+
|
|
781
|
+
client = opik_client.get_client_cached()
|
|
782
|
+
|
|
783
|
+
experiment_name = _use_or_create_experiment_name(
|
|
784
|
+
experiment_name=experiment_name,
|
|
785
|
+
experiment_name_prefix=experiment_name_prefix,
|
|
786
|
+
)
|
|
787
|
+
|
|
788
|
+
experiment = client.create_experiment(
|
|
789
|
+
name=experiment_name,
|
|
790
|
+
dataset_name=dataset.name,
|
|
791
|
+
experiment_config=experiment_config,
|
|
792
|
+
prompts=checked_prompts,
|
|
793
|
+
type="trial",
|
|
794
|
+
optimization_id=optimization_id,
|
|
795
|
+
)
|
|
796
|
+
|
|
797
|
+
return _evaluate_task(
|
|
798
|
+
client=client,
|
|
799
|
+
experiment=experiment,
|
|
800
|
+
dataset=dataset,
|
|
801
|
+
task=task,
|
|
802
|
+
scoring_metrics=scoring_metrics,
|
|
803
|
+
project_name=project_name,
|
|
804
|
+
verbose=verbose,
|
|
805
|
+
nb_samples=nb_samples,
|
|
806
|
+
task_threads=task_threads,
|
|
807
|
+
scoring_key_mapping=scoring_key_mapping,
|
|
808
|
+
dataset_item_ids=dataset_item_ids,
|
|
809
|
+
dataset_sampler=dataset_sampler,
|
|
810
|
+
trial_count=trial_count,
|
|
811
|
+
experiment_scoring_functions=experiment_scoring_functions,
|
|
812
|
+
)
|
|
813
|
+
|
|
814
|
+
|
|
815
|
+
def evaluate_on_dict_items(
|
|
816
|
+
items: List[Dict[str, Any]],
|
|
817
|
+
task: LLMTask,
|
|
818
|
+
scoring_metrics: Optional[List[base_metric.BaseMetric]] = None,
|
|
819
|
+
scoring_functions: Optional[List[scorer_function.ScorerFunction]] = None,
|
|
820
|
+
project_name: Optional[str] = None,
|
|
821
|
+
verbose: int = 0,
|
|
822
|
+
scoring_key_mapping: Optional[ScoringKeyMappingType] = None,
|
|
823
|
+
scoring_threads: int = 16,
|
|
824
|
+
) -> evaluation_result.EvaluationResultOnDictItems:
|
|
825
|
+
"""
|
|
826
|
+
Lightweight evaluation function that evaluates a task on dataset items (as dictionaries)
|
|
827
|
+
without requiring a Dataset object or creating an experiment.
|
|
828
|
+
|
|
829
|
+
This function is useful for optimization scenarios where you need to evaluate many
|
|
830
|
+
candidate solutions quickly using Opik's metric infrastructure. It creates traces for
|
|
831
|
+
tracking but doesn't require experiment setup or dataset management.
|
|
832
|
+
|
|
833
|
+
Args:
|
|
834
|
+
items: List of dataset item contents (dictionaries with the data to evaluate).
|
|
835
|
+
|
|
836
|
+
task: A callable object that takes dict with dataset item content
|
|
837
|
+
as input and returns dict which will later be used for scoring.
|
|
838
|
+
|
|
839
|
+
scoring_metrics: List of metrics to calculate during evaluation.
|
|
840
|
+
Each metric's `score(...)` method will be called with arguments taken from
|
|
841
|
+
the dataset item and task output.
|
|
842
|
+
|
|
843
|
+
scoring_functions: List of scorer functions to be executed during evaluation.
|
|
844
|
+
Each scorer function accepts predefined arguments:
|
|
845
|
+
• dataset_item — a dictionary containing the dataset item content,
|
|
846
|
+
• task_outputs — a dictionary containing the LLM task output.
|
|
847
|
+
|
|
848
|
+
project_name: The name of the project for logging traces.
|
|
849
|
+
|
|
850
|
+
verbose: Controls evaluation output logs and progress bars.
|
|
851
|
+
0 - no outputs (default), 1 - enable outputs.
|
|
852
|
+
|
|
853
|
+
scoring_key_mapping: A dictionary that allows you to rename keys present in either
|
|
854
|
+
the dataset item or the task output to match the keys expected by scoring metrics.
|
|
855
|
+
|
|
856
|
+
scoring_threads: Number of thread workers to run scoring metrics.
|
|
857
|
+
|
|
858
|
+
Returns:
|
|
859
|
+
EvaluationResultOnDictItems object containing test results and providing methods
|
|
860
|
+
to aggregate scores, similar to the regular evaluation result.
|
|
861
|
+
|
|
862
|
+
Example:
|
|
863
|
+
```python
|
|
864
|
+
import opik
|
|
865
|
+
from opik.evaluation.metrics import Equals
|
|
866
|
+
|
|
867
|
+
items = [
|
|
868
|
+
{"input": "What is 2+2?", "expected_output": "4"},
|
|
869
|
+
{"input": "What is 3+3?", "expected_output": "6"},
|
|
870
|
+
]
|
|
871
|
+
|
|
872
|
+
def my_task(item):
|
|
873
|
+
# Your LLM call here
|
|
874
|
+
question = item["input"]
|
|
875
|
+
# ... call model ...
|
|
876
|
+
return {"output": model_output}
|
|
877
|
+
|
|
878
|
+
result = opik.evaluate_on_dict_items(
|
|
879
|
+
items=items,
|
|
880
|
+
task=my_task,
|
|
881
|
+
scoring_metrics=[Equals()],
|
|
882
|
+
scoring_key_mapping={"reference": "expected_output"},
|
|
883
|
+
)
|
|
884
|
+
|
|
885
|
+
# Access individual test results
|
|
886
|
+
for test_result in result.test_results:
|
|
887
|
+
print(f"Score: {test_result.score_results[0].value}")
|
|
888
|
+
|
|
889
|
+
# Get aggregated statistics
|
|
890
|
+
aggregated = result.aggregate_evaluation_scores()
|
|
891
|
+
print(f"Mean equals score: {aggregated['equals_metric'].mean}")
|
|
892
|
+
```
|
|
893
|
+
"""
|
|
894
|
+
# Wrap scoring functions if any
|
|
895
|
+
scoring_metrics = _wrap_scoring_functions(
|
|
896
|
+
scoring_functions=scoring_functions,
|
|
897
|
+
scoring_metrics=scoring_metrics,
|
|
898
|
+
project_name=project_name,
|
|
899
|
+
)
|
|
900
|
+
|
|
901
|
+
if not scoring_metrics:
|
|
902
|
+
LOGGER.warning("No scoring metrics provided for items evaluation")
|
|
903
|
+
return evaluation_result.EvaluationResultOnDictItems(test_results=[])
|
|
904
|
+
|
|
905
|
+
client = opik_client.get_client_cached()
|
|
906
|
+
|
|
907
|
+
# Create evaluation engine
|
|
908
|
+
with asyncio_support.async_http_connections_expire_immediately():
|
|
909
|
+
evaluation_engine = engine.EvaluationEngine(
|
|
910
|
+
client=client,
|
|
911
|
+
project_name=project_name,
|
|
912
|
+
scoring_metrics=scoring_metrics,
|
|
913
|
+
workers=scoring_threads,
|
|
914
|
+
verbose=verbose,
|
|
915
|
+
scoring_key_mapping=scoring_key_mapping,
|
|
916
|
+
)
|
|
917
|
+
|
|
918
|
+
# Use the new evaluate_items method
|
|
919
|
+
test_results = evaluation_engine.evaluate_llm_task_on_dict_items(
|
|
920
|
+
items=items,
|
|
921
|
+
task=task,
|
|
922
|
+
)
|
|
923
|
+
|
|
924
|
+
return evaluation_result.EvaluationResultOnDictItems(
|
|
925
|
+
test_results=test_results,
|
|
926
|
+
)
|
|
927
|
+
|
|
928
|
+
|
|
929
|
+
def _wrap_scoring_functions(
|
|
930
|
+
scoring_functions: Optional[List[scorer_function.ScorerFunction]],
|
|
931
|
+
scoring_metrics: Optional[List[base_metric.BaseMetric]],
|
|
932
|
+
project_name: Optional[str],
|
|
933
|
+
) -> List[base_metric.BaseMetric]:
|
|
934
|
+
if scoring_functions:
|
|
935
|
+
function_metrics = scorer_wrapper_metric.wrap_scorer_functions(
|
|
936
|
+
scoring_functions, project_name=project_name
|
|
937
|
+
)
|
|
938
|
+
if scoring_metrics:
|
|
939
|
+
scoring_metrics.extend(function_metrics)
|
|
940
|
+
else:
|
|
941
|
+
scoring_metrics = function_metrics
|
|
942
|
+
|
|
943
|
+
return scoring_metrics if scoring_metrics else []
|
|
944
|
+
|
|
945
|
+
|
|
946
|
+
def _use_or_create_experiment_name(
|
|
947
|
+
experiment_name: Optional[str], experiment_name_prefix: Optional[str]
|
|
948
|
+
) -> Optional[str]:
|
|
949
|
+
if experiment_name:
|
|
950
|
+
return experiment_name
|
|
951
|
+
|
|
952
|
+
if experiment_name_prefix:
|
|
953
|
+
return experiment_helpers.generate_unique_experiment_name(
|
|
954
|
+
experiment_name_prefix
|
|
955
|
+
)
|
|
956
|
+
else:
|
|
957
|
+
return None
|