opik 1.6.4__py3-none-any.whl → 1.9.71__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opik/__init__.py +33 -2
- opik/anonymizer/__init__.py +5 -0
- opik/anonymizer/anonymizer.py +12 -0
- opik/anonymizer/factory.py +80 -0
- opik/anonymizer/recursive_anonymizer.py +64 -0
- opik/anonymizer/rules.py +56 -0
- opik/anonymizer/rules_anonymizer.py +35 -0
- opik/api_objects/attachment/__init__.py +5 -0
- opik/api_objects/attachment/attachment.py +20 -0
- opik/api_objects/attachment/attachment_context.py +36 -0
- opik/api_objects/attachment/attachments_extractor.py +153 -0
- opik/api_objects/attachment/client.py +220 -0
- opik/api_objects/attachment/converters.py +51 -0
- opik/api_objects/attachment/decoder.py +18 -0
- opik/api_objects/attachment/decoder_base64.py +83 -0
- opik/api_objects/attachment/decoder_helpers.py +137 -0
- opik/api_objects/conversation/__init__.py +0 -0
- opik/api_objects/conversation/conversation_factory.py +43 -0
- opik/api_objects/conversation/conversation_thread.py +49 -0
- opik/api_objects/data_helpers.py +79 -0
- opik/api_objects/dataset/dataset.py +107 -45
- opik/api_objects/dataset/rest_operations.py +12 -3
- opik/api_objects/experiment/experiment.py +81 -45
- opik/api_objects/experiment/experiment_item.py +2 -1
- opik/api_objects/experiment/experiments_client.py +64 -0
- opik/api_objects/experiment/helpers.py +35 -11
- opik/api_objects/experiment/rest_operations.py +88 -19
- opik/api_objects/helpers.py +104 -7
- opik/api_objects/local_recording.py +81 -0
- opik/api_objects/opik_client.py +872 -174
- opik/api_objects/opik_query_language.py +136 -18
- opik/api_objects/optimization/__init__.py +3 -0
- opik/api_objects/optimization/optimization.py +39 -0
- opik/api_objects/prompt/__init__.py +13 -1
- opik/api_objects/prompt/base_prompt.py +69 -0
- opik/api_objects/prompt/base_prompt_template.py +29 -0
- opik/api_objects/prompt/chat/__init__.py +1 -0
- opik/api_objects/prompt/chat/chat_prompt.py +210 -0
- opik/api_objects/prompt/chat/chat_prompt_template.py +350 -0
- opik/api_objects/prompt/chat/content_renderer_registry.py +203 -0
- opik/api_objects/prompt/client.py +193 -41
- opik/api_objects/prompt/text/__init__.py +1 -0
- opik/api_objects/prompt/text/prompt.py +174 -0
- opik/api_objects/prompt/text/prompt_template.py +55 -0
- opik/api_objects/prompt/types.py +29 -0
- opik/api_objects/rest_stream_parser.py +98 -0
- opik/api_objects/search_helpers.py +89 -0
- opik/api_objects/span/span_client.py +165 -45
- opik/api_objects/span/span_data.py +136 -25
- opik/api_objects/threads/__init__.py +0 -0
- opik/api_objects/threads/threads_client.py +185 -0
- opik/api_objects/trace/trace_client.py +72 -36
- opik/api_objects/trace/trace_data.py +112 -26
- opik/api_objects/validation_helpers.py +3 -3
- opik/cli/__init__.py +5 -0
- opik/cli/__main__.py +6 -0
- opik/cli/configure.py +66 -0
- opik/cli/exports/__init__.py +131 -0
- opik/cli/exports/dataset.py +278 -0
- opik/cli/exports/experiment.py +784 -0
- opik/cli/exports/project.py +685 -0
- opik/cli/exports/prompt.py +578 -0
- opik/cli/exports/utils.py +406 -0
- opik/cli/harbor.py +39 -0
- opik/cli/healthcheck.py +21 -0
- opik/cli/imports/__init__.py +439 -0
- opik/cli/imports/dataset.py +143 -0
- opik/cli/imports/experiment.py +1192 -0
- opik/cli/imports/project.py +262 -0
- opik/cli/imports/prompt.py +177 -0
- opik/cli/imports/utils.py +280 -0
- opik/cli/main.py +49 -0
- opik/cli/proxy.py +93 -0
- opik/cli/usage_report/__init__.py +16 -0
- opik/cli/usage_report/charts.py +783 -0
- opik/cli/usage_report/cli.py +274 -0
- opik/cli/usage_report/constants.py +9 -0
- opik/cli/usage_report/extraction.py +749 -0
- opik/cli/usage_report/pdf.py +244 -0
- opik/cli/usage_report/statistics.py +78 -0
- opik/cli/usage_report/utils.py +235 -0
- opik/config.py +62 -4
- opik/configurator/configure.py +45 -6
- opik/configurator/opik_rest_helpers.py +4 -1
- opik/context_storage.py +164 -65
- opik/datetime_helpers.py +12 -0
- opik/decorator/arguments_helpers.py +9 -1
- opik/decorator/base_track_decorator.py +298 -146
- opik/decorator/context_manager/__init__.py +0 -0
- opik/decorator/context_manager/span_context_manager.py +123 -0
- opik/decorator/context_manager/trace_context_manager.py +84 -0
- opik/decorator/generator_wrappers.py +3 -2
- opik/decorator/inspect_helpers.py +11 -0
- opik/decorator/opik_args/__init__.py +13 -0
- opik/decorator/opik_args/api_classes.py +71 -0
- opik/decorator/opik_args/helpers.py +120 -0
- opik/decorator/span_creation_handler.py +49 -21
- opik/decorator/tracker.py +9 -1
- opik/dict_utils.py +3 -3
- opik/environment.py +13 -1
- opik/error_tracking/api.py +1 -1
- opik/error_tracking/before_send.py +6 -5
- opik/error_tracking/environment_details.py +29 -7
- opik/error_tracking/error_filtering/filter_by_response_status_code.py +42 -0
- opik/error_tracking/error_filtering/filter_chain_builder.py +14 -3
- opik/evaluation/__init__.py +14 -2
- opik/evaluation/engine/engine.py +280 -82
- opik/evaluation/engine/evaluation_tasks_executor.py +15 -10
- opik/evaluation/engine/helpers.py +34 -9
- opik/evaluation/engine/metrics_evaluator.py +237 -0
- opik/evaluation/engine/types.py +5 -4
- opik/evaluation/evaluation_result.py +169 -2
- opik/evaluation/evaluator.py +659 -58
- opik/evaluation/metrics/__init__.py +121 -6
- opik/evaluation/metrics/aggregated_metric.py +92 -0
- opik/evaluation/metrics/arguments_helpers.py +15 -21
- opik/evaluation/metrics/arguments_validator.py +38 -0
- opik/evaluation/metrics/base_metric.py +20 -10
- opik/evaluation/metrics/conversation/__init__.py +48 -0
- opik/evaluation/metrics/conversation/conversation_thread_metric.py +79 -0
- opik/evaluation/metrics/conversation/conversation_turns_factory.py +39 -0
- opik/evaluation/metrics/conversation/g_eval_wrappers.py +19 -0
- opik/evaluation/metrics/conversation/helpers.py +84 -0
- opik/evaluation/metrics/conversation/heuristics/__init__.py +14 -0
- opik/evaluation/metrics/conversation/heuristics/degeneration/__init__.py +3 -0
- opik/evaluation/metrics/conversation/heuristics/degeneration/metric.py +189 -0
- opik/evaluation/metrics/conversation/heuristics/degeneration/phrases.py +12 -0
- opik/evaluation/metrics/conversation/heuristics/knowledge_retention/__init__.py +3 -0
- opik/evaluation/metrics/conversation/heuristics/knowledge_retention/metric.py +172 -0
- opik/evaluation/metrics/conversation/llm_judges/__init__.py +32 -0
- opik/evaluation/metrics/conversation/llm_judges/conversational_coherence/__init__.py +0 -0
- opik/evaluation/metrics/conversation/llm_judges/conversational_coherence/metric.py +274 -0
- opik/evaluation/metrics/conversation/llm_judges/conversational_coherence/schema.py +16 -0
- opik/evaluation/metrics/conversation/llm_judges/conversational_coherence/templates.py +95 -0
- opik/evaluation/metrics/conversation/llm_judges/g_eval_wrappers.py +442 -0
- opik/evaluation/metrics/conversation/llm_judges/session_completeness/__init__.py +0 -0
- opik/evaluation/metrics/conversation/llm_judges/session_completeness/metric.py +295 -0
- opik/evaluation/metrics/conversation/llm_judges/session_completeness/schema.py +22 -0
- opik/evaluation/metrics/conversation/llm_judges/session_completeness/templates.py +139 -0
- opik/evaluation/metrics/conversation/llm_judges/user_frustration/__init__.py +0 -0
- opik/evaluation/metrics/conversation/llm_judges/user_frustration/metric.py +277 -0
- opik/evaluation/metrics/conversation/llm_judges/user_frustration/schema.py +16 -0
- opik/evaluation/metrics/conversation/llm_judges/user_frustration/templates.py +135 -0
- opik/evaluation/metrics/conversation/types.py +34 -0
- opik/evaluation/metrics/conversation_types.py +9 -0
- opik/evaluation/metrics/heuristics/bertscore.py +107 -0
- opik/evaluation/metrics/heuristics/bleu.py +43 -16
- opik/evaluation/metrics/heuristics/chrf.py +127 -0
- opik/evaluation/metrics/heuristics/contains.py +50 -11
- opik/evaluation/metrics/heuristics/distribution_metrics.py +331 -0
- opik/evaluation/metrics/heuristics/equals.py +4 -1
- opik/evaluation/metrics/heuristics/gleu.py +113 -0
- opik/evaluation/metrics/heuristics/is_json.py +9 -3
- opik/evaluation/metrics/heuristics/language_adherence.py +123 -0
- opik/evaluation/metrics/heuristics/levenshtein_ratio.py +6 -5
- opik/evaluation/metrics/heuristics/meteor.py +119 -0
- opik/evaluation/metrics/heuristics/prompt_injection.py +150 -0
- opik/evaluation/metrics/heuristics/readability.py +129 -0
- opik/evaluation/metrics/heuristics/regex_match.py +4 -1
- opik/evaluation/metrics/heuristics/rouge.py +148 -0
- opik/evaluation/metrics/heuristics/sentiment.py +98 -0
- opik/evaluation/metrics/heuristics/spearman.py +88 -0
- opik/evaluation/metrics/heuristics/tone.py +155 -0
- opik/evaluation/metrics/heuristics/vader_sentiment.py +77 -0
- opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +27 -30
- opik/evaluation/metrics/llm_judges/answer_relevance/parser.py +27 -0
- opik/evaluation/metrics/llm_judges/answer_relevance/templates.py +10 -10
- opik/evaluation/metrics/llm_judges/context_precision/metric.py +28 -31
- opik/evaluation/metrics/llm_judges/context_precision/parser.py +27 -0
- opik/evaluation/metrics/llm_judges/context_precision/template.py +7 -7
- opik/evaluation/metrics/llm_judges/context_recall/metric.py +27 -31
- opik/evaluation/metrics/llm_judges/context_recall/parser.py +27 -0
- opik/evaluation/metrics/llm_judges/context_recall/template.py +7 -7
- opik/evaluation/metrics/llm_judges/factuality/metric.py +7 -26
- opik/evaluation/metrics/llm_judges/factuality/parser.py +35 -0
- opik/evaluation/metrics/llm_judges/factuality/template.py +1 -1
- opik/evaluation/metrics/llm_judges/g_eval/__init__.py +5 -0
- opik/evaluation/metrics/llm_judges/g_eval/metric.py +244 -113
- opik/evaluation/metrics/llm_judges/g_eval/parser.py +161 -0
- opik/evaluation/metrics/llm_judges/g_eval/presets.py +209 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/__init__.py +36 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/agent_assessment.py +77 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/bias_classifier.py +181 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/compliance_risk.py +41 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/prompt_uncertainty.py +41 -0
- opik/evaluation/metrics/llm_judges/g_eval_presets/qa_suite.py +146 -0
- opik/evaluation/metrics/llm_judges/hallucination/metric.py +23 -27
- opik/evaluation/metrics/llm_judges/hallucination/parser.py +29 -0
- opik/evaluation/metrics/llm_judges/hallucination/template.py +2 -4
- opik/evaluation/metrics/llm_judges/llm_juries/__init__.py +3 -0
- opik/evaluation/metrics/llm_judges/llm_juries/metric.py +76 -0
- opik/evaluation/metrics/llm_judges/moderation/metric.py +23 -28
- opik/evaluation/metrics/llm_judges/moderation/parser.py +27 -0
- opik/evaluation/metrics/llm_judges/moderation/template.py +2 -2
- opik/evaluation/metrics/llm_judges/parsing_helpers.py +26 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/__init__.py +0 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +144 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/parser.py +79 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/schema.py +15 -0
- opik/evaluation/metrics/llm_judges/structure_output_compliance/template.py +50 -0
- opik/evaluation/metrics/llm_judges/syc_eval/__init__.py +0 -0
- opik/evaluation/metrics/llm_judges/syc_eval/metric.py +252 -0
- opik/evaluation/metrics/llm_judges/syc_eval/parser.py +82 -0
- opik/evaluation/metrics/llm_judges/syc_eval/template.py +155 -0
- opik/evaluation/metrics/llm_judges/trajectory_accuracy/__init__.py +3 -0
- opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +171 -0
- opik/evaluation/metrics/llm_judges/trajectory_accuracy/parser.py +38 -0
- opik/evaluation/metrics/llm_judges/trajectory_accuracy/templates.py +65 -0
- opik/evaluation/metrics/llm_judges/usefulness/metric.py +23 -32
- opik/evaluation/metrics/llm_judges/usefulness/parser.py +28 -0
- opik/evaluation/metrics/ragas_metric.py +112 -0
- opik/evaluation/models/__init__.py +10 -0
- opik/evaluation/models/base_model.py +140 -18
- opik/evaluation/models/langchain/__init__.py +3 -0
- opik/evaluation/models/langchain/langchain_chat_model.py +166 -0
- opik/evaluation/models/langchain/message_converters.py +106 -0
- opik/evaluation/models/langchain/opik_monitoring.py +23 -0
- opik/evaluation/models/litellm/litellm_chat_model.py +186 -40
- opik/evaluation/models/litellm/opik_monitor.py +24 -21
- opik/evaluation/models/litellm/util.py +125 -0
- opik/evaluation/models/litellm/warning_filters.py +16 -4
- opik/evaluation/models/model_capabilities.py +187 -0
- opik/evaluation/models/models_factory.py +25 -3
- opik/evaluation/preprocessing.py +92 -0
- opik/evaluation/report.py +70 -12
- opik/evaluation/rest_operations.py +49 -45
- opik/evaluation/samplers/__init__.py +4 -0
- opik/evaluation/samplers/base_dataset_sampler.py +40 -0
- opik/evaluation/samplers/random_dataset_sampler.py +48 -0
- opik/evaluation/score_statistics.py +66 -0
- opik/evaluation/scorers/__init__.py +4 -0
- opik/evaluation/scorers/scorer_function.py +55 -0
- opik/evaluation/scorers/scorer_wrapper_metric.py +130 -0
- opik/evaluation/test_case.py +3 -2
- opik/evaluation/test_result.py +1 -0
- opik/evaluation/threads/__init__.py +0 -0
- opik/evaluation/threads/context_helper.py +32 -0
- opik/evaluation/threads/evaluation_engine.py +181 -0
- opik/evaluation/threads/evaluation_result.py +18 -0
- opik/evaluation/threads/evaluator.py +120 -0
- opik/evaluation/threads/helpers.py +51 -0
- opik/evaluation/types.py +9 -1
- opik/exceptions.py +116 -3
- opik/file_upload/__init__.py +0 -0
- opik/file_upload/base_upload_manager.py +39 -0
- opik/file_upload/file_upload_monitor.py +14 -0
- opik/file_upload/file_uploader.py +141 -0
- opik/file_upload/mime_type.py +9 -0
- opik/file_upload/s3_multipart_upload/__init__.py +0 -0
- opik/file_upload/s3_multipart_upload/file_parts_strategy.py +89 -0
- opik/file_upload/s3_multipart_upload/s3_file_uploader.py +86 -0
- opik/file_upload/s3_multipart_upload/s3_upload_error.py +29 -0
- opik/file_upload/thread_pool.py +17 -0
- opik/file_upload/upload_client.py +114 -0
- opik/file_upload/upload_manager.py +255 -0
- opik/file_upload/upload_options.py +37 -0
- opik/format_helpers.py +17 -0
- opik/guardrails/__init__.py +4 -0
- opik/guardrails/guardrail.py +157 -0
- opik/guardrails/guards/__init__.py +5 -0
- opik/guardrails/guards/guard.py +17 -0
- opik/guardrails/guards/pii.py +47 -0
- opik/guardrails/guards/topic.py +76 -0
- opik/guardrails/rest_api_client.py +34 -0
- opik/guardrails/schemas.py +24 -0
- opik/guardrails/tracing.py +61 -0
- opik/healthcheck/__init__.py +2 -1
- opik/healthcheck/checks.py +2 -2
- opik/healthcheck/rich_representation.py +1 -1
- opik/hooks/__init__.py +23 -0
- opik/hooks/anonymizer_hook.py +36 -0
- opik/hooks/httpx_client_hook.py +112 -0
- opik/httpx_client.py +75 -4
- opik/id_helpers.py +18 -0
- opik/integrations/adk/__init__.py +14 -0
- opik/integrations/adk/callback_context_info_extractors.py +32 -0
- opik/integrations/adk/graph/__init__.py +0 -0
- opik/integrations/adk/graph/mermaid_graph_builder.py +128 -0
- opik/integrations/adk/graph/nodes.py +101 -0
- opik/integrations/adk/graph/subgraph_edges_builders.py +41 -0
- opik/integrations/adk/helpers.py +48 -0
- opik/integrations/adk/legacy_opik_tracer.py +381 -0
- opik/integrations/adk/opik_tracer.py +370 -0
- opik/integrations/adk/patchers/__init__.py +4 -0
- opik/integrations/adk/patchers/adk_otel_tracer/__init__.py +0 -0
- opik/integrations/adk/patchers/adk_otel_tracer/llm_span_helpers.py +30 -0
- opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +201 -0
- opik/integrations/adk/patchers/litellm_wrappers.py +91 -0
- opik/integrations/adk/patchers/llm_response_wrapper.py +105 -0
- opik/integrations/adk/patchers/patchers.py +64 -0
- opik/integrations/adk/recursive_callback_injector.py +126 -0
- opik/integrations/aisuite/aisuite_decorator.py +8 -3
- opik/integrations/aisuite/opik_tracker.py +1 -0
- opik/integrations/anthropic/messages_create_decorator.py +8 -3
- opik/integrations/anthropic/opik_tracker.py +0 -1
- opik/integrations/bedrock/converse/__init__.py +0 -0
- opik/integrations/bedrock/converse/chunks_aggregator.py +188 -0
- opik/integrations/bedrock/{converse_decorator.py → converse/converse_decorator.py} +18 -8
- opik/integrations/bedrock/invoke_agent_decorator.py +12 -7
- opik/integrations/bedrock/invoke_model/__init__.py +0 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/__init__.py +78 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/api.py +45 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/base.py +23 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/claude.py +121 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/format_detector.py +107 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/llama.py +108 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/mistral.py +118 -0
- opik/integrations/bedrock/invoke_model/chunks_aggregator/nova.py +99 -0
- opik/integrations/bedrock/invoke_model/invoke_model_decorator.py +178 -0
- opik/integrations/bedrock/invoke_model/response_types.py +34 -0
- opik/integrations/bedrock/invoke_model/stream_wrappers.py +122 -0
- opik/integrations/bedrock/invoke_model/usage_converters.py +87 -0
- opik/integrations/bedrock/invoke_model/usage_extraction.py +108 -0
- opik/integrations/bedrock/opik_tracker.py +43 -4
- opik/integrations/bedrock/types.py +19 -0
- opik/integrations/crewai/crewai_decorator.py +34 -56
- opik/integrations/crewai/opik_tracker.py +31 -10
- opik/integrations/crewai/patchers/__init__.py +5 -0
- opik/integrations/crewai/patchers/flow.py +118 -0
- opik/integrations/crewai/patchers/litellm_completion.py +30 -0
- opik/integrations/crewai/patchers/llm_client.py +207 -0
- opik/integrations/dspy/callback.py +246 -84
- opik/integrations/dspy/graph.py +88 -0
- opik/integrations/dspy/parsers.py +168 -0
- opik/integrations/genai/encoder_extension.py +2 -6
- opik/integrations/genai/generate_content_decorator.py +20 -13
- opik/integrations/guardrails/guardrails_decorator.py +4 -0
- opik/integrations/harbor/__init__.py +17 -0
- opik/integrations/harbor/experiment_service.py +269 -0
- opik/integrations/harbor/opik_tracker.py +528 -0
- opik/integrations/haystack/constants.py +35 -0
- opik/integrations/haystack/converters.py +1 -2
- opik/integrations/haystack/opik_connector.py +28 -6
- opik/integrations/haystack/opik_span_bridge.py +284 -0
- opik/integrations/haystack/opik_tracer.py +124 -222
- opik/integrations/langchain/__init__.py +3 -1
- opik/integrations/langchain/helpers.py +96 -0
- opik/integrations/langchain/langgraph_async_context_bridge.py +131 -0
- opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
- opik/integrations/langchain/opik_encoder_extension.py +2 -2
- opik/integrations/langchain/opik_tracer.py +641 -206
- opik/integrations/langchain/provider_usage_extractors/__init__.py +5 -0
- opik/integrations/langchain/provider_usage_extractors/anthropic_usage_extractor.py +101 -0
- opik/integrations/langchain/provider_usage_extractors/anthropic_vertexai_usage_extractor.py +67 -0
- opik/integrations/langchain/provider_usage_extractors/bedrock_usage_extractor.py +94 -0
- opik/integrations/langchain/provider_usage_extractors/google_generative_ai_usage_extractor.py +109 -0
- opik/integrations/langchain/provider_usage_extractors/groq_usage_extractor.py +92 -0
- opik/integrations/langchain/provider_usage_extractors/langchain_run_helpers/__init__.py +15 -0
- opik/integrations/langchain/provider_usage_extractors/langchain_run_helpers/helpers.py +134 -0
- opik/integrations/langchain/provider_usage_extractors/langchain_run_helpers/langchain_usage.py +163 -0
- opik/integrations/langchain/provider_usage_extractors/openai_usage_extractor.py +124 -0
- opik/integrations/langchain/provider_usage_extractors/provider_usage_extractor_protocol.py +29 -0
- opik/integrations/langchain/provider_usage_extractors/usage_extractor.py +48 -0
- opik/integrations/langchain/provider_usage_extractors/vertexai_usage_extractor.py +109 -0
- opik/integrations/litellm/__init__.py +5 -0
- opik/integrations/litellm/completion_chunks_aggregator.py +115 -0
- opik/integrations/litellm/litellm_completion_decorator.py +242 -0
- opik/integrations/litellm/opik_tracker.py +43 -0
- opik/integrations/litellm/stream_patchers.py +151 -0
- opik/integrations/llama_index/callback.py +179 -78
- opik/integrations/llama_index/event_parsing_utils.py +29 -9
- opik/integrations/openai/agents/opik_tracing_processor.py +204 -32
- opik/integrations/openai/agents/span_data_parsers.py +15 -6
- opik/integrations/openai/chat_completion_chunks_aggregator.py +1 -1
- opik/integrations/openai/{openai_decorator.py → openai_chat_completions_decorator.py} +45 -35
- opik/integrations/openai/openai_responses_decorator.py +158 -0
- opik/integrations/openai/opik_tracker.py +94 -13
- opik/integrations/openai/response_events_aggregator.py +36 -0
- opik/integrations/openai/stream_patchers.py +125 -15
- opik/integrations/sagemaker/auth.py +5 -1
- opik/jsonable_encoder.py +29 -1
- opik/llm_usage/base_original_provider_usage.py +15 -8
- opik/llm_usage/bedrock_usage.py +8 -2
- opik/llm_usage/google_usage.py +6 -1
- opik/llm_usage/llm_usage_info.py +6 -0
- opik/llm_usage/{openai_usage.py → openai_chat_completions_usage.py} +2 -12
- opik/llm_usage/{openai_agent_usage.py → openai_responses_usage.py} +7 -15
- opik/llm_usage/opik_usage.py +36 -10
- opik/llm_usage/opik_usage_factory.py +35 -19
- opik/logging_messages.py +19 -7
- opik/message_processing/arguments_utils.py +22 -0
- opik/message_processing/batching/base_batcher.py +45 -17
- opik/message_processing/batching/batch_manager.py +22 -10
- opik/message_processing/batching/batch_manager_constuctors.py +36 -11
- opik/message_processing/batching/batchers.py +167 -44
- opik/message_processing/batching/flushing_thread.py +0 -3
- opik/message_processing/batching/sequence_splitter.py +50 -5
- opik/message_processing/emulation/__init__.py +0 -0
- opik/message_processing/emulation/emulator_message_processor.py +578 -0
- opik/message_processing/emulation/local_emulator_message_processor.py +140 -0
- opik/message_processing/emulation/models.py +162 -0
- opik/message_processing/encoder_helpers.py +79 -0
- opik/message_processing/message_queue.py +79 -0
- opik/message_processing/messages.py +154 -12
- opik/message_processing/preprocessing/__init__.py +0 -0
- opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
- opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
- opik/message_processing/preprocessing/constants.py +1 -0
- opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
- opik/message_processing/preprocessing/preprocessor.py +36 -0
- opik/message_processing/processors/__init__.py +0 -0
- opik/message_processing/processors/attachments_extraction_processor.py +146 -0
- opik/message_processing/processors/message_processors.py +92 -0
- opik/message_processing/processors/message_processors_chain.py +96 -0
- opik/message_processing/processors/online_message_processor.py +324 -0
- opik/message_processing/queue_consumer.py +61 -13
- opik/message_processing/streamer.py +102 -31
- opik/message_processing/streamer_constructors.py +67 -12
- opik/opik_context.py +103 -11
- opik/plugins/pytest/decorator.py +2 -2
- opik/plugins/pytest/experiment_runner.py +3 -2
- opik/plugins/pytest/hooks.py +6 -4
- opik/rate_limit/__init__.py +0 -0
- opik/rate_limit/rate_limit.py +25 -0
- opik/rest_api/__init__.py +643 -11
- opik/rest_api/alerts/__init__.py +7 -0
- opik/rest_api/alerts/client.py +667 -0
- opik/rest_api/alerts/raw_client.py +1015 -0
- opik/rest_api/alerts/types/__init__.py +7 -0
- opik/rest_api/alerts/types/get_webhook_examples_request_alert_type.py +5 -0
- opik/rest_api/annotation_queues/__init__.py +4 -0
- opik/rest_api/annotation_queues/client.py +668 -0
- opik/rest_api/annotation_queues/raw_client.py +1019 -0
- opik/rest_api/attachments/__init__.py +17 -0
- opik/rest_api/attachments/client.py +752 -0
- opik/rest_api/attachments/raw_client.py +1125 -0
- opik/rest_api/attachments/types/__init__.py +15 -0
- opik/rest_api/attachments/types/attachment_list_request_entity_type.py +5 -0
- opik/rest_api/attachments/types/download_attachment_request_entity_type.py +5 -0
- opik/rest_api/attachments/types/start_multipart_upload_request_entity_type.py +5 -0
- opik/rest_api/attachments/types/upload_attachment_request_entity_type.py +5 -0
- opik/rest_api/automation_rule_evaluators/__init__.py +2 -0
- opik/rest_api/automation_rule_evaluators/client.py +182 -1162
- opik/rest_api/automation_rule_evaluators/raw_client.py +598 -0
- opik/rest_api/chat_completions/__init__.py +2 -0
- opik/rest_api/chat_completions/client.py +115 -149
- opik/rest_api/chat_completions/raw_client.py +339 -0
- opik/rest_api/check/__init__.py +2 -0
- opik/rest_api/check/client.py +88 -106
- opik/rest_api/check/raw_client.py +258 -0
- opik/rest_api/client.py +112 -212
- opik/rest_api/core/__init__.py +5 -0
- opik/rest_api/core/api_error.py +12 -6
- opik/rest_api/core/client_wrapper.py +4 -14
- opik/rest_api/core/datetime_utils.py +1 -3
- opik/rest_api/core/file.py +2 -5
- opik/rest_api/core/http_client.py +42 -120
- opik/rest_api/core/http_response.py +55 -0
- opik/rest_api/core/jsonable_encoder.py +1 -4
- opik/rest_api/core/pydantic_utilities.py +79 -147
- opik/rest_api/core/query_encoder.py +1 -3
- opik/rest_api/core/serialization.py +10 -10
- opik/rest_api/dashboards/__init__.py +4 -0
- opik/rest_api/dashboards/client.py +462 -0
- opik/rest_api/dashboards/raw_client.py +648 -0
- opik/rest_api/datasets/__init__.py +5 -0
- opik/rest_api/datasets/client.py +1638 -1091
- opik/rest_api/datasets/raw_client.py +3389 -0
- opik/rest_api/datasets/types/__init__.py +8 -0
- opik/rest_api/datasets/types/dataset_update_visibility.py +5 -0
- opik/rest_api/datasets/types/dataset_write_visibility.py +5 -0
- opik/rest_api/errors/__init__.py +2 -0
- opik/rest_api/errors/bad_request_error.py +4 -3
- opik/rest_api/errors/conflict_error.py +4 -3
- opik/rest_api/errors/forbidden_error.py +4 -2
- opik/rest_api/errors/not_found_error.py +4 -3
- opik/rest_api/errors/not_implemented_error.py +4 -3
- opik/rest_api/errors/unauthorized_error.py +4 -3
- opik/rest_api/errors/unprocessable_entity_error.py +4 -3
- opik/rest_api/experiments/__init__.py +5 -0
- opik/rest_api/experiments/client.py +676 -752
- opik/rest_api/experiments/raw_client.py +1872 -0
- opik/rest_api/experiments/types/__init__.py +10 -0
- opik/rest_api/experiments/types/experiment_update_status.py +5 -0
- opik/rest_api/experiments/types/experiment_update_type.py +5 -0
- opik/rest_api/experiments/types/experiment_write_status.py +5 -0
- opik/rest_api/experiments/types/experiment_write_type.py +5 -0
- opik/rest_api/feedback_definitions/__init__.py +2 -0
- opik/rest_api/feedback_definitions/client.py +96 -370
- opik/rest_api/feedback_definitions/raw_client.py +541 -0
- opik/rest_api/feedback_definitions/types/__init__.py +2 -0
- opik/rest_api/feedback_definitions/types/find_feedback_definitions_request_type.py +1 -3
- opik/rest_api/guardrails/__init__.py +4 -0
- opik/rest_api/guardrails/client.py +104 -0
- opik/rest_api/guardrails/raw_client.py +102 -0
- opik/rest_api/llm_provider_key/__init__.py +2 -0
- opik/rest_api/llm_provider_key/client.py +166 -440
- opik/rest_api/llm_provider_key/raw_client.py +643 -0
- opik/rest_api/llm_provider_key/types/__init__.py +2 -0
- opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +1 -1
- opik/rest_api/manual_evaluation/__init__.py +4 -0
- opik/rest_api/manual_evaluation/client.py +347 -0
- opik/rest_api/manual_evaluation/raw_client.py +543 -0
- opik/rest_api/open_telemetry_ingestion/__init__.py +2 -0
- opik/rest_api/open_telemetry_ingestion/client.py +38 -63
- opik/rest_api/open_telemetry_ingestion/raw_client.py +88 -0
- opik/rest_api/optimizations/__init__.py +7 -0
- opik/rest_api/optimizations/client.py +704 -0
- opik/rest_api/optimizations/raw_client.py +920 -0
- opik/rest_api/optimizations/types/__init__.py +7 -0
- opik/rest_api/optimizations/types/optimization_update_status.py +7 -0
- opik/rest_api/projects/__init__.py +10 -1
- opik/rest_api/projects/client.py +180 -855
- opik/rest_api/projects/raw_client.py +1216 -0
- opik/rest_api/projects/types/__init__.py +11 -4
- opik/rest_api/projects/types/project_metric_request_public_interval.py +1 -3
- opik/rest_api/projects/types/project_metric_request_public_metric_type.py +11 -1
- opik/rest_api/projects/types/project_update_visibility.py +5 -0
- opik/rest_api/projects/types/project_write_visibility.py +5 -0
- opik/rest_api/prompts/__init__.py +4 -2
- opik/rest_api/prompts/client.py +381 -970
- opik/rest_api/prompts/raw_client.py +1634 -0
- opik/rest_api/prompts/types/__init__.py +5 -1
- opik/rest_api/prompts/types/create_prompt_version_detail_template_structure.py +5 -0
- opik/rest_api/prompts/types/prompt_write_template_structure.py +5 -0
- opik/rest_api/raw_client.py +156 -0
- opik/rest_api/redirect/__init__.py +4 -0
- opik/rest_api/redirect/client.py +375 -0
- opik/rest_api/redirect/raw_client.py +566 -0
- opik/rest_api/service_toggles/__init__.py +4 -0
- opik/rest_api/service_toggles/client.py +91 -0
- opik/rest_api/service_toggles/raw_client.py +93 -0
- opik/rest_api/spans/__init__.py +2 -0
- opik/rest_api/spans/client.py +659 -1354
- opik/rest_api/spans/raw_client.py +2383 -0
- opik/rest_api/spans/types/__init__.py +2 -0
- opik/rest_api/spans/types/find_feedback_score_names_1_request_type.py +1 -3
- opik/rest_api/spans/types/get_span_stats_request_type.py +1 -3
- opik/rest_api/spans/types/get_spans_by_project_request_type.py +1 -3
- opik/rest_api/spans/types/span_search_stream_request_public_type.py +1 -3
- opik/rest_api/system_usage/__init__.py +2 -0
- opik/rest_api/system_usage/client.py +157 -216
- opik/rest_api/system_usage/raw_client.py +455 -0
- opik/rest_api/traces/__init__.py +2 -0
- opik/rest_api/traces/client.py +2102 -1625
- opik/rest_api/traces/raw_client.py +4144 -0
- opik/rest_api/types/__init__.py +629 -24
- opik/rest_api/types/aggregation_data.py +27 -0
- opik/rest_api/types/alert.py +33 -0
- opik/rest_api/types/alert_alert_type.py +5 -0
- opik/rest_api/types/alert_page_public.py +24 -0
- opik/rest_api/types/alert_public.py +33 -0
- opik/rest_api/types/alert_public_alert_type.py +5 -0
- opik/rest_api/types/alert_trigger.py +27 -0
- opik/rest_api/types/alert_trigger_config.py +28 -0
- opik/rest_api/types/alert_trigger_config_public.py +28 -0
- opik/rest_api/types/alert_trigger_config_public_type.py +10 -0
- opik/rest_api/types/alert_trigger_config_type.py +10 -0
- opik/rest_api/types/alert_trigger_config_write.py +22 -0
- opik/rest_api/types/alert_trigger_config_write_type.py +10 -0
- opik/rest_api/types/alert_trigger_event_type.py +19 -0
- opik/rest_api/types/alert_trigger_public.py +27 -0
- opik/rest_api/types/alert_trigger_public_event_type.py +19 -0
- opik/rest_api/types/alert_trigger_write.py +23 -0
- opik/rest_api/types/alert_trigger_write_event_type.py +19 -0
- opik/rest_api/types/alert_write.py +28 -0
- opik/rest_api/types/alert_write_alert_type.py +5 -0
- opik/rest_api/types/annotation_queue.py +42 -0
- opik/rest_api/types/annotation_queue_batch.py +27 -0
- opik/rest_api/types/{json_schema_element.py → annotation_queue_item_ids.py} +5 -7
- opik/rest_api/types/annotation_queue_page_public.py +28 -0
- opik/rest_api/types/annotation_queue_public.py +38 -0
- opik/rest_api/types/annotation_queue_public_scope.py +5 -0
- opik/rest_api/types/{workspace_metadata.py → annotation_queue_reviewer.py} +6 -7
- opik/rest_api/types/annotation_queue_reviewer_public.py +20 -0
- opik/rest_api/types/annotation_queue_scope.py +5 -0
- opik/rest_api/types/annotation_queue_write.py +31 -0
- opik/rest_api/types/annotation_queue_write_scope.py +5 -0
- opik/rest_api/types/assistant_message.py +7 -8
- opik/rest_api/types/assistant_message_role.py +1 -3
- opik/rest_api/types/attachment.py +22 -0
- opik/rest_api/types/attachment_page.py +28 -0
- opik/rest_api/types/audio_url.py +19 -0
- opik/rest_api/types/audio_url_public.py +19 -0
- opik/rest_api/types/audio_url_write.py +19 -0
- opik/rest_api/types/automation_rule_evaluator.py +160 -0
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge.py +6 -6
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge_public.py +6 -6
- opik/rest_api/types/automation_rule_evaluator_llm_as_judge_write.py +6 -6
- opik/rest_api/types/automation_rule_evaluator_object_object_public.py +155 -0
- opik/rest_api/types/automation_rule_evaluator_page_public.py +6 -6
- opik/rest_api/types/automation_rule_evaluator_public.py +155 -0
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_public.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_write.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_public.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_write.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_public.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_write.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_update.py +143 -0
- opik/rest_api/types/automation_rule_evaluator_update_llm_as_judge.py +6 -6
- opik/rest_api/types/automation_rule_evaluator_update_span_llm_as_judge.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_update_trace_thread_llm_as_judge.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_update_trace_thread_user_defined_metric_python.py +22 -0
- opik/rest_api/types/automation_rule_evaluator_update_user_defined_metric_python.py +6 -6
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python.py +6 -6
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_public.py +6 -6
- opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_write.py +6 -6
- opik/rest_api/types/automation_rule_evaluator_write.py +143 -0
- opik/rest_api/types/avg_value_stat_public.py +3 -5
- opik/rest_api/types/batch_delete.py +3 -5
- opik/rest_api/types/batch_delete_by_project.py +20 -0
- opik/rest_api/types/bi_information.py +3 -5
- opik/rest_api/types/bi_information_response.py +4 -6
- opik/rest_api/types/boolean_feedback_definition.py +25 -0
- opik/rest_api/types/boolean_feedback_definition_create.py +20 -0
- opik/rest_api/types/boolean_feedback_definition_public.py +25 -0
- opik/rest_api/types/boolean_feedback_definition_update.py +20 -0
- opik/rest_api/types/boolean_feedback_detail.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_create.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_public.py +29 -0
- opik/rest_api/types/boolean_feedback_detail_update.py +29 -0
- opik/rest_api/types/categorical_feedback_definition.py +5 -7
- opik/rest_api/types/categorical_feedback_definition_create.py +4 -6
- opik/rest_api/types/categorical_feedback_definition_public.py +5 -7
- opik/rest_api/types/categorical_feedback_definition_update.py +4 -6
- opik/rest_api/types/categorical_feedback_detail.py +3 -5
- opik/rest_api/types/categorical_feedback_detail_create.py +3 -5
- opik/rest_api/types/categorical_feedback_detail_public.py +3 -5
- opik/rest_api/types/categorical_feedback_detail_update.py +3 -5
- opik/rest_api/types/chat_completion_choice.py +4 -6
- opik/rest_api/types/chat_completion_response.py +5 -6
- opik/rest_api/types/check.py +22 -0
- opik/rest_api/types/{json_node_compare.py → check_name.py} +1 -1
- opik/rest_api/types/check_public.py +22 -0
- opik/rest_api/types/check_public_name.py +5 -0
- opik/rest_api/types/check_public_result.py +5 -0
- opik/rest_api/types/check_result.py +5 -0
- opik/rest_api/types/chunked_output_json_node.py +4 -6
- opik/rest_api/types/chunked_output_json_node_public.py +4 -6
- opik/rest_api/types/chunked_output_json_node_public_type.py +6 -10
- opik/rest_api/types/chunked_output_json_node_type.py +6 -10
- opik/rest_api/types/column.py +8 -10
- opik/rest_api/types/column_compare.py +8 -10
- opik/rest_api/types/column_public.py +8 -10
- opik/rest_api/types/column_types_item.py +1 -3
- opik/rest_api/types/comment.py +4 -6
- opik/rest_api/types/comment_compare.py +4 -6
- opik/rest_api/types/comment_public.py +4 -6
- opik/rest_api/types/complete_multipart_upload_request.py +33 -0
- opik/rest_api/types/complete_multipart_upload_request_entity_type.py +5 -0
- opik/rest_api/types/completion_tokens_details.py +3 -5
- opik/rest_api/types/count_value_stat_public.py +3 -5
- opik/rest_api/types/dashboard_page_public.py +24 -0
- opik/rest_api/types/dashboard_public.py +30 -0
- opik/rest_api/types/data_point_double.py +21 -0
- opik/rest_api/types/data_point_number_public.py +3 -5
- opik/rest_api/types/dataset.py +14 -6
- opik/rest_api/types/dataset_expansion.py +42 -0
- opik/rest_api/types/dataset_expansion_response.py +39 -0
- opik/rest_api/types/dataset_item.py +9 -8
- opik/rest_api/types/dataset_item_batch.py +3 -5
- opik/rest_api/types/dataset_item_changes_public.py +5 -0
- opik/rest_api/types/dataset_item_compare.py +9 -8
- opik/rest_api/types/dataset_item_compare_source.py +1 -3
- opik/rest_api/types/dataset_item_filter.py +27 -0
- opik/rest_api/types/dataset_item_filter_operator.py +21 -0
- opik/rest_api/types/dataset_item_page_compare.py +10 -7
- opik/rest_api/types/dataset_item_page_public.py +10 -7
- opik/rest_api/types/dataset_item_public.py +9 -8
- opik/rest_api/types/dataset_item_public_source.py +1 -3
- opik/rest_api/types/dataset_item_source.py +1 -3
- opik/rest_api/types/dataset_item_update.py +39 -0
- opik/rest_api/types/dataset_item_write.py +5 -6
- opik/rest_api/types/dataset_item_write_source.py +1 -3
- opik/rest_api/types/dataset_page_public.py +9 -6
- opik/rest_api/types/dataset_public.py +14 -6
- opik/rest_api/types/dataset_public_status.py +5 -0
- opik/rest_api/types/dataset_public_visibility.py +5 -0
- opik/rest_api/types/dataset_status.py +5 -0
- opik/rest_api/types/dataset_version_diff.py +22 -0
- opik/rest_api/types/dataset_version_diff_stats.py +24 -0
- opik/rest_api/types/dataset_version_page_public.py +23 -0
- opik/rest_api/types/dataset_version_public.py +59 -0
- opik/rest_api/types/dataset_version_summary.py +46 -0
- opik/rest_api/types/dataset_version_summary_public.py +46 -0
- opik/rest_api/types/dataset_visibility.py +5 -0
- opik/rest_api/types/delete_attachments_request.py +23 -0
- opik/rest_api/types/delete_attachments_request_entity_type.py +5 -0
- opik/rest_api/types/delete_feedback_score.py +4 -5
- opik/rest_api/types/delete_ids_holder.py +19 -0
- opik/rest_api/types/delta.py +7 -9
- opik/rest_api/types/error_count_with_deviation.py +21 -0
- opik/rest_api/types/error_count_with_deviation_detailed.py +21 -0
- opik/rest_api/types/error_info.py +3 -5
- opik/rest_api/types/error_info_experiment_item_bulk_write_view.py +21 -0
- opik/rest_api/types/error_info_public.py +3 -5
- opik/rest_api/types/error_info_write.py +3 -5
- opik/rest_api/types/error_message.py +3 -5
- opik/rest_api/types/error_message_detail.py +3 -5
- opik/rest_api/types/error_message_detailed.py +3 -5
- opik/rest_api/types/error_message_public.py +3 -5
- opik/rest_api/types/experiment.py +21 -10
- opik/rest_api/types/experiment_group_aggregations_response.py +20 -0
- opik/rest_api/types/experiment_group_response.py +22 -0
- opik/rest_api/types/experiment_item.py +14 -11
- opik/rest_api/types/experiment_item_bulk_record.py +27 -0
- opik/rest_api/types/experiment_item_bulk_record_experiment_item_bulk_write_view.py +27 -0
- opik/rest_api/types/experiment_item_bulk_upload.py +27 -0
- opik/rest_api/types/experiment_item_compare.py +14 -11
- opik/rest_api/types/experiment_item_compare_trace_visibility_mode.py +5 -0
- opik/rest_api/types/experiment_item_public.py +6 -6
- opik/rest_api/types/experiment_item_public_trace_visibility_mode.py +5 -0
- opik/rest_api/types/experiment_item_trace_visibility_mode.py +5 -0
- opik/rest_api/types/experiment_page_public.py +9 -6
- opik/rest_api/types/experiment_public.py +21 -10
- opik/rest_api/types/experiment_public_status.py +5 -0
- opik/rest_api/types/experiment_public_type.py +5 -0
- opik/rest_api/types/experiment_score.py +20 -0
- opik/rest_api/types/experiment_score_public.py +20 -0
- opik/rest_api/types/experiment_score_write.py +20 -0
- opik/rest_api/types/experiment_status.py +5 -0
- opik/rest_api/types/experiment_type.py +5 -0
- opik/rest_api/types/export_trace_service_request.py +5 -0
- opik/rest_api/types/feedback.py +40 -27
- opik/rest_api/types/feedback_create.py +27 -13
- opik/rest_api/types/feedback_definition_page_public.py +4 -6
- opik/rest_api/types/feedback_object_public.py +40 -27
- opik/rest_api/types/feedback_public.py +40 -27
- opik/rest_api/types/feedback_score.py +7 -7
- opik/rest_api/types/feedback_score_average.py +3 -5
- opik/rest_api/types/feedback_score_average_detailed.py +3 -5
- opik/rest_api/types/feedback_score_average_public.py +3 -5
- opik/rest_api/types/feedback_score_batch.py +4 -6
- opik/rest_api/types/feedback_score_batch_item.py +6 -6
- opik/rest_api/types/feedback_score_batch_item_source.py +1 -3
- opik/rest_api/types/feedback_score_batch_item_thread.py +32 -0
- opik/rest_api/types/feedback_score_batch_item_thread_source.py +5 -0
- opik/rest_api/types/feedback_score_compare.py +7 -7
- opik/rest_api/types/feedback_score_compare_source.py +1 -3
- opik/rest_api/types/feedback_score_experiment_item_bulk_write_view.py +31 -0
- opik/rest_api/types/feedback_score_experiment_item_bulk_write_view_source.py +5 -0
- opik/rest_api/types/feedback_score_names.py +4 -6
- opik/rest_api/types/feedback_score_public.py +11 -7
- opik/rest_api/types/feedback_score_public_source.py +1 -3
- opik/rest_api/types/feedback_score_source.py +1 -3
- opik/rest_api/types/feedback_update.py +27 -13
- opik/rest_api/types/function.py +4 -7
- opik/rest_api/types/function_call.py +3 -5
- opik/rest_api/types/group_content.py +19 -0
- opik/rest_api/types/group_content_with_aggregations.py +21 -0
- opik/rest_api/types/group_detail.py +19 -0
- opik/rest_api/types/group_details.py +20 -0
- opik/rest_api/types/guardrail.py +34 -0
- opik/rest_api/types/guardrail_batch.py +20 -0
- opik/rest_api/types/guardrail_name.py +5 -0
- opik/rest_api/types/guardrail_result.py +5 -0
- opik/rest_api/types/guardrail_write.py +33 -0
- opik/rest_api/types/guardrail_write_name.py +5 -0
- opik/rest_api/types/guardrail_write_result.py +5 -0
- opik/rest_api/types/guardrails_validation.py +21 -0
- opik/rest_api/types/guardrails_validation_public.py +21 -0
- opik/rest_api/types/ids_holder.py +19 -0
- opik/rest_api/types/image_url.py +20 -0
- opik/rest_api/types/image_url_public.py +20 -0
- opik/rest_api/types/image_url_write.py +20 -0
- opik/rest_api/types/json_list_string.py +7 -0
- opik/rest_api/types/json_list_string_compare.py +7 -0
- opik/rest_api/types/json_list_string_experiment_item_bulk_write_view.py +7 -0
- opik/rest_api/types/json_list_string_public.py +7 -0
- opik/rest_api/types/json_list_string_write.py +7 -0
- opik/rest_api/types/json_schema.py +5 -8
- opik/rest_api/types/llm_as_judge_code.py +8 -12
- opik/rest_api/types/llm_as_judge_code_public.py +8 -12
- opik/rest_api/types/llm_as_judge_code_write.py +8 -12
- opik/rest_api/types/llm_as_judge_message.py +9 -7
- opik/rest_api/types/llm_as_judge_message_content.py +26 -0
- opik/rest_api/types/llm_as_judge_message_content_public.py +26 -0
- opik/rest_api/types/llm_as_judge_message_content_write.py +26 -0
- opik/rest_api/types/llm_as_judge_message_public.py +9 -7
- opik/rest_api/types/llm_as_judge_message_public_role.py +1 -1
- opik/rest_api/types/llm_as_judge_message_role.py +1 -1
- opik/rest_api/types/llm_as_judge_message_write.py +9 -7
- opik/rest_api/types/llm_as_judge_message_write_role.py +1 -1
- opik/rest_api/types/llm_as_judge_model_parameters.py +6 -5
- opik/rest_api/types/llm_as_judge_model_parameters_public.py +6 -5
- opik/rest_api/types/llm_as_judge_model_parameters_write.py +6 -5
- opik/rest_api/types/llm_as_judge_output_schema.py +4 -6
- opik/rest_api/types/llm_as_judge_output_schema_public.py +4 -6
- opik/rest_api/types/llm_as_judge_output_schema_public_type.py +1 -3
- opik/rest_api/types/llm_as_judge_output_schema_type.py +1 -3
- opik/rest_api/types/llm_as_judge_output_schema_write.py +4 -6
- opik/rest_api/types/llm_as_judge_output_schema_write_type.py +1 -3
- opik/rest_api/types/log_item.py +5 -7
- opik/rest_api/types/log_item_level.py +1 -3
- opik/rest_api/types/log_page.py +4 -6
- opik/rest_api/types/manual_evaluation_request.py +38 -0
- opik/rest_api/types/manual_evaluation_request_entity_type.py +5 -0
- opik/rest_api/types/manual_evaluation_response.py +27 -0
- opik/rest_api/types/multipart_upload_part.py +20 -0
- opik/rest_api/types/numerical_feedback_definition.py +5 -7
- opik/rest_api/types/numerical_feedback_definition_create.py +4 -6
- opik/rest_api/types/numerical_feedback_definition_public.py +5 -7
- opik/rest_api/types/numerical_feedback_definition_update.py +4 -6
- opik/rest_api/types/numerical_feedback_detail.py +3 -5
- opik/rest_api/types/numerical_feedback_detail_create.py +3 -5
- opik/rest_api/types/numerical_feedback_detail_public.py +3 -5
- opik/rest_api/types/numerical_feedback_detail_update.py +3 -5
- opik/rest_api/types/optimization.py +37 -0
- opik/rest_api/types/optimization_page_public.py +28 -0
- opik/rest_api/types/optimization_public.py +37 -0
- opik/rest_api/types/optimization_public_status.py +7 -0
- opik/rest_api/types/optimization_status.py +7 -0
- opik/rest_api/types/optimization_studio_config.py +27 -0
- opik/rest_api/types/optimization_studio_config_public.py +27 -0
- opik/rest_api/types/optimization_studio_config_write.py +27 -0
- opik/rest_api/types/optimization_studio_log.py +22 -0
- opik/rest_api/types/optimization_write.py +30 -0
- opik/rest_api/types/optimization_write_status.py +7 -0
- opik/rest_api/types/page_columns.py +4 -6
- opik/rest_api/types/percentage_value_stat_public.py +4 -6
- opik/rest_api/types/percentage_values.py +8 -16
- opik/rest_api/types/percentage_values_detailed.py +8 -16
- opik/rest_api/types/percentage_values_public.py +8 -16
- opik/rest_api/types/project.py +12 -7
- opik/rest_api/types/project_detailed.py +12 -7
- opik/rest_api/types/project_detailed_visibility.py +5 -0
- opik/rest_api/types/project_metric_response_public.py +5 -9
- opik/rest_api/types/project_metric_response_public_interval.py +1 -3
- opik/rest_api/types/project_metric_response_public_metric_type.py +11 -1
- opik/rest_api/types/project_page_public.py +8 -10
- opik/rest_api/types/project_public.py +6 -6
- opik/rest_api/types/project_public_visibility.py +5 -0
- opik/rest_api/types/project_reference.py +31 -0
- opik/rest_api/types/project_reference_public.py +31 -0
- opik/rest_api/types/project_stat_item_object_public.py +8 -17
- opik/rest_api/types/project_stats_public.py +4 -6
- opik/rest_api/types/project_stats_summary.py +4 -6
- opik/rest_api/types/project_stats_summary_item.py +9 -6
- opik/rest_api/types/project_visibility.py +5 -0
- opik/rest_api/types/prompt.py +12 -7
- opik/rest_api/types/prompt_detail.py +12 -7
- opik/rest_api/types/prompt_detail_template_structure.py +5 -0
- opik/rest_api/types/prompt_page_public.py +9 -6
- opik/rest_api/types/prompt_public.py +11 -6
- opik/rest_api/types/prompt_public_template_structure.py +5 -0
- opik/rest_api/types/prompt_template_structure.py +5 -0
- opik/rest_api/types/prompt_tokens_details.py +19 -0
- opik/rest_api/types/prompt_version.py +7 -6
- opik/rest_api/types/prompt_version_detail.py +7 -6
- opik/rest_api/types/prompt_version_detail_template_structure.py +5 -0
- opik/rest_api/types/prompt_version_link.py +4 -5
- opik/rest_api/types/prompt_version_link_public.py +4 -5
- opik/rest_api/types/prompt_version_link_write.py +3 -5
- opik/rest_api/types/prompt_version_page_public.py +9 -6
- opik/rest_api/types/prompt_version_public.py +7 -6
- opik/rest_api/types/prompt_version_public_template_structure.py +5 -0
- opik/rest_api/types/prompt_version_template_structure.py +5 -0
- opik/rest_api/types/prompt_version_update.py +33 -0
- opik/rest_api/types/provider_api_key.py +18 -8
- opik/rest_api/types/provider_api_key_page_public.py +27 -0
- opik/rest_api/types/provider_api_key_provider.py +1 -1
- opik/rest_api/types/provider_api_key_public.py +18 -8
- opik/rest_api/types/provider_api_key_public_provider.py +1 -1
- opik/rest_api/types/response_format.py +5 -7
- opik/rest_api/types/response_format_type.py +1 -3
- opik/rest_api/types/result.py +21 -0
- opik/rest_api/types/results_number_public.py +4 -6
- opik/rest_api/types/score_name.py +4 -5
- opik/rest_api/types/service_toggles_config.py +44 -0
- opik/rest_api/types/span.py +13 -15
- opik/rest_api/types/span_batch.py +4 -6
- opik/rest_api/types/span_enrichment_options.py +31 -0
- opik/rest_api/types/span_experiment_item_bulk_write_view.py +39 -0
- opik/rest_api/types/span_experiment_item_bulk_write_view_type.py +5 -0
- opik/rest_api/types/span_filter.py +23 -0
- opik/rest_api/types/span_filter_operator.py +21 -0
- opik/rest_api/types/span_filter_public.py +4 -6
- opik/rest_api/types/span_filter_public_operator.py +2 -0
- opik/rest_api/types/span_filter_write.py +23 -0
- opik/rest_api/types/span_filter_write_operator.py +21 -0
- opik/rest_api/types/span_llm_as_judge_code.py +27 -0
- opik/rest_api/types/span_llm_as_judge_code_public.py +27 -0
- opik/rest_api/types/span_llm_as_judge_code_write.py +27 -0
- opik/rest_api/types/span_page_public.py +9 -6
- opik/rest_api/types/span_public.py +19 -16
- opik/rest_api/types/span_public_type.py +1 -1
- opik/rest_api/types/span_type.py +1 -1
- opik/rest_api/types/span_update.py +46 -0
- opik/rest_api/types/span_update_type.py +5 -0
- opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
- opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
- opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
- opik/rest_api/types/span_write.py +13 -14
- opik/rest_api/types/span_write_type.py +1 -1
- opik/rest_api/types/spans_count_response.py +20 -0
- opik/rest_api/types/start_multipart_upload_response.py +20 -0
- opik/rest_api/types/stream_options.py +3 -5
- opik/rest_api/types/studio_evaluation.py +20 -0
- opik/rest_api/types/studio_evaluation_public.py +20 -0
- opik/rest_api/types/studio_evaluation_write.py +20 -0
- opik/rest_api/types/studio_llm_model.py +21 -0
- opik/rest_api/types/studio_llm_model_public.py +21 -0
- opik/rest_api/types/studio_llm_model_write.py +21 -0
- opik/rest_api/types/studio_message.py +20 -0
- opik/rest_api/types/studio_message_public.py +20 -0
- opik/rest_api/types/studio_message_write.py +20 -0
- opik/rest_api/types/studio_metric.py +21 -0
- opik/rest_api/types/studio_metric_public.py +21 -0
- opik/rest_api/types/studio_metric_write.py +21 -0
- opik/rest_api/types/studio_optimizer.py +21 -0
- opik/rest_api/types/studio_optimizer_public.py +21 -0
- opik/rest_api/types/studio_optimizer_write.py +21 -0
- opik/rest_api/types/studio_prompt.py +20 -0
- opik/rest_api/types/studio_prompt_public.py +20 -0
- opik/rest_api/types/studio_prompt_write.py +20 -0
- opik/rest_api/types/tool.py +4 -6
- opik/rest_api/types/tool_call.py +4 -6
- opik/rest_api/types/trace.py +26 -12
- opik/rest_api/types/trace_batch.py +4 -6
- opik/rest_api/types/trace_count_response.py +4 -6
- opik/rest_api/types/trace_enrichment_options.py +32 -0
- opik/rest_api/types/trace_experiment_item_bulk_write_view.py +41 -0
- opik/rest_api/types/trace_filter.py +23 -0
- opik/rest_api/types/trace_filter_operator.py +21 -0
- opik/rest_api/types/trace_filter_public.py +23 -0
- opik/rest_api/types/trace_filter_public_operator.py +21 -0
- opik/rest_api/types/trace_filter_write.py +23 -0
- opik/rest_api/types/trace_filter_write_operator.py +21 -0
- opik/rest_api/types/trace_page_public.py +8 -10
- opik/rest_api/types/trace_public.py +27 -13
- opik/rest_api/types/trace_public_visibility_mode.py +5 -0
- opik/rest_api/types/trace_thread.py +18 -9
- opik/rest_api/types/trace_thread_filter.py +23 -0
- opik/rest_api/types/trace_thread_filter_operator.py +21 -0
- opik/rest_api/types/trace_thread_filter_public.py +23 -0
- opik/rest_api/types/trace_thread_filter_public_operator.py +21 -0
- opik/rest_api/types/trace_thread_filter_write.py +23 -0
- opik/rest_api/types/trace_thread_filter_write_operator.py +21 -0
- opik/rest_api/types/trace_thread_identifier.py +22 -0
- opik/rest_api/types/trace_thread_llm_as_judge_code.py +26 -0
- opik/rest_api/types/trace_thread_llm_as_judge_code_public.py +26 -0
- opik/rest_api/types/trace_thread_llm_as_judge_code_write.py +26 -0
- opik/rest_api/types/trace_thread_page.py +9 -6
- opik/rest_api/types/trace_thread_status.py +5 -0
- opik/rest_api/types/trace_thread_update.py +19 -0
- opik/rest_api/types/trace_thread_user_defined_metric_python_code.py +19 -0
- opik/rest_api/types/trace_thread_user_defined_metric_python_code_public.py +19 -0
- opik/rest_api/types/trace_thread_user_defined_metric_python_code_write.py +19 -0
- opik/rest_api/types/trace_update.py +39 -0
- opik/rest_api/types/trace_visibility_mode.py +5 -0
- opik/rest_api/types/trace_write.py +10 -11
- opik/rest_api/types/usage.py +6 -6
- opik/rest_api/types/user_defined_metric_python_code.py +3 -5
- opik/rest_api/types/user_defined_metric_python_code_public.py +3 -5
- opik/rest_api/types/user_defined_metric_python_code_write.py +3 -5
- opik/rest_api/types/value_entry.py +27 -0
- opik/rest_api/types/value_entry_compare.py +27 -0
- opik/rest_api/types/value_entry_compare_source.py +5 -0
- opik/rest_api/types/value_entry_experiment_item_bulk_write_view.py +27 -0
- opik/rest_api/types/value_entry_experiment_item_bulk_write_view_source.py +5 -0
- opik/rest_api/types/value_entry_public.py +27 -0
- opik/rest_api/types/value_entry_public_source.py +5 -0
- opik/rest_api/types/value_entry_source.py +5 -0
- opik/rest_api/types/video_url.py +19 -0
- opik/rest_api/types/video_url_public.py +19 -0
- opik/rest_api/types/video_url_write.py +19 -0
- opik/rest_api/types/webhook.py +28 -0
- opik/rest_api/types/webhook_examples.py +19 -0
- opik/rest_api/types/webhook_public.py +28 -0
- opik/rest_api/types/webhook_test_result.py +23 -0
- opik/rest_api/types/webhook_test_result_status.py +5 -0
- opik/rest_api/types/webhook_write.py +23 -0
- opik/rest_api/types/welcome_wizard_tracking.py +22 -0
- opik/rest_api/types/workspace_configuration.py +27 -0
- opik/rest_api/types/workspace_metric_request.py +24 -0
- opik/rest_api/types/workspace_metric_response.py +20 -0
- opik/rest_api/types/workspace_metrics_summary_request.py +23 -0
- opik/rest_api/types/workspace_metrics_summary_response.py +20 -0
- opik/rest_api/types/workspace_name_holder.py +19 -0
- opik/rest_api/types/workspace_spans_count.py +20 -0
- opik/rest_api/types/workspace_trace_count.py +3 -5
- opik/rest_api/welcome_wizard/__init__.py +4 -0
- opik/rest_api/welcome_wizard/client.py +195 -0
- opik/rest_api/welcome_wizard/raw_client.py +208 -0
- opik/rest_api/workspaces/__init__.py +2 -0
- opik/rest_api/workspaces/client.py +550 -77
- opik/rest_api/workspaces/raw_client.py +923 -0
- opik/rest_client_configurator/api.py +1 -0
- opik/rest_client_configurator/retry_decorator.py +1 -0
- opik/s3_httpx_client.py +67 -0
- opik/simulation/__init__.py +6 -0
- opik/simulation/simulated_user.py +99 -0
- opik/simulation/simulator.py +108 -0
- opik/synchronization.py +11 -24
- opik/tracing_runtime_config.py +48 -0
- opik/types.py +48 -2
- opik/url_helpers.py +13 -3
- opik/validation/chat_prompt_messages.py +241 -0
- opik/validation/feedback_score.py +4 -5
- opik/validation/parameter.py +122 -0
- opik/validation/parameters_validator.py +175 -0
- opik/validation/validator.py +30 -2
- opik/validation/validator_helpers.py +147 -0
- opik-1.9.71.dist-info/METADATA +370 -0
- opik-1.9.71.dist-info/RECORD +1110 -0
- {opik-1.6.4.dist-info → opik-1.9.71.dist-info}/WHEEL +1 -1
- opik-1.9.71.dist-info/licenses/LICENSE +203 -0
- opik/api_objects/prompt/prompt.py +0 -107
- opik/api_objects/prompt/prompt_template.py +0 -35
- opik/cli.py +0 -193
- opik/evaluation/metrics/models.py +0 -8
- opik/hooks.py +0 -13
- opik/integrations/bedrock/chunks_aggregator.py +0 -55
- opik/integrations/bedrock/helpers.py +0 -8
- opik/integrations/langchain/google_run_helpers.py +0 -75
- opik/integrations/langchain/openai_run_helpers.py +0 -122
- opik/message_processing/message_processors.py +0 -203
- opik/rest_api/types/delta_role.py +0 -7
- opik/rest_api/types/json_object_schema.py +0 -34
- opik-1.6.4.dist-info/METADATA +0 -270
- opik-1.6.4.dist-info/RECORD +0 -507
- /opik/integrations/bedrock/{stream_wrappers.py → converse/stream_wrappers.py} +0 -0
- {opik-1.6.4.dist-info → opik-1.9.71.dist-info}/entry_points.txt +0 -0
- {opik-1.6.4.dist-info → opik-1.9.71.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Optional, Union, Any, List, Dict
|
|
4
|
+
|
|
5
|
+
import pydantic
|
|
6
|
+
|
|
7
|
+
from opik.evaluation.metrics.conversation import helpers as conversation_helpers
|
|
8
|
+
from opik.evaluation.metrics.conversation import types as conversation_types
|
|
9
|
+
import opik.exceptions as exceptions
|
|
10
|
+
from opik.evaluation.metrics import score_result
|
|
11
|
+
from opik.evaluation.metrics.conversation.conversation_thread_metric import (
|
|
12
|
+
ConversationThreadMetric,
|
|
13
|
+
)
|
|
14
|
+
from opik.evaluation.metrics.llm_judges import parsing_helpers
|
|
15
|
+
from opik.evaluation.models import base_model, models_factory
|
|
16
|
+
from . import schema, templates
|
|
17
|
+
|
|
18
|
+
LOGGER = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class UserFrustrationMetric(ConversationThreadMetric):
|
|
22
|
+
"""
|
|
23
|
+
A heuristic score estimating the likelihood that the user experienced confusion, annoyance,
|
|
24
|
+
or disengagement during the session — due to repetition, lack of adaptation, ignored
|
|
25
|
+
intent signals, or failure to smoothly conclude.
|
|
26
|
+
|
|
27
|
+
The ``UserFrustrationMetric`` class integrates with LLM models to analyze
|
|
28
|
+
conversation data in sliding windows and produce a numerical score along with an optional
|
|
29
|
+
reason for the calculated score. It provides both synchronous and asynchronous methods for
|
|
30
|
+
calculation and supports customization through attributes like window size and reason inclusion.
|
|
31
|
+
|
|
32
|
+
This metric can be used to monitor and track user frustration levels during conversations, enabling
|
|
33
|
+
insights into user experience. The metric makes use of LLM models to score conversational
|
|
34
|
+
windows and summarize results. It returns a score between `0.0` and `1.0`. The higher the score,
|
|
35
|
+
the more frustrated the user is likely to be.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
model: The model to use for evaluating the conversation. If a string is provided, it will be used to
|
|
39
|
+
fetch the model from the LiteLLM API. If a base_model.OpikBaseModel is
|
|
40
|
+
provided, it will be used directly. Default is None.
|
|
41
|
+
name: The name of the metric. The default is "user_frustration_score".
|
|
42
|
+
include_reason: Whether to include the reason for the score in the
|
|
43
|
+
result. Default is True.
|
|
44
|
+
track: Whether to track the metric. Default is True.
|
|
45
|
+
project_name: The name of the project to track the metric in.
|
|
46
|
+
Default is None.
|
|
47
|
+
window_size: The window size to use for calculating the score. It defines the
|
|
48
|
+
maximal number of historical turns to include in each window when assessing
|
|
49
|
+
the frustration of the current turn in the conversation. Default is 10.
|
|
50
|
+
temperature: The temperature to use for the model. Defaults to 1e-8.
|
|
51
|
+
|
|
52
|
+
Example:
|
|
53
|
+
>>> from opik.evaluation.metrics import UserFrustrationMetric
|
|
54
|
+
>>> conversation = [
|
|
55
|
+
>>> {"role": "user", "content": "How do I center a div using CSS?"},
|
|
56
|
+
>>> {"role": "assistant", "content": "There are many ways to center elements in CSS."},
|
|
57
|
+
>>> {"role": "user", "content": "Okay... can you show me one?"},
|
|
58
|
+
>>> {"role": "assistant", "content": "Sure. It depends on the context — are you centering horizontally, vertically, or both?"},
|
|
59
|
+
>>> {"role": "user", "content": "Both. Just give me a basic example."},
|
|
60
|
+
>>> ]
|
|
61
|
+
>>> metric = UserFrustrationMetric()
|
|
62
|
+
>>> result = metric.score(conversation)
|
|
63
|
+
>>> if result.scoring_failed:
|
|
64
|
+
>>> print(f"Scoring failed: {result.reason}")
|
|
65
|
+
>>> else:
|
|
66
|
+
>>> print(result.value)
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
def __init__(
|
|
70
|
+
self,
|
|
71
|
+
model: Optional[Union[str, base_model.OpikBaseModel]] = None,
|
|
72
|
+
name: str = "user_frustration_score",
|
|
73
|
+
include_reason: bool = True,
|
|
74
|
+
track: bool = True,
|
|
75
|
+
project_name: Optional[str] = None,
|
|
76
|
+
window_size: int = 10,
|
|
77
|
+
temperature: float = 1e-8,
|
|
78
|
+
):
|
|
79
|
+
super().__init__(
|
|
80
|
+
name=name,
|
|
81
|
+
track=track,
|
|
82
|
+
project_name=project_name,
|
|
83
|
+
)
|
|
84
|
+
self._include_reason = include_reason
|
|
85
|
+
self._window_size = window_size
|
|
86
|
+
|
|
87
|
+
self._init_model(model, temperature=temperature)
|
|
88
|
+
|
|
89
|
+
def _init_model(
|
|
90
|
+
self, model: Optional[Union[str, base_model.OpikBaseModel]], temperature: float
|
|
91
|
+
) -> None:
|
|
92
|
+
if isinstance(model, base_model.OpikBaseModel):
|
|
93
|
+
self._model = model
|
|
94
|
+
else:
|
|
95
|
+
self._model = models_factory.get(model_name=model, temperature=temperature)
|
|
96
|
+
|
|
97
|
+
def score(
|
|
98
|
+
self,
|
|
99
|
+
conversation: conversation_types.Conversation,
|
|
100
|
+
**ignored_kwargs: Any,
|
|
101
|
+
) -> score_result.ScoreResult:
|
|
102
|
+
return self._calculate_score(conversation=conversation)
|
|
103
|
+
|
|
104
|
+
async def ascore(
|
|
105
|
+
self,
|
|
106
|
+
conversation: conversation_types.Conversation,
|
|
107
|
+
**ignored_kwargs: Any,
|
|
108
|
+
) -> score_result.ScoreResult:
|
|
109
|
+
return await self._a_calculate_score(conversation=conversation)
|
|
110
|
+
|
|
111
|
+
def _calculate_score(
|
|
112
|
+
self,
|
|
113
|
+
conversation: conversation_types.Conversation,
|
|
114
|
+
) -> score_result.ScoreResult:
|
|
115
|
+
try:
|
|
116
|
+
turns_windows = (
|
|
117
|
+
conversation_helpers.extract_turns_windows_from_conversation(
|
|
118
|
+
conversation=conversation, window_size=self._window_size
|
|
119
|
+
)
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
verdicts = [
|
|
123
|
+
self._evaluate_conversation(conversation_sliding_window=window)
|
|
124
|
+
for window in turns_windows
|
|
125
|
+
]
|
|
126
|
+
|
|
127
|
+
score = _score_from_verdicts(verdicts=verdicts)
|
|
128
|
+
reason = (
|
|
129
|
+
self._reason_from_verdicts(score=score, verdicts=verdicts)
|
|
130
|
+
if self._include_reason
|
|
131
|
+
else None
|
|
132
|
+
)
|
|
133
|
+
return score_result.ScoreResult(
|
|
134
|
+
name=self.name,
|
|
135
|
+
value=score,
|
|
136
|
+
reason=reason,
|
|
137
|
+
)
|
|
138
|
+
except Exception as e:
|
|
139
|
+
LOGGER.error(f"Failed to calculate user frustration score: {e}")
|
|
140
|
+
raise exceptions.MetricComputationError(
|
|
141
|
+
f"Failed to calculate user frustration score: {e}"
|
|
142
|
+
) from e
|
|
143
|
+
|
|
144
|
+
async def _a_calculate_score(
|
|
145
|
+
self,
|
|
146
|
+
conversation: conversation_types.Conversation,
|
|
147
|
+
) -> score_result.ScoreResult:
|
|
148
|
+
try:
|
|
149
|
+
turns_windows = (
|
|
150
|
+
conversation_helpers.extract_turns_windows_from_conversation(
|
|
151
|
+
conversation=conversation, window_size=self._window_size
|
|
152
|
+
)
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
verdicts = await asyncio.gather(
|
|
156
|
+
*[
|
|
157
|
+
self._a_evaluate_conversation(conversation_sliding_window=window)
|
|
158
|
+
for window in turns_windows
|
|
159
|
+
]
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
score = _score_from_verdicts(verdicts=verdicts)
|
|
163
|
+
reason = (
|
|
164
|
+
await self._a_reason_from_verdicts(score=score, verdicts=verdicts)
|
|
165
|
+
if self._include_reason
|
|
166
|
+
else None
|
|
167
|
+
)
|
|
168
|
+
return score_result.ScoreResult(
|
|
169
|
+
name=self.name,
|
|
170
|
+
value=score,
|
|
171
|
+
reason=reason,
|
|
172
|
+
)
|
|
173
|
+
except Exception as e:
|
|
174
|
+
LOGGER.error(f"Failed to calculate user frustration score: {e}")
|
|
175
|
+
raise exceptions.MetricComputationError(
|
|
176
|
+
f"Failed to calculate user frustration score: {e}"
|
|
177
|
+
) from e
|
|
178
|
+
|
|
179
|
+
def _evaluate_conversation(
|
|
180
|
+
self,
|
|
181
|
+
conversation_sliding_window: conversation_types.Conversation,
|
|
182
|
+
) -> schema.EvaluateUserFrustrationResponse:
|
|
183
|
+
llm_query = templates.evaluate_conversation(
|
|
184
|
+
sliding_window=conversation_sliding_window
|
|
185
|
+
)
|
|
186
|
+
model_output = self._model.generate_string(
|
|
187
|
+
input=llm_query,
|
|
188
|
+
response_format=schema.EvaluateUserFrustrationResponse,
|
|
189
|
+
)
|
|
190
|
+
return _evaluate_conversation_from_model_output(model_output=model_output)
|
|
191
|
+
|
|
192
|
+
async def _a_evaluate_conversation(
|
|
193
|
+
self,
|
|
194
|
+
conversation_sliding_window: conversation_types.Conversation,
|
|
195
|
+
) -> schema.EvaluateUserFrustrationResponse:
|
|
196
|
+
llm_query = templates.evaluate_conversation(
|
|
197
|
+
sliding_window=conversation_sliding_window
|
|
198
|
+
)
|
|
199
|
+
model_output = await self._model.agenerate_string(
|
|
200
|
+
input=llm_query,
|
|
201
|
+
response_format=schema.EvaluateUserFrustrationResponse,
|
|
202
|
+
)
|
|
203
|
+
return _evaluate_conversation_from_model_output(model_output=model_output)
|
|
204
|
+
|
|
205
|
+
def _reason_from_verdicts(
|
|
206
|
+
self, score: float, verdicts: List[schema.EvaluateUserFrustrationResponse]
|
|
207
|
+
) -> str:
|
|
208
|
+
frustrations: List[Dict[str, str]] = _extract_frustrations_from_verdicts(
|
|
209
|
+
verdicts
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
llm_query = templates.generate_reason(score=score, frustrations=frustrations)
|
|
213
|
+
|
|
214
|
+
model_output = self._model.generate_string(
|
|
215
|
+
input=llm_query, response_format=schema.ScoreReasonResponse
|
|
216
|
+
)
|
|
217
|
+
return _generate_reason_from_model_output(model_output=model_output)
|
|
218
|
+
|
|
219
|
+
async def _a_reason_from_verdicts(
|
|
220
|
+
self, score: float, verdicts: List[schema.EvaluateUserFrustrationResponse]
|
|
221
|
+
) -> str:
|
|
222
|
+
frustrations: List[Dict[str, str]] = _extract_frustrations_from_verdicts(
|
|
223
|
+
verdicts
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
llm_query = templates.generate_reason(score=score, frustrations=frustrations)
|
|
227
|
+
|
|
228
|
+
model_output = await self._model.agenerate_string(
|
|
229
|
+
input=llm_query, response_format=schema.ScoreReasonResponse
|
|
230
|
+
)
|
|
231
|
+
return _generate_reason_from_model_output(model_output=model_output)
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def _evaluate_conversation_from_model_output(
|
|
235
|
+
model_output: str,
|
|
236
|
+
) -> schema.EvaluateUserFrustrationResponse:
|
|
237
|
+
try:
|
|
238
|
+
dict_content = parsing_helpers.extract_json_content_or_raise(model_output)
|
|
239
|
+
return schema.EvaluateUserFrustrationResponse.model_validate(dict_content)
|
|
240
|
+
except pydantic.ValidationError as e:
|
|
241
|
+
LOGGER.warning(
|
|
242
|
+
f"Failed to parse user's frustration evaluation results from the LLM output: {model_output}, reason: {e}",
|
|
243
|
+
exc_info=True,
|
|
244
|
+
)
|
|
245
|
+
raise e
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def _score_from_verdicts(
|
|
249
|
+
verdicts: List[schema.EvaluateUserFrustrationResponse],
|
|
250
|
+
) -> float:
|
|
251
|
+
if len(verdicts) == 0:
|
|
252
|
+
return 0.0
|
|
253
|
+
|
|
254
|
+
frustrated_count = sum(v.verdict.strip().lower() == "yes" for v in verdicts)
|
|
255
|
+
return frustrated_count / len(verdicts)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def _extract_frustrations_from_verdicts(
|
|
259
|
+
verdicts: List[schema.EvaluateUserFrustrationResponse],
|
|
260
|
+
) -> List[Dict[str, str]]:
|
|
261
|
+
return [
|
|
262
|
+
{"message_number": f"{index + 1}", "reason": verdict.reason}
|
|
263
|
+
for index, verdict in enumerate(verdicts)
|
|
264
|
+
if verdict.verdict.strip().lower() == "yes" and verdict.reason is not None
|
|
265
|
+
]
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def _generate_reason_from_model_output(model_output: str) -> str:
|
|
269
|
+
try:
|
|
270
|
+
dict_content = parsing_helpers.extract_json_content_or_raise(model_output)
|
|
271
|
+
return schema.ScoreReasonResponse.model_validate(dict_content).reason
|
|
272
|
+
except pydantic.ValidationError as e:
|
|
273
|
+
LOGGER.warning(
|
|
274
|
+
f"Failed to parse frustration score reason from the LLM output: {model_output}, reason: {e}",
|
|
275
|
+
exc_info=True,
|
|
276
|
+
)
|
|
277
|
+
raise e
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
import pydantic
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class EvaluateUserFrustrationResponse(pydantic.BaseModel):
|
|
7
|
+
verdict: str
|
|
8
|
+
reason: Optional[str] = pydantic.Field(default=None)
|
|
9
|
+
|
|
10
|
+
__hash__ = object.__hash__
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ScoreReasonResponse(pydantic.BaseModel):
|
|
14
|
+
reason: str
|
|
15
|
+
|
|
16
|
+
__hash__ = object.__hash__
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
from typing import List, Dict
|
|
2
|
+
|
|
3
|
+
from opik.evaluation.metrics.conversation import types as conversation_types
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def evaluate_conversation(sliding_window: conversation_types.Conversation) -> str:
|
|
7
|
+
return f"""Based on the given list of message exchanges between a user and an LLM, generate a JSON object to indicate whether the LAST `user` message is showing that the user experiences confusion, annoyance, or disengagement during the conversation session given in the context of the last messages.
|
|
8
|
+
|
|
9
|
+
** Guidelines: **
|
|
10
|
+
- Make sure to only return in JSON format.
|
|
11
|
+
- The JSON must have only 2 fields: 'verdict' and 'reason'.
|
|
12
|
+
- The 'verdict' key should STRICTLY be either 'yes' or 'no', which states whether the last `user` message is showing that the user experienced confusion, annoyance, or disengagement during the conversation session given in the context of the last messages.
|
|
13
|
+
- Provide a 'reason' ONLY if the answer is 'yes'.
|
|
14
|
+
- You DON'T have to provide a reason if the answer is 'no'.
|
|
15
|
+
- You MUST USE the previous messages (if any) provided in the list of messages to make an informed judgement on user frustration.
|
|
16
|
+
- You MUST ONLY provide a verdict for the LAST message on the list but MUST USE context from the previous messages.
|
|
17
|
+
- ONLY provide a 'yes' answer if the LLM response caused the user to express COMPLETE frustration or confusion in his/her input messages.
|
|
18
|
+
- Vague LLM responses to vague inputs, such as greetings DOES NOT count as causes of frustration!
|
|
19
|
+
- You should mention LLM response instead of `assistant`, and User instead of `user`.
|
|
20
|
+
|
|
21
|
+
===== Start OF EXAMPLE ======
|
|
22
|
+
** Example Turns: **
|
|
23
|
+
[
|
|
24
|
+
{{
|
|
25
|
+
"role": "user",
|
|
26
|
+
"content": "Hello!"
|
|
27
|
+
}},
|
|
28
|
+
{{
|
|
29
|
+
"role": "assistant",
|
|
30
|
+
"content": "Hi there!"
|
|
31
|
+
}},
|
|
32
|
+
{{
|
|
33
|
+
"role": "user",
|
|
34
|
+
"content": "Can you help me write a function to parse dates in Python?"
|
|
35
|
+
}},
|
|
36
|
+
{{
|
|
37
|
+
"role": "assistant",
|
|
38
|
+
"content": "Of course! Here's a basic example using `datetime.strptime`:\n\n```python\nfrom datetime import datetime\n\ndef parse_date(date_string):\n return datetime.strptime(date_string, '%Y-%m-%d')\n```"
|
|
39
|
+
}},
|
|
40
|
+
{{
|
|
41
|
+
"role": "user",
|
|
42
|
+
"content": "That's not what I meant. I need it to handle multiple formats."
|
|
43
|
+
}},
|
|
44
|
+
{{
|
|
45
|
+
"role": "assistant",
|
|
46
|
+
"content": "Ah, got it. Here's an updated version that tries multiple formats:\n\n```python\nfrom datetime import datetime\n\ndef parse_date(date_string):\n for fmt in ('%Y-%m-%d', '%d/%m/%Y', '%m-%d-%Y'):\n try:\n return datetime.strptime(date_string, fmt)\n except ValueError:\n continue\n raise ValueError('No valid date format found')\n```"
|
|
47
|
+
}},
|
|
48
|
+
{{
|
|
49
|
+
"role": "user",
|
|
50
|
+
"content": "Why didn’t you just give me this the first time? This is basic."
|
|
51
|
+
}},
|
|
52
|
+
{{
|
|
53
|
+
"role": "assistant",
|
|
54
|
+
"content": "content": "Apologies for the oversight. I always aim to provide the most helpful answer based on the initial question."
|
|
55
|
+
}},
|
|
56
|
+
{{
|
|
57
|
+
"role": "user",
|
|
58
|
+
"content": "Well, it's kind of frustrating to keep asking for obvious things. Can you at least include error handling too next time?"
|
|
59
|
+
}},
|
|
60
|
+
{{
|
|
61
|
+
"role": "assistant",
|
|
62
|
+
"content": "You're right, I’ll be more thorough. Let me know if you'd like to expand the function with logging or timezone support as well."
|
|
63
|
+
|
|
64
|
+
}},
|
|
65
|
+
{{
|
|
66
|
+
"role": "user",
|
|
67
|
+
"content": "Now you're overcomplicating it. I just wanted a simple, robust function."
|
|
68
|
+
}},
|
|
69
|
+
{{
|
|
70
|
+
"role": "assistant",
|
|
71
|
+
"content": "Understood. Here's a simpler version with basic error handling included:\n\n```python\nfrom datetime import datetime\n\ndef parse_date(date_string):\n formats = ('%Y-%m-%d', '%d/%m/%Y', '%m-%d-%Y')\n for fmt in formats:\n try:\n return datetime.strptime(date_string, fmt)\n except ValueError:\n continue\n return None # Or raise an error if preferred\n```"
|
|
72
|
+
}},
|
|
73
|
+
{{
|
|
74
|
+
"role": "user", "content": "You could’ve just started with this. Why is it so hard to get a straight answer?"
|
|
75
|
+
}}
|
|
76
|
+
]
|
|
77
|
+
|
|
78
|
+
** Example output JSON **
|
|
79
|
+
{{
|
|
80
|
+
"verdict": "yes",
|
|
81
|
+
"reason": "The User repeatedly clarifies their intent and expresses dissatisfaction with the LLM’s initial responses, indicating a mismatch between the User’s expectations and the LLM’s output. Despite asking a clear question, the LLM provides an overly simplistic solution at first, requiring the User to iterate and request obvious improvements (e.g., support for multiple formats, error handling). The User’s tone becomes increasingly critical, with statements like “Why didn’t you just give me this the first time?” and “Why is it so hard to get a straight answer?”, signaling rising frustration due to perceived inefficiency and lack of responsiveness from the assistant."
|
|
82
|
+
}}
|
|
83
|
+
===== END OF EXAMPLE ======
|
|
84
|
+
|
|
85
|
+
** Turns: **
|
|
86
|
+
{sliding_window}
|
|
87
|
+
|
|
88
|
+
** JSON: **
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def generate_reason(score: float, frustrations: List[Dict[str, str]]) -> str:
|
|
93
|
+
return f"""Below is a list of user frustrations drawn from some messages in a conversation, which you have minimal knowledge of. It is a list of strings explaining why the 'assistant' messages caused the 'user' to be frustrated.
|
|
94
|
+
Given the frustration score, which is a 0-1 score indicating how frustrating the OVERALL AI 'assistant' messages are in a conversation (higher the more frustrating).
|
|
95
|
+
|
|
96
|
+
** Guidelines: **
|
|
97
|
+
- Make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
98
|
+
- Always quote WHICH MESSAGE and the INFORMATION in the reason in your final reason.
|
|
99
|
+
- Be confident in your reasoning, as if you’re aware of the `assistant` messages from the messages in a conversation that led to the frustrations.
|
|
100
|
+
- You should CONCISELY summarize the user frustrations to justify the score.
|
|
101
|
+
- You should NOT mention concrete frustration in your reason, and make the reason sound convincing.
|
|
102
|
+
- You should mention LLM response instead of `assistant`, and User instead of `user`.
|
|
103
|
+
- You should format <frustration_score> to use 1 decimal place in the reason.
|
|
104
|
+
|
|
105
|
+
===== Start OF EXAMPLE ======
|
|
106
|
+
** Example frustrations: **
|
|
107
|
+
[
|
|
108
|
+
{{
|
|
109
|
+
"message_number": "1",
|
|
110
|
+
"reason": "The User repeatedly clarifies their intent and expresses dissatisfaction with the LLM’s initial responses, indicating a mismatch between the User’s expectations and the LLM’s output."
|
|
111
|
+
}},
|
|
112
|
+
{{
|
|
113
|
+
"message_number": "2",
|
|
114
|
+
"reason": "Despite asking a clear question, the LLM provides an overly simplistic solution at first, requiring the User to iterate and request obvious improvements (e.g., support for multiple formats, error handling)."
|
|
115
|
+
}},
|
|
116
|
+
{{
|
|
117
|
+
"message_number": "3",
|
|
118
|
+
"reason": "The User’s tone becomes increasingly critical, with statements like “Why didn’t you just give me this the first time?” and “Why is it so hard to get a straight answer?”, signaling rising frustration due to perceived inefficiency and lack of responsiveness from the assistant."
|
|
119
|
+
}}
|
|
120
|
+
]
|
|
121
|
+
|
|
122
|
+
** Example output JSON **
|
|
123
|
+
{{
|
|
124
|
+
"reason": "The score is <frustration_score> because <your_reason>."
|
|
125
|
+
}}
|
|
126
|
+
===== END OF EXAMPLE ======
|
|
127
|
+
|
|
128
|
+
** Frustration Score: **
|
|
129
|
+
{score}
|
|
130
|
+
|
|
131
|
+
** Frustrations: **
|
|
132
|
+
{frustrations}
|
|
133
|
+
|
|
134
|
+
** JSON: **
|
|
135
|
+
"""
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from typing import Dict, List, Literal, Optional
|
|
2
|
+
|
|
3
|
+
import pydantic
|
|
4
|
+
|
|
5
|
+
ConversationDict = Dict[Literal["role", "content"], str]
|
|
6
|
+
Conversation = List[ConversationDict]
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ConversationTurn(pydantic.BaseModel):
|
|
10
|
+
"""
|
|
11
|
+
Representation of a single turn in a conversation.
|
|
12
|
+
This class defines a model for encapsulating a single conversational
|
|
13
|
+
turn consisting of an input user's message and an output LLM message. It is
|
|
14
|
+
designed to handle the exchange of messages in a structured format.
|
|
15
|
+
Args:
|
|
16
|
+
input: The input message of the conversation turn.
|
|
17
|
+
output: The output message of the conversation turn.
|
|
18
|
+
Example:
|
|
19
|
+
>>> conversation_turn = ConversationTurn(
|
|
20
|
+
>>> input={"role": "user", "content": "Hello!"},
|
|
21
|
+
>>> output={"role": "assistant", "content": "Hi there! How can I help you today?"}
|
|
22
|
+
>>> )
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
input: ConversationDict
|
|
26
|
+
output: Optional[ConversationDict]
|
|
27
|
+
|
|
28
|
+
def as_list(self) -> List[ConversationDict]:
|
|
29
|
+
if self.output is None:
|
|
30
|
+
return [self.input]
|
|
31
|
+
return [self.input, self.output]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
__all__ = ["ConversationDict", "Conversation", "ConversationTurn"]
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Callable, Optional, Sequence, Tuple, Union
|
|
4
|
+
|
|
5
|
+
from opik.exceptions import MetricComputationError
|
|
6
|
+
from opik.evaluation.metrics import base_metric, score_result
|
|
7
|
+
|
|
8
|
+
try: # pragma: no cover - optional dependency
|
|
9
|
+
from bert_score import score as bert_score_fn
|
|
10
|
+
except ImportError: # pragma: no cover - optional dependency
|
|
11
|
+
bert_score_fn = None
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
BertScoreFn = Callable[
|
|
15
|
+
[Sequence[str], Union[Sequence[str], Sequence[Sequence[str]]]], Tuple[Any, Any, Any]
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class BERTScore(base_metric.BaseMetric):
|
|
20
|
+
"""Wrapper around the `bert-score` library.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
scorer_fn: Optional callable compatible with ``bert_score.score`` for
|
|
24
|
+
dependency injection or advanced usage.
|
|
25
|
+
model_type: Model checkpoint to use when loading the scorer. Ignored when
|
|
26
|
+
``scorer_fn`` is provided.
|
|
27
|
+
lang: Two-letter language code used by the default scorer.
|
|
28
|
+
rescale_with_baseline: Whether to rescale the score using the provided
|
|
29
|
+
baseline statistics.
|
|
30
|
+
device: Optional device string forwarded to ``bert_score`` (e.g., "cpu",
|
|
31
|
+
"cuda").
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
scorer_fn: Optional[BertScoreFn] = None,
|
|
37
|
+
model_type: Optional[str] = "bert-base-uncased",
|
|
38
|
+
lang: Optional[str] = "en",
|
|
39
|
+
rescale_with_baseline: bool = False,
|
|
40
|
+
device: Optional[str] = None,
|
|
41
|
+
name: str = "bertscore_metric",
|
|
42
|
+
track: bool = True,
|
|
43
|
+
project_name: Optional[str] = None,
|
|
44
|
+
**scorer_kwargs: Any,
|
|
45
|
+
) -> None:
|
|
46
|
+
super().__init__(name=name, track=track, project_name=project_name)
|
|
47
|
+
|
|
48
|
+
if scorer_fn is not None:
|
|
49
|
+
self._scorer_fn = scorer_fn
|
|
50
|
+
else:
|
|
51
|
+
if bert_score_fn is None: # pragma: no cover - optional dependency
|
|
52
|
+
raise ImportError(
|
|
53
|
+
"BERTScore metric requires the optional 'bert-score' package. "
|
|
54
|
+
"Install via `pip install bert-score` or provide `scorer_fn`."
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
def _score(
|
|
58
|
+
candidates: Sequence[str],
|
|
59
|
+
references: Union[Sequence[str], Sequence[Sequence[str]]],
|
|
60
|
+
) -> Tuple[Any, Any, Any]:
|
|
61
|
+
return bert_score_fn(
|
|
62
|
+
candidates,
|
|
63
|
+
references,
|
|
64
|
+
model_type=model_type,
|
|
65
|
+
lang=lang,
|
|
66
|
+
rescale_with_baseline=rescale_with_baseline,
|
|
67
|
+
device=device,
|
|
68
|
+
**scorer_kwargs,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
self._scorer_fn = _score
|
|
72
|
+
|
|
73
|
+
def score(
|
|
74
|
+
self,
|
|
75
|
+
output: str,
|
|
76
|
+
reference: Union[str, Sequence[str], Sequence[Sequence[str]]],
|
|
77
|
+
**ignored_kwargs: Any,
|
|
78
|
+
) -> score_result.ScoreResult:
|
|
79
|
+
if not output.strip():
|
|
80
|
+
raise MetricComputationError("Candidate is empty (BERTScore metric).")
|
|
81
|
+
|
|
82
|
+
references: Union[Sequence[str], Sequence[Sequence[str]]]
|
|
83
|
+
if isinstance(reference, str):
|
|
84
|
+
references = [reference]
|
|
85
|
+
else:
|
|
86
|
+
references = reference
|
|
87
|
+
if isinstance(reference, Sequence) and len(reference) == 0:
|
|
88
|
+
raise MetricComputationError("Reference is empty (BERTScore metric).")
|
|
89
|
+
|
|
90
|
+
precision, recall, f1 = self._scorer_fn([output], references)
|
|
91
|
+
|
|
92
|
+
score_value = float(f1[0].item() if hasattr(f1[0], "item") else f1[0])
|
|
93
|
+
metadata = {
|
|
94
|
+
"precision": float(
|
|
95
|
+
precision[0].item() if hasattr(precision[0], "item") else precision[0]
|
|
96
|
+
),
|
|
97
|
+
"recall": float(
|
|
98
|
+
recall[0].item() if hasattr(recall[0], "item") else recall[0]
|
|
99
|
+
),
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
return score_result.ScoreResult(
|
|
103
|
+
value=score_value,
|
|
104
|
+
name=self.name,
|
|
105
|
+
reason=f"BERTScore F1: {score_value:.4f}",
|
|
106
|
+
metadata=metadata,
|
|
107
|
+
)
|