PyPI - opik - Versions diffs - 1.6.4__py3-none-any.whl → 1.9.71__py3-none-any.whl - Mend

opik 1.6.4py3-none-any.whl → 1.9.71py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1021) hide show

opik/__init__.py +33 -2
opik/anonymizer/__init__.py +5 -0
opik/anonymizer/anonymizer.py +12 -0
opik/anonymizer/factory.py +80 -0
opik/anonymizer/recursive_anonymizer.py +64 -0
opik/anonymizer/rules.py +56 -0
opik/anonymizer/rules_anonymizer.py +35 -0
opik/api_objects/attachment/__init__.py +5 -0
opik/api_objects/attachment/attachment.py +20 -0
opik/api_objects/attachment/attachment_context.py +36 -0
opik/api_objects/attachment/attachments_extractor.py +153 -0
opik/api_objects/attachment/client.py +220 -0
opik/api_objects/attachment/converters.py +51 -0
opik/api_objects/attachment/decoder.py +18 -0
opik/api_objects/attachment/decoder_base64.py +83 -0
opik/api_objects/attachment/decoder_helpers.py +137 -0
opik/api_objects/conversation/__init__.py +0 -0
opik/api_objects/conversation/conversation_factory.py +43 -0
opik/api_objects/conversation/conversation_thread.py +49 -0
opik/api_objects/data_helpers.py +79 -0
opik/api_objects/dataset/dataset.py +107 -45
opik/api_objects/dataset/rest_operations.py +12 -3
opik/api_objects/experiment/experiment.py +81 -45
opik/api_objects/experiment/experiment_item.py +2 -1
opik/api_objects/experiment/experiments_client.py +64 -0
opik/api_objects/experiment/helpers.py +35 -11
opik/api_objects/experiment/rest_operations.py +88 -19
opik/api_objects/helpers.py +104 -7
opik/api_objects/local_recording.py +81 -0
opik/api_objects/opik_client.py +872 -174
opik/api_objects/opik_query_language.py +136 -18
opik/api_objects/optimization/__init__.py +3 -0
opik/api_objects/optimization/optimization.py +39 -0
opik/api_objects/prompt/__init__.py +13 -1
opik/api_objects/prompt/base_prompt.py +69 -0
opik/api_objects/prompt/base_prompt_template.py +29 -0
opik/api_objects/prompt/chat/__init__.py +1 -0
opik/api_objects/prompt/chat/chat_prompt.py +210 -0
opik/api_objects/prompt/chat/chat_prompt_template.py +350 -0
opik/api_objects/prompt/chat/content_renderer_registry.py +203 -0
opik/api_objects/prompt/client.py +193 -41
opik/api_objects/prompt/text/__init__.py +1 -0
opik/api_objects/prompt/text/prompt.py +174 -0
opik/api_objects/prompt/text/prompt_template.py +55 -0
opik/api_objects/prompt/types.py +29 -0
opik/api_objects/rest_stream_parser.py +98 -0
opik/api_objects/search_helpers.py +89 -0
opik/api_objects/span/span_client.py +165 -45
opik/api_objects/span/span_data.py +136 -25
opik/api_objects/threads/__init__.py +0 -0
opik/api_objects/threads/threads_client.py +185 -0
opik/api_objects/trace/trace_client.py +72 -36
opik/api_objects/trace/trace_data.py +112 -26
opik/api_objects/validation_helpers.py +3 -3
opik/cli/__init__.py +5 -0
opik/cli/__main__.py +6 -0
opik/cli/configure.py +66 -0
opik/cli/exports/__init__.py +131 -0
opik/cli/exports/dataset.py +278 -0
opik/cli/exports/experiment.py +784 -0
opik/cli/exports/project.py +685 -0
opik/cli/exports/prompt.py +578 -0
opik/cli/exports/utils.py +406 -0
opik/cli/harbor.py +39 -0
opik/cli/healthcheck.py +21 -0
opik/cli/imports/__init__.py +439 -0
opik/cli/imports/dataset.py +143 -0
opik/cli/imports/experiment.py +1192 -0
opik/cli/imports/project.py +262 -0
opik/cli/imports/prompt.py +177 -0
opik/cli/imports/utils.py +280 -0
opik/cli/main.py +49 -0
opik/cli/proxy.py +93 -0
opik/cli/usage_report/__init__.py +16 -0
opik/cli/usage_report/charts.py +783 -0
opik/cli/usage_report/cli.py +274 -0
opik/cli/usage_report/constants.py +9 -0
opik/cli/usage_report/extraction.py +749 -0
opik/cli/usage_report/pdf.py +244 -0
opik/cli/usage_report/statistics.py +78 -0
opik/cli/usage_report/utils.py +235 -0
opik/config.py +62 -4
opik/configurator/configure.py +45 -6
opik/configurator/opik_rest_helpers.py +4 -1
opik/context_storage.py +164 -65
opik/datetime_helpers.py +12 -0
opik/decorator/arguments_helpers.py +9 -1
opik/decorator/base_track_decorator.py +298 -146
opik/decorator/context_manager/__init__.py +0 -0
opik/decorator/context_manager/span_context_manager.py +123 -0
opik/decorator/context_manager/trace_context_manager.py +84 -0
opik/decorator/generator_wrappers.py +3 -2
opik/decorator/inspect_helpers.py +11 -0
opik/decorator/opik_args/__init__.py +13 -0
opik/decorator/opik_args/api_classes.py +71 -0
opik/decorator/opik_args/helpers.py +120 -0
opik/decorator/span_creation_handler.py +49 -21
opik/decorator/tracker.py +9 -1
opik/dict_utils.py +3 -3
opik/environment.py +13 -1
opik/error_tracking/api.py +1 -1
opik/error_tracking/before_send.py +6 -5
opik/error_tracking/environment_details.py +29 -7
opik/error_tracking/error_filtering/filter_by_response_status_code.py +42 -0
opik/error_tracking/error_filtering/filter_chain_builder.py +14 -3
opik/evaluation/__init__.py +14 -2
opik/evaluation/engine/engine.py +280 -82
opik/evaluation/engine/evaluation_tasks_executor.py +15 -10
opik/evaluation/engine/helpers.py +34 -9
opik/evaluation/engine/metrics_evaluator.py +237 -0
opik/evaluation/engine/types.py +5 -4
opik/evaluation/evaluation_result.py +169 -2
opik/evaluation/evaluator.py +659 -58
opik/evaluation/metrics/__init__.py +121 -6
opik/evaluation/metrics/aggregated_metric.py +92 -0
opik/evaluation/metrics/arguments_helpers.py +15 -21
opik/evaluation/metrics/arguments_validator.py +38 -0
opik/evaluation/metrics/base_metric.py +20 -10
opik/evaluation/metrics/conversation/__init__.py +48 -0
opik/evaluation/metrics/conversation/conversation_thread_metric.py +79 -0
opik/evaluation/metrics/conversation/conversation_turns_factory.py +39 -0
opik/evaluation/metrics/conversation/g_eval_wrappers.py +19 -0
opik/evaluation/metrics/conversation/helpers.py +84 -0
opik/evaluation/metrics/conversation/heuristics/__init__.py +14 -0
opik/evaluation/metrics/conversation/heuristics/degeneration/__init__.py +3 -0
opik/evaluation/metrics/conversation/heuristics/degeneration/metric.py +189 -0
opik/evaluation/metrics/conversation/heuristics/degeneration/phrases.py +12 -0
opik/evaluation/metrics/conversation/heuristics/knowledge_retention/__init__.py +3 -0
opik/evaluation/metrics/conversation/heuristics/knowledge_retention/metric.py +172 -0
opik/evaluation/metrics/conversation/llm_judges/__init__.py +32 -0
opik/evaluation/metrics/conversation/llm_judges/conversational_coherence/__init__.py +0 -0
opik/evaluation/metrics/conversation/llm_judges/conversational_coherence/metric.py +274 -0
opik/evaluation/metrics/conversation/llm_judges/conversational_coherence/schema.py +16 -0
opik/evaluation/metrics/conversation/llm_judges/conversational_coherence/templates.py +95 -0
opik/evaluation/metrics/conversation/llm_judges/g_eval_wrappers.py +442 -0
opik/evaluation/metrics/conversation/llm_judges/session_completeness/__init__.py +0 -0
opik/evaluation/metrics/conversation/llm_judges/session_completeness/metric.py +295 -0
opik/evaluation/metrics/conversation/llm_judges/session_completeness/schema.py +22 -0
opik/evaluation/metrics/conversation/llm_judges/session_completeness/templates.py +139 -0
opik/evaluation/metrics/conversation/llm_judges/user_frustration/__init__.py +0 -0
opik/evaluation/metrics/conversation/llm_judges/user_frustration/metric.py +277 -0
opik/evaluation/metrics/conversation/llm_judges/user_frustration/schema.py +16 -0
opik/evaluation/metrics/conversation/llm_judges/user_frustration/templates.py +135 -0
opik/evaluation/metrics/conversation/types.py +34 -0
opik/evaluation/metrics/conversation_types.py +9 -0
opik/evaluation/metrics/heuristics/bertscore.py +107 -0
opik/evaluation/metrics/heuristics/bleu.py +43 -16
opik/evaluation/metrics/heuristics/chrf.py +127 -0
opik/evaluation/metrics/heuristics/contains.py +50 -11
opik/evaluation/metrics/heuristics/distribution_metrics.py +331 -0
opik/evaluation/metrics/heuristics/equals.py +4 -1
opik/evaluation/metrics/heuristics/gleu.py +113 -0
opik/evaluation/metrics/heuristics/is_json.py +9 -3
opik/evaluation/metrics/heuristics/language_adherence.py +123 -0
opik/evaluation/metrics/heuristics/levenshtein_ratio.py +6 -5
opik/evaluation/metrics/heuristics/meteor.py +119 -0
opik/evaluation/metrics/heuristics/prompt_injection.py +150 -0
opik/evaluation/metrics/heuristics/readability.py +129 -0
opik/evaluation/metrics/heuristics/regex_match.py +4 -1
opik/evaluation/metrics/heuristics/rouge.py +148 -0
opik/evaluation/metrics/heuristics/sentiment.py +98 -0
opik/evaluation/metrics/heuristics/spearman.py +88 -0
opik/evaluation/metrics/heuristics/tone.py +155 -0
opik/evaluation/metrics/heuristics/vader_sentiment.py +77 -0
opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +27 -30
opik/evaluation/metrics/llm_judges/answer_relevance/parser.py +27 -0
opik/evaluation/metrics/llm_judges/answer_relevance/templates.py +10 -10
opik/evaluation/metrics/llm_judges/context_precision/metric.py +28 -31
opik/evaluation/metrics/llm_judges/context_precision/parser.py +27 -0
opik/evaluation/metrics/llm_judges/context_precision/template.py +7 -7
opik/evaluation/metrics/llm_judges/context_recall/metric.py +27 -31
opik/evaluation/metrics/llm_judges/context_recall/parser.py +27 -0
opik/evaluation/metrics/llm_judges/context_recall/template.py +7 -7
opik/evaluation/metrics/llm_judges/factuality/metric.py +7 -26
opik/evaluation/metrics/llm_judges/factuality/parser.py +35 -0
opik/evaluation/metrics/llm_judges/factuality/template.py +1 -1
opik/evaluation/metrics/llm_judges/g_eval/__init__.py +5 -0
opik/evaluation/metrics/llm_judges/g_eval/metric.py +244 -113
opik/evaluation/metrics/llm_judges/g_eval/parser.py +161 -0
opik/evaluation/metrics/llm_judges/g_eval/presets.py +209 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/__init__.py +36 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/agent_assessment.py +77 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/bias_classifier.py +181 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/compliance_risk.py +41 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/prompt_uncertainty.py +41 -0
opik/evaluation/metrics/llm_judges/g_eval_presets/qa_suite.py +146 -0
opik/evaluation/metrics/llm_judges/hallucination/metric.py +23 -27
opik/evaluation/metrics/llm_judges/hallucination/parser.py +29 -0
opik/evaluation/metrics/llm_judges/hallucination/template.py +2 -4
opik/evaluation/metrics/llm_judges/llm_juries/__init__.py +3 -0
opik/evaluation/metrics/llm_judges/llm_juries/metric.py +76 -0
opik/evaluation/metrics/llm_judges/moderation/metric.py +23 -28
opik/evaluation/metrics/llm_judges/moderation/parser.py +27 -0
opik/evaluation/metrics/llm_judges/moderation/template.py +2 -2
opik/evaluation/metrics/llm_judges/parsing_helpers.py +26 -0
opik/evaluation/metrics/llm_judges/structure_output_compliance/__init__.py +0 -0
opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +144 -0
opik/evaluation/metrics/llm_judges/structure_output_compliance/parser.py +79 -0
opik/evaluation/metrics/llm_judges/structure_output_compliance/schema.py +15 -0
opik/evaluation/metrics/llm_judges/structure_output_compliance/template.py +50 -0
opik/evaluation/metrics/llm_judges/syc_eval/__init__.py +0 -0
opik/evaluation/metrics/llm_judges/syc_eval/metric.py +252 -0
opik/evaluation/metrics/llm_judges/syc_eval/parser.py +82 -0
opik/evaluation/metrics/llm_judges/syc_eval/template.py +155 -0
opik/evaluation/metrics/llm_judges/trajectory_accuracy/__init__.py +3 -0
opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +171 -0
opik/evaluation/metrics/llm_judges/trajectory_accuracy/parser.py +38 -0
opik/evaluation/metrics/llm_judges/trajectory_accuracy/templates.py +65 -0
opik/evaluation/metrics/llm_judges/usefulness/metric.py +23 -32
opik/evaluation/metrics/llm_judges/usefulness/parser.py +28 -0
opik/evaluation/metrics/ragas_metric.py +112 -0
opik/evaluation/models/__init__.py +10 -0
opik/evaluation/models/base_model.py +140 -18
opik/evaluation/models/langchain/__init__.py +3 -0
opik/evaluation/models/langchain/langchain_chat_model.py +166 -0
opik/evaluation/models/langchain/message_converters.py +106 -0
opik/evaluation/models/langchain/opik_monitoring.py +23 -0
opik/evaluation/models/litellm/litellm_chat_model.py +186 -40
opik/evaluation/models/litellm/opik_monitor.py +24 -21
opik/evaluation/models/litellm/util.py +125 -0
opik/evaluation/models/litellm/warning_filters.py +16 -4
opik/evaluation/models/model_capabilities.py +187 -0
opik/evaluation/models/models_factory.py +25 -3
opik/evaluation/preprocessing.py +92 -0
opik/evaluation/report.py +70 -12
opik/evaluation/rest_operations.py +49 -45
opik/evaluation/samplers/__init__.py +4 -0
opik/evaluation/samplers/base_dataset_sampler.py +40 -0
opik/evaluation/samplers/random_dataset_sampler.py +48 -0
opik/evaluation/score_statistics.py +66 -0
opik/evaluation/scorers/__init__.py +4 -0
opik/evaluation/scorers/scorer_function.py +55 -0
opik/evaluation/scorers/scorer_wrapper_metric.py +130 -0
opik/evaluation/test_case.py +3 -2
opik/evaluation/test_result.py +1 -0
opik/evaluation/threads/__init__.py +0 -0
opik/evaluation/threads/context_helper.py +32 -0
opik/evaluation/threads/evaluation_engine.py +181 -0
opik/evaluation/threads/evaluation_result.py +18 -0
opik/evaluation/threads/evaluator.py +120 -0
opik/evaluation/threads/helpers.py +51 -0
opik/evaluation/types.py +9 -1
opik/exceptions.py +116 -3
opik/file_upload/__init__.py +0 -0
opik/file_upload/base_upload_manager.py +39 -0
opik/file_upload/file_upload_monitor.py +14 -0
opik/file_upload/file_uploader.py +141 -0
opik/file_upload/mime_type.py +9 -0
opik/file_upload/s3_multipart_upload/__init__.py +0 -0
opik/file_upload/s3_multipart_upload/file_parts_strategy.py +89 -0
opik/file_upload/s3_multipart_upload/s3_file_uploader.py +86 -0
opik/file_upload/s3_multipart_upload/s3_upload_error.py +29 -0
opik/file_upload/thread_pool.py +17 -0
opik/file_upload/upload_client.py +114 -0
opik/file_upload/upload_manager.py +255 -0
opik/file_upload/upload_options.py +37 -0
opik/format_helpers.py +17 -0
opik/guardrails/__init__.py +4 -0
opik/guardrails/guardrail.py +157 -0
opik/guardrails/guards/__init__.py +5 -0
opik/guardrails/guards/guard.py +17 -0
opik/guardrails/guards/pii.py +47 -0
opik/guardrails/guards/topic.py +76 -0
opik/guardrails/rest_api_client.py +34 -0
opik/guardrails/schemas.py +24 -0
opik/guardrails/tracing.py +61 -0
opik/healthcheck/__init__.py +2 -1
opik/healthcheck/checks.py +2 -2
opik/healthcheck/rich_representation.py +1 -1
opik/hooks/__init__.py +23 -0
opik/hooks/anonymizer_hook.py +36 -0
opik/hooks/httpx_client_hook.py +112 -0
opik/httpx_client.py +75 -4
opik/id_helpers.py +18 -0
opik/integrations/adk/__init__.py +14 -0
opik/integrations/adk/callback_context_info_extractors.py +32 -0
opik/integrations/adk/graph/__init__.py +0 -0
opik/integrations/adk/graph/mermaid_graph_builder.py +128 -0
opik/integrations/adk/graph/nodes.py +101 -0
opik/integrations/adk/graph/subgraph_edges_builders.py +41 -0
opik/integrations/adk/helpers.py +48 -0
opik/integrations/adk/legacy_opik_tracer.py +381 -0
opik/integrations/adk/opik_tracer.py +370 -0
opik/integrations/adk/patchers/__init__.py +4 -0
opik/integrations/adk/patchers/adk_otel_tracer/__init__.py +0 -0
opik/integrations/adk/patchers/adk_otel_tracer/llm_span_helpers.py +30 -0
opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +201 -0
opik/integrations/adk/patchers/litellm_wrappers.py +91 -0
opik/integrations/adk/patchers/llm_response_wrapper.py +105 -0
opik/integrations/adk/patchers/patchers.py +64 -0
opik/integrations/adk/recursive_callback_injector.py +126 -0
opik/integrations/aisuite/aisuite_decorator.py +8 -3
opik/integrations/aisuite/opik_tracker.py +1 -0
opik/integrations/anthropic/messages_create_decorator.py +8 -3
opik/integrations/anthropic/opik_tracker.py +0 -1
opik/integrations/bedrock/converse/__init__.py +0 -0
opik/integrations/bedrock/converse/chunks_aggregator.py +188 -0
opik/integrations/bedrock/{converse_decorator.py → converse/converse_decorator.py} +18 -8
opik/integrations/bedrock/invoke_agent_decorator.py +12 -7
opik/integrations/bedrock/invoke_model/__init__.py +0 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/__init__.py +78 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/api.py +45 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/base.py +23 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/claude.py +121 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/format_detector.py +107 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/llama.py +108 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/mistral.py +118 -0
opik/integrations/bedrock/invoke_model/chunks_aggregator/nova.py +99 -0
opik/integrations/bedrock/invoke_model/invoke_model_decorator.py +178 -0
opik/integrations/bedrock/invoke_model/response_types.py +34 -0
opik/integrations/bedrock/invoke_model/stream_wrappers.py +122 -0
opik/integrations/bedrock/invoke_model/usage_converters.py +87 -0
opik/integrations/bedrock/invoke_model/usage_extraction.py +108 -0
opik/integrations/bedrock/opik_tracker.py +43 -4
opik/integrations/bedrock/types.py +19 -0
opik/integrations/crewai/crewai_decorator.py +34 -56
opik/integrations/crewai/opik_tracker.py +31 -10
opik/integrations/crewai/patchers/__init__.py +5 -0
opik/integrations/crewai/patchers/flow.py +118 -0
opik/integrations/crewai/patchers/litellm_completion.py +30 -0
opik/integrations/crewai/patchers/llm_client.py +207 -0
opik/integrations/dspy/callback.py +246 -84
opik/integrations/dspy/graph.py +88 -0
opik/integrations/dspy/parsers.py +168 -0
opik/integrations/genai/encoder_extension.py +2 -6
opik/integrations/genai/generate_content_decorator.py +20 -13
opik/integrations/guardrails/guardrails_decorator.py +4 -0
opik/integrations/harbor/__init__.py +17 -0
opik/integrations/harbor/experiment_service.py +269 -0
opik/integrations/harbor/opik_tracker.py +528 -0
opik/integrations/haystack/constants.py +35 -0
opik/integrations/haystack/converters.py +1 -2
opik/integrations/haystack/opik_connector.py +28 -6
opik/integrations/haystack/opik_span_bridge.py +284 -0
opik/integrations/haystack/opik_tracer.py +124 -222
opik/integrations/langchain/__init__.py +3 -1
opik/integrations/langchain/helpers.py +96 -0
opik/integrations/langchain/langgraph_async_context_bridge.py +131 -0
opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
opik/integrations/langchain/opik_encoder_extension.py +2 -2
opik/integrations/langchain/opik_tracer.py +641 -206
opik/integrations/langchain/provider_usage_extractors/__init__.py +5 -0
opik/integrations/langchain/provider_usage_extractors/anthropic_usage_extractor.py +101 -0
opik/integrations/langchain/provider_usage_extractors/anthropic_vertexai_usage_extractor.py +67 -0
opik/integrations/langchain/provider_usage_extractors/bedrock_usage_extractor.py +94 -0
opik/integrations/langchain/provider_usage_extractors/google_generative_ai_usage_extractor.py +109 -0
opik/integrations/langchain/provider_usage_extractors/groq_usage_extractor.py +92 -0
opik/integrations/langchain/provider_usage_extractors/langchain_run_helpers/__init__.py +15 -0
opik/integrations/langchain/provider_usage_extractors/langchain_run_helpers/helpers.py +134 -0
opik/integrations/langchain/provider_usage_extractors/langchain_run_helpers/langchain_usage.py +163 -0
opik/integrations/langchain/provider_usage_extractors/openai_usage_extractor.py +124 -0
opik/integrations/langchain/provider_usage_extractors/provider_usage_extractor_protocol.py +29 -0
opik/integrations/langchain/provider_usage_extractors/usage_extractor.py +48 -0
opik/integrations/langchain/provider_usage_extractors/vertexai_usage_extractor.py +109 -0
opik/integrations/litellm/__init__.py +5 -0
opik/integrations/litellm/completion_chunks_aggregator.py +115 -0
opik/integrations/litellm/litellm_completion_decorator.py +242 -0
opik/integrations/litellm/opik_tracker.py +43 -0
opik/integrations/litellm/stream_patchers.py +151 -0
opik/integrations/llama_index/callback.py +179 -78
opik/integrations/llama_index/event_parsing_utils.py +29 -9
opik/integrations/openai/agents/opik_tracing_processor.py +204 -32
opik/integrations/openai/agents/span_data_parsers.py +15 -6
opik/integrations/openai/chat_completion_chunks_aggregator.py +1 -1
opik/integrations/openai/{openai_decorator.py → openai_chat_completions_decorator.py} +45 -35
opik/integrations/openai/openai_responses_decorator.py +158 -0
opik/integrations/openai/opik_tracker.py +94 -13
opik/integrations/openai/response_events_aggregator.py +36 -0
opik/integrations/openai/stream_patchers.py +125 -15
opik/integrations/sagemaker/auth.py +5 -1
opik/jsonable_encoder.py +29 -1
opik/llm_usage/base_original_provider_usage.py +15 -8
opik/llm_usage/bedrock_usage.py +8 -2
opik/llm_usage/google_usage.py +6 -1
opik/llm_usage/llm_usage_info.py +6 -0
opik/llm_usage/{openai_usage.py → openai_chat_completions_usage.py} +2 -12
opik/llm_usage/{openai_agent_usage.py → openai_responses_usage.py} +7 -15
opik/llm_usage/opik_usage.py +36 -10
opik/llm_usage/opik_usage_factory.py +35 -19
opik/logging_messages.py +19 -7
opik/message_processing/arguments_utils.py +22 -0
opik/message_processing/batching/base_batcher.py +45 -17
opik/message_processing/batching/batch_manager.py +22 -10
opik/message_processing/batching/batch_manager_constuctors.py +36 -11
opik/message_processing/batching/batchers.py +167 -44
opik/message_processing/batching/flushing_thread.py +0 -3
opik/message_processing/batching/sequence_splitter.py +50 -5
opik/message_processing/emulation/__init__.py +0 -0
opik/message_processing/emulation/emulator_message_processor.py +578 -0
opik/message_processing/emulation/local_emulator_message_processor.py +140 -0
opik/message_processing/emulation/models.py +162 -0
opik/message_processing/encoder_helpers.py +79 -0
opik/message_processing/message_queue.py +79 -0
opik/message_processing/messages.py +154 -12
opik/message_processing/preprocessing/__init__.py +0 -0
opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
opik/message_processing/preprocessing/constants.py +1 -0
opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
opik/message_processing/preprocessing/preprocessor.py +36 -0
opik/message_processing/processors/__init__.py +0 -0
opik/message_processing/processors/attachments_extraction_processor.py +146 -0
opik/message_processing/processors/message_processors.py +92 -0
opik/message_processing/processors/message_processors_chain.py +96 -0
opik/message_processing/processors/online_message_processor.py +324 -0
opik/message_processing/queue_consumer.py +61 -13
opik/message_processing/streamer.py +102 -31
opik/message_processing/streamer_constructors.py +67 -12
opik/opik_context.py +103 -11
opik/plugins/pytest/decorator.py +2 -2
opik/plugins/pytest/experiment_runner.py +3 -2
opik/plugins/pytest/hooks.py +6 -4
opik/rate_limit/__init__.py +0 -0
opik/rate_limit/rate_limit.py +25 -0
opik/rest_api/__init__.py +643 -11
opik/rest_api/alerts/__init__.py +7 -0
opik/rest_api/alerts/client.py +667 -0
opik/rest_api/alerts/raw_client.py +1015 -0
opik/rest_api/alerts/types/__init__.py +7 -0
opik/rest_api/alerts/types/get_webhook_examples_request_alert_type.py +5 -0
opik/rest_api/annotation_queues/__init__.py +4 -0
opik/rest_api/annotation_queues/client.py +668 -0
opik/rest_api/annotation_queues/raw_client.py +1019 -0
opik/rest_api/attachments/__init__.py +17 -0
opik/rest_api/attachments/client.py +752 -0
opik/rest_api/attachments/raw_client.py +1125 -0
opik/rest_api/attachments/types/__init__.py +15 -0
opik/rest_api/attachments/types/attachment_list_request_entity_type.py +5 -0
opik/rest_api/attachments/types/download_attachment_request_entity_type.py +5 -0
opik/rest_api/attachments/types/start_multipart_upload_request_entity_type.py +5 -0
opik/rest_api/attachments/types/upload_attachment_request_entity_type.py +5 -0
opik/rest_api/automation_rule_evaluators/__init__.py +2 -0
opik/rest_api/automation_rule_evaluators/client.py +182 -1162
opik/rest_api/automation_rule_evaluators/raw_client.py +598 -0
opik/rest_api/chat_completions/__init__.py +2 -0
opik/rest_api/chat_completions/client.py +115 -149
opik/rest_api/chat_completions/raw_client.py +339 -0
opik/rest_api/check/__init__.py +2 -0
opik/rest_api/check/client.py +88 -106
opik/rest_api/check/raw_client.py +258 -0
opik/rest_api/client.py +112 -212
opik/rest_api/core/__init__.py +5 -0
opik/rest_api/core/api_error.py +12 -6
opik/rest_api/core/client_wrapper.py +4 -14
opik/rest_api/core/datetime_utils.py +1 -3
opik/rest_api/core/file.py +2 -5
opik/rest_api/core/http_client.py +42 -120
opik/rest_api/core/http_response.py +55 -0
opik/rest_api/core/jsonable_encoder.py +1 -4
opik/rest_api/core/pydantic_utilities.py +79 -147
opik/rest_api/core/query_encoder.py +1 -3
opik/rest_api/core/serialization.py +10 -10
opik/rest_api/dashboards/__init__.py +4 -0
opik/rest_api/dashboards/client.py +462 -0
opik/rest_api/dashboards/raw_client.py +648 -0
opik/rest_api/datasets/__init__.py +5 -0
opik/rest_api/datasets/client.py +1638 -1091
opik/rest_api/datasets/raw_client.py +3389 -0
opik/rest_api/datasets/types/__init__.py +8 -0
opik/rest_api/datasets/types/dataset_update_visibility.py +5 -0
opik/rest_api/datasets/types/dataset_write_visibility.py +5 -0
opik/rest_api/errors/__init__.py +2 -0
opik/rest_api/errors/bad_request_error.py +4 -3
opik/rest_api/errors/conflict_error.py +4 -3
opik/rest_api/errors/forbidden_error.py +4 -2
opik/rest_api/errors/not_found_error.py +4 -3
opik/rest_api/errors/not_implemented_error.py +4 -3
opik/rest_api/errors/unauthorized_error.py +4 -3
opik/rest_api/errors/unprocessable_entity_error.py +4 -3
opik/rest_api/experiments/__init__.py +5 -0
opik/rest_api/experiments/client.py +676 -752
opik/rest_api/experiments/raw_client.py +1872 -0
opik/rest_api/experiments/types/__init__.py +10 -0
opik/rest_api/experiments/types/experiment_update_status.py +5 -0
opik/rest_api/experiments/types/experiment_update_type.py +5 -0
opik/rest_api/experiments/types/experiment_write_status.py +5 -0
opik/rest_api/experiments/types/experiment_write_type.py +5 -0
opik/rest_api/feedback_definitions/__init__.py +2 -0
opik/rest_api/feedback_definitions/client.py +96 -370
opik/rest_api/feedback_definitions/raw_client.py +541 -0
opik/rest_api/feedback_definitions/types/__init__.py +2 -0
opik/rest_api/feedback_definitions/types/find_feedback_definitions_request_type.py +1 -3
opik/rest_api/guardrails/__init__.py +4 -0
opik/rest_api/guardrails/client.py +104 -0
opik/rest_api/guardrails/raw_client.py +102 -0
opik/rest_api/llm_provider_key/__init__.py +2 -0
opik/rest_api/llm_provider_key/client.py +166 -440
opik/rest_api/llm_provider_key/raw_client.py +643 -0
opik/rest_api/llm_provider_key/types/__init__.py +2 -0
opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +1 -1
opik/rest_api/manual_evaluation/__init__.py +4 -0
opik/rest_api/manual_evaluation/client.py +347 -0
opik/rest_api/manual_evaluation/raw_client.py +543 -0
opik/rest_api/open_telemetry_ingestion/__init__.py +2 -0
opik/rest_api/open_telemetry_ingestion/client.py +38 -63
opik/rest_api/open_telemetry_ingestion/raw_client.py +88 -0
opik/rest_api/optimizations/__init__.py +7 -0
opik/rest_api/optimizations/client.py +704 -0
opik/rest_api/optimizations/raw_client.py +920 -0
opik/rest_api/optimizations/types/__init__.py +7 -0
opik/rest_api/optimizations/types/optimization_update_status.py +7 -0
opik/rest_api/projects/__init__.py +10 -1
opik/rest_api/projects/client.py +180 -855
opik/rest_api/projects/raw_client.py +1216 -0
opik/rest_api/projects/types/__init__.py +11 -4
opik/rest_api/projects/types/project_metric_request_public_interval.py +1 -3
opik/rest_api/projects/types/project_metric_request_public_metric_type.py +11 -1
opik/rest_api/projects/types/project_update_visibility.py +5 -0
opik/rest_api/projects/types/project_write_visibility.py +5 -0
opik/rest_api/prompts/__init__.py +4 -2
opik/rest_api/prompts/client.py +381 -970
opik/rest_api/prompts/raw_client.py +1634 -0
opik/rest_api/prompts/types/__init__.py +5 -1
opik/rest_api/prompts/types/create_prompt_version_detail_template_structure.py +5 -0
opik/rest_api/prompts/types/prompt_write_template_structure.py +5 -0
opik/rest_api/raw_client.py +156 -0
opik/rest_api/redirect/__init__.py +4 -0
opik/rest_api/redirect/client.py +375 -0
opik/rest_api/redirect/raw_client.py +566 -0
opik/rest_api/service_toggles/__init__.py +4 -0
opik/rest_api/service_toggles/client.py +91 -0
opik/rest_api/service_toggles/raw_client.py +93 -0
opik/rest_api/spans/__init__.py +2 -0
opik/rest_api/spans/client.py +659 -1354
opik/rest_api/spans/raw_client.py +2383 -0
opik/rest_api/spans/types/__init__.py +2 -0
opik/rest_api/spans/types/find_feedback_score_names_1_request_type.py +1 -3
opik/rest_api/spans/types/get_span_stats_request_type.py +1 -3
opik/rest_api/spans/types/get_spans_by_project_request_type.py +1 -3
opik/rest_api/spans/types/span_search_stream_request_public_type.py +1 -3
opik/rest_api/system_usage/__init__.py +2 -0
opik/rest_api/system_usage/client.py +157 -216
opik/rest_api/system_usage/raw_client.py +455 -0
opik/rest_api/traces/__init__.py +2 -0
opik/rest_api/traces/client.py +2102 -1625
opik/rest_api/traces/raw_client.py +4144 -0
opik/rest_api/types/__init__.py +629 -24
opik/rest_api/types/aggregation_data.py +27 -0
opik/rest_api/types/alert.py +33 -0
opik/rest_api/types/alert_alert_type.py +5 -0
opik/rest_api/types/alert_page_public.py +24 -0
opik/rest_api/types/alert_public.py +33 -0
opik/rest_api/types/alert_public_alert_type.py +5 -0
opik/rest_api/types/alert_trigger.py +27 -0
opik/rest_api/types/alert_trigger_config.py +28 -0
opik/rest_api/types/alert_trigger_config_public.py +28 -0
opik/rest_api/types/alert_trigger_config_public_type.py +10 -0
opik/rest_api/types/alert_trigger_config_type.py +10 -0
opik/rest_api/types/alert_trigger_config_write.py +22 -0
opik/rest_api/types/alert_trigger_config_write_type.py +10 -0
opik/rest_api/types/alert_trigger_event_type.py +19 -0
opik/rest_api/types/alert_trigger_public.py +27 -0
opik/rest_api/types/alert_trigger_public_event_type.py +19 -0
opik/rest_api/types/alert_trigger_write.py +23 -0
opik/rest_api/types/alert_trigger_write_event_type.py +19 -0
opik/rest_api/types/alert_write.py +28 -0
opik/rest_api/types/alert_write_alert_type.py +5 -0
opik/rest_api/types/annotation_queue.py +42 -0
opik/rest_api/types/annotation_queue_batch.py +27 -0
opik/rest_api/types/{json_schema_element.py → annotation_queue_item_ids.py} +5 -7
opik/rest_api/types/annotation_queue_page_public.py +28 -0
opik/rest_api/types/annotation_queue_public.py +38 -0
opik/rest_api/types/annotation_queue_public_scope.py +5 -0
opik/rest_api/types/{workspace_metadata.py → annotation_queue_reviewer.py} +6 -7
opik/rest_api/types/annotation_queue_reviewer_public.py +20 -0
opik/rest_api/types/annotation_queue_scope.py +5 -0
opik/rest_api/types/annotation_queue_write.py +31 -0
opik/rest_api/types/annotation_queue_write_scope.py +5 -0
opik/rest_api/types/assistant_message.py +7 -8
opik/rest_api/types/assistant_message_role.py +1 -3
opik/rest_api/types/attachment.py +22 -0
opik/rest_api/types/attachment_page.py +28 -0
opik/rest_api/types/audio_url.py +19 -0
opik/rest_api/types/audio_url_public.py +19 -0
opik/rest_api/types/audio_url_write.py +19 -0
opik/rest_api/types/automation_rule_evaluator.py +160 -0
opik/rest_api/types/automation_rule_evaluator_llm_as_judge.py +6 -6
opik/rest_api/types/automation_rule_evaluator_llm_as_judge_public.py +6 -6
opik/rest_api/types/automation_rule_evaluator_llm_as_judge_write.py +6 -6
opik/rest_api/types/automation_rule_evaluator_object_object_public.py +155 -0
opik/rest_api/types/automation_rule_evaluator_page_public.py +6 -6
opik/rest_api/types/automation_rule_evaluator_public.py +155 -0
opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_public.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_write.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge.py +22 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_public.py +22 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_write.py +22 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python.py +22 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_public.py +22 -0
opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_write.py +22 -0
opik/rest_api/types/automation_rule_evaluator_update.py +143 -0
opik/rest_api/types/automation_rule_evaluator_update_llm_as_judge.py +6 -6
opik/rest_api/types/automation_rule_evaluator_update_span_llm_as_judge.py +22 -0
opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
opik/rest_api/types/automation_rule_evaluator_update_trace_thread_llm_as_judge.py +22 -0
opik/rest_api/types/automation_rule_evaluator_update_trace_thread_user_defined_metric_python.py +22 -0
opik/rest_api/types/automation_rule_evaluator_update_user_defined_metric_python.py +6 -6
opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python.py +6 -6
opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_public.py +6 -6
opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_write.py +6 -6
opik/rest_api/types/automation_rule_evaluator_write.py +143 -0
opik/rest_api/types/avg_value_stat_public.py +3 -5
opik/rest_api/types/batch_delete.py +3 -5
opik/rest_api/types/batch_delete_by_project.py +20 -0
opik/rest_api/types/bi_information.py +3 -5
opik/rest_api/types/bi_information_response.py +4 -6
opik/rest_api/types/boolean_feedback_definition.py +25 -0
opik/rest_api/types/boolean_feedback_definition_create.py +20 -0
opik/rest_api/types/boolean_feedback_definition_public.py +25 -0
opik/rest_api/types/boolean_feedback_definition_update.py +20 -0
opik/rest_api/types/boolean_feedback_detail.py +29 -0
opik/rest_api/types/boolean_feedback_detail_create.py +29 -0
opik/rest_api/types/boolean_feedback_detail_public.py +29 -0
opik/rest_api/types/boolean_feedback_detail_update.py +29 -0
opik/rest_api/types/categorical_feedback_definition.py +5 -7
opik/rest_api/types/categorical_feedback_definition_create.py +4 -6
opik/rest_api/types/categorical_feedback_definition_public.py +5 -7
opik/rest_api/types/categorical_feedback_definition_update.py +4 -6
opik/rest_api/types/categorical_feedback_detail.py +3 -5
opik/rest_api/types/categorical_feedback_detail_create.py +3 -5
opik/rest_api/types/categorical_feedback_detail_public.py +3 -5
opik/rest_api/types/categorical_feedback_detail_update.py +3 -5
opik/rest_api/types/chat_completion_choice.py +4 -6
opik/rest_api/types/chat_completion_response.py +5 -6
opik/rest_api/types/check.py +22 -0
opik/rest_api/types/{json_node_compare.py → check_name.py} +1 -1
opik/rest_api/types/check_public.py +22 -0
opik/rest_api/types/check_public_name.py +5 -0
opik/rest_api/types/check_public_result.py +5 -0
opik/rest_api/types/check_result.py +5 -0
opik/rest_api/types/chunked_output_json_node.py +4 -6
opik/rest_api/types/chunked_output_json_node_public.py +4 -6
opik/rest_api/types/chunked_output_json_node_public_type.py +6 -10
opik/rest_api/types/chunked_output_json_node_type.py +6 -10
opik/rest_api/types/column.py +8 -10
opik/rest_api/types/column_compare.py +8 -10
opik/rest_api/types/column_public.py +8 -10
opik/rest_api/types/column_types_item.py +1 -3
opik/rest_api/types/comment.py +4 -6
opik/rest_api/types/comment_compare.py +4 -6
opik/rest_api/types/comment_public.py +4 -6
opik/rest_api/types/complete_multipart_upload_request.py +33 -0
opik/rest_api/types/complete_multipart_upload_request_entity_type.py +5 -0
opik/rest_api/types/completion_tokens_details.py +3 -5
opik/rest_api/types/count_value_stat_public.py +3 -5
opik/rest_api/types/dashboard_page_public.py +24 -0
opik/rest_api/types/dashboard_public.py +30 -0
opik/rest_api/types/data_point_double.py +21 -0
opik/rest_api/types/data_point_number_public.py +3 -5
opik/rest_api/types/dataset.py +14 -6
opik/rest_api/types/dataset_expansion.py +42 -0
opik/rest_api/types/dataset_expansion_response.py +39 -0
opik/rest_api/types/dataset_item.py +9 -8
opik/rest_api/types/dataset_item_batch.py +3 -5
opik/rest_api/types/dataset_item_changes_public.py +5 -0
opik/rest_api/types/dataset_item_compare.py +9 -8
opik/rest_api/types/dataset_item_compare_source.py +1 -3
opik/rest_api/types/dataset_item_filter.py +27 -0
opik/rest_api/types/dataset_item_filter_operator.py +21 -0
opik/rest_api/types/dataset_item_page_compare.py +10 -7
opik/rest_api/types/dataset_item_page_public.py +10 -7
opik/rest_api/types/dataset_item_public.py +9 -8
opik/rest_api/types/dataset_item_public_source.py +1 -3
opik/rest_api/types/dataset_item_source.py +1 -3
opik/rest_api/types/dataset_item_update.py +39 -0
opik/rest_api/types/dataset_item_write.py +5 -6
opik/rest_api/types/dataset_item_write_source.py +1 -3
opik/rest_api/types/dataset_page_public.py +9 -6
opik/rest_api/types/dataset_public.py +14 -6
opik/rest_api/types/dataset_public_status.py +5 -0
opik/rest_api/types/dataset_public_visibility.py +5 -0
opik/rest_api/types/dataset_status.py +5 -0
opik/rest_api/types/dataset_version_diff.py +22 -0
opik/rest_api/types/dataset_version_diff_stats.py +24 -0
opik/rest_api/types/dataset_version_page_public.py +23 -0
opik/rest_api/types/dataset_version_public.py +59 -0
opik/rest_api/types/dataset_version_summary.py +46 -0
opik/rest_api/types/dataset_version_summary_public.py +46 -0
opik/rest_api/types/dataset_visibility.py +5 -0
opik/rest_api/types/delete_attachments_request.py +23 -0
opik/rest_api/types/delete_attachments_request_entity_type.py +5 -0
opik/rest_api/types/delete_feedback_score.py +4 -5
opik/rest_api/types/delete_ids_holder.py +19 -0
opik/rest_api/types/delta.py +7 -9
opik/rest_api/types/error_count_with_deviation.py +21 -0
opik/rest_api/types/error_count_with_deviation_detailed.py +21 -0
opik/rest_api/types/error_info.py +3 -5
opik/rest_api/types/error_info_experiment_item_bulk_write_view.py +21 -0
opik/rest_api/types/error_info_public.py +3 -5
opik/rest_api/types/error_info_write.py +3 -5
opik/rest_api/types/error_message.py +3 -5
opik/rest_api/types/error_message_detail.py +3 -5
opik/rest_api/types/error_message_detailed.py +3 -5
opik/rest_api/types/error_message_public.py +3 -5
opik/rest_api/types/experiment.py +21 -10
opik/rest_api/types/experiment_group_aggregations_response.py +20 -0
opik/rest_api/types/experiment_group_response.py +22 -0
opik/rest_api/types/experiment_item.py +14 -11
opik/rest_api/types/experiment_item_bulk_record.py +27 -0
opik/rest_api/types/experiment_item_bulk_record_experiment_item_bulk_write_view.py +27 -0
opik/rest_api/types/experiment_item_bulk_upload.py +27 -0
opik/rest_api/types/experiment_item_compare.py +14 -11
opik/rest_api/types/experiment_item_compare_trace_visibility_mode.py +5 -0
opik/rest_api/types/experiment_item_public.py +6 -6
opik/rest_api/types/experiment_item_public_trace_visibility_mode.py +5 -0
opik/rest_api/types/experiment_item_trace_visibility_mode.py +5 -0
opik/rest_api/types/experiment_page_public.py +9 -6
opik/rest_api/types/experiment_public.py +21 -10
opik/rest_api/types/experiment_public_status.py +5 -0
opik/rest_api/types/experiment_public_type.py +5 -0
opik/rest_api/types/experiment_score.py +20 -0
opik/rest_api/types/experiment_score_public.py +20 -0
opik/rest_api/types/experiment_score_write.py +20 -0
opik/rest_api/types/experiment_status.py +5 -0
opik/rest_api/types/experiment_type.py +5 -0
opik/rest_api/types/export_trace_service_request.py +5 -0
opik/rest_api/types/feedback.py +40 -27
opik/rest_api/types/feedback_create.py +27 -13
opik/rest_api/types/feedback_definition_page_public.py +4 -6
opik/rest_api/types/feedback_object_public.py +40 -27
opik/rest_api/types/feedback_public.py +40 -27
opik/rest_api/types/feedback_score.py +7 -7
opik/rest_api/types/feedback_score_average.py +3 -5
opik/rest_api/types/feedback_score_average_detailed.py +3 -5
opik/rest_api/types/feedback_score_average_public.py +3 -5
opik/rest_api/types/feedback_score_batch.py +4 -6
opik/rest_api/types/feedback_score_batch_item.py +6 -6
opik/rest_api/types/feedback_score_batch_item_source.py +1 -3
opik/rest_api/types/feedback_score_batch_item_thread.py +32 -0
opik/rest_api/types/feedback_score_batch_item_thread_source.py +5 -0
opik/rest_api/types/feedback_score_compare.py +7 -7
opik/rest_api/types/feedback_score_compare_source.py +1 -3
opik/rest_api/types/feedback_score_experiment_item_bulk_write_view.py +31 -0
opik/rest_api/types/feedback_score_experiment_item_bulk_write_view_source.py +5 -0
opik/rest_api/types/feedback_score_names.py +4 -6
opik/rest_api/types/feedback_score_public.py +11 -7
opik/rest_api/types/feedback_score_public_source.py +1 -3
opik/rest_api/types/feedback_score_source.py +1 -3
opik/rest_api/types/feedback_update.py +27 -13
opik/rest_api/types/function.py +4 -7
opik/rest_api/types/function_call.py +3 -5
opik/rest_api/types/group_content.py +19 -0
opik/rest_api/types/group_content_with_aggregations.py +21 -0
opik/rest_api/types/group_detail.py +19 -0
opik/rest_api/types/group_details.py +20 -0
opik/rest_api/types/guardrail.py +34 -0
opik/rest_api/types/guardrail_batch.py +20 -0
opik/rest_api/types/guardrail_name.py +5 -0
opik/rest_api/types/guardrail_result.py +5 -0
opik/rest_api/types/guardrail_write.py +33 -0
opik/rest_api/types/guardrail_write_name.py +5 -0
opik/rest_api/types/guardrail_write_result.py +5 -0
opik/rest_api/types/guardrails_validation.py +21 -0
opik/rest_api/types/guardrails_validation_public.py +21 -0
opik/rest_api/types/ids_holder.py +19 -0
opik/rest_api/types/image_url.py +20 -0
opik/rest_api/types/image_url_public.py +20 -0
opik/rest_api/types/image_url_write.py +20 -0
opik/rest_api/types/json_list_string.py +7 -0
opik/rest_api/types/json_list_string_compare.py +7 -0
opik/rest_api/types/json_list_string_experiment_item_bulk_write_view.py +7 -0
opik/rest_api/types/json_list_string_public.py +7 -0
opik/rest_api/types/json_list_string_write.py +7 -0
opik/rest_api/types/json_schema.py +5 -8
opik/rest_api/types/llm_as_judge_code.py +8 -12
opik/rest_api/types/llm_as_judge_code_public.py +8 -12
opik/rest_api/types/llm_as_judge_code_write.py +8 -12
opik/rest_api/types/llm_as_judge_message.py +9 -7
opik/rest_api/types/llm_as_judge_message_content.py +26 -0
opik/rest_api/types/llm_as_judge_message_content_public.py +26 -0
opik/rest_api/types/llm_as_judge_message_content_write.py +26 -0
opik/rest_api/types/llm_as_judge_message_public.py +9 -7
opik/rest_api/types/llm_as_judge_message_public_role.py +1 -1
opik/rest_api/types/llm_as_judge_message_role.py +1 -1
opik/rest_api/types/llm_as_judge_message_write.py +9 -7
opik/rest_api/types/llm_as_judge_message_write_role.py +1 -1
opik/rest_api/types/llm_as_judge_model_parameters.py +6 -5
opik/rest_api/types/llm_as_judge_model_parameters_public.py +6 -5
opik/rest_api/types/llm_as_judge_model_parameters_write.py +6 -5
opik/rest_api/types/llm_as_judge_output_schema.py +4 -6
opik/rest_api/types/llm_as_judge_output_schema_public.py +4 -6
opik/rest_api/types/llm_as_judge_output_schema_public_type.py +1 -3
opik/rest_api/types/llm_as_judge_output_schema_type.py +1 -3
opik/rest_api/types/llm_as_judge_output_schema_write.py +4 -6
opik/rest_api/types/llm_as_judge_output_schema_write_type.py +1 -3
opik/rest_api/types/log_item.py +5 -7
opik/rest_api/types/log_item_level.py +1 -3
opik/rest_api/types/log_page.py +4 -6
opik/rest_api/types/manual_evaluation_request.py +38 -0
opik/rest_api/types/manual_evaluation_request_entity_type.py +5 -0
opik/rest_api/types/manual_evaluation_response.py +27 -0
opik/rest_api/types/multipart_upload_part.py +20 -0
opik/rest_api/types/numerical_feedback_definition.py +5 -7
opik/rest_api/types/numerical_feedback_definition_create.py +4 -6
opik/rest_api/types/numerical_feedback_definition_public.py +5 -7
opik/rest_api/types/numerical_feedback_definition_update.py +4 -6
opik/rest_api/types/numerical_feedback_detail.py +3 -5
opik/rest_api/types/numerical_feedback_detail_create.py +3 -5
opik/rest_api/types/numerical_feedback_detail_public.py +3 -5
opik/rest_api/types/numerical_feedback_detail_update.py +3 -5
opik/rest_api/types/optimization.py +37 -0
opik/rest_api/types/optimization_page_public.py +28 -0
opik/rest_api/types/optimization_public.py +37 -0
opik/rest_api/types/optimization_public_status.py +7 -0
opik/rest_api/types/optimization_status.py +7 -0
opik/rest_api/types/optimization_studio_config.py +27 -0
opik/rest_api/types/optimization_studio_config_public.py +27 -0
opik/rest_api/types/optimization_studio_config_write.py +27 -0
opik/rest_api/types/optimization_studio_log.py +22 -0
opik/rest_api/types/optimization_write.py +30 -0
opik/rest_api/types/optimization_write_status.py +7 -0
opik/rest_api/types/page_columns.py +4 -6
opik/rest_api/types/percentage_value_stat_public.py +4 -6
opik/rest_api/types/percentage_values.py +8 -16
opik/rest_api/types/percentage_values_detailed.py +8 -16
opik/rest_api/types/percentage_values_public.py +8 -16
opik/rest_api/types/project.py +12 -7
opik/rest_api/types/project_detailed.py +12 -7
opik/rest_api/types/project_detailed_visibility.py +5 -0
opik/rest_api/types/project_metric_response_public.py +5 -9
opik/rest_api/types/project_metric_response_public_interval.py +1 -3
opik/rest_api/types/project_metric_response_public_metric_type.py +11 -1
opik/rest_api/types/project_page_public.py +8 -10
opik/rest_api/types/project_public.py +6 -6
opik/rest_api/types/project_public_visibility.py +5 -0
opik/rest_api/types/project_reference.py +31 -0
opik/rest_api/types/project_reference_public.py +31 -0
opik/rest_api/types/project_stat_item_object_public.py +8 -17
opik/rest_api/types/project_stats_public.py +4 -6
opik/rest_api/types/project_stats_summary.py +4 -6
opik/rest_api/types/project_stats_summary_item.py +9 -6
opik/rest_api/types/project_visibility.py +5 -0
opik/rest_api/types/prompt.py +12 -7
opik/rest_api/types/prompt_detail.py +12 -7
opik/rest_api/types/prompt_detail_template_structure.py +5 -0
opik/rest_api/types/prompt_page_public.py +9 -6
opik/rest_api/types/prompt_public.py +11 -6
opik/rest_api/types/prompt_public_template_structure.py +5 -0
opik/rest_api/types/prompt_template_structure.py +5 -0
opik/rest_api/types/prompt_tokens_details.py +19 -0
opik/rest_api/types/prompt_version.py +7 -6
opik/rest_api/types/prompt_version_detail.py +7 -6
opik/rest_api/types/prompt_version_detail_template_structure.py +5 -0
opik/rest_api/types/prompt_version_link.py +4 -5
opik/rest_api/types/prompt_version_link_public.py +4 -5
opik/rest_api/types/prompt_version_link_write.py +3 -5
opik/rest_api/types/prompt_version_page_public.py +9 -6
opik/rest_api/types/prompt_version_public.py +7 -6
opik/rest_api/types/prompt_version_public_template_structure.py +5 -0
opik/rest_api/types/prompt_version_template_structure.py +5 -0
opik/rest_api/types/prompt_version_update.py +33 -0
opik/rest_api/types/provider_api_key.py +18 -8
opik/rest_api/types/provider_api_key_page_public.py +27 -0
opik/rest_api/types/provider_api_key_provider.py +1 -1
opik/rest_api/types/provider_api_key_public.py +18 -8
opik/rest_api/types/provider_api_key_public_provider.py +1 -1
opik/rest_api/types/response_format.py +5 -7
opik/rest_api/types/response_format_type.py +1 -3
opik/rest_api/types/result.py +21 -0
opik/rest_api/types/results_number_public.py +4 -6
opik/rest_api/types/score_name.py +4 -5
opik/rest_api/types/service_toggles_config.py +44 -0
opik/rest_api/types/span.py +13 -15
opik/rest_api/types/span_batch.py +4 -6
opik/rest_api/types/span_enrichment_options.py +31 -0
opik/rest_api/types/span_experiment_item_bulk_write_view.py +39 -0
opik/rest_api/types/span_experiment_item_bulk_write_view_type.py +5 -0
opik/rest_api/types/span_filter.py +23 -0
opik/rest_api/types/span_filter_operator.py +21 -0
opik/rest_api/types/span_filter_public.py +4 -6
opik/rest_api/types/span_filter_public_operator.py +2 -0
opik/rest_api/types/span_filter_write.py +23 -0
opik/rest_api/types/span_filter_write_operator.py +21 -0
opik/rest_api/types/span_llm_as_judge_code.py +27 -0
opik/rest_api/types/span_llm_as_judge_code_public.py +27 -0
opik/rest_api/types/span_llm_as_judge_code_write.py +27 -0
opik/rest_api/types/span_page_public.py +9 -6
opik/rest_api/types/span_public.py +19 -16
opik/rest_api/types/span_public_type.py +1 -1
opik/rest_api/types/span_type.py +1 -1
opik/rest_api/types/span_update.py +46 -0
opik/rest_api/types/span_update_type.py +5 -0
opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
opik/rest_api/types/span_write.py +13 -14
opik/rest_api/types/span_write_type.py +1 -1
opik/rest_api/types/spans_count_response.py +20 -0
opik/rest_api/types/start_multipart_upload_response.py +20 -0
opik/rest_api/types/stream_options.py +3 -5
opik/rest_api/types/studio_evaluation.py +20 -0
opik/rest_api/types/studio_evaluation_public.py +20 -0
opik/rest_api/types/studio_evaluation_write.py +20 -0
opik/rest_api/types/studio_llm_model.py +21 -0
opik/rest_api/types/studio_llm_model_public.py +21 -0
opik/rest_api/types/studio_llm_model_write.py +21 -0
opik/rest_api/types/studio_message.py +20 -0
opik/rest_api/types/studio_message_public.py +20 -0
opik/rest_api/types/studio_message_write.py +20 -0
opik/rest_api/types/studio_metric.py +21 -0
opik/rest_api/types/studio_metric_public.py +21 -0
opik/rest_api/types/studio_metric_write.py +21 -0
opik/rest_api/types/studio_optimizer.py +21 -0
opik/rest_api/types/studio_optimizer_public.py +21 -0
opik/rest_api/types/studio_optimizer_write.py +21 -0
opik/rest_api/types/studio_prompt.py +20 -0
opik/rest_api/types/studio_prompt_public.py +20 -0
opik/rest_api/types/studio_prompt_write.py +20 -0
opik/rest_api/types/tool.py +4 -6
opik/rest_api/types/tool_call.py +4 -6
opik/rest_api/types/trace.py +26 -12
opik/rest_api/types/trace_batch.py +4 -6
opik/rest_api/types/trace_count_response.py +4 -6
opik/rest_api/types/trace_enrichment_options.py +32 -0
opik/rest_api/types/trace_experiment_item_bulk_write_view.py +41 -0
opik/rest_api/types/trace_filter.py +23 -0
opik/rest_api/types/trace_filter_operator.py +21 -0
opik/rest_api/types/trace_filter_public.py +23 -0
opik/rest_api/types/trace_filter_public_operator.py +21 -0
opik/rest_api/types/trace_filter_write.py +23 -0
opik/rest_api/types/trace_filter_write_operator.py +21 -0
opik/rest_api/types/trace_page_public.py +8 -10
opik/rest_api/types/trace_public.py +27 -13
opik/rest_api/types/trace_public_visibility_mode.py +5 -0
opik/rest_api/types/trace_thread.py +18 -9
opik/rest_api/types/trace_thread_filter.py +23 -0
opik/rest_api/types/trace_thread_filter_operator.py +21 -0
opik/rest_api/types/trace_thread_filter_public.py +23 -0
opik/rest_api/types/trace_thread_filter_public_operator.py +21 -0
opik/rest_api/types/trace_thread_filter_write.py +23 -0
opik/rest_api/types/trace_thread_filter_write_operator.py +21 -0
opik/rest_api/types/trace_thread_identifier.py +22 -0
opik/rest_api/types/trace_thread_llm_as_judge_code.py +26 -0
opik/rest_api/types/trace_thread_llm_as_judge_code_public.py +26 -0
opik/rest_api/types/trace_thread_llm_as_judge_code_write.py +26 -0
opik/rest_api/types/trace_thread_page.py +9 -6
opik/rest_api/types/trace_thread_status.py +5 -0
opik/rest_api/types/trace_thread_update.py +19 -0
opik/rest_api/types/trace_thread_user_defined_metric_python_code.py +19 -0
opik/rest_api/types/trace_thread_user_defined_metric_python_code_public.py +19 -0
opik/rest_api/types/trace_thread_user_defined_metric_python_code_write.py +19 -0
opik/rest_api/types/trace_update.py +39 -0
opik/rest_api/types/trace_visibility_mode.py +5 -0
opik/rest_api/types/trace_write.py +10 -11
opik/rest_api/types/usage.py +6 -6
opik/rest_api/types/user_defined_metric_python_code.py +3 -5
opik/rest_api/types/user_defined_metric_python_code_public.py +3 -5
opik/rest_api/types/user_defined_metric_python_code_write.py +3 -5
opik/rest_api/types/value_entry.py +27 -0
opik/rest_api/types/value_entry_compare.py +27 -0
opik/rest_api/types/value_entry_compare_source.py +5 -0
opik/rest_api/types/value_entry_experiment_item_bulk_write_view.py +27 -0
opik/rest_api/types/value_entry_experiment_item_bulk_write_view_source.py +5 -0
opik/rest_api/types/value_entry_public.py +27 -0
opik/rest_api/types/value_entry_public_source.py +5 -0
opik/rest_api/types/value_entry_source.py +5 -0
opik/rest_api/types/video_url.py +19 -0
opik/rest_api/types/video_url_public.py +19 -0
opik/rest_api/types/video_url_write.py +19 -0
opik/rest_api/types/webhook.py +28 -0
opik/rest_api/types/webhook_examples.py +19 -0
opik/rest_api/types/webhook_public.py +28 -0
opik/rest_api/types/webhook_test_result.py +23 -0
opik/rest_api/types/webhook_test_result_status.py +5 -0
opik/rest_api/types/webhook_write.py +23 -0
opik/rest_api/types/welcome_wizard_tracking.py +22 -0
opik/rest_api/types/workspace_configuration.py +27 -0
opik/rest_api/types/workspace_metric_request.py +24 -0
opik/rest_api/types/workspace_metric_response.py +20 -0
opik/rest_api/types/workspace_metrics_summary_request.py +23 -0
opik/rest_api/types/workspace_metrics_summary_response.py +20 -0
opik/rest_api/types/workspace_name_holder.py +19 -0
opik/rest_api/types/workspace_spans_count.py +20 -0
opik/rest_api/types/workspace_trace_count.py +3 -5
opik/rest_api/welcome_wizard/__init__.py +4 -0
opik/rest_api/welcome_wizard/client.py +195 -0
opik/rest_api/welcome_wizard/raw_client.py +208 -0
opik/rest_api/workspaces/__init__.py +2 -0
opik/rest_api/workspaces/client.py +550 -77
opik/rest_api/workspaces/raw_client.py +923 -0
opik/rest_client_configurator/api.py +1 -0
opik/rest_client_configurator/retry_decorator.py +1 -0
opik/s3_httpx_client.py +67 -0
opik/simulation/__init__.py +6 -0
opik/simulation/simulated_user.py +99 -0
opik/simulation/simulator.py +108 -0
opik/synchronization.py +11 -24
opik/tracing_runtime_config.py +48 -0
opik/types.py +48 -2
opik/url_helpers.py +13 -3
opik/validation/chat_prompt_messages.py +241 -0
opik/validation/feedback_score.py +4 -5
opik/validation/parameter.py +122 -0
opik/validation/parameters_validator.py +175 -0
opik/validation/validator.py +30 -2
opik/validation/validator_helpers.py +147 -0
opik-1.9.71.dist-info/METADATA +370 -0
opik-1.9.71.dist-info/RECORD +1110 -0
{opik-1.6.4.dist-info → opik-1.9.71.dist-info}/WHEEL +1 -1
opik-1.9.71.dist-info/licenses/LICENSE +203 -0
opik/api_objects/prompt/prompt.py +0 -107
opik/api_objects/prompt/prompt_template.py +0 -35
opik/cli.py +0 -193
opik/evaluation/metrics/models.py +0 -8
opik/hooks.py +0 -13
opik/integrations/bedrock/chunks_aggregator.py +0 -55
opik/integrations/bedrock/helpers.py +0 -8
opik/integrations/langchain/google_run_helpers.py +0 -75
opik/integrations/langchain/openai_run_helpers.py +0 -122
opik/message_processing/message_processors.py +0 -203
opik/rest_api/types/delta_role.py +0 -7
opik/rest_api/types/json_object_schema.py +0 -34
opik-1.6.4.dist-info/METADATA +0 -270
opik-1.6.4.dist-info/RECORD +0 -507
/opik/integrations/bedrock/{stream_wrappers.py → converse/stream_wrappers.py} +0 -0
{opik-1.6.4.dist-info → opik-1.9.71.dist-info}/entry_points.txt +0 -0
{opik-1.6.4.dist-info → opik-1.9.71.dist-info}/top_level.txt +0 -0

opik/evaluation/metrics/llm_judges/factuality/parser.py ADDED Viewed

@@ -0,0 +1,35 @@
+import logging
+import opik.logging_messages as logging_messages
+import opik.exceptions as exceptions
+from opik.evaluation.metrics import score_result
+from opik.evaluation.metrics.llm_judges import parsing_helpers
+LOGGER = logging.getLogger(__name__)
+def parse_model_output(content: str, name: str) -> score_result.ScoreResult:
+    try:
+        list_content = parsing_helpers.extract_json_content_or_raise(content)
+        reason = ""
+        score = 0.0
+        for claim in list_content:
+            claim_score = float(claim["score"])
+            if not (0.0 <= claim_score <= 1.0):
+                raise exceptions.MetricComputationError(
+                    f"Factuality score must be between 0.0 and 1.0, got {claim_score}"
+                )
+            score += claim_score
+            reason += claim["reason"] + "\n"
+        score /= len(list_content)
+        return score_result.ScoreResult(name=name, value=score, reason=reason)
+    except Exception as e:
+        LOGGER.error(f"Failed to parse model output: {e}", exc_info=True)
+        raise exceptions.MetricComputationError(
+            logging_messages.FACTUALITY_SCORE_CALC_FAILED
+        )

opik/evaluation/metrics/llm_judges/factuality/template.py CHANGED Viewed

@@ -46,7 +46,7 @@ def generate_query(
 ) -> str:
     examples_str = "\n\n".join(
         [
-            f"Example {i+1}:\n"
+            f"Example {i + 1}:\n"
             f"Input: {example['input']}\n"
             f"Output: {example['output']}\n"
             f"Contexts: {example['context']}\n"

opik/evaluation/metrics/llm_judges/g_eval/__init__.py CHANGED Viewed

@@ -0,0 +1,5 @@
+"""Public exports for the GEval metric package."""
+from .metric import GEval
+__all__ = ["GEval"]

opik/evaluation/metrics/llm_judges/g_eval/metric.py CHANGED Viewed

@@ -1,17 +1,13 @@
-import math
-from functools import cached_property
-from typing import Any, Optional, TYPE_CHECKING, Union
+from collections import OrderedDict
+from threading import Lock
+from typing import Any, Dict, Optional, Tuple, Union
 import pydantic
-import json
-if TYPE_CHECKING:
-    from litellm.types.utils import ModelResponse
 from opik.evaluation.metrics import base_metric, score_result
 from opik.evaluation.models import base_model, models_factory
-from opik.logging_messages import GEVAL_SCORE_CALC_FAILED
-from .template import G_EVAL_COT_TEMPLATE, G_EVAL_QUERY_TEMPLATE
-from opik import exceptions
+from opik.evaluation import models
+from . import template, parser
+from .presets import GEVAL_PRESETS
 class GEvalScoreFormat(pydantic.BaseModel):
@@ -19,56 +15,175 @@ class GEvalScoreFormat(pydantic.BaseModel):
     reason: str
+def _freeze_for_cache(value: Any) -> Any:
+    """Convert nested structures into hashable representations for caching."""
+    if isinstance(value, dict):
+        return tuple(
+            sorted((key, _freeze_for_cache(val)) for key, val in value.items())
+        )
+    if isinstance(value, (list, tuple)):
+        return tuple(_freeze_for_cache(item) for item in value)
+    if isinstance(value, set):
+        return tuple(sorted(_freeze_for_cache(item) for item in value))
+    return value
 class GEval(base_metric.BaseMetric):
+    """
+    Generalised evaluation metric that prompts an LLM to grade another LLM output.
+    GEval builds a reusable chain-of-thought using the provided
+    ``task_introduction`` and ``evaluation_criteria`` prompts, then requests a
+    final score and rationale for each evaluated output.
+    Args:
+        task_introduction: Instruction describing the evaluator's persona/purpose.
+        evaluation_criteria: Detailed rubric presented to the evaluator.
+        model: Optional model identifier or ``OpikBaseModel`` for the judge.
+        name: Display name for the metric result. Defaults to ``"g_eval_metric"``.
+        track: Whether to automatically track metric results. Defaults to ``True``.
+        project_name: Optional tracking project name.
+        temperature: Sampling temperature forwarded to the judge model.
+        seed: Optional seed for reproducible generation (if supported by the model).
+    Example:
+        >>> from opik.evaluation.metrics.llm_judges.g_eval.metric import GEval
+        >>> metric = GEval(
+        ...     task_introduction="You evaluate politeness of responses.",
+        ...     evaluation_criteria="Score from 1 (rude) to 5 (very polite).",
+        ...     model="gpt-4",
+        ... )
+        >>> result = metric.score(output="Thanks so much for your help!")  # doctest: +SKIP
+        >>> result.value  # doctest: +SKIP
+        0.9
+    """
+    _CHAIN_OF_THOUGHT_CACHE: "OrderedDict[Tuple[str, str, str, Any], str]" = (
+        OrderedDict()
+    )
+    _CHAIN_OF_THOUGHT_LOCK: Lock = Lock()
+    _MAX_CHAIN_OF_THOUGHT_CACHE = 128
     def __init__(
         self,
         task_introduction: str,
         evaluation_criteria: str,
-        model: Optional[Union[str, base_model.OpikBaseModel]] = None,
+        model: Optional[Union[str, models.base_model.OpikBaseModel]] = None,
         name: str = "g_eval_metric",
         track: bool = True,
+        project_name: Optional[str] = None,
+        temperature: float = 0.0,
+        seed: Optional[int] = None,
     ):
-        """
-        A metric that evaluates an LLM output based on chain-of-thought built with the evaluation criteria provided
-        by the user.
-        For more details see the original paper: https://arxiv.org/pdf/2303.16634
-        Args:
-            task_introduction: An instruction for LLM used to generate an evaluation chain-of-thought and in evaluation call itself.
-                `opik.evaluation.models.LiteLLMChatModel` is used by default.
-            evaluation_criteria: The main task for G-Eval metric written in human language.
-            model: The LLM to use for evaluation. Can be a string (model name) or an `opik.evaluation.models.OpikBaseModel` subclass instance.
-            name: The name of the metric.
-            track: Whether to track the metric. Defaults to True.
-        """
         super().__init__(
             name=name,
             track=track,
+            project_name=project_name,
         )
-        self._init_model(model)
         self.task_introduction = task_introduction
         self.evaluation_criteria = evaluation_criteria
+        self._seed = seed
+        self._log_probs_supported = False
+        self._init_model(model, temperature=temperature)
-    @cached_property
     def llm_chain_of_thought(self) -> str:
-        prompt = G_EVAL_COT_TEMPLATE.format(
+        cache_key = self._chain_of_thought_cache_key()
+        cached = self._get_cached_chain_of_thought(cache_key)
+        if cached is not None:
+            return cached
+        prompt = template.G_EVAL_COT_TEMPLATE.format(
             task_introduction=self.task_introduction,
             evaluation_criteria=self.evaluation_criteria,
         )
-        return self._model.generate_string(input=prompt)
+        generated = self._model.generate_string(input=prompt)
+        self._store_chain_of_thought(cache_key, generated)
+        return generated
+    async def allm_chain_of_thought(self) -> str:
+        cache_key = self._chain_of_thought_cache_key()
+        cached = self._get_cached_chain_of_thought(cache_key)
+        if cached is not None:
+            return cached
+        prompt = template.G_EVAL_COT_TEMPLATE.format(
+            task_introduction=self.task_introduction,
+            evaluation_criteria=self.evaluation_criteria,
+        )
+        generated = await self._model.agenerate_string(input=prompt)
+        self._store_chain_of_thought(cache_key, generated)
+        return generated
     def _init_model(
-        self, model: Optional[Union[str, base_model.OpikBaseModel]]
+        self, model: Optional[Union[str, base_model.OpikBaseModel]], temperature: float
     ) -> None:
         if isinstance(model, base_model.OpikBaseModel):
             self._model = model
         else:
-            self._model = models_factory.get(
-                model_name=model,
-                must_support_arguments=["logprobs", "top_logprobs"],
-            )
+            model_kwargs = {"temperature": temperature}
+            if self._seed is not None:
+                model_kwargs["seed"] = self._seed
+            self._model = models_factory.get(model_name=model, **model_kwargs)
+        if (
+            hasattr(self._model, "supported_params")
+            and "logprobs" in self._model.supported_params
+            and "top_logprobs" in self._model.supported_params
+        ):
+            self._log_probs_supported = True
+    @classmethod
+    def _get_cached_chain_of_thought(
+        cls, cache_key: Tuple[str, str, str, Any]
+    ) -> Optional[str]:
+        with cls._CHAIN_OF_THOUGHT_LOCK:
+            value = cls._CHAIN_OF_THOUGHT_CACHE.get(cache_key)
+            if value is not None:
+                cls._CHAIN_OF_THOUGHT_CACHE.move_to_end(cache_key)
+            return value
+    @classmethod
+    def _store_chain_of_thought(
+        cls, cache_key: Tuple[str, str, str, Any], value: str
+    ) -> None:
+        with cls._CHAIN_OF_THOUGHT_LOCK:
+            existing = cls._CHAIN_OF_THOUGHT_CACHE.get(cache_key)
+            if existing is not None:
+                cls._CHAIN_OF_THOUGHT_CACHE.move_to_end(cache_key)
+                return
+            cls._CHAIN_OF_THOUGHT_CACHE[cache_key] = value
+            cls._CHAIN_OF_THOUGHT_CACHE.move_to_end(cache_key)
+            while len(cls._CHAIN_OF_THOUGHT_CACHE) > cls._MAX_CHAIN_OF_THOUGHT_CACHE:
+                cls._CHAIN_OF_THOUGHT_CACHE.popitem(last=False)
+    def _chain_of_thought_cache_key(self) -> Tuple[str, str, str, Any]:
+        model_name = getattr(self._model, "model_name", "unknown")
+        return (
+            self.task_introduction,
+            self.evaluation_criteria,
+            model_name,
+            self._model_cache_fingerprint(),
+        )
+    def _model_cache_fingerprint(self) -> Any:
+        fingerprint_candidate = getattr(self._model, "cache_fingerprint", None)
+        if callable(fingerprint_candidate):
+            try:
+                fingerprint = fingerprint_candidate()
+            except Exception:
+                fingerprint = None
+            else:
+                return _freeze_for_cache(fingerprint)
+        completion_kwargs = getattr(self._model, "_completion_kwargs", None)
+        if isinstance(completion_kwargs, dict):
+            return _freeze_for_cache(completion_kwargs)
+        return id(self._model)
     def score(
         self,
@@ -86,10 +201,10 @@ class GEval(base_metric.BaseMetric):
             score_result.ScoreResult: A ScoreResult object containing the G-Eval score
             (between 0.0 and 1.0) and a reason for the score.
         """
-        llm_query = G_EVAL_QUERY_TEMPLATE.format(
+        llm_query = template.G_EVAL_QUERY_TEMPLATE.format(
             task_introduction=self.task_introduction,
             evaluation_criteria=self.evaluation_criteria,
-            chain_of_thought=self.llm_chain_of_thought,
+            chain_of_thought=self.llm_chain_of_thought(),
             input=output,
         )
@@ -100,33 +215,44 @@ class GEval(base_metric.BaseMetric):
             },
         ]
-        model_output = self._model.generate_provider_response(
-            messages=request,
-            logprobs=True,
-            top_logprobs=20,
-            response_format=GEvalScoreFormat,
+        if isinstance(self._model, models.LiteLLMChatModel):
+            provider_kwargs: Dict[str, Any] = {
+                "response_format": GEvalScoreFormat,
+            }
+            if self._log_probs_supported:
+                provider_kwargs["logprobs"] = True
+                provider_kwargs["top_logprobs"] = 20
+            with base_model.get_provider_response(
+                model_provider=self._model,
+                messages=request,
+                **provider_kwargs,
+            ) as model_output:
+                return parser.parse_litellm_model_output(
+                    content=model_output,
+                    name=self.name,
+                    log_probs_supported=self._log_probs_supported,
+                )
+        model_output_string = self._model.generate_string(
+            input=llm_query, response_format=GEvalScoreFormat
         )
-        return self._parse_model_output(model_output)
+        return parser.parse_model_output_string(model_output_string, self.name)
     async def ascore(
-        self, output: str, **ignored_kwargs: Any
+        self,
+        output: str,
+        **ignored_kwargs: Any,
     ) -> score_result.ScoreResult:
         """
-        Calculate the G-Eval score for the given LLM's output.
-        Args:
-            output: The LLM's output to evaluate.
-            **ignored_kwargs: Additional keyword arguments that are ignored.
-        Returns:
-            score_result.ScoreResult: A ScoreResult object containing the G-Eval score
-            (between 0.0 and 1.0) and a reason for the score.
+        Async variant of :meth:`score`, evaluating the provided LLM output using
+        the configured judge model and returning a ``ScoreResult``.
         """
-        llm_query = G_EVAL_QUERY_TEMPLATE.format(
+        llm_query = template.G_EVAL_QUERY_TEMPLATE.format(
             task_introduction=self.task_introduction,
             evaluation_criteria=self.evaluation_criteria,
-            chain_of_thought=self.llm_chain_of_thought,
+            chain_of_thought=await self.allm_chain_of_thought(),
             input=output,
         )
@@ -137,69 +263,74 @@ class GEval(base_metric.BaseMetric):
             },
         ]
-        model_output = await self._model.agenerate_provider_response(
-            messages=request,
-            logprobs=True,
-            top_logprobs=20,
-            response_format=GEvalScoreFormat,
+        if isinstance(self._model, models.LiteLLMChatModel):
+            provider_kwargs: Dict[str, Any] = {
+                "response_format": GEvalScoreFormat,
+            }
+            if self._log_probs_supported:
+                provider_kwargs["logprobs"] = True
+                provider_kwargs["top_logprobs"] = 20
+            async with base_model.aget_provider_response(
+                model_provider=self._model,
+                messages=request,
+                **provider_kwargs,
+            ) as model_output:
+                return parser.parse_litellm_model_output(
+                    content=model_output,
+                    name=self.name,
+                    log_probs_supported=self._log_probs_supported,
+                )
+        model_output_string = await self._model.agenerate_string(
+            input=llm_query, response_format=GEvalScoreFormat
         )
-        return self._parse_model_output(model_output)
-    def _parse_model_output(self, content: "ModelResponse") -> score_result.ScoreResult:
-        """
-        This method computes the final score based on the model's response. The model's response is a dictionary
-        with a `score` key and a `reason` key. The prompt template also specifies that the score should be an integer
-        between 0 and 10.
-        In order to make the score computation more robust, we look at the top logprobs of the score token and compute
-        a weighted average of the scores. Since we try to enforce the format of the model's response, we can assume that
-        the score token is always the fourth token in the response (first token is `{"`, followed by `score` and `":`).
-        """
-        try:
-            # Compute score using top logprobs
-            score_token_position = 3
-            top_score_logprobs = content.choices[0].model_extra["logprobs"]["content"][
-                score_token_position
-            ]["top_logprobs"]
+        return parser.parse_model_output_string(model_output_string, self.name)
-            linear_probs_sum = 0.0
-            weighted_score_sum = 0.0
-            for token_info in top_score_logprobs:
-                # litellm in v1.60.2 (or earlier) started provide logprobes
-                # as pydantic model, not just dict
-                # we will convert model to dict to provide backward compatability
-                if not isinstance(token_info, dict):
-                    token_info = token_info.model_dump()
+class GEvalPreset(GEval):
+    """
+    Pre-configured GEval variant with author-provided prompt templates.
-                # if not a number
-                if not token_info["token"].isdecimal():
-                    continue
+    Args:
+        preset: Key name from ``GEVAL_PRESETS`` describing the evaluation rubric.
+        model: Optional model identifier or ``OpikBaseModel`` instance.
+        track: Whether to automatically track metric results. Defaults to ``True``.
+        project_name: Optional tracking project name.
+        temperature: Sampling temperature forwarded to the judge model.
+        name: Optional override for the metric name (defaults to preset name).
-                score = int(token_info["token"])
+    Example:
+        >>> from opik.evaluation.metrics.llm_judges.g_eval.metric import GEvalPreset
+        >>> metric = GEvalPreset(preset="qa_relevance", model="gpt-4")
+        >>> result = metric.score(output="Answer addresses the user's question.")  # doctest: +SKIP
+        >>> result.value  # doctest: +SKIP
+        0.85
+    """
-                # if score value not in scale
-                if not 0 <= score <= 10:
-                    continue
-                log_prob = token_info["logprob"]
-                linear_prob = math.exp(log_prob)
-                linear_probs_sum += linear_prob
-                weighted_score_sum += linear_prob * score
-            final_score: float = weighted_score_sum / linear_probs_sum / 10
-            if not (0.0 <= final_score <= 1.0):
-                raise ValueError
-            # Get the reason
-            reason = json.loads(content.choices[0].message.content)["reason"]
+    def __init__(
+        self,
+        preset: str,
+        model: Optional[Union[str, models.base_model.OpikBaseModel]] = None,
+        track: bool = True,
+        project_name: Optional[str] = None,
+        temperature: float = 0.0,
+        name: Optional[str] = None,
+    ):
+        try:
+            definition = GEVAL_PRESETS[preset]
+        except KeyError as error:
+            raise ValueError(
+                f"Unknown GEval preset '{preset}'. Available presets: {list(GEVAL_PRESETS)}"
+            ) from error
-            # Return the score and the reason
-            return score_result.ScoreResult(
-                name=self.name, value=final_score, reason=reason
-            )
-        except Exception:
-            raise exceptions.MetricComputationError(GEVAL_SCORE_CALC_FAILED)
+        super().__init__(
+            task_introduction=definition.task_introduction,
+            evaluation_criteria=definition.evaluation_criteria,
+            model=model,
+            name=name or definition.name,
+            track=track,
+            project_name=project_name,
+            temperature=temperature,
+        )

opik/evaluation/metrics/llm_judges/g_eval/parser.py ADDED Viewed

@@ -0,0 +1,161 @@
+import logging
+import json
+import math
+from typing import Any, Dict, TYPE_CHECKING
+import opik.exceptions as exceptions
+from opik.evaluation.metrics import score_result
+from opik.evaluation.metrics.llm_judges import parsing_helpers
+from opik.logging_messages import GEVAL_SCORE_CALC_FAILED
+if TYPE_CHECKING:  # TODO: Daniel check if this is needed
+    from litellm.types.utils import ModelResponse as LiteLLMModelResponse
+LOGGER = logging.getLogger(__name__)
+def parse_model_output_string(
+    content: str, metric_name: str
+) -> score_result.ScoreResult:
+    try:
+        dict_content = parsing_helpers.extract_json_content_or_raise(content)
+        score_raw = float(dict_content["score"])
+        if not 0 <= score_raw <= 10:
+            raise ValueError(
+                f"LLM returned score outside of [0, 10] range: {score_raw}"
+            )
+        normalised_score = score_raw / 10
+        reason = str(dict_content["reason"])
+        return score_result.ScoreResult(
+            name=metric_name,
+            value=normalised_score,
+            reason=reason,
+        )
+    except Exception as exception:
+        LOGGER.error(f"Failed to parse model output: {exception}", exc_info=True)
+        raise exceptions.MetricComputationError(GEVAL_SCORE_CALC_FAILED) from exception
+def parse_litellm_model_output(
+    content: "LiteLLMModelResponse", name: str, log_probs_supported: bool
+) -> score_result.ScoreResult:
+    """
+    This method computes the final score based on the model's response. The model's response is a dictionary
+    with a `score` key and a `reason` key. The prompt template also specifies that the score should be an integer
+    between 0 and 10.
+    In order to make the score computation more robust, we look at the top logprobs of the score token and compute
+    a weighted average of the scores. Since we try to enforce the format of the model's response, we can assume that
+    the score token is always the fourth token in the response (first token is `{"`, followed by `score` and `":`).
+    """
+    try:
+        choice_dict = _normalise_first_choice(content)
+        if not log_probs_supported:
+            return _extract_score_from_text_content(choice_dict, name=name)
+        log_probs = _to_dict(choice_dict.get("logprobs"))
+        entries = log_probs.get("content") or []
+        score_token_position = 3
+        if len(entries) <= score_token_position:
+            return _extract_score_from_text_content(choice_dict, name=name)
+        entry_dict = _to_dict(entries[score_token_position])
+        top_logprobs = entry_dict.get("top_logprobs") or []
+        token_candidate = str(entry_dict.get("token", ""))
+        linear_probs_sum = 0.0
+        weighted_score_sum = 0.0
+        for candidate in top_logprobs:
+            token_info = _to_dict(candidate)
+            token_str = str(token_info.get("token", ""))
+            if not token_str.isdecimal():
+                continue
+            score = int(token_str)
+            if not 0 <= score <= 10:
+                continue
+            log_prob = token_info.get("logprob")
+            if log_prob is None:
+                continue
+            linear_prob = math.exp(float(log_prob))
+            linear_probs_sum += linear_prob
+            weighted_score_sum += linear_prob * score
+        if linear_probs_sum != 0.0:
+            final_score: float = weighted_score_sum / linear_probs_sum / 10
+        else:
+            if not token_candidate.isdecimal():
+                raise exceptions.MetricComputationError(GEVAL_SCORE_CALC_FAILED)
+            final_score = int(token_candidate) / 10
+        if not (0.0 <= final_score <= 1.0):
+            raise ValueError(
+                f"Failed to compute final score from log_probs, the value is out of [0, 1] range: {final_score}"
+            )
+        reason_data = json.loads(_extract_message_content(choice_dict))
+        reason = reason_data["reason"]
+        return score_result.ScoreResult(name=name, value=final_score, reason=reason)
+    except Exception as exception:
+        LOGGER.error(f"Failed to parse model output: {exception}", exc_info=True)
+        raise exceptions.MetricComputationError(GEVAL_SCORE_CALC_FAILED) from exception
+def _extract_score_from_text_content(
+    choice: Dict[str, Any], name: str
+) -> score_result.ScoreResult:
+    text_content = _extract_message_content(choice)
+    return parse_model_output_string(text_content, name)
+def _extract_message_content(choice: Dict[str, Any]) -> str:
+    message = choice.get("message")
+    if isinstance(message, dict):
+        content = message.get("content")
+    else:
+        content = getattr(message, "content", None)
+    if not isinstance(content, str):
+        raise ValueError("LLM response is missing textual content")
+    return content
+def _normalise_choice(choice: Any) -> Dict[str, Any]:
+    choice_dict = _to_dict(choice)
+    if choice_dict:
+        return choice_dict
+    return {
+        "message": getattr(choice, "message", None),
+        "logprobs": getattr(choice, "logprobs", None),
+    }
+def _normalise_first_choice(response: Any) -> Dict[str, Any]:
+    choices = getattr(response, "choices", None)
+    if not isinstance(choices, list) or not choices:
+        raise exceptions.MetricComputationError(
+            "LLM response did not contain any choices to parse."
+        )
+    return _normalise_choice(choices[0])
+def _to_dict(value: Any) -> Dict[str, Any]:
+    if isinstance(value, dict):
+        return value
+    if hasattr(value, "model_dump") and callable(value.model_dump):
+        try:
+            return value.model_dump()
+        except TypeError:
+            pass
+    if hasattr(value, "__dict__"):
+        return dict(value.__dict__)
+    return {}

opik 1.6.4__py3-none-any.whl → 1.9.71__py3-none-any.whl

opik 1.6.4py3-none-any.whl → 1.9.71py3-none-any.whl