opik 1.8.39__py3-none-any.whl → 1.9.71__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (592) hide show
  1. opik/__init__.py +19 -3
  2. opik/anonymizer/__init__.py +5 -0
  3. opik/anonymizer/anonymizer.py +12 -0
  4. opik/anonymizer/factory.py +80 -0
  5. opik/anonymizer/recursive_anonymizer.py +64 -0
  6. opik/anonymizer/rules.py +56 -0
  7. opik/anonymizer/rules_anonymizer.py +35 -0
  8. opik/api_objects/attachment/attachment_context.py +36 -0
  9. opik/api_objects/attachment/attachments_extractor.py +153 -0
  10. opik/api_objects/attachment/client.py +1 -0
  11. opik/api_objects/attachment/converters.py +2 -0
  12. opik/api_objects/attachment/decoder.py +18 -0
  13. opik/api_objects/attachment/decoder_base64.py +83 -0
  14. opik/api_objects/attachment/decoder_helpers.py +137 -0
  15. opik/api_objects/data_helpers.py +79 -0
  16. opik/api_objects/dataset/dataset.py +64 -4
  17. opik/api_objects/dataset/rest_operations.py +11 -2
  18. opik/api_objects/experiment/experiment.py +57 -57
  19. opik/api_objects/experiment/experiment_item.py +2 -1
  20. opik/api_objects/experiment/experiments_client.py +64 -0
  21. opik/api_objects/experiment/helpers.py +35 -11
  22. opik/api_objects/experiment/rest_operations.py +65 -5
  23. opik/api_objects/helpers.py +8 -5
  24. opik/api_objects/local_recording.py +81 -0
  25. opik/api_objects/opik_client.py +600 -108
  26. opik/api_objects/opik_query_language.py +39 -5
  27. opik/api_objects/prompt/__init__.py +12 -2
  28. opik/api_objects/prompt/base_prompt.py +69 -0
  29. opik/api_objects/prompt/base_prompt_template.py +29 -0
  30. opik/api_objects/prompt/chat/__init__.py +1 -0
  31. opik/api_objects/prompt/chat/chat_prompt.py +210 -0
  32. opik/api_objects/prompt/chat/chat_prompt_template.py +350 -0
  33. opik/api_objects/prompt/chat/content_renderer_registry.py +203 -0
  34. opik/api_objects/prompt/client.py +189 -47
  35. opik/api_objects/prompt/text/__init__.py +1 -0
  36. opik/api_objects/prompt/text/prompt.py +174 -0
  37. opik/api_objects/prompt/{prompt_template.py → text/prompt_template.py} +10 -6
  38. opik/api_objects/prompt/types.py +23 -0
  39. opik/api_objects/search_helpers.py +89 -0
  40. opik/api_objects/span/span_data.py +35 -25
  41. opik/api_objects/threads/threads_client.py +39 -5
  42. opik/api_objects/trace/trace_client.py +52 -2
  43. opik/api_objects/trace/trace_data.py +15 -24
  44. opik/api_objects/validation_helpers.py +3 -3
  45. opik/cli/__init__.py +5 -0
  46. opik/cli/__main__.py +6 -0
  47. opik/cli/configure.py +66 -0
  48. opik/cli/exports/__init__.py +131 -0
  49. opik/cli/exports/dataset.py +278 -0
  50. opik/cli/exports/experiment.py +784 -0
  51. opik/cli/exports/project.py +685 -0
  52. opik/cli/exports/prompt.py +578 -0
  53. opik/cli/exports/utils.py +406 -0
  54. opik/cli/harbor.py +39 -0
  55. opik/cli/healthcheck.py +21 -0
  56. opik/cli/imports/__init__.py +439 -0
  57. opik/cli/imports/dataset.py +143 -0
  58. opik/cli/imports/experiment.py +1192 -0
  59. opik/cli/imports/project.py +262 -0
  60. opik/cli/imports/prompt.py +177 -0
  61. opik/cli/imports/utils.py +280 -0
  62. opik/cli/main.py +49 -0
  63. opik/cli/proxy.py +93 -0
  64. opik/cli/usage_report/__init__.py +16 -0
  65. opik/cli/usage_report/charts.py +783 -0
  66. opik/cli/usage_report/cli.py +274 -0
  67. opik/cli/usage_report/constants.py +9 -0
  68. opik/cli/usage_report/extraction.py +749 -0
  69. opik/cli/usage_report/pdf.py +244 -0
  70. opik/cli/usage_report/statistics.py +78 -0
  71. opik/cli/usage_report/utils.py +235 -0
  72. opik/config.py +13 -7
  73. opik/configurator/configure.py +17 -0
  74. opik/datetime_helpers.py +12 -0
  75. opik/decorator/arguments_helpers.py +9 -1
  76. opik/decorator/base_track_decorator.py +205 -133
  77. opik/decorator/context_manager/span_context_manager.py +123 -0
  78. opik/decorator/context_manager/trace_context_manager.py +84 -0
  79. opik/decorator/opik_args/__init__.py +13 -0
  80. opik/decorator/opik_args/api_classes.py +71 -0
  81. opik/decorator/opik_args/helpers.py +120 -0
  82. opik/decorator/span_creation_handler.py +25 -6
  83. opik/dict_utils.py +3 -3
  84. opik/evaluation/__init__.py +13 -2
  85. opik/evaluation/engine/engine.py +272 -75
  86. opik/evaluation/engine/evaluation_tasks_executor.py +6 -3
  87. opik/evaluation/engine/helpers.py +31 -6
  88. opik/evaluation/engine/metrics_evaluator.py +237 -0
  89. opik/evaluation/evaluation_result.py +168 -2
  90. opik/evaluation/evaluator.py +533 -62
  91. opik/evaluation/metrics/__init__.py +103 -4
  92. opik/evaluation/metrics/aggregated_metric.py +35 -6
  93. opik/evaluation/metrics/base_metric.py +1 -1
  94. opik/evaluation/metrics/conversation/__init__.py +48 -0
  95. opik/evaluation/metrics/conversation/conversation_thread_metric.py +56 -2
  96. opik/evaluation/metrics/conversation/g_eval_wrappers.py +19 -0
  97. opik/evaluation/metrics/conversation/helpers.py +14 -15
  98. opik/evaluation/metrics/conversation/heuristics/__init__.py +14 -0
  99. opik/evaluation/metrics/conversation/heuristics/degeneration/__init__.py +3 -0
  100. opik/evaluation/metrics/conversation/heuristics/degeneration/metric.py +189 -0
  101. opik/evaluation/metrics/conversation/heuristics/degeneration/phrases.py +12 -0
  102. opik/evaluation/metrics/conversation/heuristics/knowledge_retention/__init__.py +3 -0
  103. opik/evaluation/metrics/conversation/heuristics/knowledge_retention/metric.py +172 -0
  104. opik/evaluation/metrics/conversation/llm_judges/__init__.py +32 -0
  105. opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/metric.py +22 -17
  106. opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/templates.py +1 -1
  107. opik/evaluation/metrics/conversation/llm_judges/g_eval_wrappers.py +442 -0
  108. opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/metric.py +13 -7
  109. opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/templates.py +1 -1
  110. opik/evaluation/metrics/conversation/llm_judges/user_frustration/__init__.py +0 -0
  111. opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/metric.py +21 -14
  112. opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/templates.py +1 -1
  113. opik/evaluation/metrics/conversation/types.py +4 -5
  114. opik/evaluation/metrics/conversation_types.py +9 -0
  115. opik/evaluation/metrics/heuristics/bertscore.py +107 -0
  116. opik/evaluation/metrics/heuristics/bleu.py +35 -15
  117. opik/evaluation/metrics/heuristics/chrf.py +127 -0
  118. opik/evaluation/metrics/heuristics/contains.py +47 -11
  119. opik/evaluation/metrics/heuristics/distribution_metrics.py +331 -0
  120. opik/evaluation/metrics/heuristics/gleu.py +113 -0
  121. opik/evaluation/metrics/heuristics/language_adherence.py +123 -0
  122. opik/evaluation/metrics/heuristics/meteor.py +119 -0
  123. opik/evaluation/metrics/heuristics/prompt_injection.py +150 -0
  124. opik/evaluation/metrics/heuristics/readability.py +129 -0
  125. opik/evaluation/metrics/heuristics/rouge.py +26 -9
  126. opik/evaluation/metrics/heuristics/spearman.py +88 -0
  127. opik/evaluation/metrics/heuristics/tone.py +155 -0
  128. opik/evaluation/metrics/heuristics/vader_sentiment.py +77 -0
  129. opik/evaluation/metrics/llm_judges/answer_relevance/metric.py +20 -5
  130. opik/evaluation/metrics/llm_judges/context_precision/metric.py +20 -6
  131. opik/evaluation/metrics/llm_judges/context_recall/metric.py +20 -6
  132. opik/evaluation/metrics/llm_judges/g_eval/__init__.py +5 -0
  133. opik/evaluation/metrics/llm_judges/g_eval/metric.py +219 -68
  134. opik/evaluation/metrics/llm_judges/g_eval/parser.py +102 -52
  135. opik/evaluation/metrics/llm_judges/g_eval/presets.py +209 -0
  136. opik/evaluation/metrics/llm_judges/g_eval_presets/__init__.py +36 -0
  137. opik/evaluation/metrics/llm_judges/g_eval_presets/agent_assessment.py +77 -0
  138. opik/evaluation/metrics/llm_judges/g_eval_presets/bias_classifier.py +181 -0
  139. opik/evaluation/metrics/llm_judges/g_eval_presets/compliance_risk.py +41 -0
  140. opik/evaluation/metrics/llm_judges/g_eval_presets/prompt_uncertainty.py +41 -0
  141. opik/evaluation/metrics/llm_judges/g_eval_presets/qa_suite.py +146 -0
  142. opik/evaluation/metrics/llm_judges/hallucination/metric.py +16 -3
  143. opik/evaluation/metrics/llm_judges/llm_juries/__init__.py +3 -0
  144. opik/evaluation/metrics/llm_judges/llm_juries/metric.py +76 -0
  145. opik/evaluation/metrics/llm_judges/moderation/metric.py +16 -4
  146. opik/evaluation/metrics/llm_judges/structure_output_compliance/__init__.py +0 -0
  147. opik/evaluation/metrics/llm_judges/structure_output_compliance/metric.py +144 -0
  148. opik/evaluation/metrics/llm_judges/structure_output_compliance/parser.py +79 -0
  149. opik/evaluation/metrics/llm_judges/structure_output_compliance/schema.py +15 -0
  150. opik/evaluation/metrics/llm_judges/structure_output_compliance/template.py +50 -0
  151. opik/evaluation/metrics/llm_judges/syc_eval/__init__.py +0 -0
  152. opik/evaluation/metrics/llm_judges/syc_eval/metric.py +252 -0
  153. opik/evaluation/metrics/llm_judges/syc_eval/parser.py +82 -0
  154. opik/evaluation/metrics/llm_judges/syc_eval/template.py +155 -0
  155. opik/evaluation/metrics/llm_judges/trajectory_accuracy/metric.py +20 -5
  156. opik/evaluation/metrics/llm_judges/usefulness/metric.py +16 -4
  157. opik/evaluation/metrics/ragas_metric.py +43 -23
  158. opik/evaluation/models/__init__.py +8 -0
  159. opik/evaluation/models/base_model.py +107 -1
  160. opik/evaluation/models/langchain/langchain_chat_model.py +15 -7
  161. opik/evaluation/models/langchain/message_converters.py +97 -15
  162. opik/evaluation/models/litellm/litellm_chat_model.py +156 -29
  163. opik/evaluation/models/litellm/util.py +125 -0
  164. opik/evaluation/models/litellm/warning_filters.py +16 -4
  165. opik/evaluation/models/model_capabilities.py +187 -0
  166. opik/evaluation/models/models_factory.py +25 -3
  167. opik/evaluation/preprocessing.py +92 -0
  168. opik/evaluation/report.py +70 -12
  169. opik/evaluation/rest_operations.py +49 -45
  170. opik/evaluation/samplers/__init__.py +4 -0
  171. opik/evaluation/samplers/base_dataset_sampler.py +40 -0
  172. opik/evaluation/samplers/random_dataset_sampler.py +48 -0
  173. opik/evaluation/score_statistics.py +66 -0
  174. opik/evaluation/scorers/__init__.py +4 -0
  175. opik/evaluation/scorers/scorer_function.py +55 -0
  176. opik/evaluation/scorers/scorer_wrapper_metric.py +130 -0
  177. opik/evaluation/test_case.py +3 -2
  178. opik/evaluation/test_result.py +1 -0
  179. opik/evaluation/threads/evaluator.py +31 -3
  180. opik/evaluation/threads/helpers.py +3 -2
  181. opik/evaluation/types.py +9 -1
  182. opik/exceptions.py +33 -0
  183. opik/file_upload/file_uploader.py +13 -0
  184. opik/file_upload/upload_options.py +2 -0
  185. opik/hooks/__init__.py +23 -0
  186. opik/hooks/anonymizer_hook.py +36 -0
  187. opik/hooks/httpx_client_hook.py +112 -0
  188. opik/httpx_client.py +12 -9
  189. opik/id_helpers.py +18 -0
  190. opik/integrations/adk/graph/subgraph_edges_builders.py +1 -2
  191. opik/integrations/adk/helpers.py +16 -7
  192. opik/integrations/adk/legacy_opik_tracer.py +7 -4
  193. opik/integrations/adk/opik_tracer.py +14 -1
  194. opik/integrations/adk/patchers/adk_otel_tracer/opik_adk_otel_tracer.py +7 -3
  195. opik/integrations/adk/recursive_callback_injector.py +4 -7
  196. opik/integrations/bedrock/converse/__init__.py +0 -0
  197. opik/integrations/bedrock/converse/chunks_aggregator.py +188 -0
  198. opik/integrations/bedrock/{converse_decorator.py → converse/converse_decorator.py} +4 -3
  199. opik/integrations/bedrock/invoke_agent_decorator.py +5 -4
  200. opik/integrations/bedrock/invoke_model/__init__.py +0 -0
  201. opik/integrations/bedrock/invoke_model/chunks_aggregator/__init__.py +78 -0
  202. opik/integrations/bedrock/invoke_model/chunks_aggregator/api.py +45 -0
  203. opik/integrations/bedrock/invoke_model/chunks_aggregator/base.py +23 -0
  204. opik/integrations/bedrock/invoke_model/chunks_aggregator/claude.py +121 -0
  205. opik/integrations/bedrock/invoke_model/chunks_aggregator/format_detector.py +107 -0
  206. opik/integrations/bedrock/invoke_model/chunks_aggregator/llama.py +108 -0
  207. opik/integrations/bedrock/invoke_model/chunks_aggregator/mistral.py +118 -0
  208. opik/integrations/bedrock/invoke_model/chunks_aggregator/nova.py +99 -0
  209. opik/integrations/bedrock/invoke_model/invoke_model_decorator.py +178 -0
  210. opik/integrations/bedrock/invoke_model/response_types.py +34 -0
  211. opik/integrations/bedrock/invoke_model/stream_wrappers.py +122 -0
  212. opik/integrations/bedrock/invoke_model/usage_converters.py +87 -0
  213. opik/integrations/bedrock/invoke_model/usage_extraction.py +108 -0
  214. opik/integrations/bedrock/opik_tracker.py +42 -4
  215. opik/integrations/bedrock/types.py +19 -0
  216. opik/integrations/crewai/crewai_decorator.py +8 -51
  217. opik/integrations/crewai/opik_tracker.py +31 -10
  218. opik/integrations/crewai/patchers/__init__.py +5 -0
  219. opik/integrations/crewai/patchers/flow.py +118 -0
  220. opik/integrations/crewai/patchers/litellm_completion.py +30 -0
  221. opik/integrations/crewai/patchers/llm_client.py +207 -0
  222. opik/integrations/dspy/callback.py +80 -17
  223. opik/integrations/dspy/parsers.py +168 -0
  224. opik/integrations/harbor/__init__.py +17 -0
  225. opik/integrations/harbor/experiment_service.py +269 -0
  226. opik/integrations/harbor/opik_tracker.py +528 -0
  227. opik/integrations/haystack/opik_connector.py +2 -2
  228. opik/integrations/haystack/opik_tracer.py +3 -7
  229. opik/integrations/langchain/__init__.py +3 -1
  230. opik/integrations/langchain/helpers.py +96 -0
  231. opik/integrations/langchain/langgraph_async_context_bridge.py +131 -0
  232. opik/integrations/langchain/langgraph_tracer_injector.py +88 -0
  233. opik/integrations/langchain/opik_encoder_extension.py +1 -1
  234. opik/integrations/langchain/opik_tracer.py +474 -229
  235. opik/integrations/litellm/__init__.py +5 -0
  236. opik/integrations/litellm/completion_chunks_aggregator.py +115 -0
  237. opik/integrations/litellm/litellm_completion_decorator.py +242 -0
  238. opik/integrations/litellm/opik_tracker.py +43 -0
  239. opik/integrations/litellm/stream_patchers.py +151 -0
  240. opik/integrations/llama_index/callback.py +146 -107
  241. opik/integrations/openai/agents/opik_tracing_processor.py +1 -2
  242. opik/integrations/openai/openai_chat_completions_decorator.py +2 -16
  243. opik/integrations/openai/opik_tracker.py +1 -1
  244. opik/integrations/sagemaker/auth.py +5 -1
  245. opik/llm_usage/google_usage.py +3 -1
  246. opik/llm_usage/opik_usage.py +7 -8
  247. opik/llm_usage/opik_usage_factory.py +4 -2
  248. opik/logging_messages.py +6 -0
  249. opik/message_processing/batching/base_batcher.py +14 -21
  250. opik/message_processing/batching/batch_manager.py +22 -10
  251. opik/message_processing/batching/batch_manager_constuctors.py +10 -0
  252. opik/message_processing/batching/batchers.py +59 -27
  253. opik/message_processing/batching/flushing_thread.py +0 -3
  254. opik/message_processing/emulation/__init__.py +0 -0
  255. opik/message_processing/emulation/emulator_message_processor.py +578 -0
  256. opik/message_processing/emulation/local_emulator_message_processor.py +140 -0
  257. opik/message_processing/emulation/models.py +162 -0
  258. opik/message_processing/encoder_helpers.py +79 -0
  259. opik/message_processing/messages.py +56 -1
  260. opik/message_processing/preprocessing/__init__.py +0 -0
  261. opik/message_processing/preprocessing/attachments_preprocessor.py +70 -0
  262. opik/message_processing/preprocessing/batching_preprocessor.py +53 -0
  263. opik/message_processing/preprocessing/constants.py +1 -0
  264. opik/message_processing/preprocessing/file_upload_preprocessor.py +38 -0
  265. opik/message_processing/preprocessing/preprocessor.py +36 -0
  266. opik/message_processing/processors/__init__.py +0 -0
  267. opik/message_processing/processors/attachments_extraction_processor.py +146 -0
  268. opik/message_processing/processors/message_processors.py +92 -0
  269. opik/message_processing/processors/message_processors_chain.py +96 -0
  270. opik/message_processing/{message_processors.py → processors/online_message_processor.py} +85 -29
  271. opik/message_processing/queue_consumer.py +9 -3
  272. opik/message_processing/streamer.py +71 -33
  273. opik/message_processing/streamer_constructors.py +43 -10
  274. opik/opik_context.py +16 -4
  275. opik/plugins/pytest/hooks.py +5 -3
  276. opik/rest_api/__init__.py +346 -15
  277. opik/rest_api/alerts/__init__.py +7 -0
  278. opik/rest_api/alerts/client.py +667 -0
  279. opik/rest_api/alerts/raw_client.py +1015 -0
  280. opik/rest_api/alerts/types/__init__.py +7 -0
  281. opik/rest_api/alerts/types/get_webhook_examples_request_alert_type.py +5 -0
  282. opik/rest_api/annotation_queues/__init__.py +4 -0
  283. opik/rest_api/annotation_queues/client.py +668 -0
  284. opik/rest_api/annotation_queues/raw_client.py +1019 -0
  285. opik/rest_api/automation_rule_evaluators/client.py +34 -2
  286. opik/rest_api/automation_rule_evaluators/raw_client.py +24 -0
  287. opik/rest_api/client.py +15 -0
  288. opik/rest_api/dashboards/__init__.py +4 -0
  289. opik/rest_api/dashboards/client.py +462 -0
  290. opik/rest_api/dashboards/raw_client.py +648 -0
  291. opik/rest_api/datasets/client.py +1310 -44
  292. opik/rest_api/datasets/raw_client.py +2269 -358
  293. opik/rest_api/experiments/__init__.py +2 -2
  294. opik/rest_api/experiments/client.py +191 -5
  295. opik/rest_api/experiments/raw_client.py +301 -7
  296. opik/rest_api/experiments/types/__init__.py +4 -1
  297. opik/rest_api/experiments/types/experiment_update_status.py +5 -0
  298. opik/rest_api/experiments/types/experiment_update_type.py +5 -0
  299. opik/rest_api/experiments/types/experiment_write_status.py +5 -0
  300. opik/rest_api/feedback_definitions/types/find_feedback_definitions_request_type.py +1 -1
  301. opik/rest_api/llm_provider_key/client.py +20 -0
  302. opik/rest_api/llm_provider_key/raw_client.py +20 -0
  303. opik/rest_api/llm_provider_key/types/provider_api_key_write_provider.py +1 -1
  304. opik/rest_api/manual_evaluation/__init__.py +4 -0
  305. opik/rest_api/manual_evaluation/client.py +347 -0
  306. opik/rest_api/manual_evaluation/raw_client.py +543 -0
  307. opik/rest_api/optimizations/client.py +145 -9
  308. opik/rest_api/optimizations/raw_client.py +237 -13
  309. opik/rest_api/optimizations/types/optimization_update_status.py +3 -1
  310. opik/rest_api/prompts/__init__.py +2 -2
  311. opik/rest_api/prompts/client.py +227 -6
  312. opik/rest_api/prompts/raw_client.py +331 -2
  313. opik/rest_api/prompts/types/__init__.py +3 -1
  314. opik/rest_api/prompts/types/create_prompt_version_detail_template_structure.py +5 -0
  315. opik/rest_api/prompts/types/prompt_write_template_structure.py +5 -0
  316. opik/rest_api/spans/__init__.py +0 -2
  317. opik/rest_api/spans/client.py +238 -76
  318. opik/rest_api/spans/raw_client.py +307 -95
  319. opik/rest_api/spans/types/__init__.py +0 -2
  320. opik/rest_api/traces/client.py +572 -161
  321. opik/rest_api/traces/raw_client.py +736 -229
  322. opik/rest_api/types/__init__.py +352 -17
  323. opik/rest_api/types/aggregation_data.py +1 -0
  324. opik/rest_api/types/alert.py +33 -0
  325. opik/rest_api/types/alert_alert_type.py +5 -0
  326. opik/rest_api/types/alert_page_public.py +24 -0
  327. opik/rest_api/types/alert_public.py +33 -0
  328. opik/rest_api/types/alert_public_alert_type.py +5 -0
  329. opik/rest_api/types/alert_trigger.py +27 -0
  330. opik/rest_api/types/alert_trigger_config.py +28 -0
  331. opik/rest_api/types/alert_trigger_config_public.py +28 -0
  332. opik/rest_api/types/alert_trigger_config_public_type.py +10 -0
  333. opik/rest_api/types/alert_trigger_config_type.py +10 -0
  334. opik/rest_api/types/alert_trigger_config_write.py +22 -0
  335. opik/rest_api/types/alert_trigger_config_write_type.py +10 -0
  336. opik/rest_api/types/alert_trigger_event_type.py +19 -0
  337. opik/rest_api/types/alert_trigger_public.py +27 -0
  338. opik/rest_api/types/alert_trigger_public_event_type.py +19 -0
  339. opik/rest_api/types/alert_trigger_write.py +23 -0
  340. opik/rest_api/types/alert_trigger_write_event_type.py +19 -0
  341. opik/rest_api/types/alert_write.py +28 -0
  342. opik/rest_api/types/alert_write_alert_type.py +5 -0
  343. opik/rest_api/types/annotation_queue.py +42 -0
  344. opik/rest_api/types/annotation_queue_batch.py +27 -0
  345. opik/rest_api/types/annotation_queue_item_ids.py +19 -0
  346. opik/rest_api/types/annotation_queue_page_public.py +28 -0
  347. opik/rest_api/types/annotation_queue_public.py +38 -0
  348. opik/rest_api/types/annotation_queue_public_scope.py +5 -0
  349. opik/rest_api/types/annotation_queue_reviewer.py +20 -0
  350. opik/rest_api/types/annotation_queue_reviewer_public.py +20 -0
  351. opik/rest_api/types/annotation_queue_scope.py +5 -0
  352. opik/rest_api/types/annotation_queue_write.py +31 -0
  353. opik/rest_api/types/annotation_queue_write_scope.py +5 -0
  354. opik/rest_api/types/audio_url.py +19 -0
  355. opik/rest_api/types/audio_url_public.py +19 -0
  356. opik/rest_api/types/audio_url_write.py +19 -0
  357. opik/rest_api/types/automation_rule_evaluator.py +62 -2
  358. opik/rest_api/types/automation_rule_evaluator_llm_as_judge.py +2 -0
  359. opik/rest_api/types/automation_rule_evaluator_llm_as_judge_public.py +2 -0
  360. opik/rest_api/types/automation_rule_evaluator_llm_as_judge_write.py +2 -0
  361. opik/rest_api/types/automation_rule_evaluator_object_object_public.py +155 -0
  362. opik/rest_api/types/automation_rule_evaluator_page_public.py +3 -2
  363. opik/rest_api/types/automation_rule_evaluator_public.py +57 -2
  364. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge.py +22 -0
  365. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_public.py +22 -0
  366. opik/rest_api/types/automation_rule_evaluator_span_llm_as_judge_write.py +22 -0
  367. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python.py +22 -0
  368. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_public.py +22 -0
  369. opik/rest_api/types/automation_rule_evaluator_span_user_defined_metric_python_write.py +22 -0
  370. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge.py +2 -0
  371. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_public.py +2 -0
  372. opik/rest_api/types/automation_rule_evaluator_trace_thread_llm_as_judge_write.py +2 -0
  373. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python.py +2 -0
  374. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_public.py +2 -0
  375. opik/rest_api/types/automation_rule_evaluator_trace_thread_user_defined_metric_python_write.py +2 -0
  376. opik/rest_api/types/automation_rule_evaluator_update.py +51 -1
  377. opik/rest_api/types/automation_rule_evaluator_update_llm_as_judge.py +2 -0
  378. opik/rest_api/types/automation_rule_evaluator_update_span_llm_as_judge.py +22 -0
  379. opik/rest_api/types/automation_rule_evaluator_update_span_user_defined_metric_python.py +22 -0
  380. opik/rest_api/types/automation_rule_evaluator_update_trace_thread_llm_as_judge.py +2 -0
  381. opik/rest_api/types/automation_rule_evaluator_update_trace_thread_user_defined_metric_python.py +2 -0
  382. opik/rest_api/types/automation_rule_evaluator_update_user_defined_metric_python.py +2 -0
  383. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python.py +2 -0
  384. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_public.py +2 -0
  385. opik/rest_api/types/automation_rule_evaluator_user_defined_metric_python_write.py +2 -0
  386. opik/rest_api/types/automation_rule_evaluator_write.py +51 -1
  387. opik/rest_api/types/boolean_feedback_definition.py +25 -0
  388. opik/rest_api/types/boolean_feedback_definition_create.py +20 -0
  389. opik/rest_api/types/boolean_feedback_definition_public.py +25 -0
  390. opik/rest_api/types/boolean_feedback_definition_update.py +20 -0
  391. opik/rest_api/types/boolean_feedback_detail.py +29 -0
  392. opik/rest_api/types/boolean_feedback_detail_create.py +29 -0
  393. opik/rest_api/types/boolean_feedback_detail_public.py +29 -0
  394. opik/rest_api/types/boolean_feedback_detail_update.py +29 -0
  395. opik/rest_api/types/dashboard_page_public.py +24 -0
  396. opik/rest_api/types/dashboard_public.py +30 -0
  397. opik/rest_api/types/dataset.py +4 -0
  398. opik/rest_api/types/dataset_expansion.py +42 -0
  399. opik/rest_api/types/dataset_expansion_response.py +39 -0
  400. opik/rest_api/types/dataset_item.py +2 -0
  401. opik/rest_api/types/dataset_item_changes_public.py +5 -0
  402. opik/rest_api/types/dataset_item_compare.py +2 -0
  403. opik/rest_api/types/dataset_item_filter.py +27 -0
  404. opik/rest_api/types/dataset_item_filter_operator.py +21 -0
  405. opik/rest_api/types/dataset_item_page_compare.py +5 -0
  406. opik/rest_api/types/dataset_item_page_public.py +5 -0
  407. opik/rest_api/types/dataset_item_public.py +2 -0
  408. opik/rest_api/types/dataset_item_update.py +39 -0
  409. opik/rest_api/types/dataset_item_write.py +1 -0
  410. opik/rest_api/types/dataset_public.py +4 -0
  411. opik/rest_api/types/dataset_public_status.py +5 -0
  412. opik/rest_api/types/dataset_status.py +5 -0
  413. opik/rest_api/types/dataset_version_diff.py +22 -0
  414. opik/rest_api/types/dataset_version_diff_stats.py +24 -0
  415. opik/rest_api/types/dataset_version_page_public.py +23 -0
  416. opik/rest_api/types/dataset_version_public.py +59 -0
  417. opik/rest_api/types/dataset_version_summary.py +46 -0
  418. opik/rest_api/types/dataset_version_summary_public.py +46 -0
  419. opik/rest_api/types/experiment.py +7 -2
  420. opik/rest_api/types/experiment_group_response.py +2 -0
  421. opik/rest_api/types/experiment_public.py +7 -2
  422. opik/rest_api/types/experiment_public_status.py +5 -0
  423. opik/rest_api/types/experiment_score.py +20 -0
  424. opik/rest_api/types/experiment_score_public.py +20 -0
  425. opik/rest_api/types/experiment_score_write.py +20 -0
  426. opik/rest_api/types/experiment_status.py +5 -0
  427. opik/rest_api/types/feedback.py +25 -1
  428. opik/rest_api/types/feedback_create.py +20 -1
  429. opik/rest_api/types/feedback_object_public.py +27 -1
  430. opik/rest_api/types/feedback_public.py +25 -1
  431. opik/rest_api/types/feedback_score_batch_item.py +2 -1
  432. opik/rest_api/types/feedback_score_batch_item_thread.py +2 -1
  433. opik/rest_api/types/feedback_score_public.py +4 -0
  434. opik/rest_api/types/feedback_update.py +20 -1
  435. opik/rest_api/types/group_content_with_aggregations.py +1 -0
  436. opik/rest_api/types/group_detail.py +19 -0
  437. opik/rest_api/types/group_details.py +20 -0
  438. opik/rest_api/types/guardrail.py +1 -0
  439. opik/rest_api/types/guardrail_write.py +1 -0
  440. opik/rest_api/types/ids_holder.py +19 -0
  441. opik/rest_api/types/image_url.py +20 -0
  442. opik/rest_api/types/image_url_public.py +20 -0
  443. opik/rest_api/types/image_url_write.py +20 -0
  444. opik/rest_api/types/llm_as_judge_message.py +5 -1
  445. opik/rest_api/types/llm_as_judge_message_content.py +26 -0
  446. opik/rest_api/types/llm_as_judge_message_content_public.py +26 -0
  447. opik/rest_api/types/llm_as_judge_message_content_write.py +26 -0
  448. opik/rest_api/types/llm_as_judge_message_public.py +5 -1
  449. opik/rest_api/types/llm_as_judge_message_write.py +5 -1
  450. opik/rest_api/types/llm_as_judge_model_parameters.py +3 -0
  451. opik/rest_api/types/llm_as_judge_model_parameters_public.py +3 -0
  452. opik/rest_api/types/llm_as_judge_model_parameters_write.py +3 -0
  453. opik/rest_api/types/manual_evaluation_request.py +38 -0
  454. opik/rest_api/types/manual_evaluation_request_entity_type.py +5 -0
  455. opik/rest_api/types/manual_evaluation_response.py +27 -0
  456. opik/rest_api/types/optimization.py +4 -2
  457. opik/rest_api/types/optimization_public.py +4 -2
  458. opik/rest_api/types/optimization_public_status.py +3 -1
  459. opik/rest_api/types/optimization_status.py +3 -1
  460. opik/rest_api/types/optimization_studio_config.py +27 -0
  461. opik/rest_api/types/optimization_studio_config_public.py +27 -0
  462. opik/rest_api/types/optimization_studio_config_write.py +27 -0
  463. opik/rest_api/types/optimization_studio_log.py +22 -0
  464. opik/rest_api/types/optimization_write.py +4 -2
  465. opik/rest_api/types/optimization_write_status.py +3 -1
  466. opik/rest_api/types/project.py +1 -0
  467. opik/rest_api/types/project_detailed.py +1 -0
  468. opik/rest_api/types/project_reference.py +31 -0
  469. opik/rest_api/types/project_reference_public.py +31 -0
  470. opik/rest_api/types/project_stats_summary_item.py +1 -0
  471. opik/rest_api/types/prompt.py +6 -0
  472. opik/rest_api/types/prompt_detail.py +6 -0
  473. opik/rest_api/types/prompt_detail_template_structure.py +5 -0
  474. opik/rest_api/types/prompt_public.py +6 -0
  475. opik/rest_api/types/prompt_public_template_structure.py +5 -0
  476. opik/rest_api/types/prompt_template_structure.py +5 -0
  477. opik/rest_api/types/prompt_version.py +3 -0
  478. opik/rest_api/types/prompt_version_detail.py +3 -0
  479. opik/rest_api/types/prompt_version_detail_template_structure.py +5 -0
  480. opik/rest_api/types/prompt_version_link.py +1 -0
  481. opik/rest_api/types/prompt_version_link_public.py +1 -0
  482. opik/rest_api/types/prompt_version_page_public.py +5 -0
  483. opik/rest_api/types/prompt_version_public.py +3 -0
  484. opik/rest_api/types/prompt_version_public_template_structure.py +5 -0
  485. opik/rest_api/types/prompt_version_template_structure.py +5 -0
  486. opik/rest_api/types/prompt_version_update.py +33 -0
  487. opik/rest_api/types/provider_api_key.py +9 -0
  488. opik/rest_api/types/provider_api_key_provider.py +1 -1
  489. opik/rest_api/types/provider_api_key_public.py +9 -0
  490. opik/rest_api/types/provider_api_key_public_provider.py +1 -1
  491. opik/rest_api/types/score_name.py +1 -0
  492. opik/rest_api/types/service_toggles_config.py +18 -0
  493. opik/rest_api/types/span.py +1 -2
  494. opik/rest_api/types/span_enrichment_options.py +31 -0
  495. opik/rest_api/types/span_experiment_item_bulk_write_view.py +1 -2
  496. opik/rest_api/types/span_filter.py +23 -0
  497. opik/rest_api/types/span_filter_operator.py +21 -0
  498. opik/rest_api/types/span_filter_write.py +23 -0
  499. opik/rest_api/types/span_filter_write_operator.py +21 -0
  500. opik/rest_api/types/span_llm_as_judge_code.py +27 -0
  501. opik/rest_api/types/span_llm_as_judge_code_public.py +27 -0
  502. opik/rest_api/types/span_llm_as_judge_code_write.py +27 -0
  503. opik/rest_api/types/span_public.py +1 -2
  504. opik/rest_api/types/span_update.py +46 -0
  505. opik/rest_api/types/span_user_defined_metric_python_code.py +20 -0
  506. opik/rest_api/types/span_user_defined_metric_python_code_public.py +20 -0
  507. opik/rest_api/types/span_user_defined_metric_python_code_write.py +20 -0
  508. opik/rest_api/types/span_write.py +1 -2
  509. opik/rest_api/types/studio_evaluation.py +20 -0
  510. opik/rest_api/types/studio_evaluation_public.py +20 -0
  511. opik/rest_api/types/studio_evaluation_write.py +20 -0
  512. opik/rest_api/types/studio_llm_model.py +21 -0
  513. opik/rest_api/types/studio_llm_model_public.py +21 -0
  514. opik/rest_api/types/studio_llm_model_write.py +21 -0
  515. opik/rest_api/types/studio_message.py +20 -0
  516. opik/rest_api/types/studio_message_public.py +20 -0
  517. opik/rest_api/types/studio_message_write.py +20 -0
  518. opik/rest_api/types/studio_metric.py +21 -0
  519. opik/rest_api/types/studio_metric_public.py +21 -0
  520. opik/rest_api/types/studio_metric_write.py +21 -0
  521. opik/rest_api/types/studio_optimizer.py +21 -0
  522. opik/rest_api/types/studio_optimizer_public.py +21 -0
  523. opik/rest_api/types/studio_optimizer_write.py +21 -0
  524. opik/rest_api/types/studio_prompt.py +20 -0
  525. opik/rest_api/types/studio_prompt_public.py +20 -0
  526. opik/rest_api/types/studio_prompt_write.py +20 -0
  527. opik/rest_api/types/trace.py +11 -2
  528. opik/rest_api/types/trace_enrichment_options.py +32 -0
  529. opik/rest_api/types/trace_experiment_item_bulk_write_view.py +1 -2
  530. opik/rest_api/types/trace_filter.py +23 -0
  531. opik/rest_api/types/trace_filter_operator.py +21 -0
  532. opik/rest_api/types/trace_filter_write.py +23 -0
  533. opik/rest_api/types/trace_filter_write_operator.py +21 -0
  534. opik/rest_api/types/trace_public.py +11 -2
  535. opik/rest_api/types/trace_thread_filter_write.py +23 -0
  536. opik/rest_api/types/trace_thread_filter_write_operator.py +21 -0
  537. opik/rest_api/types/trace_thread_identifier.py +1 -0
  538. opik/rest_api/types/trace_update.py +39 -0
  539. opik/rest_api/types/trace_write.py +1 -2
  540. opik/rest_api/types/value_entry.py +2 -0
  541. opik/rest_api/types/value_entry_compare.py +2 -0
  542. opik/rest_api/types/value_entry_experiment_item_bulk_write_view.py +2 -0
  543. opik/rest_api/types/value_entry_public.py +2 -0
  544. opik/rest_api/types/video_url.py +19 -0
  545. opik/rest_api/types/video_url_public.py +19 -0
  546. opik/rest_api/types/video_url_write.py +19 -0
  547. opik/rest_api/types/webhook.py +28 -0
  548. opik/rest_api/types/webhook_examples.py +19 -0
  549. opik/rest_api/types/webhook_public.py +28 -0
  550. opik/rest_api/types/webhook_test_result.py +23 -0
  551. opik/rest_api/types/webhook_test_result_status.py +5 -0
  552. opik/rest_api/types/webhook_write.py +23 -0
  553. opik/rest_api/types/welcome_wizard_tracking.py +22 -0
  554. opik/rest_api/types/workspace_configuration.py +5 -0
  555. opik/rest_api/welcome_wizard/__init__.py +4 -0
  556. opik/rest_api/welcome_wizard/client.py +195 -0
  557. opik/rest_api/welcome_wizard/raw_client.py +208 -0
  558. opik/rest_api/workspaces/client.py +14 -2
  559. opik/rest_api/workspaces/raw_client.py +10 -0
  560. opik/s3_httpx_client.py +14 -1
  561. opik/simulation/__init__.py +6 -0
  562. opik/simulation/simulated_user.py +99 -0
  563. opik/simulation/simulator.py +108 -0
  564. opik/synchronization.py +5 -6
  565. opik/{decorator/tracing_runtime_config.py → tracing_runtime_config.py} +6 -7
  566. opik/types.py +36 -0
  567. opik/validation/chat_prompt_messages.py +241 -0
  568. opik/validation/feedback_score.py +3 -3
  569. opik/validation/validator.py +28 -0
  570. opik-1.9.71.dist-info/METADATA +370 -0
  571. opik-1.9.71.dist-info/RECORD +1110 -0
  572. opik/api_objects/prompt/prompt.py +0 -112
  573. opik/cli.py +0 -193
  574. opik/hooks.py +0 -13
  575. opik/integrations/bedrock/chunks_aggregator.py +0 -55
  576. opik/integrations/bedrock/helpers.py +0 -8
  577. opik/rest_api/types/automation_rule_evaluator_object_public.py +0 -100
  578. opik/rest_api/types/json_node_experiment_item_bulk_write_view.py +0 -5
  579. opik-1.8.39.dist-info/METADATA +0 -339
  580. opik-1.8.39.dist-info/RECORD +0 -790
  581. /opik/{evaluation/metrics/conversation/conversational_coherence → decorator/context_manager}/__init__.py +0 -0
  582. /opik/evaluation/metrics/conversation/{session_completeness → llm_judges/conversational_coherence}/__init__.py +0 -0
  583. /opik/evaluation/metrics/conversation/{conversational_coherence → llm_judges/conversational_coherence}/schema.py +0 -0
  584. /opik/evaluation/metrics/conversation/{user_frustration → llm_judges/session_completeness}/__init__.py +0 -0
  585. /opik/evaluation/metrics/conversation/{session_completeness → llm_judges/session_completeness}/schema.py +0 -0
  586. /opik/evaluation/metrics/conversation/{user_frustration → llm_judges/user_frustration}/schema.py +0 -0
  587. /opik/integrations/bedrock/{stream_wrappers.py → converse/stream_wrappers.py} +0 -0
  588. /opik/rest_api/{spans/types → types}/span_update_type.py +0 -0
  589. {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/WHEEL +0 -0
  590. {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/entry_points.txt +0 -0
  591. {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/licenses/LICENSE +0 -0
  592. {opik-1.8.39.dist-info → opik-1.9.71.dist-info}/top_level.txt +0 -0
@@ -32,6 +32,8 @@ class AnswerRelevance(base_metric.BaseMetric):
32
32
  require_context: if set to False, execution in no-context mode is allowed. Default is True.
33
33
  track: Whether to track the metric. Defaults to True.
34
34
  project_name: Optional project name to track the metric in for the cases when there are no parent span/trace to inherit project name from.
35
+ seed: Optional seed value for reproducible model generation. If provided, this seed will be passed to the model for deterministic outputs.
36
+ temperature: Optional temperature value for model generation. If provided, this temperature will be passed to the model. If not provided, the model's default temperature will be used.
35
37
 
36
38
  Example:
37
39
  >>> from opik.evaluation.metrics import AnswerRelevance
@@ -56,6 +58,8 @@ class AnswerRelevance(base_metric.BaseMetric):
56
58
  require_context: bool = True,
57
59
  track: bool = True,
58
60
  project_name: Optional[str] = None,
61
+ seed: Optional[int] = None,
62
+ temperature: Optional[float] = None,
59
63
  ):
60
64
  super().__init__(
61
65
  name=name,
@@ -63,19 +67,28 @@ class AnswerRelevance(base_metric.BaseMetric):
63
67
  project_name=project_name,
64
68
  )
65
69
  self._require_context = require_context
66
- self._init_model(model)
70
+ self._seed = seed
71
+ self._init_model(model, temperature=temperature)
67
72
  self._init_few_shot_examples(
68
73
  few_shot_examples_with_context=few_shot_examples,
69
74
  few_shot_examples_no_context=few_shot_examples_no_context,
70
75
  )
71
76
 
72
77
  def _init_model(
73
- self, model: Optional[Union[str, base_model.OpikBaseModel]]
78
+ self,
79
+ model: Optional[Union[str, base_model.OpikBaseModel]],
80
+ temperature: Optional[float],
74
81
  ) -> None:
75
82
  if isinstance(model, base_model.OpikBaseModel):
76
83
  self._model = model
77
84
  else:
78
- self._model = models_factory.get(model_name=model)
85
+ model_kwargs = {}
86
+ if temperature is not None:
87
+ model_kwargs["temperature"] = temperature
88
+ if self._seed is not None:
89
+ model_kwargs["seed"] = self._seed
90
+
91
+ self._model = models_factory.get(model_name=model, **model_kwargs)
79
92
 
80
93
  def _init_few_shot_examples(
81
94
  self,
@@ -124,7 +137,8 @@ class AnswerRelevance(base_metric.BaseMetric):
124
137
  )
125
138
 
126
139
  model_output = self._model.generate_string(
127
- input=llm_query, response_format=AnswerRelevanceResponseFormat
140
+ input=llm_query,
141
+ response_format=AnswerRelevanceResponseFormat,
128
142
  )
129
143
  return parser.parse_model_output(content=model_output, name=self.name)
130
144
 
@@ -154,7 +168,8 @@ class AnswerRelevance(base_metric.BaseMetric):
154
168
  input=input, output=output, context=context
155
169
  )
156
170
  model_output = await self._model.agenerate_string(
157
- input=llm_query, response_format=AnswerRelevanceResponseFormat
171
+ input=llm_query,
172
+ response_format=AnswerRelevanceResponseFormat,
158
173
  )
159
174
 
160
175
  return parser.parse_model_output(content=model_output, name=self.name)
@@ -28,6 +28,8 @@ class ContextPrecision(base_metric.BaseMetric):
28
28
  track: Whether to track the metric. Defaults to True.
29
29
  project_name: Optional project name to track the metric in for the cases when
30
30
  there are no parent span/trace to inherit project name from.
31
+ seed: Optional seed value for reproducible model generation. If provided, this seed will be passed to the model for deterministic outputs.
32
+ temperature: Optional temperature value for model generation. If provided, this temperature will be passed to the model. If not provided, the model's default temperature will be used.
31
33
 
32
34
  Example:
33
35
  >>> from opik.evaluation.metrics import ContextPrecision
@@ -48,23 +50,33 @@ class ContextPrecision(base_metric.BaseMetric):
48
50
  ] = None,
49
51
  track: bool = True,
50
52
  project_name: Optional[str] = None,
53
+ seed: Optional[int] = None,
54
+ temperature: Optional[float] = None,
51
55
  ):
52
56
  super().__init__(
53
57
  name=name,
54
58
  track=track,
55
59
  project_name=project_name,
56
60
  )
57
-
58
- self._init_model(model)
61
+ self._seed = seed
62
+ self._init_model(model, temperature=temperature)
59
63
  self.few_shot_examples = few_shot_examples or template.FEW_SHOT_EXAMPLES
60
64
 
61
65
  def _init_model(
62
- self, model: Optional[Union[str, base_model.OpikBaseModel]]
66
+ self,
67
+ model: Optional[Union[str, base_model.OpikBaseModel]],
68
+ temperature: Optional[float],
63
69
  ) -> None:
64
70
  if isinstance(model, base_model.OpikBaseModel):
65
71
  self._model = model
66
72
  else:
67
- self._model = models_factory.get(model_name=model)
73
+ model_kwargs = {}
74
+ if temperature is not None:
75
+ model_kwargs["temperature"] = temperature
76
+ if self._seed is not None:
77
+ model_kwargs["seed"] = self._seed
78
+
79
+ self._model = models_factory.get(model_name=model, **model_kwargs)
68
80
 
69
81
  def score(
70
82
  self,
@@ -96,7 +108,8 @@ class ContextPrecision(base_metric.BaseMetric):
96
108
  few_shot_examples=self.few_shot_examples,
97
109
  )
98
110
  model_output = self._model.generate_string(
99
- input=llm_query, response_format=ContextPrecisionResponseFormat
111
+ input=llm_query,
112
+ response_format=ContextPrecisionResponseFormat,
100
113
  )
101
114
 
102
115
  return parser.parse_model_output(content=model_output, name=self.name)
@@ -133,7 +146,8 @@ class ContextPrecision(base_metric.BaseMetric):
133
146
  few_shot_examples=self.few_shot_examples,
134
147
  )
135
148
  model_output = await self._model.agenerate_string(
136
- input=llm_query, response_format=ContextPrecisionResponseFormat
149
+ input=llm_query,
150
+ response_format=ContextPrecisionResponseFormat,
137
151
  )
138
152
 
139
153
  return parser.parse_model_output(content=model_output, name=self.name)
@@ -28,6 +28,8 @@ class ContextRecall(base_metric.BaseMetric):
28
28
  track: Whether to track the metric. Defaults to True.
29
29
  project_name: Optional project name to track the metric in for the cases when
30
30
  there are no parent span/trace to inherit project name from.
31
+ seed: Optional seed value for reproducible model generation. If provided, this seed will be passed to the model for deterministic outputs.
32
+ temperature: Optional temperature value for model generation. If provided, this temperature will be passed to the model. If not provided, the model's default temperature will be used.
31
33
 
32
34
  Example:
33
35
  >>> from opik.evaluation.metrics import ContextRecall
@@ -46,23 +48,33 @@ class ContextRecall(base_metric.BaseMetric):
46
48
  few_shot_examples: Optional[List[template.FewShotExampleContextRecall]] = None,
47
49
  track: bool = True,
48
50
  project_name: Optional[str] = None,
51
+ seed: Optional[int] = None,
52
+ temperature: Optional[float] = None,
49
53
  ):
50
54
  super().__init__(
51
55
  name=name,
52
56
  track=track,
53
57
  project_name=project_name,
54
58
  )
55
-
56
- self._init_model(model)
59
+ self._seed = seed
60
+ self._init_model(model, temperature=temperature)
57
61
  self.few_shot_examples = few_shot_examples or template.FEW_SHOT_EXAMPLES
58
62
 
59
63
  def _init_model(
60
- self, model: Optional[Union[str, base_model.OpikBaseModel]]
64
+ self,
65
+ model: Optional[Union[str, base_model.OpikBaseModel]],
66
+ temperature: Optional[float],
61
67
  ) -> None:
62
68
  if isinstance(model, base_model.OpikBaseModel):
63
69
  self._model = model
64
70
  else:
65
- self._model = models_factory.get(model_name=model)
71
+ model_kwargs = {}
72
+ if temperature is not None:
73
+ model_kwargs["temperature"] = temperature
74
+ if self._seed is not None:
75
+ model_kwargs["seed"] = self._seed
76
+
77
+ self._model = models_factory.get(model_name=model, **model_kwargs)
66
78
 
67
79
  def score(
68
80
  self,
@@ -94,7 +106,8 @@ class ContextRecall(base_metric.BaseMetric):
94
106
  few_shot_examples=self.few_shot_examples,
95
107
  )
96
108
  model_output = self._model.generate_string(
97
- input=llm_query, response_format=ContextRecallResponseFormat
109
+ input=llm_query,
110
+ response_format=ContextRecallResponseFormat,
98
111
  )
99
112
 
100
113
  return parser.parse_model_output(content=model_output, name=self.name)
@@ -131,7 +144,8 @@ class ContextRecall(base_metric.BaseMetric):
131
144
  few_shot_examples=self.few_shot_examples,
132
145
  )
133
146
  model_output = await self._model.agenerate_string(
134
- input=llm_query, response_format=ContextRecallResponseFormat
147
+ input=llm_query,
148
+ response_format=ContextRecallResponseFormat,
135
149
  )
136
150
 
137
151
  return parser.parse_model_output(content=model_output, name=self.name)
@@ -0,0 +1,5 @@
1
+ """Public exports for the GEval metric package."""
2
+
3
+ from .metric import GEval
4
+
5
+ __all__ = ["GEval"]
@@ -1,10 +1,13 @@
1
- from typing import Any, Optional, Union
1
+ from collections import OrderedDict
2
+ from threading import Lock
3
+ from typing import Any, Dict, Optional, Tuple, Union
2
4
  import pydantic
3
5
 
4
6
  from opik.evaluation.metrics import base_metric, score_result
5
7
  from opik.evaluation.models import base_model, models_factory
6
8
  from opik.evaluation import models
7
9
  from . import template, parser
10
+ from .presets import GEVAL_PRESETS
8
11
 
9
12
 
10
13
  class GEvalScoreFormat(pydantic.BaseModel):
@@ -12,7 +15,56 @@ class GEvalScoreFormat(pydantic.BaseModel):
12
15
  reason: str
13
16
 
14
17
 
18
+ def _freeze_for_cache(value: Any) -> Any:
19
+ """Convert nested structures into hashable representations for caching."""
20
+
21
+ if isinstance(value, dict):
22
+ return tuple(
23
+ sorted((key, _freeze_for_cache(val)) for key, val in value.items())
24
+ )
25
+ if isinstance(value, (list, tuple)):
26
+ return tuple(_freeze_for_cache(item) for item in value)
27
+ if isinstance(value, set):
28
+ return tuple(sorted(_freeze_for_cache(item) for item in value))
29
+ return value
30
+
31
+
15
32
  class GEval(base_metric.BaseMetric):
33
+ """
34
+ Generalised evaluation metric that prompts an LLM to grade another LLM output.
35
+
36
+ GEval builds a reusable chain-of-thought using the provided
37
+ ``task_introduction`` and ``evaluation_criteria`` prompts, then requests a
38
+ final score and rationale for each evaluated output.
39
+
40
+ Args:
41
+ task_introduction: Instruction describing the evaluator's persona/purpose.
42
+ evaluation_criteria: Detailed rubric presented to the evaluator.
43
+ model: Optional model identifier or ``OpikBaseModel`` for the judge.
44
+ name: Display name for the metric result. Defaults to ``"g_eval_metric"``.
45
+ track: Whether to automatically track metric results. Defaults to ``True``.
46
+ project_name: Optional tracking project name.
47
+ temperature: Sampling temperature forwarded to the judge model.
48
+ seed: Optional seed for reproducible generation (if supported by the model).
49
+
50
+ Example:
51
+ >>> from opik.evaluation.metrics.llm_judges.g_eval.metric import GEval
52
+ >>> metric = GEval(
53
+ ... task_introduction="You evaluate politeness of responses.",
54
+ ... evaluation_criteria="Score from 1 (rude) to 5 (very polite).",
55
+ ... model="gpt-4",
56
+ ... )
57
+ >>> result = metric.score(output="Thanks so much for your help!") # doctest: +SKIP
58
+ >>> result.value # doctest: +SKIP
59
+ 0.9
60
+ """
61
+
62
+ _CHAIN_OF_THOUGHT_CACHE: "OrderedDict[Tuple[str, str, str, Any], str]" = (
63
+ OrderedDict()
64
+ )
65
+ _CHAIN_OF_THOUGHT_LOCK: Lock = Lock()
66
+ _MAX_CHAIN_OF_THOUGHT_CACHE = 128
67
+
16
68
  def __init__(
17
69
  self,
18
70
  task_introduction: str,
@@ -21,65 +73,61 @@ class GEval(base_metric.BaseMetric):
21
73
  name: str = "g_eval_metric",
22
74
  track: bool = True,
23
75
  project_name: Optional[str] = None,
76
+ temperature: float = 0.0,
77
+ seed: Optional[int] = None,
24
78
  ):
25
- """
26
- A metric that evaluates an LLM output based on chain-of-thought built with the evaluation criteria provided
27
- by the user.
28
-
29
- For more details see the original paper: https://arxiv.org/pdf/2303.16634
30
-
31
- Args:
32
- task_introduction: An instruction for LLM used to generate an evaluation chain-of-thought and in evaluation call itself.
33
- `opik.evaluation.models.LiteLLMChatModel` is used by default.
34
- evaluation_criteria: The main task for G-Eval metric written in human language.
35
- model: The LLM to use for evaluation. Can be a string (model name) or an `opik.evaluation.models.OpikBaseModel` subclass instance.
36
- name: The name of the metric.
37
- track: Whether to track the metric. Defaults to True.
38
- project_name: Optional project name to track the metric in for the cases when
39
- there are no parent span/trace to inherit project name from.
40
- """
41
79
  super().__init__(
42
80
  name=name,
43
81
  track=track,
44
82
  project_name=project_name,
45
83
  )
46
- self._init_model(model)
47
-
48
84
  self.task_introduction = task_introduction
49
85
  self.evaluation_criteria = evaluation_criteria
86
+ self._seed = seed
87
+
50
88
  self._log_probs_supported = False
51
89
 
52
- self._chain_of_thought_response: Optional[str] = None
90
+ self._init_model(model, temperature=temperature)
53
91
 
54
92
  def llm_chain_of_thought(self) -> str:
55
- if self._chain_of_thought_response is None:
56
- prompt = template.G_EVAL_COT_TEMPLATE.format(
57
- task_introduction=self.task_introduction,
58
- evaluation_criteria=self.evaluation_criteria,
59
- )
60
- self._chain_of_thought_response = self._model.generate_string(input=prompt)
93
+ cache_key = self._chain_of_thought_cache_key()
94
+ cached = self._get_cached_chain_of_thought(cache_key)
95
+ if cached is not None:
96
+ return cached
61
97
 
62
- return self._chain_of_thought_response
98
+ prompt = template.G_EVAL_COT_TEMPLATE.format(
99
+ task_introduction=self.task_introduction,
100
+ evaluation_criteria=self.evaluation_criteria,
101
+ )
102
+ generated = self._model.generate_string(input=prompt)
103
+ self._store_chain_of_thought(cache_key, generated)
104
+ return generated
63
105
 
64
106
  async def allm_chain_of_thought(self) -> str:
65
- if not self._chain_of_thought_response:
66
- prompt = template.G_EVAL_COT_TEMPLATE.format(
67
- task_introduction=self.task_introduction,
68
- evaluation_criteria=self.evaluation_criteria,
69
- )
70
- self._chain_of_thought_response = await self._model.agenerate_string(
71
- input=prompt
72
- )
107
+ cache_key = self._chain_of_thought_cache_key()
108
+ cached = self._get_cached_chain_of_thought(cache_key)
109
+ if cached is not None:
110
+ return cached
73
111
 
74
- return self._chain_of_thought_response
112
+ prompt = template.G_EVAL_COT_TEMPLATE.format(
113
+ task_introduction=self.task_introduction,
114
+ evaluation_criteria=self.evaluation_criteria,
115
+ )
116
+ generated = await self._model.agenerate_string(input=prompt)
117
+ self._store_chain_of_thought(cache_key, generated)
118
+ return generated
75
119
 
76
120
  def _init_model(
77
- self, model: Optional[Union[str, base_model.OpikBaseModel]]
121
+ self, model: Optional[Union[str, base_model.OpikBaseModel]], temperature: float
78
122
  ) -> None:
79
123
  if isinstance(model, base_model.OpikBaseModel):
80
124
  self._model = model
81
125
  else:
82
- self._model = models_factory.get(model_name=model)
126
+ model_kwargs = {"temperature": temperature}
127
+ if self._seed is not None:
128
+ model_kwargs["seed"] = self._seed
129
+
130
+ self._model = models_factory.get(model_name=model, **model_kwargs)
83
131
 
84
132
  if (
85
133
  hasattr(self._model, "supported_params")
@@ -88,6 +136,55 @@ class GEval(base_metric.BaseMetric):
88
136
  ):
89
137
  self._log_probs_supported = True
90
138
 
139
+ @classmethod
140
+ def _get_cached_chain_of_thought(
141
+ cls, cache_key: Tuple[str, str, str, Any]
142
+ ) -> Optional[str]:
143
+ with cls._CHAIN_OF_THOUGHT_LOCK:
144
+ value = cls._CHAIN_OF_THOUGHT_CACHE.get(cache_key)
145
+ if value is not None:
146
+ cls._CHAIN_OF_THOUGHT_CACHE.move_to_end(cache_key)
147
+ return value
148
+
149
+ @classmethod
150
+ def _store_chain_of_thought(
151
+ cls, cache_key: Tuple[str, str, str, Any], value: str
152
+ ) -> None:
153
+ with cls._CHAIN_OF_THOUGHT_LOCK:
154
+ existing = cls._CHAIN_OF_THOUGHT_CACHE.get(cache_key)
155
+ if existing is not None:
156
+ cls._CHAIN_OF_THOUGHT_CACHE.move_to_end(cache_key)
157
+ return
158
+ cls._CHAIN_OF_THOUGHT_CACHE[cache_key] = value
159
+ cls._CHAIN_OF_THOUGHT_CACHE.move_to_end(cache_key)
160
+ while len(cls._CHAIN_OF_THOUGHT_CACHE) > cls._MAX_CHAIN_OF_THOUGHT_CACHE:
161
+ cls._CHAIN_OF_THOUGHT_CACHE.popitem(last=False)
162
+
163
+ def _chain_of_thought_cache_key(self) -> Tuple[str, str, str, Any]:
164
+ model_name = getattr(self._model, "model_name", "unknown")
165
+ return (
166
+ self.task_introduction,
167
+ self.evaluation_criteria,
168
+ model_name,
169
+ self._model_cache_fingerprint(),
170
+ )
171
+
172
+ def _model_cache_fingerprint(self) -> Any:
173
+ fingerprint_candidate = getattr(self._model, "cache_fingerprint", None)
174
+ if callable(fingerprint_candidate):
175
+ try:
176
+ fingerprint = fingerprint_candidate()
177
+ except Exception:
178
+ fingerprint = None
179
+ else:
180
+ return _freeze_for_cache(fingerprint)
181
+
182
+ completion_kwargs = getattr(self._model, "_completion_kwargs", None)
183
+ if isinstance(completion_kwargs, dict):
184
+ return _freeze_for_cache(completion_kwargs)
185
+
186
+ return id(self._model)
187
+
91
188
  def score(
92
189
  self,
93
190
  output: str,
@@ -119,17 +216,23 @@ class GEval(base_metric.BaseMetric):
119
216
  ]
120
217
 
121
218
  if isinstance(self._model, models.LiteLLMChatModel):
122
- model_output = self._model.generate_provider_response(
219
+ provider_kwargs: Dict[str, Any] = {
220
+ "response_format": GEvalScoreFormat,
221
+ }
222
+ if self._log_probs_supported:
223
+ provider_kwargs["logprobs"] = True
224
+ provider_kwargs["top_logprobs"] = 20
225
+
226
+ with base_model.get_provider_response(
227
+ model_provider=self._model,
123
228
  messages=request,
124
- logprobs=self._log_probs_supported,
125
- top_logprobs=20 if self._log_probs_supported else None,
126
- response_format=GEvalScoreFormat,
127
- )
128
- return parser.parse_litellm_model_output(
129
- content=model_output,
130
- name=self.name,
131
- log_probs_supported=self._log_probs_supported,
132
- )
229
+ **provider_kwargs,
230
+ ) as model_output:
231
+ return parser.parse_litellm_model_output(
232
+ content=model_output,
233
+ name=self.name,
234
+ log_probs_supported=self._log_probs_supported,
235
+ )
133
236
 
134
237
  model_output_string = self._model.generate_string(
135
238
  input=llm_query, response_format=GEvalScoreFormat
@@ -138,18 +241,13 @@ class GEval(base_metric.BaseMetric):
138
241
  return parser.parse_model_output_string(model_output_string, self.name)
139
242
 
140
243
  async def ascore(
141
- self, output: str, **ignored_kwargs: Any
244
+ self,
245
+ output: str,
246
+ **ignored_kwargs: Any,
142
247
  ) -> score_result.ScoreResult:
143
248
  """
144
- Calculate the G-Eval score for the given LLM's output.
145
-
146
- Args:
147
- output: The LLM's output to evaluate.
148
- **ignored_kwargs: Additional keyword arguments that are ignored.
149
-
150
- Returns:
151
- score_result.ScoreResult: A ScoreResult object containing the G-Eval score
152
- (between 0.0 and 1.0) and a reason for the score.
249
+ Async variant of :meth:`score`, evaluating the provided LLM output using
250
+ the configured judge model and returning a ``ScoreResult``.
153
251
  """
154
252
  llm_query = template.G_EVAL_QUERY_TEMPLATE.format(
155
253
  task_introduction=self.task_introduction,
@@ -166,20 +264,73 @@ class GEval(base_metric.BaseMetric):
166
264
  ]
167
265
 
168
266
  if isinstance(self._model, models.LiteLLMChatModel):
169
- model_output = await self._model.agenerate_provider_response(
267
+ provider_kwargs: Dict[str, Any] = {
268
+ "response_format": GEvalScoreFormat,
269
+ }
270
+ if self._log_probs_supported:
271
+ provider_kwargs["logprobs"] = True
272
+ provider_kwargs["top_logprobs"] = 20
273
+
274
+ async with base_model.aget_provider_response(
275
+ model_provider=self._model,
170
276
  messages=request,
171
- logprobs=self._log_probs_supported,
172
- top_logprobs=20 if self._log_probs_supported else None,
173
- response_format=GEvalScoreFormat,
174
- )
175
- return parser.parse_litellm_model_output(
176
- content=model_output,
177
- name=self.name,
178
- log_probs_supported=self._log_probs_supported,
179
- )
277
+ **provider_kwargs,
278
+ ) as model_output:
279
+ return parser.parse_litellm_model_output(
280
+ content=model_output,
281
+ name=self.name,
282
+ log_probs_supported=self._log_probs_supported,
283
+ )
180
284
 
181
285
  model_output_string = await self._model.agenerate_string(
182
286
  input=llm_query, response_format=GEvalScoreFormat
183
287
  )
184
288
 
185
289
  return parser.parse_model_output_string(model_output_string, self.name)
290
+
291
+
292
+ class GEvalPreset(GEval):
293
+ """
294
+ Pre-configured GEval variant with author-provided prompt templates.
295
+
296
+ Args:
297
+ preset: Key name from ``GEVAL_PRESETS`` describing the evaluation rubric.
298
+ model: Optional model identifier or ``OpikBaseModel`` instance.
299
+ track: Whether to automatically track metric results. Defaults to ``True``.
300
+ project_name: Optional tracking project name.
301
+ temperature: Sampling temperature forwarded to the judge model.
302
+ name: Optional override for the metric name (defaults to preset name).
303
+
304
+ Example:
305
+ >>> from opik.evaluation.metrics.llm_judges.g_eval.metric import GEvalPreset
306
+ >>> metric = GEvalPreset(preset="qa_relevance", model="gpt-4")
307
+ >>> result = metric.score(output="Answer addresses the user's question.") # doctest: +SKIP
308
+ >>> result.value # doctest: +SKIP
309
+ 0.85
310
+ """
311
+
312
+ def __init__(
313
+ self,
314
+ preset: str,
315
+ model: Optional[Union[str, models.base_model.OpikBaseModel]] = None,
316
+ track: bool = True,
317
+ project_name: Optional[str] = None,
318
+ temperature: float = 0.0,
319
+ name: Optional[str] = None,
320
+ ):
321
+ try:
322
+ definition = GEVAL_PRESETS[preset]
323
+ except KeyError as error:
324
+ raise ValueError(
325
+ f"Unknown GEval preset '{preset}'. Available presets: {list(GEVAL_PRESETS)}"
326
+ ) from error
327
+
328
+ super().__init__(
329
+ task_introduction=definition.task_introduction,
330
+ evaluation_criteria=definition.evaluation_criteria,
331
+ model=model,
332
+ name=name or definition.name,
333
+ track=track,
334
+ project_name=project_name,
335
+ temperature=temperature,
336
+ )