deepeval 3.5.8__tar.gz → 3.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (605) hide show
  1. {deepeval-3.5.8 → deepeval-3.8.0}/PKG-INFO +12 -14
  2. {deepeval-3.5.8 → deepeval-3.8.0}/README.md +9 -8
  3. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/__init__.py +42 -14
  4. deepeval-3.8.0/deepeval/_version.py +1 -0
  5. deepeval-3.8.0/deepeval/anthropic/__init__.py +19 -0
  6. deepeval-3.8.0/deepeval/anthropic/extractors.py +94 -0
  7. deepeval-3.8.0/deepeval/anthropic/patch.py +169 -0
  8. deepeval-3.8.0/deepeval/anthropic/utils.py +225 -0
  9. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/drop/drop.py +45 -16
  10. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  11. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/human_eval/human_eval.py +2 -1
  12. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/ifeval/ifeval.py +2 -2
  13. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/mmlu/mmlu.py +6 -4
  14. deepeval-3.8.0/deepeval/cli/main.py +3109 -0
  15. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/cli/test.py +1 -1
  16. deepeval-3.8.0/deepeval/cli/utils.py +353 -0
  17. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/confident/api.py +10 -1
  18. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/confident/types.py +4 -2
  19. deepeval-3.8.0/deepeval/config/dotenv_handler.py +19 -0
  20. deepeval-3.8.0/deepeval/config/logging.py +33 -0
  21. deepeval-3.8.0/deepeval/config/settings.py +1589 -0
  22. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/config/settings_manager.py +5 -1
  23. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/config/utils.py +14 -1
  24. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/constants.py +9 -1
  25. deepeval-3.8.0/deepeval/contextvars.py +25 -0
  26. deepeval-3.8.0/deepeval/dataset/__init__.py +11 -0
  27. deepeval-3.8.0/deepeval/dataset/api.py +50 -0
  28. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/dataset/dataset.py +207 -54
  29. deepeval-3.8.0/deepeval/dataset/golden.py +197 -0
  30. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/dataset/test_run_tracer.py +4 -6
  31. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/dataset/utils.py +44 -14
  32. deepeval-3.8.0/deepeval/errors.py +24 -0
  33. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/evaluate/compare.py +219 -4
  34. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/evaluate/configs.py +1 -1
  35. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/evaluate/evaluate.py +29 -14
  36. deepeval-3.8.0/deepeval/evaluate/execute.py +3184 -0
  37. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/evaluate/types.py +11 -1
  38. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/evaluate/utils.py +107 -166
  39. deepeval-3.8.0/deepeval/integrations/crewai/__init__.py +9 -0
  40. deepeval-3.8.0/deepeval/integrations/crewai/handler.py +232 -0
  41. deepeval-3.8.0/deepeval/integrations/crewai/subs.py +51 -0
  42. deepeval-3.8.0/deepeval/integrations/crewai/tool.py +71 -0
  43. deepeval-3.8.0/deepeval/integrations/crewai/wrapper.py +127 -0
  44. deepeval-3.8.0/deepeval/integrations/langchain/callback.py +542 -0
  45. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/integrations/langchain/utils.py +31 -8
  46. deepeval-3.8.0/deepeval/integrations/llama_index/__init__.py +6 -0
  47. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/integrations/llama_index/handler.py +77 -24
  48. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/integrations/llama_index/utils.py +24 -0
  49. deepeval-3.8.0/deepeval/integrations/pydantic_ai/__init__.py +5 -0
  50. deepeval-3.8.0/deepeval/integrations/pydantic_ai/agent.py +38 -0
  51. deepeval-3.8.0/deepeval/integrations/pydantic_ai/instrumentator.py +325 -0
  52. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/integrations/pydantic_ai/otel.py +13 -3
  53. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/key_handler.py +133 -52
  54. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/__init__.py +32 -16
  55. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/answer_relevancy/answer_relevancy.py +128 -117
  56. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/answer_relevancy/template.py +26 -7
  57. deepeval-3.8.0/deepeval/metrics/api.py +281 -0
  58. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/arena_g_eval/arena_g_eval.py +103 -97
  59. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/arena_g_eval/template.py +17 -1
  60. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/arena_g_eval/utils.py +5 -5
  61. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/argument_correctness/argument_correctness.py +93 -89
  62. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/argument_correctness/template.py +21 -4
  63. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/base_metric.py +20 -44
  64. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/bias/bias.py +112 -109
  65. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/bias/template.py +17 -5
  66. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/contextual_precision/contextual_precision.py +115 -98
  67. deepeval-3.8.0/deepeval/metrics/contextual_precision/template.py +133 -0
  68. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/contextual_recall/contextual_recall.py +105 -86
  69. deepeval-3.8.0/deepeval/metrics/contextual_recall/template.py +126 -0
  70. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/contextual_relevancy/contextual_relevancy.py +98 -85
  71. deepeval-3.8.0/deepeval/metrics/contextual_relevancy/template.py +106 -0
  72. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/conversation_completeness/conversation_completeness.py +113 -119
  73. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/conversation_completeness/template.py +25 -5
  74. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/conversational_dag/conversational_dag.py +24 -8
  75. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/conversational_dag/nodes.py +78 -127
  76. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/conversational_dag/templates.py +20 -4
  77. deepeval-3.8.0/deepeval/metrics/conversational_g_eval/__init__.py +3 -0
  78. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/conversational_g_eval/conversational_g_eval.py +157 -132
  79. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/conversational_g_eval/template.py +4 -3
  80. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/dag/dag.py +22 -0
  81. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/dag/nodes.py +75 -130
  82. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/dag/schema.py +1 -1
  83. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/dag/templates.py +19 -5
  84. deepeval-3.8.0/deepeval/metrics/exact_match/exact_match.py +102 -0
  85. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/faithfulness/faithfulness.py +158 -150
  86. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/faithfulness/schema.py +1 -1
  87. deepeval-3.8.0/deepeval/metrics/faithfulness/template.py +225 -0
  88. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/g_eval/g_eval.py +161 -86
  89. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/g_eval/template.py +18 -1
  90. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/g_eval/utils.py +73 -7
  91. deepeval-3.8.0/deepeval/metrics/goal_accuracy/__init__.py +1 -0
  92. deepeval-3.8.0/deepeval/metrics/goal_accuracy/goal_accuracy.py +364 -0
  93. deepeval-3.8.0/deepeval/metrics/goal_accuracy/schema.py +17 -0
  94. deepeval-3.8.0/deepeval/metrics/goal_accuracy/template.py +253 -0
  95. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/hallucination/hallucination.py +79 -83
  96. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/hallucination/template.py +17 -4
  97. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/indicator.py +43 -16
  98. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/json_correctness/json_correctness.py +52 -39
  99. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/json_correctness/template.py +10 -0
  100. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/knowledge_retention/knowledge_retention.py +72 -97
  101. deepeval-3.8.0/deepeval/metrics/knowledge_retention/schema.py +21 -0
  102. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/knowledge_retention/template.py +12 -0
  103. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/mcp/mcp_task_completion.py +90 -43
  104. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +122 -81
  105. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/mcp/schema.py +4 -0
  106. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/mcp/template.py +59 -0
  107. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/mcp_use_metric/mcp_use_metric.py +72 -66
  108. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/mcp_use_metric/template.py +12 -0
  109. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/misuse/misuse.py +89 -98
  110. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/misuse/template.py +17 -2
  111. deepeval-3.8.0/deepeval/metrics/multimodal_metrics/__init__.py +5 -0
  112. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +62 -53
  113. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +82 -95
  114. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +62 -53
  115. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +62 -53
  116. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  117. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/non_advice/non_advice.py +91 -105
  118. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/non_advice/template.py +14 -2
  119. deepeval-3.8.0/deepeval/metrics/pattern_match/pattern_match.py +111 -0
  120. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/pii_leakage/pii_leakage.py +87 -107
  121. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/pii_leakage/template.py +16 -2
  122. deepeval-3.8.0/deepeval/metrics/plan_adherence/__init__.py +1 -0
  123. deepeval-3.8.0/deepeval/metrics/plan_adherence/plan_adherence.py +266 -0
  124. deepeval-3.8.0/deepeval/metrics/plan_adherence/schema.py +11 -0
  125. deepeval-3.8.0/deepeval/metrics/plan_adherence/template.py +181 -0
  126. deepeval-3.8.0/deepeval/metrics/plan_quality/__init__.py +1 -0
  127. deepeval-3.8.0/deepeval/metrics/plan_quality/plan_quality.py +268 -0
  128. deepeval-3.8.0/deepeval/metrics/plan_quality/schema.py +11 -0
  129. deepeval-3.8.0/deepeval/metrics/plan_quality/template.py +110 -0
  130. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/prompt_alignment/prompt_alignment.py +103 -82
  131. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/prompt_alignment/template.py +16 -4
  132. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/ragas.py +3 -3
  133. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/role_adherence/role_adherence.py +60 -71
  134. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/role_adherence/template.py +14 -0
  135. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/role_violation/role_violation.py +87 -108
  136. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/role_violation/template.py +14 -2
  137. deepeval-3.8.0/deepeval/metrics/step_efficiency/__init__.py +1 -0
  138. deepeval-3.8.0/deepeval/metrics/step_efficiency/schema.py +11 -0
  139. deepeval-3.8.0/deepeval/metrics/step_efficiency/step_efficiency.py +224 -0
  140. deepeval-3.8.0/deepeval/metrics/step_efficiency/template.py +267 -0
  141. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/summarization/summarization.py +127 -184
  142. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/summarization/template.py +19 -0
  143. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/task_completion/task_completion.py +80 -75
  144. deepeval-3.8.0/deepeval/metrics/tool_correctness/schema.py +6 -0
  145. deepeval-3.8.0/deepeval/metrics/tool_correctness/template.py +88 -0
  146. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/tool_correctness/tool_correctness.py +240 -27
  147. deepeval-3.8.0/deepeval/metrics/tool_use/__init__.py +1 -0
  148. deepeval-3.8.0/deepeval/metrics/tool_use/schema.py +23 -0
  149. deepeval-3.8.0/deepeval/metrics/tool_use/template.py +234 -0
  150. deepeval-3.8.0/deepeval/metrics/tool_use/tool_use.py +436 -0
  151. deepeval-3.8.0/deepeval/metrics/topic_adherence/__init__.py +1 -0
  152. deepeval-3.8.0/deepeval/metrics/topic_adherence/schema.py +20 -0
  153. deepeval-3.8.0/deepeval/metrics/topic_adherence/template.py +182 -0
  154. deepeval-3.8.0/deepeval/metrics/topic_adherence/topic_adherence.py +342 -0
  155. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/toxicity/template.py +17 -4
  156. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/toxicity/toxicity.py +92 -99
  157. deepeval-3.8.0/deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  158. deepeval-3.8.0/deepeval/metrics/turn_contextual_precision/template.py +194 -0
  159. deepeval-3.8.0/deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  160. deepeval-3.8.0/deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  161. deepeval-3.8.0/deepeval/metrics/turn_contextual_recall/template.py +185 -0
  162. deepeval-3.8.0/deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +525 -0
  163. {deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy → deepeval-3.8.0/deepeval/metrics/turn_contextual_relevancy}/schema.py +7 -1
  164. deepeval-3.8.0/deepeval/metrics/turn_contextual_relevancy/template.py +168 -0
  165. deepeval-3.8.0/deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +532 -0
  166. {deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_faithfulness → deepeval-3.8.0/deepeval/metrics/turn_faithfulness}/schema.py +11 -3
  167. deepeval-3.8.0/deepeval/metrics/turn_faithfulness/template.py +225 -0
  168. deepeval-3.8.0/deepeval/metrics/turn_faithfulness/turn_faithfulness.py +573 -0
  169. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/turn_relevancy/template.py +16 -2
  170. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/turn_relevancy/turn_relevancy.py +68 -69
  171. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/utils.py +175 -121
  172. deepeval-3.8.0/deepeval/model_integrations/types.py +20 -0
  173. deepeval-3.8.0/deepeval/model_integrations/utils.py +116 -0
  174. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/models/__init__.py +4 -10
  175. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/models/base_model.py +52 -34
  176. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/models/embedding_models/__init__.py +7 -0
  177. deepeval-3.8.0/deepeval/models/embedding_models/azure_embedding_model.py +166 -0
  178. deepeval-3.8.0/deepeval/models/embedding_models/local_embedding_model.py +132 -0
  179. deepeval-3.8.0/deepeval/models/embedding_models/ollama_embedding_model.py +113 -0
  180. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/models/embedding_models/openai_embedding_model.py +61 -34
  181. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/models/llms/__init__.py +4 -0
  182. deepeval-3.8.0/deepeval/models/llms/amazon_bedrock_model.py +316 -0
  183. deepeval-3.8.0/deepeval/models/llms/anthropic_model.py +298 -0
  184. deepeval-3.8.0/deepeval/models/llms/azure_model.py +458 -0
  185. deepeval-3.8.0/deepeval/models/llms/constants.py +2055 -0
  186. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/models/llms/deepseek_model.py +114 -52
  187. deepeval-3.8.0/deepeval/models/llms/gemini_model.py +430 -0
  188. deepeval-3.8.0/deepeval/models/llms/grok_model.py +312 -0
  189. deepeval-3.8.0/deepeval/models/llms/kimi_model.py +294 -0
  190. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/models/llms/litellm_model.py +190 -56
  191. deepeval-3.8.0/deepeval/models/llms/local_model.py +242 -0
  192. deepeval-3.8.0/deepeval/models/llms/ollama_model.py +237 -0
  193. deepeval-3.8.0/deepeval/models/llms/openai_model.py +488 -0
  194. deepeval-3.8.0/deepeval/models/llms/openrouter_model.py +398 -0
  195. deepeval-3.8.0/deepeval/models/llms/portkey_model.py +191 -0
  196. deepeval-3.8.0/deepeval/models/llms/utils.py +49 -0
  197. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/models/retry_policy.py +311 -26
  198. deepeval-3.8.0/deepeval/models/utils.py +173 -0
  199. deepeval-3.8.0/deepeval/openai/__init__.py +21 -0
  200. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/openai/extractors.py +82 -47
  201. deepeval-3.8.0/deepeval/openai/patch.py +295 -0
  202. deepeval-3.8.0/deepeval/openai/utils.py +211 -0
  203. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/openai_agents/__init__.py +4 -3
  204. deepeval-3.8.0/deepeval/openai_agents/agent.py +36 -0
  205. deepeval-3.8.0/deepeval/openai_agents/callback_handler.py +151 -0
  206. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/openai_agents/extractors.py +83 -7
  207. deepeval-3.8.0/deepeval/openai_agents/patch.py +309 -0
  208. deepeval-3.8.0/deepeval/openai_agents/runner.py +348 -0
  209. deepeval-3.8.0/deepeval/optimizer/__init__.py +5 -0
  210. deepeval-3.8.0/deepeval/optimizer/algorithms/__init__.py +6 -0
  211. deepeval-3.8.0/deepeval/optimizer/algorithms/base.py +29 -0
  212. deepeval-3.8.0/deepeval/optimizer/algorithms/configs.py +18 -0
  213. deepeval-3.8.0/deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  214. deepeval-3.8.0/deepeval/optimizer/algorithms/copro/copro.py +836 -0
  215. deepeval-3.8.0/deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  216. deepeval-3.8.0/deepeval/optimizer/algorithms/gepa/gepa.py +737 -0
  217. deepeval-3.8.0/deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  218. deepeval-3.8.0/deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  219. deepeval-3.8.0/deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  220. deepeval-3.8.0/deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  221. deepeval-3.8.0/deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  222. deepeval-3.8.0/deepeval/optimizer/algorithms/simba/simba.py +999 -0
  223. deepeval-3.8.0/deepeval/optimizer/algorithms/simba/types.py +15 -0
  224. deepeval-3.8.0/deepeval/optimizer/configs.py +31 -0
  225. deepeval-3.8.0/deepeval/optimizer/policies.py +227 -0
  226. deepeval-3.8.0/deepeval/optimizer/prompt_optimizer.py +263 -0
  227. deepeval-3.8.0/deepeval/optimizer/rewriter/__init__.py +5 -0
  228. deepeval-3.8.0/deepeval/optimizer/rewriter/rewriter.py +124 -0
  229. deepeval-3.8.0/deepeval/optimizer/rewriter/utils.py +214 -0
  230. deepeval-3.8.0/deepeval/optimizer/scorer/__init__.py +5 -0
  231. deepeval-3.8.0/deepeval/optimizer/scorer/base.py +86 -0
  232. deepeval-3.8.0/deepeval/optimizer/scorer/scorer.py +316 -0
  233. deepeval-3.8.0/deepeval/optimizer/scorer/utils.py +30 -0
  234. deepeval-3.8.0/deepeval/optimizer/types.py +148 -0
  235. deepeval-3.8.0/deepeval/optimizer/utils.py +480 -0
  236. deepeval-3.8.0/deepeval/prompt/__init__.py +21 -0
  237. deepeval-3.8.0/deepeval/prompt/api.py +234 -0
  238. deepeval-3.8.0/deepeval/prompt/prompt.py +837 -0
  239. deepeval-3.8.0/deepeval/prompt/utils.py +221 -0
  240. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/simulator/conversation_simulator.py +74 -20
  241. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/simulator/template.py +17 -2
  242. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/synthesizer/chunking/context_generator.py +217 -152
  243. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  244. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/synthesizer/config.py +9 -0
  245. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/synthesizer/schema.py +23 -0
  246. deepeval-3.8.0/deepeval/synthesizer/synthesizer.py +2751 -0
  247. deepeval-3.8.0/deepeval/synthesizer/templates/__init__.py +12 -0
  248. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/synthesizer/templates/template.py +554 -1
  249. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/synthesizer/templates/template_extraction.py +32 -0
  250. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/synthesizer/templates/template_prompt.py +262 -0
  251. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/telemetry.py +3 -3
  252. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/test_case/__init__.py +3 -4
  253. deepeval-3.8.0/deepeval/test_case/api.py +112 -0
  254. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/test_case/arena_test_case.py +21 -5
  255. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/test_case/conversational_test_case.py +68 -1
  256. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/test_case/llm_test_case.py +215 -2
  257. deepeval-3.8.0/deepeval/test_case/utils.py +20 -0
  258. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/test_run/__init__.py +3 -1
  259. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/test_run/api.py +22 -16
  260. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/test_run/cache.py +37 -13
  261. deepeval-3.8.0/deepeval/test_run/hyperparameters.py +109 -0
  262. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/test_run/test_run.py +437 -227
  263. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/tracing/__init__.py +3 -0
  264. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/tracing/api.py +11 -8
  265. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/tracing/context.py +4 -0
  266. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/tracing/otel/exporter.py +248 -176
  267. deepeval-3.8.0/deepeval/tracing/otel/test_exporter.py +35 -0
  268. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/tracing/otel/utils.py +258 -23
  269. deepeval-3.8.0/deepeval/tracing/patchers.py +190 -0
  270. deepeval-3.8.0/deepeval/tracing/trace_context.py +107 -0
  271. deepeval-3.8.0/deepeval/tracing/trace_test_manager.py +19 -0
  272. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/tracing/tracing.py +129 -23
  273. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/tracing/types.py +29 -11
  274. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/tracing/utils.py +68 -84
  275. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/utils.py +357 -11
  276. {deepeval-3.5.8 → deepeval-3.8.0}/pyproject.toml +26 -10
  277. deepeval-3.5.8/deepeval/_version.py +0 -1
  278. deepeval-3.5.8/deepeval/cli/main.py +0 -1629
  279. deepeval-3.5.8/deepeval/cli/utils.py +0 -181
  280. deepeval-3.5.8/deepeval/config/settings.py +0 -671
  281. deepeval-3.5.8/deepeval/dataset/__init__.py +0 -5
  282. deepeval-3.5.8/deepeval/dataset/api.py +0 -28
  283. deepeval-3.5.8/deepeval/dataset/golden.py +0 -60
  284. deepeval-3.5.8/deepeval/errors.py +0 -6
  285. deepeval-3.5.8/deepeval/evaluate/execute.py +0 -2242
  286. deepeval-3.5.8/deepeval/integrations/crewai/__init__.py +0 -4
  287. deepeval-3.5.8/deepeval/integrations/crewai/agent.py +0 -98
  288. deepeval-3.5.8/deepeval/integrations/crewai/handler.py +0 -124
  289. deepeval-3.5.8/deepeval/integrations/crewai/patch.py +0 -41
  290. deepeval-3.5.8/deepeval/integrations/langchain/callback.py +0 -345
  291. deepeval-3.5.8/deepeval/integrations/llama_index/__init__.py +0 -10
  292. deepeval-3.5.8/deepeval/integrations/llama_index/agent/patched.py +0 -68
  293. deepeval-3.5.8/deepeval/integrations/pydantic_ai/__init__.py +0 -5
  294. deepeval-3.5.8/deepeval/integrations/pydantic_ai/agent.py +0 -339
  295. deepeval-3.5.8/deepeval/integrations/pydantic_ai/patcher.py +0 -484
  296. deepeval-3.5.8/deepeval/integrations/pydantic_ai/utils.py +0 -323
  297. deepeval-3.5.8/deepeval/metrics/contextual_precision/template.py +0 -84
  298. deepeval-3.5.8/deepeval/metrics/contextual_recall/template.py +0 -75
  299. deepeval-3.5.8/deepeval/metrics/contextual_relevancy/template.py +0 -77
  300. deepeval-3.5.8/deepeval/metrics/faithfulness/template.py +0 -140
  301. deepeval-3.5.8/deepeval/metrics/knowledge_retention/schema.py +0 -15
  302. deepeval-3.5.8/deepeval/metrics/multimodal_metrics/__init__.py +0 -24
  303. deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -338
  304. deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  305. deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  306. deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -288
  307. deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  308. deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  309. deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -282
  310. deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  311. deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  312. deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -279
  313. deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  314. deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -353
  315. deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  316. deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -379
  317. deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  318. deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  319. deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  320. deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -285
  321. deepeval-3.5.8/deepeval/models/embedding_models/azure_embedding_model.py +0 -106
  322. deepeval-3.5.8/deepeval/models/embedding_models/local_embedding_model.py +0 -102
  323. deepeval-3.5.8/deepeval/models/embedding_models/ollama_embedding_model.py +0 -80
  324. deepeval-3.5.8/deepeval/models/llms/amazon_bedrock_model.py +0 -186
  325. deepeval-3.5.8/deepeval/models/llms/anthropic_model.py +0 -170
  326. deepeval-3.5.8/deepeval/models/llms/azure_model.py +0 -287
  327. deepeval-3.5.8/deepeval/models/llms/gemini_model.py +0 -232
  328. deepeval-3.5.8/deepeval/models/llms/grok_model.py +0 -237
  329. deepeval-3.5.8/deepeval/models/llms/kimi_model.py +0 -236
  330. deepeval-3.5.8/deepeval/models/llms/local_model.py +0 -130
  331. deepeval-3.5.8/deepeval/models/llms/ollama_model.py +0 -104
  332. deepeval-3.5.8/deepeval/models/llms/openai_model.py +0 -518
  333. deepeval-3.5.8/deepeval/models/llms/utils.py +0 -22
  334. deepeval-3.5.8/deepeval/models/mlllms/__init__.py +0 -3
  335. deepeval-3.5.8/deepeval/models/mlllms/gemini_model.py +0 -284
  336. deepeval-3.5.8/deepeval/models/mlllms/ollama_model.py +0 -144
  337. deepeval-3.5.8/deepeval/models/mlllms/openai_model.py +0 -258
  338. deepeval-3.5.8/deepeval/models/utils.py +0 -31
  339. deepeval-3.5.8/deepeval/openai/__init__.py +0 -37
  340. deepeval-3.5.8/deepeval/openai/patch.py +0 -204
  341. deepeval-3.5.8/deepeval/openai/utils.py +0 -86
  342. deepeval-3.5.8/deepeval/openai_agents/agent.py +0 -194
  343. deepeval-3.5.8/deepeval/openai_agents/callback_handler.py +0 -134
  344. deepeval-3.5.8/deepeval/openai_agents/patch.py +0 -115
  345. deepeval-3.5.8/deepeval/openai_agents/runner.py +0 -335
  346. deepeval-3.5.8/deepeval/prompt/__init__.py +0 -3
  347. deepeval-3.5.8/deepeval/prompt/api.py +0 -70
  348. deepeval-3.5.8/deepeval/prompt/prompt.py +0 -434
  349. deepeval-3.5.8/deepeval/prompt/utils.py +0 -50
  350. deepeval-3.5.8/deepeval/synthesizer/synthesizer.py +0 -1502
  351. deepeval-3.5.8/deepeval/synthesizer/templates/__init__.py +0 -3
  352. deepeval-3.5.8/deepeval/test_case/mllm_test_case.py +0 -147
  353. deepeval-3.5.8/deepeval/test_case/utils.py +0 -24
  354. deepeval-3.5.8/deepeval/test_run/hyperparameters.py +0 -66
  355. deepeval-3.5.8/deepeval/tracing/patchers.py +0 -84
  356. {deepeval-3.5.8 → deepeval-3.8.0}/LICENSE.md +0 -0
  357. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/annotation/__init__.py +0 -0
  358. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/annotation/annotation.py +0 -0
  359. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/annotation/api.py +0 -0
  360. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/__init__.py +0 -0
  361. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/arc/__init__.py +0 -0
  362. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/arc/arc.py +0 -0
  363. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/arc/mode.py +0 -0
  364. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/arc/template.py +0 -0
  365. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/base_benchmark.py +0 -0
  366. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/bbq/__init__.py +0 -0
  367. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/bbq/bbq.py +0 -0
  368. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/bbq/task.py +0 -0
  369. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/bbq/template.py +0 -0
  370. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/__init__.py +0 -0
  371. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/big_bench_hard.py +0 -0
  372. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/__init__.py +0 -0
  373. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/boolean_expressions.txt +0 -0
  374. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/causal_judgement.txt +0 -0
  375. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/date_understanding.txt +0 -0
  376. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/disambiguation_qa.txt +0 -0
  377. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/dyck_languages.txt +0 -0
  378. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/formal_fallacies.txt +0 -0
  379. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/geometric_shapes.txt +0 -0
  380. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/hyperbaton.txt +0 -0
  381. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_five_objects.txt +0 -0
  382. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  383. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_three_objects.txt +0 -0
  384. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/movie_recommendation.txt +0 -0
  385. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/multistep_arithmetic_two.txt +0 -0
  386. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/navigate.txt +0 -0
  387. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/object_counting.txt +0 -0
  388. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/penguins_in_a_table.txt +0 -0
  389. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  390. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/ruin_names.txt +0 -0
  391. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/salient_translation_error_detection.txt +0 -0
  392. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/snarks.txt +0 -0
  393. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/sports_understanding.txt +0 -0
  394. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/temporal_sequences.txt +0 -0
  395. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  396. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  397. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  398. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/web_of_lies.txt +0 -0
  399. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/word_sorting.txt +0 -0
  400. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/__init__.py +0 -0
  401. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/boolean_expressions.txt +0 -0
  402. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/causal_judgement.txt +0 -0
  403. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/date_understanding.txt +0 -0
  404. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/disambiguation_qa.txt +0 -0
  405. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/dyck_languages.txt +0 -0
  406. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/formal_fallacies.txt +0 -0
  407. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/geometric_shapes.txt +0 -0
  408. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/hyperbaton.txt +0 -0
  409. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_five_objects.txt +0 -0
  410. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_seven_objects.txt +0 -0
  411. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_three_objects.txt +0 -0
  412. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/movie_recommendation.txt +0 -0
  413. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/multistep_arithmetic_two.txt +0 -0
  414. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/navigate.txt +0 -0
  415. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/object_counting.txt +0 -0
  416. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/penguins_in_a_table.txt +0 -0
  417. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/reasoning_about_colored_objects.txt +0 -0
  418. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/ruin_names.txt +0 -0
  419. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/salient_translation_error_detection.txt +0 -0
  420. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/snarks.txt +0 -0
  421. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/sports_understanding.txt +0 -0
  422. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/temporal_sequences.txt +0 -0
  423. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  424. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  425. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  426. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/web_of_lies.txt +0 -0
  427. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/word_sorting.txt +0 -0
  428. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/task.py +0 -0
  429. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/template.py +0 -0
  430. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/bool_q/__init__.py +0 -0
  431. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/bool_q/bool_q.py +0 -0
  432. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/bool_q/template.py +0 -0
  433. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/drop/__init__.py +0 -0
  434. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/drop/task.py +0 -0
  435. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/drop/template.py +0 -0
  436. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/equity_med_qa/__init__.py +0 -0
  437. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/equity_med_qa/task.py +0 -0
  438. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/equity_med_qa/template.py +0 -0
  439. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/gsm8k/__init__.py +0 -0
  440. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/gsm8k/gsm8k.py +0 -0
  441. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/gsm8k/template.py +0 -0
  442. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/hellaswag/__init__.py +0 -0
  443. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/hellaswag/hellaswag.py +0 -0
  444. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/hellaswag/task.py +0 -0
  445. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/hellaswag/template.py +0 -0
  446. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/human_eval/__init__.py +0 -0
  447. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/human_eval/task.py +0 -0
  448. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/human_eval/template.py +0 -0
  449. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/ifeval/__init__.py +0 -0
  450. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/ifeval/template.py +0 -0
  451. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/lambada/__init__.py +0 -0
  452. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/lambada/lambada.py +0 -0
  453. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/lambada/template.py +0 -0
  454. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/logi_qa/__init__.py +0 -0
  455. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/logi_qa/logi_qa.py +0 -0
  456. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/logi_qa/task.py +0 -0
  457. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/logi_qa/template.py +0 -0
  458. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/math_qa/__init__.py +0 -0
  459. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/math_qa/math_qa.py +0 -0
  460. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/math_qa/task.py +0 -0
  461. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/math_qa/template.py +0 -0
  462. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/mmlu/__init__.py +0 -0
  463. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/mmlu/task.py +0 -0
  464. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/mmlu/template.py +0 -0
  465. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/modes/__init__.py +0 -0
  466. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/results.py +0 -0
  467. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/schema.py +0 -0
  468. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/squad/__init__.py +0 -0
  469. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/squad/squad.py +0 -0
  470. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/squad/task.py +0 -0
  471. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/squad/template.py +0 -0
  472. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/tasks/__init__.py +0 -0
  473. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/truthful_qa/__init__.py +0 -0
  474. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/truthful_qa/mode.py +0 -0
  475. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/truthful_qa/task.py +0 -0
  476. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/truthful_qa/template.py +0 -0
  477. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/truthful_qa/truthful_qa.py +0 -0
  478. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/utils.py +0 -0
  479. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/winogrande/__init__.py +0 -0
  480. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/winogrande/template.py +0 -0
  481. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/winogrande/winogrande.py +0 -0
  482. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/cli/__init__.py +0 -0
  483. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/cli/dotenv_handler.py +0 -0
  484. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/cli/server.py +0 -0
  485. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/cli/types.py +0 -0
  486. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/confident/__init__.py +0 -0
  487. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/config/__init__.py +0 -0
  488. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/dataset/types.py +0 -0
  489. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/evaluate/__init__.py +0 -0
  490. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/evaluate/api.py +0 -0
  491. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/integrations/__init__.py +0 -0
  492. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/integrations/hugging_face/__init__.py +0 -0
  493. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/integrations/hugging_face/callback.py +0 -0
  494. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/integrations/hugging_face/rich_manager.py +0 -0
  495. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/integrations/hugging_face/tests/test_callbacks.py +0 -0
  496. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/integrations/hugging_face/utils.py +0 -0
  497. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/integrations/langchain/__init__.py +0 -0
  498. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/integrations/langchain/patch.py +0 -0
  499. /deepeval-3.5.8/deepeval/metrics/argument_correctness/__init__.py → /deepeval-3.8.0/deepeval/integrations/pydantic_ai/test_instrumentator.py +0 -0
  500. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/answer_relevancy/__init__.py +0 -0
  501. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/answer_relevancy/schema.py +0 -0
  502. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/arena_g_eval/__init__.py +0 -0
  503. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/arena_g_eval/schema.py +0 -0
  504. {deepeval-3.5.8/deepeval/metrics/conversation_completeness → deepeval-3.8.0/deepeval/metrics/argument_correctness}/__init__.py +0 -0
  505. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/argument_correctness/schema.py +0 -0
  506. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/bias/__init__.py +0 -0
  507. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/bias/schema.py +0 -0
  508. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/contextual_precision/__init__.py +0 -0
  509. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/contextual_precision/schema.py +0 -0
  510. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/contextual_recall/__init__.py +0 -0
  511. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/contextual_recall/schema.py +0 -0
  512. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/contextual_relevancy/__init__.py +0 -0
  513. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/contextual_relevancy/schema.py +0 -0
  514. {deepeval-3.5.8/deepeval/metrics/conversational_g_eval → deepeval-3.8.0/deepeval/metrics/conversation_completeness}/__init__.py +0 -0
  515. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/conversation_completeness/schema.py +0 -0
  516. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/conversational_dag/__init__.py +0 -0
  517. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/conversational_g_eval/schema.py +0 -0
  518. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/dag/__init__.py +0 -0
  519. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/dag/graph.py +0 -0
  520. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/dag/utils.py +0 -0
  521. {deepeval-3.5.8/deepeval/metrics/json_correctness → deepeval-3.8.0/deepeval/metrics/exact_match}/__init__.py +0 -0
  522. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/faithfulness/__init__.py +0 -0
  523. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/g_eval/__init__.py +0 -0
  524. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/g_eval/schema.py +0 -0
  525. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/hallucination/__init__.py +0 -0
  526. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/hallucination/schema.py +0 -0
  527. {deepeval-3.5.8/deepeval/metrics/knowledge_retention → deepeval-3.8.0/deepeval/metrics/json_correctness}/__init__.py +0 -0
  528. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/json_correctness/schema.py +0 -0
  529. {deepeval-3.5.8/deepeval/metrics/mcp → deepeval-3.8.0/deepeval/metrics/knowledge_retention}/__init__.py +0 -0
  530. {deepeval-3.5.8/deepeval/metrics/mcp_use_metric → deepeval-3.8.0/deepeval/metrics/mcp}/__init__.py +0 -0
  531. {deepeval-3.5.8/deepeval/metrics/multimodal_metrics/image_coherence → deepeval-3.8.0/deepeval/metrics/mcp_use_metric}/__init__.py +0 -0
  532. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/mcp_use_metric/schema.py +0 -0
  533. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/misuse/__init__.py +0 -0
  534. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/misuse/schema.py +0 -0
  535. {deepeval-3.5.8/deepeval/metrics/multimodal_metrics/image_editing → deepeval-3.8.0/deepeval/metrics/multimodal_metrics/image_coherence}/__init__.py +0 -0
  536. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/multimodal_metrics/image_coherence/schema.py +0 -0
  537. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/multimodal_metrics/image_coherence/template.py +0 -0
  538. {deepeval-3.5.8/deepeval/metrics/multimodal_metrics/image_helpfulness → deepeval-3.8.0/deepeval/metrics/multimodal_metrics/image_editing}/__init__.py +0 -0
  539. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/multimodal_metrics/image_editing/schema.py +0 -0
  540. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/multimodal_metrics/image_editing/template.py +0 -0
  541. {deepeval-3.5.8/deepeval/metrics/multimodal_metrics/image_reference → deepeval-3.8.0/deepeval/metrics/multimodal_metrics/image_helpfulness}/__init__.py +0 -0
  542. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/multimodal_metrics/image_helpfulness/schema.py +0 -0
  543. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/multimodal_metrics/image_helpfulness/template.py +0 -0
  544. {deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy → deepeval-3.8.0/deepeval/metrics/multimodal_metrics/image_reference}/__init__.py +0 -0
  545. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/multimodal_metrics/image_reference/schema.py +0 -0
  546. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/multimodal_metrics/image_reference/template.py +0 -0
  547. {deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision → deepeval-3.8.0/deepeval/metrics/multimodal_metrics/text_to_image}/__init__.py +0 -0
  548. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/multimodal_metrics/text_to_image/schema.py +0 -0
  549. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/multimodal_metrics/text_to_image/template.py +0 -0
  550. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/non_advice/__init__.py +0 -0
  551. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/non_advice/schema.py +0 -0
  552. {deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall → deepeval-3.8.0/deepeval/metrics/pattern_match}/__init__.py +0 -0
  553. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/pii_leakage/__init__.py +0 -0
  554. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/pii_leakage/schema.py +0 -0
  555. {deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy → deepeval-3.8.0/deepeval/metrics/prompt_alignment}/__init__.py +0 -0
  556. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/prompt_alignment/schema.py +0 -0
  557. {deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_faithfulness → deepeval-3.8.0/deepeval/metrics/role_adherence}/__init__.py +0 -0
  558. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/role_adherence/schema.py +0 -0
  559. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/role_violation/__init__.py +0 -0
  560. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/role_violation/schema.py +0 -0
  561. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/summarization/__init__.py +0 -0
  562. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/summarization/schema.py +0 -0
  563. {deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_g_eval → deepeval-3.8.0/deepeval/metrics/task_completion}/__init__.py +0 -0
  564. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/task_completion/schema.py +0 -0
  565. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/task_completion/template.py +0 -0
  566. {deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness → deepeval-3.8.0/deepeval/metrics/tool_correctness}/__init__.py +0 -0
  567. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/toxicity/__init__.py +0 -0
  568. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/toxicity/schema.py +0 -0
  569. {deepeval-3.5.8/deepeval/metrics/multimodal_metrics/text_to_image → deepeval-3.8.0/deepeval/metrics/turn_contextual_precision}/__init__.py +0 -0
  570. {deepeval-3.5.8/deepeval/metrics/prompt_alignment → deepeval-3.8.0/deepeval/metrics/turn_contextual_recall}/__init__.py +0 -0
  571. {deepeval-3.5.8/deepeval/metrics/role_adherence → deepeval-3.8.0/deepeval/metrics/turn_contextual_relevancy}/__init__.py +0 -0
  572. {deepeval-3.5.8/deepeval/metrics/task_completion → deepeval-3.8.0/deepeval/metrics/turn_faithfulness}/__init__.py +0 -0
  573. {deepeval-3.5.8/deepeval/metrics/tool_correctness → deepeval-3.8.0/deepeval/metrics/turn_relevancy}/__init__.py +0 -0
  574. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/turn_relevancy/schema.py +0 -0
  575. {deepeval-3.5.8/deepeval/metrics/turn_relevancy → deepeval-3.8.0/deepeval/model_integrations}/__init__.py +0 -0
  576. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/models/_summac_model.py +0 -0
  577. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/models/answer_relevancy_model.py +0 -0
  578. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/models/detoxify_model.py +0 -0
  579. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/models/hallucination_model.py +0 -0
  580. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/models/summac_model.py +0 -0
  581. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/models/unbias_model.py +0 -0
  582. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/plugins/__init__.py +0 -0
  583. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/plugins/plugin.py +0 -0
  584. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/progress_context.py +0 -0
  585. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/py.typed +0 -0
  586. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/red_teaming/README.md +0 -0
  587. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/scorer/__init__.py +0 -0
  588. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/scorer/scorer.py +0 -0
  589. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/simulator/__init__.py +0 -0
  590. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/simulator/schema.py +0 -0
  591. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/singleton.py +0 -0
  592. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/synthesizer/__init__.py +0 -0
  593. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/synthesizer/base_synthesizer.py +0 -0
  594. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/synthesizer/chunking/__init__.py +0 -0
  595. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/synthesizer/types.py +0 -0
  596. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/synthesizer/utils.py +0 -0
  597. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/test_case/mcp.py +0 -0
  598. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/test_run/hooks.py +0 -0
  599. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/tracing/offline_evals/__init__.py +0 -0
  600. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/tracing/offline_evals/api.py +0 -0
  601. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/tracing/offline_evals/span.py +0 -0
  602. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/tracing/offline_evals/thread.py +0 -0
  603. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/tracing/offline_evals/trace.py +0 -0
  604. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/tracing/otel/__init__.py +0 -0
  605. {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/tracing/perf_epoch_bridge.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deepeval
3
- Version: 3.5.8
3
+ Version: 3.8.0
4
4
  Summary: The LLM Evaluation Framework
5
5
  Home-page: https://github.com/confident-ai/deepeval
6
6
  License: Apache-2.0
@@ -13,26 +13,23 @@ Classifier: Programming Language :: Python :: 3.9
13
13
  Classifier: Programming Language :: Python :: 3.10
14
14
  Classifier: Programming Language :: Python :: 3.11
15
15
  Requires-Dist: aiohttp
16
- Requires-Dist: anthropic
17
16
  Requires-Dist: click (>=8.0.0,<8.3.0)
18
- Requires-Dist: google-genai (>=1.9.0,<2.0.0)
19
17
  Requires-Dist: grpcio (>=1.67.1,<2.0.0)
20
18
  Requires-Dist: jinja2
21
19
  Requires-Dist: nest_asyncio
22
- Requires-Dist: ollama
23
20
  Requires-Dist: openai
24
21
  Requires-Dist: opentelemetry-api (>=1.24.0,<2.0.0)
25
22
  Requires-Dist: opentelemetry-exporter-otlp-proto-grpc (>=1.24.0,<2.0.0)
26
23
  Requires-Dist: opentelemetry-sdk (>=1.24.0,<2.0.0)
27
24
  Requires-Dist: portalocker
28
- Requires-Dist: posthog (>=6.3.0,<7.0.0)
25
+ Requires-Dist: posthog (>=5.4.0,<6.0.0)
29
26
  Requires-Dist: pydantic (>=2.11.7,<3.0.0)
30
27
  Requires-Dist: pydantic-settings (>=2.10.1,<3.0.0)
31
28
  Requires-Dist: pyfiglet
32
29
  Requires-Dist: pytest
33
30
  Requires-Dist: pytest-asyncio
34
31
  Requires-Dist: pytest-repeat
35
- Requires-Dist: pytest-rerunfailures (>=12.0,<13.0)
32
+ Requires-Dist: pytest-rerunfailures
36
33
  Requires-Dist: pytest-xdist
37
34
  Requires-Dist: python-dotenv (>=1.1.1,<2.0.0)
38
35
  Requires-Dist: requests (>=2.31.0,<3.0.0)
@@ -103,9 +100,9 @@ Description-Content-Type: text/markdown
103
100
  <a href="https://www.readme-i18n.com/confident-ai/deepeval?lang=zh">中文</a>
104
101
  </p>
105
102
 
106
- **DeepEval** is a simple-to-use, open-source LLM evaluation framework, for evaluating and testing large-language model systems. It is similar to Pytest but specialized for unit testing LLM outputs. DeepEval incorporates the latest research to evaluate LLM outputs based on metrics such as G-Eval, hallucination, answer relevancy, RAGAS, etc., which uses LLMs and various other NLP models that runs **locally on your machine** for evaluation.
103
+ **DeepEval** is a simple-to-use, open-source LLM evaluation framework, for evaluating and testing large-language model systems. It is similar to Pytest but specialized for unit testing LLM outputs. DeepEval incorporates the latest research to evaluate LLM outputs based on metrics such as G-Eval, task completion, answer relevancy, hallucination, etc., which uses LLM-as-a-judge and other NLP models that run **locally on your machine** for evaluation.
107
104
 
108
- Whether your LLM applications are RAG pipelines, chatbots, AI agents, implemented via LangChain or LlamaIndex, DeepEval has you covered. With it, you can easily determine the optimal models, prompts, and architecture to improve your RAG pipeline, agentic workflows, prevent prompt drifting, or even transition from OpenAI to hosting your own Deepseek R1 with confidence.
105
+ Whether your LLM applications are AI agents, RAG pipelines, or chatbots, implemented via LangChain or OpenAI, DeepEval has you covered. With it, you can easily determine the optimal models, prompts, and architecture to improve your RAG pipeline, agentic workflows, prevent prompt drifting, or even transition from OpenAI to hosting your own Deepseek R1 with confidence.
109
106
 
110
107
  > [!IMPORTANT]
111
108
  > Need a place for your DeepEval testing data to live 🏡❤️? [Sign up to the DeepEval platform](https://confident-ai.com?utm_source=GitHub) to compare iterations of your LLM app, generate & share testing reports, and more.
@@ -118,10 +115,10 @@ Whether your LLM applications are RAG pipelines, chatbots, AI agents, implemente
118
115
 
119
116
  # 🔥 Metrics and Features
120
117
 
121
- > 🥳 You can now share DeepEval's test results on the cloud directly on [Confident AI](https://confident-ai.com?utm_source=GitHub)'s infrastructure
118
+ > 🥳 You can now share DeepEval's test results on the cloud directly on [Confident AI](https://confident-ai.com?utm_source=GitHub)
122
119
 
123
120
  - Supports both end-to-end and component-level LLM evaluation.
124
- - Large variety of ready-to-use LLM evaluation metrics (all with explanations) powered by **ANY** LLM of your choice, statistical methods, or NLP models that runs **locally on your machine**:
121
+ - Large variety of ready-to-use LLM evaluation metrics (all with explanations) powered by **ANY** LLM of your choice, statistical methods, or NLP models that run **locally on your machine**:
125
122
  - G-Eval
126
123
  - DAG ([deep acyclic graph](https://deepeval.com/docs/metrics-dag))
127
124
  - **RAG metrics:**
@@ -161,7 +158,7 @@ Whether your LLM applications are RAG pipelines, chatbots, AI agents, implemente
161
158
  - TruthfulQA
162
159
  - HumanEval
163
160
  - GSM8K
164
- - [100% integrated with Confident AI](https://confident-ai.com?utm_source=GitHub) for the full evaluation lifecycle:
161
+ - [100% integrated with Confident AI](https://confident-ai.com?utm_source=GitHub) for the full evaluation & observability lifecycle:
165
162
  - Curate/annotate evaluation datasets on the cloud
166
163
  - Benchmark LLM app using dataset, and compare with previous iterations to experiment which models/prompts works best
167
164
  - Fine-tune metrics for custom results
@@ -170,7 +167,7 @@ Whether your LLM applications are RAG pipelines, chatbots, AI agents, implemente
170
167
  - Repeat until perfection
171
168
 
172
169
  > [!NOTE]
173
- > Confident AI is the DeepEval platform. Create an account [here.](https://app.confident-ai.com?utm_source=GitHub)
170
+ > DeepEval is available on Confident AI, an LLM evals platform for AI observability and quality. Create an account [here.](https://app.confident-ai.com?utm_source=GitHub)
174
171
 
175
172
  <br />
176
173
 
@@ -359,7 +356,7 @@ for golden in dataset.goldens:
359
356
 
360
357
  @pytest.mark.parametrize(
361
358
  "test_case",
362
- dataset,
359
+ dataset.test_cases,
363
360
  )
364
361
  def test_customer_chatbot(test_case: LLMTestCase):
365
362
  answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5)
@@ -397,7 +394,7 @@ cp .env.example .env.local
397
394
 
398
395
  # DeepEval With Confident AI
399
396
 
400
- DeepEval's cloud platform, [Confident AI](https://confident-ai.com?utm_source=Github), allows you to:
397
+ DeepEval is available on [Confident AI](https://confident-ai.com?utm_source=Github), an evals & observability platform that allows you to:
401
398
 
402
399
  1. Curate/annotate evaluation datasets on the cloud
403
400
  2. Benchmark LLM app using dataset, and compare with previous iterations to experiment which models/prompts works best
@@ -439,6 +436,7 @@ Using `.env.local` or `.env` is optional. If they are missing, DeepEval uses you
439
436
  ```bash
440
437
  cp .env.example .env.local
441
438
  # then edit .env.local (ignored by git)
439
+ ```
442
440
 
443
441
  <br />
444
442
 
@@ -53,9 +53,9 @@
53
53
  <a href="https://www.readme-i18n.com/confident-ai/deepeval?lang=zh">中文</a>
54
54
  </p>
55
55
 
56
- **DeepEval** is a simple-to-use, open-source LLM evaluation framework, for evaluating and testing large-language model systems. It is similar to Pytest but specialized for unit testing LLM outputs. DeepEval incorporates the latest research to evaluate LLM outputs based on metrics such as G-Eval, hallucination, answer relevancy, RAGAS, etc., which uses LLMs and various other NLP models that runs **locally on your machine** for evaluation.
56
+ **DeepEval** is a simple-to-use, open-source LLM evaluation framework, for evaluating and testing large-language model systems. It is similar to Pytest but specialized for unit testing LLM outputs. DeepEval incorporates the latest research to evaluate LLM outputs based on metrics such as G-Eval, task completion, answer relevancy, hallucination, etc., which uses LLM-as-a-judge and other NLP models that run **locally on your machine** for evaluation.
57
57
 
58
- Whether your LLM applications are RAG pipelines, chatbots, AI agents, implemented via LangChain or LlamaIndex, DeepEval has you covered. With it, you can easily determine the optimal models, prompts, and architecture to improve your RAG pipeline, agentic workflows, prevent prompt drifting, or even transition from OpenAI to hosting your own Deepseek R1 with confidence.
58
+ Whether your LLM applications are AI agents, RAG pipelines, or chatbots, implemented via LangChain or OpenAI, DeepEval has you covered. With it, you can easily determine the optimal models, prompts, and architecture to improve your RAG pipeline, agentic workflows, prevent prompt drifting, or even transition from OpenAI to hosting your own Deepseek R1 with confidence.
59
59
 
60
60
  > [!IMPORTANT]
61
61
  > Need a place for your DeepEval testing data to live 🏡❤️? [Sign up to the DeepEval platform](https://confident-ai.com?utm_source=GitHub) to compare iterations of your LLM app, generate & share testing reports, and more.
@@ -68,10 +68,10 @@ Whether your LLM applications are RAG pipelines, chatbots, AI agents, implemente
68
68
 
69
69
  # 🔥 Metrics and Features
70
70
 
71
- > 🥳 You can now share DeepEval's test results on the cloud directly on [Confident AI](https://confident-ai.com?utm_source=GitHub)'s infrastructure
71
+ > 🥳 You can now share DeepEval's test results on the cloud directly on [Confident AI](https://confident-ai.com?utm_source=GitHub)
72
72
 
73
73
  - Supports both end-to-end and component-level LLM evaluation.
74
- - Large variety of ready-to-use LLM evaluation metrics (all with explanations) powered by **ANY** LLM of your choice, statistical methods, or NLP models that runs **locally on your machine**:
74
+ - Large variety of ready-to-use LLM evaluation metrics (all with explanations) powered by **ANY** LLM of your choice, statistical methods, or NLP models that run **locally on your machine**:
75
75
  - G-Eval
76
76
  - DAG ([deep acyclic graph](https://deepeval.com/docs/metrics-dag))
77
77
  - **RAG metrics:**
@@ -111,7 +111,7 @@ Whether your LLM applications are RAG pipelines, chatbots, AI agents, implemente
111
111
  - TruthfulQA
112
112
  - HumanEval
113
113
  - GSM8K
114
- - [100% integrated with Confident AI](https://confident-ai.com?utm_source=GitHub) for the full evaluation lifecycle:
114
+ - [100% integrated with Confident AI](https://confident-ai.com?utm_source=GitHub) for the full evaluation & observability lifecycle:
115
115
  - Curate/annotate evaluation datasets on the cloud
116
116
  - Benchmark LLM app using dataset, and compare with previous iterations to experiment which models/prompts works best
117
117
  - Fine-tune metrics for custom results
@@ -120,7 +120,7 @@ Whether your LLM applications are RAG pipelines, chatbots, AI agents, implemente
120
120
  - Repeat until perfection
121
121
 
122
122
  > [!NOTE]
123
- > Confident AI is the DeepEval platform. Create an account [here.](https://app.confident-ai.com?utm_source=GitHub)
123
+ > DeepEval is available on Confident AI, an LLM evals platform for AI observability and quality. Create an account [here.](https://app.confident-ai.com?utm_source=GitHub)
124
124
 
125
125
  <br />
126
126
 
@@ -309,7 +309,7 @@ for golden in dataset.goldens:
309
309
 
310
310
  @pytest.mark.parametrize(
311
311
  "test_case",
312
- dataset,
312
+ dataset.test_cases,
313
313
  )
314
314
  def test_customer_chatbot(test_case: LLMTestCase):
315
315
  answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5)
@@ -347,7 +347,7 @@ cp .env.example .env.local
347
347
 
348
348
  # DeepEval With Confident AI
349
349
 
350
- DeepEval's cloud platform, [Confident AI](https://confident-ai.com?utm_source=Github), allows you to:
350
+ DeepEval is available on [Confident AI](https://confident-ai.com?utm_source=Github), an evals & observability platform that allows you to:
351
351
 
352
352
  1. Curate/annotate evaluation datasets on the cloud
353
353
  2. Benchmark LLM app using dataset, and compare with previous iterations to experiment which models/prompts works best
@@ -389,6 +389,7 @@ Using `.env.local` or `.env` is optional. If they are missing, DeepEval uses you
389
389
  ```bash
390
390
  cp .env.example .env.local
391
391
  # then edit .env.local (ignored by git)
392
+ ```
392
393
 
393
394
  <br />
394
395
 
@@ -1,24 +1,56 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
1
4
  import os
2
- import warnings
3
5
  import re
6
+ import warnings
4
7
 
5
- # load environment variables before other imports
8
+ # IMPORTANT: load environment variables before other imports
6
9
  from deepeval.config.settings import autoload_dotenv, get_settings
7
10
 
11
+ logging.getLogger("deepeval").addHandler(logging.NullHandler())
8
12
  autoload_dotenv()
9
13
 
10
- from ._version import __version__
11
- from deepeval.evaluate import evaluate, assert_test
12
- from deepeval.evaluate.compare import compare
13
- from deepeval.test_run import on_test_run_end, log_hyperparameters
14
- from deepeval.utils import login
15
- from deepeval.telemetry import *
14
+
15
+ def _expose_public_api() -> None:
16
+ # All other imports must happen after env is loaded
17
+ # Do not do this at module level or ruff will complain with E402
18
+ global __version__, evaluate, assert_test, compare
19
+ global on_test_run_end, log_hyperparameters, login, telemetry
20
+
21
+ from ._version import __version__ as _version
22
+ from deepeval.evaluate import (
23
+ evaluate as _evaluate,
24
+ assert_test as _assert_test,
25
+ )
26
+ from deepeval.evaluate.compare import compare as _compare
27
+ from deepeval.test_run import (
28
+ on_test_run_end as _on_end,
29
+ log_hyperparameters as _log_hparams,
30
+ )
31
+ from deepeval.utils import login as _login
32
+ import deepeval.telemetry as _telemetry
33
+
34
+ __version__ = _version
35
+ evaluate = _evaluate
36
+ assert_test = _assert_test
37
+ compare = _compare
38
+ on_test_run_end = _on_end
39
+ log_hyperparameters = _log_hparams
40
+ login = _login
41
+ telemetry = _telemetry
42
+
43
+
44
+ _expose_public_api()
16
45
 
17
46
 
18
47
  settings = get_settings()
48
+
19
49
  if not settings.DEEPEVAL_GRPC_LOGGING:
20
- os.environ.setdefault("GRPC_VERBOSITY", "ERROR")
21
- os.environ.setdefault("GRPC_TRACE", "")
50
+ if os.getenv("GRPC_VERBOSITY") is None:
51
+ os.environ["GRPC_VERBOSITY"] = settings.GRPC_VERBOSITY or "ERROR"
52
+ if os.getenv("GRPC_TRACE") is None:
53
+ os.environ["GRPC_TRACE"] = settings.GRPC_TRACE or ""
22
54
 
23
55
 
24
56
  __all__ = [
@@ -70,9 +102,5 @@ def update_warning_opt_in():
70
102
  return os.getenv("DEEPEVAL_UPDATE_WARNING_OPT_IN") == "1"
71
103
 
72
104
 
73
- def is_read_only_env():
74
- return os.getenv("DEEPEVAL_FILE_SYSTEM") == "READ_ONLY"
75
-
76
-
77
105
  if update_warning_opt_in():
78
106
  check_for_update()
@@ -0,0 +1 @@
1
+ __version__: str = "3.8.0"
@@ -0,0 +1,19 @@
1
+ try:
2
+ import anthropic # noqa: F401
3
+ except ImportError:
4
+ raise ModuleNotFoundError(
5
+ "Please install anthropic to use this feature: 'pip install anthropic'"
6
+ )
7
+
8
+ try:
9
+ from anthropic import Anthropic, AsyncAnthropic # noqa: F401
10
+ except ImportError:
11
+ Anthropic = None # type: ignore
12
+ AsyncAnthropic = None # type: ignore
13
+
14
+ if Anthropic or AsyncAnthropic:
15
+ from deepeval.anthropic.patch import patch_anthropic_classes
16
+ from deepeval.telemetry import capture_tracing_integration
17
+
18
+ with capture_tracing_integration("anthropic"):
19
+ patch_anthropic_classes()
@@ -0,0 +1,94 @@
1
+ from anthropic.types.message import Message
2
+ from anthropic.types import ToolUseBlock
3
+ from typing import Any, Dict
4
+
5
+ from deepeval.anthropic.utils import (
6
+ render_messages_anthropic,
7
+ stringify_anthropic_content,
8
+ )
9
+ from deepeval.model_integrations.types import InputParameters, OutputParameters
10
+ from deepeval.test_case.llm_test_case import ToolCall
11
+
12
+
13
+ def safe_extract_input_parameters(kwargs: Dict[str, Any]) -> InputParameters:
14
+ # guarding against errors to be compatible with legacy APIs
15
+ try:
16
+ return extract_messages_api_input_parameters(kwargs)
17
+ except:
18
+ return InputParameters(model="NA")
19
+
20
+
21
+ def extract_messages_api_input_parameters(
22
+ kwargs: Dict[str, Any],
23
+ ) -> InputParameters:
24
+ model = kwargs.get("model")
25
+ tools = kwargs.get("tools")
26
+ messages = kwargs.get("messages")
27
+ tool_descriptions = (
28
+ {tool["name"]: tool["description"] for tool in tools}
29
+ if tools is not None
30
+ else None
31
+ )
32
+
33
+ input_argument = ""
34
+ user_messages = []
35
+ for message in messages:
36
+ role = message["role"]
37
+ if role == "user":
38
+ user_messages.append(message["content"])
39
+ if len(user_messages) > 0:
40
+ input_argument = user_messages[0]
41
+
42
+ return InputParameters(
43
+ model=model,
44
+ input=stringify_anthropic_content(input_argument),
45
+ messages=render_messages_anthropic(messages),
46
+ tools=tools,
47
+ tool_descriptions=tool_descriptions,
48
+ )
49
+
50
+
51
+ def safe_extract_output_parameters(
52
+ message_response: Message,
53
+ input_parameters: InputParameters,
54
+ ) -> OutputParameters:
55
+ # guarding against errors to be compatible with legacy APIs
56
+ try:
57
+ return extract_messages_api_output_parameters(
58
+ message_response, input_parameters
59
+ )
60
+ except:
61
+ return OutputParameters()
62
+
63
+
64
+ def extract_messages_api_output_parameters(
65
+ message_response: Message,
66
+ input_parameters: InputParameters,
67
+ ) -> OutputParameters:
68
+ output = str(message_response.content[0].text)
69
+ prompt_tokens = message_response.usage.input_tokens
70
+ completion_tokens = message_response.usage.output_tokens
71
+
72
+ tools_called = None
73
+ anthropic_tool_calls = [
74
+ block
75
+ for block in message_response.content
76
+ if isinstance(block, ToolUseBlock)
77
+ ]
78
+ if anthropic_tool_calls:
79
+ tools_called = []
80
+ tool_descriptions = input_parameters.tool_descriptions or {}
81
+ for tool_call in anthropic_tool_calls:
82
+ tools_called.append(
83
+ ToolCall(
84
+ name=tool_call.name,
85
+ input_parameters=tool_call.input,
86
+ description=tool_descriptions.get(tool_call.name),
87
+ )
88
+ )
89
+ return OutputParameters(
90
+ output=output,
91
+ prompt_tokens=prompt_tokens,
92
+ completion_tokens=completion_tokens,
93
+ tools_called=tools_called,
94
+ )
@@ -0,0 +1,169 @@
1
+ from typing import Callable
2
+ from functools import wraps
3
+
4
+ from deepeval.anthropic.extractors import (
5
+ safe_extract_input_parameters,
6
+ safe_extract_output_parameters,
7
+ InputParameters,
8
+ )
9
+ from deepeval.model_integrations.utils import _update_all_attributes
10
+ from deepeval.tracing import observe
11
+ from deepeval.tracing.trace_context import current_llm_context
12
+
13
+ _ORIGINAL_METHODS = {}
14
+ _ANTHROPIC_PATCHED = False
15
+
16
+
17
+ def patch_anthropic_classes():
18
+ """
19
+ Monkey patch Anthropic resource classes directly.
20
+ """
21
+ global _ANTHROPIC_PATCHED
22
+
23
+ # Single guard - if already patched, return immediately
24
+ if _ANTHROPIC_PATCHED:
25
+ return
26
+
27
+ try:
28
+ from anthropic.resources.messages import Messages, AsyncMessages
29
+
30
+ # Store original methods before patching
31
+ if hasattr(Messages, "create"):
32
+ _ORIGINAL_METHODS["Messages.create"] = Messages.create
33
+ Messages.create = _create_sync_wrapper(Messages.create)
34
+
35
+ if hasattr(AsyncMessages, "create"):
36
+ _ORIGINAL_METHODS["AsyncMessages.create"] = AsyncMessages.create
37
+ AsyncMessages.create = _create_async_wrapper(AsyncMessages.create)
38
+
39
+ except ImportError:
40
+ pass
41
+
42
+ _ANTHROPIC_PATCHED = True
43
+
44
+
45
+ def _create_sync_wrapper(original_method):
46
+ """
47
+ Create a wrapper for sync methods - called ONCE during patching.
48
+ """
49
+
50
+ @wraps(original_method)
51
+ def method_wrapper(self, *args, **kwargs):
52
+ bound_method = original_method.__get__(self, type(self))
53
+ patched = _patch_sync_anthropic_client_method(
54
+ original_method=bound_method
55
+ )
56
+ return patched(*args, **kwargs)
57
+
58
+ return method_wrapper
59
+
60
+
61
+ def _create_async_wrapper(original_method):
62
+ """
63
+ Create a wrapper for sync methods - called ONCE during patching.
64
+ """
65
+
66
+ @wraps(original_method)
67
+ def method_wrapper(self, *args, **kwargs):
68
+ bound_method = original_method.__get__(self, type(self))
69
+ patched = _patch_async_anthropic_client_method(
70
+ original_method=bound_method
71
+ )
72
+ return patched(*args, **kwargs)
73
+
74
+ return method_wrapper
75
+
76
+
77
+ def _patch_sync_anthropic_client_method(original_method: Callable):
78
+ @wraps(original_method)
79
+ def patched_sync_anthropic_method(*args, **kwargs):
80
+ input_parameters: InputParameters = safe_extract_input_parameters(
81
+ kwargs
82
+ )
83
+ llm_context = current_llm_context.get()
84
+
85
+ @observe(
86
+ type="llm",
87
+ model=input_parameters.model,
88
+ metrics=llm_context.metrics,
89
+ metric_collection=llm_context.metric_collection,
90
+ )
91
+ def llm_generation(*args, **kwargs):
92
+ messages_api_response = original_method(*args, **kwargs)
93
+ output_parameters = safe_extract_output_parameters(
94
+ messages_api_response, input_parameters
95
+ )
96
+ _update_all_attributes(
97
+ input_parameters,
98
+ output_parameters,
99
+ llm_context.expected_tools,
100
+ llm_context.expected_output,
101
+ llm_context.context,
102
+ llm_context.retrieval_context,
103
+ )
104
+ return messages_api_response
105
+
106
+ return llm_generation(*args, **kwargs)
107
+
108
+ return patched_sync_anthropic_method
109
+
110
+
111
+ def _patch_async_anthropic_client_method(original_method: Callable):
112
+ @wraps(original_method)
113
+ async def patched_async_anthropic_method(*args, **kwargs):
114
+ input_parameters: InputParameters = safe_extract_input_parameters(
115
+ kwargs
116
+ )
117
+ llm_context = current_llm_context.get()
118
+
119
+ @observe(
120
+ type="llm",
121
+ model=input_parameters.model,
122
+ metrics=llm_context.metrics,
123
+ metric_collection=llm_context.metric_collection,
124
+ )
125
+ async def llm_generation(*args, **kwargs):
126
+ messages_api_response = await original_method(*args, **kwargs)
127
+ output_parameters = safe_extract_output_parameters(
128
+ messages_api_response, input_parameters
129
+ )
130
+ _update_all_attributes(
131
+ input_parameters,
132
+ output_parameters,
133
+ llm_context.expected_tools,
134
+ llm_context.expected_output,
135
+ llm_context.context,
136
+ llm_context.retrieval_context,
137
+ )
138
+ return messages_api_response
139
+
140
+ return await llm_generation(*args, **kwargs)
141
+
142
+ return patched_async_anthropic_method
143
+
144
+
145
+ def unpatch_anthropic_classes():
146
+ """
147
+ Restore Anthropic resource classes to their original state.
148
+ """
149
+ global _ANTHROPIC_PATCHED
150
+
151
+ # If not patched, nothing to do
152
+ if not _ANTHROPIC_PATCHED:
153
+ return
154
+
155
+ try:
156
+ from anthropic.resources.messages import Messages, AsyncMessages
157
+
158
+ # Restore original methods for Messages
159
+ if hasattr(Messages, "create"):
160
+ Messages.create = _ORIGINAL_METHODS["Messages.create"]
161
+
162
+ if hasattr(AsyncMessages, "create"):
163
+ AsyncMessages.create = _ORIGINAL_METHODS["AsyncMessages.create"]
164
+
165
+ except ImportError:
166
+ pass
167
+
168
+ # Reset the patched flag
169
+ _ANTHROPIC_PATCHED = False