deepeval 2.0.4__tar.gz → 3.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (835) hide show
  1. deepeval-3.8.0/PKG-INFO +473 -0
  2. deepeval-3.8.0/README.md +425 -0
  3. deepeval-3.8.0/deepeval/__init__.py +106 -0
  4. deepeval-3.8.0/deepeval/_version.py +1 -0
  5. deepeval-3.8.0/deepeval/annotation/__init__.py +3 -0
  6. deepeval-3.8.0/deepeval/annotation/annotation.py +76 -0
  7. deepeval-3.8.0/deepeval/annotation/api.py +54 -0
  8. deepeval-3.8.0/deepeval/anthropic/__init__.py +19 -0
  9. deepeval-3.8.0/deepeval/anthropic/extractors.py +94 -0
  10. deepeval-3.8.0/deepeval/anthropic/patch.py +169 -0
  11. deepeval-3.8.0/deepeval/anthropic/utils.py +225 -0
  12. deepeval-3.8.0/deepeval/benchmarks/__init__.py +37 -0
  13. deepeval-3.8.0/deepeval/benchmarks/arc/arc.py +182 -0
  14. deepeval-3.8.0/deepeval/benchmarks/arc/mode.py +6 -0
  15. deepeval-3.8.0/deepeval/benchmarks/arc/template.py +94 -0
  16. deepeval-3.8.0/deepeval/benchmarks/base_benchmark.py +32 -0
  17. deepeval-3.8.0/deepeval/benchmarks/bbq/bbq.py +218 -0
  18. deepeval-3.8.0/deepeval/benchmarks/bbq/task.py +15 -0
  19. deepeval-3.8.0/deepeval/benchmarks/bbq/template.py +163 -0
  20. deepeval-3.8.0/deepeval/benchmarks/big_bench_hard/big_bench_hard.py +329 -0
  21. deepeval-3.8.0/deepeval/benchmarks/big_bench_hard/cot_prompts/snarks.txt +30 -0
  22. deepeval-3.8.0/deepeval/benchmarks/big_bench_hard/template.py +42 -0
  23. deepeval-3.8.0/deepeval/benchmarks/bool_q/bool_q.py +165 -0
  24. deepeval-3.8.0/deepeval/benchmarks/bool_q/template.py +32 -0
  25. deepeval-3.8.0/deepeval/benchmarks/drop/drop.py +363 -0
  26. deepeval-3.8.0/deepeval/benchmarks/drop/template.py +40 -0
  27. deepeval-3.8.0/deepeval/benchmarks/equity_med_qa/equity_med_qa.py +166 -0
  28. deepeval-3.8.0/deepeval/benchmarks/equity_med_qa/task.py +13 -0
  29. deepeval-3.8.0/deepeval/benchmarks/equity_med_qa/template.py +7 -0
  30. deepeval-3.8.0/deepeval/benchmarks/gsm8k/gsm8k.py +210 -0
  31. deepeval-3.8.0/deepeval/benchmarks/hellaswag/hellaswag.py +324 -0
  32. deepeval-3.8.0/deepeval/benchmarks/hellaswag/task.py +196 -0
  33. deepeval-3.8.0/deepeval/benchmarks/human_eval/human_eval.py +247 -0
  34. deepeval-3.8.0/deepeval/benchmarks/ifeval/ifeval.py +619 -0
  35. deepeval-3.8.0/deepeval/benchmarks/ifeval/template.py +131 -0
  36. deepeval-3.8.0/deepeval/benchmarks/lambada/lambada.py +165 -0
  37. deepeval-3.8.0/deepeval/benchmarks/lambada/template.py +64 -0
  38. deepeval-3.8.0/deepeval/benchmarks/logi_qa/logi_qa.py +303 -0
  39. deepeval-3.8.0/deepeval/benchmarks/logi_qa/task.py +9 -0
  40. deepeval-3.8.0/deepeval/benchmarks/logi_qa/template.py +42 -0
  41. deepeval-3.8.0/deepeval/benchmarks/math_qa/math_qa.py +291 -0
  42. deepeval-3.8.0/deepeval/benchmarks/math_qa/task.py +10 -0
  43. deepeval-3.8.0/deepeval/benchmarks/math_qa/template.py +75 -0
  44. deepeval-3.8.0/deepeval/benchmarks/mmlu/mmlu.py +317 -0
  45. deepeval-3.8.0/deepeval/benchmarks/mmlu/template.py +39 -0
  46. deepeval-3.8.0/deepeval/benchmarks/modes/__init__.py +7 -0
  47. deepeval-3.8.0/deepeval/benchmarks/results.py +2 -0
  48. deepeval-3.8.0/deepeval/benchmarks/schema.py +171 -0
  49. deepeval-3.8.0/deepeval/benchmarks/squad/squad.py +214 -0
  50. deepeval-3.8.0/deepeval/benchmarks/squad/task.py +54 -0
  51. deepeval-3.8.0/deepeval/benchmarks/squad/template.py +70 -0
  52. deepeval-3.8.0/deepeval/benchmarks/tasks/__init__.py +10 -0
  53. deepeval-3.8.0/deepeval/benchmarks/truthful_qa/truthful_qa.py +362 -0
  54. deepeval-3.8.0/deepeval/benchmarks/utils.py +13 -0
  55. deepeval-3.8.0/deepeval/benchmarks/winogrande/template.py +63 -0
  56. deepeval-3.8.0/deepeval/benchmarks/winogrande/winogrande.py +167 -0
  57. deepeval-3.8.0/deepeval/cli/dotenv_handler.py +71 -0
  58. deepeval-3.8.0/deepeval/cli/main.py +3109 -0
  59. deepeval-3.8.0/deepeval/cli/server.py +51 -0
  60. deepeval-3.8.0/deepeval/cli/test.py +181 -0
  61. deepeval-3.8.0/deepeval/cli/types.py +13 -0
  62. deepeval-3.8.0/deepeval/cli/utils.py +353 -0
  63. deepeval-3.8.0/deepeval/confident/__init__.py +1 -0
  64. deepeval-3.8.0/deepeval/confident/api.py +260 -0
  65. deepeval-3.8.0/deepeval/confident/types.py +22 -0
  66. deepeval-3.8.0/deepeval/config/dotenv_handler.py +19 -0
  67. deepeval-3.8.0/deepeval/config/logging.py +33 -0
  68. deepeval-3.8.0/deepeval/config/settings.py +1589 -0
  69. deepeval-3.8.0/deepeval/config/settings_manager.py +137 -0
  70. deepeval-3.8.0/deepeval/config/utils.py +152 -0
  71. deepeval-3.8.0/deepeval/constants.py +49 -0
  72. deepeval-3.8.0/deepeval/contextvars.py +25 -0
  73. deepeval-3.8.0/deepeval/dataset/__init__.py +11 -0
  74. deepeval-3.8.0/deepeval/dataset/api.py +50 -0
  75. deepeval-3.8.0/deepeval/dataset/dataset.py +1452 -0
  76. deepeval-3.8.0/deepeval/dataset/golden.py +197 -0
  77. deepeval-3.8.0/deepeval/dataset/test_run_tracer.py +80 -0
  78. deepeval-3.8.0/deepeval/dataset/types.py +25 -0
  79. deepeval-3.8.0/deepeval/dataset/utils.py +234 -0
  80. deepeval-3.8.0/deepeval/errors.py +24 -0
  81. deepeval-3.8.0/deepeval/evaluate/__init__.py +14 -0
  82. deepeval-3.8.0/deepeval/evaluate/api.py +11 -0
  83. deepeval-3.8.0/deepeval/evaluate/compare.py +530 -0
  84. deepeval-3.8.0/deepeval/evaluate/configs.py +38 -0
  85. deepeval-3.8.0/deepeval/evaluate/evaluate.py +322 -0
  86. deepeval-3.8.0/deepeval/evaluate/execute.py +3184 -0
  87. deepeval-3.8.0/deepeval/evaluate/types.py +37 -0
  88. deepeval-3.8.0/deepeval/evaluate/utils.py +574 -0
  89. deepeval-3.8.0/deepeval/integrations/crewai/__init__.py +9 -0
  90. deepeval-3.8.0/deepeval/integrations/crewai/handler.py +232 -0
  91. deepeval-3.8.0/deepeval/integrations/crewai/subs.py +51 -0
  92. deepeval-3.8.0/deepeval/integrations/crewai/tool.py +71 -0
  93. deepeval-3.8.0/deepeval/integrations/crewai/wrapper.py +127 -0
  94. deepeval-3.8.0/deepeval/integrations/hugging_face/callback.py +231 -0
  95. deepeval-3.8.0/deepeval/integrations/hugging_face/tests/test_callbacks.py +158 -0
  96. deepeval-3.8.0/deepeval/integrations/langchain/__init__.py +4 -0
  97. deepeval-3.8.0/deepeval/integrations/langchain/callback.py +542 -0
  98. deepeval-3.8.0/deepeval/integrations/langchain/patch.py +43 -0
  99. deepeval-3.8.0/deepeval/integrations/langchain/utils.py +337 -0
  100. deepeval-3.8.0/deepeval/integrations/llama_index/__init__.py +6 -0
  101. deepeval-3.8.0/deepeval/integrations/llama_index/handler.py +301 -0
  102. deepeval-3.8.0/deepeval/integrations/llama_index/utils.py +107 -0
  103. deepeval-3.8.0/deepeval/integrations/pydantic_ai/__init__.py +5 -0
  104. deepeval-3.8.0/deepeval/integrations/pydantic_ai/agent.py +38 -0
  105. deepeval-3.8.0/deepeval/integrations/pydantic_ai/instrumentator.py +325 -0
  106. deepeval-3.8.0/deepeval/integrations/pydantic_ai/otel.py +64 -0
  107. deepeval-3.8.0/deepeval/key_handler.py +283 -0
  108. deepeval-3.8.0/deepeval/metrics/__init__.py +133 -0
  109. deepeval-3.8.0/deepeval/metrics/answer_relevancy/__init__.py +1 -0
  110. deepeval-3.8.0/deepeval/metrics/answer_relevancy/answer_relevancy.py +319 -0
  111. deepeval-3.8.0/deepeval/metrics/answer_relevancy/schema.py +19 -0
  112. deepeval-3.8.0/deepeval/metrics/answer_relevancy/template.py +129 -0
  113. deepeval-3.8.0/deepeval/metrics/api.py +281 -0
  114. deepeval-3.8.0/deepeval/metrics/arena_g_eval/__init__.py +1 -0
  115. deepeval-3.8.0/deepeval/metrics/arena_g_eval/arena_g_eval.py +316 -0
  116. deepeval-3.8.0/deepeval/metrics/arena_g_eval/schema.py +20 -0
  117. deepeval-3.8.0/deepeval/metrics/arena_g_eval/template.py +133 -0
  118. deepeval-3.8.0/deepeval/metrics/arena_g_eval/utils.py +161 -0
  119. deepeval-3.8.0/deepeval/metrics/argument_correctness/argument_correctness.py +282 -0
  120. deepeval-3.8.0/deepeval/metrics/argument_correctness/schema.py +15 -0
  121. deepeval-3.8.0/deepeval/metrics/argument_correctness/template.py +141 -0
  122. deepeval-3.8.0/deepeval/metrics/base_metric.py +139 -0
  123. deepeval-3.8.0/deepeval/metrics/bias/__init__.py +1 -0
  124. deepeval-3.8.0/deepeval/metrics/bias/bias.py +295 -0
  125. deepeval-3.8.0/deepeval/metrics/bias/schema.py +20 -0
  126. deepeval-3.8.0/deepeval/metrics/bias/template.py +127 -0
  127. deepeval-3.8.0/deepeval/metrics/contextual_precision/__init__.py +1 -0
  128. deepeval-3.8.0/deepeval/metrics/contextual_precision/contextual_precision.py +305 -0
  129. deepeval-3.8.0/deepeval/metrics/contextual_precision/schema.py +15 -0
  130. deepeval-3.8.0/deepeval/metrics/contextual_precision/template.py +133 -0
  131. deepeval-3.8.0/deepeval/metrics/contextual_recall/__init__.py +1 -0
  132. deepeval-3.8.0/deepeval/metrics/contextual_recall/contextual_recall.py +293 -0
  133. deepeval-3.8.0/deepeval/metrics/contextual_recall/schema.py +15 -0
  134. deepeval-3.8.0/deepeval/metrics/contextual_recall/template.py +126 -0
  135. deepeval-3.8.0/deepeval/metrics/contextual_relevancy/__init__.py +1 -0
  136. deepeval-3.8.0/deepeval/metrics/contextual_relevancy/contextual_relevancy.py +288 -0
  137. deepeval-3.8.0/deepeval/metrics/contextual_relevancy/schema.py +16 -0
  138. deepeval-3.8.0/deepeval/metrics/contextual_relevancy/template.py +106 -0
  139. deepeval-3.8.0/deepeval/metrics/conversation_completeness/conversation_completeness.py +304 -0
  140. deepeval-3.8.0/deepeval/metrics/conversation_completeness/schema.py +15 -0
  141. deepeval-3.8.0/deepeval/metrics/conversation_completeness/template.py +157 -0
  142. deepeval-3.8.0/deepeval/metrics/conversational_dag/__init__.py +7 -0
  143. deepeval-3.8.0/deepeval/metrics/conversational_dag/conversational_dag.py +155 -0
  144. deepeval-3.8.0/deepeval/metrics/conversational_dag/nodes.py +882 -0
  145. deepeval-3.8.0/deepeval/metrics/conversational_dag/templates.py +133 -0
  146. deepeval-3.8.0/deepeval/metrics/conversational_g_eval/__init__.py +3 -0
  147. deepeval-3.8.0/deepeval/metrics/conversational_g_eval/conversational_g_eval.py +455 -0
  148. deepeval-3.8.0/deepeval/metrics/conversational_g_eval/template.py +77 -0
  149. deepeval-3.8.0/deepeval/metrics/dag/__init__.py +8 -0
  150. deepeval-3.8.0/deepeval/metrics/dag/dag.py +161 -0
  151. deepeval-3.8.0/deepeval/metrics/dag/graph.py +77 -0
  152. deepeval-3.8.0/deepeval/metrics/dag/nodes.py +714 -0
  153. deepeval-3.8.0/deepeval/metrics/dag/schema.py +20 -0
  154. deepeval-3.8.0/deepeval/metrics/dag/templates.py +108 -0
  155. deepeval-3.8.0/deepeval/metrics/dag/utils.py +188 -0
  156. deepeval-3.8.0/deepeval/metrics/exact_match/exact_match.py +102 -0
  157. deepeval-3.8.0/deepeval/metrics/faithfulness/__init__.py +1 -0
  158. deepeval-3.8.0/deepeval/metrics/faithfulness/faithfulness.py +355 -0
  159. deepeval-3.8.0/deepeval/metrics/faithfulness/schema.py +23 -0
  160. deepeval-3.8.0/deepeval/metrics/faithfulness/template.py +225 -0
  161. deepeval-3.8.0/deepeval/metrics/g_eval/__init__.py +5 -0
  162. deepeval-3.8.0/deepeval/metrics/g_eval/g_eval.py +450 -0
  163. deepeval-3.8.0/deepeval/metrics/g_eval/schema.py +17 -0
  164. deepeval-3.8.0/deepeval/metrics/g_eval/template.py +141 -0
  165. deepeval-3.8.0/deepeval/metrics/g_eval/utils.py +328 -0
  166. deepeval-3.8.0/deepeval/metrics/goal_accuracy/__init__.py +1 -0
  167. deepeval-3.8.0/deepeval/metrics/goal_accuracy/goal_accuracy.py +364 -0
  168. deepeval-3.8.0/deepeval/metrics/goal_accuracy/schema.py +17 -0
  169. deepeval-3.8.0/deepeval/metrics/goal_accuracy/template.py +253 -0
  170. deepeval-3.8.0/deepeval/metrics/hallucination/__init__.py +1 -0
  171. deepeval-3.8.0/deepeval/metrics/hallucination/hallucination.py +267 -0
  172. deepeval-3.8.0/deepeval/metrics/hallucination/schema.py +15 -0
  173. deepeval-3.8.0/deepeval/metrics/hallucination/template.py +80 -0
  174. deepeval-3.8.0/deepeval/metrics/indicator.py +302 -0
  175. deepeval-3.8.0/deepeval/metrics/json_correctness/json_correctness.py +223 -0
  176. deepeval-3.8.0/deepeval/metrics/json_correctness/schema.py +5 -0
  177. deepeval-3.8.0/deepeval/metrics/json_correctness/template.py +41 -0
  178. deepeval-3.8.0/deepeval/metrics/knowledge_retention/knowledge_retention.py +334 -0
  179. deepeval-3.8.0/deepeval/metrics/knowledge_retention/schema.py +21 -0
  180. deepeval-3.8.0/deepeval/metrics/knowledge_retention/template.py +197 -0
  181. deepeval-3.8.0/deepeval/metrics/mcp/mcp_task_completion.py +293 -0
  182. deepeval-3.8.0/deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +386 -0
  183. deepeval-3.8.0/deepeval/metrics/mcp/schema.py +26 -0
  184. deepeval-3.8.0/deepeval/metrics/mcp/template.py +184 -0
  185. deepeval-3.8.0/deepeval/metrics/mcp_use_metric/mcp_use_metric.py +397 -0
  186. deepeval-3.8.0/deepeval/metrics/mcp_use_metric/schema.py +11 -0
  187. deepeval-3.8.0/deepeval/metrics/mcp_use_metric/template.py +126 -0
  188. deepeval-3.8.0/deepeval/metrics/misuse/__init__.py +1 -0
  189. deepeval-3.8.0/deepeval/metrics/misuse/misuse.py +290 -0
  190. deepeval-3.8.0/deepeval/metrics/misuse/schema.py +19 -0
  191. deepeval-3.8.0/deepeval/metrics/misuse/template.py +94 -0
  192. deepeval-3.8.0/deepeval/metrics/multimodal_metrics/__init__.py +5 -0
  193. deepeval-3.8.0/deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +351 -0
  194. deepeval-3.8.0/deepeval/metrics/multimodal_metrics/image_coherence/schema.py +7 -0
  195. deepeval-3.8.0/deepeval/metrics/multimodal_metrics/image_coherence/template.py +44 -0
  196. deepeval-3.8.0/deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +295 -0
  197. deepeval-3.8.0/deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +352 -0
  198. deepeval-3.8.0/deepeval/metrics/multimodal_metrics/image_helpfulness/schema.py +7 -0
  199. deepeval-3.8.0/deepeval/metrics/multimodal_metrics/image_helpfulness/template.py +44 -0
  200. deepeval-3.8.0/deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +352 -0
  201. deepeval-3.8.0/deepeval/metrics/multimodal_metrics/image_reference/schema.py +7 -0
  202. deepeval-3.8.0/deepeval/metrics/multimodal_metrics/image_reference/template.py +43 -0
  203. deepeval-3.8.0/deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +297 -0
  204. deepeval-3.8.0/deepeval/metrics/non_advice/__init__.py +1 -0
  205. deepeval-3.8.0/deepeval/metrics/non_advice/non_advice.py +296 -0
  206. deepeval-3.8.0/deepeval/metrics/non_advice/schema.py +19 -0
  207. deepeval-3.8.0/deepeval/metrics/non_advice/template.py +95 -0
  208. deepeval-3.8.0/deepeval/metrics/pattern_match/pattern_match.py +111 -0
  209. deepeval-3.8.0/deepeval/metrics/pii_leakage/__init__.py +1 -0
  210. deepeval-3.8.0/deepeval/metrics/pii_leakage/pii_leakage.py +281 -0
  211. deepeval-3.8.0/deepeval/metrics/pii_leakage/schema.py +19 -0
  212. deepeval-3.8.0/deepeval/metrics/pii_leakage/template.py +86 -0
  213. deepeval-3.8.0/deepeval/metrics/plan_adherence/__init__.py +1 -0
  214. deepeval-3.8.0/deepeval/metrics/plan_adherence/plan_adherence.py +266 -0
  215. deepeval-3.8.0/deepeval/metrics/plan_adherence/schema.py +11 -0
  216. deepeval-3.8.0/deepeval/metrics/plan_adherence/template.py +181 -0
  217. deepeval-3.8.0/deepeval/metrics/plan_quality/__init__.py +1 -0
  218. deepeval-3.8.0/deepeval/metrics/plan_quality/plan_quality.py +268 -0
  219. deepeval-3.8.0/deepeval/metrics/plan_quality/schema.py +11 -0
  220. deepeval-3.8.0/deepeval/metrics/plan_quality/template.py +110 -0
  221. deepeval-3.8.0/deepeval/metrics/prompt_alignment/prompt_alignment.py +283 -0
  222. deepeval-3.8.0/deepeval/metrics/prompt_alignment/schema.py +15 -0
  223. deepeval-3.8.0/deepeval/metrics/prompt_alignment/template.py +99 -0
  224. deepeval-3.8.0/deepeval/metrics/ragas.py +559 -0
  225. deepeval-3.8.0/deepeval/metrics/role_adherence/role_adherence.py +265 -0
  226. deepeval-3.8.0/deepeval/metrics/role_adherence/schema.py +16 -0
  227. deepeval-3.8.0/deepeval/metrics/role_adherence/template.py +112 -0
  228. deepeval-3.8.0/deepeval/metrics/role_violation/__init__.py +1 -0
  229. deepeval-3.8.0/deepeval/metrics/role_violation/role_violation.py +296 -0
  230. deepeval-3.8.0/deepeval/metrics/role_violation/schema.py +19 -0
  231. deepeval-3.8.0/deepeval/metrics/role_violation/template.py +86 -0
  232. deepeval-3.8.0/deepeval/metrics/step_efficiency/__init__.py +1 -0
  233. deepeval-3.8.0/deepeval/metrics/step_efficiency/schema.py +11 -0
  234. deepeval-3.8.0/deepeval/metrics/step_efficiency/step_efficiency.py +224 -0
  235. deepeval-3.8.0/deepeval/metrics/step_efficiency/template.py +267 -0
  236. deepeval-3.8.0/deepeval/metrics/summarization/__init__.py +1 -0
  237. deepeval-3.8.0/deepeval/metrics/summarization/schema.py +36 -0
  238. deepeval-3.8.0/deepeval/metrics/summarization/summarization.py +526 -0
  239. deepeval-3.8.0/deepeval/metrics/summarization/template.py +143 -0
  240. deepeval-3.8.0/deepeval/metrics/task_completion/schema.py +12 -0
  241. deepeval-3.8.0/deepeval/metrics/task_completion/task_completion.py +253 -0
  242. deepeval-3.8.0/deepeval/metrics/task_completion/template.py +232 -0
  243. deepeval-3.8.0/deepeval/metrics/tool_correctness/schema.py +6 -0
  244. deepeval-3.8.0/deepeval/metrics/tool_correctness/template.py +88 -0
  245. deepeval-3.8.0/deepeval/metrics/tool_correctness/tool_correctness.py +525 -0
  246. deepeval-3.8.0/deepeval/metrics/tool_use/__init__.py +1 -0
  247. deepeval-3.8.0/deepeval/metrics/tool_use/schema.py +23 -0
  248. deepeval-3.8.0/deepeval/metrics/tool_use/template.py +234 -0
  249. deepeval-3.8.0/deepeval/metrics/tool_use/tool_use.py +436 -0
  250. deepeval-3.8.0/deepeval/metrics/topic_adherence/__init__.py +1 -0
  251. deepeval-3.8.0/deepeval/metrics/topic_adherence/schema.py +20 -0
  252. deepeval-3.8.0/deepeval/metrics/topic_adherence/template.py +182 -0
  253. deepeval-3.8.0/deepeval/metrics/topic_adherence/topic_adherence.py +342 -0
  254. deepeval-3.8.0/deepeval/metrics/toxicity/__init__.py +1 -0
  255. deepeval-3.8.0/deepeval/metrics/toxicity/schema.py +20 -0
  256. deepeval-3.8.0/deepeval/metrics/toxicity/template.py +133 -0
  257. deepeval-3.8.0/deepeval/metrics/toxicity/toxicity.py +296 -0
  258. deepeval-3.8.0/deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  259. deepeval-3.8.0/deepeval/metrics/turn_contextual_precision/template.py +194 -0
  260. deepeval-3.8.0/deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  261. deepeval-3.8.0/deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  262. deepeval-3.8.0/deepeval/metrics/turn_contextual_recall/template.py +185 -0
  263. deepeval-3.8.0/deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +525 -0
  264. deepeval-3.8.0/deepeval/metrics/turn_contextual_relevancy/schema.py +22 -0
  265. deepeval-3.8.0/deepeval/metrics/turn_contextual_relevancy/template.py +168 -0
  266. deepeval-3.8.0/deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +532 -0
  267. deepeval-3.8.0/deepeval/metrics/turn_faithfulness/schema.py +31 -0
  268. deepeval-3.8.0/deepeval/metrics/turn_faithfulness/template.py +225 -0
  269. deepeval-3.8.0/deepeval/metrics/turn_faithfulness/turn_faithfulness.py +573 -0
  270. deepeval-3.8.0/deepeval/metrics/turn_relevancy/schema.py +12 -0
  271. deepeval-3.8.0/deepeval/metrics/turn_relevancy/template.py +89 -0
  272. deepeval-3.8.0/deepeval/metrics/turn_relevancy/turn_relevancy.py +270 -0
  273. deepeval-3.8.0/deepeval/metrics/utils.py +632 -0
  274. deepeval-3.8.0/deepeval/model_integrations/types.py +20 -0
  275. deepeval-3.8.0/deepeval/model_integrations/utils.py +116 -0
  276. deepeval-3.8.0/deepeval/models/__init__.py +49 -0
  277. deepeval-3.8.0/deepeval/models/_summac_model.py +593 -0
  278. deepeval-3.8.0/deepeval/models/answer_relevancy_model.py +75 -0
  279. deepeval-3.8.0/deepeval/models/base_model.py +176 -0
  280. deepeval-3.8.0/deepeval/models/embedding_models/__init__.py +11 -0
  281. deepeval-3.8.0/deepeval/models/embedding_models/azure_embedding_model.py +166 -0
  282. deepeval-3.8.0/deepeval/models/embedding_models/local_embedding_model.py +132 -0
  283. deepeval-3.8.0/deepeval/models/embedding_models/ollama_embedding_model.py +113 -0
  284. deepeval-3.8.0/deepeval/models/embedding_models/openai_embedding_model.py +135 -0
  285. deepeval-3.8.0/deepeval/models/llms/__init__.py +29 -0
  286. deepeval-3.8.0/deepeval/models/llms/amazon_bedrock_model.py +316 -0
  287. deepeval-3.8.0/deepeval/models/llms/anthropic_model.py +298 -0
  288. deepeval-3.8.0/deepeval/models/llms/azure_model.py +458 -0
  289. deepeval-3.8.0/deepeval/models/llms/constants.py +2055 -0
  290. deepeval-3.8.0/deepeval/models/llms/deepseek_model.py +244 -0
  291. deepeval-3.8.0/deepeval/models/llms/gemini_model.py +430 -0
  292. deepeval-3.8.0/deepeval/models/llms/grok_model.py +312 -0
  293. deepeval-3.8.0/deepeval/models/llms/kimi_model.py +294 -0
  294. deepeval-3.8.0/deepeval/models/llms/litellm_model.py +467 -0
  295. deepeval-3.8.0/deepeval/models/llms/local_model.py +242 -0
  296. deepeval-3.8.0/deepeval/models/llms/ollama_model.py +237 -0
  297. deepeval-3.8.0/deepeval/models/llms/openai_model.py +488 -0
  298. deepeval-3.8.0/deepeval/models/llms/openrouter_model.py +398 -0
  299. deepeval-3.8.0/deepeval/models/llms/portkey_model.py +191 -0
  300. deepeval-3.8.0/deepeval/models/llms/utils.py +49 -0
  301. deepeval-3.8.0/deepeval/models/retry_policy.py +1049 -0
  302. deepeval-3.8.0/deepeval/models/utils.py +173 -0
  303. deepeval-3.8.0/deepeval/openai/__init__.py +21 -0
  304. deepeval-3.8.0/deepeval/openai/extractors.py +198 -0
  305. deepeval-3.8.0/deepeval/openai/patch.py +295 -0
  306. deepeval-3.8.0/deepeval/openai/utils.py +211 -0
  307. deepeval-3.8.0/deepeval/openai_agents/__init__.py +7 -0
  308. deepeval-3.8.0/deepeval/openai_agents/agent.py +36 -0
  309. deepeval-3.8.0/deepeval/openai_agents/callback_handler.py +151 -0
  310. deepeval-3.8.0/deepeval/openai_agents/extractors.py +443 -0
  311. deepeval-3.8.0/deepeval/openai_agents/patch.py +309 -0
  312. deepeval-3.8.0/deepeval/openai_agents/runner.py +348 -0
  313. deepeval-3.8.0/deepeval/optimizer/__init__.py +5 -0
  314. deepeval-3.8.0/deepeval/optimizer/algorithms/__init__.py +6 -0
  315. deepeval-3.8.0/deepeval/optimizer/algorithms/base.py +29 -0
  316. deepeval-3.8.0/deepeval/optimizer/algorithms/configs.py +18 -0
  317. deepeval-3.8.0/deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  318. deepeval-3.8.0/deepeval/optimizer/algorithms/copro/copro.py +836 -0
  319. deepeval-3.8.0/deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  320. deepeval-3.8.0/deepeval/optimizer/algorithms/gepa/gepa.py +737 -0
  321. deepeval-3.8.0/deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  322. deepeval-3.8.0/deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  323. deepeval-3.8.0/deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  324. deepeval-3.8.0/deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  325. deepeval-3.8.0/deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  326. deepeval-3.8.0/deepeval/optimizer/algorithms/simba/simba.py +999 -0
  327. deepeval-3.8.0/deepeval/optimizer/algorithms/simba/types.py +15 -0
  328. deepeval-3.8.0/deepeval/optimizer/configs.py +31 -0
  329. deepeval-3.8.0/deepeval/optimizer/policies.py +227 -0
  330. deepeval-3.8.0/deepeval/optimizer/prompt_optimizer.py +263 -0
  331. deepeval-3.8.0/deepeval/optimizer/rewriter/__init__.py +5 -0
  332. deepeval-3.8.0/deepeval/optimizer/rewriter/rewriter.py +124 -0
  333. deepeval-3.8.0/deepeval/optimizer/rewriter/utils.py +214 -0
  334. deepeval-3.8.0/deepeval/optimizer/scorer/__init__.py +5 -0
  335. deepeval-3.8.0/deepeval/optimizer/scorer/base.py +86 -0
  336. deepeval-3.8.0/deepeval/optimizer/scorer/scorer.py +316 -0
  337. deepeval-3.8.0/deepeval/optimizer/scorer/utils.py +30 -0
  338. deepeval-3.8.0/deepeval/optimizer/types.py +148 -0
  339. deepeval-3.8.0/deepeval/optimizer/utils.py +480 -0
  340. deepeval-3.8.0/deepeval/plugins/plugin.py +50 -0
  341. deepeval-3.8.0/deepeval/progress_context.py +108 -0
  342. deepeval-3.8.0/deepeval/prompt/__init__.py +21 -0
  343. deepeval-3.8.0/deepeval/prompt/api.py +234 -0
  344. deepeval-3.8.0/deepeval/prompt/prompt.py +837 -0
  345. deepeval-3.8.0/deepeval/prompt/utils.py +221 -0
  346. deepeval-3.8.0/deepeval/red_teaming/README.md +3 -0
  347. deepeval-3.8.0/deepeval/scorer/scorer.py +474 -0
  348. deepeval-3.8.0/deepeval/simulator/__init__.py +4 -0
  349. deepeval-3.8.0/deepeval/simulator/conversation_simulator.py +725 -0
  350. deepeval-3.8.0/deepeval/simulator/schema.py +15 -0
  351. deepeval-3.8.0/deepeval/simulator/template.py +140 -0
  352. deepeval-3.8.0/deepeval/synthesizer/__init__.py +8 -0
  353. deepeval-3.8.0/deepeval/synthesizer/chunking/context_generator.py +999 -0
  354. deepeval-3.8.0/deepeval/synthesizer/chunking/doc_chunker.py +274 -0
  355. deepeval-3.8.0/deepeval/synthesizer/config.py +70 -0
  356. deepeval-3.8.0/deepeval/synthesizer/schema.py +83 -0
  357. deepeval-3.8.0/deepeval/synthesizer/synthesizer.py +2751 -0
  358. deepeval-3.8.0/deepeval/synthesizer/templates/__init__.py +12 -0
  359. deepeval-3.8.0/deepeval/synthesizer/templates/template.py +1281 -0
  360. deepeval-3.8.0/deepeval/synthesizer/templates/template_extraction.py +78 -0
  361. deepeval-3.8.0/deepeval/synthesizer/templates/template_prompt.py +546 -0
  362. deepeval-3.8.0/deepeval/synthesizer/utils.py +36 -0
  363. deepeval-3.8.0/deepeval/telemetry.py +643 -0
  364. deepeval-3.8.0/deepeval/test_case/__init__.py +37 -0
  365. deepeval-3.8.0/deepeval/test_case/api.py +112 -0
  366. deepeval-3.8.0/deepeval/test_case/arena_test_case.py +48 -0
  367. deepeval-3.8.0/deepeval/test_case/conversational_test_case.py +266 -0
  368. deepeval-3.8.0/deepeval/test_case/llm_test_case.py +542 -0
  369. deepeval-3.8.0/deepeval/test_case/mcp.py +59 -0
  370. deepeval-3.8.0/deepeval/test_case/utils.py +20 -0
  371. deepeval-3.8.0/deepeval/test_run/__init__.py +34 -0
  372. deepeval-3.8.0/deepeval/test_run/api.py +157 -0
  373. deepeval-3.8.0/deepeval/test_run/cache.py +391 -0
  374. deepeval-3.8.0/deepeval/test_run/hyperparameters.py +109 -0
  375. deepeval-3.8.0/deepeval/test_run/test_run.py +1131 -0
  376. deepeval-3.8.0/deepeval/tracing/__init__.py +26 -0
  377. deepeval-3.8.0/deepeval/tracing/api.py +146 -0
  378. deepeval-3.8.0/deepeval/tracing/context.py +158 -0
  379. deepeval-3.8.0/deepeval/tracing/offline_evals/__init__.py +3 -0
  380. deepeval-3.8.0/deepeval/tracing/offline_evals/api.py +16 -0
  381. deepeval-3.8.0/deepeval/tracing/offline_evals/span.py +59 -0
  382. deepeval-3.8.0/deepeval/tracing/offline_evals/thread.py +67 -0
  383. deepeval-3.8.0/deepeval/tracing/offline_evals/trace.py +63 -0
  384. deepeval-3.8.0/deepeval/tracing/otel/__init__.py +5 -0
  385. deepeval-3.8.0/deepeval/tracing/otel/exporter.py +753 -0
  386. deepeval-3.8.0/deepeval/tracing/otel/test_exporter.py +35 -0
  387. deepeval-3.8.0/deepeval/tracing/otel/utils.py +544 -0
  388. deepeval-3.8.0/deepeval/tracing/patchers.py +190 -0
  389. deepeval-3.8.0/deepeval/tracing/perf_epoch_bridge.py +53 -0
  390. deepeval-3.8.0/deepeval/tracing/trace_context.py +107 -0
  391. deepeval-3.8.0/deepeval/tracing/trace_test_manager.py +19 -0
  392. deepeval-3.8.0/deepeval/tracing/tracing.py +1235 -0
  393. deepeval-3.8.0/deepeval/tracing/types.py +193 -0
  394. deepeval-3.8.0/deepeval/tracing/utils.py +202 -0
  395. deepeval-3.8.0/deepeval/utils.py +946 -0
  396. deepeval-3.8.0/pyproject.toml +98 -0
  397. deepeval-2.0.4/MANIFEST.in +0 -1
  398. deepeval-2.0.4/PKG-INFO +0 -35
  399. deepeval-2.0.4/README.md +0 -318
  400. deepeval-2.0.4/deepeval/__init__.py +0 -68
  401. deepeval-2.0.4/deepeval/_version.py +0 -1
  402. deepeval-2.0.4/deepeval/benchmarks/__init__.py +0 -7
  403. deepeval-2.0.4/deepeval/benchmarks/base_benchmark.py +0 -19
  404. deepeval-2.0.4/deepeval/benchmarks/big_bench_hard/big_bench_hard.py +0 -205
  405. deepeval-2.0.4/deepeval/benchmarks/big_bench_hard/cot_prompts/snarks.txt +0 -30
  406. deepeval-2.0.4/deepeval/benchmarks/big_bench_hard/template.py +0 -42
  407. deepeval-2.0.4/deepeval/benchmarks/drop/drop.py +0 -251
  408. deepeval-2.0.4/deepeval/benchmarks/drop/template.py +0 -40
  409. deepeval-2.0.4/deepeval/benchmarks/gsm8k/gsm8k.py +0 -116
  410. deepeval-2.0.4/deepeval/benchmarks/hellaswag/hellaswag.py +0 -232
  411. deepeval-2.0.4/deepeval/benchmarks/hellaswag/task.py +0 -196
  412. deepeval-2.0.4/deepeval/benchmarks/human_eval/human_eval.py +0 -136
  413. deepeval-2.0.4/deepeval/benchmarks/mmlu/mmlu.py +0 -216
  414. deepeval-2.0.4/deepeval/benchmarks/mmlu/template.py +0 -38
  415. deepeval-2.0.4/deepeval/benchmarks/modes/__init__.py +0 -1
  416. deepeval-2.0.4/deepeval/benchmarks/schema.py +0 -180
  417. deepeval-2.0.4/deepeval/benchmarks/tasks/__init__.py +0 -6
  418. deepeval-2.0.4/deepeval/benchmarks/truthful_qa/truthful_qa.py +0 -269
  419. deepeval-2.0.4/deepeval/benchmarks/utils.py +0 -17
  420. deepeval-2.0.4/deepeval/cli/main.py +0 -240
  421. deepeval-2.0.4/deepeval/cli/test.py +0 -156
  422. deepeval-2.0.4/deepeval/confident/__init__.py +0 -2
  423. deepeval-2.0.4/deepeval/confident/api.py +0 -147
  424. deepeval-2.0.4/deepeval/confident/evaluate.py +0 -94
  425. deepeval-2.0.4/deepeval/confident/types.py +0 -15
  426. deepeval-2.0.4/deepeval/confident/utils.py +0 -7
  427. deepeval-2.0.4/deepeval/constants.py +0 -5
  428. deepeval-2.0.4/deepeval/dataset/__init__.py +0 -2
  429. deepeval-2.0.4/deepeval/dataset/api.py +0 -25
  430. deepeval-2.0.4/deepeval/dataset/dataset.py +0 -820
  431. deepeval-2.0.4/deepeval/dataset/golden.py +0 -40
  432. deepeval-2.0.4/deepeval/dataset/utils.py +0 -65
  433. deepeval-2.0.4/deepeval/errors.py +0 -2
  434. deepeval-2.0.4/deepeval/evaluate.py +0 -1166
  435. deepeval-2.0.4/deepeval/event/__init__.py +0 -1
  436. deepeval-2.0.4/deepeval/event/api.py +0 -5
  437. deepeval-2.0.4/deepeval/event/event.py +0 -47
  438. deepeval-2.0.4/deepeval/guardrails/__init__.py +0 -2
  439. deepeval-2.0.4/deepeval/guardrails/api.py +0 -26
  440. deepeval-2.0.4/deepeval/guardrails/guardrails.py +0 -97
  441. deepeval-2.0.4/deepeval/guardrails/types.py +0 -77
  442. deepeval-2.0.4/deepeval/integrations/__init__.py +0 -4
  443. deepeval-2.0.4/deepeval/integrations/harness/__init__.py +0 -1
  444. deepeval-2.0.4/deepeval/integrations/harness/callback.py +0 -26
  445. deepeval-2.0.4/deepeval/integrations/hugging_face/callback.py +0 -231
  446. deepeval-2.0.4/deepeval/integrations/integrations.py +0 -33
  447. deepeval-2.0.4/deepeval/integrations/langchain/__init__.py +0 -25
  448. deepeval-2.0.4/deepeval/integrations/langchain/callback.py +0 -720
  449. deepeval-2.0.4/deepeval/integrations/llama_index/__init__.py +0 -9
  450. deepeval-2.0.4/deepeval/integrations/llama_index/callback.py +0 -832
  451. deepeval-2.0.4/deepeval/integrations/llama_index/evaluators.py +0 -323
  452. deepeval-2.0.4/deepeval/integrations/llama_index/utils.py +0 -10
  453. deepeval-2.0.4/deepeval/key_handler.py +0 -77
  454. deepeval-2.0.4/deepeval/metrics/__init__.py +0 -32
  455. deepeval-2.0.4/deepeval/metrics/answer_relevancy/answer_relevancy.py +0 -295
  456. deepeval-2.0.4/deepeval/metrics/answer_relevancy/schema.py +0 -19
  457. deepeval-2.0.4/deepeval/metrics/answer_relevancy/template.py +0 -93
  458. deepeval-2.0.4/deepeval/metrics/base_metric.py +0 -122
  459. deepeval-2.0.4/deepeval/metrics/bias/bias.py +0 -276
  460. deepeval-2.0.4/deepeval/metrics/bias/schema.py +0 -20
  461. deepeval-2.0.4/deepeval/metrics/bias/template.py +0 -112
  462. deepeval-2.0.4/deepeval/metrics/contextual_precision/contextual_precision.py +0 -277
  463. deepeval-2.0.4/deepeval/metrics/contextual_precision/schema.py +0 -15
  464. deepeval-2.0.4/deepeval/metrics/contextual_precision/template.py +0 -76
  465. deepeval-2.0.4/deepeval/metrics/contextual_recall/contextual_recall.py +0 -264
  466. deepeval-2.0.4/deepeval/metrics/contextual_recall/schema.py +0 -15
  467. deepeval-2.0.4/deepeval/metrics/contextual_recall/template.py +0 -69
  468. deepeval-2.0.4/deepeval/metrics/contextual_relevancy/contextual_relevancy.py +0 -256
  469. deepeval-2.0.4/deepeval/metrics/contextual_relevancy/schema.py +0 -16
  470. deepeval-2.0.4/deepeval/metrics/contextual_relevancy/template.py +0 -76
  471. deepeval-2.0.4/deepeval/metrics/conversation_completeness/conversation_completeness.py +0 -284
  472. deepeval-2.0.4/deepeval/metrics/conversation_completeness/schema.py +0 -15
  473. deepeval-2.0.4/deepeval/metrics/conversation_completeness/template.py +0 -134
  474. deepeval-2.0.4/deepeval/metrics/conversation_relevancy/conversation_relevancy.py +0 -251
  475. deepeval-2.0.4/deepeval/metrics/conversation_relevancy/schema.py +0 -12
  476. deepeval-2.0.4/deepeval/metrics/conversation_relevancy/template.py +0 -68
  477. deepeval-2.0.4/deepeval/metrics/conversational_g_eval/conversational_g_eval.py +0 -407
  478. deepeval-2.0.4/deepeval/metrics/conversational_g_eval/template.py +0 -41
  479. deepeval-2.0.4/deepeval/metrics/faithfulness/faithfulness.py +0 -337
  480. deepeval-2.0.4/deepeval/metrics/faithfulness/schema.py +0 -23
  481. deepeval-2.0.4/deepeval/metrics/faithfulness/template.py +0 -144
  482. deepeval-2.0.4/deepeval/metrics/g_eval/g_eval.py +0 -436
  483. deepeval-2.0.4/deepeval/metrics/g_eval/schema.py +0 -11
  484. deepeval-2.0.4/deepeval/metrics/g_eval/template.py +0 -40
  485. deepeval-2.0.4/deepeval/metrics/hallucination/hallucination.py +0 -259
  486. deepeval-2.0.4/deepeval/metrics/hallucination/schema.py +0 -15
  487. deepeval-2.0.4/deepeval/metrics/hallucination/template.py +0 -62
  488. deepeval-2.0.4/deepeval/metrics/image_editing/image_editing.py +0 -290
  489. deepeval-2.0.4/deepeval/metrics/indicator.py +0 -239
  490. deepeval-2.0.4/deepeval/metrics/json_correctness/json_correctness.py +0 -196
  491. deepeval-2.0.4/deepeval/metrics/json_correctness/schema.py +0 -5
  492. deepeval-2.0.4/deepeval/metrics/json_correctness/template.py +0 -31
  493. deepeval-2.0.4/deepeval/metrics/knowledge_retention/knowledge_retention.py +0 -323
  494. deepeval-2.0.4/deepeval/metrics/knowledge_retention/schema.py +0 -16
  495. deepeval-2.0.4/deepeval/metrics/knowledge_retention/template.py +0 -174
  496. deepeval-2.0.4/deepeval/metrics/prompt_alignment/prompt_alignment.py +0 -251
  497. deepeval-2.0.4/deepeval/metrics/prompt_alignment/schema.py +0 -15
  498. deepeval-2.0.4/deepeval/metrics/prompt_alignment/template.py +0 -87
  499. deepeval-2.0.4/deepeval/metrics/ragas.py +0 -534
  500. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/__init__.py +0 -22
  501. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/bfla/bfla.py +0 -198
  502. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/bfla/schema.py +0 -10
  503. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/bfla/template.py +0 -53
  504. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/bias/bias.py +0 -190
  505. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/bias/schema.py +0 -10
  506. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/bias/template.py +0 -52
  507. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/bola/bola.py +0 -194
  508. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/bola/schema.py +0 -11
  509. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/bola/template.py +0 -65
  510. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/competitors/competitors.py +0 -193
  511. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/competitors/schema.py +0 -10
  512. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/competitors/template.py +0 -64
  513. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/contracts/contracts.py +0 -149
  514. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/contracts/schema.py +0 -6
  515. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/contracts/template.py +0 -27
  516. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/debug_access/debug_access.py +0 -150
  517. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/debug_access/schema.py +0 -6
  518. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/debug_access/template.py +0 -26
  519. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/excessive_agency/excessive_agency.py +0 -150
  520. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/excessive_agency/schema.py +0 -6
  521. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/excessive_agency/template.py +0 -27
  522. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/hallucination/hallucination.py +0 -196
  523. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/hallucination/schema.py +0 -10
  524. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/hallucination/template.py +0 -54
  525. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/harm/harm.py +0 -154
  526. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/harm/schema.py +0 -6
  527. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/harm/template.py +0 -35
  528. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/hijacking/hijacking.py +0 -193
  529. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/hijacking/schema.py +0 -10
  530. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/hijacking/template.py +0 -46
  531. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/imitation/imitation.py +0 -194
  532. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/imitation/schema.py +0 -11
  533. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/imitation/template.py +0 -62
  534. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/overreliance/overreliance.py +0 -193
  535. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/overreliance/schema.py +0 -10
  536. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/overreliance/template.py +0 -68
  537. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/pii/pii.py +0 -236
  538. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/pii/schema.py +0 -15
  539. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/pii/template.py +0 -72
  540. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/politics/politics.py +0 -150
  541. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/politics/schema.py +0 -6
  542. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/politics/template.py +0 -26
  543. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/prompt_extraction/prompt_extraction.py +0 -193
  544. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/prompt_extraction/schema.py +0 -10
  545. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/prompt_extraction/template.py +0 -47
  546. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/rbac/rbac.py +0 -198
  547. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/rbac/schema.py +0 -10
  548. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/rbac/template.py +0 -57
  549. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/religion/religion.py +0 -150
  550. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/religion/schema.py +0 -6
  551. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/religion/template.py +0 -26
  552. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/shell_injection/schema.py +0 -6
  553. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/shell_injection/shell_injection.py +0 -150
  554. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/shell_injection/template.py +0 -27
  555. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/sql_injection/schema.py +0 -6
  556. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/sql_injection/sql_injection.py +0 -148
  557. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/sql_injection/template.py +0 -27
  558. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/ssrf/schema.py +0 -10
  559. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/ssrf/ssrf.py +0 -193
  560. deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/ssrf/template.py +0 -55
  561. deepeval-2.0.4/deepeval/metrics/role_adherence/role_adherence.py +0 -258
  562. deepeval-2.0.4/deepeval/metrics/role_adherence/schema.py +0 -10
  563. deepeval-2.0.4/deepeval/metrics/role_adherence/template.py +0 -84
  564. deepeval-2.0.4/deepeval/metrics/summarization/schema.py +0 -36
  565. deepeval-2.0.4/deepeval/metrics/summarization/summarization.py +0 -573
  566. deepeval-2.0.4/deepeval/metrics/summarization/template.py +0 -124
  567. deepeval-2.0.4/deepeval/metrics/text_to_image/text_to_image.py +0 -286
  568. deepeval-2.0.4/deepeval/metrics/tool_correctness/__init__.py +0 -0
  569. deepeval-2.0.4/deepeval/metrics/tool_correctness/tool_correctness.py +0 -128
  570. deepeval-2.0.4/deepeval/metrics/toxicity/__init__.py +0 -0
  571. deepeval-2.0.4/deepeval/metrics/toxicity/schema.py +0 -20
  572. deepeval-2.0.4/deepeval/metrics/toxicity/template.py +0 -89
  573. deepeval-2.0.4/deepeval/metrics/toxicity/toxicity.py +0 -284
  574. deepeval-2.0.4/deepeval/metrics/utils.py +0 -335
  575. deepeval-2.0.4/deepeval/models/__init__.py +0 -22
  576. deepeval-2.0.4/deepeval/models/_summac_model.py +0 -582
  577. deepeval-2.0.4/deepeval/models/answer_relevancy_model.py +0 -74
  578. deepeval-2.0.4/deepeval/models/base_model.py +0 -156
  579. deepeval-2.0.4/deepeval/models/gpt_model.py +0 -417
  580. deepeval-2.0.4/deepeval/models/gpt_model_schematic.py +0 -104
  581. deepeval-2.0.4/deepeval/models/openai_embedding_model.py +0 -108
  582. deepeval-2.0.4/deepeval/monitor/__init__.py +0 -3
  583. deepeval-2.0.4/deepeval/monitor/api.py +0 -56
  584. deepeval-2.0.4/deepeval/monitor/feedback.py +0 -82
  585. deepeval-2.0.4/deepeval/monitor/monitor.py +0 -150
  586. deepeval-2.0.4/deepeval/monitor/utils.py +0 -42
  587. deepeval-2.0.4/deepeval/plugins/__init__.py +0 -0
  588. deepeval-2.0.4/deepeval/plugins/plugin.py +0 -40
  589. deepeval-2.0.4/deepeval/progress_context.py +0 -60
  590. deepeval-2.0.4/deepeval/prompt/__init__.py +0 -1
  591. deepeval-2.0.4/deepeval/prompt/api.py +0 -6
  592. deepeval-2.0.4/deepeval/prompt/prompt.py +0 -64
  593. deepeval-2.0.4/deepeval/red_teaming/__init__.py +0 -2
  594. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/__init__.py +0 -11
  595. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/base.py +0 -12
  596. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/base64/__init__.py +0 -1
  597. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/base64/base64.py +0 -8
  598. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/gray_box/__init__.py +0 -1
  599. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/gray_box/gray_box.py +0 -142
  600. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/gray_box/schema.py +0 -13
  601. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/gray_box/template.py +0 -91
  602. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/jailbreaking_crescendo/__init__.py +0 -1
  603. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/jailbreaking_crescendo/jailbreaking_crescendo.py +0 -391
  604. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/jailbreaking_crescendo/schema.py +0 -30
  605. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/jailbreaking_crescendo/template.py +0 -245
  606. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/jailbreaking_linear/__init__.py +0 -1
  607. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/jailbreaking_linear/jailbreaking_linear.py +0 -214
  608. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/jailbreaking_linear/schema.py +0 -19
  609. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/jailbreaking_linear/template.py +0 -190
  610. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/jailbreaking_tree/__init__.py +0 -1
  611. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/jailbreaking_tree/jailbreaking_tree.py +0 -346
  612. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/jailbreaking_tree/schema.py +0 -19
  613. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/jailbreaking_tree/template.py +0 -190
  614. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/leetspeak/__init__.py +0 -1
  615. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/leetspeak/leetspeak.py +0 -23
  616. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/math_problem/__init__.py +0 -1
  617. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/math_problem/math_problem.py +0 -152
  618. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/math_problem/schema.py +0 -13
  619. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/math_problem/template.py +0 -112
  620. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/multilingual/__init__.py +0 -1
  621. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/multilingual/multilingual.py +0 -145
  622. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/multilingual/schema.py +0 -13
  623. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/multilingual/template.py +0 -111
  624. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/prompt_injection/__init__.py +0 -1
  625. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/prompt_injection/prompt_injection.py +0 -15
  626. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/prompt_injection/template.py +0 -25
  627. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/prompt_probing/__init__.py +0 -1
  628. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/prompt_probing/prompt_probing.py +0 -144
  629. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/prompt_probing/schema.py +0 -13
  630. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/prompt_probing/template.py +0 -97
  631. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/rot13/__init__.py +0 -1
  632. deepeval-2.0.4/deepeval/red_teaming/attack_enhancements/rot13/rot13.py +0 -12
  633. deepeval-2.0.4/deepeval/red_teaming/attack_synthesizer.py +0 -702
  634. deepeval-2.0.4/deepeval/red_teaming/red_teamer.py +0 -656
  635. deepeval-2.0.4/deepeval/red_teaming/schema.py +0 -35
  636. deepeval-2.0.4/deepeval/red_teaming/template.py +0 -1476
  637. deepeval-2.0.4/deepeval/red_teaming/types.py +0 -196
  638. deepeval-2.0.4/deepeval/red_teaming/utils.py +0 -61
  639. deepeval-2.0.4/deepeval/scorer/scorer.py +0 -427
  640. deepeval-2.0.4/deepeval/synthesizer/__init__.py +0 -5
  641. deepeval-2.0.4/deepeval/synthesizer/chunking/__init__.py +0 -0
  642. deepeval-2.0.4/deepeval/synthesizer/chunking/context_generator.py +0 -587
  643. deepeval-2.0.4/deepeval/synthesizer/chunking/doc_chunker.py +0 -173
  644. deepeval-2.0.4/deepeval/synthesizer/config.py +0 -59
  645. deepeval-2.0.4/deepeval/synthesizer/schema.py +0 -54
  646. deepeval-2.0.4/deepeval/synthesizer/synthesizer.py +0 -1115
  647. deepeval-2.0.4/deepeval/synthesizer/templates/__init__.py +0 -0
  648. deepeval-2.0.4/deepeval/synthesizer/templates/template.py +0 -728
  649. deepeval-2.0.4/deepeval/synthesizer/templates/template_prompt.py +0 -284
  650. deepeval-2.0.4/deepeval/telemetry.py +0 -241
  651. deepeval-2.0.4/deepeval/test_case/__init__.py +0 -3
  652. deepeval-2.0.4/deepeval/test_case/conversational_test_case.py +0 -29
  653. deepeval-2.0.4/deepeval/test_case/llm_test_case.py +0 -68
  654. deepeval-2.0.4/deepeval/test_case/mllm_test_case.py +0 -44
  655. deepeval-2.0.4/deepeval/test_case/utils.py +0 -24
  656. deepeval-2.0.4/deepeval/test_run/__init__.py +0 -11
  657. deepeval-2.0.4/deepeval/test_run/api.py +0 -146
  658. deepeval-2.0.4/deepeval/test_run/cache.py +0 -366
  659. deepeval-2.0.4/deepeval/test_run/hyperparameters.py +0 -56
  660. deepeval-2.0.4/deepeval/test_run/test_run.py +0 -762
  661. deepeval-2.0.4/deepeval/tracing/__init__.py +0 -50
  662. deepeval-2.0.4/deepeval/tracing/tracer.py +0 -592
  663. deepeval-2.0.4/deepeval/tracing/tracing.py +0 -254
  664. deepeval-2.0.4/deepeval/types.py +0 -6
  665. deepeval-2.0.4/deepeval/utils.py +0 -465
  666. deepeval-2.0.4/deepeval.egg-info/PKG-INFO +0 -35
  667. deepeval-2.0.4/deepeval.egg-info/SOURCES.txt +0 -437
  668. deepeval-2.0.4/deepeval.egg-info/entry_points.txt +0 -5
  669. deepeval-2.0.4/deepeval.egg-info/requires.txt +0 -27
  670. deepeval-2.0.4/deepeval.egg-info/top_level.txt +0 -2
  671. deepeval-2.0.4/pyproject.toml +0 -65
  672. deepeval-2.0.4/setup.cfg +0 -4
  673. deepeval-2.0.4/setup.py +0 -52
  674. deepeval-2.0.4/tests/__init__.py +0 -0
  675. deepeval-2.0.4/tests/custom_judge.py +0 -25
  676. deepeval-2.0.4/tests/test_answer_relevancy.py +0 -95
  677. deepeval-2.0.4/tests/test_benchmarks.py +0 -118
  678. deepeval-2.0.4/tests/test_bias.py +0 -53
  679. deepeval-2.0.4/tests/test_cache.py +0 -56
  680. deepeval-2.0.4/tests/test_cli.py +0 -27
  681. deepeval-2.0.4/tests/test_context_generator.py +0 -26
  682. deepeval-2.0.4/tests/test_contextual_precision.py +0 -88
  683. deepeval-2.0.4/tests/test_contextual_recall.py +0 -66
  684. deepeval-2.0.4/tests/test_contextual_relevancy.py +0 -47
  685. deepeval-2.0.4/tests/test_copy_metrics.py +0 -36
  686. deepeval-2.0.4/tests/test_custom_metric.py +0 -42
  687. deepeval-2.0.4/tests/test_dataset.py +0 -82
  688. deepeval-2.0.4/tests/test_deployment.py +0 -57
  689. deepeval-2.0.4/tests/test_everything.py +0 -248
  690. deepeval-2.0.4/tests/test_faithfulness.py +0 -85
  691. deepeval-2.0.4/tests/test_g_eval.py +0 -1241
  692. deepeval-2.0.4/tests/test_guardrails.py +0 -16
  693. deepeval-2.0.4/tests/test_hallucination.py +0 -61
  694. deepeval-2.0.4/tests/test_hybrid_tracing.py +0 -206
  695. deepeval-2.0.4/tests/test_image_metrics.py +0 -75
  696. deepeval-2.0.4/tests/test_json_metrics.py +0 -520
  697. deepeval-2.0.4/tests/test_rag_metrics.py +0 -38
  698. deepeval-2.0.4/tests/test_ragas.py +0 -69
  699. deepeval-2.0.4/tests/test_red_team_synthesizer.py +0 -207
  700. deepeval-2.0.4/tests/test_scoring.py +0 -225
  701. deepeval-2.0.4/tests/test_stateless.py +0 -341
  702. deepeval-2.0.4/tests/test_summarization.py +0 -42
  703. deepeval-2.0.4/tests/test_synthesizer.py +0 -403
  704. deepeval-2.0.4/tests/test_toxic.py +0 -34
  705. deepeval-2.0.4/tests/test_utils.py +0 -27
  706. {deepeval-2.0.4 → deepeval-3.8.0}/LICENSE.md +0 -0
  707. {deepeval-2.0.4/deepeval/benchmarks/big_bench_hard → deepeval-3.8.0/deepeval/benchmarks/arc}/__init__.py +0 -0
  708. {deepeval-2.0.4/deepeval/benchmarks/big_bench_hard/cot_prompts → deepeval-3.8.0/deepeval/benchmarks/bbq}/__init__.py +0 -0
  709. {deepeval-2.0.4/deepeval/benchmarks/big_bench_hard/shot_prompts → deepeval-3.8.0/deepeval/benchmarks/big_bench_hard}/__init__.py +0 -0
  710. {deepeval-2.0.4/deepeval/benchmarks/drop → deepeval-3.8.0/deepeval/benchmarks/big_bench_hard/cot_prompts}/__init__.py +0 -0
  711. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/boolean_expressions.txt +0 -0
  712. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/causal_judgement.txt +0 -0
  713. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/date_understanding.txt +0 -0
  714. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/disambiguation_qa.txt +0 -0
  715. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/dyck_languages.txt +0 -0
  716. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/formal_fallacies.txt +0 -0
  717. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/geometric_shapes.txt +0 -0
  718. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/hyperbaton.txt +0 -0
  719. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_five_objects.txt +0 -0
  720. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  721. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_three_objects.txt +0 -0
  722. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/movie_recommendation.txt +0 -0
  723. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/multistep_arithmetic_two.txt +0 -0
  724. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/navigate.txt +0 -0
  725. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/object_counting.txt +0 -0
  726. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/penguins_in_a_table.txt +0 -0
  727. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  728. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/ruin_names.txt +0 -0
  729. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/salient_translation_error_detection.txt +0 -0
  730. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/sports_understanding.txt +0 -0
  731. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/temporal_sequences.txt +0 -0
  732. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  733. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  734. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  735. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/web_of_lies.txt +0 -0
  736. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/word_sorting.txt +0 -0
  737. {deepeval-2.0.4/deepeval/benchmarks/gsm8k → deepeval-3.8.0/deepeval/benchmarks/big_bench_hard/shot_prompts}/__init__.py +0 -0
  738. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/boolean_expressions.txt +0 -0
  739. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/causal_judgement.txt +0 -0
  740. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/date_understanding.txt +0 -0
  741. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/disambiguation_qa.txt +0 -0
  742. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/dyck_languages.txt +0 -0
  743. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/formal_fallacies.txt +0 -0
  744. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/geometric_shapes.txt +0 -0
  745. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/hyperbaton.txt +0 -0
  746. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_five_objects.txt +0 -0
  747. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_seven_objects.txt +0 -0
  748. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_three_objects.txt +0 -0
  749. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/movie_recommendation.txt +0 -0
  750. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/multistep_arithmetic_two.txt +0 -0
  751. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/navigate.txt +0 -0
  752. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/object_counting.txt +0 -0
  753. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/penguins_in_a_table.txt +0 -0
  754. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/reasoning_about_colored_objects.txt +0 -0
  755. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/ruin_names.txt +0 -0
  756. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/salient_translation_error_detection.txt +0 -0
  757. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/snarks.txt +0 -0
  758. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/sports_understanding.txt +0 -0
  759. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/temporal_sequences.txt +0 -0
  760. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  761. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  762. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  763. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/web_of_lies.txt +0 -0
  764. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/word_sorting.txt +0 -0
  765. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/task.py +0 -0
  766. {deepeval-2.0.4/deepeval/benchmarks/hellaswag → deepeval-3.8.0/deepeval/benchmarks/bool_q}/__init__.py +0 -0
  767. {deepeval-2.0.4/deepeval/benchmarks/human_eval → deepeval-3.8.0/deepeval/benchmarks/drop}/__init__.py +0 -0
  768. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/drop/task.py +0 -0
  769. {deepeval-2.0.4/deepeval/benchmarks/mmlu → deepeval-3.8.0/deepeval/benchmarks/equity_med_qa}/__init__.py +0 -0
  770. {deepeval-2.0.4/deepeval/benchmarks/truthful_qa → deepeval-3.8.0/deepeval/benchmarks/gsm8k}/__init__.py +0 -0
  771. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/gsm8k/template.py +0 -0
  772. {deepeval-2.0.4/deepeval/cli → deepeval-3.8.0/deepeval/benchmarks/hellaswag}/__init__.py +0 -0
  773. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/hellaswag/template.py +0 -0
  774. {deepeval-2.0.4/deepeval/metrics/answer_relevancy → deepeval-3.8.0/deepeval/benchmarks/human_eval}/__init__.py +0 -0
  775. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/human_eval/task.py +0 -0
  776. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/human_eval/template.py +0 -0
  777. /deepeval-2.0.4/deepeval.egg-info/dependency_links.txt → /deepeval-3.8.0/deepeval/benchmarks/ifeval/__init__.py +0 -0
  778. {deepeval-2.0.4/deepeval/metrics/bias → deepeval-3.8.0/deepeval/benchmarks/lambada}/__init__.py +0 -0
  779. {deepeval-2.0.4/deepeval/metrics/contextual_precision → deepeval-3.8.0/deepeval/benchmarks/logi_qa}/__init__.py +0 -0
  780. {deepeval-2.0.4/deepeval/metrics/contextual_recall → deepeval-3.8.0/deepeval/benchmarks/math_qa}/__init__.py +0 -0
  781. {deepeval-2.0.4/deepeval/metrics/contextual_relevancy → deepeval-3.8.0/deepeval/benchmarks/mmlu}/__init__.py +0 -0
  782. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/mmlu/task.py +0 -0
  783. {deepeval-2.0.4/deepeval/metrics/conversation_completeness → deepeval-3.8.0/deepeval/benchmarks/squad}/__init__.py +0 -0
  784. {deepeval-2.0.4/deepeval/metrics/conversation_relevancy → deepeval-3.8.0/deepeval/benchmarks/truthful_qa}/__init__.py +0 -0
  785. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/truthful_qa/mode.py +0 -0
  786. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/truthful_qa/task.py +0 -0
  787. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/benchmarks/truthful_qa/template.py +0 -0
  788. {deepeval-2.0.4/deepeval/metrics/conversational_g_eval → deepeval-3.8.0/deepeval/benchmarks/winogrande}/__init__.py +0 -0
  789. {deepeval-2.0.4/deepeval/metrics/faithfulness → deepeval-3.8.0/deepeval/cli}/__init__.py +0 -0
  790. {deepeval-2.0.4/deepeval/metrics/g_eval → deepeval-3.8.0/deepeval/config}/__init__.py +0 -0
  791. {deepeval-2.0.4/deepeval/metrics/hallucination → deepeval-3.8.0/deepeval/integrations}/__init__.py +0 -0
  792. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/integrations/hugging_face/__init__.py +0 -0
  793. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/integrations/hugging_face/rich_manager.py +0 -0
  794. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/integrations/hugging_face/utils.py +0 -0
  795. /deepeval-2.0.4/deepeval/metrics/image_editing/__init__.py → /deepeval-3.8.0/deepeval/integrations/pydantic_ai/test_instrumentator.py +0 -0
  796. {deepeval-2.0.4/deepeval/metrics/json_correctness → deepeval-3.8.0/deepeval/metrics/argument_correctness}/__init__.py +0 -0
  797. {deepeval-2.0.4/deepeval/metrics/knowledge_retention → deepeval-3.8.0/deepeval/metrics/conversation_completeness}/__init__.py +0 -0
  798. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/metrics/conversational_g_eval/schema.py +0 -0
  799. {deepeval-2.0.4/deepeval/metrics/prompt_alignment → deepeval-3.8.0/deepeval/metrics/exact_match}/__init__.py +0 -0
  800. {deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/bfla → deepeval-3.8.0/deepeval/metrics/json_correctness}/__init__.py +0 -0
  801. {deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/bias → deepeval-3.8.0/deepeval/metrics/knowledge_retention}/__init__.py +0 -0
  802. {deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/bola → deepeval-3.8.0/deepeval/metrics/mcp}/__init__.py +0 -0
  803. {deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/competitors → deepeval-3.8.0/deepeval/metrics/mcp_use_metric}/__init__.py +0 -0
  804. {deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/contracts → deepeval-3.8.0/deepeval/metrics/multimodal_metrics/image_coherence}/__init__.py +0 -0
  805. {deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/debug_access → deepeval-3.8.0/deepeval/metrics/multimodal_metrics/image_editing}/__init__.py +0 -0
  806. {deepeval-2.0.4/deepeval/metrics → deepeval-3.8.0/deepeval/metrics/multimodal_metrics}/image_editing/schema.py +0 -0
  807. {deepeval-2.0.4/deepeval/metrics → deepeval-3.8.0/deepeval/metrics/multimodal_metrics}/image_editing/template.py +0 -0
  808. {deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/excessive_agency → deepeval-3.8.0/deepeval/metrics/multimodal_metrics/image_helpfulness}/__init__.py +0 -0
  809. {deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/hallucination → deepeval-3.8.0/deepeval/metrics/multimodal_metrics/image_reference}/__init__.py +0 -0
  810. {deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/harm → deepeval-3.8.0/deepeval/metrics/multimodal_metrics/text_to_image}/__init__.py +0 -0
  811. {deepeval-2.0.4/deepeval/metrics → deepeval-3.8.0/deepeval/metrics/multimodal_metrics}/text_to_image/schema.py +0 -0
  812. {deepeval-2.0.4/deepeval/metrics → deepeval-3.8.0/deepeval/metrics/multimodal_metrics}/text_to_image/template.py +0 -0
  813. {deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/hijacking → deepeval-3.8.0/deepeval/metrics/pattern_match}/__init__.py +0 -0
  814. {deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/imitation → deepeval-3.8.0/deepeval/metrics/prompt_alignment}/__init__.py +0 -0
  815. {deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/overreliance → deepeval-3.8.0/deepeval/metrics/role_adherence}/__init__.py +0 -0
  816. {deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/pii → deepeval-3.8.0/deepeval/metrics/task_completion}/__init__.py +0 -0
  817. {deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/politics → deepeval-3.8.0/deepeval/metrics/tool_correctness}/__init__.py +0 -0
  818. {deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/prompt_extraction → deepeval-3.8.0/deepeval/metrics/turn_contextual_precision}/__init__.py +0 -0
  819. {deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/rbac → deepeval-3.8.0/deepeval/metrics/turn_contextual_recall}/__init__.py +0 -0
  820. {deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/religion → deepeval-3.8.0/deepeval/metrics/turn_contextual_relevancy}/__init__.py +0 -0
  821. {deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/shell_injection → deepeval-3.8.0/deepeval/metrics/turn_faithfulness}/__init__.py +0 -0
  822. {deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/sql_injection → deepeval-3.8.0/deepeval/metrics/turn_relevancy}/__init__.py +0 -0
  823. {deepeval-2.0.4/deepeval/metrics/red_teaming_metrics/ssrf → deepeval-3.8.0/deepeval/model_integrations}/__init__.py +0 -0
  824. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/models/detoxify_model.py +0 -0
  825. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/models/hallucination_model.py +0 -0
  826. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/models/summac_model.py +0 -0
  827. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/models/unbias_model.py +0 -0
  828. {deepeval-2.0.4/deepeval/metrics/role_adherence → deepeval-3.8.0/deepeval/plugins}/__init__.py +0 -0
  829. /deepeval-2.0.4/deepeval/metrics/summarization/__init__.py → /deepeval-3.8.0/deepeval/py.typed +0 -0
  830. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/scorer/__init__.py +0 -0
  831. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/singleton.py +0 -0
  832. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/synthesizer/base_synthesizer.py +0 -0
  833. {deepeval-2.0.4/deepeval/metrics/text_to_image → deepeval-3.8.0/deepeval/synthesizer/chunking}/__init__.py +0 -0
  834. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/synthesizer/types.py +0 -0
  835. {deepeval-2.0.4 → deepeval-3.8.0}/deepeval/test_run/hooks.py +0 -0
@@ -0,0 +1,473 @@
1
+ Metadata-Version: 2.1
2
+ Name: deepeval
3
+ Version: 3.8.0
4
+ Summary: The LLM Evaluation Framework
5
+ Home-page: https://github.com/confident-ai/deepeval
6
+ License: Apache-2.0
7
+ Author: Jeffrey Ip
8
+ Author-email: jeffreyip@confident-ai.com
9
+ Requires-Python: >=3.9,<4.0
10
+ Classifier: License :: OSI Approved :: Apache Software License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.9
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Requires-Dist: aiohttp
16
+ Requires-Dist: click (>=8.0.0,<8.3.0)
17
+ Requires-Dist: grpcio (>=1.67.1,<2.0.0)
18
+ Requires-Dist: jinja2
19
+ Requires-Dist: nest_asyncio
20
+ Requires-Dist: openai
21
+ Requires-Dist: opentelemetry-api (>=1.24.0,<2.0.0)
22
+ Requires-Dist: opentelemetry-exporter-otlp-proto-grpc (>=1.24.0,<2.0.0)
23
+ Requires-Dist: opentelemetry-sdk (>=1.24.0,<2.0.0)
24
+ Requires-Dist: portalocker
25
+ Requires-Dist: posthog (>=5.4.0,<6.0.0)
26
+ Requires-Dist: pydantic (>=2.11.7,<3.0.0)
27
+ Requires-Dist: pydantic-settings (>=2.10.1,<3.0.0)
28
+ Requires-Dist: pyfiglet
29
+ Requires-Dist: pytest
30
+ Requires-Dist: pytest-asyncio
31
+ Requires-Dist: pytest-repeat
32
+ Requires-Dist: pytest-rerunfailures
33
+ Requires-Dist: pytest-xdist
34
+ Requires-Dist: python-dotenv (>=1.1.1,<2.0.0)
35
+ Requires-Dist: requests (>=2.31.0,<3.0.0)
36
+ Requires-Dist: rich (>=13.6.0,<15.0.0)
37
+ Requires-Dist: sentry-sdk
38
+ Requires-Dist: setuptools
39
+ Requires-Dist: tabulate (>=0.9.0,<0.10.0)
40
+ Requires-Dist: tenacity (>=8.0.0,<=10.0.0)
41
+ Requires-Dist: tqdm (>=4.66.1,<5.0.0)
42
+ Requires-Dist: typer (>=0.9,<1.0.0)
43
+ Requires-Dist: wheel
44
+ Project-URL: Documentation, https://deepeval.com
45
+ Project-URL: Repository, https://github.com/confident-ai/deepeval
46
+ Description-Content-Type: text/markdown
47
+
48
+ <p align="center">
49
+ <img src="https://github.com/confident-ai/deepeval/blob/main/docs/static/img/deepeval.png" alt="DeepEval Logo" width="100%">
50
+ </p>
51
+
52
+ <p align="center">
53
+ <h1 align="center">The LLM Evaluation Framework</h1>
54
+ </p>
55
+
56
+ <p align="center">
57
+ <a href="https://trendshift.io/repositories/5917" target="_blank"><img src="https://trendshift.io/api/badge/repositories/5917" alt="confident-ai%2Fdeepeval | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
58
+ </p>
59
+
60
+ <p align="center">
61
+ <a href="https://discord.gg/3SEyvpgu2f">
62
+ <img alt="discord-invite" src="https://dcbadge.vercel.app/api/server/3SEyvpgu2f?style=flat">
63
+ </a>
64
+ </p>
65
+
66
+ <h4 align="center">
67
+ <p>
68
+ <a href="https://deepeval.com/docs/getting-started?utm_source=GitHub">Documentation</a> |
69
+ <a href="#-metrics-and-features">Metrics and Features</a> |
70
+ <a href="#-quickstart">Getting Started</a> |
71
+ <a href="#-integrations">Integrations</a> |
72
+ <a href="https://confident-ai.com?utm_source=GitHub">DeepEval Platform</a>
73
+ <p>
74
+ </h4>
75
+
76
+ <p align="center">
77
+ <a href="https://github.com/confident-ai/deepeval/releases">
78
+ <img alt="GitHub release" src="https://img.shields.io/github/release/confident-ai/deepeval.svg?color=violet">
79
+ </a>
80
+ <a href="https://colab.research.google.com/drive/1PPxYEBa6eu__LquGoFFJZkhYgWVYE6kh?usp=sharing">
81
+ <img alt="Try Quickstart in Colab" src="https://colab.research.google.com/assets/colab-badge.svg">
82
+ </a>
83
+ <a href="https://github.com/confident-ai/deepeval/blob/master/LICENSE.md">
84
+ <img alt="License" src="https://img.shields.io/github/license/confident-ai/deepeval.svg?color=yellow">
85
+ </a>
86
+ <a href="https://x.com/deepeval">
87
+ <img alt="Twitter Follow" src="https://img.shields.io/twitter/follow/deepeval?style=social&logo=x">
88
+ </a>
89
+ </p>
90
+
91
+ <p align="center">
92
+ <!-- Keep these links. Translations will automatically update with the README. -->
93
+ <a href="https://www.readme-i18n.com/confident-ai/deepeval?lang=de">Deutsch</a> |
94
+ <a href="https://www.readme-i18n.com/confident-ai/deepeval?lang=es">Español</a> |
95
+ <a href="https://www.readme-i18n.com/confident-ai/deepeval?lang=fr">français</a> |
96
+ <a href="https://www.readme-i18n.com/confident-ai/deepeval?lang=ja">日本語</a> |
97
+ <a href="https://www.readme-i18n.com/confident-ai/deepeval?lang=ko">한국어</a> |
98
+ <a href="https://www.readme-i18n.com/confident-ai/deepeval?lang=pt">Português</a> |
99
+ <a href="https://www.readme-i18n.com/confident-ai/deepeval?lang=ru">Русский</a> |
100
+ <a href="https://www.readme-i18n.com/confident-ai/deepeval?lang=zh">中文</a>
101
+ </p>
102
+
103
+ **DeepEval** is a simple-to-use, open-source LLM evaluation framework, for evaluating and testing large-language model systems. It is similar to Pytest but specialized for unit testing LLM outputs. DeepEval incorporates the latest research to evaluate LLM outputs based on metrics such as G-Eval, task completion, answer relevancy, hallucination, etc., which uses LLM-as-a-judge and other NLP models that run **locally on your machine** for evaluation.
104
+
105
+ Whether your LLM applications are AI agents, RAG pipelines, or chatbots, implemented via LangChain or OpenAI, DeepEval has you covered. With it, you can easily determine the optimal models, prompts, and architecture to improve your RAG pipeline, agentic workflows, prevent prompt drifting, or even transition from OpenAI to hosting your own Deepseek R1 with confidence.
106
+
107
+ > [!IMPORTANT]
108
+ > Need a place for your DeepEval testing data to live 🏡❤️? [Sign up to the DeepEval platform](https://confident-ai.com?utm_source=GitHub) to compare iterations of your LLM app, generate & share testing reports, and more.
109
+ >
110
+ > ![Demo GIF](assets/demo.gif)
111
+
112
+ > Want to talk LLM evaluation, need help picking metrics, or just to say hi? [Come join our discord.](https://discord.com/invite/3SEyvpgu2f)
113
+
114
+ <br />
115
+
116
+ # 🔥 Metrics and Features
117
+
118
+ > 🥳 You can now share DeepEval's test results on the cloud directly on [Confident AI](https://confident-ai.com?utm_source=GitHub)
119
+
120
+ - Supports both end-to-end and component-level LLM evaluation.
121
+ - Large variety of ready-to-use LLM evaluation metrics (all with explanations) powered by **ANY** LLM of your choice, statistical methods, or NLP models that run **locally on your machine**:
122
+ - G-Eval
123
+ - DAG ([deep acyclic graph](https://deepeval.com/docs/metrics-dag))
124
+ - **RAG metrics:**
125
+ - Answer Relevancy
126
+ - Faithfulness
127
+ - Contextual Recall
128
+ - Contextual Precision
129
+ - Contextual Relevancy
130
+ - RAGAS
131
+ - **Agentic metrics:**
132
+ - Task Completion
133
+ - Tool Correctness
134
+ - **Others:**
135
+ - Hallucination
136
+ - Summarization
137
+ - Bias
138
+ - Toxicity
139
+ - **Conversational metrics:**
140
+ - Knowledge Retention
141
+ - Conversation Completeness
142
+ - Conversation Relevancy
143
+ - Role Adherence
144
+ - etc.
145
+ - Build your own custom metrics that are automatically integrated with DeepEval's ecosystem.
146
+ - Generate synthetic datasets for evaluation.
147
+ - Integrates seamlessly with **ANY** CI/CD environment.
148
+ - [Red team your LLM application](https://deepeval.com/docs/red-teaming-introduction) for 40+ safety vulnerabilities in a few lines of code, including:
149
+ - Toxicity
150
+ - Bias
151
+ - SQL Injection
152
+ - etc., using advanced 10+ attack enhancement strategies such as prompt injections.
153
+ - Easily benchmark **ANY** LLM on popular LLM benchmarks in [under 10 lines of code.](https://deepeval.com/docs/benchmarks-introduction?utm_source=GitHub), which includes:
154
+ - MMLU
155
+ - HellaSwag
156
+ - DROP
157
+ - BIG-Bench Hard
158
+ - TruthfulQA
159
+ - HumanEval
160
+ - GSM8K
161
+ - [100% integrated with Confident AI](https://confident-ai.com?utm_source=GitHub) for the full evaluation & observability lifecycle:
162
+ - Curate/annotate evaluation datasets on the cloud
163
+ - Benchmark LLM app using dataset, and compare with previous iterations to experiment which models/prompts works best
164
+ - Fine-tune metrics for custom results
165
+ - Debug evaluation results via LLM traces
166
+ - Monitor & evaluate LLM responses in product to improve datasets with real-world data
167
+ - Repeat until perfection
168
+
169
+ > [!NOTE]
170
+ > DeepEval is available on Confident AI, an LLM evals platform for AI observability and quality. Create an account [here.](https://app.confident-ai.com?utm_source=GitHub)
171
+
172
+ <br />
173
+
174
+ # 🔌 Integrations
175
+
176
+ - 🦄 LlamaIndex, to [**unit test RAG applications in CI/CD**](https://www.deepeval.com/integrations/frameworks/llamaindex?utm_source=GitHub)
177
+ - 🤗 Hugging Face, to [**enable real-time evaluations during LLM fine-tuning**](https://www.deepeval.com/integrations/frameworks/huggingface?utm_source=GitHub)
178
+
179
+ <br />
180
+
181
+ # 🚀 QuickStart
182
+
183
+ Let's pretend your LLM application is a RAG based customer support chatbot; here's how DeepEval can help test what you've built.
184
+
185
+ ## Installation
186
+
187
+ Deepeval works with **Python>=3.9+**.
188
+
189
+ ```
190
+ pip install -U deepeval
191
+ ```
192
+
193
+ ## Create an account (highly recommended)
194
+
195
+ Using the `deepeval` platform will allow you to generate sharable testing reports on the cloud. It is free, takes no additional code to setup, and we highly recommend giving it a try.
196
+
197
+ To login, run:
198
+
199
+ ```
200
+ deepeval login
201
+ ```
202
+
203
+ Follow the instructions in the CLI to create an account, copy your API key, and paste it into the CLI. All test cases will automatically be logged (find more information on data privacy [here](https://deepeval.com/docs/data-privacy?utm_source=GitHub)).
204
+
205
+ ## Writing your first test case
206
+
207
+ Create a test file:
208
+
209
+ ```bash
210
+ touch test_chatbot.py
211
+ ```
212
+
213
+ Open `test_chatbot.py` and write your first test case to run an **end-to-end** evaluation using DeepEval, which treats your LLM app as a black-box:
214
+
215
+ ```python
216
+ import pytest
217
+ from deepeval import assert_test
218
+ from deepeval.metrics import GEval
219
+ from deepeval.test_case import LLMTestCase, LLMTestCaseParams
220
+
221
+ def test_case():
222
+ correctness_metric = GEval(
223
+ name="Correctness",
224
+ criteria="Determine if the 'actual output' is correct based on the 'expected output'.",
225
+ evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
226
+ threshold=0.5
227
+ )
228
+ test_case = LLMTestCase(
229
+ input="What if these shoes don't fit?",
230
+ # Replace this with the actual output from your LLM application
231
+ actual_output="You have 30 days to get a full refund at no extra cost.",
232
+ expected_output="We offer a 30-day full refund at no extra costs.",
233
+ retrieval_context=["All customers are eligible for a 30 day full refund at no extra costs."]
234
+ )
235
+ assert_test(test_case, [correctness_metric])
236
+ ```
237
+
238
+ Set your `OPENAI_API_KEY` as an environment variable (you can also evaluate using your own custom model, for more details visit [this part of our docs](https://deepeval.com/docs/metrics-introduction#using-a-custom-llm?utm_source=GitHub)):
239
+
240
+ ```
241
+ export OPENAI_API_KEY="..."
242
+ ```
243
+
244
+ And finally, run `test_chatbot.py` in the CLI:
245
+
246
+ ```
247
+ deepeval test run test_chatbot.py
248
+ ```
249
+
250
+ **Congratulations! Your test case should have passed ✅** Let's breakdown what happened.
251
+
252
+ - The variable `input` mimics a user input, and `actual_output` is a placeholder for what your application's supposed to output based on this input.
253
+ - The variable `expected_output` represents the ideal answer for a given `input`, and [`GEval`](https://deepeval.com/docs/metrics-llm-evals) is a research-backed metric provided by `deepeval` for you to evaluate your LLM output's on any custom with human-like accuracy.
254
+ - In this example, the metric `criteria` is correctness of the `actual_output` based on the provided `expected_output`.
255
+ - All metric scores range from 0 - 1, which the `threshold=0.5` threshold ultimately determines if your test have passed or not.
256
+
257
+ [Read our documentation](https://deepeval.com/docs/getting-started?utm_source=GitHub) for more information on more options to run end-to-end evaluation, how to use additional metrics, create your own custom metrics, and tutorials on how to integrate with other tools like LangChain and LlamaIndex.
258
+
259
+ <br />
260
+
261
+ ## Evaluating Nested Components
262
+
263
+ If you wish to evaluate individual components within your LLM app, you need to run **component-level** evals - a powerful way to evaluate any component within an LLM system.
264
+
265
+ Simply trace "components" such as LLM calls, retrievers, tool calls, and agents within your LLM application using the `@observe` decorator to apply metrics on a component-level. Tracing with `deepeval` is non-instrusive (learn more [here](https://deepeval.com/docs/evaluation-llm-tracing#dont-be-worried-about-tracing)) and helps you avoid rewriting your codebase just for evals:
266
+
267
+ ```python
268
+ from deepeval.tracing import observe, update_current_span
269
+ from deepeval.test_case import LLMTestCase
270
+ from deepeval.dataset import Golden
271
+ from deepeval.metrics import GEval
272
+ from deepeval import evaluate
273
+
274
+ correctness = GEval(name="Correctness", criteria="Determine if the 'actual output' is correct based on the 'expected output'.", evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT])
275
+
276
+ @observe(metrics=[correctness])
277
+ def inner_component():
278
+ # Component can be anything from an LLM call, retrieval, agent, tool use, etc.
279
+ update_current_span(test_case=LLMTestCase(input="...", actual_output="..."))
280
+ return
281
+
282
+ @observe
283
+ def llm_app(input: str):
284
+ inner_component()
285
+ return
286
+
287
+ evaluate(observed_callback=llm_app, goldens=[Golden(input="Hi!")])
288
+ ```
289
+
290
+ You can learn everything about component-level evaluations [here.](https://www.deepeval.com/docs/evaluation-component-level-llm-evals)
291
+
292
+ <br />
293
+
294
+ ## Evaluating Without Pytest Integration
295
+
296
+ Alternatively, you can evaluate without Pytest, which is more suited for a notebook environment.
297
+
298
+ ```python
299
+ from deepeval import evaluate
300
+ from deepeval.metrics import AnswerRelevancyMetric
301
+ from deepeval.test_case import LLMTestCase
302
+
303
+ answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.7)
304
+ test_case = LLMTestCase(
305
+ input="What if these shoes don't fit?",
306
+ # Replace this with the actual output from your LLM application
307
+ actual_output="We offer a 30-day full refund at no extra costs.",
308
+ retrieval_context=["All customers are eligible for a 30 day full refund at no extra costs."]
309
+ )
310
+ evaluate([test_case], [answer_relevancy_metric])
311
+ ```
312
+
313
+ ## Using Standalone Metrics
314
+
315
+ DeepEval is extremely modular, making it easy for anyone to use any of our metrics. Continuing from the previous example:
316
+
317
+ ```python
318
+ from deepeval.metrics import AnswerRelevancyMetric
319
+ from deepeval.test_case import LLMTestCase
320
+
321
+ answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.7)
322
+ test_case = LLMTestCase(
323
+ input="What if these shoes don't fit?",
324
+ # Replace this with the actual output from your LLM application
325
+ actual_output="We offer a 30-day full refund at no extra costs.",
326
+ retrieval_context=["All customers are eligible for a 30 day full refund at no extra costs."]
327
+ )
328
+
329
+ answer_relevancy_metric.measure(test_case)
330
+ print(answer_relevancy_metric.score)
331
+ # All metrics also offer an explanation
332
+ print(answer_relevancy_metric.reason)
333
+ ```
334
+
335
+ Note that some metrics are for RAG pipelines, while others are for fine-tuning. Make sure to use our docs to pick the right one for your use case.
336
+
337
+ ## Evaluating a Dataset / Test Cases in Bulk
338
+
339
+ In DeepEval, a dataset is simply a collection of test cases. Here is how you can evaluate these in bulk:
340
+
341
+ ```python
342
+ import pytest
343
+ from deepeval import assert_test
344
+ from deepeval.dataset import EvaluationDataset, Golden
345
+ from deepeval.metrics import AnswerRelevancyMetric
346
+ from deepeval.test_case import LLMTestCase
347
+
348
+ dataset = EvaluationDataset(goldens=[Golden(input="What's the weather like today?")])
349
+
350
+ for golden in dataset.goldens:
351
+ test_case = LLMTestCase(
352
+ input=golden.input,
353
+ actual_output=your_llm_app(golden.input)
354
+ )
355
+ dataset.add_test_case(test_case)
356
+
357
+ @pytest.mark.parametrize(
358
+ "test_case",
359
+ dataset.test_cases,
360
+ )
361
+ def test_customer_chatbot(test_case: LLMTestCase):
362
+ answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5)
363
+ assert_test(test_case, [answer_relevancy_metric])
364
+ ```
365
+
366
+ ```bash
367
+ # Run this in the CLI, you can also add an optional -n flag to run tests in parallel
368
+ deepeval test run test_<filename>.py -n 4
369
+ ```
370
+
371
+ <br/>
372
+
373
+ Alternatively, although we recommend using `deepeval test run`, you can evaluate a dataset/test cases without using our Pytest integration:
374
+
375
+ ```python
376
+ from deepeval import evaluate
377
+ ...
378
+
379
+ evaluate(dataset, [answer_relevancy_metric])
380
+ # or
381
+ dataset.evaluate([answer_relevancy_metric])
382
+ ```
383
+
384
+ ## A Note on Env Variables (.env / .env.local)
385
+
386
+ DeepEval auto-loads `.env.local` then `.env` from the current working directory **at import time**.
387
+ **Precedence:** process env -> `.env.local` -> `.env`.
388
+ Opt out with `DEEPEVAL_DISABLE_DOTENV=1`.
389
+
390
+ ```bash
391
+ cp .env.example .env.local
392
+ # then edit .env.local (ignored by git)
393
+ ```
394
+
395
+ # DeepEval With Confident AI
396
+
397
+ DeepEval is available on [Confident AI](https://confident-ai.com?utm_source=Github), an evals & observability platform that allows you to:
398
+
399
+ 1. Curate/annotate evaluation datasets on the cloud
400
+ 2. Benchmark LLM app using dataset, and compare with previous iterations to experiment which models/prompts works best
401
+ 3. Fine-tune metrics for custom results
402
+ 4. Debug evaluation results via LLM traces
403
+ 5. Monitor & evaluate LLM responses in product to improve datasets with real-world data
404
+ 6. Repeat until perfection
405
+
406
+ Everything on Confident AI, including how to use Confident is available [here](https://www.confident-ai.com/docs?utm_source=GitHub).
407
+
408
+ To begin, login from the CLI:
409
+
410
+ ```bash
411
+ deepeval login
412
+ ```
413
+
414
+ Follow the instructions to log in, create your account, and paste your API key into the CLI.
415
+
416
+ Now, run your test file again:
417
+
418
+ ```bash
419
+ deepeval test run test_chatbot.py
420
+ ```
421
+
422
+ You should see a link displayed in the CLI once the test has finished running. Paste it into your browser to view the results!
423
+
424
+ ![Demo GIF](assets/demo.gif)
425
+
426
+ <br />
427
+
428
+ ## Configuration
429
+
430
+ ### Environment variables via .env files
431
+
432
+ Using `.env.local` or `.env` is optional. If they are missing, DeepEval uses your existing environment variables. When present, dotenv environment variables are auto-loaded at import time (unless you set `DEEPEVAL_DISABLE_DOTENV=1`).
433
+
434
+ **Precedence:** process env -> `.env.local` -> `.env`
435
+
436
+ ```bash
437
+ cp .env.example .env.local
438
+ # then edit .env.local (ignored by git)
439
+ ```
440
+
441
+ <br />
442
+
443
+ # Contributing
444
+
445
+ Please read [CONTRIBUTING.md](https://github.com/confident-ai/deepeval/blob/main/CONTRIBUTING.md) for details on our code of conduct, and the process for submitting pull requests to us.
446
+
447
+ <br />
448
+
449
+ # Roadmap
450
+
451
+ Features:
452
+
453
+ - [x] Integration with Confident AI
454
+ - [x] Implement G-Eval
455
+ - [x] Implement RAG metrics
456
+ - [x] Implement Conversational metrics
457
+ - [x] Evaluation Dataset Creation
458
+ - [x] Red-Teaming
459
+ - [ ] DAG custom metrics
460
+ - [ ] Guardrails
461
+
462
+ <br />
463
+
464
+ # Authors
465
+
466
+ Built by the founders of Confident AI. Contact jeffreyip@confident-ai.com for all enquiries.
467
+
468
+ <br />
469
+
470
+ # License
471
+
472
+ DeepEval is licensed under Apache 2.0 - see the [LICENSE.md](https://github.com/confident-ai/deepeval/blob/main/LICENSE.md) file for details.
473
+