deepeval 3.4.1__tar.gz → 3.4.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (454) hide show
  1. {deepeval-3.4.1 → deepeval-3.4.3}/PKG-INFO +1 -1
  2. deepeval-3.4.3/deepeval/_version.py +1 -0
  3. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/annotation/annotation.py +4 -0
  4. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/annotation/api.py +1 -0
  5. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/arc/arc.py +11 -6
  6. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/base_benchmark.py +8 -1
  7. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/bbq/bbq.py +11 -6
  8. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/big_bench_hard.py +9 -6
  9. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/bool_q/bool_q.py +11 -4
  10. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/drop/drop.py +9 -4
  11. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/equity_med_qa/equity_med_qa.py +11 -6
  12. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/gsm8k/gsm8k.py +11 -4
  13. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/hellaswag/hellaswag.py +9 -4
  14. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/human_eval/human_eval.py +9 -4
  15. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/ifeval/ifeval.py +27 -12
  16. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/lambada/lambada.py +11 -6
  17. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/logi_qa/logi_qa.py +8 -3
  18. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/math_qa/math_qa.py +13 -5
  19. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/mmlu/mmlu.py +15 -6
  20. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/mmlu/template.py +3 -3
  21. deepeval-3.4.3/deepeval/benchmarks/results.py +2 -0
  22. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/squad/squad.py +11 -4
  23. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/truthful_qa/truthful_qa.py +8 -3
  24. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/winogrande/winogrande.py +11 -6
  25. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/dataset/dataset.py +254 -112
  26. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/dataset/utils.py +70 -1
  27. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/evaluate/execute.py +57 -52
  28. deepeval-3.4.3/deepeval/integrations/pydantic_ai/agent.py +274 -0
  29. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/pydantic_ai/setup.py +0 -5
  30. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/contextual_relevancy/template.py +2 -1
  31. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/dag/nodes.py +22 -10
  32. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/dag/utils.py +3 -2
  33. deepeval-3.4.3/deepeval/metrics/g_eval/__init__.py +5 -0
  34. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/g_eval/g_eval.py +25 -15
  35. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/llms/amazon_bedrock_model.py +5 -4
  36. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/llms/anthropic_model.py +4 -0
  37. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/llms/deepseek_model.py +6 -0
  38. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/llms/gemini_model.py +7 -1
  39. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/llms/grok_model.py +4 -0
  40. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/llms/kimi_model.py +6 -0
  41. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/llms/litellm_model.py +33 -5
  42. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/llms/local_model.py +4 -0
  43. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/llms/ollama_model.py +10 -2
  44. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/synthesizer/synthesizer.py +4 -0
  45. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/telemetry.py +10 -1
  46. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/tracing/offline_evals/api.py +1 -0
  47. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/tracing/offline_evals/thread.py +8 -2
  48. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/tracing/otel/exporter.py +32 -42
  49. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/tracing/otel/utils.py +18 -1
  50. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/tracing/tracing.py +2 -6
  51. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/utils.py +7 -3
  52. {deepeval-3.4.1 → deepeval-3.4.3}/pyproject.toml +2 -1
  53. deepeval-3.4.1/deepeval/_version.py +0 -1
  54. deepeval-3.4.1/deepeval/integrations/pydantic_ai/agent.py +0 -34
  55. deepeval-3.4.1/deepeval/integrations/pydantic_ai/patch.py +0 -161
  56. deepeval-3.4.1/deepeval/metrics/g_eval/__init__.py +0 -4
  57. {deepeval-3.4.1 → deepeval-3.4.3}/LICENSE.md +0 -0
  58. {deepeval-3.4.1 → deepeval-3.4.3}/README.md +0 -0
  59. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/__init__.py +0 -0
  60. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/annotation/__init__.py +0 -0
  61. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/__init__.py +0 -0
  62. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/arc/__init__.py +0 -0
  63. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/arc/mode.py +0 -0
  64. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/arc/template.py +0 -0
  65. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/bbq/__init__.py +0 -0
  66. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/bbq/task.py +0 -0
  67. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/bbq/template.py +0 -0
  68. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/__init__.py +0 -0
  69. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/__init__.py +0 -0
  70. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/boolean_expressions.txt +0 -0
  71. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/causal_judgement.txt +0 -0
  72. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/date_understanding.txt +0 -0
  73. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/disambiguation_qa.txt +0 -0
  74. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/dyck_languages.txt +0 -0
  75. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/formal_fallacies.txt +0 -0
  76. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/geometric_shapes.txt +0 -0
  77. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/hyperbaton.txt +0 -0
  78. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_five_objects.txt +0 -0
  79. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  80. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_three_objects.txt +0 -0
  81. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/movie_recommendation.txt +0 -0
  82. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/multistep_arithmetic_two.txt +0 -0
  83. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/navigate.txt +0 -0
  84. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/object_counting.txt +0 -0
  85. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/penguins_in_a_table.txt +0 -0
  86. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  87. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/ruin_names.txt +0 -0
  88. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/salient_translation_error_detection.txt +0 -0
  89. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/snarks.txt +0 -0
  90. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/sports_understanding.txt +0 -0
  91. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/temporal_sequences.txt +0 -0
  92. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  93. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  94. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  95. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/web_of_lies.txt +0 -0
  96. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/word_sorting.txt +0 -0
  97. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/__init__.py +0 -0
  98. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/boolean_expressions.txt +0 -0
  99. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/causal_judgement.txt +0 -0
  100. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/date_understanding.txt +0 -0
  101. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/disambiguation_qa.txt +0 -0
  102. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/dyck_languages.txt +0 -0
  103. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/formal_fallacies.txt +0 -0
  104. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/geometric_shapes.txt +0 -0
  105. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/hyperbaton.txt +0 -0
  106. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_five_objects.txt +0 -0
  107. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_seven_objects.txt +0 -0
  108. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_three_objects.txt +0 -0
  109. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/movie_recommendation.txt +0 -0
  110. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/multistep_arithmetic_two.txt +0 -0
  111. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/navigate.txt +0 -0
  112. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/object_counting.txt +0 -0
  113. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/penguins_in_a_table.txt +0 -0
  114. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/reasoning_about_colored_objects.txt +0 -0
  115. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/ruin_names.txt +0 -0
  116. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/salient_translation_error_detection.txt +0 -0
  117. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/snarks.txt +0 -0
  118. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/sports_understanding.txt +0 -0
  119. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/temporal_sequences.txt +0 -0
  120. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  121. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  122. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  123. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/web_of_lies.txt +0 -0
  124. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/word_sorting.txt +0 -0
  125. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/task.py +0 -0
  126. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/template.py +0 -0
  127. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/bool_q/__init__.py +0 -0
  128. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/bool_q/template.py +0 -0
  129. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/drop/__init__.py +0 -0
  130. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/drop/task.py +0 -0
  131. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/drop/template.py +0 -0
  132. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/equity_med_qa/__init__.py +0 -0
  133. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/equity_med_qa/task.py +0 -0
  134. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/equity_med_qa/template.py +0 -0
  135. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/gsm8k/__init__.py +0 -0
  136. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/gsm8k/template.py +0 -0
  137. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/hellaswag/__init__.py +0 -0
  138. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/hellaswag/task.py +0 -0
  139. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/hellaswag/template.py +0 -0
  140. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/human_eval/__init__.py +0 -0
  141. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/human_eval/task.py +0 -0
  142. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/human_eval/template.py +0 -0
  143. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/ifeval/__init__.py +0 -0
  144. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/ifeval/template.py +0 -0
  145. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/lambada/__init__.py +0 -0
  146. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/lambada/template.py +0 -0
  147. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/logi_qa/__init__.py +0 -0
  148. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/logi_qa/task.py +0 -0
  149. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/logi_qa/template.py +0 -0
  150. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/math_qa/__init__.py +0 -0
  151. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/math_qa/task.py +0 -0
  152. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/math_qa/template.py +0 -0
  153. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/mmlu/__init__.py +0 -0
  154. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/mmlu/task.py +0 -0
  155. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/modes/__init__.py +0 -0
  156. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/schema.py +0 -0
  157. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/squad/__init__.py +0 -0
  158. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/squad/task.py +0 -0
  159. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/squad/template.py +0 -0
  160. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/tasks/__init__.py +0 -0
  161. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/truthful_qa/__init__.py +0 -0
  162. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/truthful_qa/mode.py +0 -0
  163. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/truthful_qa/task.py +0 -0
  164. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/truthful_qa/template.py +0 -0
  165. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/utils.py +0 -0
  166. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/winogrande/__init__.py +0 -0
  167. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/winogrande/template.py +0 -0
  168. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/cli/__init__.py +0 -0
  169. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/cli/main.py +0 -0
  170. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/cli/server.py +0 -0
  171. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/cli/test.py +0 -0
  172. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/cli/types.py +0 -0
  173. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/cli/utils.py +0 -0
  174. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/confident/__init__.py +0 -0
  175. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/confident/api.py +0 -0
  176. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/confident/types.py +0 -0
  177. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/constants.py +0 -0
  178. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/dataset/__init__.py +0 -0
  179. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/dataset/api.py +0 -0
  180. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/dataset/golden.py +0 -0
  181. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/dataset/types.py +0 -0
  182. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/errors.py +0 -0
  183. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/evaluate/__init__.py +0 -0
  184. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/evaluate/api.py +0 -0
  185. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/evaluate/compare.py +0 -0
  186. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/evaluate/configs.py +0 -0
  187. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/evaluate/evaluate.py +0 -0
  188. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/evaluate/types.py +0 -0
  189. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/evaluate/utils.py +0 -0
  190. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/__init__.py +0 -0
  191. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/crewai/__init__.py +0 -0
  192. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/crewai/agent.py +0 -0
  193. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/crewai/handler.py +0 -0
  194. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/crewai/patch.py +0 -0
  195. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/hugging_face/__init__.py +0 -0
  196. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/hugging_face/callback.py +0 -0
  197. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/hugging_face/rich_manager.py +0 -0
  198. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/hugging_face/tests/test_callbacks.py +0 -0
  199. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/hugging_face/utils.py +0 -0
  200. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/langchain/__init__.py +0 -0
  201. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/langchain/callback.py +0 -0
  202. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/langchain/utils.py +0 -0
  203. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/llama_index/__init__.py +0 -0
  204. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/llama_index/agent/patched.py +0 -0
  205. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/llama_index/handler.py +0 -0
  206. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/llama_index/utils.py +0 -0
  207. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/pydantic_ai/__init__.py +0 -0
  208. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/key_handler.py +0 -0
  209. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/__init__.py +0 -0
  210. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/answer_relevancy/__init__.py +0 -0
  211. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/answer_relevancy/answer_relevancy.py +0 -0
  212. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/answer_relevancy/schema.py +0 -0
  213. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/answer_relevancy/template.py +0 -0
  214. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/arena_g_eval/__init__.py +0 -0
  215. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/arena_g_eval/arena_g_eval.py +0 -0
  216. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/arena_g_eval/schema.py +0 -0
  217. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/arena_g_eval/template.py +0 -0
  218. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/arena_g_eval/utils.py +0 -0
  219. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/argument_correctness/__init__.py +0 -0
  220. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/argument_correctness/argument_correctness.py +0 -0
  221. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/argument_correctness/schema.py +0 -0
  222. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/argument_correctness/template.py +0 -0
  223. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/base_metric.py +0 -0
  224. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/bias/__init__.py +0 -0
  225. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/bias/bias.py +0 -0
  226. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/bias/schema.py +0 -0
  227. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/bias/template.py +0 -0
  228. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/contextual_precision/__init__.py +0 -0
  229. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/contextual_precision/contextual_precision.py +0 -0
  230. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/contextual_precision/schema.py +0 -0
  231. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/contextual_precision/template.py +0 -0
  232. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/contextual_recall/__init__.py +0 -0
  233. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/contextual_recall/contextual_recall.py +0 -0
  234. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/contextual_recall/schema.py +0 -0
  235. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/contextual_recall/template.py +0 -0
  236. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/contextual_relevancy/__init__.py +0 -0
  237. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/contextual_relevancy/contextual_relevancy.py +0 -0
  238. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/contextual_relevancy/schema.py +0 -0
  239. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/conversation_completeness/__init__.py +0 -0
  240. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/conversation_completeness/conversation_completeness.py +0 -0
  241. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/conversation_completeness/schema.py +0 -0
  242. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/conversation_completeness/template.py +0 -0
  243. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/conversational_g_eval/__init__.py +0 -0
  244. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/conversational_g_eval/conversational_g_eval.py +0 -0
  245. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/conversational_g_eval/schema.py +0 -0
  246. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/conversational_g_eval/template.py +0 -0
  247. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/dag/__init__.py +0 -0
  248. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/dag/dag.py +0 -0
  249. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/dag/graph.py +0 -0
  250. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/dag/schema.py +0 -0
  251. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/dag/templates.py +0 -0
  252. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/faithfulness/__init__.py +0 -0
  253. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/faithfulness/faithfulness.py +0 -0
  254. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/faithfulness/schema.py +0 -0
  255. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/faithfulness/template.py +0 -0
  256. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/g_eval/schema.py +0 -0
  257. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/g_eval/template.py +0 -0
  258. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/g_eval/utils.py +0 -0
  259. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/hallucination/__init__.py +0 -0
  260. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/hallucination/hallucination.py +0 -0
  261. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/hallucination/schema.py +0 -0
  262. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/hallucination/template.py +0 -0
  263. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/indicator.py +0 -0
  264. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/json_correctness/__init__.py +0 -0
  265. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/json_correctness/json_correctness.py +0 -0
  266. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/json_correctness/schema.py +0 -0
  267. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/json_correctness/template.py +0 -0
  268. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/knowledge_retention/__init__.py +0 -0
  269. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/knowledge_retention/knowledge_retention.py +0 -0
  270. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/knowledge_retention/schema.py +0 -0
  271. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/knowledge_retention/template.py +0 -0
  272. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/mcp/__init__.py +0 -0
  273. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/mcp/mcp_task_completion.py +0 -0
  274. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +0 -0
  275. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/mcp/schema.py +0 -0
  276. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/mcp/template.py +0 -0
  277. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/mcp_use_metric/__init__.py +0 -0
  278. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/mcp_use_metric/mcp_use_metric.py +0 -0
  279. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/mcp_use_metric/schema.py +0 -0
  280. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/mcp_use_metric/template.py +0 -0
  281. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/misuse/__init__.py +0 -0
  282. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/misuse/misuse.py +0 -0
  283. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/misuse/schema.py +0 -0
  284. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/misuse/template.py +0 -0
  285. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/__init__.py +0 -0
  286. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/image_coherence/__init__.py +0 -0
  287. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +0 -0
  288. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/image_coherence/schema.py +0 -0
  289. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/image_coherence/template.py +0 -0
  290. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/image_editing/__init__.py +0 -0
  291. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +0 -0
  292. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/image_editing/schema.py +0 -0
  293. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/image_editing/template.py +0 -0
  294. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/image_helpfulness/__init__.py +0 -0
  295. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +0 -0
  296. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/image_helpfulness/schema.py +0 -0
  297. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/image_helpfulness/template.py +0 -0
  298. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/image_reference/__init__.py +0 -0
  299. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +0 -0
  300. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/image_reference/schema.py +0 -0
  301. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/image_reference/template.py +0 -0
  302. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/__init__.py +0 -0
  303. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -0
  304. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -0
  305. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -0
  306. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/__init__.py +0 -0
  307. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -0
  308. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -0
  309. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -0
  310. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/__init__.py +0 -0
  311. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -0
  312. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -0
  313. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -0
  314. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/__init__.py +0 -0
  315. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -0
  316. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/schema.py +0 -0
  317. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -0
  318. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  319. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -0
  320. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/schema.py +0 -0
  321. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -0
  322. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  323. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -0
  324. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -0
  325. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -0
  326. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -0
  327. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  328. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -0
  329. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/text_to_image/__init__.py +0 -0
  330. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/text_to_image/schema.py +0 -0
  331. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/text_to_image/template.py +0 -0
  332. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +0 -0
  333. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/non_advice/__init__.py +0 -0
  334. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/non_advice/non_advice.py +0 -0
  335. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/non_advice/schema.py +0 -0
  336. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/non_advice/template.py +0 -0
  337. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/pii_leakage/__init__.py +0 -0
  338. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/pii_leakage/pii_leakage.py +0 -0
  339. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/pii_leakage/schema.py +0 -0
  340. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/pii_leakage/template.py +0 -0
  341. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/prompt_alignment/__init__.py +0 -0
  342. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/prompt_alignment/prompt_alignment.py +0 -0
  343. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/prompt_alignment/schema.py +0 -0
  344. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/prompt_alignment/template.py +0 -0
  345. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/ragas.py +0 -0
  346. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/role_adherence/__init__.py +0 -0
  347. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/role_adherence/role_adherence.py +0 -0
  348. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/role_adherence/schema.py +0 -0
  349. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/role_adherence/template.py +0 -0
  350. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/role_violation/__init__.py +0 -0
  351. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/role_violation/role_violation.py +0 -0
  352. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/role_violation/schema.py +0 -0
  353. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/role_violation/template.py +0 -0
  354. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/summarization/__init__.py +0 -0
  355. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/summarization/schema.py +0 -0
  356. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/summarization/summarization.py +0 -0
  357. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/summarization/template.py +0 -0
  358. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/task_completion/__init__.py +0 -0
  359. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/task_completion/schema.py +0 -0
  360. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/task_completion/task_completion.py +0 -0
  361. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/task_completion/template.py +0 -0
  362. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/tool_correctness/__init__.py +0 -0
  363. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/tool_correctness/tool_correctness.py +0 -0
  364. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/toxicity/__init__.py +0 -0
  365. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/toxicity/schema.py +0 -0
  366. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/toxicity/template.py +0 -0
  367. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/toxicity/toxicity.py +0 -0
  368. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/turn_relevancy/__init__.py +0 -0
  369. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/turn_relevancy/schema.py +0 -0
  370. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/turn_relevancy/template.py +0 -0
  371. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/turn_relevancy/turn_relevancy.py +0 -0
  372. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/utils.py +0 -0
  373. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/__init__.py +0 -0
  374. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/_summac_model.py +0 -0
  375. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/answer_relevancy_model.py +0 -0
  376. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/base_model.py +0 -0
  377. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/detoxify_model.py +0 -0
  378. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/embedding_models/__init__.py +0 -0
  379. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/embedding_models/azure_embedding_model.py +0 -0
  380. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/embedding_models/local_embedding_model.py +0 -0
  381. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/embedding_models/ollama_embedding_model.py +0 -0
  382. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/embedding_models/openai_embedding_model.py +0 -0
  383. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/hallucination_model.py +0 -0
  384. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/llms/__init__.py +0 -0
  385. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/llms/azure_model.py +0 -0
  386. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/llms/openai_model.py +0 -0
  387. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/llms/utils.py +0 -0
  388. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/mlllms/__init__.py +0 -0
  389. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/mlllms/gemini_model.py +0 -0
  390. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/mlllms/ollama_model.py +0 -0
  391. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/mlllms/openai_model.py +0 -0
  392. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/summac_model.py +0 -0
  393. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/unbias_model.py +0 -0
  394. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/utils.py +0 -0
  395. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/openai/__init__.py +0 -0
  396. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/openai/extractors.py +0 -0
  397. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/openai/patch.py +0 -0
  398. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/openai/utils.py +0 -0
  399. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/openai_agents/__init__.py +0 -0
  400. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/openai_agents/callback_handler.py +0 -0
  401. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/openai_agents/extractors.py +0 -0
  402. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/plugins/__init__.py +0 -0
  403. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/plugins/plugin.py +0 -0
  404. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/progress_context.py +0 -0
  405. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/prompt/__init__.py +0 -0
  406. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/prompt/api.py +0 -0
  407. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/prompt/prompt.py +0 -0
  408. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/prompt/utils.py +0 -0
  409. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/py.typed +0 -0
  410. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/red_teaming/README.md +0 -0
  411. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/scorer/__init__.py +0 -0
  412. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/scorer/scorer.py +0 -0
  413. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/simulator/__init__.py +0 -0
  414. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/simulator/conversation_simulator.py +0 -0
  415. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/simulator/schema.py +0 -0
  416. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/simulator/template.py +0 -0
  417. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/singleton.py +0 -0
  418. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/synthesizer/__init__.py +0 -0
  419. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/synthesizer/base_synthesizer.py +0 -0
  420. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/synthesizer/chunking/__init__.py +0 -0
  421. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/synthesizer/chunking/context_generator.py +0 -0
  422. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/synthesizer/chunking/doc_chunker.py +0 -0
  423. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/synthesizer/config.py +0 -0
  424. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/synthesizer/schema.py +0 -0
  425. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/synthesizer/templates/__init__.py +0 -0
  426. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/synthesizer/templates/template.py +0 -0
  427. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/synthesizer/templates/template_extraction.py +0 -0
  428. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/synthesizer/templates/template_prompt.py +0 -0
  429. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/synthesizer/types.py +0 -0
  430. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/synthesizer/utils.py +0 -0
  431. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/test_case/__init__.py +0 -0
  432. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/test_case/arena_test_case.py +0 -0
  433. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/test_case/conversational_test_case.py +0 -0
  434. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/test_case/llm_test_case.py +0 -0
  435. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/test_case/mcp.py +0 -0
  436. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/test_case/mllm_test_case.py +0 -0
  437. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/test_case/utils.py +0 -0
  438. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/test_run/__init__.py +0 -0
  439. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/test_run/api.py +0 -0
  440. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/test_run/cache.py +0 -0
  441. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/test_run/hooks.py +0 -0
  442. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/test_run/hyperparameters.py +0 -0
  443. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/test_run/test_run.py +0 -0
  444. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/tracing/__init__.py +0 -0
  445. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/tracing/api.py +0 -0
  446. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/tracing/context.py +0 -0
  447. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/tracing/offline_evals/__init__.py +0 -0
  448. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/tracing/offline_evals/span.py +0 -0
  449. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/tracing/offline_evals/trace.py +0 -0
  450. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/tracing/otel/__init__.py +0 -0
  451. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/tracing/patchers.py +0 -0
  452. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/tracing/perf_epoch_bridge.py +0 -0
  453. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/tracing/types.py +0 -0
  454. {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/tracing/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deepeval
3
- Version: 3.4.1
3
+ Version: 3.4.3
4
4
  Summary: The LLM Evaluation Framework
5
5
  Home-page: https://github.com/confident-ai/deepeval
6
6
  License: Apache-2.0
@@ -0,0 +1 @@
1
+ __version__: str = "3.4.3"
@@ -12,6 +12,7 @@ def send_annotation(
12
12
  expected_output: Optional[str] = None,
13
13
  expected_outcome: Optional[str] = None,
14
14
  explanation: Optional[str] = None,
15
+ user_id: Optional[str] = None,
15
16
  type: Optional[AnnotationType] = AnnotationType.THUMBS_RATING,
16
17
  ) -> str:
17
18
  api_annotation = APIAnnotation(
@@ -23,6 +24,7 @@ def send_annotation(
23
24
  expectedOutcome=expected_outcome,
24
25
  explanation=explanation,
25
26
  type=type,
27
+ userId=user_id,
26
28
  )
27
29
  api = Api()
28
30
  try:
@@ -47,6 +49,7 @@ async def a_send_annotation(
47
49
  expected_outcome: Optional[str] = None,
48
50
  explanation: Optional[str] = None,
49
51
  type: Optional[AnnotationType] = AnnotationType.THUMBS_RATING,
52
+ user_id: Optional[str] = None,
50
53
  ) -> str:
51
54
  api_annotation = APIAnnotation(
52
55
  rating=rating,
@@ -57,6 +60,7 @@ async def a_send_annotation(
57
60
  expectedOutcome=expected_outcome,
58
61
  explanation=explanation,
59
62
  type=type,
63
+ userId=user_id,
60
64
  )
61
65
  api = Api()
62
66
  try:
@@ -17,6 +17,7 @@ class APIAnnotation(BaseModel):
17
17
  expected_outcome: Optional[str] = Field(None, alias="expectedOutcome")
18
18
  explanation: Optional[str] = Field(None)
19
19
  type: Optional[AnnotationType] = Field(None, alias="type")
20
+ user_id: Optional[str] = Field(None, alias="userId")
20
21
 
21
22
  @model_validator(mode="before")
22
23
  def validate_input(cls, data):
@@ -2,7 +2,10 @@ from typing import List, Optional, Dict
2
2
  from tqdm import tqdm
3
3
 
4
4
  from deepeval.dataset import Golden
5
- from deepeval.benchmarks.base_benchmark import DeepEvalBaseBenchmark
5
+ from deepeval.benchmarks.base_benchmark import (
6
+ DeepEvalBaseBenchmark,
7
+ DeepEvalBaseBenchmarkResult,
8
+ )
6
9
  from deepeval.models import DeepEvalBaseLLM
7
10
  from deepeval.benchmarks.arc.mode import ARCMode
8
11
  from deepeval.benchmarks.arc.template import ARCTemplate
@@ -48,7 +51,9 @@ class ARC(DeepEvalBaseBenchmark):
48
51
  else:
49
52
  self.confinement_instructions = confinement_instructions
50
53
 
51
- def evaluate(self, model: DeepEvalBaseLLM, *args, **kwargs) -> Dict:
54
+ def evaluate(
55
+ self, model: DeepEvalBaseLLM, *args, **kwargs
56
+ ) -> DeepEvalBaseBenchmarkResult:
52
57
  import pandas as pd
53
58
 
54
59
  with capture_benchmark_run("ARC", self.n_problems):
@@ -90,7 +95,9 @@ class ARC(DeepEvalBaseBenchmark):
90
95
  )
91
96
  self.overall_score = overall_accuracy
92
97
 
93
- return overall_accuracy
98
+ return DeepEvalBaseBenchmarkResult(
99
+ overall_accuracy=overall_accuracy
100
+ )
94
101
 
95
102
  def predict(self, model: DeepEvalBaseLLM, golden: Golden) -> Dict:
96
103
  # Define prompt template
@@ -129,9 +136,7 @@ class ARC(DeepEvalBaseBenchmark):
129
136
  dataset_attr = dataset_mapping.get(mode)
130
137
  if dataset_attr:
131
138
  if not hasattr(self, dataset_attr):
132
- dataset = load_dataset(
133
- "ai2_arc", mode.value, trust_remote_code=True
134
- )
139
+ dataset = load_dataset("ai2_arc", mode.value)
135
140
  setattr(self, dataset_attr, dataset)
136
141
  else:
137
142
  dataset = getattr(self, dataset_attr)
@@ -1,10 +1,15 @@
1
1
  from deepeval.models.base_model import DeepEvalBaseLLM
2
2
  from abc import ABC, abstractmethod
3
3
  from typing import List, TypeVar, Generic, List, Optional
4
+ from pydantic import BaseModel
4
5
 
5
6
  from deepeval.dataset import Golden
6
7
 
7
8
 
9
+ class DeepEvalBaseBenchmarkResult(BaseModel):
10
+ overall_accuracy: float
11
+
12
+
8
13
  T = TypeVar("T")
9
14
 
10
15
 
@@ -21,5 +26,7 @@ class DeepEvalBaseBenchmark(ABC, Generic[T]):
21
26
  raise NotImplementedError
22
27
 
23
28
  @abstractmethod
24
- def evaluate(self, model: DeepEvalBaseLLM, *args, **kwargs) -> dict:
29
+ def evaluate(
30
+ self, model: DeepEvalBaseLLM, *args, **kwargs
31
+ ) -> DeepEvalBaseBenchmarkResult:
25
32
  raise NotImplementedError
@@ -2,7 +2,10 @@ from typing import List, Optional, Dict
2
2
  from tqdm import tqdm
3
3
 
4
4
  from deepeval.dataset import Golden
5
- from deepeval.benchmarks.base_benchmark import DeepEvalBaseBenchmark
5
+ from deepeval.benchmarks.base_benchmark import (
6
+ DeepEvalBaseBenchmark,
7
+ DeepEvalBaseBenchmarkResult,
8
+ )
6
9
  from deepeval.models import DeepEvalBaseLLM
7
10
  from deepeval.benchmarks.bbq.task import BBQTask
8
11
  from deepeval.benchmarks.bbq.template import BBQTemplate
@@ -39,7 +42,9 @@ class BBQ(DeepEvalBaseBenchmark):
39
42
  else:
40
43
  self.confinement_instructions = confinement_instructions
41
44
 
42
- def evaluate(self, model: DeepEvalBaseLLM, *args, **kwargs) -> Dict:
45
+ def evaluate(
46
+ self, model: DeepEvalBaseLLM, *args, **kwargs
47
+ ) -> DeepEvalBaseBenchmarkResult:
43
48
  import pandas as pd
44
49
 
45
50
  with capture_benchmark_run("BBQ", len(self.tasks)):
@@ -115,7 +120,9 @@ class BBQ(DeepEvalBaseBenchmark):
115
120
  )
116
121
  self.overall_score = overall_accuracy
117
122
 
118
- return overall_accuracy
123
+ return DeepEvalBaseBenchmarkResult(
124
+ overall_accuracy=overall_accuracy
125
+ )
119
126
 
120
127
  def predict(self, model: DeepEvalBaseLLM, golden: Golden) -> Dict:
121
128
  # Define prompt template
@@ -164,9 +171,7 @@ class BBQ(DeepEvalBaseBenchmark):
164
171
  dataset_attr = dataset_mapping.get(task)
165
172
  if dataset_attr:
166
173
  if not hasattr(self, dataset_attr):
167
- dataset = load_dataset(
168
- "heegyu/bbq", task.value, trust_remote_code=True
169
- )
174
+ dataset = load_dataset("heegyu/bbq", task.value)
170
175
  setattr(self, dataset_attr, dataset)
171
176
  else:
172
177
  dataset = getattr(self, dataset_attr)
@@ -2,7 +2,10 @@ from typing import List, Optional, Dict
2
2
  from tqdm import tqdm
3
3
 
4
4
  from deepeval.dataset import Golden
5
- from deepeval.benchmarks.base_benchmark import DeepEvalBaseBenchmark
5
+ from deepeval.benchmarks.base_benchmark import (
6
+ DeepEvalBaseBenchmark,
7
+ DeepEvalBaseBenchmarkResult,
8
+ )
6
9
  from deepeval.models import DeepEvalBaseLLM
7
10
  from deepeval.benchmarks.big_bench_hard.task import BigBenchHardTask
8
11
  from deepeval.benchmarks.big_bench_hard.template import BigBenchHardTemplate
@@ -81,7 +84,7 @@ class BigBenchHard(DeepEvalBaseBenchmark):
81
84
  *args,
82
85
  batch_size: Optional[int] = None,
83
86
  **kwargs,
84
- ) -> Dict:
87
+ ) -> DeepEvalBaseBenchmarkResult:
85
88
  import pandas as pd
86
89
 
87
90
  with capture_benchmark_run("Big Bench Hard", len(self.tasks)):
@@ -189,7 +192,9 @@ class BigBenchHard(DeepEvalBaseBenchmark):
189
192
  )
190
193
  self.overall_score = overall_accuracy
191
194
 
192
- return overall_accuracy
195
+ return DeepEvalBaseBenchmarkResult(
196
+ overall_accuracy=overall_accuracy
197
+ )
193
198
 
194
199
  def predict(
195
200
  self, model: DeepEvalBaseLLM, task: BigBenchHardTask, golden: Golden
@@ -279,9 +284,7 @@ class BigBenchHard(DeepEvalBaseBenchmark):
279
284
  dataset_attr = dataset_mapping.get(task)
280
285
  if dataset_attr:
281
286
  if not hasattr(self, dataset_attr):
282
- dataset = load_dataset(
283
- "lukaemon/bbh", task.value, trust_remote_code=True
284
- )
287
+ dataset = load_dataset("lukaemon/bbh", task.value)
285
288
  setattr(self, dataset_attr, dataset)
286
289
  else:
287
290
  dataset = getattr(self, dataset_attr)
@@ -2,7 +2,10 @@ from typing import List, Optional, Dict
2
2
  from tqdm import tqdm
3
3
 
4
4
  from deepeval.dataset import Golden
5
- from deepeval.benchmarks.base_benchmark import DeepEvalBaseBenchmark
5
+ from deepeval.benchmarks.base_benchmark import (
6
+ DeepEvalBaseBenchmark,
7
+ DeepEvalBaseBenchmarkResult,
8
+ )
6
9
  from deepeval.models import DeepEvalBaseLLM
7
10
  from deepeval.benchmarks.bool_q.template import BoolQTemplate
8
11
  from deepeval.benchmarks.schema import AffirmationSchema
@@ -37,7 +40,9 @@ class BoolQ(DeepEvalBaseBenchmark):
37
40
  else:
38
41
  self.confinement_instructions = confinement_instructions
39
42
 
40
- def evaluate(self, model: DeepEvalBaseLLM, *args, **kwargs) -> Dict:
43
+ def evaluate(
44
+ self, model: DeepEvalBaseLLM, *args, **kwargs
45
+ ) -> DeepEvalBaseBenchmarkResult:
41
46
  import pandas as pd
42
47
 
43
48
  with capture_benchmark_run("BoolQ", self.n_problems):
@@ -77,7 +82,9 @@ class BoolQ(DeepEvalBaseBenchmark):
77
82
  )
78
83
  self.overall_score = overall_accuracy
79
84
 
80
- return overall_accuracy
85
+ return DeepEvalBaseBenchmarkResult(
86
+ overall_accuracy=overall_accuracy
87
+ )
81
88
 
82
89
  def predict(self, model: DeepEvalBaseLLM, golden: Golden) -> Dict:
83
90
  # Define prompt template
@@ -113,7 +120,7 @@ class BoolQ(DeepEvalBaseBenchmark):
113
120
  if self.dataset:
114
121
  dataset = self.dataset
115
122
  else:
116
- dataset = load_dataset("boolq", "default", trust_remote_code=True)
123
+ dataset = load_dataset("boolq", "default")
117
124
  self.dataset = dataset
118
125
 
119
126
  # Construct test set
@@ -3,7 +3,10 @@ from tqdm import tqdm
3
3
  from typing import Union
4
4
 
5
5
  from deepeval.dataset import Golden
6
- from deepeval.benchmarks.base_benchmark import DeepEvalBaseBenchmark
6
+ from deepeval.benchmarks.base_benchmark import (
7
+ DeepEvalBaseBenchmark,
8
+ DeepEvalBaseBenchmarkResult,
9
+ )
7
10
  from deepeval.models import DeepEvalBaseLLM
8
11
  from deepeval.benchmarks.drop.task import DROPTask
9
12
  from deepeval.benchmarks.drop.template import DROPTemplate
@@ -49,7 +52,7 @@ class DROP(DeepEvalBaseBenchmark):
49
52
  *args,
50
53
  batch_size: int | None = None,
51
54
  **kwargs,
52
- ) -> Dict:
55
+ ) -> DeepEvalBaseBenchmarkResult:
53
56
  import pandas as pd
54
57
 
55
58
  with capture_benchmark_run("DROP", len(self.tasks)):
@@ -155,7 +158,9 @@ class DROP(DeepEvalBaseBenchmark):
155
158
  )
156
159
  self.overall_score = overall_accuracy
157
160
 
158
- return overall_accuracy
161
+ return DeepEvalBaseBenchmarkResult(
162
+ overall_accuracy=overall_accuracy
163
+ )
159
164
 
160
165
  def predict(self, model: DeepEvalBaseLLM, golden: Golden) -> Dict:
161
166
  # Define prompt template
@@ -263,7 +268,7 @@ class DROP(DeepEvalBaseBenchmark):
263
268
  if self.dataset:
264
269
  dataset = self.dataset
265
270
  else:
266
- dataset = load_dataset("ucinlp/drop", trust_remote_code=True)
271
+ dataset = load_dataset("ucinlp/drop")
267
272
  self.dataset = dataset
268
273
 
269
274
  # construct example dataset
@@ -4,7 +4,10 @@ from tqdm import tqdm
4
4
  from deepeval.dataset import Golden
5
5
  from deepeval.test_case import LLMTestCase
6
6
  from deepeval.metrics import BiasMetric
7
- from deepeval.benchmarks.base_benchmark import DeepEvalBaseBenchmark
7
+ from deepeval.benchmarks.base_benchmark import (
8
+ DeepEvalBaseBenchmark,
9
+ DeepEvalBaseBenchmarkResult,
10
+ )
8
11
  from deepeval.models import DeepEvalBaseLLM
9
12
  from deepeval.benchmarks.equity_med_qa.task import EquityMedQATask
10
13
  from deepeval.benchmarks.equity_med_qa.template import EquityMedQATemplate
@@ -34,7 +37,9 @@ class EquityMedQA(DeepEvalBaseBenchmark):
34
37
  initialize_model(model)
35
38
  )
36
39
 
37
- def evaluate(self, model: DeepEvalBaseLLM, *args, **kwargs) -> Dict:
40
+ def evaluate(
41
+ self, model: DeepEvalBaseLLM, *args, **kwargs
42
+ ) -> DeepEvalBaseBenchmarkResult:
38
43
  import pandas as pd
39
44
 
40
45
  with capture_benchmark_run("EquityMedQA", len(self.tasks)):
@@ -97,7 +102,9 @@ class EquityMedQA(DeepEvalBaseBenchmark):
97
102
  )
98
103
  self.overall_score = overall_accuracy
99
104
 
100
- return overall_accuracy
105
+ return DeepEvalBaseBenchmarkResult(
106
+ overall_accuracy=overall_accuracy
107
+ )
101
108
 
102
109
  def predict(self, model: DeepEvalBaseLLM, golden: Golden) -> Dict:
103
110
  prediction = model.generate(golden.input)
@@ -143,9 +150,7 @@ class EquityMedQA(DeepEvalBaseBenchmark):
143
150
  dataset_attr = dataset_mapping.get(task)
144
151
  if dataset_attr:
145
152
  if not hasattr(self, dataset_attr):
146
- dataset = load_dataset(
147
- "katielink/EquityMedQA", task.value, trust_remote_code=True
148
- )
153
+ dataset = load_dataset("katielink/EquityMedQA", task.value)
149
154
  setattr(self, dataset_attr, dataset)
150
155
  else:
151
156
  dataset = getattr(self, dataset_attr)
@@ -2,7 +2,10 @@ from typing import List, Optional, Dict, Union
2
2
  from tqdm import tqdm
3
3
 
4
4
  from deepeval.dataset import Golden
5
- from deepeval.benchmarks.base_benchmark import DeepEvalBaseBenchmark
5
+ from deepeval.benchmarks.base_benchmark import (
6
+ DeepEvalBaseBenchmark,
7
+ DeepEvalBaseBenchmarkResult,
8
+ )
6
9
  from deepeval.models import DeepEvalBaseLLM
7
10
  from deepeval.benchmarks.gsm8k.template import GSM8KTemplate
8
11
  from deepeval.benchmarks.schema import NumberSchema
@@ -39,7 +42,9 @@ class GSM8K(DeepEvalBaseBenchmark):
39
42
  else:
40
43
  self.confinement_instructions = confinement_instructions
41
44
 
42
- def evaluate(self, model: DeepEvalBaseLLM, *args, **kwargs) -> Dict:
45
+ def evaluate(
46
+ self, model: DeepEvalBaseLLM, *args, **kwargs
47
+ ) -> DeepEvalBaseBenchmarkResult:
43
48
  import pandas as pd
44
49
 
45
50
  with capture_benchmark_run("GSM8K", len(self.tasks)):
@@ -82,7 +87,9 @@ class GSM8K(DeepEvalBaseBenchmark):
82
87
  )
83
88
  self.overall_score = overall_accuracy
84
89
 
85
- return overall_accuracy
90
+ return DeepEvalBaseBenchmarkResult(
91
+ overall_accuracy=overall_accuracy
92
+ )
86
93
 
87
94
  def predict(self, model: DeepEvalBaseLLM, golden: Golden) -> Dict:
88
95
  # Define prompt template
@@ -150,7 +157,7 @@ class GSM8K(DeepEvalBaseBenchmark):
150
157
  if self.dataset:
151
158
  dataset = self.dataset
152
159
  else:
153
- dataset = load_dataset("gsm8k", "main", trust_remote_code=True)
160
+ dataset = load_dataset("gsm8k", "main")
154
161
  self.dataset = dataset
155
162
 
156
163
  # Construct example dataset for n_shot inference
@@ -2,7 +2,10 @@ from typing import List, Dict, Optional
2
2
  from tqdm import tqdm
3
3
 
4
4
  from deepeval.dataset import Golden
5
- from deepeval.benchmarks.base_benchmark import DeepEvalBaseBenchmark
5
+ from deepeval.benchmarks.base_benchmark import (
6
+ DeepEvalBaseBenchmark,
7
+ DeepEvalBaseBenchmarkResult,
8
+ )
6
9
  from deepeval.models import DeepEvalBaseLLM
7
10
  from deepeval.benchmarks.hellaswag.task import HellaSwagTask
8
11
  from deepeval.benchmarks.hellaswag.template import HellaSwagTemplate
@@ -50,7 +53,7 @@ class HellaSwag(DeepEvalBaseBenchmark):
50
53
  *args,
51
54
  batch_size: int | None = None,
52
55
  **kwargs,
53
- ) -> Dict:
56
+ ) -> DeepEvalBaseBenchmarkResult:
54
57
  import pandas as pd
55
58
 
56
59
  with capture_benchmark_run("HellaSwag", len(self.tasks)):
@@ -160,7 +163,9 @@ class HellaSwag(DeepEvalBaseBenchmark):
160
163
  )
161
164
  self.overall_score = overall_accuracy
162
165
 
163
- return overall_accuracy
166
+ return DeepEvalBaseBenchmarkResult(
167
+ overall_accuracy=overall_accuracy
168
+ )
164
169
 
165
170
  def predict(
166
171
  self, model: DeepEvalBaseLLM, task: HellaSwagTask, golden: Golden
@@ -253,7 +258,7 @@ class HellaSwag(DeepEvalBaseBenchmark):
253
258
  if self.dataset:
254
259
  dataset = self.dataset
255
260
  else:
256
- dataset = load_dataset("Rowan/hellaswag", trust_remote_code=True)
261
+ dataset = load_dataset("Rowan/hellaswag")
257
262
  self.dataset = dataset
258
263
 
259
264
  # If dataset has not been previously loaded, construct
@@ -1,7 +1,10 @@
1
1
  from typing import List, Optional, Dict
2
2
 
3
3
  from deepeval.dataset import Golden
4
- from deepeval.benchmarks.base_benchmark import DeepEvalBaseBenchmark
4
+ from deepeval.benchmarks.base_benchmark import (
5
+ DeepEvalBaseBenchmark,
6
+ DeepEvalBaseBenchmarkResult,
7
+ )
5
8
  from deepeval.models import DeepEvalBaseLLM
6
9
  from deepeval.benchmarks.human_eval.task import HumanEvalTask
7
10
  from deepeval.benchmarks.human_eval.template import HumanEvalTemplate
@@ -93,7 +96,7 @@ class HumanEval(DeepEvalBaseBenchmark):
93
96
 
94
97
  def evaluate(
95
98
  self, model: DeepEvalBaseLLM, *args, k: int = 1, **kwargs
96
- ) -> Dict:
99
+ ) -> DeepEvalBaseBenchmarkResult:
97
100
  import pandas as pd
98
101
 
99
102
  with capture_benchmark_run("HumanEval", len(self.tasks)):
@@ -157,7 +160,9 @@ class HumanEval(DeepEvalBaseBenchmark):
157
160
  )
158
161
  self.overall_score = overall_accuracy
159
162
 
160
- return overall_accuracy
163
+ return DeepEvalBaseBenchmarkResult(
164
+ overall_accuracy=overall_accuracy
165
+ )
161
166
 
162
167
  def predict(
163
168
  self,
@@ -201,7 +206,7 @@ class HumanEval(DeepEvalBaseBenchmark):
201
206
  if self.dataset:
202
207
  dataset = self.dataset
203
208
  else:
204
- dataset = load_dataset("openai_humaneval", trust_remote_code=True)
209
+ dataset = load_dataset("openai_humaneval")
205
210
  self.dataset = dataset
206
211
 
207
212
  # Filter tasks
@@ -1,15 +1,29 @@
1
+ from pydantic.config import ConfigDict
2
+ from deepeval.benchmarks.base_benchmark import (
3
+ DeepEvalBaseBenchmark,
4
+ DeepEvalBaseBenchmarkResult,
5
+ )
1
6
  from typing import List, Optional, Dict, Any, Tuple
2
7
  from tqdm import tqdm
3
8
  import re
4
9
  import json
5
10
 
6
11
  from deepeval.dataset import Golden
7
- from deepeval.benchmarks.base_benchmark import DeepEvalBaseBenchmark
12
+ from deepeval.benchmarks.base_benchmark import (
13
+ DeepEvalBaseBenchmark,
14
+ DeepEvalBaseBenchmarkResult,
15
+ )
8
16
  from deepeval.models import DeepEvalBaseLLM
9
17
  from deepeval.benchmarks.schema import StringSchema
10
18
  from deepeval.telemetry import capture_benchmark_run
11
19
 
12
20
 
21
+ class IFEvalResult(DeepEvalBaseBenchmarkResult):
22
+ model_config = ConfigDict(arbitrary_types_allowed=True)
23
+ instruction_breakdown: dict[str, Any]
24
+ predictions: "pd.DataFrame"
25
+
26
+
13
27
  class IFEvalInstructionVerifier:
14
28
  """
15
29
  Verifies instruction compliance for IFEval benchmark.
@@ -394,16 +408,17 @@ class IFEval(DeepEvalBaseBenchmark):
394
408
  **kwargs,
395
409
  ):
396
410
  from deepeval.scorer import Scorer
411
+ import pandas as pd
397
412
 
398
413
  super().__init__(**kwargs)
399
414
  self.scorer = Scorer()
400
415
  self.n_problems = n_problems
401
416
  self.verbose_mode = verbose_mode
402
- self.predictions = None
403
- self.overall_score = None
417
+ self.predictions: Optional[pd.DataFrame] = None
418
+ self.overall_score: Optional[float] = None
404
419
  self.instruction_breakdown = None
405
420
 
406
- def evaluate(self, model: DeepEvalBaseLLM, *args, **kwargs) -> Dict:
421
+ def evaluate(self, model: DeepEvalBaseLLM, *args, **kwargs) -> IFEvalResult:
407
422
  import pandas as pd
408
423
 
409
424
  with capture_benchmark_run("IFEval", self.n_problems or "all"):
@@ -459,8 +474,7 @@ class IFEval(DeepEvalBaseBenchmark):
459
474
  print(
460
475
  f"Instruction '{instruction_id}' Accuracy: {accuracy:.4f}"
461
476
  )
462
-
463
- self.predictions = pd.DataFrame(
477
+ predictions: pd.DataFrame = pd.DataFrame(
464
478
  predictions_row,
465
479
  columns=[
466
480
  "Input",
@@ -468,14 +482,15 @@ class IFEval(DeepEvalBaseBenchmark):
468
482
  "All_Instructions_Correct",
469
483
  ],
470
484
  )
485
+ self.predictions = predictions
471
486
  self.overall_score = overall_accuracy
472
487
  self.instruction_breakdown = instruction_accuracies
473
488
 
474
- return {
475
- "overall_accuracy": overall_accuracy,
476
- "instruction_breakdown": instruction_accuracies,
477
- "predictions": self.predictions,
478
- }
489
+ return IFEvalResult(
490
+ overall_accuracy=overall_accuracy,
491
+ instruction_breakdown=instruction_accuracies,
492
+ predictions=predictions,
493
+ )
479
494
 
480
495
  def predict(
481
496
  self, model: DeepEvalBaseLLM, golden: Golden
@@ -531,7 +546,7 @@ class IFEval(DeepEvalBaseBenchmark):
531
546
  if self.dataset:
532
547
  dataset = self.dataset
533
548
  else:
534
- dataset = load_dataset("google/IFEval", trust_remote_code=True)
549
+ dataset = load_dataset("google/IFEval")
535
550
  self.dataset = dataset
536
551
 
537
552
  goldens: List[Golden] = []
@@ -2,7 +2,10 @@ from typing import List, Optional, Dict
2
2
  from tqdm import tqdm
3
3
 
4
4
  from deepeval.dataset import Golden
5
- from deepeval.benchmarks.base_benchmark import DeepEvalBaseBenchmark
5
+ from deepeval.benchmarks.base_benchmark import (
6
+ DeepEvalBaseBenchmark,
7
+ DeepEvalBaseBenchmarkResult,
8
+ )
6
9
  from deepeval.models import DeepEvalBaseLLM
7
10
  from deepeval.benchmarks.lambada.template import LAMBADATemplate
8
11
  from deepeval.benchmarks.schema import StringSchema
@@ -37,7 +40,9 @@ class LAMBADA(DeepEvalBaseBenchmark):
37
40
  else:
38
41
  self.confinement_instructions = confinement_instructions
39
42
 
40
- def evaluate(self, model: DeepEvalBaseLLM, *args, **kwargs) -> Dict:
43
+ def evaluate(
44
+ self, model: DeepEvalBaseLLM, *args, **kwargs
45
+ ) -> DeepEvalBaseBenchmarkResult:
41
46
  import pandas as pd
42
47
 
43
48
  with capture_benchmark_run("LAMBADA", self.n_problems):
@@ -77,7 +82,9 @@ class LAMBADA(DeepEvalBaseBenchmark):
77
82
  )
78
83
  self.overall_score = overall_accuracy
79
84
 
80
- return overall_accuracy
85
+ return DeepEvalBaseBenchmarkResult(
86
+ overall_accuracy=overall_accuracy
87
+ )
81
88
 
82
89
  def predict(self, model: DeepEvalBaseLLM, golden: Golden) -> Dict:
83
90
  # Define prompt template
@@ -113,9 +120,7 @@ class LAMBADA(DeepEvalBaseBenchmark):
113
120
  if self.dataset:
114
121
  dataset = self.dataset
115
122
  else:
116
- dataset = load_dataset(
117
- "EleutherAI/lambada_openai", "default", trust_remote_code=True
118
- )
123
+ dataset = load_dataset("EleutherAI/lambada_openai", "default")
119
124
  self.dataset = dataset
120
125
 
121
126
  # Construct test set
@@ -4,7 +4,10 @@ import requests
4
4
  import json
5
5
 
6
6
  from deepeval.dataset import Golden
7
- from deepeval.benchmarks.base_benchmark import DeepEvalBaseBenchmark
7
+ from deepeval.benchmarks.base_benchmark import (
8
+ DeepEvalBaseBenchmark,
9
+ DeepEvalBaseBenchmarkResult,
10
+ )
8
11
  from deepeval.models import DeepEvalBaseLLM
9
12
  from deepeval.benchmarks.logi_qa.task import LogiQATask
10
13
  from deepeval.benchmarks.logi_qa.template import LogiQATemplate
@@ -51,7 +54,7 @@ class LogiQA(DeepEvalBaseBenchmark):
51
54
  *args,
52
55
  batch_size: int | None = None,
53
56
  **kwargs,
54
- ) -> Dict:
57
+ ) -> DeepEvalBaseBenchmarkResult:
55
58
  import pandas as pd
56
59
 
57
60
  with capture_benchmark_run("LogiQA", len(self.tasks)):
@@ -157,7 +160,9 @@ class LogiQA(DeepEvalBaseBenchmark):
157
160
  )
158
161
  self.overall_score = overall_accuracy
159
162
 
160
- return overall_accuracy
163
+ return DeepEvalBaseBenchmarkResult(
164
+ overall_accuracy=overall_accuracy
165
+ )
161
166
 
162
167
  def predict(self, model: DeepEvalBaseLLM, golden: Golden) -> Dict:
163
168
  # Define prompt template