deepeval 3.7.1__tar.gz → 3.7.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (514) hide show
  1. {deepeval-3.7.1 → deepeval-3.7.3}/PKG-INFO +2 -1
  2. {deepeval-3.7.1 → deepeval-3.7.3}/README.md +1 -0
  3. deepeval-3.7.3/deepeval/_version.py +1 -0
  4. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/human_eval/human_eval.py +2 -1
  5. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/dataset/dataset.py +35 -11
  6. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/dataset/utils.py +2 -0
  7. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/evaluate/compare.py +6 -2
  8. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/utils.py +3 -0
  9. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/models/__init__.py +2 -0
  10. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/models/mlllms/__init__.py +1 -0
  11. deepeval-3.7.3/deepeval/models/mlllms/azure_model.py +334 -0
  12. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/synthesizer/config.py +9 -0
  13. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/synthesizer/schema.py +23 -0
  14. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/synthesizer/synthesizer.py +1137 -2
  15. deepeval-3.7.3/deepeval/synthesizer/templates/__init__.py +12 -0
  16. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/synthesizer/templates/template.py +554 -1
  17. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/synthesizer/templates/template_extraction.py +32 -0
  18. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/synthesizer/templates/template_prompt.py +262 -0
  19. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/tracing/context.py +3 -0
  20. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/tracing/tracing.py +22 -11
  21. {deepeval-3.7.1 → deepeval-3.7.3}/pyproject.toml +7 -7
  22. deepeval-3.7.1/deepeval/_version.py +0 -1
  23. deepeval-3.7.1/deepeval/synthesizer/templates/__init__.py +0 -3
  24. {deepeval-3.7.1 → deepeval-3.7.3}/LICENSE.md +0 -0
  25. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/__init__.py +0 -0
  26. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/annotation/__init__.py +0 -0
  27. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/annotation/annotation.py +0 -0
  28. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/annotation/api.py +0 -0
  29. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/anthropic/__init__.py +0 -0
  30. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/anthropic/extractors.py +0 -0
  31. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/anthropic/patch.py +0 -0
  32. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/anthropic/utils.py +0 -0
  33. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/__init__.py +0 -0
  34. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/arc/__init__.py +0 -0
  35. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/arc/arc.py +0 -0
  36. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/arc/mode.py +0 -0
  37. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/arc/template.py +0 -0
  38. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/base_benchmark.py +0 -0
  39. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/bbq/__init__.py +0 -0
  40. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/bbq/bbq.py +0 -0
  41. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/bbq/task.py +0 -0
  42. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/bbq/template.py +0 -0
  43. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/__init__.py +0 -0
  44. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/big_bench_hard.py +0 -0
  45. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/__init__.py +0 -0
  46. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/boolean_expressions.txt +0 -0
  47. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/causal_judgement.txt +0 -0
  48. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/date_understanding.txt +0 -0
  49. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/disambiguation_qa.txt +0 -0
  50. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/dyck_languages.txt +0 -0
  51. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/formal_fallacies.txt +0 -0
  52. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/geometric_shapes.txt +0 -0
  53. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/hyperbaton.txt +0 -0
  54. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_five_objects.txt +0 -0
  55. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  56. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_three_objects.txt +0 -0
  57. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/movie_recommendation.txt +0 -0
  58. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/multistep_arithmetic_two.txt +0 -0
  59. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/navigate.txt +0 -0
  60. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/object_counting.txt +0 -0
  61. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/penguins_in_a_table.txt +0 -0
  62. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  63. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/ruin_names.txt +0 -0
  64. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/salient_translation_error_detection.txt +0 -0
  65. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/snarks.txt +0 -0
  66. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/sports_understanding.txt +0 -0
  67. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/temporal_sequences.txt +0 -0
  68. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  69. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  70. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  71. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/web_of_lies.txt +0 -0
  72. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/word_sorting.txt +0 -0
  73. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/__init__.py +0 -0
  74. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/boolean_expressions.txt +0 -0
  75. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/causal_judgement.txt +0 -0
  76. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/date_understanding.txt +0 -0
  77. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/disambiguation_qa.txt +0 -0
  78. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/dyck_languages.txt +0 -0
  79. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/formal_fallacies.txt +0 -0
  80. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/geometric_shapes.txt +0 -0
  81. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/hyperbaton.txt +0 -0
  82. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_five_objects.txt +0 -0
  83. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_seven_objects.txt +0 -0
  84. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_three_objects.txt +0 -0
  85. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/movie_recommendation.txt +0 -0
  86. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/multistep_arithmetic_two.txt +0 -0
  87. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/navigate.txt +0 -0
  88. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/object_counting.txt +0 -0
  89. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/penguins_in_a_table.txt +0 -0
  90. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/reasoning_about_colored_objects.txt +0 -0
  91. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/ruin_names.txt +0 -0
  92. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/salient_translation_error_detection.txt +0 -0
  93. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/snarks.txt +0 -0
  94. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/sports_understanding.txt +0 -0
  95. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/temporal_sequences.txt +0 -0
  96. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  97. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  98. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  99. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/web_of_lies.txt +0 -0
  100. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/word_sorting.txt +0 -0
  101. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/task.py +0 -0
  102. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/big_bench_hard/template.py +0 -0
  103. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/bool_q/__init__.py +0 -0
  104. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/bool_q/bool_q.py +0 -0
  105. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/bool_q/template.py +0 -0
  106. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/drop/__init__.py +0 -0
  107. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/drop/drop.py +0 -0
  108. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/drop/task.py +0 -0
  109. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/drop/template.py +0 -0
  110. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/equity_med_qa/__init__.py +0 -0
  111. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/equity_med_qa/equity_med_qa.py +0 -0
  112. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/equity_med_qa/task.py +0 -0
  113. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/equity_med_qa/template.py +0 -0
  114. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/gsm8k/__init__.py +0 -0
  115. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/gsm8k/gsm8k.py +0 -0
  116. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/gsm8k/template.py +0 -0
  117. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/hellaswag/__init__.py +0 -0
  118. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/hellaswag/hellaswag.py +0 -0
  119. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/hellaswag/task.py +0 -0
  120. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/hellaswag/template.py +0 -0
  121. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/human_eval/__init__.py +0 -0
  122. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/human_eval/task.py +0 -0
  123. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/human_eval/template.py +0 -0
  124. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/ifeval/__init__.py +0 -0
  125. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/ifeval/ifeval.py +0 -0
  126. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/ifeval/template.py +0 -0
  127. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/lambada/__init__.py +0 -0
  128. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/lambada/lambada.py +0 -0
  129. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/lambada/template.py +0 -0
  130. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/logi_qa/__init__.py +0 -0
  131. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/logi_qa/logi_qa.py +0 -0
  132. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/logi_qa/task.py +0 -0
  133. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/logi_qa/template.py +0 -0
  134. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/math_qa/__init__.py +0 -0
  135. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/math_qa/math_qa.py +0 -0
  136. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/math_qa/task.py +0 -0
  137. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/math_qa/template.py +0 -0
  138. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/mmlu/__init__.py +0 -0
  139. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/mmlu/mmlu.py +0 -0
  140. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/mmlu/task.py +0 -0
  141. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/mmlu/template.py +0 -0
  142. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/modes/__init__.py +0 -0
  143. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/results.py +0 -0
  144. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/schema.py +0 -0
  145. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/squad/__init__.py +0 -0
  146. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/squad/squad.py +0 -0
  147. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/squad/task.py +0 -0
  148. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/squad/template.py +0 -0
  149. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/tasks/__init__.py +0 -0
  150. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/truthful_qa/__init__.py +0 -0
  151. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/truthful_qa/mode.py +0 -0
  152. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/truthful_qa/task.py +0 -0
  153. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/truthful_qa/template.py +0 -0
  154. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/truthful_qa/truthful_qa.py +0 -0
  155. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/utils.py +0 -0
  156. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/winogrande/__init__.py +0 -0
  157. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/winogrande/template.py +0 -0
  158. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/benchmarks/winogrande/winogrande.py +0 -0
  159. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/cli/__init__.py +0 -0
  160. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/cli/dotenv_handler.py +0 -0
  161. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/cli/main.py +0 -0
  162. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/cli/server.py +0 -0
  163. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/cli/test.py +0 -0
  164. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/cli/types.py +0 -0
  165. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/cli/utils.py +0 -0
  166. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/confident/__init__.py +0 -0
  167. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/confident/api.py +0 -0
  168. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/confident/types.py +0 -0
  169. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/config/__init__.py +0 -0
  170. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/config/logging.py +0 -0
  171. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/config/settings.py +0 -0
  172. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/config/settings_manager.py +0 -0
  173. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/config/utils.py +0 -0
  174. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/constants.py +0 -0
  175. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/contextvars.py +0 -0
  176. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/dataset/__init__.py +0 -0
  177. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/dataset/api.py +0 -0
  178. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/dataset/golden.py +0 -0
  179. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/dataset/test_run_tracer.py +0 -0
  180. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/dataset/types.py +0 -0
  181. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/errors.py +0 -0
  182. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/evaluate/__init__.py +0 -0
  183. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/evaluate/api.py +0 -0
  184. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/evaluate/configs.py +0 -0
  185. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/evaluate/evaluate.py +0 -0
  186. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/evaluate/execute.py +0 -0
  187. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/evaluate/types.py +0 -0
  188. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/evaluate/utils.py +0 -0
  189. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/integrations/__init__.py +0 -0
  190. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/integrations/crewai/__init__.py +0 -0
  191. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/integrations/crewai/handler.py +0 -0
  192. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/integrations/crewai/subs.py +0 -0
  193. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/integrations/crewai/tool.py +0 -0
  194. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/integrations/crewai/wrapper.py +0 -0
  195. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/integrations/hugging_face/__init__.py +0 -0
  196. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/integrations/hugging_face/callback.py +0 -0
  197. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/integrations/hugging_face/rich_manager.py +0 -0
  198. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/integrations/hugging_face/tests/test_callbacks.py +0 -0
  199. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/integrations/hugging_face/utils.py +0 -0
  200. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/integrations/langchain/__init__.py +0 -0
  201. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/integrations/langchain/callback.py +0 -0
  202. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/integrations/langchain/patch.py +0 -0
  203. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/integrations/langchain/utils.py +0 -0
  204. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/integrations/llama_index/__init__.py +0 -0
  205. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/integrations/llama_index/handler.py +0 -0
  206. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/integrations/llama_index/utils.py +0 -0
  207. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/integrations/pydantic_ai/__init__.py +0 -0
  208. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/integrations/pydantic_ai/agent.py +0 -0
  209. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/integrations/pydantic_ai/instrumentator.py +0 -0
  210. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/integrations/pydantic_ai/otel.py +0 -0
  211. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/integrations/pydantic_ai/test_instrumentator.py +0 -0
  212. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/key_handler.py +0 -0
  213. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/__init__.py +0 -0
  214. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/answer_relevancy/__init__.py +0 -0
  215. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/answer_relevancy/answer_relevancy.py +0 -0
  216. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/answer_relevancy/schema.py +0 -0
  217. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/answer_relevancy/template.py +0 -0
  218. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/api.py +0 -0
  219. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/arena_g_eval/__init__.py +0 -0
  220. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/arena_g_eval/arena_g_eval.py +0 -0
  221. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/arena_g_eval/schema.py +0 -0
  222. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/arena_g_eval/template.py +0 -0
  223. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/arena_g_eval/utils.py +0 -0
  224. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/argument_correctness/__init__.py +0 -0
  225. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/argument_correctness/argument_correctness.py +0 -0
  226. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/argument_correctness/schema.py +0 -0
  227. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/argument_correctness/template.py +0 -0
  228. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/base_metric.py +0 -0
  229. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/bias/__init__.py +0 -0
  230. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/bias/bias.py +0 -0
  231. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/bias/schema.py +0 -0
  232. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/bias/template.py +0 -0
  233. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/contextual_precision/__init__.py +0 -0
  234. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/contextual_precision/contextual_precision.py +0 -0
  235. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/contextual_precision/schema.py +0 -0
  236. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/contextual_precision/template.py +0 -0
  237. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/contextual_recall/__init__.py +0 -0
  238. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/contextual_recall/contextual_recall.py +0 -0
  239. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/contextual_recall/schema.py +0 -0
  240. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/contextual_recall/template.py +0 -0
  241. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/contextual_relevancy/__init__.py +0 -0
  242. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/contextual_relevancy/contextual_relevancy.py +0 -0
  243. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/contextual_relevancy/schema.py +0 -0
  244. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/contextual_relevancy/template.py +0 -0
  245. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/conversation_completeness/__init__.py +0 -0
  246. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/conversation_completeness/conversation_completeness.py +0 -0
  247. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/conversation_completeness/schema.py +0 -0
  248. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/conversation_completeness/template.py +0 -0
  249. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/conversational_dag/__init__.py +0 -0
  250. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/conversational_dag/conversational_dag.py +0 -0
  251. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/conversational_dag/nodes.py +0 -0
  252. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/conversational_dag/templates.py +0 -0
  253. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/conversational_g_eval/__init__.py +0 -0
  254. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/conversational_g_eval/conversational_g_eval.py +0 -0
  255. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/conversational_g_eval/schema.py +0 -0
  256. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/conversational_g_eval/template.py +0 -0
  257. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/dag/__init__.py +0 -0
  258. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/dag/dag.py +0 -0
  259. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/dag/graph.py +0 -0
  260. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/dag/nodes.py +0 -0
  261. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/dag/schema.py +0 -0
  262. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/dag/templates.py +0 -0
  263. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/dag/utils.py +0 -0
  264. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/exact_match/__init__.py +0 -0
  265. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/exact_match/exact_match.py +0 -0
  266. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/faithfulness/__init__.py +0 -0
  267. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/faithfulness/faithfulness.py +0 -0
  268. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/faithfulness/schema.py +0 -0
  269. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/faithfulness/template.py +0 -0
  270. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/g_eval/__init__.py +0 -0
  271. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/g_eval/g_eval.py +0 -0
  272. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/g_eval/schema.py +0 -0
  273. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/g_eval/template.py +0 -0
  274. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/g_eval/utils.py +0 -0
  275. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/goal_accuracy/__init__.py +0 -0
  276. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/goal_accuracy/goal_accuracy.py +0 -0
  277. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/goal_accuracy/schema.py +0 -0
  278. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/goal_accuracy/template.py +0 -0
  279. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/hallucination/__init__.py +0 -0
  280. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/hallucination/hallucination.py +0 -0
  281. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/hallucination/schema.py +0 -0
  282. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/hallucination/template.py +0 -0
  283. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/indicator.py +0 -0
  284. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/json_correctness/__init__.py +0 -0
  285. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/json_correctness/json_correctness.py +0 -0
  286. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/json_correctness/schema.py +0 -0
  287. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/json_correctness/template.py +0 -0
  288. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/knowledge_retention/__init__.py +0 -0
  289. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/knowledge_retention/knowledge_retention.py +0 -0
  290. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/knowledge_retention/schema.py +0 -0
  291. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/knowledge_retention/template.py +0 -0
  292. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/mcp/__init__.py +0 -0
  293. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/mcp/mcp_task_completion.py +0 -0
  294. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +0 -0
  295. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/mcp/schema.py +0 -0
  296. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/mcp/template.py +0 -0
  297. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/mcp_use_metric/__init__.py +0 -0
  298. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/mcp_use_metric/mcp_use_metric.py +0 -0
  299. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/mcp_use_metric/schema.py +0 -0
  300. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/mcp_use_metric/template.py +0 -0
  301. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/misuse/__init__.py +0 -0
  302. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/misuse/misuse.py +0 -0
  303. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/misuse/schema.py +0 -0
  304. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/misuse/template.py +0 -0
  305. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/__init__.py +0 -0
  306. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/image_coherence/__init__.py +0 -0
  307. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +0 -0
  308. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/image_coherence/schema.py +0 -0
  309. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/image_coherence/template.py +0 -0
  310. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/image_editing/__init__.py +0 -0
  311. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +0 -0
  312. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/image_editing/schema.py +0 -0
  313. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/image_editing/template.py +0 -0
  314. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/image_helpfulness/__init__.py +0 -0
  315. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +0 -0
  316. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/image_helpfulness/schema.py +0 -0
  317. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/image_helpfulness/template.py +0 -0
  318. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/image_reference/__init__.py +0 -0
  319. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +0 -0
  320. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/image_reference/schema.py +0 -0
  321. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/image_reference/template.py +0 -0
  322. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/__init__.py +0 -0
  323. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -0
  324. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -0
  325. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -0
  326. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/__init__.py +0 -0
  327. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -0
  328. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -0
  329. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -0
  330. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/__init__.py +0 -0
  331. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -0
  332. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -0
  333. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -0
  334. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/__init__.py +0 -0
  335. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -0
  336. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/schema.py +0 -0
  337. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -0
  338. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  339. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -0
  340. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/schema.py +0 -0
  341. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -0
  342. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  343. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -0
  344. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -0
  345. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -0
  346. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -0
  347. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  348. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -0
  349. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/text_to_image/__init__.py +0 -0
  350. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/text_to_image/schema.py +0 -0
  351. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/text_to_image/template.py +0 -0
  352. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +0 -0
  353. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/non_advice/__init__.py +0 -0
  354. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/non_advice/non_advice.py +0 -0
  355. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/non_advice/schema.py +0 -0
  356. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/non_advice/template.py +0 -0
  357. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/pattern_match/__init__.py +0 -0
  358. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/pattern_match/pattern_match.py +0 -0
  359. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/pii_leakage/__init__.py +0 -0
  360. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/pii_leakage/pii_leakage.py +0 -0
  361. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/pii_leakage/schema.py +0 -0
  362. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/pii_leakage/template.py +0 -0
  363. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/plan_adherence/__init__.py +0 -0
  364. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/plan_adherence/plan_adherence.py +0 -0
  365. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/plan_adherence/schema.py +0 -0
  366. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/plan_adherence/template.py +0 -0
  367. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/plan_quality/__init__.py +0 -0
  368. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/plan_quality/plan_quality.py +0 -0
  369. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/plan_quality/schema.py +0 -0
  370. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/plan_quality/template.py +0 -0
  371. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/prompt_alignment/__init__.py +0 -0
  372. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/prompt_alignment/prompt_alignment.py +0 -0
  373. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/prompt_alignment/schema.py +0 -0
  374. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/prompt_alignment/template.py +0 -0
  375. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/ragas.py +0 -0
  376. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/role_adherence/__init__.py +0 -0
  377. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/role_adherence/role_adherence.py +0 -0
  378. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/role_adherence/schema.py +0 -0
  379. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/role_adherence/template.py +0 -0
  380. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/role_violation/__init__.py +0 -0
  381. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/role_violation/role_violation.py +0 -0
  382. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/role_violation/schema.py +0 -0
  383. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/role_violation/template.py +0 -0
  384. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/step_efficiency/__init__.py +0 -0
  385. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/step_efficiency/schema.py +0 -0
  386. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/step_efficiency/step_efficiency.py +0 -0
  387. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/step_efficiency/template.py +0 -0
  388. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/summarization/__init__.py +0 -0
  389. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/summarization/schema.py +0 -0
  390. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/summarization/summarization.py +0 -0
  391. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/summarization/template.py +0 -0
  392. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/task_completion/__init__.py +0 -0
  393. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/task_completion/schema.py +0 -0
  394. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/task_completion/task_completion.py +0 -0
  395. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/task_completion/template.py +0 -0
  396. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/tool_correctness/__init__.py +0 -0
  397. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/tool_correctness/schema.py +0 -0
  398. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/tool_correctness/template.py +0 -0
  399. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/tool_correctness/tool_correctness.py +0 -0
  400. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/tool_use/__init__.py +0 -0
  401. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/tool_use/schema.py +0 -0
  402. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/tool_use/template.py +0 -0
  403. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/tool_use/tool_use.py +0 -0
  404. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/topic_adherence/__init__.py +0 -0
  405. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/topic_adherence/schema.py +0 -0
  406. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/topic_adherence/template.py +0 -0
  407. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/topic_adherence/topic_adherence.py +0 -0
  408. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/toxicity/__init__.py +0 -0
  409. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/toxicity/schema.py +0 -0
  410. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/toxicity/template.py +0 -0
  411. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/toxicity/toxicity.py +0 -0
  412. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/turn_relevancy/__init__.py +0 -0
  413. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/turn_relevancy/schema.py +0 -0
  414. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/turn_relevancy/template.py +0 -0
  415. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/metrics/turn_relevancy/turn_relevancy.py +0 -0
  416. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/model_integrations/__init__.py +0 -0
  417. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/model_integrations/types.py +0 -0
  418. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/model_integrations/utils.py +0 -0
  419. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/models/_summac_model.py +0 -0
  420. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/models/answer_relevancy_model.py +0 -0
  421. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/models/base_model.py +0 -0
  422. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/models/detoxify_model.py +0 -0
  423. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/models/embedding_models/__init__.py +0 -0
  424. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/models/embedding_models/azure_embedding_model.py +0 -0
  425. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/models/embedding_models/local_embedding_model.py +0 -0
  426. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/models/embedding_models/ollama_embedding_model.py +0 -0
  427. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/models/embedding_models/openai_embedding_model.py +0 -0
  428. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/models/hallucination_model.py +0 -0
  429. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/models/llms/__init__.py +0 -0
  430. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/models/llms/amazon_bedrock_model.py +0 -0
  431. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/models/llms/anthropic_model.py +0 -0
  432. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/models/llms/azure_model.py +0 -0
  433. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/models/llms/deepseek_model.py +0 -0
  434. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/models/llms/gemini_model.py +0 -0
  435. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/models/llms/grok_model.py +0 -0
  436. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/models/llms/kimi_model.py +0 -0
  437. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/models/llms/litellm_model.py +0 -0
  438. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/models/llms/local_model.py +0 -0
  439. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/models/llms/ollama_model.py +0 -0
  440. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/models/llms/openai_model.py +0 -0
  441. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/models/llms/utils.py +0 -0
  442. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/models/mlllms/gemini_model.py +0 -0
  443. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/models/mlllms/ollama_model.py +0 -0
  444. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/models/mlllms/openai_model.py +0 -0
  445. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/models/retry_policy.py +0 -0
  446. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/models/summac_model.py +0 -0
  447. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/models/unbias_model.py +0 -0
  448. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/models/utils.py +0 -0
  449. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/openai/__init__.py +0 -0
  450. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/openai/extractors.py +0 -0
  451. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/openai/patch.py +0 -0
  452. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/openai/utils.py +0 -0
  453. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/openai_agents/__init__.py +0 -0
  454. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/openai_agents/agent.py +0 -0
  455. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/openai_agents/callback_handler.py +0 -0
  456. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/openai_agents/extractors.py +0 -0
  457. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/openai_agents/patch.py +0 -0
  458. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/openai_agents/runner.py +0 -0
  459. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/plugins/__init__.py +0 -0
  460. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/plugins/plugin.py +0 -0
  461. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/progress_context.py +0 -0
  462. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/prompt/__init__.py +0 -0
  463. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/prompt/api.py +0 -0
  464. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/prompt/prompt.py +0 -0
  465. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/prompt/utils.py +0 -0
  466. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/py.typed +0 -0
  467. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/red_teaming/README.md +0 -0
  468. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/scorer/__init__.py +0 -0
  469. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/scorer/scorer.py +0 -0
  470. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/simulator/__init__.py +0 -0
  471. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/simulator/conversation_simulator.py +0 -0
  472. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/simulator/schema.py +0 -0
  473. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/simulator/template.py +0 -0
  474. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/singleton.py +0 -0
  475. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/synthesizer/__init__.py +0 -0
  476. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/synthesizer/base_synthesizer.py +0 -0
  477. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/synthesizer/chunking/__init__.py +0 -0
  478. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/synthesizer/chunking/context_generator.py +0 -0
  479. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/synthesizer/chunking/doc_chunker.py +0 -0
  480. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/synthesizer/types.py +0 -0
  481. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/synthesizer/utils.py +0 -0
  482. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/telemetry.py +0 -0
  483. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/test_case/__init__.py +0 -0
  484. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/test_case/api.py +0 -0
  485. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/test_case/arena_test_case.py +0 -0
  486. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/test_case/conversational_test_case.py +0 -0
  487. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/test_case/llm_test_case.py +0 -0
  488. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/test_case/mcp.py +0 -0
  489. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/test_case/mllm_test_case.py +0 -0
  490. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/test_case/utils.py +0 -0
  491. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/test_run/__init__.py +0 -0
  492. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/test_run/api.py +0 -0
  493. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/test_run/cache.py +0 -0
  494. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/test_run/hooks.py +0 -0
  495. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/test_run/hyperparameters.py +0 -0
  496. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/test_run/test_run.py +0 -0
  497. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/tracing/__init__.py +0 -0
  498. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/tracing/api.py +0 -0
  499. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/tracing/offline_evals/__init__.py +0 -0
  500. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/tracing/offline_evals/api.py +0 -0
  501. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/tracing/offline_evals/span.py +0 -0
  502. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/tracing/offline_evals/thread.py +0 -0
  503. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/tracing/offline_evals/trace.py +0 -0
  504. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/tracing/otel/__init__.py +0 -0
  505. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/tracing/otel/exporter.py +0 -0
  506. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/tracing/otel/test_exporter.py +0 -0
  507. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/tracing/otel/utils.py +0 -0
  508. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/tracing/patchers.py +0 -0
  509. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/tracing/perf_epoch_bridge.py +0 -0
  510. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/tracing/trace_context.py +0 -0
  511. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/tracing/trace_test_manager.py +0 -0
  512. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/tracing/types.py +0 -0
  513. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/tracing/utils.py +0 -0
  514. {deepeval-3.7.1 → deepeval-3.7.3}/deepeval/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deepeval
3
- Version: 3.7.1
3
+ Version: 3.7.3
4
4
  Summary: The LLM Evaluation Framework
5
5
  Home-page: https://github.com/confident-ai/deepeval
6
6
  License: Apache-2.0
@@ -439,6 +439,7 @@ Using `.env.local` or `.env` is optional. If they are missing, DeepEval uses you
439
439
  ```bash
440
440
  cp .env.example .env.local
441
441
  # then edit .env.local (ignored by git)
442
+ ```
442
443
 
443
444
  <br />
444
445
 
@@ -389,6 +389,7 @@ Using `.env.local` or `.env` is optional. If they are missing, DeepEval uses you
389
389
  ```bash
390
390
  cp .env.example .env.local
391
391
  # then edit .env.local (ignored by git)
392
+ ```
392
393
 
393
394
  <br />
394
395
 
@@ -0,0 +1 @@
1
+ __version__: str = "3.7.3"
@@ -92,7 +92,7 @@ class HumanEval(DeepEvalBaseBenchmark):
92
92
  self.predictions: Optional[pd.DataFrame] = None
93
93
  self.task_scores: Optional[pd.DataFrame] = None
94
94
  self.overall_score: Optional[float] = None
95
- self.verbose_mode: bool = (False,)
95
+ self.verbose_mode: bool = verbose_mode
96
96
 
97
97
  def evaluate(
98
98
  self, model: DeepEvalBaseLLM, *args, k: int = 1, **kwargs
@@ -123,6 +123,7 @@ class HumanEval(DeepEvalBaseBenchmark):
123
123
  task.value,
124
124
  golden.input,
125
125
  prediction,
126
+ task_correct,
126
127
  golden.expected_output,
127
128
  score,
128
129
  )
@@ -189,17 +189,35 @@ class EvaluationDataset:
189
189
  test_case._dataset_alias = self._alias
190
190
  test_case._dataset_id = self._id
191
191
  if isinstance(test_case, LLMTestCase):
192
+ if self._conversational_goldens or self._conversational_test_cases:
193
+ raise TypeError(
194
+ "You cannot add 'LLMTestCase' to a multi-turn dataset."
195
+ )
192
196
  test_case._dataset_rank = len(self._llm_test_cases)
193
197
  self._llm_test_cases.append(test_case)
194
198
  elif isinstance(test_case, ConversationalTestCase):
199
+ if self._goldens or self._llm_test_cases:
200
+ raise TypeError(
201
+ "You cannot add 'ConversationalTestCase' to a single-turn dataset."
202
+ )
203
+ self._multi_turn = True
195
204
  test_case._dataset_rank = len(self._conversational_test_cases)
196
205
  self._conversational_test_cases.append(test_case)
197
206
 
198
207
  def add_golden(self, golden: Union[Golden, ConversationalGolden]):
199
- if self._multi_turn:
200
- self._add_conversational_golden(golden)
201
- else:
208
+ if isinstance(golden, Golden):
209
+ if self._conversational_goldens or self._conversational_test_cases:
210
+ raise TypeError(
211
+ "You cannot add 'Golden' to a multi-turn dataset."
212
+ )
202
213
  self._add_golden(golden)
214
+ else:
215
+ if self._goldens or self._llm_test_cases:
216
+ raise TypeError(
217
+ "You cannot add 'ConversationalGolden' to a single-turn dataset."
218
+ )
219
+ self._multi_turn = True
220
+ self._add_conversational_golden(golden)
203
221
 
204
222
  def _add_golden(self, golden: Union[Golden, ConversationalGolden]):
205
223
  if isinstance(golden, Golden):
@@ -224,16 +242,16 @@ class EvaluationDataset:
224
242
  file_path: str,
225
243
  input_col_name: str,
226
244
  actual_output_col_name: str,
227
- expected_output_col_name: Optional[str] = None,
228
- context_col_name: Optional[str] = None,
245
+ expected_output_col_name: Optional[str] = "expected_output",
246
+ context_col_name: Optional[str] = "context",
229
247
  context_col_delimiter: str = ";",
230
- retrieval_context_col_name: Optional[str] = None,
248
+ retrieval_context_col_name: Optional[str] = "retrieval_context",
231
249
  retrieval_context_col_delimiter: str = ";",
232
- tools_called_col_name: Optional[str] = None,
250
+ tools_called_col_name: Optional[str] = "tools_called",
233
251
  tools_called_col_delimiter: str = ";",
234
- expected_tools_col_name: Optional[str] = None,
252
+ expected_tools_col_name: Optional[str] = "expected_tools",
235
253
  expected_tools_col_delimiter: str = ";",
236
- additional_metadata_col_name: Optional[str] = None,
254
+ additional_metadata_col_name: Optional[str] = "additional_metadata",
237
255
  ):
238
256
  """
239
257
  Load test cases from a CSV file.
@@ -379,6 +397,7 @@ class EvaluationDataset:
379
397
  retrieval_context_key_name: Optional[str] = None,
380
398
  tools_called_key_name: Optional[str] = None,
381
399
  expected_tools_key_name: Optional[str] = None,
400
+ addtional_metadata_key_name: Optional[str] = None,
382
401
  encoding_type: str = "utf-8",
383
402
  ):
384
403
  """
@@ -431,6 +450,7 @@ class EvaluationDataset:
431
450
  tools_called = [ToolCall(**tool) for tool in tools_called_data]
432
451
  expected_tools_data = json_obj.get(expected_tools_key_name, [])
433
452
  expected_tools = [ToolCall(**tool) for tool in expected_tools_data]
453
+ # additional_metadata = json_obj.get(addtional_metadata_key_name)
434
454
 
435
455
  self.add_test_case(
436
456
  LLMTestCase(
@@ -441,6 +461,7 @@ class EvaluationDataset:
441
461
  retrieval_context=retrieval_context,
442
462
  tools_called=tools_called,
443
463
  expected_tools=expected_tools,
464
+ # additional_metadata=additional_metadata,
444
465
  )
445
466
  )
446
467
 
@@ -460,8 +481,8 @@ class EvaluationDataset:
460
481
  expected_tools_col_delimiter: str = ";",
461
482
  comments_key_name: str = "comments",
462
483
  name_key_name: str = "name",
463
- source_file_col_name: Optional[str] = None,
464
- additional_metadata_col_name: Optional[str] = None,
484
+ source_file_col_name: Optional[str] = "source_file",
485
+ additional_metadata_col_name: Optional[str] = "additional_metadata",
465
486
  scenario_col_name: Optional[str] = "scenario",
466
487
  turns_col_name: Optional[str] = "turns",
467
488
  expected_outcome_col_name: Optional[str] = "expected_outcome",
@@ -587,6 +608,7 @@ class EvaluationDataset:
587
608
  context=context,
588
609
  comments=comments,
589
610
  name=name,
611
+ additional_metadata=additional_metadata,
590
612
  )
591
613
  )
592
614
  else:
@@ -645,6 +667,7 @@ class EvaluationDataset:
645
667
  comments = json_obj.get(comments_key_name)
646
668
  name = json_obj.get(name_key_name)
647
669
  parsed_turns = parse_turns(turns) if turns else []
670
+ additional_metadata = json_obj.get(additional_metadata_key_name)
648
671
 
649
672
  self._multi_turn = True
650
673
  self.goldens.append(
@@ -656,6 +679,7 @@ class EvaluationDataset:
656
679
  context=context,
657
680
  comments=comments,
658
681
  name=name,
682
+ additional_metadata=additional_metadata,
659
683
  )
660
684
  )
661
685
  else:
@@ -24,6 +24,7 @@ def convert_test_cases_to_goldens(
24
24
  "retrieval_context": test_case.retrieval_context,
25
25
  "tools_called": test_case.tools_called,
26
26
  "expected_tools": test_case.expected_tools,
27
+ "additional_metadata": test_case.additional_metadata,
27
28
  }
28
29
  goldens.append(Golden(**golden))
29
30
  return goldens
@@ -70,6 +71,7 @@ def convert_convo_test_cases_to_convo_goldens(
70
71
  "expected_outcome": test_case.expected_outcome,
71
72
  "user_description": test_case.user_description,
72
73
  "context": test_case.context,
74
+ "additional_metadata": test_case.additional_metadata,
73
75
  }
74
76
  goldens.append(ConversationalGolden(**golden))
75
77
  return goldens
@@ -502,10 +502,14 @@ def wrap_up_experiment(
502
502
 
503
503
  try:
504
504
  api = Api()
505
- experiment_request = PostExperimentRequest(testRuns=test_runs, name=name)
505
+ experiment_request = PostExperimentRequest(
506
+ testRuns=test_runs, name=name
507
+ )
506
508
 
507
509
  try:
508
- body = experiment_request.model_dump(by_alias=True, exclude_none=True)
510
+ body = experiment_request.model_dump(
511
+ by_alias=True, exclude_none=True
512
+ )
509
513
  except AttributeError:
510
514
  body = experiment_request.dict(by_alias=True, exclude_none=True)
511
515
  json_str = json.dumps(body, cls=TestRunEncoder)
@@ -25,6 +25,7 @@ from deepeval.models import (
25
25
  MultimodalOpenAIModel,
26
26
  MultimodalGeminiModel,
27
27
  MultimodalOllamaModel,
28
+ MultimodalAzureOpenAIMLLMModel,
28
29
  AmazonBedrockModel,
29
30
  LiteLLMModel,
30
31
  KimiModel,
@@ -514,6 +515,8 @@ def initialize_multimodal_model(
514
515
  return MultimodalGeminiModel(), True
515
516
  if should_use_ollama_model():
516
517
  return MultimodalOllamaModel(), True
518
+ elif should_use_azure_openai():
519
+ return MultimodalAzureOpenAIMLLMModel(model_name=model), True
517
520
  elif isinstance(model, str) or model is None:
518
521
  return MultimodalOpenAIModel(model=model), True
519
522
  raise TypeError(
@@ -21,6 +21,7 @@ from deepeval.models.mlllms import (
21
21
  MultimodalOpenAIModel,
22
22
  MultimodalOllamaModel,
23
23
  MultimodalGeminiModel,
24
+ MultimodalAzureOpenAIMLLMModel,
24
25
  )
25
26
  from deepeval.models.embedding_models import (
26
27
  OpenAIEmbeddingModel,
@@ -48,6 +49,7 @@ __all__ = [
48
49
  "MultimodalOpenAIModel",
49
50
  "MultimodalOllamaModel",
50
51
  "MultimodalGeminiModel",
52
+ "MultimodalAzureOpenAIMLLMModel",
51
53
  "OpenAIEmbeddingModel",
52
54
  "AzureOpenAIEmbeddingModel",
53
55
  "LocalEmbeddingModel",
@@ -1,3 +1,4 @@
1
1
  from .openai_model import MultimodalOpenAIModel
2
2
  from .ollama_model import MultimodalOllamaModel
3
3
  from .gemini_model import MultimodalGeminiModel
4
+ from .azure_model import MultimodalAzureOpenAIMLLMModel
@@ -0,0 +1,334 @@
1
+ from openai.types.chat.chat_completion import ChatCompletion
2
+ from openai import AzureOpenAI, AsyncAzureOpenAI
3
+ from typing import Optional, Tuple, Union, Dict, List
4
+ from pydantic import BaseModel
5
+ from io import BytesIO
6
+ import base64
7
+
8
+ from deepeval.models import DeepEvalBaseMLLM
9
+ from deepeval.key_handler import ModelKeyValues, KEY_FILE_HANDLER
10
+ from deepeval.test_case import MLLMImage
11
+ from deepeval.models.llms.openai_model import (
12
+ structured_outputs_models,
13
+ json_mode_models,
14
+ model_pricing,
15
+ )
16
+ from deepeval.models.retry_policy import (
17
+ create_retry_decorator,
18
+ sdk_retries_for,
19
+ )
20
+
21
+ from deepeval.models.llms.utils import trim_and_load_json
22
+ from deepeval.models.utils import parse_model_name
23
+ from deepeval.constants import ProviderSlug as PS
24
+
25
+
26
+ retry_azure = create_retry_decorator(PS.AZURE)
27
+
28
+
29
+ class MultimodalAzureOpenAIMLLMModel(DeepEvalBaseMLLM):
30
+ def __init__(
31
+ self,
32
+ deployment_name: Optional[str] = None,
33
+ model_name: Optional[str] = None,
34
+ azure_openai_api_key: Optional[str] = None,
35
+ openai_api_version: Optional[str] = None,
36
+ azure_endpoint: Optional[str] = None,
37
+ temperature: float = 0,
38
+ generation_kwargs: Optional[Dict] = None,
39
+ **kwargs,
40
+ ):
41
+ # fetch Azure deployment parameters
42
+ model_name = model_name or KEY_FILE_HANDLER.fetch_data(
43
+ ModelKeyValues.AZURE_MODEL_NAME
44
+ )
45
+ self.deployment_name = deployment_name or KEY_FILE_HANDLER.fetch_data(
46
+ ModelKeyValues.AZURE_DEPLOYMENT_NAME
47
+ )
48
+ self.azure_openai_api_key = (
49
+ azure_openai_api_key
50
+ or KEY_FILE_HANDLER.fetch_data(ModelKeyValues.AZURE_OPENAI_API_KEY)
51
+ )
52
+ self.openai_api_version = (
53
+ openai_api_version
54
+ or KEY_FILE_HANDLER.fetch_data(ModelKeyValues.OPENAI_API_VERSION)
55
+ )
56
+ self.azure_endpoint = azure_endpoint or KEY_FILE_HANDLER.fetch_data(
57
+ ModelKeyValues.AZURE_OPENAI_ENDPOINT
58
+ )
59
+ if temperature < 0:
60
+ raise ValueError("Temperature must be >= 0.")
61
+ self.temperature = temperature
62
+
63
+ # args and kwargs will be passed to the underlying model, in load_model function
64
+ self.kwargs = kwargs
65
+ self.generation_kwargs = generation_kwargs or {}
66
+ super().__init__(parse_model_name(model_name))
67
+
68
+ ###############################################
69
+ # Generate functions
70
+ ###############################################
71
+
72
+ @retry_azure
73
+ def generate(
74
+ self,
75
+ multimodal_input: List[Union[str, MLLMImage]],
76
+ schema: Optional[BaseModel] = None,
77
+ ) -> Tuple[Union[str, BaseModel], float]:
78
+ client = self.load_model(async_mode=False)
79
+ prompt = self.generate_prompt(multimodal_input)
80
+
81
+ if schema:
82
+ if self.model_name in structured_outputs_models:
83
+ messages = [{"role": "user", "content": prompt}]
84
+ completion = client.beta.chat.completions.parse(
85
+ model=self.deployment_name,
86
+ messages=messages,
87
+ response_format=schema,
88
+ temperature=self.temperature,
89
+ )
90
+ structured_output: BaseModel = completion.choices[
91
+ 0
92
+ ].message.parsed
93
+ cost = self.calculate_cost(
94
+ completion.usage.prompt_tokens,
95
+ completion.usage.completion_tokens,
96
+ )
97
+ return structured_output, cost
98
+ if self.model_name in json_mode_models:
99
+ messages = [{"role": "user", "content": prompt}]
100
+ completion = client.beta.chat.completions.parse(
101
+ model=self.deployment_name,
102
+ messages=messages,
103
+ response_format={"type": "json_object"},
104
+ temperature=self.temperature,
105
+ )
106
+ json_output = trim_and_load_json(
107
+ completion.choices[0].message.content
108
+ )
109
+ cost = self.calculate_cost(
110
+ completion.usage.prompt_tokens,
111
+ completion.usage.completion_tokens,
112
+ )
113
+ return schema.model_validate(json_output), cost
114
+ print("Loading model client:")
115
+ print(client.base_url)
116
+ completion = client.chat.completions.create(
117
+ model=self.deployment_name,
118
+ messages=[{"role": "user", "content": prompt}],
119
+ temperature=self.temperature,
120
+ **self.generation_kwargs,
121
+ )
122
+ output = completion.choices[0].message.content
123
+ cost = self.calculate_cost(
124
+ completion.usage.prompt_tokens, completion.usage.completion_tokens
125
+ )
126
+ if schema:
127
+ json_output = trim_and_load_json(output)
128
+ return schema.model_validate(json_output), cost
129
+ else:
130
+ return output, cost
131
+
132
+ @retry_azure
133
+ async def a_generate(
134
+ self,
135
+ multimodal_input: List[Union[str, MLLMImage]],
136
+ schema: Optional[BaseModel] = None,
137
+ ) -> Tuple[Union[str, BaseModel], float]:
138
+ client = self.load_model(async_mode=True)
139
+ prompt = self.generate_prompt(multimodal_input)
140
+
141
+ if schema:
142
+ if self.model_name in structured_outputs_models:
143
+ messages = [{"role": "user", "content": prompt}]
144
+ completion = await client.beta.chat.completions.parse(
145
+ model=self.deployment_name,
146
+ messages=messages,
147
+ response_format=schema,
148
+ temperature=self.temperature,
149
+ )
150
+ structured_output: BaseModel = completion.choices[
151
+ 0
152
+ ].message.parsed
153
+ cost = self.calculate_cost(
154
+ completion.usage.prompt_tokens,
155
+ completion.usage.completion_tokens,
156
+ )
157
+ return structured_output, cost
158
+ if self.model_name in json_mode_models:
159
+ messages = [{"role": "user", "content": prompt}]
160
+ completion = await client.beta.chat.completions.parse(
161
+ model=self.deployment_name,
162
+ messages=messages,
163
+ response_format={"type": "json_object"},
164
+ temperature=self.temperature,
165
+ **self.generation_kwargs,
166
+ )
167
+ json_output = trim_and_load_json(
168
+ completion.choices[0].message.content
169
+ )
170
+ cost = self.calculate_cost(
171
+ completion.usage.prompt_tokens,
172
+ completion.usage.completion_tokens,
173
+ )
174
+ return schema.model_validate(json_output), cost
175
+
176
+ completion = await client.chat.completions.create(
177
+ model=self.deployment_name,
178
+ messages=[{"role": "user", "content": prompt}],
179
+ temperature=self.temperature,
180
+ **self.generation_kwargs,
181
+ )
182
+ output = completion.choices[0].message.content
183
+ cost = self.calculate_cost(
184
+ completion.usage.prompt_tokens,
185
+ completion.usage.completion_tokens,
186
+ )
187
+ if schema:
188
+ json_output = trim_and_load_json(output)
189
+ return schema.model_validate(json_output), cost
190
+ else:
191
+ return output, cost
192
+
193
+ ###############################################
194
+ # Other generate functions
195
+ ###############################################
196
+
197
+ @retry_azure
198
+ def generate_raw_response(
199
+ self,
200
+ multimodal_input: List[Union[str, MLLMImage]],
201
+ top_logprobs: int = 5,
202
+ ) -> Tuple[ChatCompletion, float]:
203
+ client = self.load_model(async_mode=False)
204
+ prompt = self.generate_prompt(multimodal_input)
205
+ messages = [{"role": "user", "content": prompt}]
206
+
207
+ # Generate completion
208
+ completion = client.chat.completions.create(
209
+ model=self.deployment_name,
210
+ messages=messages,
211
+ temperature=self.temperature,
212
+ logprobs=True,
213
+ top_logprobs=top_logprobs,
214
+ **self.generation_kwargs,
215
+ )
216
+ # Cost calculation
217
+ input_tokens = completion.usage.prompt_tokens
218
+ output_tokens = completion.usage.completion_tokens
219
+ cost = self.calculate_cost(input_tokens, output_tokens)
220
+
221
+ return completion, cost
222
+
223
+ @retry_azure
224
+ async def a_generate_raw_response(
225
+ self,
226
+ multimodal_input: List[Union[str, MLLMImage]],
227
+ top_logprobs: int = 5,
228
+ ) -> Tuple[ChatCompletion, float]:
229
+ client = self.load_model(async_mode=True)
230
+ prompt = self.generate_prompt(multimodal_input)
231
+ messages = [{"role": "user", "content": prompt}]
232
+
233
+ # Generate completion
234
+ completion = await client.chat.completions.create(
235
+ model=self.deployment_name,
236
+ messages=messages,
237
+ temperature=self.temperature,
238
+ logprobs=True,
239
+ top_logprobs=top_logprobs,
240
+ **self.generation_kwargs,
241
+ )
242
+ # Cost calculation
243
+ input_tokens = completion.usage.prompt_tokens
244
+ output_tokens = completion.usage.completion_tokens
245
+ cost = self.calculate_cost(input_tokens, output_tokens)
246
+
247
+ return completion, cost
248
+
249
+ ###############################################
250
+ # Utilities
251
+ ###############################################
252
+
253
+ def generate_prompt(
254
+ self, multimodal_input: List[Union[str, MLLMImage]] = []
255
+ ):
256
+ """Convert multimodal input into the proper message format for Azure OpenAI."""
257
+ prompt = []
258
+ for ele in multimodal_input:
259
+ if isinstance(ele, str):
260
+ prompt.append({"type": "text", "text": ele})
261
+ elif isinstance(ele, MLLMImage):
262
+ if ele.local:
263
+ import PIL.Image
264
+
265
+ image = PIL.Image.open(ele.url)
266
+ visual_dict = {
267
+ "type": "image_url",
268
+ "image_url": {
269
+ "url": f"data:image/jpeg;base64,{self.encode_pil_image(image)}"
270
+ },
271
+ }
272
+ else:
273
+ visual_dict = {
274
+ "type": "image_url",
275
+ "image_url": {"url": ele.url},
276
+ }
277
+ prompt.append(visual_dict)
278
+ return prompt
279
+
280
+ def encode_pil_image(self, pil_image):
281
+ """Encode a PIL image to base64 string."""
282
+ image_buffer = BytesIO()
283
+ if pil_image.mode in ("RGBA", "LA", "P"):
284
+ pil_image = pil_image.convert("RGB")
285
+ pil_image.save(image_buffer, format="JPEG")
286
+ image_bytes = image_buffer.getvalue()
287
+ base64_encoded_image = base64.b64encode(image_bytes).decode("utf-8")
288
+ return base64_encoded_image
289
+
290
+ def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
291
+ pricing = model_pricing.get(self.model_name, model_pricing["gpt-4.1"])
292
+ input_cost = input_tokens * pricing["input"]
293
+ output_cost = output_tokens * pricing["output"]
294
+ return input_cost + output_cost
295
+
296
+ ###############################################
297
+ # Model
298
+ ###############################################
299
+
300
+ def get_model_name(self):
301
+ return f"Azure OpenAI ({self.model_name})"
302
+
303
+ def load_model(self, async_mode: bool = False):
304
+ if not async_mode:
305
+ return self._build_client(AzureOpenAI)
306
+ return self._build_client(AsyncAzureOpenAI)
307
+
308
+ def _client_kwargs(self) -> Dict:
309
+ """
310
+ If Tenacity is managing retries, force OpenAI SDK retries off to avoid double retries.
311
+ If the user opts into SDK retries for 'azure' via DEEPEVAL_SDK_RETRY_PROVIDERS,
312
+ leave their retry settings as is.
313
+ """
314
+ kwargs = dict(self.kwargs or {})
315
+ if not sdk_retries_for(PS.AZURE):
316
+ kwargs["max_retries"] = 0
317
+ return kwargs
318
+
319
+ def _build_client(self, cls):
320
+ kw = dict(
321
+ api_key=self.azure_openai_api_key,
322
+ api_version=self.openai_api_version,
323
+ azure_endpoint=self.azure_endpoint,
324
+ azure_deployment=self.deployment_name,
325
+ **self._client_kwargs(),
326
+ )
327
+ try:
328
+ return cls(**kw)
329
+ except TypeError as e:
330
+ # older OpenAI SDKs may not accept max_retries, in that case remove and retry once
331
+ if "max_retries" in str(e):
332
+ kw.pop("max_retries", None)
333
+ return cls(**kw)
334
+ raise
@@ -41,6 +41,15 @@ class StylingConfig:
41
41
  expected_output_format: Optional[str] = None
42
42
 
43
43
 
44
+ @dataclass
45
+ class ConversationalStylingConfig:
46
+ scenario_context: Optional[str] = None
47
+ conversational_task: Optional[str] = None
48
+ participant_roles: Optional[str] = None
49
+ scenario_format: Optional[str] = None
50
+ expected_outcome_format: Optional[str] = None
51
+
52
+
44
53
  @dataclass
45
54
  class ContextConstructionConfig:
46
55
  embedder: Optional[Union[str, DeepEvalBaseEmbeddingModel]] = None
@@ -58,3 +58,26 @@ class PromptStyling(BaseModel):
58
58
  scenario: str
59
59
  task: str
60
60
  input_format: str
61
+
62
+
63
+ class ConversationalScenario(BaseModel):
64
+ scenario: str
65
+
66
+
67
+ class ConversationalScenarioList(BaseModel):
68
+ data: List[ConversationalScenario]
69
+
70
+
71
+ class RewrittenScenario(BaseModel):
72
+ rewritten_scenario: str
73
+
74
+
75
+ class ScenarioFeedback(BaseModel):
76
+ score: float
77
+ feedback: str
78
+
79
+
80
+ class ConversationalPromptStyling(BaseModel):
81
+ scenario_context: str
82
+ conversational_task: str
83
+ participant_roles: str