deepeval 3.5.4__tar.gz → 3.5.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (471) hide show
  1. {deepeval-3.5.4 → deepeval-3.5.5}/PKG-INFO +3 -1
  2. {deepeval-3.5.4 → deepeval-3.5.5}/README.md +2 -0
  3. deepeval-3.5.5/deepeval/_version.py +1 -0
  4. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/config/settings.py +14 -0
  5. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/constants.py +2 -1
  6. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/dataset/dataset.py +11 -4
  7. deepeval-3.5.5/deepeval/dataset/types.py +25 -0
  8. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/dataset/utils.py +31 -3
  9. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/evaluate/execute.py +216 -17
  10. deepeval-3.5.5/deepeval/openai_agents/agent.py +195 -0
  11. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/openai_agents/callback_handler.py +21 -30
  12. deepeval-3.5.5/deepeval/openai_agents/runner.py +331 -0
  13. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/tracing/tracing.py +1 -3
  14. {deepeval-3.5.4 → deepeval-3.5.5}/pyproject.toml +1 -1
  15. deepeval-3.5.4/deepeval/_version.py +0 -1
  16. deepeval-3.5.4/deepeval/dataset/types.py +0 -17
  17. deepeval-3.5.4/deepeval/openai_agents/agent.py +0 -186
  18. deepeval-3.5.4/deepeval/openai_agents/runner.py +0 -114
  19. {deepeval-3.5.4 → deepeval-3.5.5}/LICENSE.md +0 -0
  20. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/__init__.py +0 -0
  21. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/annotation/__init__.py +0 -0
  22. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/annotation/annotation.py +0 -0
  23. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/annotation/api.py +0 -0
  24. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/__init__.py +0 -0
  25. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/arc/__init__.py +0 -0
  26. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/arc/arc.py +0 -0
  27. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/arc/mode.py +0 -0
  28. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/arc/template.py +0 -0
  29. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/base_benchmark.py +0 -0
  30. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/bbq/__init__.py +0 -0
  31. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/bbq/bbq.py +0 -0
  32. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/bbq/task.py +0 -0
  33. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/bbq/template.py +0 -0
  34. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/__init__.py +0 -0
  35. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/big_bench_hard.py +0 -0
  36. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/__init__.py +0 -0
  37. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/boolean_expressions.txt +0 -0
  38. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/causal_judgement.txt +0 -0
  39. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/date_understanding.txt +0 -0
  40. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/disambiguation_qa.txt +0 -0
  41. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/dyck_languages.txt +0 -0
  42. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/formal_fallacies.txt +0 -0
  43. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/geometric_shapes.txt +0 -0
  44. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/hyperbaton.txt +0 -0
  45. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_five_objects.txt +0 -0
  46. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  47. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_three_objects.txt +0 -0
  48. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/movie_recommendation.txt +0 -0
  49. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/multistep_arithmetic_two.txt +0 -0
  50. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/navigate.txt +0 -0
  51. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/object_counting.txt +0 -0
  52. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/penguins_in_a_table.txt +0 -0
  53. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  54. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/ruin_names.txt +0 -0
  55. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/salient_translation_error_detection.txt +0 -0
  56. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/snarks.txt +0 -0
  57. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/sports_understanding.txt +0 -0
  58. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/temporal_sequences.txt +0 -0
  59. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  60. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  61. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  62. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/web_of_lies.txt +0 -0
  63. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/word_sorting.txt +0 -0
  64. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/__init__.py +0 -0
  65. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/boolean_expressions.txt +0 -0
  66. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/causal_judgement.txt +0 -0
  67. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/date_understanding.txt +0 -0
  68. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/disambiguation_qa.txt +0 -0
  69. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/dyck_languages.txt +0 -0
  70. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/formal_fallacies.txt +0 -0
  71. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/geometric_shapes.txt +0 -0
  72. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/hyperbaton.txt +0 -0
  73. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_five_objects.txt +0 -0
  74. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_seven_objects.txt +0 -0
  75. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_three_objects.txt +0 -0
  76. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/movie_recommendation.txt +0 -0
  77. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/multistep_arithmetic_two.txt +0 -0
  78. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/navigate.txt +0 -0
  79. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/object_counting.txt +0 -0
  80. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/penguins_in_a_table.txt +0 -0
  81. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/reasoning_about_colored_objects.txt +0 -0
  82. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/ruin_names.txt +0 -0
  83. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/salient_translation_error_detection.txt +0 -0
  84. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/snarks.txt +0 -0
  85. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/sports_understanding.txt +0 -0
  86. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/temporal_sequences.txt +0 -0
  87. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  88. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  89. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  90. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/web_of_lies.txt +0 -0
  91. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/word_sorting.txt +0 -0
  92. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/task.py +0 -0
  93. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/big_bench_hard/template.py +0 -0
  94. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/bool_q/__init__.py +0 -0
  95. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/bool_q/bool_q.py +0 -0
  96. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/bool_q/template.py +0 -0
  97. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/drop/__init__.py +0 -0
  98. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/drop/drop.py +0 -0
  99. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/drop/task.py +0 -0
  100. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/drop/template.py +0 -0
  101. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/equity_med_qa/__init__.py +0 -0
  102. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/equity_med_qa/equity_med_qa.py +0 -0
  103. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/equity_med_qa/task.py +0 -0
  104. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/equity_med_qa/template.py +0 -0
  105. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/gsm8k/__init__.py +0 -0
  106. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/gsm8k/gsm8k.py +0 -0
  107. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/gsm8k/template.py +0 -0
  108. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/hellaswag/__init__.py +0 -0
  109. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/hellaswag/hellaswag.py +0 -0
  110. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/hellaswag/task.py +0 -0
  111. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/hellaswag/template.py +0 -0
  112. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/human_eval/__init__.py +0 -0
  113. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/human_eval/human_eval.py +0 -0
  114. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/human_eval/task.py +0 -0
  115. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/human_eval/template.py +0 -0
  116. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/ifeval/__init__.py +0 -0
  117. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/ifeval/ifeval.py +0 -0
  118. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/ifeval/template.py +0 -0
  119. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/lambada/__init__.py +0 -0
  120. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/lambada/lambada.py +0 -0
  121. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/lambada/template.py +0 -0
  122. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/logi_qa/__init__.py +0 -0
  123. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/logi_qa/logi_qa.py +0 -0
  124. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/logi_qa/task.py +0 -0
  125. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/logi_qa/template.py +0 -0
  126. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/math_qa/__init__.py +0 -0
  127. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/math_qa/math_qa.py +0 -0
  128. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/math_qa/task.py +0 -0
  129. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/math_qa/template.py +0 -0
  130. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/mmlu/__init__.py +0 -0
  131. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/mmlu/mmlu.py +0 -0
  132. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/mmlu/task.py +0 -0
  133. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/mmlu/template.py +0 -0
  134. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/modes/__init__.py +0 -0
  135. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/results.py +0 -0
  136. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/schema.py +0 -0
  137. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/squad/__init__.py +0 -0
  138. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/squad/squad.py +0 -0
  139. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/squad/task.py +0 -0
  140. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/squad/template.py +0 -0
  141. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/tasks/__init__.py +0 -0
  142. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/truthful_qa/__init__.py +0 -0
  143. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/truthful_qa/mode.py +0 -0
  144. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/truthful_qa/task.py +0 -0
  145. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/truthful_qa/template.py +0 -0
  146. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/truthful_qa/truthful_qa.py +0 -0
  147. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/utils.py +0 -0
  148. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/winogrande/__init__.py +0 -0
  149. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/winogrande/template.py +0 -0
  150. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/benchmarks/winogrande/winogrande.py +0 -0
  151. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/cli/__init__.py +0 -0
  152. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/cli/dotenv_handler.py +0 -0
  153. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/cli/main.py +0 -0
  154. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/cli/server.py +0 -0
  155. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/cli/test.py +0 -0
  156. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/cli/types.py +0 -0
  157. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/cli/utils.py +0 -0
  158. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/confident/__init__.py +0 -0
  159. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/confident/api.py +0 -0
  160. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/confident/types.py +0 -0
  161. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/config/__init__.py +0 -0
  162. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/config/settings_manager.py +0 -0
  163. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/config/utils.py +0 -0
  164. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/dataset/__init__.py +0 -0
  165. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/dataset/api.py +0 -0
  166. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/dataset/golden.py +0 -0
  167. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/dataset/test_run_tracer.py +0 -0
  168. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/errors.py +0 -0
  169. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/evaluate/__init__.py +0 -0
  170. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/evaluate/api.py +0 -0
  171. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/evaluate/compare.py +0 -0
  172. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/evaluate/configs.py +0 -0
  173. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/evaluate/evaluate.py +0 -0
  174. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/evaluate/types.py +0 -0
  175. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/evaluate/utils.py +0 -0
  176. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/integrations/__init__.py +0 -0
  177. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/integrations/crewai/__init__.py +0 -0
  178. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/integrations/crewai/agent.py +0 -0
  179. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/integrations/crewai/handler.py +0 -0
  180. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/integrations/crewai/patch.py +0 -0
  181. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/integrations/hugging_face/__init__.py +0 -0
  182. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/integrations/hugging_face/callback.py +0 -0
  183. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/integrations/hugging_face/rich_manager.py +0 -0
  184. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/integrations/hugging_face/tests/test_callbacks.py +0 -0
  185. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/integrations/hugging_face/utils.py +0 -0
  186. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/integrations/langchain/__init__.py +0 -0
  187. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/integrations/langchain/callback.py +0 -0
  188. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/integrations/langchain/patch.py +0 -0
  189. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/integrations/langchain/utils.py +0 -0
  190. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/integrations/llama_index/__init__.py +0 -0
  191. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/integrations/llama_index/agent/patched.py +0 -0
  192. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/integrations/llama_index/handler.py +0 -0
  193. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/integrations/llama_index/utils.py +0 -0
  194. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/integrations/pydantic_ai/__init__.py +0 -0
  195. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/integrations/pydantic_ai/agent.py +0 -0
  196. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/integrations/pydantic_ai/otel.py +0 -0
  197. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/integrations/pydantic_ai/patcher.py +0 -0
  198. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/integrations/pydantic_ai/utils.py +0 -0
  199. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/key_handler.py +0 -0
  200. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/__init__.py +0 -0
  201. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/answer_relevancy/__init__.py +0 -0
  202. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/answer_relevancy/answer_relevancy.py +0 -0
  203. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/answer_relevancy/schema.py +0 -0
  204. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/answer_relevancy/template.py +0 -0
  205. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/arena_g_eval/__init__.py +0 -0
  206. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/arena_g_eval/arena_g_eval.py +0 -0
  207. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/arena_g_eval/schema.py +0 -0
  208. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/arena_g_eval/template.py +0 -0
  209. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/arena_g_eval/utils.py +0 -0
  210. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/argument_correctness/__init__.py +0 -0
  211. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/argument_correctness/argument_correctness.py +0 -0
  212. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/argument_correctness/schema.py +0 -0
  213. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/argument_correctness/template.py +0 -0
  214. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/base_metric.py +0 -0
  215. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/bias/__init__.py +0 -0
  216. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/bias/bias.py +0 -0
  217. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/bias/schema.py +0 -0
  218. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/bias/template.py +0 -0
  219. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/contextual_precision/__init__.py +0 -0
  220. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/contextual_precision/contextual_precision.py +0 -0
  221. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/contextual_precision/schema.py +0 -0
  222. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/contextual_precision/template.py +0 -0
  223. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/contextual_recall/__init__.py +0 -0
  224. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/contextual_recall/contextual_recall.py +0 -0
  225. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/contextual_recall/schema.py +0 -0
  226. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/contextual_recall/template.py +0 -0
  227. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/contextual_relevancy/__init__.py +0 -0
  228. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/contextual_relevancy/contextual_relevancy.py +0 -0
  229. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/contextual_relevancy/schema.py +0 -0
  230. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/contextual_relevancy/template.py +0 -0
  231. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/conversation_completeness/__init__.py +0 -0
  232. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/conversation_completeness/conversation_completeness.py +0 -0
  233. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/conversation_completeness/schema.py +0 -0
  234. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/conversation_completeness/template.py +0 -0
  235. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/conversational_dag/__init__.py +0 -0
  236. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/conversational_dag/conversational_dag.py +0 -0
  237. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/conversational_dag/nodes.py +0 -0
  238. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/conversational_dag/templates.py +0 -0
  239. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/conversational_g_eval/__init__.py +0 -0
  240. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/conversational_g_eval/conversational_g_eval.py +0 -0
  241. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/conversational_g_eval/schema.py +0 -0
  242. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/conversational_g_eval/template.py +0 -0
  243. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/dag/__init__.py +0 -0
  244. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/dag/dag.py +0 -0
  245. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/dag/graph.py +0 -0
  246. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/dag/nodes.py +0 -0
  247. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/dag/schema.py +0 -0
  248. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/dag/templates.py +0 -0
  249. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/dag/utils.py +0 -0
  250. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/faithfulness/__init__.py +0 -0
  251. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/faithfulness/faithfulness.py +0 -0
  252. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/faithfulness/schema.py +0 -0
  253. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/faithfulness/template.py +0 -0
  254. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/g_eval/__init__.py +0 -0
  255. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/g_eval/g_eval.py +0 -0
  256. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/g_eval/schema.py +0 -0
  257. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/g_eval/template.py +0 -0
  258. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/g_eval/utils.py +0 -0
  259. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/hallucination/__init__.py +0 -0
  260. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/hallucination/hallucination.py +0 -0
  261. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/hallucination/schema.py +0 -0
  262. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/hallucination/template.py +0 -0
  263. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/indicator.py +0 -0
  264. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/json_correctness/__init__.py +0 -0
  265. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/json_correctness/json_correctness.py +0 -0
  266. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/json_correctness/schema.py +0 -0
  267. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/json_correctness/template.py +0 -0
  268. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/knowledge_retention/__init__.py +0 -0
  269. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/knowledge_retention/knowledge_retention.py +0 -0
  270. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/knowledge_retention/schema.py +0 -0
  271. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/knowledge_retention/template.py +0 -0
  272. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/mcp/__init__.py +0 -0
  273. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/mcp/mcp_task_completion.py +0 -0
  274. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +0 -0
  275. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/mcp/schema.py +0 -0
  276. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/mcp/template.py +0 -0
  277. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/mcp_use_metric/__init__.py +0 -0
  278. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/mcp_use_metric/mcp_use_metric.py +0 -0
  279. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/mcp_use_metric/schema.py +0 -0
  280. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/mcp_use_metric/template.py +0 -0
  281. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/misuse/__init__.py +0 -0
  282. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/misuse/misuse.py +0 -0
  283. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/misuse/schema.py +0 -0
  284. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/misuse/template.py +0 -0
  285. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/__init__.py +0 -0
  286. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/image_coherence/__init__.py +0 -0
  287. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +0 -0
  288. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/image_coherence/schema.py +0 -0
  289. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/image_coherence/template.py +0 -0
  290. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/image_editing/__init__.py +0 -0
  291. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +0 -0
  292. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/image_editing/schema.py +0 -0
  293. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/image_editing/template.py +0 -0
  294. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/image_helpfulness/__init__.py +0 -0
  295. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +0 -0
  296. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/image_helpfulness/schema.py +0 -0
  297. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/image_helpfulness/template.py +0 -0
  298. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/image_reference/__init__.py +0 -0
  299. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +0 -0
  300. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/image_reference/schema.py +0 -0
  301. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/image_reference/template.py +0 -0
  302. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/__init__.py +0 -0
  303. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -0
  304. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -0
  305. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -0
  306. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/__init__.py +0 -0
  307. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -0
  308. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -0
  309. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -0
  310. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/__init__.py +0 -0
  311. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -0
  312. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -0
  313. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -0
  314. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/__init__.py +0 -0
  315. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -0
  316. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/schema.py +0 -0
  317. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -0
  318. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  319. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -0
  320. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/schema.py +0 -0
  321. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -0
  322. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  323. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -0
  324. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -0
  325. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -0
  326. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -0
  327. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  328. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -0
  329. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/text_to_image/__init__.py +0 -0
  330. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/text_to_image/schema.py +0 -0
  331. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/text_to_image/template.py +0 -0
  332. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +0 -0
  333. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/non_advice/__init__.py +0 -0
  334. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/non_advice/non_advice.py +0 -0
  335. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/non_advice/schema.py +0 -0
  336. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/non_advice/template.py +0 -0
  337. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/pii_leakage/__init__.py +0 -0
  338. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/pii_leakage/pii_leakage.py +0 -0
  339. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/pii_leakage/schema.py +0 -0
  340. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/pii_leakage/template.py +0 -0
  341. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/prompt_alignment/__init__.py +0 -0
  342. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/prompt_alignment/prompt_alignment.py +0 -0
  343. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/prompt_alignment/schema.py +0 -0
  344. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/prompt_alignment/template.py +0 -0
  345. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/ragas.py +0 -0
  346. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/role_adherence/__init__.py +0 -0
  347. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/role_adherence/role_adherence.py +0 -0
  348. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/role_adherence/schema.py +0 -0
  349. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/role_adherence/template.py +0 -0
  350. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/role_violation/__init__.py +0 -0
  351. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/role_violation/role_violation.py +0 -0
  352. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/role_violation/schema.py +0 -0
  353. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/role_violation/template.py +0 -0
  354. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/summarization/__init__.py +0 -0
  355. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/summarization/schema.py +0 -0
  356. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/summarization/summarization.py +0 -0
  357. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/summarization/template.py +0 -0
  358. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/task_completion/__init__.py +0 -0
  359. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/task_completion/schema.py +0 -0
  360. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/task_completion/task_completion.py +0 -0
  361. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/task_completion/template.py +0 -0
  362. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/tool_correctness/__init__.py +0 -0
  363. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/tool_correctness/tool_correctness.py +0 -0
  364. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/toxicity/__init__.py +0 -0
  365. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/toxicity/schema.py +0 -0
  366. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/toxicity/template.py +0 -0
  367. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/toxicity/toxicity.py +0 -0
  368. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/turn_relevancy/__init__.py +0 -0
  369. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/turn_relevancy/schema.py +0 -0
  370. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/turn_relevancy/template.py +0 -0
  371. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/turn_relevancy/turn_relevancy.py +0 -0
  372. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/metrics/utils.py +0 -0
  373. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/models/__init__.py +0 -0
  374. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/models/_summac_model.py +0 -0
  375. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/models/answer_relevancy_model.py +0 -0
  376. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/models/base_model.py +0 -0
  377. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/models/detoxify_model.py +0 -0
  378. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/models/embedding_models/__init__.py +0 -0
  379. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/models/embedding_models/azure_embedding_model.py +0 -0
  380. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/models/embedding_models/local_embedding_model.py +0 -0
  381. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/models/embedding_models/ollama_embedding_model.py +0 -0
  382. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/models/embedding_models/openai_embedding_model.py +0 -0
  383. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/models/hallucination_model.py +0 -0
  384. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/models/llms/__init__.py +0 -0
  385. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/models/llms/amazon_bedrock_model.py +0 -0
  386. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/models/llms/anthropic_model.py +0 -0
  387. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/models/llms/azure_model.py +0 -0
  388. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/models/llms/deepseek_model.py +0 -0
  389. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/models/llms/gemini_model.py +0 -0
  390. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/models/llms/grok_model.py +0 -0
  391. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/models/llms/kimi_model.py +0 -0
  392. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/models/llms/litellm_model.py +0 -0
  393. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/models/llms/local_model.py +0 -0
  394. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/models/llms/ollama_model.py +0 -0
  395. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/models/llms/openai_model.py +0 -0
  396. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/models/llms/utils.py +0 -0
  397. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/models/mlllms/__init__.py +0 -0
  398. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/models/mlllms/gemini_model.py +0 -0
  399. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/models/mlllms/ollama_model.py +0 -0
  400. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/models/mlllms/openai_model.py +0 -0
  401. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/models/retry_policy.py +0 -0
  402. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/models/summac_model.py +0 -0
  403. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/models/unbias_model.py +0 -0
  404. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/models/utils.py +0 -0
  405. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/openai/__init__.py +0 -0
  406. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/openai/extractors.py +0 -0
  407. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/openai/patch.py +0 -0
  408. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/openai/utils.py +0 -0
  409. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/openai_agents/__init__.py +0 -0
  410. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/openai_agents/extractors.py +0 -0
  411. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/openai_agents/patch.py +0 -0
  412. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/plugins/__init__.py +0 -0
  413. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/plugins/plugin.py +0 -0
  414. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/progress_context.py +0 -0
  415. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/prompt/__init__.py +0 -0
  416. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/prompt/api.py +0 -0
  417. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/prompt/prompt.py +0 -0
  418. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/prompt/utils.py +0 -0
  419. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/py.typed +0 -0
  420. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/red_teaming/README.md +0 -0
  421. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/scorer/__init__.py +0 -0
  422. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/scorer/scorer.py +0 -0
  423. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/simulator/__init__.py +0 -0
  424. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/simulator/conversation_simulator.py +0 -0
  425. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/simulator/schema.py +0 -0
  426. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/simulator/template.py +0 -0
  427. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/singleton.py +0 -0
  428. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/synthesizer/__init__.py +0 -0
  429. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/synthesizer/base_synthesizer.py +0 -0
  430. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/synthesizer/chunking/__init__.py +0 -0
  431. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/synthesizer/chunking/context_generator.py +0 -0
  432. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/synthesizer/chunking/doc_chunker.py +0 -0
  433. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/synthesizer/config.py +0 -0
  434. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/synthesizer/schema.py +0 -0
  435. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/synthesizer/synthesizer.py +0 -0
  436. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/synthesizer/templates/__init__.py +0 -0
  437. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/synthesizer/templates/template.py +0 -0
  438. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/synthesizer/templates/template_extraction.py +0 -0
  439. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/synthesizer/templates/template_prompt.py +0 -0
  440. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/synthesizer/types.py +0 -0
  441. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/synthesizer/utils.py +0 -0
  442. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/telemetry.py +0 -0
  443. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/test_case/__init__.py +0 -0
  444. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/test_case/arena_test_case.py +0 -0
  445. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/test_case/conversational_test_case.py +0 -0
  446. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/test_case/llm_test_case.py +0 -0
  447. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/test_case/mcp.py +0 -0
  448. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/test_case/mllm_test_case.py +0 -0
  449. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/test_case/utils.py +0 -0
  450. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/test_run/__init__.py +0 -0
  451. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/test_run/api.py +0 -0
  452. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/test_run/cache.py +0 -0
  453. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/test_run/hooks.py +0 -0
  454. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/test_run/hyperparameters.py +0 -0
  455. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/test_run/test_run.py +0 -0
  456. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/tracing/__init__.py +0 -0
  457. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/tracing/api.py +0 -0
  458. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/tracing/context.py +0 -0
  459. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/tracing/offline_evals/__init__.py +0 -0
  460. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/tracing/offline_evals/api.py +0 -0
  461. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/tracing/offline_evals/span.py +0 -0
  462. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/tracing/offline_evals/thread.py +0 -0
  463. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/tracing/offline_evals/trace.py +0 -0
  464. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/tracing/otel/__init__.py +0 -0
  465. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/tracing/otel/exporter.py +0 -0
  466. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/tracing/otel/utils.py +0 -0
  467. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/tracing/patchers.py +0 -0
  468. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/tracing/perf_epoch_bridge.py +0 -0
  469. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/tracing/types.py +0 -0
  470. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/tracing/utils.py +0 -0
  471. {deepeval-3.5.4 → deepeval-3.5.5}/deepeval/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deepeval
3
- Version: 3.5.4
3
+ Version: 3.5.5
4
4
  Summary: The LLM Evaluation Framework
5
5
  Home-page: https://github.com/confident-ai/deepeval
6
6
  License: Apache-2.0
@@ -186,6 +186,8 @@ Let's pretend your LLM application is a RAG based customer support chatbot; here
186
186
 
187
187
  ## Installation
188
188
 
189
+ Deepeval works with **Python>=3.9+**.
190
+
189
191
  ```
190
192
  pip install -U deepeval
191
193
  ```
@@ -137,6 +137,8 @@ Let's pretend your LLM application is a RAG based customer support chatbot; here
137
137
 
138
138
  ## Installation
139
139
 
140
+ Deepeval works with **Python>=3.9+**.
141
+
140
142
  ```
141
143
  pip install -U deepeval
142
144
  ```
@@ -0,0 +1 @@
1
+ __version__: str = "3.5.5"
@@ -281,6 +281,7 @@ class Settings(BaseSettings):
281
281
  #
282
282
  # Telemetry and Debug
283
283
  #
284
+ DEEPEVAL_DEBUG_ASYNC: Optional[bool] = None
284
285
  DEEPEVAL_TELEMETRY_OPT_OUT: Optional[bool] = None
285
286
  DEEPEVAL_UPDATE_WARNING_OPT_IN: Optional[bool] = None
286
287
  DEEPEVAL_GRPC_LOGGING: Optional[bool] = None
@@ -303,6 +304,19 @@ class Settings(BaseSettings):
303
304
  MEDIA_IMAGE_CONNECT_TIMEOUT_SECONDS: float = 3.05
304
305
  MEDIA_IMAGE_READ_TIMEOUT_SECONDS: float = 10.0
305
306
 
307
+ #
308
+ # Async Task Configuration
309
+ #
310
+
311
+ # Maximum time allowed for a single task to complete
312
+ DEEPEVAL_PER_TASK_TIMEOUT_SECONDS: int = (
313
+ 300 # Set to float('inf') to disable timeout
314
+ )
315
+
316
+ # Buffer time for gathering results from all tasks, added to the longest task duration
317
+ # Increase if many tasks are running concurrently
318
+ DEEPEVAL_TASK_GATHER_BUFFER_SECONDS: int = 60
319
+
306
320
  ##############
307
321
  # Validators #
308
322
  ##############
@@ -1,4 +1,5 @@
1
1
  from enum import Enum
2
+ from typing import Union
2
3
 
3
4
  KEY_FILE: str = ".deepeval"
4
5
  HIDDEN_DIR: str = ".deepeval"
@@ -29,7 +30,7 @@ class ProviderSlug(str, Enum):
29
30
  OLLAMA = "ollama"
30
31
 
31
32
 
32
- def slugify(value: str | ProviderSlug) -> str:
33
+ def slugify(value: Union[str, ProviderSlug]) -> str:
33
34
  return (
34
35
  value.value
35
36
  if isinstance(value, ProviderSlug)
@@ -1,5 +1,5 @@
1
1
  from asyncio import Task
2
- from typing import Iterator, List, Optional, Union, Literal
2
+ from typing import TYPE_CHECKING, Iterator, List, Optional, Union, Literal
3
3
  from dataclasses import dataclass, field
4
4
  from opentelemetry.trace import Tracer
5
5
  from opentelemetry.context import Context, attach, detach
@@ -7,7 +7,6 @@ from rich.console import Console
7
7
  from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
8
8
  import json
9
9
  import csv
10
- import webbrowser
11
10
  import os
12
11
  import datetime
13
12
  import time
@@ -17,6 +16,7 @@ from opentelemetry import baggage
17
16
 
18
17
  from deepeval.confident.api import Api, Endpoints, HttpMethods
19
18
  from deepeval.dataset.utils import (
19
+ coerce_to_task,
20
20
  convert_test_cases_to_goldens,
21
21
  convert_goldens_to_test_cases,
22
22
  convert_convo_goldens_to_convo_test_cases,
@@ -49,11 +49,18 @@ from deepeval.utils import (
49
49
  from deepeval.test_run import (
50
50
  global_test_run_manager,
51
51
  )
52
- from deepeval.dataset.types import global_evaluation_tasks
53
52
  from deepeval.openai.utils import openai_test_case_pairs
54
53
  from deepeval.tracing import trace_manager
55
54
  from deepeval.tracing.tracing import EVAL_DUMMY_SPAN_NAME
56
55
 
56
+ if TYPE_CHECKING:
57
+ from deepeval.evaluate.configs import (
58
+ AsyncConfig,
59
+ DisplayConfig,
60
+ CacheConfig,
61
+ ErrorConfig,
62
+ )
63
+
57
64
 
58
65
  valid_file_types = ["csv", "json", "jsonl"]
59
66
 
@@ -1230,7 +1237,7 @@ class EvaluationDataset:
1230
1237
  )
1231
1238
 
1232
1239
  def evaluate(self, task: Task):
1233
- global_evaluation_tasks.append(task)
1240
+ coerce_to_task(task)
1234
1241
 
1235
1242
  def _start_otel_test_run(self, tracer: Optional[Tracer] = None) -> Context:
1236
1243
  _tracer = check_tracer(tracer)
@@ -0,0 +1,25 @@
1
+ import asyncio
2
+
3
+ from typing import Any
4
+ from deepeval.dataset.utils import coerce_to_task
5
+
6
+
7
+ class EvaluationTasks:
8
+
9
+ def __init__(self):
10
+ self._tasks: list[asyncio.Future] = []
11
+
12
+ def append(self, obj: Any):
13
+ self._tasks.append(coerce_to_task(obj))
14
+
15
+ def get_tasks(self) -> list[asyncio.Future]:
16
+ return list(self._tasks)
17
+
18
+ def num_tasks(self):
19
+ return len(self._tasks)
20
+
21
+ def clear_tasks(self) -> None:
22
+ for t in self._tasks:
23
+ if not t.done():
24
+ t.cancel()
25
+ self._tasks.clear()
@@ -1,10 +1,10 @@
1
- from typing import List, Optional, Any
1
+ import asyncio
2
+ import inspect
2
3
  import json
3
4
  import re
4
5
 
6
+ from typing import List, Optional, Any
5
7
  from opentelemetry.trace import Tracer
6
- from opentelemetry import trace
7
- from opentelemetry.trace import NoOpTracerProvider
8
8
 
9
9
  from deepeval.dataset.api import Golden
10
10
  from deepeval.dataset.golden import ConversationalGolden
@@ -174,3 +174,31 @@ def check_tracer(tracer: Optional[Tracer] = None) -> Tracer:
174
174
  )
175
175
 
176
176
  return GLOBAL_TEST_RUN_TRACER
177
+
178
+
179
+ def coerce_to_task(obj: Any) -> asyncio.Future[Any]:
180
+ # already a Task so just return it
181
+ if isinstance(obj, asyncio.Task):
182
+ return obj
183
+
184
+ # If it is a future, it is already scheduled, so just return it
185
+ if asyncio.isfuture(obj):
186
+ # type: ignore[return-value] # it is an awaitable, gather accepts it
187
+ return obj
188
+
189
+ # bare coroutine must be explicitly scheduled using create_task to bind to loop & track
190
+ if asyncio.iscoroutine(obj):
191
+ return asyncio.create_task(obj)
192
+
193
+ # generic awaitable (any object with __await__) will need to be wrapped so create_task accepts it
194
+ if inspect.isawaitable(obj):
195
+
196
+ async def _wrap(awaitable):
197
+ return await awaitable
198
+
199
+ return asyncio.create_task(_wrap(obj))
200
+
201
+ # not awaitable, so time to sound the alarm!
202
+ raise TypeError(
203
+ f"Expected Task/Future/coroutine/awaitable, got {type(obj).__name__}"
204
+ )
@@ -1,3 +1,5 @@
1
+ import logging
2
+
1
3
  from rich.progress import (
2
4
  Progress,
3
5
  TextColumn,
@@ -40,7 +42,6 @@ from deepeval.tracing.api import (
40
42
  BaseApiSpan,
41
43
  )
42
44
  from deepeval.dataset import Golden
43
- from deepeval.dataset.types import global_evaluation_tasks
44
45
  from deepeval.errors import MissingTestCaseParamsError
45
46
  from deepeval.metrics.utils import copy_metrics
46
47
  from deepeval.utils import (
@@ -87,6 +88,17 @@ from deepeval.evaluate.utils import (
87
88
  from deepeval.utils import add_pbar, update_pbar, custom_console
88
89
  from deepeval.openai.utils import openai_test_case_pairs
89
90
  from deepeval.tracing.types import TestCaseMetricPair
91
+ from deepeval.config.settings import get_settings
92
+
93
+
94
+ logger = logging.getLogger(__name__)
95
+ settings = get_settings()
96
+
97
+
98
+ async def _snapshot_tasks():
99
+ cur = asyncio.current_task()
100
+ # `all_tasks` returns tasks for the current running loop only
101
+ return {t for t in asyncio.all_tasks() if t is not cur}
90
102
 
91
103
 
92
104
  ###########################################
@@ -112,7 +124,7 @@ def execute_test_cases(
112
124
  _is_assert_test: bool = False,
113
125
  ) -> List[TestResult]:
114
126
  global_test_run_cache_manager.disable_write_cache = (
115
- cache_config.write_cache == False
127
+ cache_config.write_cache is False
116
128
  )
117
129
 
118
130
  if test_run_manager is None:
@@ -357,7 +369,7 @@ async def a_execute_test_cases(
357
369
  return await func(*args, **kwargs)
358
370
 
359
371
  global_test_run_cache_manager.disable_write_cache = (
360
- cache_config.write_cache == False
372
+ cache_config.write_cache is False
361
373
  )
362
374
  if test_run_manager is None:
363
375
  test_run_manager = global_test_run_manager
@@ -1041,7 +1053,7 @@ def execute_agentic_test_cases(
1041
1053
  with progress:
1042
1054
  pbar_id = add_pbar(
1043
1055
  progress,
1044
- f"Running Component-Level Evals (sync)",
1056
+ "Running Component-Level Evals (sync)",
1045
1057
  total=len(goldens) * 2,
1046
1058
  )
1047
1059
  evaluate_test_cases(progress=progress, pbar_id=pbar_id)
@@ -1551,7 +1563,7 @@ def execute_agentic_test_cases_from_loop(
1551
1563
  tools_called=span.tools_called,
1552
1564
  expected_tools=span.expected_tools,
1553
1565
  )
1554
- if span.metrics == None or llm_test_case == None:
1566
+ if span.metrics is None or llm_test_case is None:
1555
1567
  return
1556
1568
 
1557
1569
  has_task_completion = any(
@@ -1692,7 +1704,7 @@ def execute_agentic_test_cases_from_loop(
1692
1704
  with progress:
1693
1705
  pbar_id = add_pbar(
1694
1706
  progress,
1695
- f"Running Component-Level Evals (sync)",
1707
+ "Running Component-Level Evals (sync)",
1696
1708
  total=len(goldens) * 2,
1697
1709
  )
1698
1710
  yield from evaluate_test_cases(
@@ -1722,6 +1734,11 @@ def a_execute_agentic_test_cases_from_loop(
1722
1734
  _is_assert_test: bool = False,
1723
1735
  ) -> Iterator[TestResult]:
1724
1736
 
1737
+ GATHER_TIMEOUT_SECONDS = (
1738
+ settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
1739
+ + settings.DEEPEVAL_TASK_GATHER_BUFFER_SECONDS
1740
+ )
1741
+
1725
1742
  semaphore = asyncio.Semaphore(async_config.max_concurrent)
1726
1743
  original_create_task = asyncio.create_task
1727
1744
 
@@ -1735,43 +1752,225 @@ def a_execute_agentic_test_cases_from_loop(
1735
1752
 
1736
1753
  async def execute_callback_with_semaphore(coroutine: Awaitable):
1737
1754
  async with semaphore:
1738
- return await coroutine
1755
+ return await asyncio.wait_for(
1756
+ coroutine, timeout=settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
1757
+ )
1739
1758
 
1740
1759
  def evaluate_test_cases(
1741
1760
  progress: Optional[Progress] = None,
1742
1761
  pbar_id: Optional[int] = None,
1743
1762
  pbar_callback_id: Optional[int] = None,
1744
1763
  ):
1764
+ # Tasks we scheduled during this iterator run on this event loop.
1765
+ # by gathering these tasks we can avoid re-awaiting coroutines which
1766
+ # can cause cross loop mixups that trigger "future belongs to a different loop" errors
1767
+ created_tasks: list[asyncio.Task] = []
1768
+ task_meta: dict[asyncio.Task, dict] = {}
1769
+ current_golden_ctx = {"index": -1, "name": None, "input": None}
1770
+
1745
1771
  def create_callback_task(coro, **kwargs):
1746
- task = loop.create_task(execute_callback_with_semaphore(coro))
1772
+ # build a descriptive task name for tracking
1773
+ coro_desc = repr(coro)
1774
+ task_name = f"callback[{current_golden_ctx['index']}]:{coro_desc.split()[1] if ' ' in coro_desc else coro_desc}"
1775
+
1776
+ # Wrap the user coroutine in our semaphore runner and bind it to THIS loop.
1777
+ # Keep the resulting Task so we can gather tasks (not raw coroutines) later,
1778
+ # without touching tasks from other loops or already awaited coroutines.
1779
+ task = loop.create_task(
1780
+ execute_callback_with_semaphore(coro), name=task_name
1781
+ )
1782
+
1783
+ # record metadata for debugging
1784
+ MAX_META_INPUT_LENGTH = 120
1785
+ started = time.perf_counter()
1786
+ short_input = current_golden_ctx["input"]
1787
+ if (
1788
+ isinstance(short_input, str)
1789
+ and len(short_input) > MAX_META_INPUT_LENGTH
1790
+ ):
1791
+ short_input = short_input[:MAX_META_INPUT_LENGTH] + "…"
1792
+ task_meta[task] = {
1793
+ "golden_index": current_golden_ctx["index"],
1794
+ "golden_name": current_golden_ctx["name"],
1795
+ "input": short_input,
1796
+ "coro": coro_desc,
1797
+ "started": started,
1798
+ }
1747
1799
 
1748
1800
  def on_task_done(t: asyncio.Task):
1801
+ if settings.DEEPEVAL_DEBUG_ASYNC:
1802
+ # Using info level here to make it easy to spot these logs.
1803
+ # We are gated by DEEPEVAL_DEBUG_ASYNC
1804
+ meta = task_meta.get(t, {})
1805
+ duration = time.perf_counter() - meta.get(
1806
+ "started", started
1807
+ )
1808
+
1809
+ if t.cancelled():
1810
+ logger.info(
1811
+ "[deepeval] task CANCELLED %s after %.2fs meta=%r",
1812
+ t.get_name(),
1813
+ duration,
1814
+ meta,
1815
+ )
1816
+ else:
1817
+ exc = t.exception()
1818
+ if exc is not None:
1819
+ logger.error(
1820
+ "[deepeval] task ERROR %s after %.2fs meta=%r",
1821
+ t.get_name(),
1822
+ duration,
1823
+ meta,
1824
+ exc_info=(type(exc), exc, exc.__traceback__),
1825
+ )
1826
+ else:
1827
+ logger.info(
1828
+ "[deepeval] task OK %s after %.2fs meta={'golden_index': %r}",
1829
+ t.get_name(),
1830
+ duration,
1831
+ meta.get("golden_index"),
1832
+ )
1833
+
1749
1834
  update_pbar(progress, pbar_callback_id)
1750
1835
  update_pbar(progress, pbar_id)
1751
1836
 
1752
1837
  task.add_done_callback(on_task_done)
1838
+ created_tasks.append(task)
1753
1839
  return task
1754
1840
 
1755
1841
  asyncio.create_task = create_callback_task
1842
+ # DEBUG
1843
+ # Snapshot tasks that already exist on this loop so we can detect strays
1844
+ baseline_tasks = loop.run_until_complete(_snapshot_tasks())
1756
1845
 
1757
1846
  try:
1758
- for golden in goldens:
1847
+ for index, golden in enumerate(goldens):
1848
+ current_golden_ctx.update(
1849
+ {
1850
+ "index": index,
1851
+ "name": getattr(golden, "name", None),
1852
+ "input": getattr(golden, "input", None),
1853
+ }
1854
+ )
1855
+ prev_task_length = len(created_tasks)
1759
1856
  yield golden
1760
- if global_evaluation_tasks.num_tasks() == 0:
1857
+ # if this golden created no tasks, bump bars now
1858
+ if len(created_tasks) == prev_task_length:
1761
1859
  update_pbar(progress, pbar_callback_id)
1762
1860
  update_pbar(progress, pbar_id)
1763
1861
  finally:
1764
1862
  asyncio.create_task = original_create_task
1765
1863
 
1766
- if global_evaluation_tasks.num_tasks() > 0:
1767
- loop.run_until_complete(
1768
- asyncio.gather(
1769
- *global_evaluation_tasks.get_tasks(),
1864
+ if created_tasks:
1865
+ # Only await tasks we created on this loop in this run.
1866
+ # This will prevent re-awaiting and avoids cross loop "future belongs to a different loop" errors
1867
+ try:
1868
+ loop.run_until_complete(
1869
+ asyncio.wait_for(
1870
+ asyncio.gather(*created_tasks, return_exceptions=True),
1871
+ timeout=GATHER_TIMEOUT_SECONDS,
1872
+ )
1770
1873
  )
1771
- )
1874
+ except asyncio.TimeoutError:
1875
+ import traceback
1876
+
1877
+ pending = [t for t in created_tasks if not t.done()]
1878
+
1879
+ # Log the elapsed time for each task that was pending
1880
+ for t in pending:
1881
+ meta = task_meta.get(t, {})
1882
+ start_time = meta.get("started", time.perf_counter())
1883
+ elapsed_time = time.perf_counter() - start_time
1884
+
1885
+ # Determine if it was a per task or gather timeout based on task's elapsed time
1886
+ if (
1887
+ elapsed_time
1888
+ >= settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
1889
+ ):
1890
+ timeout_type = "per-task"
1891
+ else:
1892
+ timeout_type = "gather"
1893
+
1894
+ logger.warning(
1895
+ f"[deepeval] gather TIMEOUT after {GATHER_TIMEOUT_SECONDS}s; "
1896
+ f"pending={len(pending)} tasks. Timeout type: {timeout_type}. "
1897
+ f"To give tasks more time, consider increasing "
1898
+ f"DEEPEVAL_PER_TASK_TIMEOUT_SECONDS for longer task completion time or "
1899
+ f"DEEPEVAL_TASK_GATHER_BUFFER_SECONDS to allow more time for gathering results."
1900
+ )
1901
+
1902
+ # Log pending tasks and their stack traces
1903
+ logger.info(
1904
+ " - PENDING %s elapsed_time=%.2fs meta=%s",
1905
+ t.get_name(),
1906
+ elapsed_time,
1907
+ meta,
1908
+ )
1909
+ if loop.get_debug() and settings.DEEPEVAL_DEBUG_ASYNC:
1910
+ frames = t.get_stack(limit=6)
1911
+ if frames:
1912
+ logger.info(" stack:")
1913
+ for fr in frames:
1914
+ for line in traceback.format_stack(fr):
1915
+ logger.info(" " + line.rstrip())
1916
+
1917
+ # Cancel and drain the tasks
1918
+ for t in pending:
1919
+ t.cancel()
1920
+ loop.run_until_complete(
1921
+ asyncio.gather(*created_tasks, return_exceptions=True)
1922
+ )
1923
+ finally:
1924
+
1925
+ # if it is already closed, we are done
1926
+ if loop.is_closed():
1927
+ return
1928
+
1929
+ try:
1930
+ # Find tasks that were created during this run but we didn’t track
1931
+ current_tasks = loop.run_until_complete(_snapshot_tasks())
1932
+ except RuntimeError:
1933
+ # this might happen if the loop is already closing
1934
+ # nothing we can do
1935
+ return
1936
+
1937
+ leftovers = [
1938
+ t
1939
+ for t in current_tasks
1940
+ if t not in baseline_tasks
1941
+ and t not in created_tasks
1942
+ and not t.done()
1943
+ ]
1944
+
1945
+ if not leftovers:
1946
+ return
1947
+
1948
+ if settings.DEEPEVAL_DEBUG_ASYNC:
1949
+ logger.warning(
1950
+ "[deepeval] %d stray task(s) not tracked; cancelling…",
1951
+ len(leftovers),
1952
+ )
1953
+ for t in leftovers:
1954
+ meta = task_meta.get(t, {})
1955
+ name = t.get_name()
1956
+ logger.warning(" - STRAY %s meta=%s", name, meta)
1957
+
1958
+ for t in leftovers:
1959
+ t.cancel()
1960
+
1961
+ # Drain strays so they don’t leak into the next iteration
1962
+ try:
1963
+ loop.run_until_complete(
1964
+ asyncio.gather(*leftovers, return_exceptions=True)
1965
+ )
1966
+ except RuntimeError:
1967
+ # If the loop is closing here, just continue
1968
+ if settings.DEEPEVAL_DEBUG_ASYNC:
1969
+ logger.warning(
1970
+ "[deepeval] failed to drain stray tasks because loop is closing"
1971
+ )
1772
1972
 
1773
1973
  # Evaluate traces
1774
- asyncio.create_task = loop.create_task
1775
1974
  if trace_manager.traces_to_evaluate:
1776
1975
  loop.run_until_complete(
1777
1976
  _a_evaluate_traces(
@@ -1863,7 +2062,7 @@ def a_execute_agentic_test_cases_from_loop(
1863
2062
  with progress:
1864
2063
  pbar_id = add_pbar(
1865
2064
  progress,
1866
- f"Running Component-Level Evals (async)",
2065
+ "Running Component-Level Evals (async)",
1867
2066
  total=len(goldens) * 2,
1868
2067
  )
1869
2068
  pbar_callback_id = add_pbar(