deepeval 3.6.4__tar.gz → 3.6.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (474) hide show
  1. {deepeval-3.6.4 → deepeval-3.6.5}/PKG-INFO +1 -1
  2. deepeval-3.6.5/deepeval/_version.py +1 -0
  3. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/config/settings.py +13 -0
  4. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/dataset/dataset.py +8 -2
  5. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/evaluate/evaluate.py +8 -2
  6. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/evaluate/execute.py +6 -11
  7. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/evaluate/types.py +4 -1
  8. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/evaluate/utils.py +46 -29
  9. deepeval-3.6.5/deepeval/integrations/crewai/__init__.py +3 -0
  10. deepeval-3.6.5/deepeval/integrations/crewai/handler.py +196 -0
  11. deepeval-3.6.5/deepeval/integrations/crewai/wrapper.py +87 -0
  12. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/integrations/pydantic_ai/instrumentator.py +48 -9
  13. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/faithfulness/faithfulness.py +8 -0
  14. deepeval-3.6.5/deepeval/synthesizer/chunking/__init__.py +0 -0
  15. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/test_run/__init__.py +2 -1
  16. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/test_run/api.py +1 -0
  17. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/test_run/test_run.py +85 -9
  18. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/tracing/__init__.py +2 -0
  19. deepeval-3.6.5/deepeval/tracing/otel/test_exporter.py +35 -0
  20. deepeval-3.6.5/deepeval/tracing/trace_context.py +14 -0
  21. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/tracing/tracing.py +7 -6
  22. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/tracing/utils.py +2 -86
  23. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/utils.py +149 -1
  24. {deepeval-3.6.4 → deepeval-3.6.5}/pyproject.toml +1 -1
  25. deepeval-3.6.4/deepeval/_version.py +0 -1
  26. deepeval-3.6.4/deepeval/integrations/crewai/__init__.py +0 -4
  27. deepeval-3.6.4/deepeval/integrations/crewai/agent.py +0 -98
  28. deepeval-3.6.4/deepeval/integrations/crewai/handler.py +0 -124
  29. deepeval-3.6.4/deepeval/integrations/crewai/patch.py +0 -41
  30. {deepeval-3.6.4 → deepeval-3.6.5}/LICENSE.md +0 -0
  31. {deepeval-3.6.4 → deepeval-3.6.5}/README.md +0 -0
  32. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/__init__.py +0 -0
  33. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/annotation/__init__.py +0 -0
  34. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/annotation/annotation.py +0 -0
  35. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/annotation/api.py +0 -0
  36. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/__init__.py +0 -0
  37. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/arc/__init__.py +0 -0
  38. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/arc/arc.py +0 -0
  39. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/arc/mode.py +0 -0
  40. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/arc/template.py +0 -0
  41. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/base_benchmark.py +0 -0
  42. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/bbq/__init__.py +0 -0
  43. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/bbq/bbq.py +0 -0
  44. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/bbq/task.py +0 -0
  45. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/bbq/template.py +0 -0
  46. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/__init__.py +0 -0
  47. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/big_bench_hard.py +0 -0
  48. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/__init__.py +0 -0
  49. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/boolean_expressions.txt +0 -0
  50. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/causal_judgement.txt +0 -0
  51. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/date_understanding.txt +0 -0
  52. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/disambiguation_qa.txt +0 -0
  53. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/dyck_languages.txt +0 -0
  54. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/formal_fallacies.txt +0 -0
  55. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/geometric_shapes.txt +0 -0
  56. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/hyperbaton.txt +0 -0
  57. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_five_objects.txt +0 -0
  58. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  59. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_three_objects.txt +0 -0
  60. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/movie_recommendation.txt +0 -0
  61. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/multistep_arithmetic_two.txt +0 -0
  62. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/navigate.txt +0 -0
  63. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/object_counting.txt +0 -0
  64. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/penguins_in_a_table.txt +0 -0
  65. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  66. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/ruin_names.txt +0 -0
  67. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/salient_translation_error_detection.txt +0 -0
  68. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/snarks.txt +0 -0
  69. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/sports_understanding.txt +0 -0
  70. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/temporal_sequences.txt +0 -0
  71. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  72. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  73. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  74. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/web_of_lies.txt +0 -0
  75. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/word_sorting.txt +0 -0
  76. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/__init__.py +0 -0
  77. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/boolean_expressions.txt +0 -0
  78. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/causal_judgement.txt +0 -0
  79. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/date_understanding.txt +0 -0
  80. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/disambiguation_qa.txt +0 -0
  81. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/dyck_languages.txt +0 -0
  82. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/formal_fallacies.txt +0 -0
  83. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/geometric_shapes.txt +0 -0
  84. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/hyperbaton.txt +0 -0
  85. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_five_objects.txt +0 -0
  86. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_seven_objects.txt +0 -0
  87. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_three_objects.txt +0 -0
  88. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/movie_recommendation.txt +0 -0
  89. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/multistep_arithmetic_two.txt +0 -0
  90. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/navigate.txt +0 -0
  91. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/object_counting.txt +0 -0
  92. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/penguins_in_a_table.txt +0 -0
  93. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/reasoning_about_colored_objects.txt +0 -0
  94. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/ruin_names.txt +0 -0
  95. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/salient_translation_error_detection.txt +0 -0
  96. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/snarks.txt +0 -0
  97. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/sports_understanding.txt +0 -0
  98. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/temporal_sequences.txt +0 -0
  99. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  100. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  101. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  102. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/web_of_lies.txt +0 -0
  103. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/word_sorting.txt +0 -0
  104. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/task.py +0 -0
  105. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/big_bench_hard/template.py +0 -0
  106. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/bool_q/__init__.py +0 -0
  107. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/bool_q/bool_q.py +0 -0
  108. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/bool_q/template.py +0 -0
  109. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/drop/__init__.py +0 -0
  110. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/drop/drop.py +0 -0
  111. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/drop/task.py +0 -0
  112. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/drop/template.py +0 -0
  113. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/equity_med_qa/__init__.py +0 -0
  114. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/equity_med_qa/equity_med_qa.py +0 -0
  115. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/equity_med_qa/task.py +0 -0
  116. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/equity_med_qa/template.py +0 -0
  117. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/gsm8k/__init__.py +0 -0
  118. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/gsm8k/gsm8k.py +0 -0
  119. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/gsm8k/template.py +0 -0
  120. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/hellaswag/__init__.py +0 -0
  121. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/hellaswag/hellaswag.py +0 -0
  122. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/hellaswag/task.py +0 -0
  123. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/hellaswag/template.py +0 -0
  124. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/human_eval/__init__.py +0 -0
  125. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/human_eval/human_eval.py +0 -0
  126. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/human_eval/task.py +0 -0
  127. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/human_eval/template.py +0 -0
  128. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/ifeval/__init__.py +0 -0
  129. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/ifeval/ifeval.py +0 -0
  130. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/ifeval/template.py +0 -0
  131. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/lambada/__init__.py +0 -0
  132. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/lambada/lambada.py +0 -0
  133. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/lambada/template.py +0 -0
  134. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/logi_qa/__init__.py +0 -0
  135. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/logi_qa/logi_qa.py +0 -0
  136. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/logi_qa/task.py +0 -0
  137. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/logi_qa/template.py +0 -0
  138. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/math_qa/__init__.py +0 -0
  139. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/math_qa/math_qa.py +0 -0
  140. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/math_qa/task.py +0 -0
  141. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/math_qa/template.py +0 -0
  142. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/mmlu/__init__.py +0 -0
  143. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/mmlu/mmlu.py +0 -0
  144. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/mmlu/task.py +0 -0
  145. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/mmlu/template.py +0 -0
  146. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/modes/__init__.py +0 -0
  147. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/results.py +0 -0
  148. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/schema.py +0 -0
  149. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/squad/__init__.py +0 -0
  150. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/squad/squad.py +0 -0
  151. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/squad/task.py +0 -0
  152. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/squad/template.py +0 -0
  153. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/tasks/__init__.py +0 -0
  154. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/truthful_qa/__init__.py +0 -0
  155. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/truthful_qa/mode.py +0 -0
  156. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/truthful_qa/task.py +0 -0
  157. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/truthful_qa/template.py +0 -0
  158. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/truthful_qa/truthful_qa.py +0 -0
  159. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/utils.py +0 -0
  160. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/winogrande/__init__.py +0 -0
  161. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/winogrande/template.py +0 -0
  162. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/benchmarks/winogrande/winogrande.py +0 -0
  163. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/cli/__init__.py +0 -0
  164. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/cli/dotenv_handler.py +0 -0
  165. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/cli/main.py +0 -0
  166. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/cli/server.py +0 -0
  167. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/cli/test.py +0 -0
  168. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/cli/types.py +0 -0
  169. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/cli/utils.py +0 -0
  170. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/confident/__init__.py +0 -0
  171. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/confident/api.py +0 -0
  172. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/confident/types.py +0 -0
  173. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/config/__init__.py +0 -0
  174. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/config/settings_manager.py +0 -0
  175. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/config/utils.py +0 -0
  176. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/constants.py +0 -0
  177. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/contextvars.py +0 -0
  178. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/dataset/__init__.py +0 -0
  179. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/dataset/api.py +0 -0
  180. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/dataset/golden.py +0 -0
  181. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/dataset/test_run_tracer.py +0 -0
  182. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/dataset/types.py +0 -0
  183. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/dataset/utils.py +0 -0
  184. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/errors.py +0 -0
  185. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/evaluate/__init__.py +0 -0
  186. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/evaluate/api.py +0 -0
  187. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/evaluate/compare.py +0 -0
  188. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/evaluate/configs.py +0 -0
  189. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/integrations/__init__.py +0 -0
  190. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/integrations/hugging_face/__init__.py +0 -0
  191. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/integrations/hugging_face/callback.py +0 -0
  192. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/integrations/hugging_face/rich_manager.py +0 -0
  193. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/integrations/hugging_face/tests/test_callbacks.py +0 -0
  194. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/integrations/hugging_face/utils.py +0 -0
  195. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/integrations/langchain/__init__.py +0 -0
  196. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/integrations/langchain/callback.py +0 -0
  197. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/integrations/langchain/patch.py +0 -0
  198. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/integrations/langchain/utils.py +0 -0
  199. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/integrations/llama_index/__init__.py +0 -0
  200. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/integrations/llama_index/agent/patched.py +0 -0
  201. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/integrations/llama_index/handler.py +0 -0
  202. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/integrations/llama_index/utils.py +0 -0
  203. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/integrations/pydantic_ai/__init__.py +0 -0
  204. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/integrations/pydantic_ai/agent.py +0 -0
  205. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/integrations/pydantic_ai/otel.py +0 -0
  206. /deepeval-3.6.4/deepeval/metrics/argument_correctness/__init__.py → /deepeval-3.6.5/deepeval/integrations/pydantic_ai/test_instrumentator.py +0 -0
  207. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/key_handler.py +0 -0
  208. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/__init__.py +0 -0
  209. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/answer_relevancy/__init__.py +0 -0
  210. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/answer_relevancy/answer_relevancy.py +0 -0
  211. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/answer_relevancy/schema.py +0 -0
  212. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/answer_relevancy/template.py +0 -0
  213. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/arena_g_eval/__init__.py +0 -0
  214. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/arena_g_eval/arena_g_eval.py +0 -0
  215. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/arena_g_eval/schema.py +0 -0
  216. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/arena_g_eval/template.py +0 -0
  217. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/arena_g_eval/utils.py +0 -0
  218. {deepeval-3.6.4/deepeval/metrics/conversation_completeness → deepeval-3.6.5/deepeval/metrics/argument_correctness}/__init__.py +0 -0
  219. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/argument_correctness/argument_correctness.py +0 -0
  220. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/argument_correctness/schema.py +0 -0
  221. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/argument_correctness/template.py +0 -0
  222. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/base_metric.py +0 -0
  223. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/bias/__init__.py +0 -0
  224. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/bias/bias.py +0 -0
  225. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/bias/schema.py +0 -0
  226. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/bias/template.py +0 -0
  227. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/contextual_precision/__init__.py +0 -0
  228. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/contextual_precision/contextual_precision.py +0 -0
  229. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/contextual_precision/schema.py +0 -0
  230. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/contextual_precision/template.py +0 -0
  231. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/contextual_recall/__init__.py +0 -0
  232. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/contextual_recall/contextual_recall.py +0 -0
  233. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/contextual_recall/schema.py +0 -0
  234. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/contextual_recall/template.py +0 -0
  235. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/contextual_relevancy/__init__.py +0 -0
  236. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/contextual_relevancy/contextual_relevancy.py +0 -0
  237. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/contextual_relevancy/schema.py +0 -0
  238. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/contextual_relevancy/template.py +0 -0
  239. {deepeval-3.6.4/deepeval/metrics/conversational_g_eval → deepeval-3.6.5/deepeval/metrics/conversation_completeness}/__init__.py +0 -0
  240. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/conversation_completeness/conversation_completeness.py +0 -0
  241. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/conversation_completeness/schema.py +0 -0
  242. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/conversation_completeness/template.py +0 -0
  243. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/conversational_dag/__init__.py +0 -0
  244. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/conversational_dag/conversational_dag.py +0 -0
  245. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/conversational_dag/nodes.py +0 -0
  246. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/conversational_dag/templates.py +0 -0
  247. {deepeval-3.6.4/deepeval/metrics/json_correctness → deepeval-3.6.5/deepeval/metrics/conversational_g_eval}/__init__.py +0 -0
  248. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/conversational_g_eval/conversational_g_eval.py +0 -0
  249. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/conversational_g_eval/schema.py +0 -0
  250. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/conversational_g_eval/template.py +0 -0
  251. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/dag/__init__.py +0 -0
  252. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/dag/dag.py +0 -0
  253. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/dag/graph.py +0 -0
  254. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/dag/nodes.py +0 -0
  255. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/dag/schema.py +0 -0
  256. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/dag/templates.py +0 -0
  257. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/dag/utils.py +0 -0
  258. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/faithfulness/__init__.py +0 -0
  259. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/faithfulness/schema.py +0 -0
  260. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/faithfulness/template.py +0 -0
  261. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/g_eval/__init__.py +0 -0
  262. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/g_eval/g_eval.py +0 -0
  263. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/g_eval/schema.py +0 -0
  264. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/g_eval/template.py +0 -0
  265. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/g_eval/utils.py +0 -0
  266. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/hallucination/__init__.py +0 -0
  267. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/hallucination/hallucination.py +0 -0
  268. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/hallucination/schema.py +0 -0
  269. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/hallucination/template.py +0 -0
  270. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/indicator.py +0 -0
  271. {deepeval-3.6.4/deepeval/metrics/knowledge_retention → deepeval-3.6.5/deepeval/metrics/json_correctness}/__init__.py +0 -0
  272. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/json_correctness/json_correctness.py +0 -0
  273. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/json_correctness/schema.py +0 -0
  274. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/json_correctness/template.py +0 -0
  275. {deepeval-3.6.4/deepeval/metrics/mcp → deepeval-3.6.5/deepeval/metrics/knowledge_retention}/__init__.py +0 -0
  276. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/knowledge_retention/knowledge_retention.py +0 -0
  277. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/knowledge_retention/schema.py +0 -0
  278. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/knowledge_retention/template.py +0 -0
  279. {deepeval-3.6.4/deepeval/metrics/mcp_use_metric → deepeval-3.6.5/deepeval/metrics/mcp}/__init__.py +0 -0
  280. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/mcp/mcp_task_completion.py +0 -0
  281. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +0 -0
  282. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/mcp/schema.py +0 -0
  283. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/mcp/template.py +0 -0
  284. {deepeval-3.6.4/deepeval/metrics/multimodal_metrics/image_coherence → deepeval-3.6.5/deepeval/metrics/mcp_use_metric}/__init__.py +0 -0
  285. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/mcp_use_metric/mcp_use_metric.py +0 -0
  286. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/mcp_use_metric/schema.py +0 -0
  287. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/mcp_use_metric/template.py +0 -0
  288. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/misuse/__init__.py +0 -0
  289. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/misuse/misuse.py +0 -0
  290. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/misuse/schema.py +0 -0
  291. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/misuse/template.py +0 -0
  292. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/__init__.py +0 -0
  293. {deepeval-3.6.4/deepeval/metrics/multimodal_metrics/image_editing → deepeval-3.6.5/deepeval/metrics/multimodal_metrics/image_coherence}/__init__.py +0 -0
  294. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +0 -0
  295. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/image_coherence/schema.py +0 -0
  296. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/image_coherence/template.py +0 -0
  297. {deepeval-3.6.4/deepeval/metrics/multimodal_metrics/image_helpfulness → deepeval-3.6.5/deepeval/metrics/multimodal_metrics/image_editing}/__init__.py +0 -0
  298. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +0 -0
  299. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/image_editing/schema.py +0 -0
  300. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/image_editing/template.py +0 -0
  301. {deepeval-3.6.4/deepeval/metrics/multimodal_metrics/image_reference → deepeval-3.6.5/deepeval/metrics/multimodal_metrics/image_helpfulness}/__init__.py +0 -0
  302. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +0 -0
  303. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/image_helpfulness/schema.py +0 -0
  304. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/image_helpfulness/template.py +0 -0
  305. {deepeval-3.6.4/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy → deepeval-3.6.5/deepeval/metrics/multimodal_metrics/image_reference}/__init__.py +0 -0
  306. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +0 -0
  307. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/image_reference/schema.py +0 -0
  308. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/image_reference/template.py +0 -0
  309. {deepeval-3.6.4/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision → deepeval-3.6.5/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy}/__init__.py +0 -0
  310. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -0
  311. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -0
  312. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -0
  313. {deepeval-3.6.4/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall → deepeval-3.6.5/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision}/__init__.py +0 -0
  314. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -0
  315. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -0
  316. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -0
  317. {deepeval-3.6.4/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy → deepeval-3.6.5/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall}/__init__.py +0 -0
  318. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -0
  319. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -0
  320. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -0
  321. {deepeval-3.6.4/deepeval/metrics/multimodal_metrics/multimodal_faithfulness → deepeval-3.6.5/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy}/__init__.py +0 -0
  322. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -0
  323. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/schema.py +0 -0
  324. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -0
  325. {deepeval-3.6.4/deepeval/metrics/multimodal_metrics/multimodal_g_eval → deepeval-3.6.5/deepeval/metrics/multimodal_metrics/multimodal_faithfulness}/__init__.py +0 -0
  326. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -0
  327. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/schema.py +0 -0
  328. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -0
  329. {deepeval-3.6.4/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness → deepeval-3.6.5/deepeval/metrics/multimodal_metrics/multimodal_g_eval}/__init__.py +0 -0
  330. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -0
  331. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -0
  332. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -0
  333. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -0
  334. {deepeval-3.6.4/deepeval/metrics/multimodal_metrics/text_to_image → deepeval-3.6.5/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness}/__init__.py +0 -0
  335. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -0
  336. {deepeval-3.6.4/deepeval/metrics/prompt_alignment → deepeval-3.6.5/deepeval/metrics/multimodal_metrics/text_to_image}/__init__.py +0 -0
  337. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/text_to_image/schema.py +0 -0
  338. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/text_to_image/template.py +0 -0
  339. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +0 -0
  340. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/non_advice/__init__.py +0 -0
  341. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/non_advice/non_advice.py +0 -0
  342. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/non_advice/schema.py +0 -0
  343. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/non_advice/template.py +0 -0
  344. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/pii_leakage/__init__.py +0 -0
  345. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/pii_leakage/pii_leakage.py +0 -0
  346. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/pii_leakage/schema.py +0 -0
  347. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/pii_leakage/template.py +0 -0
  348. {deepeval-3.6.4/deepeval/metrics/role_adherence → deepeval-3.6.5/deepeval/metrics/prompt_alignment}/__init__.py +0 -0
  349. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/prompt_alignment/prompt_alignment.py +0 -0
  350. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/prompt_alignment/schema.py +0 -0
  351. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/prompt_alignment/template.py +0 -0
  352. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/ragas.py +0 -0
  353. {deepeval-3.6.4/deepeval/metrics/task_completion → deepeval-3.6.5/deepeval/metrics/role_adherence}/__init__.py +0 -0
  354. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/role_adherence/role_adherence.py +0 -0
  355. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/role_adherence/schema.py +0 -0
  356. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/role_adherence/template.py +0 -0
  357. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/role_violation/__init__.py +0 -0
  358. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/role_violation/role_violation.py +0 -0
  359. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/role_violation/schema.py +0 -0
  360. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/role_violation/template.py +0 -0
  361. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/summarization/__init__.py +0 -0
  362. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/summarization/schema.py +0 -0
  363. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/summarization/summarization.py +0 -0
  364. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/summarization/template.py +0 -0
  365. {deepeval-3.6.4/deepeval/metrics/tool_correctness → deepeval-3.6.5/deepeval/metrics/task_completion}/__init__.py +0 -0
  366. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/task_completion/schema.py +0 -0
  367. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/task_completion/task_completion.py +0 -0
  368. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/task_completion/template.py +0 -0
  369. {deepeval-3.6.4/deepeval/metrics/turn_relevancy → deepeval-3.6.5/deepeval/metrics/tool_correctness}/__init__.py +0 -0
  370. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/tool_correctness/tool_correctness.py +0 -0
  371. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/toxicity/__init__.py +0 -0
  372. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/toxicity/schema.py +0 -0
  373. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/toxicity/template.py +0 -0
  374. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/toxicity/toxicity.py +0 -0
  375. {deepeval-3.6.4/deepeval/plugins → deepeval-3.6.5/deepeval/metrics/turn_relevancy}/__init__.py +0 -0
  376. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/turn_relevancy/schema.py +0 -0
  377. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/turn_relevancy/template.py +0 -0
  378. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/turn_relevancy/turn_relevancy.py +0 -0
  379. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/metrics/utils.py +0 -0
  380. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/models/__init__.py +0 -0
  381. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/models/_summac_model.py +0 -0
  382. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/models/answer_relevancy_model.py +0 -0
  383. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/models/base_model.py +0 -0
  384. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/models/detoxify_model.py +0 -0
  385. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/models/embedding_models/__init__.py +0 -0
  386. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/models/embedding_models/azure_embedding_model.py +0 -0
  387. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/models/embedding_models/local_embedding_model.py +0 -0
  388. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/models/embedding_models/ollama_embedding_model.py +0 -0
  389. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/models/embedding_models/openai_embedding_model.py +0 -0
  390. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/models/hallucination_model.py +0 -0
  391. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/models/llms/__init__.py +0 -0
  392. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/models/llms/amazon_bedrock_model.py +0 -0
  393. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/models/llms/anthropic_model.py +0 -0
  394. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/models/llms/azure_model.py +0 -0
  395. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/models/llms/deepseek_model.py +0 -0
  396. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/models/llms/gemini_model.py +0 -0
  397. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/models/llms/grok_model.py +0 -0
  398. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/models/llms/kimi_model.py +0 -0
  399. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/models/llms/litellm_model.py +0 -0
  400. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/models/llms/local_model.py +0 -0
  401. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/models/llms/ollama_model.py +0 -0
  402. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/models/llms/openai_model.py +0 -0
  403. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/models/llms/utils.py +0 -0
  404. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/models/mlllms/__init__.py +0 -0
  405. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/models/mlllms/gemini_model.py +0 -0
  406. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/models/mlllms/ollama_model.py +0 -0
  407. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/models/mlllms/openai_model.py +0 -0
  408. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/models/retry_policy.py +0 -0
  409. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/models/summac_model.py +0 -0
  410. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/models/unbias_model.py +0 -0
  411. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/models/utils.py +0 -0
  412. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/openai/__init__.py +0 -0
  413. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/openai/extractors.py +0 -0
  414. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/openai/patch.py +0 -0
  415. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/openai/utils.py +0 -0
  416. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/openai_agents/__init__.py +0 -0
  417. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/openai_agents/agent.py +0 -0
  418. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/openai_agents/callback_handler.py +0 -0
  419. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/openai_agents/extractors.py +0 -0
  420. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/openai_agents/patch.py +0 -0
  421. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/openai_agents/runner.py +0 -0
  422. {deepeval-3.6.4/deepeval/synthesizer/chunking → deepeval-3.6.5/deepeval/plugins}/__init__.py +0 -0
  423. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/plugins/plugin.py +0 -0
  424. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/progress_context.py +0 -0
  425. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/prompt/__init__.py +0 -0
  426. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/prompt/api.py +0 -0
  427. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/prompt/prompt.py +0 -0
  428. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/prompt/utils.py +0 -0
  429. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/py.typed +0 -0
  430. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/red_teaming/README.md +0 -0
  431. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/scorer/__init__.py +0 -0
  432. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/scorer/scorer.py +0 -0
  433. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/simulator/__init__.py +0 -0
  434. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/simulator/conversation_simulator.py +0 -0
  435. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/simulator/schema.py +0 -0
  436. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/simulator/template.py +0 -0
  437. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/singleton.py +0 -0
  438. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/synthesizer/__init__.py +0 -0
  439. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/synthesizer/base_synthesizer.py +0 -0
  440. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/synthesizer/chunking/context_generator.py +0 -0
  441. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/synthesizer/chunking/doc_chunker.py +0 -0
  442. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/synthesizer/config.py +0 -0
  443. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/synthesizer/schema.py +0 -0
  444. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/synthesizer/synthesizer.py +0 -0
  445. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/synthesizer/templates/__init__.py +0 -0
  446. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/synthesizer/templates/template.py +0 -0
  447. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/synthesizer/templates/template_extraction.py +0 -0
  448. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/synthesizer/templates/template_prompt.py +0 -0
  449. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/synthesizer/types.py +0 -0
  450. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/synthesizer/utils.py +0 -0
  451. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/telemetry.py +0 -0
  452. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/test_case/__init__.py +0 -0
  453. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/test_case/arena_test_case.py +0 -0
  454. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/test_case/conversational_test_case.py +0 -0
  455. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/test_case/llm_test_case.py +0 -0
  456. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/test_case/mcp.py +0 -0
  457. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/test_case/mllm_test_case.py +0 -0
  458. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/test_case/utils.py +0 -0
  459. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/test_run/cache.py +0 -0
  460. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/test_run/hooks.py +0 -0
  461. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/test_run/hyperparameters.py +0 -0
  462. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/tracing/api.py +0 -0
  463. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/tracing/context.py +0 -0
  464. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/tracing/offline_evals/__init__.py +0 -0
  465. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/tracing/offline_evals/api.py +0 -0
  466. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/tracing/offline_evals/span.py +0 -0
  467. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/tracing/offline_evals/thread.py +0 -0
  468. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/tracing/offline_evals/trace.py +0 -0
  469. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/tracing/otel/__init__.py +0 -0
  470. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/tracing/otel/exporter.py +0 -0
  471. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/tracing/otel/utils.py +0 -0
  472. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/tracing/patchers.py +0 -0
  473. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/tracing/perf_epoch_bridge.py +0 -0
  474. {deepeval-3.6.4 → deepeval-3.6.5}/deepeval/tracing/types.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deepeval
3
- Version: 3.6.4
3
+ Version: 3.6.5
4
4
  Summary: The LLM Evaluation Framework
5
5
  Home-page: https://github.com/confident-ai/deepeval
6
6
  License: Apache-2.0
@@ -0,0 +1 @@
1
+ __version__: str = "3.6.5"
@@ -180,6 +180,19 @@ class Settings(BaseSettings):
180
180
  # into this directory. The directory will be created on demand.
181
181
  DEEPEVAL_RESULTS_FOLDER: Optional[Path] = None
182
182
 
183
+ # Display / Truncation
184
+ DEEPEVAL_MAXLEN_TINY: Optional[int] = 40
185
+ DEEPEVAL_MAXLEN_SHORT: Optional[int] = 60
186
+ DEEPEVAL_MAXLEN_MEDIUM: Optional[int] = 120
187
+ DEEPEVAL_MAXLEN_LONG: Optional[int] = 240
188
+
189
+ # If set, this overrides the default max_len used by deepeval/utils shorten
190
+ # falls back to DEEPEVAL_MAXLEN_LONG when None.
191
+ DEEPEVAL_SHORTEN_DEFAULT_MAXLEN: Optional[int] = None
192
+
193
+ # Optional global suffix (keeps your "..." default).
194
+ DEEPEVAL_SHORTEN_SUFFIX: Optional[str] = "..."
195
+
183
196
  #
184
197
  # GPU and perf toggles
185
198
  #
@@ -1266,11 +1266,17 @@ class EvaluationDataset:
1266
1266
  detach(ctx_token)
1267
1267
 
1268
1268
  else:
1269
- confident_link = global_test_run_manager.wrap_up_test_run(
1269
+ res = global_test_run_manager.wrap_up_test_run(
1270
1270
  run_duration, display_table=False
1271
1271
  )
1272
+ if isinstance(res, tuple):
1273
+ confident_link, test_run_id = res
1274
+ else:
1275
+ confident_link = test_run_id = None
1272
1276
  return EvaluationResult(
1273
- test_results=test_results, confident_link=confident_link
1277
+ test_results=test_results,
1278
+ confident_link=confident_link,
1279
+ test_run_id=test_run_id,
1274
1280
  )
1275
1281
 
1276
1282
  def evaluate(self, task: Task):
@@ -268,11 +268,17 @@ def evaluate(
268
268
  test_run = global_test_run_manager.get_test_run()
269
269
  test_run.hyperparameters = process_hyperparameters(hyperparameters)
270
270
  global_test_run_manager.save_test_run(TEMP_FILE_PATH)
271
- confident_link = global_test_run_manager.wrap_up_test_run(
271
+ res = global_test_run_manager.wrap_up_test_run(
272
272
  run_duration, display_table=False
273
273
  )
274
+ if isinstance(res, tuple):
275
+ confident_link, test_run_id = res
276
+ else:
277
+ confident_link = test_run_id = None
274
278
  return EvaluationResult(
275
- test_results=test_results, confident_link=confident_link
279
+ test_results=test_results,
280
+ confident_link=confident_link,
281
+ test_run_id=test_run_id,
276
282
  )
277
283
  elif metric_collection:
278
284
  api = Api()
@@ -45,9 +45,7 @@ from deepeval.dataset import Golden
45
45
  from deepeval.contextvars import set_current_golden, reset_current_golden
46
46
  from deepeval.errors import MissingTestCaseParamsError
47
47
  from deepeval.metrics.utils import copy_metrics
48
- from deepeval.utils import (
49
- get_or_create_event_loop,
50
- )
48
+ from deepeval.utils import get_or_create_event_loop, shorten, len_medium
51
49
  from deepeval.telemetry import capture_evaluation_run
52
50
  from deepeval.metrics import (
53
51
  BaseMetric,
@@ -1802,14 +1800,11 @@ def a_execute_agentic_test_cases_from_loop(
1802
1800
  )
1803
1801
 
1804
1802
  # record metadata for debugging
1805
- MAX_META_INPUT_LENGTH = 120
1806
1803
  started = time.perf_counter()
1807
- short_input = current_golden_ctx["input"]
1808
- if (
1809
- isinstance(short_input, str)
1810
- and len(short_input) > MAX_META_INPUT_LENGTH
1811
- ):
1812
- short_input = short_input[:MAX_META_INPUT_LENGTH] + "…"
1804
+ short_input = current_golden_ctx.get("input")
1805
+ if isinstance(short_input, str):
1806
+ short_input = shorten(short_input, len_medium())
1807
+
1813
1808
  task_meta[task] = {
1814
1809
  "golden_index": current_golden_ctx["index"],
1815
1810
  "golden_name": current_golden_ctx["name"],
@@ -1972,7 +1967,7 @@ def a_execute_agentic_test_cases_from_loop(
1972
1967
 
1973
1968
  if settings.DEEPEVAL_DEBUG_ASYNC:
1974
1969
  logger.warning(
1975
- "[deepeval] %d stray task(s) not tracked; cancelling",
1970
+ "[deepeval] %d stray task(s) not tracked; cancelling...",
1976
1971
  len(leftovers),
1977
1972
  )
1978
1973
  for t in leftovers:
@@ -1,7 +1,8 @@
1
1
  from typing import Optional, List, Union, Dict
2
2
  from dataclasses import dataclass
3
3
  from pydantic import BaseModel
4
- from deepeval.test_run import MetricData
4
+
5
+ from deepeval.test_run.api import MetricData, TurnApi
5
6
  from deepeval.test_case import MLLMImage
6
7
 
7
8
 
@@ -19,9 +20,11 @@ class TestResult:
19
20
  expected_output: Optional[str] = None
20
21
  context: Optional[List[str]] = None
21
22
  retrieval_context: Optional[List[str]] = None
23
+ turns: Optional[List[TurnApi]] = None
22
24
  additional_metadata: Optional[Dict] = None
23
25
 
24
26
 
25
27
  class EvaluationResult(BaseModel):
26
28
  test_results: List[TestResult]
27
29
  confident_link: Optional[str]
30
+ test_run_id: Optional[str]
@@ -1,9 +1,10 @@
1
1
  import ast
2
2
  import inspect
3
- from typing import Optional, List, Callable, Union, Dict
4
- import os, time
5
-
3
+ from typing import Optional, List, Callable, Union
4
+ import os
5
+ import time
6
6
 
7
+ from deepeval.utils import format_turn
7
8
  from deepeval.test_case.conversational_test_case import Turn
8
9
  from deepeval.test_run.api import TurnApi
9
10
  from deepeval.test_run.test_run import TestRunResultDisplay
@@ -34,6 +35,29 @@ from deepeval.tracing.utils import (
34
35
  )
35
36
 
36
37
 
38
+ def _is_metric_successful(metric_data: MetricData) -> bool:
39
+ """
40
+ Robustly determine success for a metric row.
41
+
42
+ Rationale:
43
+ - If the metric recorded an error, treat as failure.
44
+ - Be defensive: custom rows may not be MetricData at runtime.
45
+ """
46
+ if getattr(metric_data, "error", None):
47
+ return False
48
+
49
+ s = getattr(metric_data, "success", None)
50
+ if isinstance(s, bool):
51
+ return s
52
+ if s is None:
53
+ return False
54
+ if isinstance(s, (int, float)):
55
+ return bool(s)
56
+ if isinstance(s, str):
57
+ return s.strip().lower() in {"true", "t", "1", "yes", "y"}
58
+ return False
59
+
60
+
37
61
  def create_metric_data(metric: BaseMetric) -> MetricData:
38
62
  if metric.error is not None:
39
63
  return MetricData(
@@ -75,6 +99,7 @@ def create_test_result(
75
99
  metrics_data=api_test_case.metrics_data,
76
100
  conversational=True,
77
101
  additional_metadata=api_test_case.additional_metadata,
102
+ turns=api_test_case.turns,
78
103
  )
79
104
  else:
80
105
  multimodal = (
@@ -112,6 +137,7 @@ def create_api_turn(turn: Turn, index: int) -> TurnApi:
112
137
  return TurnApi(
113
138
  role=turn.role,
114
139
  content=turn.content,
140
+ user_id=turn.user_id,
115
141
  retrievalContext=turn.retrieval_context,
116
142
  toolsCalled=turn.tools_called,
117
143
  additionalMetadata=turn.additional_metadata,
@@ -372,17 +398,7 @@ def print_test_result(test_result: TestResult, display: TestRunResultDisplay):
372
398
  print("Metrics Summary\n")
373
399
 
374
400
  for metric_data in test_result.metrics_data:
375
- successful = True
376
- if metric_data.error is not None:
377
- successful = False
378
- else:
379
- # This try block is for user defined custom metrics,
380
- # which might not handle the score == undefined case elegantly
381
- try:
382
- if not metric_data.success:
383
- successful = False
384
- except:
385
- successful = False
401
+ successful = _is_metric_successful(metric_data)
386
402
 
387
403
  if not successful:
388
404
  print(
@@ -401,9 +417,14 @@ def print_test_result(test_result: TestResult, display: TestRunResultDisplay):
401
417
 
402
418
  elif test_result.conversational:
403
419
  print("For conversational test case:\n")
404
- print(
405
- f" - Unable to print conversational test case. Run 'deepeval login' to view conversational evaluations in full."
406
- )
420
+ if test_result.turns:
421
+ print(" Turns:")
422
+ turns = sorted(test_result.turns, key=lambda t: t.order)
423
+ for t in turns:
424
+ print(format_turn(t))
425
+ else:
426
+ print(" - No turns recorded in this test case.")
427
+
407
428
  else:
408
429
  print("For test case:\n")
409
430
  print(f" - input: {test_result.input}")
@@ -470,15 +491,7 @@ def write_test_result_to_file(
470
491
  file.write("Metrics Summary\n\n")
471
492
 
472
493
  for metric_data in test_result.metrics_data:
473
- successful = True
474
- if metric_data.error is not None:
475
- successful = False
476
- else:
477
- try:
478
- if not metric_data.success:
479
- successful = False
480
- except:
481
- successful = False
494
+ successful = _is_metric_successful(metric_data)
482
495
 
483
496
  if not successful:
484
497
  file.write(
@@ -500,9 +513,13 @@ def write_test_result_to_file(
500
513
  file.write(f" - actual output: {test_result.actual_output}\n")
501
514
  elif test_result.conversational:
502
515
  file.write("For conversational test case:\n\n")
503
- file.write(
504
- " - Unable to print conversational test case. Run 'deepeval login' to view conversational evaluations in full.\n"
505
- )
516
+ if test_result.turns:
517
+ file.write(" Turns:\n")
518
+ turns = sorted(test_result.turns, key=lambda t: t.order)
519
+ for t in turns:
520
+ file.write(format_turn(t) + "\n")
521
+ else:
522
+ file.write(" - No turns recorded in this test case.\n")
506
523
  else:
507
524
  file.write("For test case:\n\n")
508
525
  file.write(f" - input: {test_result.input}\n")
@@ -0,0 +1,3 @@
1
+ from .handler import instrument_crewai
2
+
3
+ __all__ = ["instrument_crewai"]
@@ -0,0 +1,196 @@
1
+ import logging
2
+ import deepeval
3
+
4
+ from typing import Optional
5
+ from deepeval.telemetry import capture_tracing_integration
6
+ from deepeval.tracing.context import current_span_context, current_trace_context
7
+ from deepeval.tracing.tracing import Observer
8
+ from deepeval.tracing.types import LlmSpan
9
+ from deepeval.config.settings import get_settings
10
+
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ try:
16
+ from crewai.utilities.events.base_event_listener import BaseEventListener
17
+ from crewai.events import (
18
+ CrewKickoffStartedEvent,
19
+ CrewKickoffCompletedEvent,
20
+ LLMCallStartedEvent,
21
+ LLMCallCompletedEvent,
22
+ AgentExecutionStartedEvent,
23
+ AgentExecutionCompletedEvent,
24
+ ToolUsageStartedEvent,
25
+ ToolUsageFinishedEvent,
26
+ )
27
+
28
+ crewai_installed = True
29
+ except ImportError as e:
30
+ if get_settings().DEEPEVAL_VERBOSE_MODE:
31
+ if isinstance(e, ModuleNotFoundError):
32
+ logger.warning(
33
+ "Optional crewai dependency not installed: %s",
34
+ e.name,
35
+ stacklevel=2,
36
+ )
37
+ else:
38
+ logger.warning(
39
+ "Optional crewai import failed: %s",
40
+ e,
41
+ stacklevel=2,
42
+ )
43
+
44
+ crewai_installed = False
45
+
46
+ IS_WRAPPED_ALL = False
47
+
48
+
49
+ def is_crewai_installed():
50
+ if not crewai_installed:
51
+ raise ImportError(
52
+ "CrewAI is not installed. Please install it with `pip install crewai`."
53
+ )
54
+
55
+
56
+ class CrewAIEventsListener(BaseEventListener):
57
+ def __init__(self):
58
+ is_crewai_installed()
59
+ super().__init__()
60
+ self.span_observers: dict[str, Observer] = {}
61
+
62
+ @staticmethod
63
+ def get_tool_execution_id(source, event) -> str:
64
+ source_id = id(source)
65
+ task_id = getattr(event, "task_id", "unknown")
66
+ agent_id = getattr(event, "agent_id", "unknown")
67
+ tool_name = getattr(event, "tool_name", "unknown")
68
+ execution_id = f"tool_{source_id}_{task_id}_{agent_id}_{tool_name}"
69
+
70
+ return execution_id
71
+
72
+ def setup_listeners(self, crewai_event_bus):
73
+ @crewai_event_bus.on(CrewKickoffStartedEvent)
74
+ def on_crew_started(source, event: CrewKickoffStartedEvent):
75
+ # Assuming that this event is called in the crew.kickoff method
76
+ current_span = current_span_context.get()
77
+
78
+ # set the input
79
+ if current_span:
80
+ current_span.input = event.inputs
81
+
82
+ # set trace input
83
+ current_trace = current_trace_context.get()
84
+ if current_trace:
85
+ current_trace.input = event.inputs
86
+
87
+ @crewai_event_bus.on(CrewKickoffCompletedEvent)
88
+ def on_crew_completed(source, event: CrewKickoffCompletedEvent):
89
+ # Assuming that this event is called in the crew.kickoff method
90
+ current_span = current_span_context.get()
91
+
92
+ # set the output
93
+ if current_span:
94
+ current_span.output = str(event.output)
95
+
96
+ # set trace output
97
+ current_trace = current_trace_context.get()
98
+ if current_trace:
99
+ current_trace.output = str(event.output)
100
+
101
+ @crewai_event_bus.on(LLMCallStartedEvent)
102
+ def on_llm_started(source, event: LLMCallStartedEvent):
103
+ # Assuming that this event is called in the llm.call method
104
+ current_span = current_span_context.get()
105
+
106
+ # set the input
107
+ if current_span:
108
+ current_span.input = event.messages
109
+
110
+ # set the model
111
+ if isinstance(current_span, LlmSpan):
112
+ current_span.model = event.model
113
+
114
+ @crewai_event_bus.on(LLMCallCompletedEvent)
115
+ def on_llm_completed(source, event: LLMCallCompletedEvent):
116
+ # Assuming that this event is called in the llm.call method
117
+ current_span = current_span_context.get()
118
+
119
+ # set the output
120
+ if current_span:
121
+ current_span.output = event.response
122
+
123
+ @crewai_event_bus.on(AgentExecutionStartedEvent)
124
+ def on_agent_started(source, event: AgentExecutionStartedEvent):
125
+ # Assuming that this event is called in the agent.execute_task method
126
+ current_span = current_span_context.get()
127
+
128
+ # set the input
129
+ if current_span:
130
+ current_span.input = event.task_prompt
131
+
132
+ @crewai_event_bus.on(AgentExecutionCompletedEvent)
133
+ def on_agent_completed(source, event: AgentExecutionCompletedEvent):
134
+ # Assuming that this event is called in the agent.execute_task method
135
+ current_span = current_span_context.get()
136
+
137
+ # set the output
138
+ if current_span:
139
+ current_span.output = event.output
140
+
141
+ @crewai_event_bus.on(ToolUsageStartedEvent)
142
+ def on_tool_started(source, event: ToolUsageStartedEvent):
143
+ observer = Observer(
144
+ span_type="tool",
145
+ func_name=event.tool_name,
146
+ function_kwargs=event.tool_args,
147
+ )
148
+ self.span_observers[self.get_tool_execution_id(source, event)] = (
149
+ observer
150
+ )
151
+ observer.__enter__()
152
+
153
+ @crewai_event_bus.on(ToolUsageFinishedEvent)
154
+ def on_tool_completed(source, event: ToolUsageFinishedEvent):
155
+ observer = self.span_observers.pop(
156
+ self.get_tool_execution_id(source, event)
157
+ )
158
+ if observer:
159
+ current_span = current_span_context.get()
160
+ if current_span:
161
+ current_span.output = event.output
162
+ observer.__exit__(None, None, None)
163
+
164
+
165
+ def instrument_crewai(api_key: Optional[str] = None):
166
+ is_crewai_installed()
167
+ with capture_tracing_integration("crewai"):
168
+ if api_key:
169
+ deepeval.login(api_key)
170
+
171
+ wrap_all()
172
+
173
+ CrewAIEventsListener()
174
+
175
+
176
+ def wrap_all():
177
+ global IS_WRAPPED_ALL
178
+
179
+ if not IS_WRAPPED_ALL:
180
+ from deepeval.integrations.crewai.wrapper import (
181
+ wrap_crew_kickoff,
182
+ wrap_crew_kickoff_for_each,
183
+ wrap_crew_kickoff_async,
184
+ wrap_crew_kickoff_for_each_async,
185
+ wrap_llm_call,
186
+ wrap_agent_execute_task,
187
+ )
188
+
189
+ wrap_crew_kickoff()
190
+ wrap_crew_kickoff_for_each()
191
+ wrap_crew_kickoff_async()
192
+ wrap_crew_kickoff_for_each_async()
193
+ wrap_llm_call()
194
+ wrap_agent_execute_task()
195
+
196
+ IS_WRAPPED_ALL = True
@@ -0,0 +1,87 @@
1
+ from crewai.llm import LLM
2
+ from crewai.crew import Crew
3
+ from crewai.agent import Agent
4
+ from functools import wraps
5
+ from deepeval.tracing.tracing import Observer
6
+
7
+
8
+ def wrap_crew_kickoff():
9
+ original_kickoff = Crew.kickoff
10
+
11
+ @wraps(original_kickoff)
12
+ def wrapper(self, *args, **kwargs):
13
+ with Observer(span_type="crew", func_name="kickoff"):
14
+ result = original_kickoff(self, *args, **kwargs)
15
+
16
+ return result
17
+
18
+ Crew.kickoff = wrapper
19
+
20
+
21
+ def wrap_crew_kickoff_for_each():
22
+ original_kickoff_for_each = Crew.kickoff_for_each
23
+
24
+ @wraps(original_kickoff_for_each)
25
+ def wrapper(self, *args, **kwargs):
26
+ with Observer(span_type="crew", func_name="kickoff_for_each"):
27
+ result = original_kickoff_for_each(self, *args, **kwargs)
28
+
29
+ return result
30
+
31
+ Crew.kickoff_for_each = wrapper
32
+
33
+
34
+ def wrap_crew_kickoff_async():
35
+ original_kickoff_async = Crew.kickoff_async
36
+
37
+ @wraps(original_kickoff_async)
38
+ async def wrapper(self, *args, **kwargs):
39
+ with Observer(span_type="crew", func_name="kickoff_async"):
40
+ result = await original_kickoff_async(self, *args, **kwargs)
41
+
42
+ return result
43
+
44
+ Crew.kickoff_async = wrapper
45
+
46
+
47
+ def wrap_crew_kickoff_for_each_async():
48
+ original_kickoff_for_each_async = Crew.kickoff_for_each_async
49
+
50
+ @wraps(original_kickoff_for_each_async)
51
+ async def wrapper(self, *args, **kwargs):
52
+ with Observer(span_type="crew", func_name="kickoff_for_each_async"):
53
+ result = await original_kickoff_for_each_async(
54
+ self, *args, **kwargs
55
+ )
56
+
57
+ return result
58
+
59
+ Crew.kickoff_for_each_async = wrapper
60
+
61
+
62
+ def wrap_llm_call():
63
+ original_llm_call = LLM.call
64
+
65
+ @wraps(original_llm_call)
66
+ def wrapper(self, *args, **kwargs):
67
+ with Observer(
68
+ span_type="llm",
69
+ func_name="call",
70
+ observe_kwargs={"model": "temp_model"},
71
+ ):
72
+ result = original_llm_call(self, *args, **kwargs)
73
+ return result
74
+
75
+ LLM.call = wrapper
76
+
77
+
78
+ def wrap_agent_execute_task():
79
+ original_execute_task = Agent.execute_task
80
+
81
+ @wraps(original_execute_task)
82
+ def wrapper(self, *args, **kwargs):
83
+ with Observer(span_type="agent", func_name="execute_task"):
84
+ result = original_execute_task(self, *args, **kwargs)
85
+ return result
86
+
87
+ Agent.execute_task = wrapper
@@ -1,7 +1,19 @@
1
1
  import json
2
+ import logging
2
3
  import os
3
4
  from typing import Literal, Optional, List
4
5
 
6
+ from deepeval.config.settings import get_settings
7
+ from deepeval.confident.api import get_confident_api_key
8
+ from deepeval.prompt import Prompt
9
+ from deepeval.tracing.context import current_trace_context
10
+ from deepeval.tracing.types import Trace
11
+ from deepeval.tracing.otel.utils import to_hex_string
12
+
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
5
17
  try:
6
18
  from pydantic_ai.models.instrumented import InstrumentationSettings
7
19
  from opentelemetry.sdk.trace import SpanProcessor, TracerProvider
@@ -11,7 +23,20 @@ try:
11
23
  )
12
24
 
13
25
  dependency_installed = True
14
- except:
26
+ except ImportError as e:
27
+ if get_settings().DEEPEVAL_VERBOSE_MODE:
28
+ if isinstance(e, ModuleNotFoundError):
29
+ logger.warning(
30
+ "Optional tracing dependency not installed: %s",
31
+ e.name,
32
+ stacklevel=2,
33
+ )
34
+ else:
35
+ logger.warning(
36
+ "Optional tracing import failed: %s",
37
+ e,
38
+ stacklevel=2,
39
+ )
15
40
  dependency_installed = False
16
41
 
17
42
 
@@ -25,6 +50,10 @@ def is_dependency_installed():
25
50
 
26
51
  from deepeval.confident.api import get_confident_api_key
27
52
  from deepeval.prompt import Prompt
53
+ from deepeval.tracing.otel.test_exporter import test_exporter
54
+ from deepeval.tracing.context import current_trace_context
55
+ from deepeval.tracing.types import Trace
56
+ from deepeval.tracing.otel.utils import to_hex_string
28
57
 
29
58
  # OTLP_ENDPOINT = "http://127.0.0.1:4318/v1/traces"
30
59
  OTLP_ENDPOINT = "https://otel.confident-ai.com/v1/traces"
@@ -37,6 +66,12 @@ class SpanInterceptor(SpanProcessor):
37
66
 
38
67
  def on_start(self, span, parent_context):
39
68
 
69
+ # set trace uuid
70
+ _current_trace_context = current_trace_context.get()
71
+ if _current_trace_context and isinstance(_current_trace_context, Trace):
72
+ _otel_trace_id = span.get_span_context().trace_id
73
+ _current_trace_context.uuid = to_hex_string(_otel_trace_id, 32)
74
+
40
75
  # set trace attributes
41
76
  if self.settings.thread_id:
42
77
  span.set_attribute(
@@ -148,8 +183,9 @@ class ConfidentInstrumentationSettings(InstrumentationSettings):
148
183
  confident_prompt: Optional[Prompt] = None,
149
184
  llm_metric_collection: Optional[str] = None,
150
185
  agent_metric_collection: Optional[str] = None,
151
- tool_metric_collection_map: dict = {},
186
+ tool_metric_collection_map: Optional[dict] = None,
152
187
  trace_metric_collection: Optional[str] = None,
188
+ is_test_mode: Optional[bool] = False,
153
189
  ):
154
190
  is_dependency_installed()
155
191
 
@@ -162,7 +198,7 @@ class ConfidentInstrumentationSettings(InstrumentationSettings):
162
198
  ]:
163
199
  self.environment = _environment
164
200
 
165
- self.tool_metric_collection_map = tool_metric_collection_map
201
+ self.tool_metric_collection_map = tool_metric_collection_map or {}
166
202
  self.name = name
167
203
  self.thread_id = thread_id
168
204
  self.user_id = user_id
@@ -185,12 +221,15 @@ class ConfidentInstrumentationSettings(InstrumentationSettings):
185
221
  span_interceptor = SpanInterceptor(self)
186
222
  trace_provider.add_span_processor(span_interceptor)
187
223
 
188
- trace_provider.add_span_processor(
189
- BatchSpanProcessor(
190
- OTLPSpanExporter(
191
- endpoint=OTLP_ENDPOINT,
192
- headers={"x-confident-api-key": api_key},
224
+ if is_test_mode:
225
+ trace_provider.add_span_processor(BatchSpanProcessor(test_exporter))
226
+ else:
227
+ trace_provider.add_span_processor(
228
+ BatchSpanProcessor(
229
+ OTLPSpanExporter(
230
+ endpoint=OTLP_ENDPOINT,
231
+ headers={"x-confident-api-key": api_key},
232
+ )
193
233
  )
194
234
  )
195
- )
196
235
  super().__init__(tracer_provider=trace_provider)
@@ -41,6 +41,7 @@ class FaithfulnessMetric(BaseMetric):
41
41
  strict_mode: bool = False,
42
42
  verbose_mode: bool = False,
43
43
  truths_extraction_limit: Optional[int] = None,
44
+ penalize_ambiguous_claims: bool = False,
44
45
  evaluation_template: Type[FaithfulnessTemplate] = FaithfulnessTemplate,
45
46
  ):
46
47
  self.threshold = 1 if strict_mode else threshold
@@ -51,6 +52,7 @@ class FaithfulnessMetric(BaseMetric):
51
52
  self.strict_mode = strict_mode
52
53
  self.verbose_mode = verbose_mode
53
54
  self.evaluation_template = evaluation_template
55
+ self.penalize_ambiguous_claims = penalize_ambiguous_claims
54
56
 
55
57
  self.truths_extraction_limit = truths_extraction_limit
56
58
  if self.truths_extraction_limit is not None:
@@ -329,6 +331,12 @@ class FaithfulnessMetric(BaseMetric):
329
331
  if verdict.verdict.strip().lower() != "no":
330
332
  faithfulness_count += 1
331
333
 
334
+ if (
335
+ self.penalize_ambiguous_claims
336
+ and verdict.verdict.strip().lower() == "idk"
337
+ ):
338
+ faithfulness_count -= 1
339
+
332
340
  score = faithfulness_count / number_of_verdicts
333
341
  return 0 if self.strict_mode and score < self.threshold else score
334
342