deepeval 3.4.9__tar.gz → 3.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (468) hide show
  1. {deepeval-3.4.9 → deepeval-3.5.1}/PKG-INFO +14 -13
  2. {deepeval-3.4.9 → deepeval-3.5.1}/README.md +13 -12
  3. deepeval-3.5.1/deepeval/_version.py +1 -0
  4. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/drop/drop.py +2 -3
  5. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/hellaswag/hellaswag.py +2 -2
  6. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/logi_qa/logi_qa.py +2 -2
  7. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/math_qa/math_qa.py +2 -2
  8. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/mmlu/mmlu.py +2 -2
  9. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/truthful_qa/truthful_qa.py +2 -2
  10. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/confident/api.py +3 -0
  11. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/integrations/langchain/callback.py +21 -0
  12. deepeval-3.5.1/deepeval/integrations/pydantic_ai/__init__.py +3 -0
  13. deepeval-3.4.9/deepeval/integrations/pydantic_ai/setup.py → deepeval-3.5.1/deepeval/integrations/pydantic_ai/otel.py +0 -8
  14. deepeval-3.5.1/deepeval/integrations/pydantic_ai/patcher.py +376 -0
  15. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/__init__.py +1 -1
  16. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/answer_relevancy/template.py +13 -38
  17. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/faithfulness/template.py +17 -27
  18. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/models/llms/grok_model.py +1 -1
  19. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/models/llms/kimi_model.py +1 -1
  20. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/prompt/api.py +22 -4
  21. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/prompt/prompt.py +131 -17
  22. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/synthesizer/synthesizer.py +17 -9
  23. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/tracing/api.py +3 -0
  24. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/tracing/context.py +3 -1
  25. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/tracing/perf_epoch_bridge.py +4 -4
  26. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/tracing/tracing.py +12 -2
  27. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/tracing/types.py +3 -0
  28. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/tracing/utils.py +6 -2
  29. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/utils.py +2 -2
  30. {deepeval-3.4.9 → deepeval-3.5.1}/pyproject.toml +3 -1
  31. deepeval-3.4.9/deepeval/_version.py +0 -1
  32. deepeval-3.4.9/deepeval/integrations/pydantic_ai/__init__.py +0 -5
  33. deepeval-3.4.9/deepeval/integrations/pydantic_ai/agent.py +0 -364
  34. {deepeval-3.4.9 → deepeval-3.5.1}/LICENSE.md +0 -0
  35. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/__init__.py +0 -0
  36. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/annotation/__init__.py +0 -0
  37. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/annotation/annotation.py +0 -0
  38. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/annotation/api.py +0 -0
  39. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/__init__.py +0 -0
  40. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/arc/__init__.py +0 -0
  41. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/arc/arc.py +0 -0
  42. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/arc/mode.py +0 -0
  43. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/arc/template.py +0 -0
  44. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/base_benchmark.py +0 -0
  45. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/bbq/__init__.py +0 -0
  46. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/bbq/bbq.py +0 -0
  47. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/bbq/task.py +0 -0
  48. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/bbq/template.py +0 -0
  49. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/__init__.py +0 -0
  50. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/big_bench_hard.py +0 -0
  51. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/cot_prompts/__init__.py +0 -0
  52. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/cot_prompts/boolean_expressions.txt +0 -0
  53. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/cot_prompts/causal_judgement.txt +0 -0
  54. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/cot_prompts/date_understanding.txt +0 -0
  55. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/cot_prompts/disambiguation_qa.txt +0 -0
  56. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/cot_prompts/dyck_languages.txt +0 -0
  57. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/cot_prompts/formal_fallacies.txt +0 -0
  58. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/cot_prompts/geometric_shapes.txt +0 -0
  59. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/cot_prompts/hyperbaton.txt +0 -0
  60. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_five_objects.txt +0 -0
  61. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  62. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_three_objects.txt +0 -0
  63. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/cot_prompts/movie_recommendation.txt +0 -0
  64. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/cot_prompts/multistep_arithmetic_two.txt +0 -0
  65. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/cot_prompts/navigate.txt +0 -0
  66. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/cot_prompts/object_counting.txt +0 -0
  67. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/cot_prompts/penguins_in_a_table.txt +0 -0
  68. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  69. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/cot_prompts/ruin_names.txt +0 -0
  70. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/cot_prompts/salient_translation_error_detection.txt +0 -0
  71. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/cot_prompts/snarks.txt +0 -0
  72. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/cot_prompts/sports_understanding.txt +0 -0
  73. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/cot_prompts/temporal_sequences.txt +0 -0
  74. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  75. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  76. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  77. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/cot_prompts/web_of_lies.txt +0 -0
  78. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/cot_prompts/word_sorting.txt +0 -0
  79. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/shot_prompts/__init__.py +0 -0
  80. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/shot_prompts/boolean_expressions.txt +0 -0
  81. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/shot_prompts/causal_judgement.txt +0 -0
  82. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/shot_prompts/date_understanding.txt +0 -0
  83. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/shot_prompts/disambiguation_qa.txt +0 -0
  84. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/shot_prompts/dyck_languages.txt +0 -0
  85. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/shot_prompts/formal_fallacies.txt +0 -0
  86. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/shot_prompts/geometric_shapes.txt +0 -0
  87. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/shot_prompts/hyperbaton.txt +0 -0
  88. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_five_objects.txt +0 -0
  89. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_seven_objects.txt +0 -0
  90. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_three_objects.txt +0 -0
  91. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/shot_prompts/movie_recommendation.txt +0 -0
  92. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/shot_prompts/multistep_arithmetic_two.txt +0 -0
  93. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/shot_prompts/navigate.txt +0 -0
  94. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/shot_prompts/object_counting.txt +0 -0
  95. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/shot_prompts/penguins_in_a_table.txt +0 -0
  96. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/shot_prompts/reasoning_about_colored_objects.txt +0 -0
  97. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/shot_prompts/ruin_names.txt +0 -0
  98. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/shot_prompts/salient_translation_error_detection.txt +0 -0
  99. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/shot_prompts/snarks.txt +0 -0
  100. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/shot_prompts/sports_understanding.txt +0 -0
  101. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/shot_prompts/temporal_sequences.txt +0 -0
  102. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  103. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  104. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  105. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/shot_prompts/web_of_lies.txt +0 -0
  106. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/shot_prompts/word_sorting.txt +0 -0
  107. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/task.py +0 -0
  108. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/big_bench_hard/template.py +0 -0
  109. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/bool_q/__init__.py +0 -0
  110. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/bool_q/bool_q.py +0 -0
  111. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/bool_q/template.py +0 -0
  112. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/drop/__init__.py +0 -0
  113. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/drop/task.py +0 -0
  114. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/drop/template.py +0 -0
  115. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/equity_med_qa/__init__.py +0 -0
  116. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/equity_med_qa/equity_med_qa.py +0 -0
  117. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/equity_med_qa/task.py +0 -0
  118. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/equity_med_qa/template.py +0 -0
  119. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/gsm8k/__init__.py +0 -0
  120. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/gsm8k/gsm8k.py +0 -0
  121. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/gsm8k/template.py +0 -0
  122. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/hellaswag/__init__.py +0 -0
  123. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/hellaswag/task.py +0 -0
  124. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/hellaswag/template.py +0 -0
  125. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/human_eval/__init__.py +0 -0
  126. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/human_eval/human_eval.py +0 -0
  127. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/human_eval/task.py +0 -0
  128. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/human_eval/template.py +0 -0
  129. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/ifeval/__init__.py +0 -0
  130. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/ifeval/ifeval.py +0 -0
  131. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/ifeval/template.py +0 -0
  132. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/lambada/__init__.py +0 -0
  133. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/lambada/lambada.py +0 -0
  134. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/lambada/template.py +0 -0
  135. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/logi_qa/__init__.py +0 -0
  136. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/logi_qa/task.py +0 -0
  137. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/logi_qa/template.py +0 -0
  138. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/math_qa/__init__.py +0 -0
  139. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/math_qa/task.py +0 -0
  140. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/math_qa/template.py +0 -0
  141. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/mmlu/__init__.py +0 -0
  142. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/mmlu/task.py +0 -0
  143. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/mmlu/template.py +0 -0
  144. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/modes/__init__.py +0 -0
  145. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/results.py +0 -0
  146. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/schema.py +0 -0
  147. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/squad/__init__.py +0 -0
  148. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/squad/squad.py +0 -0
  149. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/squad/task.py +0 -0
  150. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/squad/template.py +0 -0
  151. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/tasks/__init__.py +0 -0
  152. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/truthful_qa/__init__.py +0 -0
  153. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/truthful_qa/mode.py +0 -0
  154. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/truthful_qa/task.py +0 -0
  155. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/truthful_qa/template.py +0 -0
  156. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/utils.py +0 -0
  157. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/winogrande/__init__.py +0 -0
  158. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/winogrande/template.py +0 -0
  159. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/benchmarks/winogrande/winogrande.py +0 -0
  160. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/cli/__init__.py +0 -0
  161. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/cli/dotenv_handler.py +0 -0
  162. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/cli/main.py +0 -0
  163. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/cli/server.py +0 -0
  164. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/cli/test.py +0 -0
  165. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/cli/types.py +0 -0
  166. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/cli/utils.py +0 -0
  167. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/confident/__init__.py +0 -0
  168. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/confident/types.py +0 -0
  169. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/config/__init__.py +0 -0
  170. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/config/settings.py +0 -0
  171. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/config/settings_manager.py +0 -0
  172. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/config/utils.py +0 -0
  173. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/constants.py +0 -0
  174. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/dataset/__init__.py +0 -0
  175. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/dataset/api.py +0 -0
  176. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/dataset/dataset.py +0 -0
  177. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/dataset/golden.py +0 -0
  178. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/dataset/test_run_tracer.py +0 -0
  179. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/dataset/types.py +0 -0
  180. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/dataset/utils.py +0 -0
  181. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/errors.py +0 -0
  182. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/evaluate/__init__.py +0 -0
  183. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/evaluate/api.py +0 -0
  184. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/evaluate/compare.py +0 -0
  185. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/evaluate/configs.py +0 -0
  186. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/evaluate/evaluate.py +0 -0
  187. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/evaluate/execute.py +0 -0
  188. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/evaluate/types.py +0 -0
  189. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/evaluate/utils.py +0 -0
  190. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/integrations/__init__.py +0 -0
  191. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/integrations/crewai/__init__.py +0 -0
  192. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/integrations/crewai/agent.py +0 -0
  193. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/integrations/crewai/handler.py +0 -0
  194. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/integrations/crewai/patch.py +0 -0
  195. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/integrations/hugging_face/__init__.py +0 -0
  196. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/integrations/hugging_face/callback.py +0 -0
  197. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/integrations/hugging_face/rich_manager.py +0 -0
  198. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/integrations/hugging_face/tests/test_callbacks.py +0 -0
  199. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/integrations/hugging_face/utils.py +0 -0
  200. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/integrations/langchain/__init__.py +0 -0
  201. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/integrations/langchain/patch.py +0 -0
  202. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/integrations/langchain/utils.py +0 -0
  203. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/integrations/llama_index/__init__.py +0 -0
  204. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/integrations/llama_index/agent/patched.py +0 -0
  205. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/integrations/llama_index/handler.py +0 -0
  206. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/integrations/llama_index/utils.py +0 -0
  207. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/key_handler.py +0 -0
  208. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/answer_relevancy/__init__.py +0 -0
  209. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/answer_relevancy/answer_relevancy.py +0 -0
  210. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/answer_relevancy/schema.py +0 -0
  211. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/arena_g_eval/__init__.py +0 -0
  212. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/arena_g_eval/arena_g_eval.py +0 -0
  213. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/arena_g_eval/schema.py +0 -0
  214. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/arena_g_eval/template.py +0 -0
  215. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/arena_g_eval/utils.py +0 -0
  216. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/argument_correctness/__init__.py +0 -0
  217. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/argument_correctness/argument_correctness.py +0 -0
  218. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/argument_correctness/schema.py +0 -0
  219. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/argument_correctness/template.py +0 -0
  220. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/base_metric.py +0 -0
  221. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/bias/__init__.py +0 -0
  222. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/bias/bias.py +0 -0
  223. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/bias/schema.py +0 -0
  224. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/bias/template.py +0 -0
  225. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/contextual_precision/__init__.py +0 -0
  226. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/contextual_precision/contextual_precision.py +0 -0
  227. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/contextual_precision/schema.py +0 -0
  228. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/contextual_precision/template.py +0 -0
  229. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/contextual_recall/__init__.py +0 -0
  230. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/contextual_recall/contextual_recall.py +0 -0
  231. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/contextual_recall/schema.py +0 -0
  232. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/contextual_recall/template.py +0 -0
  233. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/contextual_relevancy/__init__.py +0 -0
  234. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/contextual_relevancy/contextual_relevancy.py +0 -0
  235. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/contextual_relevancy/schema.py +0 -0
  236. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/contextual_relevancy/template.py +0 -0
  237. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/conversation_completeness/__init__.py +0 -0
  238. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/conversation_completeness/conversation_completeness.py +0 -0
  239. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/conversation_completeness/schema.py +0 -0
  240. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/conversation_completeness/template.py +0 -0
  241. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/conversational_dag/__init__.py +0 -0
  242. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/conversational_dag/conversational_dag.py +0 -0
  243. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/conversational_dag/nodes.py +0 -0
  244. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/conversational_dag/templates.py +0 -0
  245. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/conversational_g_eval/__init__.py +0 -0
  246. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/conversational_g_eval/conversational_g_eval.py +0 -0
  247. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/conversational_g_eval/schema.py +0 -0
  248. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/conversational_g_eval/template.py +0 -0
  249. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/dag/__init__.py +0 -0
  250. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/dag/dag.py +0 -0
  251. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/dag/graph.py +0 -0
  252. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/dag/nodes.py +0 -0
  253. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/dag/schema.py +0 -0
  254. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/dag/templates.py +0 -0
  255. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/dag/utils.py +0 -0
  256. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/faithfulness/__init__.py +0 -0
  257. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/faithfulness/faithfulness.py +0 -0
  258. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/faithfulness/schema.py +0 -0
  259. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/g_eval/__init__.py +0 -0
  260. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/g_eval/g_eval.py +0 -0
  261. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/g_eval/schema.py +0 -0
  262. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/g_eval/template.py +0 -0
  263. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/g_eval/utils.py +0 -0
  264. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/hallucination/__init__.py +0 -0
  265. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/hallucination/hallucination.py +0 -0
  266. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/hallucination/schema.py +0 -0
  267. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/hallucination/template.py +0 -0
  268. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/indicator.py +0 -0
  269. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/json_correctness/__init__.py +0 -0
  270. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/json_correctness/json_correctness.py +0 -0
  271. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/json_correctness/schema.py +0 -0
  272. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/json_correctness/template.py +0 -0
  273. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/knowledge_retention/__init__.py +0 -0
  274. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/knowledge_retention/knowledge_retention.py +0 -0
  275. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/knowledge_retention/schema.py +0 -0
  276. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/knowledge_retention/template.py +0 -0
  277. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/mcp/__init__.py +0 -0
  278. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/mcp/mcp_task_completion.py +0 -0
  279. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +0 -0
  280. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/mcp/schema.py +0 -0
  281. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/mcp/template.py +0 -0
  282. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/mcp_use_metric/__init__.py +0 -0
  283. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/mcp_use_metric/mcp_use_metric.py +0 -0
  284. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/mcp_use_metric/schema.py +0 -0
  285. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/mcp_use_metric/template.py +0 -0
  286. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/misuse/__init__.py +0 -0
  287. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/misuse/misuse.py +0 -0
  288. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/misuse/schema.py +0 -0
  289. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/misuse/template.py +0 -0
  290. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/__init__.py +0 -0
  291. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/image_coherence/__init__.py +0 -0
  292. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +0 -0
  293. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/image_coherence/schema.py +0 -0
  294. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/image_coherence/template.py +0 -0
  295. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/image_editing/__init__.py +0 -0
  296. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +0 -0
  297. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/image_editing/schema.py +0 -0
  298. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/image_editing/template.py +0 -0
  299. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/image_helpfulness/__init__.py +0 -0
  300. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +0 -0
  301. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/image_helpfulness/schema.py +0 -0
  302. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/image_helpfulness/template.py +0 -0
  303. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/image_reference/__init__.py +0 -0
  304. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +0 -0
  305. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/image_reference/schema.py +0 -0
  306. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/image_reference/template.py +0 -0
  307. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/__init__.py +0 -0
  308. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -0
  309. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -0
  310. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -0
  311. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/__init__.py +0 -0
  312. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -0
  313. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -0
  314. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -0
  315. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/__init__.py +0 -0
  316. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -0
  317. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -0
  318. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -0
  319. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/__init__.py +0 -0
  320. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -0
  321. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/schema.py +0 -0
  322. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -0
  323. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  324. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -0
  325. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/schema.py +0 -0
  326. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -0
  327. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  328. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -0
  329. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -0
  330. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -0
  331. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -0
  332. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  333. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -0
  334. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/text_to_image/__init__.py +0 -0
  335. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/text_to_image/schema.py +0 -0
  336. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/text_to_image/template.py +0 -0
  337. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +0 -0
  338. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/non_advice/__init__.py +0 -0
  339. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/non_advice/non_advice.py +0 -0
  340. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/non_advice/schema.py +0 -0
  341. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/non_advice/template.py +0 -0
  342. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/pii_leakage/__init__.py +0 -0
  343. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/pii_leakage/pii_leakage.py +0 -0
  344. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/pii_leakage/schema.py +0 -0
  345. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/pii_leakage/template.py +0 -0
  346. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/prompt_alignment/__init__.py +0 -0
  347. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/prompt_alignment/prompt_alignment.py +0 -0
  348. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/prompt_alignment/schema.py +0 -0
  349. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/prompt_alignment/template.py +0 -0
  350. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/ragas.py +0 -0
  351. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/role_adherence/__init__.py +0 -0
  352. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/role_adherence/role_adherence.py +0 -0
  353. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/role_adherence/schema.py +0 -0
  354. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/role_adherence/template.py +0 -0
  355. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/role_violation/__init__.py +0 -0
  356. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/role_violation/role_violation.py +0 -0
  357. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/role_violation/schema.py +0 -0
  358. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/role_violation/template.py +0 -0
  359. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/summarization/__init__.py +0 -0
  360. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/summarization/schema.py +0 -0
  361. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/summarization/summarization.py +0 -0
  362. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/summarization/template.py +0 -0
  363. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/task_completion/__init__.py +0 -0
  364. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/task_completion/schema.py +0 -0
  365. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/task_completion/task_completion.py +0 -0
  366. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/task_completion/template.py +0 -0
  367. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/tool_correctness/__init__.py +0 -0
  368. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/tool_correctness/tool_correctness.py +0 -0
  369. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/toxicity/__init__.py +0 -0
  370. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/toxicity/schema.py +0 -0
  371. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/toxicity/template.py +0 -0
  372. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/toxicity/toxicity.py +0 -0
  373. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/turn_relevancy/__init__.py +0 -0
  374. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/turn_relevancy/schema.py +0 -0
  375. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/turn_relevancy/template.py +0 -0
  376. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/turn_relevancy/turn_relevancy.py +0 -0
  377. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/metrics/utils.py +0 -0
  378. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/models/__init__.py +0 -0
  379. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/models/_summac_model.py +0 -0
  380. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/models/answer_relevancy_model.py +0 -0
  381. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/models/base_model.py +0 -0
  382. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/models/detoxify_model.py +0 -0
  383. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/models/embedding_models/__init__.py +0 -0
  384. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/models/embedding_models/azure_embedding_model.py +0 -0
  385. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/models/embedding_models/local_embedding_model.py +0 -0
  386. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/models/embedding_models/ollama_embedding_model.py +0 -0
  387. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/models/embedding_models/openai_embedding_model.py +0 -0
  388. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/models/hallucination_model.py +0 -0
  389. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/models/llms/__init__.py +0 -0
  390. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/models/llms/amazon_bedrock_model.py +0 -0
  391. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/models/llms/anthropic_model.py +0 -0
  392. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/models/llms/azure_model.py +0 -0
  393. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/models/llms/deepseek_model.py +0 -0
  394. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/models/llms/gemini_model.py +0 -0
  395. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/models/llms/litellm_model.py +0 -0
  396. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/models/llms/local_model.py +0 -0
  397. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/models/llms/ollama_model.py +0 -0
  398. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/models/llms/openai_model.py +0 -0
  399. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/models/llms/utils.py +0 -0
  400. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/models/mlllms/__init__.py +0 -0
  401. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/models/mlllms/gemini_model.py +0 -0
  402. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/models/mlllms/ollama_model.py +0 -0
  403. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/models/mlllms/openai_model.py +0 -0
  404. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/models/retry_policy.py +0 -0
  405. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/models/summac_model.py +0 -0
  406. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/models/unbias_model.py +0 -0
  407. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/models/utils.py +0 -0
  408. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/openai/__init__.py +0 -0
  409. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/openai/extractors.py +0 -0
  410. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/openai/patch.py +0 -0
  411. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/openai/utils.py +0 -0
  412. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/openai_agents/__init__.py +0 -0
  413. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/openai_agents/agent.py +0 -0
  414. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/openai_agents/callback_handler.py +0 -0
  415. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/openai_agents/extractors.py +0 -0
  416. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/openai_agents/patch.py +0 -0
  417. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/openai_agents/runner.py +0 -0
  418. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/plugins/__init__.py +0 -0
  419. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/plugins/plugin.py +0 -0
  420. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/progress_context.py +0 -0
  421. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/prompt/__init__.py +0 -0
  422. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/prompt/utils.py +0 -0
  423. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/py.typed +0 -0
  424. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/red_teaming/README.md +0 -0
  425. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/scorer/__init__.py +0 -0
  426. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/scorer/scorer.py +0 -0
  427. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/simulator/__init__.py +0 -0
  428. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/simulator/conversation_simulator.py +0 -0
  429. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/simulator/schema.py +0 -0
  430. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/simulator/template.py +0 -0
  431. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/singleton.py +0 -0
  432. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/synthesizer/__init__.py +0 -0
  433. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/synthesizer/base_synthesizer.py +0 -0
  434. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/synthesizer/chunking/__init__.py +0 -0
  435. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/synthesizer/chunking/context_generator.py +0 -0
  436. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/synthesizer/chunking/doc_chunker.py +0 -0
  437. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/synthesizer/config.py +0 -0
  438. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/synthesizer/schema.py +0 -0
  439. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/synthesizer/templates/__init__.py +0 -0
  440. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/synthesizer/templates/template.py +0 -0
  441. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/synthesizer/templates/template_extraction.py +0 -0
  442. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/synthesizer/templates/template_prompt.py +0 -0
  443. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/synthesizer/types.py +0 -0
  444. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/synthesizer/utils.py +0 -0
  445. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/telemetry.py +0 -0
  446. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/test_case/__init__.py +0 -0
  447. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/test_case/arena_test_case.py +0 -0
  448. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/test_case/conversational_test_case.py +0 -0
  449. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/test_case/llm_test_case.py +0 -0
  450. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/test_case/mcp.py +0 -0
  451. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/test_case/mllm_test_case.py +0 -0
  452. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/test_case/utils.py +0 -0
  453. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/test_run/__init__.py +0 -0
  454. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/test_run/api.py +0 -0
  455. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/test_run/cache.py +0 -0
  456. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/test_run/hooks.py +0 -0
  457. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/test_run/hyperparameters.py +0 -0
  458. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/test_run/test_run.py +0 -0
  459. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/tracing/__init__.py +0 -0
  460. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/tracing/offline_evals/__init__.py +0 -0
  461. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/tracing/offline_evals/api.py +0 -0
  462. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/tracing/offline_evals/span.py +0 -0
  463. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/tracing/offline_evals/thread.py +0 -0
  464. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/tracing/offline_evals/trace.py +0 -0
  465. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/tracing/otel/__init__.py +0 -0
  466. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/tracing/otel/exporter.py +0 -0
  467. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/tracing/otel/utils.py +0 -0
  468. {deepeval-3.4.9 → deepeval-3.5.1}/deepeval/tracing/patchers.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deepeval
3
- Version: 3.4.9
3
+ Version: 3.5.1
4
4
  Summary: The LLM Evaluation Framework
5
5
  Home-page: https://github.com/confident-ai/deepeval
6
6
  License: Apache-2.0
@@ -189,16 +189,6 @@ Let's pretend your LLM application is a RAG based customer support chatbot; here
189
189
  ```
190
190
  pip install -U deepeval
191
191
  ```
192
- ### Environment variables (.env / .env.local)
193
-
194
- DeepEval auto-loads `.env.local` then `.env` from the current working directory **at import time**.
195
- **Precedence:** process env -> `.env.local` -> `.env`.
196
- Opt out with `DEEPEVAL_DISABLE_DOTENV=1`.
197
-
198
- ```bash
199
- cp .env.example .env.local
200
- # then edit .env.local (ignored by git)
201
- ```
202
192
 
203
193
  ## Create an account (highly recommended)
204
194
 
@@ -391,9 +381,20 @@ evaluate(dataset, [answer_relevancy_metric])
391
381
  dataset.evaluate([answer_relevancy_metric])
392
382
  ```
393
383
 
394
- # LLM Evaluation With Confident AI
384
+ ## A Note on Env Variables (.env / .env.local)
385
+
386
+ DeepEval auto-loads `.env.local` then `.env` from the current working directory **at import time**.
387
+ **Precedence:** process env -> `.env.local` -> `.env`.
388
+ Opt out with `DEEPEVAL_DISABLE_DOTENV=1`.
389
+
390
+ ```bash
391
+ cp .env.example .env.local
392
+ # then edit .env.local (ignored by git)
393
+ ```
394
+
395
+ # DeepEval With Confident AI
395
396
 
396
- The correct LLM evaluation lifecycle is only achievable with [the DeepEval platform](https://confident-ai.com?utm_source=Github). It allows you to:
397
+ DeepEval's cloud platform, [Confident AI](https://confident-ai.com?utm_source=Github), allows you to:
397
398
 
398
399
  1. Curate/annotate evaluation datasets on the cloud
399
400
  2. Benchmark LLM app using dataset, and compare with previous iterations to experiment which models/prompts works best
@@ -140,16 +140,6 @@ Let's pretend your LLM application is a RAG based customer support chatbot; here
140
140
  ```
141
141
  pip install -U deepeval
142
142
  ```
143
- ### Environment variables (.env / .env.local)
144
-
145
- DeepEval auto-loads `.env.local` then `.env` from the current working directory **at import time**.
146
- **Precedence:** process env -> `.env.local` -> `.env`.
147
- Opt out with `DEEPEVAL_DISABLE_DOTENV=1`.
148
-
149
- ```bash
150
- cp .env.example .env.local
151
- # then edit .env.local (ignored by git)
152
- ```
153
143
 
154
144
  ## Create an account (highly recommended)
155
145
 
@@ -342,9 +332,20 @@ evaluate(dataset, [answer_relevancy_metric])
342
332
  dataset.evaluate([answer_relevancy_metric])
343
333
  ```
344
334
 
345
- # LLM Evaluation With Confident AI
335
+ ## A Note on Env Variables (.env / .env.local)
336
+
337
+ DeepEval auto-loads `.env.local` then `.env` from the current working directory **at import time**.
338
+ **Precedence:** process env -> `.env.local` -> `.env`.
339
+ Opt out with `DEEPEVAL_DISABLE_DOTENV=1`.
340
+
341
+ ```bash
342
+ cp .env.example .env.local
343
+ # then edit .env.local (ignored by git)
344
+ ```
345
+
346
+ # DeepEval With Confident AI
346
347
 
347
- The correct LLM evaluation lifecycle is only achievable with [the DeepEval platform](https://confident-ai.com?utm_source=Github). It allows you to:
348
+ DeepEval's cloud platform, [Confident AI](https://confident-ai.com?utm_source=Github), allows you to:
348
349
 
349
350
  1. Curate/annotate evaluation datasets on the cloud
350
351
  2. Benchmark LLM app using dataset, and compare with previous iterations to experiment which models/prompts works best
@@ -0,0 +1 @@
1
+ __version__: str = "3.5.1"
@@ -1,6 +1,5 @@
1
- from typing import List, Optional, Dict
1
+ from typing import List, Optional, Dict, Union
2
2
  from tqdm import tqdm
3
- from typing import Union
4
3
 
5
4
  from deepeval.dataset import Golden
6
5
  from deepeval.benchmarks.base_benchmark import (
@@ -50,7 +49,7 @@ class DROP(DeepEvalBaseBenchmark):
50
49
  self,
51
50
  model: DeepEvalBaseLLM,
52
51
  *args,
53
- batch_size: int | None = None,
52
+ batch_size: Union[int, None] = None,
54
53
  **kwargs,
55
54
  ) -> DeepEvalBaseBenchmarkResult:
56
55
  import pandas as pd
@@ -1,4 +1,4 @@
1
- from typing import List, Dict, Optional
1
+ from typing import List, Dict, Optional, Union
2
2
  from tqdm import tqdm
3
3
 
4
4
  from deepeval.dataset import Golden
@@ -51,7 +51,7 @@ class HellaSwag(DeepEvalBaseBenchmark):
51
51
  self,
52
52
  model: DeepEvalBaseLLM,
53
53
  *args,
54
- batch_size: int | None = None,
54
+ batch_size: Union[int, None] = None,
55
55
  **kwargs,
56
56
  ) -> DeepEvalBaseBenchmarkResult:
57
57
  import pandas as pd
@@ -1,4 +1,4 @@
1
- from typing import List, Optional, Dict
1
+ from typing import List, Optional, Dict, Union
2
2
  from tqdm import tqdm
3
3
  import requests
4
4
  import json
@@ -52,7 +52,7 @@ class LogiQA(DeepEvalBaseBenchmark):
52
52
  self,
53
53
  model: DeepEvalBaseLLM,
54
54
  *args,
55
- batch_size: int | None = None,
55
+ batch_size: Union[int, None] = None,
56
56
  **kwargs,
57
57
  ) -> DeepEvalBaseBenchmarkResult:
58
58
  import pandas as pd
@@ -1,4 +1,4 @@
1
- from typing import List, Optional, Dict
1
+ from typing import List, Optional, Dict, Union
2
2
  from tqdm import tqdm
3
3
 
4
4
  from deepeval.dataset import Golden
@@ -50,7 +50,7 @@ class MathQA(DeepEvalBaseBenchmark):
50
50
  self,
51
51
  model: DeepEvalBaseLLM,
52
52
  *args,
53
- batch_size: int | None = None,
53
+ batch_size: Union[int, None] = None,
54
54
  **kwargs,
55
55
  ) -> DeepEvalBaseBenchmarkResult:
56
56
  import pandas as pd
@@ -1,4 +1,4 @@
1
- from typing import List, Optional, Dict
1
+ from typing import List, Optional, Dict, Union
2
2
  from tqdm import tqdm
3
3
 
4
4
  from deepeval.dataset import Golden
@@ -49,7 +49,7 @@ class MMLU(DeepEvalBaseBenchmark):
49
49
  self,
50
50
  model: DeepEvalBaseLLM,
51
51
  *args,
52
- batch_size: int | None = None,
52
+ batch_size: Union[int, None] = None,
53
53
  **kwargs,
54
54
  ) -> DeepEvalBaseBenchmarkResult:
55
55
  import pandas as pd
@@ -1,4 +1,4 @@
1
- from typing import List, Dict, Optional
1
+ from typing import List, Dict, Optional, Union
2
2
  from tqdm import tqdm
3
3
 
4
4
  from deepeval.dataset import Golden
@@ -59,7 +59,7 @@ class TruthfulQA(DeepEvalBaseBenchmark):
59
59
  self,
60
60
  model: DeepEvalBaseLLM,
61
61
  *args,
62
- batch_size: int | None = None,
62
+ batch_size: Union[int, None] = None,
63
63
  **kwargs,
64
64
  ) -> DeepEvalBaseBenchmarkResult:
65
65
  import pandas as pd
@@ -10,6 +10,7 @@ from tenacity import (
10
10
  retry_if_exception_type,
11
11
  RetryCallState,
12
12
  )
13
+ from pydantic import SecretStr
13
14
 
14
15
  import deepeval
15
16
  from deepeval.key_handler import KEY_FILE_HANDLER, KeyValues
@@ -88,7 +89,9 @@ class Endpoints(Enum):
88
89
  TEST_RUN_ENDPOINT = "/v1/test-run"
89
90
  TRACES_ENDPOINT = "/v1/traces"
90
91
  ANNOTATIONS_ENDPOINT = "/v1/annotations"
92
+ PROMPTS_VERSION_ID_ENDPOINT = "/v1/prompts/:alias/versions/:versionId"
91
93
  PROMPTS_ENDPOINT = "/v1/prompts"
94
+ PROMPTS_VERSIONS_ENDPOINT = "/v1/prompts/:alias/versions"
92
95
  SIMULATE_ENDPOINT = "/v1/simulate"
93
96
  EVALUATE_ENDPOINT = "/v1/evaluate"
94
97
 
@@ -9,6 +9,7 @@ from deepeval.tracing.types import (
9
9
  from deepeval.metrics import BaseMetric, TaskCompletionMetric
10
10
  from deepeval.test_case import LLMTestCase
11
11
  from deepeval.test_run import global_test_run_manager
12
+ import uuid
12
13
 
13
14
  try:
14
15
  from langchain_core.callbacks.base import BaseCallbackHandler
@@ -81,6 +82,26 @@ class CallbackHandler(BaseCallbackHandler):
81
82
  )
82
83
  super().__init__()
83
84
 
85
+ def on_llm_new_token(
86
+ self,
87
+ token: str,
88
+ *,
89
+ chunk,
90
+ run_id: UUID,
91
+ parent_run_id: Optional[UUID] = None,
92
+ tags: Optional[list[str]] = None,
93
+ **kwargs: Any,
94
+ ):
95
+ llm_span: Optional[LlmSpan] = trace_manager.get_span_by_uuid(
96
+ str(run_id)
97
+ )
98
+ if llm_span is None:
99
+ return
100
+ if llm_span.token_intervals is None:
101
+ llm_span.token_intervals = {perf_counter(): token}
102
+ else:
103
+ llm_span.token_intervals[perf_counter()] = token
104
+
84
105
  def check_active_trace_id(self):
85
106
  if self.active_trace_id is None:
86
107
  self.active_trace_id = trace_manager.start_new_trace().uuid
@@ -0,0 +1,3 @@
1
+ from .patcher import instrument as instrument_pydantic_ai
2
+
3
+ __all__ = ["instrument_pydantic_ai"]
@@ -31,14 +31,6 @@ def instrument_pydantic_ai(api_key: Optional[str] = None):
31
31
  with capture_tracing_integration("pydantic_ai"):
32
32
  is_opentelemetry_available()
33
33
 
34
- if api_key:
35
- deepeval.login(api_key)
36
-
37
- api_key = get_confident_api_key()
38
-
39
- if not api_key:
40
- raise ValueError("No api key provided.")
41
-
42
34
  # create a new tracer provider
43
35
  tracer_provider = TracerProvider()
44
36
  tracer_provider.add_span_processor(
@@ -0,0 +1,376 @@
1
+ import functools
2
+ import deepeval
3
+ from deepeval.tracing.types import LlmOutput, LlmToolCall
4
+ from pydantic_ai.agent import AgentRunResult
5
+ from deepeval.tracing.context import current_trace_context
6
+ from deepeval.tracing.types import AgentSpan, LlmSpan
7
+ from deepeval.tracing.tracing import Observer
8
+ from typing import List, Callable, Optional, Any
9
+ from deepeval.test_case.llm_test_case import ToolCall
10
+ from deepeval.metrics.base_metric import BaseMetric
11
+ from deepeval.confident.api import get_confident_api_key
12
+ from deepeval.integrations.pydantic_ai.otel import instrument_pydantic_ai
13
+ from deepeval.telemetry import capture_tracing_integration
14
+ from deepeval.prompt import Prompt
15
+
16
+ try:
17
+ from pydantic_ai.agent import Agent
18
+ from pydantic_ai.models import Model
19
+ from pydantic_ai.messages import (
20
+ ModelResponse,
21
+ ModelRequest,
22
+ ModelResponsePart,
23
+ TextPart,
24
+ ToolCallPart,
25
+ SystemPromptPart,
26
+ ToolReturnPart,
27
+ UserPromptPart,
28
+ )
29
+
30
+ pydantic_ai_installed = True
31
+ except:
32
+ pydantic_ai_installed = True
33
+
34
+
35
+ def _patch_agent_tool_decorator():
36
+ original_tool = Agent.tool
37
+
38
+ @functools.wraps(original_tool)
39
+ def wrapper(
40
+ *args,
41
+ metrics: Optional[List[BaseMetric]] = None,
42
+ metric_collection: Optional[str] = None,
43
+ **kwargs
44
+ ):
45
+ # Case 1: Direct decoration - @agent.tool
46
+ if args and callable(args[0]):
47
+ patched_func = _create_patched_tool(
48
+ args[0], metrics, metric_collection
49
+ )
50
+ new_args = (patched_func,) + args[1:]
51
+ return original_tool(*new_args, **kwargs)
52
+
53
+ # Case 2: Decoration with arguments - @agent.tool(metrics=..., metric_collection=...)
54
+ else:
55
+ # Return a decorator function that will receive the actual function
56
+ def decorator(func):
57
+ patched_func = _create_patched_tool(
58
+ func, metrics, metric_collection
59
+ )
60
+ return original_tool(*args, **kwargs)(patched_func)
61
+
62
+ return decorator
63
+
64
+ Agent.tool = wrapper
65
+
66
+
67
+ def _create_patched_tool(
68
+ func: Callable,
69
+ metrics: Optional[List[BaseMetric]] = None,
70
+ metric_collection: Optional[str] = None,
71
+ ):
72
+ import asyncio
73
+
74
+ original_func = func
75
+
76
+ is_async = asyncio.iscoroutinefunction(original_func)
77
+
78
+ if is_async:
79
+
80
+ @functools.wraps(original_func)
81
+ async def async_wrapper(*args, **kwargs):
82
+ with Observer(
83
+ span_type="tool",
84
+ func_name=original_func.__name__,
85
+ metrics=metrics,
86
+ metric_collection=metric_collection,
87
+ function_kwargs={"args": args, **kwargs},
88
+ ) as observer:
89
+ result = await original_func(*args, **kwargs)
90
+ observer.result = result
91
+
92
+ return result
93
+
94
+ return async_wrapper
95
+ else:
96
+
97
+ @functools.wraps(original_func)
98
+ def sync_wrapper(*args, **kwargs):
99
+ with Observer(
100
+ span_type="tool",
101
+ func_name=original_func.__name__,
102
+ metrics=metrics,
103
+ metric_collection=metric_collection,
104
+ function_kwargs={"args": args, **kwargs},
105
+ ) as observer:
106
+ result = original_func(*args, **kwargs)
107
+ observer.result = result
108
+
109
+ return result
110
+
111
+ return sync_wrapper
112
+
113
+
114
+ def _patch_agent_init():
115
+ original_init = Agent.__init__
116
+
117
+ @functools.wraps(original_init)
118
+ def wrapper(
119
+ self,
120
+ *args,
121
+ llm_metric_collection: Optional[str] = None,
122
+ llm_metrics: Optional[List[BaseMetric]] = None,
123
+ llm_prompt: Optional[Prompt] = None,
124
+ agent_metric_collection: Optional[str] = None,
125
+ agent_metrics: Optional[List[BaseMetric]] = None,
126
+ **kwargs
127
+ ):
128
+ result = original_init(self, *args, **kwargs)
129
+ _patch_llm_model(
130
+ self._model, llm_metric_collection, llm_metrics, llm_prompt
131
+ ) # runtime patch of the model
132
+ _patch_agent_run(agent_metric_collection, agent_metrics)
133
+ return result
134
+
135
+ Agent.__init__ = wrapper
136
+
137
+
138
+ def _patch_agent_run(
139
+ agent_metric_collection: Optional[str] = None,
140
+ agent_metrics: Optional[List[BaseMetric]] = None,
141
+ ):
142
+ original_run = Agent.run
143
+
144
+ @functools.wraps(original_run)
145
+ async def wrapper(
146
+ *args,
147
+ trace_metric_collection: Optional[str] = None,
148
+ trace_metrics: Optional[List[BaseMetric]] = None,
149
+ trace_name: Optional[str] = None,
150
+ trace_tags: Optional[List[str]] = None,
151
+ trace_metadata: Optional[dict] = None,
152
+ trace_thread_id: Optional[str] = None,
153
+ trace_user_id: Optional[str] = None,
154
+ **kwargs
155
+ ):
156
+ with Observer(
157
+ span_type="agent",
158
+ func_name="Agent",
159
+ function_kwargs={"input": args[1]},
160
+ metrics=agent_metrics,
161
+ metric_collection=agent_metric_collection,
162
+ ) as observer:
163
+ result = await original_run(*args, **kwargs)
164
+ observer.update_span_properties = (
165
+ lambda agent_span: set_agent_span_attributes(agent_span, result)
166
+ )
167
+ observer.result = result.output
168
+
169
+ _update_trace_context(
170
+ trace_name=trace_name,
171
+ trace_tags=trace_tags,
172
+ trace_metadata=trace_metadata,
173
+ trace_thread_id=trace_thread_id,
174
+ trace_user_id=trace_user_id,
175
+ trace_metric_collection=trace_metric_collection,
176
+ trace_metrics=trace_metrics,
177
+ trace_input=args[1],
178
+ trace_output=result.output,
179
+ )
180
+
181
+ return result
182
+
183
+ Agent.run = wrapper
184
+
185
+
186
+ def _update_trace_context(
187
+ trace_name: Optional[str] = None,
188
+ trace_tags: Optional[List[str]] = None,
189
+ trace_metadata: Optional[dict] = None,
190
+ trace_thread_id: Optional[str] = None,
191
+ trace_user_id: Optional[str] = None,
192
+ trace_metric_collection: Optional[str] = None,
193
+ trace_metrics: Optional[List[BaseMetric]] = None,
194
+ trace_input: Optional[Any] = None,
195
+ trace_output: Optional[Any] = None,
196
+ ):
197
+
198
+ current_trace = current_trace_context.get()
199
+ current_trace.name = trace_name
200
+ current_trace.tags = trace_tags
201
+ current_trace.metadata = trace_metadata
202
+ current_trace.thread_id = trace_thread_id
203
+ current_trace.user_id = trace_user_id
204
+ current_trace.metric_collection = trace_metric_collection
205
+ current_trace.metrics = trace_metrics
206
+ current_trace.input = trace_input
207
+ current_trace.output = trace_output
208
+
209
+
210
+ def _patch_llm_model(
211
+ model: Model,
212
+ llm_metric_collection: Optional[str] = None,
213
+ llm_metrics: Optional[List[BaseMetric]] = None,
214
+ llm_prompt: Optional[Prompt] = None,
215
+ ):
216
+ original_func = model.request
217
+ try:
218
+ model_name = model.model_name
219
+ except Exception:
220
+ model_name = "unknown"
221
+
222
+ @functools.wraps(original_func)
223
+ async def wrapper(*args, **kwargs):
224
+ with Observer(
225
+ span_type="llm",
226
+ func_name="LLM",
227
+ observe_kwargs={"model": model_name},
228
+ metrics=llm_metrics,
229
+ metric_collection=llm_metric_collection,
230
+ ) as observer:
231
+ result = await original_func(*args, **kwargs)
232
+ request = kwargs.get("messages", [])
233
+ if not request:
234
+ request = args[0]
235
+ observer.update_span_properties = (
236
+ lambda llm_span: set_llm_span_attributes(
237
+ llm_span, args[0], result, llm_prompt
238
+ )
239
+ )
240
+ observer.result = result
241
+ return result
242
+
243
+ model.request = wrapper
244
+
245
+
246
+ def instrument(otel: Optional[bool] = False, api_key: Optional[str] = None):
247
+
248
+ if api_key:
249
+ deepeval.login(api_key)
250
+
251
+ api_key = get_confident_api_key()
252
+
253
+ if not api_key:
254
+ raise ValueError("No api key provided.")
255
+
256
+ if otel:
257
+ instrument_pydantic_ai(api_key)
258
+ else:
259
+ with capture_tracing_integration("pydantic_ai"):
260
+ _patch_agent_init()
261
+ _patch_agent_tool_decorator()
262
+
263
+
264
+ def set_llm_span_attributes(
265
+ llm_span: LlmSpan,
266
+ requests: List[ModelRequest],
267
+ result: ModelResponse,
268
+ llm_prompt: Optional[Prompt] = None,
269
+ ):
270
+ llm_span.prompt = llm_prompt
271
+
272
+ input = []
273
+ for request in requests:
274
+ for part in request.parts:
275
+ if isinstance(part, SystemPromptPart):
276
+ input.append({"role": "System", "content": part.content})
277
+ elif isinstance(part, UserPromptPart):
278
+ input.append({"role": "User", "content": part.content})
279
+ elif isinstance(part, ToolCallPart):
280
+ input.append(
281
+ {
282
+ "role": "Tool Call",
283
+ "name": part.tool_name,
284
+ "content": part.args_as_json_str(),
285
+ }
286
+ )
287
+ elif isinstance(part, ToolReturnPart):
288
+ input.append(
289
+ {
290
+ "role": "Tool Return",
291
+ "name": part.tool_name,
292
+ "content": part.model_response_str(),
293
+ }
294
+ )
295
+ llm_span.input = input
296
+
297
+ content = ""
298
+ tool_calls = []
299
+ for part in result.parts:
300
+ if isinstance(part, TextPart):
301
+ content += part.content + "\n"
302
+ elif isinstance(part, ToolCallPart):
303
+ tool_calls.append(
304
+ LlmToolCall(name=part.tool_name, args=part.args_as_dict())
305
+ )
306
+ llm_span.output = LlmOutput(
307
+ role="Assistant", content=content, tool_calls=tool_calls
308
+ )
309
+ llm_span.tools_called = _extract_tools_called_from_llm_response(
310
+ result.parts
311
+ )
312
+
313
+
314
+ def set_agent_span_attributes(agent_span: AgentSpan, result: AgentRunResult):
315
+ agent_span.tools_called = _extract_tools_called(result)
316
+
317
+
318
+ # llm tools called
319
+ def _extract_tools_called_from_llm_response(
320
+ result: List[ModelResponsePart],
321
+ ) -> List[ToolCall]:
322
+ tool_calls = []
323
+
324
+ # Loop through each ModelResponsePart
325
+ for part in result:
326
+ # Look for parts with part_kind="tool-call"
327
+ if hasattr(part, "part_kind") and part.part_kind == "tool-call":
328
+ # Extract tool name and args from the ToolCallPart
329
+ tool_name = part.tool_name
330
+ input_parameters = (
331
+ part.args_as_dict() if hasattr(part, "args_as_dict") else None
332
+ )
333
+
334
+ # Create and append ToolCall object
335
+ tool_call = ToolCall(
336
+ name=tool_name, input_parameters=input_parameters
337
+ )
338
+ tool_calls.append(tool_call)
339
+
340
+ return tool_calls
341
+
342
+
343
+ # TODO: llm tools called (reposne is present next message)
344
+ def _extract_tools_called(result: AgentRunResult) -> List[ToolCall]:
345
+ tool_calls = []
346
+
347
+ # Access the message history from the _state
348
+ message_history = result._state.message_history
349
+
350
+ # Scan through all messages in the history
351
+ for message in message_history:
352
+ # Check if this is a ModelResponse (kind="response")
353
+ if hasattr(message, "kind") and message.kind == "response":
354
+ # For ModelResponse messages, check each part
355
+ if hasattr(message, "parts"):
356
+ for part in message.parts:
357
+ # Look for parts with part_kind="tool-call"
358
+ if (
359
+ hasattr(part, "part_kind")
360
+ and part.part_kind == "tool-call"
361
+ ):
362
+ # Extract tool name and args from the ToolCallPart
363
+ tool_name = part.tool_name
364
+ input_parameters = (
365
+ part.args_as_dict()
366
+ if hasattr(part, "args_as_dict")
367
+ else None
368
+ )
369
+
370
+ # Create and append ToolCall object
371
+ tool_call = ToolCall(
372
+ name=tool_name, input_parameters=input_parameters
373
+ )
374
+ tool_calls.append(tool_call)
375
+
376
+ return tool_calls
@@ -69,7 +69,7 @@ __all__ = [
69
69
  "ConversationalGEval",
70
70
  "DAGMetric",
71
71
  "DeepAcyclicGraph",
72
- "ConversationalDAGMetric"
72
+ "ConversationalDAGMetric",
73
73
  # RAG metrics
74
74
  "AnswerRelevancyMetric",
75
75
  "FaithfulnessMetric",