deepeval 3.8.0__tar.gz → 3.8.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (527) hide show
  1. {deepeval-3.8.0 → deepeval-3.8.2}/PKG-INFO +1 -1
  2. deepeval-3.8.2/deepeval/_version.py +1 -0
  3. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/annotation/annotation.py +2 -2
  4. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/confident/api.py +31 -3
  5. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/config/settings.py +3 -0
  6. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/dataset/dataset.py +6 -4
  7. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/integrations/langchain/callback.py +307 -15
  8. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/integrations/langchain/utils.py +75 -24
  9. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/integrations/pydantic_ai/instrumentator.py +43 -11
  10. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/integrations/pydantic_ai/otel.py +9 -0
  11. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/contextual_recall/contextual_recall.py +25 -6
  12. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/contextual_recall/schema.py +6 -0
  13. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +10 -1
  14. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +10 -1
  15. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +10 -1
  16. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/utils.py +12 -1
  17. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/models/llms/amazon_bedrock_model.py +51 -6
  18. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/models/llms/azure_model.py +33 -7
  19. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/models/llms/gemini_model.py +6 -1
  20. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/prompt/prompt.py +7 -5
  21. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/simulator/conversation_simulator.py +4 -2
  22. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/telemetry.py +12 -91
  23. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/test_case/llm_test_case.py +1 -0
  24. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/tracing/tracing.py +6 -5
  25. {deepeval-3.8.0 → deepeval-3.8.2}/pyproject.toml +9 -4
  26. deepeval-3.8.0/deepeval/_version.py +0 -1
  27. {deepeval-3.8.0 → deepeval-3.8.2}/LICENSE.md +0 -0
  28. {deepeval-3.8.0 → deepeval-3.8.2}/README.md +0 -0
  29. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/__init__.py +0 -0
  30. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/annotation/__init__.py +0 -0
  31. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/annotation/api.py +0 -0
  32. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/anthropic/__init__.py +0 -0
  33. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/anthropic/extractors.py +0 -0
  34. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/anthropic/patch.py +0 -0
  35. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/anthropic/utils.py +0 -0
  36. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/__init__.py +0 -0
  37. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/arc/__init__.py +0 -0
  38. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/arc/arc.py +0 -0
  39. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/arc/mode.py +0 -0
  40. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/arc/template.py +0 -0
  41. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/base_benchmark.py +0 -0
  42. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/bbq/__init__.py +0 -0
  43. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/bbq/bbq.py +0 -0
  44. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/bbq/task.py +0 -0
  45. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/bbq/template.py +0 -0
  46. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/__init__.py +0 -0
  47. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/big_bench_hard.py +0 -0
  48. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/cot_prompts/__init__.py +0 -0
  49. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/cot_prompts/boolean_expressions.txt +0 -0
  50. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/cot_prompts/causal_judgement.txt +0 -0
  51. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/cot_prompts/date_understanding.txt +0 -0
  52. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/cot_prompts/disambiguation_qa.txt +0 -0
  53. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/cot_prompts/dyck_languages.txt +0 -0
  54. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/cot_prompts/formal_fallacies.txt +0 -0
  55. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/cot_prompts/geometric_shapes.txt +0 -0
  56. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/cot_prompts/hyperbaton.txt +0 -0
  57. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_five_objects.txt +0 -0
  58. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  59. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_three_objects.txt +0 -0
  60. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/cot_prompts/movie_recommendation.txt +0 -0
  61. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/cot_prompts/multistep_arithmetic_two.txt +0 -0
  62. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/cot_prompts/navigate.txt +0 -0
  63. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/cot_prompts/object_counting.txt +0 -0
  64. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/cot_prompts/penguins_in_a_table.txt +0 -0
  65. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  66. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/cot_prompts/ruin_names.txt +0 -0
  67. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/cot_prompts/salient_translation_error_detection.txt +0 -0
  68. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/cot_prompts/snarks.txt +0 -0
  69. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/cot_prompts/sports_understanding.txt +0 -0
  70. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/cot_prompts/temporal_sequences.txt +0 -0
  71. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  72. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  73. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  74. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/cot_prompts/web_of_lies.txt +0 -0
  75. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/cot_prompts/word_sorting.txt +0 -0
  76. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/shot_prompts/__init__.py +0 -0
  77. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/shot_prompts/boolean_expressions.txt +0 -0
  78. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/shot_prompts/causal_judgement.txt +0 -0
  79. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/shot_prompts/date_understanding.txt +0 -0
  80. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/shot_prompts/disambiguation_qa.txt +0 -0
  81. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/shot_prompts/dyck_languages.txt +0 -0
  82. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/shot_prompts/formal_fallacies.txt +0 -0
  83. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/shot_prompts/geometric_shapes.txt +0 -0
  84. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/shot_prompts/hyperbaton.txt +0 -0
  85. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_five_objects.txt +0 -0
  86. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_seven_objects.txt +0 -0
  87. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_three_objects.txt +0 -0
  88. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/shot_prompts/movie_recommendation.txt +0 -0
  89. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/shot_prompts/multistep_arithmetic_two.txt +0 -0
  90. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/shot_prompts/navigate.txt +0 -0
  91. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/shot_prompts/object_counting.txt +0 -0
  92. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/shot_prompts/penguins_in_a_table.txt +0 -0
  93. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/shot_prompts/reasoning_about_colored_objects.txt +0 -0
  94. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/shot_prompts/ruin_names.txt +0 -0
  95. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/shot_prompts/salient_translation_error_detection.txt +0 -0
  96. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/shot_prompts/snarks.txt +0 -0
  97. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/shot_prompts/sports_understanding.txt +0 -0
  98. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/shot_prompts/temporal_sequences.txt +0 -0
  99. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  100. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  101. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  102. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/shot_prompts/web_of_lies.txt +0 -0
  103. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/shot_prompts/word_sorting.txt +0 -0
  104. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/task.py +0 -0
  105. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/big_bench_hard/template.py +0 -0
  106. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/bool_q/__init__.py +0 -0
  107. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/bool_q/bool_q.py +0 -0
  108. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/bool_q/template.py +0 -0
  109. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/drop/__init__.py +0 -0
  110. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/drop/drop.py +0 -0
  111. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/drop/task.py +0 -0
  112. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/drop/template.py +0 -0
  113. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/equity_med_qa/__init__.py +0 -0
  114. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/equity_med_qa/equity_med_qa.py +0 -0
  115. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/equity_med_qa/task.py +0 -0
  116. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/equity_med_qa/template.py +0 -0
  117. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/gsm8k/__init__.py +0 -0
  118. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/gsm8k/gsm8k.py +0 -0
  119. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/gsm8k/template.py +0 -0
  120. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/hellaswag/__init__.py +0 -0
  121. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/hellaswag/hellaswag.py +0 -0
  122. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/hellaswag/task.py +0 -0
  123. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/hellaswag/template.py +0 -0
  124. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/human_eval/__init__.py +0 -0
  125. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/human_eval/human_eval.py +0 -0
  126. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/human_eval/task.py +0 -0
  127. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/human_eval/template.py +0 -0
  128. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/ifeval/__init__.py +0 -0
  129. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/ifeval/ifeval.py +0 -0
  130. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/ifeval/template.py +0 -0
  131. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/lambada/__init__.py +0 -0
  132. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/lambada/lambada.py +0 -0
  133. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/lambada/template.py +0 -0
  134. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/logi_qa/__init__.py +0 -0
  135. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/logi_qa/logi_qa.py +0 -0
  136. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/logi_qa/task.py +0 -0
  137. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/logi_qa/template.py +0 -0
  138. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/math_qa/__init__.py +0 -0
  139. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/math_qa/math_qa.py +0 -0
  140. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/math_qa/task.py +0 -0
  141. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/math_qa/template.py +0 -0
  142. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/mmlu/__init__.py +0 -0
  143. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/mmlu/mmlu.py +0 -0
  144. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/mmlu/task.py +0 -0
  145. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/mmlu/template.py +0 -0
  146. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/modes/__init__.py +0 -0
  147. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/results.py +0 -0
  148. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/schema.py +0 -0
  149. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/squad/__init__.py +0 -0
  150. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/squad/squad.py +0 -0
  151. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/squad/task.py +0 -0
  152. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/squad/template.py +0 -0
  153. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/tasks/__init__.py +0 -0
  154. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/truthful_qa/__init__.py +0 -0
  155. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/truthful_qa/mode.py +0 -0
  156. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/truthful_qa/task.py +0 -0
  157. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/truthful_qa/template.py +0 -0
  158. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/truthful_qa/truthful_qa.py +0 -0
  159. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/utils.py +0 -0
  160. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/winogrande/__init__.py +0 -0
  161. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/winogrande/template.py +0 -0
  162. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/benchmarks/winogrande/winogrande.py +0 -0
  163. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/cli/__init__.py +0 -0
  164. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/cli/dotenv_handler.py +0 -0
  165. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/cli/main.py +0 -0
  166. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/cli/server.py +0 -0
  167. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/cli/test.py +0 -0
  168. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/cli/types.py +0 -0
  169. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/cli/utils.py +0 -0
  170. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/confident/__init__.py +0 -0
  171. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/confident/types.py +0 -0
  172. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/config/__init__.py +0 -0
  173. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/config/dotenv_handler.py +0 -0
  174. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/config/logging.py +0 -0
  175. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/config/settings_manager.py +0 -0
  176. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/config/utils.py +0 -0
  177. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/constants.py +0 -0
  178. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/contextvars.py +0 -0
  179. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/dataset/__init__.py +0 -0
  180. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/dataset/api.py +0 -0
  181. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/dataset/golden.py +0 -0
  182. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/dataset/test_run_tracer.py +0 -0
  183. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/dataset/types.py +0 -0
  184. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/dataset/utils.py +0 -0
  185. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/errors.py +0 -0
  186. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/evaluate/__init__.py +0 -0
  187. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/evaluate/api.py +0 -0
  188. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/evaluate/compare.py +0 -0
  189. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/evaluate/configs.py +0 -0
  190. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/evaluate/evaluate.py +0 -0
  191. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/evaluate/execute.py +0 -0
  192. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/evaluate/types.py +0 -0
  193. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/evaluate/utils.py +0 -0
  194. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/integrations/__init__.py +0 -0
  195. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/integrations/crewai/__init__.py +0 -0
  196. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/integrations/crewai/handler.py +0 -0
  197. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/integrations/crewai/subs.py +0 -0
  198. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/integrations/crewai/tool.py +0 -0
  199. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/integrations/crewai/wrapper.py +0 -0
  200. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/integrations/hugging_face/__init__.py +0 -0
  201. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/integrations/hugging_face/callback.py +0 -0
  202. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/integrations/hugging_face/rich_manager.py +0 -0
  203. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/integrations/hugging_face/tests/test_callbacks.py +0 -0
  204. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/integrations/hugging_face/utils.py +0 -0
  205. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/integrations/langchain/__init__.py +0 -0
  206. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/integrations/langchain/patch.py +0 -0
  207. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/integrations/llama_index/__init__.py +0 -0
  208. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/integrations/llama_index/handler.py +0 -0
  209. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/integrations/llama_index/utils.py +0 -0
  210. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/integrations/pydantic_ai/__init__.py +0 -0
  211. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/integrations/pydantic_ai/agent.py +0 -0
  212. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/integrations/pydantic_ai/test_instrumentator.py +0 -0
  213. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/key_handler.py +0 -0
  214. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/__init__.py +0 -0
  215. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/answer_relevancy/__init__.py +0 -0
  216. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/answer_relevancy/answer_relevancy.py +0 -0
  217. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/answer_relevancy/schema.py +0 -0
  218. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/answer_relevancy/template.py +0 -0
  219. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/api.py +0 -0
  220. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/arena_g_eval/__init__.py +0 -0
  221. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/arena_g_eval/arena_g_eval.py +0 -0
  222. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/arena_g_eval/schema.py +0 -0
  223. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/arena_g_eval/template.py +0 -0
  224. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/arena_g_eval/utils.py +0 -0
  225. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/argument_correctness/__init__.py +0 -0
  226. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/argument_correctness/argument_correctness.py +0 -0
  227. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/argument_correctness/schema.py +0 -0
  228. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/argument_correctness/template.py +0 -0
  229. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/base_metric.py +0 -0
  230. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/bias/__init__.py +0 -0
  231. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/bias/bias.py +0 -0
  232. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/bias/schema.py +0 -0
  233. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/bias/template.py +0 -0
  234. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/contextual_precision/__init__.py +0 -0
  235. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/contextual_precision/contextual_precision.py +0 -0
  236. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/contextual_precision/schema.py +0 -0
  237. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/contextual_precision/template.py +0 -0
  238. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/contextual_recall/__init__.py +0 -0
  239. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/contextual_recall/template.py +0 -0
  240. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/contextual_relevancy/__init__.py +0 -0
  241. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/contextual_relevancy/contextual_relevancy.py +0 -0
  242. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/contextual_relevancy/schema.py +0 -0
  243. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/contextual_relevancy/template.py +0 -0
  244. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/conversation_completeness/__init__.py +0 -0
  245. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/conversation_completeness/conversation_completeness.py +0 -0
  246. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/conversation_completeness/schema.py +0 -0
  247. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/conversation_completeness/template.py +0 -0
  248. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/conversational_dag/__init__.py +0 -0
  249. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/conversational_dag/conversational_dag.py +0 -0
  250. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/conversational_dag/nodes.py +0 -0
  251. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/conversational_dag/templates.py +0 -0
  252. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/conversational_g_eval/__init__.py +0 -0
  253. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/conversational_g_eval/conversational_g_eval.py +0 -0
  254. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/conversational_g_eval/schema.py +0 -0
  255. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/conversational_g_eval/template.py +0 -0
  256. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/dag/__init__.py +0 -0
  257. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/dag/dag.py +0 -0
  258. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/dag/graph.py +0 -0
  259. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/dag/nodes.py +0 -0
  260. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/dag/schema.py +0 -0
  261. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/dag/templates.py +0 -0
  262. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/dag/utils.py +0 -0
  263. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/exact_match/__init__.py +0 -0
  264. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/exact_match/exact_match.py +0 -0
  265. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/faithfulness/__init__.py +0 -0
  266. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/faithfulness/faithfulness.py +0 -0
  267. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/faithfulness/schema.py +0 -0
  268. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/faithfulness/template.py +0 -0
  269. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/g_eval/__init__.py +0 -0
  270. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/g_eval/g_eval.py +0 -0
  271. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/g_eval/schema.py +0 -0
  272. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/g_eval/template.py +0 -0
  273. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/g_eval/utils.py +0 -0
  274. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/goal_accuracy/__init__.py +0 -0
  275. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/goal_accuracy/goal_accuracy.py +0 -0
  276. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/goal_accuracy/schema.py +0 -0
  277. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/goal_accuracy/template.py +0 -0
  278. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/hallucination/__init__.py +0 -0
  279. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/hallucination/hallucination.py +0 -0
  280. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/hallucination/schema.py +0 -0
  281. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/hallucination/template.py +0 -0
  282. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/indicator.py +0 -0
  283. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/json_correctness/__init__.py +0 -0
  284. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/json_correctness/json_correctness.py +0 -0
  285. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/json_correctness/schema.py +0 -0
  286. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/json_correctness/template.py +0 -0
  287. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/knowledge_retention/__init__.py +0 -0
  288. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/knowledge_retention/knowledge_retention.py +0 -0
  289. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/knowledge_retention/schema.py +0 -0
  290. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/knowledge_retention/template.py +0 -0
  291. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/mcp/__init__.py +0 -0
  292. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/mcp/mcp_task_completion.py +0 -0
  293. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +0 -0
  294. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/mcp/schema.py +0 -0
  295. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/mcp/template.py +0 -0
  296. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/mcp_use_metric/__init__.py +0 -0
  297. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/mcp_use_metric/mcp_use_metric.py +0 -0
  298. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/mcp_use_metric/schema.py +0 -0
  299. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/mcp_use_metric/template.py +0 -0
  300. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/misuse/__init__.py +0 -0
  301. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/misuse/misuse.py +0 -0
  302. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/misuse/schema.py +0 -0
  303. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/misuse/template.py +0 -0
  304. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/multimodal_metrics/__init__.py +0 -0
  305. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/multimodal_metrics/image_coherence/__init__.py +0 -0
  306. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/multimodal_metrics/image_coherence/schema.py +0 -0
  307. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/multimodal_metrics/image_coherence/template.py +0 -0
  308. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/multimodal_metrics/image_editing/__init__.py +0 -0
  309. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +0 -0
  310. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/multimodal_metrics/image_editing/schema.py +0 -0
  311. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/multimodal_metrics/image_editing/template.py +0 -0
  312. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/multimodal_metrics/image_helpfulness/__init__.py +0 -0
  313. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/multimodal_metrics/image_helpfulness/schema.py +0 -0
  314. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/multimodal_metrics/image_helpfulness/template.py +0 -0
  315. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/multimodal_metrics/image_reference/__init__.py +0 -0
  316. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/multimodal_metrics/image_reference/schema.py +0 -0
  317. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/multimodal_metrics/image_reference/template.py +0 -0
  318. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/multimodal_metrics/text_to_image/__init__.py +0 -0
  319. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/multimodal_metrics/text_to_image/schema.py +0 -0
  320. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/multimodal_metrics/text_to_image/template.py +0 -0
  321. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +0 -0
  322. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/non_advice/__init__.py +0 -0
  323. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/non_advice/non_advice.py +0 -0
  324. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/non_advice/schema.py +0 -0
  325. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/non_advice/template.py +0 -0
  326. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/pattern_match/__init__.py +0 -0
  327. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/pattern_match/pattern_match.py +0 -0
  328. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/pii_leakage/__init__.py +0 -0
  329. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/pii_leakage/pii_leakage.py +0 -0
  330. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/pii_leakage/schema.py +0 -0
  331. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/pii_leakage/template.py +0 -0
  332. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/plan_adherence/__init__.py +0 -0
  333. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/plan_adherence/plan_adherence.py +0 -0
  334. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/plan_adherence/schema.py +0 -0
  335. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/plan_adherence/template.py +0 -0
  336. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/plan_quality/__init__.py +0 -0
  337. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/plan_quality/plan_quality.py +0 -0
  338. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/plan_quality/schema.py +0 -0
  339. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/plan_quality/template.py +0 -0
  340. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/prompt_alignment/__init__.py +0 -0
  341. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/prompt_alignment/prompt_alignment.py +0 -0
  342. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/prompt_alignment/schema.py +0 -0
  343. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/prompt_alignment/template.py +0 -0
  344. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/ragas.py +0 -0
  345. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/role_adherence/__init__.py +0 -0
  346. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/role_adherence/role_adherence.py +0 -0
  347. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/role_adherence/schema.py +0 -0
  348. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/role_adherence/template.py +0 -0
  349. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/role_violation/__init__.py +0 -0
  350. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/role_violation/role_violation.py +0 -0
  351. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/role_violation/schema.py +0 -0
  352. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/role_violation/template.py +0 -0
  353. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/step_efficiency/__init__.py +0 -0
  354. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/step_efficiency/schema.py +0 -0
  355. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/step_efficiency/step_efficiency.py +0 -0
  356. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/step_efficiency/template.py +0 -0
  357. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/summarization/__init__.py +0 -0
  358. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/summarization/schema.py +0 -0
  359. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/summarization/summarization.py +0 -0
  360. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/summarization/template.py +0 -0
  361. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/task_completion/__init__.py +0 -0
  362. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/task_completion/schema.py +0 -0
  363. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/task_completion/task_completion.py +0 -0
  364. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/task_completion/template.py +0 -0
  365. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/tool_correctness/__init__.py +0 -0
  366. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/tool_correctness/schema.py +0 -0
  367. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/tool_correctness/template.py +0 -0
  368. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/tool_correctness/tool_correctness.py +0 -0
  369. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/tool_use/__init__.py +0 -0
  370. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/tool_use/schema.py +0 -0
  371. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/tool_use/template.py +0 -0
  372. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/tool_use/tool_use.py +0 -0
  373. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/topic_adherence/__init__.py +0 -0
  374. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/topic_adherence/schema.py +0 -0
  375. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/topic_adherence/template.py +0 -0
  376. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/topic_adherence/topic_adherence.py +0 -0
  377. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/toxicity/__init__.py +0 -0
  378. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/toxicity/schema.py +0 -0
  379. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/toxicity/template.py +0 -0
  380. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/toxicity/toxicity.py +0 -0
  381. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/turn_contextual_precision/__init__.py +0 -0
  382. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/turn_contextual_precision/schema.py +0 -0
  383. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/turn_contextual_precision/template.py +0 -0
  384. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +0 -0
  385. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/turn_contextual_recall/__init__.py +0 -0
  386. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/turn_contextual_recall/schema.py +0 -0
  387. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/turn_contextual_recall/template.py +0 -0
  388. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +0 -0
  389. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/turn_contextual_relevancy/__init__.py +0 -0
  390. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/turn_contextual_relevancy/schema.py +0 -0
  391. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/turn_contextual_relevancy/template.py +0 -0
  392. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +0 -0
  393. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/turn_faithfulness/__init__.py +0 -0
  394. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/turn_faithfulness/schema.py +0 -0
  395. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/turn_faithfulness/template.py +0 -0
  396. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/turn_faithfulness/turn_faithfulness.py +0 -0
  397. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/turn_relevancy/__init__.py +0 -0
  398. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/turn_relevancy/schema.py +0 -0
  399. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/turn_relevancy/template.py +0 -0
  400. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/metrics/turn_relevancy/turn_relevancy.py +0 -0
  401. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/model_integrations/__init__.py +0 -0
  402. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/model_integrations/types.py +0 -0
  403. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/model_integrations/utils.py +0 -0
  404. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/models/__init__.py +0 -0
  405. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/models/_summac_model.py +0 -0
  406. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/models/answer_relevancy_model.py +0 -0
  407. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/models/base_model.py +0 -0
  408. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/models/detoxify_model.py +0 -0
  409. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/models/embedding_models/__init__.py +0 -0
  410. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/models/embedding_models/azure_embedding_model.py +0 -0
  411. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/models/embedding_models/local_embedding_model.py +0 -0
  412. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/models/embedding_models/ollama_embedding_model.py +0 -0
  413. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/models/embedding_models/openai_embedding_model.py +0 -0
  414. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/models/hallucination_model.py +0 -0
  415. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/models/llms/__init__.py +0 -0
  416. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/models/llms/anthropic_model.py +0 -0
  417. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/models/llms/constants.py +0 -0
  418. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/models/llms/deepseek_model.py +0 -0
  419. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/models/llms/grok_model.py +0 -0
  420. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/models/llms/kimi_model.py +0 -0
  421. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/models/llms/litellm_model.py +0 -0
  422. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/models/llms/local_model.py +0 -0
  423. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/models/llms/ollama_model.py +0 -0
  424. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/models/llms/openai_model.py +0 -0
  425. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/models/llms/openrouter_model.py +0 -0
  426. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/models/llms/portkey_model.py +0 -0
  427. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/models/llms/utils.py +0 -0
  428. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/models/retry_policy.py +0 -0
  429. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/models/summac_model.py +0 -0
  430. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/models/unbias_model.py +0 -0
  431. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/models/utils.py +0 -0
  432. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/openai/__init__.py +0 -0
  433. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/openai/extractors.py +0 -0
  434. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/openai/patch.py +0 -0
  435. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/openai/utils.py +0 -0
  436. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/openai_agents/__init__.py +0 -0
  437. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/openai_agents/agent.py +0 -0
  438. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/openai_agents/callback_handler.py +0 -0
  439. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/openai_agents/extractors.py +0 -0
  440. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/openai_agents/patch.py +0 -0
  441. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/openai_agents/runner.py +0 -0
  442. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/optimizer/__init__.py +0 -0
  443. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/optimizer/algorithms/__init__.py +0 -0
  444. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/optimizer/algorithms/base.py +0 -0
  445. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/optimizer/algorithms/configs.py +0 -0
  446. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/optimizer/algorithms/copro/__init__.py +0 -0
  447. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/optimizer/algorithms/copro/copro.py +0 -0
  448. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/optimizer/algorithms/gepa/__init__.py +0 -0
  449. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/optimizer/algorithms/gepa/gepa.py +0 -0
  450. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/optimizer/algorithms/miprov2/__init__.py +0 -0
  451. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/optimizer/algorithms/miprov2/bootstrapper.py +0 -0
  452. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/optimizer/algorithms/miprov2/miprov2.py +0 -0
  453. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/optimizer/algorithms/miprov2/proposer.py +0 -0
  454. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/optimizer/algorithms/simba/__init__.py +0 -0
  455. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/optimizer/algorithms/simba/simba.py +0 -0
  456. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/optimizer/algorithms/simba/types.py +0 -0
  457. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/optimizer/configs.py +0 -0
  458. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/optimizer/policies.py +0 -0
  459. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/optimizer/prompt_optimizer.py +0 -0
  460. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/optimizer/rewriter/__init__.py +0 -0
  461. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/optimizer/rewriter/rewriter.py +0 -0
  462. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/optimizer/rewriter/utils.py +0 -0
  463. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/optimizer/scorer/__init__.py +0 -0
  464. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/optimizer/scorer/base.py +0 -0
  465. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/optimizer/scorer/scorer.py +0 -0
  466. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/optimizer/scorer/utils.py +0 -0
  467. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/optimizer/types.py +0 -0
  468. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/optimizer/utils.py +0 -0
  469. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/plugins/__init__.py +0 -0
  470. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/plugins/plugin.py +0 -0
  471. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/progress_context.py +0 -0
  472. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/prompt/__init__.py +0 -0
  473. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/prompt/api.py +0 -0
  474. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/prompt/utils.py +0 -0
  475. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/py.typed +0 -0
  476. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/red_teaming/README.md +0 -0
  477. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/scorer/__init__.py +0 -0
  478. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/scorer/scorer.py +0 -0
  479. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/simulator/__init__.py +0 -0
  480. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/simulator/schema.py +0 -0
  481. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/simulator/template.py +0 -0
  482. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/singleton.py +0 -0
  483. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/synthesizer/__init__.py +0 -0
  484. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/synthesizer/base_synthesizer.py +0 -0
  485. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/synthesizer/chunking/__init__.py +0 -0
  486. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/synthesizer/chunking/context_generator.py +0 -0
  487. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/synthesizer/chunking/doc_chunker.py +0 -0
  488. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/synthesizer/config.py +0 -0
  489. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/synthesizer/schema.py +0 -0
  490. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/synthesizer/synthesizer.py +0 -0
  491. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/synthesizer/templates/__init__.py +0 -0
  492. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/synthesizer/templates/template.py +0 -0
  493. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/synthesizer/templates/template_extraction.py +0 -0
  494. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/synthesizer/templates/template_prompt.py +0 -0
  495. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/synthesizer/types.py +0 -0
  496. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/synthesizer/utils.py +0 -0
  497. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/test_case/__init__.py +0 -0
  498. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/test_case/api.py +0 -0
  499. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/test_case/arena_test_case.py +0 -0
  500. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/test_case/conversational_test_case.py +0 -0
  501. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/test_case/mcp.py +0 -0
  502. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/test_case/utils.py +0 -0
  503. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/test_run/__init__.py +0 -0
  504. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/test_run/api.py +0 -0
  505. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/test_run/cache.py +0 -0
  506. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/test_run/hooks.py +0 -0
  507. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/test_run/hyperparameters.py +0 -0
  508. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/test_run/test_run.py +0 -0
  509. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/tracing/__init__.py +0 -0
  510. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/tracing/api.py +0 -0
  511. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/tracing/context.py +0 -0
  512. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/tracing/offline_evals/__init__.py +0 -0
  513. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/tracing/offline_evals/api.py +0 -0
  514. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/tracing/offline_evals/span.py +0 -0
  515. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/tracing/offline_evals/thread.py +0 -0
  516. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/tracing/offline_evals/trace.py +0 -0
  517. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/tracing/otel/__init__.py +0 -0
  518. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/tracing/otel/exporter.py +0 -0
  519. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/tracing/otel/test_exporter.py +0 -0
  520. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/tracing/otel/utils.py +0 -0
  521. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/tracing/patchers.py +0 -0
  522. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/tracing/perf_epoch_bridge.py +0 -0
  523. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/tracing/trace_context.py +0 -0
  524. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/tracing/trace_test_manager.py +0 -0
  525. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/tracing/types.py +0 -0
  526. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/tracing/utils.py +0 -0
  527. {deepeval-3.8.0 → deepeval-3.8.2}/deepeval/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deepeval
3
- Version: 3.8.0
3
+ Version: 3.8.2
4
4
  Summary: The LLM Evaluation Framework
5
5
  Home-page: https://github.com/confident-ai/deepeval
6
6
  License: Apache-2.0
@@ -0,0 +1 @@
1
+ __version__: str = "3.8.2"
@@ -14,7 +14,7 @@ def send_annotation(
14
14
  explanation: Optional[str] = None,
15
15
  user_id: Optional[str] = None,
16
16
  type: Optional[AnnotationType] = AnnotationType.THUMBS_RATING,
17
- ) -> str:
17
+ ) -> None:
18
18
  api_annotation = APIAnnotation(
19
19
  rating=rating,
20
20
  traceUuid=trace_uuid,
@@ -50,7 +50,7 @@ async def a_send_annotation(
50
50
  explanation: Optional[str] = None,
51
51
  type: Optional[AnnotationType] = AnnotationType.THUMBS_RATING,
52
52
  user_id: Optional[str] = None,
53
- ) -> str:
53
+ ) -> None:
54
54
  api_annotation = APIAnnotation(
55
55
  rating=rating,
56
56
  traceUuid=trace_uuid,
@@ -26,16 +26,44 @@ API_BASE_URL_EU = "https://eu.api.confident-ai.com"
26
26
  retryable_exceptions = requests.exceptions.SSLError
27
27
 
28
28
 
29
+ def _infer_region_from_api_key(api_key: Optional[str]) -> Optional[str]:
30
+ """
31
+ Infer region from Confident API key prefix.
32
+
33
+ Supported:
34
+ - confident_eu_... => "EU"
35
+ - confident_us_... => "US"
36
+
37
+ Returns None if prefix is not recognized or api_key is falsy.
38
+ """
39
+ if not api_key:
40
+ return None
41
+ key = api_key.strip().lower()
42
+ if key.startswith("confident_eu_"):
43
+ return "EU"
44
+ if key.startswith("confident_us_"):
45
+ return "US"
46
+ return None
47
+
48
+
29
49
  def get_base_api_url():
30
50
  s = get_settings()
31
51
  if s.CONFIDENT_BASE_URL:
32
52
  base_url = s.CONFIDENT_BASE_URL.rstrip("/")
33
53
  return base_url
54
+ # If the user has explicitly set a region, respect it.
34
55
  region = KEY_FILE_HANDLER.fetch_data(KeyValues.CONFIDENT_REGION)
35
- if region == "EU":
56
+ if region:
57
+ return API_BASE_URL_EU if region == "EU" else API_BASE_URL
58
+
59
+ # Otherwise, infer region from the API key prefix.
60
+ api_key = get_confident_api_key()
61
+ inferred = _infer_region_from_api_key(api_key)
62
+ if inferred == "EU":
36
63
  return API_BASE_URL_EU
37
- else:
38
- return API_BASE_URL
64
+
65
+ # Default to US (backwards compatible)
66
+ return API_BASE_URL
39
67
 
40
68
 
41
69
  def get_confident_api_key() -> Optional[str]:
@@ -447,6 +447,9 @@ class Settings(BaseSettings):
447
447
  AZURE_OPENAI_API_KEY: Optional[SecretStr] = Field(
448
448
  None, description="Azure OpenAI API key."
449
449
  )
450
+ AZURE_OPENAI_AD_TOKEN: Optional[SecretStr] = Field(
451
+ None, description="Azure OpenAI Ad Token."
452
+ )
450
453
  AZURE_OPENAI_ENDPOINT: Optional[AnyUrl] = Field(
451
454
  None, description="Azure OpenAI endpoint URL."
452
455
  )
@@ -84,9 +84,11 @@ class EvaluationDataset:
84
84
  def __init__(
85
85
  self,
86
86
  goldens: Union[List[Golden], List[ConversationalGolden]] = [],
87
+ confident_api_key: Optional[str] = None,
87
88
  ):
88
89
  self._alias = None
89
90
  self._id = None
91
+ self.confident_api_key = confident_api_key
90
92
  if len(goldens) > 0:
91
93
  self._multi_turn = (
92
94
  True if isinstance(goldens[0], ConversationalGolden) else False
@@ -722,7 +724,7 @@ class EvaluationDataset:
722
724
  "Unable to push empty dataset to Confident AI, there must be at least one golden in dataset."
723
725
  )
724
726
 
725
- api = Api()
727
+ api = Api(api_key=self.confident_api_key)
726
728
  api_dataset = APIDataset(
727
729
  goldens=self.goldens if not self._multi_turn else None,
728
730
  conversationalGoldens=(self.goldens if self._multi_turn else None),
@@ -755,7 +757,7 @@ class EvaluationDataset:
755
757
  auto_convert_goldens_to_test_cases: bool = False,
756
758
  public: bool = False,
757
759
  ):
758
- api = Api()
760
+ api = Api(api_key=self.confident_api_key)
759
761
  with capture_pull_dataset():
760
762
  with Progress(
761
763
  SpinnerColumn(style="rgb(106,0,255)"),
@@ -839,7 +841,7 @@ class EvaluationDataset:
839
841
  raise ValueError(
840
842
  f"Can't queue empty list of goldens to dataset with alias: {alias} on Confident AI."
841
843
  )
842
- api = Api()
844
+ api = Api(api_key=self.confident_api_key)
843
845
 
844
846
  multi_turn = isinstance(goldens[0], ConversationalGolden)
845
847
 
@@ -871,7 +873,7 @@ class EvaluationDataset:
871
873
  self,
872
874
  alias: str,
873
875
  ):
874
- api = Api()
876
+ api = Api(api_key=self.confident_api_key)
875
877
  api.send_request(
876
878
  method=HttpMethods.DELETE,
877
879
  endpoint=Endpoints.DATASET_ALIAS_ENDPOINT,
@@ -1,3 +1,7 @@
1
+ import logging
2
+ import os
3
+ import threading
4
+
1
5
  from typing import Any, Optional, List, Dict
2
6
  from uuid import UUID
3
7
  from time import perf_counter
@@ -20,6 +24,19 @@ from deepeval.tracing.types import (
20
24
  )
21
25
  from deepeval.telemetry import capture_tracing_integration
22
26
 
27
+ # Debug logging for LangChain callbacks (enable with DEEPEVAL_DEBUG_LANGCHAIN_CALLBACKS=1)
28
+ _DEBUG_CALLBACKS = os.environ.get(
29
+ "DEEPEVAL_DEBUG_LANGCHAIN_CALLBACKS", ""
30
+ ).lower() in ("1", "true", "yes")
31
+
32
+ _logger = logging.getLogger(__name__)
33
+
34
+
35
+ def _debug_log(msg: str):
36
+ if _DEBUG_CALLBACKS:
37
+ _logger.debug(f"[LangChain Callback] {msg}")
38
+
39
+
23
40
  try:
24
41
  from langchain_core.callbacks.base import BaseCallbackHandler
25
42
  from langchain_core.outputs import LLMResult
@@ -29,6 +46,7 @@ try:
29
46
  # contains langchain imports
30
47
  from deepeval.integrations.langchain.utils import (
31
48
  parse_prompts_to_messages,
49
+ convert_chat_messages_to_input,
32
50
  extract_name,
33
51
  safe_extract_model_name,
34
52
  safe_extract_token_usage,
@@ -50,6 +68,12 @@ def is_langchain_installed():
50
68
 
51
69
 
52
70
  class CallbackHandler(BaseCallbackHandler):
71
+ # When users create multiple CallbackHandler instances for the same logical
72
+ # conversation (same thread_id), we want spans to land on the same trace.
73
+ # Otherwise, each handler lazily creates its own trace, and multi-turn flows
74
+ # become multiple single-turn traces.
75
+ _thread_id_to_trace_uuid: Dict[str, str] = {}
76
+ _thread_id_lock = threading.Lock()
53
77
 
54
78
  def __init__(
55
79
  self,
@@ -74,13 +98,20 @@ class CallbackHandler(BaseCallbackHandler):
74
98
  self._parent_span = None
75
99
 
76
100
  # Stash trace metadata to apply once we know which trace we are using.
77
- self._trace_init_fields: Dict[str, Any] = {
101
+ # _trace_init_fields is cleared after first apply to prevent re-applying
102
+ # on every callback within the same trace. _original_init_fields is kept
103
+ # permanently so we can re-apply when a new trace is created (e.g., in
104
+ # multi-turn scenarios where the previous trace was ended).
105
+ self._original_init_fields: Dict[str, Any] = {
78
106
  "name": name,
79
107
  "tags": tags,
80
108
  "metadata": metadata,
81
109
  "thread_id": thread_id,
82
110
  "user_id": user_id,
83
111
  }
112
+ self._trace_init_fields: Dict[str, Any] = dict(
113
+ self._original_init_fields
114
+ )
84
115
 
85
116
  # Map LangChain run_id -> our span uuid for parent span restoration
86
117
  self._run_id_to_span_uuid: Dict[str, str] = {}
@@ -96,6 +127,34 @@ class CallbackHandler(BaseCallbackHandler):
96
127
  This is done lazily during actual callback execution to avoid context
97
128
  corruption when the handler is constructed outside the async task/context.
98
129
  """
130
+ # If the user provided a thread_id, attempt to reuse an existing trace for it.
131
+ # This makes multi-turn tests that use multiple CallbackHandler instances behave
132
+ # as expected: one trace containing multiple turns/spans.
133
+ thread_id = None
134
+ fields = self._trace_init_fields or {}
135
+ if fields.get("thread_id"):
136
+ thread_id = fields["thread_id"]
137
+ # In case _trace_init_fields has already been cleared, fall back to trace metadata.
138
+ if thread_id is None and self._trace is not None:
139
+ thread_id = self._trace.thread_id
140
+
141
+ if thread_id:
142
+ with self._thread_id_lock:
143
+ existing_uuid = self._thread_id_to_trace_uuid.get(thread_id)
144
+ if existing_uuid:
145
+ existing_trace = trace_manager.get_trace_by_uuid(existing_uuid)
146
+ if (
147
+ existing_trace
148
+ and existing_trace.uuid in trace_manager.active_traces
149
+ ):
150
+ current_trace_context.set(existing_trace)
151
+ self._trace = existing_trace
152
+ self.trace_uuid = existing_trace.uuid
153
+ # Lazily capture the observe parent span if present.
154
+ if self._parent_span is None:
155
+ self._parent_span = current_span_context.get()
156
+ return existing_trace
157
+
99
158
  # Prefer current context trace if it is active.
100
159
  ctx_trace = current_trace_context.get()
101
160
  if ctx_trace and ctx_trace.uuid in trace_manager.active_traces:
@@ -107,6 +166,10 @@ class CallbackHandler(BaseCallbackHandler):
107
166
  current_trace_context.set(trace)
108
167
  else:
109
168
  # Otherwise, create a fresh trace now (in the right context).
169
+ # Restore _trace_init_fields from the original init fields so that
170
+ # the new trace gets the same name/tags/metadata as intended.
171
+ if not self._trace_init_fields and self._original_init_fields:
172
+ self._trace_init_fields = dict(self._original_init_fields)
110
173
  trace = trace_manager.start_new_trace()
111
174
  current_trace_context.set(trace)
112
175
  self._trace = trace
@@ -114,8 +177,18 @@ class CallbackHandler(BaseCallbackHandler):
114
177
  # Keep a copy for quick access.
115
178
  self.trace_uuid = trace.uuid
116
179
 
180
+ # Register this trace as the canonical trace for this thread_id (if provided).
181
+ # This allows other CallbackHandler instances created for the same thread_id
182
+ # to reuse the same trace instead of creating parallel traces.
183
+ fields = self._trace_init_fields or {}
184
+ tid = fields.get("thread_id") or trace.thread_id
185
+ if tid:
186
+ with self._thread_id_lock:
187
+ # Only set if absent to preserve the "first trace wins" behavior.
188
+ self._thread_id_to_trace_uuid.setdefault(tid, trace.uuid)
189
+
117
190
  # Apply stashed metadata once.
118
- fields = getattr(self, "_trace_init_fields", None) or {}
191
+ fields = self._trace_init_fields or {}
119
192
  if fields:
120
193
  if fields.get("name") is not None:
121
194
  trace.name = fields["name"]
@@ -202,6 +275,9 @@ class CallbackHandler(BaseCallbackHandler):
202
275
  metadata: Optional[dict[str, Any]] = None,
203
276
  **kwargs: Any,
204
277
  ) -> Any:
278
+ _debug_log(
279
+ f"on_chain_start: run_id={run_id}, parent_run_id={parent_run_id}, name={extract_name(serialized, **kwargs)}"
280
+ )
205
281
  # Create spans for all chains to establish proper parent-child hierarchy
206
282
  # This is important for LangGraph where there are nested chains
207
283
  with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
@@ -232,6 +308,9 @@ class CallbackHandler(BaseCallbackHandler):
232
308
  parent_run_id: Optional[UUID] = None,
233
309
  **kwargs: Any,
234
310
  ) -> Any:
311
+ _debug_log(
312
+ f"on_chain_end: run_id={run_id}, parent_run_id={parent_run_id}"
313
+ )
235
314
  uuid_str = str(run_id)
236
315
  base_span = trace_manager.get_span_by_uuid(uuid_str)
237
316
  if base_span:
@@ -246,6 +325,59 @@ class CallbackHandler(BaseCallbackHandler):
246
325
  trace.output = output
247
326
  exit_current_context(uuid_str=uuid_str)
248
327
 
328
+ def on_chat_model_start(
329
+ self,
330
+ serialized: dict[str, Any],
331
+ messages: list[list[Any]], # list[list[BaseMessage]]
332
+ *,
333
+ run_id: UUID,
334
+ parent_run_id: Optional[UUID] = None,
335
+ tags: Optional[list[str]] = None,
336
+ metadata: Optional[dict[str, Any]] = None,
337
+ **kwargs: Any,
338
+ ) -> Any:
339
+ """
340
+ Handle chat model start callback. In LangChain v1, chat models emit
341
+ on_chat_model_start instead of on_llm_start. The on_llm_end callback
342
+ is still used for both.
343
+ """
344
+ _debug_log(
345
+ f"on_chat_model_start: run_id={run_id}, parent_run_id={parent_run_id}, messages_len={len(messages)}"
346
+ )
347
+
348
+ # Guard against double-counting if both on_llm_start and on_chat_model_start fire
349
+ uuid_str = str(run_id)
350
+ existing_span = trace_manager.get_span_by_uuid(uuid_str)
351
+ if existing_span is not None:
352
+ _debug_log(
353
+ f"on_chat_model_start: span already exists for run_id={run_id}, skipping"
354
+ )
355
+ return
356
+
357
+ with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
358
+ # Convert messages to our internal format using the shared helper
359
+ input_messages = convert_chat_messages_to_input(messages, **kwargs)
360
+
361
+ # Safe extraction of model name (handle None metadata)
362
+ md = metadata or {}
363
+ model = safe_extract_model_name(md, **kwargs)
364
+
365
+ llm_span: LlmSpan = enter_current_context(
366
+ uuid_str=uuid_str,
367
+ span_type="llm",
368
+ func_name=extract_name(serialized, **kwargs),
369
+ )
370
+ # Register this run_id -> span mapping for child callbacks
371
+ self._run_id_to_span_uuid[str(run_id)] = uuid_str
372
+
373
+ llm_span.input = input_messages
374
+ llm_span.model = model
375
+
376
+ # Extract metrics and prompt from metadata if provided, but don't mutate original
377
+ llm_span.metrics = md.get("metrics")
378
+ llm_span.metric_collection = md.get("metric_collection")
379
+ llm_span.prompt = md.get("prompt")
380
+
249
381
  def on_llm_start(
250
382
  self,
251
383
  serialized: dict[str, Any],
@@ -257,10 +389,25 @@ class CallbackHandler(BaseCallbackHandler):
257
389
  metadata: Optional[dict[str, Any]] = None,
258
390
  **kwargs: Any,
259
391
  ) -> Any:
392
+ _debug_log(
393
+ f"on_llm_start: run_id={run_id}, parent_run_id={parent_run_id}, prompts_len={len(prompts)}"
394
+ )
395
+
396
+ # Guard against double-counting if both on_llm_start and on_chat_model_start fire
397
+ uuid_str = str(run_id)
398
+ existing_span = trace_manager.get_span_by_uuid(uuid_str)
399
+ if existing_span is not None:
400
+ _debug_log(
401
+ f"on_llm_start: span already exists for run_id={run_id}, skipping"
402
+ )
403
+ return
404
+
260
405
  with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
261
- uuid_str = str(run_id)
262
406
  input_messages = parse_prompts_to_messages(prompts, **kwargs)
263
- model = safe_extract_model_name(metadata, **kwargs)
407
+
408
+ # Safe extraction of model name (handle None metadata)
409
+ md = metadata or {}
410
+ model = safe_extract_model_name(md, **kwargs)
264
411
 
265
412
  llm_span: LlmSpan = enter_current_context(
266
413
  uuid_str=uuid_str,
@@ -272,12 +419,11 @@ class CallbackHandler(BaseCallbackHandler):
272
419
 
273
420
  llm_span.input = input_messages
274
421
  llm_span.model = model
275
- metrics = metadata.pop("metrics", None)
276
- metric_collection = metadata.pop("metric_collection", None)
277
- prompt = metadata.pop("prompt", None)
278
- llm_span.metrics = metrics
279
- llm_span.metric_collection = metric_collection
280
- llm_span.prompt = prompt
422
+
423
+ # Extract metrics and prompt from metadata if provided, but don't mutate original
424
+ llm_span.metrics = md.get("metrics")
425
+ llm_span.metric_collection = md.get("metric_collection")
426
+ llm_span.prompt = md.get("prompt")
281
427
 
282
428
  def on_llm_end(
283
429
  self,
@@ -287,9 +433,20 @@ class CallbackHandler(BaseCallbackHandler):
287
433
  parent_run_id: Optional[UUID] = None,
288
434
  **kwargs: Any, # un-logged kwargs
289
435
  ) -> Any:
436
+ _debug_log(
437
+ f"on_llm_end: run_id={run_id}, parent_run_id={parent_run_id}, response_type={type(response).__name__}"
438
+ )
290
439
  uuid_str = str(run_id)
291
440
  llm_span: LlmSpan = trace_manager.get_span_by_uuid(uuid_str)
292
441
  if llm_span is None:
442
+ _debug_log(f"on_llm_end: NO SPAN FOUND for run_id={run_id}")
443
+ return
444
+
445
+ # Guard against double-finalization (if both on_llm_end and on_chat_model_end fire)
446
+ if llm_span.end_time is not None:
447
+ _debug_log(
448
+ f"on_llm_end: span already finalized for run_id={run_id}, skipping"
449
+ )
293
450
  return
294
451
 
295
452
  with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
@@ -336,7 +493,6 @@ class CallbackHandler(BaseCallbackHandler):
336
493
  )
337
494
 
338
495
  llm_span.model = model if model else llm_span.model
339
- llm_span.input = llm_span.input
340
496
  llm_span.output = output
341
497
  llm_span.input_token_count = (
342
498
  total_input_tokens if total_input_tokens > 0 else None
@@ -347,6 +503,121 @@ class CallbackHandler(BaseCallbackHandler):
347
503
 
348
504
  exit_current_context(uuid_str=uuid_str)
349
505
 
506
+ def on_chat_model_end(
507
+ self,
508
+ response: Any,
509
+ *,
510
+ run_id: UUID,
511
+ parent_run_id: Optional[UUID] = None,
512
+ **kwargs: Any,
513
+ ) -> Any:
514
+ """
515
+ Handle chat model end callback. This may be called instead of or
516
+ in addition to on_llm_end depending on the LangChain version.
517
+ """
518
+ _debug_log(
519
+ f"on_chat_model_end: run_id={run_id}, parent_run_id={parent_run_id}, response_type={type(response).__name__}"
520
+ )
521
+ uuid_str = str(run_id)
522
+ llm_span: LlmSpan = trace_manager.get_span_by_uuid(uuid_str)
523
+ if llm_span is None:
524
+ _debug_log(f"on_chat_model_end: NO SPAN FOUND for run_id={run_id}")
525
+ return
526
+
527
+ # Guard against double-finalization, which could happen if both on_llm_end and on_chat_model_end fire
528
+ if llm_span.end_time is not None:
529
+ _debug_log(
530
+ f"on_chat_model_end: span already finalized for run_id={run_id}, skipping"
531
+ )
532
+ return
533
+
534
+ with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
535
+ output = ""
536
+ total_input_tokens = 0
537
+ total_output_tokens = 0
538
+ model = None
539
+
540
+ # Handle LLMResult (same as on_llm_end)
541
+ if isinstance(response, LLMResult):
542
+ for generation in response.generations:
543
+ for gen in generation:
544
+ if isinstance(gen, ChatGeneration):
545
+ if gen.message.response_metadata and isinstance(
546
+ gen.message.response_metadata, dict
547
+ ):
548
+ model = gen.message.response_metadata.get(
549
+ "model_name"
550
+ )
551
+ input_tokens, output_tokens = (
552
+ safe_extract_token_usage(
553
+ gen.message.response_metadata
554
+ )
555
+ )
556
+ total_input_tokens += input_tokens
557
+ total_output_tokens += output_tokens
558
+
559
+ if isinstance(gen.message, AIMessage):
560
+ ai_message = gen.message
561
+ tool_calls = []
562
+ for tool_call in ai_message.tool_calls:
563
+ tool_calls.append(
564
+ LlmToolCall(
565
+ name=tool_call["name"],
566
+ args=tool_call["args"],
567
+ id=tool_call["id"],
568
+ )
569
+ )
570
+ output = LlmOutput(
571
+ role="AI",
572
+ content=ai_message.content,
573
+ tool_calls=tool_calls,
574
+ )
575
+
576
+ llm_span.model = model if model else llm_span.model
577
+ llm_span.output = output
578
+ llm_span.input_token_count = (
579
+ total_input_tokens if total_input_tokens > 0 else None
580
+ )
581
+ llm_span.output_token_count = (
582
+ total_output_tokens if total_output_tokens > 0 else None
583
+ )
584
+
585
+ exit_current_context(uuid_str=uuid_str)
586
+
587
+ def on_chat_model_error(
588
+ self,
589
+ error: BaseException,
590
+ *,
591
+ run_id: UUID,
592
+ parent_run_id: Optional[UUID] = None,
593
+ **kwargs: Any,
594
+ ) -> Any:
595
+ """
596
+ Handle chat model error callback.
597
+ """
598
+ _debug_log(
599
+ f"on_chat_model_error: run_id={run_id}, parent_run_id={parent_run_id}, error={error}"
600
+ )
601
+ uuid_str = str(run_id)
602
+ llm_span: LlmSpan = trace_manager.get_span_by_uuid(uuid_str)
603
+ if llm_span is None:
604
+ _debug_log(
605
+ f"on_chat_model_error: NO SPAN FOUND for run_id={run_id}"
606
+ )
607
+ return
608
+
609
+ # Guard against double-finalization
610
+ if llm_span.end_time is not None:
611
+ _debug_log(
612
+ f"on_chat_model_error: span already finalized for run_id={run_id}, skipping"
613
+ )
614
+ return
615
+
616
+ with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
617
+ llm_span.status = TraceSpanStatus.ERRORED
618
+ llm_span.error = str(error)
619
+ exit_current_context(uuid_str=uuid_str)
620
+
350
621
  def on_llm_error(
351
622
  self,
352
623
  error: BaseException,
@@ -355,10 +626,22 @@ class CallbackHandler(BaseCallbackHandler):
355
626
  parent_run_id: Optional[UUID] = None,
356
627
  **kwargs: Any,
357
628
  ) -> Any:
629
+ _debug_log(
630
+ f"on_llm_error: run_id={run_id}, parent_run_id={parent_run_id}, error={error}"
631
+ )
358
632
  uuid_str = str(run_id)
359
633
  llm_span: LlmSpan = trace_manager.get_span_by_uuid(uuid_str)
360
634
  if llm_span is None:
635
+ _debug_log(f"on_llm_error: NO SPAN FOUND for run_id={run_id}")
361
636
  return
637
+
638
+ # Guard against double-finalization
639
+ if llm_span.end_time is not None:
640
+ _debug_log(
641
+ f"on_llm_error: span already finalized for run_id={run_id}, skipping"
642
+ )
643
+ return
644
+
362
645
  with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
363
646
  llm_span.status = TraceSpanStatus.ERRORED
364
647
  llm_span.error = str(error)
@@ -396,6 +679,9 @@ class CallbackHandler(BaseCallbackHandler):
396
679
  inputs: Optional[dict[str, Any]] = None,
397
680
  **kwargs: Any,
398
681
  ) -> Any:
682
+ _debug_log(
683
+ f"on_tool_start: run_id={run_id}, parent_run_id={parent_run_id}, name={extract_name(serialized, **kwargs)}"
684
+ )
399
685
  with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
400
686
  uuid_str = str(run_id)
401
687
 
@@ -418,6 +704,9 @@ class CallbackHandler(BaseCallbackHandler):
418
704
  parent_run_id: Optional[UUID] = None,
419
705
  **kwargs: Any, # un-logged kwargs
420
706
  ) -> Any:
707
+ _debug_log(
708
+ f"on_tool_end: run_id={run_id}, parent_run_id={parent_run_id}"
709
+ )
421
710
  uuid_str = str(run_id)
422
711
  tool_span: ToolSpan = trace_manager.get_span_by_uuid(uuid_str)
423
712
  if tool_span is None:
@@ -485,20 +774,23 @@ class CallbackHandler(BaseCallbackHandler):
485
774
  ) -> Any:
486
775
  with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
487
776
  uuid_str = str(run_id)
777
+ # Safe access to metadata (handle None)
778
+ md = metadata or {}
488
779
  retriever_span = enter_current_context(
489
780
  uuid_str=uuid_str,
490
781
  span_type="retriever",
491
782
  func_name=extract_name(serialized, **kwargs),
492
783
  observe_kwargs={
493
- "embedder": metadata.get(
494
- "ls_embedding_provider", "unknown"
495
- ),
784
+ "embedder": md.get("ls_embedding_provider", "unknown"),
496
785
  },
497
786
  )
498
787
  # Register this run_id -> span mapping for child callbacks
499
788
  self._run_id_to_span_uuid[str(run_id)] = uuid_str
500
789
  retriever_span.input = query
501
790
 
791
+ # Extract metric_collection from metadata if provided
792
+ retriever_span.metric_collection = md.get("metric_collection")
793
+
502
794
  def on_retriever_end(
503
795
  self,
504
796
  output: Any,
@@ -539,4 +831,4 @@ class CallbackHandler(BaseCallbackHandler):
539
831
  with self._ctx(run_id=run_id, parent_run_id=parent_run_id):
540
832
  retriever_span.status = TraceSpanStatus.ERRORED
541
833
  retriever_span.error = str(error)
542
- exit_current_context(uuid_str=uuid_str)
834
+ exit_current_context(uuid_str=uuid_str)