deepeval 3.6.5__tar.gz → 3.6.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {deepeval-3.6.5 → deepeval-3.6.6}/PKG-INFO +1 -1
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/__init__.py +42 -10
- deepeval-3.6.6/deepeval/_version.py +1 -0
- deepeval-3.6.6/deepeval/config/logging.py +33 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/config/settings.py +154 -12
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/evaluate/execute.py +22 -19
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/g_eval/g_eval.py +26 -15
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/prompt_alignment/prompt_alignment.py +41 -23
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/models/retry_policy.py +202 -11
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/tracing/otel/exporter.py +0 -6
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/tracing/otel/utils.py +57 -7
- deepeval-3.6.6/deepeval/tracing/trace_test_manager.py +19 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/tracing/tracing.py +1 -1
- {deepeval-3.6.5 → deepeval-3.6.6}/pyproject.toml +1 -1
- deepeval-3.6.5/deepeval/_version.py +0 -1
- {deepeval-3.6.5 → deepeval-3.6.6}/LICENSE.md +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/README.md +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/annotation/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/annotation/annotation.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/annotation/api.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/arc/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/arc/arc.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/arc/mode.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/arc/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/base_benchmark.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/bbq/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/bbq/bbq.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/bbq/task.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/bbq/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/big_bench_hard.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/boolean_expressions.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/causal_judgement.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/date_understanding.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/disambiguation_qa.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/dyck_languages.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/formal_fallacies.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/geometric_shapes.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/hyperbaton.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_five_objects.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_seven_objects.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_three_objects.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/movie_recommendation.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/multistep_arithmetic_two.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/navigate.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/object_counting.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/penguins_in_a_table.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/reasoning_about_colored_objects.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/ruin_names.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/salient_translation_error_detection.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/snarks.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/sports_understanding.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/temporal_sequences.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/web_of_lies.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/word_sorting.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/boolean_expressions.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/causal_judgement.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/date_understanding.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/disambiguation_qa.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/dyck_languages.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/formal_fallacies.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/geometric_shapes.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/hyperbaton.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_five_objects.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_seven_objects.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_three_objects.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/movie_recommendation.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/multistep_arithmetic_two.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/navigate.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/object_counting.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/penguins_in_a_table.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/reasoning_about_colored_objects.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/ruin_names.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/salient_translation_error_detection.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/snarks.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/sports_understanding.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/temporal_sequences.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/web_of_lies.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/word_sorting.txt +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/task.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/bool_q/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/bool_q/bool_q.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/bool_q/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/drop/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/drop/drop.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/drop/task.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/drop/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/equity_med_qa/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/equity_med_qa/equity_med_qa.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/equity_med_qa/task.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/equity_med_qa/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/gsm8k/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/gsm8k/gsm8k.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/gsm8k/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/hellaswag/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/hellaswag/hellaswag.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/hellaswag/task.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/hellaswag/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/human_eval/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/human_eval/human_eval.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/human_eval/task.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/human_eval/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/ifeval/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/ifeval/ifeval.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/ifeval/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/lambada/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/lambada/lambada.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/lambada/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/logi_qa/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/logi_qa/logi_qa.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/logi_qa/task.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/logi_qa/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/math_qa/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/math_qa/math_qa.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/math_qa/task.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/math_qa/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/mmlu/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/mmlu/mmlu.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/mmlu/task.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/mmlu/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/modes/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/results.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/squad/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/squad/squad.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/squad/task.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/squad/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/tasks/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/truthful_qa/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/truthful_qa/mode.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/truthful_qa/task.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/truthful_qa/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/truthful_qa/truthful_qa.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/utils.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/winogrande/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/winogrande/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/benchmarks/winogrande/winogrande.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/cli/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/cli/dotenv_handler.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/cli/main.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/cli/server.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/cli/test.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/cli/types.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/cli/utils.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/confident/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/confident/api.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/confident/types.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/config/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/config/settings_manager.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/config/utils.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/constants.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/contextvars.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/dataset/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/dataset/api.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/dataset/dataset.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/dataset/golden.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/dataset/test_run_tracer.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/dataset/types.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/dataset/utils.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/errors.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/evaluate/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/evaluate/api.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/evaluate/compare.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/evaluate/configs.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/evaluate/evaluate.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/evaluate/types.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/evaluate/utils.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/integrations/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/integrations/crewai/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/integrations/crewai/handler.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/integrations/crewai/wrapper.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/integrations/hugging_face/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/integrations/hugging_face/callback.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/integrations/hugging_face/rich_manager.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/integrations/hugging_face/tests/test_callbacks.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/integrations/hugging_face/utils.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/integrations/langchain/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/integrations/langchain/callback.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/integrations/langchain/patch.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/integrations/langchain/utils.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/integrations/llama_index/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/integrations/llama_index/agent/patched.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/integrations/llama_index/handler.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/integrations/llama_index/utils.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/integrations/pydantic_ai/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/integrations/pydantic_ai/agent.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/integrations/pydantic_ai/instrumentator.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/integrations/pydantic_ai/otel.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/integrations/pydantic_ai/test_instrumentator.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/key_handler.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/answer_relevancy/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/answer_relevancy/answer_relevancy.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/answer_relevancy/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/answer_relevancy/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/arena_g_eval/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/arena_g_eval/arena_g_eval.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/arena_g_eval/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/arena_g_eval/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/arena_g_eval/utils.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/argument_correctness/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/argument_correctness/argument_correctness.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/argument_correctness/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/argument_correctness/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/base_metric.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/bias/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/bias/bias.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/bias/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/bias/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/contextual_precision/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/contextual_precision/contextual_precision.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/contextual_precision/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/contextual_precision/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/contextual_recall/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/contextual_recall/contextual_recall.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/contextual_recall/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/contextual_recall/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/contextual_relevancy/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/contextual_relevancy/contextual_relevancy.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/contextual_relevancy/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/contextual_relevancy/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/conversation_completeness/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/conversation_completeness/conversation_completeness.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/conversation_completeness/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/conversation_completeness/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/conversational_dag/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/conversational_dag/conversational_dag.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/conversational_dag/nodes.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/conversational_dag/templates.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/conversational_g_eval/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/conversational_g_eval/conversational_g_eval.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/conversational_g_eval/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/conversational_g_eval/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/dag/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/dag/dag.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/dag/graph.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/dag/nodes.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/dag/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/dag/templates.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/dag/utils.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/faithfulness/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/faithfulness/faithfulness.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/faithfulness/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/faithfulness/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/g_eval/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/g_eval/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/g_eval/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/g_eval/utils.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/hallucination/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/hallucination/hallucination.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/hallucination/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/hallucination/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/indicator.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/json_correctness/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/json_correctness/json_correctness.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/json_correctness/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/json_correctness/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/knowledge_retention/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/knowledge_retention/knowledge_retention.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/knowledge_retention/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/knowledge_retention/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/mcp/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/mcp/mcp_task_completion.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/mcp/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/mcp/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/mcp_use_metric/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/mcp_use_metric/mcp_use_metric.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/mcp_use_metric/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/mcp_use_metric/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/misuse/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/misuse/misuse.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/misuse/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/misuse/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/image_coherence/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/image_coherence/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/image_coherence/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/image_editing/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/image_editing/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/image_editing/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/image_helpfulness/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/image_helpfulness/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/image_helpfulness/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/image_reference/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/image_reference/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/image_reference/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/text_to_image/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/text_to_image/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/text_to_image/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/non_advice/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/non_advice/non_advice.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/non_advice/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/non_advice/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/pii_leakage/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/pii_leakage/pii_leakage.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/pii_leakage/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/pii_leakage/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/prompt_alignment/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/prompt_alignment/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/prompt_alignment/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/ragas.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/role_adherence/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/role_adherence/role_adherence.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/role_adherence/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/role_adherence/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/role_violation/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/role_violation/role_violation.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/role_violation/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/role_violation/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/summarization/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/summarization/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/summarization/summarization.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/summarization/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/task_completion/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/task_completion/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/task_completion/task_completion.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/task_completion/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/tool_correctness/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/tool_correctness/tool_correctness.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/toxicity/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/toxicity/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/toxicity/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/toxicity/toxicity.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/turn_relevancy/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/turn_relevancy/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/turn_relevancy/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/turn_relevancy/turn_relevancy.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/metrics/utils.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/models/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/models/_summac_model.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/models/answer_relevancy_model.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/models/base_model.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/models/detoxify_model.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/models/embedding_models/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/models/embedding_models/azure_embedding_model.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/models/embedding_models/local_embedding_model.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/models/embedding_models/ollama_embedding_model.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/models/embedding_models/openai_embedding_model.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/models/hallucination_model.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/models/llms/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/models/llms/amazon_bedrock_model.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/models/llms/anthropic_model.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/models/llms/azure_model.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/models/llms/deepseek_model.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/models/llms/gemini_model.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/models/llms/grok_model.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/models/llms/kimi_model.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/models/llms/litellm_model.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/models/llms/local_model.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/models/llms/ollama_model.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/models/llms/openai_model.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/models/llms/utils.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/models/mlllms/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/models/mlllms/gemini_model.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/models/mlllms/ollama_model.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/models/mlllms/openai_model.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/models/summac_model.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/models/unbias_model.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/models/utils.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/openai/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/openai/extractors.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/openai/patch.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/openai/utils.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/openai_agents/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/openai_agents/agent.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/openai_agents/callback_handler.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/openai_agents/extractors.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/openai_agents/patch.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/openai_agents/runner.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/plugins/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/plugins/plugin.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/progress_context.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/prompt/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/prompt/api.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/prompt/prompt.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/prompt/utils.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/py.typed +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/red_teaming/README.md +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/scorer/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/scorer/scorer.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/simulator/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/simulator/conversation_simulator.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/simulator/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/simulator/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/singleton.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/synthesizer/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/synthesizer/base_synthesizer.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/synthesizer/chunking/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/synthesizer/chunking/context_generator.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/synthesizer/chunking/doc_chunker.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/synthesizer/config.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/synthesizer/schema.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/synthesizer/synthesizer.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/synthesizer/templates/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/synthesizer/templates/template.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/synthesizer/templates/template_extraction.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/synthesizer/templates/template_prompt.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/synthesizer/types.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/synthesizer/utils.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/telemetry.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/test_case/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/test_case/arena_test_case.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/test_case/conversational_test_case.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/test_case/llm_test_case.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/test_case/mcp.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/test_case/mllm_test_case.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/test_case/utils.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/test_run/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/test_run/api.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/test_run/cache.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/test_run/hooks.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/test_run/hyperparameters.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/test_run/test_run.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/tracing/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/tracing/api.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/tracing/context.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/tracing/offline_evals/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/tracing/offline_evals/api.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/tracing/offline_evals/span.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/tracing/offline_evals/thread.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/tracing/offline_evals/trace.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/tracing/otel/__init__.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/tracing/otel/test_exporter.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/tracing/patchers.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/tracing/perf_epoch_bridge.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/tracing/trace_context.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/tracing/types.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/tracing/utils.py +0 -0
- {deepeval-3.6.5 → deepeval-3.6.6}/deepeval/utils.py +0 -0
|
@@ -1,24 +1,56 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
1
4
|
import os
|
|
2
|
-
import warnings
|
|
3
5
|
import re
|
|
6
|
+
import warnings
|
|
4
7
|
|
|
5
|
-
# load environment variables before other imports
|
|
8
|
+
# IMPORTANT: load environment variables before other imports
|
|
6
9
|
from deepeval.config.settings import autoload_dotenv, get_settings
|
|
7
10
|
|
|
11
|
+
logging.getLogger("deepeval").addHandler(logging.NullHandler())
|
|
8
12
|
autoload_dotenv()
|
|
9
13
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
14
|
+
|
|
15
|
+
def _expose_public_api() -> None:
|
|
16
|
+
# All other imports must happen after env is loaded
|
|
17
|
+
# Do not do this at module level or ruff will complain with E402
|
|
18
|
+
global __version__, evaluate, assert_test, compare
|
|
19
|
+
global on_test_run_end, log_hyperparameters, login, telemetry
|
|
20
|
+
|
|
21
|
+
from ._version import __version__ as _version
|
|
22
|
+
from deepeval.evaluate import (
|
|
23
|
+
evaluate as _evaluate,
|
|
24
|
+
assert_test as _assert_test,
|
|
25
|
+
)
|
|
26
|
+
from deepeval.evaluate.compare import compare as _compare
|
|
27
|
+
from deepeval.test_run import (
|
|
28
|
+
on_test_run_end as _on_end,
|
|
29
|
+
log_hyperparameters as _log_hparams,
|
|
30
|
+
)
|
|
31
|
+
from deepeval.utils import login as _login
|
|
32
|
+
import deepeval.telemetry as _telemetry
|
|
33
|
+
|
|
34
|
+
__version__ = _version
|
|
35
|
+
evaluate = _evaluate
|
|
36
|
+
assert_test = _assert_test
|
|
37
|
+
compare = _compare
|
|
38
|
+
on_test_run_end = _on_end
|
|
39
|
+
log_hyperparameters = _log_hparams
|
|
40
|
+
login = _login
|
|
41
|
+
telemetry = _telemetry
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
_expose_public_api()
|
|
16
45
|
|
|
17
46
|
|
|
18
47
|
settings = get_settings()
|
|
48
|
+
|
|
19
49
|
if not settings.DEEPEVAL_GRPC_LOGGING:
|
|
20
|
-
os.
|
|
21
|
-
|
|
50
|
+
if os.getenv("GRPC_VERBOSITY") is None:
|
|
51
|
+
os.environ["GRPC_VERBOSITY"] = settings.GRPC_VERBOSITY or "ERROR"
|
|
52
|
+
if os.getenv("GRPC_TRACE") is None:
|
|
53
|
+
os.environ["GRPC_TRACE"] = settings.GRPC_TRACE or ""
|
|
22
54
|
|
|
23
55
|
|
|
24
56
|
__all__ = [
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__: str = "3.6.6"
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Minimal logging configuration helpers for DeepEval.
|
|
3
|
+
|
|
4
|
+
This module centralizes how the library-level logger ("deepeval") is configured. We
|
|
5
|
+
intentionally keep configuration lightweight so application code retains control
|
|
6
|
+
over handlers and formatters.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
from deepeval.config.settings import get_settings
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def apply_deepeval_log_level() -> None:
|
|
14
|
+
"""
|
|
15
|
+
Apply DeepEval's current log level to the package logger.
|
|
16
|
+
|
|
17
|
+
This function reads `LOG_LEVEL` from `deepeval.config.settings.get_settings()`
|
|
18
|
+
and sets the level of the `"deepeval"` logger accordingly. If `LOG_LEVEL` is
|
|
19
|
+
unset (None), INFO is used as a default. The logger's `propagate` flag is set
|
|
20
|
+
to True so records bubble up to the application's handlers. DeepEval does not
|
|
21
|
+
install its own handlers here (a NullHandler is attached in `__init__.py`).
|
|
22
|
+
|
|
23
|
+
The function is idempotent and safe to call multiple times. It is invoked
|
|
24
|
+
automatically when settings are first constructed and whenever `LOG_LEVEL`
|
|
25
|
+
is changed via `settings.edit`.
|
|
26
|
+
"""
|
|
27
|
+
settings = get_settings()
|
|
28
|
+
log_level = settings.LOG_LEVEL
|
|
29
|
+
logging.getLogger("deepeval").setLevel(
|
|
30
|
+
log_level if log_level is not None else logging.INFO
|
|
31
|
+
)
|
|
32
|
+
# ensure we bubble up to app handlers
|
|
33
|
+
logging.getLogger("deepeval").propagate = True
|
|
@@ -10,12 +10,20 @@ Central config for DeepEval.
|
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
12
|
import logging
|
|
13
|
+
import math
|
|
13
14
|
import os
|
|
14
15
|
import re
|
|
15
16
|
|
|
16
17
|
from dotenv import dotenv_values
|
|
17
18
|
from pathlib import Path
|
|
18
|
-
from pydantic import
|
|
19
|
+
from pydantic import (
|
|
20
|
+
AnyUrl,
|
|
21
|
+
computed_field,
|
|
22
|
+
confloat,
|
|
23
|
+
conint,
|
|
24
|
+
field_validator,
|
|
25
|
+
SecretStr,
|
|
26
|
+
)
|
|
19
27
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
20
28
|
from typing import Any, Dict, List, Optional, NamedTuple
|
|
21
29
|
|
|
@@ -155,7 +163,7 @@ class Settings(BaseSettings):
|
|
|
155
163
|
#
|
|
156
164
|
|
|
157
165
|
APP_ENV: str = "dev"
|
|
158
|
-
LOG_LEVEL:
|
|
166
|
+
LOG_LEVEL: Optional[int] = None
|
|
159
167
|
PYTHONPATH: str = "."
|
|
160
168
|
CONFIDENT_REGION: Optional[str] = None
|
|
161
169
|
CONFIDENT_OPEN_BROWSER: Optional[bool] = True
|
|
@@ -287,9 +295,33 @@ class Settings(BaseSettings):
|
|
|
287
295
|
#
|
|
288
296
|
# Retry Policy
|
|
289
297
|
#
|
|
290
|
-
|
|
291
|
-
|
|
298
|
+
# Controls how Tenacity retries provider calls when the SDK isn't doing its own retries.
|
|
299
|
+
# Key concepts:
|
|
300
|
+
# - attempts count includes the first call. e.g. 1 = no retries, 2 = one retry.
|
|
301
|
+
# - backoff sleeps follow exponential growth with a cap, plus jitter. Expected jitter
|
|
302
|
+
# contribution is ~ JITTER/2 per sleep.
|
|
303
|
+
# - logging levels are looked up dynamically each attempt, so if you change LOG_LEVEL at runtime,
|
|
304
|
+
# the retry loggers will honor it without restart.
|
|
305
|
+
DEEPEVAL_SDK_RETRY_PROVIDERS: Optional[List[str]] = (
|
|
306
|
+
None # ["*"] to delegate all retries to SDKs
|
|
307
|
+
)
|
|
308
|
+
DEEPEVAL_RETRY_BEFORE_LOG_LEVEL: Optional[int] = (
|
|
309
|
+
None # default is LOG_LEVEL if set, else INFO
|
|
310
|
+
)
|
|
292
311
|
DEEPEVAL_RETRY_AFTER_LOG_LEVEL: Optional[int] = None # default -> ERROR
|
|
312
|
+
DEEPEVAL_RETRY_MAX_ATTEMPTS: conint(ge=1) = (
|
|
313
|
+
2 # attempts = first try + retries
|
|
314
|
+
)
|
|
315
|
+
DEEPEVAL_RETRY_INITIAL_SECONDS: confloat(ge=0) = (
|
|
316
|
+
1.0 # first sleep before retry, if any
|
|
317
|
+
)
|
|
318
|
+
DEEPEVAL_RETRY_EXP_BASE: confloat(ge=1) = (
|
|
319
|
+
2.0 # exponential growth factor for sleeps
|
|
320
|
+
)
|
|
321
|
+
DEEPEVAL_RETRY_JITTER: confloat(ge=0) = 2.0 # uniform jitter
|
|
322
|
+
DEEPEVAL_RETRY_CAP_SECONDS: confloat(ge=0) = (
|
|
323
|
+
5.0 # cap for each backoff sleep
|
|
324
|
+
)
|
|
293
325
|
|
|
294
326
|
#
|
|
295
327
|
# Telemetry and Debug
|
|
@@ -316,19 +348,87 @@ class Settings(BaseSettings):
|
|
|
316
348
|
#
|
|
317
349
|
MEDIA_IMAGE_CONNECT_TIMEOUT_SECONDS: float = 3.05
|
|
318
350
|
MEDIA_IMAGE_READ_TIMEOUT_SECONDS: float = 10.0
|
|
351
|
+
# DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS: per-attempt timeout for provider calls enforced by our retry decorator.
|
|
352
|
+
# This timeout interacts with retry policy and the task level budget (DEEPEVAL_PER_TASK_TIMEOUT_SECONDS) below.
|
|
353
|
+
# If you leave this at 0/None, the computed outer budget defaults to 180s.
|
|
354
|
+
DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS: Optional[confloat(ge=0)] = (
|
|
355
|
+
None # per-attempt timeout. Set 0/None to disable
|
|
356
|
+
)
|
|
319
357
|
|
|
320
358
|
#
|
|
321
359
|
# Async Task Configuration
|
|
322
360
|
#
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
|
|
326
|
-
|
|
327
|
-
|
|
361
|
+
DEEPEVAL_TIMEOUT_THREAD_LIMIT: conint(ge=1) = 128
|
|
362
|
+
DEEPEVAL_TIMEOUT_SEMAPHORE_WARN_AFTER_SECONDS: confloat(ge=0) = 5.0
|
|
363
|
+
# DEEPEVAL_PER_TASK_TIMEOUT_SECONDS is the outer time budget for one metric/task.
|
|
364
|
+
# It is computed from per-attempt timeout + retries/backoff unless you explicitly override it.
|
|
365
|
+
# - OVERRIDE = None or 0 -> auto compute as:
|
|
366
|
+
# attempts * per_attempt_timeout + sum(backoff_sleeps) + ~jitter/2 per sleep + 1s safety
|
|
367
|
+
# (If per_attempt_timeout is 0/None, the auto outer budget defaults to 180s.)
|
|
368
|
+
# - OVERRIDE > 0 -> use that exact value. A warning is logged if it is likely too small
|
|
369
|
+
# to permit the configured attempts/backoff.
|
|
370
|
+
#
|
|
371
|
+
# Tip:
|
|
372
|
+
# Most users only need to set DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS and DEEPEVAL_RETRY_MAX_ATTEMPTS.
|
|
373
|
+
# Leave the outer budget on auto unless you have very strict SLAs.
|
|
374
|
+
DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE: Optional[conint(ge=0)] = None
|
|
328
375
|
|
|
329
376
|
# Buffer time for gathering results from all tasks, added to the longest task duration
|
|
330
377
|
# Increase if many tasks are running concurrently
|
|
331
|
-
DEEPEVAL_TASK_GATHER_BUFFER_SECONDS:
|
|
378
|
+
DEEPEVAL_TASK_GATHER_BUFFER_SECONDS: confloat(ge=0) = 60
|
|
379
|
+
|
|
380
|
+
###################
|
|
381
|
+
# Computed Fields #
|
|
382
|
+
###################
|
|
383
|
+
|
|
384
|
+
def _calc_auto_outer_timeout(self) -> int:
|
|
385
|
+
"""Compute outer budget from per-attempt timeout + retries/backoff.
|
|
386
|
+
Never reference the computed property itself here.
|
|
387
|
+
"""
|
|
388
|
+
attempts = self.DEEPEVAL_RETRY_MAX_ATTEMPTS or 1
|
|
389
|
+
timeout_seconds = float(self.DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0)
|
|
390
|
+
if timeout_seconds <= 0:
|
|
391
|
+
# No per-attempt timeout set -> default outer budget
|
|
392
|
+
return 180
|
|
393
|
+
|
|
394
|
+
sleeps = max(0, attempts - 1)
|
|
395
|
+
cur = float(self.DEEPEVAL_RETRY_INITIAL_SECONDS)
|
|
396
|
+
cap = float(self.DEEPEVAL_RETRY_CAP_SECONDS)
|
|
397
|
+
base = float(self.DEEPEVAL_RETRY_EXP_BASE)
|
|
398
|
+
jitter = float(self.DEEPEVAL_RETRY_JITTER)
|
|
399
|
+
|
|
400
|
+
backoff = 0.0
|
|
401
|
+
for _ in range(sleeps):
|
|
402
|
+
backoff += min(cap, cur)
|
|
403
|
+
cur *= base
|
|
404
|
+
backoff += sleeps * (jitter / 2.0) # expected jitter
|
|
405
|
+
|
|
406
|
+
safety_overhead = 1.0
|
|
407
|
+
return int(
|
|
408
|
+
math.ceil(attempts * timeout_seconds + backoff + safety_overhead)
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
@computed_field
|
|
412
|
+
@property
|
|
413
|
+
def DEEPEVAL_PER_TASK_TIMEOUT_SECONDS(self) -> int:
|
|
414
|
+
"""If OVERRIDE is set (nonzero), return it; else return the derived budget."""
|
|
415
|
+
outer = self.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE
|
|
416
|
+
if outer not in (None, 0):
|
|
417
|
+
# Warn if user-provided outer is likely to truncate retries
|
|
418
|
+
if (self.DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0) > 0:
|
|
419
|
+
min_needed = self._calc_auto_outer_timeout()
|
|
420
|
+
if int(outer) < min_needed:
|
|
421
|
+
if self.DEEPEVAL_VERBOSE_MODE:
|
|
422
|
+
logger.warning(
|
|
423
|
+
"Metric timeout (outer=%ss) is less than attempts × per-attempt "
|
|
424
|
+
"timeout + backoff (≈%ss). Retries may be cut short.",
|
|
425
|
+
int(outer),
|
|
426
|
+
min_needed,
|
|
427
|
+
)
|
|
428
|
+
return int(outer)
|
|
429
|
+
|
|
430
|
+
# Auto mode
|
|
431
|
+
return self._calc_auto_outer_timeout()
|
|
332
432
|
|
|
333
433
|
##############
|
|
334
434
|
# Validators #
|
|
@@ -474,7 +574,9 @@ class Settings(BaseSettings):
|
|
|
474
574
|
if s in SUPPORTED_PROVIDER_SLUGS:
|
|
475
575
|
normalized.append(s)
|
|
476
576
|
else:
|
|
477
|
-
if
|
|
577
|
+
if parse_bool(
|
|
578
|
+
os.getenv("DEEPEVAL_VERBOSE_MODE"), default=False
|
|
579
|
+
):
|
|
478
580
|
logger.warning("Unknown provider slug %r dropped", item)
|
|
479
581
|
|
|
480
582
|
if star:
|
|
@@ -487,6 +589,7 @@ class Settings(BaseSettings):
|
|
|
487
589
|
@field_validator(
|
|
488
590
|
"DEEPEVAL_RETRY_BEFORE_LOG_LEVEL",
|
|
489
591
|
"DEEPEVAL_RETRY_AFTER_LOG_LEVEL",
|
|
592
|
+
"LOG_LEVEL",
|
|
490
593
|
mode="before",
|
|
491
594
|
)
|
|
492
595
|
@classmethod
|
|
@@ -524,6 +627,10 @@ class Settings(BaseSettings):
|
|
|
524
627
|
# Persistence support #
|
|
525
628
|
#######################
|
|
526
629
|
class _SettingsEditCtx:
|
|
630
|
+
COMPUTED_FIELDS: frozenset[str] = frozenset(
|
|
631
|
+
{"DEEPEVAL_PER_TASK_TIMEOUT_SECONDS"}
|
|
632
|
+
)
|
|
633
|
+
|
|
527
634
|
def __init__(
|
|
528
635
|
self,
|
|
529
636
|
settings: "Settings",
|
|
@@ -559,8 +666,11 @@ class Settings(BaseSettings):
|
|
|
559
666
|
# lazy import legacy JSON store deps
|
|
560
667
|
from deepeval.key_handler import KEY_FILE_HANDLER
|
|
561
668
|
|
|
669
|
+
model_fields = type(self._s).model_fields
|
|
670
|
+
# Exclude computed fields from persistence
|
|
671
|
+
|
|
562
672
|
# compute diff of changed fields
|
|
563
|
-
after = {k: getattr(self._s, k) for k in
|
|
673
|
+
after = {k: getattr(self._s, k) for k in model_fields}
|
|
564
674
|
|
|
565
675
|
before_norm = {
|
|
566
676
|
k: _normalize_for_env(v) for k, v in self._before.items()
|
|
@@ -570,12 +680,21 @@ class Settings(BaseSettings):
|
|
|
570
680
|
changed_keys = {
|
|
571
681
|
k for k in after_norm if after_norm[k] != before_norm.get(k)
|
|
572
682
|
}
|
|
683
|
+
changed_keys -= self.COMPUTED_FIELDS
|
|
684
|
+
|
|
573
685
|
if not changed_keys:
|
|
574
686
|
self.result = PersistResult(False, None, {})
|
|
575
687
|
return False
|
|
576
688
|
|
|
577
689
|
updates = {k: after[k] for k in changed_keys}
|
|
578
690
|
|
|
691
|
+
if "LOG_LEVEL" in updates:
|
|
692
|
+
from deepeval.config.logging import (
|
|
693
|
+
apply_deepeval_log_level,
|
|
694
|
+
)
|
|
695
|
+
|
|
696
|
+
apply_deepeval_log_level()
|
|
697
|
+
|
|
579
698
|
#
|
|
580
699
|
# .deepeval JSON support
|
|
581
700
|
#
|
|
@@ -681,4 +800,27 @@ def get_settings() -> Settings:
|
|
|
681
800
|
global _settings_singleton
|
|
682
801
|
if _settings_singleton is None:
|
|
683
802
|
_settings_singleton = Settings()
|
|
803
|
+
from deepeval.config.logging import apply_deepeval_log_level
|
|
804
|
+
|
|
805
|
+
apply_deepeval_log_level()
|
|
684
806
|
return _settings_singleton
|
|
807
|
+
|
|
808
|
+
|
|
809
|
+
def reset_settings(*, reload_dotenv: bool = False) -> Settings:
|
|
810
|
+
"""
|
|
811
|
+
Drop the cached Settings singleton and rebuild it from the current process
|
|
812
|
+
environment.
|
|
813
|
+
|
|
814
|
+
Args:
|
|
815
|
+
reload_dotenv: When True, call `autoload_dotenv()` before re-instantiating,
|
|
816
|
+
which merges .env values into os.environ (never overwriting
|
|
817
|
+
existing process env vars).
|
|
818
|
+
|
|
819
|
+
Returns:
|
|
820
|
+
The fresh Settings instance.
|
|
821
|
+
"""
|
|
822
|
+
global _settings_singleton
|
|
823
|
+
if reload_dotenv:
|
|
824
|
+
autoload_dotenv()
|
|
825
|
+
_settings_singleton = None
|
|
826
|
+
return get_settings()
|
|
@@ -91,7 +91,6 @@ from deepeval.config.settings import get_settings
|
|
|
91
91
|
|
|
92
92
|
|
|
93
93
|
logger = logging.getLogger(__name__)
|
|
94
|
-
settings = get_settings()
|
|
95
94
|
|
|
96
95
|
|
|
97
96
|
async def _snapshot_tasks():
|
|
@@ -100,6 +99,18 @@ async def _snapshot_tasks():
|
|
|
100
99
|
return {t for t in asyncio.all_tasks() if t is not cur}
|
|
101
100
|
|
|
102
101
|
|
|
102
|
+
def _per_task_timeout() -> float:
|
|
103
|
+
return get_settings().DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _gather_timeout() -> float:
|
|
107
|
+
s = get_settings()
|
|
108
|
+
return (
|
|
109
|
+
s.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
|
|
110
|
+
+ s.DEEPEVAL_TASK_GATHER_BUFFER_SECONDS
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
103
114
|
###########################################
|
|
104
115
|
### E2E Evals #############################
|
|
105
116
|
###########################################
|
|
@@ -838,7 +849,7 @@ def execute_agentic_test_cases(
|
|
|
838
849
|
loop.run_until_complete(
|
|
839
850
|
asyncio.wait_for(
|
|
840
851
|
coro,
|
|
841
|
-
timeout=
|
|
852
|
+
timeout=_per_task_timeout(),
|
|
842
853
|
)
|
|
843
854
|
)
|
|
844
855
|
else:
|
|
@@ -1196,7 +1207,7 @@ async def _a_execute_agentic_test_case(
|
|
|
1196
1207
|
if asyncio.iscoroutinefunction(observed_callback):
|
|
1197
1208
|
await asyncio.wait_for(
|
|
1198
1209
|
observed_callback(golden.input),
|
|
1199
|
-
timeout=
|
|
1210
|
+
timeout=_per_task_timeout(),
|
|
1200
1211
|
)
|
|
1201
1212
|
else:
|
|
1202
1213
|
observed_callback(golden.input)
|
|
@@ -1753,11 +1764,6 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1753
1764
|
_is_assert_test: bool = False,
|
|
1754
1765
|
) -> Iterator[TestResult]:
|
|
1755
1766
|
|
|
1756
|
-
GATHER_TIMEOUT_SECONDS = (
|
|
1757
|
-
settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
|
|
1758
|
-
+ settings.DEEPEVAL_TASK_GATHER_BUFFER_SECONDS
|
|
1759
|
-
)
|
|
1760
|
-
|
|
1761
1767
|
semaphore = asyncio.Semaphore(async_config.max_concurrent)
|
|
1762
1768
|
original_create_task = asyncio.create_task
|
|
1763
1769
|
|
|
@@ -1772,7 +1778,7 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1772
1778
|
async def execute_callback_with_semaphore(coroutine: Awaitable):
|
|
1773
1779
|
async with semaphore:
|
|
1774
1780
|
return await asyncio.wait_for(
|
|
1775
|
-
coroutine, timeout=
|
|
1781
|
+
coroutine, timeout=_per_task_timeout()
|
|
1776
1782
|
)
|
|
1777
1783
|
|
|
1778
1784
|
def evaluate_test_cases(
|
|
@@ -1814,7 +1820,7 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1814
1820
|
}
|
|
1815
1821
|
|
|
1816
1822
|
def on_task_done(t: asyncio.Task):
|
|
1817
|
-
if
|
|
1823
|
+
if get_settings().DEEPEVAL_DEBUG_ASYNC:
|
|
1818
1824
|
# Using info level here to make it easy to spot these logs.
|
|
1819
1825
|
# We are gated by DEEPEVAL_DEBUG_ASYNC
|
|
1820
1826
|
meta = task_meta.get(t, {})
|
|
@@ -1888,7 +1894,7 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1888
1894
|
loop.run_until_complete(
|
|
1889
1895
|
asyncio.wait_for(
|
|
1890
1896
|
asyncio.gather(*created_tasks, return_exceptions=True),
|
|
1891
|
-
timeout=
|
|
1897
|
+
timeout=_gather_timeout(),
|
|
1892
1898
|
)
|
|
1893
1899
|
)
|
|
1894
1900
|
except asyncio.TimeoutError:
|
|
@@ -1903,16 +1909,13 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1903
1909
|
elapsed_time = time.perf_counter() - start_time
|
|
1904
1910
|
|
|
1905
1911
|
# Determine if it was a per task or gather timeout based on task's elapsed time
|
|
1906
|
-
if (
|
|
1907
|
-
elapsed_time
|
|
1908
|
-
>= settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
|
|
1909
|
-
):
|
|
1912
|
+
if elapsed_time >= _per_task_timeout():
|
|
1910
1913
|
timeout_type = "per-task"
|
|
1911
1914
|
else:
|
|
1912
1915
|
timeout_type = "gather"
|
|
1913
1916
|
|
|
1914
1917
|
logger.warning(
|
|
1915
|
-
f"[deepeval] gather TIMEOUT after {
|
|
1918
|
+
f"[deepeval] gather TIMEOUT after {_gather_timeout()}s; "
|
|
1916
1919
|
f"pending={len(pending)} tasks. Timeout type: {timeout_type}. "
|
|
1917
1920
|
f"To give tasks more time, consider increasing "
|
|
1918
1921
|
f"DEEPEVAL_PER_TASK_TIMEOUT_SECONDS for longer task completion time or "
|
|
@@ -1926,7 +1929,7 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1926
1929
|
elapsed_time,
|
|
1927
1930
|
meta,
|
|
1928
1931
|
)
|
|
1929
|
-
if loop.get_debug() and
|
|
1932
|
+
if loop.get_debug() and get_settings().DEEPEVAL_DEBUG_ASYNC:
|
|
1930
1933
|
frames = t.get_stack(limit=6)
|
|
1931
1934
|
if frames:
|
|
1932
1935
|
logger.info(" stack:")
|
|
@@ -1965,7 +1968,7 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1965
1968
|
if not leftovers:
|
|
1966
1969
|
return
|
|
1967
1970
|
|
|
1968
|
-
if
|
|
1971
|
+
if get_settings().DEEPEVAL_DEBUG_ASYNC:
|
|
1969
1972
|
logger.warning(
|
|
1970
1973
|
"[deepeval] %d stray task(s) not tracked; cancelling...",
|
|
1971
1974
|
len(leftovers),
|
|
@@ -1985,7 +1988,7 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1985
1988
|
)
|
|
1986
1989
|
except RuntimeError:
|
|
1987
1990
|
# If the loop is closing here, just continue
|
|
1988
|
-
if
|
|
1991
|
+
if get_settings().DEEPEVAL_DEBUG_ASYNC:
|
|
1989
1992
|
logger.warning(
|
|
1990
1993
|
"[deepeval] failed to drain stray tasks because loop is closing"
|
|
1991
1994
|
)
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
"""LLM evaluated metric based on the GEval framework: https://arxiv.org/pdf/2303.16634.pdf"""
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
4
|
+
|
|
3
5
|
from typing import Optional, List, Tuple, Union, Type
|
|
4
6
|
from deepeval.metrics import BaseMetric
|
|
5
7
|
from deepeval.test_case import (
|
|
@@ -16,7 +18,7 @@ from deepeval.metrics.utils import (
|
|
|
16
18
|
)
|
|
17
19
|
from deepeval.models import DeepEvalBaseLLM
|
|
18
20
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
19
|
-
from deepeval.metrics.g_eval
|
|
21
|
+
from deepeval.metrics.g_eval import schema as gschema
|
|
20
22
|
from deepeval.metrics.g_eval.utils import (
|
|
21
23
|
Rubric,
|
|
22
24
|
construct_g_eval_params_string,
|
|
@@ -29,6 +31,7 @@ from deepeval.metrics.g_eval.utils import (
|
|
|
29
31
|
number_evaluation_steps,
|
|
30
32
|
get_score_range,
|
|
31
33
|
)
|
|
34
|
+
from deepeval.config.settings import get_settings
|
|
32
35
|
|
|
33
36
|
|
|
34
37
|
class GEval(BaseMetric):
|
|
@@ -81,12 +84,16 @@ class GEval(BaseMetric):
|
|
|
81
84
|
):
|
|
82
85
|
if self.async_mode:
|
|
83
86
|
loop = get_or_create_event_loop()
|
|
87
|
+
coro = self.a_measure(
|
|
88
|
+
test_case,
|
|
89
|
+
_show_indicator=False,
|
|
90
|
+
_in_component=_in_component,
|
|
91
|
+
_additional_context=_additional_context,
|
|
92
|
+
)
|
|
84
93
|
loop.run_until_complete(
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
_in_component=_in_component,
|
|
89
|
-
_additional_context=_additional_context,
|
|
94
|
+
asyncio.wait_for(
|
|
95
|
+
coro,
|
|
96
|
+
timeout=get_settings().DEEPEVAL_PER_TASK_TIMEOUT_SECONDS,
|
|
90
97
|
)
|
|
91
98
|
)
|
|
92
99
|
else:
|
|
@@ -177,7 +184,9 @@ class GEval(BaseMetric):
|
|
|
177
184
|
return data["steps"]
|
|
178
185
|
else:
|
|
179
186
|
try:
|
|
180
|
-
res: Steps = await self.model.a_generate(
|
|
187
|
+
res: gschema.Steps = await self.model.a_generate(
|
|
188
|
+
prompt, schema=gschema.Steps
|
|
189
|
+
)
|
|
181
190
|
return res.steps
|
|
182
191
|
except TypeError:
|
|
183
192
|
res = await self.model.a_generate(prompt)
|
|
@@ -201,7 +210,9 @@ class GEval(BaseMetric):
|
|
|
201
210
|
return data["steps"]
|
|
202
211
|
else:
|
|
203
212
|
try:
|
|
204
|
-
res: Steps = self.model.generate(
|
|
213
|
+
res: gschema.Steps = self.model.generate(
|
|
214
|
+
prompt, schema=gschema.Steps
|
|
215
|
+
)
|
|
205
216
|
return res.steps
|
|
206
217
|
except TypeError:
|
|
207
218
|
res = self.model.generate(prompt)
|
|
@@ -264,7 +275,7 @@ class GEval(BaseMetric):
|
|
|
264
275
|
score, res
|
|
265
276
|
)
|
|
266
277
|
return weighted_summed_score, reason
|
|
267
|
-
except:
|
|
278
|
+
except (KeyError, AttributeError, TypeError, ValueError):
|
|
268
279
|
return score, reason
|
|
269
280
|
except (
|
|
270
281
|
AttributeError
|
|
@@ -276,8 +287,8 @@ class GEval(BaseMetric):
|
|
|
276
287
|
return data["score"], data["reason"]
|
|
277
288
|
else:
|
|
278
289
|
try:
|
|
279
|
-
res: ReasonScore = await self.model.a_generate(
|
|
280
|
-
prompt, schema=ReasonScore
|
|
290
|
+
res: gschema.ReasonScore = await self.model.a_generate(
|
|
291
|
+
prompt, schema=gschema.ReasonScore
|
|
281
292
|
)
|
|
282
293
|
return res.score, res.reason
|
|
283
294
|
except TypeError:
|
|
@@ -338,7 +349,7 @@ class GEval(BaseMetric):
|
|
|
338
349
|
score, res
|
|
339
350
|
)
|
|
340
351
|
return weighted_summed_score, reason
|
|
341
|
-
except:
|
|
352
|
+
except (KeyError, AttributeError, TypeError, ValueError):
|
|
342
353
|
return score, reason
|
|
343
354
|
except AttributeError:
|
|
344
355
|
# This catches the case where a_generate_raw_response doesn't exist.
|
|
@@ -349,8 +360,8 @@ class GEval(BaseMetric):
|
|
|
349
360
|
return data["score"], data["reason"]
|
|
350
361
|
else:
|
|
351
362
|
try:
|
|
352
|
-
res: ReasonScore = self.model.generate(
|
|
353
|
-
prompt, schema=ReasonScore
|
|
363
|
+
res: gschema.ReasonScore = self.model.generate(
|
|
364
|
+
prompt, schema=gschema.ReasonScore
|
|
354
365
|
)
|
|
355
366
|
return res.score, res.reason
|
|
356
367
|
except TypeError:
|
|
@@ -364,7 +375,7 @@ class GEval(BaseMetric):
|
|
|
364
375
|
else:
|
|
365
376
|
try:
|
|
366
377
|
self.success = self.score >= self.threshold
|
|
367
|
-
except:
|
|
378
|
+
except TypeError:
|
|
368
379
|
self.success = False
|
|
369
380
|
return self.success
|
|
370
381
|
|