deepeval 3.6.4__tar.gz → 3.6.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {deepeval-3.6.4 → deepeval-3.6.6}/PKG-INFO +1 -1
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/__init__.py +42 -10
- deepeval-3.6.6/deepeval/_version.py +1 -0
- deepeval-3.6.6/deepeval/config/logging.py +33 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/config/settings.py +167 -12
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/dataset/dataset.py +8 -2
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/evaluate/evaluate.py +8 -2
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/evaluate/execute.py +28 -30
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/evaluate/types.py +4 -1
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/evaluate/utils.py +46 -29
- deepeval-3.6.6/deepeval/integrations/crewai/__init__.py +3 -0
- deepeval-3.6.6/deepeval/integrations/crewai/handler.py +196 -0
- deepeval-3.6.6/deepeval/integrations/crewai/wrapper.py +87 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/integrations/pydantic_ai/instrumentator.py +48 -9
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/faithfulness/faithfulness.py +8 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/g_eval/g_eval.py +26 -15
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/prompt_alignment/prompt_alignment.py +41 -23
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/models/retry_policy.py +202 -11
- deepeval-3.6.6/deepeval/synthesizer/chunking/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/test_run/__init__.py +2 -1
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/test_run/api.py +1 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/test_run/test_run.py +85 -9
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/tracing/__init__.py +2 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/tracing/otel/exporter.py +0 -6
- deepeval-3.6.6/deepeval/tracing/otel/test_exporter.py +35 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/tracing/otel/utils.py +57 -7
- deepeval-3.6.6/deepeval/tracing/trace_context.py +14 -0
- deepeval-3.6.6/deepeval/tracing/trace_test_manager.py +19 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/tracing/tracing.py +7 -6
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/tracing/utils.py +2 -86
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/utils.py +149 -1
- {deepeval-3.6.4 → deepeval-3.6.6}/pyproject.toml +1 -1
- deepeval-3.6.4/deepeval/_version.py +0 -1
- deepeval-3.6.4/deepeval/integrations/crewai/__init__.py +0 -4
- deepeval-3.6.4/deepeval/integrations/crewai/agent.py +0 -98
- deepeval-3.6.4/deepeval/integrations/crewai/handler.py +0 -124
- deepeval-3.6.4/deepeval/integrations/crewai/patch.py +0 -41
- {deepeval-3.6.4 → deepeval-3.6.6}/LICENSE.md +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/README.md +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/annotation/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/annotation/annotation.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/annotation/api.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/arc/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/arc/arc.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/arc/mode.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/arc/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/base_benchmark.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/bbq/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/bbq/bbq.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/bbq/task.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/bbq/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/big_bench_hard.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/boolean_expressions.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/causal_judgement.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/date_understanding.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/disambiguation_qa.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/dyck_languages.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/formal_fallacies.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/geometric_shapes.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/hyperbaton.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_five_objects.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_seven_objects.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_three_objects.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/movie_recommendation.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/multistep_arithmetic_two.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/navigate.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/object_counting.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/penguins_in_a_table.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/reasoning_about_colored_objects.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/ruin_names.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/salient_translation_error_detection.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/snarks.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/sports_understanding.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/temporal_sequences.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/web_of_lies.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/cot_prompts/word_sorting.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/boolean_expressions.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/causal_judgement.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/date_understanding.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/disambiguation_qa.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/dyck_languages.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/formal_fallacies.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/geometric_shapes.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/hyperbaton.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_five_objects.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_seven_objects.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_three_objects.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/movie_recommendation.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/multistep_arithmetic_two.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/navigate.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/object_counting.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/penguins_in_a_table.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/reasoning_about_colored_objects.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/ruin_names.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/salient_translation_error_detection.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/snarks.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/sports_understanding.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/temporal_sequences.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/web_of_lies.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/shot_prompts/word_sorting.txt +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/task.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/big_bench_hard/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/bool_q/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/bool_q/bool_q.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/bool_q/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/drop/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/drop/drop.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/drop/task.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/drop/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/equity_med_qa/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/equity_med_qa/equity_med_qa.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/equity_med_qa/task.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/equity_med_qa/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/gsm8k/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/gsm8k/gsm8k.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/gsm8k/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/hellaswag/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/hellaswag/hellaswag.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/hellaswag/task.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/hellaswag/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/human_eval/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/human_eval/human_eval.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/human_eval/task.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/human_eval/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/ifeval/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/ifeval/ifeval.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/ifeval/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/lambada/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/lambada/lambada.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/lambada/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/logi_qa/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/logi_qa/logi_qa.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/logi_qa/task.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/logi_qa/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/math_qa/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/math_qa/math_qa.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/math_qa/task.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/math_qa/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/mmlu/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/mmlu/mmlu.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/mmlu/task.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/mmlu/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/modes/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/results.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/squad/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/squad/squad.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/squad/task.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/squad/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/tasks/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/truthful_qa/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/truthful_qa/mode.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/truthful_qa/task.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/truthful_qa/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/truthful_qa/truthful_qa.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/utils.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/winogrande/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/winogrande/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/benchmarks/winogrande/winogrande.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/cli/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/cli/dotenv_handler.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/cli/main.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/cli/server.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/cli/test.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/cli/types.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/cli/utils.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/confident/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/confident/api.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/confident/types.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/config/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/config/settings_manager.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/config/utils.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/constants.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/contextvars.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/dataset/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/dataset/api.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/dataset/golden.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/dataset/test_run_tracer.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/dataset/types.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/dataset/utils.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/errors.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/evaluate/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/evaluate/api.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/evaluate/compare.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/evaluate/configs.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/integrations/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/integrations/hugging_face/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/integrations/hugging_face/callback.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/integrations/hugging_face/rich_manager.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/integrations/hugging_face/tests/test_callbacks.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/integrations/hugging_face/utils.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/integrations/langchain/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/integrations/langchain/callback.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/integrations/langchain/patch.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/integrations/langchain/utils.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/integrations/llama_index/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/integrations/llama_index/agent/patched.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/integrations/llama_index/handler.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/integrations/llama_index/utils.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/integrations/pydantic_ai/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/integrations/pydantic_ai/agent.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/integrations/pydantic_ai/otel.py +0 -0
- /deepeval-3.6.4/deepeval/metrics/argument_correctness/__init__.py → /deepeval-3.6.6/deepeval/integrations/pydantic_ai/test_instrumentator.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/key_handler.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/answer_relevancy/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/answer_relevancy/answer_relevancy.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/answer_relevancy/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/answer_relevancy/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/arena_g_eval/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/arena_g_eval/arena_g_eval.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/arena_g_eval/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/arena_g_eval/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/arena_g_eval/utils.py +0 -0
- {deepeval-3.6.4/deepeval/metrics/conversation_completeness → deepeval-3.6.6/deepeval/metrics/argument_correctness}/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/argument_correctness/argument_correctness.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/argument_correctness/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/argument_correctness/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/base_metric.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/bias/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/bias/bias.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/bias/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/bias/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/contextual_precision/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/contextual_precision/contextual_precision.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/contextual_precision/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/contextual_precision/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/contextual_recall/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/contextual_recall/contextual_recall.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/contextual_recall/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/contextual_recall/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/contextual_relevancy/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/contextual_relevancy/contextual_relevancy.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/contextual_relevancy/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/contextual_relevancy/template.py +0 -0
- {deepeval-3.6.4/deepeval/metrics/conversational_g_eval → deepeval-3.6.6/deepeval/metrics/conversation_completeness}/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/conversation_completeness/conversation_completeness.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/conversation_completeness/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/conversation_completeness/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/conversational_dag/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/conversational_dag/conversational_dag.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/conversational_dag/nodes.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/conversational_dag/templates.py +0 -0
- {deepeval-3.6.4/deepeval/metrics/json_correctness → deepeval-3.6.6/deepeval/metrics/conversational_g_eval}/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/conversational_g_eval/conversational_g_eval.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/conversational_g_eval/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/conversational_g_eval/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/dag/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/dag/dag.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/dag/graph.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/dag/nodes.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/dag/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/dag/templates.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/dag/utils.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/faithfulness/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/faithfulness/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/faithfulness/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/g_eval/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/g_eval/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/g_eval/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/g_eval/utils.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/hallucination/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/hallucination/hallucination.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/hallucination/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/hallucination/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/indicator.py +0 -0
- {deepeval-3.6.4/deepeval/metrics/knowledge_retention → deepeval-3.6.6/deepeval/metrics/json_correctness}/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/json_correctness/json_correctness.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/json_correctness/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/json_correctness/template.py +0 -0
- {deepeval-3.6.4/deepeval/metrics/mcp → deepeval-3.6.6/deepeval/metrics/knowledge_retention}/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/knowledge_retention/knowledge_retention.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/knowledge_retention/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/knowledge_retention/template.py +0 -0
- {deepeval-3.6.4/deepeval/metrics/mcp_use_metric → deepeval-3.6.6/deepeval/metrics/mcp}/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/mcp/mcp_task_completion.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/mcp/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/mcp/template.py +0 -0
- {deepeval-3.6.4/deepeval/metrics/multimodal_metrics/image_coherence → deepeval-3.6.6/deepeval/metrics/mcp_use_metric}/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/mcp_use_metric/mcp_use_metric.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/mcp_use_metric/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/mcp_use_metric/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/misuse/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/misuse/misuse.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/misuse/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/misuse/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/__init__.py +0 -0
- {deepeval-3.6.4/deepeval/metrics/multimodal_metrics/image_editing → deepeval-3.6.6/deepeval/metrics/multimodal_metrics/image_coherence}/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/image_coherence/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/image_coherence/template.py +0 -0
- {deepeval-3.6.4/deepeval/metrics/multimodal_metrics/image_helpfulness → deepeval-3.6.6/deepeval/metrics/multimodal_metrics/image_editing}/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/image_editing/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/image_editing/template.py +0 -0
- {deepeval-3.6.4/deepeval/metrics/multimodal_metrics/image_reference → deepeval-3.6.6/deepeval/metrics/multimodal_metrics/image_helpfulness}/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/image_helpfulness/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/image_helpfulness/template.py +0 -0
- {deepeval-3.6.4/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy → deepeval-3.6.6/deepeval/metrics/multimodal_metrics/image_reference}/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/image_reference/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/image_reference/template.py +0 -0
- {deepeval-3.6.4/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision → deepeval-3.6.6/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy}/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -0
- {deepeval-3.6.4/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall → deepeval-3.6.6/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision}/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -0
- {deepeval-3.6.4/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy → deepeval-3.6.6/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall}/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -0
- {deepeval-3.6.4/deepeval/metrics/multimodal_metrics/multimodal_faithfulness → deepeval-3.6.6/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy}/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -0
- {deepeval-3.6.4/deepeval/metrics/multimodal_metrics/multimodal_g_eval → deepeval-3.6.6/deepeval/metrics/multimodal_metrics/multimodal_faithfulness}/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -0
- {deepeval-3.6.4/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness → deepeval-3.6.6/deepeval/metrics/multimodal_metrics/multimodal_g_eval}/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -0
- {deepeval-3.6.4/deepeval/metrics/multimodal_metrics/text_to_image → deepeval-3.6.6/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness}/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -0
- {deepeval-3.6.4/deepeval/metrics/prompt_alignment → deepeval-3.6.6/deepeval/metrics/multimodal_metrics/text_to_image}/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/text_to_image/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/text_to_image/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/non_advice/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/non_advice/non_advice.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/non_advice/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/non_advice/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/pii_leakage/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/pii_leakage/pii_leakage.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/pii_leakage/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/pii_leakage/template.py +0 -0
- {deepeval-3.6.4/deepeval/metrics/role_adherence → deepeval-3.6.6/deepeval/metrics/prompt_alignment}/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/prompt_alignment/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/prompt_alignment/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/ragas.py +0 -0
- {deepeval-3.6.4/deepeval/metrics/task_completion → deepeval-3.6.6/deepeval/metrics/role_adherence}/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/role_adherence/role_adherence.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/role_adherence/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/role_adherence/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/role_violation/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/role_violation/role_violation.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/role_violation/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/role_violation/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/summarization/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/summarization/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/summarization/summarization.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/summarization/template.py +0 -0
- {deepeval-3.6.4/deepeval/metrics/tool_correctness → deepeval-3.6.6/deepeval/metrics/task_completion}/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/task_completion/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/task_completion/task_completion.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/task_completion/template.py +0 -0
- {deepeval-3.6.4/deepeval/metrics/turn_relevancy → deepeval-3.6.6/deepeval/metrics/tool_correctness}/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/tool_correctness/tool_correctness.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/toxicity/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/toxicity/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/toxicity/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/toxicity/toxicity.py +0 -0
- {deepeval-3.6.4/deepeval/plugins → deepeval-3.6.6/deepeval/metrics/turn_relevancy}/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/turn_relevancy/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/turn_relevancy/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/turn_relevancy/turn_relevancy.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/metrics/utils.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/models/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/models/_summac_model.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/models/answer_relevancy_model.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/models/base_model.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/models/detoxify_model.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/models/embedding_models/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/models/embedding_models/azure_embedding_model.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/models/embedding_models/local_embedding_model.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/models/embedding_models/ollama_embedding_model.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/models/embedding_models/openai_embedding_model.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/models/hallucination_model.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/models/llms/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/models/llms/amazon_bedrock_model.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/models/llms/anthropic_model.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/models/llms/azure_model.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/models/llms/deepseek_model.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/models/llms/gemini_model.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/models/llms/grok_model.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/models/llms/kimi_model.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/models/llms/litellm_model.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/models/llms/local_model.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/models/llms/ollama_model.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/models/llms/openai_model.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/models/llms/utils.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/models/mlllms/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/models/mlllms/gemini_model.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/models/mlllms/ollama_model.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/models/mlllms/openai_model.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/models/summac_model.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/models/unbias_model.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/models/utils.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/openai/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/openai/extractors.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/openai/patch.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/openai/utils.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/openai_agents/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/openai_agents/agent.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/openai_agents/callback_handler.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/openai_agents/extractors.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/openai_agents/patch.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/openai_agents/runner.py +0 -0
- {deepeval-3.6.4/deepeval/synthesizer/chunking → deepeval-3.6.6/deepeval/plugins}/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/plugins/plugin.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/progress_context.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/prompt/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/prompt/api.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/prompt/prompt.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/prompt/utils.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/py.typed +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/red_teaming/README.md +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/scorer/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/scorer/scorer.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/simulator/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/simulator/conversation_simulator.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/simulator/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/simulator/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/singleton.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/synthesizer/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/synthesizer/base_synthesizer.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/synthesizer/chunking/context_generator.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/synthesizer/chunking/doc_chunker.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/synthesizer/config.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/synthesizer/schema.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/synthesizer/synthesizer.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/synthesizer/templates/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/synthesizer/templates/template.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/synthesizer/templates/template_extraction.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/synthesizer/templates/template_prompt.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/synthesizer/types.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/synthesizer/utils.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/telemetry.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/test_case/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/test_case/arena_test_case.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/test_case/conversational_test_case.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/test_case/llm_test_case.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/test_case/mcp.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/test_case/mllm_test_case.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/test_case/utils.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/test_run/cache.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/test_run/hooks.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/test_run/hyperparameters.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/tracing/api.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/tracing/context.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/tracing/offline_evals/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/tracing/offline_evals/api.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/tracing/offline_evals/span.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/tracing/offline_evals/thread.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/tracing/offline_evals/trace.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/tracing/otel/__init__.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/tracing/patchers.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/tracing/perf_epoch_bridge.py +0 -0
- {deepeval-3.6.4 → deepeval-3.6.6}/deepeval/tracing/types.py +0 -0
|
@@ -1,24 +1,56 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
1
4
|
import os
|
|
2
|
-
import warnings
|
|
3
5
|
import re
|
|
6
|
+
import warnings
|
|
4
7
|
|
|
5
|
-
# load environment variables before other imports
|
|
8
|
+
# IMPORTANT: load environment variables before other imports
|
|
6
9
|
from deepeval.config.settings import autoload_dotenv, get_settings
|
|
7
10
|
|
|
11
|
+
logging.getLogger("deepeval").addHandler(logging.NullHandler())
|
|
8
12
|
autoload_dotenv()
|
|
9
13
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
14
|
+
|
|
15
|
+
def _expose_public_api() -> None:
|
|
16
|
+
# All other imports must happen after env is loaded
|
|
17
|
+
# Do not do this at module level or ruff will complain with E402
|
|
18
|
+
global __version__, evaluate, assert_test, compare
|
|
19
|
+
global on_test_run_end, log_hyperparameters, login, telemetry
|
|
20
|
+
|
|
21
|
+
from ._version import __version__ as _version
|
|
22
|
+
from deepeval.evaluate import (
|
|
23
|
+
evaluate as _evaluate,
|
|
24
|
+
assert_test as _assert_test,
|
|
25
|
+
)
|
|
26
|
+
from deepeval.evaluate.compare import compare as _compare
|
|
27
|
+
from deepeval.test_run import (
|
|
28
|
+
on_test_run_end as _on_end,
|
|
29
|
+
log_hyperparameters as _log_hparams,
|
|
30
|
+
)
|
|
31
|
+
from deepeval.utils import login as _login
|
|
32
|
+
import deepeval.telemetry as _telemetry
|
|
33
|
+
|
|
34
|
+
__version__ = _version
|
|
35
|
+
evaluate = _evaluate
|
|
36
|
+
assert_test = _assert_test
|
|
37
|
+
compare = _compare
|
|
38
|
+
on_test_run_end = _on_end
|
|
39
|
+
log_hyperparameters = _log_hparams
|
|
40
|
+
login = _login
|
|
41
|
+
telemetry = _telemetry
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
_expose_public_api()
|
|
16
45
|
|
|
17
46
|
|
|
18
47
|
settings = get_settings()
|
|
48
|
+
|
|
19
49
|
if not settings.DEEPEVAL_GRPC_LOGGING:
|
|
20
|
-
os.
|
|
21
|
-
|
|
50
|
+
if os.getenv("GRPC_VERBOSITY") is None:
|
|
51
|
+
os.environ["GRPC_VERBOSITY"] = settings.GRPC_VERBOSITY or "ERROR"
|
|
52
|
+
if os.getenv("GRPC_TRACE") is None:
|
|
53
|
+
os.environ["GRPC_TRACE"] = settings.GRPC_TRACE or ""
|
|
22
54
|
|
|
23
55
|
|
|
24
56
|
__all__ = [
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__: str = "3.6.6"
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Minimal logging configuration helpers for DeepEval.
|
|
3
|
+
|
|
4
|
+
This module centralizes how the library-level logger ("deepeval") is configured. We
|
|
5
|
+
intentionally keep configuration lightweight so application code retains control
|
|
6
|
+
over handlers and formatters.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
from deepeval.config.settings import get_settings
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def apply_deepeval_log_level() -> None:
|
|
14
|
+
"""
|
|
15
|
+
Apply DeepEval's current log level to the package logger.
|
|
16
|
+
|
|
17
|
+
This function reads `LOG_LEVEL` from `deepeval.config.settings.get_settings()`
|
|
18
|
+
and sets the level of the `"deepeval"` logger accordingly. If `LOG_LEVEL` is
|
|
19
|
+
unset (None), INFO is used as a default. The logger's `propagate` flag is set
|
|
20
|
+
to True so records bubble up to the application's handlers. DeepEval does not
|
|
21
|
+
install its own handlers here (a NullHandler is attached in `__init__.py`).
|
|
22
|
+
|
|
23
|
+
The function is idempotent and safe to call multiple times. It is invoked
|
|
24
|
+
automatically when settings are first constructed and whenever `LOG_LEVEL`
|
|
25
|
+
is changed via `settings.edit`.
|
|
26
|
+
"""
|
|
27
|
+
settings = get_settings()
|
|
28
|
+
log_level = settings.LOG_LEVEL
|
|
29
|
+
logging.getLogger("deepeval").setLevel(
|
|
30
|
+
log_level if log_level is not None else logging.INFO
|
|
31
|
+
)
|
|
32
|
+
# ensure we bubble up to app handlers
|
|
33
|
+
logging.getLogger("deepeval").propagate = True
|
|
@@ -10,12 +10,20 @@ Central config for DeepEval.
|
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
12
|
import logging
|
|
13
|
+
import math
|
|
13
14
|
import os
|
|
14
15
|
import re
|
|
15
16
|
|
|
16
17
|
from dotenv import dotenv_values
|
|
17
18
|
from pathlib import Path
|
|
18
|
-
from pydantic import
|
|
19
|
+
from pydantic import (
|
|
20
|
+
AnyUrl,
|
|
21
|
+
computed_field,
|
|
22
|
+
confloat,
|
|
23
|
+
conint,
|
|
24
|
+
field_validator,
|
|
25
|
+
SecretStr,
|
|
26
|
+
)
|
|
19
27
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
20
28
|
from typing import Any, Dict, List, Optional, NamedTuple
|
|
21
29
|
|
|
@@ -155,7 +163,7 @@ class Settings(BaseSettings):
|
|
|
155
163
|
#
|
|
156
164
|
|
|
157
165
|
APP_ENV: str = "dev"
|
|
158
|
-
LOG_LEVEL:
|
|
166
|
+
LOG_LEVEL: Optional[int] = None
|
|
159
167
|
PYTHONPATH: str = "."
|
|
160
168
|
CONFIDENT_REGION: Optional[str] = None
|
|
161
169
|
CONFIDENT_OPEN_BROWSER: Optional[bool] = True
|
|
@@ -180,6 +188,19 @@ class Settings(BaseSettings):
|
|
|
180
188
|
# into this directory. The directory will be created on demand.
|
|
181
189
|
DEEPEVAL_RESULTS_FOLDER: Optional[Path] = None
|
|
182
190
|
|
|
191
|
+
# Display / Truncation
|
|
192
|
+
DEEPEVAL_MAXLEN_TINY: Optional[int] = 40
|
|
193
|
+
DEEPEVAL_MAXLEN_SHORT: Optional[int] = 60
|
|
194
|
+
DEEPEVAL_MAXLEN_MEDIUM: Optional[int] = 120
|
|
195
|
+
DEEPEVAL_MAXLEN_LONG: Optional[int] = 240
|
|
196
|
+
|
|
197
|
+
# If set, this overrides the default max_len used by deepeval/utils shorten
|
|
198
|
+
# falls back to DEEPEVAL_MAXLEN_LONG when None.
|
|
199
|
+
DEEPEVAL_SHORTEN_DEFAULT_MAXLEN: Optional[int] = None
|
|
200
|
+
|
|
201
|
+
# Optional global suffix (keeps your "..." default).
|
|
202
|
+
DEEPEVAL_SHORTEN_SUFFIX: Optional[str] = "..."
|
|
203
|
+
|
|
183
204
|
#
|
|
184
205
|
# GPU and perf toggles
|
|
185
206
|
#
|
|
@@ -274,9 +295,33 @@ class Settings(BaseSettings):
|
|
|
274
295
|
#
|
|
275
296
|
# Retry Policy
|
|
276
297
|
#
|
|
277
|
-
|
|
278
|
-
|
|
298
|
+
# Controls how Tenacity retries provider calls when the SDK isn't doing its own retries.
|
|
299
|
+
# Key concepts:
|
|
300
|
+
# - attempts count includes the first call. e.g. 1 = no retries, 2 = one retry.
|
|
301
|
+
# - backoff sleeps follow exponential growth with a cap, plus jitter. Expected jitter
|
|
302
|
+
# contribution is ~ JITTER/2 per sleep.
|
|
303
|
+
# - logging levels are looked up dynamically each attempt, so if you change LOG_LEVEL at runtime,
|
|
304
|
+
# the retry loggers will honor it without restart.
|
|
305
|
+
DEEPEVAL_SDK_RETRY_PROVIDERS: Optional[List[str]] = (
|
|
306
|
+
None # ["*"] to delegate all retries to SDKs
|
|
307
|
+
)
|
|
308
|
+
DEEPEVAL_RETRY_BEFORE_LOG_LEVEL: Optional[int] = (
|
|
309
|
+
None # default is LOG_LEVEL if set, else INFO
|
|
310
|
+
)
|
|
279
311
|
DEEPEVAL_RETRY_AFTER_LOG_LEVEL: Optional[int] = None # default -> ERROR
|
|
312
|
+
DEEPEVAL_RETRY_MAX_ATTEMPTS: conint(ge=1) = (
|
|
313
|
+
2 # attempts = first try + retries
|
|
314
|
+
)
|
|
315
|
+
DEEPEVAL_RETRY_INITIAL_SECONDS: confloat(ge=0) = (
|
|
316
|
+
1.0 # first sleep before retry, if any
|
|
317
|
+
)
|
|
318
|
+
DEEPEVAL_RETRY_EXP_BASE: confloat(ge=1) = (
|
|
319
|
+
2.0 # exponential growth factor for sleeps
|
|
320
|
+
)
|
|
321
|
+
DEEPEVAL_RETRY_JITTER: confloat(ge=0) = 2.0 # uniform jitter
|
|
322
|
+
DEEPEVAL_RETRY_CAP_SECONDS: confloat(ge=0) = (
|
|
323
|
+
5.0 # cap for each backoff sleep
|
|
324
|
+
)
|
|
280
325
|
|
|
281
326
|
#
|
|
282
327
|
# Telemetry and Debug
|
|
@@ -303,19 +348,87 @@ class Settings(BaseSettings):
|
|
|
303
348
|
#
|
|
304
349
|
MEDIA_IMAGE_CONNECT_TIMEOUT_SECONDS: float = 3.05
|
|
305
350
|
MEDIA_IMAGE_READ_TIMEOUT_SECONDS: float = 10.0
|
|
351
|
+
# DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS: per-attempt timeout for provider calls enforced by our retry decorator.
|
|
352
|
+
# This timeout interacts with retry policy and the task level budget (DEEPEVAL_PER_TASK_TIMEOUT_SECONDS) below.
|
|
353
|
+
# If you leave this at 0/None, the computed outer budget defaults to 180s.
|
|
354
|
+
DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS: Optional[confloat(ge=0)] = (
|
|
355
|
+
None # per-attempt timeout. Set 0/None to disable
|
|
356
|
+
)
|
|
306
357
|
|
|
307
358
|
#
|
|
308
359
|
# Async Task Configuration
|
|
309
360
|
#
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
|
|
313
|
-
|
|
314
|
-
|
|
361
|
+
DEEPEVAL_TIMEOUT_THREAD_LIMIT: conint(ge=1) = 128
|
|
362
|
+
DEEPEVAL_TIMEOUT_SEMAPHORE_WARN_AFTER_SECONDS: confloat(ge=0) = 5.0
|
|
363
|
+
# DEEPEVAL_PER_TASK_TIMEOUT_SECONDS is the outer time budget for one metric/task.
|
|
364
|
+
# It is computed from per-attempt timeout + retries/backoff unless you explicitly override it.
|
|
365
|
+
# - OVERRIDE = None or 0 -> auto compute as:
|
|
366
|
+
# attempts * per_attempt_timeout + sum(backoff_sleeps) + ~jitter/2 per sleep + 1s safety
|
|
367
|
+
# (If per_attempt_timeout is 0/None, the auto outer budget defaults to 180s.)
|
|
368
|
+
# - OVERRIDE > 0 -> use that exact value. A warning is logged if it is likely too small
|
|
369
|
+
# to permit the configured attempts/backoff.
|
|
370
|
+
#
|
|
371
|
+
# Tip:
|
|
372
|
+
# Most users only need to set DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS and DEEPEVAL_RETRY_MAX_ATTEMPTS.
|
|
373
|
+
# Leave the outer budget on auto unless you have very strict SLAs.
|
|
374
|
+
DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE: Optional[conint(ge=0)] = None
|
|
315
375
|
|
|
316
376
|
# Buffer time for gathering results from all tasks, added to the longest task duration
|
|
317
377
|
# Increase if many tasks are running concurrently
|
|
318
|
-
DEEPEVAL_TASK_GATHER_BUFFER_SECONDS:
|
|
378
|
+
DEEPEVAL_TASK_GATHER_BUFFER_SECONDS: confloat(ge=0) = 60
|
|
379
|
+
|
|
380
|
+
###################
|
|
381
|
+
# Computed Fields #
|
|
382
|
+
###################
|
|
383
|
+
|
|
384
|
+
def _calc_auto_outer_timeout(self) -> int:
|
|
385
|
+
"""Compute outer budget from per-attempt timeout + retries/backoff.
|
|
386
|
+
Never reference the computed property itself here.
|
|
387
|
+
"""
|
|
388
|
+
attempts = self.DEEPEVAL_RETRY_MAX_ATTEMPTS or 1
|
|
389
|
+
timeout_seconds = float(self.DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0)
|
|
390
|
+
if timeout_seconds <= 0:
|
|
391
|
+
# No per-attempt timeout set -> default outer budget
|
|
392
|
+
return 180
|
|
393
|
+
|
|
394
|
+
sleeps = max(0, attempts - 1)
|
|
395
|
+
cur = float(self.DEEPEVAL_RETRY_INITIAL_SECONDS)
|
|
396
|
+
cap = float(self.DEEPEVAL_RETRY_CAP_SECONDS)
|
|
397
|
+
base = float(self.DEEPEVAL_RETRY_EXP_BASE)
|
|
398
|
+
jitter = float(self.DEEPEVAL_RETRY_JITTER)
|
|
399
|
+
|
|
400
|
+
backoff = 0.0
|
|
401
|
+
for _ in range(sleeps):
|
|
402
|
+
backoff += min(cap, cur)
|
|
403
|
+
cur *= base
|
|
404
|
+
backoff += sleeps * (jitter / 2.0) # expected jitter
|
|
405
|
+
|
|
406
|
+
safety_overhead = 1.0
|
|
407
|
+
return int(
|
|
408
|
+
math.ceil(attempts * timeout_seconds + backoff + safety_overhead)
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
@computed_field
|
|
412
|
+
@property
|
|
413
|
+
def DEEPEVAL_PER_TASK_TIMEOUT_SECONDS(self) -> int:
|
|
414
|
+
"""If OVERRIDE is set (nonzero), return it; else return the derived budget."""
|
|
415
|
+
outer = self.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE
|
|
416
|
+
if outer not in (None, 0):
|
|
417
|
+
# Warn if user-provided outer is likely to truncate retries
|
|
418
|
+
if (self.DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0) > 0:
|
|
419
|
+
min_needed = self._calc_auto_outer_timeout()
|
|
420
|
+
if int(outer) < min_needed:
|
|
421
|
+
if self.DEEPEVAL_VERBOSE_MODE:
|
|
422
|
+
logger.warning(
|
|
423
|
+
"Metric timeout (outer=%ss) is less than attempts × per-attempt "
|
|
424
|
+
"timeout + backoff (≈%ss). Retries may be cut short.",
|
|
425
|
+
int(outer),
|
|
426
|
+
min_needed,
|
|
427
|
+
)
|
|
428
|
+
return int(outer)
|
|
429
|
+
|
|
430
|
+
# Auto mode
|
|
431
|
+
return self._calc_auto_outer_timeout()
|
|
319
432
|
|
|
320
433
|
##############
|
|
321
434
|
# Validators #
|
|
@@ -461,7 +574,9 @@ class Settings(BaseSettings):
|
|
|
461
574
|
if s in SUPPORTED_PROVIDER_SLUGS:
|
|
462
575
|
normalized.append(s)
|
|
463
576
|
else:
|
|
464
|
-
if
|
|
577
|
+
if parse_bool(
|
|
578
|
+
os.getenv("DEEPEVAL_VERBOSE_MODE"), default=False
|
|
579
|
+
):
|
|
465
580
|
logger.warning("Unknown provider slug %r dropped", item)
|
|
466
581
|
|
|
467
582
|
if star:
|
|
@@ -474,6 +589,7 @@ class Settings(BaseSettings):
|
|
|
474
589
|
@field_validator(
|
|
475
590
|
"DEEPEVAL_RETRY_BEFORE_LOG_LEVEL",
|
|
476
591
|
"DEEPEVAL_RETRY_AFTER_LOG_LEVEL",
|
|
592
|
+
"LOG_LEVEL",
|
|
477
593
|
mode="before",
|
|
478
594
|
)
|
|
479
595
|
@classmethod
|
|
@@ -511,6 +627,10 @@ class Settings(BaseSettings):
|
|
|
511
627
|
# Persistence support #
|
|
512
628
|
#######################
|
|
513
629
|
class _SettingsEditCtx:
|
|
630
|
+
COMPUTED_FIELDS: frozenset[str] = frozenset(
|
|
631
|
+
{"DEEPEVAL_PER_TASK_TIMEOUT_SECONDS"}
|
|
632
|
+
)
|
|
633
|
+
|
|
514
634
|
def __init__(
|
|
515
635
|
self,
|
|
516
636
|
settings: "Settings",
|
|
@@ -546,8 +666,11 @@ class Settings(BaseSettings):
|
|
|
546
666
|
# lazy import legacy JSON store deps
|
|
547
667
|
from deepeval.key_handler import KEY_FILE_HANDLER
|
|
548
668
|
|
|
669
|
+
model_fields = type(self._s).model_fields
|
|
670
|
+
# Exclude computed fields from persistence
|
|
671
|
+
|
|
549
672
|
# compute diff of changed fields
|
|
550
|
-
after = {k: getattr(self._s, k) for k in
|
|
673
|
+
after = {k: getattr(self._s, k) for k in model_fields}
|
|
551
674
|
|
|
552
675
|
before_norm = {
|
|
553
676
|
k: _normalize_for_env(v) for k, v in self._before.items()
|
|
@@ -557,12 +680,21 @@ class Settings(BaseSettings):
|
|
|
557
680
|
changed_keys = {
|
|
558
681
|
k for k in after_norm if after_norm[k] != before_norm.get(k)
|
|
559
682
|
}
|
|
683
|
+
changed_keys -= self.COMPUTED_FIELDS
|
|
684
|
+
|
|
560
685
|
if not changed_keys:
|
|
561
686
|
self.result = PersistResult(False, None, {})
|
|
562
687
|
return False
|
|
563
688
|
|
|
564
689
|
updates = {k: after[k] for k in changed_keys}
|
|
565
690
|
|
|
691
|
+
if "LOG_LEVEL" in updates:
|
|
692
|
+
from deepeval.config.logging import (
|
|
693
|
+
apply_deepeval_log_level,
|
|
694
|
+
)
|
|
695
|
+
|
|
696
|
+
apply_deepeval_log_level()
|
|
697
|
+
|
|
566
698
|
#
|
|
567
699
|
# .deepeval JSON support
|
|
568
700
|
#
|
|
@@ -668,4 +800,27 @@ def get_settings() -> Settings:
|
|
|
668
800
|
global _settings_singleton
|
|
669
801
|
if _settings_singleton is None:
|
|
670
802
|
_settings_singleton = Settings()
|
|
803
|
+
from deepeval.config.logging import apply_deepeval_log_level
|
|
804
|
+
|
|
805
|
+
apply_deepeval_log_level()
|
|
671
806
|
return _settings_singleton
|
|
807
|
+
|
|
808
|
+
|
|
809
|
+
def reset_settings(*, reload_dotenv: bool = False) -> Settings:
|
|
810
|
+
"""
|
|
811
|
+
Drop the cached Settings singleton and rebuild it from the current process
|
|
812
|
+
environment.
|
|
813
|
+
|
|
814
|
+
Args:
|
|
815
|
+
reload_dotenv: When True, call `autoload_dotenv()` before re-instantiating,
|
|
816
|
+
which merges .env values into os.environ (never overwriting
|
|
817
|
+
existing process env vars).
|
|
818
|
+
|
|
819
|
+
Returns:
|
|
820
|
+
The fresh Settings instance.
|
|
821
|
+
"""
|
|
822
|
+
global _settings_singleton
|
|
823
|
+
if reload_dotenv:
|
|
824
|
+
autoload_dotenv()
|
|
825
|
+
_settings_singleton = None
|
|
826
|
+
return get_settings()
|
|
@@ -1266,11 +1266,17 @@ class EvaluationDataset:
|
|
|
1266
1266
|
detach(ctx_token)
|
|
1267
1267
|
|
|
1268
1268
|
else:
|
|
1269
|
-
|
|
1269
|
+
res = global_test_run_manager.wrap_up_test_run(
|
|
1270
1270
|
run_duration, display_table=False
|
|
1271
1271
|
)
|
|
1272
|
+
if isinstance(res, tuple):
|
|
1273
|
+
confident_link, test_run_id = res
|
|
1274
|
+
else:
|
|
1275
|
+
confident_link = test_run_id = None
|
|
1272
1276
|
return EvaluationResult(
|
|
1273
|
-
test_results=test_results,
|
|
1277
|
+
test_results=test_results,
|
|
1278
|
+
confident_link=confident_link,
|
|
1279
|
+
test_run_id=test_run_id,
|
|
1274
1280
|
)
|
|
1275
1281
|
|
|
1276
1282
|
def evaluate(self, task: Task):
|
|
@@ -268,11 +268,17 @@ def evaluate(
|
|
|
268
268
|
test_run = global_test_run_manager.get_test_run()
|
|
269
269
|
test_run.hyperparameters = process_hyperparameters(hyperparameters)
|
|
270
270
|
global_test_run_manager.save_test_run(TEMP_FILE_PATH)
|
|
271
|
-
|
|
271
|
+
res = global_test_run_manager.wrap_up_test_run(
|
|
272
272
|
run_duration, display_table=False
|
|
273
273
|
)
|
|
274
|
+
if isinstance(res, tuple):
|
|
275
|
+
confident_link, test_run_id = res
|
|
276
|
+
else:
|
|
277
|
+
confident_link = test_run_id = None
|
|
274
278
|
return EvaluationResult(
|
|
275
|
-
test_results=test_results,
|
|
279
|
+
test_results=test_results,
|
|
280
|
+
confident_link=confident_link,
|
|
281
|
+
test_run_id=test_run_id,
|
|
276
282
|
)
|
|
277
283
|
elif metric_collection:
|
|
278
284
|
api = Api()
|
|
@@ -45,9 +45,7 @@ from deepeval.dataset import Golden
|
|
|
45
45
|
from deepeval.contextvars import set_current_golden, reset_current_golden
|
|
46
46
|
from deepeval.errors import MissingTestCaseParamsError
|
|
47
47
|
from deepeval.metrics.utils import copy_metrics
|
|
48
|
-
from deepeval.utils import
|
|
49
|
-
get_or_create_event_loop,
|
|
50
|
-
)
|
|
48
|
+
from deepeval.utils import get_or_create_event_loop, shorten, len_medium
|
|
51
49
|
from deepeval.telemetry import capture_evaluation_run
|
|
52
50
|
from deepeval.metrics import (
|
|
53
51
|
BaseMetric,
|
|
@@ -93,7 +91,6 @@ from deepeval.config.settings import get_settings
|
|
|
93
91
|
|
|
94
92
|
|
|
95
93
|
logger = logging.getLogger(__name__)
|
|
96
|
-
settings = get_settings()
|
|
97
94
|
|
|
98
95
|
|
|
99
96
|
async def _snapshot_tasks():
|
|
@@ -102,6 +99,18 @@ async def _snapshot_tasks():
|
|
|
102
99
|
return {t for t in asyncio.all_tasks() if t is not cur}
|
|
103
100
|
|
|
104
101
|
|
|
102
|
+
def _per_task_timeout() -> float:
|
|
103
|
+
return get_settings().DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _gather_timeout() -> float:
|
|
107
|
+
s = get_settings()
|
|
108
|
+
return (
|
|
109
|
+
s.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
|
|
110
|
+
+ s.DEEPEVAL_TASK_GATHER_BUFFER_SECONDS
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
105
114
|
###########################################
|
|
106
115
|
### E2E Evals #############################
|
|
107
116
|
###########################################
|
|
@@ -840,7 +849,7 @@ def execute_agentic_test_cases(
|
|
|
840
849
|
loop.run_until_complete(
|
|
841
850
|
asyncio.wait_for(
|
|
842
851
|
coro,
|
|
843
|
-
timeout=
|
|
852
|
+
timeout=_per_task_timeout(),
|
|
844
853
|
)
|
|
845
854
|
)
|
|
846
855
|
else:
|
|
@@ -1198,7 +1207,7 @@ async def _a_execute_agentic_test_case(
|
|
|
1198
1207
|
if asyncio.iscoroutinefunction(observed_callback):
|
|
1199
1208
|
await asyncio.wait_for(
|
|
1200
1209
|
observed_callback(golden.input),
|
|
1201
|
-
timeout=
|
|
1210
|
+
timeout=_per_task_timeout(),
|
|
1202
1211
|
)
|
|
1203
1212
|
else:
|
|
1204
1213
|
observed_callback(golden.input)
|
|
@@ -1755,11 +1764,6 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1755
1764
|
_is_assert_test: bool = False,
|
|
1756
1765
|
) -> Iterator[TestResult]:
|
|
1757
1766
|
|
|
1758
|
-
GATHER_TIMEOUT_SECONDS = (
|
|
1759
|
-
settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
|
|
1760
|
-
+ settings.DEEPEVAL_TASK_GATHER_BUFFER_SECONDS
|
|
1761
|
-
)
|
|
1762
|
-
|
|
1763
1767
|
semaphore = asyncio.Semaphore(async_config.max_concurrent)
|
|
1764
1768
|
original_create_task = asyncio.create_task
|
|
1765
1769
|
|
|
@@ -1774,7 +1778,7 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1774
1778
|
async def execute_callback_with_semaphore(coroutine: Awaitable):
|
|
1775
1779
|
async with semaphore:
|
|
1776
1780
|
return await asyncio.wait_for(
|
|
1777
|
-
coroutine, timeout=
|
|
1781
|
+
coroutine, timeout=_per_task_timeout()
|
|
1778
1782
|
)
|
|
1779
1783
|
|
|
1780
1784
|
def evaluate_test_cases(
|
|
@@ -1802,14 +1806,11 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1802
1806
|
)
|
|
1803
1807
|
|
|
1804
1808
|
# record metadata for debugging
|
|
1805
|
-
MAX_META_INPUT_LENGTH = 120
|
|
1806
1809
|
started = time.perf_counter()
|
|
1807
|
-
short_input = current_golden_ctx
|
|
1808
|
-
if (
|
|
1809
|
-
|
|
1810
|
-
|
|
1811
|
-
):
|
|
1812
|
-
short_input = short_input[:MAX_META_INPUT_LENGTH] + "…"
|
|
1810
|
+
short_input = current_golden_ctx.get("input")
|
|
1811
|
+
if isinstance(short_input, str):
|
|
1812
|
+
short_input = shorten(short_input, len_medium())
|
|
1813
|
+
|
|
1813
1814
|
task_meta[task] = {
|
|
1814
1815
|
"golden_index": current_golden_ctx["index"],
|
|
1815
1816
|
"golden_name": current_golden_ctx["name"],
|
|
@@ -1819,7 +1820,7 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1819
1820
|
}
|
|
1820
1821
|
|
|
1821
1822
|
def on_task_done(t: asyncio.Task):
|
|
1822
|
-
if
|
|
1823
|
+
if get_settings().DEEPEVAL_DEBUG_ASYNC:
|
|
1823
1824
|
# Using info level here to make it easy to spot these logs.
|
|
1824
1825
|
# We are gated by DEEPEVAL_DEBUG_ASYNC
|
|
1825
1826
|
meta = task_meta.get(t, {})
|
|
@@ -1893,7 +1894,7 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1893
1894
|
loop.run_until_complete(
|
|
1894
1895
|
asyncio.wait_for(
|
|
1895
1896
|
asyncio.gather(*created_tasks, return_exceptions=True),
|
|
1896
|
-
timeout=
|
|
1897
|
+
timeout=_gather_timeout(),
|
|
1897
1898
|
)
|
|
1898
1899
|
)
|
|
1899
1900
|
except asyncio.TimeoutError:
|
|
@@ -1908,16 +1909,13 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1908
1909
|
elapsed_time = time.perf_counter() - start_time
|
|
1909
1910
|
|
|
1910
1911
|
# Determine if it was a per task or gather timeout based on task's elapsed time
|
|
1911
|
-
if (
|
|
1912
|
-
elapsed_time
|
|
1913
|
-
>= settings.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS
|
|
1914
|
-
):
|
|
1912
|
+
if elapsed_time >= _per_task_timeout():
|
|
1915
1913
|
timeout_type = "per-task"
|
|
1916
1914
|
else:
|
|
1917
1915
|
timeout_type = "gather"
|
|
1918
1916
|
|
|
1919
1917
|
logger.warning(
|
|
1920
|
-
f"[deepeval] gather TIMEOUT after {
|
|
1918
|
+
f"[deepeval] gather TIMEOUT after {_gather_timeout()}s; "
|
|
1921
1919
|
f"pending={len(pending)} tasks. Timeout type: {timeout_type}. "
|
|
1922
1920
|
f"To give tasks more time, consider increasing "
|
|
1923
1921
|
f"DEEPEVAL_PER_TASK_TIMEOUT_SECONDS for longer task completion time or "
|
|
@@ -1931,7 +1929,7 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1931
1929
|
elapsed_time,
|
|
1932
1930
|
meta,
|
|
1933
1931
|
)
|
|
1934
|
-
if loop.get_debug() and
|
|
1932
|
+
if loop.get_debug() and get_settings().DEEPEVAL_DEBUG_ASYNC:
|
|
1935
1933
|
frames = t.get_stack(limit=6)
|
|
1936
1934
|
if frames:
|
|
1937
1935
|
logger.info(" stack:")
|
|
@@ -1970,9 +1968,9 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1970
1968
|
if not leftovers:
|
|
1971
1969
|
return
|
|
1972
1970
|
|
|
1973
|
-
if
|
|
1971
|
+
if get_settings().DEEPEVAL_DEBUG_ASYNC:
|
|
1974
1972
|
logger.warning(
|
|
1975
|
-
"[deepeval] %d stray task(s) not tracked; cancelling
|
|
1973
|
+
"[deepeval] %d stray task(s) not tracked; cancelling...",
|
|
1976
1974
|
len(leftovers),
|
|
1977
1975
|
)
|
|
1978
1976
|
for t in leftovers:
|
|
@@ -1990,7 +1988,7 @@ def a_execute_agentic_test_cases_from_loop(
|
|
|
1990
1988
|
)
|
|
1991
1989
|
except RuntimeError:
|
|
1992
1990
|
# If the loop is closing here, just continue
|
|
1993
|
-
if
|
|
1991
|
+
if get_settings().DEEPEVAL_DEBUG_ASYNC:
|
|
1994
1992
|
logger.warning(
|
|
1995
1993
|
"[deepeval] failed to drain stray tasks because loop is closing"
|
|
1996
1994
|
)
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
from typing import Optional, List, Union, Dict
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
from pydantic import BaseModel
|
|
4
|
-
|
|
4
|
+
|
|
5
|
+
from deepeval.test_run.api import MetricData, TurnApi
|
|
5
6
|
from deepeval.test_case import MLLMImage
|
|
6
7
|
|
|
7
8
|
|
|
@@ -19,9 +20,11 @@ class TestResult:
|
|
|
19
20
|
expected_output: Optional[str] = None
|
|
20
21
|
context: Optional[List[str]] = None
|
|
21
22
|
retrieval_context: Optional[List[str]] = None
|
|
23
|
+
turns: Optional[List[TurnApi]] = None
|
|
22
24
|
additional_metadata: Optional[Dict] = None
|
|
23
25
|
|
|
24
26
|
|
|
25
27
|
class EvaluationResult(BaseModel):
|
|
26
28
|
test_results: List[TestResult]
|
|
27
29
|
confident_link: Optional[str]
|
|
30
|
+
test_run_id: Optional[str]
|