deepeval 3.6.8__tar.gz → 3.6.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {deepeval-3.6.8 → deepeval-3.6.9}/PKG-INFO +1 -1
- deepeval-3.6.9/deepeval/_version.py +1 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/config/settings.py +104 -36
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/config/utils.py +5 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/dataset/dataset.py +162 -30
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/dataset/utils.py +41 -13
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/evaluate/execute.py +1099 -633
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/indicator.py +21 -1
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/models/llms/amazon_bedrock_model.py +20 -17
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/models/llms/openai_model.py +10 -1
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/models/retry_policy.py +103 -20
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/simulator/conversation_simulator.py +25 -18
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/synthesizer/chunking/context_generator.py +9 -1
- {deepeval-3.6.8 → deepeval-3.6.9}/pyproject.toml +1 -1
- deepeval-3.6.8/deepeval/_version.py +0 -1
- {deepeval-3.6.8 → deepeval-3.6.9}/LICENSE.md +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/README.md +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/annotation/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/annotation/annotation.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/annotation/api.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/arc/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/arc/arc.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/arc/mode.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/arc/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/base_benchmark.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/bbq/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/bbq/bbq.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/bbq/task.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/bbq/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/big_bench_hard.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/cot_prompts/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/cot_prompts/boolean_expressions.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/cot_prompts/causal_judgement.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/cot_prompts/date_understanding.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/cot_prompts/disambiguation_qa.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/cot_prompts/dyck_languages.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/cot_prompts/formal_fallacies.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/cot_prompts/geometric_shapes.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/cot_prompts/hyperbaton.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_five_objects.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_seven_objects.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_three_objects.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/cot_prompts/movie_recommendation.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/cot_prompts/multistep_arithmetic_two.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/cot_prompts/navigate.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/cot_prompts/object_counting.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/cot_prompts/penguins_in_a_table.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/cot_prompts/reasoning_about_colored_objects.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/cot_prompts/ruin_names.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/cot_prompts/salient_translation_error_detection.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/cot_prompts/snarks.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/cot_prompts/sports_understanding.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/cot_prompts/temporal_sequences.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/cot_prompts/web_of_lies.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/cot_prompts/word_sorting.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/shot_prompts/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/shot_prompts/boolean_expressions.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/shot_prompts/causal_judgement.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/shot_prompts/date_understanding.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/shot_prompts/disambiguation_qa.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/shot_prompts/dyck_languages.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/shot_prompts/formal_fallacies.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/shot_prompts/geometric_shapes.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/shot_prompts/hyperbaton.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_five_objects.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_seven_objects.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_three_objects.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/shot_prompts/movie_recommendation.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/shot_prompts/multistep_arithmetic_two.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/shot_prompts/navigate.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/shot_prompts/object_counting.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/shot_prompts/penguins_in_a_table.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/shot_prompts/reasoning_about_colored_objects.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/shot_prompts/ruin_names.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/shot_prompts/salient_translation_error_detection.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/shot_prompts/snarks.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/shot_prompts/sports_understanding.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/shot_prompts/temporal_sequences.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/shot_prompts/web_of_lies.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/shot_prompts/word_sorting.txt +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/task.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/big_bench_hard/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/bool_q/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/bool_q/bool_q.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/bool_q/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/drop/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/drop/drop.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/drop/task.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/drop/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/equity_med_qa/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/equity_med_qa/equity_med_qa.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/equity_med_qa/task.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/equity_med_qa/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/gsm8k/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/gsm8k/gsm8k.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/gsm8k/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/hellaswag/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/hellaswag/hellaswag.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/hellaswag/task.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/hellaswag/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/human_eval/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/human_eval/human_eval.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/human_eval/task.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/human_eval/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/ifeval/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/ifeval/ifeval.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/ifeval/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/lambada/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/lambada/lambada.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/lambada/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/logi_qa/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/logi_qa/logi_qa.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/logi_qa/task.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/logi_qa/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/math_qa/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/math_qa/math_qa.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/math_qa/task.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/math_qa/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/mmlu/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/mmlu/mmlu.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/mmlu/task.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/mmlu/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/modes/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/results.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/squad/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/squad/squad.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/squad/task.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/squad/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/tasks/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/truthful_qa/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/truthful_qa/mode.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/truthful_qa/task.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/truthful_qa/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/truthful_qa/truthful_qa.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/utils.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/winogrande/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/winogrande/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/benchmarks/winogrande/winogrande.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/cli/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/cli/dotenv_handler.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/cli/main.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/cli/server.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/cli/test.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/cli/types.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/cli/utils.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/confident/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/confident/api.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/confident/types.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/config/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/config/logging.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/config/settings_manager.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/constants.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/contextvars.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/dataset/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/dataset/api.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/dataset/golden.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/dataset/test_run_tracer.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/dataset/types.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/errors.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/evaluate/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/evaluate/api.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/evaluate/compare.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/evaluate/configs.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/evaluate/evaluate.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/evaluate/types.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/evaluate/utils.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/integrations/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/integrations/crewai/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/integrations/crewai/handler.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/integrations/crewai/subs.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/integrations/crewai/tool.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/integrations/crewai/wrapper.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/integrations/hugging_face/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/integrations/hugging_face/callback.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/integrations/hugging_face/rich_manager.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/integrations/hugging_face/tests/test_callbacks.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/integrations/hugging_face/utils.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/integrations/langchain/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/integrations/langchain/callback.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/integrations/langchain/patch.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/integrations/langchain/utils.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/integrations/llama_index/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/integrations/llama_index/handler.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/integrations/llama_index/utils.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/integrations/pydantic_ai/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/integrations/pydantic_ai/agent.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/integrations/pydantic_ai/instrumentator.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/integrations/pydantic_ai/otel.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/integrations/pydantic_ai/test_instrumentator.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/key_handler.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/answer_relevancy/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/answer_relevancy/answer_relevancy.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/answer_relevancy/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/answer_relevancy/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/api.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/arena_g_eval/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/arena_g_eval/arena_g_eval.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/arena_g_eval/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/arena_g_eval/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/arena_g_eval/utils.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/argument_correctness/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/argument_correctness/argument_correctness.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/argument_correctness/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/argument_correctness/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/base_metric.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/bias/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/bias/bias.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/bias/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/bias/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/contextual_precision/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/contextual_precision/contextual_precision.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/contextual_precision/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/contextual_precision/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/contextual_recall/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/contextual_recall/contextual_recall.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/contextual_recall/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/contextual_recall/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/contextual_relevancy/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/contextual_relevancy/contextual_relevancy.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/contextual_relevancy/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/contextual_relevancy/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/conversation_completeness/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/conversation_completeness/conversation_completeness.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/conversation_completeness/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/conversation_completeness/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/conversational_dag/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/conversational_dag/conversational_dag.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/conversational_dag/nodes.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/conversational_dag/templates.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/conversational_g_eval/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/conversational_g_eval/conversational_g_eval.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/conversational_g_eval/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/conversational_g_eval/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/dag/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/dag/dag.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/dag/graph.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/dag/nodes.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/dag/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/dag/templates.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/dag/utils.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/faithfulness/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/faithfulness/faithfulness.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/faithfulness/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/faithfulness/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/g_eval/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/g_eval/g_eval.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/g_eval/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/g_eval/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/g_eval/utils.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/goal_accuracy/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/goal_accuracy/goal_accuracy.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/goal_accuracy/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/goal_accuracy/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/hallucination/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/hallucination/hallucination.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/hallucination/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/hallucination/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/json_correctness/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/json_correctness/json_correctness.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/json_correctness/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/json_correctness/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/knowledge_retention/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/knowledge_retention/knowledge_retention.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/knowledge_retention/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/knowledge_retention/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/mcp/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/mcp/mcp_task_completion.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/mcp/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/mcp/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/mcp_use_metric/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/mcp_use_metric/mcp_use_metric.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/mcp_use_metric/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/mcp_use_metric/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/misuse/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/misuse/misuse.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/misuse/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/misuse/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/image_coherence/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/image_coherence/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/image_coherence/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/image_editing/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/image_editing/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/image_editing/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/image_helpfulness/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/image_helpfulness/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/image_helpfulness/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/image_reference/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/image_reference/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/image_reference/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/text_to_image/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/text_to_image/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/text_to_image/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/non_advice/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/non_advice/non_advice.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/non_advice/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/non_advice/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/pii_leakage/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/pii_leakage/pii_leakage.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/pii_leakage/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/pii_leakage/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/plan_adherence/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/plan_adherence/plan_adherence.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/plan_adherence/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/plan_adherence/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/plan_quality/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/plan_quality/plan_quality.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/plan_quality/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/plan_quality/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/prompt_alignment/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/prompt_alignment/prompt_alignment.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/prompt_alignment/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/prompt_alignment/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/ragas.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/role_adherence/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/role_adherence/role_adherence.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/role_adherence/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/role_adherence/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/role_violation/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/role_violation/role_violation.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/role_violation/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/role_violation/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/step_efficiency/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/step_efficiency/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/step_efficiency/step_efficiency.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/step_efficiency/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/summarization/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/summarization/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/summarization/summarization.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/summarization/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/task_completion/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/task_completion/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/task_completion/task_completion.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/task_completion/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/tool_correctness/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/tool_correctness/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/tool_correctness/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/tool_correctness/tool_correctness.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/tool_use/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/tool_use/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/tool_use/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/tool_use/tool_use.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/topic_adherence/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/topic_adherence/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/topic_adherence/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/topic_adherence/topic_adherence.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/toxicity/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/toxicity/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/toxicity/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/toxicity/toxicity.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/turn_relevancy/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/turn_relevancy/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/turn_relevancy/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/turn_relevancy/turn_relevancy.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/metrics/utils.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/models/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/models/_summac_model.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/models/answer_relevancy_model.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/models/base_model.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/models/detoxify_model.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/models/embedding_models/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/models/embedding_models/azure_embedding_model.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/models/embedding_models/local_embedding_model.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/models/embedding_models/ollama_embedding_model.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/models/embedding_models/openai_embedding_model.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/models/hallucination_model.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/models/llms/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/models/llms/anthropic_model.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/models/llms/azure_model.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/models/llms/deepseek_model.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/models/llms/gemini_model.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/models/llms/grok_model.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/models/llms/kimi_model.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/models/llms/litellm_model.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/models/llms/local_model.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/models/llms/ollama_model.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/models/llms/utils.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/models/mlllms/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/models/mlllms/gemini_model.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/models/mlllms/ollama_model.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/models/mlllms/openai_model.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/models/summac_model.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/models/unbias_model.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/models/utils.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/openai/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/openai/extractors.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/openai/patch.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/openai/types.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/openai/utils.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/openai_agents/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/openai_agents/agent.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/openai_agents/callback_handler.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/openai_agents/extractors.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/openai_agents/patch.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/openai_agents/runner.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/plugins/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/plugins/plugin.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/progress_context.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/prompt/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/prompt/api.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/prompt/prompt.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/prompt/utils.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/py.typed +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/red_teaming/README.md +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/scorer/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/scorer/scorer.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/simulator/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/simulator/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/simulator/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/singleton.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/synthesizer/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/synthesizer/base_synthesizer.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/synthesizer/chunking/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/synthesizer/chunking/doc_chunker.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/synthesizer/config.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/synthesizer/schema.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/synthesizer/synthesizer.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/synthesizer/templates/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/synthesizer/templates/template.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/synthesizer/templates/template_extraction.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/synthesizer/templates/template_prompt.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/synthesizer/types.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/synthesizer/utils.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/telemetry.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/test_case/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/test_case/api.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/test_case/arena_test_case.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/test_case/conversational_test_case.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/test_case/llm_test_case.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/test_case/mcp.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/test_case/mllm_test_case.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/test_case/utils.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/test_run/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/test_run/api.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/test_run/cache.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/test_run/hooks.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/test_run/hyperparameters.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/test_run/test_run.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/tracing/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/tracing/api.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/tracing/context.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/tracing/offline_evals/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/tracing/offline_evals/api.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/tracing/offline_evals/span.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/tracing/offline_evals/thread.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/tracing/offline_evals/trace.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/tracing/otel/__init__.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/tracing/otel/exporter.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/tracing/otel/test_exporter.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/tracing/otel/utils.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/tracing/patchers.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/tracing/perf_epoch_bridge.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/tracing/trace_context.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/tracing/trace_test_manager.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/tracing/tracing.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/tracing/types.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/tracing/utils.py +0 -0
- {deepeval-3.6.8 → deepeval-3.6.9}/deepeval/utils.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__: str = "3.6.9"
|
|
@@ -30,6 +30,7 @@ from typing import Any, Dict, List, Optional, NamedTuple
|
|
|
30
30
|
from deepeval.config.utils import (
|
|
31
31
|
parse_bool,
|
|
32
32
|
coerce_to_list,
|
|
33
|
+
constrain_between,
|
|
33
34
|
dedupe_preserve_order,
|
|
34
35
|
)
|
|
35
36
|
from deepeval.constants import SUPPORTED_PROVIDER_SLUGS, slugify
|
|
@@ -336,6 +337,7 @@ class Settings(BaseSettings):
|
|
|
336
337
|
IGNORE_DEEPEVAL_ERRORS: Optional[bool] = None
|
|
337
338
|
SKIP_DEEPEVAL_MISSING_PARAMS: Optional[bool] = None
|
|
338
339
|
DEEPEVAL_VERBOSE_MODE: Optional[bool] = None
|
|
340
|
+
DEEPEVAL_LOG_STACK_TRACES: Optional[bool] = None
|
|
339
341
|
ENABLE_DEEPEVAL_CACHE: Optional[bool] = None
|
|
340
342
|
|
|
341
343
|
CONFIDENT_TRACE_FLUSH: Optional[bool] = None
|
|
@@ -355,11 +357,19 @@ class Settings(BaseSettings):
|
|
|
355
357
|
#
|
|
356
358
|
MEDIA_IMAGE_CONNECT_TIMEOUT_SECONDS: float = 3.05
|
|
357
359
|
MEDIA_IMAGE_READ_TIMEOUT_SECONDS: float = 10.0
|
|
358
|
-
#
|
|
359
|
-
#
|
|
360
|
-
#
|
|
361
|
-
|
|
362
|
-
|
|
360
|
+
# DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE
|
|
361
|
+
# Per-attempt timeout (seconds) for provider calls used by the retry policy.
|
|
362
|
+
# This is an OVERRIDE setting. The effective value you should rely on at runtime is
|
|
363
|
+
# the computed property: DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS.
|
|
364
|
+
#
|
|
365
|
+
# If this is None or 0 the DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS is computed from either:
|
|
366
|
+
# - DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE: slice the outer budget
|
|
367
|
+
# across attempts after subtracting expected backoff and a small safety buffer
|
|
368
|
+
# - the default outer budget (180s) if no outer override is set.
|
|
369
|
+
#
|
|
370
|
+
# Tip: Set this OR the outer override, but generally not both
|
|
371
|
+
DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE: Optional[confloat(gt=0)] = (
|
|
372
|
+
None
|
|
363
373
|
)
|
|
364
374
|
|
|
365
375
|
#
|
|
@@ -373,76 +383,115 @@ class Settings(BaseSettings):
|
|
|
373
383
|
#
|
|
374
384
|
DEEPEVAL_TIMEOUT_THREAD_LIMIT: conint(ge=1) = 128
|
|
375
385
|
DEEPEVAL_TIMEOUT_SEMAPHORE_WARN_AFTER_SECONDS: confloat(ge=0) = 5.0
|
|
376
|
-
#
|
|
377
|
-
#
|
|
378
|
-
#
|
|
379
|
-
# attempts * per_attempt_timeout +
|
|
380
|
-
#
|
|
381
|
-
#
|
|
382
|
-
#
|
|
386
|
+
# DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE
|
|
387
|
+
# Outer time budget (seconds) for a single metric/test-case, including retries and backoff.
|
|
388
|
+
# This is an OVERRIDE setting. If None or 0 the DEEPEVAL_PER_TASK_TIMEOUT_SECONDS field is computed:
|
|
389
|
+
# attempts * per_attempt_timeout + expected_backoff + 1s safety
|
|
390
|
+
# (When neither override is set 180s is used.)
|
|
391
|
+
#
|
|
392
|
+
# If > 0, we use the value exactly and log a warning if it is likely too small
|
|
393
|
+
# to accommodate the configured attempts/backoff.
|
|
383
394
|
#
|
|
384
|
-
#
|
|
385
|
-
#
|
|
386
|
-
#
|
|
387
|
-
DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE: Optional[
|
|
395
|
+
# usage:
|
|
396
|
+
# - set DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE along with DEEPEVAL_RETRY_MAX_ATTEMPTS, or
|
|
397
|
+
# - set DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE alone.
|
|
398
|
+
DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE: Optional[confloat(ge=0)] = None
|
|
388
399
|
|
|
389
400
|
# Buffer time for gathering results from all tasks, added to the longest task duration
|
|
390
401
|
# Increase if many tasks are running concurrently
|
|
391
|
-
DEEPEVAL_TASK_GATHER_BUFFER_SECONDS: confloat(ge=0) =
|
|
402
|
+
# DEEPEVAL_TASK_GATHER_BUFFER_SECONDS: confloat(ge=0) = (
|
|
403
|
+
# 30 # 15s seemed like not enough. we may make this computed later.
|
|
404
|
+
# )
|
|
405
|
+
DEEPEVAL_TASK_GATHER_BUFFER_SECONDS_OVERRIDE: Optional[confloat(ge=0)] = (
|
|
406
|
+
None
|
|
407
|
+
)
|
|
392
408
|
|
|
393
409
|
###################
|
|
394
410
|
# Computed Fields #
|
|
395
411
|
###################
|
|
396
412
|
|
|
397
|
-
def _calc_auto_outer_timeout(self) ->
|
|
413
|
+
def _calc_auto_outer_timeout(self) -> float:
|
|
398
414
|
"""Compute outer budget from per-attempt timeout + retries/backoff.
|
|
399
415
|
Never reference the computed property itself here.
|
|
400
416
|
"""
|
|
401
417
|
attempts = self.DEEPEVAL_RETRY_MAX_ATTEMPTS or 1
|
|
402
|
-
timeout_seconds = float(
|
|
418
|
+
timeout_seconds = float(
|
|
419
|
+
self.DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE or 0
|
|
420
|
+
)
|
|
403
421
|
if timeout_seconds <= 0:
|
|
404
422
|
# No per-attempt timeout set -> default outer budget
|
|
405
423
|
return 180
|
|
406
424
|
|
|
407
|
-
|
|
408
|
-
cur = float(self.DEEPEVAL_RETRY_INITIAL_SECONDS)
|
|
409
|
-
cap = float(self.DEEPEVAL_RETRY_CAP_SECONDS)
|
|
410
|
-
base = float(self.DEEPEVAL_RETRY_EXP_BASE)
|
|
411
|
-
jitter = float(self.DEEPEVAL_RETRY_JITTER)
|
|
412
|
-
|
|
413
|
-
backoff = 0.0
|
|
414
|
-
for _ in range(sleeps):
|
|
415
|
-
backoff += min(cap, cur)
|
|
416
|
-
cur *= base
|
|
417
|
-
backoff += sleeps * (jitter / 2.0) # expected jitter
|
|
418
|
-
|
|
425
|
+
backoff = self._expected_backoff(attempts)
|
|
419
426
|
safety_overhead = 1.0
|
|
420
|
-
return
|
|
427
|
+
return float(
|
|
421
428
|
math.ceil(attempts * timeout_seconds + backoff + safety_overhead)
|
|
422
429
|
)
|
|
423
430
|
|
|
424
431
|
@computed_field
|
|
425
432
|
@property
|
|
426
|
-
def
|
|
433
|
+
def DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS(self) -> float:
|
|
434
|
+
over = self.DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE
|
|
435
|
+
if over is not None and float(over) > 0:
|
|
436
|
+
return float(over)
|
|
437
|
+
|
|
438
|
+
attempts = int(self.DEEPEVAL_RETRY_MAX_ATTEMPTS or 1)
|
|
439
|
+
outer_over = self.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE
|
|
440
|
+
|
|
441
|
+
# If the user set an outer override, slice it up
|
|
442
|
+
if outer_over and float(outer_over) > 0 and attempts > 0:
|
|
443
|
+
backoff = self._expected_backoff(attempts)
|
|
444
|
+
safety = 1.0
|
|
445
|
+
usable = max(0.0, float(outer_over) - backoff - safety)
|
|
446
|
+
return 0.0 if usable <= 0 else (usable / attempts)
|
|
447
|
+
|
|
448
|
+
# NEW: when neither override is set, derive from the default outer (180s)
|
|
449
|
+
default_outer = 180.0
|
|
450
|
+
backoff = self._expected_backoff(attempts)
|
|
451
|
+
safety = 1.0
|
|
452
|
+
usable = max(0.0, default_outer - backoff - safety)
|
|
453
|
+
# Keep per-attempt sensible (cap to at least 1s)
|
|
454
|
+
return 0.0 if usable <= 0 else max(1.0, usable / attempts)
|
|
455
|
+
|
|
456
|
+
@computed_field
|
|
457
|
+
@property
|
|
458
|
+
def DEEPEVAL_PER_TASK_TIMEOUT_SECONDS(self) -> float:
|
|
427
459
|
"""If OVERRIDE is set (nonzero), return it; else return the derived budget."""
|
|
428
460
|
outer = self.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE
|
|
429
461
|
if outer not in (None, 0):
|
|
430
462
|
# Warn if user-provided outer is likely to truncate retries
|
|
431
463
|
if (self.DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0) > 0:
|
|
432
464
|
min_needed = self._calc_auto_outer_timeout()
|
|
433
|
-
if
|
|
465
|
+
if float(outer) < min_needed:
|
|
434
466
|
if self.DEEPEVAL_VERBOSE_MODE:
|
|
435
467
|
logger.warning(
|
|
436
468
|
"Metric timeout (outer=%ss) is less than attempts × per-attempt "
|
|
437
469
|
"timeout + backoff (≈%ss). Retries may be cut short.",
|
|
438
|
-
|
|
470
|
+
float(outer),
|
|
439
471
|
min_needed,
|
|
440
472
|
)
|
|
441
|
-
return
|
|
473
|
+
return float(outer)
|
|
442
474
|
|
|
443
475
|
# Auto mode
|
|
444
476
|
return self._calc_auto_outer_timeout()
|
|
445
477
|
|
|
478
|
+
@computed_field
|
|
479
|
+
@property
|
|
480
|
+
def DEEPEVAL_TASK_GATHER_BUFFER_SECONDS(self) -> float:
|
|
481
|
+
"""
|
|
482
|
+
Buffer time we add to the longest task’s duration to allow gather/drain
|
|
483
|
+
to complete. If an override is provided, use it; otherwise derive a
|
|
484
|
+
sensible default from the task-level budget:
|
|
485
|
+
buffer = constrain_between(0.15 * DEEPEVAL_PER_TASK_TIMEOUT_SECONDS, 10, 60)
|
|
486
|
+
"""
|
|
487
|
+
over = self.DEEPEVAL_TASK_GATHER_BUFFER_SECONDS_OVERRIDE
|
|
488
|
+
if over is not None and float(over) >= 0:
|
|
489
|
+
return float(over)
|
|
490
|
+
|
|
491
|
+
outer = float(self.DEEPEVAL_PER_TASK_TIMEOUT_SECONDS or 0.0)
|
|
492
|
+
base = 0.15 * outer
|
|
493
|
+
return constrain_between(base, 10.0, 60.0)
|
|
494
|
+
|
|
446
495
|
##############
|
|
447
496
|
# Validators #
|
|
448
497
|
##############
|
|
@@ -810,6 +859,25 @@ class Settings(BaseSettings):
|
|
|
810
859
|
ctx.switch_model_provider(target)
|
|
811
860
|
return ctx.result
|
|
812
861
|
|
|
862
|
+
def _expected_backoff(self, attempts: int) -> float:
|
|
863
|
+
"""Sum of expected sleeps for (attempts-1) retries, including jitter expectation."""
|
|
864
|
+
sleeps = max(0, attempts - 1)
|
|
865
|
+
cur = float(self.DEEPEVAL_RETRY_INITIAL_SECONDS)
|
|
866
|
+
cap = float(self.DEEPEVAL_RETRY_CAP_SECONDS)
|
|
867
|
+
base = float(self.DEEPEVAL_RETRY_EXP_BASE)
|
|
868
|
+
jitter = float(self.DEEPEVAL_RETRY_JITTER)
|
|
869
|
+
|
|
870
|
+
backoff = 0.0
|
|
871
|
+
for _ in range(sleeps):
|
|
872
|
+
backoff += min(cap, cur)
|
|
873
|
+
cur *= base
|
|
874
|
+
backoff += sleeps * (jitter / 2.0) # expected jitter
|
|
875
|
+
return backoff
|
|
876
|
+
|
|
877
|
+
def _constrain_between(self, value: float, lo: float, hi: float) -> float:
|
|
878
|
+
"""Return value constrained to the inclusive range [lo, hi]."""
|
|
879
|
+
return min(max(value, lo), hi)
|
|
880
|
+
|
|
813
881
|
|
|
814
882
|
_settings_singleton: Optional[Settings] = None
|
|
815
883
|
|
|
@@ -137,3 +137,8 @@ def dedupe_preserve_order(items: Iterable[str]) -> List[str]:
|
|
|
137
137
|
seen.add(x)
|
|
138
138
|
out.append(x)
|
|
139
139
|
return out
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def constrain_between(value: float, lo: float, hi: float) -> float:
|
|
143
|
+
"""Return value constrained to the inclusive range [lo, hi]."""
|
|
144
|
+
return min(max(value, lo), hi)
|
|
@@ -951,6 +951,8 @@ class EvaluationDataset:
|
|
|
951
951
|
context=golden.context,
|
|
952
952
|
name=golden.name,
|
|
953
953
|
comments=golden.comments,
|
|
954
|
+
additional_metadata=golden.additional_metadata,
|
|
955
|
+
custom_column_key_values=golden.custom_column_key_values,
|
|
954
956
|
)
|
|
955
957
|
for golden in self.goldens
|
|
956
958
|
]
|
|
@@ -965,6 +967,10 @@ class EvaluationDataset:
|
|
|
965
967
|
name=golden.name,
|
|
966
968
|
comments=golden.comments,
|
|
967
969
|
source_file=golden.source_file,
|
|
970
|
+
tools_called=golden.tools_called,
|
|
971
|
+
expected_tools=golden.expected_tools,
|
|
972
|
+
additional_metadata=golden.additional_metadata,
|
|
973
|
+
custom_column_key_values=golden.custom_column_key_values,
|
|
968
974
|
)
|
|
969
975
|
for golden in self.goldens
|
|
970
976
|
]
|
|
@@ -995,36 +1001,68 @@ class EvaluationDataset:
|
|
|
995
1001
|
if file_type == "json":
|
|
996
1002
|
with open(full_file_path, "w", encoding="utf-8") as file:
|
|
997
1003
|
if self._multi_turn:
|
|
998
|
-
json_data = [
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1004
|
+
json_data = []
|
|
1005
|
+
for golden in goldens:
|
|
1006
|
+
# Serialize turns as structured list of dicts
|
|
1007
|
+
turns_list = (
|
|
1008
|
+
json.loads(format_turns(golden.turns))
|
|
1009
|
+
if golden.turns
|
|
1010
|
+
else None
|
|
1011
|
+
)
|
|
1012
|
+
json_data.append(
|
|
1013
|
+
{
|
|
1014
|
+
"scenario": golden.scenario,
|
|
1015
|
+
"turns": turns_list,
|
|
1016
|
+
"expected_outcome": golden.expected_outcome,
|
|
1017
|
+
"user_description": golden.user_description,
|
|
1018
|
+
"context": golden.context,
|
|
1019
|
+
"name": golden.name,
|
|
1020
|
+
"comments": golden.comments,
|
|
1021
|
+
"additional_metadata": golden.additional_metadata,
|
|
1022
|
+
"custom_column_key_values": golden.custom_column_key_values,
|
|
1023
|
+
}
|
|
1024
|
+
)
|
|
1014
1025
|
else:
|
|
1015
|
-
json_data = [
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1026
|
+
json_data = []
|
|
1027
|
+
for golden in goldens:
|
|
1028
|
+
# Convert ToolCall lists to list[dict]
|
|
1029
|
+
def _dump_tools(tools):
|
|
1030
|
+
if not tools:
|
|
1031
|
+
return None
|
|
1032
|
+
dumped = []
|
|
1033
|
+
for t in tools:
|
|
1034
|
+
if hasattr(t, "model_dump"):
|
|
1035
|
+
dumped.append(
|
|
1036
|
+
t.model_dump(
|
|
1037
|
+
by_alias=True, exclude_none=True
|
|
1038
|
+
)
|
|
1039
|
+
)
|
|
1040
|
+
elif hasattr(t, "dict"):
|
|
1041
|
+
dumped.append(t.dict(exclude_none=True))
|
|
1042
|
+
else:
|
|
1043
|
+
dumped.append(t)
|
|
1044
|
+
return dumped if len(dumped) > 0 else None
|
|
1045
|
+
|
|
1046
|
+
json_data.append(
|
|
1047
|
+
{
|
|
1048
|
+
"input": golden.input,
|
|
1049
|
+
"actual_output": golden.actual_output,
|
|
1050
|
+
"expected_output": golden.expected_output,
|
|
1051
|
+
"retrieval_context": golden.retrieval_context,
|
|
1052
|
+
"context": golden.context,
|
|
1053
|
+
"name": golden.name,
|
|
1054
|
+
"comments": golden.comments,
|
|
1055
|
+
"source_file": golden.source_file,
|
|
1056
|
+
"tools_called": _dump_tools(
|
|
1057
|
+
golden.tools_called
|
|
1058
|
+
),
|
|
1059
|
+
"expected_tools": _dump_tools(
|
|
1060
|
+
golden.expected_tools
|
|
1061
|
+
),
|
|
1062
|
+
"additional_metadata": golden.additional_metadata,
|
|
1063
|
+
"custom_column_key_values": golden.custom_column_key_values,
|
|
1064
|
+
}
|
|
1065
|
+
)
|
|
1028
1066
|
json.dump(json_data, file, indent=4, ensure_ascii=False)
|
|
1029
1067
|
elif file_type == "csv":
|
|
1030
1068
|
with open(
|
|
@@ -1041,6 +1079,8 @@ class EvaluationDataset:
|
|
|
1041
1079
|
"context",
|
|
1042
1080
|
"name",
|
|
1043
1081
|
"comments",
|
|
1082
|
+
"additional_metadata",
|
|
1083
|
+
"custom_column_key_values",
|
|
1044
1084
|
]
|
|
1045
1085
|
)
|
|
1046
1086
|
for golden in goldens:
|
|
@@ -1054,6 +1094,21 @@ class EvaluationDataset:
|
|
|
1054
1094
|
if golden.turns is not None
|
|
1055
1095
|
else None
|
|
1056
1096
|
)
|
|
1097
|
+
additional_metadata = (
|
|
1098
|
+
json.dumps(
|
|
1099
|
+
golden.additional_metadata, ensure_ascii=False
|
|
1100
|
+
)
|
|
1101
|
+
if golden.additional_metadata is not None
|
|
1102
|
+
else None
|
|
1103
|
+
)
|
|
1104
|
+
custom_cols = (
|
|
1105
|
+
json.dumps(
|
|
1106
|
+
golden.custom_column_key_values,
|
|
1107
|
+
ensure_ascii=False,
|
|
1108
|
+
)
|
|
1109
|
+
if golden.custom_column_key_values
|
|
1110
|
+
else None
|
|
1111
|
+
)
|
|
1057
1112
|
writer.writerow(
|
|
1058
1113
|
[
|
|
1059
1114
|
golden.scenario,
|
|
@@ -1063,6 +1118,8 @@ class EvaluationDataset:
|
|
|
1063
1118
|
context,
|
|
1064
1119
|
golden.name,
|
|
1065
1120
|
golden.comments,
|
|
1121
|
+
additional_metadata,
|
|
1122
|
+
custom_cols,
|
|
1066
1123
|
]
|
|
1067
1124
|
)
|
|
1068
1125
|
else:
|
|
@@ -1076,6 +1133,10 @@ class EvaluationDataset:
|
|
|
1076
1133
|
"name",
|
|
1077
1134
|
"comments",
|
|
1078
1135
|
"source_file",
|
|
1136
|
+
"tools_called",
|
|
1137
|
+
"expected_tools",
|
|
1138
|
+
"additional_metadata",
|
|
1139
|
+
"custom_column_key_values",
|
|
1079
1140
|
]
|
|
1080
1141
|
)
|
|
1081
1142
|
for golden in goldens:
|
|
@@ -1089,6 +1150,42 @@ class EvaluationDataset:
|
|
|
1089
1150
|
if golden.context is not None
|
|
1090
1151
|
else None
|
|
1091
1152
|
)
|
|
1153
|
+
|
|
1154
|
+
# Dump tools as JSON strings for CSV
|
|
1155
|
+
def _dump_tools_csv(tools):
|
|
1156
|
+
if not tools:
|
|
1157
|
+
return None
|
|
1158
|
+
dumped = []
|
|
1159
|
+
for t in tools:
|
|
1160
|
+
if hasattr(t, "model_dump"):
|
|
1161
|
+
dumped.append(
|
|
1162
|
+
t.model_dump(
|
|
1163
|
+
by_alias=True, exclude_none=True
|
|
1164
|
+
)
|
|
1165
|
+
)
|
|
1166
|
+
elif hasattr(t, "dict"):
|
|
1167
|
+
dumped.append(t.dict(exclude_none=True))
|
|
1168
|
+
else:
|
|
1169
|
+
dumped.append(t)
|
|
1170
|
+
return json.dumps(dumped, ensure_ascii=False)
|
|
1171
|
+
|
|
1172
|
+
tools_called = _dump_tools_csv(golden.tools_called)
|
|
1173
|
+
expected_tools = _dump_tools_csv(golden.expected_tools)
|
|
1174
|
+
additional_metadata = (
|
|
1175
|
+
json.dumps(
|
|
1176
|
+
golden.additional_metadata, ensure_ascii=False
|
|
1177
|
+
)
|
|
1178
|
+
if golden.additional_metadata is not None
|
|
1179
|
+
else None
|
|
1180
|
+
)
|
|
1181
|
+
custom_cols = (
|
|
1182
|
+
json.dumps(
|
|
1183
|
+
golden.custom_column_key_values,
|
|
1184
|
+
ensure_ascii=False,
|
|
1185
|
+
)
|
|
1186
|
+
if golden.custom_column_key_values
|
|
1187
|
+
else None
|
|
1188
|
+
)
|
|
1092
1189
|
writer.writerow(
|
|
1093
1190
|
[
|
|
1094
1191
|
golden.input,
|
|
@@ -1099,6 +1196,10 @@ class EvaluationDataset:
|
|
|
1099
1196
|
golden.name,
|
|
1100
1197
|
golden.comments,
|
|
1101
1198
|
golden.source_file,
|
|
1199
|
+
tools_called,
|
|
1200
|
+
expected_tools,
|
|
1201
|
+
additional_metadata,
|
|
1202
|
+
custom_cols,
|
|
1102
1203
|
]
|
|
1103
1204
|
)
|
|
1104
1205
|
elif file_type == "jsonl":
|
|
@@ -1106,7 +1207,9 @@ class EvaluationDataset:
|
|
|
1106
1207
|
for golden in goldens:
|
|
1107
1208
|
if self._multi_turn:
|
|
1108
1209
|
turns = (
|
|
1109
|
-
format_turns(golden.turns)
|
|
1210
|
+
json.loads(format_turns(golden.turns))
|
|
1211
|
+
if golden.turns
|
|
1212
|
+
else None
|
|
1110
1213
|
)
|
|
1111
1214
|
record = {
|
|
1112
1215
|
"scenario": golden.scenario,
|
|
@@ -1114,6 +1217,10 @@ class EvaluationDataset:
|
|
|
1114
1217
|
"expected_outcome": golden.expected_outcome,
|
|
1115
1218
|
"user_description": golden.user_description,
|
|
1116
1219
|
"context": golden.context,
|
|
1220
|
+
"name": golden.name,
|
|
1221
|
+
"comments": golden.comments,
|
|
1222
|
+
"additional_metadata": golden.additional_metadata,
|
|
1223
|
+
"custom_column_key_values": golden.custom_column_key_values,
|
|
1117
1224
|
}
|
|
1118
1225
|
else:
|
|
1119
1226
|
retrieval_context = (
|
|
@@ -1126,12 +1233,37 @@ class EvaluationDataset:
|
|
|
1126
1233
|
if golden.context is not None
|
|
1127
1234
|
else None
|
|
1128
1235
|
)
|
|
1236
|
+
|
|
1237
|
+
# Convert ToolCall lists to list[dict]
|
|
1238
|
+
def _dump_tools(tools):
|
|
1239
|
+
if not tools:
|
|
1240
|
+
return None
|
|
1241
|
+
dumped = []
|
|
1242
|
+
for t in tools:
|
|
1243
|
+
if hasattr(t, "model_dump"):
|
|
1244
|
+
dumped.append(
|
|
1245
|
+
t.model_dump(
|
|
1246
|
+
by_alias=True, exclude_none=True
|
|
1247
|
+
)
|
|
1248
|
+
)
|
|
1249
|
+
elif hasattr(t, "dict"):
|
|
1250
|
+
dumped.append(t.dict(exclude_none=True))
|
|
1251
|
+
else:
|
|
1252
|
+
dumped.append(t)
|
|
1253
|
+
return dumped if len(dumped) > 0 else None
|
|
1254
|
+
|
|
1129
1255
|
record = {
|
|
1130
1256
|
"input": golden.input,
|
|
1131
1257
|
"actual_output": golden.actual_output,
|
|
1132
1258
|
"expected_output": golden.expected_output,
|
|
1133
1259
|
"retrieval_context": retrieval_context,
|
|
1134
1260
|
"context": context,
|
|
1261
|
+
"tools_called": _dump_tools(golden.tools_called),
|
|
1262
|
+
"expected_tools": _dump_tools(
|
|
1263
|
+
golden.expected_tools
|
|
1264
|
+
),
|
|
1265
|
+
"additional_metadata": golden.additional_metadata,
|
|
1266
|
+
"custom_column_key_values": golden.custom_column_key_values,
|
|
1135
1267
|
}
|
|
1136
1268
|
|
|
1137
1269
|
file.write(json.dumps(record, ensure_ascii=False) + "\n")
|
|
@@ -111,12 +111,36 @@ def trimAndLoadJson(input_string: str) -> Any:
|
|
|
111
111
|
def format_turns(turns: List[Turn]) -> str:
|
|
112
112
|
res = []
|
|
113
113
|
for turn in turns:
|
|
114
|
+
# Safely convert nested Pydantic models (ToolCall/MCP calls) to dicts
|
|
115
|
+
def _dump_list(models):
|
|
116
|
+
if not models:
|
|
117
|
+
return None
|
|
118
|
+
dumped = []
|
|
119
|
+
for m in models:
|
|
120
|
+
if hasattr(m, "model_dump"):
|
|
121
|
+
dumped.append(
|
|
122
|
+
m.model_dump(by_alias=True, exclude_none=True)
|
|
123
|
+
)
|
|
124
|
+
elif hasattr(m, "dict"):
|
|
125
|
+
dumped.append(m.dict(exclude_none=True))
|
|
126
|
+
else:
|
|
127
|
+
dumped.append(m)
|
|
128
|
+
return dumped if len(dumped) > 0 else None
|
|
129
|
+
|
|
114
130
|
cur_turn = {
|
|
115
131
|
"role": turn.role,
|
|
116
132
|
"content": turn.content,
|
|
133
|
+
"user_id": turn.user_id if turn.user_id is not None else None,
|
|
117
134
|
"retrieval_context": (
|
|
118
135
|
turn.retrieval_context if turn.retrieval_context else None
|
|
119
136
|
),
|
|
137
|
+
"tools_called": _dump_list(turn.tools_called),
|
|
138
|
+
"mcp_tools_called": _dump_list(turn.mcp_tools_called),
|
|
139
|
+
"mcp_resources_called": _dump_list(turn.mcp_resources_called),
|
|
140
|
+
"mcp_prompts_called": _dump_list(turn.mcp_prompts_called),
|
|
141
|
+
"additional_metadata": (
|
|
142
|
+
turn.additional_metadata if turn.additional_metadata else None
|
|
143
|
+
),
|
|
120
144
|
}
|
|
121
145
|
res.append(cur_turn)
|
|
122
146
|
try:
|
|
@@ -125,11 +149,17 @@ def format_turns(turns: List[Turn]) -> str:
|
|
|
125
149
|
raise ValueError(f"Error serializing turns: {e}")
|
|
126
150
|
|
|
127
151
|
|
|
128
|
-
def parse_turns(turns_str:
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
152
|
+
def parse_turns(turns_str: Any) -> List[Turn]:
|
|
153
|
+
# Accept either a JSON string or a Python list
|
|
154
|
+
if isinstance(turns_str, str):
|
|
155
|
+
try:
|
|
156
|
+
parsed = json.loads(turns_str)
|
|
157
|
+
except json.JSONDecodeError as e:
|
|
158
|
+
raise ValueError(f"Invalid JSON: {e}")
|
|
159
|
+
elif isinstance(turns_str, list):
|
|
160
|
+
parsed = turns_str
|
|
161
|
+
else:
|
|
162
|
+
raise TypeError("Expected a JSON string or a list of turns.")
|
|
133
163
|
|
|
134
164
|
if not isinstance(parsed, list):
|
|
135
165
|
raise TypeError("Expected a list of turns.")
|
|
@@ -145,15 +175,13 @@ def parse_turns(turns_str: str) -> List[Turn]:
|
|
|
145
175
|
if "content" not in turn or not isinstance(turn["content"], str):
|
|
146
176
|
raise ValueError(f"Turn at index {i} is missing a valid 'content'.")
|
|
147
177
|
|
|
148
|
-
|
|
178
|
+
try:
|
|
179
|
+
# Pydantic v2
|
|
180
|
+
res.append(Turn.model_validate(turn))
|
|
181
|
+
except AttributeError:
|
|
182
|
+
# Pydantic v1 fallback
|
|
183
|
+
res.append(Turn.parse_obj(turn))
|
|
149
184
|
|
|
150
|
-
res.append(
|
|
151
|
-
Turn(
|
|
152
|
-
role=turn["role"],
|
|
153
|
-
content=turn["content"],
|
|
154
|
-
retrieval_context=retrieval_context,
|
|
155
|
-
)
|
|
156
|
-
)
|
|
157
185
|
return res
|
|
158
186
|
|
|
159
187
|
|