deepeval 3.4.1__tar.gz → 3.4.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {deepeval-3.4.1 → deepeval-3.4.3}/PKG-INFO +1 -1
- deepeval-3.4.3/deepeval/_version.py +1 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/annotation/annotation.py +4 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/annotation/api.py +1 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/arc/arc.py +11 -6
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/base_benchmark.py +8 -1
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/bbq/bbq.py +11 -6
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/big_bench_hard.py +9 -6
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/bool_q/bool_q.py +11 -4
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/drop/drop.py +9 -4
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/equity_med_qa/equity_med_qa.py +11 -6
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/gsm8k/gsm8k.py +11 -4
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/hellaswag/hellaswag.py +9 -4
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/human_eval/human_eval.py +9 -4
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/ifeval/ifeval.py +27 -12
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/lambada/lambada.py +11 -6
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/logi_qa/logi_qa.py +8 -3
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/math_qa/math_qa.py +13 -5
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/mmlu/mmlu.py +15 -6
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/mmlu/template.py +3 -3
- deepeval-3.4.3/deepeval/benchmarks/results.py +2 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/squad/squad.py +11 -4
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/truthful_qa/truthful_qa.py +8 -3
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/winogrande/winogrande.py +11 -6
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/dataset/dataset.py +254 -112
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/dataset/utils.py +70 -1
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/evaluate/execute.py +57 -52
- deepeval-3.4.3/deepeval/integrations/pydantic_ai/agent.py +274 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/pydantic_ai/setup.py +0 -5
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/contextual_relevancy/template.py +2 -1
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/dag/nodes.py +22 -10
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/dag/utils.py +3 -2
- deepeval-3.4.3/deepeval/metrics/g_eval/__init__.py +5 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/g_eval/g_eval.py +25 -15
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/llms/amazon_bedrock_model.py +5 -4
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/llms/anthropic_model.py +4 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/llms/deepseek_model.py +6 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/llms/gemini_model.py +7 -1
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/llms/grok_model.py +4 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/llms/kimi_model.py +6 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/llms/litellm_model.py +33 -5
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/llms/local_model.py +4 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/llms/ollama_model.py +10 -2
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/synthesizer/synthesizer.py +4 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/telemetry.py +10 -1
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/tracing/offline_evals/api.py +1 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/tracing/offline_evals/thread.py +8 -2
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/tracing/otel/exporter.py +32 -42
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/tracing/otel/utils.py +18 -1
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/tracing/tracing.py +2 -6
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/utils.py +7 -3
- {deepeval-3.4.1 → deepeval-3.4.3}/pyproject.toml +2 -1
- deepeval-3.4.1/deepeval/_version.py +0 -1
- deepeval-3.4.1/deepeval/integrations/pydantic_ai/agent.py +0 -34
- deepeval-3.4.1/deepeval/integrations/pydantic_ai/patch.py +0 -161
- deepeval-3.4.1/deepeval/metrics/g_eval/__init__.py +0 -4
- {deepeval-3.4.1 → deepeval-3.4.3}/LICENSE.md +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/README.md +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/annotation/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/arc/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/arc/mode.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/arc/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/bbq/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/bbq/task.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/bbq/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/boolean_expressions.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/causal_judgement.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/date_understanding.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/disambiguation_qa.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/dyck_languages.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/formal_fallacies.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/geometric_shapes.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/hyperbaton.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_five_objects.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_seven_objects.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_three_objects.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/movie_recommendation.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/multistep_arithmetic_two.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/navigate.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/object_counting.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/penguins_in_a_table.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/reasoning_about_colored_objects.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/ruin_names.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/salient_translation_error_detection.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/snarks.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/sports_understanding.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/temporal_sequences.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/web_of_lies.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/cot_prompts/word_sorting.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/boolean_expressions.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/causal_judgement.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/date_understanding.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/disambiguation_qa.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/dyck_languages.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/formal_fallacies.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/geometric_shapes.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/hyperbaton.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_five_objects.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_seven_objects.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_three_objects.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/movie_recommendation.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/multistep_arithmetic_two.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/navigate.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/object_counting.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/penguins_in_a_table.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/reasoning_about_colored_objects.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/ruin_names.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/salient_translation_error_detection.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/snarks.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/sports_understanding.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/temporal_sequences.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/web_of_lies.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/shot_prompts/word_sorting.txt +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/task.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/big_bench_hard/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/bool_q/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/bool_q/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/drop/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/drop/task.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/drop/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/equity_med_qa/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/equity_med_qa/task.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/equity_med_qa/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/gsm8k/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/gsm8k/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/hellaswag/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/hellaswag/task.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/hellaswag/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/human_eval/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/human_eval/task.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/human_eval/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/ifeval/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/ifeval/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/lambada/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/lambada/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/logi_qa/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/logi_qa/task.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/logi_qa/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/math_qa/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/math_qa/task.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/math_qa/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/mmlu/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/mmlu/task.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/modes/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/squad/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/squad/task.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/squad/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/tasks/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/truthful_qa/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/truthful_qa/mode.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/truthful_qa/task.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/truthful_qa/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/utils.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/winogrande/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/benchmarks/winogrande/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/cli/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/cli/main.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/cli/server.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/cli/test.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/cli/types.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/cli/utils.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/confident/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/confident/api.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/confident/types.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/constants.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/dataset/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/dataset/api.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/dataset/golden.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/dataset/types.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/errors.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/evaluate/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/evaluate/api.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/evaluate/compare.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/evaluate/configs.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/evaluate/evaluate.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/evaluate/types.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/evaluate/utils.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/crewai/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/crewai/agent.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/crewai/handler.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/crewai/patch.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/hugging_face/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/hugging_face/callback.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/hugging_face/rich_manager.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/hugging_face/tests/test_callbacks.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/hugging_face/utils.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/langchain/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/langchain/callback.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/langchain/utils.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/llama_index/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/llama_index/agent/patched.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/llama_index/handler.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/llama_index/utils.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/integrations/pydantic_ai/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/key_handler.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/answer_relevancy/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/answer_relevancy/answer_relevancy.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/answer_relevancy/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/answer_relevancy/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/arena_g_eval/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/arena_g_eval/arena_g_eval.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/arena_g_eval/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/arena_g_eval/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/arena_g_eval/utils.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/argument_correctness/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/argument_correctness/argument_correctness.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/argument_correctness/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/argument_correctness/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/base_metric.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/bias/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/bias/bias.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/bias/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/bias/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/contextual_precision/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/contextual_precision/contextual_precision.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/contextual_precision/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/contextual_precision/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/contextual_recall/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/contextual_recall/contextual_recall.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/contextual_recall/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/contextual_recall/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/contextual_relevancy/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/contextual_relevancy/contextual_relevancy.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/contextual_relevancy/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/conversation_completeness/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/conversation_completeness/conversation_completeness.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/conversation_completeness/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/conversation_completeness/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/conversational_g_eval/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/conversational_g_eval/conversational_g_eval.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/conversational_g_eval/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/conversational_g_eval/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/dag/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/dag/dag.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/dag/graph.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/dag/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/dag/templates.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/faithfulness/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/faithfulness/faithfulness.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/faithfulness/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/faithfulness/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/g_eval/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/g_eval/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/g_eval/utils.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/hallucination/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/hallucination/hallucination.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/hallucination/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/hallucination/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/indicator.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/json_correctness/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/json_correctness/json_correctness.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/json_correctness/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/json_correctness/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/knowledge_retention/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/knowledge_retention/knowledge_retention.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/knowledge_retention/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/knowledge_retention/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/mcp/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/mcp/mcp_task_completion.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/mcp/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/mcp/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/mcp_use_metric/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/mcp_use_metric/mcp_use_metric.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/mcp_use_metric/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/mcp_use_metric/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/misuse/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/misuse/misuse.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/misuse/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/misuse/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/image_coherence/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/image_coherence/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/image_coherence/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/image_editing/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/image_editing/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/image_editing/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/image_helpfulness/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/image_helpfulness/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/image_helpfulness/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/image_reference/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/image_reference/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/image_reference/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/text_to_image/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/text_to_image/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/text_to_image/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/non_advice/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/non_advice/non_advice.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/non_advice/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/non_advice/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/pii_leakage/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/pii_leakage/pii_leakage.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/pii_leakage/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/pii_leakage/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/prompt_alignment/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/prompt_alignment/prompt_alignment.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/prompt_alignment/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/prompt_alignment/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/ragas.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/role_adherence/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/role_adherence/role_adherence.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/role_adherence/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/role_adherence/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/role_violation/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/role_violation/role_violation.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/role_violation/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/role_violation/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/summarization/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/summarization/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/summarization/summarization.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/summarization/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/task_completion/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/task_completion/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/task_completion/task_completion.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/task_completion/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/tool_correctness/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/tool_correctness/tool_correctness.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/toxicity/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/toxicity/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/toxicity/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/toxicity/toxicity.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/turn_relevancy/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/turn_relevancy/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/turn_relevancy/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/turn_relevancy/turn_relevancy.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/metrics/utils.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/_summac_model.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/answer_relevancy_model.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/base_model.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/detoxify_model.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/embedding_models/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/embedding_models/azure_embedding_model.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/embedding_models/local_embedding_model.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/embedding_models/ollama_embedding_model.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/embedding_models/openai_embedding_model.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/hallucination_model.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/llms/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/llms/azure_model.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/llms/openai_model.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/llms/utils.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/mlllms/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/mlllms/gemini_model.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/mlllms/ollama_model.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/mlllms/openai_model.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/summac_model.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/unbias_model.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/models/utils.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/openai/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/openai/extractors.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/openai/patch.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/openai/utils.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/openai_agents/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/openai_agents/callback_handler.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/openai_agents/extractors.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/plugins/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/plugins/plugin.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/progress_context.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/prompt/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/prompt/api.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/prompt/prompt.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/prompt/utils.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/py.typed +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/red_teaming/README.md +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/scorer/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/scorer/scorer.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/simulator/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/simulator/conversation_simulator.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/simulator/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/simulator/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/singleton.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/synthesizer/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/synthesizer/base_synthesizer.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/synthesizer/chunking/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/synthesizer/chunking/context_generator.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/synthesizer/chunking/doc_chunker.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/synthesizer/config.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/synthesizer/schema.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/synthesizer/templates/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/synthesizer/templates/template.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/synthesizer/templates/template_extraction.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/synthesizer/templates/template_prompt.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/synthesizer/types.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/synthesizer/utils.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/test_case/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/test_case/arena_test_case.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/test_case/conversational_test_case.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/test_case/llm_test_case.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/test_case/mcp.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/test_case/mllm_test_case.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/test_case/utils.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/test_run/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/test_run/api.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/test_run/cache.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/test_run/hooks.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/test_run/hyperparameters.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/test_run/test_run.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/tracing/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/tracing/api.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/tracing/context.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/tracing/offline_evals/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/tracing/offline_evals/span.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/tracing/offline_evals/trace.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/tracing/otel/__init__.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/tracing/patchers.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/tracing/perf_epoch_bridge.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/tracing/types.py +0 -0
- {deepeval-3.4.1 → deepeval-3.4.3}/deepeval/tracing/utils.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__: str = "3.4.3"
|
|
@@ -12,6 +12,7 @@ def send_annotation(
|
|
|
12
12
|
expected_output: Optional[str] = None,
|
|
13
13
|
expected_outcome: Optional[str] = None,
|
|
14
14
|
explanation: Optional[str] = None,
|
|
15
|
+
user_id: Optional[str] = None,
|
|
15
16
|
type: Optional[AnnotationType] = AnnotationType.THUMBS_RATING,
|
|
16
17
|
) -> str:
|
|
17
18
|
api_annotation = APIAnnotation(
|
|
@@ -23,6 +24,7 @@ def send_annotation(
|
|
|
23
24
|
expectedOutcome=expected_outcome,
|
|
24
25
|
explanation=explanation,
|
|
25
26
|
type=type,
|
|
27
|
+
userId=user_id,
|
|
26
28
|
)
|
|
27
29
|
api = Api()
|
|
28
30
|
try:
|
|
@@ -47,6 +49,7 @@ async def a_send_annotation(
|
|
|
47
49
|
expected_outcome: Optional[str] = None,
|
|
48
50
|
explanation: Optional[str] = None,
|
|
49
51
|
type: Optional[AnnotationType] = AnnotationType.THUMBS_RATING,
|
|
52
|
+
user_id: Optional[str] = None,
|
|
50
53
|
) -> str:
|
|
51
54
|
api_annotation = APIAnnotation(
|
|
52
55
|
rating=rating,
|
|
@@ -57,6 +60,7 @@ async def a_send_annotation(
|
|
|
57
60
|
expectedOutcome=expected_outcome,
|
|
58
61
|
explanation=explanation,
|
|
59
62
|
type=type,
|
|
63
|
+
userId=user_id,
|
|
60
64
|
)
|
|
61
65
|
api = Api()
|
|
62
66
|
try:
|
|
@@ -17,6 +17,7 @@ class APIAnnotation(BaseModel):
|
|
|
17
17
|
expected_outcome: Optional[str] = Field(None, alias="expectedOutcome")
|
|
18
18
|
explanation: Optional[str] = Field(None)
|
|
19
19
|
type: Optional[AnnotationType] = Field(None, alias="type")
|
|
20
|
+
user_id: Optional[str] = Field(None, alias="userId")
|
|
20
21
|
|
|
21
22
|
@model_validator(mode="before")
|
|
22
23
|
def validate_input(cls, data):
|
|
@@ -2,7 +2,10 @@ from typing import List, Optional, Dict
|
|
|
2
2
|
from tqdm import tqdm
|
|
3
3
|
|
|
4
4
|
from deepeval.dataset import Golden
|
|
5
|
-
from deepeval.benchmarks.base_benchmark import
|
|
5
|
+
from deepeval.benchmarks.base_benchmark import (
|
|
6
|
+
DeepEvalBaseBenchmark,
|
|
7
|
+
DeepEvalBaseBenchmarkResult,
|
|
8
|
+
)
|
|
6
9
|
from deepeval.models import DeepEvalBaseLLM
|
|
7
10
|
from deepeval.benchmarks.arc.mode import ARCMode
|
|
8
11
|
from deepeval.benchmarks.arc.template import ARCTemplate
|
|
@@ -48,7 +51,9 @@ class ARC(DeepEvalBaseBenchmark):
|
|
|
48
51
|
else:
|
|
49
52
|
self.confinement_instructions = confinement_instructions
|
|
50
53
|
|
|
51
|
-
def evaluate(
|
|
54
|
+
def evaluate(
|
|
55
|
+
self, model: DeepEvalBaseLLM, *args, **kwargs
|
|
56
|
+
) -> DeepEvalBaseBenchmarkResult:
|
|
52
57
|
import pandas as pd
|
|
53
58
|
|
|
54
59
|
with capture_benchmark_run("ARC", self.n_problems):
|
|
@@ -90,7 +95,9 @@ class ARC(DeepEvalBaseBenchmark):
|
|
|
90
95
|
)
|
|
91
96
|
self.overall_score = overall_accuracy
|
|
92
97
|
|
|
93
|
-
return
|
|
98
|
+
return DeepEvalBaseBenchmarkResult(
|
|
99
|
+
overall_accuracy=overall_accuracy
|
|
100
|
+
)
|
|
94
101
|
|
|
95
102
|
def predict(self, model: DeepEvalBaseLLM, golden: Golden) -> Dict:
|
|
96
103
|
# Define prompt template
|
|
@@ -129,9 +136,7 @@ class ARC(DeepEvalBaseBenchmark):
|
|
|
129
136
|
dataset_attr = dataset_mapping.get(mode)
|
|
130
137
|
if dataset_attr:
|
|
131
138
|
if not hasattr(self, dataset_attr):
|
|
132
|
-
dataset = load_dataset(
|
|
133
|
-
"ai2_arc", mode.value, trust_remote_code=True
|
|
134
|
-
)
|
|
139
|
+
dataset = load_dataset("ai2_arc", mode.value)
|
|
135
140
|
setattr(self, dataset_attr, dataset)
|
|
136
141
|
else:
|
|
137
142
|
dataset = getattr(self, dataset_attr)
|
|
@@ -1,10 +1,15 @@
|
|
|
1
1
|
from deepeval.models.base_model import DeepEvalBaseLLM
|
|
2
2
|
from abc import ABC, abstractmethod
|
|
3
3
|
from typing import List, TypeVar, Generic, List, Optional
|
|
4
|
+
from pydantic import BaseModel
|
|
4
5
|
|
|
5
6
|
from deepeval.dataset import Golden
|
|
6
7
|
|
|
7
8
|
|
|
9
|
+
class DeepEvalBaseBenchmarkResult(BaseModel):
|
|
10
|
+
overall_accuracy: float
|
|
11
|
+
|
|
12
|
+
|
|
8
13
|
T = TypeVar("T")
|
|
9
14
|
|
|
10
15
|
|
|
@@ -21,5 +26,7 @@ class DeepEvalBaseBenchmark(ABC, Generic[T]):
|
|
|
21
26
|
raise NotImplementedError
|
|
22
27
|
|
|
23
28
|
@abstractmethod
|
|
24
|
-
def evaluate(
|
|
29
|
+
def evaluate(
|
|
30
|
+
self, model: DeepEvalBaseLLM, *args, **kwargs
|
|
31
|
+
) -> DeepEvalBaseBenchmarkResult:
|
|
25
32
|
raise NotImplementedError
|
|
@@ -2,7 +2,10 @@ from typing import List, Optional, Dict
|
|
|
2
2
|
from tqdm import tqdm
|
|
3
3
|
|
|
4
4
|
from deepeval.dataset import Golden
|
|
5
|
-
from deepeval.benchmarks.base_benchmark import
|
|
5
|
+
from deepeval.benchmarks.base_benchmark import (
|
|
6
|
+
DeepEvalBaseBenchmark,
|
|
7
|
+
DeepEvalBaseBenchmarkResult,
|
|
8
|
+
)
|
|
6
9
|
from deepeval.models import DeepEvalBaseLLM
|
|
7
10
|
from deepeval.benchmarks.bbq.task import BBQTask
|
|
8
11
|
from deepeval.benchmarks.bbq.template import BBQTemplate
|
|
@@ -39,7 +42,9 @@ class BBQ(DeepEvalBaseBenchmark):
|
|
|
39
42
|
else:
|
|
40
43
|
self.confinement_instructions = confinement_instructions
|
|
41
44
|
|
|
42
|
-
def evaluate(
|
|
45
|
+
def evaluate(
|
|
46
|
+
self, model: DeepEvalBaseLLM, *args, **kwargs
|
|
47
|
+
) -> DeepEvalBaseBenchmarkResult:
|
|
43
48
|
import pandas as pd
|
|
44
49
|
|
|
45
50
|
with capture_benchmark_run("BBQ", len(self.tasks)):
|
|
@@ -115,7 +120,9 @@ class BBQ(DeepEvalBaseBenchmark):
|
|
|
115
120
|
)
|
|
116
121
|
self.overall_score = overall_accuracy
|
|
117
122
|
|
|
118
|
-
return
|
|
123
|
+
return DeepEvalBaseBenchmarkResult(
|
|
124
|
+
overall_accuracy=overall_accuracy
|
|
125
|
+
)
|
|
119
126
|
|
|
120
127
|
def predict(self, model: DeepEvalBaseLLM, golden: Golden) -> Dict:
|
|
121
128
|
# Define prompt template
|
|
@@ -164,9 +171,7 @@ class BBQ(DeepEvalBaseBenchmark):
|
|
|
164
171
|
dataset_attr = dataset_mapping.get(task)
|
|
165
172
|
if dataset_attr:
|
|
166
173
|
if not hasattr(self, dataset_attr):
|
|
167
|
-
dataset = load_dataset(
|
|
168
|
-
"heegyu/bbq", task.value, trust_remote_code=True
|
|
169
|
-
)
|
|
174
|
+
dataset = load_dataset("heegyu/bbq", task.value)
|
|
170
175
|
setattr(self, dataset_attr, dataset)
|
|
171
176
|
else:
|
|
172
177
|
dataset = getattr(self, dataset_attr)
|
|
@@ -2,7 +2,10 @@ from typing import List, Optional, Dict
|
|
|
2
2
|
from tqdm import tqdm
|
|
3
3
|
|
|
4
4
|
from deepeval.dataset import Golden
|
|
5
|
-
from deepeval.benchmarks.base_benchmark import
|
|
5
|
+
from deepeval.benchmarks.base_benchmark import (
|
|
6
|
+
DeepEvalBaseBenchmark,
|
|
7
|
+
DeepEvalBaseBenchmarkResult,
|
|
8
|
+
)
|
|
6
9
|
from deepeval.models import DeepEvalBaseLLM
|
|
7
10
|
from deepeval.benchmarks.big_bench_hard.task import BigBenchHardTask
|
|
8
11
|
from deepeval.benchmarks.big_bench_hard.template import BigBenchHardTemplate
|
|
@@ -81,7 +84,7 @@ class BigBenchHard(DeepEvalBaseBenchmark):
|
|
|
81
84
|
*args,
|
|
82
85
|
batch_size: Optional[int] = None,
|
|
83
86
|
**kwargs,
|
|
84
|
-
) ->
|
|
87
|
+
) -> DeepEvalBaseBenchmarkResult:
|
|
85
88
|
import pandas as pd
|
|
86
89
|
|
|
87
90
|
with capture_benchmark_run("Big Bench Hard", len(self.tasks)):
|
|
@@ -189,7 +192,9 @@ class BigBenchHard(DeepEvalBaseBenchmark):
|
|
|
189
192
|
)
|
|
190
193
|
self.overall_score = overall_accuracy
|
|
191
194
|
|
|
192
|
-
return
|
|
195
|
+
return DeepEvalBaseBenchmarkResult(
|
|
196
|
+
overall_accuracy=overall_accuracy
|
|
197
|
+
)
|
|
193
198
|
|
|
194
199
|
def predict(
|
|
195
200
|
self, model: DeepEvalBaseLLM, task: BigBenchHardTask, golden: Golden
|
|
@@ -279,9 +284,7 @@ class BigBenchHard(DeepEvalBaseBenchmark):
|
|
|
279
284
|
dataset_attr = dataset_mapping.get(task)
|
|
280
285
|
if dataset_attr:
|
|
281
286
|
if not hasattr(self, dataset_attr):
|
|
282
|
-
dataset = load_dataset(
|
|
283
|
-
"lukaemon/bbh", task.value, trust_remote_code=True
|
|
284
|
-
)
|
|
287
|
+
dataset = load_dataset("lukaemon/bbh", task.value)
|
|
285
288
|
setattr(self, dataset_attr, dataset)
|
|
286
289
|
else:
|
|
287
290
|
dataset = getattr(self, dataset_attr)
|
|
@@ -2,7 +2,10 @@ from typing import List, Optional, Dict
|
|
|
2
2
|
from tqdm import tqdm
|
|
3
3
|
|
|
4
4
|
from deepeval.dataset import Golden
|
|
5
|
-
from deepeval.benchmarks.base_benchmark import
|
|
5
|
+
from deepeval.benchmarks.base_benchmark import (
|
|
6
|
+
DeepEvalBaseBenchmark,
|
|
7
|
+
DeepEvalBaseBenchmarkResult,
|
|
8
|
+
)
|
|
6
9
|
from deepeval.models import DeepEvalBaseLLM
|
|
7
10
|
from deepeval.benchmarks.bool_q.template import BoolQTemplate
|
|
8
11
|
from deepeval.benchmarks.schema import AffirmationSchema
|
|
@@ -37,7 +40,9 @@ class BoolQ(DeepEvalBaseBenchmark):
|
|
|
37
40
|
else:
|
|
38
41
|
self.confinement_instructions = confinement_instructions
|
|
39
42
|
|
|
40
|
-
def evaluate(
|
|
43
|
+
def evaluate(
|
|
44
|
+
self, model: DeepEvalBaseLLM, *args, **kwargs
|
|
45
|
+
) -> DeepEvalBaseBenchmarkResult:
|
|
41
46
|
import pandas as pd
|
|
42
47
|
|
|
43
48
|
with capture_benchmark_run("BoolQ", self.n_problems):
|
|
@@ -77,7 +82,9 @@ class BoolQ(DeepEvalBaseBenchmark):
|
|
|
77
82
|
)
|
|
78
83
|
self.overall_score = overall_accuracy
|
|
79
84
|
|
|
80
|
-
return
|
|
85
|
+
return DeepEvalBaseBenchmarkResult(
|
|
86
|
+
overall_accuracy=overall_accuracy
|
|
87
|
+
)
|
|
81
88
|
|
|
82
89
|
def predict(self, model: DeepEvalBaseLLM, golden: Golden) -> Dict:
|
|
83
90
|
# Define prompt template
|
|
@@ -113,7 +120,7 @@ class BoolQ(DeepEvalBaseBenchmark):
|
|
|
113
120
|
if self.dataset:
|
|
114
121
|
dataset = self.dataset
|
|
115
122
|
else:
|
|
116
|
-
dataset = load_dataset("boolq", "default"
|
|
123
|
+
dataset = load_dataset("boolq", "default")
|
|
117
124
|
self.dataset = dataset
|
|
118
125
|
|
|
119
126
|
# Construct test set
|
|
@@ -3,7 +3,10 @@ from tqdm import tqdm
|
|
|
3
3
|
from typing import Union
|
|
4
4
|
|
|
5
5
|
from deepeval.dataset import Golden
|
|
6
|
-
from deepeval.benchmarks.base_benchmark import
|
|
6
|
+
from deepeval.benchmarks.base_benchmark import (
|
|
7
|
+
DeepEvalBaseBenchmark,
|
|
8
|
+
DeepEvalBaseBenchmarkResult,
|
|
9
|
+
)
|
|
7
10
|
from deepeval.models import DeepEvalBaseLLM
|
|
8
11
|
from deepeval.benchmarks.drop.task import DROPTask
|
|
9
12
|
from deepeval.benchmarks.drop.template import DROPTemplate
|
|
@@ -49,7 +52,7 @@ class DROP(DeepEvalBaseBenchmark):
|
|
|
49
52
|
*args,
|
|
50
53
|
batch_size: int | None = None,
|
|
51
54
|
**kwargs,
|
|
52
|
-
) ->
|
|
55
|
+
) -> DeepEvalBaseBenchmarkResult:
|
|
53
56
|
import pandas as pd
|
|
54
57
|
|
|
55
58
|
with capture_benchmark_run("DROP", len(self.tasks)):
|
|
@@ -155,7 +158,9 @@ class DROP(DeepEvalBaseBenchmark):
|
|
|
155
158
|
)
|
|
156
159
|
self.overall_score = overall_accuracy
|
|
157
160
|
|
|
158
|
-
return
|
|
161
|
+
return DeepEvalBaseBenchmarkResult(
|
|
162
|
+
overall_accuracy=overall_accuracy
|
|
163
|
+
)
|
|
159
164
|
|
|
160
165
|
def predict(self, model: DeepEvalBaseLLM, golden: Golden) -> Dict:
|
|
161
166
|
# Define prompt template
|
|
@@ -263,7 +268,7 @@ class DROP(DeepEvalBaseBenchmark):
|
|
|
263
268
|
if self.dataset:
|
|
264
269
|
dataset = self.dataset
|
|
265
270
|
else:
|
|
266
|
-
dataset = load_dataset("ucinlp/drop"
|
|
271
|
+
dataset = load_dataset("ucinlp/drop")
|
|
267
272
|
self.dataset = dataset
|
|
268
273
|
|
|
269
274
|
# construct example dataset
|
|
@@ -4,7 +4,10 @@ from tqdm import tqdm
|
|
|
4
4
|
from deepeval.dataset import Golden
|
|
5
5
|
from deepeval.test_case import LLMTestCase
|
|
6
6
|
from deepeval.metrics import BiasMetric
|
|
7
|
-
from deepeval.benchmarks.base_benchmark import
|
|
7
|
+
from deepeval.benchmarks.base_benchmark import (
|
|
8
|
+
DeepEvalBaseBenchmark,
|
|
9
|
+
DeepEvalBaseBenchmarkResult,
|
|
10
|
+
)
|
|
8
11
|
from deepeval.models import DeepEvalBaseLLM
|
|
9
12
|
from deepeval.benchmarks.equity_med_qa.task import EquityMedQATask
|
|
10
13
|
from deepeval.benchmarks.equity_med_qa.template import EquityMedQATemplate
|
|
@@ -34,7 +37,9 @@ class EquityMedQA(DeepEvalBaseBenchmark):
|
|
|
34
37
|
initialize_model(model)
|
|
35
38
|
)
|
|
36
39
|
|
|
37
|
-
def evaluate(
|
|
40
|
+
def evaluate(
|
|
41
|
+
self, model: DeepEvalBaseLLM, *args, **kwargs
|
|
42
|
+
) -> DeepEvalBaseBenchmarkResult:
|
|
38
43
|
import pandas as pd
|
|
39
44
|
|
|
40
45
|
with capture_benchmark_run("EquityMedQA", len(self.tasks)):
|
|
@@ -97,7 +102,9 @@ class EquityMedQA(DeepEvalBaseBenchmark):
|
|
|
97
102
|
)
|
|
98
103
|
self.overall_score = overall_accuracy
|
|
99
104
|
|
|
100
|
-
return
|
|
105
|
+
return DeepEvalBaseBenchmarkResult(
|
|
106
|
+
overall_accuracy=overall_accuracy
|
|
107
|
+
)
|
|
101
108
|
|
|
102
109
|
def predict(self, model: DeepEvalBaseLLM, golden: Golden) -> Dict:
|
|
103
110
|
prediction = model.generate(golden.input)
|
|
@@ -143,9 +150,7 @@ class EquityMedQA(DeepEvalBaseBenchmark):
|
|
|
143
150
|
dataset_attr = dataset_mapping.get(task)
|
|
144
151
|
if dataset_attr:
|
|
145
152
|
if not hasattr(self, dataset_attr):
|
|
146
|
-
dataset = load_dataset(
|
|
147
|
-
"katielink/EquityMedQA", task.value, trust_remote_code=True
|
|
148
|
-
)
|
|
153
|
+
dataset = load_dataset("katielink/EquityMedQA", task.value)
|
|
149
154
|
setattr(self, dataset_attr, dataset)
|
|
150
155
|
else:
|
|
151
156
|
dataset = getattr(self, dataset_attr)
|
|
@@ -2,7 +2,10 @@ from typing import List, Optional, Dict, Union
|
|
|
2
2
|
from tqdm import tqdm
|
|
3
3
|
|
|
4
4
|
from deepeval.dataset import Golden
|
|
5
|
-
from deepeval.benchmarks.base_benchmark import
|
|
5
|
+
from deepeval.benchmarks.base_benchmark import (
|
|
6
|
+
DeepEvalBaseBenchmark,
|
|
7
|
+
DeepEvalBaseBenchmarkResult,
|
|
8
|
+
)
|
|
6
9
|
from deepeval.models import DeepEvalBaseLLM
|
|
7
10
|
from deepeval.benchmarks.gsm8k.template import GSM8KTemplate
|
|
8
11
|
from deepeval.benchmarks.schema import NumberSchema
|
|
@@ -39,7 +42,9 @@ class GSM8K(DeepEvalBaseBenchmark):
|
|
|
39
42
|
else:
|
|
40
43
|
self.confinement_instructions = confinement_instructions
|
|
41
44
|
|
|
42
|
-
def evaluate(
|
|
45
|
+
def evaluate(
|
|
46
|
+
self, model: DeepEvalBaseLLM, *args, **kwargs
|
|
47
|
+
) -> DeepEvalBaseBenchmarkResult:
|
|
43
48
|
import pandas as pd
|
|
44
49
|
|
|
45
50
|
with capture_benchmark_run("GSM8K", len(self.tasks)):
|
|
@@ -82,7 +87,9 @@ class GSM8K(DeepEvalBaseBenchmark):
|
|
|
82
87
|
)
|
|
83
88
|
self.overall_score = overall_accuracy
|
|
84
89
|
|
|
85
|
-
return
|
|
90
|
+
return DeepEvalBaseBenchmarkResult(
|
|
91
|
+
overall_accuracy=overall_accuracy
|
|
92
|
+
)
|
|
86
93
|
|
|
87
94
|
def predict(self, model: DeepEvalBaseLLM, golden: Golden) -> Dict:
|
|
88
95
|
# Define prompt template
|
|
@@ -150,7 +157,7 @@ class GSM8K(DeepEvalBaseBenchmark):
|
|
|
150
157
|
if self.dataset:
|
|
151
158
|
dataset = self.dataset
|
|
152
159
|
else:
|
|
153
|
-
dataset = load_dataset("gsm8k", "main"
|
|
160
|
+
dataset = load_dataset("gsm8k", "main")
|
|
154
161
|
self.dataset = dataset
|
|
155
162
|
|
|
156
163
|
# Construct example dataset for n_shot inference
|
|
@@ -2,7 +2,10 @@ from typing import List, Dict, Optional
|
|
|
2
2
|
from tqdm import tqdm
|
|
3
3
|
|
|
4
4
|
from deepeval.dataset import Golden
|
|
5
|
-
from deepeval.benchmarks.base_benchmark import
|
|
5
|
+
from deepeval.benchmarks.base_benchmark import (
|
|
6
|
+
DeepEvalBaseBenchmark,
|
|
7
|
+
DeepEvalBaseBenchmarkResult,
|
|
8
|
+
)
|
|
6
9
|
from deepeval.models import DeepEvalBaseLLM
|
|
7
10
|
from deepeval.benchmarks.hellaswag.task import HellaSwagTask
|
|
8
11
|
from deepeval.benchmarks.hellaswag.template import HellaSwagTemplate
|
|
@@ -50,7 +53,7 @@ class HellaSwag(DeepEvalBaseBenchmark):
|
|
|
50
53
|
*args,
|
|
51
54
|
batch_size: int | None = None,
|
|
52
55
|
**kwargs,
|
|
53
|
-
) ->
|
|
56
|
+
) -> DeepEvalBaseBenchmarkResult:
|
|
54
57
|
import pandas as pd
|
|
55
58
|
|
|
56
59
|
with capture_benchmark_run("HellaSwag", len(self.tasks)):
|
|
@@ -160,7 +163,9 @@ class HellaSwag(DeepEvalBaseBenchmark):
|
|
|
160
163
|
)
|
|
161
164
|
self.overall_score = overall_accuracy
|
|
162
165
|
|
|
163
|
-
return
|
|
166
|
+
return DeepEvalBaseBenchmarkResult(
|
|
167
|
+
overall_accuracy=overall_accuracy
|
|
168
|
+
)
|
|
164
169
|
|
|
165
170
|
def predict(
|
|
166
171
|
self, model: DeepEvalBaseLLM, task: HellaSwagTask, golden: Golden
|
|
@@ -253,7 +258,7 @@ class HellaSwag(DeepEvalBaseBenchmark):
|
|
|
253
258
|
if self.dataset:
|
|
254
259
|
dataset = self.dataset
|
|
255
260
|
else:
|
|
256
|
-
dataset = load_dataset("Rowan/hellaswag"
|
|
261
|
+
dataset = load_dataset("Rowan/hellaswag")
|
|
257
262
|
self.dataset = dataset
|
|
258
263
|
|
|
259
264
|
# If dataset has not been previously loaded, construct
|
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
from typing import List, Optional, Dict
|
|
2
2
|
|
|
3
3
|
from deepeval.dataset import Golden
|
|
4
|
-
from deepeval.benchmarks.base_benchmark import
|
|
4
|
+
from deepeval.benchmarks.base_benchmark import (
|
|
5
|
+
DeepEvalBaseBenchmark,
|
|
6
|
+
DeepEvalBaseBenchmarkResult,
|
|
7
|
+
)
|
|
5
8
|
from deepeval.models import DeepEvalBaseLLM
|
|
6
9
|
from deepeval.benchmarks.human_eval.task import HumanEvalTask
|
|
7
10
|
from deepeval.benchmarks.human_eval.template import HumanEvalTemplate
|
|
@@ -93,7 +96,7 @@ class HumanEval(DeepEvalBaseBenchmark):
|
|
|
93
96
|
|
|
94
97
|
def evaluate(
|
|
95
98
|
self, model: DeepEvalBaseLLM, *args, k: int = 1, **kwargs
|
|
96
|
-
) ->
|
|
99
|
+
) -> DeepEvalBaseBenchmarkResult:
|
|
97
100
|
import pandas as pd
|
|
98
101
|
|
|
99
102
|
with capture_benchmark_run("HumanEval", len(self.tasks)):
|
|
@@ -157,7 +160,9 @@ class HumanEval(DeepEvalBaseBenchmark):
|
|
|
157
160
|
)
|
|
158
161
|
self.overall_score = overall_accuracy
|
|
159
162
|
|
|
160
|
-
return
|
|
163
|
+
return DeepEvalBaseBenchmarkResult(
|
|
164
|
+
overall_accuracy=overall_accuracy
|
|
165
|
+
)
|
|
161
166
|
|
|
162
167
|
def predict(
|
|
163
168
|
self,
|
|
@@ -201,7 +206,7 @@ class HumanEval(DeepEvalBaseBenchmark):
|
|
|
201
206
|
if self.dataset:
|
|
202
207
|
dataset = self.dataset
|
|
203
208
|
else:
|
|
204
|
-
dataset = load_dataset("openai_humaneval"
|
|
209
|
+
dataset = load_dataset("openai_humaneval")
|
|
205
210
|
self.dataset = dataset
|
|
206
211
|
|
|
207
212
|
# Filter tasks
|
|
@@ -1,15 +1,29 @@
|
|
|
1
|
+
from pydantic.config import ConfigDict
|
|
2
|
+
from deepeval.benchmarks.base_benchmark import (
|
|
3
|
+
DeepEvalBaseBenchmark,
|
|
4
|
+
DeepEvalBaseBenchmarkResult,
|
|
5
|
+
)
|
|
1
6
|
from typing import List, Optional, Dict, Any, Tuple
|
|
2
7
|
from tqdm import tqdm
|
|
3
8
|
import re
|
|
4
9
|
import json
|
|
5
10
|
|
|
6
11
|
from deepeval.dataset import Golden
|
|
7
|
-
from deepeval.benchmarks.base_benchmark import
|
|
12
|
+
from deepeval.benchmarks.base_benchmark import (
|
|
13
|
+
DeepEvalBaseBenchmark,
|
|
14
|
+
DeepEvalBaseBenchmarkResult,
|
|
15
|
+
)
|
|
8
16
|
from deepeval.models import DeepEvalBaseLLM
|
|
9
17
|
from deepeval.benchmarks.schema import StringSchema
|
|
10
18
|
from deepeval.telemetry import capture_benchmark_run
|
|
11
19
|
|
|
12
20
|
|
|
21
|
+
class IFEvalResult(DeepEvalBaseBenchmarkResult):
|
|
22
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
23
|
+
instruction_breakdown: dict[str, Any]
|
|
24
|
+
predictions: "pd.DataFrame"
|
|
25
|
+
|
|
26
|
+
|
|
13
27
|
class IFEvalInstructionVerifier:
|
|
14
28
|
"""
|
|
15
29
|
Verifies instruction compliance for IFEval benchmark.
|
|
@@ -394,16 +408,17 @@ class IFEval(DeepEvalBaseBenchmark):
|
|
|
394
408
|
**kwargs,
|
|
395
409
|
):
|
|
396
410
|
from deepeval.scorer import Scorer
|
|
411
|
+
import pandas as pd
|
|
397
412
|
|
|
398
413
|
super().__init__(**kwargs)
|
|
399
414
|
self.scorer = Scorer()
|
|
400
415
|
self.n_problems = n_problems
|
|
401
416
|
self.verbose_mode = verbose_mode
|
|
402
|
-
self.predictions = None
|
|
403
|
-
self.overall_score = None
|
|
417
|
+
self.predictions: Optional[pd.DataFrame] = None
|
|
418
|
+
self.overall_score: Optional[float] = None
|
|
404
419
|
self.instruction_breakdown = None
|
|
405
420
|
|
|
406
|
-
def evaluate(self, model: DeepEvalBaseLLM, *args, **kwargs) ->
|
|
421
|
+
def evaluate(self, model: DeepEvalBaseLLM, *args, **kwargs) -> IFEvalResult:
|
|
407
422
|
import pandas as pd
|
|
408
423
|
|
|
409
424
|
with capture_benchmark_run("IFEval", self.n_problems or "all"):
|
|
@@ -459,8 +474,7 @@ class IFEval(DeepEvalBaseBenchmark):
|
|
|
459
474
|
print(
|
|
460
475
|
f"Instruction '{instruction_id}' Accuracy: {accuracy:.4f}"
|
|
461
476
|
)
|
|
462
|
-
|
|
463
|
-
self.predictions = pd.DataFrame(
|
|
477
|
+
predictions: pd.DataFrame = pd.DataFrame(
|
|
464
478
|
predictions_row,
|
|
465
479
|
columns=[
|
|
466
480
|
"Input",
|
|
@@ -468,14 +482,15 @@ class IFEval(DeepEvalBaseBenchmark):
|
|
|
468
482
|
"All_Instructions_Correct",
|
|
469
483
|
],
|
|
470
484
|
)
|
|
485
|
+
self.predictions = predictions
|
|
471
486
|
self.overall_score = overall_accuracy
|
|
472
487
|
self.instruction_breakdown = instruction_accuracies
|
|
473
488
|
|
|
474
|
-
return
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
489
|
+
return IFEvalResult(
|
|
490
|
+
overall_accuracy=overall_accuracy,
|
|
491
|
+
instruction_breakdown=instruction_accuracies,
|
|
492
|
+
predictions=predictions,
|
|
493
|
+
)
|
|
479
494
|
|
|
480
495
|
def predict(
|
|
481
496
|
self, model: DeepEvalBaseLLM, golden: Golden
|
|
@@ -531,7 +546,7 @@ class IFEval(DeepEvalBaseBenchmark):
|
|
|
531
546
|
if self.dataset:
|
|
532
547
|
dataset = self.dataset
|
|
533
548
|
else:
|
|
534
|
-
dataset = load_dataset("google/IFEval"
|
|
549
|
+
dataset = load_dataset("google/IFEval")
|
|
535
550
|
self.dataset = dataset
|
|
536
551
|
|
|
537
552
|
goldens: List[Golden] = []
|
|
@@ -2,7 +2,10 @@ from typing import List, Optional, Dict
|
|
|
2
2
|
from tqdm import tqdm
|
|
3
3
|
|
|
4
4
|
from deepeval.dataset import Golden
|
|
5
|
-
from deepeval.benchmarks.base_benchmark import
|
|
5
|
+
from deepeval.benchmarks.base_benchmark import (
|
|
6
|
+
DeepEvalBaseBenchmark,
|
|
7
|
+
DeepEvalBaseBenchmarkResult,
|
|
8
|
+
)
|
|
6
9
|
from deepeval.models import DeepEvalBaseLLM
|
|
7
10
|
from deepeval.benchmarks.lambada.template import LAMBADATemplate
|
|
8
11
|
from deepeval.benchmarks.schema import StringSchema
|
|
@@ -37,7 +40,9 @@ class LAMBADA(DeepEvalBaseBenchmark):
|
|
|
37
40
|
else:
|
|
38
41
|
self.confinement_instructions = confinement_instructions
|
|
39
42
|
|
|
40
|
-
def evaluate(
|
|
43
|
+
def evaluate(
|
|
44
|
+
self, model: DeepEvalBaseLLM, *args, **kwargs
|
|
45
|
+
) -> DeepEvalBaseBenchmarkResult:
|
|
41
46
|
import pandas as pd
|
|
42
47
|
|
|
43
48
|
with capture_benchmark_run("LAMBADA", self.n_problems):
|
|
@@ -77,7 +82,9 @@ class LAMBADA(DeepEvalBaseBenchmark):
|
|
|
77
82
|
)
|
|
78
83
|
self.overall_score = overall_accuracy
|
|
79
84
|
|
|
80
|
-
return
|
|
85
|
+
return DeepEvalBaseBenchmarkResult(
|
|
86
|
+
overall_accuracy=overall_accuracy
|
|
87
|
+
)
|
|
81
88
|
|
|
82
89
|
def predict(self, model: DeepEvalBaseLLM, golden: Golden) -> Dict:
|
|
83
90
|
# Define prompt template
|
|
@@ -113,9 +120,7 @@ class LAMBADA(DeepEvalBaseBenchmark):
|
|
|
113
120
|
if self.dataset:
|
|
114
121
|
dataset = self.dataset
|
|
115
122
|
else:
|
|
116
|
-
dataset = load_dataset(
|
|
117
|
-
"EleutherAI/lambada_openai", "default", trust_remote_code=True
|
|
118
|
-
)
|
|
123
|
+
dataset = load_dataset("EleutherAI/lambada_openai", "default")
|
|
119
124
|
self.dataset = dataset
|
|
120
125
|
|
|
121
126
|
# Construct test set
|
|
@@ -4,7 +4,10 @@ import requests
|
|
|
4
4
|
import json
|
|
5
5
|
|
|
6
6
|
from deepeval.dataset import Golden
|
|
7
|
-
from deepeval.benchmarks.base_benchmark import
|
|
7
|
+
from deepeval.benchmarks.base_benchmark import (
|
|
8
|
+
DeepEvalBaseBenchmark,
|
|
9
|
+
DeepEvalBaseBenchmarkResult,
|
|
10
|
+
)
|
|
8
11
|
from deepeval.models import DeepEvalBaseLLM
|
|
9
12
|
from deepeval.benchmarks.logi_qa.task import LogiQATask
|
|
10
13
|
from deepeval.benchmarks.logi_qa.template import LogiQATemplate
|
|
@@ -51,7 +54,7 @@ class LogiQA(DeepEvalBaseBenchmark):
|
|
|
51
54
|
*args,
|
|
52
55
|
batch_size: int | None = None,
|
|
53
56
|
**kwargs,
|
|
54
|
-
) ->
|
|
57
|
+
) -> DeepEvalBaseBenchmarkResult:
|
|
55
58
|
import pandas as pd
|
|
56
59
|
|
|
57
60
|
with capture_benchmark_run("LogiQA", len(self.tasks)):
|
|
@@ -157,7 +160,9 @@ class LogiQA(DeepEvalBaseBenchmark):
|
|
|
157
160
|
)
|
|
158
161
|
self.overall_score = overall_accuracy
|
|
159
162
|
|
|
160
|
-
return
|
|
163
|
+
return DeepEvalBaseBenchmarkResult(
|
|
164
|
+
overall_accuracy=overall_accuracy
|
|
165
|
+
)
|
|
161
166
|
|
|
162
167
|
def predict(self, model: DeepEvalBaseLLM, golden: Golden) -> Dict:
|
|
163
168
|
# Define prompt template
|