deepeval 3.7.4__tar.gz → 3.7.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {deepeval-3.7.4 → deepeval-3.7.5}/PKG-INFO +1 -4
- deepeval-3.7.5/deepeval/_version.py +1 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/dataset/golden.py +54 -2
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/evaluate/evaluate.py +16 -8
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/evaluate/execute.py +70 -26
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/evaluate/utils.py +26 -22
- deepeval-3.7.5/deepeval/integrations/pydantic_ai/agent.py +38 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/__init__.py +14 -12
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
- deepeval-3.7.5/deepeval/metrics/answer_relevancy/template.py +206 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/base_metric.py +2 -5
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
- deepeval-3.7.5/deepeval/metrics/contextual_precision/template.py +133 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
- deepeval-3.7.5/deepeval/metrics/contextual_recall/template.py +126 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
- deepeval-3.7.5/deepeval/metrics/contextual_relevancy/template.py +106 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/dag/templates.py +2 -2
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/faithfulness/faithfulness.py +70 -27
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval-3.7.5/deepeval/metrics/faithfulness/template.py +225 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/g_eval/utils.py +2 -2
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/indicator.py +4 -4
- deepeval-3.7.5/deepeval/metrics/multimodal_metrics/__init__.py +6 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
- deepeval-3.7.5/deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +133 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/ragas.py +3 -3
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
- deepeval-3.7.5/deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval-3.7.5/deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval-3.7.5/deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
- deepeval-3.7.5/deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval-3.7.5/deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval-3.7.5/deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
- {deepeval-3.7.4/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy → deepeval-3.7.5/deepeval/metrics/turn_contextual_relevancy}/schema.py +7 -1
- deepeval-3.7.5/deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval-3.7.5/deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
- {deepeval-3.7.4/deepeval/metrics/multimodal_metrics/multimodal_faithfulness → deepeval-3.7.5/deepeval/metrics/turn_faithfulness}/schema.py +11 -3
- deepeval-3.7.5/deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval-3.7.5/deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/utils.py +39 -58
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/models/__init__.py +0 -12
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/models/base_model.py +16 -38
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/models/embedding_models/__init__.py +7 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/models/embedding_models/azure_embedding_model.py +52 -28
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/models/embedding_models/local_embedding_model.py +18 -14
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/models/embedding_models/ollama_embedding_model.py +38 -16
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/models/embedding_models/openai_embedding_model.py +40 -21
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/models/llms/amazon_bedrock_model.py +1 -2
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/models/llms/anthropic_model.py +44 -23
- {deepeval-3.7.4/deepeval/models/mlllms → deepeval-3.7.5/deepeval/models/llms}/azure_model.py +111 -70
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/models/llms/deepseek_model.py +18 -13
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/models/llms/gemini_model.py +129 -43
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/models/llms/grok_model.py +18 -13
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/models/llms/kimi_model.py +18 -13
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/models/llms/litellm_model.py +42 -22
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/models/llms/local_model.py +12 -7
- {deepeval-3.7.4/deepeval/models/mlllms → deepeval-3.7.5/deepeval/models/llms}/ollama_model.py +85 -44
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/models/llms/openai_model.py +137 -41
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/models/llms/portkey_model.py +24 -7
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/models/llms/utils.py +5 -3
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/models/retry_policy.py +17 -14
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/models/utils.py +46 -1
- deepeval-3.7.5/deepeval/optimizer/__init__.py +5 -0
- deepeval-3.7.5/deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval-3.7.5/deepeval/optimizer/algorithms/base.py +29 -0
- deepeval-3.7.5/deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval-3.7.5/deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval-3.7.4/deepeval/optimization/copro/loop.py → deepeval-3.7.5/deepeval/optimizer/algorithms/copro/copro.py +112 -113
- deepeval-3.7.5/deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval-3.7.4/deepeval/optimization/gepa/loop.py → deepeval-3.7.5/deepeval/optimizer/algorithms/gepa/gepa.py +175 -115
- deepeval-3.7.5/deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval-3.7.5/deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval-3.7.5/deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval-3.7.5/deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval-3.7.5/deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval-3.7.4/deepeval/optimization/simba/loop.py → deepeval-3.7.5/deepeval/optimizer/algorithms/simba/simba.py +128 -112
- {deepeval-3.7.4/deepeval/optimization → deepeval-3.7.5/deepeval/optimizer}/configs.py +5 -8
- deepeval-3.7.4/deepeval/optimization/policies/selection.py → deepeval-3.7.5/deepeval/optimizer/policies.py +63 -2
- deepeval-3.7.5/deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval-3.7.5/deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval-3.7.5/deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval-3.7.5/deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval-3.7.5/deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval-3.7.5/deepeval/optimizer/scorer/base.py +86 -0
- deepeval-3.7.5/deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval-3.7.5/deepeval/optimizer/scorer/utils.py +30 -0
- deepeval-3.7.5/deepeval/optimizer/types.py +148 -0
- {deepeval-3.7.4/deepeval/optimization → deepeval-3.7.5/deepeval/optimizer}/utils.py +47 -165
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/prompt/prompt.py +5 -9
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/test_case/__init__.py +1 -3
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/test_case/api.py +12 -10
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/test_case/conversational_test_case.py +19 -1
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/test_case/llm_test_case.py +152 -1
- deepeval-3.7.5/deepeval/test_case/utils.py +20 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/test_run/api.py +15 -14
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/test_run/test_run.py +3 -3
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/tracing/patchers.py +9 -4
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/tracing/tracing.py +2 -2
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/utils.py +65 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/pyproject.toml +4 -4
- deepeval-3.7.4/deepeval/_version.py +0 -1
- deepeval-3.7.4/deepeval/integrations/pydantic_ai/agent.py +0 -21
- deepeval-3.7.4/deepeval/metrics/answer_relevancy/template.py +0 -110
- deepeval-3.7.4/deepeval/metrics/contextual_precision/template.py +0 -84
- deepeval-3.7.4/deepeval/metrics/contextual_recall/template.py +0 -75
- deepeval-3.7.4/deepeval/metrics/contextual_relevancy/template.py +0 -77
- deepeval-3.7.4/deepeval/metrics/faithfulness/template.py +0 -140
- deepeval-3.7.4/deepeval/metrics/multimodal_metrics/__init__.py +0 -24
- deepeval-3.7.4/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval-3.7.4/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval-3.7.4/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval-3.7.4/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval-3.7.4/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval-3.7.4/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval-3.7.4/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval-3.7.4/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval-3.7.4/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval-3.7.4/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval-3.7.4/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval-3.7.4/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval-3.7.4/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval-3.7.4/deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
- deepeval-3.7.4/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval-3.7.4/deepeval/models/llms/azure_model.py +0 -299
- deepeval-3.7.4/deepeval/models/llms/ollama_model.py +0 -114
- deepeval-3.7.4/deepeval/models/mlllms/__init__.py +0 -4
- deepeval-3.7.4/deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval-3.7.4/deepeval/models/mlllms/openai_model.py +0 -309
- deepeval-3.7.4/deepeval/optimization/__init__.py +0 -13
- deepeval-3.7.4/deepeval/optimization/adapters/__init__.py +0 -2
- deepeval-3.7.4/deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval-3.7.4/deepeval/optimization/aggregates.py +0 -14
- deepeval-3.7.4/deepeval/optimization/copro/configs.py +0 -31
- deepeval-3.7.4/deepeval/optimization/gepa/__init__.py +0 -7
- deepeval-3.7.4/deepeval/optimization/gepa/configs.py +0 -115
- deepeval-3.7.4/deepeval/optimization/miprov2/configs.py +0 -134
- deepeval-3.7.4/deepeval/optimization/miprov2/loop.py +0 -785
- deepeval-3.7.4/deepeval/optimization/mutations/__init__.py +0 -0
- deepeval-3.7.4/deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval-3.7.4/deepeval/optimization/policies/__init__.py +0 -16
- deepeval-3.7.4/deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval-3.7.4/deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval-3.7.4/deepeval/optimization/simba/__init__.py +0 -0
- deepeval-3.7.4/deepeval/optimization/simba/configs.py +0 -33
- deepeval-3.7.4/deepeval/optimization/types.py +0 -361
- deepeval-3.7.4/deepeval/plugins/__init__.py +0 -0
- deepeval-3.7.4/deepeval/synthesizer/chunking/__init__.py +0 -0
- deepeval-3.7.4/deepeval/test_case/mllm_test_case.py +0 -170
- deepeval-3.7.4/deepeval/test_case/utils.py +0 -24
- {deepeval-3.7.4 → deepeval-3.7.5}/LICENSE.md +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/README.md +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/annotation/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/annotation/annotation.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/annotation/api.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/anthropic/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/anthropic/extractors.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/anthropic/patch.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/anthropic/utils.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/arc/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/arc/arc.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/arc/mode.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/arc/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/base_benchmark.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/bbq/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/bbq/bbq.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/bbq/task.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/bbq/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/big_bench_hard.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/boolean_expressions.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/causal_judgement.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/date_understanding.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/disambiguation_qa.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/dyck_languages.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/formal_fallacies.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/geometric_shapes.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/hyperbaton.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_five_objects.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_seven_objects.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_three_objects.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/movie_recommendation.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/multistep_arithmetic_two.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/navigate.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/object_counting.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/penguins_in_a_table.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/reasoning_about_colored_objects.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/ruin_names.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/salient_translation_error_detection.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/snarks.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/sports_understanding.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/temporal_sequences.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/web_of_lies.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/cot_prompts/word_sorting.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/boolean_expressions.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/causal_judgement.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/date_understanding.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/disambiguation_qa.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/dyck_languages.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/formal_fallacies.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/geometric_shapes.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/hyperbaton.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_five_objects.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_seven_objects.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_three_objects.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/movie_recommendation.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/multistep_arithmetic_two.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/navigate.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/object_counting.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/penguins_in_a_table.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/reasoning_about_colored_objects.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/ruin_names.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/salient_translation_error_detection.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/snarks.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/sports_understanding.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/temporal_sequences.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/web_of_lies.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/shot_prompts/word_sorting.txt +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/task.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/big_bench_hard/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/bool_q/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/bool_q/bool_q.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/bool_q/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/drop/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/drop/drop.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/drop/task.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/drop/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/equity_med_qa/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/equity_med_qa/equity_med_qa.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/equity_med_qa/task.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/equity_med_qa/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/gsm8k/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/gsm8k/gsm8k.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/gsm8k/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/hellaswag/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/hellaswag/hellaswag.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/hellaswag/task.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/hellaswag/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/human_eval/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/human_eval/human_eval.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/human_eval/task.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/human_eval/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/ifeval/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/ifeval/ifeval.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/ifeval/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/lambada/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/lambada/lambada.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/lambada/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/logi_qa/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/logi_qa/logi_qa.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/logi_qa/task.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/logi_qa/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/math_qa/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/math_qa/math_qa.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/math_qa/task.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/math_qa/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/mmlu/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/mmlu/mmlu.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/mmlu/task.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/mmlu/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/modes/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/results.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/squad/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/squad/squad.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/squad/task.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/squad/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/tasks/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/truthful_qa/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/truthful_qa/mode.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/truthful_qa/task.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/truthful_qa/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/truthful_qa/truthful_qa.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/utils.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/winogrande/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/winogrande/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/benchmarks/winogrande/winogrande.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/cli/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/cli/dotenv_handler.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/cli/main.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/cli/server.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/cli/test.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/cli/types.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/cli/utils.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/confident/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/confident/api.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/confident/types.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/config/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/config/logging.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/config/settings.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/config/settings_manager.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/config/utils.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/constants.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/contextvars.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/dataset/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/dataset/api.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/dataset/dataset.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/dataset/test_run_tracer.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/dataset/types.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/dataset/utils.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/errors.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/evaluate/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/evaluate/api.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/evaluate/compare.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/evaluate/configs.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/evaluate/types.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/integrations/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/integrations/crewai/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/integrations/crewai/handler.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/integrations/crewai/subs.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/integrations/crewai/tool.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/integrations/crewai/wrapper.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/integrations/hugging_face/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/integrations/hugging_face/callback.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/integrations/hugging_face/rich_manager.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/integrations/hugging_face/tests/test_callbacks.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/integrations/hugging_face/utils.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/integrations/langchain/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/integrations/langchain/callback.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/integrations/langchain/patch.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/integrations/langchain/utils.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/integrations/llama_index/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/integrations/llama_index/handler.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/integrations/llama_index/utils.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/integrations/pydantic_ai/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/integrations/pydantic_ai/otel.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/integrations/pydantic_ai/test_instrumentator.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/key_handler.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/answer_relevancy/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/answer_relevancy/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/api.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/arena_g_eval/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/arena_g_eval/arena_g_eval.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/arena_g_eval/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/arena_g_eval/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/arena_g_eval/utils.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/argument_correctness/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/argument_correctness/argument_correctness.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/argument_correctness/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/argument_correctness/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/bias/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/bias/bias.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/bias/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/bias/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/contextual_precision/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/contextual_precision/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/contextual_recall/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/contextual_recall/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/contextual_relevancy/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/contextual_relevancy/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/conversation_completeness/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/conversation_completeness/conversation_completeness.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/conversation_completeness/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/conversation_completeness/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/conversational_dag/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/conversational_dag/conversational_dag.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/conversational_dag/nodes.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/conversational_dag/templates.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/conversational_g_eval/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/conversational_g_eval/conversational_g_eval.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/conversational_g_eval/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/conversational_g_eval/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/dag/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/dag/dag.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/dag/graph.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/dag/nodes.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/dag/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/dag/utils.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/exact_match/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/exact_match/exact_match.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/faithfulness/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/g_eval/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/g_eval/g_eval.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/g_eval/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/g_eval/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/goal_accuracy/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/goal_accuracy/goal_accuracy.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/goal_accuracy/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/goal_accuracy/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/hallucination/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/hallucination/hallucination.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/hallucination/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/hallucination/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/json_correctness/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/json_correctness/json_correctness.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/json_correctness/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/json_correctness/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/knowledge_retention/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/knowledge_retention/knowledge_retention.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/knowledge_retention/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/knowledge_retention/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/mcp/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/mcp/mcp_task_completion.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/mcp/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/mcp/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/mcp_use_metric/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/mcp_use_metric/mcp_use_metric.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/mcp_use_metric/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/mcp_use_metric/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/misuse/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/misuse/misuse.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/misuse/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/misuse/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/multimodal_metrics/image_coherence/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/multimodal_metrics/image_coherence/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/multimodal_metrics/image_coherence/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/multimodal_metrics/image_editing/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/multimodal_metrics/image_editing/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/multimodal_metrics/image_editing/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/multimodal_metrics/image_helpfulness/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/multimodal_metrics/image_helpfulness/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/multimodal_metrics/image_helpfulness/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/multimodal_metrics/image_reference/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/multimodal_metrics/image_reference/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/multimodal_metrics/image_reference/template.py +0 -0
- {deepeval-3.7.4/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy → deepeval-3.7.5/deepeval/metrics/multimodal_metrics/multimodal_g_eval}/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -0
- {deepeval-3.7.4/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision → deepeval-3.7.5/deepeval/metrics/multimodal_metrics/text_to_image}/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/multimodal_metrics/text_to_image/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/multimodal_metrics/text_to_image/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/non_advice/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/non_advice/non_advice.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/non_advice/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/non_advice/template.py +0 -0
- {deepeval-3.7.4/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall → deepeval-3.7.5/deepeval/metrics/pattern_match}/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/pattern_match/pattern_match.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/pii_leakage/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/pii_leakage/pii_leakage.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/pii_leakage/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/pii_leakage/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/plan_adherence/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/plan_adherence/plan_adherence.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/plan_adherence/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/plan_adherence/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/plan_quality/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/plan_quality/plan_quality.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/plan_quality/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/plan_quality/template.py +0 -0
- {deepeval-3.7.4/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy → deepeval-3.7.5/deepeval/metrics/prompt_alignment}/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/prompt_alignment/prompt_alignment.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/prompt_alignment/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/prompt_alignment/template.py +0 -0
- {deepeval-3.7.4/deepeval/metrics/multimodal_metrics/multimodal_faithfulness → deepeval-3.7.5/deepeval/metrics/role_adherence}/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/role_adherence/role_adherence.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/role_adherence/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/role_adherence/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/role_violation/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/role_violation/role_violation.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/role_violation/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/role_violation/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/step_efficiency/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/step_efficiency/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/step_efficiency/step_efficiency.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/step_efficiency/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/summarization/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/summarization/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/summarization/summarization.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/summarization/template.py +0 -0
- {deepeval-3.7.4/deepeval/metrics/multimodal_metrics/multimodal_g_eval → deepeval-3.7.5/deepeval/metrics/task_completion}/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/task_completion/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/task_completion/task_completion.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/task_completion/template.py +0 -0
- {deepeval-3.7.4/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness → deepeval-3.7.5/deepeval/metrics/tool_correctness}/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/tool_correctness/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/tool_correctness/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/tool_use/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/tool_use/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/tool_use/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/tool_use/tool_use.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/topic_adherence/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/topic_adherence/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/topic_adherence/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/topic_adherence/topic_adherence.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/toxicity/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/toxicity/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/toxicity/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/toxicity/toxicity.py +0 -0
- {deepeval-3.7.4/deepeval/metrics/multimodal_metrics/text_to_image → deepeval-3.7.5/deepeval/metrics/turn_contextual_precision}/__init__.py +0 -0
- {deepeval-3.7.4/deepeval/metrics/pattern_match → deepeval-3.7.5/deepeval/metrics/turn_contextual_recall}/__init__.py +0 -0
- {deepeval-3.7.4/deepeval/metrics/prompt_alignment → deepeval-3.7.5/deepeval/metrics/turn_contextual_relevancy}/__init__.py +0 -0
- {deepeval-3.7.4/deepeval/metrics/role_adherence → deepeval-3.7.5/deepeval/metrics/turn_faithfulness}/__init__.py +0 -0
- {deepeval-3.7.4/deepeval/metrics/task_completion → deepeval-3.7.5/deepeval/metrics/turn_relevancy}/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/turn_relevancy/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/turn_relevancy/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/metrics/turn_relevancy/turn_relevancy.py +0 -0
- {deepeval-3.7.4/deepeval/metrics/tool_correctness → deepeval-3.7.5/deepeval/model_integrations}/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/model_integrations/types.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/model_integrations/utils.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/models/_summac_model.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/models/answer_relevancy_model.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/models/detoxify_model.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/models/hallucination_model.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/models/llms/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/models/summac_model.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/models/unbias_model.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/openai/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/openai/extractors.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/openai/patch.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/openai/utils.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/openai_agents/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/openai_agents/agent.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/openai_agents/callback_handler.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/openai_agents/extractors.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/openai_agents/patch.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/openai_agents/runner.py +0 -0
- {deepeval-3.7.4/deepeval/optimization → deepeval-3.7.5/deepeval/optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4/deepeval/metrics/turn_relevancy → deepeval-3.7.5/deepeval/plugins}/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/plugins/plugin.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/progress_context.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/prompt/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/prompt/api.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/prompt/utils.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/py.typed +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/red_teaming/README.md +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/scorer/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/scorer/scorer.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/simulator/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/simulator/conversation_simulator.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/simulator/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/simulator/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/singleton.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/synthesizer/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/synthesizer/base_synthesizer.py +0 -0
- {deepeval-3.7.4/deepeval/model_integrations → deepeval-3.7.5/deepeval/synthesizer/chunking}/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/synthesizer/chunking/context_generator.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/synthesizer/chunking/doc_chunker.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/synthesizer/config.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/synthesizer/schema.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/synthesizer/synthesizer.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/synthesizer/templates/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/synthesizer/templates/template.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/synthesizer/templates/template_extraction.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/synthesizer/templates/template_prompt.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/synthesizer/types.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/synthesizer/utils.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/telemetry.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/test_case/arena_test_case.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/test_case/mcp.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/test_run/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/test_run/cache.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/test_run/hooks.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/test_run/hyperparameters.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/tracing/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/tracing/api.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/tracing/context.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/tracing/offline_evals/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/tracing/offline_evals/api.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/tracing/offline_evals/span.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/tracing/offline_evals/thread.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/tracing/offline_evals/trace.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/tracing/otel/__init__.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/tracing/otel/exporter.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/tracing/otel/test_exporter.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/tracing/otel/utils.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/tracing/perf_epoch_bridge.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/tracing/trace_context.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/tracing/trace_test_manager.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/tracing/types.py +0 -0
- {deepeval-3.7.4 → deepeval-3.7.5}/deepeval/tracing/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: deepeval
|
|
3
|
-
Version: 3.7.
|
|
3
|
+
Version: 3.7.5
|
|
4
4
|
Summary: The LLM Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/confident-ai/deepeval
|
|
6
6
|
License: Apache-2.0
|
|
@@ -13,13 +13,10 @@ Classifier: Programming Language :: Python :: 3.9
|
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.10
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.11
|
|
15
15
|
Requires-Dist: aiohttp
|
|
16
|
-
Requires-Dist: anthropic
|
|
17
16
|
Requires-Dist: click (>=8.0.0,<8.3.0)
|
|
18
|
-
Requires-Dist: google-genai (>=1.9.0,<2.0.0)
|
|
19
17
|
Requires-Dist: grpcio (>=1.67.1,<2.0.0)
|
|
20
18
|
Requires-Dist: jinja2
|
|
21
19
|
Requires-Dist: nest_asyncio
|
|
22
|
-
Requires-Dist: ollama
|
|
23
20
|
Requires-Dist: openai
|
|
24
21
|
Requires-Dist: opentelemetry-api (>=1.24.0,<2.0.0)
|
|
25
22
|
Requires-Dist: opentelemetry-exporter-otlp-proto-grpc (>=1.24.0,<2.0.0)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__: str = "3.7.5"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
from pydantic import BaseModel, Field, PrivateAttr
|
|
1
|
+
from pydantic import BaseModel, Field, PrivateAttr, model_validator
|
|
2
2
|
from typing import Optional, Dict, List
|
|
3
|
-
from deepeval.test_case import ToolCall, Turn
|
|
3
|
+
from deepeval.test_case import ToolCall, Turn, MLLMImage
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
class Golden(BaseModel):
|
|
@@ -32,10 +32,40 @@ class Golden(BaseModel):
|
|
|
32
32
|
custom_column_key_values: Optional[Dict[str, str]] = Field(
|
|
33
33
|
default=None, serialization_alias="customColumnKeyValues"
|
|
34
34
|
)
|
|
35
|
+
multimodal: bool = Field(False, exclude=True)
|
|
35
36
|
_dataset_rank: Optional[int] = PrivateAttr(default=None)
|
|
36
37
|
_dataset_alias: Optional[str] = PrivateAttr(default=None)
|
|
37
38
|
_dataset_id: Optional[str] = PrivateAttr(default=None)
|
|
38
39
|
|
|
40
|
+
@model_validator(mode="after")
|
|
41
|
+
def set_is_multimodal(self):
|
|
42
|
+
import re
|
|
43
|
+
|
|
44
|
+
if self.multimodal is True:
|
|
45
|
+
return self
|
|
46
|
+
|
|
47
|
+
pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
|
|
48
|
+
self.multimodal = (
|
|
49
|
+
any(
|
|
50
|
+
[
|
|
51
|
+
(
|
|
52
|
+
re.search(pattern, self.input) is not None
|
|
53
|
+
if self.input
|
|
54
|
+
else False
|
|
55
|
+
),
|
|
56
|
+
(
|
|
57
|
+
re.search(pattern, self.actual_output) is not None
|
|
58
|
+
if self.actual_output
|
|
59
|
+
else False
|
|
60
|
+
),
|
|
61
|
+
]
|
|
62
|
+
)
|
|
63
|
+
if isinstance(self.input, str)
|
|
64
|
+
else self.multimodal
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
return self
|
|
68
|
+
|
|
39
69
|
|
|
40
70
|
class ConversationalGolden(BaseModel):
|
|
41
71
|
scenario: str
|
|
@@ -55,6 +85,28 @@ class ConversationalGolden(BaseModel):
|
|
|
55
85
|
default=None, serialization_alias="customColumnKeyValues"
|
|
56
86
|
)
|
|
57
87
|
turns: Optional[List[Turn]] = Field(default=None)
|
|
88
|
+
multimodal: bool = Field(False, exclude=True)
|
|
58
89
|
_dataset_rank: Optional[int] = PrivateAttr(default=None)
|
|
59
90
|
_dataset_alias: Optional[str] = PrivateAttr(default=None)
|
|
60
91
|
_dataset_id: Optional[str] = PrivateAttr(default=None)
|
|
92
|
+
|
|
93
|
+
@model_validator(mode="after")
|
|
94
|
+
def set_is_multimodal(self):
|
|
95
|
+
import re
|
|
96
|
+
|
|
97
|
+
if self.multimodal is True:
|
|
98
|
+
return self
|
|
99
|
+
|
|
100
|
+
pattern = r"\[DEEPEVAL:IMAGE:(.*?)\]"
|
|
101
|
+
self.multimodal = (
|
|
102
|
+
any(
|
|
103
|
+
[
|
|
104
|
+
re.search(pattern, turn.content) is not None
|
|
105
|
+
for turn in self.turns
|
|
106
|
+
]
|
|
107
|
+
)
|
|
108
|
+
if self.turns
|
|
109
|
+
else self.multimodal
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
return self
|
|
@@ -54,7 +54,6 @@ from deepeval.metrics.indicator import (
|
|
|
54
54
|
from deepeval.test_case import (
|
|
55
55
|
LLMTestCase,
|
|
56
56
|
ConversationalTestCase,
|
|
57
|
-
MLLMTestCase,
|
|
58
57
|
)
|
|
59
58
|
from deepeval.test_run import (
|
|
60
59
|
global_test_run_manager,
|
|
@@ -71,9 +70,7 @@ from deepeval.evaluate.execute import (
|
|
|
71
70
|
|
|
72
71
|
|
|
73
72
|
def assert_test(
|
|
74
|
-
test_case: Optional[
|
|
75
|
-
Union[LLMTestCase, ConversationalTestCase, MLLMTestCase]
|
|
76
|
-
] = None,
|
|
73
|
+
test_case: Optional[Union[LLMTestCase, ConversationalTestCase]] = None,
|
|
77
74
|
metrics: Optional[
|
|
78
75
|
Union[
|
|
79
76
|
List[BaseMetric],
|
|
@@ -175,7 +172,7 @@ def assert_test(
|
|
|
175
172
|
try:
|
|
176
173
|
if not metric_data.success:
|
|
177
174
|
failed_metrics_data.append(metric_data)
|
|
178
|
-
except:
|
|
175
|
+
except Exception:
|
|
179
176
|
failed_metrics_data.append(metric_data)
|
|
180
177
|
|
|
181
178
|
failed_metrics_str = ", ".join(
|
|
@@ -188,9 +185,7 @@ def assert_test(
|
|
|
188
185
|
|
|
189
186
|
|
|
190
187
|
def evaluate(
|
|
191
|
-
test_cases: Union[
|
|
192
|
-
List[LLMTestCase], List[ConversationalTestCase], List[MLLMTestCase]
|
|
193
|
-
],
|
|
188
|
+
test_cases: Union[List[LLMTestCase], List[ConversationalTestCase]],
|
|
194
189
|
metrics: Optional[
|
|
195
190
|
Union[
|
|
196
191
|
List[BaseMetric],
|
|
@@ -272,6 +267,19 @@ def evaluate(
|
|
|
272
267
|
test_run.hyperparameters = process_hyperparameters(hyperparameters)
|
|
273
268
|
test_run.prompts = process_prompts(hyperparameters)
|
|
274
269
|
global_test_run_manager.save_test_run(TEMP_FILE_PATH)
|
|
270
|
+
|
|
271
|
+
# In CLI mode (`deepeval test run`), the CLI owns finalization and will
|
|
272
|
+
# call `wrap_up_test_run()` once after pytest finishes. Finalizing here
|
|
273
|
+
# as well would double finalize the run and consequently result in
|
|
274
|
+
# duplicate uploads / local saves and temp file races, so only
|
|
275
|
+
# do it when we're NOT in CLI mode.
|
|
276
|
+
if get_is_running_deepeval():
|
|
277
|
+
return EvaluationResult(
|
|
278
|
+
test_results=test_results,
|
|
279
|
+
confident_link=None,
|
|
280
|
+
test_run_id=None,
|
|
281
|
+
)
|
|
282
|
+
|
|
275
283
|
res = global_test_run_manager.wrap_up_test_run(
|
|
276
284
|
run_duration, display_table=False
|
|
277
285
|
)
|
|
@@ -58,6 +58,13 @@ from deepeval.metrics import (
|
|
|
58
58
|
BaseConversationalMetric,
|
|
59
59
|
BaseMultimodalMetric,
|
|
60
60
|
TaskCompletionMetric,
|
|
61
|
+
# RAG metrics that support both single-turn and multimodal
|
|
62
|
+
ContextualPrecisionMetric,
|
|
63
|
+
ContextualRecallMetric,
|
|
64
|
+
ContextualRelevancyMetric,
|
|
65
|
+
AnswerRelevancyMetric,
|
|
66
|
+
FaithfulnessMetric,
|
|
67
|
+
ToolCorrectnessMetric,
|
|
61
68
|
)
|
|
62
69
|
from deepeval.metrics.indicator import (
|
|
63
70
|
measure_metrics_with_indicator,
|
|
@@ -70,7 +77,6 @@ from deepeval.models.retry_policy import (
|
|
|
70
77
|
from deepeval.test_case import (
|
|
71
78
|
LLMTestCase,
|
|
72
79
|
ConversationalTestCase,
|
|
73
|
-
MLLMTestCase,
|
|
74
80
|
)
|
|
75
81
|
from deepeval.test_case.api import create_api_test_case
|
|
76
82
|
from deepeval.test_run import (
|
|
@@ -110,6 +116,15 @@ from deepeval.test_run.hyperparameters import (
|
|
|
110
116
|
|
|
111
117
|
logger = logging.getLogger(__name__)
|
|
112
118
|
|
|
119
|
+
MLLM_SUPPORTED_METRICS = [
|
|
120
|
+
ContextualPrecisionMetric,
|
|
121
|
+
ContextualRecallMetric,
|
|
122
|
+
ContextualRelevancyMetric,
|
|
123
|
+
AnswerRelevancyMetric,
|
|
124
|
+
FaithfulnessMetric,
|
|
125
|
+
ToolCorrectnessMetric,
|
|
126
|
+
]
|
|
127
|
+
|
|
113
128
|
|
|
114
129
|
def _skip_metrics_for_error(
|
|
115
130
|
span: Optional[BaseSpan] = None,
|
|
@@ -263,9 +278,7 @@ async def _await_with_outer_deadline(obj, *args, timeout: float, **kwargs):
|
|
|
263
278
|
|
|
264
279
|
|
|
265
280
|
def execute_test_cases(
|
|
266
|
-
test_cases: Union[
|
|
267
|
-
List[LLMTestCase], List[ConversationalTestCase], List[MLLMTestCase]
|
|
268
|
-
],
|
|
281
|
+
test_cases: Union[List[LLMTestCase], List[ConversationalTestCase]],
|
|
269
282
|
metrics: Union[
|
|
270
283
|
List[BaseMetric],
|
|
271
284
|
List[BaseConversationalMetric],
|
|
@@ -307,6 +320,8 @@ def execute_test_cases(
|
|
|
307
320
|
metric.async_mode = False
|
|
308
321
|
if isinstance(metric, BaseMetric):
|
|
309
322
|
llm_metrics.append(metric)
|
|
323
|
+
if type(metric) in MLLM_SUPPORTED_METRICS:
|
|
324
|
+
mllm_metrics.append(metric)
|
|
310
325
|
elif isinstance(metric, BaseConversationalMetric):
|
|
311
326
|
conversational_metrics.append(metric)
|
|
312
327
|
elif isinstance(metric, BaseMultimodalMetric):
|
|
@@ -325,12 +340,12 @@ def execute_test_cases(
|
|
|
325
340
|
)
|
|
326
341
|
for i, test_case in enumerate(test_cases):
|
|
327
342
|
# skip what we know we won't run
|
|
328
|
-
if isinstance(test_case, LLMTestCase):
|
|
343
|
+
if isinstance(test_case, LLMTestCase) and not test_case.multimodal:
|
|
329
344
|
if not llm_metrics:
|
|
330
345
|
update_pbar(progress, pbar_id)
|
|
331
346
|
continue
|
|
332
347
|
per_case_total = len(llm_metrics)
|
|
333
|
-
elif isinstance(test_case,
|
|
348
|
+
elif isinstance(test_case, LLMTestCase) and test_case.multimodal:
|
|
334
349
|
if not mllm_metrics:
|
|
335
350
|
update_pbar(progress, pbar_id)
|
|
336
351
|
continue
|
|
@@ -349,10 +364,16 @@ def execute_test_cases(
|
|
|
349
364
|
|
|
350
365
|
metrics_for_case = (
|
|
351
366
|
llm_metrics
|
|
352
|
-
if
|
|
367
|
+
if (
|
|
368
|
+
isinstance(test_case, LLMTestCase)
|
|
369
|
+
and not test_case.multimodal
|
|
370
|
+
)
|
|
353
371
|
else (
|
|
354
372
|
mllm_metrics
|
|
355
|
-
if
|
|
373
|
+
if (
|
|
374
|
+
isinstance(test_case, LLMTestCase)
|
|
375
|
+
and test_case.multimodal
|
|
376
|
+
)
|
|
356
377
|
else conversational_metrics
|
|
357
378
|
)
|
|
358
379
|
)
|
|
@@ -360,10 +381,16 @@ def execute_test_cases(
|
|
|
360
381
|
test_case=test_case,
|
|
361
382
|
index=(
|
|
362
383
|
llm_test_case_count + 1
|
|
363
|
-
if
|
|
384
|
+
if (
|
|
385
|
+
isinstance(test_case, LLMTestCase)
|
|
386
|
+
and not test_case.multimodal
|
|
387
|
+
)
|
|
364
388
|
else (
|
|
365
389
|
mllm_test_case_count + 1
|
|
366
|
-
if
|
|
390
|
+
if (
|
|
391
|
+
isinstance(test_case, LLMTestCase)
|
|
392
|
+
and test_case.multimodal
|
|
393
|
+
)
|
|
367
394
|
else conversational_test_case_count + 1
|
|
368
395
|
)
|
|
369
396
|
),
|
|
@@ -383,7 +410,10 @@ def execute_test_cases(
|
|
|
383
410
|
for metric in metrics:
|
|
384
411
|
metric.error = None # Reset metric error
|
|
385
412
|
|
|
386
|
-
if
|
|
413
|
+
if (
|
|
414
|
+
isinstance(test_case, LLMTestCase)
|
|
415
|
+
and not test_case.multimodal
|
|
416
|
+
):
|
|
387
417
|
llm_test_case_count += 1
|
|
388
418
|
cached_test_case = None
|
|
389
419
|
if cache_config.use_cache:
|
|
@@ -436,7 +466,10 @@ def execute_test_cases(
|
|
|
436
466
|
update_pbar(progress, pbar_test_case_id)
|
|
437
467
|
|
|
438
468
|
# No caching and not sending test cases to Confident AI for multimodal metrics yet
|
|
439
|
-
elif
|
|
469
|
+
elif (
|
|
470
|
+
isinstance(test_case, LLMTestCase)
|
|
471
|
+
and test_case.multimodal
|
|
472
|
+
):
|
|
440
473
|
mllm_test_case_count += 1
|
|
441
474
|
for metric in mllm_metrics:
|
|
442
475
|
current_index = index_of[id(metric)]
|
|
@@ -560,9 +593,7 @@ def execute_test_cases(
|
|
|
560
593
|
|
|
561
594
|
|
|
562
595
|
async def a_execute_test_cases(
|
|
563
|
-
test_cases: Union[
|
|
564
|
-
List[LLMTestCase], List[ConversationalTestCase], List[MLLMTestCase]
|
|
565
|
-
],
|
|
596
|
+
test_cases: Union[List[LLMTestCase], List[ConversationalTestCase]],
|
|
566
597
|
metrics: Union[
|
|
567
598
|
List[BaseMetric],
|
|
568
599
|
List[BaseConversationalMetric],
|
|
@@ -605,6 +636,8 @@ async def a_execute_test_cases(
|
|
|
605
636
|
for metric in metrics:
|
|
606
637
|
if isinstance(metric, BaseMetric):
|
|
607
638
|
llm_metrics.append(metric)
|
|
639
|
+
if type(metric) in MLLM_SUPPORTED_METRICS:
|
|
640
|
+
mllm_metrics.append(metric)
|
|
608
641
|
elif isinstance(metric, BaseMultimodalMetric):
|
|
609
642
|
mllm_metrics.append(metric)
|
|
610
643
|
elif isinstance(metric, BaseConversationalMetric):
|
|
@@ -613,7 +646,7 @@ async def a_execute_test_cases(
|
|
|
613
646
|
llm_test_case_counter = -1
|
|
614
647
|
mllm_test_case_counter = -1
|
|
615
648
|
conversational_test_case_counter = -1
|
|
616
|
-
test_results: List[Union[TestResult,
|
|
649
|
+
test_results: List[Union[TestResult, LLMTestCase]] = []
|
|
617
650
|
tasks = []
|
|
618
651
|
|
|
619
652
|
if display_config.show_indicator and _use_bar_indicator:
|
|
@@ -632,7 +665,10 @@ async def a_execute_test_cases(
|
|
|
632
665
|
with progress:
|
|
633
666
|
for test_case in test_cases:
|
|
634
667
|
with capture_evaluation_run("test case"):
|
|
635
|
-
if
|
|
668
|
+
if (
|
|
669
|
+
isinstance(test_case, LLMTestCase)
|
|
670
|
+
and not test_case.multimodal
|
|
671
|
+
):
|
|
636
672
|
if len(llm_metrics) == 0:
|
|
637
673
|
update_pbar(progress, pbar_id)
|
|
638
674
|
continue
|
|
@@ -660,7 +696,10 @@ async def a_execute_test_cases(
|
|
|
660
696
|
)
|
|
661
697
|
tasks.append(asyncio.create_task(task))
|
|
662
698
|
|
|
663
|
-
elif
|
|
699
|
+
elif (
|
|
700
|
+
isinstance(test_case, LLMTestCase)
|
|
701
|
+
and test_case.multimodal
|
|
702
|
+
):
|
|
664
703
|
mllm_test_case_counter += 1
|
|
665
704
|
copied_multimodal_metrics: List[
|
|
666
705
|
BaseMultimodalMetric
|
|
@@ -724,7 +763,10 @@ async def a_execute_test_cases(
|
|
|
724
763
|
else:
|
|
725
764
|
for test_case in test_cases:
|
|
726
765
|
with capture_evaluation_run("test case"):
|
|
727
|
-
if
|
|
766
|
+
if (
|
|
767
|
+
isinstance(test_case, LLMTestCase)
|
|
768
|
+
and not test_case.multimodal
|
|
769
|
+
):
|
|
728
770
|
if len(llm_metrics) == 0:
|
|
729
771
|
continue
|
|
730
772
|
llm_test_case_counter += 1
|
|
@@ -772,7 +814,9 @@ async def a_execute_test_cases(
|
|
|
772
814
|
)
|
|
773
815
|
tasks.append(asyncio.create_task((task)))
|
|
774
816
|
|
|
775
|
-
elif
|
|
817
|
+
elif (
|
|
818
|
+
isinstance(test_case, LLMTestCase) and test_case.multimodal
|
|
819
|
+
):
|
|
776
820
|
mllm_test_case_counter += 1
|
|
777
821
|
copied_multimodal_metrics: List[BaseMultimodalMetric] = (
|
|
778
822
|
copy_metrics(mllm_metrics)
|
|
@@ -815,7 +859,7 @@ async def _a_execute_llm_test_cases(
|
|
|
815
859
|
metrics: List[BaseMetric],
|
|
816
860
|
test_case: LLMTestCase,
|
|
817
861
|
test_run_manager: TestRunManager,
|
|
818
|
-
test_results: List[Union[TestResult,
|
|
862
|
+
test_results: List[Union[TestResult, LLMTestCase]],
|
|
819
863
|
count: int,
|
|
820
864
|
test_run: TestRun,
|
|
821
865
|
ignore_errors: bool,
|
|
@@ -934,9 +978,9 @@ async def _a_execute_llm_test_cases(
|
|
|
934
978
|
|
|
935
979
|
async def _a_execute_mllm_test_cases(
|
|
936
980
|
metrics: List[BaseMultimodalMetric],
|
|
937
|
-
test_case:
|
|
981
|
+
test_case: LLMTestCase,
|
|
938
982
|
test_run_manager: TestRunManager,
|
|
939
|
-
test_results: List[Union[TestResult,
|
|
983
|
+
test_results: List[Union[TestResult, LLMTestCase]],
|
|
940
984
|
count: int,
|
|
941
985
|
ignore_errors: bool,
|
|
942
986
|
skip_on_missing_params: bool,
|
|
@@ -1013,7 +1057,7 @@ async def _a_execute_conversational_test_cases(
|
|
|
1013
1057
|
],
|
|
1014
1058
|
test_case: ConversationalTestCase,
|
|
1015
1059
|
test_run_manager: TestRunManager,
|
|
1016
|
-
test_results: List[Union[TestResult,
|
|
1060
|
+
test_results: List[Union[TestResult, LLMTestCase]],
|
|
1017
1061
|
count: int,
|
|
1018
1062
|
ignore_errors: bool,
|
|
1019
1063
|
skip_on_missing_params: bool,
|
|
@@ -1776,7 +1820,7 @@ async def a_execute_agentic_test_cases(
|
|
|
1776
1820
|
async def _a_execute_agentic_test_case(
|
|
1777
1821
|
golden: Golden,
|
|
1778
1822
|
test_run_manager: TestRunManager,
|
|
1779
|
-
test_results: List[Union[TestResult,
|
|
1823
|
+
test_results: List[Union[TestResult, LLMTestCase]],
|
|
1780
1824
|
count: int,
|
|
1781
1825
|
verbose_mode: Optional[bool],
|
|
1782
1826
|
ignore_errors: bool,
|
|
@@ -3205,7 +3249,7 @@ async def _evaluate_test_case_pairs(
|
|
|
3205
3249
|
|
|
3206
3250
|
def _execute_metric(
|
|
3207
3251
|
metric: BaseMetric,
|
|
3208
|
-
test_case: Union[LLMTestCase, ConversationalTestCase
|
|
3252
|
+
test_case: Union[LLMTestCase, ConversationalTestCase],
|
|
3209
3253
|
show_metric_indicator: bool,
|
|
3210
3254
|
in_component: bool,
|
|
3211
3255
|
error_config: ErrorConfig,
|
|
@@ -16,7 +16,6 @@ from deepeval.metrics import (
|
|
|
16
16
|
from deepeval.test_case import (
|
|
17
17
|
LLMTestCase,
|
|
18
18
|
ConversationalTestCase,
|
|
19
|
-
MLLMTestCase,
|
|
20
19
|
)
|
|
21
20
|
from deepeval.test_run import (
|
|
22
21
|
LLMApiTestCase,
|
|
@@ -129,17 +128,14 @@ def create_test_result(
|
|
|
129
128
|
turns=api_test_case.turns,
|
|
130
129
|
)
|
|
131
130
|
else:
|
|
132
|
-
multimodal =
|
|
133
|
-
api_test_case.multimodal_input is not None
|
|
134
|
-
and api_test_case.multimodal_input_actual_output is not None
|
|
135
|
-
)
|
|
131
|
+
multimodal = api_test_case.images_mapping
|
|
136
132
|
if multimodal:
|
|
137
133
|
return TestResult(
|
|
138
134
|
name=name,
|
|
139
135
|
success=api_test_case.success,
|
|
140
136
|
metrics_data=api_test_case.metrics_data,
|
|
141
|
-
input=api_test_case.
|
|
142
|
-
actual_output=api_test_case.
|
|
137
|
+
input=api_test_case.input,
|
|
138
|
+
actual_output=api_test_case.actual_output,
|
|
143
139
|
conversational=False,
|
|
144
140
|
multimodal=True,
|
|
145
141
|
additional_metadata=api_test_case.additional_metadata,
|
|
@@ -222,9 +218,9 @@ def validate_assert_test_inputs(
|
|
|
222
218
|
)
|
|
223
219
|
|
|
224
220
|
if test_case and metrics:
|
|
225
|
-
if
|
|
226
|
-
isinstance(
|
|
227
|
-
):
|
|
221
|
+
if (
|
|
222
|
+
isinstance(test_case, LLMTestCase) and not test_case.multimodal
|
|
223
|
+
) and not all(isinstance(metric, BaseMetric) for metric in metrics):
|
|
228
224
|
raise ValueError(
|
|
229
225
|
"All 'metrics' for an 'LLMTestCase' must be instances of 'BaseMetric' only."
|
|
230
226
|
)
|
|
@@ -234,11 +230,17 @@ def validate_assert_test_inputs(
|
|
|
234
230
|
raise ValueError(
|
|
235
231
|
"All 'metrics' for an 'ConversationalTestCase' must be instances of 'BaseConversationalMetric' only."
|
|
236
232
|
)
|
|
237
|
-
if
|
|
238
|
-
isinstance(
|
|
233
|
+
if (
|
|
234
|
+
isinstance(test_case, LLMTestCase) and test_case.multimodal
|
|
235
|
+
) and not all(
|
|
236
|
+
(
|
|
237
|
+
isinstance(metric, BaseMultimodalMetric)
|
|
238
|
+
or isinstance(metric, BaseMetric)
|
|
239
|
+
)
|
|
240
|
+
for metric in metrics
|
|
239
241
|
):
|
|
240
242
|
raise ValueError(
|
|
241
|
-
"All 'metrics' for
|
|
243
|
+
"All 'metrics' for multi-modal LLMTestCase must be instances of 'BaseMultimodalMetric' only."
|
|
242
244
|
)
|
|
243
245
|
|
|
244
246
|
if not ((golden and observed_callback) or (test_case and metrics)):
|
|
@@ -251,9 +253,7 @@ def validate_evaluate_inputs(
|
|
|
251
253
|
goldens: Optional[List] = None,
|
|
252
254
|
observed_callback: Optional[Callable] = None,
|
|
253
255
|
test_cases: Optional[
|
|
254
|
-
Union[
|
|
255
|
-
List[LLMTestCase], List[ConversationalTestCase], List[MLLMTestCase]
|
|
256
|
-
]
|
|
256
|
+
Union[List[LLMTestCase], List[ConversationalTestCase]]
|
|
257
257
|
] = None,
|
|
258
258
|
metrics: Optional[
|
|
259
259
|
Union[
|
|
@@ -292,9 +292,10 @@ def validate_evaluate_inputs(
|
|
|
292
292
|
if test_cases and metrics:
|
|
293
293
|
for test_case in test_cases:
|
|
294
294
|
for metric in metrics:
|
|
295
|
-
if
|
|
296
|
-
|
|
297
|
-
|
|
295
|
+
if (
|
|
296
|
+
isinstance(test_case, LLMTestCase)
|
|
297
|
+
and not test_case.multimodal
|
|
298
|
+
) and not isinstance(metric, BaseMetric):
|
|
298
299
|
raise ValueError(
|
|
299
300
|
f"Metric {metric.__name__} is not a valid metric for LLMTestCase."
|
|
300
301
|
)
|
|
@@ -305,11 +306,14 @@ def validate_evaluate_inputs(
|
|
|
305
306
|
raise ValueError(
|
|
306
307
|
f"Metric {metric.__name__} is not a valid metric for ConversationalTestCase."
|
|
307
308
|
)
|
|
308
|
-
if
|
|
309
|
-
|
|
309
|
+
if (
|
|
310
|
+
isinstance(test_case, LLMTestCase) and test_case.multimodal
|
|
311
|
+
) and not (
|
|
312
|
+
isinstance(metric, BaseMultimodalMetric)
|
|
313
|
+
or isinstance(metric, BaseMetric)
|
|
310
314
|
):
|
|
311
315
|
raise ValueError(
|
|
312
|
-
f"Metric {metric.__name__} is not a valid metric for
|
|
316
|
+
f"Metric {metric.__name__} is not a valid metric for multi-modal LLMTestCase."
|
|
313
317
|
)
|
|
314
318
|
|
|
315
319
|
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
from typing import TYPE_CHECKING, Any
|
|
3
|
+
|
|
4
|
+
try:
|
|
5
|
+
from pydantic_ai.agent import Agent as _BaseAgent
|
|
6
|
+
|
|
7
|
+
is_pydantic_ai_installed = True
|
|
8
|
+
except ImportError:
|
|
9
|
+
is_pydantic_ai_installed = False
|
|
10
|
+
|
|
11
|
+
class _BaseAgent:
|
|
12
|
+
"""Dummy fallback so imports don't crash when pydantic-ai is missing."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
|
15
|
+
# No-op: for compatibility
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
# For type checkers: use the real Agent if available.
|
|
21
|
+
from pydantic_ai.agent import Agent # type: ignore[unused-ignore]
|
|
22
|
+
else:
|
|
23
|
+
# At runtime we always have some base: real Agent or our dummy.
|
|
24
|
+
# This is just to avoid blow-ups.
|
|
25
|
+
Agent = _BaseAgent
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class DeepEvalPydanticAIAgent(Agent):
|
|
29
|
+
|
|
30
|
+
def __init__(self, *args, **kwargs):
|
|
31
|
+
warnings.warn(
|
|
32
|
+
"instrument_pydantic_ai is deprecated and will be removed in a future version. "
|
|
33
|
+
"Please use the new ConfidentInstrumentationSettings instead. Docs: https://www.confident-ai.com/docs/integrations/third-party/pydantic-ai",
|
|
34
|
+
DeprecationWarning,
|
|
35
|
+
stacklevel=2,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
super().__init__(*args, **kwargs)
|
|
@@ -1,40 +1,58 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import json
|
|
2
4
|
import logging
|
|
3
5
|
import os
|
|
4
6
|
from time import perf_counter
|
|
5
|
-
from typing import
|
|
7
|
+
from typing import Any, List, Optional, TYPE_CHECKING
|
|
6
8
|
|
|
7
9
|
from deepeval.config.settings import get_settings
|
|
8
10
|
from deepeval.confident.api import get_confident_api_key
|
|
9
11
|
from deepeval.metrics.base_metric import BaseMetric
|
|
10
12
|
from deepeval.prompt import Prompt
|
|
11
13
|
from deepeval.tracing.context import current_trace_context
|
|
12
|
-
from deepeval.tracing.types import Trace
|
|
13
|
-
from deepeval.tracing.otel.utils import to_hex_string
|
|
14
|
-
from deepeval.tracing.tracing import trace_manager
|
|
15
|
-
from deepeval.tracing.otel.utils import normalize_pydantic_ai_messages
|
|
16
14
|
from deepeval.tracing.otel.exporter import ConfidentSpanExporter
|
|
17
|
-
|
|
15
|
+
from deepeval.tracing.otel.test_exporter import test_exporter
|
|
16
|
+
from deepeval.tracing.otel.utils import (
|
|
17
|
+
normalize_pydantic_ai_messages,
|
|
18
|
+
to_hex_string,
|
|
19
|
+
)
|
|
20
|
+
from deepeval.tracing.perf_epoch_bridge import init_clock_bridge
|
|
21
|
+
from deepeval.tracing.tracing import trace_manager
|
|
22
|
+
from deepeval.tracing.types import (
|
|
23
|
+
AgentSpan,
|
|
24
|
+
Trace,
|
|
25
|
+
TraceSpanStatus,
|
|
26
|
+
ToolCall,
|
|
27
|
+
)
|
|
18
28
|
|
|
19
29
|
logger = logging.getLogger(__name__)
|
|
20
30
|
|
|
21
|
-
|
|
22
31
|
try:
|
|
23
|
-
|
|
24
|
-
from opentelemetry.sdk.trace import
|
|
32
|
+
# Optional dependencies
|
|
33
|
+
from opentelemetry.sdk.trace import (
|
|
34
|
+
ReadableSpan as _ReadableSpan,
|
|
35
|
+
SpanProcessor as _SpanProcessor,
|
|
36
|
+
TracerProvider,
|
|
37
|
+
)
|
|
25
38
|
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
|
26
39
|
from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
|
|
27
40
|
OTLPSpanExporter,
|
|
28
41
|
)
|
|
29
|
-
from
|
|
42
|
+
from pydantic_ai.models.instrumented import (
|
|
43
|
+
InstrumentationSettings as _BaseInstrumentationSettings,
|
|
44
|
+
)
|
|
30
45
|
|
|
31
46
|
dependency_installed = True
|
|
32
47
|
except ImportError as e:
|
|
48
|
+
dependency_installed = False
|
|
49
|
+
|
|
50
|
+
# Preserve previous behavior: only log when verbose mode is enabled.
|
|
33
51
|
if get_settings().DEEPEVAL_VERBOSE_MODE:
|
|
34
52
|
if isinstance(e, ModuleNotFoundError):
|
|
35
53
|
logger.warning(
|
|
36
54
|
"Optional tracing dependency not installed: %s",
|
|
37
|
-
e
|
|
55
|
+
getattr(e, "name", repr(e)),
|
|
38
56
|
stacklevel=2,
|
|
39
57
|
)
|
|
40
58
|
else:
|
|
@@ -43,26 +61,47 @@ except ImportError as e:
|
|
|
43
61
|
e,
|
|
44
62
|
stacklevel=2,
|
|
45
63
|
)
|
|
46
|
-
|
|
64
|
+
|
|
65
|
+
# Dummy fallbacks so imports and class definitions don't crash when
|
|
66
|
+
# optional deps are missing. Actual use is still guarded by
|
|
67
|
+
# is_dependency_installed().
|
|
68
|
+
class _BaseInstrumentationSettings:
|
|
69
|
+
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
|
70
|
+
pass
|
|
71
|
+
|
|
72
|
+
class _SpanProcessor:
|
|
73
|
+
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
|
74
|
+
pass
|
|
75
|
+
|
|
76
|
+
def on_start(self, span: Any, parent_context: Any) -> None:
|
|
77
|
+
pass
|
|
78
|
+
|
|
79
|
+
def on_end(self, span: Any) -> None:
|
|
80
|
+
pass
|
|
81
|
+
|
|
82
|
+
class _ReadableSpan:
|
|
83
|
+
pass
|
|
47
84
|
|
|
48
85
|
|
|
49
|
-
def is_dependency_installed():
|
|
86
|
+
def is_dependency_installed() -> bool:
|
|
50
87
|
if not dependency_installed:
|
|
51
88
|
raise ImportError(
|
|
52
|
-
"Dependencies are not installed. Please install it with
|
|
89
|
+
"Dependencies are not installed. Please install it with "
|
|
90
|
+
"`pip install pydantic-ai opentelemetry-sdk "
|
|
91
|
+
"opentelemetry-exporter-otlp-proto-http`."
|
|
53
92
|
)
|
|
54
93
|
return True
|
|
55
94
|
|
|
56
95
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
from
|
|
60
|
-
from
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
96
|
+
if TYPE_CHECKING:
|
|
97
|
+
# For type checkers, use real types
|
|
98
|
+
from opentelemetry.sdk.trace import ReadableSpan, SpanProcessor
|
|
99
|
+
from pydantic_ai.models.instrumented import InstrumentationSettings
|
|
100
|
+
else:
|
|
101
|
+
# At runtime we always have something to subclass / annotate with
|
|
102
|
+
InstrumentationSettings = _BaseInstrumentationSettings
|
|
103
|
+
SpanProcessor = _SpanProcessor
|
|
104
|
+
ReadableSpan = _ReadableSpan
|
|
66
105
|
|
|
67
106
|
# OTLP_ENDPOINT = "http://127.0.0.1:4318/v1/traces"
|
|
68
107
|
OTLP_ENDPOINT = "https://otel.confident-ai.com/v1/traces"
|