deepeval 3.5.8__tar.gz → 3.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {deepeval-3.5.8 → deepeval-3.8.0}/PKG-INFO +12 -14
- {deepeval-3.5.8 → deepeval-3.8.0}/README.md +9 -8
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/__init__.py +42 -14
- deepeval-3.8.0/deepeval/_version.py +1 -0
- deepeval-3.8.0/deepeval/anthropic/__init__.py +19 -0
- deepeval-3.8.0/deepeval/anthropic/extractors.py +94 -0
- deepeval-3.8.0/deepeval/anthropic/patch.py +169 -0
- deepeval-3.8.0/deepeval/anthropic/utils.py +225 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/drop/drop.py +45 -16
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/human_eval/human_eval.py +2 -1
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/ifeval/ifeval.py +2 -2
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/mmlu/mmlu.py +6 -4
- deepeval-3.8.0/deepeval/cli/main.py +3109 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/cli/test.py +1 -1
- deepeval-3.8.0/deepeval/cli/utils.py +353 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/confident/api.py +10 -1
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/confident/types.py +4 -2
- deepeval-3.8.0/deepeval/config/dotenv_handler.py +19 -0
- deepeval-3.8.0/deepeval/config/logging.py +33 -0
- deepeval-3.8.0/deepeval/config/settings.py +1589 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/config/settings_manager.py +5 -1
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/config/utils.py +14 -1
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/constants.py +9 -1
- deepeval-3.8.0/deepeval/contextvars.py +25 -0
- deepeval-3.8.0/deepeval/dataset/__init__.py +11 -0
- deepeval-3.8.0/deepeval/dataset/api.py +50 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/dataset/dataset.py +207 -54
- deepeval-3.8.0/deepeval/dataset/golden.py +197 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/dataset/test_run_tracer.py +4 -6
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/dataset/utils.py +44 -14
- deepeval-3.8.0/deepeval/errors.py +24 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/evaluate/compare.py +219 -4
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/evaluate/configs.py +1 -1
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/evaluate/evaluate.py +29 -14
- deepeval-3.8.0/deepeval/evaluate/execute.py +3184 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/evaluate/types.py +11 -1
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/evaluate/utils.py +107 -166
- deepeval-3.8.0/deepeval/integrations/crewai/__init__.py +9 -0
- deepeval-3.8.0/deepeval/integrations/crewai/handler.py +232 -0
- deepeval-3.8.0/deepeval/integrations/crewai/subs.py +51 -0
- deepeval-3.8.0/deepeval/integrations/crewai/tool.py +71 -0
- deepeval-3.8.0/deepeval/integrations/crewai/wrapper.py +127 -0
- deepeval-3.8.0/deepeval/integrations/langchain/callback.py +542 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/integrations/langchain/utils.py +31 -8
- deepeval-3.8.0/deepeval/integrations/llama_index/__init__.py +6 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/integrations/llama_index/handler.py +77 -24
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/integrations/llama_index/utils.py +24 -0
- deepeval-3.8.0/deepeval/integrations/pydantic_ai/__init__.py +5 -0
- deepeval-3.8.0/deepeval/integrations/pydantic_ai/agent.py +38 -0
- deepeval-3.8.0/deepeval/integrations/pydantic_ai/instrumentator.py +325 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/integrations/pydantic_ai/otel.py +13 -3
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/key_handler.py +133 -52
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/__init__.py +32 -16
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/answer_relevancy/answer_relevancy.py +128 -117
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/answer_relevancy/template.py +26 -7
- deepeval-3.8.0/deepeval/metrics/api.py +281 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/arena_g_eval/arena_g_eval.py +103 -97
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/arena_g_eval/template.py +17 -1
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/arena_g_eval/utils.py +5 -5
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/argument_correctness/argument_correctness.py +93 -89
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/argument_correctness/template.py +21 -4
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/base_metric.py +20 -44
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/bias/bias.py +112 -109
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/bias/template.py +17 -5
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/contextual_precision/contextual_precision.py +115 -98
- deepeval-3.8.0/deepeval/metrics/contextual_precision/template.py +133 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/contextual_recall/contextual_recall.py +105 -86
- deepeval-3.8.0/deepeval/metrics/contextual_recall/template.py +126 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/contextual_relevancy/contextual_relevancy.py +98 -85
- deepeval-3.8.0/deepeval/metrics/contextual_relevancy/template.py +106 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/conversation_completeness/conversation_completeness.py +113 -119
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/conversation_completeness/template.py +25 -5
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/conversational_dag/conversational_dag.py +24 -8
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/conversational_dag/nodes.py +78 -127
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/conversational_dag/templates.py +20 -4
- deepeval-3.8.0/deepeval/metrics/conversational_g_eval/__init__.py +3 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/conversational_g_eval/conversational_g_eval.py +157 -132
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/conversational_g_eval/template.py +4 -3
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/dag/dag.py +22 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/dag/nodes.py +75 -130
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/dag/schema.py +1 -1
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/dag/templates.py +19 -5
- deepeval-3.8.0/deepeval/metrics/exact_match/exact_match.py +102 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/faithfulness/faithfulness.py +158 -150
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval-3.8.0/deepeval/metrics/faithfulness/template.py +225 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/g_eval/g_eval.py +161 -86
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/g_eval/template.py +18 -1
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/g_eval/utils.py +73 -7
- deepeval-3.8.0/deepeval/metrics/goal_accuracy/__init__.py +1 -0
- deepeval-3.8.0/deepeval/metrics/goal_accuracy/goal_accuracy.py +364 -0
- deepeval-3.8.0/deepeval/metrics/goal_accuracy/schema.py +17 -0
- deepeval-3.8.0/deepeval/metrics/goal_accuracy/template.py +253 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/hallucination/hallucination.py +79 -83
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/hallucination/template.py +17 -4
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/indicator.py +43 -16
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/json_correctness/json_correctness.py +52 -39
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/json_correctness/template.py +10 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/knowledge_retention/knowledge_retention.py +72 -97
- deepeval-3.8.0/deepeval/metrics/knowledge_retention/schema.py +21 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/knowledge_retention/template.py +12 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/mcp/mcp_task_completion.py +90 -43
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +122 -81
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/mcp/schema.py +4 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/mcp/template.py +59 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/mcp_use_metric/mcp_use_metric.py +72 -66
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/mcp_use_metric/template.py +12 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/misuse/misuse.py +89 -98
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/misuse/template.py +17 -2
- deepeval-3.8.0/deepeval/metrics/multimodal_metrics/__init__.py +5 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +62 -53
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +82 -95
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +62 -53
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +62 -53
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/non_advice/non_advice.py +91 -105
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/non_advice/template.py +14 -2
- deepeval-3.8.0/deepeval/metrics/pattern_match/pattern_match.py +111 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/pii_leakage/pii_leakage.py +87 -107
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/pii_leakage/template.py +16 -2
- deepeval-3.8.0/deepeval/metrics/plan_adherence/__init__.py +1 -0
- deepeval-3.8.0/deepeval/metrics/plan_adherence/plan_adherence.py +266 -0
- deepeval-3.8.0/deepeval/metrics/plan_adherence/schema.py +11 -0
- deepeval-3.8.0/deepeval/metrics/plan_adherence/template.py +181 -0
- deepeval-3.8.0/deepeval/metrics/plan_quality/__init__.py +1 -0
- deepeval-3.8.0/deepeval/metrics/plan_quality/plan_quality.py +268 -0
- deepeval-3.8.0/deepeval/metrics/plan_quality/schema.py +11 -0
- deepeval-3.8.0/deepeval/metrics/plan_quality/template.py +110 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/prompt_alignment/prompt_alignment.py +103 -82
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/prompt_alignment/template.py +16 -4
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/ragas.py +3 -3
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/role_adherence/role_adherence.py +60 -71
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/role_adherence/template.py +14 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/role_violation/role_violation.py +87 -108
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/role_violation/template.py +14 -2
- deepeval-3.8.0/deepeval/metrics/step_efficiency/__init__.py +1 -0
- deepeval-3.8.0/deepeval/metrics/step_efficiency/schema.py +11 -0
- deepeval-3.8.0/deepeval/metrics/step_efficiency/step_efficiency.py +224 -0
- deepeval-3.8.0/deepeval/metrics/step_efficiency/template.py +267 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/summarization/summarization.py +127 -184
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/summarization/template.py +19 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/task_completion/task_completion.py +80 -75
- deepeval-3.8.0/deepeval/metrics/tool_correctness/schema.py +6 -0
- deepeval-3.8.0/deepeval/metrics/tool_correctness/template.py +88 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/tool_correctness/tool_correctness.py +240 -27
- deepeval-3.8.0/deepeval/metrics/tool_use/__init__.py +1 -0
- deepeval-3.8.0/deepeval/metrics/tool_use/schema.py +23 -0
- deepeval-3.8.0/deepeval/metrics/tool_use/template.py +234 -0
- deepeval-3.8.0/deepeval/metrics/tool_use/tool_use.py +436 -0
- deepeval-3.8.0/deepeval/metrics/topic_adherence/__init__.py +1 -0
- deepeval-3.8.0/deepeval/metrics/topic_adherence/schema.py +20 -0
- deepeval-3.8.0/deepeval/metrics/topic_adherence/template.py +182 -0
- deepeval-3.8.0/deepeval/metrics/topic_adherence/topic_adherence.py +342 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/toxicity/template.py +17 -4
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/toxicity/toxicity.py +92 -99
- deepeval-3.8.0/deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval-3.8.0/deepeval/metrics/turn_contextual_precision/template.py +194 -0
- deepeval-3.8.0/deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
- deepeval-3.8.0/deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval-3.8.0/deepeval/metrics/turn_contextual_recall/template.py +185 -0
- deepeval-3.8.0/deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +525 -0
- {deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy → deepeval-3.8.0/deepeval/metrics/turn_contextual_relevancy}/schema.py +7 -1
- deepeval-3.8.0/deepeval/metrics/turn_contextual_relevancy/template.py +168 -0
- deepeval-3.8.0/deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +532 -0
- {deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_faithfulness → deepeval-3.8.0/deepeval/metrics/turn_faithfulness}/schema.py +11 -3
- deepeval-3.8.0/deepeval/metrics/turn_faithfulness/template.py +225 -0
- deepeval-3.8.0/deepeval/metrics/turn_faithfulness/turn_faithfulness.py +573 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/turn_relevancy/template.py +16 -2
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/turn_relevancy/turn_relevancy.py +68 -69
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/utils.py +175 -121
- deepeval-3.8.0/deepeval/model_integrations/types.py +20 -0
- deepeval-3.8.0/deepeval/model_integrations/utils.py +116 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/models/__init__.py +4 -10
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/models/base_model.py +52 -34
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/models/embedding_models/__init__.py +7 -0
- deepeval-3.8.0/deepeval/models/embedding_models/azure_embedding_model.py +166 -0
- deepeval-3.8.0/deepeval/models/embedding_models/local_embedding_model.py +132 -0
- deepeval-3.8.0/deepeval/models/embedding_models/ollama_embedding_model.py +113 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/models/embedding_models/openai_embedding_model.py +61 -34
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/models/llms/__init__.py +4 -0
- deepeval-3.8.0/deepeval/models/llms/amazon_bedrock_model.py +316 -0
- deepeval-3.8.0/deepeval/models/llms/anthropic_model.py +298 -0
- deepeval-3.8.0/deepeval/models/llms/azure_model.py +458 -0
- deepeval-3.8.0/deepeval/models/llms/constants.py +2055 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/models/llms/deepseek_model.py +114 -52
- deepeval-3.8.0/deepeval/models/llms/gemini_model.py +430 -0
- deepeval-3.8.0/deepeval/models/llms/grok_model.py +312 -0
- deepeval-3.8.0/deepeval/models/llms/kimi_model.py +294 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/models/llms/litellm_model.py +190 -56
- deepeval-3.8.0/deepeval/models/llms/local_model.py +242 -0
- deepeval-3.8.0/deepeval/models/llms/ollama_model.py +237 -0
- deepeval-3.8.0/deepeval/models/llms/openai_model.py +488 -0
- deepeval-3.8.0/deepeval/models/llms/openrouter_model.py +398 -0
- deepeval-3.8.0/deepeval/models/llms/portkey_model.py +191 -0
- deepeval-3.8.0/deepeval/models/llms/utils.py +49 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/models/retry_policy.py +311 -26
- deepeval-3.8.0/deepeval/models/utils.py +173 -0
- deepeval-3.8.0/deepeval/openai/__init__.py +21 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/openai/extractors.py +82 -47
- deepeval-3.8.0/deepeval/openai/patch.py +295 -0
- deepeval-3.8.0/deepeval/openai/utils.py +211 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/openai_agents/__init__.py +4 -3
- deepeval-3.8.0/deepeval/openai_agents/agent.py +36 -0
- deepeval-3.8.0/deepeval/openai_agents/callback_handler.py +151 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/openai_agents/extractors.py +83 -7
- deepeval-3.8.0/deepeval/openai_agents/patch.py +309 -0
- deepeval-3.8.0/deepeval/openai_agents/runner.py +348 -0
- deepeval-3.8.0/deepeval/optimizer/__init__.py +5 -0
- deepeval-3.8.0/deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval-3.8.0/deepeval/optimizer/algorithms/base.py +29 -0
- deepeval-3.8.0/deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval-3.8.0/deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval-3.8.0/deepeval/optimizer/algorithms/copro/copro.py +836 -0
- deepeval-3.8.0/deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval-3.8.0/deepeval/optimizer/algorithms/gepa/gepa.py +737 -0
- deepeval-3.8.0/deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval-3.8.0/deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval-3.8.0/deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval-3.8.0/deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval-3.8.0/deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval-3.8.0/deepeval/optimizer/algorithms/simba/simba.py +999 -0
- deepeval-3.8.0/deepeval/optimizer/algorithms/simba/types.py +15 -0
- deepeval-3.8.0/deepeval/optimizer/configs.py +31 -0
- deepeval-3.8.0/deepeval/optimizer/policies.py +227 -0
- deepeval-3.8.0/deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval-3.8.0/deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval-3.8.0/deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval-3.8.0/deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval-3.8.0/deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval-3.8.0/deepeval/optimizer/scorer/base.py +86 -0
- deepeval-3.8.0/deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval-3.8.0/deepeval/optimizer/scorer/utils.py +30 -0
- deepeval-3.8.0/deepeval/optimizer/types.py +148 -0
- deepeval-3.8.0/deepeval/optimizer/utils.py +480 -0
- deepeval-3.8.0/deepeval/prompt/__init__.py +21 -0
- deepeval-3.8.0/deepeval/prompt/api.py +234 -0
- deepeval-3.8.0/deepeval/prompt/prompt.py +837 -0
- deepeval-3.8.0/deepeval/prompt/utils.py +221 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/simulator/conversation_simulator.py +74 -20
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/simulator/template.py +17 -2
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/synthesizer/chunking/context_generator.py +217 -152
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/synthesizer/chunking/doc_chunker.py +46 -12
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/synthesizer/config.py +9 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/synthesizer/schema.py +23 -0
- deepeval-3.8.0/deepeval/synthesizer/synthesizer.py +2751 -0
- deepeval-3.8.0/deepeval/synthesizer/templates/__init__.py +12 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/synthesizer/templates/template.py +554 -1
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/synthesizer/templates/template_extraction.py +32 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/synthesizer/templates/template_prompt.py +262 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/telemetry.py +3 -3
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/test_case/__init__.py +3 -4
- deepeval-3.8.0/deepeval/test_case/api.py +112 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/test_case/arena_test_case.py +21 -5
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/test_case/conversational_test_case.py +68 -1
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/test_case/llm_test_case.py +215 -2
- deepeval-3.8.0/deepeval/test_case/utils.py +20 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/test_run/__init__.py +3 -1
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/test_run/api.py +22 -16
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/test_run/cache.py +37 -13
- deepeval-3.8.0/deepeval/test_run/hyperparameters.py +109 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/test_run/test_run.py +437 -227
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/tracing/__init__.py +3 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/tracing/api.py +11 -8
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/tracing/context.py +4 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/tracing/otel/exporter.py +248 -176
- deepeval-3.8.0/deepeval/tracing/otel/test_exporter.py +35 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/tracing/otel/utils.py +258 -23
- deepeval-3.8.0/deepeval/tracing/patchers.py +190 -0
- deepeval-3.8.0/deepeval/tracing/trace_context.py +107 -0
- deepeval-3.8.0/deepeval/tracing/trace_test_manager.py +19 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/tracing/tracing.py +129 -23
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/tracing/types.py +29 -11
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/tracing/utils.py +68 -84
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/utils.py +357 -11
- {deepeval-3.5.8 → deepeval-3.8.0}/pyproject.toml +26 -10
- deepeval-3.5.8/deepeval/_version.py +0 -1
- deepeval-3.5.8/deepeval/cli/main.py +0 -1629
- deepeval-3.5.8/deepeval/cli/utils.py +0 -181
- deepeval-3.5.8/deepeval/config/settings.py +0 -671
- deepeval-3.5.8/deepeval/dataset/__init__.py +0 -5
- deepeval-3.5.8/deepeval/dataset/api.py +0 -28
- deepeval-3.5.8/deepeval/dataset/golden.py +0 -60
- deepeval-3.5.8/deepeval/errors.py +0 -6
- deepeval-3.5.8/deepeval/evaluate/execute.py +0 -2242
- deepeval-3.5.8/deepeval/integrations/crewai/__init__.py +0 -4
- deepeval-3.5.8/deepeval/integrations/crewai/agent.py +0 -98
- deepeval-3.5.8/deepeval/integrations/crewai/handler.py +0 -124
- deepeval-3.5.8/deepeval/integrations/crewai/patch.py +0 -41
- deepeval-3.5.8/deepeval/integrations/langchain/callback.py +0 -345
- deepeval-3.5.8/deepeval/integrations/llama_index/__init__.py +0 -10
- deepeval-3.5.8/deepeval/integrations/llama_index/agent/patched.py +0 -68
- deepeval-3.5.8/deepeval/integrations/pydantic_ai/__init__.py +0 -5
- deepeval-3.5.8/deepeval/integrations/pydantic_ai/agent.py +0 -339
- deepeval-3.5.8/deepeval/integrations/pydantic_ai/patcher.py +0 -484
- deepeval-3.5.8/deepeval/integrations/pydantic_ai/utils.py +0 -323
- deepeval-3.5.8/deepeval/metrics/contextual_precision/template.py +0 -84
- deepeval-3.5.8/deepeval/metrics/contextual_recall/template.py +0 -75
- deepeval-3.5.8/deepeval/metrics/contextual_relevancy/template.py +0 -77
- deepeval-3.5.8/deepeval/metrics/faithfulness/template.py +0 -140
- deepeval-3.5.8/deepeval/metrics/knowledge_retention/schema.py +0 -15
- deepeval-3.5.8/deepeval/metrics/multimodal_metrics/__init__.py +0 -24
- deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -338
- deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -288
- deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -282
- deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -279
- deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -353
- deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -379
- deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
- deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -285
- deepeval-3.5.8/deepeval/models/embedding_models/azure_embedding_model.py +0 -106
- deepeval-3.5.8/deepeval/models/embedding_models/local_embedding_model.py +0 -102
- deepeval-3.5.8/deepeval/models/embedding_models/ollama_embedding_model.py +0 -80
- deepeval-3.5.8/deepeval/models/llms/amazon_bedrock_model.py +0 -186
- deepeval-3.5.8/deepeval/models/llms/anthropic_model.py +0 -170
- deepeval-3.5.8/deepeval/models/llms/azure_model.py +0 -287
- deepeval-3.5.8/deepeval/models/llms/gemini_model.py +0 -232
- deepeval-3.5.8/deepeval/models/llms/grok_model.py +0 -237
- deepeval-3.5.8/deepeval/models/llms/kimi_model.py +0 -236
- deepeval-3.5.8/deepeval/models/llms/local_model.py +0 -130
- deepeval-3.5.8/deepeval/models/llms/ollama_model.py +0 -104
- deepeval-3.5.8/deepeval/models/llms/openai_model.py +0 -518
- deepeval-3.5.8/deepeval/models/llms/utils.py +0 -22
- deepeval-3.5.8/deepeval/models/mlllms/__init__.py +0 -3
- deepeval-3.5.8/deepeval/models/mlllms/gemini_model.py +0 -284
- deepeval-3.5.8/deepeval/models/mlllms/ollama_model.py +0 -144
- deepeval-3.5.8/deepeval/models/mlllms/openai_model.py +0 -258
- deepeval-3.5.8/deepeval/models/utils.py +0 -31
- deepeval-3.5.8/deepeval/openai/__init__.py +0 -37
- deepeval-3.5.8/deepeval/openai/patch.py +0 -204
- deepeval-3.5.8/deepeval/openai/utils.py +0 -86
- deepeval-3.5.8/deepeval/openai_agents/agent.py +0 -194
- deepeval-3.5.8/deepeval/openai_agents/callback_handler.py +0 -134
- deepeval-3.5.8/deepeval/openai_agents/patch.py +0 -115
- deepeval-3.5.8/deepeval/openai_agents/runner.py +0 -335
- deepeval-3.5.8/deepeval/prompt/__init__.py +0 -3
- deepeval-3.5.8/deepeval/prompt/api.py +0 -70
- deepeval-3.5.8/deepeval/prompt/prompt.py +0 -434
- deepeval-3.5.8/deepeval/prompt/utils.py +0 -50
- deepeval-3.5.8/deepeval/synthesizer/synthesizer.py +0 -1502
- deepeval-3.5.8/deepeval/synthesizer/templates/__init__.py +0 -3
- deepeval-3.5.8/deepeval/test_case/mllm_test_case.py +0 -147
- deepeval-3.5.8/deepeval/test_case/utils.py +0 -24
- deepeval-3.5.8/deepeval/test_run/hyperparameters.py +0 -66
- deepeval-3.5.8/deepeval/tracing/patchers.py +0 -84
- {deepeval-3.5.8 → deepeval-3.8.0}/LICENSE.md +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/annotation/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/annotation/annotation.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/annotation/api.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/arc/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/arc/arc.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/arc/mode.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/arc/template.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/base_benchmark.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/bbq/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/bbq/bbq.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/bbq/task.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/bbq/template.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/big_bench_hard.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/boolean_expressions.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/causal_judgement.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/date_understanding.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/disambiguation_qa.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/dyck_languages.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/formal_fallacies.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/geometric_shapes.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/hyperbaton.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_five_objects.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_seven_objects.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/logical_deduction_three_objects.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/movie_recommendation.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/multistep_arithmetic_two.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/navigate.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/object_counting.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/penguins_in_a_table.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/reasoning_about_colored_objects.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/ruin_names.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/salient_translation_error_detection.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/snarks.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/sports_understanding.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/temporal_sequences.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/web_of_lies.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/cot_prompts/word_sorting.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/boolean_expressions.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/causal_judgement.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/date_understanding.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/disambiguation_qa.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/dyck_languages.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/formal_fallacies.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/geometric_shapes.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/hyperbaton.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_five_objects.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_seven_objects.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/logical_deduction_three_objects.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/movie_recommendation.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/multistep_arithmetic_two.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/navigate.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/object_counting.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/penguins_in_a_table.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/reasoning_about_colored_objects.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/ruin_names.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/salient_translation_error_detection.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/snarks.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/sports_understanding.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/temporal_sequences.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/web_of_lies.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/shot_prompts/word_sorting.txt +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/task.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/big_bench_hard/template.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/bool_q/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/bool_q/bool_q.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/bool_q/template.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/drop/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/drop/task.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/drop/template.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/equity_med_qa/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/equity_med_qa/task.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/equity_med_qa/template.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/gsm8k/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/gsm8k/gsm8k.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/gsm8k/template.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/hellaswag/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/hellaswag/hellaswag.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/hellaswag/task.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/hellaswag/template.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/human_eval/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/human_eval/task.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/human_eval/template.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/ifeval/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/ifeval/template.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/lambada/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/lambada/lambada.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/lambada/template.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/logi_qa/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/logi_qa/logi_qa.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/logi_qa/task.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/logi_qa/template.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/math_qa/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/math_qa/math_qa.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/math_qa/task.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/math_qa/template.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/mmlu/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/mmlu/task.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/mmlu/template.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/modes/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/results.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/schema.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/squad/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/squad/squad.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/squad/task.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/squad/template.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/tasks/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/truthful_qa/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/truthful_qa/mode.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/truthful_qa/task.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/truthful_qa/template.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/truthful_qa/truthful_qa.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/utils.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/winogrande/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/winogrande/template.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/benchmarks/winogrande/winogrande.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/cli/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/cli/dotenv_handler.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/cli/server.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/cli/types.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/confident/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/config/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/dataset/types.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/evaluate/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/evaluate/api.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/integrations/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/integrations/hugging_face/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/integrations/hugging_face/callback.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/integrations/hugging_face/rich_manager.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/integrations/hugging_face/tests/test_callbacks.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/integrations/hugging_face/utils.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/integrations/langchain/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/integrations/langchain/patch.py +0 -0
- /deepeval-3.5.8/deepeval/metrics/argument_correctness/__init__.py → /deepeval-3.8.0/deepeval/integrations/pydantic_ai/test_instrumentator.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/answer_relevancy/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/answer_relevancy/schema.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/arena_g_eval/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/arena_g_eval/schema.py +0 -0
- {deepeval-3.5.8/deepeval/metrics/conversation_completeness → deepeval-3.8.0/deepeval/metrics/argument_correctness}/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/argument_correctness/schema.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/bias/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/bias/schema.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/contextual_precision/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/contextual_precision/schema.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/contextual_recall/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/contextual_recall/schema.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/contextual_relevancy/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/contextual_relevancy/schema.py +0 -0
- {deepeval-3.5.8/deepeval/metrics/conversational_g_eval → deepeval-3.8.0/deepeval/metrics/conversation_completeness}/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/conversation_completeness/schema.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/conversational_dag/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/conversational_g_eval/schema.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/dag/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/dag/graph.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/dag/utils.py +0 -0
- {deepeval-3.5.8/deepeval/metrics/json_correctness → deepeval-3.8.0/deepeval/metrics/exact_match}/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/faithfulness/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/g_eval/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/g_eval/schema.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/hallucination/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/hallucination/schema.py +0 -0
- {deepeval-3.5.8/deepeval/metrics/knowledge_retention → deepeval-3.8.0/deepeval/metrics/json_correctness}/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/json_correctness/schema.py +0 -0
- {deepeval-3.5.8/deepeval/metrics/mcp → deepeval-3.8.0/deepeval/metrics/knowledge_retention}/__init__.py +0 -0
- {deepeval-3.5.8/deepeval/metrics/mcp_use_metric → deepeval-3.8.0/deepeval/metrics/mcp}/__init__.py +0 -0
- {deepeval-3.5.8/deepeval/metrics/multimodal_metrics/image_coherence → deepeval-3.8.0/deepeval/metrics/mcp_use_metric}/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/mcp_use_metric/schema.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/misuse/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/misuse/schema.py +0 -0
- {deepeval-3.5.8/deepeval/metrics/multimodal_metrics/image_editing → deepeval-3.8.0/deepeval/metrics/multimodal_metrics/image_coherence}/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/multimodal_metrics/image_coherence/schema.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/multimodal_metrics/image_coherence/template.py +0 -0
- {deepeval-3.5.8/deepeval/metrics/multimodal_metrics/image_helpfulness → deepeval-3.8.0/deepeval/metrics/multimodal_metrics/image_editing}/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/multimodal_metrics/image_editing/schema.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/multimodal_metrics/image_editing/template.py +0 -0
- {deepeval-3.5.8/deepeval/metrics/multimodal_metrics/image_reference → deepeval-3.8.0/deepeval/metrics/multimodal_metrics/image_helpfulness}/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/multimodal_metrics/image_helpfulness/schema.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/multimodal_metrics/image_helpfulness/template.py +0 -0
- {deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy → deepeval-3.8.0/deepeval/metrics/multimodal_metrics/image_reference}/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/multimodal_metrics/image_reference/schema.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/multimodal_metrics/image_reference/template.py +0 -0
- {deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_contextual_precision → deepeval-3.8.0/deepeval/metrics/multimodal_metrics/text_to_image}/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/multimodal_metrics/text_to_image/schema.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/multimodal_metrics/text_to_image/template.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/non_advice/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/non_advice/schema.py +0 -0
- {deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_contextual_recall → deepeval-3.8.0/deepeval/metrics/pattern_match}/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/pii_leakage/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/pii_leakage/schema.py +0 -0
- {deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy → deepeval-3.8.0/deepeval/metrics/prompt_alignment}/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/prompt_alignment/schema.py +0 -0
- {deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_faithfulness → deepeval-3.8.0/deepeval/metrics/role_adherence}/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/role_adherence/schema.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/role_violation/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/role_violation/schema.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/summarization/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/summarization/schema.py +0 -0
- {deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_g_eval → deepeval-3.8.0/deepeval/metrics/task_completion}/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/task_completion/schema.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/task_completion/template.py +0 -0
- {deepeval-3.5.8/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness → deepeval-3.8.0/deepeval/metrics/tool_correctness}/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/toxicity/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/toxicity/schema.py +0 -0
- {deepeval-3.5.8/deepeval/metrics/multimodal_metrics/text_to_image → deepeval-3.8.0/deepeval/metrics/turn_contextual_precision}/__init__.py +0 -0
- {deepeval-3.5.8/deepeval/metrics/prompt_alignment → deepeval-3.8.0/deepeval/metrics/turn_contextual_recall}/__init__.py +0 -0
- {deepeval-3.5.8/deepeval/metrics/role_adherence → deepeval-3.8.0/deepeval/metrics/turn_contextual_relevancy}/__init__.py +0 -0
- {deepeval-3.5.8/deepeval/metrics/task_completion → deepeval-3.8.0/deepeval/metrics/turn_faithfulness}/__init__.py +0 -0
- {deepeval-3.5.8/deepeval/metrics/tool_correctness → deepeval-3.8.0/deepeval/metrics/turn_relevancy}/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/metrics/turn_relevancy/schema.py +0 -0
- {deepeval-3.5.8/deepeval/metrics/turn_relevancy → deepeval-3.8.0/deepeval/model_integrations}/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/models/_summac_model.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/models/answer_relevancy_model.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/models/detoxify_model.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/models/hallucination_model.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/models/summac_model.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/models/unbias_model.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/plugins/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/plugins/plugin.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/progress_context.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/py.typed +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/red_teaming/README.md +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/scorer/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/scorer/scorer.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/simulator/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/simulator/schema.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/singleton.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/synthesizer/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/synthesizer/base_synthesizer.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/synthesizer/chunking/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/synthesizer/types.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/synthesizer/utils.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/test_case/mcp.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/test_run/hooks.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/tracing/offline_evals/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/tracing/offline_evals/api.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/tracing/offline_evals/span.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/tracing/offline_evals/thread.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/tracing/offline_evals/trace.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/tracing/otel/__init__.py +0 -0
- {deepeval-3.5.8 → deepeval-3.8.0}/deepeval/tracing/perf_epoch_bridge.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: deepeval
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.8.0
|
|
4
4
|
Summary: The LLM Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/confident-ai/deepeval
|
|
6
6
|
License: Apache-2.0
|
|
@@ -13,26 +13,23 @@ Classifier: Programming Language :: Python :: 3.9
|
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.10
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.11
|
|
15
15
|
Requires-Dist: aiohttp
|
|
16
|
-
Requires-Dist: anthropic
|
|
17
16
|
Requires-Dist: click (>=8.0.0,<8.3.0)
|
|
18
|
-
Requires-Dist: google-genai (>=1.9.0,<2.0.0)
|
|
19
17
|
Requires-Dist: grpcio (>=1.67.1,<2.0.0)
|
|
20
18
|
Requires-Dist: jinja2
|
|
21
19
|
Requires-Dist: nest_asyncio
|
|
22
|
-
Requires-Dist: ollama
|
|
23
20
|
Requires-Dist: openai
|
|
24
21
|
Requires-Dist: opentelemetry-api (>=1.24.0,<2.0.0)
|
|
25
22
|
Requires-Dist: opentelemetry-exporter-otlp-proto-grpc (>=1.24.0,<2.0.0)
|
|
26
23
|
Requires-Dist: opentelemetry-sdk (>=1.24.0,<2.0.0)
|
|
27
24
|
Requires-Dist: portalocker
|
|
28
|
-
Requires-Dist: posthog (>=
|
|
25
|
+
Requires-Dist: posthog (>=5.4.0,<6.0.0)
|
|
29
26
|
Requires-Dist: pydantic (>=2.11.7,<3.0.0)
|
|
30
27
|
Requires-Dist: pydantic-settings (>=2.10.1,<3.0.0)
|
|
31
28
|
Requires-Dist: pyfiglet
|
|
32
29
|
Requires-Dist: pytest
|
|
33
30
|
Requires-Dist: pytest-asyncio
|
|
34
31
|
Requires-Dist: pytest-repeat
|
|
35
|
-
Requires-Dist: pytest-rerunfailures
|
|
32
|
+
Requires-Dist: pytest-rerunfailures
|
|
36
33
|
Requires-Dist: pytest-xdist
|
|
37
34
|
Requires-Dist: python-dotenv (>=1.1.1,<2.0.0)
|
|
38
35
|
Requires-Dist: requests (>=2.31.0,<3.0.0)
|
|
@@ -103,9 +100,9 @@ Description-Content-Type: text/markdown
|
|
|
103
100
|
<a href="https://www.readme-i18n.com/confident-ai/deepeval?lang=zh">中文</a>
|
|
104
101
|
</p>
|
|
105
102
|
|
|
106
|
-
**DeepEval** is a simple-to-use, open-source LLM evaluation framework, for evaluating and testing large-language model systems. It is similar to Pytest but specialized for unit testing LLM outputs. DeepEval incorporates the latest research to evaluate LLM outputs based on metrics such as G-Eval,
|
|
103
|
+
**DeepEval** is a simple-to-use, open-source LLM evaluation framework, for evaluating and testing large-language model systems. It is similar to Pytest but specialized for unit testing LLM outputs. DeepEval incorporates the latest research to evaluate LLM outputs based on metrics such as G-Eval, task completion, answer relevancy, hallucination, etc., which uses LLM-as-a-judge and other NLP models that run **locally on your machine** for evaluation.
|
|
107
104
|
|
|
108
|
-
Whether your LLM applications are RAG pipelines, chatbots,
|
|
105
|
+
Whether your LLM applications are AI agents, RAG pipelines, or chatbots, implemented via LangChain or OpenAI, DeepEval has you covered. With it, you can easily determine the optimal models, prompts, and architecture to improve your RAG pipeline, agentic workflows, prevent prompt drifting, or even transition from OpenAI to hosting your own Deepseek R1 with confidence.
|
|
109
106
|
|
|
110
107
|
> [!IMPORTANT]
|
|
111
108
|
> Need a place for your DeepEval testing data to live 🏡❤️? [Sign up to the DeepEval platform](https://confident-ai.com?utm_source=GitHub) to compare iterations of your LLM app, generate & share testing reports, and more.
|
|
@@ -118,10 +115,10 @@ Whether your LLM applications are RAG pipelines, chatbots, AI agents, implemente
|
|
|
118
115
|
|
|
119
116
|
# 🔥 Metrics and Features
|
|
120
117
|
|
|
121
|
-
> 🥳 You can now share DeepEval's test results on the cloud directly on [Confident AI](https://confident-ai.com?utm_source=GitHub)
|
|
118
|
+
> 🥳 You can now share DeepEval's test results on the cloud directly on [Confident AI](https://confident-ai.com?utm_source=GitHub)
|
|
122
119
|
|
|
123
120
|
- Supports both end-to-end and component-level LLM evaluation.
|
|
124
|
-
- Large variety of ready-to-use LLM evaluation metrics (all with explanations) powered by **ANY** LLM of your choice, statistical methods, or NLP models that
|
|
121
|
+
- Large variety of ready-to-use LLM evaluation metrics (all with explanations) powered by **ANY** LLM of your choice, statistical methods, or NLP models that run **locally on your machine**:
|
|
125
122
|
- G-Eval
|
|
126
123
|
- DAG ([deep acyclic graph](https://deepeval.com/docs/metrics-dag))
|
|
127
124
|
- **RAG metrics:**
|
|
@@ -161,7 +158,7 @@ Whether your LLM applications are RAG pipelines, chatbots, AI agents, implemente
|
|
|
161
158
|
- TruthfulQA
|
|
162
159
|
- HumanEval
|
|
163
160
|
- GSM8K
|
|
164
|
-
- [100% integrated with Confident AI](https://confident-ai.com?utm_source=GitHub) for the full evaluation lifecycle:
|
|
161
|
+
- [100% integrated with Confident AI](https://confident-ai.com?utm_source=GitHub) for the full evaluation & observability lifecycle:
|
|
165
162
|
- Curate/annotate evaluation datasets on the cloud
|
|
166
163
|
- Benchmark LLM app using dataset, and compare with previous iterations to experiment which models/prompts works best
|
|
167
164
|
- Fine-tune metrics for custom results
|
|
@@ -170,7 +167,7 @@ Whether your LLM applications are RAG pipelines, chatbots, AI agents, implemente
|
|
|
170
167
|
- Repeat until perfection
|
|
171
168
|
|
|
172
169
|
> [!NOTE]
|
|
173
|
-
> Confident AI
|
|
170
|
+
> DeepEval is available on Confident AI, an LLM evals platform for AI observability and quality. Create an account [here.](https://app.confident-ai.com?utm_source=GitHub)
|
|
174
171
|
|
|
175
172
|
<br />
|
|
176
173
|
|
|
@@ -359,7 +356,7 @@ for golden in dataset.goldens:
|
|
|
359
356
|
|
|
360
357
|
@pytest.mark.parametrize(
|
|
361
358
|
"test_case",
|
|
362
|
-
dataset,
|
|
359
|
+
dataset.test_cases,
|
|
363
360
|
)
|
|
364
361
|
def test_customer_chatbot(test_case: LLMTestCase):
|
|
365
362
|
answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5)
|
|
@@ -397,7 +394,7 @@ cp .env.example .env.local
|
|
|
397
394
|
|
|
398
395
|
# DeepEval With Confident AI
|
|
399
396
|
|
|
400
|
-
DeepEval
|
|
397
|
+
DeepEval is available on [Confident AI](https://confident-ai.com?utm_source=Github), an evals & observability platform that allows you to:
|
|
401
398
|
|
|
402
399
|
1. Curate/annotate evaluation datasets on the cloud
|
|
403
400
|
2. Benchmark LLM app using dataset, and compare with previous iterations to experiment which models/prompts works best
|
|
@@ -439,6 +436,7 @@ Using `.env.local` or `.env` is optional. If they are missing, DeepEval uses you
|
|
|
439
436
|
```bash
|
|
440
437
|
cp .env.example .env.local
|
|
441
438
|
# then edit .env.local (ignored by git)
|
|
439
|
+
```
|
|
442
440
|
|
|
443
441
|
<br />
|
|
444
442
|
|
|
@@ -53,9 +53,9 @@
|
|
|
53
53
|
<a href="https://www.readme-i18n.com/confident-ai/deepeval?lang=zh">中文</a>
|
|
54
54
|
</p>
|
|
55
55
|
|
|
56
|
-
**DeepEval** is a simple-to-use, open-source LLM evaluation framework, for evaluating and testing large-language model systems. It is similar to Pytest but specialized for unit testing LLM outputs. DeepEval incorporates the latest research to evaluate LLM outputs based on metrics such as G-Eval,
|
|
56
|
+
**DeepEval** is a simple-to-use, open-source LLM evaluation framework, for evaluating and testing large-language model systems. It is similar to Pytest but specialized for unit testing LLM outputs. DeepEval incorporates the latest research to evaluate LLM outputs based on metrics such as G-Eval, task completion, answer relevancy, hallucination, etc., which uses LLM-as-a-judge and other NLP models that run **locally on your machine** for evaluation.
|
|
57
57
|
|
|
58
|
-
Whether your LLM applications are RAG pipelines, chatbots,
|
|
58
|
+
Whether your LLM applications are AI agents, RAG pipelines, or chatbots, implemented via LangChain or OpenAI, DeepEval has you covered. With it, you can easily determine the optimal models, prompts, and architecture to improve your RAG pipeline, agentic workflows, prevent prompt drifting, or even transition from OpenAI to hosting your own Deepseek R1 with confidence.
|
|
59
59
|
|
|
60
60
|
> [!IMPORTANT]
|
|
61
61
|
> Need a place for your DeepEval testing data to live 🏡❤️? [Sign up to the DeepEval platform](https://confident-ai.com?utm_source=GitHub) to compare iterations of your LLM app, generate & share testing reports, and more.
|
|
@@ -68,10 +68,10 @@ Whether your LLM applications are RAG pipelines, chatbots, AI agents, implemente
|
|
|
68
68
|
|
|
69
69
|
# 🔥 Metrics and Features
|
|
70
70
|
|
|
71
|
-
> 🥳 You can now share DeepEval's test results on the cloud directly on [Confident AI](https://confident-ai.com?utm_source=GitHub)
|
|
71
|
+
> 🥳 You can now share DeepEval's test results on the cloud directly on [Confident AI](https://confident-ai.com?utm_source=GitHub)
|
|
72
72
|
|
|
73
73
|
- Supports both end-to-end and component-level LLM evaluation.
|
|
74
|
-
- Large variety of ready-to-use LLM evaluation metrics (all with explanations) powered by **ANY** LLM of your choice, statistical methods, or NLP models that
|
|
74
|
+
- Large variety of ready-to-use LLM evaluation metrics (all with explanations) powered by **ANY** LLM of your choice, statistical methods, or NLP models that run **locally on your machine**:
|
|
75
75
|
- G-Eval
|
|
76
76
|
- DAG ([deep acyclic graph](https://deepeval.com/docs/metrics-dag))
|
|
77
77
|
- **RAG metrics:**
|
|
@@ -111,7 +111,7 @@ Whether your LLM applications are RAG pipelines, chatbots, AI agents, implemente
|
|
|
111
111
|
- TruthfulQA
|
|
112
112
|
- HumanEval
|
|
113
113
|
- GSM8K
|
|
114
|
-
- [100% integrated with Confident AI](https://confident-ai.com?utm_source=GitHub) for the full evaluation lifecycle:
|
|
114
|
+
- [100% integrated with Confident AI](https://confident-ai.com?utm_source=GitHub) for the full evaluation & observability lifecycle:
|
|
115
115
|
- Curate/annotate evaluation datasets on the cloud
|
|
116
116
|
- Benchmark LLM app using dataset, and compare with previous iterations to experiment which models/prompts works best
|
|
117
117
|
- Fine-tune metrics for custom results
|
|
@@ -120,7 +120,7 @@ Whether your LLM applications are RAG pipelines, chatbots, AI agents, implemente
|
|
|
120
120
|
- Repeat until perfection
|
|
121
121
|
|
|
122
122
|
> [!NOTE]
|
|
123
|
-
> Confident AI
|
|
123
|
+
> DeepEval is available on Confident AI, an LLM evals platform for AI observability and quality. Create an account [here.](https://app.confident-ai.com?utm_source=GitHub)
|
|
124
124
|
|
|
125
125
|
<br />
|
|
126
126
|
|
|
@@ -309,7 +309,7 @@ for golden in dataset.goldens:
|
|
|
309
309
|
|
|
310
310
|
@pytest.mark.parametrize(
|
|
311
311
|
"test_case",
|
|
312
|
-
dataset,
|
|
312
|
+
dataset.test_cases,
|
|
313
313
|
)
|
|
314
314
|
def test_customer_chatbot(test_case: LLMTestCase):
|
|
315
315
|
answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5)
|
|
@@ -347,7 +347,7 @@ cp .env.example .env.local
|
|
|
347
347
|
|
|
348
348
|
# DeepEval With Confident AI
|
|
349
349
|
|
|
350
|
-
DeepEval
|
|
350
|
+
DeepEval is available on [Confident AI](https://confident-ai.com?utm_source=Github), an evals & observability platform that allows you to:
|
|
351
351
|
|
|
352
352
|
1. Curate/annotate evaluation datasets on the cloud
|
|
353
353
|
2. Benchmark LLM app using dataset, and compare with previous iterations to experiment which models/prompts works best
|
|
@@ -389,6 +389,7 @@ Using `.env.local` or `.env` is optional. If they are missing, DeepEval uses you
|
|
|
389
389
|
```bash
|
|
390
390
|
cp .env.example .env.local
|
|
391
391
|
# then edit .env.local (ignored by git)
|
|
392
|
+
```
|
|
392
393
|
|
|
393
394
|
<br />
|
|
394
395
|
|
|
@@ -1,24 +1,56 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
1
4
|
import os
|
|
2
|
-
import warnings
|
|
3
5
|
import re
|
|
6
|
+
import warnings
|
|
4
7
|
|
|
5
|
-
# load environment variables before other imports
|
|
8
|
+
# IMPORTANT: load environment variables before other imports
|
|
6
9
|
from deepeval.config.settings import autoload_dotenv, get_settings
|
|
7
10
|
|
|
11
|
+
logging.getLogger("deepeval").addHandler(logging.NullHandler())
|
|
8
12
|
autoload_dotenv()
|
|
9
13
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
14
|
+
|
|
15
|
+
def _expose_public_api() -> None:
|
|
16
|
+
# All other imports must happen after env is loaded
|
|
17
|
+
# Do not do this at module level or ruff will complain with E402
|
|
18
|
+
global __version__, evaluate, assert_test, compare
|
|
19
|
+
global on_test_run_end, log_hyperparameters, login, telemetry
|
|
20
|
+
|
|
21
|
+
from ._version import __version__ as _version
|
|
22
|
+
from deepeval.evaluate import (
|
|
23
|
+
evaluate as _evaluate,
|
|
24
|
+
assert_test as _assert_test,
|
|
25
|
+
)
|
|
26
|
+
from deepeval.evaluate.compare import compare as _compare
|
|
27
|
+
from deepeval.test_run import (
|
|
28
|
+
on_test_run_end as _on_end,
|
|
29
|
+
log_hyperparameters as _log_hparams,
|
|
30
|
+
)
|
|
31
|
+
from deepeval.utils import login as _login
|
|
32
|
+
import deepeval.telemetry as _telemetry
|
|
33
|
+
|
|
34
|
+
__version__ = _version
|
|
35
|
+
evaluate = _evaluate
|
|
36
|
+
assert_test = _assert_test
|
|
37
|
+
compare = _compare
|
|
38
|
+
on_test_run_end = _on_end
|
|
39
|
+
log_hyperparameters = _log_hparams
|
|
40
|
+
login = _login
|
|
41
|
+
telemetry = _telemetry
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
_expose_public_api()
|
|
16
45
|
|
|
17
46
|
|
|
18
47
|
settings = get_settings()
|
|
48
|
+
|
|
19
49
|
if not settings.DEEPEVAL_GRPC_LOGGING:
|
|
20
|
-
os.
|
|
21
|
-
|
|
50
|
+
if os.getenv("GRPC_VERBOSITY") is None:
|
|
51
|
+
os.environ["GRPC_VERBOSITY"] = settings.GRPC_VERBOSITY or "ERROR"
|
|
52
|
+
if os.getenv("GRPC_TRACE") is None:
|
|
53
|
+
os.environ["GRPC_TRACE"] = settings.GRPC_TRACE or ""
|
|
22
54
|
|
|
23
55
|
|
|
24
56
|
__all__ = [
|
|
@@ -70,9 +102,5 @@ def update_warning_opt_in():
|
|
|
70
102
|
return os.getenv("DEEPEVAL_UPDATE_WARNING_OPT_IN") == "1"
|
|
71
103
|
|
|
72
104
|
|
|
73
|
-
def is_read_only_env():
|
|
74
|
-
return os.getenv("DEEPEVAL_FILE_SYSTEM") == "READ_ONLY"
|
|
75
|
-
|
|
76
|
-
|
|
77
105
|
if update_warning_opt_in():
|
|
78
106
|
check_for_update()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__: str = "3.8.0"
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
try:
|
|
2
|
+
import anthropic # noqa: F401
|
|
3
|
+
except ImportError:
|
|
4
|
+
raise ModuleNotFoundError(
|
|
5
|
+
"Please install anthropic to use this feature: 'pip install anthropic'"
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
from anthropic import Anthropic, AsyncAnthropic # noqa: F401
|
|
10
|
+
except ImportError:
|
|
11
|
+
Anthropic = None # type: ignore
|
|
12
|
+
AsyncAnthropic = None # type: ignore
|
|
13
|
+
|
|
14
|
+
if Anthropic or AsyncAnthropic:
|
|
15
|
+
from deepeval.anthropic.patch import patch_anthropic_classes
|
|
16
|
+
from deepeval.telemetry import capture_tracing_integration
|
|
17
|
+
|
|
18
|
+
with capture_tracing_integration("anthropic"):
|
|
19
|
+
patch_anthropic_classes()
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
from anthropic.types.message import Message
|
|
2
|
+
from anthropic.types import ToolUseBlock
|
|
3
|
+
from typing import Any, Dict
|
|
4
|
+
|
|
5
|
+
from deepeval.anthropic.utils import (
|
|
6
|
+
render_messages_anthropic,
|
|
7
|
+
stringify_anthropic_content,
|
|
8
|
+
)
|
|
9
|
+
from deepeval.model_integrations.types import InputParameters, OutputParameters
|
|
10
|
+
from deepeval.test_case.llm_test_case import ToolCall
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def safe_extract_input_parameters(kwargs: Dict[str, Any]) -> InputParameters:
|
|
14
|
+
# guarding against errors to be compatible with legacy APIs
|
|
15
|
+
try:
|
|
16
|
+
return extract_messages_api_input_parameters(kwargs)
|
|
17
|
+
except:
|
|
18
|
+
return InputParameters(model="NA")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def extract_messages_api_input_parameters(
|
|
22
|
+
kwargs: Dict[str, Any],
|
|
23
|
+
) -> InputParameters:
|
|
24
|
+
model = kwargs.get("model")
|
|
25
|
+
tools = kwargs.get("tools")
|
|
26
|
+
messages = kwargs.get("messages")
|
|
27
|
+
tool_descriptions = (
|
|
28
|
+
{tool["name"]: tool["description"] for tool in tools}
|
|
29
|
+
if tools is not None
|
|
30
|
+
else None
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
input_argument = ""
|
|
34
|
+
user_messages = []
|
|
35
|
+
for message in messages:
|
|
36
|
+
role = message["role"]
|
|
37
|
+
if role == "user":
|
|
38
|
+
user_messages.append(message["content"])
|
|
39
|
+
if len(user_messages) > 0:
|
|
40
|
+
input_argument = user_messages[0]
|
|
41
|
+
|
|
42
|
+
return InputParameters(
|
|
43
|
+
model=model,
|
|
44
|
+
input=stringify_anthropic_content(input_argument),
|
|
45
|
+
messages=render_messages_anthropic(messages),
|
|
46
|
+
tools=tools,
|
|
47
|
+
tool_descriptions=tool_descriptions,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def safe_extract_output_parameters(
|
|
52
|
+
message_response: Message,
|
|
53
|
+
input_parameters: InputParameters,
|
|
54
|
+
) -> OutputParameters:
|
|
55
|
+
# guarding against errors to be compatible with legacy APIs
|
|
56
|
+
try:
|
|
57
|
+
return extract_messages_api_output_parameters(
|
|
58
|
+
message_response, input_parameters
|
|
59
|
+
)
|
|
60
|
+
except:
|
|
61
|
+
return OutputParameters()
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def extract_messages_api_output_parameters(
|
|
65
|
+
message_response: Message,
|
|
66
|
+
input_parameters: InputParameters,
|
|
67
|
+
) -> OutputParameters:
|
|
68
|
+
output = str(message_response.content[0].text)
|
|
69
|
+
prompt_tokens = message_response.usage.input_tokens
|
|
70
|
+
completion_tokens = message_response.usage.output_tokens
|
|
71
|
+
|
|
72
|
+
tools_called = None
|
|
73
|
+
anthropic_tool_calls = [
|
|
74
|
+
block
|
|
75
|
+
for block in message_response.content
|
|
76
|
+
if isinstance(block, ToolUseBlock)
|
|
77
|
+
]
|
|
78
|
+
if anthropic_tool_calls:
|
|
79
|
+
tools_called = []
|
|
80
|
+
tool_descriptions = input_parameters.tool_descriptions or {}
|
|
81
|
+
for tool_call in anthropic_tool_calls:
|
|
82
|
+
tools_called.append(
|
|
83
|
+
ToolCall(
|
|
84
|
+
name=tool_call.name,
|
|
85
|
+
input_parameters=tool_call.input,
|
|
86
|
+
description=tool_descriptions.get(tool_call.name),
|
|
87
|
+
)
|
|
88
|
+
)
|
|
89
|
+
return OutputParameters(
|
|
90
|
+
output=output,
|
|
91
|
+
prompt_tokens=prompt_tokens,
|
|
92
|
+
completion_tokens=completion_tokens,
|
|
93
|
+
tools_called=tools_called,
|
|
94
|
+
)
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
from typing import Callable
|
|
2
|
+
from functools import wraps
|
|
3
|
+
|
|
4
|
+
from deepeval.anthropic.extractors import (
|
|
5
|
+
safe_extract_input_parameters,
|
|
6
|
+
safe_extract_output_parameters,
|
|
7
|
+
InputParameters,
|
|
8
|
+
)
|
|
9
|
+
from deepeval.model_integrations.utils import _update_all_attributes
|
|
10
|
+
from deepeval.tracing import observe
|
|
11
|
+
from deepeval.tracing.trace_context import current_llm_context
|
|
12
|
+
|
|
13
|
+
_ORIGINAL_METHODS = {}
|
|
14
|
+
_ANTHROPIC_PATCHED = False
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def patch_anthropic_classes():
|
|
18
|
+
"""
|
|
19
|
+
Monkey patch Anthropic resource classes directly.
|
|
20
|
+
"""
|
|
21
|
+
global _ANTHROPIC_PATCHED
|
|
22
|
+
|
|
23
|
+
# Single guard - if already patched, return immediately
|
|
24
|
+
if _ANTHROPIC_PATCHED:
|
|
25
|
+
return
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
from anthropic.resources.messages import Messages, AsyncMessages
|
|
29
|
+
|
|
30
|
+
# Store original methods before patching
|
|
31
|
+
if hasattr(Messages, "create"):
|
|
32
|
+
_ORIGINAL_METHODS["Messages.create"] = Messages.create
|
|
33
|
+
Messages.create = _create_sync_wrapper(Messages.create)
|
|
34
|
+
|
|
35
|
+
if hasattr(AsyncMessages, "create"):
|
|
36
|
+
_ORIGINAL_METHODS["AsyncMessages.create"] = AsyncMessages.create
|
|
37
|
+
AsyncMessages.create = _create_async_wrapper(AsyncMessages.create)
|
|
38
|
+
|
|
39
|
+
except ImportError:
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
_ANTHROPIC_PATCHED = True
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _create_sync_wrapper(original_method):
|
|
46
|
+
"""
|
|
47
|
+
Create a wrapper for sync methods - called ONCE during patching.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
@wraps(original_method)
|
|
51
|
+
def method_wrapper(self, *args, **kwargs):
|
|
52
|
+
bound_method = original_method.__get__(self, type(self))
|
|
53
|
+
patched = _patch_sync_anthropic_client_method(
|
|
54
|
+
original_method=bound_method
|
|
55
|
+
)
|
|
56
|
+
return patched(*args, **kwargs)
|
|
57
|
+
|
|
58
|
+
return method_wrapper
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _create_async_wrapper(original_method):
|
|
62
|
+
"""
|
|
63
|
+
Create a wrapper for sync methods - called ONCE during patching.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
@wraps(original_method)
|
|
67
|
+
def method_wrapper(self, *args, **kwargs):
|
|
68
|
+
bound_method = original_method.__get__(self, type(self))
|
|
69
|
+
patched = _patch_async_anthropic_client_method(
|
|
70
|
+
original_method=bound_method
|
|
71
|
+
)
|
|
72
|
+
return patched(*args, **kwargs)
|
|
73
|
+
|
|
74
|
+
return method_wrapper
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _patch_sync_anthropic_client_method(original_method: Callable):
|
|
78
|
+
@wraps(original_method)
|
|
79
|
+
def patched_sync_anthropic_method(*args, **kwargs):
|
|
80
|
+
input_parameters: InputParameters = safe_extract_input_parameters(
|
|
81
|
+
kwargs
|
|
82
|
+
)
|
|
83
|
+
llm_context = current_llm_context.get()
|
|
84
|
+
|
|
85
|
+
@observe(
|
|
86
|
+
type="llm",
|
|
87
|
+
model=input_parameters.model,
|
|
88
|
+
metrics=llm_context.metrics,
|
|
89
|
+
metric_collection=llm_context.metric_collection,
|
|
90
|
+
)
|
|
91
|
+
def llm_generation(*args, **kwargs):
|
|
92
|
+
messages_api_response = original_method(*args, **kwargs)
|
|
93
|
+
output_parameters = safe_extract_output_parameters(
|
|
94
|
+
messages_api_response, input_parameters
|
|
95
|
+
)
|
|
96
|
+
_update_all_attributes(
|
|
97
|
+
input_parameters,
|
|
98
|
+
output_parameters,
|
|
99
|
+
llm_context.expected_tools,
|
|
100
|
+
llm_context.expected_output,
|
|
101
|
+
llm_context.context,
|
|
102
|
+
llm_context.retrieval_context,
|
|
103
|
+
)
|
|
104
|
+
return messages_api_response
|
|
105
|
+
|
|
106
|
+
return llm_generation(*args, **kwargs)
|
|
107
|
+
|
|
108
|
+
return patched_sync_anthropic_method
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _patch_async_anthropic_client_method(original_method: Callable):
|
|
112
|
+
@wraps(original_method)
|
|
113
|
+
async def patched_async_anthropic_method(*args, **kwargs):
|
|
114
|
+
input_parameters: InputParameters = safe_extract_input_parameters(
|
|
115
|
+
kwargs
|
|
116
|
+
)
|
|
117
|
+
llm_context = current_llm_context.get()
|
|
118
|
+
|
|
119
|
+
@observe(
|
|
120
|
+
type="llm",
|
|
121
|
+
model=input_parameters.model,
|
|
122
|
+
metrics=llm_context.metrics,
|
|
123
|
+
metric_collection=llm_context.metric_collection,
|
|
124
|
+
)
|
|
125
|
+
async def llm_generation(*args, **kwargs):
|
|
126
|
+
messages_api_response = await original_method(*args, **kwargs)
|
|
127
|
+
output_parameters = safe_extract_output_parameters(
|
|
128
|
+
messages_api_response, input_parameters
|
|
129
|
+
)
|
|
130
|
+
_update_all_attributes(
|
|
131
|
+
input_parameters,
|
|
132
|
+
output_parameters,
|
|
133
|
+
llm_context.expected_tools,
|
|
134
|
+
llm_context.expected_output,
|
|
135
|
+
llm_context.context,
|
|
136
|
+
llm_context.retrieval_context,
|
|
137
|
+
)
|
|
138
|
+
return messages_api_response
|
|
139
|
+
|
|
140
|
+
return await llm_generation(*args, **kwargs)
|
|
141
|
+
|
|
142
|
+
return patched_async_anthropic_method
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def unpatch_anthropic_classes():
|
|
146
|
+
"""
|
|
147
|
+
Restore Anthropic resource classes to their original state.
|
|
148
|
+
"""
|
|
149
|
+
global _ANTHROPIC_PATCHED
|
|
150
|
+
|
|
151
|
+
# If not patched, nothing to do
|
|
152
|
+
if not _ANTHROPIC_PATCHED:
|
|
153
|
+
return
|
|
154
|
+
|
|
155
|
+
try:
|
|
156
|
+
from anthropic.resources.messages import Messages, AsyncMessages
|
|
157
|
+
|
|
158
|
+
# Restore original methods for Messages
|
|
159
|
+
if hasattr(Messages, "create"):
|
|
160
|
+
Messages.create = _ORIGINAL_METHODS["Messages.create"]
|
|
161
|
+
|
|
162
|
+
if hasattr(AsyncMessages, "create"):
|
|
163
|
+
AsyncMessages.create = _ORIGINAL_METHODS["AsyncMessages.create"]
|
|
164
|
+
|
|
165
|
+
except ImportError:
|
|
166
|
+
pass
|
|
167
|
+
|
|
168
|
+
# Reset the patched flag
|
|
169
|
+
_ANTHROPIC_PATCHED = False
|