deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +139 -2
- deepeval/evaluate/evaluate.py +16 -11
- deepeval/evaluate/execute.py +13 -181
- deepeval/evaluate/utils.py +6 -26
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +14 -16
- deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
- deepeval/metrics/answer_relevancy/template.py +22 -3
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -44
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +16 -2
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +138 -149
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +7 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -19
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +158 -122
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +49 -33
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +79 -33
- deepeval/models/embedding_models/local_embedding_model.py +39 -20
- deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
- deepeval/models/embedding_models/openai_embedding_model.py +42 -22
- deepeval/models/llms/amazon_bedrock_model.py +226 -72
- deepeval/models/llms/anthropic_model.py +178 -63
- deepeval/models/llms/azure_model.py +218 -60
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +95 -40
- deepeval/models/llms/gemini_model.py +209 -64
- deepeval/models/llms/grok_model.py +139 -68
- deepeval/models/llms/kimi_model.py +140 -90
- deepeval/models/llms/litellm_model.py +131 -37
- deepeval/models/llms/local_model.py +125 -21
- deepeval/models/llms/ollama_model.py +147 -24
- deepeval/models/llms/openai_model.py +222 -269
- deepeval/models/llms/portkey_model.py +81 -22
- deepeval/models/llms/utils.py +8 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +106 -5
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +26 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +68 -1
- deepeval/test_case/llm_test_case.py +206 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +18 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -1,62 +1,125 @@
|
|
|
1
|
-
import
|
|
2
|
-
|
|
3
|
-
from typing import Optional, Tuple, Union, Dict
|
|
4
|
-
from anthropic import Anthropic, AsyncAnthropic
|
|
1
|
+
from typing import Optional, Tuple, Union, Dict, List
|
|
5
2
|
from pydantic import BaseModel, SecretStr
|
|
6
3
|
|
|
4
|
+
from deepeval.errors import DeepEvalError
|
|
7
5
|
from deepeval.models import DeepEvalBaseLLM
|
|
8
6
|
from deepeval.models.llms.utils import trim_and_load_json
|
|
9
7
|
from deepeval.models.retry_policy import (
|
|
10
8
|
create_retry_decorator,
|
|
11
9
|
sdk_retries_for,
|
|
12
10
|
)
|
|
13
|
-
from deepeval.models.utils import
|
|
11
|
+
from deepeval.models.utils import (
|
|
12
|
+
require_costs,
|
|
13
|
+
require_secret_api_key,
|
|
14
|
+
normalize_kwargs_and_extract_aliases,
|
|
15
|
+
)
|
|
16
|
+
from deepeval.test_case import MLLMImage
|
|
17
|
+
from deepeval.utils import check_if_multimodal, convert_to_multi_modal_array
|
|
14
18
|
from deepeval.config.settings import get_settings
|
|
15
19
|
from deepeval.constants import ProviderSlug as PS
|
|
16
|
-
|
|
20
|
+
from deepeval.utils import require_dependency, require_param
|
|
21
|
+
from deepeval.models.llms.constants import ANTHROPIC_MODELS_DATA
|
|
17
22
|
|
|
18
23
|
# consistent retry rules
|
|
19
24
|
retry_anthropic = create_retry_decorator(PS.ANTHROPIC)
|
|
20
25
|
|
|
21
|
-
|
|
22
|
-
"
|
|
23
|
-
"claude-sonnet-4-20250514": {"input": 3.00 / 1e6, "output": 15.00 / 1e6},
|
|
24
|
-
"claude-3-7-sonnet-latest": {"input": 3.00 / 1e6, "output": 15.00 / 1e6},
|
|
25
|
-
"claude-3-5-haiku-latest": {"input": 0.80 / 1e6, "output": 4.00 / 1e6},
|
|
26
|
-
"claude-3-5-sonnet-latest": {"input": 3.00 / 1e6, "output": 15.00 / 1e6},
|
|
27
|
-
"claude-3-opus-latest": {"input": 15.00 / 1e6, "output": 75.00 / 1e6},
|
|
28
|
-
"claude-3-sonnet-20240229": {"input": 3.00 / 1e6, "output": 15.00 / 1e6},
|
|
29
|
-
"claude-3-haiku-20240307": {"input": 0.25 / 1e6, "output": 1.25 / 1e6},
|
|
30
|
-
"claude-instant-1.2": {"input": 0.80 / 1e6, "output": 2.40 / 1e6},
|
|
26
|
+
_ALIAS_MAP = {
|
|
27
|
+
"api_key": ["_anthropic_api_key"],
|
|
31
28
|
}
|
|
32
29
|
|
|
30
|
+
default_model = "claude-3-7-sonnet-latest"
|
|
31
|
+
|
|
33
32
|
|
|
34
33
|
class AnthropicModel(DeepEvalBaseLLM):
|
|
35
34
|
def __init__(
|
|
36
35
|
self,
|
|
37
|
-
model: str =
|
|
38
|
-
|
|
39
|
-
|
|
36
|
+
model: Optional[str] = None,
|
|
37
|
+
api_key: Optional[str] = None,
|
|
38
|
+
temperature: Optional[float] = None,
|
|
39
|
+
cost_per_input_token: Optional[float] = None,
|
|
40
|
+
cost_per_output_token: Optional[float] = None,
|
|
40
41
|
generation_kwargs: Optional[Dict] = None,
|
|
41
42
|
**kwargs,
|
|
42
43
|
):
|
|
43
|
-
|
|
44
|
+
settings = get_settings()
|
|
45
|
+
normalized_kwargs, alias_values = normalize_kwargs_and_extract_aliases(
|
|
46
|
+
"AnthropicModel",
|
|
47
|
+
kwargs,
|
|
48
|
+
_ALIAS_MAP,
|
|
49
|
+
)
|
|
44
50
|
|
|
45
|
-
|
|
51
|
+
# re-map depricated keywords to re-named positional args
|
|
52
|
+
if api_key is None and "api_key" in alias_values:
|
|
53
|
+
api_key = alias_values["api_key"]
|
|
54
|
+
|
|
55
|
+
if api_key is not None:
|
|
46
56
|
# keep it secret, keep it safe from serializings, logging and alike
|
|
47
|
-
self.
|
|
48
|
-
_anthropic_api_key
|
|
49
|
-
)
|
|
57
|
+
self.api_key: Optional[SecretStr] = SecretStr(api_key)
|
|
50
58
|
else:
|
|
51
|
-
self.
|
|
59
|
+
self.api_key = settings.ANTHROPIC_API_KEY
|
|
60
|
+
|
|
61
|
+
model = model or settings.ANTHROPIC_MODEL_NAME or default_model
|
|
62
|
+
|
|
63
|
+
if temperature is not None:
|
|
64
|
+
temperature = float(temperature)
|
|
65
|
+
elif settings.TEMPERATURE is not None:
|
|
66
|
+
temperature = settings.TEMPERATURE
|
|
67
|
+
else:
|
|
68
|
+
temperature = 0.0
|
|
69
|
+
|
|
70
|
+
cost_per_input_token = (
|
|
71
|
+
cost_per_input_token
|
|
72
|
+
if cost_per_input_token is not None
|
|
73
|
+
else settings.ANTHROPIC_COST_PER_INPUT_TOKEN
|
|
74
|
+
)
|
|
75
|
+
cost_per_output_token = (
|
|
76
|
+
cost_per_output_token
|
|
77
|
+
if cost_per_output_token is not None
|
|
78
|
+
else settings.ANTHROPIC_COST_PER_OUTPUT_TOKEN
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# Validation
|
|
82
|
+
model = require_param(
|
|
83
|
+
model,
|
|
84
|
+
provider_label="AnthropicModel",
|
|
85
|
+
env_var_name="ANTHROPIC_MODEL_NAME",
|
|
86
|
+
param_hint="model",
|
|
87
|
+
)
|
|
52
88
|
|
|
53
89
|
if temperature < 0:
|
|
54
|
-
raise
|
|
90
|
+
raise DeepEvalError("Temperature must be >= 0.")
|
|
55
91
|
self.temperature = temperature
|
|
56
92
|
|
|
57
|
-
self.
|
|
58
|
-
|
|
59
|
-
|
|
93
|
+
self.model_data = ANTHROPIC_MODELS_DATA.get(model)
|
|
94
|
+
|
|
95
|
+
cost_per_input_token, cost_per_output_token = require_costs(
|
|
96
|
+
self.model_data,
|
|
97
|
+
model,
|
|
98
|
+
"ANTHROPIC_COST_PER_INPUT_TOKEN",
|
|
99
|
+
"ANTHROPIC_COST_PER_OUTPUT_TOKEN",
|
|
100
|
+
cost_per_input_token,
|
|
101
|
+
cost_per_output_token,
|
|
102
|
+
)
|
|
103
|
+
self.model_data.input_price = cost_per_input_token
|
|
104
|
+
self.model_data.output_price = cost_per_output_token
|
|
105
|
+
|
|
106
|
+
# Keep sanitized kwargs for client call to strip legacy keys
|
|
107
|
+
self.kwargs = normalized_kwargs
|
|
108
|
+
self.kwargs.pop(
|
|
109
|
+
"temperature", None
|
|
110
|
+
) # to avoid duplicate with self.temperature
|
|
111
|
+
max_tokens = self.kwargs.pop("max_tokens", None)
|
|
112
|
+
|
|
113
|
+
self.generation_kwargs = dict(generation_kwargs or {})
|
|
114
|
+
self.generation_kwargs.pop(
|
|
115
|
+
"temperature", None
|
|
116
|
+
) # to avoid duplicate with self.temperature
|
|
117
|
+
default_max_tokens = 1024 if max_tokens is None else max_tokens
|
|
118
|
+
self._max_tokens = int(
|
|
119
|
+
self.generation_kwargs.pop("max_tokens", default_max_tokens)
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
super().__init__(model)
|
|
60
123
|
|
|
61
124
|
###############################################
|
|
62
125
|
# Generate functions
|
|
@@ -65,17 +128,25 @@ class AnthropicModel(DeepEvalBaseLLM):
|
|
|
65
128
|
@retry_anthropic
|
|
66
129
|
def generate(
|
|
67
130
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
68
|
-
) -> Tuple[Union[str,
|
|
131
|
+
) -> Tuple[Union[str, BaseModel], float]:
|
|
132
|
+
if check_if_multimodal(prompt):
|
|
133
|
+
prompt = convert_to_multi_modal_array(input=prompt)
|
|
134
|
+
content = self.generate_content(prompt)
|
|
135
|
+
else:
|
|
136
|
+
content = [{"type": "text", "text": prompt}]
|
|
137
|
+
|
|
138
|
+
# Get max_tokens from kwargs, default to 1024 if not provided
|
|
139
|
+
max_tokens = self._max_tokens
|
|
69
140
|
chat_model = self.load_model()
|
|
70
141
|
message = chat_model.messages.create(
|
|
71
|
-
max_tokens=
|
|
142
|
+
max_tokens=max_tokens,
|
|
72
143
|
messages=[
|
|
73
144
|
{
|
|
74
145
|
"role": "user",
|
|
75
|
-
"content":
|
|
146
|
+
"content": content,
|
|
76
147
|
}
|
|
77
148
|
],
|
|
78
|
-
model=self.
|
|
149
|
+
model=self.name,
|
|
79
150
|
temperature=self.temperature,
|
|
80
151
|
**self.generation_kwargs,
|
|
81
152
|
)
|
|
@@ -91,17 +162,25 @@ class AnthropicModel(DeepEvalBaseLLM):
|
|
|
91
162
|
@retry_anthropic
|
|
92
163
|
async def a_generate(
|
|
93
164
|
self, prompt: str, schema: Optional[BaseModel] = None
|
|
94
|
-
) -> Tuple[str, float]:
|
|
165
|
+
) -> Tuple[Union[str, BaseModel], float]:
|
|
166
|
+
if check_if_multimodal(prompt):
|
|
167
|
+
prompt = convert_to_multi_modal_array(input=prompt)
|
|
168
|
+
content = self.generate_content(prompt)
|
|
169
|
+
else:
|
|
170
|
+
content = [{"type": "text", "text": prompt}]
|
|
171
|
+
|
|
172
|
+
# Get max_tokens from kwargs, default to 1024 if not provided
|
|
173
|
+
max_tokens = self._max_tokens
|
|
95
174
|
chat_model = self.load_model(async_mode=True)
|
|
96
175
|
message = await chat_model.messages.create(
|
|
97
|
-
max_tokens=
|
|
176
|
+
max_tokens=max_tokens,
|
|
98
177
|
messages=[
|
|
99
178
|
{
|
|
100
179
|
"role": "user",
|
|
101
|
-
"content":
|
|
180
|
+
"content": content,
|
|
102
181
|
}
|
|
103
182
|
],
|
|
104
|
-
model=self.
|
|
183
|
+
model=self.name,
|
|
105
184
|
temperature=self.temperature,
|
|
106
185
|
**self.generation_kwargs,
|
|
107
186
|
)
|
|
@@ -115,43 +194,76 @@ class AnthropicModel(DeepEvalBaseLLM):
|
|
|
115
194
|
|
|
116
195
|
return schema.model_validate(json_output), cost
|
|
117
196
|
|
|
197
|
+
def generate_content(self, multimodal_input: List[Union[str, MLLMImage]]):
|
|
198
|
+
content = []
|
|
199
|
+
for element in multimodal_input:
|
|
200
|
+
if isinstance(element, str):
|
|
201
|
+
content.append({"type": "text", "text": element})
|
|
202
|
+
elif isinstance(element, MLLMImage):
|
|
203
|
+
if element.url and not element.local:
|
|
204
|
+
content.append(
|
|
205
|
+
{
|
|
206
|
+
"type": "image",
|
|
207
|
+
"source": {"type": "url", "url": element.url},
|
|
208
|
+
}
|
|
209
|
+
)
|
|
210
|
+
else:
|
|
211
|
+
element.ensure_images_loaded()
|
|
212
|
+
mime_type = element.mimeType or "image/jpeg"
|
|
213
|
+
content.append(
|
|
214
|
+
{
|
|
215
|
+
"type": "image",
|
|
216
|
+
"source": {
|
|
217
|
+
"type": "base64",
|
|
218
|
+
"media_type": mime_type,
|
|
219
|
+
"data": element.dataBase64,
|
|
220
|
+
},
|
|
221
|
+
}
|
|
222
|
+
)
|
|
223
|
+
return content
|
|
224
|
+
|
|
118
225
|
###############################################
|
|
119
226
|
# Utilities
|
|
120
227
|
###############################################
|
|
121
228
|
|
|
122
229
|
def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
if pricing is None:
|
|
126
|
-
# Calculate average cost from all known models
|
|
127
|
-
avg_input_cost = sum(
|
|
128
|
-
p["input"] for p in model_pricing.values()
|
|
129
|
-
) / len(model_pricing)
|
|
130
|
-
avg_output_cost = sum(
|
|
131
|
-
p["output"] for p in model_pricing.values()
|
|
132
|
-
) / len(model_pricing)
|
|
133
|
-
pricing = {"input": avg_input_cost, "output": avg_output_cost}
|
|
134
|
-
|
|
135
|
-
warnings.warn(
|
|
136
|
-
f"[Warning] Pricing not defined for model '{self.model_name}'. "
|
|
137
|
-
"Using average input/output token costs from existing model_pricing."
|
|
138
|
-
)
|
|
139
|
-
|
|
140
|
-
input_cost = input_tokens * pricing["input"]
|
|
141
|
-
output_cost = output_tokens * pricing["output"]
|
|
230
|
+
input_cost = input_tokens * self.model_data.input_price
|
|
231
|
+
output_cost = output_tokens * self.model_data.output_price
|
|
142
232
|
return input_cost + output_cost
|
|
143
233
|
|
|
234
|
+
#########################
|
|
235
|
+
# Capabilities #
|
|
236
|
+
#########################
|
|
237
|
+
|
|
238
|
+
def supports_log_probs(self) -> Union[bool, None]:
|
|
239
|
+
return self.model_data.supports_log_probs
|
|
240
|
+
|
|
241
|
+
def supports_temperature(self) -> Union[bool, None]:
|
|
242
|
+
return self.model_data.supports_temperature
|
|
243
|
+
|
|
244
|
+
def supports_multimodal(self) -> Union[bool, None]:
|
|
245
|
+
return self.model_data.supports_multimodal
|
|
246
|
+
|
|
247
|
+
def supports_structured_outputs(self) -> Union[bool, None]:
|
|
248
|
+
return self.model_data.supports_structured_outputs
|
|
249
|
+
|
|
250
|
+
def supports_json_mode(self) -> Union[bool, None]:
|
|
251
|
+
return self.model_data.supports_json
|
|
252
|
+
|
|
144
253
|
###############################################
|
|
145
254
|
# Model
|
|
146
255
|
###############################################
|
|
147
256
|
|
|
148
257
|
def load_model(self, async_mode: bool = False):
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
258
|
+
module = require_dependency(
|
|
259
|
+
"anthropic",
|
|
260
|
+
provider_label="AnthropicModel",
|
|
261
|
+
install_hint="Install it with `pip install anthropic`.",
|
|
262
|
+
)
|
|
152
263
|
|
|
153
|
-
|
|
154
|
-
|
|
264
|
+
if not async_mode:
|
|
265
|
+
return self._build_client(module.Anthropic)
|
|
266
|
+
return self._build_client(module.AsyncAnthropic)
|
|
155
267
|
|
|
156
268
|
def _client_kwargs(self) -> Dict:
|
|
157
269
|
kwargs = dict(self.kwargs or {})
|
|
@@ -163,10 +275,10 @@ class AnthropicModel(DeepEvalBaseLLM):
|
|
|
163
275
|
|
|
164
276
|
def _build_client(self, cls):
|
|
165
277
|
api_key = require_secret_api_key(
|
|
166
|
-
self.
|
|
278
|
+
self.api_key,
|
|
167
279
|
provider_label="Anthropic",
|
|
168
280
|
env_var_name="ANTHROPIC_API_KEY",
|
|
169
|
-
param_hint="`
|
|
281
|
+
param_hint="`api_key` to AnthropicModel(...)",
|
|
170
282
|
)
|
|
171
283
|
kw = dict(
|
|
172
284
|
api_key=api_key,
|
|
@@ -180,3 +292,6 @@ class AnthropicModel(DeepEvalBaseLLM):
|
|
|
180
292
|
kw.pop("max_retries", None)
|
|
181
293
|
return cls(**kw)
|
|
182
294
|
raise
|
|
295
|
+
|
|
296
|
+
def get_model_name(self):
|
|
297
|
+
return f"{self.name} (Anthropic)"
|