deepeval 3.7.2__py3-none-any.whl → 3.7.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/benchmarks/human_eval/human_eval.py +2 -1
- deepeval/dataset/dataset.py +35 -11
- deepeval/dataset/utils.py +2 -0
- deepeval/metrics/utils.py +3 -0
- deepeval/models/__init__.py +2 -0
- deepeval/models/mlllms/__init__.py +1 -0
- deepeval/models/mlllms/azure_model.py +334 -0
- deepeval/tracing/context.py +3 -0
- deepeval/tracing/tracing.py +22 -11
- {deepeval-3.7.2.dist-info → deepeval-3.7.3.dist-info}/METADATA +1 -1
- {deepeval-3.7.2.dist-info → deepeval-3.7.3.dist-info}/RECORD +15 -14
- {deepeval-3.7.2.dist-info → deepeval-3.7.3.dist-info}/entry_points.txt +1 -1
- {deepeval-3.7.2.dist-info → deepeval-3.7.3.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.2.dist-info → deepeval-3.7.3.dist-info}/WHEEL +0 -0
deepeval/_version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__: str = "3.7.
|
|
1
|
+
__version__: str = "3.7.3"
|
|
@@ -92,7 +92,7 @@ class HumanEval(DeepEvalBaseBenchmark):
|
|
|
92
92
|
self.predictions: Optional[pd.DataFrame] = None
|
|
93
93
|
self.task_scores: Optional[pd.DataFrame] = None
|
|
94
94
|
self.overall_score: Optional[float] = None
|
|
95
|
-
self.verbose_mode: bool =
|
|
95
|
+
self.verbose_mode: bool = verbose_mode
|
|
96
96
|
|
|
97
97
|
def evaluate(
|
|
98
98
|
self, model: DeepEvalBaseLLM, *args, k: int = 1, **kwargs
|
|
@@ -123,6 +123,7 @@ class HumanEval(DeepEvalBaseBenchmark):
|
|
|
123
123
|
task.value,
|
|
124
124
|
golden.input,
|
|
125
125
|
prediction,
|
|
126
|
+
task_correct,
|
|
126
127
|
golden.expected_output,
|
|
127
128
|
score,
|
|
128
129
|
)
|
deepeval/dataset/dataset.py
CHANGED
|
@@ -189,17 +189,35 @@ class EvaluationDataset:
|
|
|
189
189
|
test_case._dataset_alias = self._alias
|
|
190
190
|
test_case._dataset_id = self._id
|
|
191
191
|
if isinstance(test_case, LLMTestCase):
|
|
192
|
+
if self._conversational_goldens or self._conversational_test_cases:
|
|
193
|
+
raise TypeError(
|
|
194
|
+
"You cannot add 'LLMTestCase' to a multi-turn dataset."
|
|
195
|
+
)
|
|
192
196
|
test_case._dataset_rank = len(self._llm_test_cases)
|
|
193
197
|
self._llm_test_cases.append(test_case)
|
|
194
198
|
elif isinstance(test_case, ConversationalTestCase):
|
|
199
|
+
if self._goldens or self._llm_test_cases:
|
|
200
|
+
raise TypeError(
|
|
201
|
+
"You cannot add 'ConversationalTestCase' to a single-turn dataset."
|
|
202
|
+
)
|
|
203
|
+
self._multi_turn = True
|
|
195
204
|
test_case._dataset_rank = len(self._conversational_test_cases)
|
|
196
205
|
self._conversational_test_cases.append(test_case)
|
|
197
206
|
|
|
198
207
|
def add_golden(self, golden: Union[Golden, ConversationalGolden]):
|
|
199
|
-
if
|
|
200
|
-
self.
|
|
201
|
-
|
|
208
|
+
if isinstance(golden, Golden):
|
|
209
|
+
if self._conversational_goldens or self._conversational_test_cases:
|
|
210
|
+
raise TypeError(
|
|
211
|
+
"You cannot add 'Golden' to a multi-turn dataset."
|
|
212
|
+
)
|
|
202
213
|
self._add_golden(golden)
|
|
214
|
+
else:
|
|
215
|
+
if self._goldens or self._llm_test_cases:
|
|
216
|
+
raise TypeError(
|
|
217
|
+
"You cannot add 'ConversationalGolden' to a single-turn dataset."
|
|
218
|
+
)
|
|
219
|
+
self._multi_turn = True
|
|
220
|
+
self._add_conversational_golden(golden)
|
|
203
221
|
|
|
204
222
|
def _add_golden(self, golden: Union[Golden, ConversationalGolden]):
|
|
205
223
|
if isinstance(golden, Golden):
|
|
@@ -224,16 +242,16 @@ class EvaluationDataset:
|
|
|
224
242
|
file_path: str,
|
|
225
243
|
input_col_name: str,
|
|
226
244
|
actual_output_col_name: str,
|
|
227
|
-
expected_output_col_name: Optional[str] =
|
|
228
|
-
context_col_name: Optional[str] =
|
|
245
|
+
expected_output_col_name: Optional[str] = "expected_output",
|
|
246
|
+
context_col_name: Optional[str] = "context",
|
|
229
247
|
context_col_delimiter: str = ";",
|
|
230
|
-
retrieval_context_col_name: Optional[str] =
|
|
248
|
+
retrieval_context_col_name: Optional[str] = "retrieval_context",
|
|
231
249
|
retrieval_context_col_delimiter: str = ";",
|
|
232
|
-
tools_called_col_name: Optional[str] =
|
|
250
|
+
tools_called_col_name: Optional[str] = "tools_called",
|
|
233
251
|
tools_called_col_delimiter: str = ";",
|
|
234
|
-
expected_tools_col_name: Optional[str] =
|
|
252
|
+
expected_tools_col_name: Optional[str] = "expected_tools",
|
|
235
253
|
expected_tools_col_delimiter: str = ";",
|
|
236
|
-
additional_metadata_col_name: Optional[str] =
|
|
254
|
+
additional_metadata_col_name: Optional[str] = "additional_metadata",
|
|
237
255
|
):
|
|
238
256
|
"""
|
|
239
257
|
Load test cases from a CSV file.
|
|
@@ -379,6 +397,7 @@ class EvaluationDataset:
|
|
|
379
397
|
retrieval_context_key_name: Optional[str] = None,
|
|
380
398
|
tools_called_key_name: Optional[str] = None,
|
|
381
399
|
expected_tools_key_name: Optional[str] = None,
|
|
400
|
+
addtional_metadata_key_name: Optional[str] = None,
|
|
382
401
|
encoding_type: str = "utf-8",
|
|
383
402
|
):
|
|
384
403
|
"""
|
|
@@ -431,6 +450,7 @@ class EvaluationDataset:
|
|
|
431
450
|
tools_called = [ToolCall(**tool) for tool in tools_called_data]
|
|
432
451
|
expected_tools_data = json_obj.get(expected_tools_key_name, [])
|
|
433
452
|
expected_tools = [ToolCall(**tool) for tool in expected_tools_data]
|
|
453
|
+
# additional_metadata = json_obj.get(addtional_metadata_key_name)
|
|
434
454
|
|
|
435
455
|
self.add_test_case(
|
|
436
456
|
LLMTestCase(
|
|
@@ -441,6 +461,7 @@ class EvaluationDataset:
|
|
|
441
461
|
retrieval_context=retrieval_context,
|
|
442
462
|
tools_called=tools_called,
|
|
443
463
|
expected_tools=expected_tools,
|
|
464
|
+
# additional_metadata=additional_metadata,
|
|
444
465
|
)
|
|
445
466
|
)
|
|
446
467
|
|
|
@@ -460,8 +481,8 @@ class EvaluationDataset:
|
|
|
460
481
|
expected_tools_col_delimiter: str = ";",
|
|
461
482
|
comments_key_name: str = "comments",
|
|
462
483
|
name_key_name: str = "name",
|
|
463
|
-
source_file_col_name: Optional[str] =
|
|
464
|
-
additional_metadata_col_name: Optional[str] =
|
|
484
|
+
source_file_col_name: Optional[str] = "source_file",
|
|
485
|
+
additional_metadata_col_name: Optional[str] = "additional_metadata",
|
|
465
486
|
scenario_col_name: Optional[str] = "scenario",
|
|
466
487
|
turns_col_name: Optional[str] = "turns",
|
|
467
488
|
expected_outcome_col_name: Optional[str] = "expected_outcome",
|
|
@@ -587,6 +608,7 @@ class EvaluationDataset:
|
|
|
587
608
|
context=context,
|
|
588
609
|
comments=comments,
|
|
589
610
|
name=name,
|
|
611
|
+
additional_metadata=additional_metadata,
|
|
590
612
|
)
|
|
591
613
|
)
|
|
592
614
|
else:
|
|
@@ -645,6 +667,7 @@ class EvaluationDataset:
|
|
|
645
667
|
comments = json_obj.get(comments_key_name)
|
|
646
668
|
name = json_obj.get(name_key_name)
|
|
647
669
|
parsed_turns = parse_turns(turns) if turns else []
|
|
670
|
+
additional_metadata = json_obj.get(additional_metadata_key_name)
|
|
648
671
|
|
|
649
672
|
self._multi_turn = True
|
|
650
673
|
self.goldens.append(
|
|
@@ -656,6 +679,7 @@ class EvaluationDataset:
|
|
|
656
679
|
context=context,
|
|
657
680
|
comments=comments,
|
|
658
681
|
name=name,
|
|
682
|
+
additional_metadata=additional_metadata,
|
|
659
683
|
)
|
|
660
684
|
)
|
|
661
685
|
else:
|
deepeval/dataset/utils.py
CHANGED
|
@@ -24,6 +24,7 @@ def convert_test_cases_to_goldens(
|
|
|
24
24
|
"retrieval_context": test_case.retrieval_context,
|
|
25
25
|
"tools_called": test_case.tools_called,
|
|
26
26
|
"expected_tools": test_case.expected_tools,
|
|
27
|
+
"additional_metadata": test_case.additional_metadata,
|
|
27
28
|
}
|
|
28
29
|
goldens.append(Golden(**golden))
|
|
29
30
|
return goldens
|
|
@@ -70,6 +71,7 @@ def convert_convo_test_cases_to_convo_goldens(
|
|
|
70
71
|
"expected_outcome": test_case.expected_outcome,
|
|
71
72
|
"user_description": test_case.user_description,
|
|
72
73
|
"context": test_case.context,
|
|
74
|
+
"additional_metadata": test_case.additional_metadata,
|
|
73
75
|
}
|
|
74
76
|
goldens.append(ConversationalGolden(**golden))
|
|
75
77
|
return goldens
|
deepeval/metrics/utils.py
CHANGED
|
@@ -25,6 +25,7 @@ from deepeval.models import (
|
|
|
25
25
|
MultimodalOpenAIModel,
|
|
26
26
|
MultimodalGeminiModel,
|
|
27
27
|
MultimodalOllamaModel,
|
|
28
|
+
MultimodalAzureOpenAIMLLMModel,
|
|
28
29
|
AmazonBedrockModel,
|
|
29
30
|
LiteLLMModel,
|
|
30
31
|
KimiModel,
|
|
@@ -514,6 +515,8 @@ def initialize_multimodal_model(
|
|
|
514
515
|
return MultimodalGeminiModel(), True
|
|
515
516
|
if should_use_ollama_model():
|
|
516
517
|
return MultimodalOllamaModel(), True
|
|
518
|
+
elif should_use_azure_openai():
|
|
519
|
+
return MultimodalAzureOpenAIMLLMModel(model_name=model), True
|
|
517
520
|
elif isinstance(model, str) or model is None:
|
|
518
521
|
return MultimodalOpenAIModel(model=model), True
|
|
519
522
|
raise TypeError(
|
deepeval/models/__init__.py
CHANGED
|
@@ -21,6 +21,7 @@ from deepeval.models.mlllms import (
|
|
|
21
21
|
MultimodalOpenAIModel,
|
|
22
22
|
MultimodalOllamaModel,
|
|
23
23
|
MultimodalGeminiModel,
|
|
24
|
+
MultimodalAzureOpenAIMLLMModel,
|
|
24
25
|
)
|
|
25
26
|
from deepeval.models.embedding_models import (
|
|
26
27
|
OpenAIEmbeddingModel,
|
|
@@ -48,6 +49,7 @@ __all__ = [
|
|
|
48
49
|
"MultimodalOpenAIModel",
|
|
49
50
|
"MultimodalOllamaModel",
|
|
50
51
|
"MultimodalGeminiModel",
|
|
52
|
+
"MultimodalAzureOpenAIMLLMModel",
|
|
51
53
|
"OpenAIEmbeddingModel",
|
|
52
54
|
"AzureOpenAIEmbeddingModel",
|
|
53
55
|
"LocalEmbeddingModel",
|
|
@@ -0,0 +1,334 @@
|
|
|
1
|
+
from openai.types.chat.chat_completion import ChatCompletion
|
|
2
|
+
from openai import AzureOpenAI, AsyncAzureOpenAI
|
|
3
|
+
from typing import Optional, Tuple, Union, Dict, List
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
from io import BytesIO
|
|
6
|
+
import base64
|
|
7
|
+
|
|
8
|
+
from deepeval.models import DeepEvalBaseMLLM
|
|
9
|
+
from deepeval.key_handler import ModelKeyValues, KEY_FILE_HANDLER
|
|
10
|
+
from deepeval.test_case import MLLMImage
|
|
11
|
+
from deepeval.models.llms.openai_model import (
|
|
12
|
+
structured_outputs_models,
|
|
13
|
+
json_mode_models,
|
|
14
|
+
model_pricing,
|
|
15
|
+
)
|
|
16
|
+
from deepeval.models.retry_policy import (
|
|
17
|
+
create_retry_decorator,
|
|
18
|
+
sdk_retries_for,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
from deepeval.models.llms.utils import trim_and_load_json
|
|
22
|
+
from deepeval.models.utils import parse_model_name
|
|
23
|
+
from deepeval.constants import ProviderSlug as PS
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
retry_azure = create_retry_decorator(PS.AZURE)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class MultimodalAzureOpenAIMLLMModel(DeepEvalBaseMLLM):
|
|
30
|
+
def __init__(
|
|
31
|
+
self,
|
|
32
|
+
deployment_name: Optional[str] = None,
|
|
33
|
+
model_name: Optional[str] = None,
|
|
34
|
+
azure_openai_api_key: Optional[str] = None,
|
|
35
|
+
openai_api_version: Optional[str] = None,
|
|
36
|
+
azure_endpoint: Optional[str] = None,
|
|
37
|
+
temperature: float = 0,
|
|
38
|
+
generation_kwargs: Optional[Dict] = None,
|
|
39
|
+
**kwargs,
|
|
40
|
+
):
|
|
41
|
+
# fetch Azure deployment parameters
|
|
42
|
+
model_name = model_name or KEY_FILE_HANDLER.fetch_data(
|
|
43
|
+
ModelKeyValues.AZURE_MODEL_NAME
|
|
44
|
+
)
|
|
45
|
+
self.deployment_name = deployment_name or KEY_FILE_HANDLER.fetch_data(
|
|
46
|
+
ModelKeyValues.AZURE_DEPLOYMENT_NAME
|
|
47
|
+
)
|
|
48
|
+
self.azure_openai_api_key = (
|
|
49
|
+
azure_openai_api_key
|
|
50
|
+
or KEY_FILE_HANDLER.fetch_data(ModelKeyValues.AZURE_OPENAI_API_KEY)
|
|
51
|
+
)
|
|
52
|
+
self.openai_api_version = (
|
|
53
|
+
openai_api_version
|
|
54
|
+
or KEY_FILE_HANDLER.fetch_data(ModelKeyValues.OPENAI_API_VERSION)
|
|
55
|
+
)
|
|
56
|
+
self.azure_endpoint = azure_endpoint or KEY_FILE_HANDLER.fetch_data(
|
|
57
|
+
ModelKeyValues.AZURE_OPENAI_ENDPOINT
|
|
58
|
+
)
|
|
59
|
+
if temperature < 0:
|
|
60
|
+
raise ValueError("Temperature must be >= 0.")
|
|
61
|
+
self.temperature = temperature
|
|
62
|
+
|
|
63
|
+
# args and kwargs will be passed to the underlying model, in load_model function
|
|
64
|
+
self.kwargs = kwargs
|
|
65
|
+
self.generation_kwargs = generation_kwargs or {}
|
|
66
|
+
super().__init__(parse_model_name(model_name))
|
|
67
|
+
|
|
68
|
+
###############################################
|
|
69
|
+
# Generate functions
|
|
70
|
+
###############################################
|
|
71
|
+
|
|
72
|
+
@retry_azure
|
|
73
|
+
def generate(
|
|
74
|
+
self,
|
|
75
|
+
multimodal_input: List[Union[str, MLLMImage]],
|
|
76
|
+
schema: Optional[BaseModel] = None,
|
|
77
|
+
) -> Tuple[Union[str, BaseModel], float]:
|
|
78
|
+
client = self.load_model(async_mode=False)
|
|
79
|
+
prompt = self.generate_prompt(multimodal_input)
|
|
80
|
+
|
|
81
|
+
if schema:
|
|
82
|
+
if self.model_name in structured_outputs_models:
|
|
83
|
+
messages = [{"role": "user", "content": prompt}]
|
|
84
|
+
completion = client.beta.chat.completions.parse(
|
|
85
|
+
model=self.deployment_name,
|
|
86
|
+
messages=messages,
|
|
87
|
+
response_format=schema,
|
|
88
|
+
temperature=self.temperature,
|
|
89
|
+
)
|
|
90
|
+
structured_output: BaseModel = completion.choices[
|
|
91
|
+
0
|
|
92
|
+
].message.parsed
|
|
93
|
+
cost = self.calculate_cost(
|
|
94
|
+
completion.usage.prompt_tokens,
|
|
95
|
+
completion.usage.completion_tokens,
|
|
96
|
+
)
|
|
97
|
+
return structured_output, cost
|
|
98
|
+
if self.model_name in json_mode_models:
|
|
99
|
+
messages = [{"role": "user", "content": prompt}]
|
|
100
|
+
completion = client.beta.chat.completions.parse(
|
|
101
|
+
model=self.deployment_name,
|
|
102
|
+
messages=messages,
|
|
103
|
+
response_format={"type": "json_object"},
|
|
104
|
+
temperature=self.temperature,
|
|
105
|
+
)
|
|
106
|
+
json_output = trim_and_load_json(
|
|
107
|
+
completion.choices[0].message.content
|
|
108
|
+
)
|
|
109
|
+
cost = self.calculate_cost(
|
|
110
|
+
completion.usage.prompt_tokens,
|
|
111
|
+
completion.usage.completion_tokens,
|
|
112
|
+
)
|
|
113
|
+
return schema.model_validate(json_output), cost
|
|
114
|
+
print("Loading model client:")
|
|
115
|
+
print(client.base_url)
|
|
116
|
+
completion = client.chat.completions.create(
|
|
117
|
+
model=self.deployment_name,
|
|
118
|
+
messages=[{"role": "user", "content": prompt}],
|
|
119
|
+
temperature=self.temperature,
|
|
120
|
+
**self.generation_kwargs,
|
|
121
|
+
)
|
|
122
|
+
output = completion.choices[0].message.content
|
|
123
|
+
cost = self.calculate_cost(
|
|
124
|
+
completion.usage.prompt_tokens, completion.usage.completion_tokens
|
|
125
|
+
)
|
|
126
|
+
if schema:
|
|
127
|
+
json_output = trim_and_load_json(output)
|
|
128
|
+
return schema.model_validate(json_output), cost
|
|
129
|
+
else:
|
|
130
|
+
return output, cost
|
|
131
|
+
|
|
132
|
+
@retry_azure
|
|
133
|
+
async def a_generate(
|
|
134
|
+
self,
|
|
135
|
+
multimodal_input: List[Union[str, MLLMImage]],
|
|
136
|
+
schema: Optional[BaseModel] = None,
|
|
137
|
+
) -> Tuple[Union[str, BaseModel], float]:
|
|
138
|
+
client = self.load_model(async_mode=True)
|
|
139
|
+
prompt = self.generate_prompt(multimodal_input)
|
|
140
|
+
|
|
141
|
+
if schema:
|
|
142
|
+
if self.model_name in structured_outputs_models:
|
|
143
|
+
messages = [{"role": "user", "content": prompt}]
|
|
144
|
+
completion = await client.beta.chat.completions.parse(
|
|
145
|
+
model=self.deployment_name,
|
|
146
|
+
messages=messages,
|
|
147
|
+
response_format=schema,
|
|
148
|
+
temperature=self.temperature,
|
|
149
|
+
)
|
|
150
|
+
structured_output: BaseModel = completion.choices[
|
|
151
|
+
0
|
|
152
|
+
].message.parsed
|
|
153
|
+
cost = self.calculate_cost(
|
|
154
|
+
completion.usage.prompt_tokens,
|
|
155
|
+
completion.usage.completion_tokens,
|
|
156
|
+
)
|
|
157
|
+
return structured_output, cost
|
|
158
|
+
if self.model_name in json_mode_models:
|
|
159
|
+
messages = [{"role": "user", "content": prompt}]
|
|
160
|
+
completion = await client.beta.chat.completions.parse(
|
|
161
|
+
model=self.deployment_name,
|
|
162
|
+
messages=messages,
|
|
163
|
+
response_format={"type": "json_object"},
|
|
164
|
+
temperature=self.temperature,
|
|
165
|
+
**self.generation_kwargs,
|
|
166
|
+
)
|
|
167
|
+
json_output = trim_and_load_json(
|
|
168
|
+
completion.choices[0].message.content
|
|
169
|
+
)
|
|
170
|
+
cost = self.calculate_cost(
|
|
171
|
+
completion.usage.prompt_tokens,
|
|
172
|
+
completion.usage.completion_tokens,
|
|
173
|
+
)
|
|
174
|
+
return schema.model_validate(json_output), cost
|
|
175
|
+
|
|
176
|
+
completion = await client.chat.completions.create(
|
|
177
|
+
model=self.deployment_name,
|
|
178
|
+
messages=[{"role": "user", "content": prompt}],
|
|
179
|
+
temperature=self.temperature,
|
|
180
|
+
**self.generation_kwargs,
|
|
181
|
+
)
|
|
182
|
+
output = completion.choices[0].message.content
|
|
183
|
+
cost = self.calculate_cost(
|
|
184
|
+
completion.usage.prompt_tokens,
|
|
185
|
+
completion.usage.completion_tokens,
|
|
186
|
+
)
|
|
187
|
+
if schema:
|
|
188
|
+
json_output = trim_and_load_json(output)
|
|
189
|
+
return schema.model_validate(json_output), cost
|
|
190
|
+
else:
|
|
191
|
+
return output, cost
|
|
192
|
+
|
|
193
|
+
###############################################
|
|
194
|
+
# Other generate functions
|
|
195
|
+
###############################################
|
|
196
|
+
|
|
197
|
+
@retry_azure
|
|
198
|
+
def generate_raw_response(
|
|
199
|
+
self,
|
|
200
|
+
multimodal_input: List[Union[str, MLLMImage]],
|
|
201
|
+
top_logprobs: int = 5,
|
|
202
|
+
) -> Tuple[ChatCompletion, float]:
|
|
203
|
+
client = self.load_model(async_mode=False)
|
|
204
|
+
prompt = self.generate_prompt(multimodal_input)
|
|
205
|
+
messages = [{"role": "user", "content": prompt}]
|
|
206
|
+
|
|
207
|
+
# Generate completion
|
|
208
|
+
completion = client.chat.completions.create(
|
|
209
|
+
model=self.deployment_name,
|
|
210
|
+
messages=messages,
|
|
211
|
+
temperature=self.temperature,
|
|
212
|
+
logprobs=True,
|
|
213
|
+
top_logprobs=top_logprobs,
|
|
214
|
+
**self.generation_kwargs,
|
|
215
|
+
)
|
|
216
|
+
# Cost calculation
|
|
217
|
+
input_tokens = completion.usage.prompt_tokens
|
|
218
|
+
output_tokens = completion.usage.completion_tokens
|
|
219
|
+
cost = self.calculate_cost(input_tokens, output_tokens)
|
|
220
|
+
|
|
221
|
+
return completion, cost
|
|
222
|
+
|
|
223
|
+
@retry_azure
|
|
224
|
+
async def a_generate_raw_response(
|
|
225
|
+
self,
|
|
226
|
+
multimodal_input: List[Union[str, MLLMImage]],
|
|
227
|
+
top_logprobs: int = 5,
|
|
228
|
+
) -> Tuple[ChatCompletion, float]:
|
|
229
|
+
client = self.load_model(async_mode=True)
|
|
230
|
+
prompt = self.generate_prompt(multimodal_input)
|
|
231
|
+
messages = [{"role": "user", "content": prompt}]
|
|
232
|
+
|
|
233
|
+
# Generate completion
|
|
234
|
+
completion = await client.chat.completions.create(
|
|
235
|
+
model=self.deployment_name,
|
|
236
|
+
messages=messages,
|
|
237
|
+
temperature=self.temperature,
|
|
238
|
+
logprobs=True,
|
|
239
|
+
top_logprobs=top_logprobs,
|
|
240
|
+
**self.generation_kwargs,
|
|
241
|
+
)
|
|
242
|
+
# Cost calculation
|
|
243
|
+
input_tokens = completion.usage.prompt_tokens
|
|
244
|
+
output_tokens = completion.usage.completion_tokens
|
|
245
|
+
cost = self.calculate_cost(input_tokens, output_tokens)
|
|
246
|
+
|
|
247
|
+
return completion, cost
|
|
248
|
+
|
|
249
|
+
###############################################
|
|
250
|
+
# Utilities
|
|
251
|
+
###############################################
|
|
252
|
+
|
|
253
|
+
def generate_prompt(
|
|
254
|
+
self, multimodal_input: List[Union[str, MLLMImage]] = []
|
|
255
|
+
):
|
|
256
|
+
"""Convert multimodal input into the proper message format for Azure OpenAI."""
|
|
257
|
+
prompt = []
|
|
258
|
+
for ele in multimodal_input:
|
|
259
|
+
if isinstance(ele, str):
|
|
260
|
+
prompt.append({"type": "text", "text": ele})
|
|
261
|
+
elif isinstance(ele, MLLMImage):
|
|
262
|
+
if ele.local:
|
|
263
|
+
import PIL.Image
|
|
264
|
+
|
|
265
|
+
image = PIL.Image.open(ele.url)
|
|
266
|
+
visual_dict = {
|
|
267
|
+
"type": "image_url",
|
|
268
|
+
"image_url": {
|
|
269
|
+
"url": f"data:image/jpeg;base64,{self.encode_pil_image(image)}"
|
|
270
|
+
},
|
|
271
|
+
}
|
|
272
|
+
else:
|
|
273
|
+
visual_dict = {
|
|
274
|
+
"type": "image_url",
|
|
275
|
+
"image_url": {"url": ele.url},
|
|
276
|
+
}
|
|
277
|
+
prompt.append(visual_dict)
|
|
278
|
+
return prompt
|
|
279
|
+
|
|
280
|
+
def encode_pil_image(self, pil_image):
|
|
281
|
+
"""Encode a PIL image to base64 string."""
|
|
282
|
+
image_buffer = BytesIO()
|
|
283
|
+
if pil_image.mode in ("RGBA", "LA", "P"):
|
|
284
|
+
pil_image = pil_image.convert("RGB")
|
|
285
|
+
pil_image.save(image_buffer, format="JPEG")
|
|
286
|
+
image_bytes = image_buffer.getvalue()
|
|
287
|
+
base64_encoded_image = base64.b64encode(image_bytes).decode("utf-8")
|
|
288
|
+
return base64_encoded_image
|
|
289
|
+
|
|
290
|
+
def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
|
|
291
|
+
pricing = model_pricing.get(self.model_name, model_pricing["gpt-4.1"])
|
|
292
|
+
input_cost = input_tokens * pricing["input"]
|
|
293
|
+
output_cost = output_tokens * pricing["output"]
|
|
294
|
+
return input_cost + output_cost
|
|
295
|
+
|
|
296
|
+
###############################################
|
|
297
|
+
# Model
|
|
298
|
+
###############################################
|
|
299
|
+
|
|
300
|
+
def get_model_name(self):
|
|
301
|
+
return f"Azure OpenAI ({self.model_name})"
|
|
302
|
+
|
|
303
|
+
def load_model(self, async_mode: bool = False):
|
|
304
|
+
if not async_mode:
|
|
305
|
+
return self._build_client(AzureOpenAI)
|
|
306
|
+
return self._build_client(AsyncAzureOpenAI)
|
|
307
|
+
|
|
308
|
+
def _client_kwargs(self) -> Dict:
|
|
309
|
+
"""
|
|
310
|
+
If Tenacity is managing retries, force OpenAI SDK retries off to avoid double retries.
|
|
311
|
+
If the user opts into SDK retries for 'azure' via DEEPEVAL_SDK_RETRY_PROVIDERS,
|
|
312
|
+
leave their retry settings as is.
|
|
313
|
+
"""
|
|
314
|
+
kwargs = dict(self.kwargs or {})
|
|
315
|
+
if not sdk_retries_for(PS.AZURE):
|
|
316
|
+
kwargs["max_retries"] = 0
|
|
317
|
+
return kwargs
|
|
318
|
+
|
|
319
|
+
def _build_client(self, cls):
|
|
320
|
+
kw = dict(
|
|
321
|
+
api_key=self.azure_openai_api_key,
|
|
322
|
+
api_version=self.openai_api_version,
|
|
323
|
+
azure_endpoint=self.azure_endpoint,
|
|
324
|
+
azure_deployment=self.deployment_name,
|
|
325
|
+
**self._client_kwargs(),
|
|
326
|
+
)
|
|
327
|
+
try:
|
|
328
|
+
return cls(**kw)
|
|
329
|
+
except TypeError as e:
|
|
330
|
+
# older OpenAI SDKs may not accept max_retries, in that case remove and retry once
|
|
331
|
+
if "max_retries" in str(e):
|
|
332
|
+
kw.pop("max_retries", None)
|
|
333
|
+
return cls(**kw)
|
|
334
|
+
raise
|
deepeval/tracing/context.py
CHANGED
|
@@ -73,6 +73,7 @@ def update_current_trace(
|
|
|
73
73
|
tools_called: Optional[List[ToolCall]] = None,
|
|
74
74
|
expected_tools: Optional[List[ToolCall]] = None,
|
|
75
75
|
test_case: Optional[LLMTestCase] = None,
|
|
76
|
+
confident_api_key: Optional[str] = None,
|
|
76
77
|
):
|
|
77
78
|
current_trace = current_trace_context.get()
|
|
78
79
|
if not current_trace:
|
|
@@ -109,6 +110,8 @@ def update_current_trace(
|
|
|
109
110
|
current_trace.tools_called = tools_called
|
|
110
111
|
if expected_tools:
|
|
111
112
|
current_trace.expected_tools = expected_tools
|
|
113
|
+
if confident_api_key:
|
|
114
|
+
current_trace.confident_api_key = confident_api_key
|
|
112
115
|
|
|
113
116
|
|
|
114
117
|
def update_llm_span(
|
deepeval/tracing/tracing.py
CHANGED
|
@@ -441,11 +441,11 @@ class TraceManager:
|
|
|
441
441
|
loop = asyncio.new_event_loop()
|
|
442
442
|
asyncio.set_event_loop(loop)
|
|
443
443
|
|
|
444
|
-
# buffer for
|
|
445
|
-
|
|
444
|
+
# buffer for traces that need to be sent after main exits
|
|
445
|
+
remaining_traces: List[TraceApi] = []
|
|
446
446
|
|
|
447
447
|
async def _a_send_trace(trace_obj):
|
|
448
|
-
nonlocal
|
|
448
|
+
nonlocal remaining_traces
|
|
449
449
|
try:
|
|
450
450
|
# Build API object & payload
|
|
451
451
|
if isinstance(trace_obj, TraceApi):
|
|
@@ -486,7 +486,7 @@ class TraceManager:
|
|
|
486
486
|
)
|
|
487
487
|
elif self._flush_enabled:
|
|
488
488
|
# Main thread gone → to be flushed
|
|
489
|
-
|
|
489
|
+
remaining_traces.append(trace_api)
|
|
490
490
|
|
|
491
491
|
except Exception as e:
|
|
492
492
|
queue_size = self._trace_queue.qsize()
|
|
@@ -544,24 +544,35 @@ class TraceManager:
|
|
|
544
544
|
loop.run_until_complete(
|
|
545
545
|
asyncio.gather(*pending, return_exceptions=True)
|
|
546
546
|
)
|
|
547
|
-
self.flush_traces(
|
|
547
|
+
self.flush_traces(remaining_traces)
|
|
548
548
|
loop.run_until_complete(loop.shutdown_asyncgens())
|
|
549
549
|
loop.close()
|
|
550
550
|
|
|
551
|
-
def flush_traces(
|
|
552
|
-
self, remaining_trace_request_bodies: List[Dict[str, Any]]
|
|
553
|
-
):
|
|
551
|
+
def flush_traces(self, remaining_traces: List[TraceApi]):
|
|
554
552
|
if not tracing_enabled() or not self.tracing_enabled:
|
|
555
553
|
return
|
|
556
554
|
|
|
557
555
|
self._print_trace_status(
|
|
558
556
|
TraceWorkerStatus.WARNING,
|
|
559
|
-
message=f"Flushing {len(
|
|
557
|
+
message=f"Flushing {len(remaining_traces)} remaining trace(s)",
|
|
560
558
|
)
|
|
561
|
-
for
|
|
559
|
+
for trace_api in remaining_traces:
|
|
562
560
|
with capture_send_trace():
|
|
563
561
|
try:
|
|
564
|
-
|
|
562
|
+
try:
|
|
563
|
+
body = trace_api.model_dump(
|
|
564
|
+
by_alias=True,
|
|
565
|
+
exclude_none=True,
|
|
566
|
+
)
|
|
567
|
+
except AttributeError:
|
|
568
|
+
# Pydantic version below 2.0
|
|
569
|
+
body = trace_api.dict(by_alias=True, exclude_none=True)
|
|
570
|
+
|
|
571
|
+
body = make_json_serializable(body)
|
|
572
|
+
if trace_api.confident_api_key:
|
|
573
|
+
api = Api(api_key=trace_api.confident_api_key)
|
|
574
|
+
else:
|
|
575
|
+
api = Api(api_key=self.confident_api_key)
|
|
565
576
|
|
|
566
577
|
_, link = api.send_request(
|
|
567
578
|
method=HttpMethods.POST,
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
deepeval/__init__.py,sha256=tle4lT4FONApg3OeztGPEdrpGMEGLWajyGTu7bEd3s0,2976
|
|
2
|
-
deepeval/_version.py,sha256=
|
|
2
|
+
deepeval/_version.py,sha256=neKNug0TPAnGHtzNXdePXNla9tw6mxKTmj9WJu2trY8,27
|
|
3
3
|
deepeval/annotation/__init__.py,sha256=ZFhUVNNuH_YgQSZJ-m5E9iUb9TkAkEV33a6ouMDZ8EI,111
|
|
4
4
|
deepeval/annotation/annotation.py,sha256=3j3-syeJepAcEj3u3e4T_BeRDzNr7yXGDIoNQGMKpwQ,2298
|
|
5
5
|
deepeval/annotation/api.py,sha256=EYN33ACVzVxsFleRYm60KB4Exvff3rPJKt1VBuuX970,2147
|
|
@@ -96,7 +96,7 @@ deepeval/benchmarks/hellaswag/hellaswag.py,sha256=_3felzBwQUhhRXk4D9NbcY8dme_qUQ
|
|
|
96
96
|
deepeval/benchmarks/hellaswag/task.py,sha256=LfO8T6bpNiwdM8VdubKrup7qje3-rHgu69iB6Sdsc6I,7323
|
|
97
97
|
deepeval/benchmarks/hellaswag/template.py,sha256=TcCu25hkl89qbRwcEyRVGTGp7DU_5Eph754W2znk5QY,1279
|
|
98
98
|
deepeval/benchmarks/human_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
99
|
-
deepeval/benchmarks/human_eval/human_eval.py,sha256=
|
|
99
|
+
deepeval/benchmarks/human_eval/human_eval.py,sha256=1xMVLQ1rQccY9Ac7BB6_1dFjo1QLK7DEMg0QXY2ybRM,7756
|
|
100
100
|
deepeval/benchmarks/human_eval/task.py,sha256=lEHJpEiRbw5cXUKA_id0J5gQwae1G1T1JCJAeeTpXGg,5412
|
|
101
101
|
deepeval/benchmarks/human_eval/template.py,sha256=rcCHSb0wP_FS9DQPaoBn-iwgicI1OyEdFCkZLQ1vxPk,647
|
|
102
102
|
deepeval/benchmarks/ifeval/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
@@ -153,11 +153,11 @@ deepeval/constants.py,sha256=J5rNXGsMKTFYJ_9Wi49qchZXuUityZjnvuy3I3TO5zk,1667
|
|
|
153
153
|
deepeval/contextvars.py,sha256=oqXtuYiKd4Zvc1rNoR1gcRBxzZYCGTMVn7XostwvkRI,524
|
|
154
154
|
deepeval/dataset/__init__.py,sha256=N2c-rkuxWYiiJSOZArw0H02Cwo7cnfzFuNYJlvsIBEg,249
|
|
155
155
|
deepeval/dataset/api.py,sha256=ZxkEqAF4nZH_Ys_1f5r9N2LFI_vBcAJxt8eJm7Mplpw,831
|
|
156
|
-
deepeval/dataset/dataset.py,sha256=
|
|
156
|
+
deepeval/dataset/dataset.py,sha256=Y9U-hVoa5BbnlzwJiFiDTkDcp9E6VmKOd7NtyLmdpHY,59182
|
|
157
157
|
deepeval/dataset/golden.py,sha256=T-rTk4Hw1tANx_Iimv977F6Y4QK3s5OIB4PecU5FJDM,2338
|
|
158
158
|
deepeval/dataset/test_run_tracer.py,sha256=5CdpDvhzkEEBRyqWi6egocaxiN6IRS3XfbACxEQZQeM,2544
|
|
159
159
|
deepeval/dataset/types.py,sha256=CWeOIBPK2WdmRUqjFa9gfN-w2da0r8Ilzl3ToDpJQoQ,558
|
|
160
|
-
deepeval/dataset/utils.py,sha256=
|
|
160
|
+
deepeval/dataset/utils.py,sha256=nWCNmD1kyLwvlCXlN-7XiqN2W7IUOkDckc1xl32MF-U,8042
|
|
161
161
|
deepeval/errors.py,sha256=FfhtULNIQqHpKVqCr-xlvTtLxkNj40qVU89sXYKuDrA,754
|
|
162
162
|
deepeval/evaluate/__init__.py,sha256=315IaMiYEz7oJhZ4kPTBfeCNd1xF-wWVU6KOQnrKQpE,291
|
|
163
163
|
deepeval/evaluate/api.py,sha256=rkblH0ZFAAdyuF0Ymh7JE1pIJPR9yFuPrn9SQaCEQp4,435
|
|
@@ -394,11 +394,11 @@ deepeval/metrics/turn_relevancy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
|
|
|
394
394
|
deepeval/metrics/turn_relevancy/schema.py,sha256=om0zFJcM6qu2GWS9aJTP3lUmuEXX8KpoACEvCsJqfq4,234
|
|
395
395
|
deepeval/metrics/turn_relevancy/template.py,sha256=klZ10QI8jo4ekf-KgcWgRxS9E3AK4vgKDNzjwAYGl48,2797
|
|
396
396
|
deepeval/metrics/turn_relevancy/turn_relevancy.py,sha256=cgMt0toBIwzDc8lE8Q3YztzQA_DqR4GfdDrlyX7ya6w,10385
|
|
397
|
-
deepeval/metrics/utils.py,sha256=
|
|
397
|
+
deepeval/metrics/utils.py,sha256=gEEETXaug997fqrVW2Suceuaw1MgZAYMF4wT15Mu2Z8,18920
|
|
398
398
|
deepeval/model_integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
399
399
|
deepeval/model_integrations/types.py,sha256=rbVMhC_2yWwD6JqzkRO9D7aMVC_KtXN686G_S7de7S8,630
|
|
400
400
|
deepeval/model_integrations/utils.py,sha256=Zt9SYPgTxlGsQFZgpZvh_a5fWuL8mmIFVSe6uoQywZ4,3562
|
|
401
|
-
deepeval/models/__init__.py,sha256=
|
|
401
|
+
deepeval/models/__init__.py,sha256=7vANBeNkDUADNzP8cyZUoQVmExEEODlHWGcj_9ik9A8,1269
|
|
402
402
|
deepeval/models/_summac_model.py,sha256=xflanxl_IBuzuainlYCVX7UvjHCnAckKSvNR2NwZI6k,19750
|
|
403
403
|
deepeval/models/answer_relevancy_model.py,sha256=SLOA6uUImNOuxpPGfTg2AH7MIkf9QsotYixvI1jcVC8,2197
|
|
404
404
|
deepeval/models/base_model.py,sha256=owmHhVBppPe5Zt6GK9p87dE31hoqtn5_8F9TWr97aRE,4112
|
|
@@ -422,7 +422,8 @@ deepeval/models/llms/local_model.py,sha256=hEyKVA6pkQm9dICUKsMNgjVI3w6gnyMdmBt_E
|
|
|
422
422
|
deepeval/models/llms/ollama_model.py,sha256=xPO4d4jMY-cQAyHAcMuFvWS8JMWwCUbKP9CMi838Nuc,3307
|
|
423
423
|
deepeval/models/llms/openai_model.py,sha256=1rjwbyt87fK03pw7r5tq3PjUVfl2EWllAssGyy6Dt2A,17494
|
|
424
424
|
deepeval/models/llms/utils.py,sha256=gFM_8eIvdSwN_D4Yqp-j7PkfoiRn_bgu7tlCHol3A6c,1324
|
|
425
|
-
deepeval/models/mlllms/__init__.py,sha256=
|
|
425
|
+
deepeval/models/mlllms/__init__.py,sha256=EgFWQcqVPhIqb90QCtIH2Np0bLSRj_beaymbuRN7Ds8,200
|
|
426
|
+
deepeval/models/mlllms/azure_model.py,sha256=pMpLAKZypVKMOAtyHToX8WgCcbbA8hXa2sd16DhCKEM,12696
|
|
426
427
|
deepeval/models/mlllms/gemini_model.py,sha256=7tHIWD4w_fBz3L7jkKWygn1QpBPk9nl2Kw-yb0Jc3PI,10167
|
|
427
428
|
deepeval/models/mlllms/ollama_model.py,sha256=_YtYtw8oIMVVI-CFsDicsdeEJUPhw_9ArPxB_1olsJA,4798
|
|
428
429
|
deepeval/models/mlllms/openai_model.py,sha256=KgvYgQwWZ1A_Gcl6-4-W7IMqbUF9K8sNY37j5Ag7kQQ,9014
|
|
@@ -487,7 +488,7 @@ deepeval/test_run/hyperparameters.py,sha256=4yJkNgwL2y6eyWDTmUV62f5RUlfOui4R22ws
|
|
|
487
488
|
deepeval/test_run/test_run.py,sha256=mStiBQZkgktwF99FRqkT_-UFrMT0m06X20TEEO6V2Bc,41278
|
|
488
489
|
deepeval/tracing/__init__.py,sha256=aSOk_ZgL-K7CZzcyiaIa5peAiaPViDST5GhpHA3Adc8,614
|
|
489
490
|
deepeval/tracing/api.py,sha256=GbtpUDdGpchl6rPXtZT6IBKjAhwux6qOlKLdP3dRVHU,4996
|
|
490
|
-
deepeval/tracing/context.py,sha256=
|
|
491
|
+
deepeval/tracing/context.py,sha256=v4uzd0N2H8mNntPwZvL49ya2kW9FvjqQqWXJFgE5d0c,5469
|
|
491
492
|
deepeval/tracing/offline_evals/__init__.py,sha256=bEniJAl7PmS9u2ksiOTfHtlCPJ9_CJV5R6umrUOX5MM,102
|
|
492
493
|
deepeval/tracing/offline_evals/api.py,sha256=eBfqh2uWyeRkIeGhjrN1bTQzAEow-XPubs-42WEZ2QQ,510
|
|
493
494
|
deepeval/tracing/offline_evals/span.py,sha256=pXqTVXs-WnjRVpCYYEbNe0zSM6Wz9GsKHsM5ZcWxrmM,1802
|
|
@@ -501,12 +502,12 @@ deepeval/tracing/patchers.py,sha256=c-8Fjc5VIWB5VD9ONKq735ypW6O1pZIFQWsHR3lRh0E,
|
|
|
501
502
|
deepeval/tracing/perf_epoch_bridge.py,sha256=iyAPddB6Op7NpMtPHJ29lDm53Btz9yLaN6xSCfTRQm4,1825
|
|
502
503
|
deepeval/tracing/trace_context.py,sha256=Z0n0Cu1A5g9dXiZnzTFO5TzeOYHKeNuO6v3_EU_Gi_c,3568
|
|
503
504
|
deepeval/tracing/trace_test_manager.py,sha256=wt4y7EWTRc4Bw938-UFFtXHkdFFOrnx6JaIk7J5Iulw,555
|
|
504
|
-
deepeval/tracing/tracing.py,sha256=
|
|
505
|
+
deepeval/tracing/tracing.py,sha256=dx4JpiixkscEaYBR0LFpYCjeGgfZqiVlRCD-HblMo6g,46459
|
|
505
506
|
deepeval/tracing/types.py,sha256=WhnxefUc5I8jcAOBQ-tsZ8_zVZfGqSvCWHD5XUN6Ggw,6040
|
|
506
507
|
deepeval/tracing/utils.py,sha256=mdvhYAxDNsdnusaEXJd-c-_O2Jn6S3xSuzRvLO1Jz4U,5684
|
|
507
508
|
deepeval/utils.py,sha256=zy9RR0bt3YMzWVlJc5Rl6eU5RyeW2uEjMfwD1sdgPr4,23234
|
|
508
|
-
deepeval-3.7.
|
|
509
|
-
deepeval-3.7.
|
|
510
|
-
deepeval-3.7.
|
|
511
|
-
deepeval-3.7.
|
|
512
|
-
deepeval-3.7.
|
|
509
|
+
deepeval-3.7.3.dist-info/LICENSE.md,sha256=0ATkuLv6QgsJTBODUHC5Rak_PArA6gv2t7inJzNTP38,11352
|
|
510
|
+
deepeval-3.7.3.dist-info/METADATA,sha256=LvW_cE214Ta8lMgI-eQC3cnlO0c6yHyYw1L4B4AHxaM,18743
|
|
511
|
+
deepeval-3.7.3.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
|
|
512
|
+
deepeval-3.7.3.dist-info/entry_points.txt,sha256=NoismUQfwLOojSGZmBrdcpwfaoFRAzUhBvZD3UwOKog,95
|
|
513
|
+
deepeval-3.7.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|