deepeval 3.7.1__py3-none-any.whl → 3.7.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/benchmarks/human_eval/human_eval.py +2 -1
- deepeval/dataset/dataset.py +35 -11
- deepeval/dataset/utils.py +2 -0
- deepeval/evaluate/compare.py +6 -2
- deepeval/metrics/utils.py +3 -0
- deepeval/models/__init__.py +2 -0
- deepeval/models/mlllms/__init__.py +1 -0
- deepeval/models/mlllms/azure_model.py +334 -0
- deepeval/synthesizer/config.py +9 -0
- deepeval/synthesizer/schema.py +23 -0
- deepeval/synthesizer/synthesizer.py +1137 -2
- deepeval/synthesizer/templates/__init__.py +11 -2
- deepeval/synthesizer/templates/template.py +554 -1
- deepeval/synthesizer/templates/template_extraction.py +32 -0
- deepeval/synthesizer/templates/template_prompt.py +262 -0
- deepeval/tracing/context.py +3 -0
- deepeval/tracing/tracing.py +22 -11
- {deepeval-3.7.1.dist-info → deepeval-3.7.3.dist-info}/METADATA +2 -1
- {deepeval-3.7.1.dist-info → deepeval-3.7.3.dist-info}/RECORD +23 -22
- {deepeval-3.7.1.dist-info → deepeval-3.7.3.dist-info}/entry_points.txt +1 -1
- {deepeval-3.7.1.dist-info → deepeval-3.7.3.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.1.dist-info → deepeval-3.7.3.dist-info}/WHEEL +0 -0
deepeval/_version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__: str = "3.7.
|
|
1
|
+
__version__: str = "3.7.3"
|
|
@@ -92,7 +92,7 @@ class HumanEval(DeepEvalBaseBenchmark):
|
|
|
92
92
|
self.predictions: Optional[pd.DataFrame] = None
|
|
93
93
|
self.task_scores: Optional[pd.DataFrame] = None
|
|
94
94
|
self.overall_score: Optional[float] = None
|
|
95
|
-
self.verbose_mode: bool =
|
|
95
|
+
self.verbose_mode: bool = verbose_mode
|
|
96
96
|
|
|
97
97
|
def evaluate(
|
|
98
98
|
self, model: DeepEvalBaseLLM, *args, k: int = 1, **kwargs
|
|
@@ -123,6 +123,7 @@ class HumanEval(DeepEvalBaseBenchmark):
|
|
|
123
123
|
task.value,
|
|
124
124
|
golden.input,
|
|
125
125
|
prediction,
|
|
126
|
+
task_correct,
|
|
126
127
|
golden.expected_output,
|
|
127
128
|
score,
|
|
128
129
|
)
|
deepeval/dataset/dataset.py
CHANGED
|
@@ -189,17 +189,35 @@ class EvaluationDataset:
|
|
|
189
189
|
test_case._dataset_alias = self._alias
|
|
190
190
|
test_case._dataset_id = self._id
|
|
191
191
|
if isinstance(test_case, LLMTestCase):
|
|
192
|
+
if self._conversational_goldens or self._conversational_test_cases:
|
|
193
|
+
raise TypeError(
|
|
194
|
+
"You cannot add 'LLMTestCase' to a multi-turn dataset."
|
|
195
|
+
)
|
|
192
196
|
test_case._dataset_rank = len(self._llm_test_cases)
|
|
193
197
|
self._llm_test_cases.append(test_case)
|
|
194
198
|
elif isinstance(test_case, ConversationalTestCase):
|
|
199
|
+
if self._goldens or self._llm_test_cases:
|
|
200
|
+
raise TypeError(
|
|
201
|
+
"You cannot add 'ConversationalTestCase' to a single-turn dataset."
|
|
202
|
+
)
|
|
203
|
+
self._multi_turn = True
|
|
195
204
|
test_case._dataset_rank = len(self._conversational_test_cases)
|
|
196
205
|
self._conversational_test_cases.append(test_case)
|
|
197
206
|
|
|
198
207
|
def add_golden(self, golden: Union[Golden, ConversationalGolden]):
|
|
199
|
-
if
|
|
200
|
-
self.
|
|
201
|
-
|
|
208
|
+
if isinstance(golden, Golden):
|
|
209
|
+
if self._conversational_goldens or self._conversational_test_cases:
|
|
210
|
+
raise TypeError(
|
|
211
|
+
"You cannot add 'Golden' to a multi-turn dataset."
|
|
212
|
+
)
|
|
202
213
|
self._add_golden(golden)
|
|
214
|
+
else:
|
|
215
|
+
if self._goldens or self._llm_test_cases:
|
|
216
|
+
raise TypeError(
|
|
217
|
+
"You cannot add 'ConversationalGolden' to a single-turn dataset."
|
|
218
|
+
)
|
|
219
|
+
self._multi_turn = True
|
|
220
|
+
self._add_conversational_golden(golden)
|
|
203
221
|
|
|
204
222
|
def _add_golden(self, golden: Union[Golden, ConversationalGolden]):
|
|
205
223
|
if isinstance(golden, Golden):
|
|
@@ -224,16 +242,16 @@ class EvaluationDataset:
|
|
|
224
242
|
file_path: str,
|
|
225
243
|
input_col_name: str,
|
|
226
244
|
actual_output_col_name: str,
|
|
227
|
-
expected_output_col_name: Optional[str] =
|
|
228
|
-
context_col_name: Optional[str] =
|
|
245
|
+
expected_output_col_name: Optional[str] = "expected_output",
|
|
246
|
+
context_col_name: Optional[str] = "context",
|
|
229
247
|
context_col_delimiter: str = ";",
|
|
230
|
-
retrieval_context_col_name: Optional[str] =
|
|
248
|
+
retrieval_context_col_name: Optional[str] = "retrieval_context",
|
|
231
249
|
retrieval_context_col_delimiter: str = ";",
|
|
232
|
-
tools_called_col_name: Optional[str] =
|
|
250
|
+
tools_called_col_name: Optional[str] = "tools_called",
|
|
233
251
|
tools_called_col_delimiter: str = ";",
|
|
234
|
-
expected_tools_col_name: Optional[str] =
|
|
252
|
+
expected_tools_col_name: Optional[str] = "expected_tools",
|
|
235
253
|
expected_tools_col_delimiter: str = ";",
|
|
236
|
-
additional_metadata_col_name: Optional[str] =
|
|
254
|
+
additional_metadata_col_name: Optional[str] = "additional_metadata",
|
|
237
255
|
):
|
|
238
256
|
"""
|
|
239
257
|
Load test cases from a CSV file.
|
|
@@ -379,6 +397,7 @@ class EvaluationDataset:
|
|
|
379
397
|
retrieval_context_key_name: Optional[str] = None,
|
|
380
398
|
tools_called_key_name: Optional[str] = None,
|
|
381
399
|
expected_tools_key_name: Optional[str] = None,
|
|
400
|
+
addtional_metadata_key_name: Optional[str] = None,
|
|
382
401
|
encoding_type: str = "utf-8",
|
|
383
402
|
):
|
|
384
403
|
"""
|
|
@@ -431,6 +450,7 @@ class EvaluationDataset:
|
|
|
431
450
|
tools_called = [ToolCall(**tool) for tool in tools_called_data]
|
|
432
451
|
expected_tools_data = json_obj.get(expected_tools_key_name, [])
|
|
433
452
|
expected_tools = [ToolCall(**tool) for tool in expected_tools_data]
|
|
453
|
+
# additional_metadata = json_obj.get(addtional_metadata_key_name)
|
|
434
454
|
|
|
435
455
|
self.add_test_case(
|
|
436
456
|
LLMTestCase(
|
|
@@ -441,6 +461,7 @@ class EvaluationDataset:
|
|
|
441
461
|
retrieval_context=retrieval_context,
|
|
442
462
|
tools_called=tools_called,
|
|
443
463
|
expected_tools=expected_tools,
|
|
464
|
+
# additional_metadata=additional_metadata,
|
|
444
465
|
)
|
|
445
466
|
)
|
|
446
467
|
|
|
@@ -460,8 +481,8 @@ class EvaluationDataset:
|
|
|
460
481
|
expected_tools_col_delimiter: str = ";",
|
|
461
482
|
comments_key_name: str = "comments",
|
|
462
483
|
name_key_name: str = "name",
|
|
463
|
-
source_file_col_name: Optional[str] =
|
|
464
|
-
additional_metadata_col_name: Optional[str] =
|
|
484
|
+
source_file_col_name: Optional[str] = "source_file",
|
|
485
|
+
additional_metadata_col_name: Optional[str] = "additional_metadata",
|
|
465
486
|
scenario_col_name: Optional[str] = "scenario",
|
|
466
487
|
turns_col_name: Optional[str] = "turns",
|
|
467
488
|
expected_outcome_col_name: Optional[str] = "expected_outcome",
|
|
@@ -587,6 +608,7 @@ class EvaluationDataset:
|
|
|
587
608
|
context=context,
|
|
588
609
|
comments=comments,
|
|
589
610
|
name=name,
|
|
611
|
+
additional_metadata=additional_metadata,
|
|
590
612
|
)
|
|
591
613
|
)
|
|
592
614
|
else:
|
|
@@ -645,6 +667,7 @@ class EvaluationDataset:
|
|
|
645
667
|
comments = json_obj.get(comments_key_name)
|
|
646
668
|
name = json_obj.get(name_key_name)
|
|
647
669
|
parsed_turns = parse_turns(turns) if turns else []
|
|
670
|
+
additional_metadata = json_obj.get(additional_metadata_key_name)
|
|
648
671
|
|
|
649
672
|
self._multi_turn = True
|
|
650
673
|
self.goldens.append(
|
|
@@ -656,6 +679,7 @@ class EvaluationDataset:
|
|
|
656
679
|
context=context,
|
|
657
680
|
comments=comments,
|
|
658
681
|
name=name,
|
|
682
|
+
additional_metadata=additional_metadata,
|
|
659
683
|
)
|
|
660
684
|
)
|
|
661
685
|
else:
|
deepeval/dataset/utils.py
CHANGED
|
@@ -24,6 +24,7 @@ def convert_test_cases_to_goldens(
|
|
|
24
24
|
"retrieval_context": test_case.retrieval_context,
|
|
25
25
|
"tools_called": test_case.tools_called,
|
|
26
26
|
"expected_tools": test_case.expected_tools,
|
|
27
|
+
"additional_metadata": test_case.additional_metadata,
|
|
27
28
|
}
|
|
28
29
|
goldens.append(Golden(**golden))
|
|
29
30
|
return goldens
|
|
@@ -70,6 +71,7 @@ def convert_convo_test_cases_to_convo_goldens(
|
|
|
70
71
|
"expected_outcome": test_case.expected_outcome,
|
|
71
72
|
"user_description": test_case.user_description,
|
|
72
73
|
"context": test_case.context,
|
|
74
|
+
"additional_metadata": test_case.additional_metadata,
|
|
73
75
|
}
|
|
74
76
|
goldens.append(ConversationalGolden(**golden))
|
|
75
77
|
return goldens
|
deepeval/evaluate/compare.py
CHANGED
|
@@ -502,10 +502,14 @@ def wrap_up_experiment(
|
|
|
502
502
|
|
|
503
503
|
try:
|
|
504
504
|
api = Api()
|
|
505
|
-
experiment_request = PostExperimentRequest(
|
|
505
|
+
experiment_request = PostExperimentRequest(
|
|
506
|
+
testRuns=test_runs, name=name
|
|
507
|
+
)
|
|
506
508
|
|
|
507
509
|
try:
|
|
508
|
-
body = experiment_request.model_dump(
|
|
510
|
+
body = experiment_request.model_dump(
|
|
511
|
+
by_alias=True, exclude_none=True
|
|
512
|
+
)
|
|
509
513
|
except AttributeError:
|
|
510
514
|
body = experiment_request.dict(by_alias=True, exclude_none=True)
|
|
511
515
|
json_str = json.dumps(body, cls=TestRunEncoder)
|
deepeval/metrics/utils.py
CHANGED
|
@@ -25,6 +25,7 @@ from deepeval.models import (
|
|
|
25
25
|
MultimodalOpenAIModel,
|
|
26
26
|
MultimodalGeminiModel,
|
|
27
27
|
MultimodalOllamaModel,
|
|
28
|
+
MultimodalAzureOpenAIMLLMModel,
|
|
28
29
|
AmazonBedrockModel,
|
|
29
30
|
LiteLLMModel,
|
|
30
31
|
KimiModel,
|
|
@@ -514,6 +515,8 @@ def initialize_multimodal_model(
|
|
|
514
515
|
return MultimodalGeminiModel(), True
|
|
515
516
|
if should_use_ollama_model():
|
|
516
517
|
return MultimodalOllamaModel(), True
|
|
518
|
+
elif should_use_azure_openai():
|
|
519
|
+
return MultimodalAzureOpenAIMLLMModel(model_name=model), True
|
|
517
520
|
elif isinstance(model, str) or model is None:
|
|
518
521
|
return MultimodalOpenAIModel(model=model), True
|
|
519
522
|
raise TypeError(
|
deepeval/models/__init__.py
CHANGED
|
@@ -21,6 +21,7 @@ from deepeval.models.mlllms import (
|
|
|
21
21
|
MultimodalOpenAIModel,
|
|
22
22
|
MultimodalOllamaModel,
|
|
23
23
|
MultimodalGeminiModel,
|
|
24
|
+
MultimodalAzureOpenAIMLLMModel,
|
|
24
25
|
)
|
|
25
26
|
from deepeval.models.embedding_models import (
|
|
26
27
|
OpenAIEmbeddingModel,
|
|
@@ -48,6 +49,7 @@ __all__ = [
|
|
|
48
49
|
"MultimodalOpenAIModel",
|
|
49
50
|
"MultimodalOllamaModel",
|
|
50
51
|
"MultimodalGeminiModel",
|
|
52
|
+
"MultimodalAzureOpenAIMLLMModel",
|
|
51
53
|
"OpenAIEmbeddingModel",
|
|
52
54
|
"AzureOpenAIEmbeddingModel",
|
|
53
55
|
"LocalEmbeddingModel",
|
|
@@ -0,0 +1,334 @@
|
|
|
1
|
+
from openai.types.chat.chat_completion import ChatCompletion
|
|
2
|
+
from openai import AzureOpenAI, AsyncAzureOpenAI
|
|
3
|
+
from typing import Optional, Tuple, Union, Dict, List
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
from io import BytesIO
|
|
6
|
+
import base64
|
|
7
|
+
|
|
8
|
+
from deepeval.models import DeepEvalBaseMLLM
|
|
9
|
+
from deepeval.key_handler import ModelKeyValues, KEY_FILE_HANDLER
|
|
10
|
+
from deepeval.test_case import MLLMImage
|
|
11
|
+
from deepeval.models.llms.openai_model import (
|
|
12
|
+
structured_outputs_models,
|
|
13
|
+
json_mode_models,
|
|
14
|
+
model_pricing,
|
|
15
|
+
)
|
|
16
|
+
from deepeval.models.retry_policy import (
|
|
17
|
+
create_retry_decorator,
|
|
18
|
+
sdk_retries_for,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
from deepeval.models.llms.utils import trim_and_load_json
|
|
22
|
+
from deepeval.models.utils import parse_model_name
|
|
23
|
+
from deepeval.constants import ProviderSlug as PS
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
retry_azure = create_retry_decorator(PS.AZURE)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class MultimodalAzureOpenAIMLLMModel(DeepEvalBaseMLLM):
|
|
30
|
+
def __init__(
|
|
31
|
+
self,
|
|
32
|
+
deployment_name: Optional[str] = None,
|
|
33
|
+
model_name: Optional[str] = None,
|
|
34
|
+
azure_openai_api_key: Optional[str] = None,
|
|
35
|
+
openai_api_version: Optional[str] = None,
|
|
36
|
+
azure_endpoint: Optional[str] = None,
|
|
37
|
+
temperature: float = 0,
|
|
38
|
+
generation_kwargs: Optional[Dict] = None,
|
|
39
|
+
**kwargs,
|
|
40
|
+
):
|
|
41
|
+
# fetch Azure deployment parameters
|
|
42
|
+
model_name = model_name or KEY_FILE_HANDLER.fetch_data(
|
|
43
|
+
ModelKeyValues.AZURE_MODEL_NAME
|
|
44
|
+
)
|
|
45
|
+
self.deployment_name = deployment_name or KEY_FILE_HANDLER.fetch_data(
|
|
46
|
+
ModelKeyValues.AZURE_DEPLOYMENT_NAME
|
|
47
|
+
)
|
|
48
|
+
self.azure_openai_api_key = (
|
|
49
|
+
azure_openai_api_key
|
|
50
|
+
or KEY_FILE_HANDLER.fetch_data(ModelKeyValues.AZURE_OPENAI_API_KEY)
|
|
51
|
+
)
|
|
52
|
+
self.openai_api_version = (
|
|
53
|
+
openai_api_version
|
|
54
|
+
or KEY_FILE_HANDLER.fetch_data(ModelKeyValues.OPENAI_API_VERSION)
|
|
55
|
+
)
|
|
56
|
+
self.azure_endpoint = azure_endpoint or KEY_FILE_HANDLER.fetch_data(
|
|
57
|
+
ModelKeyValues.AZURE_OPENAI_ENDPOINT
|
|
58
|
+
)
|
|
59
|
+
if temperature < 0:
|
|
60
|
+
raise ValueError("Temperature must be >= 0.")
|
|
61
|
+
self.temperature = temperature
|
|
62
|
+
|
|
63
|
+
# args and kwargs will be passed to the underlying model, in load_model function
|
|
64
|
+
self.kwargs = kwargs
|
|
65
|
+
self.generation_kwargs = generation_kwargs or {}
|
|
66
|
+
super().__init__(parse_model_name(model_name))
|
|
67
|
+
|
|
68
|
+
###############################################
|
|
69
|
+
# Generate functions
|
|
70
|
+
###############################################
|
|
71
|
+
|
|
72
|
+
@retry_azure
|
|
73
|
+
def generate(
|
|
74
|
+
self,
|
|
75
|
+
multimodal_input: List[Union[str, MLLMImage]],
|
|
76
|
+
schema: Optional[BaseModel] = None,
|
|
77
|
+
) -> Tuple[Union[str, BaseModel], float]:
|
|
78
|
+
client = self.load_model(async_mode=False)
|
|
79
|
+
prompt = self.generate_prompt(multimodal_input)
|
|
80
|
+
|
|
81
|
+
if schema:
|
|
82
|
+
if self.model_name in structured_outputs_models:
|
|
83
|
+
messages = [{"role": "user", "content": prompt}]
|
|
84
|
+
completion = client.beta.chat.completions.parse(
|
|
85
|
+
model=self.deployment_name,
|
|
86
|
+
messages=messages,
|
|
87
|
+
response_format=schema,
|
|
88
|
+
temperature=self.temperature,
|
|
89
|
+
)
|
|
90
|
+
structured_output: BaseModel = completion.choices[
|
|
91
|
+
0
|
|
92
|
+
].message.parsed
|
|
93
|
+
cost = self.calculate_cost(
|
|
94
|
+
completion.usage.prompt_tokens,
|
|
95
|
+
completion.usage.completion_tokens,
|
|
96
|
+
)
|
|
97
|
+
return structured_output, cost
|
|
98
|
+
if self.model_name in json_mode_models:
|
|
99
|
+
messages = [{"role": "user", "content": prompt}]
|
|
100
|
+
completion = client.beta.chat.completions.parse(
|
|
101
|
+
model=self.deployment_name,
|
|
102
|
+
messages=messages,
|
|
103
|
+
response_format={"type": "json_object"},
|
|
104
|
+
temperature=self.temperature,
|
|
105
|
+
)
|
|
106
|
+
json_output = trim_and_load_json(
|
|
107
|
+
completion.choices[0].message.content
|
|
108
|
+
)
|
|
109
|
+
cost = self.calculate_cost(
|
|
110
|
+
completion.usage.prompt_tokens,
|
|
111
|
+
completion.usage.completion_tokens,
|
|
112
|
+
)
|
|
113
|
+
return schema.model_validate(json_output), cost
|
|
114
|
+
print("Loading model client:")
|
|
115
|
+
print(client.base_url)
|
|
116
|
+
completion = client.chat.completions.create(
|
|
117
|
+
model=self.deployment_name,
|
|
118
|
+
messages=[{"role": "user", "content": prompt}],
|
|
119
|
+
temperature=self.temperature,
|
|
120
|
+
**self.generation_kwargs,
|
|
121
|
+
)
|
|
122
|
+
output = completion.choices[0].message.content
|
|
123
|
+
cost = self.calculate_cost(
|
|
124
|
+
completion.usage.prompt_tokens, completion.usage.completion_tokens
|
|
125
|
+
)
|
|
126
|
+
if schema:
|
|
127
|
+
json_output = trim_and_load_json(output)
|
|
128
|
+
return schema.model_validate(json_output), cost
|
|
129
|
+
else:
|
|
130
|
+
return output, cost
|
|
131
|
+
|
|
132
|
+
@retry_azure
|
|
133
|
+
async def a_generate(
|
|
134
|
+
self,
|
|
135
|
+
multimodal_input: List[Union[str, MLLMImage]],
|
|
136
|
+
schema: Optional[BaseModel] = None,
|
|
137
|
+
) -> Tuple[Union[str, BaseModel], float]:
|
|
138
|
+
client = self.load_model(async_mode=True)
|
|
139
|
+
prompt = self.generate_prompt(multimodal_input)
|
|
140
|
+
|
|
141
|
+
if schema:
|
|
142
|
+
if self.model_name in structured_outputs_models:
|
|
143
|
+
messages = [{"role": "user", "content": prompt}]
|
|
144
|
+
completion = await client.beta.chat.completions.parse(
|
|
145
|
+
model=self.deployment_name,
|
|
146
|
+
messages=messages,
|
|
147
|
+
response_format=schema,
|
|
148
|
+
temperature=self.temperature,
|
|
149
|
+
)
|
|
150
|
+
structured_output: BaseModel = completion.choices[
|
|
151
|
+
0
|
|
152
|
+
].message.parsed
|
|
153
|
+
cost = self.calculate_cost(
|
|
154
|
+
completion.usage.prompt_tokens,
|
|
155
|
+
completion.usage.completion_tokens,
|
|
156
|
+
)
|
|
157
|
+
return structured_output, cost
|
|
158
|
+
if self.model_name in json_mode_models:
|
|
159
|
+
messages = [{"role": "user", "content": prompt}]
|
|
160
|
+
completion = await client.beta.chat.completions.parse(
|
|
161
|
+
model=self.deployment_name,
|
|
162
|
+
messages=messages,
|
|
163
|
+
response_format={"type": "json_object"},
|
|
164
|
+
temperature=self.temperature,
|
|
165
|
+
**self.generation_kwargs,
|
|
166
|
+
)
|
|
167
|
+
json_output = trim_and_load_json(
|
|
168
|
+
completion.choices[0].message.content
|
|
169
|
+
)
|
|
170
|
+
cost = self.calculate_cost(
|
|
171
|
+
completion.usage.prompt_tokens,
|
|
172
|
+
completion.usage.completion_tokens,
|
|
173
|
+
)
|
|
174
|
+
return schema.model_validate(json_output), cost
|
|
175
|
+
|
|
176
|
+
completion = await client.chat.completions.create(
|
|
177
|
+
model=self.deployment_name,
|
|
178
|
+
messages=[{"role": "user", "content": prompt}],
|
|
179
|
+
temperature=self.temperature,
|
|
180
|
+
**self.generation_kwargs,
|
|
181
|
+
)
|
|
182
|
+
output = completion.choices[0].message.content
|
|
183
|
+
cost = self.calculate_cost(
|
|
184
|
+
completion.usage.prompt_tokens,
|
|
185
|
+
completion.usage.completion_tokens,
|
|
186
|
+
)
|
|
187
|
+
if schema:
|
|
188
|
+
json_output = trim_and_load_json(output)
|
|
189
|
+
return schema.model_validate(json_output), cost
|
|
190
|
+
else:
|
|
191
|
+
return output, cost
|
|
192
|
+
|
|
193
|
+
###############################################
|
|
194
|
+
# Other generate functions
|
|
195
|
+
###############################################
|
|
196
|
+
|
|
197
|
+
@retry_azure
|
|
198
|
+
def generate_raw_response(
|
|
199
|
+
self,
|
|
200
|
+
multimodal_input: List[Union[str, MLLMImage]],
|
|
201
|
+
top_logprobs: int = 5,
|
|
202
|
+
) -> Tuple[ChatCompletion, float]:
|
|
203
|
+
client = self.load_model(async_mode=False)
|
|
204
|
+
prompt = self.generate_prompt(multimodal_input)
|
|
205
|
+
messages = [{"role": "user", "content": prompt}]
|
|
206
|
+
|
|
207
|
+
# Generate completion
|
|
208
|
+
completion = client.chat.completions.create(
|
|
209
|
+
model=self.deployment_name,
|
|
210
|
+
messages=messages,
|
|
211
|
+
temperature=self.temperature,
|
|
212
|
+
logprobs=True,
|
|
213
|
+
top_logprobs=top_logprobs,
|
|
214
|
+
**self.generation_kwargs,
|
|
215
|
+
)
|
|
216
|
+
# Cost calculation
|
|
217
|
+
input_tokens = completion.usage.prompt_tokens
|
|
218
|
+
output_tokens = completion.usage.completion_tokens
|
|
219
|
+
cost = self.calculate_cost(input_tokens, output_tokens)
|
|
220
|
+
|
|
221
|
+
return completion, cost
|
|
222
|
+
|
|
223
|
+
@retry_azure
|
|
224
|
+
async def a_generate_raw_response(
|
|
225
|
+
self,
|
|
226
|
+
multimodal_input: List[Union[str, MLLMImage]],
|
|
227
|
+
top_logprobs: int = 5,
|
|
228
|
+
) -> Tuple[ChatCompletion, float]:
|
|
229
|
+
client = self.load_model(async_mode=True)
|
|
230
|
+
prompt = self.generate_prompt(multimodal_input)
|
|
231
|
+
messages = [{"role": "user", "content": prompt}]
|
|
232
|
+
|
|
233
|
+
# Generate completion
|
|
234
|
+
completion = await client.chat.completions.create(
|
|
235
|
+
model=self.deployment_name,
|
|
236
|
+
messages=messages,
|
|
237
|
+
temperature=self.temperature,
|
|
238
|
+
logprobs=True,
|
|
239
|
+
top_logprobs=top_logprobs,
|
|
240
|
+
**self.generation_kwargs,
|
|
241
|
+
)
|
|
242
|
+
# Cost calculation
|
|
243
|
+
input_tokens = completion.usage.prompt_tokens
|
|
244
|
+
output_tokens = completion.usage.completion_tokens
|
|
245
|
+
cost = self.calculate_cost(input_tokens, output_tokens)
|
|
246
|
+
|
|
247
|
+
return completion, cost
|
|
248
|
+
|
|
249
|
+
###############################################
|
|
250
|
+
# Utilities
|
|
251
|
+
###############################################
|
|
252
|
+
|
|
253
|
+
def generate_prompt(
|
|
254
|
+
self, multimodal_input: List[Union[str, MLLMImage]] = []
|
|
255
|
+
):
|
|
256
|
+
"""Convert multimodal input into the proper message format for Azure OpenAI."""
|
|
257
|
+
prompt = []
|
|
258
|
+
for ele in multimodal_input:
|
|
259
|
+
if isinstance(ele, str):
|
|
260
|
+
prompt.append({"type": "text", "text": ele})
|
|
261
|
+
elif isinstance(ele, MLLMImage):
|
|
262
|
+
if ele.local:
|
|
263
|
+
import PIL.Image
|
|
264
|
+
|
|
265
|
+
image = PIL.Image.open(ele.url)
|
|
266
|
+
visual_dict = {
|
|
267
|
+
"type": "image_url",
|
|
268
|
+
"image_url": {
|
|
269
|
+
"url": f"data:image/jpeg;base64,{self.encode_pil_image(image)}"
|
|
270
|
+
},
|
|
271
|
+
}
|
|
272
|
+
else:
|
|
273
|
+
visual_dict = {
|
|
274
|
+
"type": "image_url",
|
|
275
|
+
"image_url": {"url": ele.url},
|
|
276
|
+
}
|
|
277
|
+
prompt.append(visual_dict)
|
|
278
|
+
return prompt
|
|
279
|
+
|
|
280
|
+
def encode_pil_image(self, pil_image):
|
|
281
|
+
"""Encode a PIL image to base64 string."""
|
|
282
|
+
image_buffer = BytesIO()
|
|
283
|
+
if pil_image.mode in ("RGBA", "LA", "P"):
|
|
284
|
+
pil_image = pil_image.convert("RGB")
|
|
285
|
+
pil_image.save(image_buffer, format="JPEG")
|
|
286
|
+
image_bytes = image_buffer.getvalue()
|
|
287
|
+
base64_encoded_image = base64.b64encode(image_bytes).decode("utf-8")
|
|
288
|
+
return base64_encoded_image
|
|
289
|
+
|
|
290
|
+
def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
|
|
291
|
+
pricing = model_pricing.get(self.model_name, model_pricing["gpt-4.1"])
|
|
292
|
+
input_cost = input_tokens * pricing["input"]
|
|
293
|
+
output_cost = output_tokens * pricing["output"]
|
|
294
|
+
return input_cost + output_cost
|
|
295
|
+
|
|
296
|
+
###############################################
|
|
297
|
+
# Model
|
|
298
|
+
###############################################
|
|
299
|
+
|
|
300
|
+
def get_model_name(self):
|
|
301
|
+
return f"Azure OpenAI ({self.model_name})"
|
|
302
|
+
|
|
303
|
+
def load_model(self, async_mode: bool = False):
|
|
304
|
+
if not async_mode:
|
|
305
|
+
return self._build_client(AzureOpenAI)
|
|
306
|
+
return self._build_client(AsyncAzureOpenAI)
|
|
307
|
+
|
|
308
|
+
def _client_kwargs(self) -> Dict:
|
|
309
|
+
"""
|
|
310
|
+
If Tenacity is managing retries, force OpenAI SDK retries off to avoid double retries.
|
|
311
|
+
If the user opts into SDK retries for 'azure' via DEEPEVAL_SDK_RETRY_PROVIDERS,
|
|
312
|
+
leave their retry settings as is.
|
|
313
|
+
"""
|
|
314
|
+
kwargs = dict(self.kwargs or {})
|
|
315
|
+
if not sdk_retries_for(PS.AZURE):
|
|
316
|
+
kwargs["max_retries"] = 0
|
|
317
|
+
return kwargs
|
|
318
|
+
|
|
319
|
+
def _build_client(self, cls):
|
|
320
|
+
kw = dict(
|
|
321
|
+
api_key=self.azure_openai_api_key,
|
|
322
|
+
api_version=self.openai_api_version,
|
|
323
|
+
azure_endpoint=self.azure_endpoint,
|
|
324
|
+
azure_deployment=self.deployment_name,
|
|
325
|
+
**self._client_kwargs(),
|
|
326
|
+
)
|
|
327
|
+
try:
|
|
328
|
+
return cls(**kw)
|
|
329
|
+
except TypeError as e:
|
|
330
|
+
# older OpenAI SDKs may not accept max_retries, in that case remove and retry once
|
|
331
|
+
if "max_retries" in str(e):
|
|
332
|
+
kw.pop("max_retries", None)
|
|
333
|
+
return cls(**kw)
|
|
334
|
+
raise
|
deepeval/synthesizer/config.py
CHANGED
|
@@ -41,6 +41,15 @@ class StylingConfig:
|
|
|
41
41
|
expected_output_format: Optional[str] = None
|
|
42
42
|
|
|
43
43
|
|
|
44
|
+
@dataclass
|
|
45
|
+
class ConversationalStylingConfig:
|
|
46
|
+
scenario_context: Optional[str] = None
|
|
47
|
+
conversational_task: Optional[str] = None
|
|
48
|
+
participant_roles: Optional[str] = None
|
|
49
|
+
scenario_format: Optional[str] = None
|
|
50
|
+
expected_outcome_format: Optional[str] = None
|
|
51
|
+
|
|
52
|
+
|
|
44
53
|
@dataclass
|
|
45
54
|
class ContextConstructionConfig:
|
|
46
55
|
embedder: Optional[Union[str, DeepEvalBaseEmbeddingModel]] = None
|
deepeval/synthesizer/schema.py
CHANGED
|
@@ -58,3 +58,26 @@ class PromptStyling(BaseModel):
|
|
|
58
58
|
scenario: str
|
|
59
59
|
task: str
|
|
60
60
|
input_format: str
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class ConversationalScenario(BaseModel):
|
|
64
|
+
scenario: str
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class ConversationalScenarioList(BaseModel):
|
|
68
|
+
data: List[ConversationalScenario]
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class RewrittenScenario(BaseModel):
|
|
72
|
+
rewritten_scenario: str
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class ScenarioFeedback(BaseModel):
|
|
76
|
+
score: float
|
|
77
|
+
feedback: str
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class ConversationalPromptStyling(BaseModel):
|
|
81
|
+
scenario_context: str
|
|
82
|
+
conversational_task: str
|
|
83
|
+
participant_roles: str
|