deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  3. deepeval/cli/main.py +42 -0
  4. deepeval/confident/api.py +1 -0
  5. deepeval/config/settings.py +22 -4
  6. deepeval/constants.py +8 -1
  7. deepeval/dataset/dataset.py +2 -11
  8. deepeval/dataset/utils.py +1 -1
  9. deepeval/errors.py +20 -2
  10. deepeval/evaluate/evaluate.py +5 -1
  11. deepeval/evaluate/execute.py +811 -248
  12. deepeval/evaluate/types.py +1 -0
  13. deepeval/evaluate/utils.py +33 -119
  14. deepeval/integrations/crewai/__init__.py +7 -1
  15. deepeval/integrations/crewai/handler.py +1 -1
  16. deepeval/integrations/crewai/subs.py +51 -0
  17. deepeval/integrations/crewai/tool.py +71 -0
  18. deepeval/integrations/crewai/wrapper.py +45 -5
  19. deepeval/integrations/llama_index/__init__.py +0 -4
  20. deepeval/integrations/llama_index/handler.py +20 -21
  21. deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
  22. deepeval/metrics/__init__.py +13 -0
  23. deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
  24. deepeval/metrics/api.py +281 -0
  25. deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
  26. deepeval/metrics/base_metric.py +1 -0
  27. deepeval/metrics/bias/bias.py +12 -3
  28. deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
  29. deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
  30. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
  31. deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
  32. deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
  33. deepeval/metrics/conversational_dag/nodes.py +12 -4
  34. deepeval/metrics/conversational_g_eval/__init__.py +3 -0
  35. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
  36. deepeval/metrics/dag/dag.py +12 -0
  37. deepeval/metrics/dag/nodes.py +12 -4
  38. deepeval/metrics/dag/schema.py +1 -1
  39. deepeval/metrics/dag/templates.py +2 -2
  40. deepeval/metrics/faithfulness/faithfulness.py +12 -1
  41. deepeval/metrics/g_eval/g_eval.py +11 -0
  42. deepeval/metrics/goal_accuracy/__init__.py +1 -0
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
  44. deepeval/metrics/goal_accuracy/schema.py +17 -0
  45. deepeval/metrics/goal_accuracy/template.py +235 -0
  46. deepeval/metrics/hallucination/hallucination.py +20 -9
  47. deepeval/metrics/indicator.py +8 -2
  48. deepeval/metrics/json_correctness/json_correctness.py +12 -1
  49. deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +20 -2
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
  52. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
  53. deepeval/metrics/misuse/misuse.py +12 -1
  54. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
  55. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
  56. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
  57. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
  58. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
  59. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
  60. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
  61. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
  62. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
  63. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
  64. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
  65. deepeval/metrics/non_advice/non_advice.py +12 -0
  66. deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
  67. deepeval/metrics/plan_adherence/__init__.py +1 -0
  68. deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
  69. deepeval/metrics/plan_adherence/schema.py +11 -0
  70. deepeval/metrics/plan_adherence/template.py +170 -0
  71. deepeval/metrics/plan_quality/__init__.py +1 -0
  72. deepeval/metrics/plan_quality/plan_quality.py +292 -0
  73. deepeval/metrics/plan_quality/schema.py +11 -0
  74. deepeval/metrics/plan_quality/template.py +101 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
  76. deepeval/metrics/role_adherence/role_adherence.py +12 -0
  77. deepeval/metrics/role_violation/role_violation.py +12 -0
  78. deepeval/metrics/step_efficiency/__init__.py +1 -0
  79. deepeval/metrics/step_efficiency/schema.py +11 -0
  80. deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
  81. deepeval/metrics/step_efficiency/template.py +256 -0
  82. deepeval/metrics/summarization/summarization.py +12 -1
  83. deepeval/metrics/task_completion/task_completion.py +4 -0
  84. deepeval/metrics/tool_correctness/schema.py +6 -0
  85. deepeval/metrics/tool_correctness/template.py +88 -0
  86. deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
  87. deepeval/metrics/tool_use/__init__.py +1 -0
  88. deepeval/metrics/tool_use/schema.py +19 -0
  89. deepeval/metrics/tool_use/template.py +220 -0
  90. deepeval/metrics/tool_use/tool_use.py +458 -0
  91. deepeval/metrics/topic_adherence/__init__.py +1 -0
  92. deepeval/metrics/topic_adherence/schema.py +16 -0
  93. deepeval/metrics/topic_adherence/template.py +162 -0
  94. deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
  95. deepeval/metrics/toxicity/toxicity.py +12 -0
  96. deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
  97. deepeval/models/embedding_models/azure_embedding_model.py +37 -36
  98. deepeval/models/embedding_models/local_embedding_model.py +30 -32
  99. deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
  100. deepeval/models/embedding_models/openai_embedding_model.py +22 -31
  101. deepeval/models/llms/grok_model.py +1 -1
  102. deepeval/models/llms/openai_model.py +2 -0
  103. deepeval/openai/__init__.py +14 -32
  104. deepeval/openai/extractors.py +85 -50
  105. deepeval/openai/patch.py +258 -167
  106. deepeval/openai/types.py +20 -0
  107. deepeval/openai/utils.py +205 -56
  108. deepeval/prompt/__init__.py +19 -1
  109. deepeval/prompt/api.py +160 -0
  110. deepeval/prompt/prompt.py +245 -62
  111. deepeval/prompt/utils.py +186 -15
  112. deepeval/synthesizer/chunking/context_generator.py +209 -152
  113. deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  114. deepeval/synthesizer/synthesizer.py +19 -15
  115. deepeval/test_case/api.py +131 -0
  116. deepeval/test_case/llm_test_case.py +6 -2
  117. deepeval/test_run/__init__.py +1 -0
  118. deepeval/test_run/hyperparameters.py +47 -8
  119. deepeval/test_run/test_run.py +292 -206
  120. deepeval/tracing/__init__.py +2 -1
  121. deepeval/tracing/api.py +3 -1
  122. deepeval/tracing/otel/exporter.py +3 -4
  123. deepeval/tracing/otel/utils.py +24 -5
  124. deepeval/tracing/trace_context.py +89 -5
  125. deepeval/tracing/tracing.py +74 -3
  126. deepeval/tracing/types.py +20 -2
  127. deepeval/tracing/utils.py +8 -0
  128. deepeval/utils.py +21 -0
  129. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
  130. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
  131. deepeval/integrations/llama_index/agent/patched.py +0 -68
  132. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
  133. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
  134. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
@@ -1,5 +1,5 @@
1
1
  from openai import OpenAI, AsyncOpenAI
2
- from typing import Dict, List
2
+ from typing import Dict, List, Optional
3
3
 
4
4
  from deepeval.key_handler import EmbeddingKeyValues, KEY_FILE_HANDLER
5
5
  from deepeval.models import DeepEvalBaseEmbeddingModel
@@ -15,25 +15,32 @@ retry_local = create_retry_decorator(PS.LOCAL)
15
15
 
16
16
 
17
17
  class LocalEmbeddingModel(DeepEvalBaseEmbeddingModel):
18
- def __init__(self, **kwargs):
19
- self.base_url = KEY_FILE_HANDLER.fetch_data(
18
+ def __init__(
19
+ self,
20
+ api_key: Optional[str] = None,
21
+ base_url: Optional[str] = None,
22
+ model: Optional[str] = None,
23
+ generation_kwargs: Optional[Dict] = None,
24
+ **client_kwargs,
25
+ ):
26
+ self.api_key = api_key or KEY_FILE_HANDLER.fetch_data(
27
+ EmbeddingKeyValues.LOCAL_EMBEDDING_API_KEY
28
+ )
29
+ self.base_url = base_url or KEY_FILE_HANDLER.fetch_data(
20
30
  EmbeddingKeyValues.LOCAL_EMBEDDING_BASE_URL
21
31
  )
22
- model_name = KEY_FILE_HANDLER.fetch_data(
32
+ self.model_name = model or KEY_FILE_HANDLER.fetch_data(
23
33
  EmbeddingKeyValues.LOCAL_EMBEDDING_MODEL_NAME
24
34
  )
25
- self.api_key = KEY_FILE_HANDLER.fetch_data(
26
- EmbeddingKeyValues.LOCAL_EMBEDDING_API_KEY
27
- )
28
- self.kwargs = kwargs
29
- super().__init__(model_name)
35
+ self.client_kwargs = client_kwargs or {}
36
+ self.generation_kwargs = generation_kwargs or {}
37
+ super().__init__(self.model_name)
30
38
 
31
39
  @retry_local
32
40
  def embed_text(self, text: str) -> List[float]:
33
41
  embedding_model = self.load_model()
34
42
  response = embedding_model.embeddings.create(
35
- model=self.model_name,
36
- input=[text],
43
+ model=self.model_name, input=[text], **self.generation_kwargs
37
44
  )
38
45
  return response.data[0].embedding
39
46
 
@@ -41,8 +48,7 @@ class LocalEmbeddingModel(DeepEvalBaseEmbeddingModel):
41
48
  def embed_texts(self, texts: List[str]) -> List[List[float]]:
42
49
  embedding_model = self.load_model()
43
50
  response = embedding_model.embeddings.create(
44
- model=self.model_name,
45
- input=texts,
51
+ model=self.model_name, input=texts, **self.generation_kwargs
46
52
  )
47
53
  return [data.embedding for data in response.data]
48
54
 
@@ -50,8 +56,7 @@ class LocalEmbeddingModel(DeepEvalBaseEmbeddingModel):
50
56
  async def a_embed_text(self, text: str) -> List[float]:
51
57
  embedding_model = self.load_model(async_mode=True)
52
58
  response = await embedding_model.embeddings.create(
53
- model=self.model_name,
54
- input=[text],
59
+ model=self.model_name, input=[text], **self.generation_kwargs
55
60
  )
56
61
  return response.data[0].embedding
57
62
 
@@ -59,8 +64,7 @@ class LocalEmbeddingModel(DeepEvalBaseEmbeddingModel):
59
64
  async def a_embed_texts(self, texts: List[str]) -> List[List[float]]:
60
65
  embedding_model = self.load_model(async_mode=True)
61
66
  response = await embedding_model.embeddings.create(
62
- model=self.model_name,
63
- input=texts,
67
+ model=self.model_name, input=texts, **self.generation_kwargs
64
68
  )
65
69
  return [data.embedding for data in response.data]
66
70
 
@@ -76,27 +80,21 @@ class LocalEmbeddingModel(DeepEvalBaseEmbeddingModel):
76
80
  return self._build_client(OpenAI)
77
81
  return self._build_client(AsyncOpenAI)
78
82
 
79
- def _client_kwargs(self) -> Dict:
80
- """
81
- If Tenacity manages retries, turn off OpenAI SDK retries to avoid double retrying.
82
- If users opt into SDK retries via DEEPEVAL_SDK_RETRY_PROVIDERS=local, leave them enabled.
83
- """
84
- kwargs = dict(self.kwargs or {})
83
+ def _build_client(self, cls):
84
+ client_kwargs = self.client_kwargs.copy()
85
85
  if not sdk_retries_for(PS.LOCAL):
86
- kwargs["max_retries"] = 0
87
- return kwargs
86
+ client_kwargs["max_retries"] = 0
88
87
 
89
- def _build_client(self, cls):
90
- kw = dict(
88
+ client_init_kwargs = dict(
91
89
  api_key=self.api_key,
92
90
  base_url=self.base_url,
93
- **self._client_kwargs(),
91
+ **client_kwargs,
94
92
  )
95
93
  try:
96
- return cls(**kw)
94
+ return cls(**client_init_kwargs)
97
95
  except TypeError as e:
98
- # Older OpenAI SDKs may not accept max_retries; drop and retry once.
96
+ # older OpenAI SDKs may not accept max_retries, in that case remove and retry once
99
97
  if "max_retries" in str(e):
100
- kw.pop("max_retries", None)
101
- return cls(**kw)
98
+ client_init_kwargs.pop("max_retries", None)
99
+ return cls(**client_init_kwargs)
102
100
  raise
@@ -1,5 +1,5 @@
1
1
  from ollama import Client, AsyncClient
2
- from typing import List
2
+ from typing import List, Optional, Dict
3
3
 
4
4
  from deepeval.key_handler import EmbeddingKeyValues, KEY_FILE_HANDLER
5
5
  from deepeval.models import DeepEvalBaseEmbeddingModel
@@ -13,27 +13,28 @@ retry_ollama = create_retry_decorator(PS.OLLAMA)
13
13
 
14
14
 
15
15
  class OllamaEmbeddingModel(DeepEvalBaseEmbeddingModel):
16
- def __init__(self, *args, **kwargs):
17
- self.base_url = KEY_FILE_HANDLER.fetch_data(
16
+ def __init__(
17
+ self,
18
+ model: Optional[str] = None,
19
+ host: Optional[str] = None,
20
+ generation_kwargs: Optional[Dict] = None,
21
+ **client_kwargs,
22
+ ):
23
+ self.host = host or KEY_FILE_HANDLER.fetch_data(
18
24
  EmbeddingKeyValues.LOCAL_EMBEDDING_BASE_URL
19
25
  )
20
- model_name = KEY_FILE_HANDLER.fetch_data(
26
+ self.model_name = model or KEY_FILE_HANDLER.fetch_data(
21
27
  EmbeddingKeyValues.LOCAL_EMBEDDING_MODEL_NAME
22
28
  )
23
- # TODO: This is not being used. Clean it up in consistency PR
24
- self.api_key = KEY_FILE_HANDLER.fetch_data(
25
- EmbeddingKeyValues.LOCAL_EMBEDDING_API_KEY
26
- )
27
- self.args = args
28
- self.kwargs = kwargs
29
- super().__init__(model_name)
29
+ self.client_kwargs = client_kwargs or {}
30
+ self.generation_kwargs = generation_kwargs or {}
31
+ super().__init__(self.model_name)
30
32
 
31
33
  @retry_ollama
32
34
  def embed_text(self, text: str) -> List[float]:
33
35
  embedding_model = self.load_model()
34
36
  response = embedding_model.embed(
35
- model=self.model_name,
36
- input=text,
37
+ model=self.model_name, input=text, **self.generation_kwargs
37
38
  )
38
39
  return response["embeddings"][0]
39
40
 
@@ -41,8 +42,7 @@ class OllamaEmbeddingModel(DeepEvalBaseEmbeddingModel):
41
42
  def embed_texts(self, texts: List[str]) -> List[List[float]]:
42
43
  embedding_model = self.load_model()
43
44
  response = embedding_model.embed(
44
- model=self.model_name,
45
- input=texts,
45
+ model=self.model_name, input=texts, **self.generation_kwargs
46
46
  )
47
47
  return response["embeddings"]
48
48
 
@@ -50,8 +50,7 @@ class OllamaEmbeddingModel(DeepEvalBaseEmbeddingModel):
50
50
  async def a_embed_text(self, text: str) -> List[float]:
51
51
  embedding_model = self.load_model(async_mode=True)
52
52
  response = await embedding_model.embed(
53
- model=self.model_name,
54
- input=text,
53
+ model=self.model_name, input=text, **self.generation_kwargs
55
54
  )
56
55
  return response["embeddings"][0]
57
56
 
@@ -59,8 +58,7 @@ class OllamaEmbeddingModel(DeepEvalBaseEmbeddingModel):
59
58
  async def a_embed_texts(self, texts: List[str]) -> List[List[float]]:
60
59
  embedding_model = self.load_model(async_mode=True)
61
60
  response = await embedding_model.embed(
62
- model=self.model_name,
63
- input=texts,
61
+ model=self.model_name, input=texts, **self.generation_kwargs
64
62
  )
65
63
  return response["embeddings"]
66
64
 
@@ -74,7 +72,7 @@ class OllamaEmbeddingModel(DeepEvalBaseEmbeddingModel):
74
72
  return self._build_client(AsyncClient)
75
73
 
76
74
  def _build_client(self, cls):
77
- return cls(host=self.base_url, **self.kwargs)
75
+ return cls(host=self.host, **self.client_kwargs)
78
76
 
79
77
  def get_model_name(self):
80
78
  return f"{self.model_name} (Ollama)"
@@ -19,27 +19,28 @@ default_openai_embedding_model = "text-embedding-3-small"
19
19
 
20
20
 
21
21
  class OpenAIEmbeddingModel(DeepEvalBaseEmbeddingModel):
22
+
22
23
  def __init__(
23
24
  self,
24
25
  model: Optional[str] = None,
25
- _openai_api_key: Optional[str] = None,
26
- **kwargs,
26
+ openai_api_key: Optional[str] = None,
27
+ generation_kwargs: Optional[Dict] = None,
28
+ **client_kwargs,
27
29
  ):
28
- model_name = model if model else default_openai_embedding_model
29
- if model_name not in valid_openai_embedding_models:
30
+ self.openai_api_key = openai_api_key
31
+ self.model_name = model if model else default_openai_embedding_model
32
+ if self.model_name not in valid_openai_embedding_models:
30
33
  raise ValueError(
31
34
  f"Invalid model. Available OpenAI Embedding models: {', '.join(valid_openai_embedding_models)}"
32
35
  )
33
- self._openai_api_key = _openai_api_key
34
- self.model_name = model_name
35
- self.kwargs = kwargs
36
+ self.client_kwargs = client_kwargs or {}
37
+ self.generation_kwargs = generation_kwargs or {}
36
38
 
37
39
  @retry_openai
38
40
  def embed_text(self, text: str) -> List[float]:
39
41
  client = self.load_model(async_mode=False)
40
42
  response = client.embeddings.create(
41
- input=text,
42
- model=self.model_name,
43
+ input=text, model=self.model_name, **self.generation_kwargs
43
44
  )
44
45
  return response.data[0].embedding
45
46
 
@@ -47,8 +48,7 @@ class OpenAIEmbeddingModel(DeepEvalBaseEmbeddingModel):
47
48
  def embed_texts(self, texts: List[str]) -> List[List[float]]:
48
49
  client = self.load_model(async_mode=False)
49
50
  response = client.embeddings.create(
50
- input=texts,
51
- model=self.model_name,
51
+ input=texts, model=self.model_name, **self.generation_kwargs
52
52
  )
53
53
  return [item.embedding for item in response.data]
54
54
 
@@ -56,8 +56,7 @@ class OpenAIEmbeddingModel(DeepEvalBaseEmbeddingModel):
56
56
  async def a_embed_text(self, text: str) -> List[float]:
57
57
  client = self.load_model(async_mode=True)
58
58
  response = await client.embeddings.create(
59
- input=text,
60
- model=self.model_name,
59
+ input=text, model=self.model_name, **self.generation_kwargs
61
60
  )
62
61
  return response.data[0].embedding
63
62
 
@@ -65,8 +64,7 @@ class OpenAIEmbeddingModel(DeepEvalBaseEmbeddingModel):
65
64
  async def a_embed_texts(self, texts: List[str]) -> List[List[float]]:
66
65
  client = self.load_model(async_mode=True)
67
66
  response = await client.embeddings.create(
68
- input=texts,
69
- model=self.model_name,
67
+ input=texts, model=self.model_name, **self.generation_kwargs
70
68
  )
71
69
  return [item.embedding for item in response.data]
72
70
 
@@ -82,27 +80,20 @@ class OpenAIEmbeddingModel(DeepEvalBaseEmbeddingModel):
82
80
  return self._build_client(OpenAI)
83
81
  return self._build_client(AsyncOpenAI)
84
82
 
85
- def _client_kwargs(self) -> Dict:
86
- """
87
- If Tenacity is managing retries, force OpenAI SDK retries off to avoid double retries.
88
- If the user opts into SDK retries for 'openai' via DEEPEVAL_SDK_RETRY_PROVIDERS,
89
- leave their retry settings as is.
90
- """
91
- kwargs = dict(self.kwargs or {})
83
+ def _build_client(self, cls):
84
+ client_kwargs = self.client_kwargs.copy()
92
85
  if not sdk_retries_for(PS.OPENAI):
93
- kwargs["max_retries"] = 0
94
- return kwargs
86
+ client_kwargs["max_retries"] = 0
95
87
 
96
- def _build_client(self, cls):
97
- kw = dict(
98
- api_key=self._openai_api_key,
99
- **self._client_kwargs(),
88
+ client_init_kwargs = dict(
89
+ api_key=self.openai_api_key,
90
+ **client_kwargs,
100
91
  )
101
92
  try:
102
- return cls(**kw)
93
+ return cls(**client_init_kwargs)
103
94
  except TypeError as e:
104
95
  # older OpenAI SDKs may not accept max_retries, in that case remove and retry once
105
96
  if "max_retries" in str(e):
106
- kw.pop("max_retries", None)
107
- return cls(**kw)
97
+ client_init_kwargs.pop("max_retries", None)
98
+ return cls(**client_init_kwargs)
108
99
  raise
@@ -56,8 +56,8 @@ model_pricing = {
56
56
  class GrokModel(DeepEvalBaseLLM):
57
57
  def __init__(
58
58
  self,
59
- api_key: Optional[str] = None,
60
59
  model: Optional[str] = None,
60
+ api_key: Optional[str] = None,
61
61
  temperature: float = 0,
62
62
  generation_kwargs: Optional[Dict] = None,
63
63
  **kwargs,
@@ -70,6 +70,8 @@ unsupported_log_probs_gpt_models = [
70
70
  "o1-mini-2024-09-12",
71
71
  "o3-mini",
72
72
  "o3-mini-2025-01-31",
73
+ "o4-mini",
74
+ "o4-mini-2025-04-16",
73
75
  "gpt-4.5-preview-2025-02-27",
74
76
  "gpt-5",
75
77
  "gpt-5-2025-08-07",
@@ -1,37 +1,19 @@
1
- from importlib.machinery import SourceFileLoader
2
- import importlib.util
3
- import sys
4
-
5
- from deepeval.openai.patch import patch_openai
1
+ try:
2
+ import openai # noqa: F401
3
+ except ImportError:
4
+ raise ModuleNotFoundError(
5
+ "Please install OpenAI to use this feature: 'pip install openai'"
6
+ )
6
7
 
7
8
 
8
- def load_and_patch_openai():
9
- openai_spec = importlib.util.find_spec("openai")
10
- if not openai_spec or not openai_spec.origin:
11
- raise ImportError("Could not find the OpenAI package")
12
- package_dirs = openai_spec.submodule_search_locations
13
- loader = SourceFileLoader("deepeval_openai", openai_spec.origin)
14
- new_spec = importlib.util.spec_from_loader(
15
- "deepeval_openai",
16
- loader,
17
- origin=openai_spec.origin,
18
- is_package=True,
19
- )
20
- deepeval_openai = importlib.util.module_from_spec(new_spec)
21
- deepeval_openai.__path__ = package_dirs
22
- sys.modules["deepeval_openai"] = deepeval_openai
23
- loader.exec_module(deepeval_openai)
24
- patch_openai(deepeval_openai)
25
- return deepeval_openai
9
+ try:
10
+ from openai import OpenAI, AsyncOpenAI # noqa: F401
11
+ except ImportError:
12
+ OpenAI = None # type: ignore
13
+ AsyncOpenAI = None # type: ignore
26
14
 
27
15
 
28
- patched_openai = load_and_patch_openai()
29
- openai = patched_openai
30
- OpenAI = patched_openai.OpenAI
31
- AsyncOpenAI = patched_openai.AsyncOpenAI
16
+ if OpenAI or AsyncOpenAI:
17
+ from deepeval.openai.patch import patch_openai_classes
32
18
 
33
- __all__ = [
34
- "openai",
35
- "OpenAI",
36
- "AsyncOpenAI",
37
- ]
19
+ patch_openai_classes()
@@ -1,39 +1,36 @@
1
+ import json
1
2
  from openai.types.chat import ChatCompletion, ParsedChatCompletion
2
- from typing import Optional, Union, List, Dict
3
+ from typing import Any, Union, Dict
3
4
  from openai.types.responses import Response
4
- from pydantic import BaseModel
5
- import json
6
5
 
7
6
  from deepeval.test_case.llm_test_case import ToolCall
8
-
9
-
10
- class InputParameters(BaseModel):
11
- model: Optional[str] = None
12
- input: Optional[str] = None
13
- instructions: Optional[str] = None
14
- messages: Optional[List[Dict]] = None
15
- tool_descriptions: Optional[Dict[str, str]] = None
16
-
17
-
18
- class OutputParameters(BaseModel):
19
- output: Optional[str] = None
20
- prompt_tokens: Optional[int] = None
21
- completion_tokens: Optional[int] = None
22
- tools_called: Optional[List[ToolCall]] = None
23
-
24
-
25
- def extract_input_parameters(
26
- is_completion: bool, kwargs: Dict
7
+ from deepeval.openai.utils import (
8
+ render_response_input,
9
+ stringify_multimodal_content,
10
+ render_messages,
11
+ )
12
+ from deepeval.openai.types import InputParameters, OutputParameters
13
+ from deepeval.tracing.types import Message
14
+
15
+
16
+ # guarding against errors to be compatible with legacy APIs
17
+ def safe_extract_input_parameters(
18
+ is_completion: bool, kwargs: Dict[str, Any]
27
19
  ) -> InputParameters:
28
- if is_completion:
29
- return extract_input_parameters_from_completion(kwargs)
30
- else:
31
- return extract_input_parameters_from_response(kwargs)
20
+ try:
21
+ if is_completion:
22
+ return extract_input_parameters_from_completion(kwargs)
23
+ else:
24
+ return extract_input_parameters_from_response(kwargs)
25
+ except:
26
+ return InputParameters(model="NA")
32
27
 
33
28
 
34
- def extract_input_parameters_from_completion(kwargs: Dict) -> InputParameters:
29
+ def extract_input_parameters_from_completion(
30
+ kwargs: Dict[str, Any],
31
+ ) -> InputParameters:
35
32
  model = kwargs.get("model")
36
- messages = kwargs.get("messages")
33
+ messages = kwargs.get("messages") or []
37
34
  tools = kwargs.get("tools")
38
35
  tool_descriptions_map = (
39
36
  {
@@ -45,7 +42,7 @@ def extract_input_parameters_from_completion(kwargs: Dict) -> InputParameters:
45
42
  )
46
43
 
47
44
  # extract first user input from messages
48
- input = ""
45
+ input_arg = ""
49
46
  user_messages = []
50
47
  for message in messages:
51
48
  role = message["role"]
@@ -53,20 +50,25 @@ def extract_input_parameters_from_completion(kwargs: Dict) -> InputParameters:
53
50
  if role == "user":
54
51
  user_messages.append(content)
55
52
  if len(user_messages) > 0:
56
- input = user_messages[0]
53
+ input_arg = user_messages[0]
54
+
55
+ # render messages
56
+ messages = render_messages(messages)
57
57
 
58
58
  return InputParameters(
59
59
  model=model,
60
- input=input,
60
+ input=stringify_multimodal_content(input_arg),
61
61
  messages=messages,
62
62
  tools=tools,
63
63
  tool_descriptions=tool_descriptions_map,
64
64
  )
65
65
 
66
66
 
67
- def extract_input_parameters_from_response(kwargs: Dict) -> InputParameters:
67
+ def extract_input_parameters_from_response(
68
+ kwargs: Dict[str, Any],
69
+ ) -> InputParameters:
68
70
  model = kwargs.get("model")
69
- input = kwargs.get("input")
71
+ input_payload = kwargs.get("input")
70
72
  instructions = kwargs.get("instructions")
71
73
  tools = kwargs.get("tools")
72
74
  tool_descriptions = (
@@ -74,35 +76,59 @@ def extract_input_parameters_from_response(kwargs: Dict) -> InputParameters:
74
76
  if tools is not None
75
77
  else None
76
78
  )
79
+ messages = []
80
+ if isinstance(input_payload, list):
81
+ messages = render_response_input(input_payload)
82
+ elif isinstance(input_payload, str):
83
+ messages = [
84
+ {
85
+ "role": "user",
86
+ "content": input_payload,
87
+ }
88
+ ]
89
+ if instructions:
90
+ messages.insert(
91
+ 0,
92
+ {
93
+ "role": "system",
94
+ "content": instructions,
95
+ },
96
+ )
77
97
  return InputParameters(
78
98
  model=model,
79
- input=input,
99
+ input=stringify_multimodal_content(input_payload),
100
+ messages=messages,
80
101
  instructions=instructions,
81
102
  tools=tools,
82
103
  tool_descriptions=tool_descriptions,
83
104
  )
84
105
 
85
106
 
86
- def extract_output_parameters(
107
+ def safe_extract_output_parameters(
87
108
  is_completion: bool,
88
109
  response: Union[ChatCompletion, ParsedChatCompletion, Response],
89
110
  input_parameters: InputParameters,
90
111
  ) -> OutputParameters:
91
- if is_completion:
92
- return extract_output_parameters_from_completion(
93
- response, input_parameters
94
- )
95
- else:
96
- return extract_output_parameters_from_response(
97
- response, input_parameters
98
- )
112
+
113
+ # guarding against errors to be compatible with legacy APIs
114
+ try:
115
+ if is_completion:
116
+ return extract_output_parameters_from_completion(
117
+ response, input_parameters
118
+ )
119
+ else:
120
+ return extract_output_parameters_from_response(
121
+ response, input_parameters
122
+ )
123
+ except:
124
+ return OutputParameters()
99
125
 
100
126
 
101
127
  def extract_output_parameters_from_completion(
102
128
  completion: Union[ChatCompletion, ParsedChatCompletion],
103
129
  input_parameters: InputParameters,
104
130
  ) -> OutputParameters:
105
- output = str(completion.choices[0].message.content)
131
+ output = str(completion.choices[0].message.content or "")
106
132
  prompt_tokens = completion.usage.prompt_tokens
107
133
  completion_tokens = completion.usage.completion_tokens
108
134
 
@@ -112,16 +138,21 @@ def extract_output_parameters_from_completion(
112
138
  if openai_tool_calls is not None:
113
139
  tools_called = []
114
140
  for tool_call in openai_tool_calls:
141
+ tool_descriptions = input_parameters.tool_descriptions or {}
115
142
  tools_called.append(
116
143
  ToolCall(
117
144
  name=tool_call.function.name,
118
145
  input_parameters=json.loads(tool_call.function.arguments),
119
- description=input_parameters.tool_descriptions.get(
120
- tool_call.function.name
121
- ),
146
+ description=tool_descriptions.get(tool_call.function.name),
122
147
  )
123
148
  )
124
149
 
150
+ if not output and tools_called:
151
+ tool_calls = []
152
+ for tool_call in tools_called:
153
+ tool_calls.append(tool_call)
154
+ output = tool_calls
155
+
125
156
  return OutputParameters(
126
157
  output=output,
127
158
  prompt_tokens=prompt_tokens,
@@ -145,15 +176,19 @@ def extract_output_parameters_from_response(
145
176
  for tool_call in openai_raw_output:
146
177
  if tool_call.type != "function_call":
147
178
  continue
179
+ tool_descriptions = input_parameters.tool_descriptions or {}
148
180
  tools_called.append(
149
181
  ToolCall(
150
182
  name=tool_call.name,
151
183
  input_parameters=json.loads(tool_call.arguments),
152
- description=input_parameters.tool_descriptions.get(
153
- tool_call.name
154
- ),
184
+ description=tool_descriptions.get(tool_call.name),
155
185
  )
156
186
  )
187
+ if not output and tools_called:
188
+ tool_calls = []
189
+ for tool_call in tools_called:
190
+ tool_calls.append(tool_call)
191
+ output = tool_calls
157
192
 
158
193
  return OutputParameters(
159
194
  output=output,