deepeval 3.6.6__py3-none-any.whl → 3.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  3. deepeval/cli/main.py +42 -0
  4. deepeval/confident/api.py +1 -0
  5. deepeval/config/settings.py +22 -4
  6. deepeval/constants.py +8 -1
  7. deepeval/dataset/dataset.py +2 -11
  8. deepeval/dataset/utils.py +1 -1
  9. deepeval/evaluate/evaluate.py +5 -1
  10. deepeval/evaluate/execute.py +97 -42
  11. deepeval/evaluate/utils.py +20 -116
  12. deepeval/integrations/crewai/__init__.py +6 -1
  13. deepeval/integrations/crewai/handler.py +1 -1
  14. deepeval/integrations/crewai/subs.py +51 -0
  15. deepeval/integrations/crewai/wrapper.py +45 -5
  16. deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
  17. deepeval/metrics/api.py +281 -0
  18. deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
  19. deepeval/metrics/bias/bias.py +12 -3
  20. deepeval/metrics/contextual_precision/contextual_precision.py +12 -3
  21. deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
  22. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
  23. deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
  24. deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
  25. deepeval/metrics/conversational_dag/nodes.py +12 -4
  26. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +73 -59
  27. deepeval/metrics/dag/dag.py +12 -0
  28. deepeval/metrics/dag/nodes.py +12 -4
  29. deepeval/metrics/faithfulness/faithfulness.py +12 -1
  30. deepeval/metrics/g_eval/g_eval.py +11 -0
  31. deepeval/metrics/hallucination/hallucination.py +12 -1
  32. deepeval/metrics/indicator.py +8 -2
  33. deepeval/metrics/json_correctness/json_correctness.py +12 -1
  34. deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
  35. deepeval/metrics/mcp/mcp_task_completion.py +13 -0
  36. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +13 -0
  37. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +12 -1
  38. deepeval/metrics/misuse/misuse.py +12 -1
  39. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
  40. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
  41. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
  42. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
  43. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
  44. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +6 -1
  45. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
  46. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
  47. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
  48. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
  49. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
  50. deepeval/metrics/non_advice/non_advice.py +12 -0
  51. deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
  52. deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
  53. deepeval/metrics/role_adherence/role_adherence.py +12 -0
  54. deepeval/metrics/role_violation/role_violation.py +12 -0
  55. deepeval/metrics/summarization/summarization.py +12 -1
  56. deepeval/metrics/task_completion/task_completion.py +3 -0
  57. deepeval/metrics/tool_correctness/tool_correctness.py +8 -0
  58. deepeval/metrics/toxicity/toxicity.py +12 -0
  59. deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
  60. deepeval/models/llms/grok_model.py +1 -1
  61. deepeval/models/llms/openai_model.py +2 -0
  62. deepeval/openai/__init__.py +14 -32
  63. deepeval/openai/extractors.py +24 -34
  64. deepeval/openai/patch.py +256 -161
  65. deepeval/openai/types.py +20 -0
  66. deepeval/openai/utils.py +98 -56
  67. deepeval/prompt/__init__.py +19 -1
  68. deepeval/prompt/api.py +160 -0
  69. deepeval/prompt/prompt.py +244 -62
  70. deepeval/prompt/utils.py +144 -2
  71. deepeval/synthesizer/chunking/context_generator.py +209 -152
  72. deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  73. deepeval/synthesizer/synthesizer.py +8 -5
  74. deepeval/test_case/api.py +131 -0
  75. deepeval/test_run/__init__.py +1 -0
  76. deepeval/test_run/hyperparameters.py +47 -8
  77. deepeval/test_run/test_run.py +104 -1
  78. deepeval/tracing/api.py +3 -1
  79. deepeval/tracing/message_types/__init__.py +10 -0
  80. deepeval/tracing/message_types/base.py +6 -0
  81. deepeval/tracing/message_types/messages.py +14 -0
  82. deepeval/tracing/message_types/tools.py +18 -0
  83. deepeval/tracing/otel/utils.py +1 -1
  84. deepeval/tracing/trace_context.py +73 -4
  85. deepeval/tracing/tracing.py +51 -3
  86. deepeval/tracing/types.py +16 -0
  87. deepeval/tracing/utils.py +8 -0
  88. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/METADATA +1 -1
  89. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/RECORD +92 -84
  90. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/LICENSE.md +0 -0
  91. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/WHEEL +0 -0
  92. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/entry_points.txt +0 -0
deepeval/openai/utils.py CHANGED
@@ -1,67 +1,39 @@
1
- from typing import List
1
+ import json
2
2
  import uuid
3
+ from typing import Any, List, Optional
3
4
 
4
5
  from deepeval.tracing.types import ToolSpan, TraceSpanStatus
5
- from deepeval.openai.extractors import InputParameters, OutputParameters
6
6
  from deepeval.tracing.context import current_span_context
7
- from deepeval.test_case import LLMTestCase
8
- from deepeval.metrics import BaseMetric
9
- from deepeval.tracing.types import TestCaseMetricPair
10
-
11
- openai_test_case_pairs: List[TestCaseMetricPair] = []
12
-
13
-
14
- def set_attr_path(obj, attr_path: str, value):
15
- *pre_path, final_attr = attr_path.split(".")
16
- for attr in pre_path:
17
- obj = getattr(obj, attr, None)
18
- if obj is None:
19
- return
20
- setattr(obj, final_attr, value)
21
-
22
-
23
- def get_attr_path(obj, attr_path: str):
24
- for attr in attr_path.split("."):
25
- obj = getattr(obj, attr, None)
26
- if obj is None:
27
- return None
28
- return obj
29
-
30
-
31
- def add_test_case(
32
- test_case: LLMTestCase,
33
- metrics: List[BaseMetric],
34
- input_parameters: InputParameters,
35
- ):
36
- openai_test_case_pairs.append(
37
- TestCaseMetricPair(
38
- test_case=test_case,
39
- metrics=metrics,
40
- hyperparameters=create_hyperparameters_map(input_parameters),
7
+ from deepeval.utils import shorten, len_long
8
+ from deepeval.openai.types import OutputParameters
9
+
10
+
11
+ _URL_MAX = 200
12
+ _JSON_MAX = max(
13
+ len_long(), 400
14
+ ) # <- make this bigger by increasing DEEPEVAL_MAXLEN_LONG above 400
15
+
16
+
17
+ def _compact_dump(value: Any) -> str:
18
+ try:
19
+ dumped = json.dumps(
20
+ value, ensure_ascii=False, default=str, separators=(",", ":")
41
21
  )
42
- )
43
-
44
-
45
- def create_hyperparameters_map(input_parameters: InputParameters):
46
- hyperparameters = {"model": input_parameters.model}
47
- if input_parameters.instructions:
48
- hyperparameters["system_prompt"] = input_parameters.instructions
49
- elif input_parameters.messages:
50
- system_messages = [
51
- m["content"]
52
- for m in input_parameters.messages
53
- if m["role"] == "system"
54
- ]
55
- if system_messages:
56
- hyperparameters["system_prompt"] = (
57
- system_messages[0]
58
- if len(system_messages) == 1
59
- else str(system_messages)
60
- )
61
- return hyperparameters
22
+ except Exception:
23
+ dumped = repr(value)
24
+ return shorten(dumped, max_len=_JSON_MAX)
25
+
26
+
27
+ def _fmt_url(url: Optional[str]) -> str:
28
+ if not url:
29
+ return ""
30
+ if url.startswith("data:"):
31
+ return "[data-uri]"
32
+ return shorten(url, max_len=_URL_MAX)
62
33
 
63
34
 
64
35
  def create_child_tool_spans(output_parameters: OutputParameters):
36
+
65
37
  if output_parameters.tools_called is None:
66
38
  return
67
39
 
@@ -84,3 +56,73 @@ def create_child_tool_spans(output_parameters: OutputParameters):
84
56
  }
85
57
  )
86
58
  current_span.children.append(tool_span)
59
+
60
+
61
+ def stringify_multimodal_content(content: Any) -> str:
62
+ """
63
+ Return a short, human-readable summary string for an OpenAI-style multimodal `content` value.
64
+
65
+ This is used to populate span summaries, such as `InputParameters.input`. It never raises and
66
+ never returns huge blobs.
67
+
68
+ Notes:
69
+ - Data URIs are redacted to "[data-uri]".
70
+ - Output is capped via `deepeval.utils.shorten` (configurable through settings).
71
+ - Fields that are not explicitly handled are returned as size-capped JSON dumps
72
+ - This string is for display/summary only, not intended to be parsable.
73
+
74
+ Args:
75
+ content: The value of an OpenAI message `content`, may be a str or list of typed parts,
76
+ or any nested structure.
77
+
78
+ Returns:
79
+ A short, readable `str` summary.
80
+ """
81
+ if content is None:
82
+ return ""
83
+ if isinstance(content, str):
84
+ return content
85
+ if isinstance(content, (bytes, bytearray)):
86
+ return f"[bytes:{len(content)}]"
87
+
88
+ # list of parts for Chat & Responses
89
+ if isinstance(content, list):
90
+ parts: List[str] = []
91
+ for part in content:
92
+ s = stringify_multimodal_content(part)
93
+ if s:
94
+ parts.append(s)
95
+ return "\n".join(parts)
96
+
97
+ # documented dict shapes (Chat & Responses)
98
+ if isinstance(content, dict):
99
+ t = content.get("type")
100
+
101
+ # Chat Completions
102
+ if t == "text":
103
+ return str(content.get("text", ""))
104
+ if t == "image_url":
105
+ image_url = content.get("image_url")
106
+ if isinstance(image_url, str):
107
+ url = image_url
108
+ else:
109
+ url = (image_url or {}).get("url") or content.get("url")
110
+ return f"[image:{_fmt_url(url)}]"
111
+
112
+ # Responses API variants
113
+ if t == "input_text":
114
+ return str(content.get("text", ""))
115
+ if t == "input_image":
116
+ image_url = content.get("image_url")
117
+ if isinstance(image_url, str):
118
+ url = image_url
119
+ else:
120
+ url = (image_url or {}).get("url") or content.get("url")
121
+ return f"[image:{_fmt_url(url)}]"
122
+
123
+ # readability for other input_* types we don't currently handle
124
+ if t and t.startswith("input_"):
125
+ return f"[{t}]"
126
+
127
+ # unknown dicts and types returned as shortened JSON
128
+ return _compact_dump(content)
@@ -1,3 +1,21 @@
1
1
  from .prompt import Prompt
2
+ from .api import (
3
+ PromptMessage,
4
+ ModelSettings,
5
+ ModelProvider,
6
+ Verbosity,
7
+ ReasoningEffort,
8
+ OutputType,
9
+ PromptInterpolationType,
10
+ )
2
11
 
3
- __all__ = ["Prompt"]
12
+ __all__ = [
13
+ "Prompt",
14
+ "PromptMessage",
15
+ "ModelSettings",
16
+ "ModelProvider",
17
+ "Verbosity",
18
+ "ReasoningEffort",
19
+ "OutputType",
20
+ "PromptInterpolationType",
21
+ ]
deepeval/prompt/api.py CHANGED
@@ -1,6 +1,119 @@
1
1
  from pydantic import BaseModel, Field, AliasChoices
2
2
  from enum import Enum
3
3
  from typing import List, Optional
4
+ from pydantic import TypeAdapter
5
+
6
+ ###################################
7
+ # Model Settings
8
+ ###################################
9
+
10
+
11
+ class ReasoningEffort(Enum):
12
+ MINIMAL = "MINIMAL"
13
+ LOW = "LOW"
14
+ MEDIUM = "MEDIUM"
15
+ HIGH = "HIGH"
16
+
17
+
18
+ class Verbosity(Enum):
19
+ LOW = "LOW"
20
+ MEDIUM = "MEDIUM"
21
+ HIGH = "HIGH"
22
+
23
+
24
+ class ModelProvider(Enum):
25
+ OPEN_AI = "OPEN_AI"
26
+ ANTHROPIC = "ANTHROPIC"
27
+ GEMINI = "GEMINI"
28
+ X_AI = "X_AI"
29
+ DEEPSEEK = "DEEPSEEK"
30
+ BEDROCK = "BEDROCK"
31
+
32
+
33
+ class ModelSettings(BaseModel):
34
+ provider: Optional[ModelProvider] = None
35
+ name: Optional[str] = None
36
+ temperature: Optional[float] = None
37
+ max_tokens: Optional[int] = Field(
38
+ default=None,
39
+ serialization_alias="maxTokens",
40
+ validation_alias=AliasChoices("max_tokens", "maxTokens"),
41
+ )
42
+ top_p: Optional[float] = Field(
43
+ default=None,
44
+ serialization_alias="topP",
45
+ validation_alias=AliasChoices("top_p", "topP"),
46
+ )
47
+ frequency_penalty: Optional[float] = Field(
48
+ default=None,
49
+ serialization_alias="frequencyPenalty",
50
+ validation_alias=AliasChoices("frequency_penalty", "frequencyPenalty"),
51
+ )
52
+ presence_penalty: Optional[float] = Field(
53
+ default=None,
54
+ serialization_alias="presencePenalty",
55
+ validation_alias=AliasChoices("presence_penalty", "presencePenalty"),
56
+ )
57
+ stop_sequence: Optional[List[str]] = Field(
58
+ default=None,
59
+ serialization_alias="stopSequence",
60
+ validation_alias=AliasChoices("stop_sequence", "stopSequence"),
61
+ )
62
+ reasoning_effort: Optional[ReasoningEffort] = Field(
63
+ default=None,
64
+ serialization_alias="reasoningEffort",
65
+ validation_alias=AliasChoices("reasoning_effort", "reasoningEffort"),
66
+ )
67
+ verbosity: Optional[Verbosity] = Field(
68
+ default=None,
69
+ serialization_alias="verbosity",
70
+ validation_alias=AliasChoices("verbosity", "verbosity"),
71
+ )
72
+
73
+
74
+ ###################################
75
+ # Output Settings
76
+ ###################################
77
+
78
+
79
+ class OutputType(Enum):
80
+ TEXT = "TEXT"
81
+ JSON = "JSON"
82
+ SCHEMA = "SCHEMA"
83
+
84
+
85
+ class SchemaDataType(Enum):
86
+ OBJECT = "OBJECT"
87
+ STRING = "STRING"
88
+ FLOAT = "FLOAT"
89
+ INTEGER = "INTEGER"
90
+ BOOLEAN = "BOOLEAN"
91
+ NULL = "NULL"
92
+
93
+
94
+ class OutputSchemaField(BaseModel):
95
+ id: str
96
+ type: SchemaDataType
97
+ name: str
98
+ required: Optional[bool] = False
99
+ parent_id: Optional[str] = Field(
100
+ default=None,
101
+ serialization_alias="parentId",
102
+ validation_alias=AliasChoices("parent_id", "parentId"),
103
+ )
104
+
105
+ class Config:
106
+ use_enum_values = True
107
+
108
+
109
+ class OutputSchema(BaseModel):
110
+ fields: Optional[List[OutputSchemaField]] = None
111
+ name: str
112
+
113
+
114
+ ###################################
115
+ # Prompt
116
+ ###################################
4
117
 
5
118
 
6
119
  class PromptInterpolationType(Enum):
@@ -16,6 +129,9 @@ class PromptMessage(BaseModel):
16
129
  content: str
17
130
 
18
131
 
132
+ PromptMessageList = TypeAdapter(List[PromptMessage])
133
+
134
+
19
135
  class PromptType(Enum):
20
136
  TEXT = "TEXT"
21
137
  LIST = "LIST"
@@ -53,6 +169,21 @@ class PromptHttpResponse(BaseModel):
53
169
  serialization_alias="interpolationType"
54
170
  )
55
171
  type: PromptType
172
+ model_settings: Optional[ModelSettings] = Field(
173
+ default=None,
174
+ serialization_alias="modelSettings",
175
+ validation_alias=AliasChoices("model_settings", "modelSettings"),
176
+ )
177
+ output_type: Optional[OutputType] = Field(
178
+ default=None,
179
+ serialization_alias="outputType",
180
+ validation_alias=AliasChoices("output_type", "outputType"),
181
+ )
182
+ output_schema: Optional[OutputSchema] = Field(
183
+ default=None,
184
+ serialization_alias="outputSchema",
185
+ validation_alias=AliasChoices("output_schema", "outputSchema"),
186
+ )
56
187
 
57
188
 
58
189
  class PromptPushRequest(BaseModel):
@@ -62,6 +193,35 @@ class PromptPushRequest(BaseModel):
62
193
  interpolation_type: PromptInterpolationType = Field(
63
194
  serialization_alias="interpolationType"
64
195
  )
196
+ model_settings: Optional[ModelSettings] = Field(
197
+ default=None, serialization_alias="modelSettings"
198
+ )
199
+ output_schema: Optional[OutputSchema] = Field(
200
+ default=None, serialization_alias="outputSchema"
201
+ )
202
+ output_type: Optional[OutputType] = Field(
203
+ default=None, serialization_alias="outputType"
204
+ )
205
+
206
+ class Config:
207
+ use_enum_values = True
208
+
209
+
210
+ class PromptUpdateRequest(BaseModel):
211
+ text: Optional[str] = None
212
+ messages: Optional[List[PromptMessage]] = None
213
+ interpolation_type: PromptInterpolationType = Field(
214
+ serialization_alias="interpolationType"
215
+ )
216
+ model_settings: Optional[ModelSettings] = Field(
217
+ default=None, serialization_alias="modelSettings"
218
+ )
219
+ output_schema: Optional[OutputSchema] = Field(
220
+ default=None, serialization_alias="outputSchema"
221
+ )
222
+ output_type: Optional[OutputType] = Field(
223
+ default=None, serialization_alias="outputType"
224
+ )
65
225
 
66
226
  class Config:
67
227
  use_enum_values = True