deepeval 3.7.9__py3-none-any.whl → 3.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -145,6 +145,7 @@ def enter_current_context(
145
145
  progress: Optional[Progress] = None,
146
146
  pbar_callback_id: Optional[int] = None,
147
147
  uuid_str: Optional[str] = None,
148
+ fallback_trace_uuid: Optional[str] = None,
148
149
  ) -> BaseSpan:
149
150
  start_time = perf_counter()
150
151
  observe_kwargs = observe_kwargs or {}
@@ -159,12 +160,27 @@ def enter_current_context(
159
160
  parent_uuid: Optional[str] = None
160
161
 
161
162
  if parent_span:
162
- parent_uuid = parent_span.uuid
163
- trace_uuid = parent_span.trace_uuid
164
- else:
163
+ # Validate that the parent span's trace is still active
164
+ if parent_span.trace_uuid in trace_manager.active_traces:
165
+ parent_uuid = parent_span.uuid
166
+ trace_uuid = parent_span.trace_uuid
167
+ else:
168
+ # Parent span references a dead trace - treat as if no parent
169
+ parent_span = None
170
+
171
+ if not parent_span:
165
172
  current_trace = current_trace_context.get()
166
- if current_trace:
173
+ # IMPORTANT: Verify trace is still active, not just in context
174
+ # (a previous failed async operation might leave a dead trace in context)
175
+ if current_trace and current_trace.uuid in trace_manager.active_traces:
167
176
  trace_uuid = current_trace.uuid
177
+ elif (
178
+ fallback_trace_uuid
179
+ and fallback_trace_uuid in trace_manager.active_traces
180
+ ):
181
+ # In async contexts, ContextVar may not propagate. Use the fallback trace_uuid
182
+ # provided by the CallbackHandler to avoid creating duplicate traces.
183
+ trace_uuid = fallback_trace_uuid
168
184
  else:
169
185
  trace = trace_manager.start_new_trace(
170
186
  metric_collection=metric_collection
@@ -258,11 +274,13 @@ def exit_current_context(
258
274
 
259
275
  current_span = current_span_context.get()
260
276
 
277
+ # In async contexts (LangChain/LangGraph), context variables don't propagate
278
+ # reliably across task boundaries. Fall back to direct span lookup.
261
279
  if not current_span or current_span.uuid != uuid_str:
262
- print(
263
- f"Error: Current span in context does not match the span being exited. Expected UUID: {uuid_str}, Got: {current_span.uuid if current_span else 'None'}"
264
- )
265
- return
280
+ current_span = trace_manager.get_span_by_uuid(uuid_str)
281
+ if not current_span:
282
+ # Span already removed or never existed
283
+ return
266
284
 
267
285
  current_span.end_time = end_time
268
286
  if exc_type is not None:
@@ -295,7 +313,12 @@ def exit_current_context(
295
313
  else:
296
314
  current_span_context.set(None)
297
315
  else:
316
+ # Try context first, then fall back to direct trace lookup for async contexts
298
317
  current_trace = current_trace_context.get()
318
+ if not current_trace and current_span.trace_uuid:
319
+ current_trace = trace_manager.get_trace_by_uuid(
320
+ current_span.trace_uuid
321
+ )
299
322
  if current_span.status == TraceSpanStatus.ERRORED and current_trace:
300
323
  current_trace.status = TraceSpanStatus.ERRORED
301
324
  if current_trace and current_trace.uuid == current_span.trace_uuid:
deepeval/key_handler.py CHANGED
@@ -162,6 +162,13 @@ class ModelKeyValues(Enum):
162
162
  VLLM_API_KEY = "VLLM_API_KEY"
163
163
  VLLM_MODEL_NAME = "VLLM_MODEL_NAME"
164
164
 
165
+ # OpenRouter
166
+ USE_OPENROUTER_MODEL = "USE_OPENROUTER_MODEL"
167
+ OPENROUTER_MODEL_NAME = "OPENROUTER_MODEL_NAME"
168
+ OPENROUTER_COST_PER_INPUT_TOKEN = "OPENROUTER_COST_PER_INPUT_TOKEN"
169
+ OPENROUTER_COST_PER_OUTPUT_TOKEN = "OPENROUTER_COST_PER_OUTPUT_TOKEN"
170
+ OPENROUTER_API_KEY = "OPENROUTER_API_KEY"
171
+
165
172
 
166
173
  class EmbeddingKeyValues(Enum):
167
174
  # Azure OpenAI
@@ -174,7 +181,7 @@ class EmbeddingKeyValues(Enum):
174
181
  USE_LOCAL_EMBEDDINGS = "USE_LOCAL_EMBEDDINGS"
175
182
  LOCAL_EMBEDDING_MODEL_NAME = "LOCAL_EMBEDDING_MODEL_NAME"
176
183
  LOCAL_EMBEDDING_BASE_URL = "LOCAL_EMBEDDING_BASE_URL"
177
- LOCAL_EMBEDDING_API_KEY = "LOCAL_EMBEDDING_API_KEY"
184
+ LOCAL_EMBEDDING_API_KEY = ("LOCAL_EMBEDDING_API_KEY",)
178
185
 
179
186
 
180
187
  class KeyFileHandler:
@@ -2,6 +2,7 @@
2
2
 
3
3
  from openai.types.chat.chat_completion import ChatCompletion
4
4
  from typing import Optional, List, Tuple, Union, Dict, Type
5
+ from rich.console import Console
5
6
  import math
6
7
  from deepeval.metrics import BaseConversationalMetric
7
8
  from deepeval.metrics.g_eval.utils import (
@@ -11,6 +12,8 @@ from deepeval.metrics.g_eval.utils import (
11
12
  format_rubrics,
12
13
  validate_and_sort_rubrics,
13
14
  validate_criteria_and_evaluation_steps,
15
+ CONVERSATIONAL_G_EVAL_API_PARAMS,
16
+ construct_geval_upload_payload,
14
17
  )
15
18
  from deepeval.test_case import (
16
19
  TurnParams,
@@ -33,6 +36,7 @@ from deepeval.models import DeepEvalBaseLLM
33
36
  from deepeval.metrics.indicator import metric_progress_indicator
34
37
  import deepeval.metrics.conversational_g_eval.schema as cgschema
35
38
  from deepeval.metrics.api import metric_data_manager
39
+ from deepeval.confident.api import Api, Endpoints, HttpMethods
36
40
 
37
41
 
38
42
  class ConversationalGEval(BaseConversationalMetric):
@@ -412,6 +416,37 @@ class ConversationalGEval(BaseConversationalMetric):
412
416
  self.success = False
413
417
  return self.success
414
418
 
419
+ def upload(self):
420
+ api = Api()
421
+
422
+ payload = construct_geval_upload_payload(
423
+ name=self.name,
424
+ evaluation_params=self.evaluation_params,
425
+ g_eval_api_params=CONVERSATIONAL_G_EVAL_API_PARAMS,
426
+ criteria=self.criteria,
427
+ evaluation_steps=self.evaluation_steps,
428
+ multi_turn=True,
429
+ rubric=self.rubric,
430
+ )
431
+
432
+ data, _ = api.send_request(
433
+ method=HttpMethods.POST,
434
+ endpoint=Endpoints.METRICS_ENDPOINT,
435
+ body=payload,
436
+ )
437
+
438
+ metric_id = data.get("id")
439
+ self.metric_id = metric_id
440
+ console = Console()
441
+
442
+ if metric_id:
443
+ console.print(
444
+ "[rgb(5,245,141)]✓[/rgb(5,245,141)] Metric uploaded successfully "
445
+ f"(id: [bold]{metric_id}[/bold])"
446
+ )
447
+
448
+ return data
449
+
415
450
  @property
416
451
  def __name__(self):
417
452
  if self._include_g_eval_suffix:
@@ -1,7 +1,7 @@
1
1
  """LLM evaluated metric based on the GEval framework: https://arxiv.org/pdf/2303.16634.pdf"""
2
2
 
3
3
  import asyncio
4
-
4
+ from rich.console import Console
5
5
  from typing import Optional, List, Tuple, Union, Type
6
6
  from deepeval.metrics import BaseMetric
7
7
  from deepeval.test_case import (
@@ -32,9 +32,12 @@ from deepeval.metrics.g_eval.utils import (
32
32
  validate_criteria_and_evaluation_steps,
33
33
  number_evaluation_steps,
34
34
  get_score_range,
35
+ construct_geval_upload_payload,
36
+ G_EVAL_API_PARAMS,
35
37
  )
36
38
  from deepeval.metrics.api import metric_data_manager
37
39
  from deepeval.config.settings import get_settings
40
+ from deepeval.confident.api import Api, Endpoints, HttpMethods
38
41
 
39
42
 
40
43
  class GEval(BaseMetric):
@@ -408,6 +411,37 @@ class GEval(BaseMetric):
408
411
  self.success = False
409
412
  return self.success
410
413
 
414
+ def upload(self):
415
+ api = Api()
416
+
417
+ payload = construct_geval_upload_payload(
418
+ name=self.name,
419
+ evaluation_params=self.evaluation_params,
420
+ g_eval_api_params=G_EVAL_API_PARAMS,
421
+ criteria=self.criteria,
422
+ evaluation_steps=self.evaluation_steps,
423
+ multi_turn=False,
424
+ rubric=self.rubric,
425
+ )
426
+
427
+ data, _ = api.send_request(
428
+ method=HttpMethods.POST,
429
+ endpoint=Endpoints.METRICS_ENDPOINT,
430
+ body=payload,
431
+ )
432
+
433
+ metric_id = data.get("id")
434
+ self.metric_id = metric_id
435
+ console = Console()
436
+
437
+ if metric_id:
438
+ console.print(
439
+ "[rgb(5,245,141)]✓[/rgb(5,245,141)] Metric uploaded successfully "
440
+ f"(id: [bold]{metric_id}[/bold])"
441
+ )
442
+
443
+ return data
444
+
411
445
  @property
412
446
  def __name__(self):
413
447
  if self._include_g_eval_suffix:
@@ -52,6 +52,71 @@ CONVERSATIONAL_G_EVAL_PARAMS = {
52
52
  TurnParams.SCENARIO: "Scenario",
53
53
  }
54
54
 
55
+ G_EVAL_API_PARAMS = {
56
+ LLMTestCaseParams.INPUT: "input",
57
+ LLMTestCaseParams.ACTUAL_OUTPUT: "actualOutput",
58
+ LLMTestCaseParams.EXPECTED_OUTPUT: "expectedOutput",
59
+ LLMTestCaseParams.CONTEXT: "context",
60
+ LLMTestCaseParams.RETRIEVAL_CONTEXT: "retrievalContext",
61
+ LLMTestCaseParams.EXPECTED_TOOLS: "expectedTools",
62
+ LLMTestCaseParams.TOOLS_CALLED: "toolsCalled",
63
+ }
64
+
65
+ CONVERSATIONAL_G_EVAL_API_PARAMS = {
66
+ TurnParams.ROLE: "role",
67
+ TurnParams.CONTENT: "content",
68
+ TurnParams.SCENARIO: "scenario",
69
+ TurnParams.EXPECTED_OUTCOME: "expectedOutcome",
70
+ TurnParams.RETRIEVAL_CONTEXT: "retrievalContext",
71
+ TurnParams.TOOLS_CALLED: "toolsCalled",
72
+ }
73
+
74
+
75
+ def construct_geval_upload_payload(
76
+ name: str,
77
+ evaluation_params: List[LLMTestCaseParams],
78
+ g_eval_api_params: Dict,
79
+ criteria: Optional[str] = None,
80
+ evaluation_steps: Optional[List[str]] = None,
81
+ multi_turn: bool = False,
82
+ rubric: Optional[List[Rubric]] = None,
83
+ ) -> Dict:
84
+ if not evaluation_params:
85
+ raise ValueError("GEval requires at least one evaluation parameter.")
86
+
87
+ unsupported_params = [
88
+ param for param in evaluation_params if param not in g_eval_api_params
89
+ ]
90
+ if unsupported_params:
91
+ raise ValueError(
92
+ "Unsupported evaluation params for GEval upload: "
93
+ + ", ".join(param.name for param in unsupported_params)
94
+ )
95
+
96
+ payload = {
97
+ "name": name,
98
+ "evaluationParams": [
99
+ g_eval_api_params[param] for param in evaluation_params
100
+ ],
101
+ "multiTurn": multi_turn,
102
+ }
103
+
104
+ if criteria is not None:
105
+ payload["criteria"] = criteria
106
+ else:
107
+ payload["evaluationSteps"] = evaluation_steps
108
+
109
+ if rubric is not None:
110
+ payload["rubric"] = [
111
+ {
112
+ "scoreRange": list(r.score_range),
113
+ "expectedOutcome": r.expected_outcome,
114
+ }
115
+ for r in rubric
116
+ ]
117
+
118
+ return payload
119
+
55
120
 
56
121
  def validate_criteria_and_evaluation_steps(
57
122
  criteria: Optional[str] = None,
@@ -16,6 +16,7 @@ from deepeval.models.llms import (
16
16
  GrokModel,
17
17
  DeepSeekModel,
18
18
  PortkeyModel,
19
+ OpenRouterModel,
19
20
  )
20
21
  from deepeval.models.embedding_models import (
21
22
  OpenAIEmbeddingModel,
@@ -44,4 +45,5 @@ __all__ = [
44
45
  "LocalEmbeddingModel",
45
46
  "OllamaEmbeddingModel",
46
47
  "PortkeyModel",
48
+ "OpenRouterModel",
47
49
  ]
@@ -10,6 +10,7 @@ from .kimi_model import KimiModel
10
10
  from .grok_model import GrokModel
11
11
  from .deepseek_model import DeepSeekModel
12
12
  from .portkey_model import PortkeyModel
13
+ from .openrouter_model import OpenRouterModel
13
14
 
14
15
  __all__ = [
15
16
  "AzureOpenAIModel",
@@ -24,4 +25,5 @@ __all__ = [
24
25
  "GrokModel",
25
26
  "DeepSeekModel",
26
27
  "PortkeyModel",
28
+ "OpenRouterModel",
27
29
  ]
@@ -3,6 +3,11 @@ from typing import Any, Callable, Union
3
3
  from deepeval.models.base_model import DeepEvalModelData
4
4
 
5
5
 
6
+ DEFAULT_GPT_MODEL = "gpt-4.1"
7
+ # OpenRouter uses provider/model format (e.g., "openai/gpt-4", "anthropic/claude-3-opus")
8
+ # DeepEval does not validate OpenRouter model strings.
9
+ DEFAULT_OPENROUTER_MODEL = f"openai/{DEFAULT_GPT_MODEL}"
10
+
6
11
  ModelDataFactory = Callable[[], DeepEvalModelData]
7
12
  ModelDataValue = Union[DeepEvalModelData, ModelDataFactory]
8
13
 
@@ -366,6 +371,24 @@ OPENAI_MODELS_DATA = ModelDataRegistry(
366
371
  input_price=1.25 / 1e6,
367
372
  output_price=10.00 / 1e6,
368
373
  ),
374
+ "gpt-5.1": make_model_data(
375
+ supports_log_probs=False,
376
+ supports_multimodal=True,
377
+ supports_structured_outputs=True,
378
+ supports_json=False,
379
+ supports_temperature=False,
380
+ input_price=1.25 / 1e6,
381
+ output_price=10.00 / 1e6,
382
+ ),
383
+ "gpt-5.2": make_model_data(
384
+ supports_log_probs=False,
385
+ supports_multimodal=True,
386
+ supports_structured_outputs=True,
387
+ supports_json=False,
388
+ supports_temperature=False,
389
+ input_price=1.75 / 1e6,
390
+ output_price=14.00 / 1e6,
391
+ ),
369
392
  }
370
393
  )
371
394
 
@@ -24,14 +24,13 @@ from deepeval.models.retry_policy import (
24
24
  sdk_retries_for,
25
25
  )
26
26
  from deepeval.models.llms.constants import (
27
+ DEFAULT_GPT_MODEL,
27
28
  OPENAI_MODELS_DATA,
28
29
  )
29
30
 
30
31
 
31
32
  retry_openai = create_retry_decorator(PS.OPENAI)
32
33
 
33
- default_gpt_model = "gpt-4.1"
34
-
35
34
 
36
35
  def _request_timeout_seconds() -> float:
37
36
  timeout = float(get_settings().DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS or 0)
@@ -70,7 +69,7 @@ class GPTModel(DeepEvalBaseLLM):
70
69
 
71
70
  model = model or settings.OPENAI_MODEL_NAME
72
71
  if model is None:
73
- model = default_gpt_model
72
+ model = DEFAULT_GPT_MODEL
74
73
 
75
74
  cost_per_input_token = (
76
75
  cost_per_input_token
@@ -377,7 +376,9 @@ class GPTModel(DeepEvalBaseLLM):
377
376
  # Utilities #
378
377
  #############
379
378
 
380
- def calculate_cost(self, input_tokens: int, output_tokens: int) -> float:
379
+ def calculate_cost(
380
+ self, input_tokens: int, output_tokens: int
381
+ ) -> Optional[float]:
381
382
  if self.model_data.input_price and self.model_data.output_price:
382
383
  input_cost = input_tokens * self.model_data.input_price
383
384
  output_cost = output_tokens * self.model_data.output_price