deepeval 3.5.5__py3-none-any.whl → 3.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deepeval/_version.py CHANGED
@@ -1 +1 @@
1
- __version__: str = "3.5.5"
1
+ __version__: str = "3.5.7"
deepeval/cli/main.py CHANGED
@@ -28,8 +28,6 @@ import typer
28
28
  from enum import Enum
29
29
  from pydantic import SecretStr
30
30
  from deepeval.key_handler import (
31
- KEY_FILE_HANDLER,
32
- KeyValues,
33
31
  EmbeddingKeyValues,
34
32
  ModelKeyValues,
35
33
  )
@@ -46,16 +44,9 @@ from deepeval.cli.utils import (
46
44
  render_login_message,
47
45
  upload_and_open_link,
48
46
  PROD,
49
- resolve_save_target,
50
- save_environ_to_store,
51
- unset_environ_in_store,
52
- switch_model_provider,
53
47
  )
54
48
  from deepeval.confident.api import (
55
- get_confident_api_key,
56
49
  is_confident,
57
- set_confident_api_key,
58
- CONFIDENT_API_KEY_ENV_VAR,
59
50
  )
60
51
 
61
52
  app = typer.Typer(name="deepeval")
@@ -109,7 +100,7 @@ def set_confident_region_command(
109
100
  # Add flag emojis based on region
110
101
  flag = "🇺🇸" if region == Regions.US else "🇪🇺"
111
102
 
112
- setting = get_settings()
103
+ settings = get_settings()
113
104
  with settings.edit(save=save) as edit_ctx:
114
105
  settings.CONFIDENT_REGION = region.value
115
106
 
@@ -282,23 +273,196 @@ def view():
282
273
  upload_and_open_link(_span=span)
283
274
 
284
275
 
285
- @app.command(name="enable-grpc-logging")
286
- def enable_grpc_logging(save: Optional[str] = None):
276
+ @app.command(name="set-debug")
277
+ def set_debug(
278
+ # Core verbosity
279
+ log_level: Optional[str] = typer.Option(
280
+ None,
281
+ "--log-level",
282
+ help="Global LOG_LEVEL (DEBUG|INFO|WARNING|ERROR|CRITICAL|NOTSET).",
283
+ ),
284
+ verbose: Optional[bool] = typer.Option(
285
+ None, "--verbose/--no-verbose", help="Toggle DEEPEVAL_VERBOSE_MODE."
286
+ ),
287
+ # Retry logging dials
288
+ retry_before_level: Optional[str] = typer.Option(
289
+ None,
290
+ "--retry-before-level",
291
+ help="Log level before a retry attempt (DEBUG|INFO|WARNING|ERROR|CRITICAL|NOTSET or numeric).",
292
+ ),
293
+ retry_after_level: Optional[str] = typer.Option(
294
+ None,
295
+ "--retry-after-level",
296
+ help="Log level after a retry attempt (DEBUG|INFO|WARNING|ERROR|CRITICAL|NOTSET or numeric).",
297
+ ),
298
+ # gRPC visibility
299
+ grpc: Optional[bool] = typer.Option(
300
+ None, "--grpc/--no-grpc", help="Toggle DEEPEVAL_GRPC_LOGGING."
301
+ ),
302
+ grpc_verbosity: Optional[str] = typer.Option(
303
+ None,
304
+ "--grpc-verbosity",
305
+ help="Set GRPC_VERBOSITY (DEBUG|INFO|ERROR|NONE).",
306
+ ),
307
+ grpc_trace: Optional[str] = typer.Option(
308
+ None,
309
+ "--grpc-trace",
310
+ help=(
311
+ "Set GRPC_TRACE to comma-separated tracer names or glob patterns "
312
+ "(e.g. 'tcp,http,secure_endpoint', '*' for all, 'list_tracers' to print available)."
313
+ ),
314
+ ),
315
+ # Confident tracing
316
+ trace_verbose: Optional[bool] = typer.Option(
317
+ None,
318
+ "--trace-verbose/--no-trace-verbose",
319
+ help="Enable / disable CONFIDENT_TRACE_VERBOSE.",
320
+ ),
321
+ trace_env: Optional[str] = typer.Option(
322
+ None,
323
+ "--trace-env",
324
+ help='Set CONFIDENT_TRACE_ENVIRONMENT ("development", "staging", "production", etc).',
325
+ ),
326
+ trace_flush: Optional[bool] = typer.Option(
327
+ None,
328
+ "--trace-flush/--no-trace-flush",
329
+ help="Enable / disable CONFIDENT_TRACE_FLUSH.",
330
+ ),
331
+ # Advanced / potentially surprising
332
+ error_reporting: Optional[bool] = typer.Option(
333
+ None,
334
+ "--error-reporting/--no-error-reporting",
335
+ help="Enable / disable ERROR_REPORTING.",
336
+ ),
337
+ ignore_errors: Optional[bool] = typer.Option(
338
+ None,
339
+ "--ignore-errors/--no-ignore-errors",
340
+ help="Enable / disable IGNORE_DEEPEVAL_ERRORS (not recommended in normal debugging).",
341
+ ),
342
+ # Persistence
343
+ save: Optional[str] = typer.Option(
344
+ None,
345
+ "--save",
346
+ help="Persist CLI parameters as environment variables in a dotenv file. "
347
+ "Usage: --save=dotenv[:path] (default: .env.local)",
348
+ ),
349
+ ):
287
350
  """
288
- Enable verbose gRPC logging for the current process.
289
- Pass --save=dotenv[:path] to persist it (optional).
351
+ Configure verbose debug behavior for DeepEval.
352
+
353
+ This command lets you mix-and-match verbosity flags (global LOG_LEVEL, verbose mode),
354
+ retry logger levels, gRPC wire logging, and Confident trace toggles. Values apply
355
+ immediately to the current process and can be persisted to a dotenv file with --save.
356
+
357
+ Examples:
358
+ deepeval set-debug --log-level DEBUG --verbose --grpc --retry-before-level DEBUG --retry-after-level INFO
359
+ deepeval set-debug --trace-verbose --trace-env staging --save dotenv:.env.local
290
360
  """
291
361
  settings = get_settings()
292
362
  with settings.edit(save=save) as edit_ctx:
293
- settings.DEEPEVAL_GRPC_LOGGING = True
363
+ # Core verbosity
364
+ if log_level is not None:
365
+ settings.LOG_LEVEL = log_level
366
+ if verbose is not None:
367
+ settings.DEEPEVAL_VERBOSE_MODE = verbose
368
+
369
+ # Retry logging
370
+ if retry_before_level is not None:
371
+ settings.DEEPEVAL_RETRY_BEFORE_LOG_LEVEL = retry_before_level
372
+ if retry_after_level is not None:
373
+ settings.DEEPEVAL_RETRY_AFTER_LOG_LEVEL = retry_after_level
374
+
375
+ # gRPC
376
+ if grpc is not None:
377
+ settings.DEEPEVAL_GRPC_LOGGING = grpc
378
+ if grpc_verbosity is not None:
379
+ settings.GRPC_VERBOSITY = grpc_verbosity
380
+ if grpc_trace is not None:
381
+ settings.GRPC_TRACE = grpc_trace
382
+
383
+ # Confident tracing
384
+ if trace_verbose is not None:
385
+ settings.CONFIDENT_TRACE_VERBOSE = trace_verbose
386
+ if trace_env is not None:
387
+ settings.CONFIDENT_TRACE_ENVIRONMENT = trace_env
388
+ if trace_flush is not None:
389
+ settings.CONFIDENT_TRACE_FLUSH = trace_flush
390
+
391
+ # Advanced
392
+ if error_reporting is not None:
393
+ settings.ERROR_REPORTING = error_reporting
394
+ if ignore_errors is not None:
395
+ settings.IGNORE_DEEPEVAL_ERRORS = ignore_errors
396
+
397
+ handled, path, updated = edit_ctx.result
398
+
399
+ if not updated:
400
+ # no changes were made, so there is nothing to do.
401
+ return
402
+
403
+ if not handled and save is not None:
404
+ print("Unsupported --save option. Use --save=dotenv[:path].")
405
+ elif path:
406
+ print(
407
+ f"Saved environment variables to {path} (ensure it's git-ignored)."
408
+ )
409
+ else:
410
+ print(
411
+ "Settings updated for this session. To persist, use --save=dotenv[:path] "
412
+ "(default .env.local) or set DEEPEVAL_DEFAULT_SAVE=dotenv:.env.local"
413
+ )
414
+
415
+ print(":loud_sound: Debug options updated.")
416
+
417
+
418
+ @app.command(name="unset-debug")
419
+ def unset_debug(
420
+ save: Optional[str] = typer.Option(
421
+ None,
422
+ "--save",
423
+ help="Remove only the debug-related environment variables from a dotenv file. "
424
+ "Usage: --save=dotenv[:path] (default: .env.local)",
425
+ ),
426
+ ):
427
+ """
428
+ Restore default behavior by unsetting debug related variables.
429
+
430
+ Behavior:
431
+ - Resets LOG_LEVEL back to 'info'.
432
+ - Unsets DEEPEVAL_VERBOSE_MODE, retry log-level overrides, gRPC and Confident trace flags.
433
+ - If --save is provided (or DEEPEVAL_DEFAULT_SAVE is set), removes these keys from the target dotenv file.
434
+ """
435
+ settings = get_settings()
436
+ with settings.edit(save=save) as edit_ctx:
437
+ # Back to normal global level
438
+ settings.LOG_LEVEL = "info"
439
+ settings.CONFIDENT_TRACE_ENVIRONMENT = "development"
440
+ settings.CONFIDENT_TRACE_VERBOSE = True
441
+
442
+ # Clear optional toggles/overrides
443
+ settings.DEEPEVAL_VERBOSE_MODE = None
444
+ settings.DEEPEVAL_RETRY_BEFORE_LOG_LEVEL = None
445
+ settings.DEEPEVAL_RETRY_AFTER_LOG_LEVEL = None
446
+
447
+ settings.DEEPEVAL_GRPC_LOGGING = None
448
+ settings.GRPC_VERBOSITY = None
449
+ settings.GRPC_TRACE = None
450
+
451
+ settings.CONFIDENT_TRACE_FLUSH = None
452
+
453
+ settings.ERROR_REPORTING = None
454
+ settings.IGNORE_DEEPEVAL_ERRORS = None
294
455
 
295
456
  handled, path, _ = edit_ctx.result
296
457
 
297
458
  if not handled and save is not None:
298
- # invalid --save format (unsupported)
299
459
  print("Unsupported --save option. Use --save=dotenv[:path].")
460
+ elif path:
461
+ print(f"Removed debug-related environment variables from {path}.")
300
462
  else:
301
- print("gRPC logging enabled.")
463
+ print("Debug settings reverted to defaults for this session.")
464
+
465
+ print(":mute: Debug options unset.")
302
466
 
303
467
 
304
468
  #############################################
@@ -1336,7 +1500,7 @@ def set_gemini_model_env(
1336
1500
  )
1337
1501
  else:
1338
1502
  print(
1339
- f":raising_hands: Congratulations! You're now using Gemini's model for all evals that require an LLM."
1503
+ ":raising_hands: Congratulations! You're now using Gemini's model for all evals that require an LLM."
1340
1504
  )
1341
1505
 
1342
1506
 
@@ -458,6 +458,8 @@ class EvaluationDataset:
458
458
  tools_called_col_delimiter: str = ";",
459
459
  expected_tools_col_name: Optional[str] = "expected_tools",
460
460
  expected_tools_col_delimiter: str = ";",
461
+ comments_key_name: str = "comments",
462
+ name_key_name: str = "name",
461
463
  source_file_col_name: Optional[str] = None,
462
464
  additional_metadata_col_name: Optional[str] = None,
463
465
  scenario_col_name: Optional[str] = "scenario",
@@ -526,6 +528,8 @@ class EvaluationDataset:
526
528
  df, expected_tools_col_name, default=""
527
529
  )
528
530
  ]
531
+ comments = get_column_data(df, comments_key_name)
532
+ name = get_column_data(df, name_key_name)
529
533
  source_files = get_column_data(df, source_file_col_name)
530
534
  additional_metadatas = [
531
535
  ast.literal_eval(metadata) if metadata else None
@@ -546,6 +550,8 @@ class EvaluationDataset:
546
550
  retrieval_context,
547
551
  tools_called,
548
552
  expected_tools,
553
+ comments,
554
+ name,
549
555
  source_file,
550
556
  additional_metadata,
551
557
  scenario,
@@ -560,6 +566,8 @@ class EvaluationDataset:
560
566
  retrieval_contexts,
561
567
  tools_called,
562
568
  expected_tools,
569
+ comments,
570
+ name,
563
571
  source_files,
564
572
  additional_metadatas,
565
573
  scenarios,
@@ -569,7 +577,7 @@ class EvaluationDataset:
569
577
  ):
570
578
  if scenario:
571
579
  self._multi_turn = True
572
- parsed_turns = parse_turns(turns)
580
+ parsed_turns = parse_turns(turns) if turns else []
573
581
  self.goldens.append(
574
582
  ConversationalGolden(
575
583
  scenario=scenario,
@@ -577,6 +585,8 @@ class EvaluationDataset:
577
585
  expected_outcome=expected_outcome,
578
586
  user_description=user_description,
579
587
  context=context,
588
+ comments=comments,
589
+ name=name,
580
590
  )
581
591
  )
582
592
  else:
@@ -592,6 +602,8 @@ class EvaluationDataset:
592
602
  expected_tools=expected_tools,
593
603
  additional_metadata=additional_metadata,
594
604
  source_file=source_file,
605
+ comments=comments,
606
+ name=name,
595
607
  )
596
608
  )
597
609
 
@@ -605,6 +617,8 @@ class EvaluationDataset:
605
617
  retrieval_context_key_name: Optional[str] = "retrieval_context",
606
618
  tools_called_key_name: Optional[str] = "tools_called",
607
619
  expected_tools_key_name: Optional[str] = "expected_tools",
620
+ comments_key_name: str = "comments",
621
+ name_key_name: str = "name",
608
622
  source_file_key_name: Optional[str] = "source_file",
609
623
  additional_metadata_key_name: Optional[str] = "additional_metadata",
610
624
  scenario_key_name: Optional[str] = "scenario",
@@ -628,7 +642,8 @@ class EvaluationDataset:
628
642
  expected_outcome = json_obj.get(expected_outcome_key_name)
629
643
  user_description = json_obj.get(user_description_key_name)
630
644
  context = json_obj.get(context_key_name)
631
-
645
+ comments = json_obj.get(comments_key_name)
646
+ name = json_obj.get(name_key_name)
632
647
  parsed_turns = parse_turns(turns) if turns else []
633
648
 
634
649
  self._multi_turn = True
@@ -639,6 +654,8 @@ class EvaluationDataset:
639
654
  expected_outcome=expected_outcome,
640
655
  user_description=user_description,
641
656
  context=context,
657
+ comments=comments,
658
+ name=name,
642
659
  )
643
660
  )
644
661
  else:
@@ -649,6 +666,8 @@ class EvaluationDataset:
649
666
  retrieval_context = json_obj.get(retrieval_context_key_name)
650
667
  tools_called = json_obj.get(tools_called_key_name)
651
668
  expected_tools = json_obj.get(expected_tools_key_name)
669
+ comments = json_obj.get(comments_key_name)
670
+ name = json_obj.get(name_key_name)
652
671
  source_file = json_obj.get(source_file_key_name)
653
672
  additional_metadata = json_obj.get(additional_metadata_key_name)
654
673
 
@@ -663,6 +682,8 @@ class EvaluationDataset:
663
682
  tools_called=tools_called,
664
683
  expected_tools=expected_tools,
665
684
  additional_metadata=additional_metadata,
685
+ comments=comments,
686
+ name=name,
666
687
  source_file=source_file,
667
688
  )
668
689
  )
@@ -928,6 +949,8 @@ class EvaluationDataset:
928
949
  expected_outcome=golden.expected_outcome,
929
950
  user_description=golden.user_description,
930
951
  context=golden.context,
952
+ name=golden.name,
953
+ comments=golden.comments,
931
954
  )
932
955
  for golden in self.goldens
933
956
  ]
@@ -939,6 +962,8 @@ class EvaluationDataset:
939
962
  actual_output=golden.actual_output,
940
963
  retrieval_context=golden.retrieval_context,
941
964
  context=golden.context,
965
+ name=golden.name,
966
+ comments=golden.comments,
942
967
  source_file=golden.source_file,
943
968
  )
944
969
  for golden in self.goldens
@@ -981,6 +1006,8 @@ class EvaluationDataset:
981
1006
  "expected_outcome": golden.expected_outcome,
982
1007
  "user_description": golden.user_description,
983
1008
  "context": golden.context,
1009
+ "name": golden.name,
1010
+ "comments": golden.comments,
984
1011
  }
985
1012
  for golden in goldens
986
1013
  ]
@@ -992,6 +1019,8 @@ class EvaluationDataset:
992
1019
  "expected_output": golden.expected_output,
993
1020
  "retrieval_context": golden.retrieval_context,
994
1021
  "context": golden.context,
1022
+ "name": golden.name,
1023
+ "comments": golden.comments,
995
1024
  "source_file": golden.source_file,
996
1025
  }
997
1026
  for golden in goldens
@@ -1010,6 +1039,8 @@ class EvaluationDataset:
1010
1039
  "expected_outcome",
1011
1040
  "user_description",
1012
1041
  "context",
1042
+ "name",
1043
+ "comments",
1013
1044
  ]
1014
1045
  )
1015
1046
  for golden in goldens:
@@ -1030,6 +1061,8 @@ class EvaluationDataset:
1030
1061
  golden.expected_outcome,
1031
1062
  golden.user_description,
1032
1063
  context,
1064
+ golden.name,
1065
+ golden.comments,
1033
1066
  ]
1034
1067
  )
1035
1068
  else:
@@ -1040,6 +1073,8 @@ class EvaluationDataset:
1040
1073
  "expected_output",
1041
1074
  "retrieval_context",
1042
1075
  "context",
1076
+ "name",
1077
+ "comments",
1043
1078
  "source_file",
1044
1079
  ]
1045
1080
  )
@@ -1061,6 +1096,8 @@ class EvaluationDataset:
1061
1096
  golden.expected_output,
1062
1097
  retrieval_context,
1063
1098
  context,
1099
+ golden.name,
1100
+ golden.comments,
1064
1101
  golden.source_file,
1065
1102
  ]
1066
1103
  )
@@ -1219,12 +1219,16 @@ async def _a_execute_agentic_test_case(
1219
1219
 
1220
1220
  test_case = LLMTestCase(
1221
1221
  input=golden.input,
1222
- actual_output=str(trace.output) if trace.output is not None else None,
1223
- expected_output=trace.expected_output,
1224
- context=trace.context,
1225
- retrieval_context=trace.retrieval_context,
1226
- tools_called=trace.tools_called,
1227
- expected_tools=trace.expected_tools,
1222
+ actual_output=(
1223
+ str(current_trace.output)
1224
+ if current_trace.output is not None
1225
+ else None
1226
+ ),
1227
+ expected_output=current_trace.expected_output,
1228
+ context=current_trace.context,
1229
+ retrieval_context=current_trace.retrieval_context,
1230
+ tools_called=current_trace.tools_called,
1231
+ expected_tools=current_trace.expected_tools,
1228
1232
  additional_metadata=golden.additional_metadata,
1229
1233
  comments=golden.comments,
1230
1234
  name=golden.name,
@@ -1,6 +1,6 @@
1
1
  """LLM evaluated metric based on the GEval framework: https://arxiv.org/pdf/2303.16634.pdf"""
2
2
 
3
- from typing import Optional, List, Tuple, Union
3
+ from typing import Optional, List, Tuple, Type, Union
4
4
  from deepeval.models import DeepEvalBaseMLLM
5
5
  from deepeval.metrics import BaseMultimodalMetric
6
6
  from deepeval.test_case import (
@@ -10,7 +10,10 @@ from deepeval.test_case import (
10
10
  from deepeval.metrics.multimodal_metrics.multimodal_g_eval.template import (
11
11
  MultimodalGEvalTemplate,
12
12
  )
13
- from deepeval.metrics.multimodal_metrics.multimodal_g_eval.schema import *
13
+ from deepeval.metrics.multimodal_metrics.multimodal_g_eval.schema import (
14
+ Steps,
15
+ ReasonScore,
16
+ )
14
17
  from deepeval.utils import get_or_create_event_loop, prettify_list
15
18
  from deepeval.metrics.indicator import metric_progress_indicator
16
19
  from deepeval.metrics.utils import (
@@ -49,6 +52,9 @@ class MultimodalGEval(BaseMultimodalMetric):
49
52
  async_mode: bool = True,
50
53
  strict_mode: bool = False,
51
54
  verbose_mode: bool = False,
55
+ evaluation_template: Type[
56
+ MultimodalGEvalTemplate
57
+ ] = MultimodalGEvalTemplate,
52
58
  _include_g_eval_suffix: bool = True,
53
59
  ):
54
60
  validate_criteria_and_evaluation_steps(criteria, evaluation_steps)
@@ -65,6 +71,7 @@ class MultimodalGEval(BaseMultimodalMetric):
65
71
  self.async_mode = async_mode
66
72
  self.verbose_mode = verbose_mode
67
73
  self._include_g_eval_suffix = _include_g_eval_suffix
74
+ self.evaluation_template = evaluation_template
68
75
 
69
76
  def measure(
70
77
  self,
@@ -167,7 +174,7 @@ class MultimodalGEval(BaseMultimodalMetric):
167
174
  g_eval_params_str = construct_g_eval_params_string(
168
175
  self.evaluation_params
169
176
  )
170
- prompt = MultimodalGEvalTemplate.generate_evaluation_steps(
177
+ prompt = self.evaluation_template.generate_evaluation_steps(
171
178
  criteria=self.criteria, parameters=g_eval_params_str
172
179
  )
173
180
  if self.using_native_model:
@@ -190,7 +197,7 @@ class MultimodalGEval(BaseMultimodalMetric):
190
197
  g_eval_params_str = construct_g_eval_params_string(
191
198
  self.evaluation_params
192
199
  )
193
- prompt = MultimodalGEvalTemplate.generate_evaluation_steps(
200
+ prompt = self.evaluation_template.generate_evaluation_steps(
194
201
  criteria=self.criteria, parameters=g_eval_params_str
195
202
  )
196
203
  if self.using_native_model:
@@ -218,7 +225,7 @@ class MultimodalGEval(BaseMultimodalMetric):
218
225
 
219
226
  if not self.strict_mode:
220
227
  rubric_str = format_rubrics(self.rubric) if self.rubric else None
221
- prompt = MultimodalGEvalTemplate.generate_evaluation_results(
228
+ prompt = self.evaluation_template.generate_evaluation_results(
222
229
  evaluation_steps=number_evaluation_steps(self.evaluation_steps),
223
230
  test_case_list=test_case_list,
224
231
  parameters=g_eval_params_str,
@@ -227,11 +234,15 @@ class MultimodalGEval(BaseMultimodalMetric):
227
234
  _additional_context=_additional_context,
228
235
  )
229
236
  else:
230
- prompt = MultimodalGEvalTemplate.generate_strict_evaluation_results(
231
- evaluation_steps=number_evaluation_steps(self.evaluation_steps),
232
- test_case_list=test_case_list,
233
- parameters=g_eval_params_str,
234
- _additional_context=_additional_context,
237
+ prompt = (
238
+ self.evaluation_template.generate_strict_evaluation_results(
239
+ evaluation_steps=number_evaluation_steps(
240
+ self.evaluation_steps
241
+ ),
242
+ test_case_list=test_case_list,
243
+ parameters=g_eval_params_str,
244
+ _additional_context=_additional_context,
245
+ )
235
246
  )
236
247
  try:
237
248
  # don't use log probabilities for unsupported gpt models
@@ -256,7 +267,7 @@ class MultimodalGEval(BaseMultimodalMetric):
256
267
  score, res
257
268
  )
258
269
  return weighted_summed_score, reason
259
- except:
270
+ except Exception:
260
271
  return score, reason
261
272
  except (
262
273
  AttributeError
@@ -289,7 +300,7 @@ class MultimodalGEval(BaseMultimodalMetric):
289
300
 
290
301
  if not self.strict_mode:
291
302
  rubric_str = format_rubrics(self.rubric) if self.rubric else None
292
- prompt = MultimodalGEvalTemplate.generate_evaluation_results(
303
+ prompt = self.evaluation_template.generate_evaluation_results(
293
304
  evaluation_steps=number_evaluation_steps(self.evaluation_steps),
294
305
  test_case_list=test_case_list,
295
306
  parameters=g_eval_params_str,
@@ -298,11 +309,15 @@ class MultimodalGEval(BaseMultimodalMetric):
298
309
  _additional_context=_additional_context,
299
310
  )
300
311
  else:
301
- prompt = MultimodalGEvalTemplate.generate_strict_evaluation_results(
302
- evaluation_steps=number_evaluation_steps(self.evaluation_steps),
303
- test_case_list=test_case_list,
304
- parameters=g_eval_params_str,
305
- _additional_context=_additional_context,
312
+ prompt = (
313
+ self.evaluation_template.generate_strict_evaluation_results(
314
+ evaluation_steps=number_evaluation_steps(
315
+ self.evaluation_steps
316
+ ),
317
+ test_case_list=test_case_list,
318
+ parameters=g_eval_params_str,
319
+ _additional_context=_additional_context,
320
+ )
306
321
  )
307
322
 
308
323
  try:
@@ -326,7 +341,7 @@ class MultimodalGEval(BaseMultimodalMetric):
326
341
  score, res
327
342
  )
328
343
  return weighted_summed_score, reason
329
- except:
344
+ except Exception:
330
345
  return score, reason
331
346
  except AttributeError:
332
347
  # This catches the case where a_generate_raw_response doesn't exist.
@@ -352,7 +367,7 @@ class MultimodalGEval(BaseMultimodalMetric):
352
367
  else:
353
368
  try:
354
369
  self.success = self.score >= self.threshold
355
- except:
370
+ except Exception:
356
371
  self.success = False
357
372
  return self.success
358
373
 
@@ -164,13 +164,12 @@ class _ObservedModel(Model):
164
164
  ):
165
165
 
166
166
  if isinstance(event, ResponseCompletedEvent):
167
- observer.result = (
168
- event.response.output_text
169
- ) # TODO: support other response types
167
+ observer.result = make_json_serializable(
168
+ event.response.output
169
+ )
170
170
 
171
171
  yield event
172
172
 
173
- observer.__exit__(None, None, None)
174
173
  except Exception as e:
175
174
  observer.__exit__(type(e), e, e.__traceback__)
176
175
  raise
@@ -1,9 +1,13 @@
1
1
  from deepeval.tracing.tracing import (
2
2
  Observer,
3
3
  current_span_context,
4
+ trace_manager,
4
5
  )
5
6
  from deepeval.openai_agents.extractors import *
6
7
  from deepeval.tracing.context import current_trace_context
8
+ from deepeval.tracing.utils import make_json_serializable
9
+ from time import perf_counter
10
+ from deepeval.tracing.types import TraceSpanStatus
7
11
 
8
12
  try:
9
13
  from agents.tracing import Span, Trace, TracingProcessor
@@ -33,14 +37,51 @@ def _check_openai_agents_available():
33
37
  class DeepEvalTracingProcessor(TracingProcessor):
34
38
  def __init__(self) -> None:
35
39
  _check_openai_agents_available()
36
- self.root_span_observers: dict[str, Observer] = {}
37
40
  self.span_observers: dict[str, Observer] = {}
38
41
 
39
42
  def on_trace_start(self, trace: "Trace") -> None:
40
- pass
43
+ trace_dict = trace.export()
44
+ _trace_uuid = trace_dict.get("id")
45
+ _thread_id = trace_dict.get("group_id")
46
+ _trace_name = trace_dict.get("workflow_name")
47
+ _trace_metadata = trace_dict.get("metadata")
48
+
49
+ if _thread_id or _trace_metadata:
50
+ _trace = trace_manager.start_new_trace(trace_uuid=str(_trace_uuid))
51
+ _trace.thread_id = str(_thread_id)
52
+ _trace.name = str(_trace_name)
53
+ _trace.metadata = make_json_serializable(_trace_metadata)
54
+ current_trace_context.set(_trace)
55
+
56
+ trace_manager.add_span( # adds a dummy root span
57
+ BaseSpan(
58
+ uuid=_trace_uuid,
59
+ trace_uuid=_trace_uuid,
60
+ parent_uuid=None,
61
+ start_time=perf_counter(),
62
+ name=_trace_name,
63
+ status=TraceSpanStatus.IN_PROGRESS,
64
+ children=[],
65
+ )
66
+ )
67
+ else:
68
+ current_trace = current_trace_context.get()
69
+ if current_trace:
70
+ current_trace.name = str(_trace_name)
41
71
 
42
72
  def on_trace_end(self, trace: "Trace") -> None:
43
- pass
73
+ trace_dict = trace.export()
74
+ _trace_uuid = trace_dict.get("id")
75
+ _thread_id = trace_dict.get("group_id")
76
+ _trace_name = trace_dict.get("workflow_name")
77
+ _trace_metadata = trace_dict.get("metadata")
78
+
79
+ if _thread_id or _trace_metadata:
80
+ trace_manager.remove_span(
81
+ _trace_uuid
82
+ ) # removing the dummy root span
83
+ trace_manager.end_trace(_trace_uuid)
84
+ current_trace_context.set(None)
44
85
 
45
86
  def on_span_start(self, span: "Span") -> None:
46
87
  if not span.started_at:
@@ -109,10 +109,9 @@ class Runner(AgentsRunner):
109
109
  metric_collection=metric_collection,
110
110
  metrics=metrics,
111
111
  func_name="run",
112
- function_kwargs={"input": input},
112
+ function_kwargs={"input": input}, # also set below
113
113
  ) as observer:
114
114
  update_trace_attributes(
115
- input=input,
116
115
  name=name,
117
116
  tags=tags,
118
117
  metadata=metadata,
@@ -123,7 +122,8 @@ class Runner(AgentsRunner):
123
122
  )
124
123
  current_span = current_span_context.get()
125
124
  current_trace = current_trace_context.get()
126
- current_trace.input = input
125
+ if not current_trace.input:
126
+ current_trace.input = input
127
127
  if current_span:
128
128
  current_span.input = input
129
129
  res = await super().run(
@@ -138,8 +138,9 @@ class Runner(AgentsRunner):
138
138
  session=session,
139
139
  **kwargs, # backwards compatibility
140
140
  )
141
+ current_trace_thread_id = current_trace_context.get().thread_id
141
142
  _output = None
142
- if thread_id:
143
+ if current_trace_thread_id:
143
144
  _output = res.final_output
144
145
  else:
145
146
  _output = str(res)
@@ -170,30 +171,30 @@ class Runner(AgentsRunner):
170
171
  **kwargs,
171
172
  ) -> RunResult:
172
173
  is_agents_available()
173
- input_val = input
174
-
175
- update_trace_attributes(
176
- input=input_val,
177
- name=name,
178
- tags=tags,
179
- metadata=metadata,
180
- thread_id=thread_id,
181
- user_id=user_id,
182
- metric_collection=metric_collection,
183
- metrics=metrics,
184
- )
185
174
 
186
175
  with Observer(
187
176
  span_type="custom",
188
177
  metric_collection=metric_collection,
189
178
  metrics=metrics,
190
179
  func_name="run_sync",
191
- function_kwargs={"input": input_val},
180
+ function_kwargs={"input": input}, # also set below
192
181
  ) as observer:
182
+ update_trace_attributes(
183
+ name=name,
184
+ tags=tags,
185
+ metadata=metadata,
186
+ thread_id=thread_id,
187
+ user_id=user_id,
188
+ metric_collection=metric_collection,
189
+ metrics=metrics,
190
+ )
191
+
193
192
  current_span = current_span_context.get()
194
193
  current_trace = current_trace_context.get()
194
+ if not current_trace.input:
195
+ current_trace.input = input
195
196
  if current_span:
196
- current_span.input = input_val
197
+ current_span.input = input
197
198
  res = super().run_sync(
198
199
  starting_agent,
199
200
  input,
@@ -206,8 +207,9 @@ class Runner(AgentsRunner):
206
207
  session=session,
207
208
  **kwargs, # backwards compatibility
208
209
  )
210
+ current_trace_thread_id = current_trace_context.get().thread_id
209
211
  _output = None
210
- if thread_id:
212
+ if current_trace_thread_id:
211
213
  _output = res.final_output
212
214
  else:
213
215
  _output = str(res)
@@ -250,7 +252,6 @@ class Runner(AgentsRunner):
250
252
  observer.__enter__()
251
253
 
252
254
  update_trace_attributes(
253
- input=input,
254
255
  name=name,
255
256
  tags=tags,
256
257
  metadata=metadata,
@@ -259,6 +260,9 @@ class Runner(AgentsRunner):
259
260
  metric_collection=metric_collection,
260
261
  metrics=metrics,
261
262
  )
263
+ current_trace = current_trace_context.get()
264
+ if not current_trace.input:
265
+ current_trace.input = input
262
266
 
263
267
  current_span = current_span_context.get()
264
268
  if current_span:
deepeval/scorer/scorer.py CHANGED
@@ -223,7 +223,7 @@ class Scorer:
223
223
  Right now we are using score_one method under the hood. Instead of scoring multiple predictions for faithfulness.
224
224
  """
225
225
  try:
226
- from deepeval.models import SummaCModels
226
+ from deepeval.models.summac_model import SummaCModels
227
227
  except Exception as e:
228
228
  print(f"SummaCZS model can not be loaded.\n{e}")
229
229
 
@@ -326,7 +326,7 @@ class Scorer:
326
326
  from sentence_transformers import util
327
327
 
328
328
  try:
329
- from deepeval.models import (
329
+ from deepeval.models.answer_relevancy_model import (
330
330
  AnswerRelevancyModel,
331
331
  CrossEncoderAnswerRelevancyModel,
332
332
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deepeval
3
- Version: 3.5.5
3
+ Version: 3.5.7
4
4
  Summary: The LLM Evaluation Framework
5
5
  Home-page: https://github.com/confident-ai/deepeval
6
6
  License: Apache-2.0
@@ -1,5 +1,5 @@
1
1
  deepeval/__init__.py,sha256=6fsb813LD_jNhqR-xZnSdE5E-KsBbC3tc4oIg5ZMgTw,2115
2
- deepeval/_version.py,sha256=CJwAeAyMGnIxrkmBn8fpG6bwbVBsUZaTrtwbstM2LgA,27
2
+ deepeval/_version.py,sha256=5PpKL25tWtYxTPc0_se2v49WDFVCzYaCu8yogWsx_qQ,27
3
3
  deepeval/annotation/__init__.py,sha256=ZFhUVNNuH_YgQSZJ-m5E9iUb9TkAkEV33a6ouMDZ8EI,111
4
4
  deepeval/annotation/annotation.py,sha256=3j3-syeJepAcEj3u3e4T_BeRDzNr7yXGDIoNQGMKpwQ,2298
5
5
  deepeval/annotation/api.py,sha256=EYN33ACVzVxsFleRYm60KB4Exvff3rPJKt1VBuuX970,2147
@@ -132,7 +132,7 @@ deepeval/benchmarks/winogrande/template.py,sha256=tDwH8NpNF9x7FbDmQw45XaW1LNqGBV
132
132
  deepeval/benchmarks/winogrande/winogrande.py,sha256=_4irJkRPw3c-Ufo-hM4cHpPKUoxozedFQpok9n0csTg,5644
133
133
  deepeval/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
134
134
  deepeval/cli/dotenv_handler.py,sha256=7PtVjCNUZKAXsVJQxznsLexad7y8x-gQ195xAxmv4gA,2468
135
- deepeval/cli/main.py,sha256=60FsOU9OGRX49eSSuJkePf8kArSor3-QbeqAZ2bDWCE,51298
135
+ deepeval/cli/main.py,sha256=keY6Ik3h2PLmwFMiNUU8mWEtAGsR8mjzWmYedq0k25w,57296
136
136
  deepeval/cli/server.py,sha256=cOm9xiYcPYB9GDeFQw9-Iawf9bNfOqftZs7q7mO_P7I,1979
137
137
  deepeval/cli/test.py,sha256=kSIFMRTAfVzBJ4OitwvT829-ylV7UzPMP57P2DePS-Q,5482
138
138
  deepeval/cli/types.py,sha256=_7KdthstHNc-JKCWrfpDQCf_j8h9PMxh0qJCHmVXJr0,310
@@ -147,7 +147,7 @@ deepeval/config/utils.py,sha256=gSOVv18Tx1R72GucbdQesbZLFL-Y9EzbS4p7qd2w_xE,3799
147
147
  deepeval/constants.py,sha256=Qe-es-WDPJndgBspEQXxddDCVanrAu03YWCpXsUkdo0,1368
148
148
  deepeval/dataset/__init__.py,sha256=rcum_VjBXu8eisCdr6sl84BgoZUs3x0tYbB2PnPtHGY,212
149
149
  deepeval/dataset/api.py,sha256=ZxkEqAF4nZH_Ys_1f5r9N2LFI_vBcAJxt8eJm7Mplpw,831
150
- deepeval/dataset/dataset.py,sha256=T2rzGGKeCjIkkhXY0ofnWh13W6gjjdjat9uVHCmhGFI,49493
150
+ deepeval/dataset/dataset.py,sha256=dDWTSPWN8i_mZBOAgZt0r5Id6q6aeDf8jAKxv81mP1o,51113
151
151
  deepeval/dataset/golden.py,sha256=T-rTk4Hw1tANx_Iimv977F6Y4QK3s5OIB4PecU5FJDM,2338
152
152
  deepeval/dataset/test_run_tracer.py,sha256=5CdpDvhzkEEBRyqWi6egocaxiN6IRS3XfbACxEQZQeM,2544
153
153
  deepeval/dataset/types.py,sha256=CWeOIBPK2WdmRUqjFa9gfN-w2da0r8Ilzl3ToDpJQoQ,558
@@ -158,7 +158,7 @@ deepeval/evaluate/api.py,sha256=rkblH0ZFAAdyuF0Ymh7JE1pIJPR9yFuPrn9SQaCEQp4,435
158
158
  deepeval/evaluate/compare.py,sha256=tdSJY4E7YJ_zO3dzvpwngZHLiUI2YQcTWJOLI83htsQ,9855
159
159
  deepeval/evaluate/configs.py,sha256=QfWjaWNxLsgEe8-5j4PIs5WcSyEckiWt0qdpXSpl57M,928
160
160
  deepeval/evaluate/evaluate.py,sha256=NPAJ2iJqJI_RurXKUIC0tft_ozYMIKwZf5iPfmnNhQc,10412
161
- deepeval/evaluate/execute.py,sha256=45m3w3QSAWVHRNTSqLZcpUI1bA_qRFWIGu292WKTjcA,87953
161
+ deepeval/evaluate/execute.py,sha256=fJLBl45Vf4rA4Pm7k932TG-0BNIvf90klQyurXb-b_4,88057
162
162
  deepeval/evaluate/types.py,sha256=IGZ3Xsj0UecPI3JNeTpJaK1gDvlepokfCmHwtItIW9M,831
163
163
  deepeval/evaluate/utils.py,sha256=kkliSGzuICeUsXDtlMMPfN95dUKlqarNhfciSffd4gI,23143
164
164
  deepeval/integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -308,7 +308,7 @@ deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfuln
308
308
  deepeval/metrics/multimodal_metrics/multimodal_faithfulness/schema.py,sha256=b-WtfA7zq4TgQiuqqNEMf7jmohnWBMW4opChHyg49Gc,414
309
309
  deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py,sha256=9EWRC-Wiyr_UEMPfpuTcX2tvsjPxSRY4n_lClcsK6vw,8389
310
310
  deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
311
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py,sha256=gZ7Q4vF12PLGhbHhOUAl9LIFWDOc9-GKhu3ly_LOkQ0,13997
311
+ deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py,sha256=YR2SMmUwVPe8epth2PWtG6UB4vnInBZrTaeeVyF9VHA,14428
312
312
  deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py,sha256=H_9-iA1BXJwbPKrGEZBqxDO_En4sjXI8_xKSNYc-hnk,167
313
313
  deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py,sha256=6kIC4vTtRxUBCyafjyWLZg5WhVHxsRy-m2Mv7OGbgV0,5235
314
314
  deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py,sha256=UgY46c1mudFoOglbrrJsXnSrdiJGGRFqpDvrAAZWwV0,2189
@@ -395,11 +395,11 @@ deepeval/openai/extractors.py,sha256=q062nlYKuPVwqfLFYCD1yWv7xHF1U_XrYdAp5ve2l_E
395
395
  deepeval/openai/patch.py,sha256=tPDqXaBScBJveM9P5xLT_mVwkubw0bOey-efvdjZIfg,7466
396
396
  deepeval/openai/utils.py,sha256=-84VZGUsnzRkYAFWc_DGaGuQTDCUItk0VtUTdjtSxg4,2748
397
397
  deepeval/openai_agents/__init__.py,sha256=u-e9laod3LyPfLcI5lr7Yhk8ArfWvlpr-D4_idWIt0A,321
398
- deepeval/openai_agents/agent.py,sha256=PYOhLELRXfGAP_fje70X3Ovm3WjF24mQYWdwrobwcr4,6173
399
- deepeval/openai_agents/callback_handler.py,sha256=-tOXJ3SMKqH5u41cB_g7FBjaX5qAuqVAaAv7vQtiBVc,3025
398
+ deepeval/openai_agents/agent.py,sha256=gZcmfqTgrQaJV8g6ChmmdpyArEp6oDIqHSaYPDEd344,6100
399
+ deepeval/openai_agents/callback_handler.py,sha256=jrV2Uv9FjfU1BQQe6V_ltT3QS8ZcalxMbqzJI2vvJXo,4713
400
400
  deepeval/openai_agents/extractors.py,sha256=0jZxwgY1NQ3mMxVWPpLcMpKlbj-aYV7rwuzRzG8hdZs,11529
401
401
  deepeval/openai_agents/patch.py,sha256=zSmRV5yOReHC6IylhT93SM1nQpmH3sEWfYcJqa_iM84,3684
402
- deepeval/openai_agents/runner.py,sha256=pRwe6DX6kpXia6btl4TAWlnXpk88MsQfM8yWkGufyk8,10608
402
+ deepeval/openai_agents/runner.py,sha256=U8Kh4jHhDIYVkIIxytcGCKRFHdgxxhpATHd9jnbh1Eg,10999
403
403
  deepeval/plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
404
404
  deepeval/plugins/plugin.py,sha256=_dwsdx4Dg9DbXxK3f7zJY4QWTJQWc7QE1HmIg2Zjjag,1515
405
405
  deepeval/progress_context.py,sha256=ZSKpxrE9sdgt9G3REKnVeXAv7GJXHHVGgLynpG1Pudw,3557
@@ -410,7 +410,7 @@ deepeval/prompt/utils.py,sha256=Gk0zj_9BK8MQccs8GmiC8o-YVtkou6ZJEz8kWgW5Mog,1678
410
410
  deepeval/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
411
411
  deepeval/red_teaming/README.md,sha256=BY5rAdpp3-sMMToEKwq0Nsd9ivkGDzPE16DeDb8GY7U,154
412
412
  deepeval/scorer/__init__.py,sha256=hTvtoV3a4l0dSBjERm-jX7jveTtKZXK0c9JerQo0T_w,27
413
- deepeval/scorer/scorer.py,sha256=8kp5wXbIlem-8ucyyA6ew_sIeR77HoypW_s4wKmxGUU,18312
413
+ deepeval/scorer/scorer.py,sha256=EmXo1wEMMAL2it8WxNJ4cTqZLCH1ad4BY2VewoX6b10,18348
414
414
  deepeval/simulator/__init__.py,sha256=wkyevg9nh46rsVnVrBjY3K5bHlkqjwx4TtrTfyjDCO0,96
415
415
  deepeval/simulator/conversation_simulator.py,sha256=Ojng2ZoM31p7GVWEkiT44PE926eEzRoVJP5eRb1yrQI,24262
416
416
  deepeval/simulator/schema.py,sha256=16X2-m92plP52YTd-dvECt_-6gsz0U4j7Ut3UdI6gKY,252
@@ -461,8 +461,8 @@ deepeval/tracing/tracing.py,sha256=b-0T3W6lAEOEGhODx0e-yIwBkm5V46EDNAWS9lcWkD0,4
461
461
  deepeval/tracing/types.py,sha256=l_utWKerNlE5H3mOKpeUJLsvpP3cMyjH7HRANNgTmSQ,5306
462
462
  deepeval/tracing/utils.py,sha256=w_kdhuyBCygllnbqLpDdKJqpJo42t3ZMlGhNicV2A8c,6467
463
463
  deepeval/utils.py,sha256=r8tV_NYJSi6ib-oQw6cLw3L7ZSe4KIJVJc1ng6-kDX4,17179
464
- deepeval-3.5.5.dist-info/LICENSE.md,sha256=0ATkuLv6QgsJTBODUHC5Rak_PArA6gv2t7inJzNTP38,11352
465
- deepeval-3.5.5.dist-info/METADATA,sha256=Js_9nOjXPh0YQOokcbPvquIW7lBRQMphrLHTYZ8-pAE,18721
466
- deepeval-3.5.5.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
467
- deepeval-3.5.5.dist-info/entry_points.txt,sha256=fVr8UphXTfJe9I2rObmUtfU3gkSrYeM0pLy-NbJYg10,94
468
- deepeval-3.5.5.dist-info/RECORD,,
464
+ deepeval-3.5.7.dist-info/LICENSE.md,sha256=0ATkuLv6QgsJTBODUHC5Rak_PArA6gv2t7inJzNTP38,11352
465
+ deepeval-3.5.7.dist-info/METADATA,sha256=NldO1OinDSv_gGUP-kkFk1zpMGXKTceoYMtF92XGbgs,18721
466
+ deepeval-3.5.7.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
467
+ deepeval-3.5.7.dist-info/entry_points.txt,sha256=fVr8UphXTfJe9I2rObmUtfU3gkSrYeM0pLy-NbJYg10,94
468
+ deepeval-3.5.7.dist-info/RECORD,,