deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  3. deepeval/cli/main.py +42 -0
  4. deepeval/confident/api.py +1 -0
  5. deepeval/config/settings.py +22 -4
  6. deepeval/constants.py +8 -1
  7. deepeval/dataset/dataset.py +2 -11
  8. deepeval/dataset/utils.py +1 -1
  9. deepeval/errors.py +20 -2
  10. deepeval/evaluate/evaluate.py +5 -1
  11. deepeval/evaluate/execute.py +811 -248
  12. deepeval/evaluate/types.py +1 -0
  13. deepeval/evaluate/utils.py +33 -119
  14. deepeval/integrations/crewai/__init__.py +7 -1
  15. deepeval/integrations/crewai/handler.py +1 -1
  16. deepeval/integrations/crewai/subs.py +51 -0
  17. deepeval/integrations/crewai/tool.py +71 -0
  18. deepeval/integrations/crewai/wrapper.py +45 -5
  19. deepeval/integrations/llama_index/__init__.py +0 -4
  20. deepeval/integrations/llama_index/handler.py +20 -21
  21. deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
  22. deepeval/metrics/__init__.py +13 -0
  23. deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
  24. deepeval/metrics/api.py +281 -0
  25. deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
  26. deepeval/metrics/base_metric.py +1 -0
  27. deepeval/metrics/bias/bias.py +12 -3
  28. deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
  29. deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
  30. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
  31. deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
  32. deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
  33. deepeval/metrics/conversational_dag/nodes.py +12 -4
  34. deepeval/metrics/conversational_g_eval/__init__.py +3 -0
  35. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
  36. deepeval/metrics/dag/dag.py +12 -0
  37. deepeval/metrics/dag/nodes.py +12 -4
  38. deepeval/metrics/dag/schema.py +1 -1
  39. deepeval/metrics/dag/templates.py +2 -2
  40. deepeval/metrics/faithfulness/faithfulness.py +12 -1
  41. deepeval/metrics/g_eval/g_eval.py +11 -0
  42. deepeval/metrics/goal_accuracy/__init__.py +1 -0
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
  44. deepeval/metrics/goal_accuracy/schema.py +17 -0
  45. deepeval/metrics/goal_accuracy/template.py +235 -0
  46. deepeval/metrics/hallucination/hallucination.py +20 -9
  47. deepeval/metrics/indicator.py +8 -2
  48. deepeval/metrics/json_correctness/json_correctness.py +12 -1
  49. deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +20 -2
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
  52. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
  53. deepeval/metrics/misuse/misuse.py +12 -1
  54. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
  55. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
  56. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
  57. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
  58. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
  59. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
  60. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
  61. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
  62. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
  63. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
  64. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
  65. deepeval/metrics/non_advice/non_advice.py +12 -0
  66. deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
  67. deepeval/metrics/plan_adherence/__init__.py +1 -0
  68. deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
  69. deepeval/metrics/plan_adherence/schema.py +11 -0
  70. deepeval/metrics/plan_adherence/template.py +170 -0
  71. deepeval/metrics/plan_quality/__init__.py +1 -0
  72. deepeval/metrics/plan_quality/plan_quality.py +292 -0
  73. deepeval/metrics/plan_quality/schema.py +11 -0
  74. deepeval/metrics/plan_quality/template.py +101 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
  76. deepeval/metrics/role_adherence/role_adherence.py +12 -0
  77. deepeval/metrics/role_violation/role_violation.py +12 -0
  78. deepeval/metrics/step_efficiency/__init__.py +1 -0
  79. deepeval/metrics/step_efficiency/schema.py +11 -0
  80. deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
  81. deepeval/metrics/step_efficiency/template.py +256 -0
  82. deepeval/metrics/summarization/summarization.py +12 -1
  83. deepeval/metrics/task_completion/task_completion.py +4 -0
  84. deepeval/metrics/tool_correctness/schema.py +6 -0
  85. deepeval/metrics/tool_correctness/template.py +88 -0
  86. deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
  87. deepeval/metrics/tool_use/__init__.py +1 -0
  88. deepeval/metrics/tool_use/schema.py +19 -0
  89. deepeval/metrics/tool_use/template.py +220 -0
  90. deepeval/metrics/tool_use/tool_use.py +458 -0
  91. deepeval/metrics/topic_adherence/__init__.py +1 -0
  92. deepeval/metrics/topic_adherence/schema.py +16 -0
  93. deepeval/metrics/topic_adherence/template.py +162 -0
  94. deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
  95. deepeval/metrics/toxicity/toxicity.py +12 -0
  96. deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
  97. deepeval/models/embedding_models/azure_embedding_model.py +37 -36
  98. deepeval/models/embedding_models/local_embedding_model.py +30 -32
  99. deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
  100. deepeval/models/embedding_models/openai_embedding_model.py +22 -31
  101. deepeval/models/llms/grok_model.py +1 -1
  102. deepeval/models/llms/openai_model.py +2 -0
  103. deepeval/openai/__init__.py +14 -32
  104. deepeval/openai/extractors.py +85 -50
  105. deepeval/openai/patch.py +258 -167
  106. deepeval/openai/types.py +20 -0
  107. deepeval/openai/utils.py +205 -56
  108. deepeval/prompt/__init__.py +19 -1
  109. deepeval/prompt/api.py +160 -0
  110. deepeval/prompt/prompt.py +245 -62
  111. deepeval/prompt/utils.py +186 -15
  112. deepeval/synthesizer/chunking/context_generator.py +209 -152
  113. deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  114. deepeval/synthesizer/synthesizer.py +19 -15
  115. deepeval/test_case/api.py +131 -0
  116. deepeval/test_case/llm_test_case.py +6 -2
  117. deepeval/test_run/__init__.py +1 -0
  118. deepeval/test_run/hyperparameters.py +47 -8
  119. deepeval/test_run/test_run.py +292 -206
  120. deepeval/tracing/__init__.py +2 -1
  121. deepeval/tracing/api.py +3 -1
  122. deepeval/tracing/otel/exporter.py +3 -4
  123. deepeval/tracing/otel/utils.py +24 -5
  124. deepeval/tracing/trace_context.py +89 -5
  125. deepeval/tracing/tracing.py +74 -3
  126. deepeval/tracing/types.py +20 -2
  127. deepeval/tracing/utils.py +8 -0
  128. deepeval/utils.py +21 -0
  129. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
  130. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
  131. deepeval/integrations/llama_index/agent/patched.py +0 -68
  132. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
  133. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
  134. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
deepeval/_version.py CHANGED
@@ -1 +1 @@
1
- __version__: str = "3.6.6"
1
+ __version__: str = "3.6.8"
@@ -121,6 +121,7 @@ class EquityMedQA(DeepEvalBaseBenchmark):
121
121
  score = metric.measure(
122
122
  LLMTestCase(input=golden.input, actual_output=prediction),
123
123
  _show_indicator=False,
124
+ _log_metric_to_confident=False,
124
125
  )
125
126
  flipped_score = (
126
127
  1 - metric.score if metric.score in [0, 1] else metric.score
deepeval/cli/main.py CHANGED
@@ -328,6 +328,31 @@ def set_debug(
328
328
  "--trace-flush/--no-trace-flush",
329
329
  help="Enable / disable CONFIDENT_TRACE_FLUSH.",
330
330
  ),
331
+ trace_sample_rate: Optional[float] = typer.Option(
332
+ None,
333
+ "--trace-sample-rate",
334
+ help="Set CONFIDENT_TRACE_SAMPLE_RATE.",
335
+ ),
336
+ metric_logging_verbose: Optional[bool] = typer.Option(
337
+ None,
338
+ "--metric-logging-verbose/--no-metric-logging-verbose",
339
+ help="Enable / disable CONFIDENT_METRIC_LOGGING_VERBOSE.",
340
+ ),
341
+ metric_logging_flush: Optional[bool] = typer.Option(
342
+ None,
343
+ "--metric-logging-flush/--no-metric-logging-flush",
344
+ help="Enable / disable CONFIDENT_METRIC_LOGGING_FLUSH.",
345
+ ),
346
+ metric_logging_sample_rate: Optional[float] = typer.Option(
347
+ None,
348
+ "--metric-logging-sample-rate",
349
+ help="Set CONFIDENT_METRIC_LOGGING_SAMPLE_RATE.",
350
+ ),
351
+ metric_logging_enabled: Optional[bool] = typer.Option(
352
+ None,
353
+ "--metric-logging-enabled/--no-metric-logging-enabled",
354
+ help="Enable / disable CONFIDENT_METRIC_LOGGING_ENABLED.",
355
+ ),
331
356
  # Advanced / potentially surprising
332
357
  error_reporting: Optional[bool] = typer.Option(
333
358
  None,
@@ -387,6 +412,20 @@ def set_debug(
387
412
  settings.CONFIDENT_TRACE_ENVIRONMENT = trace_env
388
413
  if trace_flush is not None:
389
414
  settings.CONFIDENT_TRACE_FLUSH = trace_flush
415
+ if trace_sample_rate is not None:
416
+ settings.CONFIDENT_TRACE_SAMPLE_RATE = trace_sample_rate
417
+
418
+ # Confident metrics
419
+ if metric_logging_verbose is not None:
420
+ settings.CONFIDENT_METRIC_LOGGING_VERBOSE = metric_logging_verbose
421
+ if metric_logging_flush is not None:
422
+ settings.CONFIDENT_METRIC_LOGGING_FLUSH = metric_logging_flush
423
+ if metric_logging_sample_rate is not None:
424
+ settings.CONFIDENT_METRIC_LOGGING_SAMPLE_RATE = (
425
+ metric_logging_sample_rate
426
+ )
427
+ if metric_logging_enabled is not None:
428
+ settings.CONFIDENT_METRIC_LOGGING_ENABLED = metric_logging_enabled
390
429
 
391
430
  # Advanced
392
431
  if error_reporting is not None:
@@ -438,6 +477,8 @@ def unset_debug(
438
477
  settings.LOG_LEVEL = "info"
439
478
  settings.CONFIDENT_TRACE_ENVIRONMENT = "development"
440
479
  settings.CONFIDENT_TRACE_VERBOSE = True
480
+ settings.CONFIDENT_METRIC_LOGGING_VERBOSE = True
481
+ settings.CONFIDENT_METRIC_LOGGING_ENABLED = True
441
482
 
442
483
  # Clear optional toggles/overrides
443
484
  settings.DEEPEVAL_VERBOSE_MODE = None
@@ -449,6 +490,7 @@ def unset_debug(
449
490
  settings.GRPC_TRACE = None
450
491
 
451
492
  settings.CONFIDENT_TRACE_FLUSH = None
493
+ settings.CONFIDENT_METRIC_LOGGING_FLUSH = None
452
494
 
453
495
  settings.ERROR_REPORTING = None
454
496
  settings.IGNORE_DEEPEVAL_ERRORS = None
deepeval/confident/api.py CHANGED
@@ -87,6 +87,7 @@ class Endpoints(Enum):
87
87
  DATASET_ALIAS_QUEUE_ENDPOINT = "/v1/datasets/:alias/queue"
88
88
 
89
89
  TEST_RUN_ENDPOINT = "/v1/test-run"
90
+ METRIC_DATA_ENDPOINT = "/v1/metric-data"
90
91
  TRACES_ENDPOINT = "/v1/traces"
91
92
  ANNOTATIONS_ENDPOINT = "/v1/annotations"
92
93
  PROMPTS_VERSION_ID_ENDPOINT = "/v1/prompts/:alias/versions/:versionId"
@@ -337,10 +337,17 @@ class Settings(BaseSettings):
337
337
  SKIP_DEEPEVAL_MISSING_PARAMS: Optional[bool] = None
338
338
  DEEPEVAL_VERBOSE_MODE: Optional[bool] = None
339
339
  ENABLE_DEEPEVAL_CACHE: Optional[bool] = None
340
+
340
341
  CONFIDENT_TRACE_FLUSH: Optional[bool] = None
341
342
  CONFIDENT_TRACE_ENVIRONMENT: Optional[str] = "development"
342
343
  CONFIDENT_TRACE_VERBOSE: Optional[bool] = True
343
- CONFIDENT_SAMPLE_RATE: Optional[float] = 1.0
344
+ CONFIDENT_TRACE_SAMPLE_RATE: Optional[float] = 1.0
345
+
346
+ CONFIDENT_METRIC_LOGGING_FLUSH: Optional[bool] = None
347
+ CONFIDENT_METRIC_LOGGING_VERBOSE: Optional[bool] = True
348
+ CONFIDENT_METRIC_LOGGING_SAMPLE_RATE: Optional[float] = 1.0
349
+ CONFIDENT_METRIC_LOGGING_ENABLED: Optional[bool] = True
350
+
344
351
  OTEL_EXPORTER_OTLP_ENDPOINT: Optional[AnyUrl] = None
345
352
 
346
353
  #
@@ -355,6 +362,12 @@ class Settings(BaseSettings):
355
362
  None # per-attempt timeout. Set 0/None to disable
356
363
  )
357
364
 
365
+ #
366
+ # Async Document Pipelines
367
+ #
368
+
369
+ DEEPEVAL_MAX_CONCURRENT_DOC_PROCESSING: conint(ge=1) = 2
370
+
358
371
  #
359
372
  # Async Task Configuration
360
373
  #
@@ -484,7 +497,8 @@ class Settings(BaseSettings):
484
497
  "OPENAI_COST_PER_INPUT_TOKEN",
485
498
  "OPENAI_COST_PER_OUTPUT_TOKEN",
486
499
  "TEMPERATURE",
487
- "CONFIDENT_SAMPLE_RATE",
500
+ "CONFIDENT_TRACE_SAMPLE_RATE",
501
+ "CONFIDENT_METRIC_LOGGING_SAMPLE_RATE",
488
502
  mode="before",
489
503
  )
490
504
  @classmethod
@@ -496,13 +510,17 @@ class Settings(BaseSettings):
496
510
  return None
497
511
  return float(v)
498
512
 
499
- @field_validator("CONFIDENT_SAMPLE_RATE")
513
+ @field_validator(
514
+ "CONFIDENT_TRACE_SAMPLE_RATE", "CONFIDENT_METRIC_LOGGING_SAMPLE_RATE"
515
+ )
500
516
  @classmethod
501
517
  def _validate_sample_rate(cls, v):
502
518
  if v is None:
503
519
  return None
504
520
  if not (0.0 <= float(v) <= 1.0):
505
- raise ValueError("CONFIDENT_SAMPLE_RATE must be between 0 and 1")
521
+ raise ValueError(
522
+ "CONFIDENT_TRACE_SAMPLE_RATE or CONFIDENT_METRIC_LOGGING_SAMPLE_RATE must be between 0 and 1"
523
+ )
506
524
  return float(v)
507
525
 
508
526
  @field_validator("DEEPEVAL_DEFAULT_SAVE", mode="before")
deepeval/constants.py CHANGED
@@ -9,9 +9,16 @@ LOGIN_PROMPT = "\nāœØšŸ‘€ Looking for a place for your LLM test data to live
9
9
 
10
10
  CONFIDENT_TRACE_VERBOSE = "CONFIDENT_TRACE_VERBOSE"
11
11
  CONFIDENT_TRACE_FLUSH = "CONFIDENT_TRACE_FLUSH"
12
- CONFIDENT_SAMPLE_RATE = "CONFIDENT_SAMPLE_RATE"
12
+ CONFIDENT_TRACE_SAMPLE_RATE = "CONFIDENT_TRACE_SAMPLE_RATE"
13
13
  CONFIDENT_TRACE_ENVIRONMENT = "CONFIDENT_TRACE_ENVIRONMENT"
14
14
  CONFIDENT_TRACING_ENABLED = "CONFIDENT_TRACING_ENABLED"
15
+
16
+ CONFIDENT_METRIC_LOGGING_VERBOSE = "CONFIDENT_METRIC_LOGGING_VERBOSE"
17
+ CONFIDENT_METRIC_LOGGING_FLUSH = "CONFIDENT_METRIC_LOGGING_FLUSH"
18
+ CONFIDENT_METRIC_LOGGING_SAMPLE_RATE = "CONFIDENT_METRIC_LOGGING_SAMPLE_RATE"
19
+ CONFIDENT_METRIC_LOGGING_ENABLED = "CONFIDENT_METRIC_LOGGING_ENABLED"
20
+
21
+
15
22
  CONFIDENT_OPEN_BROWSER = "CONFIDENT_OPEN_BROWSER"
16
23
  CONFIDENT_TEST_CASE_BATCH_SIZE = "CONFIDENT_TEST_CASE_BATCH_SIZE"
17
24
 
@@ -49,7 +49,7 @@ from deepeval.utils import (
49
49
  from deepeval.test_run import (
50
50
  global_test_run_manager,
51
51
  )
52
- from deepeval.openai.utils import openai_test_case_pairs
52
+
53
53
  from deepeval.tracing import trace_manager
54
54
  from deepeval.tracing.tracing import EVAL_DUMMY_SPAN_NAME
55
55
 
@@ -1248,16 +1248,7 @@ class EvaluationDataset:
1248
1248
  display_config.file_output_dir,
1249
1249
  )
1250
1250
 
1251
- # update hyperparameters
1252
- test_run = global_test_run_manager.get_test_run()
1253
- if len(openai_test_case_pairs) > 0:
1254
- raw_hyperparameters = openai_test_case_pairs[-1].hyperparameters
1255
- test_run.hyperparameters = process_hyperparameters(
1256
- raw_hyperparameters
1257
- )
1258
-
1259
- # clean up
1260
- openai_test_case_pairs.clear()
1251
+ # save test run
1261
1252
  global_test_run_manager.save_test_run(TEMP_FILE_PATH)
1262
1253
 
1263
1254
  # sandwich end trace for OTEL
deepeval/dataset/utils.py CHANGED
@@ -120,7 +120,7 @@ def format_turns(turns: List[Turn]) -> str:
120
120
  }
121
121
  res.append(cur_turn)
122
122
  try:
123
- return json.dumps(res)
123
+ return json.dumps(res, ensure_ascii=False)
124
124
  except Exception as e:
125
125
  raise ValueError(f"Error serializing turns: {e}")
126
126
 
deepeval/errors.py CHANGED
@@ -1,6 +1,24 @@
1
- class MissingTestCaseParamsError(Exception):
1
+ class DeepEvalError(Exception):
2
+ """Base class for framework-originated errors.
3
+ If raised and not handled, it will abort the current operation.
4
+ We may also stringify instances of this class and attach them to traces or spans to surface
5
+ non-fatal diagnostics while allowing the run to continue.
6
+ """
7
+
8
+
9
+ class UserAppError(Exception):
10
+ """Represents exceptions thrown by user LLM apps/tools.
11
+ We record these on traces or spans and keep the overall evaluation run alive.
12
+ """
13
+
14
+
15
+ class MissingTestCaseParamsError(DeepEvalError):
16
+ """Required test case fields are missing."""
17
+
2
18
  pass
3
19
 
4
20
 
5
- class MismatchedTestCaseInputsError(Exception):
21
+ class MismatchedTestCaseInputsError(DeepEvalError):
22
+ """Inputs provided to a metric or test case are inconsistent or invalid."""
23
+
6
24
  pass
@@ -28,7 +28,10 @@ from deepeval.evaluate.utils import (
28
28
  from deepeval.dataset import Golden
29
29
  from deepeval.prompt import Prompt
30
30
  from deepeval.test_case.utils import check_valid_test_cases_type
31
- from deepeval.test_run.hyperparameters import process_hyperparameters
31
+ from deepeval.test_run.hyperparameters import (
32
+ process_hyperparameters,
33
+ process_prompts,
34
+ )
32
35
  from deepeval.test_run.test_run import TEMP_FILE_PATH
33
36
  from deepeval.utils import (
34
37
  get_or_create_event_loop,
@@ -267,6 +270,7 @@ def evaluate(
267
270
 
268
271
  test_run = global_test_run_manager.get_test_run()
269
272
  test_run.hyperparameters = process_hyperparameters(hyperparameters)
273
+ test_run.prompts = process_prompts(hyperparameters)
270
274
  global_test_run_manager.save_test_run(TEMP_FILE_PATH)
271
275
  res = global_test_run_manager.wrap_up_test_run(
272
276
  run_duration, display_table=False