deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  3. deepeval/cli/main.py +42 -0
  4. deepeval/confident/api.py +1 -0
  5. deepeval/config/settings.py +22 -4
  6. deepeval/constants.py +8 -1
  7. deepeval/dataset/dataset.py +2 -11
  8. deepeval/dataset/utils.py +1 -1
  9. deepeval/errors.py +20 -2
  10. deepeval/evaluate/evaluate.py +5 -1
  11. deepeval/evaluate/execute.py +811 -248
  12. deepeval/evaluate/types.py +1 -0
  13. deepeval/evaluate/utils.py +33 -119
  14. deepeval/integrations/crewai/__init__.py +7 -1
  15. deepeval/integrations/crewai/handler.py +1 -1
  16. deepeval/integrations/crewai/subs.py +51 -0
  17. deepeval/integrations/crewai/tool.py +71 -0
  18. deepeval/integrations/crewai/wrapper.py +45 -5
  19. deepeval/integrations/llama_index/__init__.py +0 -4
  20. deepeval/integrations/llama_index/handler.py +20 -21
  21. deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
  22. deepeval/metrics/__init__.py +13 -0
  23. deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
  24. deepeval/metrics/api.py +281 -0
  25. deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
  26. deepeval/metrics/base_metric.py +1 -0
  27. deepeval/metrics/bias/bias.py +12 -3
  28. deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
  29. deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
  30. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
  31. deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
  32. deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
  33. deepeval/metrics/conversational_dag/nodes.py +12 -4
  34. deepeval/metrics/conversational_g_eval/__init__.py +3 -0
  35. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
  36. deepeval/metrics/dag/dag.py +12 -0
  37. deepeval/metrics/dag/nodes.py +12 -4
  38. deepeval/metrics/dag/schema.py +1 -1
  39. deepeval/metrics/dag/templates.py +2 -2
  40. deepeval/metrics/faithfulness/faithfulness.py +12 -1
  41. deepeval/metrics/g_eval/g_eval.py +11 -0
  42. deepeval/metrics/goal_accuracy/__init__.py +1 -0
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
  44. deepeval/metrics/goal_accuracy/schema.py +17 -0
  45. deepeval/metrics/goal_accuracy/template.py +235 -0
  46. deepeval/metrics/hallucination/hallucination.py +20 -9
  47. deepeval/metrics/indicator.py +8 -2
  48. deepeval/metrics/json_correctness/json_correctness.py +12 -1
  49. deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +20 -2
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
  52. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
  53. deepeval/metrics/misuse/misuse.py +12 -1
  54. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
  55. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
  56. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
  57. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
  58. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
  59. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
  60. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
  61. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
  62. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
  63. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
  64. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
  65. deepeval/metrics/non_advice/non_advice.py +12 -0
  66. deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
  67. deepeval/metrics/plan_adherence/__init__.py +1 -0
  68. deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
  69. deepeval/metrics/plan_adherence/schema.py +11 -0
  70. deepeval/metrics/plan_adherence/template.py +170 -0
  71. deepeval/metrics/plan_quality/__init__.py +1 -0
  72. deepeval/metrics/plan_quality/plan_quality.py +292 -0
  73. deepeval/metrics/plan_quality/schema.py +11 -0
  74. deepeval/metrics/plan_quality/template.py +101 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
  76. deepeval/metrics/role_adherence/role_adherence.py +12 -0
  77. deepeval/metrics/role_violation/role_violation.py +12 -0
  78. deepeval/metrics/step_efficiency/__init__.py +1 -0
  79. deepeval/metrics/step_efficiency/schema.py +11 -0
  80. deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
  81. deepeval/metrics/step_efficiency/template.py +256 -0
  82. deepeval/metrics/summarization/summarization.py +12 -1
  83. deepeval/metrics/task_completion/task_completion.py +4 -0
  84. deepeval/metrics/tool_correctness/schema.py +6 -0
  85. deepeval/metrics/tool_correctness/template.py +88 -0
  86. deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
  87. deepeval/metrics/tool_use/__init__.py +1 -0
  88. deepeval/metrics/tool_use/schema.py +19 -0
  89. deepeval/metrics/tool_use/template.py +220 -0
  90. deepeval/metrics/tool_use/tool_use.py +458 -0
  91. deepeval/metrics/topic_adherence/__init__.py +1 -0
  92. deepeval/metrics/topic_adherence/schema.py +16 -0
  93. deepeval/metrics/topic_adherence/template.py +162 -0
  94. deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
  95. deepeval/metrics/toxicity/toxicity.py +12 -0
  96. deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
  97. deepeval/models/embedding_models/azure_embedding_model.py +37 -36
  98. deepeval/models/embedding_models/local_embedding_model.py +30 -32
  99. deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
  100. deepeval/models/embedding_models/openai_embedding_model.py +22 -31
  101. deepeval/models/llms/grok_model.py +1 -1
  102. deepeval/models/llms/openai_model.py +2 -0
  103. deepeval/openai/__init__.py +14 -32
  104. deepeval/openai/extractors.py +85 -50
  105. deepeval/openai/patch.py +258 -167
  106. deepeval/openai/types.py +20 -0
  107. deepeval/openai/utils.py +205 -56
  108. deepeval/prompt/__init__.py +19 -1
  109. deepeval/prompt/api.py +160 -0
  110. deepeval/prompt/prompt.py +245 -62
  111. deepeval/prompt/utils.py +186 -15
  112. deepeval/synthesizer/chunking/context_generator.py +209 -152
  113. deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  114. deepeval/synthesizer/synthesizer.py +19 -15
  115. deepeval/test_case/api.py +131 -0
  116. deepeval/test_case/llm_test_case.py +6 -2
  117. deepeval/test_run/__init__.py +1 -0
  118. deepeval/test_run/hyperparameters.py +47 -8
  119. deepeval/test_run/test_run.py +292 -206
  120. deepeval/tracing/__init__.py +2 -1
  121. deepeval/tracing/api.py +3 -1
  122. deepeval/tracing/otel/exporter.py +3 -4
  123. deepeval/tracing/otel/utils.py +24 -5
  124. deepeval/tracing/trace_context.py +89 -5
  125. deepeval/tracing/tracing.py +74 -3
  126. deepeval/tracing/types.py +20 -2
  127. deepeval/tracing/utils.py +8 -0
  128. deepeval/utils.py +21 -0
  129. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
  130. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
  131. deepeval/integrations/llama_index/agent/patched.py +0 -68
  132. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
  133. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
  134. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
@@ -555,7 +555,7 @@ class Synthesizer:
555
555
  include_expected_output=include_expected_output,
556
556
  max_goldens_per_context=max_goldens_per_context,
557
557
  source_files=source_files,
558
- index=index,
558
+ context_index=index,
559
559
  progress=progress,
560
560
  pbar_id=pbar_id,
561
561
  context_scores=_context_scores,
@@ -577,7 +577,7 @@ class Synthesizer:
577
577
  include_expected_output: bool,
578
578
  max_goldens_per_context: int,
579
579
  source_files: Optional[List[str]],
580
- index: int,
580
+ context_index: int,
581
581
  progress: Optional[Progress] = None,
582
582
  pbar_id: Optional[int] = None,
583
583
  context_scores: Optional[List[float]] = None,
@@ -599,7 +599,7 @@ class Synthesizer:
599
599
  # Add pbars
600
600
  pbar_generate_goldens_id = add_pbar(
601
601
  progress,
602
- f"\t⚡ Generating goldens from context #{index}",
602
+ f"\t⚡ Generating goldens from context #{context_index}",
603
603
  total=1 + max_goldens_per_context,
604
604
  )
605
605
  pbar_generate_inputs_id = add_pbar(
@@ -643,7 +643,7 @@ class Synthesizer:
643
643
 
644
644
  # Helper function to process each input in parallel
645
645
  async def process_input(
646
- index: int,
646
+ input_index: int,
647
647
  data: SyntheticData,
648
648
  progress: Optional[Progress] = None,
649
649
  ):
@@ -654,7 +654,7 @@ class Synthesizer:
654
654
  num_evolutions=self.evolution_config.num_evolutions,
655
655
  evolutions=self.evolution_config.evolutions,
656
656
  progress=progress,
657
- pbar_evolve_input_id=pbar_evolve_input_ids[index],
657
+ pbar_evolve_input_id=pbar_evolve_input_ids[input_index],
658
658
  remove_pbar=False,
659
659
  )
660
660
 
@@ -672,7 +672,7 @@ class Synthesizer:
672
672
  )
673
673
  evolved_input = res.input
674
674
  update_pbar(
675
- progress, pbar_evolve_input_ids[index], remove=False
675
+ progress, pbar_evolve_input_ids[input_index], remove=False
676
676
  )
677
677
 
678
678
  # Generate expected output
@@ -685,7 +685,7 @@ class Synthesizer:
685
685
  )
686
686
  expected_output = await self._a_generate(expected_output_prompt)
687
687
  update_pbar(
688
- progress, pbar_evolve_input_ids[index], remove=False
688
+ progress, pbar_evolve_input_ids[input_index], remove=False
689
689
  )
690
690
 
691
691
  # Create Golden
@@ -694,13 +694,14 @@ class Synthesizer:
694
694
  context=context,
695
695
  expected_output=expected_output,
696
696
  source_file=(
697
- source_files[index]
698
- if source_files is not None and index < len(source_files)
697
+ source_files[context_index]
698
+ if source_files is not None
699
+ and context_index < len(source_files)
699
700
  else None
700
701
  ),
701
702
  additional_metadata={
702
703
  "evolutions": evolutions_used,
703
- "synthetic_input_quality": scores[index],
704
+ "synthetic_input_quality": scores[input_index],
704
705
  # "context_quality": (
705
706
  # context_scores[data_index]
706
707
  # if context_scores is not None
@@ -898,6 +899,7 @@ class Synthesizer:
898
899
  update_pbar(progress, pbar_id)
899
900
 
900
901
  # Evolve inputs
902
+ evolved_prompts = []
901
903
  for i, data in enumerate(synthetic_data):
902
904
  pbar_evolve_input_id = add_pbar(
903
905
  progress,
@@ -911,14 +913,16 @@ class Synthesizer:
911
913
  progress=progress,
912
914
  pbar_evolve_input_id=pbar_evolve_input_id,
913
915
  )
916
+ evolved_prompts.append(evolved_prompt)
914
917
  update_pbar(progress, pbar_id)
915
918
 
916
919
  # Synthesize Goldens
917
- golden = Golden(
918
- input=evolved_prompt,
919
- additional_metadata={"evolutions": evolutions_used},
920
- )
921
- goldens.append(golden)
920
+ for evolved_prompt in evolved_prompts:
921
+ golden = Golden(
922
+ input=evolved_prompt,
923
+ additional_metadata={"evolutions": evolutions_used},
924
+ )
925
+ goldens.append(golden)
922
926
 
923
927
  # Wrap up Synthesis
924
928
  self.synthetic_goldens.extend(goldens)
@@ -0,0 +1,131 @@
1
+ from typing import Union, Optional
2
+ import os
3
+
4
+ from deepeval.test_run.api import (
5
+ LLMApiTestCase,
6
+ ConversationalApiTestCase,
7
+ TurnApi,
8
+ TraceApi,
9
+ )
10
+ from deepeval.test_case import (
11
+ LLMTestCase,
12
+ ConversationalTestCase,
13
+ MLLMTestCase,
14
+ Turn,
15
+ )
16
+ from deepeval.constants import PYTEST_RUN_TEST_NAME
17
+
18
+
19
+ def create_api_turn(turn: Turn, index: int) -> TurnApi:
20
+ return TurnApi(
21
+ role=turn.role,
22
+ content=turn.content,
23
+ user_id=turn.user_id,
24
+ retrievalContext=turn.retrieval_context,
25
+ toolsCalled=turn.tools_called,
26
+ additionalMetadata=turn.additional_metadata,
27
+ order=index,
28
+ )
29
+
30
+
31
+ def create_api_test_case(
32
+ test_case: Union[LLMTestCase, ConversationalTestCase, MLLMTestCase],
33
+ trace: Optional[TraceApi] = None,
34
+ index: Optional[int] = None,
35
+ ) -> Union[LLMApiTestCase, ConversationalApiTestCase]:
36
+ if isinstance(test_case, ConversationalTestCase):
37
+ order = (
38
+ test_case._dataset_rank
39
+ if test_case._dataset_rank is not None
40
+ else index
41
+ )
42
+ if test_case.name:
43
+ name = test_case.name
44
+ else:
45
+ name = os.getenv(
46
+ PYTEST_RUN_TEST_NAME, f"conversational_test_case_{order}"
47
+ )
48
+
49
+ api_test_case = ConversationalApiTestCase(
50
+ name=name,
51
+ success=True,
52
+ metricsData=[],
53
+ runDuration=0,
54
+ evaluationCost=None,
55
+ order=order,
56
+ scenario=test_case.scenario,
57
+ expectedOutcome=test_case.expected_outcome,
58
+ userDescription=test_case.user_description,
59
+ context=test_case.context,
60
+ tags=test_case.tags,
61
+ comments=test_case.comments,
62
+ additionalMetadata=test_case.additional_metadata,
63
+ )
64
+ api_test_case.turns = [
65
+ create_api_turn(
66
+ turn=turn,
67
+ index=index,
68
+ )
69
+ for index, turn in enumerate(test_case.turns)
70
+ ]
71
+
72
+ return api_test_case
73
+ else:
74
+ order = (
75
+ test_case._dataset_rank
76
+ if test_case._dataset_rank is not None
77
+ else index
78
+ )
79
+
80
+ success = True
81
+ if test_case.name is not None:
82
+ name = test_case.name
83
+ else:
84
+ name = os.getenv(PYTEST_RUN_TEST_NAME, f"test_case_{order}")
85
+ metrics_data = []
86
+
87
+ if isinstance(test_case, LLMTestCase):
88
+ api_test_case = LLMApiTestCase(
89
+ name=name,
90
+ input=test_case.input,
91
+ actualOutput=test_case.actual_output,
92
+ expectedOutput=test_case.expected_output,
93
+ context=test_case.context,
94
+ retrievalContext=test_case.retrieval_context,
95
+ toolsCalled=test_case.tools_called,
96
+ expectedTools=test_case.expected_tools,
97
+ tokenCost=test_case.token_cost,
98
+ completionTime=test_case.completion_time,
99
+ tags=test_case.tags,
100
+ success=success,
101
+ metricsData=metrics_data,
102
+ runDuration=None,
103
+ evaluationCost=None,
104
+ order=order,
105
+ additionalMetadata=test_case.additional_metadata,
106
+ comments=test_case.comments,
107
+ trace=trace,
108
+ )
109
+ elif isinstance(test_case, MLLMTestCase):
110
+ api_test_case = LLMApiTestCase(
111
+ name=name,
112
+ input="",
113
+ multimodalInput=test_case.input,
114
+ multimodalActualOutput=test_case.actual_output,
115
+ multimodalExpectedOutput=test_case.expected_output,
116
+ multimodalRetrievalContext=test_case.retrieval_context,
117
+ multimodalContext=test_case.context,
118
+ toolsCalled=test_case.tools_called,
119
+ expectedTools=test_case.expected_tools,
120
+ tokenCost=test_case.token_cost,
121
+ completionTime=test_case.completion_time,
122
+ success=success,
123
+ metricsData=metrics_data,
124
+ runDuration=None,
125
+ evaluationCost=None,
126
+ order=order,
127
+ additionalMetadata=test_case.additional_metadata,
128
+ comments=test_case.comments,
129
+ )
130
+ # llm_test_case_lookup_map[instance_id] = api_test_case
131
+ return api_test_case
@@ -122,7 +122,9 @@ class ToolCall(BaseModel):
122
122
 
123
123
  # Handle nested fields like input_parameters
124
124
  if self.input_parameters:
125
- formatted_input = json.dumps(self.input_parameters, indent=4)
125
+ formatted_input = json.dumps(
126
+ self.input_parameters, indent=4, ensure_ascii=False
127
+ )
126
128
  formatted_input = self._indent_nested_field(
127
129
  "input_parameters", formatted_input
128
130
  )
@@ -130,7 +132,9 @@ class ToolCall(BaseModel):
130
132
 
131
133
  # Handle nested fields like output
132
134
  if isinstance(self.output, dict):
133
- formatted_output = json.dumps(self.output, indent=4)
135
+ formatted_output = json.dumps(
136
+ self.output, indent=4, ensure_ascii=False
137
+ )
134
138
  formatted_output = self._indent_nested_field(
135
139
  "output", formatted_output
136
140
  )
@@ -8,6 +8,7 @@ from .test_run import (
8
8
  LLMApiTestCase,
9
9
  ConversationalApiTestCase,
10
10
  TestRunManager,
11
+ PromptData,
11
12
  )
12
13
 
13
14
  from .hooks import on_test_run_end, invoke_test_run_end_hook
@@ -1,13 +1,15 @@
1
- from typing import Union, Dict
2
-
1
+ from typing import Union, Dict, Optional, List
3
2
  from deepeval.test_run import global_test_run_manager
4
3
  from deepeval.prompt import Prompt
5
4
  from deepeval.prompt.api import PromptApi
6
5
  from deepeval.test_run.test_run import TEMP_FILE_PATH
6
+ from deepeval.confident.api import is_confident
7
+ from deepeval.test_run.test_run import PromptData
7
8
 
8
9
 
9
10
  def process_hyperparameters(
10
- hyperparameters,
11
+ hyperparameters: Optional[Dict] = None,
12
+ verbose: bool = True,
11
13
  ) -> Union[Dict[str, Union[str, int, float, PromptApi]], None]:
12
14
  if hyperparameters is None:
13
15
  return None
@@ -16,6 +18,7 @@ def process_hyperparameters(
16
18
  raise TypeError("Hyperparameters must be a dictionary or None")
17
19
 
18
20
  processed_hyperparameters = {}
21
+ prompts_version_id_map = {}
19
22
 
20
23
  for key, value in hyperparameters.items():
21
24
  if not isinstance(key, str):
@@ -30,14 +33,21 @@ def process_hyperparameters(
30
33
  )
31
34
 
32
35
  if isinstance(value, Prompt):
33
- if value._prompt_version_id is not None and value._type is not None:
36
+ prompt_key = f"{value.alias}_{value.version}"
37
+ if value._prompt_version_id is not None and value.type is not None:
34
38
  processed_hyperparameters[key] = PromptApi(
35
39
  id=value._prompt_version_id,
36
- type=value._type,
40
+ type=value.type,
37
41
  )
38
- else:
39
- raise ValueError(
40
- f"Cannot log Prompt where template was not pulled from Confident AI. Please import your prompt on Confident AI to continue."
42
+ elif is_confident():
43
+ if prompt_key not in prompts_version_id_map:
44
+ value.push(_verbose=verbose)
45
+ prompts_version_id_map[prompt_key] = (
46
+ value._prompt_version_id
47
+ )
48
+ processed_hyperparameters[key] = PromptApi(
49
+ id=prompts_version_id_map[prompt_key],
50
+ type=value.type,
41
51
  )
42
52
  else:
43
53
  processed_hyperparameters[key] = str(value)
@@ -64,3 +74,32 @@ def log_hyperparameters(func):
64
74
 
65
75
  # Return the wrapper function to be used as the decorator
66
76
  return wrapper
77
+
78
+
79
+ def process_prompts(
80
+ hyperparameters: Dict[str, Union[str, int, float, Prompt]],
81
+ ) -> List[PromptData]:
82
+ prompts = []
83
+ if not hyperparameters:
84
+ return prompts
85
+ seen_prompts = set()
86
+ prompt_objects = [
87
+ value for value in hyperparameters.values() if isinstance(value, Prompt)
88
+ ]
89
+ for prompt in prompt_objects:
90
+ prompt_version = prompt.version if is_confident() else None
91
+ prompt_key = f"{prompt.alias}_{prompt_version}"
92
+ if prompt_key in seen_prompts:
93
+ continue
94
+ seen_prompts.add(prompt_key)
95
+ prompt_data = PromptData(
96
+ alias=prompt.alias,
97
+ version=prompt_version,
98
+ text_template=prompt.text_template,
99
+ messages_template=prompt.messages_template,
100
+ model_settings=prompt.model_settings,
101
+ output_type=prompt.output_type,
102
+ interpolation_type=prompt.interpolation_type,
103
+ )
104
+ prompts.append(prompt_data)
105
+ return prompts