deepeval 3.7.2__py3-none-any.whl → 3.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/benchmarks/human_eval/human_eval.py +2 -1
  3. deepeval/cli/test.py +1 -1
  4. deepeval/config/settings.py +102 -13
  5. deepeval/dataset/dataset.py +35 -11
  6. deepeval/dataset/utils.py +2 -0
  7. deepeval/evaluate/configs.py +1 -1
  8. deepeval/evaluate/execute.py +4 -1
  9. deepeval/metrics/answer_relevancy/template.py +4 -4
  10. deepeval/metrics/argument_correctness/template.py +2 -2
  11. deepeval/metrics/bias/template.py +3 -3
  12. deepeval/metrics/contextual_precision/template.py +6 -6
  13. deepeval/metrics/contextual_recall/template.py +2 -2
  14. deepeval/metrics/contextual_relevancy/template.py +3 -3
  15. deepeval/metrics/conversation_completeness/template.py +2 -2
  16. deepeval/metrics/conversational_dag/templates.py +4 -4
  17. deepeval/metrics/conversational_g_eval/template.py +4 -3
  18. deepeval/metrics/dag/templates.py +4 -4
  19. deepeval/metrics/faithfulness/template.py +4 -4
  20. deepeval/metrics/hallucination/template.py +4 -4
  21. deepeval/metrics/misuse/template.py +2 -2
  22. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +7 -7
  23. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +6 -6
  24. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +2 -2
  25. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +3 -3
  26. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +9 -9
  27. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +4 -4
  28. deepeval/metrics/non_advice/template.py +2 -2
  29. deepeval/metrics/pii_leakage/template.py +2 -2
  30. deepeval/metrics/prompt_alignment/template.py +4 -4
  31. deepeval/metrics/role_violation/template.py +2 -2
  32. deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
  33. deepeval/metrics/toxicity/template.py +4 -4
  34. deepeval/metrics/turn_relevancy/template.py +2 -2
  35. deepeval/metrics/utils.py +3 -0
  36. deepeval/models/__init__.py +2 -0
  37. deepeval/models/embedding_models/azure_embedding_model.py +28 -15
  38. deepeval/models/embedding_models/local_embedding_model.py +23 -10
  39. deepeval/models/embedding_models/ollama_embedding_model.py +8 -6
  40. deepeval/models/embedding_models/openai_embedding_model.py +18 -2
  41. deepeval/models/llms/anthropic_model.py +17 -5
  42. deepeval/models/llms/azure_model.py +30 -18
  43. deepeval/models/llms/deepseek_model.py +22 -12
  44. deepeval/models/llms/gemini_model.py +120 -87
  45. deepeval/models/llms/grok_model.py +23 -16
  46. deepeval/models/llms/kimi_model.py +23 -12
  47. deepeval/models/llms/litellm_model.py +63 -25
  48. deepeval/models/llms/local_model.py +26 -18
  49. deepeval/models/llms/ollama_model.py +17 -7
  50. deepeval/models/llms/openai_model.py +22 -17
  51. deepeval/models/llms/portkey_model.py +132 -0
  52. deepeval/models/mlllms/__init__.py +1 -0
  53. deepeval/models/mlllms/azure_model.py +343 -0
  54. deepeval/models/mlllms/gemini_model.py +102 -73
  55. deepeval/models/mlllms/ollama_model.py +40 -9
  56. deepeval/models/mlllms/openai_model.py +65 -14
  57. deepeval/models/utils.py +48 -3
  58. deepeval/optimization/__init__.py +13 -0
  59. deepeval/optimization/adapters/__init__.py +2 -0
  60. deepeval/optimization/adapters/deepeval_scoring_adapter.py +588 -0
  61. deepeval/optimization/aggregates.py +14 -0
  62. deepeval/optimization/configs.py +34 -0
  63. deepeval/optimization/copro/configs.py +31 -0
  64. deepeval/optimization/copro/loop.py +837 -0
  65. deepeval/optimization/gepa/__init__.py +7 -0
  66. deepeval/optimization/gepa/configs.py +115 -0
  67. deepeval/optimization/gepa/loop.py +677 -0
  68. deepeval/optimization/miprov2/configs.py +134 -0
  69. deepeval/optimization/miprov2/loop.py +785 -0
  70. deepeval/optimization/mutations/__init__.py +0 -0
  71. deepeval/optimization/mutations/prompt_rewriter.py +458 -0
  72. deepeval/optimization/policies/__init__.py +16 -0
  73. deepeval/optimization/policies/selection.py +166 -0
  74. deepeval/optimization/policies/tie_breaker.py +67 -0
  75. deepeval/optimization/prompt_optimizer.py +462 -0
  76. deepeval/optimization/simba/__init__.py +0 -0
  77. deepeval/optimization/simba/configs.py +33 -0
  78. deepeval/optimization/simba/loop.py +983 -0
  79. deepeval/optimization/simba/types.py +15 -0
  80. deepeval/optimization/types.py +361 -0
  81. deepeval/optimization/utils.py +598 -0
  82. deepeval/prompt/prompt.py +10 -5
  83. deepeval/test_run/cache.py +2 -0
  84. deepeval/test_run/test_run.py +6 -1
  85. deepeval/tracing/context.py +3 -0
  86. deepeval/tracing/tracing.py +22 -11
  87. deepeval/utils.py +24 -0
  88. {deepeval-3.7.2.dist-info → deepeval-3.7.4.dist-info}/METADATA +1 -1
  89. {deepeval-3.7.2.dist-info → deepeval-3.7.4.dist-info}/RECORD +92 -66
  90. {deepeval-3.7.2.dist-info → deepeval-3.7.4.dist-info}/entry_points.txt +1 -1
  91. {deepeval-3.7.2.dist-info → deepeval-3.7.4.dist-info}/LICENSE.md +0 -0
  92. {deepeval-3.7.2.dist-info → deepeval-3.7.4.dist-info}/WHEEL +0 -0
deepeval/_version.py CHANGED
@@ -1 +1 @@
1
- __version__: str = "3.7.2"
1
+ __version__: str = "3.7.4"
@@ -92,7 +92,7 @@ class HumanEval(DeepEvalBaseBenchmark):
92
92
  self.predictions: Optional[pd.DataFrame] = None
93
93
  self.task_scores: Optional[pd.DataFrame] = None
94
94
  self.overall_score: Optional[float] = None
95
- self.verbose_mode: bool = (False,)
95
+ self.verbose_mode: bool = verbose_mode
96
96
 
97
97
  def evaluate(
98
98
  self, model: DeepEvalBaseLLM, *args, k: int = 1, **kwargs
@@ -123,6 +123,7 @@ class HumanEval(DeepEvalBaseBenchmark):
123
123
  task.value,
124
124
  golden.input,
125
125
  prediction,
126
+ task_correct,
126
127
  golden.expected_output,
127
128
  score,
128
129
  )
deepeval/cli/test.py CHANGED
@@ -160,7 +160,7 @@ def run(
160
160
  pytest_args.extend(["--identifier", identifier])
161
161
 
162
162
  # Add the deepeval plugin file to pytest arguments
163
- pytest_args.extend(["-p", "plugins"])
163
+ pytest_args.extend(["-p", "deepeval"])
164
164
  # Append the extra arguments collected by allow_extra_args=True
165
165
  # Pytest will raise its own error if the arguments are invalid (error:
166
166
  if ctx.args:
@@ -49,6 +49,8 @@ _DEPRECATED_TO_OVERRIDE = {
49
49
  "DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS": "DEEPEVAL_PER_ATTEMPT_TIMEOUT_SECONDS_OVERRIDE",
50
50
  "DEEPEVAL_TASK_GATHER_BUFFER_SECONDS": "DEEPEVAL_TASK_GATHER_BUFFER_SECONDS_OVERRIDE",
51
51
  }
52
+ # Track which secrets we've warned about when loading from the legacy keyfile
53
+ _LEGACY_KEYFILE_SECRET_WARNED: set[str] = set()
52
54
 
53
55
 
54
56
  def _find_legacy_enum(env_key: str):
@@ -88,6 +90,82 @@ def _is_secret_key(settings: "Settings", env_key: str) -> bool:
88
90
  return False
89
91
 
90
92
 
93
+ def _merge_legacy_keyfile_into_env() -> None:
94
+ """
95
+ Backwards compatibility: merge values from the legacy .deepeval/.deepeval
96
+ JSON keystore into os.environ for known Settings fields, without
97
+ overwriting existing process env vars.
98
+
99
+ This runs before we compute the Settings env fingerprint so that Pydantic
100
+ can see these values on first construction.
101
+
102
+ Precedence: process env -> dotenv -> legacy json
103
+ """
104
+ # if somebody really wants to skip this behavior
105
+ if parse_bool(os.getenv("DEEPEVAL_DISABLE_LEGACY_KEYFILE"), default=False):
106
+ return
107
+
108
+ from deepeval.constants import HIDDEN_DIR, KEY_FILE
109
+ from deepeval.key_handler import (
110
+ KeyValues,
111
+ ModelKeyValues,
112
+ EmbeddingKeyValues,
113
+ SECRET_KEYS,
114
+ )
115
+
116
+ key_path = Path(HIDDEN_DIR) / KEY_FILE
117
+
118
+ try:
119
+ with key_path.open("r", encoding="utf-8") as f:
120
+ try:
121
+ data = json.load(f)
122
+ except json.JSONDecodeError:
123
+ # Corrupted file -> ignore, same as KeyFileHandler
124
+ return
125
+ except FileNotFoundError:
126
+ # No legacy store -> nothing to merge
127
+ return
128
+
129
+ if not isinstance(data, dict):
130
+ return
131
+
132
+ # Map JSON keys (enum .value) -> env keys (enum .name)
133
+ mapping: Dict[str, str] = {}
134
+ for enum in (KeyValues, ModelKeyValues, EmbeddingKeyValues):
135
+ for member in enum:
136
+ mapping[member.value] = member.name
137
+
138
+ for json_key, raw in data.items():
139
+ env_key = mapping.get(json_key)
140
+ if not env_key:
141
+ continue
142
+
143
+ # Process env always wins
144
+ if env_key in os.environ:
145
+ continue
146
+ if raw is None:
147
+ continue
148
+
149
+ # Mirror the legacy warning semantics for secrets, but only once per key
150
+ if (
151
+ json_key in SECRET_KEYS
152
+ and json_key not in _LEGACY_KEYFILE_SECRET_WARNED
153
+ ):
154
+ logger.warning(
155
+ "Reading secret '%s' from legacy %s/%s. "
156
+ "Persisting API keys in plaintext is deprecated. "
157
+ "Move this to your environment (.env / .env.local). "
158
+ "This fallback will be removed in a future release.",
159
+ json_key,
160
+ HIDDEN_DIR,
161
+ KEY_FILE,
162
+ )
163
+ _LEGACY_KEYFILE_SECRET_WARNED.add(json_key)
164
+
165
+ # Let Settings validators coerce types; we just inject the raw string
166
+ os.environ[env_key] = str(raw)
167
+
168
+
91
169
  def _read_env_file(path: Path) -> Dict[str, str]:
92
170
  if not path.exists():
93
171
  return {}
@@ -258,6 +336,7 @@ class Settings(BaseSettings):
258
336
  GOOGLE_GENAI_USE_VERTEXAI: Optional[bool] = None
259
337
  GOOGLE_CLOUD_PROJECT: Optional[str] = None
260
338
  GOOGLE_CLOUD_LOCATION: Optional[str] = None
339
+ GOOGLE_SERVICE_ACCOUNT_KEY: Optional[str] = None
261
340
  # Grok
262
341
  USE_GROK_MODEL: Optional[bool] = None
263
342
  GROK_API_KEY: Optional[SecretStr] = None
@@ -291,6 +370,12 @@ class Settings(BaseSettings):
291
370
  OPENAI_MODEL_NAME: Optional[str] = None
292
371
  OPENAI_COST_PER_INPUT_TOKEN: Optional[float] = None
293
372
  OPENAI_COST_PER_OUTPUT_TOKEN: Optional[float] = None
373
+ # PortKey
374
+ USE_PORTKEY_MODEL: Optional[bool] = None
375
+ PORTKEY_API_KEY: Optional[SecretStr] = None
376
+ PORTKEY_MODEL_NAME: Optional[str] = None
377
+ PORTKEY_BASE_URL: Optional[AnyUrl] = None
378
+ PORTKEY_PROVIDER_NAME: Optional[str] = None
294
379
  # Vertex AI
295
380
  VERTEX_AI_MODEL_NAME: Optional[str] = None
296
381
  # VLLM
@@ -516,29 +601,30 @@ class Settings(BaseSettings):
516
601
  "CONFIDENT_OPEN_BROWSER",
517
602
  "CONFIDENT_TRACE_FLUSH",
518
603
  "CONFIDENT_TRACE_VERBOSE",
604
+ "CUDA_LAUNCH_BLOCKING",
605
+ "DEEPEVAL_VERBOSE_MODE",
606
+ "DEEPEVAL_GRPC_LOGGING",
607
+ "DEEPEVAL_DISABLE_DOTENV",
608
+ "DEEPEVAL_TELEMETRY_OPT_OUT",
609
+ "DEEPEVAL_UPDATE_WARNING_OPT_IN",
610
+ "ENABLE_DEEPEVAL_CACHE",
611
+ "ERROR_REPORTING",
612
+ "GOOGLE_GENAI_USE_VERTEXAI",
613
+ "IGNORE_DEEPEVAL_ERRORS",
614
+ "SKIP_DEEPEVAL_MISSING_PARAMS",
615
+ "TOKENIZERS_PARALLELISM",
616
+ "TRANSFORMERS_NO_ADVISORY_WARNINGS",
519
617
  "USE_OPENAI_MODEL",
520
618
  "USE_AZURE_OPENAI",
521
619
  "USE_LOCAL_MODEL",
522
620
  "USE_GEMINI_MODEL",
523
- "GOOGLE_GENAI_USE_VERTEXAI",
524
621
  "USE_MOONSHOT_MODEL",
525
622
  "USE_GROK_MODEL",
526
623
  "USE_DEEPSEEK_MODEL",
527
624
  "USE_LITELLM",
528
625
  "USE_AZURE_OPENAI_EMBEDDING",
529
626
  "USE_LOCAL_EMBEDDINGS",
530
- "DEEPEVAL_GRPC_LOGGING",
531
- "DEEPEVAL_DISABLE_DOTENV",
532
- "DEEPEVAL_TELEMETRY_OPT_OUT",
533
- "DEEPEVAL_UPDATE_WARNING_OPT_IN",
534
- "TOKENIZERS_PARALLELISM",
535
- "TRANSFORMERS_NO_ADVISORY_WARNINGS",
536
- "CUDA_LAUNCH_BLOCKING",
537
- "ERROR_REPORTING",
538
- "IGNORE_DEEPEVAL_ERRORS",
539
- "SKIP_DEEPEVAL_MISSING_PARAMS",
540
- "DEEPEVAL_VERBOSE_MODE",
541
- "ENABLE_DEEPEVAL_CACHE",
627
+ "USE_PORTKEY_MODEL",
542
628
  mode="before",
543
629
  )
544
630
  @classmethod
@@ -1008,6 +1094,9 @@ _settings_lock = threading.RLock()
1008
1094
 
1009
1095
 
1010
1096
  def _calc_env_fingerprint() -> str:
1097
+ # Pull legacy .deepeval JSON-based settings into the process env before hashing
1098
+ _merge_legacy_keyfile_into_env()
1099
+
1011
1100
  env = os.environ.copy()
1012
1101
  # must hash in a stable order.
1013
1102
  keys = sorted(
@@ -189,17 +189,35 @@ class EvaluationDataset:
189
189
  test_case._dataset_alias = self._alias
190
190
  test_case._dataset_id = self._id
191
191
  if isinstance(test_case, LLMTestCase):
192
+ if self._conversational_goldens or self._conversational_test_cases:
193
+ raise TypeError(
194
+ "You cannot add 'LLMTestCase' to a multi-turn dataset."
195
+ )
192
196
  test_case._dataset_rank = len(self._llm_test_cases)
193
197
  self._llm_test_cases.append(test_case)
194
198
  elif isinstance(test_case, ConversationalTestCase):
199
+ if self._goldens or self._llm_test_cases:
200
+ raise TypeError(
201
+ "You cannot add 'ConversationalTestCase' to a single-turn dataset."
202
+ )
203
+ self._multi_turn = True
195
204
  test_case._dataset_rank = len(self._conversational_test_cases)
196
205
  self._conversational_test_cases.append(test_case)
197
206
 
198
207
  def add_golden(self, golden: Union[Golden, ConversationalGolden]):
199
- if self._multi_turn:
200
- self._add_conversational_golden(golden)
201
- else:
208
+ if isinstance(golden, Golden):
209
+ if self._conversational_goldens or self._conversational_test_cases:
210
+ raise TypeError(
211
+ "You cannot add 'Golden' to a multi-turn dataset."
212
+ )
202
213
  self._add_golden(golden)
214
+ else:
215
+ if self._goldens or self._llm_test_cases:
216
+ raise TypeError(
217
+ "You cannot add 'ConversationalGolden' to a single-turn dataset."
218
+ )
219
+ self._multi_turn = True
220
+ self._add_conversational_golden(golden)
203
221
 
204
222
  def _add_golden(self, golden: Union[Golden, ConversationalGolden]):
205
223
  if isinstance(golden, Golden):
@@ -224,16 +242,16 @@ class EvaluationDataset:
224
242
  file_path: str,
225
243
  input_col_name: str,
226
244
  actual_output_col_name: str,
227
- expected_output_col_name: Optional[str] = None,
228
- context_col_name: Optional[str] = None,
245
+ expected_output_col_name: Optional[str] = "expected_output",
246
+ context_col_name: Optional[str] = "context",
229
247
  context_col_delimiter: str = ";",
230
- retrieval_context_col_name: Optional[str] = None,
248
+ retrieval_context_col_name: Optional[str] = "retrieval_context",
231
249
  retrieval_context_col_delimiter: str = ";",
232
- tools_called_col_name: Optional[str] = None,
250
+ tools_called_col_name: Optional[str] = "tools_called",
233
251
  tools_called_col_delimiter: str = ";",
234
- expected_tools_col_name: Optional[str] = None,
252
+ expected_tools_col_name: Optional[str] = "expected_tools",
235
253
  expected_tools_col_delimiter: str = ";",
236
- additional_metadata_col_name: Optional[str] = None,
254
+ additional_metadata_col_name: Optional[str] = "additional_metadata",
237
255
  ):
238
256
  """
239
257
  Load test cases from a CSV file.
@@ -379,6 +397,7 @@ class EvaluationDataset:
379
397
  retrieval_context_key_name: Optional[str] = None,
380
398
  tools_called_key_name: Optional[str] = None,
381
399
  expected_tools_key_name: Optional[str] = None,
400
+ addtional_metadata_key_name: Optional[str] = None,
382
401
  encoding_type: str = "utf-8",
383
402
  ):
384
403
  """
@@ -431,6 +450,7 @@ class EvaluationDataset:
431
450
  tools_called = [ToolCall(**tool) for tool in tools_called_data]
432
451
  expected_tools_data = json_obj.get(expected_tools_key_name, [])
433
452
  expected_tools = [ToolCall(**tool) for tool in expected_tools_data]
453
+ # additional_metadata = json_obj.get(addtional_metadata_key_name)
434
454
 
435
455
  self.add_test_case(
436
456
  LLMTestCase(
@@ -441,6 +461,7 @@ class EvaluationDataset:
441
461
  retrieval_context=retrieval_context,
442
462
  tools_called=tools_called,
443
463
  expected_tools=expected_tools,
464
+ # additional_metadata=additional_metadata,
444
465
  )
445
466
  )
446
467
 
@@ -460,8 +481,8 @@ class EvaluationDataset:
460
481
  expected_tools_col_delimiter: str = ";",
461
482
  comments_key_name: str = "comments",
462
483
  name_key_name: str = "name",
463
- source_file_col_name: Optional[str] = None,
464
- additional_metadata_col_name: Optional[str] = None,
484
+ source_file_col_name: Optional[str] = "source_file",
485
+ additional_metadata_col_name: Optional[str] = "additional_metadata",
465
486
  scenario_col_name: Optional[str] = "scenario",
466
487
  turns_col_name: Optional[str] = "turns",
467
488
  expected_outcome_col_name: Optional[str] = "expected_outcome",
@@ -587,6 +608,7 @@ class EvaluationDataset:
587
608
  context=context,
588
609
  comments=comments,
589
610
  name=name,
611
+ additional_metadata=additional_metadata,
590
612
  )
591
613
  )
592
614
  else:
@@ -645,6 +667,7 @@ class EvaluationDataset:
645
667
  comments = json_obj.get(comments_key_name)
646
668
  name = json_obj.get(name_key_name)
647
669
  parsed_turns = parse_turns(turns) if turns else []
670
+ additional_metadata = json_obj.get(additional_metadata_key_name)
648
671
 
649
672
  self._multi_turn = True
650
673
  self.goldens.append(
@@ -656,6 +679,7 @@ class EvaluationDataset:
656
679
  context=context,
657
680
  comments=comments,
658
681
  name=name,
682
+ additional_metadata=additional_metadata,
659
683
  )
660
684
  )
661
685
  else:
deepeval/dataset/utils.py CHANGED
@@ -24,6 +24,7 @@ def convert_test_cases_to_goldens(
24
24
  "retrieval_context": test_case.retrieval_context,
25
25
  "tools_called": test_case.tools_called,
26
26
  "expected_tools": test_case.expected_tools,
27
+ "additional_metadata": test_case.additional_metadata,
27
28
  }
28
29
  goldens.append(Golden(**golden))
29
30
  return goldens
@@ -70,6 +71,7 @@ def convert_convo_test_cases_to_convo_goldens(
70
71
  "expected_outcome": test_case.expected_outcome,
71
72
  "user_description": test_case.user_description,
72
73
  "context": test_case.context,
74
+ "additional_metadata": test_case.additional_metadata,
73
75
  }
74
76
  goldens.append(ConversationalGolden(**golden))
75
77
  return goldens
@@ -7,7 +7,7 @@ from deepeval.test_run.test_run import TestRunResultDisplay
7
7
  @dataclass
8
8
  class AsyncConfig:
9
9
  run_async: bool = True
10
- throttle_value: int = 0
10
+ throttle_value: float = 0
11
11
  max_concurrent: int = 20
12
12
 
13
13
  def __post_init__(self):
@@ -718,6 +718,8 @@ async def a_execute_test_cases(
718
718
  "Gather timed out after %.1fs. Some metrics may be marked as timed out.",
719
719
  _gather_timeout(),
720
720
  )
721
+ if not error_config.ignore_errors:
722
+ raise
721
723
 
722
724
  else:
723
725
  for test_case in test_cases:
@@ -803,7 +805,8 @@ async def a_execute_test_cases(
803
805
  if not t.done():
804
806
  t.cancel()
805
807
  await asyncio.gather(*tasks, return_exceptions=True)
806
- raise
808
+ if not error_config.ignore_errors:
809
+ raise
807
810
 
808
811
  return test_results
809
812
 
@@ -49,12 +49,12 @@ Expected JSON format:
49
49
  "verdict": "yes"
50
50
  }},
51
51
  {{
52
- "verdict": "no",
53
- "reason": <explanation_for_irrelevance>
52
+ "reason": <explanation_for_irrelevance>,
53
+ "verdict": "no"
54
54
  }},
55
55
  {{
56
- "verdict": "idk",
57
- "reason": <explanation_for_ambiguity>
56
+ "reason": <explanation_for_ambiguity>,
57
+ "verdict": "idk"
58
58
  }}
59
59
  ]
60
60
  }}
@@ -70,8 +70,8 @@ class ArgumentCorrectnessTemplate:
70
70
  "verdict": "yes"
71
71
  }},
72
72
  {{
73
- "verdict": "no",
74
- "reason": "Recommending romantic Parisian comedies does not help find the highest temperature in 2023."
73
+ "reason": "Recommending romantic Parisian comedies does not help find the highest temperature in 2023.",
74
+ "verdict": "no"
75
75
  }}
76
76
  ]
77
77
  }}
@@ -64,15 +64,15 @@ Example JSON:
64
64
  {{
65
65
  "verdicts": [
66
66
  {{
67
- "verdict": "yes",
68
- "reason": "The opinion "Government meddling in healthcare bloats costs and quashes innovation" reveals a political bias, emphasizing negative views on government involvement."
67
+ "reason": "The opinion "Government meddling in healthcare bloats costs and quashes innovation" reveals a political bias, emphasizing negative views on government involvement.",
68
+ "verdict": "yes"
69
69
  }},
70
70
  {{
71
71
  "verdict": "no"
72
72
  }},
73
73
  {{
74
74
  "verdict": "no"
75
- }},
75
+ }}
76
76
  ]
77
77
  }}
78
78
 
@@ -19,16 +19,16 @@ Example:
19
19
  {{
20
20
  "verdicts": [
21
21
  {{
22
- "verdict": "yes",
23
- "reason": "It clearly addresses the question by stating that 'Einstein won the Nobel Prize for his discovery of the photoelectric effect.'"
22
+ "reason": "It clearly addresses the question by stating that 'Einstein won the Nobel Prize for his discovery of the photoelectric effect.'",
23
+ "verdict": "yes"
24
24
  }},
25
25
  {{
26
- "verdict": "yes",
27
- "reason": "The text verifies that the prize was indeed won in 1968."
26
+ "reason": "The text verifies that the prize was indeed won in 1968.",
27
+ "verdict": "yes"
28
28
  }},
29
29
  {{
30
- "verdict": "no",
31
- "reason": "'There was a cat' is not at all relevant to the topic of winning a Nobel Prize."
30
+ "reason": "'There was a cat' is not at all relevant to the topic of winning a Nobel Prize.",
31
+ "verdict": "no"
32
32
  }}
33
33
  ]
34
34
  }}
@@ -55,8 +55,8 @@ IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' k
55
55
  {{
56
56
  "verdicts": [
57
57
  {{
58
- "verdict": "yes",
59
- "reason": "..."
58
+ "reason": "...",
59
+ "verdict": "yes"
60
60
  }},
61
61
  ...
62
62
  ]
@@ -55,13 +55,13 @@ Example:
55
55
  {{
56
56
  "verdicts": [
57
57
  {{
58
- "verdict": "yes",
59
58
  "statement": "Einstein won the Nobel Prize for his discovery of the photoelectric effect in 1968",
59
+ "verdict": "yes"
60
60
  }},
61
61
  {{
62
- "verdict": "no",
63
62
  "statement": "There was a cat.",
64
- "reason": "The retrieval context contained the information 'There was a cat' when it has nothing to do with Einstein's achievements."
63
+ "reason": "The retrieval context contained the information 'There was a cat' when it has nothing to do with Einstein's achievements.",
64
+ "verdict": "no"
65
65
  }}
66
66
  ]
67
67
  }}
@@ -86,8 +86,8 @@ User wants to tell the assistant something.
86
86
 
87
87
  Example JSON:
88
88
  {{
89
- "verdict": "no",
90
- "reason": "The user wanted to tell the assistant something but the LLM not only refused to answer but replied 'Oh ok, in that case should you need anything just let me know!', which is completely irrelevant and doesn't satisfy the user at all. "
89
+ "reason": "The user wanted to tell the assistant something but the LLM not only refused to answer but replied 'Oh ok, in that case should you need anything just let me know!', which is completely irrelevant and doesn't satisfy the user at all.",
90
+ "verdict": "no"
91
91
  }}
92
92
  ===== END OF EXAMPLE ======
93
93
 
@@ -77,8 +77,8 @@ class ConversationalBinaryJudgementTemplate:
77
77
 
78
78
  Example:
79
79
  {{
80
- "verdict": true,
81
- "reason": "The assistant provided a clear and direct answer in response to every user query."
80
+ "reason": "The assistant provided a clear and direct answer in response to every user query.",
81
+ "verdict": true
82
82
  }}
83
83
  **
84
84
  JSON:
@@ -108,8 +108,8 @@ class ConversationalNonBinaryJudgementTemplate:
108
108
 
109
109
  Example:
110
110
  {{
111
- "verdict": "{options[1]}",
112
- "reason": "The assistant partially addressed the user’s issue but missed clarifying their follow-up question."
111
+ "reason": "The assistant partially addressed the user's issue but missed clarifying their follow-up question.",
112
+ "verdict": "{options[1]}"
113
113
  }}
114
114
  **
115
115
  JSON:
@@ -70,7 +70,8 @@ JSON:
70
70
  ---
71
71
  Example JSON:
72
72
  {{
73
- "score": 0,
74
- "reason": "Your concise and informative reason here."
73
+ "reason": "Your concise and informative reason here.",
74
+ "score": 0
75
75
  }}
76
- """
76
+
77
+ JSON:"""
@@ -63,8 +63,8 @@ class BinaryJudgementTemplate:
63
63
  IMPORTANT: Please make sure to only return a json with two keys: `verdict` (True or False), and the 'reason' key providing the reason. The verdict must be a boolean only, either True or False.
64
64
  Example JSON:
65
65
  {{
66
- "verdict": True,
67
- "reason": "..."
66
+ "reason": "...",
67
+ "verdict": True
68
68
  }}
69
69
  **
70
70
 
@@ -85,8 +85,8 @@ class NonBinaryJudgementTemplate:
85
85
  IMPORTANT: Please make sure to only return a json with two keys: 'verdict' {options} and 'reason' providing the reason.
86
86
  Example JSON:
87
87
  {{
88
- "verdict": {options},
89
- "reason": "..."
88
+ "reason": "...",
89
+ "verdict": {options}
90
90
  }}
91
91
  **
92
92
 
@@ -83,12 +83,12 @@ Expected JSON format:
83
83
  "verdict": "yes"
84
84
  }},
85
85
  {{
86
- "verdict": "no",
87
- "reason": <explanation_for_contradiction>
86
+ "reason": <explanation_for_contradiction>,
87
+ "verdict": "no"
88
88
  }},
89
89
  {{
90
- "verdict": "idk",
91
- "reason": <explanation_for_uncertainty>
90
+ "reason": <explanation_for_uncertainty>,
91
+ "verdict": "idk"
92
92
  }}
93
93
  ]
94
94
  }}
@@ -17,12 +17,12 @@ Example:
17
17
  {{
18
18
  "verdicts": [
19
19
  {{
20
- "verdict": "yes",
21
- "reason": "The actual output agrees with the provided context which states that Einstein won the Nobel Prize for his discovery of the photoelectric effect."
20
+ "reason": "The actual output agrees with the provided context which states that Einstein won the Nobel Prize for his discovery of the photoelectric effect.",
21
+ "verdict": "yes"
22
22
  }},
23
23
  {{
24
- "verdict": "no",
25
- "reason": "The actual output contradicts the provided context which states that Einstein won the Nobel Prize in 1968, not 1969."
24
+ "reason": "The actual output contradicts the provided context which states that Einstein won the Nobel Prize in 1968, not 1969.",
25
+ "verdict": "no"
26
26
  }}
27
27
  ]
28
28
  }}
@@ -40,8 +40,8 @@ Example JSON:
40
40
  {{
41
41
  "verdicts": [
42
42
  {{
43
- "verdict": "yes",
44
- "reason": "This request falls outside the {domain} domain and should be handled by a different specialist."
43
+ "reason": "This request falls outside the {domain} domain and should be handled by a different specialist.",
44
+ "verdict": "yes"
45
45
  }},
46
46
  {{
47
47
  "verdict": "no"
@@ -50,19 +50,19 @@ class MultimodalAnswerRelevancyTemplate:
50
50
  {{
51
51
  "verdicts": [
52
52
  {{
53
- "verdict": "no",
54
- "reason": "The 'Shoes.' statement made in the actual output is completely irrelevant to the input, which asks about what to do in the event of an earthquake."
53
+ "reason": "The 'Shoes.' statement made in the actual output is completely irrelevant to the input, which asks about what to do in the event of an earthquake.",
54
+ "verdict": "no"
55
55
  }},
56
56
  {{
57
- "verdict": "idk",
58
- "reason": "The statement thanking the user for asking the question is not directly relevant to the input, but is not entirely irrelevant."
57
+ "reason": "The statement thanking the user for asking the question is not directly relevant to the input, but is not entirely irrelevant.",
58
+ "verdict": "idk"
59
59
  }},
60
60
  {{
61
- "verdict": "idk",
62
- "reason": "The question about whether there is anything else the user can help with is not directly relevant to the input, but is not entirely irrelevant."
61
+ "reason": "The question about whether there is anything else the user can help with is not directly relevant to the input, but is not entirely irrelevant.",
62
+ "verdict": "idk"
63
63
  }},
64
64
  {{
65
- "verdict": "yes",
65
+ "verdict": "yes"
66
66
  }}
67
67
  ]
68
68
  }}
@@ -27,16 +27,16 @@ class MultiModalContextualPrecisionTemplate:
27
27
  {{
28
28
  "verdicts": [
29
29
  {{
30
- "verdict": "yes",
31
- "reason": "It clearly addresses the question by stating that 'Einstein won the Nobel Prize for his discovery of the photoelectric effect.'"
30
+ "reason": "It clearly addresses the question by stating that 'Einstein won the Nobel Prize for his discovery of the photoelectric effect.'",
31
+ "verdict": "yes"
32
32
  }},
33
33
  {{
34
- "verdict": "yes",
35
- "reason": "The text verifies that the prize was indeed won in 1968."
34
+ "reason": "The text verifies that the prize was indeed won in 1968.",
35
+ "verdict": "yes"
36
36
  }},
37
37
  {{
38
- "verdict": "no",
39
- "reason": "'There was a cat' is not at all relevant to the topic of winning a Nobel Prize."
38
+ "reason": "'There was a cat' is not at all relevant to the topic of winning a Nobel Prize.",
39
+ "verdict": "no"
40
40
  }}
41
41
  ]
42
42
  }}
@@ -66,8 +66,8 @@ class MultimodalContextualRecallTemplate:
66
66
  {{
67
67
  "verdicts": [
68
68
  {{
69
- "verdict": "yes",
70
- "reason": "..."
69
+ "reason": "...",
70
+ "verdict": "yes"
71
71
  }},
72
72
  ...
73
73
  ]