ibm-watsonx-orchestrate-evaluation-framework 1.0.8__py3-none-any.whl → 1.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (60) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info}/METADATA +103 -109
  2. ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info/RECORD +96 -0
  3. wxo_agentic_evaluation/analytics/tools/main.py +1 -18
  4. wxo_agentic_evaluation/analyze_run.py +358 -97
  5. wxo_agentic_evaluation/arg_configs.py +28 -1
  6. wxo_agentic_evaluation/description_quality_checker.py +149 -0
  7. wxo_agentic_evaluation/evaluation_package.py +58 -17
  8. wxo_agentic_evaluation/inference_backend.py +32 -17
  9. wxo_agentic_evaluation/llm_user.py +2 -1
  10. wxo_agentic_evaluation/metrics/metrics.py +22 -1
  11. wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
  12. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +9 -1
  13. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
  14. wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
  15. wxo_agentic_evaluation/prompt/template_render.py +34 -3
  16. wxo_agentic_evaluation/quick_eval.py +342 -0
  17. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +113 -0
  18. wxo_agentic_evaluation/red_teaming/attack_generator.py +286 -0
  19. wxo_agentic_evaluation/red_teaming/attack_list.py +96 -0
  20. wxo_agentic_evaluation/red_teaming/attack_runner.py +128 -0
  21. wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
  22. wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
  23. wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
  24. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
  25. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +27 -0
  26. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
  27. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
  28. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  29. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
  30. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
  31. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  32. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +237 -0
  33. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
  34. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +101 -0
  35. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +263 -0
  36. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +455 -0
  37. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +156 -0
  38. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
  39. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +547 -0
  40. wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
  41. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +258 -0
  42. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +333 -0
  43. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +188 -0
  44. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +409 -0
  45. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +42 -0
  46. wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
  47. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +145 -0
  48. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +116 -0
  49. wxo_agentic_evaluation/service_instance.py +2 -2
  50. wxo_agentic_evaluation/service_provider/watsonx_provider.py +118 -4
  51. wxo_agentic_evaluation/tool_planner.py +3 -1
  52. wxo_agentic_evaluation/type.py +33 -2
  53. wxo_agentic_evaluation/utils/__init__.py +0 -1
  54. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +157 -0
  55. wxo_agentic_evaluation/utils/rich_utils.py +174 -0
  56. wxo_agentic_evaluation/utils/rouge_score.py +23 -0
  57. wxo_agentic_evaluation/utils/utils.py +167 -5
  58. ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/RECORD +0 -56
  59. {ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info}/WHEEL +0 -0
  60. {ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,188 @@
1
+ import json
2
+ from typing import Any, Dict, List, Optional, Tuple
3
+
4
+ from pydantic import BaseModel
5
+
6
+ from wxo_agentic_evaluation.referenceless_eval.metrics.field import NumericField
7
+ from wxo_agentic_evaluation.referenceless_eval.metrics.metric import Metric
8
+ from wxo_agentic_evaluation.referenceless_eval.metrics.prompt import MetricPrompt
9
+ from wxo_agentic_evaluation.referenceless_eval.prompt.runner import (
10
+ AsyncGen,
11
+ Prompt,
12
+ PromptAndSchema,
13
+ PromptResult,
14
+ PromptRunner,
15
+ SyncGen,
16
+ )
17
+
18
+
19
+ class MetricRunResult(BaseModel):
20
+ """
21
+ Structured result for a single metric invocation.
22
+ """
23
+
24
+ metric_name: str
25
+ jsonschema: Dict[str, Any]
26
+ prompt: Prompt
27
+ raw_response: Any
28
+ numeric_thresholds_checks: Dict[str, bool]
29
+ error: Optional[str]
30
+ is_important: bool
31
+ importance_reason: Optional[str]
32
+ is_correct: bool
33
+ correctness_reason: Optional[str]
34
+ is_issue: bool
35
+
36
+
37
+ class MetricRunner:
38
+ """
39
+ Orchestrates running multiple metrics via LLM calls.
40
+ """
41
+
42
+ def __init__(
43
+ self, entries: Optional[List[Tuple[MetricPrompt, Dict[str, Any]]]] = None
44
+ ) -> None:
45
+ """
46
+ Args:
47
+ entries: Optional list of (MetricPrompt, user_kwargs) pairs.
48
+ """
49
+ self.entries: List[Dict[str, Any]] = []
50
+ if entries:
51
+ for mp, kw in entries:
52
+ self.add(mp, kw)
53
+
54
+ def add(self, metric_prompt: MetricPrompt, user_kwargs: Dict[str, Any]) -> None:
55
+ """
56
+ Add a metric to run.
57
+
58
+ Args:
59
+ metric_prompt: MetricPrompt instance.
60
+ user_kwargs: Dict of variables to render the user template.
61
+ """
62
+ messages = metric_prompt.build_messages(user_kwargs)
63
+ self.entries.append(
64
+ {
65
+ "metric_prompt": metric_prompt,
66
+ "user_kwargs": user_kwargs,
67
+ "messages": messages,
68
+ "schema": metric_prompt.metric.to_jsonschema(),
69
+ }
70
+ )
71
+
72
+ def remove(self, index: int) -> None:
73
+ """Remove the entry at the given index."""
74
+ self.entries.pop(index)
75
+
76
+ def clear(self) -> None:
77
+ """Remove all entries."""
78
+ self.entries.clear()
79
+
80
+ def _assemble_prompts(self) -> List[PromptAndSchema]:
81
+ return [(e["messages"], e["schema"]) for e in self.entries]
82
+
83
+ def _process_results(
84
+ self, prompt_results: List[PromptResult]
85
+ ) -> List[MetricRunResult]:
86
+ """
87
+ Combine PromptResult with metric parsing, threshold checks,
88
+ importance and correctness determinations.
89
+ """
90
+ results: List[MetricRunResult] = []
91
+
92
+ for entry, pr in zip(self.entries, prompt_results):
93
+ mp: MetricPrompt = entry["metric_prompt"]
94
+ metric: Metric = mp.metric
95
+
96
+ # default values
97
+ numeric_thresholds_checks: Dict[str, bool] = {}
98
+ err = pr.error
99
+ is_imp = False
100
+ imp_reason = None
101
+ is_corr = False
102
+ corr_reason = None
103
+ data = None
104
+
105
+ if pr.error is None:
106
+ try:
107
+ # parse raw response into JSON-compatible dict
108
+ raw = pr.response
109
+ if isinstance(raw, str):
110
+ data = json.loads(raw)
111
+ else:
112
+ data = raw
113
+
114
+ # numeric threshold checks
115
+ for field in metric.fields:
116
+ if isinstance(field, NumericField):
117
+ val = data.get(field.name)
118
+ ok = False
119
+ if isinstance(val, (int, float)):
120
+ ok = field.is_within_threshold(val)
121
+ numeric_thresholds_checks[field.name] = ok
122
+
123
+ # importance and correctness
124
+ is_imp, imp_reason = metric.is_important(data)
125
+ is_corr, corr_reason = metric.is_correct(data)
126
+
127
+ except Exception as e:
128
+ err = str(e)
129
+
130
+ # Build the result model
131
+ result = MetricRunResult(
132
+ metric_name=metric.name,
133
+ jsonschema=entry["schema"],
134
+ prompt=pr.prompt,
135
+ raw_response=data,
136
+ numeric_thresholds_checks=numeric_thresholds_checks,
137
+ error=err,
138
+ is_important=is_imp,
139
+ importance_reason=imp_reason,
140
+ is_correct=is_corr,
141
+ correctness_reason=corr_reason,
142
+ is_issue=is_imp and not is_corr,
143
+ )
144
+ results.append(result)
145
+
146
+ return results
147
+
148
+ def run_all(
149
+ self,
150
+ gen_fn: SyncGen,
151
+ prompt_param_name: str = "prompt",
152
+ schema_param_name: Optional[str] = None,
153
+ **kwargs: Any,
154
+ ) -> List[MetricRunResult]:
155
+ """
156
+ Run all metrics using a synchronous single-prompt generator.
157
+ """
158
+ prompts = self._assemble_prompts()
159
+ runner = PromptRunner(prompts)
160
+ pr_results = runner.run_all(
161
+ gen_fn,
162
+ prompt_param_name=prompt_param_name,
163
+ schema_param_name=schema_param_name,
164
+ **kwargs,
165
+ )
166
+ return self._process_results(pr_results)
167
+
168
+ async def run_async(
169
+ self,
170
+ async_fn: AsyncGen,
171
+ max_parallel: int = 10,
172
+ prompt_param_name: str = "prompt",
173
+ schema_param_name: Optional[str] = None,
174
+ **kwargs: Any,
175
+ ) -> List[MetricRunResult]:
176
+ """
177
+ Run all metrics using asynchronous single-prompt generation.
178
+ """
179
+ prompts = self._assemble_prompts()
180
+ runner = PromptRunner(prompts)
181
+ pr_results = await runner.run_async(
182
+ async_fn,
183
+ max_parallel=max_parallel,
184
+ prompt_param_name=prompt_param_name,
185
+ schema_param_name=schema_param_name,
186
+ **kwargs,
187
+ )
188
+ return self._process_results(pr_results)
@@ -0,0 +1,409 @@
1
+ import json
2
+ from typing import Any, Dict, List, Optional, Tuple, Type
3
+
4
+ import jsonschema
5
+ from jinja2 import BaseLoader, Environment, Template
6
+ from pydantic import BaseModel, ValidationError, create_model
7
+
8
+ from wxo_agentic_evaluation.referenceless_eval.metrics.field import (
9
+ CorrectionField,
10
+ EvidenceField,
11
+ ExplanationField,
12
+ NumericField,
13
+ )
14
+ from wxo_agentic_evaluation.referenceless_eval.metrics.metric import Metric
15
+ from wxo_agentic_evaluation.referenceless_eval.metrics.utils import (
16
+ remove_threshold_fields,
17
+ validate_template_context,
18
+ )
19
+
20
+ # Jinja2 environment for string templates
21
+ _jinja_env = Environment(loader=BaseLoader(), autoescape=False)
22
+
23
+
24
+ class MetricPrompt:
25
+ """
26
+ Combines a Metric with system and user prompt templates, plus optional few-shot examples.
27
+
28
+ Attributes:
29
+ metric: Metric instance describing the schema to validate outputs.
30
+ system_template: Jinja2 Template for the system message.
31
+ user_template: Jinja2 Template for the user message.
32
+ examples: List of (user_kwargs, output_dict) pairs.
33
+ """
34
+
35
+ def __init__(
36
+ self,
37
+ metric: Metric,
38
+ system_template: str,
39
+ user_template: str,
40
+ *,
41
+ system_kwargs_defaults: Optional[Dict[str, Any]] = None,
42
+ ) -> None:
43
+ """
44
+ Args:
45
+ metric: Metric whose JSONSchema will be used for validation.
46
+ system_template: Template string for the system message; may reference keys.
47
+ user_template: Template string for the user message; may reference user_kwargs keys.
48
+ system_kwargs_defaults: Optional default context for system template.
49
+ """
50
+ if not isinstance(system_template, str):
51
+ raise TypeError("system_template must be a string")
52
+ if not isinstance(user_template, str):
53
+ raise TypeError("user_template must be a string")
54
+ if not system_kwargs_defaults:
55
+ system_kwargs_defaults = {} # Default to empty dict if None
56
+ if not isinstance(system_kwargs_defaults, dict):
57
+ raise TypeError("system_kwargs_defaults must be a dict")
58
+ if not isinstance(metric, Metric):
59
+ raise TypeError("metric must be an instance of Metric")
60
+
61
+ self._system_template_str: str = system_template
62
+ self._user_template_str: str = user_template
63
+
64
+ # Compile Jinja2 templates
65
+ self._system_tmpl: Template = _jinja_env.from_string(system_template)
66
+ self._user_tmpl: Template = _jinja_env.from_string(user_template)
67
+
68
+ # Store defaults for system context
69
+ # This allows overriding system context without modifying the template
70
+ # during prompt building
71
+ self.system_kwargs_defaults: Dict[str, Any] = system_kwargs_defaults.copy()
72
+
73
+ # Initialize examples list
74
+ # This will hold (user_kwargs, output) pairs for few-shot prompting
75
+ self.examples: List[Tuple[Dict[str, Any], Dict[str, Any]]] = []
76
+
77
+ # Store the metric for validation
78
+ # This allows the prompt to validate example outputs against the metric's schema
79
+ self.metric = metric
80
+
81
+ # --- Getters and Setters ---
82
+
83
+ def get_system_template(self) -> str:
84
+ return self._system_tmpl.source
85
+
86
+ def set_system_template(self, template_str: str) -> None:
87
+ self._system_tmpl = _jinja_env.from_string(template_str)
88
+
89
+ def get_user_template(self) -> str:
90
+ return self._user_tmpl.source
91
+
92
+ def set_user_template(self, template_str: str) -> None:
93
+ """
94
+ Setting a new user template clears existing examples.
95
+ """
96
+ self._user_tmpl = _jinja_env.from_string(template_str)
97
+ self.examples.clear()
98
+
99
+ def get_system_kwargs_defaults(self) -> Dict[str, Any]:
100
+ return dict(self.system_kwargs_defaults)
101
+
102
+ def set_system_kwargs_defaults(self, defaults: Dict[str, Any]) -> None:
103
+ self.system_kwargs_defaults = defaults
104
+
105
+ # --- Example Management ---
106
+
107
+ def add_example(self, user_kwargs: Dict[str, Any], output: Dict[str, Any]) -> None:
108
+ """
109
+ Add a few-shot example.
110
+
111
+ Validates that `output` adheres to this.metric's JSONSchema.
112
+
113
+ Args:
114
+ user_kwargs: Variables for rendering the user_template.
115
+ output: Dict matching the metric's schema.
116
+
117
+ Raises:
118
+ ValidationError if output invalid.
119
+ """
120
+ schema = self.metric.to_jsonschema()
121
+ # 1) JSONSchema structural validation
122
+ jsonschema.validate(instance=output, schema=schema)
123
+ # 2) Pydantic type/enum validation
124
+ Model: Type[BaseModel] = self._build_response_model()
125
+ try:
126
+ Model.model_validate(output)
127
+ except ValidationError as e:
128
+ raise ValueError(f"Example output failed validation: {e}")
129
+ self.examples.append((user_kwargs, output))
130
+
131
+ # --- Prompt Building ---
132
+
133
+ def build_messages(
134
+ self,
135
+ user_kwargs: Dict[str, Any],
136
+ system_kwargs: Optional[Dict[str, Any]] = None,
137
+ ) -> List[Dict[str, str]]:
138
+ """
139
+ Build the full chat messages sequence:
140
+
141
+ 1. System message rendered with:
142
+ - metric_jsonschema
143
+ - plus any system_kwargs (overrides defaults)
144
+ 2. For each example:
145
+ - User message from user_template with example user_kwargs
146
+ - Assistant message: JSON dump of example output
147
+ 3. Final user message with provided user_kwargs
148
+
149
+ Args:
150
+ user_kwargs: Variables for the final user prompt.
151
+ system_kwargs: Optional overrides for system template context.
152
+
153
+ Returns:
154
+ List of {"role": "...", "content": "..."} dicts.
155
+ """
156
+ msgs: List[Dict[str, str]] = []
157
+ # Prepare system context
158
+ ctx = self.system_kwargs_defaults
159
+ ctx["metric_jsonschema"] = json.dumps(
160
+ remove_threshold_fields(self.metric.to_jsonschema())
161
+ )
162
+
163
+ if system_kwargs:
164
+ ctx.update(system_kwargs)
165
+
166
+ # 1) System message
167
+ sys_text = self._system_tmpl.render(**ctx)
168
+ msgs.append({"role": "system", "content": sys_text})
169
+
170
+ try:
171
+ # 2) Few-shot examples
172
+ for ex_user_kwargs, ex_output in self.examples:
173
+ ex_user_kwargs_parsed = {
174
+ k: json.dumps(d) for k, d in ex_user_kwargs.items()
175
+ }
176
+ user_text = self._user_tmpl.render(**ex_user_kwargs_parsed)
177
+ msgs.append({"role": "user", "content": user_text})
178
+ assistant_text = json.dumps(ex_output, indent=None)
179
+ msgs.append({"role": "assistant", "content": assistant_text})
180
+
181
+ # 3) Final user message
182
+ final_user_kwargs_parsed = {}
183
+ for key, obj in user_kwargs.items():
184
+ final_user_kwargs_parsed[key] = json.dumps(obj)
185
+ final_user = self._user_tmpl.render(**final_user_kwargs_parsed)
186
+ except Exception as e:
187
+ raise e
188
+
189
+ msgs.append({"role": "user", "content": final_user})
190
+
191
+ return msgs
192
+
193
+ def build_messages(
194
+ self,
195
+ user_kwargs: Dict[str, Any],
196
+ system_kwargs: Optional[Dict[str, Any]] = None,
197
+ ) -> List[Dict[str, str]]:
198
+ """
199
+ Build the full chat messages sequence:
200
+
201
+ 1. System message rendered with:
202
+ - metric_jsonschema
203
+ - plus any system_kwargs (overrides defaults)
204
+ 2. For each example:
205
+ - User message from user_template with example user_kwargs
206
+ - Assistant message: JSON dump of example output
207
+ 3. Final user message with provided user_kwargs
208
+
209
+ Args:
210
+ user_kwargs: Variables for the final user prompt.
211
+ system_kwargs: Optional overrides for system template context.
212
+
213
+ Returns:
214
+ List of {"role": "...", "content": "..."} dicts.
215
+ """
216
+ msgs: List[Dict[str, str]] = []
217
+
218
+ # Prepare system context
219
+ ctx = self.system_kwargs_defaults.copy()
220
+ ctx["metric_jsonschema"] = json.dumps(
221
+ remove_threshold_fields(self.metric.to_jsonschema())
222
+ )
223
+ if system_kwargs:
224
+ ctx.update(system_kwargs)
225
+
226
+ # Validate and render system message
227
+ validate_template_context(
228
+ _jinja_env, self._system_template_str, ctx, "system_template"
229
+ )
230
+
231
+ sys_text = self._system_tmpl.render(**ctx)
232
+ msgs.append({"role": "system", "content": sys_text})
233
+
234
+ try:
235
+ # Few-shot examples
236
+ for ex_user_kwargs, ex_output in self.examples:
237
+ ex_user_kwargs_parsed = {
238
+ k: json.dumps(d) for k, d in ex_user_kwargs.items()
239
+ }
240
+ validate_template_context(
241
+ _jinja_env,
242
+ self._user_template_str,
243
+ ex_user_kwargs_parsed,
244
+ "user_template (example)",
245
+ )
246
+ user_text = self._user_tmpl.render(**ex_user_kwargs_parsed)
247
+ msgs.append({"role": "user", "content": user_text})
248
+
249
+ assistant_text = json.dumps(ex_output, indent=None)
250
+ msgs.append({"role": "assistant", "content": assistant_text})
251
+
252
+ # Final user message
253
+ final_user_kwargs_parsed = {
254
+ k: json.dumps(obj) for k, obj in user_kwargs.items()
255
+ }
256
+ validate_template_context(
257
+ _jinja_env,
258
+ self._user_template_str,
259
+ final_user_kwargs_parsed,
260
+ "user_template (final)",
261
+ )
262
+ # Render final user message
263
+ final_user = self._user_tmpl.render(**final_user_kwargs_parsed)
264
+
265
+ except Exception as e:
266
+ raise e
267
+
268
+ msgs.append({"role": "user", "content": final_user})
269
+ return msgs
270
+
271
+ def _build_response_model(self) -> Type[BaseModel]:
272
+ """
273
+ Dynamically construct a Pydantic model matching metric.to_jsonschema().
274
+ Used to enforce types beyond JSONSchema.
275
+ """
276
+ schema = self.metric.to_jsonschema()
277
+ props = schema.get("properties", {})
278
+ fields: Dict[str, Tuple[Any, Any]] = {}
279
+ for name, subs in props.items():
280
+ jtype = subs.get("type")
281
+ # map JSONSchema types -> Python types
282
+ if name in schema.get("required", []):
283
+ secondary_type = ...
284
+ else:
285
+ secondary_type = None
286
+
287
+ if jtype == "integer":
288
+ py = (int, secondary_type)
289
+ elif jtype == "number":
290
+ py = (float, secondary_type)
291
+ elif jtype == "string":
292
+ py = (str, secondary_type)
293
+ elif jtype == "boolean":
294
+ py = (bool, secondary_type)
295
+ elif jtype == "object":
296
+ py = (dict, secondary_type)
297
+ else:
298
+ py = (Any, secondary_type)
299
+ # handle enums
300
+ if "enum" in subs:
301
+ from typing import Literal
302
+
303
+ enum_vals = subs["enum"]
304
+ py = (Literal[tuple(enum_vals)], secondary_type)
305
+
306
+ # handle additional properties
307
+ if subs.get("additionalProperties", False):
308
+ # If additionalProperties is true, we allow any type
309
+ py = (Dict[str, Any], secondary_type)
310
+ fields[name] = py
311
+
312
+ Model = create_model(schema.get("title", "ResponseModel"), **fields)
313
+ return Model
314
+
315
+
316
+ # --- Example Subclass: RelevancePrompt ---
317
+
318
+
319
+ class RelevanceMetric(Metric):
320
+ """
321
+ Metric for assessing relevance of a response to its context.
322
+ """
323
+
324
+ def __init__(self) -> None:
325
+ desc = "Rate how relevant the response is to the given context on a 0-1 scale."
326
+ super().__init__(
327
+ name="Relevance",
328
+ description=desc,
329
+ fields=[
330
+ ExplanationField(
331
+ name="explanation",
332
+ json_type="string",
333
+ description="Why the response is or is not relevant, step by step.",
334
+ ),
335
+ EvidenceField(
336
+ name="evidence",
337
+ json_type="string",
338
+ description="Portion of context or response that supports your relevance rating.",
339
+ ),
340
+ NumericField(
341
+ name="output",
342
+ json_type="number",
343
+ description="Relevance score from 0.0 (not relevant) to 1.0 (fully relevant).",
344
+ jsonschema_extra={"minimum": 0.0, "maximum": 1.0},
345
+ extra_params={"threshold_low": 0.0, "threshold_high": 1.0},
346
+ ),
347
+ NumericField(
348
+ name="confidence",
349
+ json_type="number",
350
+ description="Confidence in your relevance judgment (0.0-1.0).",
351
+ jsonschema_extra={"minimum": 0.0, "maximum": 1.0},
352
+ extra_params={"threshold_low": 0.0, "threshold_high": 1.0},
353
+ ),
354
+ CorrectionField(
355
+ name="correction",
356
+ json_type="object",
357
+ description="If relevance is low, suggest how to improve relevance.",
358
+ ),
359
+ ],
360
+ )
361
+
362
+
363
+ class RelevancePrompt(MetricPrompt):
364
+ """
365
+ Prompt builder specialized for the RelevanceMetric.
366
+ Provides default templates and example usage.
367
+ """
368
+
369
+ def __init__(self) -> None:
370
+ metric = RelevanceMetric()
371
+ system_tmpl = (
372
+ "You are an expert judge that assesses response relevance. "
373
+ "Here is the JSONSchema for your response:\n"
374
+ "{{ metric_jsonschema }}"
375
+ )
376
+ user_tmpl = (
377
+ "Context: {{ context }}\n"
378
+ "Response: {{ response }}\n"
379
+ "Provide your evaluation as JSON as specified in the system prompt."
380
+ )
381
+ super().__init__(metric, system_tmpl, user_tmpl)
382
+
383
+ # Initialize default few-shot examples
384
+ self.add_example(
385
+ {
386
+ "context": "The sky is blue.",
387
+ "response": "The sky appears azure due to Rayleigh scattering.",
388
+ },
389
+ {
390
+ "evidence": "The sky appears azure due to Rayleigh scattering.",
391
+ "explanation": "The response directly addresses sky color by naming scattering physics.",
392
+ "output": 1.0,
393
+ "confidence": 0.9,
394
+ "correction": {},
395
+ },
396
+ )
397
+ self.add_example(
398
+ {
399
+ "context": "What is the capital of France?",
400
+ "response": "The moon orbits Earth every 27 days.",
401
+ },
402
+ {
403
+ "evidence": "The moon orbits Earth every 27 days.",
404
+ "explanation": "The response is about lunar orbit, unrelated to capitals.",
405
+ "output": 0.0,
406
+ "confidence": 0.8,
407
+ "correction": {"suggestion": "The capital of France is Paris."},
408
+ },
409
+ )
@@ -0,0 +1,42 @@
1
+ from typing import Any, Dict
2
+
3
+ from jinja2 import Environment, meta
4
+
5
+
6
+ def remove_threshold_fields(schema: dict) -> dict:
7
+ """
8
+ Recursively removes 'threshold_low' and 'threshold_high' fields from a JSON schema.
9
+ """
10
+ if isinstance(schema, dict):
11
+ # Remove the threshold fields if present
12
+ schema.pop("threshold_low", None)
13
+ schema.pop("threshold_high", None)
14
+ # Recurse into nested dictionaries and lists
15
+ for key, value in schema.items():
16
+ if isinstance(value, dict):
17
+ schema[key] = remove_threshold_fields(value)
18
+ elif isinstance(value, list):
19
+ schema[key] = [
20
+ remove_threshold_fields(item) if isinstance(item, dict) else item
21
+ for item in value
22
+ ]
23
+ return schema
24
+
25
+
26
+ def validate_template_context(
27
+ env: Environment,
28
+ template_str: str,
29
+ context: Dict[str, Any],
30
+ template_name: str = "",
31
+ ):
32
+ parsed = env.parse(template_str)
33
+ required_vars = meta.find_undeclared_variables(parsed)
34
+ missing_or_empty = [
35
+ var
36
+ for var in required_vars
37
+ if var not in context or context[var] in (None, "", [], {}, ())
38
+ ]
39
+ if missing_or_empty:
40
+ raise ValueError(
41
+ f"Missing or empty variables in template '{template_name or 'unnamed'}': {missing_or_empty}"
42
+ )