ibm-watsonx-orchestrate-evaluation-framework 1.0.8__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (63) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info}/METADATA +103 -109
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info/RECORD +97 -0
  3. wxo_agentic_evaluation/analytics/tools/main.py +1 -18
  4. wxo_agentic_evaluation/analyze_run.py +358 -97
  5. wxo_agentic_evaluation/arg_configs.py +28 -1
  6. wxo_agentic_evaluation/description_quality_checker.py +149 -0
  7. wxo_agentic_evaluation/evaluation_package.py +58 -17
  8. wxo_agentic_evaluation/inference_backend.py +32 -17
  9. wxo_agentic_evaluation/llm_user.py +2 -1
  10. wxo_agentic_evaluation/metrics/metrics.py +22 -1
  11. wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
  12. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +9 -1
  13. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
  14. wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
  15. wxo_agentic_evaluation/prompt/template_render.py +34 -3
  16. wxo_agentic_evaluation/quick_eval.py +342 -0
  17. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +113 -0
  18. wxo_agentic_evaluation/red_teaming/attack_generator.py +286 -0
  19. wxo_agentic_evaluation/red_teaming/attack_list.py +96 -0
  20. wxo_agentic_evaluation/red_teaming/attack_runner.py +128 -0
  21. wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
  22. wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
  23. wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
  24. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
  25. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +27 -0
  26. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
  27. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
  28. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  29. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
  30. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
  31. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  32. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +237 -0
  33. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
  34. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +101 -0
  35. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +263 -0
  36. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +455 -0
  37. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +156 -0
  38. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
  39. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +547 -0
  40. wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
  41. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +258 -0
  42. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +333 -0
  43. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +188 -0
  44. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +409 -0
  45. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +42 -0
  46. wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
  47. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +145 -0
  48. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +114 -0
  49. wxo_agentic_evaluation/service_instance.py +2 -2
  50. wxo_agentic_evaluation/service_provider/__init__.py +15 -6
  51. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +4 -3
  52. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +138 -0
  53. wxo_agentic_evaluation/service_provider/watsonx_provider.py +11 -4
  54. wxo_agentic_evaluation/tool_planner.py +3 -1
  55. wxo_agentic_evaluation/type.py +33 -2
  56. wxo_agentic_evaluation/utils/__init__.py +0 -1
  57. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +157 -0
  58. wxo_agentic_evaluation/utils/rich_utils.py +174 -0
  59. wxo_agentic_evaluation/utils/rouge_score.py +23 -0
  60. wxo_agentic_evaluation/utils/utils.py +167 -5
  61. ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/RECORD +0 -56
  62. {ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info}/WHEEL +0 -0
  63. {ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,258 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC
4
+ from typing import Any, Dict, List, Literal, Optional, Type, TypeVar
5
+
6
+ from pydantic import BaseModel
7
+ from pydantic import Field as PydanticField
8
+ from pydantic import PrivateAttr, model_validator
9
+
10
+ JSONType = Literal["integer", "number", "string", "boolean", "object", "array"]
11
+ TField = TypeVar("TField", bound="BaseField")
12
+ BaseFieldRegistry: List[Type[BaseField]] = []
13
+
14
+
15
+ class BaseField(BaseModel, ABC):
16
+ """
17
+ Abstract representation of a single metric field.
18
+
19
+ Attributes:
20
+ name: Identifier of the field (used as JSON key).
21
+ json_type: JSON Schema type of the field.
22
+ description: Human-friendly description of the field's purpose.
23
+ jsonschema_extra: Additional JSONSchema keywords (e.g., enum, pattern).
24
+ extra_params: Non-JSONSchema attributes (e.g., thresholds).
25
+ """
26
+
27
+ name: str
28
+ json_type: JSONType
29
+ description: str = PydanticField(
30
+ "No description provided. Please specify what this field represents.",
31
+ description="A clear description of this field's meaning.",
32
+ )
33
+ jsonschema_extra: Dict[str, Any] = PydanticField(
34
+ default_factory=dict,
35
+ description="Additional JSONSchema constraints for this field.",
36
+ )
37
+ extra_params: Dict[str, Any] = PydanticField(
38
+ default_factory=dict,
39
+ description="Extra parameters not included in the JSONSchema (e.g., thresholds).",
40
+ )
41
+
42
+ def __init_subclass__(cls, **kwargs):
43
+ super().__init_subclass__(**kwargs)
44
+ if not getattr(cls, "__abstract__", False):
45
+ BaseFieldRegistry.insert(0, cls)
46
+
47
+ @classmethod
48
+ def can_handle(cls, name: str, schema: Dict[str, Any]) -> bool:
49
+ """Override in subclasses to signal compatibility with a JSONSchema snippet."""
50
+ return False
51
+
52
+ @classmethod
53
+ def from_jsonschema(cls, name: str, schema: Dict[str, Any]) -> BaseField:
54
+ """
55
+ Instantiate the appropriate Field subclass from a JSONSchema property.
56
+ The first subclass whose `can_handle` returns True is used.
57
+ Falls back to GenericField.
58
+ """
59
+ for field_cls in BaseFieldRegistry:
60
+ if field_cls.can_handle(name, schema):
61
+ desc = schema.get("description", "")
62
+ extra = {
63
+ k: v for k, v in schema.items() if k not in ("type", "description")
64
+ }
65
+ return field_cls(
66
+ name=name,
67
+ json_type=schema.get("type", "string"),
68
+ description=desc,
69
+ jsonschema_extra=extra,
70
+ extra_params={},
71
+ )
72
+ return GenericField(
73
+ name=name,
74
+ json_type=schema.get("type", "string"),
75
+ description=schema.get("description", ""),
76
+ jsonschema_extra={
77
+ k: v for k, v in schema.items() if k not in ("type", "description")
78
+ },
79
+ extra_params={},
80
+ )
81
+
82
+ def to_jsonschema(self) -> Dict[str, Any]:
83
+ return {
84
+ "type": self.json_type,
85
+ "description": self.description,
86
+ **self.jsonschema_extra,
87
+ }
88
+
89
+ # --- Getters and Setters ---
90
+
91
+ def get_name(self) -> str:
92
+ return self.name
93
+
94
+ def set_name(self, name: str) -> None:
95
+ self.name = name
96
+
97
+ def get_description(self) -> str:
98
+ return self.description
99
+
100
+ def set_description(self, description: str) -> None:
101
+ self.description = description
102
+
103
+ def get_jsonschema_extra(self) -> Dict[str, Any]:
104
+ return dict(self.jsonschema_extra)
105
+
106
+ def set_jsonschema_extra(self, extra: Dict[str, Any]) -> None:
107
+ self.jsonschema_extra = extra
108
+
109
+ def get_extra_param(self, key: str) -> Any:
110
+ return self.extra_params.get(key)
111
+
112
+ def set_extra_param(self, key: str, value: Any) -> None:
113
+ self.extra_params[key] = value
114
+
115
+
116
+ class NumericField(BaseField):
117
+ """
118
+ Numeric field (integer or number) with optional thresholds.
119
+ The `extra_params` dict may include:
120
+ - threshold_low: minimal acceptable value (for validation)
121
+ - threshold_high: maximal acceptable value
122
+ """
123
+
124
+ threshold_low: Optional[float] = PydanticField(
125
+ None, description="Lower bound for correctness checks (not in JSONSchema)."
126
+ )
127
+ threshold_high: Optional[float] = PydanticField(
128
+ None, description="Upper bound for correctness checks (not in JSONSchema)."
129
+ )
130
+
131
+ __abstract__ = False
132
+
133
+ @model_validator(mode="before")
134
+ def extract_thresholds(cls, values: Dict[str, Any]) -> Dict[str, Any]:
135
+ extra = values.get("jsonschema_extra", {})
136
+ if "threshold_low" in extra:
137
+ values["threshold_low"] = extra["threshold_low"]
138
+ if "threshold_high" in extra:
139
+ values["threshold_high"] = extra["threshold_high"]
140
+ return values
141
+
142
+ @classmethod
143
+ def can_handle(cls, name: str, schema: Dict[str, Any]) -> bool:
144
+ return schema.get("type") in ("integer", "number")
145
+
146
+ @classmethod
147
+ def from_jsonschema(cls, name: str, schema: Dict[str, Any]) -> NumericField:
148
+ """
149
+ Create a NumericField from a JSONSchema property.
150
+ """
151
+ return NumericField(
152
+ name=name,
153
+ json_type=schema.get("type", "number"),
154
+ description=schema.get("description", ""),
155
+ jsonschema_extra={
156
+ k: v for k, v in schema.items() if k not in ("type", "description")
157
+ },
158
+ extra_params={},
159
+ )
160
+
161
+ def to_jsonschema(self) -> Dict[str, Any]:
162
+ return super().to_jsonschema()
163
+
164
+ def is_within_threshold(self, value: float) -> bool:
165
+ if self.threshold_low is not None and value < self.threshold_low:
166
+ return False
167
+ if self.threshold_high is not None and value > self.threshold_high:
168
+ return False
169
+ return True
170
+
171
+
172
+ class EnumField(BaseField):
173
+ """
174
+ Field whose value must be one of a fixed set of options.
175
+ Expects `jsonschema_extra["enum"]` to be a list of allowed values.
176
+ """
177
+
178
+ __abstract__ = False
179
+
180
+ @classmethod
181
+ def can_handle(cls, name: str, schema: Dict[str, Any]) -> bool:
182
+ return "enum" in schema
183
+
184
+
185
+ class ExplanationField(BaseField):
186
+ """
187
+ Free-form explanation of the metric's reasoning.
188
+ """
189
+
190
+ __abstract__ = False
191
+
192
+ def __init__(self, **data: Any):
193
+ data.setdefault(
194
+ "description",
195
+ "A detailed, step-by-step explanation of the reasoning behind the metric's value.",
196
+ )
197
+ super().__init__(**data)
198
+
199
+ @classmethod
200
+ def can_handle(cls, name: str, schema: Dict[str, Any]) -> bool:
201
+ return name.lower() == "explanation" and schema.get("type") == "string"
202
+
203
+
204
+ class EvidenceField(BaseField):
205
+ """
206
+ The specific quote or reference that supports the metric's evaluation.
207
+ """
208
+
209
+ __abstract__ = False
210
+
211
+ def __init__(self, **data: Any):
212
+ data.setdefault(
213
+ "description",
214
+ "The exact quote or reference from the input or context that justifies the metric's value.",
215
+ )
216
+ super().__init__(**data)
217
+
218
+ @classmethod
219
+ def can_handle(cls, name: str, schema: Dict[str, Any]) -> bool:
220
+ return name.lower() == "evidence" and schema.get("type") == "string"
221
+
222
+
223
+ class CorrectionField(BaseField):
224
+ """
225
+ A structured suggestion (as JSON) for correcting or improving the output.
226
+ """
227
+
228
+ __abstract__ = False
229
+
230
+ def __init__(self, **data: Any):
231
+ data.setdefault(
232
+ "description",
233
+ "A JSON-formatted suggestion for how to correct or improve the output if needed.",
234
+ )
235
+ super().__init__(**data)
236
+
237
+ @classmethod
238
+ def can_handle(cls, name: str, schema: Dict[str, Any]) -> bool:
239
+ return name.lower() == "correction" and schema.get("type") == "object"
240
+
241
+
242
+ class GenericField(BaseField):
243
+ """
244
+ Fallback field type for any property not handled by other classes.
245
+ """
246
+
247
+ __abstract__ = False
248
+
249
+ def __init__(self, **data: Any):
250
+ data.setdefault(
251
+ "description",
252
+ f"A generic field named '{data.get('name')}' of type {data.get('json_type')}.",
253
+ )
254
+ super().__init__(**data)
255
+
256
+ @classmethod
257
+ def can_handle(cls, name: str, schema: Dict[str, Any]) -> bool:
258
+ return True
@@ -0,0 +1,333 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from typing import Any, Dict, List, Optional, Set, Tuple, Type, TypeVar
5
+
6
+ from wxo_agentic_evaluation.referenceless_eval.metrics.field import (
7
+ BaseField,
8
+ CorrectionField,
9
+ EvidenceField,
10
+ ExplanationField,
11
+ NumericField,
12
+ )
13
+
14
+ TMetric = TypeVar("TMetric", bound="Metric")
15
+
16
+
17
+ class Metric:
18
+ """
19
+ Abstract representation of an evaluation metric composed of multiple fields.
20
+ """
21
+
22
+ def __init__(
23
+ self,
24
+ name: str,
25
+ description: str,
26
+ fields: Optional[List[BaseField]] = None,
27
+ required: Optional[List[str]] = None,
28
+ additional_properties: bool = True,
29
+ ) -> None:
30
+ """
31
+ Args:
32
+ name: Unique metric identifier.
33
+ description: Full description of what this metric measures.
34
+ fields: List of BaseField instances composing this metric.
35
+ required: List of field names that must appear in results.
36
+ Defaults to all provided fields.
37
+ """
38
+ self.name = name
39
+ self.description = description
40
+ self.fields: List[BaseField] = fields or []
41
+ self.additional_properties = additional_properties
42
+ # Determine required fields
43
+ if required is not None:
44
+ self.required_fields: Set[str] = set(required)
45
+ else:
46
+ self.required_fields: Set[str] = {f.name for f in self.fields}
47
+
48
+ # Validate required_fields
49
+ known = {f.name for f in self.fields}
50
+ missing = self.required_fields - known
51
+ if missing:
52
+ raise ValueError(
53
+ f"Required fields {missing} not among metric fields {known}"
54
+ )
55
+
56
+ def to_jsonschema(self) -> Dict[str, Any]:
57
+ """
58
+ Build a JSONSchema representation of this metric.
59
+
60
+ Returns:
61
+ A dict with keys:
62
+ - title: self.name
63
+ - description: self.description
64
+ - type: "object"
65
+ - properties: mapping field.name → field.to_jsonschema()
66
+ - required: list of required field names
67
+ """
68
+ props: Dict[str, Any] = {f.name: f.to_jsonschema() for f in self.fields}
69
+ return {
70
+ "title": self.name,
71
+ "description": self.description,
72
+ "type": "object",
73
+ "properties": props,
74
+ "required": sorted(self.required_fields),
75
+ "additionalProperties": self.additional_properties,
76
+ }
77
+
78
+ def add_field(self, field: BaseField, required: bool = True) -> None:
79
+ """
80
+ Add a new field to this metric.
81
+
82
+ Args:
83
+ field: BaseField instance.
84
+ required: Whether this field must appear in results.
85
+ """
86
+ if any(f.name == field.name for f in self.fields):
87
+ raise ValueError(f"Field '{field.name}' already defined")
88
+ self.fields.append(field)
89
+ if required:
90
+ self.required_fields.add(field.name)
91
+
92
+ def remove_field(self, name: str) -> None:
93
+ """
94
+ Remove a field by name.
95
+
96
+ Args:
97
+ name: Name of field to remove.
98
+ """
99
+ self.fields = [f for f in self.fields if f.name != name]
100
+ self.required_fields.discard(name)
101
+
102
+ @classmethod
103
+ def from_jsonschema(cls: Type[TMetric], schema: Dict[str, Any]) -> Metric:
104
+ """
105
+ Reconstruct a Metric from a JSONSchema dict.
106
+
107
+ Args:
108
+ schema: dict with 'title', 'description', 'properties', 'required'.
109
+
110
+ Returns:
111
+ Metric instance with fields populated.
112
+ """
113
+ name: str = schema.get("title", "")
114
+ description: str = schema.get("description", "")
115
+ props: Dict[str, Any] = schema.get("properties", {})
116
+ required: List[str] = schema.get("required", [])
117
+ additional_props: bool = schema.get("additionalProperties", True)
118
+ fields: List[BaseField] = []
119
+ for fname, fschema in props.items():
120
+ # If type is number or integer, use NumericField
121
+ if fschema.get("type") in ("number", "integer"):
122
+ field = NumericField.from_jsonschema(fname, fschema)
123
+ else:
124
+ field = BaseField.from_jsonschema(fname, fschema)
125
+ fields.append(field)
126
+ return cls(
127
+ name=name,
128
+ description=description,
129
+ fields=fields,
130
+ required=required,
131
+ additional_properties=additional_props,
132
+ )
133
+
134
+ def is_important(self, result: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
135
+ """
136
+ A result is 'important' if its confidence lies within the defined confidence thresholds.
137
+
138
+ Args:
139
+ result: Parsed metric result with at least 'confidence'.
140
+
141
+ Returns:
142
+ (important: bool, reason: Optional[str])
143
+ """
144
+ try:
145
+ conf = float(result.get("confidence", 0.0))
146
+ except (TypeError, ValueError):
147
+ return False, "Invalid confidence value"
148
+ # locate the confidence field
149
+ conf_field = next((f for f in self.fields if f.name == "confidence"), None)
150
+ if isinstance(conf_field, NumericField):
151
+ ok = conf_field.is_within_threshold(conf)
152
+ reason = (
153
+ None
154
+ if ok
155
+ else f"Confidence {conf} outside [{conf_field.threshold_low},{conf_field.threshold_high}]"
156
+ )
157
+ return ok, reason
158
+ return False, "Confidence field not defined"
159
+
160
+ def is_correct(self, result: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
161
+ """
162
+ A result is 'correct' if it is important AND its output lies within thresholds.
163
+
164
+ Args:
165
+ result: Parsed metric result with 'output' and 'confidence'.
166
+
167
+ Returns:
168
+ (correct: bool, reason: Optional[str])
169
+ """
170
+ important, imp_reason = self.is_important(result)
171
+ if not important:
172
+ return True, f"Not important: {imp_reason}"
173
+ # check output
174
+ try:
175
+ val = float(result.get("output", 0.0))
176
+ except (TypeError, ValueError):
177
+ return False, "Invalid output value"
178
+ out_field = next((f for f in self.fields if f.name == "output"), None)
179
+ if isinstance(out_field, NumericField):
180
+ ok = out_field.is_within_threshold(val)
181
+ reason = (
182
+ None
183
+ if ok
184
+ else f"Output {val} outside [{out_field.threshold_low},{out_field.threshold_high}]"
185
+ )
186
+ return ok, reason
187
+ return False, "Output field not defined"
188
+
189
+ def parse_response(self, response: str) -> Dict[str, Any]:
190
+ """
191
+ Parse a raw response string into a structured dict.
192
+
193
+ Args:
194
+ response: Raw response string.
195
+
196
+ Returns:
197
+ Parsed response as a dict.
198
+ """
199
+ # Default implementation: assume JSON string
200
+ try:
201
+ return json.loads(response)
202
+ except json.JSONDecodeError as e:
203
+ raise ValueError(f"Failed to parse response: {e}") from e
204
+
205
+
206
+ class StandardMetric(Metric):
207
+ """
208
+ A standard metric with common fields:
209
+ - explanation: string, detailed reasoning.
210
+ - evidence: string, supporting quote or reference.
211
+ - output: numeric value within specified range.
212
+ - confidence: numeric confidence within specified range.
213
+ - correction: object, structured suggestion for improvement.
214
+ Also provides convenience methods `is_important` and `is_correct`.
215
+ """
216
+
217
+ def __init__(
218
+ self,
219
+ name: str,
220
+ description: str,
221
+ *,
222
+ output_range: Tuple[float, float] = (0.0, 1.0),
223
+ confidence_range: Tuple[float, float] = (0.0, 1.0),
224
+ ) -> None:
225
+ """
226
+ Args:
227
+ name: Metric identifier.
228
+ description: Explanation of what the metric measures.
229
+ output_range: (min, max) allowed for the 'output' field.
230
+ confidence_range: (min, max) for the 'confidence' field.
231
+
232
+ Fields created:
233
+ - explanation: "A detailed, step-by-step explanation of the reasoning."
234
+ - evidence: "The exact quote or evidence supporting the reasoning."
235
+ - output: numeric in output_range
236
+ - confidence: numeric in confidence_range
237
+ - correction: structured suggestion if output below threshold
238
+ """
239
+ # Prepare fields
240
+ min_out, max_out = output_range
241
+ min_conf, max_conf = confidence_range
242
+
243
+ explanation = ExplanationField(
244
+ name="explanation",
245
+ json_type="string",
246
+ description="A detailed, step-by-step explanation of the reasoning behind the output value.",
247
+ )
248
+ evidence = EvidenceField(
249
+ name="evidence",
250
+ json_type="string",
251
+ description="The exact quote or reference that supports the output value.",
252
+ )
253
+ output = NumericField(
254
+ name="output",
255
+ json_type=(
256
+ "number"
257
+ if isinstance(min_out, float) or isinstance(max_out, float)
258
+ else "integer"
259
+ ),
260
+ description=f"Primary numeric score for this metric (range {min_out} to {max_out}).",
261
+ jsonschema_extra={"minimum": min_out, "maximum": max_out},
262
+ extra_params={"threshold_low": min_out, "threshold_high": max_out},
263
+ )
264
+ confidence = NumericField(
265
+ name="confidence",
266
+ json_type="number",
267
+ description=f"Confidence in the output value (range {min_conf} to {max_conf}).",
268
+ jsonschema_extra={"minimum": min_conf, "maximum": max_conf},
269
+ extra_params={"threshold_low": min_conf, "threshold_high": max_conf},
270
+ )
271
+ correction = CorrectionField(
272
+ name="correction",
273
+ json_type="object",
274
+ description="Structured suggestion for how to correct or improve the output if needed.",
275
+ )
276
+
277
+ fields = [explanation, evidence, output, confidence, correction]
278
+ super().__init__(name=name, description=description, fields=fields)
279
+
280
+ def is_important(self, result: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
281
+ """
282
+ A result is 'important' if its confidence lies within the defined confidence thresholds.
283
+
284
+ Args:
285
+ result: Parsed metric result with at least 'confidence'.
286
+
287
+ Returns:
288
+ (important: bool, reason: Optional[str])
289
+ """
290
+ try:
291
+ conf = float(result.get("confidence", 0.0))
292
+ except (TypeError, ValueError):
293
+ return False, "Invalid confidence value"
294
+ # locate the confidence field
295
+ conf_field = next((f for f in self.fields if f.name == "confidence"), None)
296
+ if isinstance(conf_field, NumericField):
297
+ ok = conf_field.is_within_threshold(conf)
298
+ reason = (
299
+ None
300
+ if ok
301
+ else f"Confidence {conf} outside [{conf_field.threshold_low},{conf_field.threshold_high}]"
302
+ )
303
+ return ok, reason
304
+ return False, "Confidence field not defined"
305
+
306
+ def is_correct(self, result: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
307
+ """
308
+ A result is 'correct' if it is important AND its output lies within thresholds.
309
+
310
+ Args:
311
+ result: Parsed metric result with 'output' and 'confidence'.
312
+
313
+ Returns:
314
+ (correct: bool, reason: Optional[str])
315
+ """
316
+ important, imp_reason = self.is_important(result)
317
+ if not important:
318
+ return True, f"Not important: {imp_reason}"
319
+ # check output
320
+ try:
321
+ val = float(result.get("output", 0.0))
322
+ except (TypeError, ValueError):
323
+ return False, "Invalid output value"
324
+ out_field = next((f for f in self.fields if f.name == "output"), None)
325
+ if isinstance(out_field, NumericField):
326
+ ok = out_field.is_within_threshold(val)
327
+ reason = (
328
+ None
329
+ if ok
330
+ else f"Output {val} outside [{out_field.threshold_low},{out_field.threshold_high}]"
331
+ )
332
+ return ok, reason
333
+ return False, "Output field not defined"