ibm-watsonx-orchestrate-evaluation-framework 1.0.8__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info}/METADATA +103 -109
- ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info/RECORD +97 -0
- wxo_agentic_evaluation/analytics/tools/main.py +1 -18
- wxo_agentic_evaluation/analyze_run.py +358 -97
- wxo_agentic_evaluation/arg_configs.py +28 -1
- wxo_agentic_evaluation/description_quality_checker.py +149 -0
- wxo_agentic_evaluation/evaluation_package.py +58 -17
- wxo_agentic_evaluation/inference_backend.py +32 -17
- wxo_agentic_evaluation/llm_user.py +2 -1
- wxo_agentic_evaluation/metrics/metrics.py +22 -1
- wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +9 -1
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
- wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
- wxo_agentic_evaluation/prompt/template_render.py +34 -3
- wxo_agentic_evaluation/quick_eval.py +342 -0
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +113 -0
- wxo_agentic_evaluation/red_teaming/attack_generator.py +286 -0
- wxo_agentic_evaluation/red_teaming/attack_list.py +96 -0
- wxo_agentic_evaluation/red_teaming/attack_runner.py +128 -0
- wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +27 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +237 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +101 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +263 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +455 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +156 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +547 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +258 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +333 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +188 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +409 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +42 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +145 -0
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +114 -0
- wxo_agentic_evaluation/service_instance.py +2 -2
- wxo_agentic_evaluation/service_provider/__init__.py +15 -6
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +4 -3
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +138 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +11 -4
- wxo_agentic_evaluation/tool_planner.py +3 -1
- wxo_agentic_evaluation/type.py +33 -2
- wxo_agentic_evaluation/utils/__init__.py +0 -1
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +157 -0
- wxo_agentic_evaluation/utils/rich_utils.py +174 -0
- wxo_agentic_evaluation/utils/rouge_score.py +23 -0
- wxo_agentic_evaluation/utils/utils.py +167 -5
- ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/RECORD +0 -56
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABC
|
|
4
|
+
from typing import Any, Dict, List, Literal, Optional, Type, TypeVar
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
from pydantic import Field as PydanticField
|
|
8
|
+
from pydantic import PrivateAttr, model_validator
|
|
9
|
+
|
|
10
|
+
JSONType = Literal["integer", "number", "string", "boolean", "object", "array"]
|
|
11
|
+
TField = TypeVar("TField", bound="BaseField")
|
|
12
|
+
BaseFieldRegistry: List[Type[BaseField]] = []
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BaseField(BaseModel, ABC):
|
|
16
|
+
"""
|
|
17
|
+
Abstract representation of a single metric field.
|
|
18
|
+
|
|
19
|
+
Attributes:
|
|
20
|
+
name: Identifier of the field (used as JSON key).
|
|
21
|
+
json_type: JSON Schema type of the field.
|
|
22
|
+
description: Human-friendly description of the field's purpose.
|
|
23
|
+
jsonschema_extra: Additional JSONSchema keywords (e.g., enum, pattern).
|
|
24
|
+
extra_params: Non-JSONSchema attributes (e.g., thresholds).
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
name: str
|
|
28
|
+
json_type: JSONType
|
|
29
|
+
description: str = PydanticField(
|
|
30
|
+
"No description provided. Please specify what this field represents.",
|
|
31
|
+
description="A clear description of this field's meaning.",
|
|
32
|
+
)
|
|
33
|
+
jsonschema_extra: Dict[str, Any] = PydanticField(
|
|
34
|
+
default_factory=dict,
|
|
35
|
+
description="Additional JSONSchema constraints for this field.",
|
|
36
|
+
)
|
|
37
|
+
extra_params: Dict[str, Any] = PydanticField(
|
|
38
|
+
default_factory=dict,
|
|
39
|
+
description="Extra parameters not included in the JSONSchema (e.g., thresholds).",
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
def __init_subclass__(cls, **kwargs):
|
|
43
|
+
super().__init_subclass__(**kwargs)
|
|
44
|
+
if not getattr(cls, "__abstract__", False):
|
|
45
|
+
BaseFieldRegistry.insert(0, cls)
|
|
46
|
+
|
|
47
|
+
@classmethod
|
|
48
|
+
def can_handle(cls, name: str, schema: Dict[str, Any]) -> bool:
|
|
49
|
+
"""Override in subclasses to signal compatibility with a JSONSchema snippet."""
|
|
50
|
+
return False
|
|
51
|
+
|
|
52
|
+
@classmethod
|
|
53
|
+
def from_jsonschema(cls, name: str, schema: Dict[str, Any]) -> BaseField:
|
|
54
|
+
"""
|
|
55
|
+
Instantiate the appropriate Field subclass from a JSONSchema property.
|
|
56
|
+
The first subclass whose `can_handle` returns True is used.
|
|
57
|
+
Falls back to GenericField.
|
|
58
|
+
"""
|
|
59
|
+
for field_cls in BaseFieldRegistry:
|
|
60
|
+
if field_cls.can_handle(name, schema):
|
|
61
|
+
desc = schema.get("description", "")
|
|
62
|
+
extra = {
|
|
63
|
+
k: v for k, v in schema.items() if k not in ("type", "description")
|
|
64
|
+
}
|
|
65
|
+
return field_cls(
|
|
66
|
+
name=name,
|
|
67
|
+
json_type=schema.get("type", "string"),
|
|
68
|
+
description=desc,
|
|
69
|
+
jsonschema_extra=extra,
|
|
70
|
+
extra_params={},
|
|
71
|
+
)
|
|
72
|
+
return GenericField(
|
|
73
|
+
name=name,
|
|
74
|
+
json_type=schema.get("type", "string"),
|
|
75
|
+
description=schema.get("description", ""),
|
|
76
|
+
jsonschema_extra={
|
|
77
|
+
k: v for k, v in schema.items() if k not in ("type", "description")
|
|
78
|
+
},
|
|
79
|
+
extra_params={},
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
def to_jsonschema(self) -> Dict[str, Any]:
|
|
83
|
+
return {
|
|
84
|
+
"type": self.json_type,
|
|
85
|
+
"description": self.description,
|
|
86
|
+
**self.jsonschema_extra,
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
# --- Getters and Setters ---
|
|
90
|
+
|
|
91
|
+
def get_name(self) -> str:
|
|
92
|
+
return self.name
|
|
93
|
+
|
|
94
|
+
def set_name(self, name: str) -> None:
|
|
95
|
+
self.name = name
|
|
96
|
+
|
|
97
|
+
def get_description(self) -> str:
|
|
98
|
+
return self.description
|
|
99
|
+
|
|
100
|
+
def set_description(self, description: str) -> None:
|
|
101
|
+
self.description = description
|
|
102
|
+
|
|
103
|
+
def get_jsonschema_extra(self) -> Dict[str, Any]:
|
|
104
|
+
return dict(self.jsonschema_extra)
|
|
105
|
+
|
|
106
|
+
def set_jsonschema_extra(self, extra: Dict[str, Any]) -> None:
|
|
107
|
+
self.jsonschema_extra = extra
|
|
108
|
+
|
|
109
|
+
def get_extra_param(self, key: str) -> Any:
|
|
110
|
+
return self.extra_params.get(key)
|
|
111
|
+
|
|
112
|
+
def set_extra_param(self, key: str, value: Any) -> None:
|
|
113
|
+
self.extra_params[key] = value
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class NumericField(BaseField):
|
|
117
|
+
"""
|
|
118
|
+
Numeric field (integer or number) with optional thresholds.
|
|
119
|
+
The `extra_params` dict may include:
|
|
120
|
+
- threshold_low: minimal acceptable value (for validation)
|
|
121
|
+
- threshold_high: maximal acceptable value
|
|
122
|
+
"""
|
|
123
|
+
|
|
124
|
+
threshold_low: Optional[float] = PydanticField(
|
|
125
|
+
None, description="Lower bound for correctness checks (not in JSONSchema)."
|
|
126
|
+
)
|
|
127
|
+
threshold_high: Optional[float] = PydanticField(
|
|
128
|
+
None, description="Upper bound for correctness checks (not in JSONSchema)."
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
__abstract__ = False
|
|
132
|
+
|
|
133
|
+
@model_validator(mode="before")
|
|
134
|
+
def extract_thresholds(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
|
135
|
+
extra = values.get("jsonschema_extra", {})
|
|
136
|
+
if "threshold_low" in extra:
|
|
137
|
+
values["threshold_low"] = extra["threshold_low"]
|
|
138
|
+
if "threshold_high" in extra:
|
|
139
|
+
values["threshold_high"] = extra["threshold_high"]
|
|
140
|
+
return values
|
|
141
|
+
|
|
142
|
+
@classmethod
|
|
143
|
+
def can_handle(cls, name: str, schema: Dict[str, Any]) -> bool:
|
|
144
|
+
return schema.get("type") in ("integer", "number")
|
|
145
|
+
|
|
146
|
+
@classmethod
|
|
147
|
+
def from_jsonschema(cls, name: str, schema: Dict[str, Any]) -> NumericField:
|
|
148
|
+
"""
|
|
149
|
+
Create a NumericField from a JSONSchema property.
|
|
150
|
+
"""
|
|
151
|
+
return NumericField(
|
|
152
|
+
name=name,
|
|
153
|
+
json_type=schema.get("type", "number"),
|
|
154
|
+
description=schema.get("description", ""),
|
|
155
|
+
jsonschema_extra={
|
|
156
|
+
k: v for k, v in schema.items() if k not in ("type", "description")
|
|
157
|
+
},
|
|
158
|
+
extra_params={},
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
def to_jsonschema(self) -> Dict[str, Any]:
|
|
162
|
+
return super().to_jsonschema()
|
|
163
|
+
|
|
164
|
+
def is_within_threshold(self, value: float) -> bool:
|
|
165
|
+
if self.threshold_low is not None and value < self.threshold_low:
|
|
166
|
+
return False
|
|
167
|
+
if self.threshold_high is not None and value > self.threshold_high:
|
|
168
|
+
return False
|
|
169
|
+
return True
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
class EnumField(BaseField):
|
|
173
|
+
"""
|
|
174
|
+
Field whose value must be one of a fixed set of options.
|
|
175
|
+
Expects `jsonschema_extra["enum"]` to be a list of allowed values.
|
|
176
|
+
"""
|
|
177
|
+
|
|
178
|
+
__abstract__ = False
|
|
179
|
+
|
|
180
|
+
@classmethod
|
|
181
|
+
def can_handle(cls, name: str, schema: Dict[str, Any]) -> bool:
|
|
182
|
+
return "enum" in schema
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
class ExplanationField(BaseField):
|
|
186
|
+
"""
|
|
187
|
+
Free-form explanation of the metric's reasoning.
|
|
188
|
+
"""
|
|
189
|
+
|
|
190
|
+
__abstract__ = False
|
|
191
|
+
|
|
192
|
+
def __init__(self, **data: Any):
|
|
193
|
+
data.setdefault(
|
|
194
|
+
"description",
|
|
195
|
+
"A detailed, step-by-step explanation of the reasoning behind the metric's value.",
|
|
196
|
+
)
|
|
197
|
+
super().__init__(**data)
|
|
198
|
+
|
|
199
|
+
@classmethod
|
|
200
|
+
def can_handle(cls, name: str, schema: Dict[str, Any]) -> bool:
|
|
201
|
+
return name.lower() == "explanation" and schema.get("type") == "string"
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
class EvidenceField(BaseField):
|
|
205
|
+
"""
|
|
206
|
+
The specific quote or reference that supports the metric's evaluation.
|
|
207
|
+
"""
|
|
208
|
+
|
|
209
|
+
__abstract__ = False
|
|
210
|
+
|
|
211
|
+
def __init__(self, **data: Any):
|
|
212
|
+
data.setdefault(
|
|
213
|
+
"description",
|
|
214
|
+
"The exact quote or reference from the input or context that justifies the metric's value.",
|
|
215
|
+
)
|
|
216
|
+
super().__init__(**data)
|
|
217
|
+
|
|
218
|
+
@classmethod
|
|
219
|
+
def can_handle(cls, name: str, schema: Dict[str, Any]) -> bool:
|
|
220
|
+
return name.lower() == "evidence" and schema.get("type") == "string"
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
class CorrectionField(BaseField):
|
|
224
|
+
"""
|
|
225
|
+
A structured suggestion (as JSON) for correcting or improving the output.
|
|
226
|
+
"""
|
|
227
|
+
|
|
228
|
+
__abstract__ = False
|
|
229
|
+
|
|
230
|
+
def __init__(self, **data: Any):
|
|
231
|
+
data.setdefault(
|
|
232
|
+
"description",
|
|
233
|
+
"A JSON-formatted suggestion for how to correct or improve the output if needed.",
|
|
234
|
+
)
|
|
235
|
+
super().__init__(**data)
|
|
236
|
+
|
|
237
|
+
@classmethod
|
|
238
|
+
def can_handle(cls, name: str, schema: Dict[str, Any]) -> bool:
|
|
239
|
+
return name.lower() == "correction" and schema.get("type") == "object"
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
class GenericField(BaseField):
|
|
243
|
+
"""
|
|
244
|
+
Fallback field type for any property not handled by other classes.
|
|
245
|
+
"""
|
|
246
|
+
|
|
247
|
+
__abstract__ = False
|
|
248
|
+
|
|
249
|
+
def __init__(self, **data: Any):
|
|
250
|
+
data.setdefault(
|
|
251
|
+
"description",
|
|
252
|
+
f"A generic field named '{data.get('name')}' of type {data.get('json_type')}.",
|
|
253
|
+
)
|
|
254
|
+
super().__init__(**data)
|
|
255
|
+
|
|
256
|
+
@classmethod
|
|
257
|
+
def can_handle(cls, name: str, schema: Dict[str, Any]) -> bool:
|
|
258
|
+
return True
|
|
@@ -0,0 +1,333 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from typing import Any, Dict, List, Optional, Set, Tuple, Type, TypeVar
|
|
5
|
+
|
|
6
|
+
from wxo_agentic_evaluation.referenceless_eval.metrics.field import (
|
|
7
|
+
BaseField,
|
|
8
|
+
CorrectionField,
|
|
9
|
+
EvidenceField,
|
|
10
|
+
ExplanationField,
|
|
11
|
+
NumericField,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
TMetric = TypeVar("TMetric", bound="Metric")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Metric:
|
|
18
|
+
"""
|
|
19
|
+
Abstract representation of an evaluation metric composed of multiple fields.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
name: str,
|
|
25
|
+
description: str,
|
|
26
|
+
fields: Optional[List[BaseField]] = None,
|
|
27
|
+
required: Optional[List[str]] = None,
|
|
28
|
+
additional_properties: bool = True,
|
|
29
|
+
) -> None:
|
|
30
|
+
"""
|
|
31
|
+
Args:
|
|
32
|
+
name: Unique metric identifier.
|
|
33
|
+
description: Full description of what this metric measures.
|
|
34
|
+
fields: List of BaseField instances composing this metric.
|
|
35
|
+
required: List of field names that must appear in results.
|
|
36
|
+
Defaults to all provided fields.
|
|
37
|
+
"""
|
|
38
|
+
self.name = name
|
|
39
|
+
self.description = description
|
|
40
|
+
self.fields: List[BaseField] = fields or []
|
|
41
|
+
self.additional_properties = additional_properties
|
|
42
|
+
# Determine required fields
|
|
43
|
+
if required is not None:
|
|
44
|
+
self.required_fields: Set[str] = set(required)
|
|
45
|
+
else:
|
|
46
|
+
self.required_fields: Set[str] = {f.name for f in self.fields}
|
|
47
|
+
|
|
48
|
+
# Validate required_fields
|
|
49
|
+
known = {f.name for f in self.fields}
|
|
50
|
+
missing = self.required_fields - known
|
|
51
|
+
if missing:
|
|
52
|
+
raise ValueError(
|
|
53
|
+
f"Required fields {missing} not among metric fields {known}"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
def to_jsonschema(self) -> Dict[str, Any]:
|
|
57
|
+
"""
|
|
58
|
+
Build a JSONSchema representation of this metric.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
A dict with keys:
|
|
62
|
+
- title: self.name
|
|
63
|
+
- description: self.description
|
|
64
|
+
- type: "object"
|
|
65
|
+
- properties: mapping field.name → field.to_jsonschema()
|
|
66
|
+
- required: list of required field names
|
|
67
|
+
"""
|
|
68
|
+
props: Dict[str, Any] = {f.name: f.to_jsonschema() for f in self.fields}
|
|
69
|
+
return {
|
|
70
|
+
"title": self.name,
|
|
71
|
+
"description": self.description,
|
|
72
|
+
"type": "object",
|
|
73
|
+
"properties": props,
|
|
74
|
+
"required": sorted(self.required_fields),
|
|
75
|
+
"additionalProperties": self.additional_properties,
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
def add_field(self, field: BaseField, required: bool = True) -> None:
|
|
79
|
+
"""
|
|
80
|
+
Add a new field to this metric.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
field: BaseField instance.
|
|
84
|
+
required: Whether this field must appear in results.
|
|
85
|
+
"""
|
|
86
|
+
if any(f.name == field.name for f in self.fields):
|
|
87
|
+
raise ValueError(f"Field '{field.name}' already defined")
|
|
88
|
+
self.fields.append(field)
|
|
89
|
+
if required:
|
|
90
|
+
self.required_fields.add(field.name)
|
|
91
|
+
|
|
92
|
+
def remove_field(self, name: str) -> None:
|
|
93
|
+
"""
|
|
94
|
+
Remove a field by name.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
name: Name of field to remove.
|
|
98
|
+
"""
|
|
99
|
+
self.fields = [f for f in self.fields if f.name != name]
|
|
100
|
+
self.required_fields.discard(name)
|
|
101
|
+
|
|
102
|
+
@classmethod
|
|
103
|
+
def from_jsonschema(cls: Type[TMetric], schema: Dict[str, Any]) -> Metric:
|
|
104
|
+
"""
|
|
105
|
+
Reconstruct a Metric from a JSONSchema dict.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
schema: dict with 'title', 'description', 'properties', 'required'.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
Metric instance with fields populated.
|
|
112
|
+
"""
|
|
113
|
+
name: str = schema.get("title", "")
|
|
114
|
+
description: str = schema.get("description", "")
|
|
115
|
+
props: Dict[str, Any] = schema.get("properties", {})
|
|
116
|
+
required: List[str] = schema.get("required", [])
|
|
117
|
+
additional_props: bool = schema.get("additionalProperties", True)
|
|
118
|
+
fields: List[BaseField] = []
|
|
119
|
+
for fname, fschema in props.items():
|
|
120
|
+
# If type is number or integer, use NumericField
|
|
121
|
+
if fschema.get("type") in ("number", "integer"):
|
|
122
|
+
field = NumericField.from_jsonschema(fname, fschema)
|
|
123
|
+
else:
|
|
124
|
+
field = BaseField.from_jsonschema(fname, fschema)
|
|
125
|
+
fields.append(field)
|
|
126
|
+
return cls(
|
|
127
|
+
name=name,
|
|
128
|
+
description=description,
|
|
129
|
+
fields=fields,
|
|
130
|
+
required=required,
|
|
131
|
+
additional_properties=additional_props,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
def is_important(self, result: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
|
|
135
|
+
"""
|
|
136
|
+
A result is 'important' if its confidence lies within the defined confidence thresholds.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
result: Parsed metric result with at least 'confidence'.
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
(important: bool, reason: Optional[str])
|
|
143
|
+
"""
|
|
144
|
+
try:
|
|
145
|
+
conf = float(result.get("confidence", 0.0))
|
|
146
|
+
except (TypeError, ValueError):
|
|
147
|
+
return False, "Invalid confidence value"
|
|
148
|
+
# locate the confidence field
|
|
149
|
+
conf_field = next((f for f in self.fields if f.name == "confidence"), None)
|
|
150
|
+
if isinstance(conf_field, NumericField):
|
|
151
|
+
ok = conf_field.is_within_threshold(conf)
|
|
152
|
+
reason = (
|
|
153
|
+
None
|
|
154
|
+
if ok
|
|
155
|
+
else f"Confidence {conf} outside [{conf_field.threshold_low},{conf_field.threshold_high}]"
|
|
156
|
+
)
|
|
157
|
+
return ok, reason
|
|
158
|
+
return False, "Confidence field not defined"
|
|
159
|
+
|
|
160
|
+
def is_correct(self, result: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
|
|
161
|
+
"""
|
|
162
|
+
A result is 'correct' if it is important AND its output lies within thresholds.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
result: Parsed metric result with 'output' and 'confidence'.
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
(correct: bool, reason: Optional[str])
|
|
169
|
+
"""
|
|
170
|
+
important, imp_reason = self.is_important(result)
|
|
171
|
+
if not important:
|
|
172
|
+
return True, f"Not important: {imp_reason}"
|
|
173
|
+
# check output
|
|
174
|
+
try:
|
|
175
|
+
val = float(result.get("output", 0.0))
|
|
176
|
+
except (TypeError, ValueError):
|
|
177
|
+
return False, "Invalid output value"
|
|
178
|
+
out_field = next((f for f in self.fields if f.name == "output"), None)
|
|
179
|
+
if isinstance(out_field, NumericField):
|
|
180
|
+
ok = out_field.is_within_threshold(val)
|
|
181
|
+
reason = (
|
|
182
|
+
None
|
|
183
|
+
if ok
|
|
184
|
+
else f"Output {val} outside [{out_field.threshold_low},{out_field.threshold_high}]"
|
|
185
|
+
)
|
|
186
|
+
return ok, reason
|
|
187
|
+
return False, "Output field not defined"
|
|
188
|
+
|
|
189
|
+
def parse_response(self, response: str) -> Dict[str, Any]:
|
|
190
|
+
"""
|
|
191
|
+
Parse a raw response string into a structured dict.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
response: Raw response string.
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
Parsed response as a dict.
|
|
198
|
+
"""
|
|
199
|
+
# Default implementation: assume JSON string
|
|
200
|
+
try:
|
|
201
|
+
return json.loads(response)
|
|
202
|
+
except json.JSONDecodeError as e:
|
|
203
|
+
raise ValueError(f"Failed to parse response: {e}") from e
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
class StandardMetric(Metric):
|
|
207
|
+
"""
|
|
208
|
+
A standard metric with common fields:
|
|
209
|
+
- explanation: string, detailed reasoning.
|
|
210
|
+
- evidence: string, supporting quote or reference.
|
|
211
|
+
- output: numeric value within specified range.
|
|
212
|
+
- confidence: numeric confidence within specified range.
|
|
213
|
+
- correction: object, structured suggestion for improvement.
|
|
214
|
+
Also provides convenience methods `is_important` and `is_correct`.
|
|
215
|
+
"""
|
|
216
|
+
|
|
217
|
+
def __init__(
|
|
218
|
+
self,
|
|
219
|
+
name: str,
|
|
220
|
+
description: str,
|
|
221
|
+
*,
|
|
222
|
+
output_range: Tuple[float, float] = (0.0, 1.0),
|
|
223
|
+
confidence_range: Tuple[float, float] = (0.0, 1.0),
|
|
224
|
+
) -> None:
|
|
225
|
+
"""
|
|
226
|
+
Args:
|
|
227
|
+
name: Metric identifier.
|
|
228
|
+
description: Explanation of what the metric measures.
|
|
229
|
+
output_range: (min, max) allowed for the 'output' field.
|
|
230
|
+
confidence_range: (min, max) for the 'confidence' field.
|
|
231
|
+
|
|
232
|
+
Fields created:
|
|
233
|
+
- explanation: "A detailed, step-by-step explanation of the reasoning."
|
|
234
|
+
- evidence: "The exact quote or evidence supporting the reasoning."
|
|
235
|
+
- output: numeric in output_range
|
|
236
|
+
- confidence: numeric in confidence_range
|
|
237
|
+
- correction: structured suggestion if output below threshold
|
|
238
|
+
"""
|
|
239
|
+
# Prepare fields
|
|
240
|
+
min_out, max_out = output_range
|
|
241
|
+
min_conf, max_conf = confidence_range
|
|
242
|
+
|
|
243
|
+
explanation = ExplanationField(
|
|
244
|
+
name="explanation",
|
|
245
|
+
json_type="string",
|
|
246
|
+
description="A detailed, step-by-step explanation of the reasoning behind the output value.",
|
|
247
|
+
)
|
|
248
|
+
evidence = EvidenceField(
|
|
249
|
+
name="evidence",
|
|
250
|
+
json_type="string",
|
|
251
|
+
description="The exact quote or reference that supports the output value.",
|
|
252
|
+
)
|
|
253
|
+
output = NumericField(
|
|
254
|
+
name="output",
|
|
255
|
+
json_type=(
|
|
256
|
+
"number"
|
|
257
|
+
if isinstance(min_out, float) or isinstance(max_out, float)
|
|
258
|
+
else "integer"
|
|
259
|
+
),
|
|
260
|
+
description=f"Primary numeric score for this metric (range {min_out} to {max_out}).",
|
|
261
|
+
jsonschema_extra={"minimum": min_out, "maximum": max_out},
|
|
262
|
+
extra_params={"threshold_low": min_out, "threshold_high": max_out},
|
|
263
|
+
)
|
|
264
|
+
confidence = NumericField(
|
|
265
|
+
name="confidence",
|
|
266
|
+
json_type="number",
|
|
267
|
+
description=f"Confidence in the output value (range {min_conf} to {max_conf}).",
|
|
268
|
+
jsonschema_extra={"minimum": min_conf, "maximum": max_conf},
|
|
269
|
+
extra_params={"threshold_low": min_conf, "threshold_high": max_conf},
|
|
270
|
+
)
|
|
271
|
+
correction = CorrectionField(
|
|
272
|
+
name="correction",
|
|
273
|
+
json_type="object",
|
|
274
|
+
description="Structured suggestion for how to correct or improve the output if needed.",
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
fields = [explanation, evidence, output, confidence, correction]
|
|
278
|
+
super().__init__(name=name, description=description, fields=fields)
|
|
279
|
+
|
|
280
|
+
def is_important(self, result: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
|
|
281
|
+
"""
|
|
282
|
+
A result is 'important' if its confidence lies within the defined confidence thresholds.
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
result: Parsed metric result with at least 'confidence'.
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
(important: bool, reason: Optional[str])
|
|
289
|
+
"""
|
|
290
|
+
try:
|
|
291
|
+
conf = float(result.get("confidence", 0.0))
|
|
292
|
+
except (TypeError, ValueError):
|
|
293
|
+
return False, "Invalid confidence value"
|
|
294
|
+
# locate the confidence field
|
|
295
|
+
conf_field = next((f for f in self.fields if f.name == "confidence"), None)
|
|
296
|
+
if isinstance(conf_field, NumericField):
|
|
297
|
+
ok = conf_field.is_within_threshold(conf)
|
|
298
|
+
reason = (
|
|
299
|
+
None
|
|
300
|
+
if ok
|
|
301
|
+
else f"Confidence {conf} outside [{conf_field.threshold_low},{conf_field.threshold_high}]"
|
|
302
|
+
)
|
|
303
|
+
return ok, reason
|
|
304
|
+
return False, "Confidence field not defined"
|
|
305
|
+
|
|
306
|
+
def is_correct(self, result: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
|
|
307
|
+
"""
|
|
308
|
+
A result is 'correct' if it is important AND its output lies within thresholds.
|
|
309
|
+
|
|
310
|
+
Args:
|
|
311
|
+
result: Parsed metric result with 'output' and 'confidence'.
|
|
312
|
+
|
|
313
|
+
Returns:
|
|
314
|
+
(correct: bool, reason: Optional[str])
|
|
315
|
+
"""
|
|
316
|
+
important, imp_reason = self.is_important(result)
|
|
317
|
+
if not important:
|
|
318
|
+
return True, f"Not important: {imp_reason}"
|
|
319
|
+
# check output
|
|
320
|
+
try:
|
|
321
|
+
val = float(result.get("output", 0.0))
|
|
322
|
+
except (TypeError, ValueError):
|
|
323
|
+
return False, "Invalid output value"
|
|
324
|
+
out_field = next((f for f in self.fields if f.name == "output"), None)
|
|
325
|
+
if isinstance(out_field, NumericField):
|
|
326
|
+
ok = out_field.is_within_threshold(val)
|
|
327
|
+
reason = (
|
|
328
|
+
None
|
|
329
|
+
if ok
|
|
330
|
+
else f"Output {val} outside [{out_field.threshold_low},{out_field.threshold_high}]"
|
|
331
|
+
)
|
|
332
|
+
return ok, reason
|
|
333
|
+
return False, "Output field not defined"
|