ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +19 -25
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +1184 -97
  8. wxo_agentic_evaluation/annotate.py +7 -5
  9. wxo_agentic_evaluation/arg_configs.py +97 -5
  10. wxo_agentic_evaluation/base_user.py +25 -0
  11. wxo_agentic_evaluation/batch_annotate.py +97 -27
  12. wxo_agentic_evaluation/clients.py +103 -0
  13. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  14. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  15. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  16. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  17. wxo_agentic_evaluation/data_annotator.py +45 -19
  18. wxo_agentic_evaluation/description_quality_checker.py +178 -0
  19. wxo_agentic_evaluation/evaluation.py +50 -0
  20. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  21. wxo_agentic_evaluation/evaluation_package.py +544 -107
  22. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  23. wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
  24. wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
  25. wxo_agentic_evaluation/external_agent/types.py +8 -7
  26. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  27. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  28. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  29. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  30. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  31. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  32. wxo_agentic_evaluation/llm_matching.py +108 -5
  33. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  34. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  35. wxo_agentic_evaluation/llm_user.py +12 -6
  36. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  37. wxo_agentic_evaluation/main.py +128 -246
  38. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  39. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  40. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  41. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  42. wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
  43. wxo_agentic_evaluation/metrics/metrics.py +319 -16
  44. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  45. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  46. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  47. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  48. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  49. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  50. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  51. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  52. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  53. wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
  54. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
  55. wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
  56. wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
  57. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  58. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
  59. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  60. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
  61. wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
  62. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  63. wxo_agentic_evaluation/prompt/template_render.py +163 -12
  64. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  65. wxo_agentic_evaluation/quick_eval.py +384 -0
  66. wxo_agentic_evaluation/record_chat.py +132 -81
  67. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
  68. wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
  69. wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
  70. wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
  71. wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
  72. wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
  73. wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
  74. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
  75. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
  76. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
  77. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
  78. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  79. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  80. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
  81. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
  82. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  83. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  84. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
  85. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
  86. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
  87. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
  88. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
  89. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
  90. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
  91. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
  92. wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
  93. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
  94. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
  95. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
  96. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
  97. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
  98. wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
  99. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
  100. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
  101. wxo_agentic_evaluation/resource_map.py +6 -3
  102. wxo_agentic_evaluation/runner.py +329 -0
  103. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  104. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  105. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
  106. wxo_agentic_evaluation/scheduler.py +247 -0
  107. wxo_agentic_evaluation/service_instance.py +117 -26
  108. wxo_agentic_evaluation/service_provider/__init__.py +182 -17
  109. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  110. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
  111. wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
  112. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  113. wxo_agentic_evaluation/service_provider/provider.py +129 -10
  114. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
  115. wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
  116. wxo_agentic_evaluation/simluation_runner.py +125 -0
  117. wxo_agentic_evaluation/test_prompt.py +4 -4
  118. wxo_agentic_evaluation/tool_planner.py +141 -46
  119. wxo_agentic_evaluation/type.py +217 -14
  120. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  121. wxo_agentic_evaluation/utils/__init__.py +44 -3
  122. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  123. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  124. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  125. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
  126. wxo_agentic_evaluation/utils/parsers.py +71 -0
  127. wxo_agentic_evaluation/utils/rich_utils.py +188 -0
  128. wxo_agentic_evaluation/utils/rouge_score.py +23 -0
  129. wxo_agentic_evaluation/utils/utils.py +514 -17
  130. wxo_agentic_evaluation/wxo_client.py +81 -0
  131. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
  132. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
  133. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  134. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,266 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC
4
+ from typing import Any, Dict, List, Literal, Optional, Type, TypeVar
5
+
6
+ from pydantic import BaseModel
7
+ from pydantic import Field as PydanticField
8
+ from pydantic import PrivateAttr, model_validator
9
+
10
+ JSONType = Literal["integer", "number", "string", "boolean", "object", "array"]
11
+ TField = TypeVar("TField", bound="BaseField")
12
+ BaseFieldRegistry: List[Type[BaseField]] = []
13
+
14
+
15
+ class BaseField(BaseModel, ABC):
16
+ """
17
+ Abstract representation of a single metric field.
18
+
19
+ Attributes:
20
+ name: Identifier of the field (used as JSON key).
21
+ json_type: JSON Schema type of the field.
22
+ description: Human-friendly description of the field's purpose.
23
+ jsonschema_extra: Additional JSONSchema keywords (e.g., enum, pattern).
24
+ extra_params: Non-JSONSchema attributes (e.g., thresholds).
25
+ """
26
+
27
+ name: str
28
+ json_type: JSONType
29
+ description: str = PydanticField(
30
+ "No description provided. Please specify what this field represents.",
31
+ description="A clear description of this field's meaning.",
32
+ )
33
+ jsonschema_extra: Dict[str, Any] = PydanticField(
34
+ default_factory=dict,
35
+ description="Additional JSONSchema constraints for this field.",
36
+ )
37
+ extra_params: Dict[str, Any] = PydanticField(
38
+ default_factory=dict,
39
+ description="Extra parameters not included in the JSONSchema (e.g., thresholds).",
40
+ )
41
+
42
+ def __init_subclass__(cls, **kwargs):
43
+ super().__init_subclass__(**kwargs)
44
+ if not getattr(cls, "__abstract__", False):
45
+ BaseFieldRegistry.insert(0, cls)
46
+
47
+ @classmethod
48
+ def can_handle(cls, name: str, schema: Dict[str, Any]) -> bool:
49
+ """Override in subclasses to signal compatibility with a JSONSchema snippet."""
50
+ return False
51
+
52
+ @classmethod
53
+ def from_jsonschema(cls, name: str, schema: Dict[str, Any]) -> BaseField:
54
+ """
55
+ Instantiate the appropriate Field subclass from a JSONSchema property.
56
+ The first subclass whose `can_handle` returns True is used.
57
+ Falls back to GenericField.
58
+ """
59
+ for field_cls in BaseFieldRegistry:
60
+ if field_cls.can_handle(name, schema):
61
+ desc = schema.get("description", "")
62
+ extra = {
63
+ k: v
64
+ for k, v in schema.items()
65
+ if k not in ("type", "description")
66
+ }
67
+ return field_cls(
68
+ name=name,
69
+ json_type=schema.get("type", "string"),
70
+ description=desc,
71
+ jsonschema_extra=extra,
72
+ extra_params={},
73
+ )
74
+ return GenericField(
75
+ name=name,
76
+ json_type=schema.get("type", "string"),
77
+ description=schema.get("description", ""),
78
+ jsonschema_extra={
79
+ k: v
80
+ for k, v in schema.items()
81
+ if k not in ("type", "description")
82
+ },
83
+ extra_params={},
84
+ )
85
+
86
+ def to_jsonschema(self) -> Dict[str, Any]:
87
+ return {
88
+ "type": self.json_type,
89
+ "description": self.description,
90
+ **self.jsonschema_extra,
91
+ }
92
+
93
+ # --- Getters and Setters ---
94
+
95
+ def get_name(self) -> str:
96
+ return self.name
97
+
98
+ def set_name(self, name: str) -> None:
99
+ self.name = name
100
+
101
+ def get_description(self) -> str:
102
+ return self.description
103
+
104
+ def set_description(self, description: str) -> None:
105
+ self.description = description
106
+
107
+ def get_jsonschema_extra(self) -> Dict[str, Any]:
108
+ return dict(self.jsonschema_extra)
109
+
110
+ def set_jsonschema_extra(self, extra: Dict[str, Any]) -> None:
111
+ self.jsonschema_extra = extra
112
+
113
+ def get_extra_param(self, key: str) -> Any:
114
+ return self.extra_params.get(key)
115
+
116
+ def set_extra_param(self, key: str, value: Any) -> None:
117
+ self.extra_params[key] = value
118
+
119
+
120
+ class NumericField(BaseField):
121
+ """
122
+ Numeric field (integer or number) with optional thresholds.
123
+ The `extra_params` dict may include:
124
+ - threshold_low: minimal acceptable value (for validation)
125
+ - threshold_high: maximal acceptable value
126
+ """
127
+
128
+ threshold_low: Optional[float] = PydanticField(
129
+ None,
130
+ description="Lower bound for correctness checks (not in JSONSchema).",
131
+ )
132
+ threshold_high: Optional[float] = PydanticField(
133
+ None,
134
+ description="Upper bound for correctness checks (not in JSONSchema).",
135
+ )
136
+
137
+ __abstract__ = False
138
+
139
+ @model_validator(mode="before")
140
+ def extract_thresholds(cls, values: Dict[str, Any]) -> Dict[str, Any]:
141
+ extra = values.get("jsonschema_extra", {})
142
+ if "threshold_low" in extra:
143
+ values["threshold_low"] = extra["threshold_low"]
144
+ if "threshold_high" in extra:
145
+ values["threshold_high"] = extra["threshold_high"]
146
+ return values
147
+
148
+ @classmethod
149
+ def can_handle(cls, name: str, schema: Dict[str, Any]) -> bool:
150
+ return schema.get("type") in ("integer", "number")
151
+
152
+ @classmethod
153
+ def from_jsonschema(cls, name: str, schema: Dict[str, Any]) -> NumericField:
154
+ """
155
+ Create a NumericField from a JSONSchema property.
156
+ """
157
+ return NumericField(
158
+ name=name,
159
+ json_type=schema.get("type", "number"),
160
+ description=schema.get("description", ""),
161
+ jsonschema_extra={
162
+ k: v
163
+ for k, v in schema.items()
164
+ if k not in ("type", "description")
165
+ },
166
+ extra_params={},
167
+ )
168
+
169
+ def to_jsonschema(self) -> Dict[str, Any]:
170
+ return super().to_jsonschema()
171
+
172
+ def is_within_threshold(self, value: float) -> bool:
173
+ if self.threshold_low is not None and value < self.threshold_low:
174
+ return False
175
+ if self.threshold_high is not None and value > self.threshold_high:
176
+ return False
177
+ return True
178
+
179
+
180
+ class EnumField(BaseField):
181
+ """
182
+ Field whose value must be one of a fixed set of options.
183
+ Expects `jsonschema_extra["enum"]` to be a list of allowed values.
184
+ """
185
+
186
+ __abstract__ = False
187
+
188
+ @classmethod
189
+ def can_handle(cls, name: str, schema: Dict[str, Any]) -> bool:
190
+ return "enum" in schema
191
+
192
+
193
+ class ExplanationField(BaseField):
194
+ """
195
+ Free-form explanation of the metric's reasoning.
196
+ """
197
+
198
+ __abstract__ = False
199
+
200
+ def __init__(self, **data: Any):
201
+ data.setdefault(
202
+ "description",
203
+ "A detailed, step-by-step explanation of the reasoning behind the metric's value.",
204
+ )
205
+ super().__init__(**data)
206
+
207
+ @classmethod
208
+ def can_handle(cls, name: str, schema: Dict[str, Any]) -> bool:
209
+ return name.lower() == "explanation" and schema.get("type") == "string"
210
+
211
+
212
+ class EvidenceField(BaseField):
213
+ """
214
+ The specific quote or reference that supports the metric's evaluation.
215
+ """
216
+
217
+ __abstract__ = False
218
+
219
+ def __init__(self, **data: Any):
220
+ data.setdefault(
221
+ "description",
222
+ "The exact quote or reference from the input or context that justifies the metric's value.",
223
+ )
224
+ super().__init__(**data)
225
+
226
+ @classmethod
227
+ def can_handle(cls, name: str, schema: Dict[str, Any]) -> bool:
228
+ return name.lower() == "evidence" and schema.get("type") == "string"
229
+
230
+
231
+ class CorrectionField(BaseField):
232
+ """
233
+ A structured suggestion (as JSON) for correcting or improving the output.
234
+ """
235
+
236
+ __abstract__ = False
237
+
238
+ def __init__(self, **data: Any):
239
+ data.setdefault(
240
+ "description",
241
+ "A JSON-formatted suggestion for how to correct or improve the output if needed.",
242
+ )
243
+ super().__init__(**data)
244
+
245
+ @classmethod
246
+ def can_handle(cls, name: str, schema: Dict[str, Any]) -> bool:
247
+ return name.lower() == "correction" and schema.get("type") == "object"
248
+
249
+
250
+ class GenericField(BaseField):
251
+ """
252
+ Fallback field type for any property not handled by other classes.
253
+ """
254
+
255
+ __abstract__ = False
256
+
257
+ def __init__(self, **data: Any):
258
+ data.setdefault(
259
+ "description",
260
+ f"A generic field named '{data.get('name')}' of type {data.get('json_type')}.",
261
+ )
262
+ super().__init__(**data)
263
+
264
+ @classmethod
265
+ def can_handle(cls, name: str, schema: Dict[str, Any]) -> bool:
266
+ return True
@@ -0,0 +1,344 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from typing import Any, Dict, List, Optional, Set, Tuple, Type, TypeVar
5
+
6
+ from wxo_agentic_evaluation.referenceless_eval.metrics.field import (
7
+ BaseField,
8
+ CorrectionField,
9
+ EvidenceField,
10
+ ExplanationField,
11
+ NumericField,
12
+ )
13
+
14
+ TMetric = TypeVar("TMetric", bound="Metric")
15
+
16
+
17
+ class Metric:
18
+ """
19
+ Abstract representation of an evaluation metric composed of multiple fields.
20
+ """
21
+
22
+ def __init__(
23
+ self,
24
+ name: str,
25
+ description: str,
26
+ fields: Optional[List[BaseField]] = None,
27
+ required: Optional[List[str]] = None,
28
+ additional_properties: bool = True,
29
+ ) -> None:
30
+ """
31
+ Args:
32
+ name: Unique metric identifier.
33
+ description: Full description of what this metric measures.
34
+ fields: List of BaseField instances composing this metric.
35
+ required: List of field names that must appear in results.
36
+ Defaults to all provided fields.
37
+ """
38
+ self.name = name
39
+ self.description = description
40
+ self.fields: List[BaseField] = fields or []
41
+ self.additional_properties = additional_properties
42
+ # Determine required fields
43
+ if required is not None:
44
+ self.required_fields: Set[str] = set(required)
45
+ else:
46
+ self.required_fields: Set[str] = {f.name for f in self.fields}
47
+
48
+ # Validate required_fields
49
+ known = {f.name for f in self.fields}
50
+ missing = self.required_fields - known
51
+ if missing:
52
+ raise ValueError(
53
+ f"Required fields {missing} not among metric fields {known}"
54
+ )
55
+
56
+ def to_jsonschema(self) -> Dict[str, Any]:
57
+ """
58
+ Build a JSONSchema representation of this metric.
59
+
60
+ Returns:
61
+ A dict with keys:
62
+ - title: self.name
63
+ - description: self.description
64
+ - type: "object"
65
+ - properties: mapping field.name → field.to_jsonschema()
66
+ - required: list of required field names
67
+ """
68
+ props: Dict[str, Any] = {f.name: f.to_jsonschema() for f in self.fields}
69
+ return {
70
+ "title": self.name,
71
+ "description": self.description,
72
+ "type": "object",
73
+ "properties": props,
74
+ "required": sorted(self.required_fields),
75
+ "additionalProperties": self.additional_properties,
76
+ }
77
+
78
+ def add_field(self, field: BaseField, required: bool = True) -> None:
79
+ """
80
+ Add a new field to this metric.
81
+
82
+ Args:
83
+ field: BaseField instance.
84
+ required: Whether this field must appear in results.
85
+ """
86
+ if any(f.name == field.name for f in self.fields):
87
+ raise ValueError(f"Field '{field.name}' already defined")
88
+ self.fields.append(field)
89
+ if required:
90
+ self.required_fields.add(field.name)
91
+
92
+ def remove_field(self, name: str) -> None:
93
+ """
94
+ Remove a field by name.
95
+
96
+ Args:
97
+ name: Name of field to remove.
98
+ """
99
+ self.fields = [f for f in self.fields if f.name != name]
100
+ self.required_fields.discard(name)
101
+
102
+ @classmethod
103
+ def from_jsonschema(cls: Type[TMetric], schema: Dict[str, Any]) -> Metric:
104
+ """
105
+ Reconstruct a Metric from a JSONSchema dict.
106
+
107
+ Args:
108
+ schema: dict with 'title', 'description', 'properties', 'required'.
109
+
110
+ Returns:
111
+ Metric instance with fields populated.
112
+ """
113
+ name: str = schema.get("title", "")
114
+ description: str = schema.get("description", "")
115
+ props: Dict[str, Any] = schema.get("properties", {})
116
+ required: List[str] = schema.get("required", [])
117
+ additional_props: bool = schema.get("additionalProperties", True)
118
+ fields: List[BaseField] = []
119
+ for fname, fschema in props.items():
120
+ # If type is number or integer, use NumericField
121
+ if fschema.get("type") in ("number", "integer"):
122
+ field = NumericField.from_jsonschema(fname, fschema)
123
+ else:
124
+ field = BaseField.from_jsonschema(fname, fschema)
125
+ fields.append(field)
126
+ return cls(
127
+ name=name,
128
+ description=description,
129
+ fields=fields,
130
+ required=required,
131
+ additional_properties=additional_props,
132
+ )
133
+
134
+ def is_important(
135
+ self, result: Dict[str, Any]
136
+ ) -> Tuple[bool, Optional[str]]:
137
+ """
138
+ A result is 'important' if its confidence lies within the defined confidence thresholds.
139
+
140
+ Args:
141
+ result: Parsed metric result with at least 'confidence'.
142
+
143
+ Returns:
144
+ (important: bool, reason: Optional[str])
145
+ """
146
+ try:
147
+ conf = float(result.get("confidence", 0.0))
148
+ except (TypeError, ValueError):
149
+ return False, "Invalid confidence value"
150
+ # locate the confidence field
151
+ conf_field = next(
152
+ (f for f in self.fields if f.name == "confidence"), None
153
+ )
154
+ if isinstance(conf_field, NumericField):
155
+ ok = conf_field.is_within_threshold(conf)
156
+ reason = (
157
+ None
158
+ if ok
159
+ else f"Confidence {conf} outside [{conf_field.threshold_low},{conf_field.threshold_high}]"
160
+ )
161
+ return ok, reason
162
+ return False, "Confidence field not defined"
163
+
164
+ def is_correct(self, result: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
165
+ """
166
+ A result is 'correct' if it is important AND its output lies within thresholds.
167
+
168
+ Args:
169
+ result: Parsed metric result with 'output' and 'confidence'.
170
+
171
+ Returns:
172
+ (correct: bool, reason: Optional[str])
173
+ """
174
+ important, imp_reason = self.is_important(result)
175
+ if not important:
176
+ return True, f"Not important: {imp_reason}"
177
+ # check output
178
+ try:
179
+ val = float(result.get("output", 0.0))
180
+ except (TypeError, ValueError):
181
+ return False, "Invalid output value"
182
+ out_field = next((f for f in self.fields if f.name == "output"), None)
183
+ if isinstance(out_field, NumericField):
184
+ ok = out_field.is_within_threshold(val)
185
+ reason = (
186
+ None
187
+ if ok
188
+ else f"Output {val} outside [{out_field.threshold_low},{out_field.threshold_high}]"
189
+ )
190
+ return ok, reason
191
+ return False, "Output field not defined"
192
+
193
+ def parse_response(self, response: str) -> Dict[str, Any]:
194
+ """
195
+ Parse a raw response string into a structured dict.
196
+
197
+ Args:
198
+ response: Raw response string.
199
+
200
+ Returns:
201
+ Parsed response as a dict.
202
+ """
203
+ # Default implementation: assume JSON string
204
+ try:
205
+ return json.loads(response)
206
+ except json.JSONDecodeError as e:
207
+ raise ValueError(f"Failed to parse response: {e}") from e
208
+
209
+
210
+ class StandardMetric(Metric):
211
+ """
212
+ A standard metric with common fields:
213
+ - explanation: string, detailed reasoning.
214
+ - evidence: string, supporting quote or reference.
215
+ - output: numeric value within specified range.
216
+ - confidence: numeric confidence within specified range.
217
+ - correction: object, structured suggestion for improvement.
218
+ Also provides convenience methods `is_important` and `is_correct`.
219
+ """
220
+
221
+ def __init__(
222
+ self,
223
+ name: str,
224
+ description: str,
225
+ *,
226
+ output_range: Tuple[float, float] = (0.0, 1.0),
227
+ confidence_range: Tuple[float, float] = (0.0, 1.0),
228
+ ) -> None:
229
+ """
230
+ Args:
231
+ name: Metric identifier.
232
+ description: Explanation of what the metric measures.
233
+ output_range: (min, max) allowed for the 'output' field.
234
+ confidence_range: (min, max) for the 'confidence' field.
235
+
236
+ Fields created:
237
+ - explanation: "A detailed, step-by-step explanation of the reasoning."
238
+ - evidence: "The exact quote or evidence supporting the reasoning."
239
+ - output: numeric in output_range
240
+ - confidence: numeric in confidence_range
241
+ - correction: structured suggestion if output below threshold
242
+ """
243
+ # Prepare fields
244
+ min_out, max_out = output_range
245
+ min_conf, max_conf = confidence_range
246
+
247
+ explanation = ExplanationField(
248
+ name="explanation",
249
+ json_type="string",
250
+ description="A detailed, step-by-step explanation of the reasoning behind the output value.",
251
+ )
252
+ evidence = EvidenceField(
253
+ name="evidence",
254
+ json_type="string",
255
+ description="The exact quote or reference that supports the output value.",
256
+ )
257
+ output = NumericField(
258
+ name="output",
259
+ json_type=(
260
+ "number"
261
+ if isinstance(min_out, float) or isinstance(max_out, float)
262
+ else "integer"
263
+ ),
264
+ description=f"Primary numeric score for this metric (range {min_out} to {max_out}).",
265
+ jsonschema_extra={"minimum": min_out, "maximum": max_out},
266
+ extra_params={"threshold_low": min_out, "threshold_high": max_out},
267
+ )
268
+ confidence = NumericField(
269
+ name="confidence",
270
+ json_type="number",
271
+ description=f"Confidence in the output value (range {min_conf} to {max_conf}).",
272
+ jsonschema_extra={"minimum": min_conf, "maximum": max_conf},
273
+ extra_params={
274
+ "threshold_low": min_conf,
275
+ "threshold_high": max_conf,
276
+ },
277
+ )
278
+ correction = CorrectionField(
279
+ name="correction",
280
+ json_type="object",
281
+ description="Structured suggestion for how to correct or improve the output if needed.",
282
+ )
283
+
284
+ fields = [explanation, evidence, output, confidence, correction]
285
+ super().__init__(name=name, description=description, fields=fields)
286
+
287
+ def is_important(
288
+ self, result: Dict[str, Any]
289
+ ) -> Tuple[bool, Optional[str]]:
290
+ """
291
+ A result is 'important' if its confidence lies within the defined confidence thresholds.
292
+
293
+ Args:
294
+ result: Parsed metric result with at least 'confidence'.
295
+
296
+ Returns:
297
+ (important: bool, reason: Optional[str])
298
+ """
299
+ try:
300
+ conf = float(result.get("confidence", 0.0))
301
+ except (TypeError, ValueError):
302
+ return False, "Invalid confidence value"
303
+ # locate the confidence field
304
+ conf_field = next(
305
+ (f for f in self.fields if f.name == "confidence"), None
306
+ )
307
+ if isinstance(conf_field, NumericField):
308
+ ok = conf_field.is_within_threshold(conf)
309
+ reason = (
310
+ None
311
+ if ok
312
+ else f"Confidence {conf} outside [{conf_field.threshold_low},{conf_field.threshold_high}]"
313
+ )
314
+ return ok, reason
315
+ return False, "Confidence field not defined"
316
+
317
+ def is_correct(self, result: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
318
+ """
319
+ A result is 'correct' if it is important AND its output lies within thresholds.
320
+
321
+ Args:
322
+ result: Parsed metric result with 'output' and 'confidence'.
323
+
324
+ Returns:
325
+ (correct: bool, reason: Optional[str])
326
+ """
327
+ important, imp_reason = self.is_important(result)
328
+ if not important:
329
+ return True, f"Not important: {imp_reason}"
330
+ # check output
331
+ try:
332
+ val = float(result.get("output", 0.0))
333
+ except (TypeError, ValueError):
334
+ return False, "Invalid output value"
335
+ out_field = next((f for f in self.fields if f.name == "output"), None)
336
+ if isinstance(out_field, NumericField):
337
+ ok = out_field.is_within_threshold(val)
338
+ reason = (
339
+ None
340
+ if ok
341
+ else f"Output {val} outside [{out_field.threshold_low},{out_field.threshold_high}]"
342
+ )
343
+ return ok, reason
344
+ return False, "Output field not defined"