ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +19 -25
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +1184 -97
  8. wxo_agentic_evaluation/annotate.py +7 -5
  9. wxo_agentic_evaluation/arg_configs.py +97 -5
  10. wxo_agentic_evaluation/base_user.py +25 -0
  11. wxo_agentic_evaluation/batch_annotate.py +97 -27
  12. wxo_agentic_evaluation/clients.py +103 -0
  13. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  14. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  15. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  16. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  17. wxo_agentic_evaluation/data_annotator.py +45 -19
  18. wxo_agentic_evaluation/description_quality_checker.py +178 -0
  19. wxo_agentic_evaluation/evaluation.py +50 -0
  20. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  21. wxo_agentic_evaluation/evaluation_package.py +544 -107
  22. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  23. wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
  24. wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
  25. wxo_agentic_evaluation/external_agent/types.py +8 -7
  26. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  27. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  28. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  29. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  30. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  31. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  32. wxo_agentic_evaluation/llm_matching.py +108 -5
  33. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  34. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  35. wxo_agentic_evaluation/llm_user.py +12 -6
  36. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  37. wxo_agentic_evaluation/main.py +128 -246
  38. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  39. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  40. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  41. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  42. wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
  43. wxo_agentic_evaluation/metrics/metrics.py +319 -16
  44. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  45. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  46. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  47. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  48. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  49. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  50. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  51. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  52. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  53. wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
  54. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
  55. wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
  56. wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
  57. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  58. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
  59. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  60. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
  61. wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
  62. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  63. wxo_agentic_evaluation/prompt/template_render.py +163 -12
  64. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  65. wxo_agentic_evaluation/quick_eval.py +384 -0
  66. wxo_agentic_evaluation/record_chat.py +132 -81
  67. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
  68. wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
  69. wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
  70. wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
  71. wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
  72. wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
  73. wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
  74. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
  75. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
  76. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
  77. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
  78. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  79. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  80. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
  81. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
  82. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  83. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  84. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
  85. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
  86. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
  87. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
  88. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
  89. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
  90. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
  91. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
  92. wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
  93. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
  94. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
  95. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
  96. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
  97. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
  98. wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
  99. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
  100. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
  101. wxo_agentic_evaluation/resource_map.py +6 -3
  102. wxo_agentic_evaluation/runner.py +329 -0
  103. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  104. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  105. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
  106. wxo_agentic_evaluation/scheduler.py +247 -0
  107. wxo_agentic_evaluation/service_instance.py +117 -26
  108. wxo_agentic_evaluation/service_provider/__init__.py +182 -17
  109. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  110. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
  111. wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
  112. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  113. wxo_agentic_evaluation/service_provider/provider.py +129 -10
  114. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
  115. wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
  116. wxo_agentic_evaluation/simluation_runner.py +125 -0
  117. wxo_agentic_evaluation/test_prompt.py +4 -4
  118. wxo_agentic_evaluation/tool_planner.py +141 -46
  119. wxo_agentic_evaluation/type.py +217 -14
  120. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  121. wxo_agentic_evaluation/utils/__init__.py +44 -3
  122. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  123. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  124. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  125. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
  126. wxo_agentic_evaluation/utils/parsers.py +71 -0
  127. wxo_agentic_evaluation/utils/rich_utils.py +188 -0
  128. wxo_agentic_evaluation/utils/rouge_score.py +23 -0
  129. wxo_agentic_evaluation/utils/utils.py +514 -17
  130. wxo_agentic_evaluation/wxo_client.py +81 -0
  131. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
  132. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
  133. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  134. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,562 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from types import NoneType
5
+ from typing import Any, Dict, List, Literal, Optional, Union
6
+
7
+ from pydantic import BaseModel, Field, ValidationError, model_validator
8
+ from typing_extensions import Self
9
+
10
+ from wxo_agentic_evaluation.referenceless_eval.metrics import MetricRunResult
11
+
12
+ # ----------------------------------------------------------------------
13
+ # 1. Function-Call Metric Models
14
+ # ----------------------------------------------------------------------
15
+
16
+
17
+ class FunctionCallMetric(BaseModel):
18
+ """
19
+ Function-call metric: a single metric name, schema, and examples.
20
+ """
21
+
22
+ name: str = Field(
23
+ ..., description="Name of the metric (e.g. 'function_selection')."
24
+ )
25
+ jsonschema: Dict[str, Any] = Field(
26
+ ..., description="JSON Schema dict for this metric's output."
27
+ )
28
+ examples: Optional[List[Dict[Literal["user_kwargs", "output"], Any]]] = (
29
+ Field(
30
+ None,
31
+ description=(
32
+ "List of example inputs and outputs for this metric; "
33
+ "each example is a dict with 'user_kwargs' and 'output' keys."
34
+ ),
35
+ )
36
+ )
37
+
38
+
39
+ # ----------------------------------------------------------------------
40
+ # 2. Static-Check Models (Optional)
41
+ # ----------------------------------------------------------------------
42
+
43
+
44
+ class StaticMetricResult(BaseModel):
45
+ """
46
+ Result of a single static (schema-based) check.
47
+ """
48
+
49
+ description: str = Field(
50
+ ...,
51
+ description="Human-readable description of this static validation check.",
52
+ )
53
+ valid: bool = Field(
54
+ ..., description="True if this static check passed; False otherwise."
55
+ )
56
+ explanation: Optional[str] = Field(
57
+ None,
58
+ description=(
59
+ "If valid==False, a detailed explanation of why the check failed; "
60
+ "otherwise None."
61
+ ),
62
+ )
63
+
64
+
65
+ class StaticResult(BaseModel):
66
+ """
67
+ Aggregated results of static (schema-based) checks for one function call.
68
+ """
69
+
70
+ metrics: Dict[str, StaticMetricResult] = Field(
71
+ ...,
72
+ description=(
73
+ "Mapping from each static-check name to its StaticMetricResult."
74
+ ),
75
+ )
76
+ final_decision: bool = Field(
77
+ ...,
78
+ description=(
79
+ "Overall outcome: False if any metric.valid is False; True only if all pass."
80
+ ),
81
+ )
82
+
83
+
84
+ # ----------------------------------------------------------------------
85
+ # 3. Semantic Metric Result Models
86
+ # ----------------------------------------------------------------------
87
+
88
+
89
+ class SemanticMetricResult(BaseModel):
90
+ """
91
+ Wraps a single metric evaluation result returned by MetricRunner.
92
+ """
93
+
94
+ metric_name: str = Field(
95
+ ..., description="Identifier (name) of the evaluated metric."
96
+ )
97
+ jsonschema: Dict[str, Any] = Field(
98
+ ..., description="JSON Schema dict that was used to validate output."
99
+ )
100
+ prompt: Union[str, List[Dict[str, str]]] = Field(
101
+ ...,
102
+ description=(
103
+ "The actual prompt sent to the LLM—either a plain string "
104
+ "or a list of {'role','content'} messages."
105
+ ),
106
+ )
107
+ raw_response: Any = Field(
108
+ ..., description="Raw response returned by the LLM client."
109
+ )
110
+ numeric_thresholds_checks: Dict[str, bool] = Field(
111
+ ...,
112
+ description=(
113
+ "For every numeric field in the metric, a boolean indicating "
114
+ "whether the parsed value fell within its [low, high] thresholds."
115
+ ),
116
+ )
117
+ is_important: bool = Field(
118
+ ...,
119
+ description=(
120
+ "True if the metric's confidence field met its importance threshold; "
121
+ "False otherwise."
122
+ ),
123
+ )
124
+ importance_reason: Optional[str] = Field(
125
+ None,
126
+ description=(
127
+ "If is_important==False, a textual reason (e.g. 'confidence too low'); "
128
+ "otherwise None."
129
+ ),
130
+ )
131
+ error: Optional[str] = Field(
132
+ None,
133
+ description=(
134
+ "Error message if prompt generation or parsing failed; "
135
+ "otherwise None."
136
+ ),
137
+ )
138
+ is_correct: bool = Field(
139
+ ...,
140
+ description=(
141
+ "True if both importance and the metric's primary value field "
142
+ "fell within thresholds; False otherwise."
143
+ ),
144
+ )
145
+ correctness_reason: Optional[str] = Field(
146
+ None,
147
+ description=(
148
+ "If is_correct==False, a textual reason why the value or confidence "
149
+ "fell outside thresholds; otherwise None."
150
+ ),
151
+ )
152
+ is_issue: bool = Field(
153
+ ...,
154
+ description=(
155
+ "True if is_correct==False and is_important==True; False otherwise."
156
+ ),
157
+ )
158
+
159
+ @model_validator(mode="after")
160
+ def raw_response_json(self) -> Self:
161
+ if isinstance(self.raw_response, str):
162
+ self.raw_response = json.loads(self.raw_response)
163
+
164
+ return self
165
+
166
+ @classmethod
167
+ def from_runner(cls, rr: MetricRunResult) -> "SemanticMetricResult":
168
+ """
169
+ Construct from an internal MetricRunResult instance.
170
+ """
171
+ # first construct the object from what MetricRunner gave us
172
+ data = rr.model_dump()
173
+ inst: SemanticMetricResult = cls(**data)
174
+
175
+ return inst
176
+
177
+ @property
178
+ def output_value(self) -> Optional[float]:
179
+ """
180
+ Convenience accessor for the metric's primary 'output' numeric field,
181
+ if present and parsed successfully.
182
+ """
183
+ if self.raw_response and isinstance(
184
+ self.raw_response.get("output"), (int, float)
185
+ ):
186
+ return float(self.raw_response["output"])
187
+ return None
188
+
189
+ @property
190
+ def normalized_output(self) -> Optional[float]:
191
+ """
192
+ Linearly scale 'output' into [0,1] according to its schema min/max.
193
+ """
194
+ out = self.output_value
195
+ subs = self.jsonschema.get("properties", {}).get("output", {})
196
+ low = subs.get("minimum", 0.0)
197
+ high = subs.get("maximum", 1.0)
198
+ if out is None or high == low:
199
+ return None
200
+ return (out - low) / (high - low)
201
+
202
+
203
+ class SemanticCategoryResult(BaseModel):
204
+ """
205
+ Collection of SemanticMetricResults for a single category:
206
+ - general
207
+ - function_selection
208
+ - parameter
209
+ """
210
+
211
+ metrics: Optional[Dict[str, SemanticMetricResult]] = Field(
212
+ None,
213
+ description=(
214
+ "Mapping metric_name -> SemanticMetricResult for this category."
215
+ ),
216
+ )
217
+ avg_score: Optional[float] = Field(
218
+ None,
219
+ description=(
220
+ "Average of the 'output' values across all metrics whose "
221
+ "confidence was within thresholds (is_important==True)."
222
+ ),
223
+ )
224
+
225
+ @classmethod
226
+ def from_results(
227
+ cls, results: List[MetricRunResult]
228
+ ) -> "SemanticCategoryResult":
229
+ """
230
+ Build a category result from a list of MetricRunResult objects.
231
+ """
232
+ # 1) build per-metric results
233
+ mapping: Dict[str, SemanticMetricResult] = {
234
+ r.metric_name: SemanticMetricResult.from_runner(r) for r in results
235
+ }
236
+
237
+ # 2) compute normalized‐output average over 'important' metrics only
238
+ norms: List[float] = []
239
+ for m in mapping.values():
240
+ norm = m.normalized_output
241
+ if norm is not None and m.is_important:
242
+ norms.append(norm)
243
+
244
+ avg = (sum(norms) / len(norms)) if norms else None
245
+ return cls(metrics=mapping, avg_score=avg)
246
+
247
+
248
+ class SemanticResult(BaseModel):
249
+ """
250
+ Aggregated semantic metrics across all categories for one function call.
251
+ """
252
+
253
+ general: Optional[SemanticCategoryResult] = Field(
254
+ None,
255
+ description=(
256
+ "Results of general tool-call metrics, if any; otherwise None."
257
+ ),
258
+ )
259
+ function_selection: Optional[SemanticCategoryResult] = Field(
260
+ None,
261
+ description=(
262
+ "Results of function-selection metrics, if any; otherwise None."
263
+ ),
264
+ )
265
+ parameter: Optional[Dict[str, SemanticCategoryResult]] = Field(
266
+ None,
267
+ description=(
268
+ "Parameter-level results, keyed by parameter name, each with its metrics."
269
+ ),
270
+ )
271
+ transform: Optional[Dict[str, TransformResult]] = Field(
272
+ None,
273
+ description=(
274
+ "Optional per-parameter transformation results: "
275
+ "mapping parameter_name -> TransformResult."
276
+ ),
277
+ )
278
+
279
+
280
+ # ----------------------------------------------------------------------
281
+ # 4. Transformation Result Model
282
+ # ----------------------------------------------------------------------
283
+
284
+
285
+ class TransformResult(BaseModel):
286
+ """
287
+ Result of unit-extraction and code-based transformation checks for one parameter.
288
+ """
289
+
290
+ units: Dict[str, Any] = Field(
291
+ ...,
292
+ description=(
293
+ "Extracted unit info: keys 'user_units', 'user_value', and 'spec_units'."
294
+ ),
295
+ )
296
+ generated_code: str = Field(
297
+ ...,
298
+ description="The Python code snippet returned by the LLM for unit conversion.",
299
+ )
300
+ execution_success: bool = Field(
301
+ ...,
302
+ description="True if generated_code executed without error and matched values.",
303
+ )
304
+ correct: bool = Field(
305
+ ...,
306
+ description=(
307
+ "False if execution_success is True but the transformation "
308
+ "was incorrect; True if the transformation was correct or was not executed."
309
+ ),
310
+ )
311
+ execution_output: Any = Field(
312
+ None,
313
+ description="The actual output of executing the transformation code.",
314
+ )
315
+ correction: Optional[str] = Field(
316
+ None,
317
+ description="Correction explanation if execution succedded but the transformation was incorrect.",
318
+ )
319
+ error: Optional[str] = Field(
320
+ None,
321
+ description=(
322
+ "Error message if code generation or execution failed; "
323
+ "otherwise None."
324
+ ),
325
+ )
326
+
327
+
328
+ # ----------------------------------------------------------------------
329
+ # 5. Pipeline I/O Models
330
+ # ----------------------------------------------------------------------
331
+
332
+
333
+ class FunctionCallInput(BaseModel):
334
+ """
335
+ Input bundle for the function-calling pipeline.
336
+ """
337
+
338
+ conversation_context: Union[str, List[Dict]] = Field(
339
+ ...,
340
+ description=(
341
+ "Either a single user text string or a list of chat messages "
342
+ "with {'role','content'}."
343
+ ),
344
+ )
345
+ tools_inventory: List[ToolSpec] = Field(
346
+ ...,
347
+ description=(
348
+ "List of available tools; each entry must at least include "
349
+ "'name' and argument schema."
350
+ ),
351
+ )
352
+ tool_call: ToolCall = Field(
353
+ ...,
354
+ description=(
355
+ "Proposed function call dict: {\n"
356
+ " 'name': '<function_name>',\n"
357
+ " 'args': {<param>:<value>, ...}\n"
358
+ "}."
359
+ ),
360
+ )
361
+
362
+
363
+ class PipelineResult(BaseModel):
364
+ """
365
+ Final output of the function-calling pipeline for one tool call.
366
+ """
367
+
368
+ inputs: FunctionCallInput = Field(
369
+ ..., description="Echo of the pipeline inputs."
370
+ )
371
+ static: Optional[StaticResult] = Field(
372
+ None, description="Static schema-validation results, if enabled."
373
+ )
374
+ semantic: SemanticResult = Field(
375
+ ..., description="All semantic metric results by category."
376
+ )
377
+ overall_valid: bool = Field(
378
+ ...,
379
+ description=(
380
+ "True if all semantic metrics passed (is_correct==True) "
381
+ "and, if present, all transformations succeeded."
382
+ ),
383
+ )
384
+ overall_avg_score: Optional[float] = Field(
385
+ None,
386
+ description=(
387
+ "Average of the three category avg_scores "
388
+ "(general, function_selection, parameter) where available."
389
+ ),
390
+ )
391
+
392
+ @model_validator(mode="after")
393
+ def compute_overall(self) -> Self:
394
+ """
395
+ After validation, compute overall_valid as AND of:
396
+ • all semantic is_correct flags
397
+ • if transform exists: all execution_success flags
398
+ """
399
+ static: StaticResult = self.static
400
+ if static:
401
+ # static checks
402
+ ok = static.final_decision
403
+
404
+ sem: SemanticResult = self.semantic
405
+ if sem:
406
+ # semantic checks
407
+ if sem.general and sem.general.metrics:
408
+ for m in sem.general.metrics.values():
409
+ if not m.is_correct:
410
+ ok = False
411
+ if sem.function_selection and sem.function_selection.metrics:
412
+ for m in sem.function_selection.metrics.values():
413
+ if not m.is_correct:
414
+ ok = False
415
+ if sem.parameter:
416
+ for cat in sem.parameter.values():
417
+ if cat and cat.metrics:
418
+ for m in cat.metrics.values():
419
+ if not m.is_correct:
420
+ ok = False
421
+
422
+ # transformation checks (if any)
423
+ trans: Optional[Dict[str, TransformResult]] = sem.transform
424
+ if trans:
425
+ for tr in trans.values():
426
+ if not tr.correct:
427
+ ok = False
428
+
429
+ # compute overall_avg_score from category averages
430
+ cat_avgs: List[float] = []
431
+ for cat in (sem.general, sem.function_selection):
432
+ if cat and cat.avg_score is not None:
433
+ cat_avgs.append(cat.avg_score)
434
+ # for parameters, average the per‐param avg_scores
435
+ if sem.parameter:
436
+ param_avgs = [
437
+ cat.avg_score
438
+ for cat in sem.parameter.values()
439
+ if cat.avg_score is not None
440
+ ]
441
+ if param_avgs:
442
+ cat_avgs.append(sum(param_avgs) / len(param_avgs))
443
+
444
+ self.overall_avg_score = (
445
+ sum(cat_avgs) / len(cat_avgs) if cat_avgs else None
446
+ )
447
+ self.overall_valid = ok
448
+ return self
449
+
450
+
451
+ # ----------------------------------------------------------------------
452
+ # 6. API Specification & Call Models
453
+ # ----------------------------------------------------------------------
454
+
455
+
456
+ # Map primitive spec-types to Python types (optional helper)
457
+ SPEC_TYPES: Dict[str, Any] = {
458
+ "any": str,
459
+ "array": list,
460
+ "bigint": int,
461
+ "boolean": bool,
462
+ "byte": int,
463
+ "char": str,
464
+ "dict": dict,
465
+ "double": float,
466
+ "float": float,
467
+ "hashtable": dict,
468
+ "hashmap": dict,
469
+ "integer": int,
470
+ "int": int,
471
+ "list": list,
472
+ "long": int,
473
+ "number": float,
474
+ "null": NoneType,
475
+ "object": dict,
476
+ "string": str,
477
+ "tuple": tuple,
478
+ "uint": int,
479
+ "ulong": int,
480
+ "unsigned": int,
481
+ "void": NoneType,
482
+ }
483
+
484
+
485
+ class FunctionDefinition(BaseModel):
486
+ """
487
+ Wraps an OpenAI-style function definition for function-calling clients.
488
+ """
489
+
490
+ name: str = Field(..., description="Function name as expected by the LLM.")
491
+ description: Optional[str] = Field(
492
+ None, description="Human-readable description of the function."
493
+ )
494
+ parameters: Dict[str, Any] = Field(
495
+ ...,
496
+ description=(
497
+ "JSON-Schema object describing all parameters; either a dict "
498
+ "or a FunctionParameter model."
499
+ ),
500
+ )
501
+
502
+
503
+ class ToolSpec(BaseModel):
504
+ """
505
+ OpenAI tool specification wrapper, matching function-calling API.
506
+ """
507
+
508
+ type: Literal["function"] = Field(
509
+ "function",
510
+ description="Must be 'function' for OpenAI function-calling.",
511
+ )
512
+ function: FunctionDefinition = Field(
513
+ ..., description="Underlying function definition or raw dict."
514
+ )
515
+
516
+
517
+ class ToolFunctionCall(BaseModel):
518
+ """
519
+ Parsed representation of an LLM's function call response.
520
+ """
521
+
522
+ name: str = Field(
523
+ ..., description="Name of the function the LLM chose to call."
524
+ )
525
+ arguments: str = Field(
526
+ ..., description="JSON-encoded string of the call's arguments."
527
+ )
528
+ parsed_arguments: Dict[str, Any] = Field(
529
+ default_factory=dict,
530
+ description="Parsed JSON arguments, available after validation.",
531
+ )
532
+
533
+ @model_validator(mode="after")
534
+ def _parse_arguments(self) -> Self:
535
+ """
536
+ After model construction, parse the `arguments` JSON string
537
+ into `parsed_arguments`, or raise a ValidationError.
538
+ """
539
+ try:
540
+ raw = self.arguments
541
+ self.parsed_arguments = json.loads(raw)
542
+ except json.JSONDecodeError as e:
543
+ raise ValidationError(f"Invalid JSON in arguments: {e}") from e
544
+ return self
545
+
546
+
547
+ class ToolCall(BaseModel):
548
+ """
549
+ Full OpenAI function call object (for v1 function-calling API).
550
+ """
551
+
552
+ id: Optional[str] = Field(
553
+ None,
554
+ description=("Optional unique identifier for this function call."),
555
+ )
556
+ type: Literal["function"] = Field(
557
+ "function",
558
+ description="Must be 'function' for OpenAI function calls.",
559
+ )
560
+ function: ToolFunctionCall = Field(
561
+ ..., description="Nested function name+arguments object or raw dict."
562
+ )
@@ -0,0 +1,3 @@
1
+ from .metric import Metric, StandardMetric
2
+ from .metrics_runner import MetricRunner, MetricRunResult
3
+ from .prompt import MetricPrompt, RelevancePrompt