ibm-watsonx-orchestrate-evaluation-framework 1.0.8__py3-none-any.whl → 1.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (60) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info}/METADATA +103 -109
  2. ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info/RECORD +96 -0
  3. wxo_agentic_evaluation/analytics/tools/main.py +1 -18
  4. wxo_agentic_evaluation/analyze_run.py +358 -97
  5. wxo_agentic_evaluation/arg_configs.py +28 -1
  6. wxo_agentic_evaluation/description_quality_checker.py +149 -0
  7. wxo_agentic_evaluation/evaluation_package.py +58 -17
  8. wxo_agentic_evaluation/inference_backend.py +32 -17
  9. wxo_agentic_evaluation/llm_user.py +2 -1
  10. wxo_agentic_evaluation/metrics/metrics.py +22 -1
  11. wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
  12. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +9 -1
  13. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
  14. wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
  15. wxo_agentic_evaluation/prompt/template_render.py +34 -3
  16. wxo_agentic_evaluation/quick_eval.py +342 -0
  17. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +113 -0
  18. wxo_agentic_evaluation/red_teaming/attack_generator.py +286 -0
  19. wxo_agentic_evaluation/red_teaming/attack_list.py +96 -0
  20. wxo_agentic_evaluation/red_teaming/attack_runner.py +128 -0
  21. wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
  22. wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
  23. wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
  24. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
  25. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +27 -0
  26. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
  27. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
  28. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  29. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
  30. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
  31. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  32. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +237 -0
  33. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
  34. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +101 -0
  35. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +263 -0
  36. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +455 -0
  37. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +156 -0
  38. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
  39. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +547 -0
  40. wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
  41. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +258 -0
  42. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +333 -0
  43. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +188 -0
  44. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +409 -0
  45. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +42 -0
  46. wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
  47. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +145 -0
  48. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +116 -0
  49. wxo_agentic_evaluation/service_instance.py +2 -2
  50. wxo_agentic_evaluation/service_provider/watsonx_provider.py +118 -4
  51. wxo_agentic_evaluation/tool_planner.py +3 -1
  52. wxo_agentic_evaluation/type.py +33 -2
  53. wxo_agentic_evaluation/utils/__init__.py +0 -1
  54. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +157 -0
  55. wxo_agentic_evaluation/utils/rich_utils.py +174 -0
  56. wxo_agentic_evaluation/utils/rouge_score.py +23 -0
  57. wxo_agentic_evaluation/utils/utils.py +167 -5
  58. ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/RECORD +0 -56
  59. {ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info}/WHEEL +0 -0
  60. {ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,547 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from types import NoneType
5
+ from typing import (
6
+ Any,
7
+ Dict,
8
+ List,
9
+ Literal,
10
+ Optional,
11
+ Union,
12
+ )
13
+ from typing_extensions import Self
14
+
15
+ from pydantic import BaseModel, Field, ValidationError, model_validator
16
+
17
+ from wxo_agentic_evaluation.referenceless_eval.metrics import MetricRunResult
18
+
19
+ # ----------------------------------------------------------------------
20
+ # 1. Function-Call Metric Models
21
+ # ----------------------------------------------------------------------
22
+
23
+
24
+ class FunctionCallMetric(BaseModel):
25
+ """
26
+ Function-call metric: a single metric name, schema, and examples.
27
+ """
28
+
29
+ name: str = Field(
30
+ ..., description="Name of the metric (e.g. 'function_selection')."
31
+ )
32
+ jsonschema: Dict[str, Any] = Field(
33
+ ..., description="JSON Schema dict for this metric's output."
34
+ )
35
+ examples: Optional[List[Dict[Literal["user_kwargs", "output"], Any]]] = Field(
36
+ None,
37
+ description=(
38
+ "List of example inputs and outputs for this metric; "
39
+ "each example is a dict with 'user_kwargs' and 'output' keys."
40
+ ),
41
+ )
42
+
43
+
44
+ # ----------------------------------------------------------------------
45
+ # 2. Static-Check Models (Optional)
46
+ # ----------------------------------------------------------------------
47
+
48
+
49
+ class StaticMetricResult(BaseModel):
50
+ """
51
+ Result of a single static (schema-based) check.
52
+ """
53
+
54
+ description: str = Field(
55
+ ..., description="Human-readable description of this static validation check."
56
+ )
57
+ valid: bool = Field(
58
+ ..., description="True if this static check passed; False otherwise."
59
+ )
60
+ explanation: Optional[str] = Field(
61
+ None,
62
+ description=(
63
+ "If valid==False, a detailed explanation of why the check failed; "
64
+ "otherwise None."
65
+ ),
66
+ )
67
+
68
+
69
+ class StaticResult(BaseModel):
70
+ """
71
+ Aggregated results of static (schema-based) checks for one function call.
72
+ """
73
+
74
+ metrics: Dict[str, StaticMetricResult] = Field(
75
+ ...,
76
+ description=("Mapping from each static-check name to its StaticMetricResult."),
77
+ )
78
+ final_decision: bool = Field(
79
+ ...,
80
+ description=(
81
+ "Overall outcome: False if any metric.valid is False; True only if all pass."
82
+ ),
83
+ )
84
+
85
+
86
+ # ----------------------------------------------------------------------
87
+ # 3. Semantic Metric Result Models
88
+ # ----------------------------------------------------------------------
89
+
90
+
91
+ class SemanticMetricResult(BaseModel):
92
+ """
93
+ Wraps a single metric evaluation result returned by MetricRunner.
94
+ """
95
+
96
+ metric_name: str = Field(
97
+ ..., description="Identifier (name) of the evaluated metric."
98
+ )
99
+ jsonschema: Dict[str, Any] = Field(
100
+ ..., description="JSON Schema dict that was used to validate output."
101
+ )
102
+ prompt: Union[str, List[Dict[str, str]]] = Field(
103
+ ...,
104
+ description=(
105
+ "The actual prompt sent to the LLM—either a plain string "
106
+ "or a list of {'role','content'} messages."
107
+ ),
108
+ )
109
+ raw_response: Any = Field(
110
+ ..., description="Raw response returned by the LLM client."
111
+ )
112
+ numeric_thresholds_checks: Dict[str, bool] = Field(
113
+ ...,
114
+ description=(
115
+ "For every numeric field in the metric, a boolean indicating "
116
+ "whether the parsed value fell within its [low, high] thresholds."
117
+ ),
118
+ )
119
+ is_important: bool = Field(
120
+ ...,
121
+ description=(
122
+ "True if the metric's confidence field met its importance threshold; "
123
+ "False otherwise."
124
+ ),
125
+ )
126
+ importance_reason: Optional[str] = Field(
127
+ None,
128
+ description=(
129
+ "If is_important==False, a textual reason (e.g. 'confidence too low'); "
130
+ "otherwise None."
131
+ ),
132
+ )
133
+ error: Optional[str] = Field(
134
+ None,
135
+ description=(
136
+ "Error message if prompt generation or parsing failed; " "otherwise None."
137
+ ),
138
+ )
139
+ is_correct: bool = Field(
140
+ ...,
141
+ description=(
142
+ "True if both importance and the metric's primary value field "
143
+ "fell within thresholds; False otherwise."
144
+ ),
145
+ )
146
+ correctness_reason: Optional[str] = Field(
147
+ None,
148
+ description=(
149
+ "If is_correct==False, a textual reason why the value or confidence "
150
+ "fell outside thresholds; otherwise None."
151
+ ),
152
+ )
153
+ is_issue: bool = Field(
154
+ ...,
155
+ description=(
156
+ "True if is_correct==False and is_important==True; False otherwise."
157
+ ),
158
+ )
159
+
160
+ @model_validator(mode='after')
161
+ def raw_response_json(self) -> Self:
162
+ if isinstance(self.raw_response, str):
163
+ self.raw_response = json.loads(self.raw_response)
164
+
165
+ return self
166
+
167
+ @classmethod
168
+ def from_runner(cls, rr: MetricRunResult) -> "SemanticMetricResult":
169
+ """
170
+ Construct from an internal MetricRunResult instance.
171
+ """
172
+ # first construct the object from what MetricRunner gave us
173
+ data = rr.model_dump()
174
+ inst: SemanticMetricResult = cls(**data)
175
+
176
+ return inst
177
+
178
+ @property
179
+ def output_value(self) -> Optional[float]:
180
+ """
181
+ Convenience accessor for the metric's primary 'output' numeric field,
182
+ if present and parsed successfully.
183
+ """
184
+ if self.raw_response and isinstance(
185
+ self.raw_response.get("output"), (int, float)
186
+ ):
187
+ return float(self.raw_response["output"])
188
+ return None
189
+
190
+ @property
191
+ def normalized_output(self) -> Optional[float]:
192
+ """
193
+ Linearly scale 'output' into [0,1] according to its schema min/max.
194
+ """
195
+ out = self.output_value
196
+ subs = self.jsonschema.get("properties", {}).get("output", {})
197
+ low = subs.get("minimum", 0.0)
198
+ high = subs.get("maximum", 1.0)
199
+ if out is None or high == low:
200
+ return None
201
+ return (out - low) / (high - low)
202
+
203
+
204
+ class SemanticCategoryResult(BaseModel):
205
+ """
206
+ Collection of SemanticMetricResults for a single category:
207
+ - general
208
+ - function_selection
209
+ - parameter
210
+ """
211
+
212
+ metrics: Optional[Dict[str, SemanticMetricResult]] = Field(
213
+ None,
214
+ description=("Mapping metric_name -> SemanticMetricResult for this category."),
215
+ )
216
+ avg_score: Optional[float] = Field(
217
+ None,
218
+ description=(
219
+ "Average of the 'output' values across all metrics whose "
220
+ "confidence was within thresholds (is_important==True)."
221
+ ),
222
+ )
223
+
224
+ @classmethod
225
+ def from_results(cls, results: List[MetricRunResult]) -> "SemanticCategoryResult":
226
+ """
227
+ Build a category result from a list of MetricRunResult objects.
228
+ """
229
+ # 1) build per-metric results
230
+ mapping: Dict[str, SemanticMetricResult] = {
231
+ r.metric_name: SemanticMetricResult.from_runner(r) for r in results
232
+ }
233
+
234
+ # 2) compute normalized‐output average over 'important' metrics only
235
+ norms: List[float] = []
236
+ for m in mapping.values():
237
+ norm = m.normalized_output
238
+ if norm is not None and m.is_important:
239
+ norms.append(norm)
240
+
241
+ avg = (sum(norms) / len(norms)) if norms else None
242
+ return cls(metrics=mapping, avg_score=avg)
243
+
244
+
245
+ class SemanticResult(BaseModel):
246
+ """
247
+ Aggregated semantic metrics across all categories for one function call.
248
+ """
249
+
250
+ general: Optional[SemanticCategoryResult] = Field(
251
+ None,
252
+ description=("Results of general tool-call metrics, if any; otherwise None."),
253
+ )
254
+ function_selection: Optional[SemanticCategoryResult] = Field(
255
+ None,
256
+ description=("Results of function-selection metrics, if any; otherwise None."),
257
+ )
258
+ parameter: Optional[Dict[str, SemanticCategoryResult]] = Field(
259
+ None,
260
+ description=(
261
+ "Parameter-level results, keyed by parameter name, each with its metrics."
262
+ ),
263
+ )
264
+ transform: Optional[Dict[str, TransformResult]] = Field(
265
+ None,
266
+ description=(
267
+ "Optional per-parameter transformation results: "
268
+ "mapping parameter_name -> TransformResult."
269
+ ),
270
+ )
271
+
272
+
273
+ # ----------------------------------------------------------------------
274
+ # 4. Transformation Result Model
275
+ # ----------------------------------------------------------------------
276
+
277
+
278
+ class TransformResult(BaseModel):
279
+ """
280
+ Result of unit-extraction and code-based transformation checks for one parameter.
281
+ """
282
+
283
+ units: Dict[str, Any] = Field(
284
+ ...,
285
+ description=(
286
+ "Extracted unit info: keys 'user_units', 'user_value', and 'spec_units'."
287
+ ),
288
+ )
289
+ generated_code: str = Field(
290
+ ...,
291
+ description="The Python code snippet returned by the LLM for unit conversion.",
292
+ )
293
+ execution_success: bool = Field(
294
+ ...,
295
+ description="True if generated_code executed without error and matched values.",
296
+ )
297
+ correct: bool = Field(
298
+ ...,
299
+ description=(
300
+ "False if execution_success is True but the transformation "
301
+ "was incorrect; True if the transformation was correct or was not executed."
302
+ ),
303
+ )
304
+ execution_output: Any = Field(
305
+ None, description="The actual output of executing the transformation code."
306
+ )
307
+ correction: Optional[str] = Field(
308
+ None,
309
+ description="Correction explanation if execution succedded but the transformation was incorrect.",
310
+ )
311
+ error: Optional[str] = Field(
312
+ None,
313
+ description=(
314
+ "Error message if code generation or execution failed; " "otherwise None."
315
+ ),
316
+ )
317
+
318
+
319
+ # ----------------------------------------------------------------------
320
+ # 5. Pipeline I/O Models
321
+ # ----------------------------------------------------------------------
322
+
323
+
324
+ class FunctionCallInput(BaseModel):
325
+ """
326
+ Input bundle for the function-calling pipeline.
327
+ """
328
+
329
+ conversation_context: Union[str, List[Dict]] = Field(
330
+ ...,
331
+ description=(
332
+ "Either a single user text string or a list of chat messages "
333
+ "with {'role','content'}."
334
+ ),
335
+ )
336
+ tools_inventory: List[ToolSpec] = Field(
337
+ ...,
338
+ description=(
339
+ "List of available tools; each entry must at least include "
340
+ "'name' and argument schema."
341
+ ),
342
+ )
343
+ tool_call: ToolCall = Field(
344
+ ...,
345
+ description=(
346
+ "Proposed function call dict: {\n"
347
+ " 'name': '<function_name>',\n"
348
+ " 'args': {<param>:<value>, ...}\n"
349
+ "}."
350
+ ),
351
+ )
352
+
353
+
354
+ class PipelineResult(BaseModel):
355
+ """
356
+ Final output of the function-calling pipeline for one tool call.
357
+ """
358
+
359
+ inputs: FunctionCallInput = Field(..., description="Echo of the pipeline inputs.")
360
+ static: Optional[StaticResult] = Field(
361
+ None, description="Static schema-validation results, if enabled."
362
+ )
363
+ semantic: SemanticResult = Field(
364
+ ..., description="All semantic metric results by category."
365
+ )
366
+ overall_valid: bool = Field(
367
+ ...,
368
+ description=(
369
+ "True if all semantic metrics passed (is_correct==True) "
370
+ "and, if present, all transformations succeeded."
371
+ ),
372
+ )
373
+ overall_avg_score: Optional[float] = Field(
374
+ None,
375
+ description=(
376
+ "Average of the three category avg_scores "
377
+ "(general, function_selection, parameter) where available."
378
+ ),
379
+ )
380
+
381
+ @model_validator(mode="after")
382
+ def compute_overall(cls, values: PipelineResult) -> PipelineResult:
383
+ """
384
+ After validation, compute overall_valid as AND of:
385
+ • all semantic is_correct flags
386
+ • if transform exists: all execution_success flags
387
+ """
388
+ static: StaticResult = values.static
389
+ if static:
390
+ # static checks
391
+ ok = static.final_decision
392
+
393
+ sem: SemanticResult = values.semantic
394
+ if sem:
395
+ # semantic checks
396
+ if sem.general and sem.general.metrics:
397
+ for m in sem.general.metrics.values():
398
+ if not m.is_correct:
399
+ ok = False
400
+ if sem.function_selection and sem.function_selection.metrics:
401
+ for m in sem.function_selection.metrics.values():
402
+ if not m.is_correct:
403
+ ok = False
404
+ if sem.parameter:
405
+ for cat in sem.parameter.values():
406
+ if cat and cat.metrics:
407
+ for m in cat.metrics.values():
408
+ if not m.is_correct:
409
+ ok = False
410
+
411
+ # transformation checks (if any)
412
+ trans: Optional[Dict[str, TransformResult]] = sem.transform
413
+ if trans:
414
+ for tr in trans.values():
415
+ if not tr.correct:
416
+ ok = False
417
+
418
+ # compute overall_avg_score from category averages
419
+ cat_avgs: List[float] = []
420
+ for cat in (sem.general, sem.function_selection):
421
+ if cat and cat.avg_score is not None:
422
+ cat_avgs.append(cat.avg_score)
423
+ # for parameters, average the per‐param avg_scores
424
+ if sem.parameter:
425
+ param_avgs = [
426
+ cat.avg_score
427
+ for cat in sem.parameter.values()
428
+ if cat.avg_score is not None
429
+ ]
430
+ if param_avgs:
431
+ cat_avgs.append(sum(param_avgs) / len(param_avgs))
432
+
433
+ values.overall_avg_score = sum(cat_avgs) / len(cat_avgs) if cat_avgs else None
434
+ values.overall_valid = ok
435
+ return values
436
+
437
+
438
+ # ----------------------------------------------------------------------
439
+ # 6. API Specification & Call Models
440
+ # ----------------------------------------------------------------------
441
+
442
+
443
+ # Map primitive spec-types to Python types (optional helper)
444
+ SPEC_TYPES: Dict[str, Any] = {
445
+ "any": str,
446
+ "array": list,
447
+ "bigint": int,
448
+ "boolean": bool,
449
+ "byte": int,
450
+ "char": str,
451
+ "dict": dict,
452
+ "double": float,
453
+ "float": float,
454
+ "hashtable": dict,
455
+ "hashmap": dict,
456
+ "integer": int,
457
+ "int": int,
458
+ "list": list,
459
+ "long": int,
460
+ "number": float,
461
+ "null": NoneType,
462
+ "object": dict,
463
+ "string": str,
464
+ "tuple": tuple,
465
+ "uint": int,
466
+ "ulong": int,
467
+ "unsigned": int,
468
+ "void": NoneType,
469
+ }
470
+
471
+
472
+ class FunctionDefinition(BaseModel):
473
+ """
474
+ Wraps an OpenAI-style function definition for function-calling clients.
475
+ """
476
+
477
+ name: str = Field(..., description="Function name as expected by the LLM.")
478
+ description: Optional[str] = Field(
479
+ None, description="Human-readable description of the function."
480
+ )
481
+ parameters: Dict[str, Any] = Field(
482
+ ...,
483
+ description=(
484
+ "JSON-Schema object describing all parameters; either a dict "
485
+ "or a FunctionParameter model."
486
+ ),
487
+ )
488
+
489
+
490
+ class ToolSpec(BaseModel):
491
+ """
492
+ OpenAI tool specification wrapper, matching function-calling API.
493
+ """
494
+
495
+ type: Literal["function"] = Field(
496
+ "function",
497
+ description="Must be 'function' for OpenAI function-calling.",
498
+ )
499
+ function: FunctionDefinition = Field(
500
+ ..., description="Underlying function definition or raw dict."
501
+ )
502
+
503
+
504
+ class ToolFunctionCall(BaseModel):
505
+ """
506
+ Parsed representation of an LLM's function call response.
507
+ """
508
+
509
+ name: str = Field(..., description="Name of the function the LLM chose to call.")
510
+ arguments: str = Field(
511
+ ..., description="JSON-encoded string of the call's arguments."
512
+ )
513
+ parsed_arguments: Dict[str, Any] = Field(
514
+ default_factory=dict,
515
+ description="Parsed JSON arguments, available after validation.",
516
+ )
517
+
518
+ @model_validator(mode="after")
519
+ def _parse_arguments(cls, values: ToolFunctionCall) -> ToolFunctionCall:
520
+ """
521
+ After model construction, parse the `arguments` JSON string
522
+ into `parsed_arguments`, or raise a ValidationError.
523
+ """
524
+ try:
525
+ raw = values.arguments
526
+ values.parsed_arguments = json.loads(raw)
527
+ except json.JSONDecodeError as e:
528
+ raise ValidationError(f"Invalid JSON in arguments: {e}") from e
529
+ return values
530
+
531
+
532
+ class ToolCall(BaseModel):
533
+ """
534
+ Full OpenAI function call object (for v1 function-calling API).
535
+ """
536
+
537
+ id: Optional[str] = Field(
538
+ None,
539
+ description=("Optional unique identifier for this function call."),
540
+ )
541
+ type: Literal["function"] = Field(
542
+ "function",
543
+ description="Must be 'function' for OpenAI function calls.",
544
+ )
545
+ function: ToolFunctionCall = Field(
546
+ ..., description="Nested function name+arguments object or raw dict."
547
+ )
@@ -0,0 +1,3 @@
1
+ from .metric import Metric, StandardMetric
2
+ from .metrics_runner import MetricRunner, MetricRunResult
3
+ from .prompt import MetricPrompt, RelevancePrompt