ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +19 -25
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +1184 -97
  8. wxo_agentic_evaluation/annotate.py +7 -5
  9. wxo_agentic_evaluation/arg_configs.py +97 -5
  10. wxo_agentic_evaluation/base_user.py +25 -0
  11. wxo_agentic_evaluation/batch_annotate.py +97 -27
  12. wxo_agentic_evaluation/clients.py +103 -0
  13. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  14. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  15. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  16. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  17. wxo_agentic_evaluation/data_annotator.py +45 -19
  18. wxo_agentic_evaluation/description_quality_checker.py +178 -0
  19. wxo_agentic_evaluation/evaluation.py +50 -0
  20. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  21. wxo_agentic_evaluation/evaluation_package.py +544 -107
  22. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  23. wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
  24. wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
  25. wxo_agentic_evaluation/external_agent/types.py +8 -7
  26. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  27. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  28. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  29. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  30. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  31. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  32. wxo_agentic_evaluation/llm_matching.py +108 -5
  33. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  34. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  35. wxo_agentic_evaluation/llm_user.py +12 -6
  36. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  37. wxo_agentic_evaluation/main.py +128 -246
  38. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  39. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  40. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  41. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  42. wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
  43. wxo_agentic_evaluation/metrics/metrics.py +319 -16
  44. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  45. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  46. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  47. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  48. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  49. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  50. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  51. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  52. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  53. wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
  54. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
  55. wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
  56. wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
  57. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  58. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
  59. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  60. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
  61. wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
  62. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  63. wxo_agentic_evaluation/prompt/template_render.py +163 -12
  64. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  65. wxo_agentic_evaluation/quick_eval.py +384 -0
  66. wxo_agentic_evaluation/record_chat.py +132 -81
  67. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
  68. wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
  69. wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
  70. wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
  71. wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
  72. wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
  73. wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
  74. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
  75. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
  76. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
  77. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
  78. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  79. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  80. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
  81. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
  82. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  83. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  84. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
  85. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
  86. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
  87. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
  88. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
  89. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
  90. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
  91. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
  92. wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
  93. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
  94. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
  95. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
  96. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
  97. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
  98. wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
  99. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
  100. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
  101. wxo_agentic_evaluation/resource_map.py +6 -3
  102. wxo_agentic_evaluation/runner.py +329 -0
  103. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  104. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  105. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
  106. wxo_agentic_evaluation/scheduler.py +247 -0
  107. wxo_agentic_evaluation/service_instance.py +117 -26
  108. wxo_agentic_evaluation/service_provider/__init__.py +182 -17
  109. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  110. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
  111. wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
  112. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  113. wxo_agentic_evaluation/service_provider/provider.py +129 -10
  114. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
  115. wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
  116. wxo_agentic_evaluation/simluation_runner.py +125 -0
  117. wxo_agentic_evaluation/test_prompt.py +4 -4
  118. wxo_agentic_evaluation/tool_planner.py +141 -46
  119. wxo_agentic_evaluation/type.py +217 -14
  120. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  121. wxo_agentic_evaluation/utils/__init__.py +44 -3
  122. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  123. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  124. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  125. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
  126. wxo_agentic_evaluation/utils/parsers.py +71 -0
  127. wxo_agentic_evaluation/utils/rich_utils.py +188 -0
  128. wxo_agentic_evaluation/utils/rouge_score.py +23 -0
  129. wxo_agentic_evaluation/utils/utils.py +514 -17
  130. wxo_agentic_evaluation/wxo_client.py +81 -0
  131. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
  132. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
  133. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  134. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,245 @@
1
+ import json
2
+ from enum import Enum
3
+ from pathlib import Path
4
+ from typing import Any, Dict, Iterable, List, Tuple, Union
5
+
6
+ from pydantic import ValidationError
7
+
8
+ from wxo_agentic_evaluation.referenceless_eval.function_calling.metrics.function_call.general import (
9
+ GeneralMetricsPrompt,
10
+ )
11
+ from wxo_agentic_evaluation.referenceless_eval.function_calling.metrics.function_selection.function_selection import (
12
+ FunctionSelectionPrompt,
13
+ )
14
+ from wxo_agentic_evaluation.referenceless_eval.metrics import (
15
+ Metric,
16
+ MetricPrompt,
17
+ )
18
+
19
+ PromptType = Union[
20
+ GeneralMetricsPrompt,
21
+ FunctionSelectionPrompt,
22
+ ]
23
+
24
+
25
+ # Enum for prompt kinds
26
+ class PromptKind(str, Enum):
27
+ GENERAL = "general"
28
+ FUNCTION_SELECTION = "function_selection"
29
+ PARAMETER = "parameter"
30
+
31
+
32
+ # Map enum → Prompt class
33
+ _PROMPT_CLASS_MAP: Dict[PromptKind, Any] = {
34
+ PromptKind.GENERAL: GeneralMetricsPrompt,
35
+ PromptKind.FUNCTION_SELECTION: FunctionSelectionPrompt,
36
+ }
37
+
38
+
39
+ class LoaderError(Exception):
40
+ """Raised when prompt loading fails."""
41
+
42
+
43
+ def load_prompts_from_jsonl(
44
+ path: Union[str, Path],
45
+ kind: PromptKind,
46
+ ) -> List[PromptType]:
47
+ """
48
+ Load prompts from a JSONL file.
49
+
50
+ Args:
51
+ path: .jsonl file path.
52
+ kind: PromptKind value.
53
+
54
+ Returns:
55
+ List of PromptType, each with its examples loaded.
56
+
57
+ Raises:
58
+ LoaderError on I/O, parse, or validation errors.
59
+ """
60
+ PromptCls = _PROMPT_CLASS_MAP.get(kind)
61
+ if PromptCls is None:
62
+ raise LoaderError(f"Unknown PromptKind: {kind}")
63
+
64
+ p = Path(path)
65
+ if not p.is_file():
66
+ raise LoaderError(f"File not found: {path}")
67
+
68
+ prompts: List[PromptType] = []
69
+ for lineno, raw in enumerate(
70
+ p.read_text(encoding="utf-8").splitlines(), start=1
71
+ ):
72
+ if not raw.strip():
73
+ continue
74
+ try:
75
+ rec = json.loads(raw)
76
+ except json.JSONDecodeError as e:
77
+ raise LoaderError(f"{path}:{lineno} invalid JSON: {e}") from e
78
+
79
+ # Extract
80
+ try:
81
+ schema = rec["jsonschema"]
82
+ examples = rec.get("examples", [])
83
+ description = rec.get("description", schema.get("description", ""))
84
+ except KeyError as e:
85
+ raise LoaderError(f"{path}:{lineno} missing key {e}") from e
86
+
87
+ # Build metric
88
+ try:
89
+ metric = Metric.from_jsonschema(schema)
90
+ metric.description = description
91
+ except Exception as e:
92
+ raise LoaderError(f"{path}:{lineno} invalid schema: {e}") from e
93
+
94
+ # Instantiate prompt
95
+ prompt: MetricPrompt
96
+ try:
97
+ prompt = PromptCls(
98
+ metric=metric, task_description=metric.description
99
+ )
100
+ except TypeError:
101
+ prompt = PromptCls(metric=metric)
102
+
103
+ # Load examples
104
+ for ex_idx, ex in enumerate(examples, start=1):
105
+ try:
106
+ user_kwargs = ex["user_kwargs"]
107
+ output = ex["output"]
108
+ except KeyError as e:
109
+ raise LoaderError(
110
+ f"{path}:{lineno}, example {ex_idx} missing {e}"
111
+ ) from e
112
+ try:
113
+ prompt.add_example(user_kwargs, output)
114
+ except (ValidationError, ValueError) as e:
115
+ raise LoaderError(
116
+ f"{path}:{lineno}, example {ex_idx} invalid: {e}"
117
+ ) from e
118
+
119
+ prompts.append(prompt)
120
+
121
+ return prompts
122
+
123
+
124
+ def load_prompts_from_list(
125
+ records: Iterable[Dict[str, Any]], kind: PromptKind
126
+ ) -> List[PromptType]:
127
+ """
128
+ Load prompts from an in-memory list of dicts, same structure as JSONL.
129
+
130
+ Args:
131
+ records: Iterable of dicts with keys {schema, thresholds, examples, description}.
132
+ kind: PromptKind value.
133
+
134
+ Returns:
135
+ List of PromptType.
136
+
137
+ Raises:
138
+ LoaderError on missing data or validation failures.
139
+ """
140
+ PromptCls = _PROMPT_CLASS_MAP.get(kind)
141
+ if PromptCls is None:
142
+ raise LoaderError(f"Unknown PromptKind: {kind}")
143
+
144
+ prompts: List[PromptType] = []
145
+ for idx, rec in enumerate(records, start=1):
146
+ # same logic as JSONL loader
147
+ try:
148
+ schema = rec["jsonschema"]
149
+ examples = rec.get("examples", [])
150
+ description = schema.get("description", rec.get("name", ""))
151
+ except KeyError as e:
152
+ raise LoaderError(f"Record {idx} missing key {e}") from e
153
+
154
+ try:
155
+ metric = Metric.from_jsonschema(schema)
156
+ metric.description = description
157
+ except Exception as e:
158
+ raise LoaderError(f"Record {idx} invalid schema: {e}") from e
159
+
160
+ try:
161
+ prompt = PromptCls(
162
+ metric=metric, task_description=rec["task_description"]
163
+ )
164
+ except TypeError:
165
+ prompt = PromptCls(metric=metric)
166
+
167
+ for ex_idx, ex in enumerate(examples, start=1):
168
+ try:
169
+ user_kwargs = ex["user_kwargs"]
170
+ output = ex["output"]
171
+ except KeyError as e:
172
+ raise LoaderError(
173
+ f"Record {idx}, example {ex_idx} missing {e}"
174
+ ) from e
175
+ try:
176
+ prompt.add_example(user_kwargs, output)
177
+ except (ValidationError, ValueError) as e:
178
+ raise LoaderError(
179
+ f"Record {idx}, example {ex_idx} invalid: {e}"
180
+ ) from e
181
+
182
+ prompts.append(prompt)
183
+
184
+ return prompts
185
+
186
+
187
+ def load_prompts_from_metrics(
188
+ metrics_with_examples: Iterable[Tuple[Metric, List[Dict[str, Any]]]],
189
+ kind: PromptKind,
190
+ ) -> List[PromptType]:
191
+ """
192
+ Instantiate prompts directly from Metric objects and example data.
193
+
194
+ Args:
195
+ metrics_with_examples: An iterable of (Metric instance, examples) tuples.
196
+ Each examples list item must be a dict with:
197
+ - "user_kwargs": Dict[str, Any]
198
+ - "output": Dict[str, Any]
199
+ kind: Which PromptKind to use (GENERAL, FUNCTION_SELECTION, PARAMETER).
200
+
201
+ Returns:
202
+ A list of PromptType, each with its few-shot examples loaded.
203
+
204
+ Raises:
205
+ LoaderError: on missing data or validation errors.
206
+ """
207
+ PromptCls = _PROMPT_CLASS_MAP.get(kind)
208
+ if PromptCls is None:
209
+ raise LoaderError(f"Unknown PromptKind: {kind}")
210
+
211
+ prompts: List[PromptType] = []
212
+ for idx, (metric, examples) in enumerate(metrics_with_examples, start=1):
213
+ if not isinstance(metric, Metric):
214
+ raise LoaderError(
215
+ f"Item {idx}: expected a Metric instance, got {type(metric)}"
216
+ )
217
+
218
+ # Instantiate prompt with the metric's description as task_description
219
+ try:
220
+ prompt = PromptCls(
221
+ metric=metric, task_description=metric.description
222
+ )
223
+ except TypeError:
224
+ # Fallback if constructor signature differs
225
+ prompt = PromptCls(metric=metric)
226
+
227
+ # Add each provided example
228
+ for ex_idx, ex in enumerate(examples or [], start=1):
229
+ if "user_kwargs" not in ex or "output" not in ex:
230
+ raise LoaderError(
231
+ f"Metric {metric.name}, example {ex_idx}: "
232
+ "each example must include 'user_kwargs' and 'output'."
233
+ )
234
+ user_kwargs = ex["user_kwargs"]
235
+ output = ex["output"]
236
+ try:
237
+ prompt.add_example(user_kwargs, output)
238
+ except (ValidationError, ValueError) as e:
239
+ raise LoaderError(
240
+ f"Metric {metric.name}, example {ex_idx} invalid: {e}"
241
+ ) from e
242
+
243
+ prompts.append(prompt)
244
+
245
+ return prompts
@@ -0,0 +1,106 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types import (
4
+ ToolCall,
5
+ ToolSpec,
6
+ )
7
+
8
+ # ────────────────────────────────────────────────────────────────────────────────
9
+ # Adapter definitions
10
+ # ────────────────────────────────────────────────────────────────────────────────
11
+
12
+
13
+ class BaseAdapter:
14
+ """Abstract adapter to unify different API spec and call representations."""
15
+
16
+ def get_tools_inventory(self) -> List[Dict[str, Any]]:
17
+ raise NotImplementedError
18
+
19
+ def get_tools_inventory_summary(self) -> List[Dict[str, Any]]:
20
+ raise NotImplementedError
21
+
22
+ def get_tool_spec(self, tool_name: str) -> Dict[str, Any]:
23
+ raise NotImplementedError
24
+
25
+ def get_call_dict(self) -> Dict[str, Any]:
26
+ raise NotImplementedError
27
+
28
+ def get_function_name(self) -> str:
29
+ raise NotImplementedError
30
+
31
+ def get_parameters(self) -> Dict[str, Any]:
32
+ raise NotImplementedError
33
+
34
+ def get_param_spec_snippet(self, param_name: str) -> Dict[str, Any]:
35
+ raise NotImplementedError
36
+
37
+
38
+ class OpenAIAdapter(BaseAdapter):
39
+ """Adapter for ToolSpec + ToolCall inputs."""
40
+
41
+ def __init__(self, specs: List[ToolSpec], call: ToolCall):
42
+ self.specs = specs
43
+ self.call = call
44
+
45
+ def get_tools_inventory(self) -> List[Dict[str, Any]]:
46
+ return [spec.model_dump() for spec in self.specs]
47
+
48
+ def get_tools_inventory_summary(self) -> List[Dict[str, Any]]:
49
+ return [
50
+ {
51
+ "tool_name": spec.function.name,
52
+ "tool_description": spec.function.description,
53
+ "tool_parameters": {
54
+ prop_name: prop_d["type"]
55
+ for prop_name, prop_d in spec.function.parameters.get(
56
+ "properties", {}
57
+ ).items()
58
+ },
59
+ }
60
+ for spec in self.specs
61
+ ]
62
+
63
+ def get_tool_spec(self, tool_name: str) -> Dict[str, Any]:
64
+ tool = next(
65
+ (t for t in self.specs if t.function.name == tool_name), None
66
+ )
67
+ return tool.function.model_dump() if tool else {}
68
+
69
+ def get_call_dict(self) -> Dict[str, Any]:
70
+ call_dict = {
71
+ "id": self.call.id,
72
+ "type": "function",
73
+ "function": {
74
+ "name": self.call.function.name,
75
+ "arguments": self.call.function.arguments,
76
+ },
77
+ }
78
+ return call_dict
79
+
80
+ def get_function_name(self) -> str:
81
+ return self.call.function.name
82
+
83
+ def get_parameters(self) -> Dict[str, Any]:
84
+ return self.call.function.parsed_arguments
85
+
86
+ def get_param_spec_snippet(self, param_name: str) -> Dict[str, Any]:
87
+ spec = next(
88
+ (
89
+ s
90
+ for s in self.specs
91
+ if s.function.name == self.get_function_name()
92
+ ),
93
+ None,
94
+ )
95
+ if not spec:
96
+ return {"type": "object", "properties": {}, "required": []}
97
+ props = spec.function.parameters.get(
98
+ "properties", spec.function.parameters
99
+ )
100
+ if param_name not in props:
101
+ return {"type": "object", "properties": {}, "required": []}
102
+ return {
103
+ "type": "object",
104
+ "properties": {param_name: props[param_name]},
105
+ "required": [param_name],
106
+ }
@@ -0,0 +1,291 @@
1
+ import importlib.resources
2
+ import json
3
+ from pathlib import Path
4
+ from typing import Dict, List, Optional, Union
5
+
6
+ from wxo_agentic_evaluation.referenceless_eval.function_calling import metrics
7
+ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.semantic_checker import (
8
+ SemanticChecker,
9
+ )
10
+ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.static_checker import (
11
+ evaluate_static,
12
+ )
13
+ from wxo_agentic_evaluation.referenceless_eval.function_calling.pipeline.types import (
14
+ FunctionCallInput,
15
+ FunctionCallMetric,
16
+ PipelineResult,
17
+ SemanticResult,
18
+ StaticMetricResult,
19
+ StaticResult,
20
+ ToolCall,
21
+ ToolSpec,
22
+ )
23
+ from wxo_agentic_evaluation.service_provider.referenceless_provider_wrapper import (
24
+ LLMKitWrapper,
25
+ )
26
+
27
+
28
+ def metrics_dir():
29
+ path = importlib.resources.files(metrics)
30
+ return path
31
+
32
+
33
+ # Default metric JSON paths
34
+ _METRICS_DIR = metrics_dir()
35
+ _DEFAULT_GENERAL = _METRICS_DIR / "function_call" / "general_metrics.json"
36
+ _DEFAULT_GENERAL_RUNTIME = (
37
+ _METRICS_DIR / "function_call" / "general_metrics_runtime.json"
38
+ )
39
+ _DEFAULT_FUNCSEL = (
40
+ _METRICS_DIR / "function_selection" / "function_selection_metrics.json"
41
+ )
42
+ _DEFAULT_FUNCSEL_RUNTIME = (
43
+ _METRICS_DIR
44
+ / "function_selection"
45
+ / "function_selection_metrics_runtime.json"
46
+ )
47
+ _DEFAULT_PARAM = _METRICS_DIR / "parameter" / "parameter_metrics.json"
48
+ _DEFAULT_PARAM_RUNTIME = (
49
+ _METRICS_DIR / "parameter" / "parameter_metrics_runtime.json"
50
+ )
51
+
52
+
53
+ class ReflectionPipeline:
54
+ """
55
+ High-level orchestration for function-call reflection.
56
+
57
+ Modes:
58
+ • static_only: schema checks
59
+ • semantic_only: LLM metrics + transforms
60
+ • run: full static -> semantic -> assemble -> PipelineResult
61
+
62
+ Supports sync, custom JSON overrides, and any registered LLM.
63
+ runtime_pipeline: if set to true, use faster prompts (no actionable recommendations, shorter explanations)
64
+ """
65
+
66
+ def __init__(
67
+ self,
68
+ metrics_client: LLMKitWrapper,
69
+ codegen_client: Optional[LLMKitWrapper] = None,
70
+ general_metrics: Optional[
71
+ Union[Path, List[FunctionCallMetric], List[str]]
72
+ ] = _DEFAULT_GENERAL_RUNTIME,
73
+ function_metrics: Optional[
74
+ Union[Path, List[FunctionCallMetric], List[str]]
75
+ ] = _DEFAULT_FUNCSEL_RUNTIME,
76
+ parameter_metrics: Optional[
77
+ Union[Path, List[FunctionCallMetric], List[str]]
78
+ ] = _DEFAULT_PARAM_RUNTIME,
79
+ transform_enabled: Optional[bool] = False,
80
+ runtime_pipeline: Optional[bool] = True,
81
+ use_examples: Optional[bool] = True,
82
+ ):
83
+
84
+ self.metrics_client = metrics_client
85
+ if codegen_client is None:
86
+ self.codegen_client = metrics_client
87
+ else:
88
+ self.codegen_client = codegen_client
89
+
90
+ self.general_metrics = general_metrics
91
+ self.function_metrics = function_metrics
92
+ self.parameter_metrics = parameter_metrics
93
+
94
+ metrics_definitions = []
95
+
96
+ for metrics, default_path in [
97
+ (
98
+ self.general_metrics,
99
+ (
100
+ _DEFAULT_GENERAL_RUNTIME
101
+ if runtime_pipeline
102
+ else _DEFAULT_GENERAL
103
+ ),
104
+ ),
105
+ (
106
+ self.function_metrics,
107
+ (
108
+ _DEFAULT_FUNCSEL_RUNTIME
109
+ if runtime_pipeline
110
+ else _DEFAULT_FUNCSEL
111
+ ),
112
+ ),
113
+ (
114
+ self.parameter_metrics,
115
+ _DEFAULT_PARAM_RUNTIME if runtime_pipeline else _DEFAULT_PARAM,
116
+ ),
117
+ ]:
118
+ if not metrics:
119
+ metrics_definitions.append(None)
120
+ continue
121
+
122
+ # Handle metric names list
123
+ if isinstance(metrics, list) and all(
124
+ isinstance(x, str) for x in metrics
125
+ ):
126
+ # Load the default JSON file
127
+ if not default_path.is_file():
128
+ raise FileNotFoundError(
129
+ f"Default metrics file not found: {default_path}"
130
+ )
131
+
132
+ with default_path.open("r") as f_in:
133
+ all_metrics = json.load(f_in)
134
+
135
+ # Filter metrics by name
136
+ filtered_metrics = [
137
+ metric
138
+ for metric in all_metrics
139
+ if metric.get("name") in metrics
140
+ ]
141
+
142
+ # Remove examples from prompts if requested
143
+ if not use_examples:
144
+ for metric in filtered_metrics:
145
+ metric.pop("examples", None)
146
+
147
+ if len(filtered_metrics) != len(metrics):
148
+ found_names = {
149
+ metric.get("name") for metric in filtered_metrics
150
+ }
151
+ missing = set(metrics) - found_names
152
+ raise ValueError(f"Metrics not found: {missing}")
153
+
154
+ metrics_definitions.append(filtered_metrics)
155
+ continue
156
+
157
+ # Handle Path or List[FunctionCallMetric] (existing logic)
158
+ if not isinstance(metrics, (Path, list)):
159
+ raise TypeError(
160
+ "metrics must be Path, List[FunctionCallMetric], List[str], or None"
161
+ )
162
+ if isinstance(metrics, list) and all(
163
+ isinstance(x, FunctionCallMetric) for x in metrics
164
+ ):
165
+ metrics_definitions.append(
166
+ [metric.model_dump() for metric in metrics]
167
+ )
168
+ else:
169
+ if not metrics.is_file():
170
+ raise FileNotFoundError(
171
+ f"Metrics file not found: {metrics}"
172
+ )
173
+ metrics_definitions.append(
174
+ [
175
+ json.loads(json_obj)
176
+ for json_obj in metrics.read_text(
177
+ encoding="utf8"
178
+ ).splitlines()
179
+ if json_obj.strip()
180
+ ]
181
+ )
182
+
183
+ gen_defs, fun_defs, par_defs = None, None, None
184
+
185
+ if metrics_definitions:
186
+ gen_defs = metrics_definitions[0]
187
+ if len(metrics_definitions) >= 2:
188
+ fun_defs = metrics_definitions[1]
189
+ if len(metrics_definitions) >= 3:
190
+ par_defs = metrics_definitions[2]
191
+
192
+ # 3) Initialize semantic checker
193
+ self.semantic_checker = SemanticChecker(
194
+ general_metrics=gen_defs,
195
+ function_metrics=fun_defs,
196
+ parameter_metrics=par_defs,
197
+ metrics_client=self.metrics_client,
198
+ codegen_client=self.codegen_client,
199
+ transform_enabled=transform_enabled,
200
+ )
201
+
202
+ @staticmethod
203
+ def static_only(
204
+ inventory: List[ToolSpec],
205
+ call: ToolCall,
206
+ ) -> StaticResult:
207
+ """
208
+ Run schema-based static checks.
209
+
210
+ Returns:
211
+ StaticResult with per-check results and final_decision.
212
+ """
213
+ try:
214
+ return evaluate_static(inventory, call)
215
+ except Exception as e:
216
+ return StaticResult(
217
+ metrics={
218
+ "json_schema_validation": StaticMetricResult(
219
+ description="Invalid JSON schema",
220
+ valid=False,
221
+ explanation=f"error parsing JSON schema: {str(e)}",
222
+ )
223
+ },
224
+ final_decision=False,
225
+ )
226
+
227
+ def semantic_sync(
228
+ self,
229
+ conversation: Union[str, List[Dict[str, str]]],
230
+ inventory: List[ToolSpec],
231
+ call: ToolCall,
232
+ retries: Optional[int] = 2,
233
+ transform_enabled: Optional[bool] = None,
234
+ ) -> SemanticResult:
235
+ """
236
+ Synchronous LLM-based semantic metrics (+ optional transforms).
237
+ """
238
+ # delegate to SemanticChecker
239
+ return self.semantic_checker.run_sync(
240
+ inventory,
241
+ call,
242
+ conversation,
243
+ retries=retries,
244
+ transform_enabled=transform_enabled,
245
+ )
246
+
247
+ def run_sync(
248
+ self,
249
+ conversation: Union[str, List[Dict[str, str]]],
250
+ inventory: List[ToolSpec],
251
+ call: ToolCall,
252
+ continue_on_static: Optional[bool] = False,
253
+ retries: Optional[int] = 1,
254
+ transform_enabled: Optional[bool] = None,
255
+ ) -> PipelineResult:
256
+ """
257
+ Full sync pipeline: static -> semantic -> assemble PipelineResult.
258
+ """
259
+ static_res = self.static_only(inventory, call)
260
+
261
+ if not static_res.final_decision and not continue_on_static:
262
+ inputs = FunctionCallInput(
263
+ conversation_context=conversation,
264
+ tools_inventory=inventory,
265
+ tool_call=call,
266
+ )
267
+ return PipelineResult(
268
+ inputs=inputs,
269
+ static=static_res,
270
+ semantic=SemanticResult(
271
+ general=None,
272
+ function_selection=None,
273
+ parameter=None,
274
+ transform=None,
275
+ ),
276
+ overall_valid=False,
277
+ )
278
+
279
+ semantic_res = self.semantic_sync(
280
+ conversation, inventory, call, retries, transform_enabled
281
+ )
282
+ return PipelineResult(
283
+ inputs=FunctionCallInput(
284
+ conversation_context=conversation,
285
+ tools_inventory=inventory,
286
+ tool_call=call,
287
+ ),
288
+ static=static_res,
289
+ semantic=semantic_res,
290
+ overall_valid=True,
291
+ )