ibm-watsonx-orchestrate-evaluation-framework 1.0.8__py3-none-any.whl → 1.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (60) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info}/METADATA +103 -109
  2. ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info/RECORD +96 -0
  3. wxo_agentic_evaluation/analytics/tools/main.py +1 -18
  4. wxo_agentic_evaluation/analyze_run.py +358 -97
  5. wxo_agentic_evaluation/arg_configs.py +28 -1
  6. wxo_agentic_evaluation/description_quality_checker.py +149 -0
  7. wxo_agentic_evaluation/evaluation_package.py +58 -17
  8. wxo_agentic_evaluation/inference_backend.py +32 -17
  9. wxo_agentic_evaluation/llm_user.py +2 -1
  10. wxo_agentic_evaluation/metrics/metrics.py +22 -1
  11. wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
  12. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +9 -1
  13. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
  14. wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
  15. wxo_agentic_evaluation/prompt/template_render.py +34 -3
  16. wxo_agentic_evaluation/quick_eval.py +342 -0
  17. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +113 -0
  18. wxo_agentic_evaluation/red_teaming/attack_generator.py +286 -0
  19. wxo_agentic_evaluation/red_teaming/attack_list.py +96 -0
  20. wxo_agentic_evaluation/red_teaming/attack_runner.py +128 -0
  21. wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
  22. wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
  23. wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
  24. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
  25. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +27 -0
  26. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
  27. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
  28. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  29. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
  30. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
  31. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  32. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +237 -0
  33. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
  34. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +101 -0
  35. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +263 -0
  36. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +455 -0
  37. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +156 -0
  38. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
  39. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +547 -0
  40. wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
  41. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +258 -0
  42. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +333 -0
  43. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +188 -0
  44. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +409 -0
  45. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +42 -0
  46. wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
  47. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +145 -0
  48. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +116 -0
  49. wxo_agentic_evaluation/service_instance.py +2 -2
  50. wxo_agentic_evaluation/service_provider/watsonx_provider.py +118 -4
  51. wxo_agentic_evaluation/tool_planner.py +3 -1
  52. wxo_agentic_evaluation/type.py +33 -2
  53. wxo_agentic_evaluation/utils/__init__.py +0 -1
  54. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +157 -0
  55. wxo_agentic_evaluation/utils/rich_utils.py +174 -0
  56. wxo_agentic_evaluation/utils/rouge_score.py +23 -0
  57. wxo_agentic_evaluation/utils/utils.py +167 -5
  58. ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/RECORD +0 -56
  59. {ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info}/WHEEL +0 -0
  60. {ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.0.9.dist-info}/top_level.txt +0 -0
@@ -1,123 +1,384 @@
1
1
  import json
2
2
  import os
3
3
  import csv
4
- import rich
4
+ from jsonargparse import CLI
5
+ from pathlib import Path
6
+ from typing import List, Dict, Set, Optional
7
+
5
8
  from rich.text import Text
6
- from rich.panel import Panel
7
- from rich.layout import Layout
8
9
  from rich.table import Table
9
- from typing import List
10
- from wxo_agentic_evaluation.type import (
11
- ExtendedMessage,
12
- ContentType
13
- )
10
+ from rich.panel import Panel
11
+ from rich.console import Group
12
+ from rich.style import Style
13
+
14
+ from wxo_agentic_evaluation.type import ExtendedMessage, ContentType, ToolDefinition
14
15
  from wxo_agentic_evaluation.metrics.metrics import ToolCallAndRoutingMetrics
15
16
  from wxo_agentic_evaluation.arg_configs import AnalyzeConfig
16
- from jsonargparse import CLI
17
+ from wxo_agentic_evaluation.description_quality_checker import (
18
+ DescriptionQualityInspector,
19
+ )
20
+ from wxo_agentic_evaluation.utils.rich_utils import (
21
+ pretty_print,
22
+ warn,
23
+ is_ok,
24
+ print_done,
25
+ IncorrectParameterUtils,
26
+ )
27
+ from wxo_agentic_evaluation.utils.utils import (
28
+ add_line_seperator,
29
+ )
17
30
 
18
31
 
19
- def render(data: List[ExtendedMessage]):
20
- conversation_lines = []
21
- reason_lines = []
32
+ class Analyzer:
22
33
 
23
- for entry in data:
24
- msg = entry.message
25
- role = msg.role
26
- content = msg.content
27
- reason = entry.reason
28
- tool_name = None
29
- if role == "user":
30
- label = "👤 User"
31
- elif role == "assistant" and msg.type == ContentType.tool_call:
32
- if reason:
33
- label = "❌ Tool Call"
34
- tool_name = json.loads(msg.content)["name"]
34
+ def __init__(self):
35
+ self.analysis_cache: Dict[str, List[Text]] = (
36
+ {}
37
+ ) # the failing tools cached here won't be re-analyzed.
38
+ # tool_name -> description analysis
39
+
40
+ @staticmethod
41
+ def _generate_style_config():
42
+ return Style(
43
+ color="magenta",
44
+ blink=True,
45
+ bold=True,
46
+ )
47
+
48
+ def _split_cache(self, failing_tools: Set[str]) -> tuple[List[str], List[Text]]:
49
+
50
+ tools_to_analyze: List[str] = []
51
+ cached_lines: List[Text] = []
52
+ tools_analyzed: List[str] = []
53
+
54
+ for tool_name in sorted(failing_tools):
55
+ cached_analysis = self.analysis_cache.get(tool_name)
56
+ if cached_analysis:
57
+ cached_lines.extend(cached_analysis)
58
+ tools_analyzed.append(tool_name)
35
59
  else:
36
- label = "✅ Tool Call"
37
- elif role == "assistant":
38
- label = "🤖 Assistant"
39
- else:
40
- label = "📦 Unknown"
41
-
42
- text_line = Text(f"{label}: {content}\n")
43
- if reason:
44
- text_line.stylize("bold red")
45
- reason_text = f"❌ {tool_name}: {json.dumps(reason)}\n\n"
46
- reason_lines.append(Text(reason_text, style="red"))
47
- conversation_lines.append(text_line)
48
-
49
- conversation_panel = Panel(
50
- Text().join(conversation_lines),
51
- title="Conversation History",
52
- border_style="blue",
53
- )
54
- reason_panel = Panel(
55
- Text().join(reason_lines), title="Analysis Results", border_style="red"
56
- )
57
-
58
- layout = Layout()
59
- layout.split_row(Layout(conversation_panel), Layout(reason_panel))
60
-
61
- return layout
62
-
63
-
64
- def analyze(config: AnalyzeConfig):
65
- summary = []
66
- with open(os.path.join(config.data_path, "summary_metrics.csv"), "r") as f:
67
- reader = csv.reader(f)
68
- header = next(reader)
69
- for row in reader:
70
- summary.append(dict(zip(header, row)))
71
-
72
- test_case_with_failed_tools = []
73
- for entry in summary:
74
- test_case_name = entry["dataset_name"]
75
- if test_case_name.lower().strip() == "summary (average)":
76
- continue
77
- if not entry["is_success"] or float(entry["tool_calls_with_incorrect_parameter"]) > 0 or float(entry["tool_call_precision"]) < 1.0\
78
- or float(entry["tool_call_recall"]) < 1.0:
79
- test_case_with_failed_tools.append(entry)
80
- if len(test_case_with_failed_tools) == 0:
81
- header_table = Table(show_header=False, box=None)
82
- header_table.add_row(f"No Tool Call Error found!")
83
- header_panel = Panel(
84
- header_table, title="[bold green]📋 Analysis Summary[/bold green]"
60
+ tools_to_analyze.append(tool_name)
61
+
62
+ if tools_analyzed:
63
+ pretty_print(
64
+ content=f"ℹ️ Loading cached analysis since these failing tools: {tools_analyzed} have been analyzed previously.",
65
+ style="bold cyan",
66
+ )
67
+
68
+ return (
69
+ tools_to_analyze,
70
+ cached_lines
71
+ )
72
+
73
+
74
+ def analyze_failing_tool_description_quality(
75
+ self,
76
+ inspector: DescriptionQualityInspector,
77
+ tool_definition_path: str,
78
+ failing_tools: Set[str],
79
+ ) -> List[Text]:
80
+ """
81
+ :param tool_definition_path: Path to the tool definition file.
82
+ :param failing_tools: Set of tool names that failed.
83
+ :return: List of rich `Text` objects containing feedback for the customer.
84
+ """
85
+
86
+ pretty_print(
87
+ content=f"⚙️ Checking tool description quality for failing tools: {sorted(failing_tools)}",
88
+ style="bold cyan",
85
89
  )
86
- rich.print(header_panel)
87
90
 
88
- for test_case_entry in test_case_with_failed_tools:
89
- test_case_name = test_case_entry["dataset_name"]
91
+ analysis_for_display: List[Text] = []
92
+
93
+ # Step 1: get tools not yet analyzed and cached analysis for tools analyzed previously
94
+ tools_to_analyze, cached_analysis = self._split_cache(failing_tools)
95
+ if cached_analysis:
96
+ analysis_for_display.extend(cached_analysis)
97
+
98
+ # Step 2: analyze cache misses
99
+ if tools_to_analyze:
100
+
101
+ failing_tool_definitions: List[ToolDefinition] = inspector.extract_tool_desc_from_tool_source(
102
+ Path(tool_definition_path),
103
+ tools_to_analyze,
104
+ )
105
+
106
+ if not failing_tool_definitions:
107
+ analysis_for_display.append(
108
+ warn(
109
+ message=f"No tool definitions(with '@tool' decorators) for failed tools: '{tools_to_analyze}' found in the file: '{tool_definition_path}'"
110
+ )
111
+ )
112
+ return analysis_for_display
113
+
114
+ missing_tools = self._get_tools_not_found_in_source(
115
+ tools_to_analyze, failing_tool_definitions
116
+ )
117
+ if missing_tools:
118
+ analysis_for_display.append(
119
+ warn(
120
+ message=f"Missing tool definitions for failed tools: '{missing_tools}' in the file: '{tool_definition_path}'"
121
+ )
122
+ )
123
+
124
+ for tool_definition in failing_tool_definitions:
125
+
126
+ tool_analysis = self._analyze_tool_definition(
127
+ inspector=inspector,
128
+ tool_definition=tool_definition,
129
+ tool_definition_path=tool_definition_path,
130
+ )
131
+
132
+ self.analysis_cache[tool_definition.tool_name] = tool_analysis
133
+ analysis_for_display.extend(tool_analysis)
134
+
135
+ return analysis_for_display
136
+
137
+ def render(self, data: List[ExtendedMessage], tool_definition_path: Optional[str]) -> Group:
138
+ """
139
+ Render the conversation history and analysis results.
140
+ :param data: List of ExtendedMessage objects containing the conversation history.
141
+ :param tool_definition_path: Path to the tool definition file.
142
+ :return: A rich Group object containing the conversation and analysis results.
143
+ """
144
+ conversation_lines = []
145
+ reason_lines = []
146
+ failing_tools = []
147
+
148
+ for entry in data:
149
+ msg = entry.message
150
+ role = msg.role
151
+ content = msg.content
152
+ reason = entry.reason
153
+ tool_name = None
154
+ if msg.type == ContentType.tool_call or msg.type == ContentType.tool_response:
155
+ tool_name = json.loads(msg.content)["name"]
90
156
 
91
- test_case_path = os.path.join(
92
- config.data_path, "messages", f"{test_case_name}.messages.analyze.json"
157
+ if role == "user":
158
+ label = "👤 User"
159
+ elif role == "assistant" and msg.type == ContentType.tool_call:
160
+ if reason:
161
+ label = "❌ Tool Call"
162
+
163
+ if reason.get("reason") == "incorrect parameter":
164
+ failing_tools.append(
165
+ tool_name
166
+ ) # create a list of failing tools for description quality analysis.
167
+ else:
168
+ label = "✅ Tool Call"
169
+ elif role == "assistant":
170
+ label = "🤖 Assistant"
171
+ else:
172
+ label = "📦 Unknown"
173
+
174
+ text_line = Text(f"{label}: {content}\n")
175
+ if reason:
176
+ text_line.stylize("bold red")
177
+ reason_text = f"❌ {tool_name}: {json.dumps(reason)}\n\n"
178
+ reason_lines.append(Text(reason_text, style="red"))
179
+ conversation_lines.append(text_line)
180
+
181
+ if failing_tools and tool_definition_path:
182
+
183
+ inspector = DescriptionQualityInspector()
184
+
185
+ description_quality_inspection_lines = (
186
+ self.analyze_failing_tool_description_quality(
187
+ inspector, tool_definition_path, set(failing_tools)
188
+ )
189
+ )
190
+
191
+ print_done()
192
+
193
+ if description_quality_inspection_lines:
194
+ reason_lines.extend(description_quality_inspection_lines)
195
+
196
+ conversation_panel = Panel(
197
+ Text().join(conversation_lines),
198
+ title="Conversation History",
199
+ border_style="blue",
93
200
  )
94
- test_messages = []
95
- with open(test_case_path, "r", encoding="utf-8") as f:
96
- temp = json.load(f)
97
- for entry in temp:
98
- msg = ExtendedMessage(**entry)
99
- test_messages.append(msg)
100
-
101
- test_metrics_path = os.path.join(
102
- config.data_path, "messages", f"{test_case_name}.metrics.json"
201
+ reason_panel = Panel(
202
+ Text().join(reason_lines),
203
+ title="Analysis Results",
204
+ border_style="red",
205
+ )
206
+
207
+ return Group(
208
+ conversation_panel,
209
+ reason_panel,
103
210
  )
104
- with open(test_metrics_path, "r", encoding="utf-8") as f:
105
- metrics = ToolCallAndRoutingMetrics(**json.load(f))
211
+
212
+ def analyze(self, config: AnalyzeConfig):
213
+ """
214
+ Analyze the results of the tool calls and routing metrics.
215
+ :param config: AnalyzeConfig object containing user provided paths for analysis.
216
+ """
217
+
218
+ def get_summary(summary_file_name: str = "summary_metrics.csv"):
219
+ summary = []
220
+
221
+ path_to_summary_file = os.path.join(config.data_path, summary_file_name)
222
+
223
+ with open(path_to_summary_file, "r") as f:
224
+ reader = csv.reader(f)
225
+ header = next(reader)
226
+ for row in reader:
227
+ summary.append(dict(zip(header, row)))
228
+
229
+ return summary
230
+
231
+ def get_test_messages(test_case_name):
232
+ test_messages = []
233
+
234
+ test_case_path = os.path.join(
235
+ config.data_path, "messages", f"{test_case_name}.messages.analyze.json"
236
+ )
237
+
238
+ with open(test_case_path, "r", encoding="utf-8") as f:
239
+ temp = json.load(f)
240
+ for entry in temp:
241
+ msg = ExtendedMessage(**entry)
242
+ test_messages.append(msg)
243
+
244
+ return test_messages
245
+
246
+ def get_metrics(test_case_name):
247
+ test_metrics_path = os.path.join(
248
+ config.data_path, "messages", f"{test_case_name}.metrics.json"
249
+ )
250
+
251
+ with open(test_metrics_path, "r", encoding="utf-8") as f:
252
+ metrics = ToolCallAndRoutingMetrics(**json.load(f))
253
+
254
+ return metrics
255
+
256
+ summary = get_summary()
257
+
258
+ test_case_with_failed_tools = self._get_test_case_with_failed_tools(
259
+ summary=summary
260
+ )
261
+
262
+ if len(test_case_with_failed_tools) == 0:
263
+ header_table = Table(show_header=False, box=None)
264
+
265
+ header_table.add_row("No Tool Call Error found!")
266
+
267
+ panel = Panel(
268
+ header_table, title="[bold green]📋 Analysis Summary[/bold green]"
269
+ )
270
+
271
+ pretty_print(panel)
272
+
273
+ for test_case_entry in test_case_with_failed_tools:
274
+ test_case_name = test_case_entry["dataset_name"]
275
+
276
+ test_messages = get_test_messages(test_case_name=test_case_name)
277
+
278
+ metrics: ToolCallAndRoutingMetrics = get_metrics(
279
+ test_case_name=test_case_name
280
+ )
281
+
282
+ header_panel = self._create_header_analysis_panel(test_case_name, metrics)
283
+ pretty_print(header_panel)
284
+
285
+ tool_definition_path = config.tool_definition_path \
286
+ if config.tool_definition_path else None
287
+
288
+ rendered_content = self.render(
289
+ data=test_messages,
290
+ tool_definition_path=tool_definition_path
291
+ )
292
+ pretty_print(rendered_content)
293
+
294
+ add_line_seperator(
295
+ self._generate_style_config()
296
+ )
297
+
298
+ def _create_header_analysis_panel(
299
+ self, test_case_name: str, metrics: ToolCallAndRoutingMetrics
300
+ ) -> Panel:
106
301
  header_table = Table(show_header=False, box=None)
302
+
107
303
  header_table.add_row(f"Test Case Name: {test_case_name}")
108
- header_table.add_row((f"Expected Tool Calls: {metrics.expected_tool_calls}"))
304
+ header_table.add_row(f"Expected Tool Calls: {metrics.expected_tool_calls}")
109
305
  header_table.add_row(f"Correct Tool Calls: {metrics.correct_tool_calls}")
110
306
  header_table.add_row(f"Text Match: {metrics.text_match.value}")
111
- header_table.add_row(
112
- f"Journey Success: {metrics.is_success}"
113
- )
307
+ header_table.add_row(f"Journey Success: {metrics.is_success}")
308
+
114
309
  header_panel = Panel(
115
310
  header_table, title="[bold green]📋 Analysis Summary[/bold green]"
116
311
  )
117
- rich.print(header_panel)
118
- layout = render(test_messages)
119
- rich.print(layout)
120
312
 
313
+ return header_panel
314
+
315
+ def _get_test_case_with_failed_tools(self, summary) -> List:
316
+
317
+ test_case_with_failed_tools = []
318
+
319
+ for entry in summary:
320
+ test_case_name = entry["dataset_name"]
321
+
322
+ if test_case_name.lower().strip() == "summary (average)":
323
+ continue
324
+
325
+ if (
326
+ not entry["is_success"]
327
+ or float(entry["tool_calls_with_incorrect_parameter"]) > 0
328
+ or float(entry["tool_call_precision"]) < 1.0
329
+ or float(entry["tool_call_recall"]) < 1.0
330
+ ):
331
+
332
+ test_case_with_failed_tools.append(entry)
333
+
334
+ return test_case_with_failed_tools
335
+
336
+ def _get_tools_not_found_in_source(
337
+ self,
338
+ tools_to_analyze: List[str],
339
+ failing_tool_definitions: List[ToolDefinition],
340
+ ) -> Set[str]:
341
+
342
+ return set(tools_to_analyze) - {
343
+ tool_def.tool_name for tool_def in failing_tool_definitions
344
+ }
345
+
346
+ def _analyze_tool_definition(
347
+ self,
348
+ inspector: DescriptionQualityInspector,
349
+ tool_definition: ToolDefinition,
350
+ tool_definition_path: str,
351
+ ) -> List[Text]:
352
+
353
+ tool_name = tool_definition.tool_name
354
+ tool_desc = tool_definition.tool_description
355
+
356
+ tool_analysis = []
357
+
358
+ # missing description
359
+ if tool_desc is None:
360
+ tool_analysis.extend(
361
+ IncorrectParameterUtils.format_missing_description_message(
362
+ tool_name=tool_name, tool_definition_path=tool_definition_path
363
+ )
364
+ )
365
+ return tool_analysis
366
+
367
+ # bad description
368
+ if inspector.detect_bad_description(tool_definition):
369
+ tool_analysis.extend(
370
+ IncorrectParameterUtils.format_bad_description_message(
371
+ tool_name=tool_name, tool_desc=tool_desc
372
+ )
373
+ )
374
+ return tool_analysis
375
+
376
+ # good description
377
+ tool_analysis.append(
378
+ is_ok(message=f"The description for the `{tool_name}` looks sufficient.")
379
+ )
380
+ return tool_analysis
121
381
 
122
382
  if __name__ == "__main__":
123
- analyze(CLI(AnalyzeConfig, as_positional=False))
383
+ dummy_analyzer = Analyzer()
384
+ dummy_analyzer.analyze(CLI(AnalyzeConfig, as_positional=False))
@@ -1,6 +1,6 @@
1
1
  import os
2
2
  from dataclasses import dataclass, field
3
- from typing import List
3
+ from typing import List, Optional, Union
4
4
  from wxo_agentic_evaluation import __file__
5
5
 
6
6
  root_dir = os.path.dirname(__file__)
@@ -43,9 +43,32 @@ class TestConfig:
43
43
  num_workers: int = 2
44
44
 
45
45
 
46
+ @dataclass
47
+ class AttackConfig:
48
+ attack_paths: List[str]
49
+ output_dir: str
50
+ auth_config: AuthConfig
51
+ provider_config: ProviderConfig = field(default_factory=ProviderConfig)
52
+ llm_user_config: LLMUserConfig = field(default_factory=LLMUserConfig)
53
+ enable_verbose_logging: bool = True
54
+ enable_manual_user_input: bool = False
55
+ num_workers: int = 2
56
+
57
+
58
+ @dataclass
59
+ class AttackGeneratorConfig:
60
+ attacks_list: Union[List[str], str]
61
+ datasets_path: Union[List[str], str]
62
+ agents_path: str
63
+ target_agent_name: str
64
+ output_dir: str = None
65
+ max_variants: int = None
66
+
67
+
46
68
  @dataclass
47
69
  class AnalyzeConfig:
48
70
  data_path: str
71
+ tool_definition_path: Optional[str] = None
49
72
 
50
73
 
51
74
  @dataclass
@@ -77,6 +100,10 @@ class ChatRecordingConfig:
77
100
  max_retries: int = 5
78
101
 
79
102
 
103
+ @dataclass
104
+ class QuickEvalConfig(TestConfig):
105
+ tools_path: str = None
106
+
80
107
  @dataclass
81
108
  class BatchAnnotateConfig:
82
109
  allowed_tools: List[str]
@@ -0,0 +1,149 @@
1
+ import os
2
+ from pathlib import Path
3
+ from typing import List
4
+ import rich
5
+ from enum import Enum
6
+
7
+ from wxo_agentic_evaluation.service_provider import get_provider
8
+ from wxo_agentic_evaluation.prompt.template_render import BadToolDescriptionRenderer
9
+ from wxo_agentic_evaluation.tool_planner import (
10
+ parse_json_string,
11
+ extract_tool_signatures,
12
+ MISSING_DOCSTRING_PROMPT,
13
+ )
14
+ from wxo_agentic_evaluation.type import ToolDefinition
15
+ from wxo_agentic_evaluation.utils.utils import safe_divide
16
+
17
+
18
+ class ToolDescriptionIssue(Enum):
19
+ """
20
+ Represents the binary outcomes the LLM judge will classify in its assessment \
21
+ of the tool's description.
22
+ The presence of these issues in the tool's description indicates poor quality.
23
+ For more detail on what each issue indicates, please take a look at the template here: `wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2`.
24
+ """
25
+
26
+ # TODO: Priority-based weighting of issues.
27
+ CONTAINS_REDUNDANT_INFORMATION = "contains_redundant_information"
28
+ USES_VAGUE_LANGUAGE = "uses_vague_language"
29
+ DOES_NOT_HELP_IN_IDENTIFYING_TOOL_UNIQUELY = (
30
+ "does_not_help_in_identifying_tool_uniquely"
31
+ )
32
+ PROVIDES_NO_NEW_INFORMATION = "provides_no_new_information"
33
+ DOES_NOT_CONVEY_TOOL_PURPOSE = "does_not_convey_tool_purpose"
34
+
35
+
36
+ class DescriptionQualityInspector:
37
+ DEFAULT_CLASSIFICATION_THRESHOLD = (
38
+ 40.0 # 2/5 issues detected. A higher score indicates a worse description.
39
+ )
40
+ CLASSIFICATION_SCORE_THRESHOLD = float(
41
+ os.getenv("CLASSIFICATION_SCORE_THRESHOLD", DEFAULT_CLASSIFICATION_THRESHOLD)
42
+ )
43
+
44
+ LLM_MODEL_ID = "meta-llama/llama-3-2-90b-vision-instruct"
45
+ LLM_PARAMS = {
46
+ "min_new_tokens": 128,
47
+ "decoding_method": "greedy",
48
+ "max_new_tokens": 512,
49
+ }
50
+
51
+ WORST_POSSIBLE_EVAL_OUTCOME = len(
52
+ ToolDescriptionIssue
53
+ ) # the final score used for classification is normalized against this value.
54
+
55
+ root_dir = os.path.dirname(__file__)
56
+ BAD_TOOL_DESCRIPTIONS_DETECTOR_PATH = os.path.join(
57
+ root_dir, "prompt", "bad_tool_descriptions_prompt.jinja2"
58
+ )
59
+
60
+ def __init__(self, llm_client=None):
61
+ if llm_client is None:
62
+ llm_client = get_provider(
63
+ model_id=self.LLM_MODEL_ID,
64
+ params=self.LLM_PARAMS,
65
+ )
66
+ self.llm_client = llm_client
67
+ self.template = BadToolDescriptionRenderer(
68
+ self.BAD_TOOL_DESCRIPTIONS_DETECTOR_PATH
69
+ )
70
+ self.cached_response = (
71
+ None # this is used in the unit-tests for nuanced analysis of the response.
72
+ )
73
+
74
+ @staticmethod
75
+ def extract_tool_desc_from_tool_source(
76
+ tool_source: Path, failing_tools: List[str]
77
+ ) -> List[ToolDefinition]:
78
+ """
79
+ Parses the tool source file to extract the tool description.
80
+ Wraps the description along with the tool name, and args into a `ToolDefinition` for all `failing_tools`.
81
+ This `ToolDefinition` is later rendered into the judge's prompt template for evaluation.
82
+ Args:
83
+ tool_source (Path): The path to the tool source file/dir containing `.py` tools.
84
+ failing_tools (List[str]): List of tool names that failed during inference.
85
+ Returns:
86
+ List[ToolDefinition]: The extracted tool definition(s) or [] if the file contains no @tool decorators.
87
+ """
88
+ all_tool_data = extract_tool_signatures(tool_source)
89
+
90
+ tool_definitions = []
91
+ for tool_data in all_tool_data:
92
+ tool_name = tool_data["Function Name"]
93
+ if tool_name in failing_tools:
94
+ tool_definitions.append(
95
+ ToolDefinition(
96
+ tool_name=tool_name,
97
+ tool_description=(
98
+ tool_data["Docstring"]
99
+ if tool_data["Docstring"] != MISSING_DOCSTRING_PROMPT
100
+ else None
101
+ ),
102
+ tool_params=tool_data["Arguments"],
103
+ )
104
+ )
105
+ return tool_definitions
106
+
107
+ def detect_bad_description(self, tool_definition: ToolDefinition) -> bool:
108
+ """
109
+ Detects if a tool description is 'bad' using an LLM judge.
110
+ A 'bad' description is one that:
111
+ - does not describe the tool's functionality/use-case clearly
112
+ - does not provide sufficient detail for an agent to understand how to use the tool
113
+ - does not distinguish the tool from other tools
114
+ For the exact definition of a 'bad' description, refer to `ToolDescriptionIssue` Enum.
115
+ Args:
116
+ tool_definition (ToolDefinition): The definition of the tool to evaluate.
117
+ Returns:
118
+ bool: True if the description is 'bad', False otherwise.
119
+ """
120
+ prompt = self.template.render(tool_definition=tool_definition)
121
+ response = self.llm_client.query(prompt)
122
+
123
+ # parse JSON objects from cleaned text
124
+ json_objects = parse_json_string(response)
125
+
126
+ # pick the first JSON object
127
+ if json_objects:
128
+ response_data = json_objects[0]
129
+ self.cached_response = response_data
130
+ else:
131
+ return False # likely some unexpected parsing issue, in this case - flags description as good.
132
+
133
+ # calculate weighted score
134
+ final_description_score = self._calculate_score(response_data=response_data)
135
+
136
+ return final_description_score >= self.CLASSIFICATION_SCORE_THRESHOLD
137
+
138
+ def _calculate_score(self, response_data: dict) -> float:
139
+ """
140
+ Calculates a final score for the tool description.
141
+ This score is used to finally classify a 'good' or 'bad' description.
142
+ :param response_data: Parsed JSON response returned by the LLM judge.
143
+ """
144
+ detected_issues = sum(
145
+ 1
146
+ for issue in ToolDescriptionIssue
147
+ if response_data.get(issue.value, "FALSE").upper() == "TRUE"
148
+ )
149
+ return safe_divide(detected_issues, self.WORST_POSSIBLE_EVAL_OUTCOME) * 100