ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +19 -25
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +1184 -97
  8. wxo_agentic_evaluation/annotate.py +7 -5
  9. wxo_agentic_evaluation/arg_configs.py +97 -5
  10. wxo_agentic_evaluation/base_user.py +25 -0
  11. wxo_agentic_evaluation/batch_annotate.py +97 -27
  12. wxo_agentic_evaluation/clients.py +103 -0
  13. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  14. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  15. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  16. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  17. wxo_agentic_evaluation/data_annotator.py +45 -19
  18. wxo_agentic_evaluation/description_quality_checker.py +178 -0
  19. wxo_agentic_evaluation/evaluation.py +50 -0
  20. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  21. wxo_agentic_evaluation/evaluation_package.py +544 -107
  22. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  23. wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
  24. wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
  25. wxo_agentic_evaluation/external_agent/types.py +8 -7
  26. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  27. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  28. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  29. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  30. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  31. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  32. wxo_agentic_evaluation/llm_matching.py +108 -5
  33. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  34. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  35. wxo_agentic_evaluation/llm_user.py +12 -6
  36. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  37. wxo_agentic_evaluation/main.py +128 -246
  38. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  39. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  40. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  41. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  42. wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
  43. wxo_agentic_evaluation/metrics/metrics.py +319 -16
  44. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  45. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  46. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  47. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  48. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  49. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  50. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  51. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  52. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  53. wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
  54. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
  55. wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
  56. wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
  57. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  58. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
  59. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  60. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
  61. wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
  62. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  63. wxo_agentic_evaluation/prompt/template_render.py +163 -12
  64. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  65. wxo_agentic_evaluation/quick_eval.py +384 -0
  66. wxo_agentic_evaluation/record_chat.py +132 -81
  67. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
  68. wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
  69. wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
  70. wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
  71. wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
  72. wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
  73. wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
  74. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
  75. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
  76. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
  77. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
  78. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  79. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  80. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
  81. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
  82. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  83. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  84. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
  85. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
  86. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
  87. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
  88. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
  89. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
  90. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
  91. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
  92. wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
  93. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
  94. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
  95. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
  96. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
  97. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
  98. wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
  99. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
  100. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
  101. wxo_agentic_evaluation/resource_map.py +6 -3
  102. wxo_agentic_evaluation/runner.py +329 -0
  103. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  104. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  105. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
  106. wxo_agentic_evaluation/scheduler.py +247 -0
  107. wxo_agentic_evaluation/service_instance.py +117 -26
  108. wxo_agentic_evaluation/service_provider/__init__.py +182 -17
  109. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  110. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
  111. wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
  112. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  113. wxo_agentic_evaluation/service_provider/provider.py +129 -10
  114. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
  115. wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
  116. wxo_agentic_evaluation/simluation_runner.py +125 -0
  117. wxo_agentic_evaluation/test_prompt.py +4 -4
  118. wxo_agentic_evaluation/tool_planner.py +141 -46
  119. wxo_agentic_evaluation/type.py +217 -14
  120. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  121. wxo_agentic_evaluation/utils/__init__.py +44 -3
  122. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  123. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  124. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  125. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
  126. wxo_agentic_evaluation/utils/parsers.py +71 -0
  127. wxo_agentic_evaluation/utils/rich_utils.py +188 -0
  128. wxo_agentic_evaluation/utils/rouge_score.py +23 -0
  129. wxo_agentic_evaluation/utils/utils.py +514 -17
  130. wxo_agentic_evaluation/wxo_client.py +81 -0
  131. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
  132. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
  133. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  134. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,193 @@
1
+ import statistics
2
+ from dataclasses import dataclass, field
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ from wxo_agentic_evaluation.utils.utils import safe_divide
6
+
7
+
8
+ @dataclass
9
+ class TestCaseEvaluationResult:
10
+ """Class representing a single test case evaluation result."""
11
+
12
+ name: str
13
+ text_match: Optional[str] = None
14
+ is_success: bool = False
15
+ total_steps: float = 0
16
+ llm_step: float = 0
17
+ total_tool_calls: float = 0
18
+ tool_call_precision: float = 0
19
+ tool_call_recall: float = 0
20
+ agent_routing_accuracy: float = 0
21
+ avg_resp_time: float = 0
22
+ failed_tool_calls: int = 0
23
+
24
+ # Store any additional metrics not explicitly defined
25
+ additional_metrics: Dict[str, Any] = field(default_factory=dict)
26
+
27
+ def matches_count(self, match_value: str = "Summary Matched") -> int:
28
+ """Check if this test case matches the specified value."""
29
+ return 1 if self.text_match == match_value else 0
30
+
31
+ def to_dict(self) -> Dict[str, Any]:
32
+ """Convert the test case result to a dictionary."""
33
+ result = {
34
+ "dataset_name": self.name,
35
+ "text_match": self.text_match,
36
+ "is_success": self.is_success,
37
+ "total_steps": self.total_steps,
38
+ "llm_step": self.llm_step,
39
+ "total_tool_calls": self.total_tool_calls,
40
+ "tool_call_precision": self.tool_call_precision,
41
+ "tool_call_recall": self.tool_call_recall,
42
+ "agent_routing_accuracy": self.agent_routing_accuracy,
43
+ "avg_resp_time": self.avg_resp_time,
44
+ "failed_tool_calls": self.failed_tool_calls,
45
+ }
46
+ # Add any additional metrics
47
+ result.update(self.additional_metrics)
48
+ return result
49
+
50
+
51
+ @dataclass
52
+ class EvaluationResult:
53
+ """Class representing a collection of test case evaluation results."""
54
+
55
+ test_case_results: Dict[str, TestCaseEvaluationResult]
56
+
57
+ @classmethod
58
+ def from_csv(cls, data: List[Dict[str, Any]]) -> "EvaluationResult":
59
+ """Create an EvaluationResult from CSV data."""
60
+ results = {}
61
+ for row in data:
62
+ name = row["dataset_name"]
63
+
64
+ # Extract standard fields
65
+ standard_fields = {
66
+ "name": name,
67
+ "text_match": row.get("text_match"),
68
+ "is_success": row.get("is_success", False),
69
+ "total_steps": row.get("total_steps", 0),
70
+ "llm_step": row.get("llm_step", 0),
71
+ "total_tool_calls": row.get("total_tool_calls", 0),
72
+ "tool_call_precision": row.get("tool_call_precision", 0),
73
+ "tool_call_recall": row.get("tool_call_recall", 0),
74
+ "agent_routing_accuracy": row.get("agent_routing_accuracy", 0),
75
+ "avg_resp_time": row.get("avg_resp_time", 0),
76
+ "failed_tool_calls": row.get("failed_tool_calls", 0),
77
+ }
78
+
79
+ # Extract additional fields not in the standard set
80
+ additional_metrics = {}
81
+ for key, value in row.items():
82
+ if key not in standard_fields and key != "dataset_name":
83
+ additional_metrics[key] = value
84
+
85
+ # Create the test case result
86
+ result = TestCaseEvaluationResult(
87
+ **standard_fields, additional_metrics=additional_metrics
88
+ )
89
+ results[name] = result
90
+
91
+ return cls(results)
92
+
93
+ def calculate_boolean_percent_true(
94
+ self, values: List[bool]
95
+ ) -> Dict[str, Any]:
96
+ """Calculate statistics for boolean values."""
97
+ return {
98
+ "mean": sum(1 for v in values if v) / len(values) if values else 0,
99
+ "count": len(values),
100
+ "true_count": sum(1 for v in values if v),
101
+ "false_count": sum(1 for v in values if not v),
102
+ }
103
+
104
+ def calculate_numeric_statistics(
105
+ self, values: List[float]
106
+ ) -> Dict[str, Any]:
107
+ """Calculate statistics for numeric values."""
108
+ try:
109
+ stats = {
110
+ "mean": statistics.mean(values),
111
+ "median": statistics.median(values),
112
+ "min": min(values),
113
+ "max": max(values),
114
+ "count": len(values),
115
+ }
116
+ if len(values) > 1:
117
+ stats["std_dev"] = statistics.stdev(values)
118
+ return stats
119
+ except statistics.StatisticsError:
120
+ # Handle empty lists or other statistical errors
121
+ return {"error": "Could not compute statistics"}
122
+
123
+ def compute_summary_statistics(self) -> Dict[str, Any]:
124
+ """Compute summary statistics for all test cases."""
125
+ stats = {}
126
+
127
+ if not self.test_case_results:
128
+ return stats
129
+
130
+ # Get all fields from the first test case
131
+ first_result = next(iter(self.test_case_results.values()))
132
+ first_dict = first_result.to_dict()
133
+
134
+ # Identify numeric and boolean columns
135
+ for key, value in first_dict.items():
136
+ if key == "dataset_name" or key == "text_match":
137
+ continue
138
+
139
+ # Collect values for this field from all test cases
140
+ values = []
141
+ for result in self.test_case_results.values():
142
+ result_dict = result.to_dict()
143
+ if key in result_dict:
144
+ values.append(result_dict[key])
145
+
146
+ # Calculate statistics based on value type
147
+ if values:
148
+ if all(isinstance(v, bool) for v in values):
149
+ stats[key] = self.calculate_boolean_percent_true(values)
150
+ elif all(isinstance(v, (int, float)) for v in values):
151
+ stats[key] = self.calculate_numeric_statistics(values)
152
+
153
+ # Count summary matches
154
+ match_count = sum(
155
+ result.matches_count() for result in self.test_case_results.values()
156
+ )
157
+ stats["summary_matched_count"] = {
158
+ "count": match_count,
159
+ "percentage": (
160
+ round(match_count / len(self.test_case_results) * 100, 2)
161
+ if self.test_case_results
162
+ else 0
163
+ ),
164
+ }
165
+
166
+ return stats
167
+
168
+ @property
169
+ def test_count(self) -> int:
170
+ """Get the total number of test cases."""
171
+ return len(self.test_case_results)
172
+
173
+ @property
174
+ def summary_matched_count(self) -> int:
175
+ """Get the count of summary matched test cases."""
176
+ return sum(
177
+ result.matches_count() for result in self.test_case_results.values()
178
+ )
179
+
180
+ @property
181
+ def is_success_count(self) -> int:
182
+ """Get the count of successful test cases."""
183
+ return sum(
184
+ 1 for result in self.test_case_results.values() if result.is_success
185
+ )
186
+
187
+ def summary_match_ratio(self) -> float:
188
+ """Calculate the ratio of summary matches to total tests."""
189
+ return safe_divide(self.summary_matched_count, self.test_count)
190
+
191
+ def is_success_ratio(self) -> float:
192
+ """Calculate the ratio of successful tests to total tests."""
193
+ return safe_divide(self.is_success_count, self.test_count)
@@ -1,16 +1,19 @@
1
- from wxo_agentic_evaluation.type import Message, EvaluationData
2
- from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
3
- from wxo_agentic_evaluation.service_provider import get_provider
4
- from wxo_agentic_evaluation.prompt.template_render import (
5
- LlamaKeywordsGenerationTemplateRenderer,
6
- )
7
- from wxo_agentic_evaluation.arg_configs import KeywordsGenerationConfig
8
-
9
1
  import ast
10
- import json
11
2
  import collections
3
+ import json
12
4
  from typing import Dict, List, Optional
13
5
 
6
+ from wxo_agentic_evaluation.arg_configs import (
7
+ ChatRecordingConfig,
8
+ KeywordsGenerationConfig,
9
+ )
10
+ from wxo_agentic_evaluation.prompt.template_render import (
11
+ LlamaKeywordsGenerationTemplateRenderer,
12
+ )
13
+ from wxo_agentic_evaluation.service_provider import get_provider
14
+ from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
15
+ from wxo_agentic_evaluation.type import Message, OrchestrateDataset
16
+
14
17
  ERROR_KEYWORDS = [
15
18
  "error",
16
19
  "erroneous",
@@ -113,11 +116,11 @@ class DataAnnotator:
113
116
  self,
114
117
  messages: List[Message],
115
118
  keywords_generation_config: KeywordsGenerationConfig,
116
- initial_data: Optional[EvaluationData] = None,
119
+ initial_data: Optional[OrchestrateDataset] = None,
117
120
  ):
118
121
  self.messages = messages
119
122
  self.keywords_generation_config = keywords_generation_config
120
- self.initial_data = initial_data or EvaluationData(
123
+ self.initial_data = initial_data or OrchestrateDataset(
121
124
  agent="",
122
125
  story="",
123
126
  starting_sentence=messages[0].content if messages else "",
@@ -143,7 +146,9 @@ class DataAnnotator:
143
146
  )
144
147
  return wrong_tool_response_id
145
148
 
146
- def _process_tool_call_order(self, wrong_tool_response_id: list[str]) -> list[str]:
149
+ def _process_tool_call_order(
150
+ self, wrong_tool_response_id: list[str]
151
+ ) -> list[str]:
147
152
  """Process and order tool calls, skipping failed ones"""
148
153
  # gather all call ids that actually got a response
149
154
  valid_call_ids = {
@@ -221,16 +226,33 @@ class DataAnnotator:
221
226
  return goals, goal_details, previous
222
227
 
223
228
  def _process_summarization(
224
- self, previous: str, goals: Dict, goal_details: List
229
+ self,
230
+ previous: str,
231
+ goals: Dict,
232
+ goal_details: List,
233
+ config: ChatRecordingConfig = None,
225
234
  ) -> None:
226
235
  """Process summarization step"""
227
236
  summarize_step = None
228
237
  # we assume single summary step at the end
238
+ extra_kwargs = {}
239
+ instance_url = getattr(config, "service_url", None)
240
+ token = getattr(config, "token", None)
241
+ if instance_url:
242
+ extra_kwargs["instance_url"] = instance_url
243
+ if token:
244
+ extra_kwargs["token"] = token
245
+
229
246
  for message in self.messages[::-1]:
230
247
  if message.role == "assistant":
231
248
  provider = get_provider(
232
249
  model_id=self.keywords_generation_config.model_id,
233
- params={"min_new_tokens": 0, "decoding_method": "greedy", "max_new_tokens": 256},
250
+ params={
251
+ "min_new_tokens": 0,
252
+ "decoding_method": "greedy",
253
+ "max_new_tokens": 256,
254
+ },
255
+ **extra_kwargs,
234
256
  )
235
257
  kw_generator = KeywordsGenerationLLM(
236
258
  provider=provider,
@@ -248,15 +270,19 @@ class DataAnnotator:
248
270
  goal_details.append(summarize_step)
249
271
  break
250
272
 
251
- if summarize_step:
252
- goals[previous] = ["summarize"]
253
- else:
273
+ if previous is None:
274
+ goals["summarize"] = []
275
+ elif summarize_step is None:
254
276
  goals[previous] = []
277
+ else:
278
+ goals[previous] = ["summarize"]
255
279
 
256
- def generate(self) -> Dict:
280
+ def generate(self, config: ChatRecordingConfig = None) -> Dict:
257
281
  """Generate the final dataset"""
258
282
  goals, goal_details, previous = self._process_tool_calls()
259
- self._process_summarization(previous, goals, goal_details)
283
+ self._process_summarization(
284
+ previous, goals, goal_details, config=config
285
+ )
260
286
 
261
287
  return {
262
288
  "agent": self.initial_data.agent,
@@ -0,0 +1,178 @@
1
+ import os
2
+ from enum import Enum
3
+ from pathlib import Path
4
+ from typing import List
5
+
6
+ from wxo_agentic_evaluation.metrics.metrics import DescriptionQualityMetric
7
+ from wxo_agentic_evaluation.prompt.template_render import (
8
+ BadToolDescriptionRenderer,
9
+ )
10
+ from wxo_agentic_evaluation.service_provider import get_provider
11
+ from wxo_agentic_evaluation.tool_planner import (
12
+ MISSING_DOCSTRING_PROMPT,
13
+ extract_tool_signatures,
14
+ parse_json_string,
15
+ )
16
+ from wxo_agentic_evaluation.type import ToolDefinition
17
+ from wxo_agentic_evaluation.utils.gateway_provider_utils import (
18
+ get_provider_kwargs,
19
+ )
20
+ from wxo_agentic_evaluation.utils.utils import safe_divide
21
+
22
+
23
+ class ToolDescriptionIssue(Enum):
24
+ """
25
+ Represents the binary outcomes the LLM judge will classify in its assessment \
26
+ of the tool's description.
27
+ The presence of these issues in the tool's description indicates poor quality.
28
+ For more detail on what each issue indicates, please take a look at the template here: `wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2`.
29
+ """
30
+
31
+ # TODO: Priority-based weighting of issues.
32
+ CONTAINS_REDUNDANT_INFORMATION = "contains_redundant_information"
33
+ USES_VAGUE_LANGUAGE = "uses_vague_language"
34
+ DOES_NOT_HELP_IN_IDENTIFYING_TOOL_UNIQUELY = (
35
+ "does_not_help_in_identifying_tool_uniquely"
36
+ )
37
+ PROVIDES_NO_NEW_INFORMATION = "provides_no_new_information"
38
+ DOES_NOT_CONVEY_TOOL_PURPOSE = "does_not_convey_tool_purpose"
39
+
40
+
41
+ class DescriptionQualityInspector:
42
+ DEFAULT_CLASSIFICATION_THRESHOLD = 40.0 # 2/5 issues detected. A higher score indicates a worse description.
43
+ CLASSIFICATION_SCORE_THRESHOLD = float(
44
+ os.getenv(
45
+ "CLASSIFICATION_SCORE_THRESHOLD", DEFAULT_CLASSIFICATION_THRESHOLD
46
+ )
47
+ )
48
+
49
+ LLM_MODEL_ID = "meta-llama/llama-3-2-90b-vision-instruct"
50
+ LLM_PARAMS = {
51
+ "min_new_tokens": 128,
52
+ "decoding_method": "greedy",
53
+ "max_new_tokens": 512,
54
+ }
55
+
56
+ WORST_POSSIBLE_EVAL_OUTCOME = len(
57
+ ToolDescriptionIssue
58
+ ) # the final score used for classification is normalized against this value.
59
+
60
+ root_dir = os.path.dirname(__file__)
61
+ BAD_TOOL_DESCRIPTIONS_DETECTOR_PATH = os.path.join(
62
+ root_dir, "prompt", "bad_tool_descriptions_prompt.jinja2"
63
+ )
64
+
65
+ DEFAULT_PROVIDER_KWARGS = {
66
+ "model_id": LLM_MODEL_ID,
67
+ "params": LLM_PARAMS,
68
+ }
69
+
70
+ def __init__(self, llm_client=None):
71
+
72
+ if llm_client is None:
73
+
74
+ provider_kwargs = get_provider_kwargs(
75
+ **self.DEFAULT_PROVIDER_KWARGS,
76
+ )
77
+
78
+ llm_client = get_provider(
79
+ **provider_kwargs,
80
+ )
81
+
82
+ self.llm_client = llm_client
83
+ self.template = BadToolDescriptionRenderer(
84
+ self.BAD_TOOL_DESCRIPTIONS_DETECTOR_PATH
85
+ )
86
+ self.cached_response = None # this is used in the unit-tests for nuanced analysis of the response.
87
+
88
+ @staticmethod
89
+ def extract_tool_desc_from_tool_source(
90
+ tool_source: Path, failing_tools: List[str]
91
+ ) -> List[ToolDefinition]:
92
+ """
93
+ Parses the tool source file to extract the tool description.
94
+ Wraps the description along with the tool name, and args into a `ToolDefinition` for all `failing_tools`.
95
+ This `ToolDefinition` is later rendered into the judge's prompt template for evaluation.
96
+ Args:
97
+ tool_source (Path): The path to the tool source file/dir containing `.py` tools.
98
+ failing_tools (List[str]): List of tool names that failed during inference.
99
+ Returns:
100
+ List[ToolDefinition]: The extracted tool definition(s) or [] if the file contains no @tool decorators.
101
+ """
102
+ all_tool_data = extract_tool_signatures(tool_source)
103
+
104
+ tool_definitions = []
105
+ for tool_data in all_tool_data:
106
+ tool_name = tool_data["Function Name"]
107
+ if tool_name in failing_tools:
108
+ tool_definitions.append(
109
+ ToolDefinition(
110
+ tool_name=tool_name,
111
+ tool_description=(
112
+ tool_data["Docstring"]
113
+ if tool_data["Docstring"]
114
+ != MISSING_DOCSTRING_PROMPT
115
+ else None
116
+ ),
117
+ tool_params=tool_data["Arguments"],
118
+ )
119
+ )
120
+ return tool_definitions
121
+
122
+ def detect_bad_description(
123
+ self, tool_definition: ToolDefinition
124
+ ) -> DescriptionQualityMetric:
125
+ """
126
+ Detects if a tool description is 'bad' using an LLM judge.
127
+ A 'bad' description is one that:
128
+ - does not describe the tool's functionality/use-case clearly
129
+ - does not provide sufficient detail for an agent to understand how to use the tool
130
+ - does not distinguish the tool from other tools
131
+ For the exact definition of a 'bad' description, refer to `ToolDescriptionIssue` Enum.
132
+ Args:
133
+ tool_definition (ToolDefinition): The definition of the tool to evaluate.
134
+ Returns:
135
+ bool: True if the description is 'bad', False otherwise.
136
+ """
137
+
138
+ if tool_definition.tool_description is None:
139
+ return DescriptionQualityMetric(tool_name=tool_definition.tool_name)
140
+
141
+ prompt = self.template.render(tool_definition=tool_definition)
142
+ response = self.llm_client.query(prompt)
143
+
144
+ # parse JSON objects from cleaned text
145
+ json_objects = parse_json_string(response)
146
+
147
+ # pick the first JSON object
148
+ if json_objects:
149
+ response_data = json_objects[0]
150
+ self.cached_response = response_data
151
+ else:
152
+ return False # likely some unexpected parsing issue, in this case - flags description as good.
153
+
154
+ # calculate weighted score
155
+ final_description_score = self._calculate_score(
156
+ response_data=response_data
157
+ )
158
+
159
+ return DescriptionQualityMetric(
160
+ tool_name=tool_definition.tool_name,
161
+ description_score=final_description_score,
162
+ threshold=self.CLASSIFICATION_SCORE_THRESHOLD,
163
+ )
164
+
165
+ def _calculate_score(self, response_data: dict) -> float:
166
+ """
167
+ Calculates a final score for the tool description.
168
+ This score is used to finally classify a 'good' or 'bad' description.
169
+ :param response_data: Parsed JSON response returned by the LLM judge.
170
+ """
171
+ detected_issues = sum(
172
+ 1
173
+ for issue in ToolDescriptionIssue
174
+ if response_data.get(issue.value, "FALSE").upper() == "TRUE"
175
+ )
176
+ return (
177
+ safe_divide(detected_issues, self.WORST_POSSIBLE_EVAL_OUTCOME) * 100
178
+ )
@@ -0,0 +1,50 @@
1
+ import json
2
+
3
+ from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
4
+ from wxo_agentic_evaluation.otel_support.otel_message_conversion import (
5
+ convert_otel_to_message,
6
+ )
7
+ from wxo_agentic_evaluation.type import EvaluationData, Message
8
+
9
+ with open(
10
+ "/Users/haodeqi/git/wxo-evaluation/src/wxo_agentic_evaluation/otel_support/collie_example.json",
11
+ "r",
12
+ ) as f:
13
+ data = json.load(f)
14
+
15
+ tc_name = "collie_trial"
16
+
17
+
18
+ history = convert_otel_to_message(data["calls"][-1]["messages"])
19
+ for message in history:
20
+ print(f"{message.role}: {message.content}")
21
+
22
+
23
+ with open(
24
+ "/Users/haodeqi/git/wxo-evaluation/src/wxo_agentic_evaluation/otel_support/data_simple.json",
25
+ "r",
26
+ ) as f:
27
+ gt = json.load(f)
28
+
29
+ tc_name = "collie_trial"
30
+
31
+ gt = EvaluationData.model_validate(gt)
32
+
33
+
34
+ evaluation_package = EvaluationPackage(
35
+ test_case_name=tc_name,
36
+ messages=history,
37
+ ground_truth=gt,
38
+ conversational_search_data=None,
39
+ resource_map=None,
40
+ )
41
+
42
+ (
43
+ keyword_semantic_matches,
44
+ knowledge_base_metrics,
45
+ messages_with_reason,
46
+ metrics,
47
+ ) = evaluation_package.generate_summary()
48
+
49
+
50
+ print(metrics)