ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
  4. wxo_agentic_evaluation/analyze_run.py +1025 -220
  5. wxo_agentic_evaluation/annotate.py +2 -2
  6. wxo_agentic_evaluation/arg_configs.py +60 -2
  7. wxo_agentic_evaluation/base_user.py +25 -0
  8. wxo_agentic_evaluation/batch_annotate.py +19 -2
  9. wxo_agentic_evaluation/clients.py +103 -0
  10. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  11. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  12. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  13. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  14. wxo_agentic_evaluation/data_annotator.py +25 -7
  15. wxo_agentic_evaluation/description_quality_checker.py +29 -6
  16. wxo_agentic_evaluation/evaluation.py +16 -8
  17. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  18. wxo_agentic_evaluation/evaluation_package.py +414 -69
  19. wxo_agentic_evaluation/external_agent/__init__.py +1 -1
  20. wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
  21. wxo_agentic_evaluation/external_agent/types.py +3 -9
  22. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  23. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  24. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  25. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  26. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  27. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  28. wxo_agentic_evaluation/llm_matching.py +104 -2
  29. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  30. wxo_agentic_evaluation/llm_user.py +5 -4
  31. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  32. wxo_agentic_evaluation/main.py +112 -343
  33. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  34. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  35. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  36. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  37. wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
  38. wxo_agentic_evaluation/metrics/metrics.py +276 -8
  39. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  40. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  41. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  42. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  43. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  44. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  45. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  46. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  47. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  48. wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
  49. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
  50. wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
  51. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  52. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
  53. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  54. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
  55. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  56. wxo_agentic_evaluation/prompt/template_render.py +103 -4
  57. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  58. wxo_agentic_evaluation/quick_eval.py +33 -17
  59. wxo_agentic_evaluation/record_chat.py +38 -32
  60. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
  61. wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
  62. wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
  63. wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
  64. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  65. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  66. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
  67. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
  68. wxo_agentic_evaluation/resource_map.py +3 -1
  69. wxo_agentic_evaluation/runner.py +329 -0
  70. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  71. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  72. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
  73. wxo_agentic_evaluation/scheduler.py +247 -0
  74. wxo_agentic_evaluation/service_instance.py +26 -17
  75. wxo_agentic_evaluation/service_provider/__init__.py +145 -9
  76. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  77. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
  78. wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
  79. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  80. wxo_agentic_evaluation/service_provider/provider.py +130 -10
  81. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
  82. wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
  83. wxo_agentic_evaluation/simluation_runner.py +125 -0
  84. wxo_agentic_evaluation/test_prompt.py +4 -4
  85. wxo_agentic_evaluation/type.py +185 -16
  86. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  87. wxo_agentic_evaluation/utils/__init__.py +44 -3
  88. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  89. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  90. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  91. wxo_agentic_evaluation/utils/parsers.py +71 -0
  92. wxo_agentic_evaluation/utils/utils.py +313 -9
  93. wxo_agentic_evaluation/wxo_client.py +81 -0
  94. ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
  95. wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
  96. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  97. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,193 @@
1
+ import statistics
2
+ from dataclasses import dataclass, field
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ from wxo_agentic_evaluation.utils.utils import safe_divide
6
+
7
+
8
+ @dataclass
9
+ class TestCaseEvaluationResult:
10
+ """Class representing a single test case evaluation result."""
11
+
12
+ name: str
13
+ text_match: Optional[str] = None
14
+ is_success: bool = False
15
+ total_steps: float = 0
16
+ llm_step: float = 0
17
+ total_tool_calls: float = 0
18
+ tool_call_precision: float = 0
19
+ tool_call_recall: float = 0
20
+ agent_routing_accuracy: float = 0
21
+ avg_resp_time: float = 0
22
+ failed_tool_calls: int = 0
23
+
24
+ # Store any additional metrics not explicitly defined
25
+ additional_metrics: Dict[str, Any] = field(default_factory=dict)
26
+
27
+ def matches_count(self, match_value: str = "Summary Matched") -> int:
28
+ """Check if this test case matches the specified value."""
29
+ return 1 if self.text_match == match_value else 0
30
+
31
+ def to_dict(self) -> Dict[str, Any]:
32
+ """Convert the test case result to a dictionary."""
33
+ result = {
34
+ "dataset_name": self.name,
35
+ "text_match": self.text_match,
36
+ "is_success": self.is_success,
37
+ "total_steps": self.total_steps,
38
+ "llm_step": self.llm_step,
39
+ "total_tool_calls": self.total_tool_calls,
40
+ "tool_call_precision": self.tool_call_precision,
41
+ "tool_call_recall": self.tool_call_recall,
42
+ "agent_routing_accuracy": self.agent_routing_accuracy,
43
+ "avg_resp_time": self.avg_resp_time,
44
+ "failed_tool_calls": self.failed_tool_calls,
45
+ }
46
+ # Add any additional metrics
47
+ result.update(self.additional_metrics)
48
+ return result
49
+
50
+
51
+ @dataclass
52
+ class EvaluationResult:
53
+ """Class representing a collection of test case evaluation results."""
54
+
55
+ test_case_results: Dict[str, TestCaseEvaluationResult]
56
+
57
+ @classmethod
58
+ def from_csv(cls, data: List[Dict[str, Any]]) -> "EvaluationResult":
59
+ """Create an EvaluationResult from CSV data."""
60
+ results = {}
61
+ for row in data:
62
+ name = row["dataset_name"]
63
+
64
+ # Extract standard fields
65
+ standard_fields = {
66
+ "name": name,
67
+ "text_match": row.get("text_match"),
68
+ "is_success": row.get("is_success", False),
69
+ "total_steps": row.get("total_steps", 0),
70
+ "llm_step": row.get("llm_step", 0),
71
+ "total_tool_calls": row.get("total_tool_calls", 0),
72
+ "tool_call_precision": row.get("tool_call_precision", 0),
73
+ "tool_call_recall": row.get("tool_call_recall", 0),
74
+ "agent_routing_accuracy": row.get("agent_routing_accuracy", 0),
75
+ "avg_resp_time": row.get("avg_resp_time", 0),
76
+ "failed_tool_calls": row.get("failed_tool_calls", 0),
77
+ }
78
+
79
+ # Extract additional fields not in the standard set
80
+ additional_metrics = {}
81
+ for key, value in row.items():
82
+ if key not in standard_fields and key != "dataset_name":
83
+ additional_metrics[key] = value
84
+
85
+ # Create the test case result
86
+ result = TestCaseEvaluationResult(
87
+ **standard_fields, additional_metrics=additional_metrics
88
+ )
89
+ results[name] = result
90
+
91
+ return cls(results)
92
+
93
+ def calculate_boolean_percent_true(
94
+ self, values: List[bool]
95
+ ) -> Dict[str, Any]:
96
+ """Calculate statistics for boolean values."""
97
+ return {
98
+ "mean": sum(1 for v in values if v) / len(values) if values else 0,
99
+ "count": len(values),
100
+ "true_count": sum(1 for v in values if v),
101
+ "false_count": sum(1 for v in values if not v),
102
+ }
103
+
104
+ def calculate_numeric_statistics(
105
+ self, values: List[float]
106
+ ) -> Dict[str, Any]:
107
+ """Calculate statistics for numeric values."""
108
+ try:
109
+ stats = {
110
+ "mean": statistics.mean(values),
111
+ "median": statistics.median(values),
112
+ "min": min(values),
113
+ "max": max(values),
114
+ "count": len(values),
115
+ }
116
+ if len(values) > 1:
117
+ stats["std_dev"] = statistics.stdev(values)
118
+ return stats
119
+ except statistics.StatisticsError:
120
+ # Handle empty lists or other statistical errors
121
+ return {"error": "Could not compute statistics"}
122
+
123
+ def compute_summary_statistics(self) -> Dict[str, Any]:
124
+ """Compute summary statistics for all test cases."""
125
+ stats = {}
126
+
127
+ if not self.test_case_results:
128
+ return stats
129
+
130
+ # Get all fields from the first test case
131
+ first_result = next(iter(self.test_case_results.values()))
132
+ first_dict = first_result.to_dict()
133
+
134
+ # Identify numeric and boolean columns
135
+ for key, value in first_dict.items():
136
+ if key == "dataset_name" or key == "text_match":
137
+ continue
138
+
139
+ # Collect values for this field from all test cases
140
+ values = []
141
+ for result in self.test_case_results.values():
142
+ result_dict = result.to_dict()
143
+ if key in result_dict:
144
+ values.append(result_dict[key])
145
+
146
+ # Calculate statistics based on value type
147
+ if values:
148
+ if all(isinstance(v, bool) for v in values):
149
+ stats[key] = self.calculate_boolean_percent_true(values)
150
+ elif all(isinstance(v, (int, float)) for v in values):
151
+ stats[key] = self.calculate_numeric_statistics(values)
152
+
153
+ # Count summary matches
154
+ match_count = sum(
155
+ result.matches_count() for result in self.test_case_results.values()
156
+ )
157
+ stats["summary_matched_count"] = {
158
+ "count": match_count,
159
+ "percentage": (
160
+ round(match_count / len(self.test_case_results) * 100, 2)
161
+ if self.test_case_results
162
+ else 0
163
+ ),
164
+ }
165
+
166
+ return stats
167
+
168
+ @property
169
+ def test_count(self) -> int:
170
+ """Get the total number of test cases."""
171
+ return len(self.test_case_results)
172
+
173
+ @property
174
+ def summary_matched_count(self) -> int:
175
+ """Get the count of summary matched test cases."""
176
+ return sum(
177
+ result.matches_count() for result in self.test_case_results.values()
178
+ )
179
+
180
+ @property
181
+ def is_success_count(self) -> int:
182
+ """Get the count of successful test cases."""
183
+ return sum(
184
+ 1 for result in self.test_case_results.values() if result.is_success
185
+ )
186
+
187
+ def summary_match_ratio(self) -> float:
188
+ """Calculate the ratio of summary matches to total tests."""
189
+ return safe_divide(self.summary_matched_count, self.test_count)
190
+
191
+ def is_success_ratio(self) -> float:
192
+ """Calculate the ratio of successful tests to total tests."""
193
+ return safe_divide(self.is_success_count, self.test_count)
@@ -3,13 +3,16 @@ import collections
3
3
  import json
4
4
  from typing import Dict, List, Optional
5
5
 
6
- from wxo_agentic_evaluation.arg_configs import KeywordsGenerationConfig
6
+ from wxo_agentic_evaluation.arg_configs import (
7
+ ChatRecordingConfig,
8
+ KeywordsGenerationConfig,
9
+ )
7
10
  from wxo_agentic_evaluation.prompt.template_render import (
8
11
  LlamaKeywordsGenerationTemplateRenderer,
9
12
  )
10
13
  from wxo_agentic_evaluation.service_provider import get_provider
11
14
  from wxo_agentic_evaluation.service_provider.watsonx_provider import Provider
12
- from wxo_agentic_evaluation.type import EvaluationData, Message
15
+ from wxo_agentic_evaluation.type import Message, OrchestrateDataset
13
16
 
14
17
  ERROR_KEYWORDS = [
15
18
  "error",
@@ -113,11 +116,11 @@ class DataAnnotator:
113
116
  self,
114
117
  messages: List[Message],
115
118
  keywords_generation_config: KeywordsGenerationConfig,
116
- initial_data: Optional[EvaluationData] = None,
119
+ initial_data: Optional[OrchestrateDataset] = None,
117
120
  ):
118
121
  self.messages = messages
119
122
  self.keywords_generation_config = keywords_generation_config
120
- self.initial_data = initial_data or EvaluationData(
123
+ self.initial_data = initial_data or OrchestrateDataset(
121
124
  agent="",
122
125
  story="",
123
126
  starting_sentence=messages[0].content if messages else "",
@@ -223,11 +226,23 @@ class DataAnnotator:
223
226
  return goals, goal_details, previous
224
227
 
225
228
  def _process_summarization(
226
- self, previous: str, goals: Dict, goal_details: List
229
+ self,
230
+ previous: str,
231
+ goals: Dict,
232
+ goal_details: List,
233
+ config: ChatRecordingConfig = None,
227
234
  ) -> None:
228
235
  """Process summarization step"""
229
236
  summarize_step = None
230
237
  # we assume single summary step at the end
238
+ extra_kwargs = {}
239
+ instance_url = getattr(config, "service_url", None)
240
+ token = getattr(config, "token", None)
241
+ if instance_url:
242
+ extra_kwargs["instance_url"] = instance_url
243
+ if token:
244
+ extra_kwargs["token"] = token
245
+
231
246
  for message in self.messages[::-1]:
232
247
  if message.role == "assistant":
233
248
  provider = get_provider(
@@ -237,6 +252,7 @@ class DataAnnotator:
237
252
  "decoding_method": "greedy",
238
253
  "max_new_tokens": 256,
239
254
  },
255
+ **extra_kwargs,
240
256
  )
241
257
  kw_generator = KeywordsGenerationLLM(
242
258
  provider=provider,
@@ -261,10 +277,12 @@ class DataAnnotator:
261
277
  else:
262
278
  goals[previous] = ["summarize"]
263
279
 
264
- def generate(self) -> Dict:
280
+ def generate(self, config: ChatRecordingConfig = None) -> Dict:
265
281
  """Generate the final dataset"""
266
282
  goals, goal_details, previous = self._process_tool_calls()
267
- self._process_summarization(previous, goals, goal_details)
283
+ self._process_summarization(
284
+ previous, goals, goal_details, config=config
285
+ )
268
286
 
269
287
  return {
270
288
  "agent": self.initial_data.agent,
@@ -3,8 +3,7 @@ from enum import Enum
3
3
  from pathlib import Path
4
4
  from typing import List
5
5
 
6
- import rich
7
-
6
+ from wxo_agentic_evaluation.metrics.metrics import DescriptionQualityMetric
8
7
  from wxo_agentic_evaluation.prompt.template_render import (
9
8
  BadToolDescriptionRenderer,
10
9
  )
@@ -15,6 +14,9 @@ from wxo_agentic_evaluation.tool_planner import (
15
14
  parse_json_string,
16
15
  )
17
16
  from wxo_agentic_evaluation.type import ToolDefinition
17
+ from wxo_agentic_evaluation.utils.gateway_provider_utils import (
18
+ get_provider_kwargs,
19
+ )
18
20
  from wxo_agentic_evaluation.utils.utils import safe_divide
19
21
 
20
22
 
@@ -60,12 +62,23 @@ class DescriptionQualityInspector:
60
62
  root_dir, "prompt", "bad_tool_descriptions_prompt.jinja2"
61
63
  )
62
64
 
65
+ DEFAULT_PROVIDER_KWARGS = {
66
+ "model_id": LLM_MODEL_ID,
67
+ "params": LLM_PARAMS,
68
+ }
69
+
63
70
  def __init__(self, llm_client=None):
71
+
64
72
  if llm_client is None:
73
+
74
+ provider_kwargs = get_provider_kwargs(
75
+ **self.DEFAULT_PROVIDER_KWARGS,
76
+ )
77
+
65
78
  llm_client = get_provider(
66
- model_id=self.LLM_MODEL_ID,
67
- params=self.LLM_PARAMS,
79
+ **provider_kwargs,
68
80
  )
81
+
69
82
  self.llm_client = llm_client
70
83
  self.template = BadToolDescriptionRenderer(
71
84
  self.BAD_TOOL_DESCRIPTIONS_DETECTOR_PATH
@@ -106,7 +119,9 @@ class DescriptionQualityInspector:
106
119
  )
107
120
  return tool_definitions
108
121
 
109
- def detect_bad_description(self, tool_definition: ToolDefinition) -> bool:
122
+ def detect_bad_description(
123
+ self, tool_definition: ToolDefinition
124
+ ) -> DescriptionQualityMetric:
110
125
  """
111
126
  Detects if a tool description is 'bad' using an LLM judge.
112
127
  A 'bad' description is one that:
@@ -119,6 +134,10 @@ class DescriptionQualityInspector:
119
134
  Returns:
120
135
  bool: True if the description is 'bad', False otherwise.
121
136
  """
137
+
138
+ if tool_definition.tool_description is None:
139
+ return DescriptionQualityMetric(tool_name=tool_definition.tool_name)
140
+
122
141
  prompt = self.template.render(tool_definition=tool_definition)
123
142
  response = self.llm_client.query(prompt)
124
143
 
@@ -137,7 +156,11 @@ class DescriptionQualityInspector:
137
156
  response_data=response_data
138
157
  )
139
158
 
140
- return final_description_score >= self.CLASSIFICATION_SCORE_THRESHOLD
159
+ return DescriptionQualityMetric(
160
+ tool_name=tool_definition.tool_name,
161
+ description_score=final_description_score,
162
+ threshold=self.CLASSIFICATION_SCORE_THRESHOLD,
163
+ )
141
164
 
142
165
  def _calculate_score(self, response_data: dict) -> float:
143
166
  """
@@ -1,10 +1,15 @@
1
- from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
2
- from wxo_agentic_evaluation.otel_support.otel_message_conversion import convert_otel_to_message
3
- from wxo_agentic_evaluation.type import Message, EvaluationData
4
-
5
1
  import json
6
2
 
7
- with open("/Users/haodeqi/git/wxo-evaluation/src/wxo_agentic_evaluation/otel_support/collie_example.json", "r") as f:
3
+ from wxo_agentic_evaluation.evaluation_package import EvaluationPackage
4
+ from wxo_agentic_evaluation.otel_support.otel_message_conversion import (
5
+ convert_otel_to_message,
6
+ )
7
+ from wxo_agentic_evaluation.type import EvaluationData, Message
8
+
9
+ with open(
10
+ "/Users/haodeqi/git/wxo-evaluation/src/wxo_agentic_evaluation/otel_support/collie_example.json",
11
+ "r",
12
+ ) as f:
8
13
  data = json.load(f)
9
14
 
10
15
  tc_name = "collie_trial"
@@ -15,7 +20,10 @@ for message in history:
15
20
  print(f"{message.role}: {message.content}")
16
21
 
17
22
 
18
- with open("/Users/haodeqi/git/wxo-evaluation/src/wxo_agentic_evaluation/otel_support/data_simple.json", "r") as f:
23
+ with open(
24
+ "/Users/haodeqi/git/wxo-evaluation/src/wxo_agentic_evaluation/otel_support/data_simple.json",
25
+ "r",
26
+ ) as f:
19
27
  gt = json.load(f)
20
28
 
21
29
  tc_name = "collie_trial"
@@ -28,7 +36,7 @@ evaluation_package = EvaluationPackage(
28
36
  messages=history,
29
37
  ground_truth=gt,
30
38
  conversational_search_data=None,
31
- resource_map=None
39
+ resource_map=None,
32
40
  )
33
41
 
34
42
  (
@@ -39,4 +47,4 @@ evaluation_package = EvaluationPackage(
39
47
  ) = evaluation_package.generate_summary()
40
48
 
41
49
 
42
- print(metrics)
50
+ print(metrics)