ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
  4. wxo_agentic_evaluation/analytics/tools/main.py +19 -25
  5. wxo_agentic_evaluation/analytics/tools/types.py +26 -11
  6. wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
  7. wxo_agentic_evaluation/analyze_run.py +1184 -97
  8. wxo_agentic_evaluation/annotate.py +7 -5
  9. wxo_agentic_evaluation/arg_configs.py +97 -5
  10. wxo_agentic_evaluation/base_user.py +25 -0
  11. wxo_agentic_evaluation/batch_annotate.py +97 -27
  12. wxo_agentic_evaluation/clients.py +103 -0
  13. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  14. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  15. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  16. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  17. wxo_agentic_evaluation/data_annotator.py +45 -19
  18. wxo_agentic_evaluation/description_quality_checker.py +178 -0
  19. wxo_agentic_evaluation/evaluation.py +50 -0
  20. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  21. wxo_agentic_evaluation/evaluation_package.py +544 -107
  22. wxo_agentic_evaluation/external_agent/__init__.py +18 -7
  23. wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
  24. wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
  25. wxo_agentic_evaluation/external_agent/types.py +8 -7
  26. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  27. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  28. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  29. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  30. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  31. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  32. wxo_agentic_evaluation/llm_matching.py +108 -5
  33. wxo_agentic_evaluation/llm_rag_eval.py +7 -4
  34. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  35. wxo_agentic_evaluation/llm_user.py +12 -6
  36. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  37. wxo_agentic_evaluation/main.py +128 -246
  38. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  39. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  40. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  41. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  42. wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
  43. wxo_agentic_evaluation/metrics/metrics.py +319 -16
  44. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  45. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  46. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  47. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  48. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  49. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  50. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  51. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  52. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  53. wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
  54. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
  55. wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
  56. wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
  57. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  58. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
  59. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  60. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
  61. wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
  62. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  63. wxo_agentic_evaluation/prompt/template_render.py +163 -12
  64. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  65. wxo_agentic_evaluation/quick_eval.py +384 -0
  66. wxo_agentic_evaluation/record_chat.py +132 -81
  67. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
  68. wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
  69. wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
  70. wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
  71. wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
  72. wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
  73. wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
  74. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
  75. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
  76. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
  77. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
  78. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  79. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
  80. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
  81. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
  82. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  83. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
  84. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
  85. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
  86. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
  87. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
  88. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
  89. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
  90. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
  91. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
  92. wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
  93. wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
  94. wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
  95. wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
  96. wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
  97. wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
  98. wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
  99. wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
  100. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
  101. wxo_agentic_evaluation/resource_map.py +6 -3
  102. wxo_agentic_evaluation/runner.py +329 -0
  103. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  104. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  105. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
  106. wxo_agentic_evaluation/scheduler.py +247 -0
  107. wxo_agentic_evaluation/service_instance.py +117 -26
  108. wxo_agentic_evaluation/service_provider/__init__.py +182 -17
  109. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  110. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
  111. wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
  112. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  113. wxo_agentic_evaluation/service_provider/provider.py +129 -10
  114. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
  115. wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
  116. wxo_agentic_evaluation/simluation_runner.py +125 -0
  117. wxo_agentic_evaluation/test_prompt.py +4 -4
  118. wxo_agentic_evaluation/tool_planner.py +141 -46
  119. wxo_agentic_evaluation/type.py +217 -14
  120. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  121. wxo_agentic_evaluation/utils/__init__.py +44 -3
  122. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  123. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  124. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  125. wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
  126. wxo_agentic_evaluation/utils/parsers.py +71 -0
  127. wxo_agentic_evaluation/utils/rich_utils.py +188 -0
  128. wxo_agentic_evaluation/utils/rouge_score.py +23 -0
  129. wxo_agentic_evaluation/utils/utils.py +514 -17
  130. wxo_agentic_evaluation/wxo_client.py +81 -0
  131. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
  132. ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
  133. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  134. {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -1,24 +1,179 @@
1
+ import csv
2
+ import glob
3
+ import json
4
+ import math
5
+ import os
6
+ import re
7
+ from collections import defaultdict
8
+ from pathlib import Path
9
+ from typing import Any, Dict, List, Mapping, Optional, Tuple, Union
1
10
  from urllib.parse import urlparse
11
+
12
+ import rich
13
+ import yaml
14
+ from rich import box, print
2
15
  from rich.console import Console, Group
3
- from rich.table import Table
4
16
  from rich.panel import Panel
5
17
  from rich.rule import Rule
6
- from rich import box
7
- from rich import print
8
-
9
- from typing import List
18
+ from rich.style import Style
19
+ from rich.table import Table
10
20
 
11
21
  from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness
12
- from wxo_agentic_evaluation.metrics.metrics import KnowledgeBaseMetricSummary
13
- from wxo_agentic_evaluation.type import ConversationalConfidenceThresholdScore
22
+ from wxo_agentic_evaluation.metrics.metrics import (
23
+ KnowledgeBaseMetricSummary,
24
+ ReferenceLessEvalMetrics,
25
+ ToolCallAndRoutingMetrics,
26
+ )
27
+ from wxo_agentic_evaluation.type import (
28
+ ConversationalConfidenceThresholdScore,
29
+ ExtendedMessage,
30
+ Message,
31
+ )
14
32
 
15
33
  console = Console()
16
34
 
35
+ RUN_FILE_RE = re.compile(
36
+ r"^(?P<base>.+)\.run(?P<run>\d+)\.(?P<kind>messages(?:\.analyze)?|metrics)\.json$"
37
+ )
38
+ N_A = "N/A"
39
+
40
+ # File name constants
41
+ REFERENCE_FILE_NAME = "reference"
42
+ EXPERIMENT_FILE_NAME = "experiment"
43
+
44
+
45
+ class AttackResultsTable:
46
+ def __init__(self, attack_results: dict):
47
+ self.table = Table(
48
+ title="Attack Results",
49
+ box=box.ROUNDED,
50
+ show_lines=True,
51
+ )
52
+ self.table.add_column("Attack Category", style="magenta")
53
+ self.table.add_column("Count", style="cyan")
54
+ self.table.add_column("Success Rate", style="green")
55
+
56
+ # Extract values
57
+ n_on_policy = attack_results.get("n_on_policy_attacks", 0)
58
+ n_off_policy = attack_results.get("n_off_policy_attacks", 0)
59
+ n_on_policy_successful = attack_results.get("n_on_policy_successful", 0)
60
+ n_off_policy_successful = attack_results.get(
61
+ "n_off_policy_successful", 0
62
+ )
63
+
64
+ # Calculate success rates
65
+ on_policy_rate = (
66
+ f"{round(100 * safe_divide(n_on_policy_successful, n_on_policy))}%"
67
+ if n_on_policy
68
+ else "0%"
69
+ )
70
+ off_policy_rate = (
71
+ f"{round(100 * safe_divide(n_off_policy_successful, n_off_policy))}%"
72
+ if n_off_policy
73
+ else "0%"
74
+ )
75
+
76
+ self.table.add_row("On Policy", str(n_on_policy), on_policy_rate)
77
+ self.table.add_row("Off Policy", str(n_off_policy), off_policy_rate)
78
+
79
+ def print(self):
80
+ console.print(self.table)
81
+
82
+
83
+ class TestCaseResources:
84
+ def __init__(self, output_dir: str):
85
+ """Todo flesh out for all resources that are saved"""
86
+ self.output_dir = Path(output_dir)
87
+
88
+ @property
89
+ def get_summary(self):
90
+ summary = []
91
+
92
+ with open(self.output_dir / "summary_metrics.csv", "r") as f:
93
+ reader = csv.reader(f)
94
+ header = next(reader)
95
+ for row in reader:
96
+ summary.append(dict(zip(header, row)))
97
+
98
+ return summary
99
+
100
+ def get_analyze_messages(
101
+ self, test_case_name=None, path=None
102
+ ) -> Tuple[List[ExtendedMessage], Mapping[str, Any]]:
103
+ test_messages = []
104
+
105
+ if test_case_name:
106
+ path = os.path.join(
107
+ self.output_dir,
108
+ "messages",
109
+ f"{test_case_name}.messages.analyze.json",
110
+ )
111
+
112
+ if not Path(str(path)).is_file():
113
+ rich.print(f"[r]No analyze file found at {path}")
114
+ raise Exception(f"No analyze file found at {path}")
115
+
116
+ with open(path, "r", encoding="utf-8") as f:
117
+ temp = json.load(f)
118
+ meta = None
119
+ if temp and isinstance(temp[-1], dict) and "meta" in temp[-1]:
120
+ meta = temp[-1]["meta"]
121
+ temp = temp[:-1]
122
+
123
+ for entry in temp:
124
+ msg = ExtendedMessage(**entry)
125
+ test_messages.append(msg)
126
+
127
+ return test_messages, meta
128
+
129
+ def get_messages(self, test_case_name=None, path=None) -> List[Message]:
130
+ test_messages = []
131
+
132
+ if test_case_name:
133
+ path = os.path.join(
134
+ self.output_dir,
135
+ "messages",
136
+ f"{test_case_name}.messages.json",
137
+ )
138
+
139
+ if not Path(str(path)).is_file():
140
+ rich.print(f"[r]No messages file found at {path}")
141
+ raise Exception(f"No messages file found at {path}")
142
+
143
+ with open(path, "r", encoding="utf-8") as f:
144
+ temp = json.load(f)
145
+ for entry in temp:
146
+ msg = Message(**entry)
147
+ test_messages.append(msg)
148
+
149
+ return test_messages
150
+
151
+ def get_test_metrics(
152
+ self, test_case_name=None, path=None
153
+ ) -> ToolCallAndRoutingMetrics:
154
+ if test_case_name:
155
+ path = os.path.join(
156
+ self.output_dir,
157
+ "messages",
158
+ f"{test_case_name}.metrics.json",
159
+ )
160
+
161
+ if not Path(str(path)).is_file():
162
+ rich.print(f"[r]No metrics file found at {path}")
163
+ raise Exception(f"No metrics file found at {path}")
164
+
165
+ with open(path, "r", encoding="utf-8") as f:
166
+ metrics = ToolCallAndRoutingMetrics(**json.load(f))
167
+
168
+ return metrics
169
+
17
170
 
18
171
  class AgentMetricsTable:
19
- def __init__(self, data):
172
+ def __init__(self, data, title: Optional[str] = None):
173
+ if title is None:
174
+ title = "Agent Metrics"
20
175
  self.table = Table(
21
- title="Agent Metrics",
176
+ title=title,
22
177
  box=box.ROUNDED,
23
178
  show_lines=True,
24
179
  )
@@ -39,7 +194,9 @@ class AgentMetricsTable:
39
194
  console.print(self.table)
40
195
 
41
196
 
42
- def create_table(data: List[dict]) -> AgentMetricsTable:
197
+ def create_table(
198
+ data: List[dict], title: Optional[str] = None
199
+ ) -> AgentMetricsTable:
43
200
  """
44
201
  Generate a Rich table from a list of dictionaries.
45
202
  Returns the AgentMetricsTable instance.
@@ -51,14 +208,55 @@ def create_table(data: List[dict]) -> AgentMetricsTable:
51
208
  print("create_table() received an empty dataset. No table generated.")
52
209
  return None
53
210
 
54
- return AgentMetricsTable(data)
211
+ return AgentMetricsTable(data, title=title)
212
+
213
+
214
+ def mean(vals: List[float]) -> float:
215
+ """
216
+ Calculate the mean of a list of values.
217
+
218
+ Args:
219
+ vals: List of values
220
+
221
+ Returns:
222
+ Mean value
223
+ """
224
+ return round(sum(vals) / len(vals), 2) if vals else 0.0
225
+
226
+
227
+ def to_pct(value: float | None, decimals: int = 0) -> str:
228
+ """
229
+ Convert a value to a percentage string.
230
+
231
+ Args:
232
+ value: Value to convert
233
+ decimals: Number of decimal places
234
+
235
+ Returns:
236
+ Percentage string
237
+ """
238
+ if value is None:
239
+ return "NA"
240
+ try:
241
+ return f"{round(float(value) * 100, decimals)}%"
242
+ except Exception:
243
+ return "NA"
244
+
245
+
246
+ def average(array) -> float:
247
+ if len(array) == 0:
248
+ return math.nan
249
+
250
+ else:
251
+ return sum(array) / len(array)
55
252
 
56
253
 
57
254
  def safe_divide(nom, denom):
58
255
  if denom == 0:
59
256
  return 0
60
257
  else:
61
- return nom/denom
258
+ return nom / denom
259
+
62
260
 
63
261
  def is_saas_url(service_url: str) -> bool:
64
262
  hostname = urlparse(service_url).hostname
@@ -70,22 +268,132 @@ def is_ibm_cloud_url(service_url: str) -> bool:
70
268
  return ".cloud.ibm.com" in hostname
71
269
 
72
270
 
73
- def add_line_seperator():
74
- console.print(Rule(style="grey42"))
271
+ def add_line_seperator(
272
+ style_config: Optional[Union[str, Style]] = None,
273
+ print=True,
274
+ ):
275
+ """
276
+ Adds a lined seperator provided the style config.
277
+ `print` is a boolean to indicate if the lined seperator should go to stdout immeadiatly or returned as an object.
278
+ Set `print` to False, the lined seperator is printed later as part of the pager view for example.
279
+ """
280
+
281
+ if not style_config:
282
+ style = "grey42"
283
+ else:
284
+ style = style_config
285
+
286
+ if print:
287
+ console.print(
288
+ Rule(
289
+ style=style,
290
+ )
291
+ )
292
+ else:
293
+ return Rule(style=style, characters="==")
294
+
295
+
296
+ def get_reference_column(base_name: str) -> str:
297
+ """Generate a column name with the reference suffix."""
298
+ return f"{base_name}_{REFERENCE_FILE_NAME}"
299
+
300
+
301
+ def get_experiment_column(base_name: str) -> str:
302
+ """Generate a column name with the experiment suffix."""
303
+ return f"{base_name}_{EXPERIMENT_FILE_NAME}"
304
+
305
+
306
+ def get_diff_column(base_name: str) -> str:
307
+ """Generate a diff column name."""
308
+ return f"{base_name}_diff"
309
+
310
+
311
+ def get_column_value(
312
+ row: Dict[str, Any], base_name: str, file_type: str
313
+ ) -> Any:
314
+ """Get a value from a column with the appropriate suffix.
315
+
316
+ Args:
317
+ row: The data row
318
+ base_name: The base column name
319
+ file_type: Either 'reference' or 'experiment'
320
+
321
+ Returns:
322
+ The value from the column, or None if not found
323
+ """
324
+ if file_type.lower() == "reference":
325
+ key = get_reference_column(base_name)
326
+ elif file_type.lower() == "experiment":
327
+ key = get_experiment_column(base_name)
328
+ else:
329
+ raise ValueError(f"Invalid file_type: {file_type}")
330
+
331
+ return row.get(key)
332
+
333
+
334
+ def has_column_in_both(row: Dict[str, Any], base_name: str) -> bool:
335
+ """Check if a column exists with both reference and experiment suffixes."""
336
+ return (
337
+ get_reference_column(base_name) in row
338
+ and get_experiment_column(base_name) in row
339
+ )
340
+
341
+
342
+ def format_ratio(ratio: Optional[float]) -> str:
343
+ """Format a ratio as a percentage string."""
344
+ if ratio is None:
345
+ return "N/A"
346
+ return f"{ratio * 100:.1f}%"
347
+
348
+
349
+ def read_file(path: str, type: str = "csv") -> List[Dict[str, Any]]:
350
+ """Read a file and return its contents as a structured object."""
351
+ if type == "csv":
352
+ return read_csv_file(path)
353
+ elif type == "json":
354
+ # Add JSON reading logic if needed
355
+ raise NotImplementedError("JSON reading not yet implemented")
356
+ else:
357
+ raise ValueError(f"Unsupported file type: {type}")
358
+
359
+
360
+ def read_csv_file(file_path: str) -> List[Dict[str, Any]]:
361
+ """Read a CSV file and return a list of dictionaries."""
362
+ data = []
363
+ with open(file_path, "r") as f:
364
+ reader = csv.DictReader(f)
365
+ for row in reader:
366
+ # Convert numeric values to appropriate types
367
+ for key, value in row.items():
368
+ if key == "dataset_name" or key == "text_match":
369
+ continue
370
+ elif key == "is_success":
371
+ row[key] = value.lower() == "true"
372
+ else:
373
+ try:
374
+ row[key] = float(value)
375
+ except ValueError:
376
+ pass
377
+ data.append(row)
378
+ return data
75
379
 
76
380
 
77
381
  class FaithfulnessTable:
78
382
  def __init__(
79
383
  self, faithfulness_metrics: List[Faithfulness], tool_call_ids: List[str]
80
384
  ):
81
- self.table = Table(title="Faithfulness", box=box.ROUNDED, show_lines=True)
385
+ self.table = Table(
386
+ title="Faithfulness", box=box.ROUNDED, show_lines=True
387
+ )
82
388
 
83
389
  self.table.add_column("Tool Call Id", style="blue")
84
390
  self.table.add_column("Faithfulness Score", style="blue3")
85
391
  self.table.add_column("Evidence", style="cyan")
86
392
  self.table.add_column("Reasoning", style="yellow3")
87
393
 
88
- for tool_call_id, faithfulness in zip(tool_call_ids, faithfulness_metrics):
394
+ for tool_call_id, faithfulness in zip(
395
+ tool_call_ids, faithfulness_metrics
396
+ ):
89
397
  faithfulness = faithfulness.table()
90
398
  self.table.add_row(
91
399
  tool_call_id,
@@ -139,7 +447,9 @@ class KnowledgePanel:
139
447
  self.confidence_scores = ConversationalSearchTable(
140
448
  confidence_scores, tool_call_id
141
449
  )
142
- self.group = Group(self.faithfulness.table, self.confidence_scores.table)
450
+ self.group = Group(
451
+ self.faithfulness.table, self.confidence_scores.table
452
+ )
143
453
 
144
454
  # Panel acts as a section
145
455
  self.section = Panel(
@@ -183,3 +493,190 @@ class SummaryPanel:
183
493
 
184
494
  def print(self):
185
495
  console.print(self.table)
496
+
497
+
498
+ class Tokenizer:
499
+ PATTERN = r"""
500
+ \w+(?=n't)| # Words before n't contractions (e.g., "do" in "don't")
501
+ n't| # n't contractions themselves
502
+ \w+(?=')| # Words before apostrophes (e.g., "I" in "I'm")
503
+ '| # Apostrophes as separate tokens
504
+ \w+| # Regular words (letters, numbers, underscores)
505
+ [^\w\s] # Punctuation marks (anything that's not word chars or whitespace)
506
+ """
507
+
508
+ def __init__(self):
509
+ self.compiled_pattern = re.compile(
510
+ self.PATTERN, re.VERBOSE | re.IGNORECASE
511
+ )
512
+
513
+ def __call__(self, text: str) -> List[str]:
514
+ """
515
+ Tokenizes text by splitting on punctuation and handling contractions.
516
+
517
+ Args:
518
+ text: Input text to tokenize.
519
+
520
+ Returns:
521
+ List of tokenized words (lowercase, no punctuation).
522
+
523
+ Examples:
524
+ - "I'm fine" -> ['i', 'm', 'fine']
525
+ - "don't go" -> ['do', "n't", 'go']
526
+ - "Hello, world!" -> ['hello', 'world']
527
+ """
528
+
529
+ tokens = self.compiled_pattern.findall(text)
530
+
531
+ return self._clean_tokens(tokens)
532
+
533
+ def _clean_tokens(self, raw_tokens: List[str]) -> List[str]:
534
+ """
535
+ Applies some basic post-processing to tokenized messages.
536
+
537
+ Args:
538
+ raw_tokens: list of tokens extracted from a message.
539
+ """
540
+
541
+ filtered_tokens = [
542
+ token.lower()
543
+ for token in raw_tokens
544
+ if token.strip() and not (len(token) == 1 and not token.isalnum())
545
+ ]
546
+
547
+ return filtered_tokens
548
+
549
+
550
+ class ReferencelessEvalPanel:
551
+ def __init__(self, referenceless_metrics: List[ReferenceLessEvalMetrics]):
552
+ self.table = Table(
553
+ title="Quick Evaluation Summary Metrics",
554
+ box=box.ROUNDED,
555
+ show_lines=True,
556
+ )
557
+
558
+ self.table.add_column("Dataset", style="yellow", justify="center")
559
+ self.table.add_column(
560
+ "Tool Calls", style="deep_sky_blue1", justify="center"
561
+ )
562
+ self.table.add_column(
563
+ "Successful Tool Calls", style="magenta", justify="center"
564
+ )
565
+ self.table.add_column(
566
+ "Tool Calls Failed due to Schema Mismatch",
567
+ style="deep_sky_blue1",
568
+ justify="center",
569
+ )
570
+ self.table.add_column(
571
+ "Tool Calls Failed due to Hallucination",
572
+ style="magenta",
573
+ justify="center",
574
+ )
575
+
576
+ for metric in referenceless_metrics:
577
+ self.table.add_row(
578
+ str(metric.dataset_name),
579
+ str(metric.number_of_tool_calls),
580
+ str(metric.number_of_successful_tool_calls),
581
+ str(metric.number_of_static_failed_tool_calls),
582
+ str(metric.number_of_semantic_failed_tool_calls),
583
+ )
584
+
585
+ def print(self):
586
+ console.print(self.table)
587
+
588
+
589
+ # Function to load messages from JSON file
590
+ def load_messages(file_path):
591
+ """TODO: replace in favor of TestCaseResources.get_messages(...)"""
592
+ with open(file_path, "r") as f:
593
+ try:
594
+ message_data = json.load(f)
595
+ messages = []
596
+ for msg in message_data:
597
+ messages.append(Message.model_validate(msg))
598
+
599
+ return messages
600
+
601
+ except Exception as e:
602
+ print(file_path)
603
+ print(e)
604
+ return None
605
+
606
+
607
+ def load_agents_from_disk(agents_path: str):
608
+ agents_json = glob.glob(os.path.join(agents_path, "*.json"))
609
+ agents_yaml = glob.glob(os.path.join(agents_path, "*.yaml"))
610
+
611
+ agents = []
612
+
613
+ for agent_path in agents_json:
614
+ with open(agent_path, "r") as f:
615
+ agents.append(json.load(f))
616
+
617
+ for agent_path in agents_yaml:
618
+ with open(agent_path, "r") as f:
619
+ agents.append(yaml.safe_load(f))
620
+
621
+ return agents
622
+
623
+
624
+ def list_run_files(messages_dir: str, dataset_base: str, filter_run: int = -1):
625
+ """
626
+ Returns: dict[run_id] -> {"analyze": path|None, "metrics": path|None}
627
+ (We only need analyze+metrics for this feature.)
628
+
629
+ `filter_run` only get gets the runs files for that run. If it is -1, then all run files are retrieved
630
+ For example, if there is `data3.run1.messages.json`, `data3.run2.messages.json`, and filter_run is 2, then,
631
+ the files related to only the second run are retrieved.
632
+
633
+ """
634
+ runs = defaultdict(
635
+ lambda: {"analyze": None, "metrics": None, "messages": None}
636
+ )
637
+ for fn in os.listdir(messages_dir):
638
+ m = RUN_FILE_RE.match(fn)
639
+ if not m or m.group("base") != dataset_base:
640
+ continue
641
+ run_id = int(m.group("run"))
642
+ if filter_run != -1 and run_id != filter_run:
643
+ continue
644
+
645
+ kind = m.group("kind")
646
+ full = os.path.join(messages_dir, fn)
647
+ if kind == "messages.analyze":
648
+ runs[run_id]["analyze"] = full
649
+ elif kind == "metrics":
650
+ runs[run_id]["metrics"] = full
651
+ elif kind == "messages":
652
+ runs[run_id]["messages"] = full
653
+ return runs
654
+
655
+
656
+ def load_run_metrics(metrics_path: str) -> ToolCallAndRoutingMetrics:
657
+ """Todo remove in a later PR"""
658
+ with open(metrics_path, "r", encoding="utf-8") as f:
659
+ return ToolCallAndRoutingMetrics(**json.load(f))
660
+
661
+
662
+ def csv_dump(file_path: Union[str, Path], rows: List[Dict[str, Any]]) -> None:
663
+ """
664
+ Write rows to a CSV file.
665
+
666
+ Args:
667
+ file_path: Path to the output CSV file
668
+ rows: List of dictionaries representing CSV rows
669
+ """
670
+ if not rows:
671
+ return
672
+
673
+ # Ensure the parent directory exists
674
+ if isinstance(file_path, str):
675
+ file_path = Path(file_path)
676
+ file_path.parent.mkdir(parents=True, exist_ok=True)
677
+
678
+ # Write to CSV
679
+ with open(file_path, "w", newline="") as f:
680
+ writer = csv.DictWriter(f, fieldnames=rows[0].keys())
681
+ writer.writeheader()
682
+ writer.writerows(rows)
@@ -0,0 +1,81 @@
1
+ import os
2
+ from typing import Any, Dict, Optional
3
+
4
+ import requests
5
+ import urllib3
6
+ from urllib3.exceptions import InsecureRequestWarning
7
+
8
+ from wxo_agentic_evaluation.service_instance import tenant_setup
9
+
10
+
11
+ class WXOClient:
12
+ def __init__(
13
+ self, service_url, api_key, env: Optional[Dict[str, Any]] = None
14
+ ):
15
+ self.service_url = service_url
16
+ self.api_key = api_key
17
+
18
+ ov = os.getenv("WO_SSL_VERIFY")
19
+ if ov and ov.strip().lower() in ("true", "false"):
20
+ self._verify_ssl = ov.strip().lower() == "true"
21
+ else:
22
+ v, bs = (env.get("verify") if env else None), (
23
+ env.get("bypass_ssl") if env else None
24
+ )
25
+ self._verify_ssl = (
26
+ False
27
+ if (
28
+ (bs is True)
29
+ or (isinstance(bs, str) and bs.strip().lower() == "true")
30
+ or (v is None)
31
+ or (
32
+ isinstance(v, str)
33
+ and v.strip().lower() in {"none", "null"}
34
+ )
35
+ )
36
+ else (v if isinstance(v, bool) else True)
37
+ )
38
+
39
+ if not self._verify_ssl:
40
+ urllib3.disable_warnings(InsecureRequestWarning)
41
+
42
+ def _get_headers(self) -> dict:
43
+ headers = {}
44
+ if self.api_key:
45
+ headers["Authorization"] = f"Bearer {self.api_key}"
46
+ return headers
47
+
48
+ def post(self, payload: dict, path: str, stream=False):
49
+ url = f"{self.service_url}/{path}"
50
+ return requests.post(
51
+ url=url,
52
+ headers=self._get_headers(),
53
+ json=payload,
54
+ stream=stream,
55
+ verify=self._verify_ssl,
56
+ )
57
+
58
+ def get(self, path: str, params: dict = None):
59
+ url = f"{self.service_url}/{path}"
60
+ return requests.get(
61
+ url,
62
+ params=params,
63
+ headers=self._get_headers(),
64
+ verify=self._verify_ssl,
65
+ )
66
+
67
+
68
+ def get_wxo_client(
69
+ service_url: Optional[str], tenant_name: str, token: Optional[str] = None
70
+ ) -> WXOClient:
71
+
72
+ token, resolved_url, env = tenant_setup(service_url, tenant_name)
73
+ service_url = service_url or resolved_url
74
+
75
+ if not (service_url and str(service_url).strip()):
76
+ raise ValueError(
77
+ f"service_url not provided and not found in config for tenant '{tenant_name}'"
78
+ )
79
+
80
+ wxo_client = WXOClient(service_url=service_url, api_key=token, env=env)
81
+ return wxo_client