ibm-watsonx-orchestrate-evaluation-framework 1.0.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/METADATA +53 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +38 -21
- wxo_agentic_evaluation/analytics/tools/main.py +19 -25
- wxo_agentic_evaluation/analytics/tools/types.py +26 -11
- wxo_agentic_evaluation/analytics/tools/ux.py +75 -31
- wxo_agentic_evaluation/analyze_run.py +1184 -97
- wxo_agentic_evaluation/annotate.py +7 -5
- wxo_agentic_evaluation/arg_configs.py +97 -5
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +97 -27
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +45 -19
- wxo_agentic_evaluation/description_quality_checker.py +178 -0
- wxo_agentic_evaluation/evaluation.py +50 -0
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +544 -107
- wxo_agentic_evaluation/external_agent/__init__.py +18 -7
- wxo_agentic_evaluation/external_agent/external_validate.py +49 -36
- wxo_agentic_evaluation/external_agent/performance_test.py +33 -22
- wxo_agentic_evaluation/external_agent/types.py +8 -7
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +108 -5
- wxo_agentic_evaluation/llm_rag_eval.py +7 -4
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +12 -6
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +128 -246
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +28 -2
- wxo_agentic_evaluation/metrics/metrics.py +319 -16
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +101 -0
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +29 -0
- wxo_agentic_evaluation/otel_support/tasks_test.py +1566 -0
- wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +59 -5
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
- wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +163 -12
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +384 -0
- wxo_agentic_evaluation/record_chat.py +132 -81
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +302 -0
- wxo_agentic_evaluation/red_teaming/attack_generator.py +329 -0
- wxo_agentic_evaluation/red_teaming/attack_list.py +184 -0
- wxo_agentic_evaluation/red_teaming/attack_runner.py +204 -0
- wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +29 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +245 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +106 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +291 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +465 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +162 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +562 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +266 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +344 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +193 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +413 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +46 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +158 -0
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +191 -0
- wxo_agentic_evaluation/resource_map.py +6 -3
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +88 -150
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +117 -26
- wxo_agentic_evaluation/service_provider/__init__.py +182 -17
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +628 -45
- wxo_agentic_evaluation/service_provider/ollama_provider.py +392 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +129 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +203 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +516 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/tool_planner.py +141 -46
- wxo_agentic_evaluation/type.py +217 -14
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +178 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/rich_utils.py +188 -0
- wxo_agentic_evaluation/utils/rouge_score.py +23 -0
- wxo_agentic_evaluation/utils/utils.py +514 -17
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/METADATA +0 -380
- ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info/RECORD +0 -56
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
|
@@ -1,24 +1,179 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import glob
|
|
3
|
+
import json
|
|
4
|
+
import math
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Dict, List, Mapping, Optional, Tuple, Union
|
|
1
10
|
from urllib.parse import urlparse
|
|
11
|
+
|
|
12
|
+
import rich
|
|
13
|
+
import yaml
|
|
14
|
+
from rich import box, print
|
|
2
15
|
from rich.console import Console, Group
|
|
3
|
-
from rich.table import Table
|
|
4
16
|
from rich.panel import Panel
|
|
5
17
|
from rich.rule import Rule
|
|
6
|
-
from rich import
|
|
7
|
-
from rich import
|
|
8
|
-
|
|
9
|
-
from typing import List
|
|
18
|
+
from rich.style import Style
|
|
19
|
+
from rich.table import Table
|
|
10
20
|
|
|
11
21
|
from wxo_agentic_evaluation.metrics.llm_as_judge import Faithfulness
|
|
12
|
-
from wxo_agentic_evaluation.metrics.metrics import
|
|
13
|
-
|
|
22
|
+
from wxo_agentic_evaluation.metrics.metrics import (
|
|
23
|
+
KnowledgeBaseMetricSummary,
|
|
24
|
+
ReferenceLessEvalMetrics,
|
|
25
|
+
ToolCallAndRoutingMetrics,
|
|
26
|
+
)
|
|
27
|
+
from wxo_agentic_evaluation.type import (
|
|
28
|
+
ConversationalConfidenceThresholdScore,
|
|
29
|
+
ExtendedMessage,
|
|
30
|
+
Message,
|
|
31
|
+
)
|
|
14
32
|
|
|
15
33
|
console = Console()
|
|
16
34
|
|
|
35
|
+
RUN_FILE_RE = re.compile(
|
|
36
|
+
r"^(?P<base>.+)\.run(?P<run>\d+)\.(?P<kind>messages(?:\.analyze)?|metrics)\.json$"
|
|
37
|
+
)
|
|
38
|
+
N_A = "N/A"
|
|
39
|
+
|
|
40
|
+
# File name constants
|
|
41
|
+
REFERENCE_FILE_NAME = "reference"
|
|
42
|
+
EXPERIMENT_FILE_NAME = "experiment"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class AttackResultsTable:
|
|
46
|
+
def __init__(self, attack_results: dict):
|
|
47
|
+
self.table = Table(
|
|
48
|
+
title="Attack Results",
|
|
49
|
+
box=box.ROUNDED,
|
|
50
|
+
show_lines=True,
|
|
51
|
+
)
|
|
52
|
+
self.table.add_column("Attack Category", style="magenta")
|
|
53
|
+
self.table.add_column("Count", style="cyan")
|
|
54
|
+
self.table.add_column("Success Rate", style="green")
|
|
55
|
+
|
|
56
|
+
# Extract values
|
|
57
|
+
n_on_policy = attack_results.get("n_on_policy_attacks", 0)
|
|
58
|
+
n_off_policy = attack_results.get("n_off_policy_attacks", 0)
|
|
59
|
+
n_on_policy_successful = attack_results.get("n_on_policy_successful", 0)
|
|
60
|
+
n_off_policy_successful = attack_results.get(
|
|
61
|
+
"n_off_policy_successful", 0
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# Calculate success rates
|
|
65
|
+
on_policy_rate = (
|
|
66
|
+
f"{round(100 * safe_divide(n_on_policy_successful, n_on_policy))}%"
|
|
67
|
+
if n_on_policy
|
|
68
|
+
else "0%"
|
|
69
|
+
)
|
|
70
|
+
off_policy_rate = (
|
|
71
|
+
f"{round(100 * safe_divide(n_off_policy_successful, n_off_policy))}%"
|
|
72
|
+
if n_off_policy
|
|
73
|
+
else "0%"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
self.table.add_row("On Policy", str(n_on_policy), on_policy_rate)
|
|
77
|
+
self.table.add_row("Off Policy", str(n_off_policy), off_policy_rate)
|
|
78
|
+
|
|
79
|
+
def print(self):
|
|
80
|
+
console.print(self.table)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class TestCaseResources:
|
|
84
|
+
def __init__(self, output_dir: str):
|
|
85
|
+
"""Todo flesh out for all resources that are saved"""
|
|
86
|
+
self.output_dir = Path(output_dir)
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def get_summary(self):
|
|
90
|
+
summary = []
|
|
91
|
+
|
|
92
|
+
with open(self.output_dir / "summary_metrics.csv", "r") as f:
|
|
93
|
+
reader = csv.reader(f)
|
|
94
|
+
header = next(reader)
|
|
95
|
+
for row in reader:
|
|
96
|
+
summary.append(dict(zip(header, row)))
|
|
97
|
+
|
|
98
|
+
return summary
|
|
99
|
+
|
|
100
|
+
def get_analyze_messages(
|
|
101
|
+
self, test_case_name=None, path=None
|
|
102
|
+
) -> Tuple[List[ExtendedMessage], Mapping[str, Any]]:
|
|
103
|
+
test_messages = []
|
|
104
|
+
|
|
105
|
+
if test_case_name:
|
|
106
|
+
path = os.path.join(
|
|
107
|
+
self.output_dir,
|
|
108
|
+
"messages",
|
|
109
|
+
f"{test_case_name}.messages.analyze.json",
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
if not Path(str(path)).is_file():
|
|
113
|
+
rich.print(f"[r]No analyze file found at {path}")
|
|
114
|
+
raise Exception(f"No analyze file found at {path}")
|
|
115
|
+
|
|
116
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
117
|
+
temp = json.load(f)
|
|
118
|
+
meta = None
|
|
119
|
+
if temp and isinstance(temp[-1], dict) and "meta" in temp[-1]:
|
|
120
|
+
meta = temp[-1]["meta"]
|
|
121
|
+
temp = temp[:-1]
|
|
122
|
+
|
|
123
|
+
for entry in temp:
|
|
124
|
+
msg = ExtendedMessage(**entry)
|
|
125
|
+
test_messages.append(msg)
|
|
126
|
+
|
|
127
|
+
return test_messages, meta
|
|
128
|
+
|
|
129
|
+
def get_messages(self, test_case_name=None, path=None) -> List[Message]:
|
|
130
|
+
test_messages = []
|
|
131
|
+
|
|
132
|
+
if test_case_name:
|
|
133
|
+
path = os.path.join(
|
|
134
|
+
self.output_dir,
|
|
135
|
+
"messages",
|
|
136
|
+
f"{test_case_name}.messages.json",
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
if not Path(str(path)).is_file():
|
|
140
|
+
rich.print(f"[r]No messages file found at {path}")
|
|
141
|
+
raise Exception(f"No messages file found at {path}")
|
|
142
|
+
|
|
143
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
144
|
+
temp = json.load(f)
|
|
145
|
+
for entry in temp:
|
|
146
|
+
msg = Message(**entry)
|
|
147
|
+
test_messages.append(msg)
|
|
148
|
+
|
|
149
|
+
return test_messages
|
|
150
|
+
|
|
151
|
+
def get_test_metrics(
|
|
152
|
+
self, test_case_name=None, path=None
|
|
153
|
+
) -> ToolCallAndRoutingMetrics:
|
|
154
|
+
if test_case_name:
|
|
155
|
+
path = os.path.join(
|
|
156
|
+
self.output_dir,
|
|
157
|
+
"messages",
|
|
158
|
+
f"{test_case_name}.metrics.json",
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
if not Path(str(path)).is_file():
|
|
162
|
+
rich.print(f"[r]No metrics file found at {path}")
|
|
163
|
+
raise Exception(f"No metrics file found at {path}")
|
|
164
|
+
|
|
165
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
166
|
+
metrics = ToolCallAndRoutingMetrics(**json.load(f))
|
|
167
|
+
|
|
168
|
+
return metrics
|
|
169
|
+
|
|
17
170
|
|
|
18
171
|
class AgentMetricsTable:
|
|
19
|
-
def __init__(self, data):
|
|
172
|
+
def __init__(self, data, title: Optional[str] = None):
|
|
173
|
+
if title is None:
|
|
174
|
+
title = "Agent Metrics"
|
|
20
175
|
self.table = Table(
|
|
21
|
-
title=
|
|
176
|
+
title=title,
|
|
22
177
|
box=box.ROUNDED,
|
|
23
178
|
show_lines=True,
|
|
24
179
|
)
|
|
@@ -39,7 +194,9 @@ class AgentMetricsTable:
|
|
|
39
194
|
console.print(self.table)
|
|
40
195
|
|
|
41
196
|
|
|
42
|
-
def create_table(
|
|
197
|
+
def create_table(
|
|
198
|
+
data: List[dict], title: Optional[str] = None
|
|
199
|
+
) -> AgentMetricsTable:
|
|
43
200
|
"""
|
|
44
201
|
Generate a Rich table from a list of dictionaries.
|
|
45
202
|
Returns the AgentMetricsTable instance.
|
|
@@ -51,14 +208,55 @@ def create_table(data: List[dict]) -> AgentMetricsTable:
|
|
|
51
208
|
print("create_table() received an empty dataset. No table generated.")
|
|
52
209
|
return None
|
|
53
210
|
|
|
54
|
-
return AgentMetricsTable(data)
|
|
211
|
+
return AgentMetricsTable(data, title=title)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def mean(vals: List[float]) -> float:
|
|
215
|
+
"""
|
|
216
|
+
Calculate the mean of a list of values.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
vals: List of values
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
Mean value
|
|
223
|
+
"""
|
|
224
|
+
return round(sum(vals) / len(vals), 2) if vals else 0.0
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def to_pct(value: float | None, decimals: int = 0) -> str:
|
|
228
|
+
"""
|
|
229
|
+
Convert a value to a percentage string.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
value: Value to convert
|
|
233
|
+
decimals: Number of decimal places
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
Percentage string
|
|
237
|
+
"""
|
|
238
|
+
if value is None:
|
|
239
|
+
return "NA"
|
|
240
|
+
try:
|
|
241
|
+
return f"{round(float(value) * 100, decimals)}%"
|
|
242
|
+
except Exception:
|
|
243
|
+
return "NA"
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def average(array) -> float:
|
|
247
|
+
if len(array) == 0:
|
|
248
|
+
return math.nan
|
|
249
|
+
|
|
250
|
+
else:
|
|
251
|
+
return sum(array) / len(array)
|
|
55
252
|
|
|
56
253
|
|
|
57
254
|
def safe_divide(nom, denom):
|
|
58
255
|
if denom == 0:
|
|
59
256
|
return 0
|
|
60
257
|
else:
|
|
61
|
-
return nom/denom
|
|
258
|
+
return nom / denom
|
|
259
|
+
|
|
62
260
|
|
|
63
261
|
def is_saas_url(service_url: str) -> bool:
|
|
64
262
|
hostname = urlparse(service_url).hostname
|
|
@@ -70,22 +268,132 @@ def is_ibm_cloud_url(service_url: str) -> bool:
|
|
|
70
268
|
return ".cloud.ibm.com" in hostname
|
|
71
269
|
|
|
72
270
|
|
|
73
|
-
def add_line_seperator(
|
|
74
|
-
|
|
271
|
+
def add_line_seperator(
|
|
272
|
+
style_config: Optional[Union[str, Style]] = None,
|
|
273
|
+
print=True,
|
|
274
|
+
):
|
|
275
|
+
"""
|
|
276
|
+
Adds a lined seperator provided the style config.
|
|
277
|
+
`print` is a boolean to indicate if the lined seperator should go to stdout immeadiatly or returned as an object.
|
|
278
|
+
Set `print` to False, the lined seperator is printed later as part of the pager view for example.
|
|
279
|
+
"""
|
|
280
|
+
|
|
281
|
+
if not style_config:
|
|
282
|
+
style = "grey42"
|
|
283
|
+
else:
|
|
284
|
+
style = style_config
|
|
285
|
+
|
|
286
|
+
if print:
|
|
287
|
+
console.print(
|
|
288
|
+
Rule(
|
|
289
|
+
style=style,
|
|
290
|
+
)
|
|
291
|
+
)
|
|
292
|
+
else:
|
|
293
|
+
return Rule(style=style, characters="==")
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def get_reference_column(base_name: str) -> str:
|
|
297
|
+
"""Generate a column name with the reference suffix."""
|
|
298
|
+
return f"{base_name}_{REFERENCE_FILE_NAME}"
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def get_experiment_column(base_name: str) -> str:
|
|
302
|
+
"""Generate a column name with the experiment suffix."""
|
|
303
|
+
return f"{base_name}_{EXPERIMENT_FILE_NAME}"
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def get_diff_column(base_name: str) -> str:
|
|
307
|
+
"""Generate a diff column name."""
|
|
308
|
+
return f"{base_name}_diff"
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def get_column_value(
|
|
312
|
+
row: Dict[str, Any], base_name: str, file_type: str
|
|
313
|
+
) -> Any:
|
|
314
|
+
"""Get a value from a column with the appropriate suffix.
|
|
315
|
+
|
|
316
|
+
Args:
|
|
317
|
+
row: The data row
|
|
318
|
+
base_name: The base column name
|
|
319
|
+
file_type: Either 'reference' or 'experiment'
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
The value from the column, or None if not found
|
|
323
|
+
"""
|
|
324
|
+
if file_type.lower() == "reference":
|
|
325
|
+
key = get_reference_column(base_name)
|
|
326
|
+
elif file_type.lower() == "experiment":
|
|
327
|
+
key = get_experiment_column(base_name)
|
|
328
|
+
else:
|
|
329
|
+
raise ValueError(f"Invalid file_type: {file_type}")
|
|
330
|
+
|
|
331
|
+
return row.get(key)
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def has_column_in_both(row: Dict[str, Any], base_name: str) -> bool:
|
|
335
|
+
"""Check if a column exists with both reference and experiment suffixes."""
|
|
336
|
+
return (
|
|
337
|
+
get_reference_column(base_name) in row
|
|
338
|
+
and get_experiment_column(base_name) in row
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def format_ratio(ratio: Optional[float]) -> str:
|
|
343
|
+
"""Format a ratio as a percentage string."""
|
|
344
|
+
if ratio is None:
|
|
345
|
+
return "N/A"
|
|
346
|
+
return f"{ratio * 100:.1f}%"
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def read_file(path: str, type: str = "csv") -> List[Dict[str, Any]]:
|
|
350
|
+
"""Read a file and return its contents as a structured object."""
|
|
351
|
+
if type == "csv":
|
|
352
|
+
return read_csv_file(path)
|
|
353
|
+
elif type == "json":
|
|
354
|
+
# Add JSON reading logic if needed
|
|
355
|
+
raise NotImplementedError("JSON reading not yet implemented")
|
|
356
|
+
else:
|
|
357
|
+
raise ValueError(f"Unsupported file type: {type}")
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def read_csv_file(file_path: str) -> List[Dict[str, Any]]:
|
|
361
|
+
"""Read a CSV file and return a list of dictionaries."""
|
|
362
|
+
data = []
|
|
363
|
+
with open(file_path, "r") as f:
|
|
364
|
+
reader = csv.DictReader(f)
|
|
365
|
+
for row in reader:
|
|
366
|
+
# Convert numeric values to appropriate types
|
|
367
|
+
for key, value in row.items():
|
|
368
|
+
if key == "dataset_name" or key == "text_match":
|
|
369
|
+
continue
|
|
370
|
+
elif key == "is_success":
|
|
371
|
+
row[key] = value.lower() == "true"
|
|
372
|
+
else:
|
|
373
|
+
try:
|
|
374
|
+
row[key] = float(value)
|
|
375
|
+
except ValueError:
|
|
376
|
+
pass
|
|
377
|
+
data.append(row)
|
|
378
|
+
return data
|
|
75
379
|
|
|
76
380
|
|
|
77
381
|
class FaithfulnessTable:
|
|
78
382
|
def __init__(
|
|
79
383
|
self, faithfulness_metrics: List[Faithfulness], tool_call_ids: List[str]
|
|
80
384
|
):
|
|
81
|
-
self.table = Table(
|
|
385
|
+
self.table = Table(
|
|
386
|
+
title="Faithfulness", box=box.ROUNDED, show_lines=True
|
|
387
|
+
)
|
|
82
388
|
|
|
83
389
|
self.table.add_column("Tool Call Id", style="blue")
|
|
84
390
|
self.table.add_column("Faithfulness Score", style="blue3")
|
|
85
391
|
self.table.add_column("Evidence", style="cyan")
|
|
86
392
|
self.table.add_column("Reasoning", style="yellow3")
|
|
87
393
|
|
|
88
|
-
for tool_call_id, faithfulness in zip(
|
|
394
|
+
for tool_call_id, faithfulness in zip(
|
|
395
|
+
tool_call_ids, faithfulness_metrics
|
|
396
|
+
):
|
|
89
397
|
faithfulness = faithfulness.table()
|
|
90
398
|
self.table.add_row(
|
|
91
399
|
tool_call_id,
|
|
@@ -139,7 +447,9 @@ class KnowledgePanel:
|
|
|
139
447
|
self.confidence_scores = ConversationalSearchTable(
|
|
140
448
|
confidence_scores, tool_call_id
|
|
141
449
|
)
|
|
142
|
-
self.group = Group(
|
|
450
|
+
self.group = Group(
|
|
451
|
+
self.faithfulness.table, self.confidence_scores.table
|
|
452
|
+
)
|
|
143
453
|
|
|
144
454
|
# Panel acts as a section
|
|
145
455
|
self.section = Panel(
|
|
@@ -183,3 +493,190 @@ class SummaryPanel:
|
|
|
183
493
|
|
|
184
494
|
def print(self):
|
|
185
495
|
console.print(self.table)
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
class Tokenizer:
|
|
499
|
+
PATTERN = r"""
|
|
500
|
+
\w+(?=n't)| # Words before n't contractions (e.g., "do" in "don't")
|
|
501
|
+
n't| # n't contractions themselves
|
|
502
|
+
\w+(?=')| # Words before apostrophes (e.g., "I" in "I'm")
|
|
503
|
+
'| # Apostrophes as separate tokens
|
|
504
|
+
\w+| # Regular words (letters, numbers, underscores)
|
|
505
|
+
[^\w\s] # Punctuation marks (anything that's not word chars or whitespace)
|
|
506
|
+
"""
|
|
507
|
+
|
|
508
|
+
def __init__(self):
|
|
509
|
+
self.compiled_pattern = re.compile(
|
|
510
|
+
self.PATTERN, re.VERBOSE | re.IGNORECASE
|
|
511
|
+
)
|
|
512
|
+
|
|
513
|
+
def __call__(self, text: str) -> List[str]:
|
|
514
|
+
"""
|
|
515
|
+
Tokenizes text by splitting on punctuation and handling contractions.
|
|
516
|
+
|
|
517
|
+
Args:
|
|
518
|
+
text: Input text to tokenize.
|
|
519
|
+
|
|
520
|
+
Returns:
|
|
521
|
+
List of tokenized words (lowercase, no punctuation).
|
|
522
|
+
|
|
523
|
+
Examples:
|
|
524
|
+
- "I'm fine" -> ['i', 'm', 'fine']
|
|
525
|
+
- "don't go" -> ['do', "n't", 'go']
|
|
526
|
+
- "Hello, world!" -> ['hello', 'world']
|
|
527
|
+
"""
|
|
528
|
+
|
|
529
|
+
tokens = self.compiled_pattern.findall(text)
|
|
530
|
+
|
|
531
|
+
return self._clean_tokens(tokens)
|
|
532
|
+
|
|
533
|
+
def _clean_tokens(self, raw_tokens: List[str]) -> List[str]:
|
|
534
|
+
"""
|
|
535
|
+
Applies some basic post-processing to tokenized messages.
|
|
536
|
+
|
|
537
|
+
Args:
|
|
538
|
+
raw_tokens: list of tokens extracted from a message.
|
|
539
|
+
"""
|
|
540
|
+
|
|
541
|
+
filtered_tokens = [
|
|
542
|
+
token.lower()
|
|
543
|
+
for token in raw_tokens
|
|
544
|
+
if token.strip() and not (len(token) == 1 and not token.isalnum())
|
|
545
|
+
]
|
|
546
|
+
|
|
547
|
+
return filtered_tokens
|
|
548
|
+
|
|
549
|
+
|
|
550
|
+
class ReferencelessEvalPanel:
|
|
551
|
+
def __init__(self, referenceless_metrics: List[ReferenceLessEvalMetrics]):
|
|
552
|
+
self.table = Table(
|
|
553
|
+
title="Quick Evaluation Summary Metrics",
|
|
554
|
+
box=box.ROUNDED,
|
|
555
|
+
show_lines=True,
|
|
556
|
+
)
|
|
557
|
+
|
|
558
|
+
self.table.add_column("Dataset", style="yellow", justify="center")
|
|
559
|
+
self.table.add_column(
|
|
560
|
+
"Tool Calls", style="deep_sky_blue1", justify="center"
|
|
561
|
+
)
|
|
562
|
+
self.table.add_column(
|
|
563
|
+
"Successful Tool Calls", style="magenta", justify="center"
|
|
564
|
+
)
|
|
565
|
+
self.table.add_column(
|
|
566
|
+
"Tool Calls Failed due to Schema Mismatch",
|
|
567
|
+
style="deep_sky_blue1",
|
|
568
|
+
justify="center",
|
|
569
|
+
)
|
|
570
|
+
self.table.add_column(
|
|
571
|
+
"Tool Calls Failed due to Hallucination",
|
|
572
|
+
style="magenta",
|
|
573
|
+
justify="center",
|
|
574
|
+
)
|
|
575
|
+
|
|
576
|
+
for metric in referenceless_metrics:
|
|
577
|
+
self.table.add_row(
|
|
578
|
+
str(metric.dataset_name),
|
|
579
|
+
str(metric.number_of_tool_calls),
|
|
580
|
+
str(metric.number_of_successful_tool_calls),
|
|
581
|
+
str(metric.number_of_static_failed_tool_calls),
|
|
582
|
+
str(metric.number_of_semantic_failed_tool_calls),
|
|
583
|
+
)
|
|
584
|
+
|
|
585
|
+
def print(self):
|
|
586
|
+
console.print(self.table)
|
|
587
|
+
|
|
588
|
+
|
|
589
|
+
# Function to load messages from JSON file
|
|
590
|
+
def load_messages(file_path):
|
|
591
|
+
"""TODO: replace in favor of TestCaseResources.get_messages(...)"""
|
|
592
|
+
with open(file_path, "r") as f:
|
|
593
|
+
try:
|
|
594
|
+
message_data = json.load(f)
|
|
595
|
+
messages = []
|
|
596
|
+
for msg in message_data:
|
|
597
|
+
messages.append(Message.model_validate(msg))
|
|
598
|
+
|
|
599
|
+
return messages
|
|
600
|
+
|
|
601
|
+
except Exception as e:
|
|
602
|
+
print(file_path)
|
|
603
|
+
print(e)
|
|
604
|
+
return None
|
|
605
|
+
|
|
606
|
+
|
|
607
|
+
def load_agents_from_disk(agents_path: str):
|
|
608
|
+
agents_json = glob.glob(os.path.join(agents_path, "*.json"))
|
|
609
|
+
agents_yaml = glob.glob(os.path.join(agents_path, "*.yaml"))
|
|
610
|
+
|
|
611
|
+
agents = []
|
|
612
|
+
|
|
613
|
+
for agent_path in agents_json:
|
|
614
|
+
with open(agent_path, "r") as f:
|
|
615
|
+
agents.append(json.load(f))
|
|
616
|
+
|
|
617
|
+
for agent_path in agents_yaml:
|
|
618
|
+
with open(agent_path, "r") as f:
|
|
619
|
+
agents.append(yaml.safe_load(f))
|
|
620
|
+
|
|
621
|
+
return agents
|
|
622
|
+
|
|
623
|
+
|
|
624
|
+
def list_run_files(messages_dir: str, dataset_base: str, filter_run: int = -1):
|
|
625
|
+
"""
|
|
626
|
+
Returns: dict[run_id] -> {"analyze": path|None, "metrics": path|None}
|
|
627
|
+
(We only need analyze+metrics for this feature.)
|
|
628
|
+
|
|
629
|
+
`filter_run` only get gets the runs files for that run. If it is -1, then all run files are retrieved
|
|
630
|
+
For example, if there is `data3.run1.messages.json`, `data3.run2.messages.json`, and filter_run is 2, then,
|
|
631
|
+
the files related to only the second run are retrieved.
|
|
632
|
+
|
|
633
|
+
"""
|
|
634
|
+
runs = defaultdict(
|
|
635
|
+
lambda: {"analyze": None, "metrics": None, "messages": None}
|
|
636
|
+
)
|
|
637
|
+
for fn in os.listdir(messages_dir):
|
|
638
|
+
m = RUN_FILE_RE.match(fn)
|
|
639
|
+
if not m or m.group("base") != dataset_base:
|
|
640
|
+
continue
|
|
641
|
+
run_id = int(m.group("run"))
|
|
642
|
+
if filter_run != -1 and run_id != filter_run:
|
|
643
|
+
continue
|
|
644
|
+
|
|
645
|
+
kind = m.group("kind")
|
|
646
|
+
full = os.path.join(messages_dir, fn)
|
|
647
|
+
if kind == "messages.analyze":
|
|
648
|
+
runs[run_id]["analyze"] = full
|
|
649
|
+
elif kind == "metrics":
|
|
650
|
+
runs[run_id]["metrics"] = full
|
|
651
|
+
elif kind == "messages":
|
|
652
|
+
runs[run_id]["messages"] = full
|
|
653
|
+
return runs
|
|
654
|
+
|
|
655
|
+
|
|
656
|
+
def load_run_metrics(metrics_path: str) -> ToolCallAndRoutingMetrics:
|
|
657
|
+
"""Todo remove in a later PR"""
|
|
658
|
+
with open(metrics_path, "r", encoding="utf-8") as f:
|
|
659
|
+
return ToolCallAndRoutingMetrics(**json.load(f))
|
|
660
|
+
|
|
661
|
+
|
|
662
|
+
def csv_dump(file_path: Union[str, Path], rows: List[Dict[str, Any]]) -> None:
|
|
663
|
+
"""
|
|
664
|
+
Write rows to a CSV file.
|
|
665
|
+
|
|
666
|
+
Args:
|
|
667
|
+
file_path: Path to the output CSV file
|
|
668
|
+
rows: List of dictionaries representing CSV rows
|
|
669
|
+
"""
|
|
670
|
+
if not rows:
|
|
671
|
+
return
|
|
672
|
+
|
|
673
|
+
# Ensure the parent directory exists
|
|
674
|
+
if isinstance(file_path, str):
|
|
675
|
+
file_path = Path(file_path)
|
|
676
|
+
file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
677
|
+
|
|
678
|
+
# Write to CSV
|
|
679
|
+
with open(file_path, "w", newline="") as f:
|
|
680
|
+
writer = csv.DictWriter(f, fieldnames=rows[0].keys())
|
|
681
|
+
writer.writeheader()
|
|
682
|
+
writer.writerows(rows)
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Any, Dict, Optional
|
|
3
|
+
|
|
4
|
+
import requests
|
|
5
|
+
import urllib3
|
|
6
|
+
from urllib3.exceptions import InsecureRequestWarning
|
|
7
|
+
|
|
8
|
+
from wxo_agentic_evaluation.service_instance import tenant_setup
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class WXOClient:
|
|
12
|
+
def __init__(
|
|
13
|
+
self, service_url, api_key, env: Optional[Dict[str, Any]] = None
|
|
14
|
+
):
|
|
15
|
+
self.service_url = service_url
|
|
16
|
+
self.api_key = api_key
|
|
17
|
+
|
|
18
|
+
ov = os.getenv("WO_SSL_VERIFY")
|
|
19
|
+
if ov and ov.strip().lower() in ("true", "false"):
|
|
20
|
+
self._verify_ssl = ov.strip().lower() == "true"
|
|
21
|
+
else:
|
|
22
|
+
v, bs = (env.get("verify") if env else None), (
|
|
23
|
+
env.get("bypass_ssl") if env else None
|
|
24
|
+
)
|
|
25
|
+
self._verify_ssl = (
|
|
26
|
+
False
|
|
27
|
+
if (
|
|
28
|
+
(bs is True)
|
|
29
|
+
or (isinstance(bs, str) and bs.strip().lower() == "true")
|
|
30
|
+
or (v is None)
|
|
31
|
+
or (
|
|
32
|
+
isinstance(v, str)
|
|
33
|
+
and v.strip().lower() in {"none", "null"}
|
|
34
|
+
)
|
|
35
|
+
)
|
|
36
|
+
else (v if isinstance(v, bool) else True)
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
if not self._verify_ssl:
|
|
40
|
+
urllib3.disable_warnings(InsecureRequestWarning)
|
|
41
|
+
|
|
42
|
+
def _get_headers(self) -> dict:
|
|
43
|
+
headers = {}
|
|
44
|
+
if self.api_key:
|
|
45
|
+
headers["Authorization"] = f"Bearer {self.api_key}"
|
|
46
|
+
return headers
|
|
47
|
+
|
|
48
|
+
def post(self, payload: dict, path: str, stream=False):
|
|
49
|
+
url = f"{self.service_url}/{path}"
|
|
50
|
+
return requests.post(
|
|
51
|
+
url=url,
|
|
52
|
+
headers=self._get_headers(),
|
|
53
|
+
json=payload,
|
|
54
|
+
stream=stream,
|
|
55
|
+
verify=self._verify_ssl,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
def get(self, path: str, params: dict = None):
|
|
59
|
+
url = f"{self.service_url}/{path}"
|
|
60
|
+
return requests.get(
|
|
61
|
+
url,
|
|
62
|
+
params=params,
|
|
63
|
+
headers=self._get_headers(),
|
|
64
|
+
verify=self._verify_ssl,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def get_wxo_client(
|
|
69
|
+
service_url: Optional[str], tenant_name: str, token: Optional[str] = None
|
|
70
|
+
) -> WXOClient:
|
|
71
|
+
|
|
72
|
+
token, resolved_url, env = tenant_setup(service_url, tenant_name)
|
|
73
|
+
service_url = service_url or resolved_url
|
|
74
|
+
|
|
75
|
+
if not (service_url and str(service_url).strip()):
|
|
76
|
+
raise ValueError(
|
|
77
|
+
f"service_url not provided and not found in config for tenant '{tenant_name}'"
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
wxo_client = WXOClient(service_url=service_url, api_key=token, env=env)
|
|
81
|
+
return wxo_client
|