ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
- ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
- wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
- wxo_agentic_evaluation/analyze_run.py +1025 -220
- wxo_agentic_evaluation/annotate.py +2 -2
- wxo_agentic_evaluation/arg_configs.py +60 -2
- wxo_agentic_evaluation/base_user.py +25 -0
- wxo_agentic_evaluation/batch_annotate.py +19 -2
- wxo_agentic_evaluation/clients.py +103 -0
- wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
- wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
- wxo_agentic_evaluation/compare_runs/diff.py +554 -0
- wxo_agentic_evaluation/compare_runs/model.py +193 -0
- wxo_agentic_evaluation/data_annotator.py +25 -7
- wxo_agentic_evaluation/description_quality_checker.py +29 -6
- wxo_agentic_evaluation/evaluation.py +16 -8
- wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
- wxo_agentic_evaluation/evaluation_package.py +414 -69
- wxo_agentic_evaluation/external_agent/__init__.py +1 -1
- wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
- wxo_agentic_evaluation/external_agent/types.py +3 -9
- wxo_agentic_evaluation/extractors/__init__.py +3 -0
- wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
- wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
- wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
- wxo_agentic_evaluation/langfuse_collection.py +60 -0
- wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
- wxo_agentic_evaluation/llm_matching.py +104 -2
- wxo_agentic_evaluation/llm_safety_eval.py +64 -0
- wxo_agentic_evaluation/llm_user.py +5 -4
- wxo_agentic_evaluation/llm_user_v2.py +114 -0
- wxo_agentic_evaluation/main.py +112 -343
- wxo_agentic_evaluation/metrics/__init__.py +15 -0
- wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
- wxo_agentic_evaluation/metrics/evaluations.py +107 -0
- wxo_agentic_evaluation/metrics/journey_success.py +137 -0
- wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
- wxo_agentic_evaluation/metrics/metrics.py +276 -8
- wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
- wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
- wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
- wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
- wxo_agentic_evaluation/otel_parser/parser.py +163 -0
- wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
- wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
- wxo_agentic_evaluation/otel_parser/utils.py +15 -0
- wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
- wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
- wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
- wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
- wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
- wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
- wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
- wxo_agentic_evaluation/prompt/template_render.py +103 -4
- wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
- wxo_agentic_evaluation/quick_eval.py +33 -17
- wxo_agentic_evaluation/record_chat.py +38 -32
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
- wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
- wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
- wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
- wxo_agentic_evaluation/resource_map.py +3 -1
- wxo_agentic_evaluation/runner.py +329 -0
- wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
- wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
- wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
- wxo_agentic_evaluation/scheduler.py +247 -0
- wxo_agentic_evaluation/service_instance.py +26 -17
- wxo_agentic_evaluation/service_provider/__init__.py +145 -9
- wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
- wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
- wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
- wxo_agentic_evaluation/service_provider/provider.py +130 -10
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
- wxo_agentic_evaluation/simluation_runner.py +125 -0
- wxo_agentic_evaluation/test_prompt.py +4 -4
- wxo_agentic_evaluation/type.py +185 -16
- wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
- wxo_agentic_evaluation/utils/__init__.py +44 -3
- wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
- wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
- wxo_agentic_evaluation/utils/messages_parser.py +30 -0
- wxo_agentic_evaluation/utils/parsers.py +71 -0
- wxo_agentic_evaluation/utils/utils.py +313 -9
- wxo_agentic_evaluation/wxo_client.py +81 -0
- ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
- wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,554 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
4
|
+
|
|
5
|
+
from wxo_agentic_evaluation.compare_runs.model import EvaluationResult
|
|
6
|
+
from wxo_agentic_evaluation.utils.utils import (
|
|
7
|
+
EXPERIMENT_FILE_NAME,
|
|
8
|
+
REFERENCE_FILE_NAME,
|
|
9
|
+
create_table,
|
|
10
|
+
get_column_value,
|
|
11
|
+
get_diff_column,
|
|
12
|
+
get_experiment_column,
|
|
13
|
+
get_reference_column,
|
|
14
|
+
has_column_in_both,
|
|
15
|
+
safe_divide,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
# Status constants
|
|
19
|
+
STATUS_ONLY_IN_REFERENCE = f"Only in {REFERENCE_FILE_NAME.title()}"
|
|
20
|
+
STATUS_ONLY_IN_EXPERIMENT = f"Only in {EXPERIMENT_FILE_NAME.title()}"
|
|
21
|
+
STATUS_IN_BOTH = f"In both {REFERENCE_FILE_NAME} and {EXPERIMENT_FILE_NAME}"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class DiffResults:
|
|
26
|
+
"""Class for comparing two evaluation results."""
|
|
27
|
+
|
|
28
|
+
result1: EvaluationResult
|
|
29
|
+
result2: EvaluationResult
|
|
30
|
+
|
|
31
|
+
# Private cache for filtered data - not included in constructor
|
|
32
|
+
_filtered_data: Dict[str, List] = field(default_factory=dict, init=False)
|
|
33
|
+
|
|
34
|
+
def _filter_joined_data(self) -> Dict[str, List[Dict[str, Any]]]:
|
|
35
|
+
"""Filter joined data into categories and cache the results.
|
|
36
|
+
|
|
37
|
+
This method caches the results to avoid repeated filtering operations.
|
|
38
|
+
"""
|
|
39
|
+
# Return cached results if already computed
|
|
40
|
+
if self._filtered_data:
|
|
41
|
+
return self._filtered_data
|
|
42
|
+
|
|
43
|
+
# Get the joined data
|
|
44
|
+
joined_data = self.join_datasets()
|
|
45
|
+
|
|
46
|
+
# Filter data into categories
|
|
47
|
+
only_in_reference = []
|
|
48
|
+
only_in_experiment = []
|
|
49
|
+
in_both = []
|
|
50
|
+
|
|
51
|
+
for row in joined_data:
|
|
52
|
+
if row["status"] == STATUS_ONLY_IN_REFERENCE:
|
|
53
|
+
only_in_reference.append(row)
|
|
54
|
+
elif row["status"] == STATUS_ONLY_IN_EXPERIMENT:
|
|
55
|
+
only_in_experiment.append(row)
|
|
56
|
+
elif row["status"] == STATUS_IN_BOTH:
|
|
57
|
+
in_both.append(row)
|
|
58
|
+
|
|
59
|
+
# Cache the filtered data
|
|
60
|
+
self._filtered_data = {
|
|
61
|
+
"all": joined_data,
|
|
62
|
+
"only_in_reference": only_in_reference,
|
|
63
|
+
"only_in_experiment": only_in_experiment,
|
|
64
|
+
"in_both": in_both,
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
return self._filtered_data
|
|
68
|
+
|
|
69
|
+
def get_overlapping_metrics(
|
|
70
|
+
self, result: EvaluationResult, other: EvaluationResult
|
|
71
|
+
) -> Dict[str, Any]:
|
|
72
|
+
"""Calculate metrics for tests that overlap between two result sets."""
|
|
73
|
+
overlapping_names = set(result.test_case_results.keys()) & set(
|
|
74
|
+
other.test_case_results.keys()
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
overlapping_count = len(overlapping_names)
|
|
78
|
+
|
|
79
|
+
overlapping_matched = sum(
|
|
80
|
+
result.test_case_results[name].matches_count()
|
|
81
|
+
for name in overlapping_names
|
|
82
|
+
if name in result.test_case_results
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
overlapping_success = sum(
|
|
86
|
+
1
|
|
87
|
+
for name in overlapping_names
|
|
88
|
+
if name in result.test_case_results
|
|
89
|
+
and result.test_case_results[name].is_success
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
return {
|
|
93
|
+
"count": overlapping_count,
|
|
94
|
+
"matched": overlapping_matched,
|
|
95
|
+
"success": overlapping_success,
|
|
96
|
+
"matched_ratio": safe_divide(
|
|
97
|
+
overlapping_matched, overlapping_count
|
|
98
|
+
),
|
|
99
|
+
"success_ratio": safe_divide(
|
|
100
|
+
overlapping_success, overlapping_count
|
|
101
|
+
),
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
def _join_datasets_on_name(
|
|
105
|
+
self, data1: List[Dict[str, Any]], data2: List[Dict[str, Any]]
|
|
106
|
+
) -> List[Dict[str, Any]]:
|
|
107
|
+
"""Join two datasets on dataset_name and compute differences."""
|
|
108
|
+
# Create dictionaries for quick lookup
|
|
109
|
+
dict1 = {row["dataset_name"]: row for row in data1}
|
|
110
|
+
dict2 = {row["dataset_name"]: row for row in data2}
|
|
111
|
+
|
|
112
|
+
# Get all unique dataset names
|
|
113
|
+
all_names = set(dict1.keys()) | set(dict2.keys())
|
|
114
|
+
|
|
115
|
+
# Initialize result
|
|
116
|
+
joined_data = []
|
|
117
|
+
|
|
118
|
+
for name in all_names:
|
|
119
|
+
result_row = {"dataset_name": name}
|
|
120
|
+
|
|
121
|
+
# Handle datasets that exist in only one file
|
|
122
|
+
if name not in dict1:
|
|
123
|
+
result_row["status"] = STATUS_ONLY_IN_EXPERIMENT
|
|
124
|
+
result_row.update(
|
|
125
|
+
{
|
|
126
|
+
get_experiment_column(k): v
|
|
127
|
+
for k, v in dict2[name].items()
|
|
128
|
+
if k != "dataset_name"
|
|
129
|
+
}
|
|
130
|
+
)
|
|
131
|
+
joined_data.append(result_row)
|
|
132
|
+
continue
|
|
133
|
+
|
|
134
|
+
if name not in dict2:
|
|
135
|
+
result_row["status"] = STATUS_ONLY_IN_REFERENCE
|
|
136
|
+
result_row.update(
|
|
137
|
+
{
|
|
138
|
+
get_reference_column(k): v
|
|
139
|
+
for k, v in dict1[name].items()
|
|
140
|
+
if k != "dataset_name"
|
|
141
|
+
}
|
|
142
|
+
)
|
|
143
|
+
joined_data.append(result_row)
|
|
144
|
+
continue
|
|
145
|
+
|
|
146
|
+
# Dataset exists in both files
|
|
147
|
+
result_row["status"] = STATUS_IN_BOTH
|
|
148
|
+
|
|
149
|
+
# Add values from both files and compute differences
|
|
150
|
+
row1 = dict1[name]
|
|
151
|
+
row2 = dict2[name]
|
|
152
|
+
|
|
153
|
+
for key in set(row1.keys()) | set(row2.keys()):
|
|
154
|
+
if key == "dataset_name":
|
|
155
|
+
continue
|
|
156
|
+
|
|
157
|
+
# Handle keys that exist in only one file
|
|
158
|
+
if key not in row1:
|
|
159
|
+
result_row[get_experiment_column(key)] = row2[key]
|
|
160
|
+
result_row[get_diff_column(key)] = (
|
|
161
|
+
f"{STATUS_ONLY_IN_EXPERIMENT}: {row2[key]}"
|
|
162
|
+
)
|
|
163
|
+
continue
|
|
164
|
+
|
|
165
|
+
if key not in row2:
|
|
166
|
+
result_row[get_reference_column(key)] = row1[key]
|
|
167
|
+
result_row[get_diff_column(key)] = (
|
|
168
|
+
f"{STATUS_ONLY_IN_REFERENCE}: {row1[key]}"
|
|
169
|
+
)
|
|
170
|
+
continue
|
|
171
|
+
|
|
172
|
+
# Add values from both files
|
|
173
|
+
result_row[get_reference_column(key)] = row1[key]
|
|
174
|
+
result_row[get_experiment_column(key)] = row2[key]
|
|
175
|
+
|
|
176
|
+
# Compute difference
|
|
177
|
+
if isinstance(row1[key], (int, float)) and isinstance(
|
|
178
|
+
row2[key], (int, float)
|
|
179
|
+
):
|
|
180
|
+
diff = row2[key] - row1[key]
|
|
181
|
+
result_row[get_diff_column(key)] = diff
|
|
182
|
+
elif isinstance(row1[key], bool) and isinstance(
|
|
183
|
+
row2[key], bool
|
|
184
|
+
):
|
|
185
|
+
result_row[get_diff_column(key)] = (
|
|
186
|
+
"No change" if row1[key] == row2[key] else "Changed"
|
|
187
|
+
)
|
|
188
|
+
else:
|
|
189
|
+
result_row[get_diff_column(key)] = (
|
|
190
|
+
"Same" if row1[key] == row2[key] else "Different"
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
joined_data.append(result_row)
|
|
194
|
+
return joined_data
|
|
195
|
+
|
|
196
|
+
def join_datasets(self) -> List[Dict[str, Any]]:
|
|
197
|
+
"""Join datasets from both evaluation results and compute differences."""
|
|
198
|
+
# Convert test case results to the format expected by _join_datasets_on_name
|
|
199
|
+
data1 = [
|
|
200
|
+
result.to_dict()
|
|
201
|
+
for result in self.result1.test_case_results.values()
|
|
202
|
+
]
|
|
203
|
+
data2 = [
|
|
204
|
+
result.to_dict()
|
|
205
|
+
for result in self.result2.test_case_results.values()
|
|
206
|
+
]
|
|
207
|
+
return self._join_datasets_on_name(data1, data2)
|
|
208
|
+
|
|
209
|
+
def summary_statistics(self) -> List[Dict[str, Any]]:
|
|
210
|
+
"""Generate summary statistics comparing the two evaluation results."""
|
|
211
|
+
# Calculate overlapping metrics
|
|
212
|
+
overlap1 = self.get_overlapping_metrics(self.result1, self.result2)
|
|
213
|
+
overlap2 = self.get_overlapping_metrics(self.result2, self.result1)
|
|
214
|
+
|
|
215
|
+
overlapping_tests = overlap1["count"] # Same for both
|
|
216
|
+
|
|
217
|
+
return [
|
|
218
|
+
{
|
|
219
|
+
"Metric": "Total Tests",
|
|
220
|
+
"Reference": self.result1.test_count,
|
|
221
|
+
"Experiment": self.result2.test_count,
|
|
222
|
+
"Experiment - Reference": self.result2.test_count
|
|
223
|
+
- self.result1.test_count,
|
|
224
|
+
},
|
|
225
|
+
{
|
|
226
|
+
"Metric": "Overlapping Tests",
|
|
227
|
+
"Reference": overlapping_tests,
|
|
228
|
+
"Experiment": overlapping_tests,
|
|
229
|
+
"Experiment - Reference": 0,
|
|
230
|
+
},
|
|
231
|
+
{
|
|
232
|
+
"Metric": "Summary Matches",
|
|
233
|
+
"Reference": overlap1["matched"],
|
|
234
|
+
"Experiment": overlap2["matched"],
|
|
235
|
+
"Experiment - Reference": overlap2["matched"]
|
|
236
|
+
- overlap1["matched"],
|
|
237
|
+
},
|
|
238
|
+
{
|
|
239
|
+
"Metric": "Is Success",
|
|
240
|
+
"Reference": overlap1["success"],
|
|
241
|
+
"Experiment": overlap2["success"],
|
|
242
|
+
"Experiment - Reference": overlap2["success"]
|
|
243
|
+
- overlap1["success"],
|
|
244
|
+
},
|
|
245
|
+
# {"Metric": "Summary Match / Overlapping Tests",
|
|
246
|
+
# "Reference": format_ratio(overlap1['matched_ratio']),
|
|
247
|
+
# "Experiment": format_ratio(overlap2['matched_ratio']),
|
|
248
|
+
# "Difference": f"{(overlap2['matched_ratio'] - overlap1['matched_ratio']) * 100:.1f}%" if overlapping_tests > 0 else "N/A"},
|
|
249
|
+
# {"Metric": "Is Success / Overlapping Tests",
|
|
250
|
+
# "Reference": format_ratio(overlap1['success_ratio']),
|
|
251
|
+
# "Experiment": format_ratio(overlap2['success_ratio']),
|
|
252
|
+
# "Difference": f"{(overlap2['success_ratio'] - overlap1['success_ratio']) * 100:.1f}%" if overlapping_tests > 0 else "N/A"},
|
|
253
|
+
]
|
|
254
|
+
|
|
255
|
+
def compute_tabular_diff(
|
|
256
|
+
self, do_display: bool = True, verbose: bool = True
|
|
257
|
+
) -> List[Dict[str, Any]]:
|
|
258
|
+
"""Display the differences in a tabular format with one row per dataset."""
|
|
259
|
+
# Get filtered data using the caching mechanism
|
|
260
|
+
filtered_data = self._filter_joined_data()
|
|
261
|
+
joined_data = filtered_data["all"]
|
|
262
|
+
in_both = filtered_data["in_both"]
|
|
263
|
+
|
|
264
|
+
# Collect all possible column names (excluding dataset_name, status, and non-diff columns)
|
|
265
|
+
# Only include columns with numeric values
|
|
266
|
+
all_columns = set()
|
|
267
|
+
for row in joined_data:
|
|
268
|
+
for key in row.keys():
|
|
269
|
+
if key.endswith("_diff") and not key.endswith("_percent_diff"):
|
|
270
|
+
# Extract the base column name without the _diff suffix
|
|
271
|
+
base_column = key[
|
|
272
|
+
:-5
|
|
273
|
+
] # Still need this for backward compatibility
|
|
274
|
+
# Check if the diff value is numeric
|
|
275
|
+
if isinstance(row[key], (int, float)):
|
|
276
|
+
all_columns.add(base_column)
|
|
277
|
+
|
|
278
|
+
# Define preferred column order based on main.py
|
|
279
|
+
preferred_columns = [
|
|
280
|
+
"total_steps",
|
|
281
|
+
"llm_step",
|
|
282
|
+
"total_tool_calls",
|
|
283
|
+
"tool_call_precision",
|
|
284
|
+
"tool_call_recall",
|
|
285
|
+
"agent_routing_accuracy",
|
|
286
|
+
"text_match",
|
|
287
|
+
"summary_matched_count",
|
|
288
|
+
"is_success",
|
|
289
|
+
"avg_resp_time",
|
|
290
|
+
]
|
|
291
|
+
|
|
292
|
+
# Sort columns with preferred columns first, then alphabetically for the rest
|
|
293
|
+
sorted_columns = [
|
|
294
|
+
col for col in preferred_columns if col in all_columns
|
|
295
|
+
]
|
|
296
|
+
sorted_columns += sorted(
|
|
297
|
+
[col for col in all_columns if col not in preferred_columns]
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
# Prepare data for table formatting
|
|
301
|
+
table_rows = []
|
|
302
|
+
|
|
303
|
+
# Add data rows - ONLY for datasets that are in both files
|
|
304
|
+
for row in in_both:
|
|
305
|
+
dataset_name = row["dataset_name"]
|
|
306
|
+
table_row = {"Dataset": dataset_name}
|
|
307
|
+
|
|
308
|
+
for col in sorted_columns:
|
|
309
|
+
diff_key = get_diff_column(col)
|
|
310
|
+
if diff_key in row:
|
|
311
|
+
value = row[diff_key]
|
|
312
|
+
# Format the value based on its type
|
|
313
|
+
if isinstance(value, float):
|
|
314
|
+
table_row[col] = round(value, 1)
|
|
315
|
+
else:
|
|
316
|
+
table_row[col] = value
|
|
317
|
+
else:
|
|
318
|
+
# If the column doesn't exist for this dataset
|
|
319
|
+
table_row[col] = "N/A"
|
|
320
|
+
|
|
321
|
+
table_rows.append(table_row)
|
|
322
|
+
|
|
323
|
+
# Calculate average values for experiment, reference, and delta
|
|
324
|
+
if table_rows:
|
|
325
|
+
# Calculate average differences (as before)
|
|
326
|
+
summary_row = {"Dataset": "Average Difference"}
|
|
327
|
+
for col in sorted_columns:
|
|
328
|
+
values = []
|
|
329
|
+
for row in table_rows:
|
|
330
|
+
val = row.get(col)
|
|
331
|
+
if isinstance(val, (int, float)):
|
|
332
|
+
values.append(val)
|
|
333
|
+
|
|
334
|
+
if values:
|
|
335
|
+
avg_value = sum(values) / len(values)
|
|
336
|
+
summary_row[col] = str(round(avg_value, 1))
|
|
337
|
+
else:
|
|
338
|
+
summary_row[col] = "N/A"
|
|
339
|
+
|
|
340
|
+
table_rows.append(summary_row)
|
|
341
|
+
|
|
342
|
+
# Calculate average experiment and reference values
|
|
343
|
+
experiment_avgs = {}
|
|
344
|
+
reference_avgs = {}
|
|
345
|
+
|
|
346
|
+
for col in sorted_columns:
|
|
347
|
+
exp_values = []
|
|
348
|
+
ref_values = []
|
|
349
|
+
|
|
350
|
+
for row in in_both:
|
|
351
|
+
exp_key = get_experiment_column(col)
|
|
352
|
+
ref_key = get_reference_column(col)
|
|
353
|
+
|
|
354
|
+
if exp_key in row and isinstance(
|
|
355
|
+
row[exp_key], (int, float)
|
|
356
|
+
):
|
|
357
|
+
exp_values.append(row[exp_key])
|
|
358
|
+
|
|
359
|
+
if ref_key in row and isinstance(
|
|
360
|
+
row[ref_key], (int, float)
|
|
361
|
+
):
|
|
362
|
+
ref_values.append(row[ref_key])
|
|
363
|
+
|
|
364
|
+
# Calculate averages
|
|
365
|
+
if exp_values:
|
|
366
|
+
experiment_avgs[col] = round(
|
|
367
|
+
sum(exp_values) / len(exp_values), 1
|
|
368
|
+
)
|
|
369
|
+
else:
|
|
370
|
+
experiment_avgs[col] = "N/A"
|
|
371
|
+
|
|
372
|
+
if ref_values:
|
|
373
|
+
reference_avgs[col] = round(
|
|
374
|
+
sum(ref_values) / len(ref_values), 1
|
|
375
|
+
)
|
|
376
|
+
else:
|
|
377
|
+
reference_avgs[col] = "N/A"
|
|
378
|
+
|
|
379
|
+
# Create the new table format with experiment, reference, and delta values
|
|
380
|
+
avg_table_rows = []
|
|
381
|
+
for col in sorted_columns:
|
|
382
|
+
delta_value = summary_row.get(col, "N/A")
|
|
383
|
+
avg_table_rows.append(
|
|
384
|
+
{
|
|
385
|
+
"Metric": col,
|
|
386
|
+
"Reference": reference_avgs.get(col, "N/A"),
|
|
387
|
+
"Experiment": experiment_avgs.get(col, "N/A"),
|
|
388
|
+
"Experiment - Reference": delta_value,
|
|
389
|
+
}
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
# Create and print the new average values table
|
|
393
|
+
if do_display:
|
|
394
|
+
print("\n")
|
|
395
|
+
avg_table = create_table(avg_table_rows, title="Average Values")
|
|
396
|
+
avg_table.print()
|
|
397
|
+
print("\n")
|
|
398
|
+
|
|
399
|
+
if verbose:
|
|
400
|
+
table = create_table(
|
|
401
|
+
table_rows,
|
|
402
|
+
title="DIFFERENCES TABLE (Experiment value - Reference value)",
|
|
403
|
+
)
|
|
404
|
+
table.print()
|
|
405
|
+
|
|
406
|
+
return table_rows
|
|
407
|
+
|
|
408
|
+
def _write_diff_table_to_csv(
|
|
409
|
+
self,
|
|
410
|
+
table_rows: List[Dict[str, Any]],
|
|
411
|
+
output_path: str,
|
|
412
|
+
summary_matched_count1: int,
|
|
413
|
+
summary_matched_count2: int,
|
|
414
|
+
) -> None:
|
|
415
|
+
"""Write the diff table to a CSV file."""
|
|
416
|
+
if not table_rows:
|
|
417
|
+
print("No data to write to CSV.")
|
|
418
|
+
return
|
|
419
|
+
|
|
420
|
+
# Ensure we have all column names
|
|
421
|
+
fieldnames = ["Dataset"]
|
|
422
|
+
for row in table_rows:
|
|
423
|
+
for key in row.keys():
|
|
424
|
+
if key != "Dataset" and key not in fieldnames:
|
|
425
|
+
fieldnames.append(key)
|
|
426
|
+
|
|
427
|
+
try:
|
|
428
|
+
# Create a new fieldnames list with "_delta" appended to each column except "Dataset"
|
|
429
|
+
delta_fieldnames = ["Dataset"]
|
|
430
|
+
for field in fieldnames:
|
|
431
|
+
if field != "Dataset":
|
|
432
|
+
delta_fieldnames.append(f"{field}_delta")
|
|
433
|
+
|
|
434
|
+
with open(output_path, "w", newline="") as csvfile:
|
|
435
|
+
writer = csv.DictWriter(csvfile, fieldnames=delta_fieldnames)
|
|
436
|
+
writer.writeheader()
|
|
437
|
+
|
|
438
|
+
# Write all rows except the summary row, with renamed columns
|
|
439
|
+
for row in table_rows:
|
|
440
|
+
if row["Dataset"] != "Average Difference":
|
|
441
|
+
# Create a new row with renamed columns
|
|
442
|
+
new_row = {"Dataset": row["Dataset"]}
|
|
443
|
+
for key, value in row.items():
|
|
444
|
+
if key != "Dataset":
|
|
445
|
+
new_row[f"{key}_delta"] = value
|
|
446
|
+
writer.writerow(new_row)
|
|
447
|
+
|
|
448
|
+
# Add the summary row with the summary matched counts
|
|
449
|
+
summary_row = next(
|
|
450
|
+
(
|
|
451
|
+
row
|
|
452
|
+
for row in table_rows
|
|
453
|
+
if row["Dataset"] == "Average Difference"
|
|
454
|
+
),
|
|
455
|
+
{},
|
|
456
|
+
)
|
|
457
|
+
if summary_row:
|
|
458
|
+
# Create a new summary row with renamed columns
|
|
459
|
+
new_summary_row = {
|
|
460
|
+
"Dataset": summary_row.get(
|
|
461
|
+
"Dataset", "Average Difference"
|
|
462
|
+
)
|
|
463
|
+
}
|
|
464
|
+
for key, value in summary_row.items():
|
|
465
|
+
if key != "Dataset":
|
|
466
|
+
new_summary_row[f"{key}_delta"] = value
|
|
467
|
+
|
|
468
|
+
writer.writerow(new_summary_row)
|
|
469
|
+
|
|
470
|
+
except Exception as e:
|
|
471
|
+
print(f"Error writing to CSV: {e}")
|
|
472
|
+
|
|
473
|
+
def to_csv(self, path: str) -> None:
|
|
474
|
+
"""Write the diff results to a CSV file."""
|
|
475
|
+
table_rows = self.compute_tabular_diff(do_display=False, verbose=False)
|
|
476
|
+
|
|
477
|
+
self._write_diff_table_to_csv(
|
|
478
|
+
table_rows,
|
|
479
|
+
path,
|
|
480
|
+
self.result1.summary_matched_count,
|
|
481
|
+
self.result2.summary_matched_count,
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
def display_exclusive_tests(self) -> Tuple[List[str], List[str]]:
|
|
485
|
+
"""Display lists of tests that are exclusive to each file."""
|
|
486
|
+
joined_data = self.join_datasets()
|
|
487
|
+
|
|
488
|
+
# Filter tests only in reference file
|
|
489
|
+
only_in_reference = [
|
|
490
|
+
row["dataset_name"]
|
|
491
|
+
for row in joined_data
|
|
492
|
+
if row["status"] == STATUS_ONLY_IN_REFERENCE
|
|
493
|
+
]
|
|
494
|
+
|
|
495
|
+
# Filter tests only in experiment file
|
|
496
|
+
only_in_experiment = [
|
|
497
|
+
row["dataset_name"]
|
|
498
|
+
for row in joined_data
|
|
499
|
+
if row["status"] == STATUS_ONLY_IN_EXPERIMENT
|
|
500
|
+
]
|
|
501
|
+
|
|
502
|
+
# Display the results
|
|
503
|
+
print(f"\nTests {STATUS_ONLY_IN_REFERENCE}:")
|
|
504
|
+
print(sorted(only_in_reference), "\n")
|
|
505
|
+
|
|
506
|
+
print(f"\nTests {STATUS_ONLY_IN_EXPERIMENT}:")
|
|
507
|
+
print(sorted(only_in_experiment))
|
|
508
|
+
|
|
509
|
+
return only_in_reference, only_in_experiment
|
|
510
|
+
|
|
511
|
+
def display_differing_summary_matches(self) -> List[Dict[str, str]]:
|
|
512
|
+
"""Display test cases where summary match status differs between reference and experiment."""
|
|
513
|
+
# Get filtered data using the caching mechanism
|
|
514
|
+
filtered_data = self._filter_joined_data()
|
|
515
|
+
in_both = filtered_data["in_both"]
|
|
516
|
+
|
|
517
|
+
# Find test cases where summary match status differs
|
|
518
|
+
differing_cases = []
|
|
519
|
+
for row in in_both:
|
|
520
|
+
# Initialize result row with dataset name
|
|
521
|
+
result_row = {"Dataset": row["dataset_name"]}
|
|
522
|
+
|
|
523
|
+
# Check if text_match is available in both files
|
|
524
|
+
if has_column_in_both(row, "text_match"):
|
|
525
|
+
ref_match = (
|
|
526
|
+
get_column_value(row, "text_match", "reference")
|
|
527
|
+
== "Summary Matched"
|
|
528
|
+
)
|
|
529
|
+
exp_match = (
|
|
530
|
+
get_column_value(row, "text_match", "experiment")
|
|
531
|
+
== "Summary Matched"
|
|
532
|
+
)
|
|
533
|
+
|
|
534
|
+
# If the match status differs, add to our result
|
|
535
|
+
if ref_match != exp_match:
|
|
536
|
+
result_row["Reference Summary Match"] = (
|
|
537
|
+
"Yes" if ref_match else "No"
|
|
538
|
+
)
|
|
539
|
+
result_row["Experiment Summary Match"] = (
|
|
540
|
+
"Yes" if exp_match else "No"
|
|
541
|
+
)
|
|
542
|
+
differing_cases.append(result_row)
|
|
543
|
+
|
|
544
|
+
# Display the results
|
|
545
|
+
title = "Differing Summary Matches"
|
|
546
|
+
|
|
547
|
+
if differing_cases:
|
|
548
|
+
print("\nTest cases with differing summary match status:")
|
|
549
|
+
table = create_table(differing_cases, title=title)
|
|
550
|
+
table.print()
|
|
551
|
+
else:
|
|
552
|
+
print("\nNo test cases with differing summary match status found.")
|
|
553
|
+
|
|
554
|
+
return differing_cases
|