ibm-watsonx-orchestrate-evaluation-framework 1.1.3__py3-none-any.whl → 1.1.8b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/METADATA +19 -1
  2. ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info/RECORD +146 -0
  3. wxo_agentic_evaluation/analytics/tools/analyzer.py +4 -2
  4. wxo_agentic_evaluation/analyze_run.py +1025 -220
  5. wxo_agentic_evaluation/annotate.py +2 -2
  6. wxo_agentic_evaluation/arg_configs.py +60 -2
  7. wxo_agentic_evaluation/base_user.py +25 -0
  8. wxo_agentic_evaluation/batch_annotate.py +19 -2
  9. wxo_agentic_evaluation/clients.py +103 -0
  10. wxo_agentic_evaluation/compare_runs/__init__.py +0 -0
  11. wxo_agentic_evaluation/compare_runs/compare_2_runs.py +74 -0
  12. wxo_agentic_evaluation/compare_runs/diff.py +554 -0
  13. wxo_agentic_evaluation/compare_runs/model.py +193 -0
  14. wxo_agentic_evaluation/data_annotator.py +25 -7
  15. wxo_agentic_evaluation/description_quality_checker.py +29 -6
  16. wxo_agentic_evaluation/evaluation.py +16 -8
  17. wxo_agentic_evaluation/evaluation_controller/evaluation_controller.py +303 -0
  18. wxo_agentic_evaluation/evaluation_package.py +414 -69
  19. wxo_agentic_evaluation/external_agent/__init__.py +1 -1
  20. wxo_agentic_evaluation/external_agent/external_validate.py +7 -5
  21. wxo_agentic_evaluation/external_agent/types.py +3 -9
  22. wxo_agentic_evaluation/extractors/__init__.py +3 -0
  23. wxo_agentic_evaluation/extractors/extractor_base.py +21 -0
  24. wxo_agentic_evaluation/extractors/labeled_messages.py +47 -0
  25. wxo_agentic_evaluation/hr_agent_langgraph.py +68 -0
  26. wxo_agentic_evaluation/langfuse_collection.py +60 -0
  27. wxo_agentic_evaluation/langfuse_evaluation_package.py +192 -0
  28. wxo_agentic_evaluation/llm_matching.py +104 -2
  29. wxo_agentic_evaluation/llm_safety_eval.py +64 -0
  30. wxo_agentic_evaluation/llm_user.py +5 -4
  31. wxo_agentic_evaluation/llm_user_v2.py +114 -0
  32. wxo_agentic_evaluation/main.py +112 -343
  33. wxo_agentic_evaluation/metrics/__init__.py +15 -0
  34. wxo_agentic_evaluation/metrics/dummy_metric.py +16 -0
  35. wxo_agentic_evaluation/metrics/evaluations.py +107 -0
  36. wxo_agentic_evaluation/metrics/journey_success.py +137 -0
  37. wxo_agentic_evaluation/metrics/llm_as_judge.py +26 -0
  38. wxo_agentic_evaluation/metrics/metrics.py +276 -8
  39. wxo_agentic_evaluation/metrics/tool_calling.py +93 -0
  40. wxo_agentic_evaluation/otel_parser/__init__.py +1 -0
  41. wxo_agentic_evaluation/otel_parser/langflow_parser.py +86 -0
  42. wxo_agentic_evaluation/otel_parser/langgraph_parser.py +61 -0
  43. wxo_agentic_evaluation/otel_parser/parser.py +163 -0
  44. wxo_agentic_evaluation/otel_parser/parser_types.py +38 -0
  45. wxo_agentic_evaluation/otel_parser/pydantic_parser.py +50 -0
  46. wxo_agentic_evaluation/otel_parser/utils.py +15 -0
  47. wxo_agentic_evaluation/otel_parser/wxo_parser.py +39 -0
  48. wxo_agentic_evaluation/otel_support/evaluate_tau.py +44 -10
  49. wxo_agentic_evaluation/otel_support/otel_message_conversion.py +12 -4
  50. wxo_agentic_evaluation/otel_support/tasks_test.py +456 -116
  51. wxo_agentic_evaluation/prompt/derailment_prompt.jinja2 +55 -0
  52. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +50 -4
  53. wxo_agentic_evaluation/prompt/llmaaj_prompt.jinja2 +15 -0
  54. wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +1 -1
  55. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +41 -9
  56. wxo_agentic_evaluation/prompt/template_render.py +103 -4
  57. wxo_agentic_evaluation/prompt/unsafe_topic_prompt.jinja2 +65 -0
  58. wxo_agentic_evaluation/quick_eval.py +33 -17
  59. wxo_agentic_evaluation/record_chat.py +38 -32
  60. wxo_agentic_evaluation/red_teaming/attack_evaluator.py +211 -62
  61. wxo_agentic_evaluation/red_teaming/attack_generator.py +63 -40
  62. wxo_agentic_evaluation/red_teaming/attack_list.py +95 -7
  63. wxo_agentic_evaluation/red_teaming/attack_runner.py +77 -17
  64. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics.json +783 -0
  65. wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics.json +600 -0
  66. wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +10 -10
  67. wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +105 -39
  68. wxo_agentic_evaluation/resource_map.py +3 -1
  69. wxo_agentic_evaluation/runner.py +329 -0
  70. wxo_agentic_evaluation/runtime_adapter/a2a_runtime_adapter.py +0 -0
  71. wxo_agentic_evaluation/runtime_adapter/runtime_adapter.py +14 -0
  72. wxo_agentic_evaluation/{inference_backend.py → runtime_adapter/wxo_runtime_adapter.py} +24 -293
  73. wxo_agentic_evaluation/scheduler.py +247 -0
  74. wxo_agentic_evaluation/service_instance.py +26 -17
  75. wxo_agentic_evaluation/service_provider/__init__.py +145 -9
  76. wxo_agentic_evaluation/service_provider/gateway_provider.py +707 -0
  77. wxo_agentic_evaluation/service_provider/model_proxy_provider.py +417 -17
  78. wxo_agentic_evaluation/service_provider/ollama_provider.py +393 -22
  79. wxo_agentic_evaluation/service_provider/portkey_provider.py +229 -0
  80. wxo_agentic_evaluation/service_provider/provider.py +130 -10
  81. wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +52 -0
  82. wxo_agentic_evaluation/service_provider/watsonx_provider.py +481 -53
  83. wxo_agentic_evaluation/simluation_runner.py +125 -0
  84. wxo_agentic_evaluation/test_prompt.py +4 -4
  85. wxo_agentic_evaluation/type.py +185 -16
  86. wxo_agentic_evaluation/user_simulator/demo_usage_llm_user.py +100 -0
  87. wxo_agentic_evaluation/utils/__init__.py +44 -3
  88. wxo_agentic_evaluation/utils/evaluation_discovery.py +47 -0
  89. wxo_agentic_evaluation/utils/gateway_provider_utils.py +39 -0
  90. wxo_agentic_evaluation/utils/messages_parser.py +30 -0
  91. wxo_agentic_evaluation/utils/parsers.py +71 -0
  92. wxo_agentic_evaluation/utils/utils.py +313 -9
  93. wxo_agentic_evaluation/wxo_client.py +81 -0
  94. ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info/RECORD +0 -102
  95. wxo_agentic_evaluation/otel_support/evaluate_tau_traces.py +0 -176
  96. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/WHEEL +0 -0
  97. {ibm_watsonx_orchestrate_evaluation_framework-1.1.3.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.8b0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,554 @@
1
+ import csv
2
+ from dataclasses import dataclass, field
3
+ from typing import Any, Dict, List, Optional, Tuple
4
+
5
+ from wxo_agentic_evaluation.compare_runs.model import EvaluationResult
6
+ from wxo_agentic_evaluation.utils.utils import (
7
+ EXPERIMENT_FILE_NAME,
8
+ REFERENCE_FILE_NAME,
9
+ create_table,
10
+ get_column_value,
11
+ get_diff_column,
12
+ get_experiment_column,
13
+ get_reference_column,
14
+ has_column_in_both,
15
+ safe_divide,
16
+ )
17
+
18
+ # Status constants
19
+ STATUS_ONLY_IN_REFERENCE = f"Only in {REFERENCE_FILE_NAME.title()}"
20
+ STATUS_ONLY_IN_EXPERIMENT = f"Only in {EXPERIMENT_FILE_NAME.title()}"
21
+ STATUS_IN_BOTH = f"In both {REFERENCE_FILE_NAME} and {EXPERIMENT_FILE_NAME}"
22
+
23
+
24
+ @dataclass
25
+ class DiffResults:
26
+ """Class for comparing two evaluation results."""
27
+
28
+ result1: EvaluationResult
29
+ result2: EvaluationResult
30
+
31
+ # Private cache for filtered data - not included in constructor
32
+ _filtered_data: Dict[str, List] = field(default_factory=dict, init=False)
33
+
34
+ def _filter_joined_data(self) -> Dict[str, List[Dict[str, Any]]]:
35
+ """Filter joined data into categories and cache the results.
36
+
37
+ This method caches the results to avoid repeated filtering operations.
38
+ """
39
+ # Return cached results if already computed
40
+ if self._filtered_data:
41
+ return self._filtered_data
42
+
43
+ # Get the joined data
44
+ joined_data = self.join_datasets()
45
+
46
+ # Filter data into categories
47
+ only_in_reference = []
48
+ only_in_experiment = []
49
+ in_both = []
50
+
51
+ for row in joined_data:
52
+ if row["status"] == STATUS_ONLY_IN_REFERENCE:
53
+ only_in_reference.append(row)
54
+ elif row["status"] == STATUS_ONLY_IN_EXPERIMENT:
55
+ only_in_experiment.append(row)
56
+ elif row["status"] == STATUS_IN_BOTH:
57
+ in_both.append(row)
58
+
59
+ # Cache the filtered data
60
+ self._filtered_data = {
61
+ "all": joined_data,
62
+ "only_in_reference": only_in_reference,
63
+ "only_in_experiment": only_in_experiment,
64
+ "in_both": in_both,
65
+ }
66
+
67
+ return self._filtered_data
68
+
69
+ def get_overlapping_metrics(
70
+ self, result: EvaluationResult, other: EvaluationResult
71
+ ) -> Dict[str, Any]:
72
+ """Calculate metrics for tests that overlap between two result sets."""
73
+ overlapping_names = set(result.test_case_results.keys()) & set(
74
+ other.test_case_results.keys()
75
+ )
76
+
77
+ overlapping_count = len(overlapping_names)
78
+
79
+ overlapping_matched = sum(
80
+ result.test_case_results[name].matches_count()
81
+ for name in overlapping_names
82
+ if name in result.test_case_results
83
+ )
84
+
85
+ overlapping_success = sum(
86
+ 1
87
+ for name in overlapping_names
88
+ if name in result.test_case_results
89
+ and result.test_case_results[name].is_success
90
+ )
91
+
92
+ return {
93
+ "count": overlapping_count,
94
+ "matched": overlapping_matched,
95
+ "success": overlapping_success,
96
+ "matched_ratio": safe_divide(
97
+ overlapping_matched, overlapping_count
98
+ ),
99
+ "success_ratio": safe_divide(
100
+ overlapping_success, overlapping_count
101
+ ),
102
+ }
103
+
104
+ def _join_datasets_on_name(
105
+ self, data1: List[Dict[str, Any]], data2: List[Dict[str, Any]]
106
+ ) -> List[Dict[str, Any]]:
107
+ """Join two datasets on dataset_name and compute differences."""
108
+ # Create dictionaries for quick lookup
109
+ dict1 = {row["dataset_name"]: row for row in data1}
110
+ dict2 = {row["dataset_name"]: row for row in data2}
111
+
112
+ # Get all unique dataset names
113
+ all_names = set(dict1.keys()) | set(dict2.keys())
114
+
115
+ # Initialize result
116
+ joined_data = []
117
+
118
+ for name in all_names:
119
+ result_row = {"dataset_name": name}
120
+
121
+ # Handle datasets that exist in only one file
122
+ if name not in dict1:
123
+ result_row["status"] = STATUS_ONLY_IN_EXPERIMENT
124
+ result_row.update(
125
+ {
126
+ get_experiment_column(k): v
127
+ for k, v in dict2[name].items()
128
+ if k != "dataset_name"
129
+ }
130
+ )
131
+ joined_data.append(result_row)
132
+ continue
133
+
134
+ if name not in dict2:
135
+ result_row["status"] = STATUS_ONLY_IN_REFERENCE
136
+ result_row.update(
137
+ {
138
+ get_reference_column(k): v
139
+ for k, v in dict1[name].items()
140
+ if k != "dataset_name"
141
+ }
142
+ )
143
+ joined_data.append(result_row)
144
+ continue
145
+
146
+ # Dataset exists in both files
147
+ result_row["status"] = STATUS_IN_BOTH
148
+
149
+ # Add values from both files and compute differences
150
+ row1 = dict1[name]
151
+ row2 = dict2[name]
152
+
153
+ for key in set(row1.keys()) | set(row2.keys()):
154
+ if key == "dataset_name":
155
+ continue
156
+
157
+ # Handle keys that exist in only one file
158
+ if key not in row1:
159
+ result_row[get_experiment_column(key)] = row2[key]
160
+ result_row[get_diff_column(key)] = (
161
+ f"{STATUS_ONLY_IN_EXPERIMENT}: {row2[key]}"
162
+ )
163
+ continue
164
+
165
+ if key not in row2:
166
+ result_row[get_reference_column(key)] = row1[key]
167
+ result_row[get_diff_column(key)] = (
168
+ f"{STATUS_ONLY_IN_REFERENCE}: {row1[key]}"
169
+ )
170
+ continue
171
+
172
+ # Add values from both files
173
+ result_row[get_reference_column(key)] = row1[key]
174
+ result_row[get_experiment_column(key)] = row2[key]
175
+
176
+ # Compute difference
177
+ if isinstance(row1[key], (int, float)) and isinstance(
178
+ row2[key], (int, float)
179
+ ):
180
+ diff = row2[key] - row1[key]
181
+ result_row[get_diff_column(key)] = diff
182
+ elif isinstance(row1[key], bool) and isinstance(
183
+ row2[key], bool
184
+ ):
185
+ result_row[get_diff_column(key)] = (
186
+ "No change" if row1[key] == row2[key] else "Changed"
187
+ )
188
+ else:
189
+ result_row[get_diff_column(key)] = (
190
+ "Same" if row1[key] == row2[key] else "Different"
191
+ )
192
+
193
+ joined_data.append(result_row)
194
+ return joined_data
195
+
196
+ def join_datasets(self) -> List[Dict[str, Any]]:
197
+ """Join datasets from both evaluation results and compute differences."""
198
+ # Convert test case results to the format expected by _join_datasets_on_name
199
+ data1 = [
200
+ result.to_dict()
201
+ for result in self.result1.test_case_results.values()
202
+ ]
203
+ data2 = [
204
+ result.to_dict()
205
+ for result in self.result2.test_case_results.values()
206
+ ]
207
+ return self._join_datasets_on_name(data1, data2)
208
+
209
+ def summary_statistics(self) -> List[Dict[str, Any]]:
210
+ """Generate summary statistics comparing the two evaluation results."""
211
+ # Calculate overlapping metrics
212
+ overlap1 = self.get_overlapping_metrics(self.result1, self.result2)
213
+ overlap2 = self.get_overlapping_metrics(self.result2, self.result1)
214
+
215
+ overlapping_tests = overlap1["count"] # Same for both
216
+
217
+ return [
218
+ {
219
+ "Metric": "Total Tests",
220
+ "Reference": self.result1.test_count,
221
+ "Experiment": self.result2.test_count,
222
+ "Experiment - Reference": self.result2.test_count
223
+ - self.result1.test_count,
224
+ },
225
+ {
226
+ "Metric": "Overlapping Tests",
227
+ "Reference": overlapping_tests,
228
+ "Experiment": overlapping_tests,
229
+ "Experiment - Reference": 0,
230
+ },
231
+ {
232
+ "Metric": "Summary Matches",
233
+ "Reference": overlap1["matched"],
234
+ "Experiment": overlap2["matched"],
235
+ "Experiment - Reference": overlap2["matched"]
236
+ - overlap1["matched"],
237
+ },
238
+ {
239
+ "Metric": "Is Success",
240
+ "Reference": overlap1["success"],
241
+ "Experiment": overlap2["success"],
242
+ "Experiment - Reference": overlap2["success"]
243
+ - overlap1["success"],
244
+ },
245
+ # {"Metric": "Summary Match / Overlapping Tests",
246
+ # "Reference": format_ratio(overlap1['matched_ratio']),
247
+ # "Experiment": format_ratio(overlap2['matched_ratio']),
248
+ # "Difference": f"{(overlap2['matched_ratio'] - overlap1['matched_ratio']) * 100:.1f}%" if overlapping_tests > 0 else "N/A"},
249
+ # {"Metric": "Is Success / Overlapping Tests",
250
+ # "Reference": format_ratio(overlap1['success_ratio']),
251
+ # "Experiment": format_ratio(overlap2['success_ratio']),
252
+ # "Difference": f"{(overlap2['success_ratio'] - overlap1['success_ratio']) * 100:.1f}%" if overlapping_tests > 0 else "N/A"},
253
+ ]
254
+
255
+ def compute_tabular_diff(
256
+ self, do_display: bool = True, verbose: bool = True
257
+ ) -> List[Dict[str, Any]]:
258
+ """Display the differences in a tabular format with one row per dataset."""
259
+ # Get filtered data using the caching mechanism
260
+ filtered_data = self._filter_joined_data()
261
+ joined_data = filtered_data["all"]
262
+ in_both = filtered_data["in_both"]
263
+
264
+ # Collect all possible column names (excluding dataset_name, status, and non-diff columns)
265
+ # Only include columns with numeric values
266
+ all_columns = set()
267
+ for row in joined_data:
268
+ for key in row.keys():
269
+ if key.endswith("_diff") and not key.endswith("_percent_diff"):
270
+ # Extract the base column name without the _diff suffix
271
+ base_column = key[
272
+ :-5
273
+ ] # Still need this for backward compatibility
274
+ # Check if the diff value is numeric
275
+ if isinstance(row[key], (int, float)):
276
+ all_columns.add(base_column)
277
+
278
+ # Define preferred column order based on main.py
279
+ preferred_columns = [
280
+ "total_steps",
281
+ "llm_step",
282
+ "total_tool_calls",
283
+ "tool_call_precision",
284
+ "tool_call_recall",
285
+ "agent_routing_accuracy",
286
+ "text_match",
287
+ "summary_matched_count",
288
+ "is_success",
289
+ "avg_resp_time",
290
+ ]
291
+
292
+ # Sort columns with preferred columns first, then alphabetically for the rest
293
+ sorted_columns = [
294
+ col for col in preferred_columns if col in all_columns
295
+ ]
296
+ sorted_columns += sorted(
297
+ [col for col in all_columns if col not in preferred_columns]
298
+ )
299
+
300
+ # Prepare data for table formatting
301
+ table_rows = []
302
+
303
+ # Add data rows - ONLY for datasets that are in both files
304
+ for row in in_both:
305
+ dataset_name = row["dataset_name"]
306
+ table_row = {"Dataset": dataset_name}
307
+
308
+ for col in sorted_columns:
309
+ diff_key = get_diff_column(col)
310
+ if diff_key in row:
311
+ value = row[diff_key]
312
+ # Format the value based on its type
313
+ if isinstance(value, float):
314
+ table_row[col] = round(value, 1)
315
+ else:
316
+ table_row[col] = value
317
+ else:
318
+ # If the column doesn't exist for this dataset
319
+ table_row[col] = "N/A"
320
+
321
+ table_rows.append(table_row)
322
+
323
+ # Calculate average values for experiment, reference, and delta
324
+ if table_rows:
325
+ # Calculate average differences (as before)
326
+ summary_row = {"Dataset": "Average Difference"}
327
+ for col in sorted_columns:
328
+ values = []
329
+ for row in table_rows:
330
+ val = row.get(col)
331
+ if isinstance(val, (int, float)):
332
+ values.append(val)
333
+
334
+ if values:
335
+ avg_value = sum(values) / len(values)
336
+ summary_row[col] = str(round(avg_value, 1))
337
+ else:
338
+ summary_row[col] = "N/A"
339
+
340
+ table_rows.append(summary_row)
341
+
342
+ # Calculate average experiment and reference values
343
+ experiment_avgs = {}
344
+ reference_avgs = {}
345
+
346
+ for col in sorted_columns:
347
+ exp_values = []
348
+ ref_values = []
349
+
350
+ for row in in_both:
351
+ exp_key = get_experiment_column(col)
352
+ ref_key = get_reference_column(col)
353
+
354
+ if exp_key in row and isinstance(
355
+ row[exp_key], (int, float)
356
+ ):
357
+ exp_values.append(row[exp_key])
358
+
359
+ if ref_key in row and isinstance(
360
+ row[ref_key], (int, float)
361
+ ):
362
+ ref_values.append(row[ref_key])
363
+
364
+ # Calculate averages
365
+ if exp_values:
366
+ experiment_avgs[col] = round(
367
+ sum(exp_values) / len(exp_values), 1
368
+ )
369
+ else:
370
+ experiment_avgs[col] = "N/A"
371
+
372
+ if ref_values:
373
+ reference_avgs[col] = round(
374
+ sum(ref_values) / len(ref_values), 1
375
+ )
376
+ else:
377
+ reference_avgs[col] = "N/A"
378
+
379
+ # Create the new table format with experiment, reference, and delta values
380
+ avg_table_rows = []
381
+ for col in sorted_columns:
382
+ delta_value = summary_row.get(col, "N/A")
383
+ avg_table_rows.append(
384
+ {
385
+ "Metric": col,
386
+ "Reference": reference_avgs.get(col, "N/A"),
387
+ "Experiment": experiment_avgs.get(col, "N/A"),
388
+ "Experiment - Reference": delta_value,
389
+ }
390
+ )
391
+
392
+ # Create and print the new average values table
393
+ if do_display:
394
+ print("\n")
395
+ avg_table = create_table(avg_table_rows, title="Average Values")
396
+ avg_table.print()
397
+ print("\n")
398
+
399
+ if verbose:
400
+ table = create_table(
401
+ table_rows,
402
+ title="DIFFERENCES TABLE (Experiment value - Reference value)",
403
+ )
404
+ table.print()
405
+
406
+ return table_rows
407
+
408
+ def _write_diff_table_to_csv(
409
+ self,
410
+ table_rows: List[Dict[str, Any]],
411
+ output_path: str,
412
+ summary_matched_count1: int,
413
+ summary_matched_count2: int,
414
+ ) -> None:
415
+ """Write the diff table to a CSV file."""
416
+ if not table_rows:
417
+ print("No data to write to CSV.")
418
+ return
419
+
420
+ # Ensure we have all column names
421
+ fieldnames = ["Dataset"]
422
+ for row in table_rows:
423
+ for key in row.keys():
424
+ if key != "Dataset" and key not in fieldnames:
425
+ fieldnames.append(key)
426
+
427
+ try:
428
+ # Create a new fieldnames list with "_delta" appended to each column except "Dataset"
429
+ delta_fieldnames = ["Dataset"]
430
+ for field in fieldnames:
431
+ if field != "Dataset":
432
+ delta_fieldnames.append(f"{field}_delta")
433
+
434
+ with open(output_path, "w", newline="") as csvfile:
435
+ writer = csv.DictWriter(csvfile, fieldnames=delta_fieldnames)
436
+ writer.writeheader()
437
+
438
+ # Write all rows except the summary row, with renamed columns
439
+ for row in table_rows:
440
+ if row["Dataset"] != "Average Difference":
441
+ # Create a new row with renamed columns
442
+ new_row = {"Dataset": row["Dataset"]}
443
+ for key, value in row.items():
444
+ if key != "Dataset":
445
+ new_row[f"{key}_delta"] = value
446
+ writer.writerow(new_row)
447
+
448
+ # Add the summary row with the summary matched counts
449
+ summary_row = next(
450
+ (
451
+ row
452
+ for row in table_rows
453
+ if row["Dataset"] == "Average Difference"
454
+ ),
455
+ {},
456
+ )
457
+ if summary_row:
458
+ # Create a new summary row with renamed columns
459
+ new_summary_row = {
460
+ "Dataset": summary_row.get(
461
+ "Dataset", "Average Difference"
462
+ )
463
+ }
464
+ for key, value in summary_row.items():
465
+ if key != "Dataset":
466
+ new_summary_row[f"{key}_delta"] = value
467
+
468
+ writer.writerow(new_summary_row)
469
+
470
+ except Exception as e:
471
+ print(f"Error writing to CSV: {e}")
472
+
473
+ def to_csv(self, path: str) -> None:
474
+ """Write the diff results to a CSV file."""
475
+ table_rows = self.compute_tabular_diff(do_display=False, verbose=False)
476
+
477
+ self._write_diff_table_to_csv(
478
+ table_rows,
479
+ path,
480
+ self.result1.summary_matched_count,
481
+ self.result2.summary_matched_count,
482
+ )
483
+
484
+ def display_exclusive_tests(self) -> Tuple[List[str], List[str]]:
485
+ """Display lists of tests that are exclusive to each file."""
486
+ joined_data = self.join_datasets()
487
+
488
+ # Filter tests only in reference file
489
+ only_in_reference = [
490
+ row["dataset_name"]
491
+ for row in joined_data
492
+ if row["status"] == STATUS_ONLY_IN_REFERENCE
493
+ ]
494
+
495
+ # Filter tests only in experiment file
496
+ only_in_experiment = [
497
+ row["dataset_name"]
498
+ for row in joined_data
499
+ if row["status"] == STATUS_ONLY_IN_EXPERIMENT
500
+ ]
501
+
502
+ # Display the results
503
+ print(f"\nTests {STATUS_ONLY_IN_REFERENCE}:")
504
+ print(sorted(only_in_reference), "\n")
505
+
506
+ print(f"\nTests {STATUS_ONLY_IN_EXPERIMENT}:")
507
+ print(sorted(only_in_experiment))
508
+
509
+ return only_in_reference, only_in_experiment
510
+
511
+ def display_differing_summary_matches(self) -> List[Dict[str, str]]:
512
+ """Display test cases where summary match status differs between reference and experiment."""
513
+ # Get filtered data using the caching mechanism
514
+ filtered_data = self._filter_joined_data()
515
+ in_both = filtered_data["in_both"]
516
+
517
+ # Find test cases where summary match status differs
518
+ differing_cases = []
519
+ for row in in_both:
520
+ # Initialize result row with dataset name
521
+ result_row = {"Dataset": row["dataset_name"]}
522
+
523
+ # Check if text_match is available in both files
524
+ if has_column_in_both(row, "text_match"):
525
+ ref_match = (
526
+ get_column_value(row, "text_match", "reference")
527
+ == "Summary Matched"
528
+ )
529
+ exp_match = (
530
+ get_column_value(row, "text_match", "experiment")
531
+ == "Summary Matched"
532
+ )
533
+
534
+ # If the match status differs, add to our result
535
+ if ref_match != exp_match:
536
+ result_row["Reference Summary Match"] = (
537
+ "Yes" if ref_match else "No"
538
+ )
539
+ result_row["Experiment Summary Match"] = (
540
+ "Yes" if exp_match else "No"
541
+ )
542
+ differing_cases.append(result_row)
543
+
544
+ # Display the results
545
+ title = "Differing Summary Matches"
546
+
547
+ if differing_cases:
548
+ print("\nTest cases with differing summary match status:")
549
+ table = create_table(differing_cases, title=title)
550
+ table.print()
551
+ else:
552
+ print("\nNo test cases with differing summary match status found.")
553
+
554
+ return differing_cases