redisbench-admin 0.11.37__py3-none-any.whl → 0.11.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,6 +13,7 @@ from pytablewriter import MarkdownTableWriter
13
13
  import humanize
14
14
  import datetime as dt
15
15
  import os
16
+ import statistics
16
17
  from tqdm import tqdm
17
18
  from github import Github
18
19
  from slack_sdk.webhook import WebhookClient
@@ -270,6 +271,10 @@ def compare_command_logic(args, project_name, project_version):
270
271
  total_stable,
271
272
  total_unstable,
272
273
  total_comparison_points,
274
+ total_unstable_baseline,
275
+ total_unstable_comparison,
276
+ total_latency_confirmed_regressions,
277
+ latency_confirmed_regression_details,
273
278
  ) = compute_regression_table(
274
279
  rts,
275
280
  tf_github_org,
@@ -303,6 +308,7 @@ def compare_command_logic(args, project_name, project_version):
303
308
  comparison_architecture,
304
309
  first_n_baseline,
305
310
  first_n_comparison,
311
+ grafana_link_base,
306
312
  )
307
313
  comment_body = ""
308
314
  if total_comparison_points > 0:
@@ -321,11 +327,63 @@ def compare_command_logic(args, project_name, project_version):
321
327
  )
322
328
 
323
329
  if total_unstable > 0:
330
+ unstable_details = []
331
+ if total_unstable_baseline > 0:
332
+ unstable_details.append(f"{total_unstable_baseline} baseline")
333
+ if total_unstable_comparison > 0:
334
+ unstable_details.append(f"{total_unstable_comparison} comparison")
335
+
336
+ unstable_breakdown = (
337
+ " (" + ", ".join(unstable_details) + ")" if unstable_details else ""
338
+ )
324
339
  comparison_summary += (
325
- "- Detected a total of {} highly unstable benchmarks.\n".format(
326
- total_unstable
340
+ "- Detected a total of {} highly unstable benchmarks{}.\n".format(
341
+ total_unstable, unstable_breakdown
327
342
  )
328
343
  )
344
+
345
+ # Add latency confirmation summary if applicable
346
+ if total_latency_confirmed_regressions > 0:
347
+ comparison_summary += "- Latency analysis confirmed regressions in {} of the unstable tests:\n".format(
348
+ total_latency_confirmed_regressions
349
+ )
350
+
351
+ # Add detailed breakdown as bullet points with test links
352
+ if latency_confirmed_regression_details:
353
+ for detail in latency_confirmed_regression_details:
354
+ test_name = detail["test_name"]
355
+ commands_info = []
356
+ for cmd_detail in detail["commands"]:
357
+ commands_info.append(
358
+ f"{cmd_detail['command']} +{cmd_detail['change_percent']:.1f}%"
359
+ )
360
+
361
+ if commands_info:
362
+ # Create test link if grafana_link_base is available
363
+ test_display_name = test_name
364
+ if grafana_link_base is not None:
365
+ grafana_test_link = f"{grafana_link_base}?orgId=1&var-test_case={test_name}"
366
+ if baseline_branch is not None:
367
+ grafana_test_link += (
368
+ f"&var-branch={baseline_branch}"
369
+ )
370
+ if comparison_branch is not None:
371
+ grafana_test_link += (
372
+ f"&var-branch={comparison_branch}"
373
+ )
374
+ grafana_test_link += "&from=now-30d&to=now"
375
+ test_display_name = (
376
+ f"[{test_name}]({grafana_test_link})"
377
+ )
378
+
379
+ # Add confidence indicator if available
380
+ confidence_indicator = ""
381
+ if "high_confidence" in detail:
382
+ confidence_indicator = (
383
+ " 🔴" if detail["high_confidence"] else " ⚠️"
384
+ )
385
+
386
+ comparison_summary += f" - {test_display_name}: {', '.join(commands_info)}{confidence_indicator}\n"
329
387
  if total_improvements > 0:
330
388
  comparison_summary += "- Detected a total of {} improvements above the improvement water line.\n".format(
331
389
  total_improvements
@@ -484,6 +542,9 @@ def compare_command_logic(args, project_name, project_version):
484
542
  total_stable,
485
543
  total_unstable,
486
544
  total_comparison_points,
545
+ total_unstable_baseline,
546
+ total_unstable_comparison,
547
+ total_latency_confirmed_regressions,
487
548
  )
488
549
 
489
550
 
@@ -531,6 +592,7 @@ def compute_regression_table(
531
592
  comparison_architecture=ARCH_X86,
532
593
  first_n_baseline=-1,
533
594
  first_n_comparison=-1,
595
+ grafana_link_base=None,
534
596
  ):
535
597
  START_TIME_NOW_UTC, _, _ = get_start_time_vars()
536
598
  START_TIME_LAST_MONTH_UTC = START_TIME_NOW_UTC - datetime.timedelta(days=31)
@@ -593,6 +655,10 @@ def compute_regression_table(
593
655
  total_stable,
594
656
  total_unstable,
595
657
  total_comparison_points,
658
+ total_unstable_baseline,
659
+ total_unstable_comparison,
660
+ total_latency_confirmed_regressions,
661
+ latency_confirmed_regression_details,
596
662
  ) = from_rts_to_regression_table(
597
663
  baseline_deployment_name,
598
664
  comparison_deployment_name,
@@ -621,14 +687,97 @@ def compute_regression_table(
621
687
  comparison_architecture,
622
688
  first_n_baseline,
623
689
  first_n_comparison,
690
+ grafana_link_base,
691
+ baseline_branch,
692
+ baseline_tag,
693
+ comparison_branch,
694
+ comparison_tag,
695
+ from_date,
696
+ to_date,
624
697
  )
625
698
  logging.info(
626
699
  "Printing differential analysis between {} and {}".format(
627
700
  baseline_str, comparison_str
628
701
  )
629
702
  )
630
- writer = MarkdownTableWriter(
631
- table_name="Comparison between {} and {}.\n\nTime Period from {}. (environment used: {})\n".format(
703
+
704
+ # Split table into improvements, regressions, and no-changes
705
+ improvements_table = []
706
+ regressions_table = []
707
+ no_changes_table = []
708
+
709
+ for row in table:
710
+ # Check if there's a meaningful change (not stable/unstable)
711
+ note = row[4].lower() if len(row) > 4 else ""
712
+ percentage_str = row[3] if len(row) > 3 else "0.0%"
713
+
714
+ # Extract percentage value
715
+ try:
716
+ percentage_val = float(percentage_str.replace("%", "").strip())
717
+ except:
718
+ percentage_val = 0.0
719
+
720
+ # Categorize based on change type
721
+ if "improvement" in note and "potential" not in note:
722
+ # Only actual improvements, not potential ones
723
+ improvements_table.append(row)
724
+ elif ("regression" in note and "potential" not in note) or "unstable" in note:
725
+ # Only actual regressions, not potential ones, plus unstable tests
726
+ regressions_table.append(row)
727
+ elif "no change" in note or "potential" in note:
728
+ # No changes and potential changes (below significance threshold)
729
+ no_changes_table.append(row)
730
+ elif abs(percentage_val) > 3.0: # Significant changes based on percentage
731
+ if (percentage_val > 0 and metric_mode == "higher-better") or (
732
+ percentage_val < 0 and metric_mode == "lower-better"
733
+ ):
734
+ improvements_table.append(row)
735
+ else:
736
+ regressions_table.append(row)
737
+ else:
738
+ no_changes_table.append(row)
739
+
740
+ # Sort tables by percentage change
741
+ def get_percentage_value(row):
742
+ """Extract percentage value from row for sorting"""
743
+ try:
744
+ percentage_str = row[3] if len(row) > 3 else "0.0%"
745
+ return float(percentage_str.replace("%", "").strip())
746
+ except:
747
+ return 0.0
748
+
749
+ # Sort improvements by percentage change (highest first)
750
+ improvements_table.sort(key=get_percentage_value, reverse=True)
751
+
752
+ # Sort regressions by percentage change (most negative first for higher-better, most positive first for lower-better)
753
+ if metric_mode == "higher-better":
754
+ # For higher-better metrics, most negative changes are worst regressions
755
+ regressions_table.sort(key=get_percentage_value)
756
+ else:
757
+ # For lower-better metrics, most positive changes are worst regressions
758
+ regressions_table.sort(key=get_percentage_value, reverse=True)
759
+
760
+ # Create improvements table (visible)
761
+ improvements_writer = MarkdownTableWriter(
762
+ table_name="Performance Improvements - Comparison between {} and {}.\n\nTime Period from {}. (environment used: {})\n".format(
763
+ baseline_str,
764
+ comparison_str,
765
+ from_human_str,
766
+ baseline_deployment_name,
767
+ ),
768
+ headers=[
769
+ "Test Case",
770
+ "Baseline {} (median obs. +- std.dev)".format(baseline_str),
771
+ "Comparison {} (median obs. +- std.dev)".format(comparison_str),
772
+ "% change ({})".format(metric_mode),
773
+ "Note",
774
+ ],
775
+ value_matrix=improvements_table,
776
+ )
777
+
778
+ # Create regressions table (visible)
779
+ regressions_writer = MarkdownTableWriter(
780
+ table_name="Performance Regressions and Issues - Comparison between {} and {}.\n\nTime Period from {}. (environment used: {})\n".format(
632
781
  baseline_str,
633
782
  comparison_str,
634
783
  from_human_str,
@@ -641,8 +790,22 @@ def compute_regression_table(
641
790
  "% change ({})".format(metric_mode),
642
791
  "Note",
643
792
  ],
644
- value_matrix=table,
793
+ value_matrix=regressions_table,
645
794
  )
795
+
796
+ # Create no-changes table (hidden in markdown)
797
+ no_changes_writer = MarkdownTableWriter(
798
+ table_name="Tests with No Significant Changes",
799
+ headers=[
800
+ "Test Case",
801
+ "Baseline {} (median obs. +- std.dev)".format(baseline_str),
802
+ "Comparison {} (median obs. +- std.dev)".format(comparison_str),
803
+ "% change ({})".format(metric_mode),
804
+ "Note",
805
+ ],
806
+ value_matrix=no_changes_table,
807
+ )
808
+
646
809
  table_output = ""
647
810
 
648
811
  from io import StringIO
@@ -651,7 +814,25 @@ def compute_regression_table(
651
814
  old_stdout = sys.stdout
652
815
  sys.stdout = mystdout = StringIO()
653
816
 
654
- writer.dump(mystdout, False)
817
+ # Output improvements table first (if any)
818
+ if improvements_table:
819
+ improvements_writer.dump(mystdout, False)
820
+ mystdout.write("\n\n")
821
+
822
+ # Output regressions table (if any)
823
+ if regressions_table:
824
+ regressions_writer.dump(mystdout, False)
825
+ mystdout.write("\n\n")
826
+
827
+ # Add hidden no-changes table
828
+ if no_changes_table:
829
+ mystdout.write(
830
+ "<details>\n<summary>Tests with No Significant Changes ({} tests)</summary>\n\n".format(
831
+ len(no_changes_table)
832
+ )
833
+ )
834
+ no_changes_writer.dump(mystdout, False)
835
+ mystdout.write("\n</details>\n")
655
836
 
656
837
  sys.stdout = old_stdout
657
838
 
@@ -665,6 +846,10 @@ def compute_regression_table(
665
846
  total_stable,
666
847
  total_unstable,
667
848
  total_comparison_points,
849
+ total_unstable_baseline,
850
+ total_unstable_comparison,
851
+ total_latency_confirmed_regressions,
852
+ latency_confirmed_regression_details,
668
853
  )
669
854
 
670
855
 
@@ -752,6 +937,13 @@ def from_rts_to_regression_table(
752
937
  comparison_architecture=ARCH_X86,
753
938
  first_n_baseline=-1,
754
939
  first_n_comparison=-1,
940
+ grafana_link_base=None,
941
+ baseline_branch=None,
942
+ baseline_tag=None,
943
+ comparison_branch=None,
944
+ comparison_tag=None,
945
+ from_date=None,
946
+ to_date=None,
755
947
  ):
756
948
  print_all = print_regressions_only is False and print_improvements_only is False
757
949
  table = []
@@ -759,8 +951,12 @@ def from_rts_to_regression_table(
759
951
  total_improvements = 0
760
952
  total_stable = 0
761
953
  total_unstable = 0
954
+ total_unstable_baseline = 0
955
+ total_unstable_comparison = 0
762
956
  total_regressions = 0
763
957
  total_comparison_points = 0
958
+ total_latency_confirmed_regressions = 0
959
+ latency_confirmed_regression_details = [] # Track specific test details
764
960
  noise_waterline = 3
765
961
  progress = tqdm(unit="benchmark time-series", total=len(test_names))
766
962
  for test_name in test_names:
@@ -898,10 +1094,243 @@ def from_rts_to_regression_table(
898
1094
  logging.error("Detected a ZeroDivisionError. {}".format(e.__str__()))
899
1095
  pass
900
1096
  unstable = False
1097
+ unstable_baseline = False
1098
+ unstable_comparison = False
1099
+ latency_confirms_regression = False
1100
+
901
1101
  if baseline_v != "N/A" and comparison_v != "N/A":
902
1102
  if comparison_pct_change > 10.0 or baseline_pct_change > 10.0:
903
- note = "UNSTABLE (very high variance)"
904
1103
  unstable = True
1104
+ unstable_baseline = baseline_pct_change > 10.0
1105
+ unstable_comparison = comparison_pct_change > 10.0
1106
+
1107
+ # Build detailed unstable note
1108
+ unstable_parts = []
1109
+ if unstable_baseline and unstable_comparison:
1110
+ unstable_parts.append(
1111
+ "UNSTABLE (baseline & comparison high variance)"
1112
+ )
1113
+ elif unstable_baseline:
1114
+ unstable_parts.append("UNSTABLE (baseline high variance)")
1115
+ elif unstable_comparison:
1116
+ unstable_parts.append("UNSTABLE (comparison high variance)")
1117
+
1118
+ note = unstable_parts[0]
1119
+
1120
+ # Log detailed warning about unstable data detection
1121
+ logging.warning(
1122
+ f"UNSTABLE DATA DETECTED for test '{test_name}': "
1123
+ f"baseline variance={baseline_pct_change:.1f}%, "
1124
+ f"comparison variance={comparison_pct_change:.1f}% "
1125
+ f"(threshold=10.0%)"
1126
+ )
1127
+
1128
+ # For throughput metrics (higher-better), check both server-side and client-side latency
1129
+ if metric_mode == "higher-better":
1130
+ logging.info(
1131
+ f"Performing 2nd-level latency validation for unstable throughput metric '{test_name}' "
1132
+ f"(metric_mode={metric_mode})"
1133
+ )
1134
+
1135
+ # Check server-side p50 latency
1136
+ (
1137
+ server_latency_note,
1138
+ server_confirms_regression,
1139
+ server_regression_details,
1140
+ ) = check_latency_for_unstable_throughput(
1141
+ rts,
1142
+ test_name,
1143
+ baseline_str,
1144
+ comparison_str,
1145
+ by_str_baseline,
1146
+ by_str_comparison,
1147
+ baseline_deployment_name,
1148
+ comparison_deployment_name,
1149
+ tf_triggering_env,
1150
+ from_ts_ms,
1151
+ to_ts_ms,
1152
+ last_n_baseline,
1153
+ last_n_comparison,
1154
+ first_n_baseline,
1155
+ first_n_comparison,
1156
+ running_platform,
1157
+ baseline_architecture,
1158
+ comparison_architecture,
1159
+ verbose,
1160
+ )
1161
+
1162
+ # Check client-side latency metrics
1163
+ (
1164
+ client_latency_note,
1165
+ client_confirms_regression,
1166
+ client_regression_details,
1167
+ ) = check_client_side_latency(
1168
+ rts,
1169
+ test_name,
1170
+ baseline_str,
1171
+ comparison_str,
1172
+ by_str_baseline,
1173
+ by_str_comparison,
1174
+ baseline_deployment_name,
1175
+ comparison_deployment_name,
1176
+ tf_triggering_env,
1177
+ from_ts_ms,
1178
+ to_ts_ms,
1179
+ last_n_baseline,
1180
+ last_n_comparison,
1181
+ first_n_baseline,
1182
+ first_n_comparison,
1183
+ running_platform,
1184
+ baseline_architecture,
1185
+ comparison_architecture,
1186
+ verbose,
1187
+ )
1188
+
1189
+ # Combine results from both server and client side
1190
+ combined_latency_notes = []
1191
+ if server_latency_note:
1192
+ combined_latency_notes.append(f"server: {server_latency_note}")
1193
+ if client_latency_note:
1194
+ combined_latency_notes.append(f"client: {client_latency_note}")
1195
+
1196
+ # Only confirm regression if BOTH server and client side show evidence AND data is stable enough
1197
+ # Check if either server or client data contains unstable indicators
1198
+ server_has_unstable = (
1199
+ server_latency_note and "UNSTABLE" in server_latency_note
1200
+ )
1201
+ client_has_unstable = (
1202
+ client_latency_note and "UNSTABLE" in client_latency_note
1203
+ )
1204
+
1205
+ # Don't confirm regression if either side has unstable data
1206
+ if server_has_unstable or client_has_unstable:
1207
+ both_confirm_regression = False
1208
+ unstable_sides = []
1209
+ if server_has_unstable:
1210
+ unstable_sides.append("server")
1211
+ if client_has_unstable:
1212
+ unstable_sides.append("client")
1213
+ blocked_note = f"regression blocked due to unstable {' and '.join(unstable_sides)} latency data"
1214
+ note += f"; {blocked_note}"
1215
+ logging.info(
1216
+ f"Blocking regression confirmation for '{test_name}' due to unstable latency data"
1217
+ )
1218
+ if server_has_unstable:
1219
+ logging.info(f" Server-side latency data is unstable")
1220
+ if client_has_unstable:
1221
+ logging.info(f" Client-side latency data is unstable")
1222
+ else:
1223
+ both_confirm_regression = (
1224
+ server_confirms_regression and client_confirms_regression
1225
+ )
1226
+
1227
+ if combined_latency_notes:
1228
+ combined_note = "; ".join(combined_latency_notes)
1229
+ note += f"; {combined_note}"
1230
+ logging.info(
1231
+ f"Combined latency check result for '{test_name}': {combined_note}"
1232
+ )
1233
+
1234
+ if both_confirm_regression:
1235
+ logging.info(
1236
+ f"BOTH server and client latency analysis CONFIRM regression for '{test_name}'"
1237
+ )
1238
+
1239
+ # Set the flag for counting confirmed regressions
1240
+ latency_confirms_regression = True
1241
+
1242
+ # Combine regression details from both server and client
1243
+ combined_regression_details = (
1244
+ server_regression_details or client_regression_details
1245
+ )
1246
+ if combined_regression_details:
1247
+ combined_regression_details[
1248
+ "server_side"
1249
+ ] = server_confirms_regression
1250
+ combined_regression_details[
1251
+ "client_side"
1252
+ ] = client_confirms_regression
1253
+
1254
+ # 2nd level confirmation is sufficient - always add to confirmed regressions
1255
+ logging.info(
1256
+ f"Adding '{test_name}' to confirmed regressions based on 2nd level validation"
1257
+ )
1258
+
1259
+ # Perform 3rd-level analysis: variance + p99 check for additional confidence scoring
1260
+ logging.info(
1261
+ f"Performing 3rd-level analysis (variance + p99) for confidence scoring on '{test_name}'"
1262
+ )
1263
+ (
1264
+ confidence_note,
1265
+ high_confidence,
1266
+ ) = perform_variance_and_p99_analysis(
1267
+ rts,
1268
+ test_name,
1269
+ baseline_str,
1270
+ comparison_str,
1271
+ by_str_baseline,
1272
+ by_str_comparison,
1273
+ baseline_deployment_name,
1274
+ comparison_deployment_name,
1275
+ tf_triggering_env,
1276
+ from_ts_ms,
1277
+ to_ts_ms,
1278
+ last_n_baseline,
1279
+ last_n_comparison,
1280
+ first_n_baseline,
1281
+ first_n_comparison,
1282
+ running_platform,
1283
+ baseline_architecture,
1284
+ comparison_architecture,
1285
+ verbose,
1286
+ )
1287
+
1288
+ if confidence_note:
1289
+ note += f"; {confidence_note}"
1290
+ logging.info(
1291
+ f"Confidence analysis for '{test_name}': {confidence_note}"
1292
+ )
1293
+ # Use 3rd level confidence if available
1294
+ combined_regression_details[
1295
+ "high_confidence"
1296
+ ] = high_confidence
1297
+ else:
1298
+ # No 3rd level data available - default to moderate confidence since 2nd level confirmed
1299
+ logging.info(
1300
+ f"No 3rd level data available for '{test_name}' - using 2nd level confirmation"
1301
+ )
1302
+ combined_regression_details[
1303
+ "high_confidence"
1304
+ ] = True # 2nd level confirmation is reliable
1305
+
1306
+ # Always add to confirmed regressions when 2nd level confirms
1307
+ latency_confirmed_regression_details.append(
1308
+ combined_regression_details
1309
+ )
1310
+ elif server_confirms_regression or client_confirms_regression:
1311
+ side_confirmed = (
1312
+ "server" if server_confirms_regression else "client"
1313
+ )
1314
+ side_not_confirmed = (
1315
+ "client" if server_confirms_regression else "server"
1316
+ )
1317
+ insufficient_evidence_note = f"only {side_confirmed} side confirms regression ({side_not_confirmed} side stable) - insufficient evidence"
1318
+ note += f"; {insufficient_evidence_note}"
1319
+ logging.info(
1320
+ f"Only {side_confirmed} side confirms regression for '{test_name}' - insufficient evidence"
1321
+ )
1322
+ else:
1323
+ no_regression_note = (
1324
+ "neither server nor client side confirms regression"
1325
+ )
1326
+ note += f"; {no_regression_note}"
1327
+ logging.info(
1328
+ f"Neither server nor client side confirms regression for '{test_name}'"
1329
+ )
1330
+ else:
1331
+ logging.info(
1332
+ f"No latency data available for secondary check on '{test_name}'"
1333
+ )
905
1334
 
906
1335
  baseline_v_str = prepare_value_str(
907
1336
  baseline_pct_change, baseline_v, baseline_values, simplify_table
@@ -956,6 +1385,12 @@ def from_rts_to_regression_table(
956
1385
 
957
1386
  if unstable:
958
1387
  total_unstable += 1
1388
+ if unstable_baseline:
1389
+ total_unstable_baseline += 1
1390
+ if unstable_comparison:
1391
+ total_unstable_comparison += 1
1392
+ if latency_confirms_regression:
1393
+ total_latency_confirmed_regressions += 1
959
1394
 
960
1395
  should_add_line = False
961
1396
  if print_regressions_only and detected_regression:
@@ -976,6 +1411,13 @@ def from_rts_to_regression_table(
976
1411
  percentage_change,
977
1412
  table,
978
1413
  test_name,
1414
+ grafana_link_base,
1415
+ baseline_branch,
1416
+ baseline_tag,
1417
+ comparison_branch,
1418
+ comparison_tag,
1419
+ from_date,
1420
+ to_date,
979
1421
  )
980
1422
  return (
981
1423
  detected_regressions,
@@ -985,9 +1427,995 @@ def from_rts_to_regression_table(
985
1427
  total_stable,
986
1428
  total_unstable,
987
1429
  total_comparison_points,
1430
+ total_unstable_baseline,
1431
+ total_unstable_comparison,
1432
+ total_latency_confirmed_regressions,
1433
+ latency_confirmed_regression_details,
988
1434
  )
989
1435
 
990
1436
 
1437
+ def check_client_side_latency(
1438
+ rts,
1439
+ test_name,
1440
+ baseline_str,
1441
+ comparison_str,
1442
+ by_str_baseline,
1443
+ by_str_comparison,
1444
+ baseline_deployment_name,
1445
+ comparison_deployment_name,
1446
+ tf_triggering_env,
1447
+ from_ts_ms,
1448
+ to_ts_ms,
1449
+ last_n_baseline,
1450
+ last_n_comparison,
1451
+ first_n_baseline,
1452
+ first_n_comparison,
1453
+ running_platform,
1454
+ baseline_architecture,
1455
+ comparison_architecture,
1456
+ verbose=False,
1457
+ ):
1458
+ """
1459
+ Check client-side latency metrics to provide additional validation for regression detection.
1460
+
1461
+ Returns:
1462
+ tuple: (note_string, confirms_regression_bool, regression_details_dict)
1463
+ """
1464
+ logging.info(f"Starting client-side latency check for test: {test_name}")
1465
+ try:
1466
+ # Client-side latency metrics to check
1467
+ client_metrics = [
1468
+ "p50_latency_ms",
1469
+ "Latency",
1470
+ "OverallQuantiles.allCommands.q50",
1471
+ "Tests.INSERT.AverageLatency_us_",
1472
+ "Tests.READ.AverageLatency_us_",
1473
+ "Tests.SEARCH.AverageLatency_us_",
1474
+ "Tests.UPDATE.AverageLatency_us_",
1475
+ ]
1476
+
1477
+ client_latency_notes = []
1478
+ significant_client_latency_increases = 0
1479
+ regression_details = {"test_name": test_name, "commands": []}
1480
+
1481
+ for metric in client_metrics:
1482
+ # Build filters for client-side latency metric
1483
+ filters_baseline = [
1484
+ f"{by_str_baseline}={baseline_str}",
1485
+ f"metric={metric}",
1486
+ f"test_name={test_name}",
1487
+ f"deployment_name={baseline_deployment_name}",
1488
+ f"triggering_env={tf_triggering_env}",
1489
+ ]
1490
+ filters_comparison = [
1491
+ f"{by_str_comparison}={comparison_str}",
1492
+ f"metric={metric}",
1493
+ f"test_name={test_name}",
1494
+ f"deployment_name={comparison_deployment_name}",
1495
+ f"triggering_env={tf_triggering_env}",
1496
+ ]
1497
+
1498
+ # Add optional filters
1499
+ if running_platform is not None:
1500
+ filters_baseline.append(f"running_platform={running_platform}")
1501
+ filters_comparison.append(f"running_platform={running_platform}")
1502
+ if baseline_architecture != ARCH_X86:
1503
+ filters_baseline.append(f"arch={baseline_architecture}")
1504
+ if comparison_architecture != ARCH_X86:
1505
+ filters_comparison.append(f"arch={comparison_architecture}")
1506
+
1507
+ # Query for client-side latency time-series
1508
+ baseline_client_ts = rts.ts().queryindex(filters_baseline)
1509
+ comparison_client_ts = rts.ts().queryindex(filters_comparison)
1510
+
1511
+ if len(baseline_client_ts) == 0 or len(comparison_client_ts) == 0:
1512
+ if verbose:
1513
+ logging.info(
1514
+ f" No client-side data found for metric '{metric}' in {test_name}"
1515
+ )
1516
+ continue
1517
+
1518
+ logging.info(
1519
+ f" Found client-side metric '{metric}': {len(baseline_client_ts)} baseline, {len(comparison_client_ts)} comparison time-series"
1520
+ )
1521
+
1522
+ # Filter out target time-series
1523
+ baseline_client_ts = [ts for ts in baseline_client_ts if "target" not in ts]
1524
+ comparison_client_ts = [
1525
+ ts for ts in comparison_client_ts if "target" not in ts
1526
+ ]
1527
+
1528
+ if len(baseline_client_ts) == 0 or len(comparison_client_ts) == 0:
1529
+ continue
1530
+
1531
+ # Use the first available time-series for each side
1532
+ baseline_ts = baseline_client_ts[0]
1533
+ comparison_ts = comparison_client_ts[0]
1534
+
1535
+ # Get client-side latency data
1536
+ baseline_client_data = rts.ts().revrange(baseline_ts, from_ts_ms, to_ts_ms)
1537
+ comparison_client_data = rts.ts().revrange(
1538
+ comparison_ts, from_ts_ms, to_ts_ms
1539
+ )
1540
+
1541
+ if len(baseline_client_data) == 0 or len(comparison_client_data) == 0:
1542
+ if verbose:
1543
+ logging.info(
1544
+ f" No data points for metric '{metric}': baseline={len(baseline_client_data)}, comparison={len(comparison_client_data)}"
1545
+ )
1546
+ continue
1547
+
1548
+ # Calculate client-side latency statistics
1549
+ baseline_client_values = []
1550
+ comparison_client_values = []
1551
+
1552
+ (_, baseline_client_median, _) = get_v_pct_change_and_largest_var(
1553
+ baseline_client_data,
1554
+ 0,
1555
+ 0,
1556
+ baseline_client_values,
1557
+ 0,
1558
+ last_n_baseline,
1559
+ verbose,
1560
+ first_n_baseline,
1561
+ )
1562
+
1563
+ (_, comparison_client_median, _) = get_v_pct_change_and_largest_var(
1564
+ comparison_client_data,
1565
+ 0,
1566
+ 0,
1567
+ comparison_client_values,
1568
+ 0,
1569
+ last_n_comparison,
1570
+ verbose,
1571
+ first_n_comparison,
1572
+ )
1573
+
1574
+ if baseline_client_median == "N/A" or comparison_client_median == "N/A":
1575
+ if verbose:
1576
+ logging.info(
1577
+ f" Could not calculate median for metric '{metric}': baseline={baseline_client_median}, comparison={comparison_client_median}"
1578
+ )
1579
+ continue
1580
+
1581
+ # Calculate variance (coefficient of variation) for both baseline and comparison
1582
+ baseline_client_mean = (
1583
+ statistics.mean(baseline_client_values) if baseline_client_values else 0
1584
+ )
1585
+ baseline_client_stdev = (
1586
+ statistics.stdev(baseline_client_values)
1587
+ if len(baseline_client_values) > 1
1588
+ else 0
1589
+ )
1590
+ baseline_client_cv = (
1591
+ (baseline_client_stdev / baseline_client_mean * 100)
1592
+ if baseline_client_mean > 0
1593
+ else float("inf")
1594
+ )
1595
+
1596
+ comparison_client_mean = (
1597
+ statistics.mean(comparison_client_values)
1598
+ if comparison_client_values
1599
+ else 0
1600
+ )
1601
+ comparison_client_stdev = (
1602
+ statistics.stdev(comparison_client_values)
1603
+ if len(comparison_client_values) > 1
1604
+ else 0
1605
+ )
1606
+ comparison_client_cv = (
1607
+ (comparison_client_stdev / comparison_client_mean * 100)
1608
+ if comparison_client_mean > 0
1609
+ else float("inf")
1610
+ )
1611
+
1612
+ # Calculate client-side latency change (for latency, higher is worse)
1613
+ client_latency_change = (
1614
+ float(comparison_client_median) / float(baseline_client_median) - 1
1615
+ ) * 100.0
1616
+
1617
+ logging.info(
1618
+ f" Client metric '{metric}': baseline={baseline_client_median:.2f} (CV={baseline_client_cv:.1f}%), comparison={comparison_client_median:.2f} (CV={comparison_client_cv:.1f}%), change={client_latency_change:.1f}%"
1619
+ )
1620
+
1621
+ # Check if client latency data is too unstable to be reliable
1622
+ client_data_unstable = (
1623
+ baseline_client_cv > 50.0 or comparison_client_cv > 50.0
1624
+ )
1625
+
1626
+ if client_data_unstable:
1627
+ # Mark as unstable client latency data
1628
+ unstable_reason = []
1629
+ if baseline_client_cv > 50.0:
1630
+ unstable_reason.append(f"baseline CV={baseline_client_cv:.1f}%")
1631
+ if comparison_client_cv > 50.0:
1632
+ unstable_reason.append(f"comparison CV={comparison_client_cv:.1f}%")
1633
+
1634
+ client_latency_notes.append(
1635
+ f"{metric} UNSTABLE ({', '.join(unstable_reason)} - data too noisy for reliable analysis)"
1636
+ )
1637
+ logging.warning(
1638
+ f" Client metric '{metric}': UNSTABLE latency data detected - {', '.join(unstable_reason)}"
1639
+ )
1640
+ elif (
1641
+ abs(client_latency_change) > 5.0
1642
+ ): # Only report significant client latency changes for stable data
1643
+ direction = "increased" if client_latency_change > 0 else "decreased"
1644
+
1645
+ # Adjust significance threshold based on baseline variance
1646
+ if baseline_client_cv < 30.0:
1647
+ # Low variance - use standard threshold
1648
+ significance_threshold = 10.0
1649
+ elif baseline_client_cv < 50.0:
1650
+ # Moderate variance - require larger change
1651
+ significance_threshold = 15.0
1652
+ else:
1653
+ # High variance - require much larger change
1654
+ significance_threshold = 25.0
1655
+
1656
+ client_latency_notes.append(
1657
+ f"{metric} {direction} {abs(client_latency_change):.1f}% (baseline CV={baseline_client_cv:.1f}%)"
1658
+ )
1659
+ logging.info(
1660
+ f" Client metric '{metric}': SIGNIFICANT latency change detected ({direction} {abs(client_latency_change):.1f}%, baseline CV={baseline_client_cv:.1f}%)"
1661
+ )
1662
+
1663
+ # Track significant client latency increases (potential regression confirmation)
1664
+ if client_latency_change > significance_threshold:
1665
+ significant_client_latency_increases += 1
1666
+ regression_details["commands"].append(
1667
+ {
1668
+ "command": metric,
1669
+ "change_percent": client_latency_change,
1670
+ "direction": direction,
1671
+ "baseline_cv": baseline_client_cv,
1672
+ "comparison_cv": comparison_client_cv,
1673
+ }
1674
+ )
1675
+ logging.info(
1676
+ f" Client metric '{metric}': CONFIRMS regression (change={client_latency_change:.1f}% > threshold={significance_threshold:.1f}%)"
1677
+ )
1678
+ else:
1679
+ logging.info(
1680
+ f" Client metric '{metric}': Change below significance threshold (change={client_latency_change:.1f}% <= threshold={significance_threshold:.1f}%)"
1681
+ )
1682
+ elif verbose:
1683
+ client_latency_notes.append(
1684
+ f"{metric} stable (CV={baseline_client_cv:.1f}%)"
1685
+ )
1686
+ logging.info(
1687
+ f" Client metric '{metric}': latency stable (change={client_latency_change:.1f}%, baseline CV={baseline_client_cv:.1f}%)"
1688
+ )
1689
+
1690
+ # Determine if client-side latency confirms regression
1691
+ confirms_regression = significant_client_latency_increases > 0
1692
+
1693
+ # Return combined client latency notes
1694
+ if client_latency_notes:
1695
+ result = "; ".join(client_latency_notes)
1696
+ logging.info(
1697
+ f"Client-side latency check completed for {test_name}: {result}"
1698
+ )
1699
+ return (
1700
+ result,
1701
+ confirms_regression,
1702
+ regression_details if confirms_regression else None,
1703
+ )
1704
+ else:
1705
+ result = "client latency stable" if len(client_metrics) > 0 else None
1706
+ logging.info(
1707
+ f"Client-side latency check completed for {test_name}: {result or 'no data'}"
1708
+ )
1709
+ return result, False, None
1710
+
1711
+ except Exception as e:
1712
+ logging.error(f"Error checking client-side latency for {test_name}: {e}")
1713
+ return None, False, None
1714
+
1715
+
1716
+ def perform_variance_and_p99_analysis(
1717
+ rts,
1718
+ test_name,
1719
+ baseline_str,
1720
+ comparison_str,
1721
+ by_str_baseline,
1722
+ by_str_comparison,
1723
+ baseline_deployment_name,
1724
+ comparison_deployment_name,
1725
+ tf_triggering_env,
1726
+ from_ts_ms,
1727
+ to_ts_ms,
1728
+ last_n_baseline,
1729
+ last_n_comparison,
1730
+ first_n_baseline,
1731
+ first_n_comparison,
1732
+ running_platform,
1733
+ baseline_architecture,
1734
+ comparison_architecture,
1735
+ verbose=False,
1736
+ ):
1737
+ """
1738
+ Perform 3rd-level analysis using variance and p99 metrics to assess confidence in regression detection.
1739
+
1740
+ Returns:
1741
+ tuple: (confidence_note, high_confidence_bool)
1742
+ """
1743
+ try:
1744
+ logging.info(f"Starting variance and p99 analysis for {test_name}")
1745
+
1746
+ # Build filters for p99 latency metric using both metric=p99 and metric-type=(latencystats)
1747
+ filters_baseline = [
1748
+ f"{by_str_baseline}={baseline_str}",
1749
+ "metric=p99",
1750
+ "metric-type=(latencystats)",
1751
+ f"test_name={test_name}",
1752
+ f"deployment_name={baseline_deployment_name}",
1753
+ f"triggering_env={tf_triggering_env}",
1754
+ ]
1755
+ filters_comparison = [
1756
+ f"{by_str_comparison}={comparison_str}",
1757
+ "metric=p99",
1758
+ "metric-type=(latencystats)",
1759
+ f"test_name={test_name}",
1760
+ f"deployment_name={comparison_deployment_name}",
1761
+ f"triggering_env={tf_triggering_env}",
1762
+ ]
1763
+
1764
+ # Add optional filters
1765
+ if running_platform is not None:
1766
+ filters_baseline.append(f"running_platform={running_platform}")
1767
+ filters_comparison.append(f"running_platform={running_platform}")
1768
+ if baseline_architecture != ARCH_X86:
1769
+ filters_baseline.append(f"arch={baseline_architecture}")
1770
+ if comparison_architecture != ARCH_X86:
1771
+ filters_comparison.append(f"arch={comparison_architecture}")
1772
+
1773
+ # Query for p99 latency time-series
1774
+ logging.info(f"Querying p99 latencystats time-series for {test_name}")
1775
+ baseline_p99_ts = rts.ts().queryindex(filters_baseline)
1776
+ comparison_p99_ts = rts.ts().queryindex(filters_comparison)
1777
+
1778
+ logging.info(f"Found {len(baseline_p99_ts)} baseline p99 latency time-series")
1779
+ logging.info(
1780
+ f"Found {len(comparison_p99_ts)} comparison p99 latency time-series"
1781
+ )
1782
+
1783
+ # Filter out target time-series and unwanted commands (reuse existing function)
1784
+ def should_exclude_timeseries(ts_name):
1785
+ """Check if time-series should be excluded based on command"""
1786
+ if "target" in ts_name:
1787
+ return True
1788
+ ts_name_lower = ts_name.lower()
1789
+ excluded_commands = ["config", "info", "ping", "cluster", "resetstat"]
1790
+ return any(cmd in ts_name_lower for cmd in excluded_commands)
1791
+
1792
+ baseline_p99_ts = [
1793
+ ts for ts in baseline_p99_ts if not should_exclude_timeseries(ts)
1794
+ ]
1795
+ comparison_p99_ts = [
1796
+ ts for ts in comparison_p99_ts if not should_exclude_timeseries(ts)
1797
+ ]
1798
+
1799
+ if len(baseline_p99_ts) == 0 or len(comparison_p99_ts) == 0:
1800
+ logging.warning(
1801
+ f"No p99 latency data found for {test_name} after filtering"
1802
+ )
1803
+ return None, False
1804
+
1805
+ # Extract command names from time-series (reuse existing function)
1806
+ def extract_command_from_ts(ts_name):
1807
+ """Extract meaningful command name from time-series name"""
1808
+ # Look for latencystats_latency_percentiles_usec_<COMMAND>_p99 pattern
1809
+ match = re.search(
1810
+ r"latencystats_latency_percentiles_usec_([^_/]+)_p99", ts_name
1811
+ )
1812
+ if match:
1813
+ return match.group(1)
1814
+ # Look for command= pattern in the time-series name
1815
+ match = re.search(r"command=([^/]+)", ts_name)
1816
+ if match:
1817
+ return match.group(1)
1818
+ # If no specific pattern found, try to extract from the end of the path
1819
+ parts = ts_name.split("/")
1820
+ if len(parts) > 0:
1821
+ return parts[-1]
1822
+ return "unknown"
1823
+
1824
+ # Group time-series by command
1825
+ baseline_by_command = {}
1826
+ comparison_by_command = {}
1827
+
1828
+ for ts in baseline_p99_ts:
1829
+ cmd = extract_command_from_ts(ts)
1830
+ if cmd not in baseline_by_command:
1831
+ baseline_by_command[cmd] = []
1832
+ baseline_by_command[cmd].append(ts)
1833
+
1834
+ for ts in comparison_p99_ts:
1835
+ cmd = extract_command_from_ts(ts)
1836
+ if cmd not in comparison_by_command:
1837
+ comparison_by_command[cmd] = []
1838
+ comparison_by_command[cmd].append(ts)
1839
+
1840
+ # Find common commands between baseline and comparison
1841
+ common_commands = set(baseline_by_command.keys()) & set(
1842
+ comparison_by_command.keys()
1843
+ )
1844
+
1845
+ if not common_commands:
1846
+ logging.warning(
1847
+ f"No common commands found for p99 variance analysis in {test_name}"
1848
+ )
1849
+ return None, False
1850
+
1851
+ variance_notes = []
1852
+ p99_notes = []
1853
+ high_confidence_indicators = 0
1854
+ total_indicators = 0
1855
+
1856
+ # Analyze variance and p99 for each command
1857
+ for command in sorted(common_commands):
1858
+ total_indicators += 1
1859
+ logging.info(f"Analyzing p99 variance for command: {command}")
1860
+
1861
+ baseline_ts_list = baseline_by_command[command]
1862
+ comparison_ts_list = comparison_by_command[command]
1863
+
1864
+ # If multiple time-series for the same command, try to get the best one
1865
+ if len(baseline_ts_list) > 1:
1866
+ baseline_ts_list = get_only_Totals(baseline_ts_list)
1867
+ if len(comparison_ts_list) > 1:
1868
+ comparison_ts_list = get_only_Totals(comparison_ts_list)
1869
+
1870
+ if len(baseline_ts_list) != 1 or len(comparison_ts_list) != 1:
1871
+ logging.warning(
1872
+ f" Skipping {command}: baseline={len(baseline_ts_list)}, comparison={len(comparison_ts_list)} time-series"
1873
+ )
1874
+ continue
1875
+
1876
+ # Get p99 latency data for this command
1877
+ baseline_p99_data = []
1878
+ comparison_p99_data = []
1879
+
1880
+ for ts_name in baseline_ts_list:
1881
+ datapoints = rts.ts().revrange(ts_name, from_ts_ms, to_ts_ms)
1882
+ baseline_p99_data.extend(datapoints)
1883
+
1884
+ for ts_name in comparison_ts_list:
1885
+ datapoints = rts.ts().revrange(ts_name, from_ts_ms, to_ts_ms)
1886
+ comparison_p99_data.extend(datapoints)
1887
+
1888
+ if len(baseline_p99_data) < 3 or len(comparison_p99_data) < 3:
1889
+ logging.warning(
1890
+ f" Insufficient p99 data for {command}: baseline={len(baseline_p99_data)}, comparison={len(comparison_p99_data)} datapoints"
1891
+ )
1892
+ continue
1893
+
1894
+ # Extract values for variance calculation
1895
+ baseline_values = [dp[1] for dp in baseline_p99_data]
1896
+ comparison_values = [dp[1] for dp in comparison_p99_data]
1897
+
1898
+ # Calculate variance (coefficient of variation)
1899
+ baseline_mean = statistics.mean(baseline_values)
1900
+ baseline_stdev = (
1901
+ statistics.stdev(baseline_values) if len(baseline_values) > 1 else 0
1902
+ )
1903
+ baseline_cv = (
1904
+ (baseline_stdev / baseline_mean * 100)
1905
+ if baseline_mean > 0
1906
+ else float("inf")
1907
+ )
1908
+
1909
+ comparison_mean = statistics.mean(comparison_values)
1910
+ comparison_stdev = (
1911
+ statistics.stdev(comparison_values) if len(comparison_values) > 1 else 0
1912
+ )
1913
+ comparison_cv = (
1914
+ (comparison_stdev / comparison_mean * 100)
1915
+ if comparison_mean > 0
1916
+ else float("inf")
1917
+ )
1918
+
1919
+ # Calculate p99 change
1920
+ p99_change = (
1921
+ ((comparison_mean - baseline_mean) / baseline_mean * 100)
1922
+ if baseline_mean > 0
1923
+ else 0
1924
+ )
1925
+
1926
+ # Assess confidence based on variance and p99 change
1927
+ if baseline_cv < 30: # Low variance in baseline (< 30% CV)
1928
+ if abs(p99_change) > 15: # Significant p99 change
1929
+ high_confidence_indicators += 1
1930
+ p99_notes.append(
1931
+ f"{command} p99 {'+' if p99_change > 0 else ''}{p99_change:.1f}% (stable baseline)"
1932
+ )
1933
+ else:
1934
+ p99_notes.append(
1935
+ f"{command} p99 {'+' if p99_change > 0 else ''}{p99_change:.1f}% (stable baseline, minor change)"
1936
+ )
1937
+ elif baseline_cv < 50: # Moderate variance
1938
+ if abs(p99_change) > 25: # Need larger change for confidence
1939
+ high_confidence_indicators += 1
1940
+ p99_notes.append(
1941
+ f"{command} p99 {'+' if p99_change > 0 else ''}{p99_change:.1f}% (moderate baseline variance)"
1942
+ )
1943
+ else:
1944
+ p99_notes.append(
1945
+ f"{command} p99 {'+' if p99_change > 0 else ''}{p99_change:.1f}% (moderate baseline variance, uncertain)"
1946
+ )
1947
+ else: # High variance
1948
+ if abs(p99_change) > 40: # Need very large change for confidence
1949
+ high_confidence_indicators += 1
1950
+ p99_notes.append(
1951
+ f"{command} p99 {'+' if p99_change > 0 else ''}{p99_change:.1f}% (high baseline variance, large change)"
1952
+ )
1953
+ else:
1954
+ p99_notes.append(
1955
+ f"{command} p99 {'+' if p99_change > 0 else ''}{p99_change:.1f}% (high baseline variance, low confidence)"
1956
+ )
1957
+
1958
+ variance_notes.append(f"{command} baseline CV={baseline_cv:.1f}%")
1959
+
1960
+ if verbose:
1961
+ logging.info(
1962
+ f" Command {command}: baseline CV={baseline_cv:.1f}%, comparison CV={comparison_cv:.1f}%, p99 change={p99_change:.1f}%"
1963
+ )
1964
+
1965
+ # Determine overall confidence
1966
+ confidence_ratio = (
1967
+ high_confidence_indicators / total_indicators if total_indicators > 0 else 0
1968
+ )
1969
+ high_confidence = (
1970
+ confidence_ratio >= 0.5
1971
+ ) # At least 50% of indicators show high confidence
1972
+
1973
+ # Create confidence note
1974
+ confidence_parts = []
1975
+ if variance_notes:
1976
+ confidence_parts.extend(variance_notes)
1977
+ if p99_notes:
1978
+ confidence_parts.extend(p99_notes)
1979
+
1980
+ confidence_note = "; ".join(confidence_parts) if confidence_parts else None
1981
+
1982
+ if confidence_note:
1983
+ confidence_level = "HIGH" if high_confidence else "LOW"
1984
+ cv_explanation = "CV=coefficient of variation (data stability: <30% stable, 30-50% moderate, >50% unstable)"
1985
+ confidence_note = (
1986
+ f"confidence={confidence_level} ({confidence_note}; {cv_explanation})"
1987
+ )
1988
+
1989
+ logging.info(
1990
+ f"Variance and p99 analysis completed for {test_name}: confidence={confidence_ratio:.2f}, high_confidence={high_confidence}"
1991
+ )
1992
+ return confidence_note, high_confidence
1993
+
1994
+ except Exception as e:
1995
+ logging.error(f"Error in variance and p99 analysis for {test_name}: {e}")
1996
+ return None, False
1997
+
1998
+
1999
+ def check_latency_for_unstable_throughput(
2000
+ rts,
2001
+ test_name,
2002
+ baseline_str,
2003
+ comparison_str,
2004
+ by_str_baseline,
2005
+ by_str_comparison,
2006
+ baseline_deployment_name,
2007
+ comparison_deployment_name,
2008
+ tf_triggering_env,
2009
+ from_ts_ms,
2010
+ to_ts_ms,
2011
+ last_n_baseline,
2012
+ last_n_comparison,
2013
+ first_n_baseline,
2014
+ first_n_comparison,
2015
+ running_platform,
2016
+ baseline_architecture,
2017
+ comparison_architecture,
2018
+ verbose,
2019
+ ):
2020
+ """
2021
+ Check latency (p50) for unstable throughput metrics to provide additional context.
2022
+ Returns a tuple: (note_string, confirms_regression_bool, regression_details_dict)
2023
+ """
2024
+ logging.info(f"Starting latency check for unstable throughput test: {test_name}")
2025
+ try:
2026
+ # Build filters for p50 latency metric using both metric=p50 and metric-type=(latencystats)
2027
+ filters_baseline = [
2028
+ f"{by_str_baseline}={baseline_str}",
2029
+ "metric=p50",
2030
+ "metric-type=(latencystats)",
2031
+ f"test_name={test_name}",
2032
+ f"deployment_name={baseline_deployment_name}",
2033
+ f"triggering_env={tf_triggering_env}",
2034
+ ]
2035
+ filters_comparison = [
2036
+ f"{by_str_comparison}={comparison_str}",
2037
+ "metric=p50",
2038
+ "metric-type=(latencystats)",
2039
+ f"test_name={test_name}",
2040
+ f"deployment_name={comparison_deployment_name}",
2041
+ f"triggering_env={tf_triggering_env}",
2042
+ ]
2043
+
2044
+ # Add optional filters
2045
+ if running_platform is not None:
2046
+ filters_baseline.append(f"running_platform={running_platform}")
2047
+ filters_comparison.append(f"running_platform={running_platform}")
2048
+ if baseline_architecture != ARCH_X86:
2049
+ filters_baseline.append(f"arch={baseline_architecture}")
2050
+ if comparison_architecture != ARCH_X86:
2051
+ filters_comparison.append(f"arch={comparison_architecture}")
2052
+
2053
+ # Query for p50 latency time-series
2054
+ logging.info(f"Querying p50 latencystats time-series for {test_name}")
2055
+ logging.info(f"Baseline filters: {filters_baseline}")
2056
+ logging.info(f"Comparison filters: {filters_comparison}")
2057
+
2058
+ baseline_latency_ts = rts.ts().queryindex(filters_baseline)
2059
+ comparison_latency_ts = rts.ts().queryindex(filters_comparison)
2060
+
2061
+ logging.info(
2062
+ f"Found {len(baseline_latency_ts)} baseline p50 latency time-series"
2063
+ )
2064
+ logging.info(
2065
+ f"Found {len(comparison_latency_ts)} comparison p50 latency time-series"
2066
+ )
2067
+
2068
+ if verbose and baseline_latency_ts:
2069
+ logging.info(f"Baseline latency time-series: {baseline_latency_ts}")
2070
+ if verbose and comparison_latency_ts:
2071
+ logging.info(f"Comparison latency time-series: {comparison_latency_ts}")
2072
+
2073
+ # Filter out target time-series and unwanted commands
2074
+ def should_exclude_timeseries(ts_name):
2075
+ """Check if time-series should be excluded based on command"""
2076
+ # Exclude target time-series
2077
+ if "target" in ts_name:
2078
+ return True
2079
+
2080
+ # Convert to lowercase for case-insensitive matching
2081
+ ts_name_lower = ts_name.lower()
2082
+
2083
+ # Exclude administrative commands (case-insensitive)
2084
+ excluded_commands = ["config", "info", "ping", "cluster", "resetstat"]
2085
+ return any(cmd in ts_name_lower for cmd in excluded_commands)
2086
+
2087
+ baseline_latency_ts_before = len(baseline_latency_ts)
2088
+ comparison_latency_ts_before = len(comparison_latency_ts)
2089
+
2090
+ # Apply filtering and log what gets excluded
2091
+ baseline_excluded = [
2092
+ ts for ts in baseline_latency_ts if should_exclude_timeseries(ts)
2093
+ ]
2094
+ comparison_excluded = [
2095
+ ts for ts in comparison_latency_ts if should_exclude_timeseries(ts)
2096
+ ]
2097
+
2098
+ baseline_latency_ts = [
2099
+ ts for ts in baseline_latency_ts if not should_exclude_timeseries(ts)
2100
+ ]
2101
+ comparison_latency_ts = [
2102
+ ts for ts in comparison_latency_ts if not should_exclude_timeseries(ts)
2103
+ ]
2104
+
2105
+ logging.info(
2106
+ f"After filtering: baseline {baseline_latency_ts_before} -> {len(baseline_latency_ts)}, "
2107
+ f"comparison {comparison_latency_ts_before} -> {len(comparison_latency_ts)}"
2108
+ )
2109
+
2110
+ if baseline_excluded:
2111
+ logging.info(
2112
+ f"Excluded {len(baseline_excluded)} baseline administrative command time-series"
2113
+ )
2114
+ if verbose:
2115
+ for ts in baseline_excluded:
2116
+ logging.info(f" Excluded baseline: {ts}")
2117
+ if comparison_excluded:
2118
+ logging.info(
2119
+ f"Excluded {len(comparison_excluded)} comparison administrative command time-series"
2120
+ )
2121
+ if verbose:
2122
+ for ts in comparison_excluded:
2123
+ logging.info(f" Excluded comparison: {ts}")
2124
+
2125
+ if len(baseline_latency_ts) == 0 or len(comparison_latency_ts) == 0:
2126
+ logging.warning(
2127
+ f"No p50 latency data found for {test_name} after filtering"
2128
+ )
2129
+ return None, False, None
2130
+
2131
+ # Extract command names from time-series to match baseline and comparison
2132
+ def extract_command_from_ts(ts_name):
2133
+ """Extract meaningful command name from time-series name"""
2134
+ import re
2135
+
2136
+ # Look for latencystats_latency_percentiles_usec_<COMMAND>_p50 pattern
2137
+ match = re.search(
2138
+ r"latencystats_latency_percentiles_usec_([^_/]+)_p50", ts_name
2139
+ )
2140
+ if match:
2141
+ return match.group(1)
2142
+
2143
+ # Look for command= pattern in the time-series name
2144
+ match = re.search(r"command=([^/]+)", ts_name)
2145
+ if match:
2146
+ return match.group(1)
2147
+
2148
+ # If no specific pattern found, try to extract from the end of the path
2149
+ # e.g., .../Ops/sec/GET -> GET
2150
+ parts = ts_name.split("/")
2151
+ if len(parts) > 0:
2152
+ return parts[-1]
2153
+ return "unknown"
2154
+
2155
+ # Group time-series by command
2156
+ baseline_by_command = {}
2157
+ comparison_by_command = {}
2158
+
2159
+ for ts in baseline_latency_ts:
2160
+ cmd = extract_command_from_ts(ts)
2161
+ if verbose:
2162
+ logging.info(f"Baseline time-series '{ts}' -> command '{cmd}'")
2163
+ if cmd not in baseline_by_command:
2164
+ baseline_by_command[cmd] = []
2165
+ baseline_by_command[cmd].append(ts)
2166
+
2167
+ for ts in comparison_latency_ts:
2168
+ cmd = extract_command_from_ts(ts)
2169
+ if verbose:
2170
+ logging.info(f"Comparison time-series '{ts}' -> command '{cmd}'")
2171
+ if cmd not in comparison_by_command:
2172
+ comparison_by_command[cmd] = []
2173
+ comparison_by_command[cmd].append(ts)
2174
+
2175
+ # Find common commands between baseline and comparison
2176
+ common_commands = set(baseline_by_command.keys()) & set(
2177
+ comparison_by_command.keys()
2178
+ )
2179
+
2180
+ logging.info(f"Baseline commands found: {sorted(baseline_by_command.keys())}")
2181
+ logging.info(
2182
+ f"Comparison commands found: {sorted(comparison_by_command.keys())}"
2183
+ )
2184
+ logging.info(
2185
+ f"Common commands for latency comparison: {sorted(common_commands)}"
2186
+ )
2187
+
2188
+ if not common_commands:
2189
+ logging.warning(
2190
+ f"No common commands found for latency comparison in {test_name}"
2191
+ )
2192
+ return None, False, None
2193
+
2194
+ latency_notes = []
2195
+ significant_latency_increases = (
2196
+ 0 # Track commands with significant latency increases
2197
+ )
2198
+ regression_details = {"test_name": test_name, "commands": []}
2199
+
2200
+ # Compare latency for each command individually
2201
+ for command in sorted(common_commands):
2202
+ logging.info(f"Analyzing latency for command: {command}")
2203
+ baseline_ts_list = baseline_by_command[command]
2204
+ comparison_ts_list = comparison_by_command[command]
2205
+
2206
+ logging.info(
2207
+ f" Command {command}: {len(baseline_ts_list)} baseline, {len(comparison_ts_list)} comparison time-series"
2208
+ )
2209
+
2210
+ # If multiple time-series for the same command, try to get the best one
2211
+ if len(baseline_ts_list) > 1:
2212
+ logging.info(
2213
+ f" Multiple baseline time-series for {command}, filtering..."
2214
+ )
2215
+ baseline_ts_list = get_only_Totals(baseline_ts_list)
2216
+ if len(comparison_ts_list) > 1:
2217
+ logging.info(
2218
+ f" Multiple comparison time-series for {command}, filtering..."
2219
+ )
2220
+ comparison_ts_list = get_only_Totals(comparison_ts_list)
2221
+
2222
+ if len(baseline_ts_list) != 1 or len(comparison_ts_list) != 1:
2223
+ logging.warning(
2224
+ f" Skipping {command}: baseline={len(baseline_ts_list)}, comparison={len(comparison_ts_list)} time-series"
2225
+ )
2226
+ continue
2227
+
2228
+ # Get latency data for this command
2229
+ baseline_latency_data = []
2230
+ comparison_latency_data = []
2231
+
2232
+ for ts_name in baseline_ts_list:
2233
+ datapoints = rts.ts().revrange(ts_name, from_ts_ms, to_ts_ms)
2234
+ baseline_latency_data.extend(datapoints)
2235
+
2236
+ for ts_name in comparison_ts_list:
2237
+ datapoints = rts.ts().revrange(ts_name, from_ts_ms, to_ts_ms)
2238
+ comparison_latency_data.extend(datapoints)
2239
+
2240
+ if len(baseline_latency_data) == 0 or len(comparison_latency_data) == 0:
2241
+ logging.warning(
2242
+ f" No latency data for {command}: baseline={len(baseline_latency_data)}, comparison={len(comparison_latency_data)} datapoints"
2243
+ )
2244
+ continue
2245
+
2246
+ logging.info(
2247
+ f" Command {command}: {len(baseline_latency_data)} baseline, {len(comparison_latency_data)} comparison datapoints"
2248
+ )
2249
+
2250
+ # Calculate latency statistics for this command
2251
+ baseline_latency_values = []
2252
+ comparison_latency_values = []
2253
+
2254
+ (_, baseline_latency_median, _) = get_v_pct_change_and_largest_var(
2255
+ baseline_latency_data,
2256
+ 0,
2257
+ 0,
2258
+ baseline_latency_values,
2259
+ 0,
2260
+ last_n_baseline,
2261
+ verbose,
2262
+ first_n_baseline,
2263
+ )
2264
+
2265
+ (_, comparison_latency_median, _) = get_v_pct_change_and_largest_var(
2266
+ comparison_latency_data,
2267
+ 0,
2268
+ 0,
2269
+ comparison_latency_values,
2270
+ 0,
2271
+ last_n_comparison,
2272
+ verbose,
2273
+ first_n_comparison,
2274
+ )
2275
+
2276
+ if baseline_latency_median == "N/A" or comparison_latency_median == "N/A":
2277
+ logging.warning(
2278
+ f" Could not calculate median for {command}: baseline={baseline_latency_median}, comparison={comparison_latency_median}"
2279
+ )
2280
+ continue
2281
+
2282
+ # Calculate variance (coefficient of variation) for both baseline and comparison
2283
+ baseline_latency_mean = (
2284
+ statistics.mean(baseline_latency_values)
2285
+ if baseline_latency_values
2286
+ else 0
2287
+ )
2288
+ baseline_latency_stdev = (
2289
+ statistics.stdev(baseline_latency_values)
2290
+ if len(baseline_latency_values) > 1
2291
+ else 0
2292
+ )
2293
+ baseline_latency_cv = (
2294
+ (baseline_latency_stdev / baseline_latency_mean * 100)
2295
+ if baseline_latency_mean > 0
2296
+ else float("inf")
2297
+ )
2298
+
2299
+ comparison_latency_mean = (
2300
+ statistics.mean(comparison_latency_values)
2301
+ if comparison_latency_values
2302
+ else 0
2303
+ )
2304
+ comparison_latency_stdev = (
2305
+ statistics.stdev(comparison_latency_values)
2306
+ if len(comparison_latency_values) > 1
2307
+ else 0
2308
+ )
2309
+ comparison_latency_cv = (
2310
+ (comparison_latency_stdev / comparison_latency_mean * 100)
2311
+ if comparison_latency_mean > 0
2312
+ else float("inf")
2313
+ )
2314
+
2315
+ # Calculate latency change (for latency, lower is better)
2316
+ latency_change = (
2317
+ float(comparison_latency_median) / float(baseline_latency_median) - 1
2318
+ ) * 100.0
2319
+
2320
+ logging.info(
2321
+ f" Command {command}: baseline p50={baseline_latency_median:.2f} (CV={baseline_latency_cv:.1f}%), comparison p50={comparison_latency_median:.2f} (CV={comparison_latency_cv:.1f}%), change={latency_change:.1f}%"
2322
+ )
2323
+
2324
+ # Check if latency data is too unstable to be reliable
2325
+ latency_data_unstable = (
2326
+ baseline_latency_cv > 50.0 or comparison_latency_cv > 50.0
2327
+ )
2328
+
2329
+ if latency_data_unstable:
2330
+ # Mark as unstable latency data
2331
+ unstable_reason = []
2332
+ if baseline_latency_cv > 50.0:
2333
+ unstable_reason.append(f"baseline CV={baseline_latency_cv:.1f}%")
2334
+ if comparison_latency_cv > 50.0:
2335
+ unstable_reason.append(
2336
+ f"comparison CV={comparison_latency_cv:.1f}%"
2337
+ )
2338
+
2339
+ latency_notes.append(
2340
+ f"{command} p50 UNSTABLE ({', '.join(unstable_reason)} - data too noisy for reliable analysis)"
2341
+ )
2342
+ logging.warning(
2343
+ f" Command {command}: UNSTABLE latency data detected - {', '.join(unstable_reason)}"
2344
+ )
2345
+ elif (
2346
+ abs(latency_change) > 5.0
2347
+ ): # Only report significant latency changes for stable data
2348
+ direction = "increased" if latency_change > 0 else "decreased"
2349
+
2350
+ # Adjust significance threshold based on baseline variance
2351
+ if baseline_latency_cv < 30.0:
2352
+ # Low variance - use standard threshold
2353
+ significance_threshold = 10.0
2354
+ elif baseline_latency_cv < 50.0:
2355
+ # Moderate variance - require larger change
2356
+ significance_threshold = 15.0
2357
+ else:
2358
+ # High variance - require much larger change
2359
+ significance_threshold = 25.0
2360
+
2361
+ latency_notes.append(
2362
+ f"{command} p50 {direction} {abs(latency_change):.1f}% (baseline CV={baseline_latency_cv:.1f}%)"
2363
+ )
2364
+ logging.info(
2365
+ f" Command {command}: SIGNIFICANT latency change detected ({direction} {abs(latency_change):.1f}%, baseline CV={baseline_latency_cv:.1f}%)"
2366
+ )
2367
+
2368
+ # Track significant latency increases (potential regression confirmation)
2369
+ if latency_change > significance_threshold:
2370
+ significant_latency_increases += 1
2371
+ regression_details["commands"].append(
2372
+ {
2373
+ "command": command,
2374
+ "change_percent": latency_change,
2375
+ "direction": direction,
2376
+ "baseline_cv": baseline_latency_cv,
2377
+ "comparison_cv": comparison_latency_cv,
2378
+ }
2379
+ )
2380
+ logging.info(
2381
+ f" Command {command}: CONFIRMS regression (change={latency_change:.1f}% > threshold={significance_threshold:.1f}%)"
2382
+ )
2383
+ else:
2384
+ logging.info(
2385
+ f" Command {command}: Change below significance threshold (change={latency_change:.1f}% <= threshold={significance_threshold:.1f}%)"
2386
+ )
2387
+ elif verbose:
2388
+ latency_notes.append(
2389
+ f"{command} p50 stable (CV={baseline_latency_cv:.1f}%)"
2390
+ )
2391
+ logging.info(
2392
+ f" Command {command}: latency stable (change={latency_change:.1f}%, baseline CV={baseline_latency_cv:.1f}%)"
2393
+ )
2394
+
2395
+ # Determine if latency confirms regression
2396
+ confirms_regression = significant_latency_increases > 0
2397
+
2398
+ # Return combined latency notes
2399
+ if latency_notes:
2400
+ result = "; ".join(latency_notes)
2401
+ logging.info(f"Latency check completed for {test_name}: {result}")
2402
+ return (
2403
+ result,
2404
+ confirms_regression,
2405
+ regression_details if confirms_regression else None,
2406
+ )
2407
+ else:
2408
+ result = "p50 latency stable" if common_commands else None
2409
+ logging.info(
2410
+ f"Latency check completed for {test_name}: {result or 'no data'}"
2411
+ )
2412
+ return result, False, None
2413
+
2414
+ except Exception as e:
2415
+ logging.error(f"Error checking latency for {test_name}: {e}")
2416
+ return None, False, None
2417
+
2418
+
991
2419
  def get_only_Totals(baseline_timeseries):
992
2420
  logging.warning("\t\tTime-series: {}".format(", ".join(baseline_timeseries)))
993
2421
  logging.info("Checking if Totals will reduce timeseries.")
@@ -995,6 +2423,37 @@ def get_only_Totals(baseline_timeseries):
995
2423
  for ts_name in baseline_timeseries:
996
2424
  if "Totals" in ts_name:
997
2425
  new_base.append(ts_name)
2426
+
2427
+ # If no "Totals" time-series found, try to pick the best alternative
2428
+ if len(new_base) == 0:
2429
+ logging.warning(
2430
+ "No 'Totals' time-series found, trying to pick best alternative."
2431
+ )
2432
+ # Prefer time-series without quotes in metric names
2433
+ unquoted_series = [ts for ts in baseline_timeseries if "'" not in ts]
2434
+ if unquoted_series:
2435
+ new_base = unquoted_series
2436
+ else:
2437
+ # Fall back to original list
2438
+ new_base = baseline_timeseries
2439
+
2440
+ # If we still have multiple time-series after filtering for "Totals",
2441
+ # prefer the one without quotes in the metric name
2442
+ if len(new_base) > 1:
2443
+ logging.info("Multiple time-series found, preferring unquoted metric names.")
2444
+ unquoted_series = [ts for ts in new_base if "'" not in ts]
2445
+ if unquoted_series:
2446
+ new_base = unquoted_series
2447
+
2448
+ # If we still have multiple, take the first one
2449
+ if len(new_base) > 1:
2450
+ logging.warning(
2451
+ "Still multiple time-series after filtering, taking the first one: {}".format(
2452
+ new_base[0]
2453
+ )
2454
+ )
2455
+ new_base = [new_base[0]]
2456
+
998
2457
  baseline_timeseries = new_base
999
2458
  return baseline_timeseries
1000
2459
 
@@ -1061,11 +2520,38 @@ def add_line(
1061
2520
  percentage_change,
1062
2521
  table,
1063
2522
  test_name,
2523
+ grafana_link_base=None,
2524
+ baseline_branch=None,
2525
+ baseline_version=None,
2526
+ comparison_branch=None,
2527
+ comparison_version=None,
2528
+ from_date=None,
2529
+ to_date=None,
1064
2530
  ):
2531
+ grafana_link = None
2532
+ if grafana_link_base is not None:
2533
+ grafana_link = "{}?orgId=1".format(grafana_link_base)
2534
+ grafana_link += f"&var-test_case={test_name}"
2535
+
2536
+ if baseline_branch is not None:
2537
+ grafana_link += f"&var-branch={baseline_branch}"
2538
+ if baseline_version is not None:
2539
+ grafana_link += f"&var-version={baseline_version}"
2540
+ if comparison_branch is not None:
2541
+ grafana_link += f"&var-branch={comparison_branch}"
2542
+ if comparison_version is not None:
2543
+ grafana_link += f"&var-version={comparison_version}"
2544
+ grafana_link += "&from=now-30d&to=now"
2545
+
2546
+ # Create test name with optional Grafana link
2547
+ test_name_display = test_name
2548
+ if grafana_link is not None:
2549
+ test_name_display = f"[{test_name}]({grafana_link})"
2550
+
1065
2551
  percentage_change_str = "{:.1f}% ".format(percentage_change)
1066
2552
  table.append(
1067
2553
  [
1068
- test_name,
2554
+ test_name_display,
1069
2555
  baseline_v_str,
1070
2556
  comparison_v_str,
1071
2557
  percentage_change_str,
@@ -1102,9 +2588,9 @@ def get_v_pct_change_and_largest_var(
1102
2588
  comparison_values.append(tuple[1])
1103
2589
 
1104
2590
  comparison_df = pd.DataFrame(comparison_values)
1105
- comparison_median = float(comparison_df.median())
2591
+ comparison_median = float(comparison_df.median().iloc[0])
1106
2592
  comparison_v = comparison_median
1107
- comparison_std = float(comparison_df.std())
2593
+ comparison_std = float(comparison_df.std().iloc[0])
1108
2594
  if verbose:
1109
2595
  logging.info(
1110
2596
  "comparison_datapoints: {} value: {}; std-dev: {}; median: {}".format(