redisbench-admin 0.11.38__py3-none-any.whl → 0.11.40__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,6 +13,7 @@ from pytablewriter import MarkdownTableWriter
13
13
  import humanize
14
14
  import datetime as dt
15
15
  import os
16
+ import statistics
16
17
  from tqdm import tqdm
17
18
  from github import Github
18
19
  from slack_sdk.webhook import WebhookClient
@@ -273,6 +274,10 @@ def compare_command_logic(args, project_name, project_version):
273
274
  total_stable,
274
275
  total_unstable,
275
276
  total_comparison_points,
277
+ total_unstable_baseline,
278
+ total_unstable_comparison,
279
+ total_latency_confirmed_regressions,
280
+ latency_confirmed_regression_details,
276
281
  ) = compute_regression_table(
277
282
  rts,
278
283
  tf_github_org,
@@ -306,6 +311,7 @@ def compare_command_logic(args, project_name, project_version):
306
311
  comparison_architecture,
307
312
  first_n_baseline,
308
313
  first_n_comparison,
314
+ grafana_link_base,
309
315
  )
310
316
  comment_body = ""
311
317
  if total_comparison_points > 0:
@@ -324,11 +330,63 @@ def compare_command_logic(args, project_name, project_version):
324
330
  )
325
331
 
326
332
  if total_unstable > 0:
333
+ unstable_details = []
334
+ if total_unstable_baseline > 0:
335
+ unstable_details.append(f"{total_unstable_baseline} baseline")
336
+ if total_unstable_comparison > 0:
337
+ unstable_details.append(f"{total_unstable_comparison} comparison")
338
+
339
+ unstable_breakdown = (
340
+ " (" + ", ".join(unstable_details) + ")" if unstable_details else ""
341
+ )
327
342
  comparison_summary += (
328
- "- Detected a total of {} highly unstable benchmarks.\n".format(
329
- total_unstable
343
+ "- Detected a total of {} highly unstable benchmarks{}.\n".format(
344
+ total_unstable, unstable_breakdown
330
345
  )
331
346
  )
347
+
348
+ # Add latency confirmation summary if applicable
349
+ if total_latency_confirmed_regressions > 0:
350
+ comparison_summary += "- Latency analysis confirmed regressions in {} of the unstable tests:\n".format(
351
+ total_latency_confirmed_regressions
352
+ )
353
+
354
+ # Add detailed breakdown as bullet points with test links
355
+ if latency_confirmed_regression_details:
356
+ for detail in latency_confirmed_regression_details:
357
+ test_name = detail["test_name"]
358
+ commands_info = []
359
+ for cmd_detail in detail["commands"]:
360
+ commands_info.append(
361
+ f"{cmd_detail['command']} +{cmd_detail['change_percent']:.1f}%"
362
+ )
363
+
364
+ if commands_info:
365
+ # Create test link if grafana_link_base is available
366
+ test_display_name = test_name
367
+ if grafana_link_base is not None:
368
+ grafana_test_link = f"{grafana_link_base}?orgId=1&var-test_case={test_name}"
369
+ if baseline_branch is not None:
370
+ grafana_test_link += (
371
+ f"&var-branch={baseline_branch}"
372
+ )
373
+ if comparison_branch is not None:
374
+ grafana_test_link += (
375
+ f"&var-branch={comparison_branch}"
376
+ )
377
+ grafana_test_link += "&from=now-30d&to=now"
378
+ test_display_name = (
379
+ f"[{test_name}]({grafana_test_link})"
380
+ )
381
+
382
+ # Add confidence indicator if available
383
+ confidence_indicator = ""
384
+ if "high_confidence" in detail:
385
+ confidence_indicator = (
386
+ " 🔴" if detail["high_confidence"] else " ⚠️"
387
+ )
388
+
389
+ comparison_summary += f" - {test_display_name}: {', '.join(commands_info)}{confidence_indicator}\n"
332
390
  if total_improvements > 0:
333
391
  comparison_summary += "- Detected a total of {} improvements above the improvement water line.\n".format(
334
392
  total_improvements
@@ -487,6 +545,9 @@ def compare_command_logic(args, project_name, project_version):
487
545
  total_stable,
488
546
  total_unstable,
489
547
  total_comparison_points,
548
+ total_unstable_baseline,
549
+ total_unstable_comparison,
550
+ total_latency_confirmed_regressions,
490
551
  )
491
552
 
492
553
 
@@ -534,6 +595,7 @@ def compute_regression_table(
534
595
  comparison_architecture=ARCH_X86,
535
596
  first_n_baseline=-1,
536
597
  first_n_comparison=-1,
598
+ grafana_link_base=None,
537
599
  ):
538
600
  START_TIME_NOW_UTC, _, _ = get_start_time_vars()
539
601
  START_TIME_LAST_MONTH_UTC = START_TIME_NOW_UTC - datetime.timedelta(days=31)
@@ -596,6 +658,10 @@ def compute_regression_table(
596
658
  total_stable,
597
659
  total_unstable,
598
660
  total_comparison_points,
661
+ total_unstable_baseline,
662
+ total_unstable_comparison,
663
+ total_latency_confirmed_regressions,
664
+ latency_confirmed_regression_details,
599
665
  ) = from_rts_to_regression_table(
600
666
  baseline_deployment_name,
601
667
  comparison_deployment_name,
@@ -624,14 +690,97 @@ def compute_regression_table(
624
690
  comparison_architecture,
625
691
  first_n_baseline,
626
692
  first_n_comparison,
693
+ grafana_link_base,
694
+ baseline_branch,
695
+ baseline_tag,
696
+ comparison_branch,
697
+ comparison_tag,
698
+ from_date,
699
+ to_date,
627
700
  )
628
701
  logging.info(
629
702
  "Printing differential analysis between {} and {}".format(
630
703
  baseline_str, comparison_str
631
704
  )
632
705
  )
633
- writer = MarkdownTableWriter(
634
- table_name="Comparison between {} and {}.\n\nTime Period from {}. (environment used: {})\n".format(
706
+
707
+ # Split table into improvements, regressions, and no-changes
708
+ improvements_table = []
709
+ regressions_table = []
710
+ no_changes_table = []
711
+
712
+ for row in table:
713
+ # Check if there's a meaningful change (not stable/unstable)
714
+ note = row[4].lower() if len(row) > 4 else ""
715
+ percentage_str = row[3] if len(row) > 3 else "0.0%"
716
+
717
+ # Extract percentage value
718
+ try:
719
+ percentage_val = float(percentage_str.replace("%", "").strip())
720
+ except:
721
+ percentage_val = 0.0
722
+
723
+ # Categorize based on change type
724
+ if "improvement" in note and "potential" not in note:
725
+ # Only actual improvements, not potential ones
726
+ improvements_table.append(row)
727
+ elif ("regression" in note and "potential" not in note) or "unstable" in note:
728
+ # Only actual regressions, not potential ones, plus unstable tests
729
+ regressions_table.append(row)
730
+ elif "no change" in note or "potential" in note:
731
+ # No changes and potential changes (below significance threshold)
732
+ no_changes_table.append(row)
733
+ elif abs(percentage_val) > 3.0: # Significant changes based on percentage
734
+ if (percentage_val > 0 and metric_mode == "higher-better") or (
735
+ percentage_val < 0 and metric_mode == "lower-better"
736
+ ):
737
+ improvements_table.append(row)
738
+ else:
739
+ regressions_table.append(row)
740
+ else:
741
+ no_changes_table.append(row)
742
+
743
+ # Sort tables by percentage change
744
+ def get_percentage_value(row):
745
+ """Extract percentage value from row for sorting"""
746
+ try:
747
+ percentage_str = row[3] if len(row) > 3 else "0.0%"
748
+ return float(percentage_str.replace("%", "").strip())
749
+ except:
750
+ return 0.0
751
+
752
+ # Sort improvements by percentage change (highest first)
753
+ improvements_table.sort(key=get_percentage_value, reverse=True)
754
+
755
+ # Sort regressions by percentage change (most negative first for higher-better, most positive first for lower-better)
756
+ if metric_mode == "higher-better":
757
+ # For higher-better metrics, most negative changes are worst regressions
758
+ regressions_table.sort(key=get_percentage_value)
759
+ else:
760
+ # For lower-better metrics, most positive changes are worst regressions
761
+ regressions_table.sort(key=get_percentage_value, reverse=True)
762
+
763
+ # Create improvements table (visible)
764
+ improvements_writer = MarkdownTableWriter(
765
+ table_name="Performance Improvements - Comparison between {} and {}.\n\nTime Period from {}. (environment used: {})\n".format(
766
+ baseline_str,
767
+ comparison_str,
768
+ from_human_str,
769
+ baseline_deployment_name,
770
+ ),
771
+ headers=[
772
+ "Test Case",
773
+ "Baseline {} (median obs. +- std.dev)".format(baseline_str),
774
+ "Comparison {} (median obs. +- std.dev)".format(comparison_str),
775
+ "% change ({})".format(metric_mode),
776
+ "Note",
777
+ ],
778
+ value_matrix=improvements_table,
779
+ )
780
+
781
+ # Create regressions table (visible)
782
+ regressions_writer = MarkdownTableWriter(
783
+ table_name="Performance Regressions and Issues - Comparison between {} and {}.\n\nTime Period from {}. (environment used: {})\n".format(
635
784
  baseline_str,
636
785
  comparison_str,
637
786
  from_human_str,
@@ -644,8 +793,22 @@ def compute_regression_table(
644
793
  "% change ({})".format(metric_mode),
645
794
  "Note",
646
795
  ],
647
- value_matrix=table,
796
+ value_matrix=regressions_table,
648
797
  )
798
+
799
+ # Create no-changes table (hidden in markdown)
800
+ no_changes_writer = MarkdownTableWriter(
801
+ table_name="Tests with No Significant Changes",
802
+ headers=[
803
+ "Test Case",
804
+ "Baseline {} (median obs. +- std.dev)".format(baseline_str),
805
+ "Comparison {} (median obs. +- std.dev)".format(comparison_str),
806
+ "% change ({})".format(metric_mode),
807
+ "Note",
808
+ ],
809
+ value_matrix=no_changes_table,
810
+ )
811
+
649
812
  table_output = ""
650
813
 
651
814
  from io import StringIO
@@ -654,7 +817,25 @@ def compute_regression_table(
654
817
  old_stdout = sys.stdout
655
818
  sys.stdout = mystdout = StringIO()
656
819
 
657
- writer.dump(mystdout, False)
820
+ # Output improvements table first (if any)
821
+ if improvements_table:
822
+ improvements_writer.dump(mystdout, False)
823
+ mystdout.write("\n\n")
824
+
825
+ # Output regressions table (if any)
826
+ if regressions_table:
827
+ regressions_writer.dump(mystdout, False)
828
+ mystdout.write("\n\n")
829
+
830
+ # Add hidden no-changes table
831
+ if no_changes_table:
832
+ mystdout.write(
833
+ "<details>\n<summary>Tests with No Significant Changes ({} tests)</summary>\n\n".format(
834
+ len(no_changes_table)
835
+ )
836
+ )
837
+ no_changes_writer.dump(mystdout, False)
838
+ mystdout.write("\n</details>\n")
658
839
 
659
840
  sys.stdout = old_stdout
660
841
 
@@ -668,6 +849,10 @@ def compute_regression_table(
668
849
  total_stable,
669
850
  total_unstable,
670
851
  total_comparison_points,
852
+ total_unstable_baseline,
853
+ total_unstable_comparison,
854
+ total_latency_confirmed_regressions,
855
+ latency_confirmed_regression_details,
671
856
  )
672
857
 
673
858
 
@@ -755,6 +940,13 @@ def from_rts_to_regression_table(
755
940
  comparison_architecture=ARCH_X86,
756
941
  first_n_baseline=-1,
757
942
  first_n_comparison=-1,
943
+ grafana_link_base=None,
944
+ baseline_branch=None,
945
+ baseline_tag=None,
946
+ comparison_branch=None,
947
+ comparison_tag=None,
948
+ from_date=None,
949
+ to_date=None,
758
950
  ):
759
951
  print_all = print_regressions_only is False and print_improvements_only is False
760
952
  table = []
@@ -762,8 +954,12 @@ def from_rts_to_regression_table(
762
954
  total_improvements = 0
763
955
  total_stable = 0
764
956
  total_unstable = 0
957
+ total_unstable_baseline = 0
958
+ total_unstable_comparison = 0
765
959
  total_regressions = 0
766
960
  total_comparison_points = 0
961
+ total_latency_confirmed_regressions = 0
962
+ latency_confirmed_regression_details = [] # Track specific test details
767
963
  noise_waterline = 3
768
964
  progress = tqdm(unit="benchmark time-series", total=len(test_names))
769
965
  for test_name in test_names:
@@ -901,10 +1097,243 @@ def from_rts_to_regression_table(
901
1097
  logging.error("Detected a ZeroDivisionError. {}".format(e.__str__()))
902
1098
  pass
903
1099
  unstable = False
1100
+ unstable_baseline = False
1101
+ unstable_comparison = False
1102
+ latency_confirms_regression = False
1103
+
904
1104
  if baseline_v != "N/A" and comparison_v != "N/A":
905
1105
  if comparison_pct_change > 10.0 or baseline_pct_change > 10.0:
906
- note = "UNSTABLE (very high variance)"
907
1106
  unstable = True
1107
+ unstable_baseline = baseline_pct_change > 10.0
1108
+ unstable_comparison = comparison_pct_change > 10.0
1109
+
1110
+ # Build detailed unstable note
1111
+ unstable_parts = []
1112
+ if unstable_baseline and unstable_comparison:
1113
+ unstable_parts.append(
1114
+ "UNSTABLE (baseline & comparison high variance)"
1115
+ )
1116
+ elif unstable_baseline:
1117
+ unstable_parts.append("UNSTABLE (baseline high variance)")
1118
+ elif unstable_comparison:
1119
+ unstable_parts.append("UNSTABLE (comparison high variance)")
1120
+
1121
+ note = unstable_parts[0]
1122
+
1123
+ # Log detailed warning about unstable data detection
1124
+ logging.warning(
1125
+ f"UNSTABLE DATA DETECTED for test '{test_name}': "
1126
+ f"baseline variance={baseline_pct_change:.1f}%, "
1127
+ f"comparison variance={comparison_pct_change:.1f}% "
1128
+ f"(threshold=10.0%)"
1129
+ )
1130
+
1131
+ # For throughput metrics (higher-better), check both server-side and client-side latency
1132
+ if metric_mode == "higher-better":
1133
+ logging.info(
1134
+ f"Performing 2nd-level latency validation for unstable throughput metric '{test_name}' "
1135
+ f"(metric_mode={metric_mode})"
1136
+ )
1137
+
1138
+ # Check server-side p50 latency
1139
+ (
1140
+ server_latency_note,
1141
+ server_confirms_regression,
1142
+ server_regression_details,
1143
+ ) = check_latency_for_unstable_throughput(
1144
+ rts,
1145
+ test_name,
1146
+ baseline_str,
1147
+ comparison_str,
1148
+ by_str_baseline,
1149
+ by_str_comparison,
1150
+ baseline_deployment_name,
1151
+ comparison_deployment_name,
1152
+ tf_triggering_env,
1153
+ from_ts_ms,
1154
+ to_ts_ms,
1155
+ last_n_baseline,
1156
+ last_n_comparison,
1157
+ first_n_baseline,
1158
+ first_n_comparison,
1159
+ running_platform,
1160
+ baseline_architecture,
1161
+ comparison_architecture,
1162
+ verbose,
1163
+ )
1164
+
1165
+ # Check client-side latency metrics
1166
+ (
1167
+ client_latency_note,
1168
+ client_confirms_regression,
1169
+ client_regression_details,
1170
+ ) = check_client_side_latency(
1171
+ rts,
1172
+ test_name,
1173
+ baseline_str,
1174
+ comparison_str,
1175
+ by_str_baseline,
1176
+ by_str_comparison,
1177
+ baseline_deployment_name,
1178
+ comparison_deployment_name,
1179
+ tf_triggering_env,
1180
+ from_ts_ms,
1181
+ to_ts_ms,
1182
+ last_n_baseline,
1183
+ last_n_comparison,
1184
+ first_n_baseline,
1185
+ first_n_comparison,
1186
+ running_platform,
1187
+ baseline_architecture,
1188
+ comparison_architecture,
1189
+ verbose,
1190
+ )
1191
+
1192
+ # Combine results from both server and client side
1193
+ combined_latency_notes = []
1194
+ if server_latency_note:
1195
+ combined_latency_notes.append(f"server: {server_latency_note}")
1196
+ if client_latency_note:
1197
+ combined_latency_notes.append(f"client: {client_latency_note}")
1198
+
1199
+ # Only confirm regression if BOTH server and client side show evidence AND data is stable enough
1200
+ # Check if either server or client data contains unstable indicators
1201
+ server_has_unstable = (
1202
+ server_latency_note and "UNSTABLE" in server_latency_note
1203
+ )
1204
+ client_has_unstable = (
1205
+ client_latency_note and "UNSTABLE" in client_latency_note
1206
+ )
1207
+
1208
+ # Don't confirm regression if either side has unstable data
1209
+ if server_has_unstable or client_has_unstable:
1210
+ both_confirm_regression = False
1211
+ unstable_sides = []
1212
+ if server_has_unstable:
1213
+ unstable_sides.append("server")
1214
+ if client_has_unstable:
1215
+ unstable_sides.append("client")
1216
+ blocked_note = f"regression blocked due to unstable {' and '.join(unstable_sides)} latency data"
1217
+ note += f"; {blocked_note}"
1218
+ logging.info(
1219
+ f"Blocking regression confirmation for '{test_name}' due to unstable latency data"
1220
+ )
1221
+ if server_has_unstable:
1222
+ logging.info(" Server-side latency data is unstable")
1223
+ if client_has_unstable:
1224
+ logging.info(" Client-side latency data is unstable")
1225
+ else:
1226
+ both_confirm_regression = (
1227
+ server_confirms_regression and client_confirms_regression
1228
+ )
1229
+
1230
+ if combined_latency_notes:
1231
+ combined_note = "; ".join(combined_latency_notes)
1232
+ note += f"; {combined_note}"
1233
+ logging.info(
1234
+ f"Combined latency check result for '{test_name}': {combined_note}"
1235
+ )
1236
+
1237
+ if both_confirm_regression:
1238
+ logging.info(
1239
+ f"BOTH server and client latency analysis CONFIRM regression for '{test_name}'"
1240
+ )
1241
+
1242
+ # Set the flag for counting confirmed regressions
1243
+ latency_confirms_regression = True
1244
+
1245
+ # Combine regression details from both server and client
1246
+ combined_regression_details = (
1247
+ server_regression_details or client_regression_details
1248
+ )
1249
+ if combined_regression_details:
1250
+ combined_regression_details["server_side"] = (
1251
+ server_confirms_regression
1252
+ )
1253
+ combined_regression_details["client_side"] = (
1254
+ client_confirms_regression
1255
+ )
1256
+
1257
+ # 2nd level confirmation is sufficient - always add to confirmed regressions
1258
+ logging.info(
1259
+ f"Adding '{test_name}' to confirmed regressions based on 2nd level validation"
1260
+ )
1261
+
1262
+ # Perform 3rd-level analysis: variance + p99 check for additional confidence scoring
1263
+ logging.info(
1264
+ f"Performing 3rd-level analysis (variance + p99) for confidence scoring on '{test_name}'"
1265
+ )
1266
+ (
1267
+ confidence_note,
1268
+ high_confidence,
1269
+ ) = perform_variance_and_p99_analysis(
1270
+ rts,
1271
+ test_name,
1272
+ baseline_str,
1273
+ comparison_str,
1274
+ by_str_baseline,
1275
+ by_str_comparison,
1276
+ baseline_deployment_name,
1277
+ comparison_deployment_name,
1278
+ tf_triggering_env,
1279
+ from_ts_ms,
1280
+ to_ts_ms,
1281
+ last_n_baseline,
1282
+ last_n_comparison,
1283
+ first_n_baseline,
1284
+ first_n_comparison,
1285
+ running_platform,
1286
+ baseline_architecture,
1287
+ comparison_architecture,
1288
+ verbose,
1289
+ )
1290
+
1291
+ if confidence_note:
1292
+ note += f"; {confidence_note}"
1293
+ logging.info(
1294
+ f"Confidence analysis for '{test_name}': {confidence_note}"
1295
+ )
1296
+ # Use 3rd level confidence if available
1297
+ combined_regression_details["high_confidence"] = (
1298
+ high_confidence
1299
+ )
1300
+ else:
1301
+ # No 3rd level data available - default to moderate confidence since 2nd level confirmed
1302
+ logging.info(
1303
+ f"No 3rd level data available for '{test_name}' - using 2nd level confirmation"
1304
+ )
1305
+ combined_regression_details["high_confidence"] = (
1306
+ True # 2nd level confirmation is reliable
1307
+ )
1308
+
1309
+ # Always add to confirmed regressions when 2nd level confirms
1310
+ latency_confirmed_regression_details.append(
1311
+ combined_regression_details
1312
+ )
1313
+ elif server_confirms_regression or client_confirms_regression:
1314
+ side_confirmed = (
1315
+ "server" if server_confirms_regression else "client"
1316
+ )
1317
+ side_not_confirmed = (
1318
+ "client" if server_confirms_regression else "server"
1319
+ )
1320
+ insufficient_evidence_note = f"only {side_confirmed} side confirms regression ({side_not_confirmed} side stable) - insufficient evidence"
1321
+ note += f"; {insufficient_evidence_note}"
1322
+ logging.info(
1323
+ f"Only {side_confirmed} side confirms regression for '{test_name}' - insufficient evidence"
1324
+ )
1325
+ else:
1326
+ no_regression_note = (
1327
+ "neither server nor client side confirms regression"
1328
+ )
1329
+ note += f"; {no_regression_note}"
1330
+ logging.info(
1331
+ f"Neither server nor client side confirms regression for '{test_name}'"
1332
+ )
1333
+ else:
1334
+ logging.info(
1335
+ f"No latency data available for secondary check on '{test_name}'"
1336
+ )
908
1337
 
909
1338
  baseline_v_str = prepare_value_str(
910
1339
  baseline_pct_change, baseline_v, baseline_values, simplify_table
@@ -959,6 +1388,12 @@ def from_rts_to_regression_table(
959
1388
 
960
1389
  if unstable:
961
1390
  total_unstable += 1
1391
+ if unstable_baseline:
1392
+ total_unstable_baseline += 1
1393
+ if unstable_comparison:
1394
+ total_unstable_comparison += 1
1395
+ if latency_confirms_regression:
1396
+ total_latency_confirmed_regressions += 1
962
1397
 
963
1398
  should_add_line = False
964
1399
  if print_regressions_only and detected_regression:
@@ -979,6 +1414,13 @@ def from_rts_to_regression_table(
979
1414
  percentage_change,
980
1415
  table,
981
1416
  test_name,
1417
+ grafana_link_base,
1418
+ baseline_branch,
1419
+ baseline_tag,
1420
+ comparison_branch,
1421
+ comparison_tag,
1422
+ from_date,
1423
+ to_date,
982
1424
  )
983
1425
  return (
984
1426
  detected_regressions,
@@ -988,9 +1430,995 @@ def from_rts_to_regression_table(
988
1430
  total_stable,
989
1431
  total_unstable,
990
1432
  total_comparison_points,
1433
+ total_unstable_baseline,
1434
+ total_unstable_comparison,
1435
+ total_latency_confirmed_regressions,
1436
+ latency_confirmed_regression_details,
991
1437
  )
992
1438
 
993
1439
 
1440
+ def check_client_side_latency(
1441
+ rts,
1442
+ test_name,
1443
+ baseline_str,
1444
+ comparison_str,
1445
+ by_str_baseline,
1446
+ by_str_comparison,
1447
+ baseline_deployment_name,
1448
+ comparison_deployment_name,
1449
+ tf_triggering_env,
1450
+ from_ts_ms,
1451
+ to_ts_ms,
1452
+ last_n_baseline,
1453
+ last_n_comparison,
1454
+ first_n_baseline,
1455
+ first_n_comparison,
1456
+ running_platform,
1457
+ baseline_architecture,
1458
+ comparison_architecture,
1459
+ verbose=False,
1460
+ ):
1461
+ """
1462
+ Check client-side latency metrics to provide additional validation for regression detection.
1463
+
1464
+ Returns:
1465
+ tuple: (note_string, confirms_regression_bool, regression_details_dict)
1466
+ """
1467
+ logging.info(f"Starting client-side latency check for test: {test_name}")
1468
+ try:
1469
+ # Client-side latency metrics to check
1470
+ client_metrics = [
1471
+ "p50_latency_ms",
1472
+ "Latency",
1473
+ "OverallQuantiles.allCommands.q50",
1474
+ "Tests.INSERT.AverageLatency_us_",
1475
+ "Tests.READ.AverageLatency_us_",
1476
+ "Tests.SEARCH.AverageLatency_us_",
1477
+ "Tests.UPDATE.AverageLatency_us_",
1478
+ ]
1479
+
1480
+ client_latency_notes = []
1481
+ significant_client_latency_increases = 0
1482
+ regression_details = {"test_name": test_name, "commands": []}
1483
+
1484
+ for metric in client_metrics:
1485
+ # Build filters for client-side latency metric
1486
+ filters_baseline = [
1487
+ f"{by_str_baseline}={baseline_str}",
1488
+ f"metric={metric}",
1489
+ f"test_name={test_name}",
1490
+ f"deployment_name={baseline_deployment_name}",
1491
+ f"triggering_env={tf_triggering_env}",
1492
+ ]
1493
+ filters_comparison = [
1494
+ f"{by_str_comparison}={comparison_str}",
1495
+ f"metric={metric}",
1496
+ f"test_name={test_name}",
1497
+ f"deployment_name={comparison_deployment_name}",
1498
+ f"triggering_env={tf_triggering_env}",
1499
+ ]
1500
+
1501
+ # Add optional filters
1502
+ if running_platform is not None:
1503
+ filters_baseline.append(f"running_platform={running_platform}")
1504
+ filters_comparison.append(f"running_platform={running_platform}")
1505
+ if baseline_architecture != ARCH_X86:
1506
+ filters_baseline.append(f"arch={baseline_architecture}")
1507
+ if comparison_architecture != ARCH_X86:
1508
+ filters_comparison.append(f"arch={comparison_architecture}")
1509
+
1510
+ # Query for client-side latency time-series
1511
+ baseline_client_ts = rts.ts().queryindex(filters_baseline)
1512
+ comparison_client_ts = rts.ts().queryindex(filters_comparison)
1513
+
1514
+ if len(baseline_client_ts) == 0 or len(comparison_client_ts) == 0:
1515
+ if verbose:
1516
+ logging.info(
1517
+ f" No client-side data found for metric '{metric}' in {test_name}"
1518
+ )
1519
+ continue
1520
+
1521
+ logging.info(
1522
+ f" Found client-side metric '{metric}': {len(baseline_client_ts)} baseline, {len(comparison_client_ts)} comparison time-series"
1523
+ )
1524
+
1525
+ # Filter out target time-series
1526
+ baseline_client_ts = [ts for ts in baseline_client_ts if "target" not in ts]
1527
+ comparison_client_ts = [
1528
+ ts for ts in comparison_client_ts if "target" not in ts
1529
+ ]
1530
+
1531
+ if len(baseline_client_ts) == 0 or len(comparison_client_ts) == 0:
1532
+ continue
1533
+
1534
+ # Use the first available time-series for each side
1535
+ baseline_ts = baseline_client_ts[0]
1536
+ comparison_ts = comparison_client_ts[0]
1537
+
1538
+ # Get client-side latency data
1539
+ baseline_client_data = rts.ts().revrange(baseline_ts, from_ts_ms, to_ts_ms)
1540
+ comparison_client_data = rts.ts().revrange(
1541
+ comparison_ts, from_ts_ms, to_ts_ms
1542
+ )
1543
+
1544
+ if len(baseline_client_data) == 0 or len(comparison_client_data) == 0:
1545
+ if verbose:
1546
+ logging.info(
1547
+ f" No data points for metric '{metric}': baseline={len(baseline_client_data)}, comparison={len(comparison_client_data)}"
1548
+ )
1549
+ continue
1550
+
1551
+ # Calculate client-side latency statistics
1552
+ baseline_client_values = []
1553
+ comparison_client_values = []
1554
+
1555
+ (_, baseline_client_median, _) = get_v_pct_change_and_largest_var(
1556
+ baseline_client_data,
1557
+ 0,
1558
+ 0,
1559
+ baseline_client_values,
1560
+ 0,
1561
+ last_n_baseline,
1562
+ verbose,
1563
+ first_n_baseline,
1564
+ )
1565
+
1566
+ (_, comparison_client_median, _) = get_v_pct_change_and_largest_var(
1567
+ comparison_client_data,
1568
+ 0,
1569
+ 0,
1570
+ comparison_client_values,
1571
+ 0,
1572
+ last_n_comparison,
1573
+ verbose,
1574
+ first_n_comparison,
1575
+ )
1576
+
1577
+ if baseline_client_median == "N/A" or comparison_client_median == "N/A":
1578
+ if verbose:
1579
+ logging.info(
1580
+ f" Could not calculate median for metric '{metric}': baseline={baseline_client_median}, comparison={comparison_client_median}"
1581
+ )
1582
+ continue
1583
+
1584
+ # Calculate variance (coefficient of variation) for both baseline and comparison
1585
+ baseline_client_mean = (
1586
+ statistics.mean(baseline_client_values) if baseline_client_values else 0
1587
+ )
1588
+ baseline_client_stdev = (
1589
+ statistics.stdev(baseline_client_values)
1590
+ if len(baseline_client_values) > 1
1591
+ else 0
1592
+ )
1593
+ baseline_client_cv = (
1594
+ (baseline_client_stdev / baseline_client_mean * 100)
1595
+ if baseline_client_mean > 0
1596
+ else float("inf")
1597
+ )
1598
+
1599
+ comparison_client_mean = (
1600
+ statistics.mean(comparison_client_values)
1601
+ if comparison_client_values
1602
+ else 0
1603
+ )
1604
+ comparison_client_stdev = (
1605
+ statistics.stdev(comparison_client_values)
1606
+ if len(comparison_client_values) > 1
1607
+ else 0
1608
+ )
1609
+ comparison_client_cv = (
1610
+ (comparison_client_stdev / comparison_client_mean * 100)
1611
+ if comparison_client_mean > 0
1612
+ else float("inf")
1613
+ )
1614
+
1615
+ # Calculate client-side latency change (for latency, higher is worse)
1616
+ client_latency_change = (
1617
+ float(comparison_client_median) / float(baseline_client_median) - 1
1618
+ ) * 100.0
1619
+
1620
+ logging.info(
1621
+ f" Client metric '{metric}': baseline={baseline_client_median:.2f} (CV={baseline_client_cv:.1f}%), comparison={comparison_client_median:.2f} (CV={comparison_client_cv:.1f}%), change={client_latency_change:.1f}%"
1622
+ )
1623
+
1624
+ # Check if client latency data is too unstable to be reliable
1625
+ client_data_unstable = (
1626
+ baseline_client_cv > 50.0 or comparison_client_cv > 50.0
1627
+ )
1628
+
1629
+ if client_data_unstable:
1630
+ # Mark as unstable client latency data
1631
+ unstable_reason = []
1632
+ if baseline_client_cv > 50.0:
1633
+ unstable_reason.append(f"baseline CV={baseline_client_cv:.1f}%")
1634
+ if comparison_client_cv > 50.0:
1635
+ unstable_reason.append(f"comparison CV={comparison_client_cv:.1f}%")
1636
+
1637
+ client_latency_notes.append(
1638
+ f"{metric} UNSTABLE ({', '.join(unstable_reason)} - data too noisy for reliable analysis)"
1639
+ )
1640
+ logging.warning(
1641
+ f" Client metric '{metric}': UNSTABLE latency data detected - {', '.join(unstable_reason)}"
1642
+ )
1643
+ elif (
1644
+ abs(client_latency_change) > 5.0
1645
+ ): # Only report significant client latency changes for stable data
1646
+ direction = "increased" if client_latency_change > 0 else "decreased"
1647
+
1648
+ # Adjust significance threshold based on baseline variance
1649
+ if baseline_client_cv < 30.0:
1650
+ # Low variance - use standard threshold
1651
+ significance_threshold = 10.0
1652
+ elif baseline_client_cv < 50.0:
1653
+ # Moderate variance - require larger change
1654
+ significance_threshold = 15.0
1655
+ else:
1656
+ # High variance - require much larger change
1657
+ significance_threshold = 25.0
1658
+
1659
+ client_latency_notes.append(
1660
+ f"{metric} {direction} {abs(client_latency_change):.1f}% (baseline CV={baseline_client_cv:.1f}%)"
1661
+ )
1662
+ logging.info(
1663
+ f" Client metric '{metric}': SIGNIFICANT latency change detected ({direction} {abs(client_latency_change):.1f}%, baseline CV={baseline_client_cv:.1f}%)"
1664
+ )
1665
+
1666
+ # Track significant client latency increases (potential regression confirmation)
1667
+ if client_latency_change > significance_threshold:
1668
+ significant_client_latency_increases += 1
1669
+ regression_details["commands"].append(
1670
+ {
1671
+ "command": metric,
1672
+ "change_percent": client_latency_change,
1673
+ "direction": direction,
1674
+ "baseline_cv": baseline_client_cv,
1675
+ "comparison_cv": comparison_client_cv,
1676
+ }
1677
+ )
1678
+ logging.info(
1679
+ f" Client metric '{metric}': CONFIRMS regression (change={client_latency_change:.1f}% > threshold={significance_threshold:.1f}%)"
1680
+ )
1681
+ else:
1682
+ logging.info(
1683
+ f" Client metric '{metric}': Change below significance threshold (change={client_latency_change:.1f}% <= threshold={significance_threshold:.1f}%)"
1684
+ )
1685
+ elif verbose:
1686
+ client_latency_notes.append(
1687
+ f"{metric} stable (CV={baseline_client_cv:.1f}%)"
1688
+ )
1689
+ logging.info(
1690
+ f" Client metric '{metric}': latency stable (change={client_latency_change:.1f}%, baseline CV={baseline_client_cv:.1f}%)"
1691
+ )
1692
+
1693
+ # Determine if client-side latency confirms regression
1694
+ confirms_regression = significant_client_latency_increases > 0
1695
+
1696
+ # Return combined client latency notes
1697
+ if client_latency_notes:
1698
+ result = "; ".join(client_latency_notes)
1699
+ logging.info(
1700
+ f"Client-side latency check completed for {test_name}: {result}"
1701
+ )
1702
+ return (
1703
+ result,
1704
+ confirms_regression,
1705
+ regression_details if confirms_regression else None,
1706
+ )
1707
+ else:
1708
+ result = "client latency stable" if len(client_metrics) > 0 else None
1709
+ logging.info(
1710
+ f"Client-side latency check completed for {test_name}: {result or 'no data'}"
1711
+ )
1712
+ return result, False, None
1713
+
1714
+ except Exception as e:
1715
+ logging.error(f"Error checking client-side latency for {test_name}: {e}")
1716
+ return None, False, None
1717
+
1718
+
1719
+ def perform_variance_and_p99_analysis(
1720
+ rts,
1721
+ test_name,
1722
+ baseline_str,
1723
+ comparison_str,
1724
+ by_str_baseline,
1725
+ by_str_comparison,
1726
+ baseline_deployment_name,
1727
+ comparison_deployment_name,
1728
+ tf_triggering_env,
1729
+ from_ts_ms,
1730
+ to_ts_ms,
1731
+ last_n_baseline,
1732
+ last_n_comparison,
1733
+ first_n_baseline,
1734
+ first_n_comparison,
1735
+ running_platform,
1736
+ baseline_architecture,
1737
+ comparison_architecture,
1738
+ verbose=False,
1739
+ ):
1740
+ """
1741
+ Perform 3rd-level analysis using variance and p99 metrics to assess confidence in regression detection.
1742
+
1743
+ Returns:
1744
+ tuple: (confidence_note, high_confidence_bool)
1745
+ """
1746
+ try:
1747
+ logging.info(f"Starting variance and p99 analysis for {test_name}")
1748
+
1749
+ # Build filters for p99 latency metric using both metric=p99 and metric-type=(latencystats)
1750
+ filters_baseline = [
1751
+ f"{by_str_baseline}={baseline_str}",
1752
+ "metric=p99",
1753
+ "metric-type=(latencystats)",
1754
+ f"test_name={test_name}",
1755
+ f"deployment_name={baseline_deployment_name}",
1756
+ f"triggering_env={tf_triggering_env}",
1757
+ ]
1758
+ filters_comparison = [
1759
+ f"{by_str_comparison}={comparison_str}",
1760
+ "metric=p99",
1761
+ "metric-type=(latencystats)",
1762
+ f"test_name={test_name}",
1763
+ f"deployment_name={comparison_deployment_name}",
1764
+ f"triggering_env={tf_triggering_env}",
1765
+ ]
1766
+
1767
+ # Add optional filters
1768
+ if running_platform is not None:
1769
+ filters_baseline.append(f"running_platform={running_platform}")
1770
+ filters_comparison.append(f"running_platform={running_platform}")
1771
+ if baseline_architecture != ARCH_X86:
1772
+ filters_baseline.append(f"arch={baseline_architecture}")
1773
+ if comparison_architecture != ARCH_X86:
1774
+ filters_comparison.append(f"arch={comparison_architecture}")
1775
+
1776
+ # Query for p99 latency time-series
1777
+ logging.info(f"Querying p99 latencystats time-series for {test_name}")
1778
+ baseline_p99_ts = rts.ts().queryindex(filters_baseline)
1779
+ comparison_p99_ts = rts.ts().queryindex(filters_comparison)
1780
+
1781
+ logging.info(f"Found {len(baseline_p99_ts)} baseline p99 latency time-series")
1782
+ logging.info(
1783
+ f"Found {len(comparison_p99_ts)} comparison p99 latency time-series"
1784
+ )
1785
+
1786
+ # Filter out target time-series and unwanted commands (reuse existing function)
1787
+ def should_exclude_timeseries(ts_name):
1788
+ """Check if time-series should be excluded based on command"""
1789
+ if "target" in ts_name:
1790
+ return True
1791
+ ts_name_lower = ts_name.lower()
1792
+ excluded_commands = ["config", "info", "ping", "cluster", "resetstat"]
1793
+ return any(cmd in ts_name_lower for cmd in excluded_commands)
1794
+
1795
+ baseline_p99_ts = [
1796
+ ts for ts in baseline_p99_ts if not should_exclude_timeseries(ts)
1797
+ ]
1798
+ comparison_p99_ts = [
1799
+ ts for ts in comparison_p99_ts if not should_exclude_timeseries(ts)
1800
+ ]
1801
+
1802
+ if len(baseline_p99_ts) == 0 or len(comparison_p99_ts) == 0:
1803
+ logging.warning(
1804
+ f"No p99 latency data found for {test_name} after filtering"
1805
+ )
1806
+ return None, False
1807
+
1808
+ # Extract command names from time-series (reuse existing function)
1809
+ def extract_command_from_ts(ts_name):
1810
+ """Extract meaningful command name from time-series name"""
1811
+ # Look for latencystats_latency_percentiles_usec_<COMMAND>_p99 pattern
1812
+ match = re.search(
1813
+ r"latencystats_latency_percentiles_usec_([^_/]+)_p99", ts_name
1814
+ )
1815
+ if match:
1816
+ return match.group(1)
1817
+ # Look for command= pattern in the time-series name
1818
+ match = re.search(r"command=([^/]+)", ts_name)
1819
+ if match:
1820
+ return match.group(1)
1821
+ # If no specific pattern found, try to extract from the end of the path
1822
+ parts = ts_name.split("/")
1823
+ if len(parts) > 0:
1824
+ return parts[-1]
1825
+ return "unknown"
1826
+
1827
+ # Group time-series by command
1828
+ baseline_by_command = {}
1829
+ comparison_by_command = {}
1830
+
1831
+ for ts in baseline_p99_ts:
1832
+ cmd = extract_command_from_ts(ts)
1833
+ if cmd not in baseline_by_command:
1834
+ baseline_by_command[cmd] = []
1835
+ baseline_by_command[cmd].append(ts)
1836
+
1837
+ for ts in comparison_p99_ts:
1838
+ cmd = extract_command_from_ts(ts)
1839
+ if cmd not in comparison_by_command:
1840
+ comparison_by_command[cmd] = []
1841
+ comparison_by_command[cmd].append(ts)
1842
+
1843
+ # Find common commands between baseline and comparison
1844
+ common_commands = set(baseline_by_command.keys()) & set(
1845
+ comparison_by_command.keys()
1846
+ )
1847
+
1848
+ if not common_commands:
1849
+ logging.warning(
1850
+ f"No common commands found for p99 variance analysis in {test_name}"
1851
+ )
1852
+ return None, False
1853
+
1854
+ variance_notes = []
1855
+ p99_notes = []
1856
+ high_confidence_indicators = 0
1857
+ total_indicators = 0
1858
+
1859
+ # Analyze variance and p99 for each command
1860
+ for command in sorted(common_commands):
1861
+ total_indicators += 1
1862
+ logging.info(f"Analyzing p99 variance for command: {command}")
1863
+
1864
+ baseline_ts_list = baseline_by_command[command]
1865
+ comparison_ts_list = comparison_by_command[command]
1866
+
1867
+ # If multiple time-series for the same command, try to get the best one
1868
+ if len(baseline_ts_list) > 1:
1869
+ baseline_ts_list = get_only_Totals(baseline_ts_list)
1870
+ if len(comparison_ts_list) > 1:
1871
+ comparison_ts_list = get_only_Totals(comparison_ts_list)
1872
+
1873
+ if len(baseline_ts_list) != 1 or len(comparison_ts_list) != 1:
1874
+ logging.warning(
1875
+ f" Skipping {command}: baseline={len(baseline_ts_list)}, comparison={len(comparison_ts_list)} time-series"
1876
+ )
1877
+ continue
1878
+
1879
+ # Get p99 latency data for this command
1880
+ baseline_p99_data = []
1881
+ comparison_p99_data = []
1882
+
1883
+ for ts_name in baseline_ts_list:
1884
+ datapoints = rts.ts().revrange(ts_name, from_ts_ms, to_ts_ms)
1885
+ baseline_p99_data.extend(datapoints)
1886
+
1887
+ for ts_name in comparison_ts_list:
1888
+ datapoints = rts.ts().revrange(ts_name, from_ts_ms, to_ts_ms)
1889
+ comparison_p99_data.extend(datapoints)
1890
+
1891
+ if len(baseline_p99_data) < 3 or len(comparison_p99_data) < 3:
1892
+ logging.warning(
1893
+ f" Insufficient p99 data for {command}: baseline={len(baseline_p99_data)}, comparison={len(comparison_p99_data)} datapoints"
1894
+ )
1895
+ continue
1896
+
1897
+ # Extract values for variance calculation
1898
+ baseline_values = [dp[1] for dp in baseline_p99_data]
1899
+ comparison_values = [dp[1] for dp in comparison_p99_data]
1900
+
1901
+ # Calculate variance (coefficient of variation)
1902
+ baseline_mean = statistics.mean(baseline_values)
1903
+ baseline_stdev = (
1904
+ statistics.stdev(baseline_values) if len(baseline_values) > 1 else 0
1905
+ )
1906
+ baseline_cv = (
1907
+ (baseline_stdev / baseline_mean * 100)
1908
+ if baseline_mean > 0
1909
+ else float("inf")
1910
+ )
1911
+
1912
+ comparison_mean = statistics.mean(comparison_values)
1913
+ comparison_stdev = (
1914
+ statistics.stdev(comparison_values) if len(comparison_values) > 1 else 0
1915
+ )
1916
+ comparison_cv = (
1917
+ (comparison_stdev / comparison_mean * 100)
1918
+ if comparison_mean > 0
1919
+ else float("inf")
1920
+ )
1921
+
1922
+ # Calculate p99 change
1923
+ p99_change = (
1924
+ ((comparison_mean - baseline_mean) / baseline_mean * 100)
1925
+ if baseline_mean > 0
1926
+ else 0
1927
+ )
1928
+
1929
+ # Assess confidence based on variance and p99 change
1930
+ if baseline_cv < 30: # Low variance in baseline (< 30% CV)
1931
+ if abs(p99_change) > 15: # Significant p99 change
1932
+ high_confidence_indicators += 1
1933
+ p99_notes.append(
1934
+ f"{command} p99 {'+' if p99_change > 0 else ''}{p99_change:.1f}% (stable baseline)"
1935
+ )
1936
+ else:
1937
+ p99_notes.append(
1938
+ f"{command} p99 {'+' if p99_change > 0 else ''}{p99_change:.1f}% (stable baseline, minor change)"
1939
+ )
1940
+ elif baseline_cv < 50: # Moderate variance
1941
+ if abs(p99_change) > 25: # Need larger change for confidence
1942
+ high_confidence_indicators += 1
1943
+ p99_notes.append(
1944
+ f"{command} p99 {'+' if p99_change > 0 else ''}{p99_change:.1f}% (moderate baseline variance)"
1945
+ )
1946
+ else:
1947
+ p99_notes.append(
1948
+ f"{command} p99 {'+' if p99_change > 0 else ''}{p99_change:.1f}% (moderate baseline variance, uncertain)"
1949
+ )
1950
+ else: # High variance
1951
+ if abs(p99_change) > 40: # Need very large change for confidence
1952
+ high_confidence_indicators += 1
1953
+ p99_notes.append(
1954
+ f"{command} p99 {'+' if p99_change > 0 else ''}{p99_change:.1f}% (high baseline variance, large change)"
1955
+ )
1956
+ else:
1957
+ p99_notes.append(
1958
+ f"{command} p99 {'+' if p99_change > 0 else ''}{p99_change:.1f}% (high baseline variance, low confidence)"
1959
+ )
1960
+
1961
+ variance_notes.append(f"{command} baseline CV={baseline_cv:.1f}%")
1962
+
1963
+ if verbose:
1964
+ logging.info(
1965
+ f" Command {command}: baseline CV={baseline_cv:.1f}%, comparison CV={comparison_cv:.1f}%, p99 change={p99_change:.1f}%"
1966
+ )
1967
+
1968
+ # Determine overall confidence
1969
+ confidence_ratio = (
1970
+ high_confidence_indicators / total_indicators if total_indicators > 0 else 0
1971
+ )
1972
+ high_confidence = (
1973
+ confidence_ratio >= 0.5
1974
+ ) # At least 50% of indicators show high confidence
1975
+
1976
+ # Create confidence note
1977
+ confidence_parts = []
1978
+ if variance_notes:
1979
+ confidence_parts.extend(variance_notes)
1980
+ if p99_notes:
1981
+ confidence_parts.extend(p99_notes)
1982
+
1983
+ confidence_note = "; ".join(confidence_parts) if confidence_parts else None
1984
+
1985
+ if confidence_note:
1986
+ confidence_level = "HIGH" if high_confidence else "LOW"
1987
+ cv_explanation = "CV=coefficient of variation (data stability: <30% stable, 30-50% moderate, >50% unstable)"
1988
+ confidence_note = (
1989
+ f"confidence={confidence_level} ({confidence_note}; {cv_explanation})"
1990
+ )
1991
+
1992
+ logging.info(
1993
+ f"Variance and p99 analysis completed for {test_name}: confidence={confidence_ratio:.2f}, high_confidence={high_confidence}"
1994
+ )
1995
+ return confidence_note, high_confidence
1996
+
1997
+ except Exception as e:
1998
+ logging.error(f"Error in variance and p99 analysis for {test_name}: {e}")
1999
+ return None, False
2000
+
2001
+
2002
+ def check_latency_for_unstable_throughput(
2003
+ rts,
2004
+ test_name,
2005
+ baseline_str,
2006
+ comparison_str,
2007
+ by_str_baseline,
2008
+ by_str_comparison,
2009
+ baseline_deployment_name,
2010
+ comparison_deployment_name,
2011
+ tf_triggering_env,
2012
+ from_ts_ms,
2013
+ to_ts_ms,
2014
+ last_n_baseline,
2015
+ last_n_comparison,
2016
+ first_n_baseline,
2017
+ first_n_comparison,
2018
+ running_platform,
2019
+ baseline_architecture,
2020
+ comparison_architecture,
2021
+ verbose,
2022
+ ):
2023
+ """
2024
+ Check latency (p50) for unstable throughput metrics to provide additional context.
2025
+ Returns a tuple: (note_string, confirms_regression_bool, regression_details_dict)
2026
+ """
2027
+ logging.info(f"Starting latency check for unstable throughput test: {test_name}")
2028
+ try:
2029
+ # Build filters for p50 latency metric using both metric=p50 and metric-type=(latencystats)
2030
+ filters_baseline = [
2031
+ f"{by_str_baseline}={baseline_str}",
2032
+ "metric=p50",
2033
+ "metric-type=(latencystats)",
2034
+ f"test_name={test_name}",
2035
+ f"deployment_name={baseline_deployment_name}",
2036
+ f"triggering_env={tf_triggering_env}",
2037
+ ]
2038
+ filters_comparison = [
2039
+ f"{by_str_comparison}={comparison_str}",
2040
+ "metric=p50",
2041
+ "metric-type=(latencystats)",
2042
+ f"test_name={test_name}",
2043
+ f"deployment_name={comparison_deployment_name}",
2044
+ f"triggering_env={tf_triggering_env}",
2045
+ ]
2046
+
2047
+ # Add optional filters
2048
+ if running_platform is not None:
2049
+ filters_baseline.append(f"running_platform={running_platform}")
2050
+ filters_comparison.append(f"running_platform={running_platform}")
2051
+ if baseline_architecture != ARCH_X86:
2052
+ filters_baseline.append(f"arch={baseline_architecture}")
2053
+ if comparison_architecture != ARCH_X86:
2054
+ filters_comparison.append(f"arch={comparison_architecture}")
2055
+
2056
+ # Query for p50 latency time-series
2057
+ logging.info(f"Querying p50 latencystats time-series for {test_name}")
2058
+ logging.info(f"Baseline filters: {filters_baseline}")
2059
+ logging.info(f"Comparison filters: {filters_comparison}")
2060
+
2061
+ baseline_latency_ts = rts.ts().queryindex(filters_baseline)
2062
+ comparison_latency_ts = rts.ts().queryindex(filters_comparison)
2063
+
2064
+ logging.info(
2065
+ f"Found {len(baseline_latency_ts)} baseline p50 latency time-series"
2066
+ )
2067
+ logging.info(
2068
+ f"Found {len(comparison_latency_ts)} comparison p50 latency time-series"
2069
+ )
2070
+
2071
+ if verbose and baseline_latency_ts:
2072
+ logging.info(f"Baseline latency time-series: {baseline_latency_ts}")
2073
+ if verbose and comparison_latency_ts:
2074
+ logging.info(f"Comparison latency time-series: {comparison_latency_ts}")
2075
+
2076
+ # Filter out target time-series and unwanted commands
2077
+ def should_exclude_timeseries(ts_name):
2078
+ """Check if time-series should be excluded based on command"""
2079
+ # Exclude target time-series
2080
+ if "target" in ts_name:
2081
+ return True
2082
+
2083
+ # Convert to lowercase for case-insensitive matching
2084
+ ts_name_lower = ts_name.lower()
2085
+
2086
+ # Exclude administrative commands (case-insensitive)
2087
+ excluded_commands = ["config", "info", "ping", "cluster", "resetstat"]
2088
+ return any(cmd in ts_name_lower for cmd in excluded_commands)
2089
+
2090
+ baseline_latency_ts_before = len(baseline_latency_ts)
2091
+ comparison_latency_ts_before = len(comparison_latency_ts)
2092
+
2093
+ # Apply filtering and log what gets excluded
2094
+ baseline_excluded = [
2095
+ ts for ts in baseline_latency_ts if should_exclude_timeseries(ts)
2096
+ ]
2097
+ comparison_excluded = [
2098
+ ts for ts in comparison_latency_ts if should_exclude_timeseries(ts)
2099
+ ]
2100
+
2101
+ baseline_latency_ts = [
2102
+ ts for ts in baseline_latency_ts if not should_exclude_timeseries(ts)
2103
+ ]
2104
+ comparison_latency_ts = [
2105
+ ts for ts in comparison_latency_ts if not should_exclude_timeseries(ts)
2106
+ ]
2107
+
2108
+ logging.info(
2109
+ f"After filtering: baseline {baseline_latency_ts_before} -> {len(baseline_latency_ts)}, "
2110
+ f"comparison {comparison_latency_ts_before} -> {len(comparison_latency_ts)}"
2111
+ )
2112
+
2113
+ if baseline_excluded:
2114
+ logging.info(
2115
+ f"Excluded {len(baseline_excluded)} baseline administrative command time-series"
2116
+ )
2117
+ if verbose:
2118
+ for ts in baseline_excluded:
2119
+ logging.info(f" Excluded baseline: {ts}")
2120
+ if comparison_excluded:
2121
+ logging.info(
2122
+ f"Excluded {len(comparison_excluded)} comparison administrative command time-series"
2123
+ )
2124
+ if verbose:
2125
+ for ts in comparison_excluded:
2126
+ logging.info(f" Excluded comparison: {ts}")
2127
+
2128
+ if len(baseline_latency_ts) == 0 or len(comparison_latency_ts) == 0:
2129
+ logging.warning(
2130
+ f"No p50 latency data found for {test_name} after filtering"
2131
+ )
2132
+ return None, False, None
2133
+
2134
+ # Extract command names from time-series to match baseline and comparison
2135
+ def extract_command_from_ts(ts_name):
2136
+ """Extract meaningful command name from time-series name"""
2137
+ import re
2138
+
2139
+ # Look for latencystats_latency_percentiles_usec_<COMMAND>_p50 pattern
2140
+ match = re.search(
2141
+ r"latencystats_latency_percentiles_usec_([^_/]+)_p50", ts_name
2142
+ )
2143
+ if match:
2144
+ return match.group(1)
2145
+
2146
+ # Look for command= pattern in the time-series name
2147
+ match = re.search(r"command=([^/]+)", ts_name)
2148
+ if match:
2149
+ return match.group(1)
2150
+
2151
+ # If no specific pattern found, try to extract from the end of the path
2152
+ # e.g., .../Ops/sec/GET -> GET
2153
+ parts = ts_name.split("/")
2154
+ if len(parts) > 0:
2155
+ return parts[-1]
2156
+ return "unknown"
2157
+
2158
+ # Group time-series by command
2159
+ baseline_by_command = {}
2160
+ comparison_by_command = {}
2161
+
2162
+ for ts in baseline_latency_ts:
2163
+ cmd = extract_command_from_ts(ts)
2164
+ if verbose:
2165
+ logging.info(f"Baseline time-series '{ts}' -> command '{cmd}'")
2166
+ if cmd not in baseline_by_command:
2167
+ baseline_by_command[cmd] = []
2168
+ baseline_by_command[cmd].append(ts)
2169
+
2170
+ for ts in comparison_latency_ts:
2171
+ cmd = extract_command_from_ts(ts)
2172
+ if verbose:
2173
+ logging.info(f"Comparison time-series '{ts}' -> command '{cmd}'")
2174
+ if cmd not in comparison_by_command:
2175
+ comparison_by_command[cmd] = []
2176
+ comparison_by_command[cmd].append(ts)
2177
+
2178
+ # Find common commands between baseline and comparison
2179
+ common_commands = set(baseline_by_command.keys()) & set(
2180
+ comparison_by_command.keys()
2181
+ )
2182
+
2183
+ logging.info(f"Baseline commands found: {sorted(baseline_by_command.keys())}")
2184
+ logging.info(
2185
+ f"Comparison commands found: {sorted(comparison_by_command.keys())}"
2186
+ )
2187
+ logging.info(
2188
+ f"Common commands for latency comparison: {sorted(common_commands)}"
2189
+ )
2190
+
2191
+ if not common_commands:
2192
+ logging.warning(
2193
+ f"No common commands found for latency comparison in {test_name}"
2194
+ )
2195
+ return None, False, None
2196
+
2197
+ latency_notes = []
2198
+ significant_latency_increases = (
2199
+ 0 # Track commands with significant latency increases
2200
+ )
2201
+ regression_details = {"test_name": test_name, "commands": []}
2202
+
2203
+ # Compare latency for each command individually
2204
+ for command in sorted(common_commands):
2205
+ logging.info(f"Analyzing latency for command: {command}")
2206
+ baseline_ts_list = baseline_by_command[command]
2207
+ comparison_ts_list = comparison_by_command[command]
2208
+
2209
+ logging.info(
2210
+ f" Command {command}: {len(baseline_ts_list)} baseline, {len(comparison_ts_list)} comparison time-series"
2211
+ )
2212
+
2213
+ # If multiple time-series for the same command, try to get the best one
2214
+ if len(baseline_ts_list) > 1:
2215
+ logging.info(
2216
+ f" Multiple baseline time-series for {command}, filtering..."
2217
+ )
2218
+ baseline_ts_list = get_only_Totals(baseline_ts_list)
2219
+ if len(comparison_ts_list) > 1:
2220
+ logging.info(
2221
+ f" Multiple comparison time-series for {command}, filtering..."
2222
+ )
2223
+ comparison_ts_list = get_only_Totals(comparison_ts_list)
2224
+
2225
+ if len(baseline_ts_list) != 1 or len(comparison_ts_list) != 1:
2226
+ logging.warning(
2227
+ f" Skipping {command}: baseline={len(baseline_ts_list)}, comparison={len(comparison_ts_list)} time-series"
2228
+ )
2229
+ continue
2230
+
2231
+ # Get latency data for this command
2232
+ baseline_latency_data = []
2233
+ comparison_latency_data = []
2234
+
2235
+ for ts_name in baseline_ts_list:
2236
+ datapoints = rts.ts().revrange(ts_name, from_ts_ms, to_ts_ms)
2237
+ baseline_latency_data.extend(datapoints)
2238
+
2239
+ for ts_name in comparison_ts_list:
2240
+ datapoints = rts.ts().revrange(ts_name, from_ts_ms, to_ts_ms)
2241
+ comparison_latency_data.extend(datapoints)
2242
+
2243
+ if len(baseline_latency_data) == 0 or len(comparison_latency_data) == 0:
2244
+ logging.warning(
2245
+ f" No latency data for {command}: baseline={len(baseline_latency_data)}, comparison={len(comparison_latency_data)} datapoints"
2246
+ )
2247
+ continue
2248
+
2249
+ logging.info(
2250
+ f" Command {command}: {len(baseline_latency_data)} baseline, {len(comparison_latency_data)} comparison datapoints"
2251
+ )
2252
+
2253
+ # Calculate latency statistics for this command
2254
+ baseline_latency_values = []
2255
+ comparison_latency_values = []
2256
+
2257
+ (_, baseline_latency_median, _) = get_v_pct_change_and_largest_var(
2258
+ baseline_latency_data,
2259
+ 0,
2260
+ 0,
2261
+ baseline_latency_values,
2262
+ 0,
2263
+ last_n_baseline,
2264
+ verbose,
2265
+ first_n_baseline,
2266
+ )
2267
+
2268
+ (_, comparison_latency_median, _) = get_v_pct_change_and_largest_var(
2269
+ comparison_latency_data,
2270
+ 0,
2271
+ 0,
2272
+ comparison_latency_values,
2273
+ 0,
2274
+ last_n_comparison,
2275
+ verbose,
2276
+ first_n_comparison,
2277
+ )
2278
+
2279
+ if baseline_latency_median == "N/A" or comparison_latency_median == "N/A":
2280
+ logging.warning(
2281
+ f" Could not calculate median for {command}: baseline={baseline_latency_median}, comparison={comparison_latency_median}"
2282
+ )
2283
+ continue
2284
+
2285
+ # Calculate variance (coefficient of variation) for both baseline and comparison
2286
+ baseline_latency_mean = (
2287
+ statistics.mean(baseline_latency_values)
2288
+ if baseline_latency_values
2289
+ else 0
2290
+ )
2291
+ baseline_latency_stdev = (
2292
+ statistics.stdev(baseline_latency_values)
2293
+ if len(baseline_latency_values) > 1
2294
+ else 0
2295
+ )
2296
+ baseline_latency_cv = (
2297
+ (baseline_latency_stdev / baseline_latency_mean * 100)
2298
+ if baseline_latency_mean > 0
2299
+ else float("inf")
2300
+ )
2301
+
2302
+ comparison_latency_mean = (
2303
+ statistics.mean(comparison_latency_values)
2304
+ if comparison_latency_values
2305
+ else 0
2306
+ )
2307
+ comparison_latency_stdev = (
2308
+ statistics.stdev(comparison_latency_values)
2309
+ if len(comparison_latency_values) > 1
2310
+ else 0
2311
+ )
2312
+ comparison_latency_cv = (
2313
+ (comparison_latency_stdev / comparison_latency_mean * 100)
2314
+ if comparison_latency_mean > 0
2315
+ else float("inf")
2316
+ )
2317
+
2318
+ # Calculate latency change (for latency, lower is better)
2319
+ latency_change = (
2320
+ float(comparison_latency_median) / float(baseline_latency_median) - 1
2321
+ ) * 100.0
2322
+
2323
+ logging.info(
2324
+ f" Command {command}: baseline p50={baseline_latency_median:.2f} (CV={baseline_latency_cv:.1f}%), comparison p50={comparison_latency_median:.2f} (CV={comparison_latency_cv:.1f}%), change={latency_change:.1f}%"
2325
+ )
2326
+
2327
+ # Check if latency data is too unstable to be reliable
2328
+ latency_data_unstable = (
2329
+ baseline_latency_cv > 50.0 or comparison_latency_cv > 50.0
2330
+ )
2331
+
2332
+ if latency_data_unstable:
2333
+ # Mark as unstable latency data
2334
+ unstable_reason = []
2335
+ if baseline_latency_cv > 50.0:
2336
+ unstable_reason.append(f"baseline CV={baseline_latency_cv:.1f}%")
2337
+ if comparison_latency_cv > 50.0:
2338
+ unstable_reason.append(
2339
+ f"comparison CV={comparison_latency_cv:.1f}%"
2340
+ )
2341
+
2342
+ latency_notes.append(
2343
+ f"{command} p50 UNSTABLE ({', '.join(unstable_reason)} - data too noisy for reliable analysis)"
2344
+ )
2345
+ logging.warning(
2346
+ f" Command {command}: UNSTABLE latency data detected - {', '.join(unstable_reason)}"
2347
+ )
2348
+ elif (
2349
+ abs(latency_change) > 5.0
2350
+ ): # Only report significant latency changes for stable data
2351
+ direction = "increased" if latency_change > 0 else "decreased"
2352
+
2353
+ # Adjust significance threshold based on baseline variance
2354
+ if baseline_latency_cv < 30.0:
2355
+ # Low variance - use standard threshold
2356
+ significance_threshold = 10.0
2357
+ elif baseline_latency_cv < 50.0:
2358
+ # Moderate variance - require larger change
2359
+ significance_threshold = 15.0
2360
+ else:
2361
+ # High variance - require much larger change
2362
+ significance_threshold = 25.0
2363
+
2364
+ latency_notes.append(
2365
+ f"{command} p50 {direction} {abs(latency_change):.1f}% (baseline CV={baseline_latency_cv:.1f}%)"
2366
+ )
2367
+ logging.info(
2368
+ f" Command {command}: SIGNIFICANT latency change detected ({direction} {abs(latency_change):.1f}%, baseline CV={baseline_latency_cv:.1f}%)"
2369
+ )
2370
+
2371
+ # Track significant latency increases (potential regression confirmation)
2372
+ if latency_change > significance_threshold:
2373
+ significant_latency_increases += 1
2374
+ regression_details["commands"].append(
2375
+ {
2376
+ "command": command,
2377
+ "change_percent": latency_change,
2378
+ "direction": direction,
2379
+ "baseline_cv": baseline_latency_cv,
2380
+ "comparison_cv": comparison_latency_cv,
2381
+ }
2382
+ )
2383
+ logging.info(
2384
+ f" Command {command}: CONFIRMS regression (change={latency_change:.1f}% > threshold={significance_threshold:.1f}%)"
2385
+ )
2386
+ else:
2387
+ logging.info(
2388
+ f" Command {command}: Change below significance threshold (change={latency_change:.1f}% <= threshold={significance_threshold:.1f}%)"
2389
+ )
2390
+ elif verbose:
2391
+ latency_notes.append(
2392
+ f"{command} p50 stable (CV={baseline_latency_cv:.1f}%)"
2393
+ )
2394
+ logging.info(
2395
+ f" Command {command}: latency stable (change={latency_change:.1f}%, baseline CV={baseline_latency_cv:.1f}%)"
2396
+ )
2397
+
2398
+ # Determine if latency confirms regression
2399
+ confirms_regression = significant_latency_increases > 0
2400
+
2401
+ # Return combined latency notes
2402
+ if latency_notes:
2403
+ result = "; ".join(latency_notes)
2404
+ logging.info(f"Latency check completed for {test_name}: {result}")
2405
+ return (
2406
+ result,
2407
+ confirms_regression,
2408
+ regression_details if confirms_regression else None,
2409
+ )
2410
+ else:
2411
+ result = "p50 latency stable" if common_commands else None
2412
+ logging.info(
2413
+ f"Latency check completed for {test_name}: {result or 'no data'}"
2414
+ )
2415
+ return result, False, None
2416
+
2417
+ except Exception as e:
2418
+ logging.error(f"Error checking latency for {test_name}: {e}")
2419
+ return None, False, None
2420
+
2421
+
994
2422
  def get_only_Totals(baseline_timeseries):
995
2423
  logging.warning("\t\tTime-series: {}".format(", ".join(baseline_timeseries)))
996
2424
  logging.info("Checking if Totals will reduce timeseries.")
@@ -998,6 +2426,37 @@ def get_only_Totals(baseline_timeseries):
998
2426
  for ts_name in baseline_timeseries:
999
2427
  if "Totals" in ts_name:
1000
2428
  new_base.append(ts_name)
2429
+
2430
+ # If no "Totals" time-series found, try to pick the best alternative
2431
+ if len(new_base) == 0:
2432
+ logging.warning(
2433
+ "No 'Totals' time-series found, trying to pick best alternative."
2434
+ )
2435
+ # Prefer time-series without quotes in metric names
2436
+ unquoted_series = [ts for ts in baseline_timeseries if "'" not in ts]
2437
+ if unquoted_series:
2438
+ new_base = unquoted_series
2439
+ else:
2440
+ # Fall back to original list
2441
+ new_base = baseline_timeseries
2442
+
2443
+ # If we still have multiple time-series after filtering for "Totals",
2444
+ # prefer the one without quotes in the metric name
2445
+ if len(new_base) > 1:
2446
+ logging.info("Multiple time-series found, preferring unquoted metric names.")
2447
+ unquoted_series = [ts for ts in new_base if "'" not in ts]
2448
+ if unquoted_series:
2449
+ new_base = unquoted_series
2450
+
2451
+ # If we still have multiple, take the first one
2452
+ if len(new_base) > 1:
2453
+ logging.warning(
2454
+ "Still multiple time-series after filtering, taking the first one: {}".format(
2455
+ new_base[0]
2456
+ )
2457
+ )
2458
+ new_base = [new_base[0]]
2459
+
1001
2460
  baseline_timeseries = new_base
1002
2461
  return baseline_timeseries
1003
2462
 
@@ -1064,11 +2523,38 @@ def add_line(
1064
2523
  percentage_change,
1065
2524
  table,
1066
2525
  test_name,
2526
+ grafana_link_base=None,
2527
+ baseline_branch=None,
2528
+ baseline_version=None,
2529
+ comparison_branch=None,
2530
+ comparison_version=None,
2531
+ from_date=None,
2532
+ to_date=None,
1067
2533
  ):
2534
+ grafana_link = None
2535
+ if grafana_link_base is not None:
2536
+ grafana_link = "{}?orgId=1".format(grafana_link_base)
2537
+ grafana_link += f"&var-test_case={test_name}"
2538
+
2539
+ if baseline_branch is not None:
2540
+ grafana_link += f"&var-branch={baseline_branch}"
2541
+ if baseline_version is not None:
2542
+ grafana_link += f"&var-version={baseline_version}"
2543
+ if comparison_branch is not None:
2544
+ grafana_link += f"&var-branch={comparison_branch}"
2545
+ if comparison_version is not None:
2546
+ grafana_link += f"&var-version={comparison_version}"
2547
+ grafana_link += "&from=now-30d&to=now"
2548
+
2549
+ # Create test name with optional Grafana link
2550
+ test_name_display = test_name
2551
+ if grafana_link is not None:
2552
+ test_name_display = f"[{test_name}]({grafana_link})"
2553
+
1068
2554
  percentage_change_str = "{:.1f}% ".format(percentage_change)
1069
2555
  table.append(
1070
2556
  [
1071
- test_name,
2557
+ test_name_display,
1072
2558
  baseline_v_str,
1073
2559
  comparison_v_str,
1074
2560
  percentage_change_str,
@@ -1105,9 +2591,9 @@ def get_v_pct_change_and_largest_var(
1105
2591
  comparison_values.append(tuple[1])
1106
2592
 
1107
2593
  comparison_df = pd.DataFrame(comparison_values)
1108
- comparison_median = float(comparison_df.median())
2594
+ comparison_median = float(comparison_df.median().iloc[0])
1109
2595
  comparison_v = comparison_median
1110
- comparison_std = float(comparison_df.std())
2596
+ comparison_std = float(comparison_df.std().iloc[0])
1111
2597
  if verbose:
1112
2598
  logging.info(
1113
2599
  "comparison_datapoints: {} value: {}; std-dev: {}; median: {}".format(