redisbench-admin 0.11.38__py3-none-any.whl → 0.11.40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- redisbench_admin/compare/args.py +1 -1
- redisbench_admin/compare/compare.py +1496 -10
- redisbench_admin/environments/oss_cluster.py +37 -0
- redisbench_admin/run/cluster.py +6 -0
- redisbench_admin/run/metrics.py +0 -2
- redisbench_admin/run_local/args.py +12 -0
- redisbench_admin/run_local/run_local.py +108 -51
- redisbench_admin/run_remote/args.py +12 -0
- redisbench_admin/run_remote/remote_db.py +62 -23
- redisbench_admin/run_remote/remote_helpers.py +17 -0
- redisbench_admin/run_remote/run_remote.py +79 -1
- redisbench_admin/run_remote/standalone.py +136 -0
- redisbench_admin/utils/remote.py +28 -0
- redisbench_admin/utils/utils.py +42 -24
- {redisbench_admin-0.11.38.dist-info → redisbench_admin-0.11.40.dist-info}/METADATA +8 -2
- {redisbench_admin-0.11.38.dist-info → redisbench_admin-0.11.40.dist-info}/RECORD +19 -19
- {redisbench_admin-0.11.38.dist-info → redisbench_admin-0.11.40.dist-info}/LICENSE +0 -0
- {redisbench_admin-0.11.38.dist-info → redisbench_admin-0.11.40.dist-info}/WHEEL +0 -0
- {redisbench_admin-0.11.38.dist-info → redisbench_admin-0.11.40.dist-info}/entry_points.txt +0 -0
|
@@ -13,6 +13,7 @@ from pytablewriter import MarkdownTableWriter
|
|
|
13
13
|
import humanize
|
|
14
14
|
import datetime as dt
|
|
15
15
|
import os
|
|
16
|
+
import statistics
|
|
16
17
|
from tqdm import tqdm
|
|
17
18
|
from github import Github
|
|
18
19
|
from slack_sdk.webhook import WebhookClient
|
|
@@ -273,6 +274,10 @@ def compare_command_logic(args, project_name, project_version):
|
|
|
273
274
|
total_stable,
|
|
274
275
|
total_unstable,
|
|
275
276
|
total_comparison_points,
|
|
277
|
+
total_unstable_baseline,
|
|
278
|
+
total_unstable_comparison,
|
|
279
|
+
total_latency_confirmed_regressions,
|
|
280
|
+
latency_confirmed_regression_details,
|
|
276
281
|
) = compute_regression_table(
|
|
277
282
|
rts,
|
|
278
283
|
tf_github_org,
|
|
@@ -306,6 +311,7 @@ def compare_command_logic(args, project_name, project_version):
|
|
|
306
311
|
comparison_architecture,
|
|
307
312
|
first_n_baseline,
|
|
308
313
|
first_n_comparison,
|
|
314
|
+
grafana_link_base,
|
|
309
315
|
)
|
|
310
316
|
comment_body = ""
|
|
311
317
|
if total_comparison_points > 0:
|
|
@@ -324,11 +330,63 @@ def compare_command_logic(args, project_name, project_version):
|
|
|
324
330
|
)
|
|
325
331
|
|
|
326
332
|
if total_unstable > 0:
|
|
333
|
+
unstable_details = []
|
|
334
|
+
if total_unstable_baseline > 0:
|
|
335
|
+
unstable_details.append(f"{total_unstable_baseline} baseline")
|
|
336
|
+
if total_unstable_comparison > 0:
|
|
337
|
+
unstable_details.append(f"{total_unstable_comparison} comparison")
|
|
338
|
+
|
|
339
|
+
unstable_breakdown = (
|
|
340
|
+
" (" + ", ".join(unstable_details) + ")" if unstable_details else ""
|
|
341
|
+
)
|
|
327
342
|
comparison_summary += (
|
|
328
|
-
"- Detected a total of {} highly unstable benchmarks.\n".format(
|
|
329
|
-
total_unstable
|
|
343
|
+
"- Detected a total of {} highly unstable benchmarks{}.\n".format(
|
|
344
|
+
total_unstable, unstable_breakdown
|
|
330
345
|
)
|
|
331
346
|
)
|
|
347
|
+
|
|
348
|
+
# Add latency confirmation summary if applicable
|
|
349
|
+
if total_latency_confirmed_regressions > 0:
|
|
350
|
+
comparison_summary += "- Latency analysis confirmed regressions in {} of the unstable tests:\n".format(
|
|
351
|
+
total_latency_confirmed_regressions
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
# Add detailed breakdown as bullet points with test links
|
|
355
|
+
if latency_confirmed_regression_details:
|
|
356
|
+
for detail in latency_confirmed_regression_details:
|
|
357
|
+
test_name = detail["test_name"]
|
|
358
|
+
commands_info = []
|
|
359
|
+
for cmd_detail in detail["commands"]:
|
|
360
|
+
commands_info.append(
|
|
361
|
+
f"{cmd_detail['command']} +{cmd_detail['change_percent']:.1f}%"
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
if commands_info:
|
|
365
|
+
# Create test link if grafana_link_base is available
|
|
366
|
+
test_display_name = test_name
|
|
367
|
+
if grafana_link_base is not None:
|
|
368
|
+
grafana_test_link = f"{grafana_link_base}?orgId=1&var-test_case={test_name}"
|
|
369
|
+
if baseline_branch is not None:
|
|
370
|
+
grafana_test_link += (
|
|
371
|
+
f"&var-branch={baseline_branch}"
|
|
372
|
+
)
|
|
373
|
+
if comparison_branch is not None:
|
|
374
|
+
grafana_test_link += (
|
|
375
|
+
f"&var-branch={comparison_branch}"
|
|
376
|
+
)
|
|
377
|
+
grafana_test_link += "&from=now-30d&to=now"
|
|
378
|
+
test_display_name = (
|
|
379
|
+
f"[{test_name}]({grafana_test_link})"
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
# Add confidence indicator if available
|
|
383
|
+
confidence_indicator = ""
|
|
384
|
+
if "high_confidence" in detail:
|
|
385
|
+
confidence_indicator = (
|
|
386
|
+
" 🔴" if detail["high_confidence"] else " ⚠️"
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
comparison_summary += f" - {test_display_name}: {', '.join(commands_info)}{confidence_indicator}\n"
|
|
332
390
|
if total_improvements > 0:
|
|
333
391
|
comparison_summary += "- Detected a total of {} improvements above the improvement water line.\n".format(
|
|
334
392
|
total_improvements
|
|
@@ -487,6 +545,9 @@ def compare_command_logic(args, project_name, project_version):
|
|
|
487
545
|
total_stable,
|
|
488
546
|
total_unstable,
|
|
489
547
|
total_comparison_points,
|
|
548
|
+
total_unstable_baseline,
|
|
549
|
+
total_unstable_comparison,
|
|
550
|
+
total_latency_confirmed_regressions,
|
|
490
551
|
)
|
|
491
552
|
|
|
492
553
|
|
|
@@ -534,6 +595,7 @@ def compute_regression_table(
|
|
|
534
595
|
comparison_architecture=ARCH_X86,
|
|
535
596
|
first_n_baseline=-1,
|
|
536
597
|
first_n_comparison=-1,
|
|
598
|
+
grafana_link_base=None,
|
|
537
599
|
):
|
|
538
600
|
START_TIME_NOW_UTC, _, _ = get_start_time_vars()
|
|
539
601
|
START_TIME_LAST_MONTH_UTC = START_TIME_NOW_UTC - datetime.timedelta(days=31)
|
|
@@ -596,6 +658,10 @@ def compute_regression_table(
|
|
|
596
658
|
total_stable,
|
|
597
659
|
total_unstable,
|
|
598
660
|
total_comparison_points,
|
|
661
|
+
total_unstable_baseline,
|
|
662
|
+
total_unstable_comparison,
|
|
663
|
+
total_latency_confirmed_regressions,
|
|
664
|
+
latency_confirmed_regression_details,
|
|
599
665
|
) = from_rts_to_regression_table(
|
|
600
666
|
baseline_deployment_name,
|
|
601
667
|
comparison_deployment_name,
|
|
@@ -624,14 +690,97 @@ def compute_regression_table(
|
|
|
624
690
|
comparison_architecture,
|
|
625
691
|
first_n_baseline,
|
|
626
692
|
first_n_comparison,
|
|
693
|
+
grafana_link_base,
|
|
694
|
+
baseline_branch,
|
|
695
|
+
baseline_tag,
|
|
696
|
+
comparison_branch,
|
|
697
|
+
comparison_tag,
|
|
698
|
+
from_date,
|
|
699
|
+
to_date,
|
|
627
700
|
)
|
|
628
701
|
logging.info(
|
|
629
702
|
"Printing differential analysis between {} and {}".format(
|
|
630
703
|
baseline_str, comparison_str
|
|
631
704
|
)
|
|
632
705
|
)
|
|
633
|
-
|
|
634
|
-
|
|
706
|
+
|
|
707
|
+
# Split table into improvements, regressions, and no-changes
|
|
708
|
+
improvements_table = []
|
|
709
|
+
regressions_table = []
|
|
710
|
+
no_changes_table = []
|
|
711
|
+
|
|
712
|
+
for row in table:
|
|
713
|
+
# Check if there's a meaningful change (not stable/unstable)
|
|
714
|
+
note = row[4].lower() if len(row) > 4 else ""
|
|
715
|
+
percentage_str = row[3] if len(row) > 3 else "0.0%"
|
|
716
|
+
|
|
717
|
+
# Extract percentage value
|
|
718
|
+
try:
|
|
719
|
+
percentage_val = float(percentage_str.replace("%", "").strip())
|
|
720
|
+
except:
|
|
721
|
+
percentage_val = 0.0
|
|
722
|
+
|
|
723
|
+
# Categorize based on change type
|
|
724
|
+
if "improvement" in note and "potential" not in note:
|
|
725
|
+
# Only actual improvements, not potential ones
|
|
726
|
+
improvements_table.append(row)
|
|
727
|
+
elif ("regression" in note and "potential" not in note) or "unstable" in note:
|
|
728
|
+
# Only actual regressions, not potential ones, plus unstable tests
|
|
729
|
+
regressions_table.append(row)
|
|
730
|
+
elif "no change" in note or "potential" in note:
|
|
731
|
+
# No changes and potential changes (below significance threshold)
|
|
732
|
+
no_changes_table.append(row)
|
|
733
|
+
elif abs(percentage_val) > 3.0: # Significant changes based on percentage
|
|
734
|
+
if (percentage_val > 0 and metric_mode == "higher-better") or (
|
|
735
|
+
percentage_val < 0 and metric_mode == "lower-better"
|
|
736
|
+
):
|
|
737
|
+
improvements_table.append(row)
|
|
738
|
+
else:
|
|
739
|
+
regressions_table.append(row)
|
|
740
|
+
else:
|
|
741
|
+
no_changes_table.append(row)
|
|
742
|
+
|
|
743
|
+
# Sort tables by percentage change
|
|
744
|
+
def get_percentage_value(row):
|
|
745
|
+
"""Extract percentage value from row for sorting"""
|
|
746
|
+
try:
|
|
747
|
+
percentage_str = row[3] if len(row) > 3 else "0.0%"
|
|
748
|
+
return float(percentage_str.replace("%", "").strip())
|
|
749
|
+
except:
|
|
750
|
+
return 0.0
|
|
751
|
+
|
|
752
|
+
# Sort improvements by percentage change (highest first)
|
|
753
|
+
improvements_table.sort(key=get_percentage_value, reverse=True)
|
|
754
|
+
|
|
755
|
+
# Sort regressions by percentage change (most negative first for higher-better, most positive first for lower-better)
|
|
756
|
+
if metric_mode == "higher-better":
|
|
757
|
+
# For higher-better metrics, most negative changes are worst regressions
|
|
758
|
+
regressions_table.sort(key=get_percentage_value)
|
|
759
|
+
else:
|
|
760
|
+
# For lower-better metrics, most positive changes are worst regressions
|
|
761
|
+
regressions_table.sort(key=get_percentage_value, reverse=True)
|
|
762
|
+
|
|
763
|
+
# Create improvements table (visible)
|
|
764
|
+
improvements_writer = MarkdownTableWriter(
|
|
765
|
+
table_name="Performance Improvements - Comparison between {} and {}.\n\nTime Period from {}. (environment used: {})\n".format(
|
|
766
|
+
baseline_str,
|
|
767
|
+
comparison_str,
|
|
768
|
+
from_human_str,
|
|
769
|
+
baseline_deployment_name,
|
|
770
|
+
),
|
|
771
|
+
headers=[
|
|
772
|
+
"Test Case",
|
|
773
|
+
"Baseline {} (median obs. +- std.dev)".format(baseline_str),
|
|
774
|
+
"Comparison {} (median obs. +- std.dev)".format(comparison_str),
|
|
775
|
+
"% change ({})".format(metric_mode),
|
|
776
|
+
"Note",
|
|
777
|
+
],
|
|
778
|
+
value_matrix=improvements_table,
|
|
779
|
+
)
|
|
780
|
+
|
|
781
|
+
# Create regressions table (visible)
|
|
782
|
+
regressions_writer = MarkdownTableWriter(
|
|
783
|
+
table_name="Performance Regressions and Issues - Comparison between {} and {}.\n\nTime Period from {}. (environment used: {})\n".format(
|
|
635
784
|
baseline_str,
|
|
636
785
|
comparison_str,
|
|
637
786
|
from_human_str,
|
|
@@ -644,8 +793,22 @@ def compute_regression_table(
|
|
|
644
793
|
"% change ({})".format(metric_mode),
|
|
645
794
|
"Note",
|
|
646
795
|
],
|
|
647
|
-
value_matrix=
|
|
796
|
+
value_matrix=regressions_table,
|
|
648
797
|
)
|
|
798
|
+
|
|
799
|
+
# Create no-changes table (hidden in markdown)
|
|
800
|
+
no_changes_writer = MarkdownTableWriter(
|
|
801
|
+
table_name="Tests with No Significant Changes",
|
|
802
|
+
headers=[
|
|
803
|
+
"Test Case",
|
|
804
|
+
"Baseline {} (median obs. +- std.dev)".format(baseline_str),
|
|
805
|
+
"Comparison {} (median obs. +- std.dev)".format(comparison_str),
|
|
806
|
+
"% change ({})".format(metric_mode),
|
|
807
|
+
"Note",
|
|
808
|
+
],
|
|
809
|
+
value_matrix=no_changes_table,
|
|
810
|
+
)
|
|
811
|
+
|
|
649
812
|
table_output = ""
|
|
650
813
|
|
|
651
814
|
from io import StringIO
|
|
@@ -654,7 +817,25 @@ def compute_regression_table(
|
|
|
654
817
|
old_stdout = sys.stdout
|
|
655
818
|
sys.stdout = mystdout = StringIO()
|
|
656
819
|
|
|
657
|
-
|
|
820
|
+
# Output improvements table first (if any)
|
|
821
|
+
if improvements_table:
|
|
822
|
+
improvements_writer.dump(mystdout, False)
|
|
823
|
+
mystdout.write("\n\n")
|
|
824
|
+
|
|
825
|
+
# Output regressions table (if any)
|
|
826
|
+
if regressions_table:
|
|
827
|
+
regressions_writer.dump(mystdout, False)
|
|
828
|
+
mystdout.write("\n\n")
|
|
829
|
+
|
|
830
|
+
# Add hidden no-changes table
|
|
831
|
+
if no_changes_table:
|
|
832
|
+
mystdout.write(
|
|
833
|
+
"<details>\n<summary>Tests with No Significant Changes ({} tests)</summary>\n\n".format(
|
|
834
|
+
len(no_changes_table)
|
|
835
|
+
)
|
|
836
|
+
)
|
|
837
|
+
no_changes_writer.dump(mystdout, False)
|
|
838
|
+
mystdout.write("\n</details>\n")
|
|
658
839
|
|
|
659
840
|
sys.stdout = old_stdout
|
|
660
841
|
|
|
@@ -668,6 +849,10 @@ def compute_regression_table(
|
|
|
668
849
|
total_stable,
|
|
669
850
|
total_unstable,
|
|
670
851
|
total_comparison_points,
|
|
852
|
+
total_unstable_baseline,
|
|
853
|
+
total_unstable_comparison,
|
|
854
|
+
total_latency_confirmed_regressions,
|
|
855
|
+
latency_confirmed_regression_details,
|
|
671
856
|
)
|
|
672
857
|
|
|
673
858
|
|
|
@@ -755,6 +940,13 @@ def from_rts_to_regression_table(
|
|
|
755
940
|
comparison_architecture=ARCH_X86,
|
|
756
941
|
first_n_baseline=-1,
|
|
757
942
|
first_n_comparison=-1,
|
|
943
|
+
grafana_link_base=None,
|
|
944
|
+
baseline_branch=None,
|
|
945
|
+
baseline_tag=None,
|
|
946
|
+
comparison_branch=None,
|
|
947
|
+
comparison_tag=None,
|
|
948
|
+
from_date=None,
|
|
949
|
+
to_date=None,
|
|
758
950
|
):
|
|
759
951
|
print_all = print_regressions_only is False and print_improvements_only is False
|
|
760
952
|
table = []
|
|
@@ -762,8 +954,12 @@ def from_rts_to_regression_table(
|
|
|
762
954
|
total_improvements = 0
|
|
763
955
|
total_stable = 0
|
|
764
956
|
total_unstable = 0
|
|
957
|
+
total_unstable_baseline = 0
|
|
958
|
+
total_unstable_comparison = 0
|
|
765
959
|
total_regressions = 0
|
|
766
960
|
total_comparison_points = 0
|
|
961
|
+
total_latency_confirmed_regressions = 0
|
|
962
|
+
latency_confirmed_regression_details = [] # Track specific test details
|
|
767
963
|
noise_waterline = 3
|
|
768
964
|
progress = tqdm(unit="benchmark time-series", total=len(test_names))
|
|
769
965
|
for test_name in test_names:
|
|
@@ -901,10 +1097,243 @@ def from_rts_to_regression_table(
|
|
|
901
1097
|
logging.error("Detected a ZeroDivisionError. {}".format(e.__str__()))
|
|
902
1098
|
pass
|
|
903
1099
|
unstable = False
|
|
1100
|
+
unstable_baseline = False
|
|
1101
|
+
unstable_comparison = False
|
|
1102
|
+
latency_confirms_regression = False
|
|
1103
|
+
|
|
904
1104
|
if baseline_v != "N/A" and comparison_v != "N/A":
|
|
905
1105
|
if comparison_pct_change > 10.0 or baseline_pct_change > 10.0:
|
|
906
|
-
note = "UNSTABLE (very high variance)"
|
|
907
1106
|
unstable = True
|
|
1107
|
+
unstable_baseline = baseline_pct_change > 10.0
|
|
1108
|
+
unstable_comparison = comparison_pct_change > 10.0
|
|
1109
|
+
|
|
1110
|
+
# Build detailed unstable note
|
|
1111
|
+
unstable_parts = []
|
|
1112
|
+
if unstable_baseline and unstable_comparison:
|
|
1113
|
+
unstable_parts.append(
|
|
1114
|
+
"UNSTABLE (baseline & comparison high variance)"
|
|
1115
|
+
)
|
|
1116
|
+
elif unstable_baseline:
|
|
1117
|
+
unstable_parts.append("UNSTABLE (baseline high variance)")
|
|
1118
|
+
elif unstable_comparison:
|
|
1119
|
+
unstable_parts.append("UNSTABLE (comparison high variance)")
|
|
1120
|
+
|
|
1121
|
+
note = unstable_parts[0]
|
|
1122
|
+
|
|
1123
|
+
# Log detailed warning about unstable data detection
|
|
1124
|
+
logging.warning(
|
|
1125
|
+
f"UNSTABLE DATA DETECTED for test '{test_name}': "
|
|
1126
|
+
f"baseline variance={baseline_pct_change:.1f}%, "
|
|
1127
|
+
f"comparison variance={comparison_pct_change:.1f}% "
|
|
1128
|
+
f"(threshold=10.0%)"
|
|
1129
|
+
)
|
|
1130
|
+
|
|
1131
|
+
# For throughput metrics (higher-better), check both server-side and client-side latency
|
|
1132
|
+
if metric_mode == "higher-better":
|
|
1133
|
+
logging.info(
|
|
1134
|
+
f"Performing 2nd-level latency validation for unstable throughput metric '{test_name}' "
|
|
1135
|
+
f"(metric_mode={metric_mode})"
|
|
1136
|
+
)
|
|
1137
|
+
|
|
1138
|
+
# Check server-side p50 latency
|
|
1139
|
+
(
|
|
1140
|
+
server_latency_note,
|
|
1141
|
+
server_confirms_regression,
|
|
1142
|
+
server_regression_details,
|
|
1143
|
+
) = check_latency_for_unstable_throughput(
|
|
1144
|
+
rts,
|
|
1145
|
+
test_name,
|
|
1146
|
+
baseline_str,
|
|
1147
|
+
comparison_str,
|
|
1148
|
+
by_str_baseline,
|
|
1149
|
+
by_str_comparison,
|
|
1150
|
+
baseline_deployment_name,
|
|
1151
|
+
comparison_deployment_name,
|
|
1152
|
+
tf_triggering_env,
|
|
1153
|
+
from_ts_ms,
|
|
1154
|
+
to_ts_ms,
|
|
1155
|
+
last_n_baseline,
|
|
1156
|
+
last_n_comparison,
|
|
1157
|
+
first_n_baseline,
|
|
1158
|
+
first_n_comparison,
|
|
1159
|
+
running_platform,
|
|
1160
|
+
baseline_architecture,
|
|
1161
|
+
comparison_architecture,
|
|
1162
|
+
verbose,
|
|
1163
|
+
)
|
|
1164
|
+
|
|
1165
|
+
# Check client-side latency metrics
|
|
1166
|
+
(
|
|
1167
|
+
client_latency_note,
|
|
1168
|
+
client_confirms_regression,
|
|
1169
|
+
client_regression_details,
|
|
1170
|
+
) = check_client_side_latency(
|
|
1171
|
+
rts,
|
|
1172
|
+
test_name,
|
|
1173
|
+
baseline_str,
|
|
1174
|
+
comparison_str,
|
|
1175
|
+
by_str_baseline,
|
|
1176
|
+
by_str_comparison,
|
|
1177
|
+
baseline_deployment_name,
|
|
1178
|
+
comparison_deployment_name,
|
|
1179
|
+
tf_triggering_env,
|
|
1180
|
+
from_ts_ms,
|
|
1181
|
+
to_ts_ms,
|
|
1182
|
+
last_n_baseline,
|
|
1183
|
+
last_n_comparison,
|
|
1184
|
+
first_n_baseline,
|
|
1185
|
+
first_n_comparison,
|
|
1186
|
+
running_platform,
|
|
1187
|
+
baseline_architecture,
|
|
1188
|
+
comparison_architecture,
|
|
1189
|
+
verbose,
|
|
1190
|
+
)
|
|
1191
|
+
|
|
1192
|
+
# Combine results from both server and client side
|
|
1193
|
+
combined_latency_notes = []
|
|
1194
|
+
if server_latency_note:
|
|
1195
|
+
combined_latency_notes.append(f"server: {server_latency_note}")
|
|
1196
|
+
if client_latency_note:
|
|
1197
|
+
combined_latency_notes.append(f"client: {client_latency_note}")
|
|
1198
|
+
|
|
1199
|
+
# Only confirm regression if BOTH server and client side show evidence AND data is stable enough
|
|
1200
|
+
# Check if either server or client data contains unstable indicators
|
|
1201
|
+
server_has_unstable = (
|
|
1202
|
+
server_latency_note and "UNSTABLE" in server_latency_note
|
|
1203
|
+
)
|
|
1204
|
+
client_has_unstable = (
|
|
1205
|
+
client_latency_note and "UNSTABLE" in client_latency_note
|
|
1206
|
+
)
|
|
1207
|
+
|
|
1208
|
+
# Don't confirm regression if either side has unstable data
|
|
1209
|
+
if server_has_unstable or client_has_unstable:
|
|
1210
|
+
both_confirm_regression = False
|
|
1211
|
+
unstable_sides = []
|
|
1212
|
+
if server_has_unstable:
|
|
1213
|
+
unstable_sides.append("server")
|
|
1214
|
+
if client_has_unstable:
|
|
1215
|
+
unstable_sides.append("client")
|
|
1216
|
+
blocked_note = f"regression blocked due to unstable {' and '.join(unstable_sides)} latency data"
|
|
1217
|
+
note += f"; {blocked_note}"
|
|
1218
|
+
logging.info(
|
|
1219
|
+
f"Blocking regression confirmation for '{test_name}' due to unstable latency data"
|
|
1220
|
+
)
|
|
1221
|
+
if server_has_unstable:
|
|
1222
|
+
logging.info(" Server-side latency data is unstable")
|
|
1223
|
+
if client_has_unstable:
|
|
1224
|
+
logging.info(" Client-side latency data is unstable")
|
|
1225
|
+
else:
|
|
1226
|
+
both_confirm_regression = (
|
|
1227
|
+
server_confirms_regression and client_confirms_regression
|
|
1228
|
+
)
|
|
1229
|
+
|
|
1230
|
+
if combined_latency_notes:
|
|
1231
|
+
combined_note = "; ".join(combined_latency_notes)
|
|
1232
|
+
note += f"; {combined_note}"
|
|
1233
|
+
logging.info(
|
|
1234
|
+
f"Combined latency check result for '{test_name}': {combined_note}"
|
|
1235
|
+
)
|
|
1236
|
+
|
|
1237
|
+
if both_confirm_regression:
|
|
1238
|
+
logging.info(
|
|
1239
|
+
f"BOTH server and client latency analysis CONFIRM regression for '{test_name}'"
|
|
1240
|
+
)
|
|
1241
|
+
|
|
1242
|
+
# Set the flag for counting confirmed regressions
|
|
1243
|
+
latency_confirms_regression = True
|
|
1244
|
+
|
|
1245
|
+
# Combine regression details from both server and client
|
|
1246
|
+
combined_regression_details = (
|
|
1247
|
+
server_regression_details or client_regression_details
|
|
1248
|
+
)
|
|
1249
|
+
if combined_regression_details:
|
|
1250
|
+
combined_regression_details["server_side"] = (
|
|
1251
|
+
server_confirms_regression
|
|
1252
|
+
)
|
|
1253
|
+
combined_regression_details["client_side"] = (
|
|
1254
|
+
client_confirms_regression
|
|
1255
|
+
)
|
|
1256
|
+
|
|
1257
|
+
# 2nd level confirmation is sufficient - always add to confirmed regressions
|
|
1258
|
+
logging.info(
|
|
1259
|
+
f"Adding '{test_name}' to confirmed regressions based on 2nd level validation"
|
|
1260
|
+
)
|
|
1261
|
+
|
|
1262
|
+
# Perform 3rd-level analysis: variance + p99 check for additional confidence scoring
|
|
1263
|
+
logging.info(
|
|
1264
|
+
f"Performing 3rd-level analysis (variance + p99) for confidence scoring on '{test_name}'"
|
|
1265
|
+
)
|
|
1266
|
+
(
|
|
1267
|
+
confidence_note,
|
|
1268
|
+
high_confidence,
|
|
1269
|
+
) = perform_variance_and_p99_analysis(
|
|
1270
|
+
rts,
|
|
1271
|
+
test_name,
|
|
1272
|
+
baseline_str,
|
|
1273
|
+
comparison_str,
|
|
1274
|
+
by_str_baseline,
|
|
1275
|
+
by_str_comparison,
|
|
1276
|
+
baseline_deployment_name,
|
|
1277
|
+
comparison_deployment_name,
|
|
1278
|
+
tf_triggering_env,
|
|
1279
|
+
from_ts_ms,
|
|
1280
|
+
to_ts_ms,
|
|
1281
|
+
last_n_baseline,
|
|
1282
|
+
last_n_comparison,
|
|
1283
|
+
first_n_baseline,
|
|
1284
|
+
first_n_comparison,
|
|
1285
|
+
running_platform,
|
|
1286
|
+
baseline_architecture,
|
|
1287
|
+
comparison_architecture,
|
|
1288
|
+
verbose,
|
|
1289
|
+
)
|
|
1290
|
+
|
|
1291
|
+
if confidence_note:
|
|
1292
|
+
note += f"; {confidence_note}"
|
|
1293
|
+
logging.info(
|
|
1294
|
+
f"Confidence analysis for '{test_name}': {confidence_note}"
|
|
1295
|
+
)
|
|
1296
|
+
# Use 3rd level confidence if available
|
|
1297
|
+
combined_regression_details["high_confidence"] = (
|
|
1298
|
+
high_confidence
|
|
1299
|
+
)
|
|
1300
|
+
else:
|
|
1301
|
+
# No 3rd level data available - default to moderate confidence since 2nd level confirmed
|
|
1302
|
+
logging.info(
|
|
1303
|
+
f"No 3rd level data available for '{test_name}' - using 2nd level confirmation"
|
|
1304
|
+
)
|
|
1305
|
+
combined_regression_details["high_confidence"] = (
|
|
1306
|
+
True # 2nd level confirmation is reliable
|
|
1307
|
+
)
|
|
1308
|
+
|
|
1309
|
+
# Always add to confirmed regressions when 2nd level confirms
|
|
1310
|
+
latency_confirmed_regression_details.append(
|
|
1311
|
+
combined_regression_details
|
|
1312
|
+
)
|
|
1313
|
+
elif server_confirms_regression or client_confirms_regression:
|
|
1314
|
+
side_confirmed = (
|
|
1315
|
+
"server" if server_confirms_regression else "client"
|
|
1316
|
+
)
|
|
1317
|
+
side_not_confirmed = (
|
|
1318
|
+
"client" if server_confirms_regression else "server"
|
|
1319
|
+
)
|
|
1320
|
+
insufficient_evidence_note = f"only {side_confirmed} side confirms regression ({side_not_confirmed} side stable) - insufficient evidence"
|
|
1321
|
+
note += f"; {insufficient_evidence_note}"
|
|
1322
|
+
logging.info(
|
|
1323
|
+
f"Only {side_confirmed} side confirms regression for '{test_name}' - insufficient evidence"
|
|
1324
|
+
)
|
|
1325
|
+
else:
|
|
1326
|
+
no_regression_note = (
|
|
1327
|
+
"neither server nor client side confirms regression"
|
|
1328
|
+
)
|
|
1329
|
+
note += f"; {no_regression_note}"
|
|
1330
|
+
logging.info(
|
|
1331
|
+
f"Neither server nor client side confirms regression for '{test_name}'"
|
|
1332
|
+
)
|
|
1333
|
+
else:
|
|
1334
|
+
logging.info(
|
|
1335
|
+
f"No latency data available for secondary check on '{test_name}'"
|
|
1336
|
+
)
|
|
908
1337
|
|
|
909
1338
|
baseline_v_str = prepare_value_str(
|
|
910
1339
|
baseline_pct_change, baseline_v, baseline_values, simplify_table
|
|
@@ -959,6 +1388,12 @@ def from_rts_to_regression_table(
|
|
|
959
1388
|
|
|
960
1389
|
if unstable:
|
|
961
1390
|
total_unstable += 1
|
|
1391
|
+
if unstable_baseline:
|
|
1392
|
+
total_unstable_baseline += 1
|
|
1393
|
+
if unstable_comparison:
|
|
1394
|
+
total_unstable_comparison += 1
|
|
1395
|
+
if latency_confirms_regression:
|
|
1396
|
+
total_latency_confirmed_regressions += 1
|
|
962
1397
|
|
|
963
1398
|
should_add_line = False
|
|
964
1399
|
if print_regressions_only and detected_regression:
|
|
@@ -979,6 +1414,13 @@ def from_rts_to_regression_table(
|
|
|
979
1414
|
percentage_change,
|
|
980
1415
|
table,
|
|
981
1416
|
test_name,
|
|
1417
|
+
grafana_link_base,
|
|
1418
|
+
baseline_branch,
|
|
1419
|
+
baseline_tag,
|
|
1420
|
+
comparison_branch,
|
|
1421
|
+
comparison_tag,
|
|
1422
|
+
from_date,
|
|
1423
|
+
to_date,
|
|
982
1424
|
)
|
|
983
1425
|
return (
|
|
984
1426
|
detected_regressions,
|
|
@@ -988,9 +1430,995 @@ def from_rts_to_regression_table(
|
|
|
988
1430
|
total_stable,
|
|
989
1431
|
total_unstable,
|
|
990
1432
|
total_comparison_points,
|
|
1433
|
+
total_unstable_baseline,
|
|
1434
|
+
total_unstable_comparison,
|
|
1435
|
+
total_latency_confirmed_regressions,
|
|
1436
|
+
latency_confirmed_regression_details,
|
|
991
1437
|
)
|
|
992
1438
|
|
|
993
1439
|
|
|
1440
|
+
def check_client_side_latency(
|
|
1441
|
+
rts,
|
|
1442
|
+
test_name,
|
|
1443
|
+
baseline_str,
|
|
1444
|
+
comparison_str,
|
|
1445
|
+
by_str_baseline,
|
|
1446
|
+
by_str_comparison,
|
|
1447
|
+
baseline_deployment_name,
|
|
1448
|
+
comparison_deployment_name,
|
|
1449
|
+
tf_triggering_env,
|
|
1450
|
+
from_ts_ms,
|
|
1451
|
+
to_ts_ms,
|
|
1452
|
+
last_n_baseline,
|
|
1453
|
+
last_n_comparison,
|
|
1454
|
+
first_n_baseline,
|
|
1455
|
+
first_n_comparison,
|
|
1456
|
+
running_platform,
|
|
1457
|
+
baseline_architecture,
|
|
1458
|
+
comparison_architecture,
|
|
1459
|
+
verbose=False,
|
|
1460
|
+
):
|
|
1461
|
+
"""
|
|
1462
|
+
Check client-side latency metrics to provide additional validation for regression detection.
|
|
1463
|
+
|
|
1464
|
+
Returns:
|
|
1465
|
+
tuple: (note_string, confirms_regression_bool, regression_details_dict)
|
|
1466
|
+
"""
|
|
1467
|
+
logging.info(f"Starting client-side latency check for test: {test_name}")
|
|
1468
|
+
try:
|
|
1469
|
+
# Client-side latency metrics to check
|
|
1470
|
+
client_metrics = [
|
|
1471
|
+
"p50_latency_ms",
|
|
1472
|
+
"Latency",
|
|
1473
|
+
"OverallQuantiles.allCommands.q50",
|
|
1474
|
+
"Tests.INSERT.AverageLatency_us_",
|
|
1475
|
+
"Tests.READ.AverageLatency_us_",
|
|
1476
|
+
"Tests.SEARCH.AverageLatency_us_",
|
|
1477
|
+
"Tests.UPDATE.AverageLatency_us_",
|
|
1478
|
+
]
|
|
1479
|
+
|
|
1480
|
+
client_latency_notes = []
|
|
1481
|
+
significant_client_latency_increases = 0
|
|
1482
|
+
regression_details = {"test_name": test_name, "commands": []}
|
|
1483
|
+
|
|
1484
|
+
for metric in client_metrics:
|
|
1485
|
+
# Build filters for client-side latency metric
|
|
1486
|
+
filters_baseline = [
|
|
1487
|
+
f"{by_str_baseline}={baseline_str}",
|
|
1488
|
+
f"metric={metric}",
|
|
1489
|
+
f"test_name={test_name}",
|
|
1490
|
+
f"deployment_name={baseline_deployment_name}",
|
|
1491
|
+
f"triggering_env={tf_triggering_env}",
|
|
1492
|
+
]
|
|
1493
|
+
filters_comparison = [
|
|
1494
|
+
f"{by_str_comparison}={comparison_str}",
|
|
1495
|
+
f"metric={metric}",
|
|
1496
|
+
f"test_name={test_name}",
|
|
1497
|
+
f"deployment_name={comparison_deployment_name}",
|
|
1498
|
+
f"triggering_env={tf_triggering_env}",
|
|
1499
|
+
]
|
|
1500
|
+
|
|
1501
|
+
# Add optional filters
|
|
1502
|
+
if running_platform is not None:
|
|
1503
|
+
filters_baseline.append(f"running_platform={running_platform}")
|
|
1504
|
+
filters_comparison.append(f"running_platform={running_platform}")
|
|
1505
|
+
if baseline_architecture != ARCH_X86:
|
|
1506
|
+
filters_baseline.append(f"arch={baseline_architecture}")
|
|
1507
|
+
if comparison_architecture != ARCH_X86:
|
|
1508
|
+
filters_comparison.append(f"arch={comparison_architecture}")
|
|
1509
|
+
|
|
1510
|
+
# Query for client-side latency time-series
|
|
1511
|
+
baseline_client_ts = rts.ts().queryindex(filters_baseline)
|
|
1512
|
+
comparison_client_ts = rts.ts().queryindex(filters_comparison)
|
|
1513
|
+
|
|
1514
|
+
if len(baseline_client_ts) == 0 or len(comparison_client_ts) == 0:
|
|
1515
|
+
if verbose:
|
|
1516
|
+
logging.info(
|
|
1517
|
+
f" No client-side data found for metric '{metric}' in {test_name}"
|
|
1518
|
+
)
|
|
1519
|
+
continue
|
|
1520
|
+
|
|
1521
|
+
logging.info(
|
|
1522
|
+
f" Found client-side metric '{metric}': {len(baseline_client_ts)} baseline, {len(comparison_client_ts)} comparison time-series"
|
|
1523
|
+
)
|
|
1524
|
+
|
|
1525
|
+
# Filter out target time-series
|
|
1526
|
+
baseline_client_ts = [ts for ts in baseline_client_ts if "target" not in ts]
|
|
1527
|
+
comparison_client_ts = [
|
|
1528
|
+
ts for ts in comparison_client_ts if "target" not in ts
|
|
1529
|
+
]
|
|
1530
|
+
|
|
1531
|
+
if len(baseline_client_ts) == 0 or len(comparison_client_ts) == 0:
|
|
1532
|
+
continue
|
|
1533
|
+
|
|
1534
|
+
# Use the first available time-series for each side
|
|
1535
|
+
baseline_ts = baseline_client_ts[0]
|
|
1536
|
+
comparison_ts = comparison_client_ts[0]
|
|
1537
|
+
|
|
1538
|
+
# Get client-side latency data
|
|
1539
|
+
baseline_client_data = rts.ts().revrange(baseline_ts, from_ts_ms, to_ts_ms)
|
|
1540
|
+
comparison_client_data = rts.ts().revrange(
|
|
1541
|
+
comparison_ts, from_ts_ms, to_ts_ms
|
|
1542
|
+
)
|
|
1543
|
+
|
|
1544
|
+
if len(baseline_client_data) == 0 or len(comparison_client_data) == 0:
|
|
1545
|
+
if verbose:
|
|
1546
|
+
logging.info(
|
|
1547
|
+
f" No data points for metric '{metric}': baseline={len(baseline_client_data)}, comparison={len(comparison_client_data)}"
|
|
1548
|
+
)
|
|
1549
|
+
continue
|
|
1550
|
+
|
|
1551
|
+
# Calculate client-side latency statistics
|
|
1552
|
+
baseline_client_values = []
|
|
1553
|
+
comparison_client_values = []
|
|
1554
|
+
|
|
1555
|
+
(_, baseline_client_median, _) = get_v_pct_change_and_largest_var(
|
|
1556
|
+
baseline_client_data,
|
|
1557
|
+
0,
|
|
1558
|
+
0,
|
|
1559
|
+
baseline_client_values,
|
|
1560
|
+
0,
|
|
1561
|
+
last_n_baseline,
|
|
1562
|
+
verbose,
|
|
1563
|
+
first_n_baseline,
|
|
1564
|
+
)
|
|
1565
|
+
|
|
1566
|
+
(_, comparison_client_median, _) = get_v_pct_change_and_largest_var(
|
|
1567
|
+
comparison_client_data,
|
|
1568
|
+
0,
|
|
1569
|
+
0,
|
|
1570
|
+
comparison_client_values,
|
|
1571
|
+
0,
|
|
1572
|
+
last_n_comparison,
|
|
1573
|
+
verbose,
|
|
1574
|
+
first_n_comparison,
|
|
1575
|
+
)
|
|
1576
|
+
|
|
1577
|
+
if baseline_client_median == "N/A" or comparison_client_median == "N/A":
|
|
1578
|
+
if verbose:
|
|
1579
|
+
logging.info(
|
|
1580
|
+
f" Could not calculate median for metric '{metric}': baseline={baseline_client_median}, comparison={comparison_client_median}"
|
|
1581
|
+
)
|
|
1582
|
+
continue
|
|
1583
|
+
|
|
1584
|
+
# Calculate variance (coefficient of variation) for both baseline and comparison
|
|
1585
|
+
baseline_client_mean = (
|
|
1586
|
+
statistics.mean(baseline_client_values) if baseline_client_values else 0
|
|
1587
|
+
)
|
|
1588
|
+
baseline_client_stdev = (
|
|
1589
|
+
statistics.stdev(baseline_client_values)
|
|
1590
|
+
if len(baseline_client_values) > 1
|
|
1591
|
+
else 0
|
|
1592
|
+
)
|
|
1593
|
+
baseline_client_cv = (
|
|
1594
|
+
(baseline_client_stdev / baseline_client_mean * 100)
|
|
1595
|
+
if baseline_client_mean > 0
|
|
1596
|
+
else float("inf")
|
|
1597
|
+
)
|
|
1598
|
+
|
|
1599
|
+
comparison_client_mean = (
|
|
1600
|
+
statistics.mean(comparison_client_values)
|
|
1601
|
+
if comparison_client_values
|
|
1602
|
+
else 0
|
|
1603
|
+
)
|
|
1604
|
+
comparison_client_stdev = (
|
|
1605
|
+
statistics.stdev(comparison_client_values)
|
|
1606
|
+
if len(comparison_client_values) > 1
|
|
1607
|
+
else 0
|
|
1608
|
+
)
|
|
1609
|
+
comparison_client_cv = (
|
|
1610
|
+
(comparison_client_stdev / comparison_client_mean * 100)
|
|
1611
|
+
if comparison_client_mean > 0
|
|
1612
|
+
else float("inf")
|
|
1613
|
+
)
|
|
1614
|
+
|
|
1615
|
+
# Calculate client-side latency change (for latency, higher is worse)
|
|
1616
|
+
client_latency_change = (
|
|
1617
|
+
float(comparison_client_median) / float(baseline_client_median) - 1
|
|
1618
|
+
) * 100.0
|
|
1619
|
+
|
|
1620
|
+
logging.info(
|
|
1621
|
+
f" Client metric '{metric}': baseline={baseline_client_median:.2f} (CV={baseline_client_cv:.1f}%), comparison={comparison_client_median:.2f} (CV={comparison_client_cv:.1f}%), change={client_latency_change:.1f}%"
|
|
1622
|
+
)
|
|
1623
|
+
|
|
1624
|
+
# Check if client latency data is too unstable to be reliable
|
|
1625
|
+
client_data_unstable = (
|
|
1626
|
+
baseline_client_cv > 50.0 or comparison_client_cv > 50.0
|
|
1627
|
+
)
|
|
1628
|
+
|
|
1629
|
+
if client_data_unstable:
|
|
1630
|
+
# Mark as unstable client latency data
|
|
1631
|
+
unstable_reason = []
|
|
1632
|
+
if baseline_client_cv > 50.0:
|
|
1633
|
+
unstable_reason.append(f"baseline CV={baseline_client_cv:.1f}%")
|
|
1634
|
+
if comparison_client_cv > 50.0:
|
|
1635
|
+
unstable_reason.append(f"comparison CV={comparison_client_cv:.1f}%")
|
|
1636
|
+
|
|
1637
|
+
client_latency_notes.append(
|
|
1638
|
+
f"{metric} UNSTABLE ({', '.join(unstable_reason)} - data too noisy for reliable analysis)"
|
|
1639
|
+
)
|
|
1640
|
+
logging.warning(
|
|
1641
|
+
f" Client metric '{metric}': UNSTABLE latency data detected - {', '.join(unstable_reason)}"
|
|
1642
|
+
)
|
|
1643
|
+
elif (
|
|
1644
|
+
abs(client_latency_change) > 5.0
|
|
1645
|
+
): # Only report significant client latency changes for stable data
|
|
1646
|
+
direction = "increased" if client_latency_change > 0 else "decreased"
|
|
1647
|
+
|
|
1648
|
+
# Adjust significance threshold based on baseline variance
|
|
1649
|
+
if baseline_client_cv < 30.0:
|
|
1650
|
+
# Low variance - use standard threshold
|
|
1651
|
+
significance_threshold = 10.0
|
|
1652
|
+
elif baseline_client_cv < 50.0:
|
|
1653
|
+
# Moderate variance - require larger change
|
|
1654
|
+
significance_threshold = 15.0
|
|
1655
|
+
else:
|
|
1656
|
+
# High variance - require much larger change
|
|
1657
|
+
significance_threshold = 25.0
|
|
1658
|
+
|
|
1659
|
+
client_latency_notes.append(
|
|
1660
|
+
f"{metric} {direction} {abs(client_latency_change):.1f}% (baseline CV={baseline_client_cv:.1f}%)"
|
|
1661
|
+
)
|
|
1662
|
+
logging.info(
|
|
1663
|
+
f" Client metric '{metric}': SIGNIFICANT latency change detected ({direction} {abs(client_latency_change):.1f}%, baseline CV={baseline_client_cv:.1f}%)"
|
|
1664
|
+
)
|
|
1665
|
+
|
|
1666
|
+
# Track significant client latency increases (potential regression confirmation)
|
|
1667
|
+
if client_latency_change > significance_threshold:
|
|
1668
|
+
significant_client_latency_increases += 1
|
|
1669
|
+
regression_details["commands"].append(
|
|
1670
|
+
{
|
|
1671
|
+
"command": metric,
|
|
1672
|
+
"change_percent": client_latency_change,
|
|
1673
|
+
"direction": direction,
|
|
1674
|
+
"baseline_cv": baseline_client_cv,
|
|
1675
|
+
"comparison_cv": comparison_client_cv,
|
|
1676
|
+
}
|
|
1677
|
+
)
|
|
1678
|
+
logging.info(
|
|
1679
|
+
f" Client metric '{metric}': CONFIRMS regression (change={client_latency_change:.1f}% > threshold={significance_threshold:.1f}%)"
|
|
1680
|
+
)
|
|
1681
|
+
else:
|
|
1682
|
+
logging.info(
|
|
1683
|
+
f" Client metric '{metric}': Change below significance threshold (change={client_latency_change:.1f}% <= threshold={significance_threshold:.1f}%)"
|
|
1684
|
+
)
|
|
1685
|
+
elif verbose:
|
|
1686
|
+
client_latency_notes.append(
|
|
1687
|
+
f"{metric} stable (CV={baseline_client_cv:.1f}%)"
|
|
1688
|
+
)
|
|
1689
|
+
logging.info(
|
|
1690
|
+
f" Client metric '{metric}': latency stable (change={client_latency_change:.1f}%, baseline CV={baseline_client_cv:.1f}%)"
|
|
1691
|
+
)
|
|
1692
|
+
|
|
1693
|
+
# Determine if client-side latency confirms regression
|
|
1694
|
+
confirms_regression = significant_client_latency_increases > 0
|
|
1695
|
+
|
|
1696
|
+
# Return combined client latency notes
|
|
1697
|
+
if client_latency_notes:
|
|
1698
|
+
result = "; ".join(client_latency_notes)
|
|
1699
|
+
logging.info(
|
|
1700
|
+
f"Client-side latency check completed for {test_name}: {result}"
|
|
1701
|
+
)
|
|
1702
|
+
return (
|
|
1703
|
+
result,
|
|
1704
|
+
confirms_regression,
|
|
1705
|
+
regression_details if confirms_regression else None,
|
|
1706
|
+
)
|
|
1707
|
+
else:
|
|
1708
|
+
result = "client latency stable" if len(client_metrics) > 0 else None
|
|
1709
|
+
logging.info(
|
|
1710
|
+
f"Client-side latency check completed for {test_name}: {result or 'no data'}"
|
|
1711
|
+
)
|
|
1712
|
+
return result, False, None
|
|
1713
|
+
|
|
1714
|
+
except Exception as e:
|
|
1715
|
+
logging.error(f"Error checking client-side latency for {test_name}: {e}")
|
|
1716
|
+
return None, False, None
|
|
1717
|
+
|
|
1718
|
+
|
|
1719
|
+
def perform_variance_and_p99_analysis(
|
|
1720
|
+
rts,
|
|
1721
|
+
test_name,
|
|
1722
|
+
baseline_str,
|
|
1723
|
+
comparison_str,
|
|
1724
|
+
by_str_baseline,
|
|
1725
|
+
by_str_comparison,
|
|
1726
|
+
baseline_deployment_name,
|
|
1727
|
+
comparison_deployment_name,
|
|
1728
|
+
tf_triggering_env,
|
|
1729
|
+
from_ts_ms,
|
|
1730
|
+
to_ts_ms,
|
|
1731
|
+
last_n_baseline,
|
|
1732
|
+
last_n_comparison,
|
|
1733
|
+
first_n_baseline,
|
|
1734
|
+
first_n_comparison,
|
|
1735
|
+
running_platform,
|
|
1736
|
+
baseline_architecture,
|
|
1737
|
+
comparison_architecture,
|
|
1738
|
+
verbose=False,
|
|
1739
|
+
):
|
|
1740
|
+
"""
|
|
1741
|
+
Perform 3rd-level analysis using variance and p99 metrics to assess confidence in regression detection.
|
|
1742
|
+
|
|
1743
|
+
Returns:
|
|
1744
|
+
tuple: (confidence_note, high_confidence_bool)
|
|
1745
|
+
"""
|
|
1746
|
+
try:
|
|
1747
|
+
logging.info(f"Starting variance and p99 analysis for {test_name}")
|
|
1748
|
+
|
|
1749
|
+
# Build filters for p99 latency metric using both metric=p99 and metric-type=(latencystats)
|
|
1750
|
+
filters_baseline = [
|
|
1751
|
+
f"{by_str_baseline}={baseline_str}",
|
|
1752
|
+
"metric=p99",
|
|
1753
|
+
"metric-type=(latencystats)",
|
|
1754
|
+
f"test_name={test_name}",
|
|
1755
|
+
f"deployment_name={baseline_deployment_name}",
|
|
1756
|
+
f"triggering_env={tf_triggering_env}",
|
|
1757
|
+
]
|
|
1758
|
+
filters_comparison = [
|
|
1759
|
+
f"{by_str_comparison}={comparison_str}",
|
|
1760
|
+
"metric=p99",
|
|
1761
|
+
"metric-type=(latencystats)",
|
|
1762
|
+
f"test_name={test_name}",
|
|
1763
|
+
f"deployment_name={comparison_deployment_name}",
|
|
1764
|
+
f"triggering_env={tf_triggering_env}",
|
|
1765
|
+
]
|
|
1766
|
+
|
|
1767
|
+
# Add optional filters
|
|
1768
|
+
if running_platform is not None:
|
|
1769
|
+
filters_baseline.append(f"running_platform={running_platform}")
|
|
1770
|
+
filters_comparison.append(f"running_platform={running_platform}")
|
|
1771
|
+
if baseline_architecture != ARCH_X86:
|
|
1772
|
+
filters_baseline.append(f"arch={baseline_architecture}")
|
|
1773
|
+
if comparison_architecture != ARCH_X86:
|
|
1774
|
+
filters_comparison.append(f"arch={comparison_architecture}")
|
|
1775
|
+
|
|
1776
|
+
# Query for p99 latency time-series
|
|
1777
|
+
logging.info(f"Querying p99 latencystats time-series for {test_name}")
|
|
1778
|
+
baseline_p99_ts = rts.ts().queryindex(filters_baseline)
|
|
1779
|
+
comparison_p99_ts = rts.ts().queryindex(filters_comparison)
|
|
1780
|
+
|
|
1781
|
+
logging.info(f"Found {len(baseline_p99_ts)} baseline p99 latency time-series")
|
|
1782
|
+
logging.info(
|
|
1783
|
+
f"Found {len(comparison_p99_ts)} comparison p99 latency time-series"
|
|
1784
|
+
)
|
|
1785
|
+
|
|
1786
|
+
# Filter out target time-series and unwanted commands (reuse existing function)
|
|
1787
|
+
def should_exclude_timeseries(ts_name):
|
|
1788
|
+
"""Check if time-series should be excluded based on command"""
|
|
1789
|
+
if "target" in ts_name:
|
|
1790
|
+
return True
|
|
1791
|
+
ts_name_lower = ts_name.lower()
|
|
1792
|
+
excluded_commands = ["config", "info", "ping", "cluster", "resetstat"]
|
|
1793
|
+
return any(cmd in ts_name_lower for cmd in excluded_commands)
|
|
1794
|
+
|
|
1795
|
+
baseline_p99_ts = [
|
|
1796
|
+
ts for ts in baseline_p99_ts if not should_exclude_timeseries(ts)
|
|
1797
|
+
]
|
|
1798
|
+
comparison_p99_ts = [
|
|
1799
|
+
ts for ts in comparison_p99_ts if not should_exclude_timeseries(ts)
|
|
1800
|
+
]
|
|
1801
|
+
|
|
1802
|
+
if len(baseline_p99_ts) == 0 or len(comparison_p99_ts) == 0:
|
|
1803
|
+
logging.warning(
|
|
1804
|
+
f"No p99 latency data found for {test_name} after filtering"
|
|
1805
|
+
)
|
|
1806
|
+
return None, False
|
|
1807
|
+
|
|
1808
|
+
# Extract command names from time-series (reuse existing function)
|
|
1809
|
+
def extract_command_from_ts(ts_name):
|
|
1810
|
+
"""Extract meaningful command name from time-series name"""
|
|
1811
|
+
# Look for latencystats_latency_percentiles_usec_<COMMAND>_p99 pattern
|
|
1812
|
+
match = re.search(
|
|
1813
|
+
r"latencystats_latency_percentiles_usec_([^_/]+)_p99", ts_name
|
|
1814
|
+
)
|
|
1815
|
+
if match:
|
|
1816
|
+
return match.group(1)
|
|
1817
|
+
# Look for command= pattern in the time-series name
|
|
1818
|
+
match = re.search(r"command=([^/]+)", ts_name)
|
|
1819
|
+
if match:
|
|
1820
|
+
return match.group(1)
|
|
1821
|
+
# If no specific pattern found, try to extract from the end of the path
|
|
1822
|
+
parts = ts_name.split("/")
|
|
1823
|
+
if len(parts) > 0:
|
|
1824
|
+
return parts[-1]
|
|
1825
|
+
return "unknown"
|
|
1826
|
+
|
|
1827
|
+
# Group time-series by command
|
|
1828
|
+
baseline_by_command = {}
|
|
1829
|
+
comparison_by_command = {}
|
|
1830
|
+
|
|
1831
|
+
for ts in baseline_p99_ts:
|
|
1832
|
+
cmd = extract_command_from_ts(ts)
|
|
1833
|
+
if cmd not in baseline_by_command:
|
|
1834
|
+
baseline_by_command[cmd] = []
|
|
1835
|
+
baseline_by_command[cmd].append(ts)
|
|
1836
|
+
|
|
1837
|
+
for ts in comparison_p99_ts:
|
|
1838
|
+
cmd = extract_command_from_ts(ts)
|
|
1839
|
+
if cmd not in comparison_by_command:
|
|
1840
|
+
comparison_by_command[cmd] = []
|
|
1841
|
+
comparison_by_command[cmd].append(ts)
|
|
1842
|
+
|
|
1843
|
+
# Find common commands between baseline and comparison
|
|
1844
|
+
common_commands = set(baseline_by_command.keys()) & set(
|
|
1845
|
+
comparison_by_command.keys()
|
|
1846
|
+
)
|
|
1847
|
+
|
|
1848
|
+
if not common_commands:
|
|
1849
|
+
logging.warning(
|
|
1850
|
+
f"No common commands found for p99 variance analysis in {test_name}"
|
|
1851
|
+
)
|
|
1852
|
+
return None, False
|
|
1853
|
+
|
|
1854
|
+
variance_notes = []
|
|
1855
|
+
p99_notes = []
|
|
1856
|
+
high_confidence_indicators = 0
|
|
1857
|
+
total_indicators = 0
|
|
1858
|
+
|
|
1859
|
+
# Analyze variance and p99 for each command
|
|
1860
|
+
for command in sorted(common_commands):
|
|
1861
|
+
total_indicators += 1
|
|
1862
|
+
logging.info(f"Analyzing p99 variance for command: {command}")
|
|
1863
|
+
|
|
1864
|
+
baseline_ts_list = baseline_by_command[command]
|
|
1865
|
+
comparison_ts_list = comparison_by_command[command]
|
|
1866
|
+
|
|
1867
|
+
# If multiple time-series for the same command, try to get the best one
|
|
1868
|
+
if len(baseline_ts_list) > 1:
|
|
1869
|
+
baseline_ts_list = get_only_Totals(baseline_ts_list)
|
|
1870
|
+
if len(comparison_ts_list) > 1:
|
|
1871
|
+
comparison_ts_list = get_only_Totals(comparison_ts_list)
|
|
1872
|
+
|
|
1873
|
+
if len(baseline_ts_list) != 1 or len(comparison_ts_list) != 1:
|
|
1874
|
+
logging.warning(
|
|
1875
|
+
f" Skipping {command}: baseline={len(baseline_ts_list)}, comparison={len(comparison_ts_list)} time-series"
|
|
1876
|
+
)
|
|
1877
|
+
continue
|
|
1878
|
+
|
|
1879
|
+
# Get p99 latency data for this command
|
|
1880
|
+
baseline_p99_data = []
|
|
1881
|
+
comparison_p99_data = []
|
|
1882
|
+
|
|
1883
|
+
for ts_name in baseline_ts_list:
|
|
1884
|
+
datapoints = rts.ts().revrange(ts_name, from_ts_ms, to_ts_ms)
|
|
1885
|
+
baseline_p99_data.extend(datapoints)
|
|
1886
|
+
|
|
1887
|
+
for ts_name in comparison_ts_list:
|
|
1888
|
+
datapoints = rts.ts().revrange(ts_name, from_ts_ms, to_ts_ms)
|
|
1889
|
+
comparison_p99_data.extend(datapoints)
|
|
1890
|
+
|
|
1891
|
+
if len(baseline_p99_data) < 3 or len(comparison_p99_data) < 3:
|
|
1892
|
+
logging.warning(
|
|
1893
|
+
f" Insufficient p99 data for {command}: baseline={len(baseline_p99_data)}, comparison={len(comparison_p99_data)} datapoints"
|
|
1894
|
+
)
|
|
1895
|
+
continue
|
|
1896
|
+
|
|
1897
|
+
# Extract values for variance calculation
|
|
1898
|
+
baseline_values = [dp[1] for dp in baseline_p99_data]
|
|
1899
|
+
comparison_values = [dp[1] for dp in comparison_p99_data]
|
|
1900
|
+
|
|
1901
|
+
# Calculate variance (coefficient of variation)
|
|
1902
|
+
baseline_mean = statistics.mean(baseline_values)
|
|
1903
|
+
baseline_stdev = (
|
|
1904
|
+
statistics.stdev(baseline_values) if len(baseline_values) > 1 else 0
|
|
1905
|
+
)
|
|
1906
|
+
baseline_cv = (
|
|
1907
|
+
(baseline_stdev / baseline_mean * 100)
|
|
1908
|
+
if baseline_mean > 0
|
|
1909
|
+
else float("inf")
|
|
1910
|
+
)
|
|
1911
|
+
|
|
1912
|
+
comparison_mean = statistics.mean(comparison_values)
|
|
1913
|
+
comparison_stdev = (
|
|
1914
|
+
statistics.stdev(comparison_values) if len(comparison_values) > 1 else 0
|
|
1915
|
+
)
|
|
1916
|
+
comparison_cv = (
|
|
1917
|
+
(comparison_stdev / comparison_mean * 100)
|
|
1918
|
+
if comparison_mean > 0
|
|
1919
|
+
else float("inf")
|
|
1920
|
+
)
|
|
1921
|
+
|
|
1922
|
+
# Calculate p99 change
|
|
1923
|
+
p99_change = (
|
|
1924
|
+
((comparison_mean - baseline_mean) / baseline_mean * 100)
|
|
1925
|
+
if baseline_mean > 0
|
|
1926
|
+
else 0
|
|
1927
|
+
)
|
|
1928
|
+
|
|
1929
|
+
# Assess confidence based on variance and p99 change
|
|
1930
|
+
if baseline_cv < 30: # Low variance in baseline (< 30% CV)
|
|
1931
|
+
if abs(p99_change) > 15: # Significant p99 change
|
|
1932
|
+
high_confidence_indicators += 1
|
|
1933
|
+
p99_notes.append(
|
|
1934
|
+
f"{command} p99 {'+' if p99_change > 0 else ''}{p99_change:.1f}% (stable baseline)"
|
|
1935
|
+
)
|
|
1936
|
+
else:
|
|
1937
|
+
p99_notes.append(
|
|
1938
|
+
f"{command} p99 {'+' if p99_change > 0 else ''}{p99_change:.1f}% (stable baseline, minor change)"
|
|
1939
|
+
)
|
|
1940
|
+
elif baseline_cv < 50: # Moderate variance
|
|
1941
|
+
if abs(p99_change) > 25: # Need larger change for confidence
|
|
1942
|
+
high_confidence_indicators += 1
|
|
1943
|
+
p99_notes.append(
|
|
1944
|
+
f"{command} p99 {'+' if p99_change > 0 else ''}{p99_change:.1f}% (moderate baseline variance)"
|
|
1945
|
+
)
|
|
1946
|
+
else:
|
|
1947
|
+
p99_notes.append(
|
|
1948
|
+
f"{command} p99 {'+' if p99_change > 0 else ''}{p99_change:.1f}% (moderate baseline variance, uncertain)"
|
|
1949
|
+
)
|
|
1950
|
+
else: # High variance
|
|
1951
|
+
if abs(p99_change) > 40: # Need very large change for confidence
|
|
1952
|
+
high_confidence_indicators += 1
|
|
1953
|
+
p99_notes.append(
|
|
1954
|
+
f"{command} p99 {'+' if p99_change > 0 else ''}{p99_change:.1f}% (high baseline variance, large change)"
|
|
1955
|
+
)
|
|
1956
|
+
else:
|
|
1957
|
+
p99_notes.append(
|
|
1958
|
+
f"{command} p99 {'+' if p99_change > 0 else ''}{p99_change:.1f}% (high baseline variance, low confidence)"
|
|
1959
|
+
)
|
|
1960
|
+
|
|
1961
|
+
variance_notes.append(f"{command} baseline CV={baseline_cv:.1f}%")
|
|
1962
|
+
|
|
1963
|
+
if verbose:
|
|
1964
|
+
logging.info(
|
|
1965
|
+
f" Command {command}: baseline CV={baseline_cv:.1f}%, comparison CV={comparison_cv:.1f}%, p99 change={p99_change:.1f}%"
|
|
1966
|
+
)
|
|
1967
|
+
|
|
1968
|
+
# Determine overall confidence
|
|
1969
|
+
confidence_ratio = (
|
|
1970
|
+
high_confidence_indicators / total_indicators if total_indicators > 0 else 0
|
|
1971
|
+
)
|
|
1972
|
+
high_confidence = (
|
|
1973
|
+
confidence_ratio >= 0.5
|
|
1974
|
+
) # At least 50% of indicators show high confidence
|
|
1975
|
+
|
|
1976
|
+
# Create confidence note
|
|
1977
|
+
confidence_parts = []
|
|
1978
|
+
if variance_notes:
|
|
1979
|
+
confidence_parts.extend(variance_notes)
|
|
1980
|
+
if p99_notes:
|
|
1981
|
+
confidence_parts.extend(p99_notes)
|
|
1982
|
+
|
|
1983
|
+
confidence_note = "; ".join(confidence_parts) if confidence_parts else None
|
|
1984
|
+
|
|
1985
|
+
if confidence_note:
|
|
1986
|
+
confidence_level = "HIGH" if high_confidence else "LOW"
|
|
1987
|
+
cv_explanation = "CV=coefficient of variation (data stability: <30% stable, 30-50% moderate, >50% unstable)"
|
|
1988
|
+
confidence_note = (
|
|
1989
|
+
f"confidence={confidence_level} ({confidence_note}; {cv_explanation})"
|
|
1990
|
+
)
|
|
1991
|
+
|
|
1992
|
+
logging.info(
|
|
1993
|
+
f"Variance and p99 analysis completed for {test_name}: confidence={confidence_ratio:.2f}, high_confidence={high_confidence}"
|
|
1994
|
+
)
|
|
1995
|
+
return confidence_note, high_confidence
|
|
1996
|
+
|
|
1997
|
+
except Exception as e:
|
|
1998
|
+
logging.error(f"Error in variance and p99 analysis for {test_name}: {e}")
|
|
1999
|
+
return None, False
|
|
2000
|
+
|
|
2001
|
+
|
|
2002
|
+
def check_latency_for_unstable_throughput(
|
|
2003
|
+
rts,
|
|
2004
|
+
test_name,
|
|
2005
|
+
baseline_str,
|
|
2006
|
+
comparison_str,
|
|
2007
|
+
by_str_baseline,
|
|
2008
|
+
by_str_comparison,
|
|
2009
|
+
baseline_deployment_name,
|
|
2010
|
+
comparison_deployment_name,
|
|
2011
|
+
tf_triggering_env,
|
|
2012
|
+
from_ts_ms,
|
|
2013
|
+
to_ts_ms,
|
|
2014
|
+
last_n_baseline,
|
|
2015
|
+
last_n_comparison,
|
|
2016
|
+
first_n_baseline,
|
|
2017
|
+
first_n_comparison,
|
|
2018
|
+
running_platform,
|
|
2019
|
+
baseline_architecture,
|
|
2020
|
+
comparison_architecture,
|
|
2021
|
+
verbose,
|
|
2022
|
+
):
|
|
2023
|
+
"""
|
|
2024
|
+
Check latency (p50) for unstable throughput metrics to provide additional context.
|
|
2025
|
+
Returns a tuple: (note_string, confirms_regression_bool, regression_details_dict)
|
|
2026
|
+
"""
|
|
2027
|
+
logging.info(f"Starting latency check for unstable throughput test: {test_name}")
|
|
2028
|
+
try:
|
|
2029
|
+
# Build filters for p50 latency metric using both metric=p50 and metric-type=(latencystats)
|
|
2030
|
+
filters_baseline = [
|
|
2031
|
+
f"{by_str_baseline}={baseline_str}",
|
|
2032
|
+
"metric=p50",
|
|
2033
|
+
"metric-type=(latencystats)",
|
|
2034
|
+
f"test_name={test_name}",
|
|
2035
|
+
f"deployment_name={baseline_deployment_name}",
|
|
2036
|
+
f"triggering_env={tf_triggering_env}",
|
|
2037
|
+
]
|
|
2038
|
+
filters_comparison = [
|
|
2039
|
+
f"{by_str_comparison}={comparison_str}",
|
|
2040
|
+
"metric=p50",
|
|
2041
|
+
"metric-type=(latencystats)",
|
|
2042
|
+
f"test_name={test_name}",
|
|
2043
|
+
f"deployment_name={comparison_deployment_name}",
|
|
2044
|
+
f"triggering_env={tf_triggering_env}",
|
|
2045
|
+
]
|
|
2046
|
+
|
|
2047
|
+
# Add optional filters
|
|
2048
|
+
if running_platform is not None:
|
|
2049
|
+
filters_baseline.append(f"running_platform={running_platform}")
|
|
2050
|
+
filters_comparison.append(f"running_platform={running_platform}")
|
|
2051
|
+
if baseline_architecture != ARCH_X86:
|
|
2052
|
+
filters_baseline.append(f"arch={baseline_architecture}")
|
|
2053
|
+
if comparison_architecture != ARCH_X86:
|
|
2054
|
+
filters_comparison.append(f"arch={comparison_architecture}")
|
|
2055
|
+
|
|
2056
|
+
# Query for p50 latency time-series
|
|
2057
|
+
logging.info(f"Querying p50 latencystats time-series for {test_name}")
|
|
2058
|
+
logging.info(f"Baseline filters: {filters_baseline}")
|
|
2059
|
+
logging.info(f"Comparison filters: {filters_comparison}")
|
|
2060
|
+
|
|
2061
|
+
baseline_latency_ts = rts.ts().queryindex(filters_baseline)
|
|
2062
|
+
comparison_latency_ts = rts.ts().queryindex(filters_comparison)
|
|
2063
|
+
|
|
2064
|
+
logging.info(
|
|
2065
|
+
f"Found {len(baseline_latency_ts)} baseline p50 latency time-series"
|
|
2066
|
+
)
|
|
2067
|
+
logging.info(
|
|
2068
|
+
f"Found {len(comparison_latency_ts)} comparison p50 latency time-series"
|
|
2069
|
+
)
|
|
2070
|
+
|
|
2071
|
+
if verbose and baseline_latency_ts:
|
|
2072
|
+
logging.info(f"Baseline latency time-series: {baseline_latency_ts}")
|
|
2073
|
+
if verbose and comparison_latency_ts:
|
|
2074
|
+
logging.info(f"Comparison latency time-series: {comparison_latency_ts}")
|
|
2075
|
+
|
|
2076
|
+
# Filter out target time-series and unwanted commands
|
|
2077
|
+
def should_exclude_timeseries(ts_name):
|
|
2078
|
+
"""Check if time-series should be excluded based on command"""
|
|
2079
|
+
# Exclude target time-series
|
|
2080
|
+
if "target" in ts_name:
|
|
2081
|
+
return True
|
|
2082
|
+
|
|
2083
|
+
# Convert to lowercase for case-insensitive matching
|
|
2084
|
+
ts_name_lower = ts_name.lower()
|
|
2085
|
+
|
|
2086
|
+
# Exclude administrative commands (case-insensitive)
|
|
2087
|
+
excluded_commands = ["config", "info", "ping", "cluster", "resetstat"]
|
|
2088
|
+
return any(cmd in ts_name_lower for cmd in excluded_commands)
|
|
2089
|
+
|
|
2090
|
+
baseline_latency_ts_before = len(baseline_latency_ts)
|
|
2091
|
+
comparison_latency_ts_before = len(comparison_latency_ts)
|
|
2092
|
+
|
|
2093
|
+
# Apply filtering and log what gets excluded
|
|
2094
|
+
baseline_excluded = [
|
|
2095
|
+
ts for ts in baseline_latency_ts if should_exclude_timeseries(ts)
|
|
2096
|
+
]
|
|
2097
|
+
comparison_excluded = [
|
|
2098
|
+
ts for ts in comparison_latency_ts if should_exclude_timeseries(ts)
|
|
2099
|
+
]
|
|
2100
|
+
|
|
2101
|
+
baseline_latency_ts = [
|
|
2102
|
+
ts for ts in baseline_latency_ts if not should_exclude_timeseries(ts)
|
|
2103
|
+
]
|
|
2104
|
+
comparison_latency_ts = [
|
|
2105
|
+
ts for ts in comparison_latency_ts if not should_exclude_timeseries(ts)
|
|
2106
|
+
]
|
|
2107
|
+
|
|
2108
|
+
logging.info(
|
|
2109
|
+
f"After filtering: baseline {baseline_latency_ts_before} -> {len(baseline_latency_ts)}, "
|
|
2110
|
+
f"comparison {comparison_latency_ts_before} -> {len(comparison_latency_ts)}"
|
|
2111
|
+
)
|
|
2112
|
+
|
|
2113
|
+
if baseline_excluded:
|
|
2114
|
+
logging.info(
|
|
2115
|
+
f"Excluded {len(baseline_excluded)} baseline administrative command time-series"
|
|
2116
|
+
)
|
|
2117
|
+
if verbose:
|
|
2118
|
+
for ts in baseline_excluded:
|
|
2119
|
+
logging.info(f" Excluded baseline: {ts}")
|
|
2120
|
+
if comparison_excluded:
|
|
2121
|
+
logging.info(
|
|
2122
|
+
f"Excluded {len(comparison_excluded)} comparison administrative command time-series"
|
|
2123
|
+
)
|
|
2124
|
+
if verbose:
|
|
2125
|
+
for ts in comparison_excluded:
|
|
2126
|
+
logging.info(f" Excluded comparison: {ts}")
|
|
2127
|
+
|
|
2128
|
+
if len(baseline_latency_ts) == 0 or len(comparison_latency_ts) == 0:
|
|
2129
|
+
logging.warning(
|
|
2130
|
+
f"No p50 latency data found for {test_name} after filtering"
|
|
2131
|
+
)
|
|
2132
|
+
return None, False, None
|
|
2133
|
+
|
|
2134
|
+
# Extract command names from time-series to match baseline and comparison
|
|
2135
|
+
def extract_command_from_ts(ts_name):
|
|
2136
|
+
"""Extract meaningful command name from time-series name"""
|
|
2137
|
+
import re
|
|
2138
|
+
|
|
2139
|
+
# Look for latencystats_latency_percentiles_usec_<COMMAND>_p50 pattern
|
|
2140
|
+
match = re.search(
|
|
2141
|
+
r"latencystats_latency_percentiles_usec_([^_/]+)_p50", ts_name
|
|
2142
|
+
)
|
|
2143
|
+
if match:
|
|
2144
|
+
return match.group(1)
|
|
2145
|
+
|
|
2146
|
+
# Look for command= pattern in the time-series name
|
|
2147
|
+
match = re.search(r"command=([^/]+)", ts_name)
|
|
2148
|
+
if match:
|
|
2149
|
+
return match.group(1)
|
|
2150
|
+
|
|
2151
|
+
# If no specific pattern found, try to extract from the end of the path
|
|
2152
|
+
# e.g., .../Ops/sec/GET -> GET
|
|
2153
|
+
parts = ts_name.split("/")
|
|
2154
|
+
if len(parts) > 0:
|
|
2155
|
+
return parts[-1]
|
|
2156
|
+
return "unknown"
|
|
2157
|
+
|
|
2158
|
+
# Group time-series by command
|
|
2159
|
+
baseline_by_command = {}
|
|
2160
|
+
comparison_by_command = {}
|
|
2161
|
+
|
|
2162
|
+
for ts in baseline_latency_ts:
|
|
2163
|
+
cmd = extract_command_from_ts(ts)
|
|
2164
|
+
if verbose:
|
|
2165
|
+
logging.info(f"Baseline time-series '{ts}' -> command '{cmd}'")
|
|
2166
|
+
if cmd not in baseline_by_command:
|
|
2167
|
+
baseline_by_command[cmd] = []
|
|
2168
|
+
baseline_by_command[cmd].append(ts)
|
|
2169
|
+
|
|
2170
|
+
for ts in comparison_latency_ts:
|
|
2171
|
+
cmd = extract_command_from_ts(ts)
|
|
2172
|
+
if verbose:
|
|
2173
|
+
logging.info(f"Comparison time-series '{ts}' -> command '{cmd}'")
|
|
2174
|
+
if cmd not in comparison_by_command:
|
|
2175
|
+
comparison_by_command[cmd] = []
|
|
2176
|
+
comparison_by_command[cmd].append(ts)
|
|
2177
|
+
|
|
2178
|
+
# Find common commands between baseline and comparison
|
|
2179
|
+
common_commands = set(baseline_by_command.keys()) & set(
|
|
2180
|
+
comparison_by_command.keys()
|
|
2181
|
+
)
|
|
2182
|
+
|
|
2183
|
+
logging.info(f"Baseline commands found: {sorted(baseline_by_command.keys())}")
|
|
2184
|
+
logging.info(
|
|
2185
|
+
f"Comparison commands found: {sorted(comparison_by_command.keys())}"
|
|
2186
|
+
)
|
|
2187
|
+
logging.info(
|
|
2188
|
+
f"Common commands for latency comparison: {sorted(common_commands)}"
|
|
2189
|
+
)
|
|
2190
|
+
|
|
2191
|
+
if not common_commands:
|
|
2192
|
+
logging.warning(
|
|
2193
|
+
f"No common commands found for latency comparison in {test_name}"
|
|
2194
|
+
)
|
|
2195
|
+
return None, False, None
|
|
2196
|
+
|
|
2197
|
+
latency_notes = []
|
|
2198
|
+
significant_latency_increases = (
|
|
2199
|
+
0 # Track commands with significant latency increases
|
|
2200
|
+
)
|
|
2201
|
+
regression_details = {"test_name": test_name, "commands": []}
|
|
2202
|
+
|
|
2203
|
+
# Compare latency for each command individually
|
|
2204
|
+
for command in sorted(common_commands):
|
|
2205
|
+
logging.info(f"Analyzing latency for command: {command}")
|
|
2206
|
+
baseline_ts_list = baseline_by_command[command]
|
|
2207
|
+
comparison_ts_list = comparison_by_command[command]
|
|
2208
|
+
|
|
2209
|
+
logging.info(
|
|
2210
|
+
f" Command {command}: {len(baseline_ts_list)} baseline, {len(comparison_ts_list)} comparison time-series"
|
|
2211
|
+
)
|
|
2212
|
+
|
|
2213
|
+
# If multiple time-series for the same command, try to get the best one
|
|
2214
|
+
if len(baseline_ts_list) > 1:
|
|
2215
|
+
logging.info(
|
|
2216
|
+
f" Multiple baseline time-series for {command}, filtering..."
|
|
2217
|
+
)
|
|
2218
|
+
baseline_ts_list = get_only_Totals(baseline_ts_list)
|
|
2219
|
+
if len(comparison_ts_list) > 1:
|
|
2220
|
+
logging.info(
|
|
2221
|
+
f" Multiple comparison time-series for {command}, filtering..."
|
|
2222
|
+
)
|
|
2223
|
+
comparison_ts_list = get_only_Totals(comparison_ts_list)
|
|
2224
|
+
|
|
2225
|
+
if len(baseline_ts_list) != 1 or len(comparison_ts_list) != 1:
|
|
2226
|
+
logging.warning(
|
|
2227
|
+
f" Skipping {command}: baseline={len(baseline_ts_list)}, comparison={len(comparison_ts_list)} time-series"
|
|
2228
|
+
)
|
|
2229
|
+
continue
|
|
2230
|
+
|
|
2231
|
+
# Get latency data for this command
|
|
2232
|
+
baseline_latency_data = []
|
|
2233
|
+
comparison_latency_data = []
|
|
2234
|
+
|
|
2235
|
+
for ts_name in baseline_ts_list:
|
|
2236
|
+
datapoints = rts.ts().revrange(ts_name, from_ts_ms, to_ts_ms)
|
|
2237
|
+
baseline_latency_data.extend(datapoints)
|
|
2238
|
+
|
|
2239
|
+
for ts_name in comparison_ts_list:
|
|
2240
|
+
datapoints = rts.ts().revrange(ts_name, from_ts_ms, to_ts_ms)
|
|
2241
|
+
comparison_latency_data.extend(datapoints)
|
|
2242
|
+
|
|
2243
|
+
if len(baseline_latency_data) == 0 or len(comparison_latency_data) == 0:
|
|
2244
|
+
logging.warning(
|
|
2245
|
+
f" No latency data for {command}: baseline={len(baseline_latency_data)}, comparison={len(comparison_latency_data)} datapoints"
|
|
2246
|
+
)
|
|
2247
|
+
continue
|
|
2248
|
+
|
|
2249
|
+
logging.info(
|
|
2250
|
+
f" Command {command}: {len(baseline_latency_data)} baseline, {len(comparison_latency_data)} comparison datapoints"
|
|
2251
|
+
)
|
|
2252
|
+
|
|
2253
|
+
# Calculate latency statistics for this command
|
|
2254
|
+
baseline_latency_values = []
|
|
2255
|
+
comparison_latency_values = []
|
|
2256
|
+
|
|
2257
|
+
(_, baseline_latency_median, _) = get_v_pct_change_and_largest_var(
|
|
2258
|
+
baseline_latency_data,
|
|
2259
|
+
0,
|
|
2260
|
+
0,
|
|
2261
|
+
baseline_latency_values,
|
|
2262
|
+
0,
|
|
2263
|
+
last_n_baseline,
|
|
2264
|
+
verbose,
|
|
2265
|
+
first_n_baseline,
|
|
2266
|
+
)
|
|
2267
|
+
|
|
2268
|
+
(_, comparison_latency_median, _) = get_v_pct_change_and_largest_var(
|
|
2269
|
+
comparison_latency_data,
|
|
2270
|
+
0,
|
|
2271
|
+
0,
|
|
2272
|
+
comparison_latency_values,
|
|
2273
|
+
0,
|
|
2274
|
+
last_n_comparison,
|
|
2275
|
+
verbose,
|
|
2276
|
+
first_n_comparison,
|
|
2277
|
+
)
|
|
2278
|
+
|
|
2279
|
+
if baseline_latency_median == "N/A" or comparison_latency_median == "N/A":
|
|
2280
|
+
logging.warning(
|
|
2281
|
+
f" Could not calculate median for {command}: baseline={baseline_latency_median}, comparison={comparison_latency_median}"
|
|
2282
|
+
)
|
|
2283
|
+
continue
|
|
2284
|
+
|
|
2285
|
+
# Calculate variance (coefficient of variation) for both baseline and comparison
|
|
2286
|
+
baseline_latency_mean = (
|
|
2287
|
+
statistics.mean(baseline_latency_values)
|
|
2288
|
+
if baseline_latency_values
|
|
2289
|
+
else 0
|
|
2290
|
+
)
|
|
2291
|
+
baseline_latency_stdev = (
|
|
2292
|
+
statistics.stdev(baseline_latency_values)
|
|
2293
|
+
if len(baseline_latency_values) > 1
|
|
2294
|
+
else 0
|
|
2295
|
+
)
|
|
2296
|
+
baseline_latency_cv = (
|
|
2297
|
+
(baseline_latency_stdev / baseline_latency_mean * 100)
|
|
2298
|
+
if baseline_latency_mean > 0
|
|
2299
|
+
else float("inf")
|
|
2300
|
+
)
|
|
2301
|
+
|
|
2302
|
+
comparison_latency_mean = (
|
|
2303
|
+
statistics.mean(comparison_latency_values)
|
|
2304
|
+
if comparison_latency_values
|
|
2305
|
+
else 0
|
|
2306
|
+
)
|
|
2307
|
+
comparison_latency_stdev = (
|
|
2308
|
+
statistics.stdev(comparison_latency_values)
|
|
2309
|
+
if len(comparison_latency_values) > 1
|
|
2310
|
+
else 0
|
|
2311
|
+
)
|
|
2312
|
+
comparison_latency_cv = (
|
|
2313
|
+
(comparison_latency_stdev / comparison_latency_mean * 100)
|
|
2314
|
+
if comparison_latency_mean > 0
|
|
2315
|
+
else float("inf")
|
|
2316
|
+
)
|
|
2317
|
+
|
|
2318
|
+
# Calculate latency change (for latency, lower is better)
|
|
2319
|
+
latency_change = (
|
|
2320
|
+
float(comparison_latency_median) / float(baseline_latency_median) - 1
|
|
2321
|
+
) * 100.0
|
|
2322
|
+
|
|
2323
|
+
logging.info(
|
|
2324
|
+
f" Command {command}: baseline p50={baseline_latency_median:.2f} (CV={baseline_latency_cv:.1f}%), comparison p50={comparison_latency_median:.2f} (CV={comparison_latency_cv:.1f}%), change={latency_change:.1f}%"
|
|
2325
|
+
)
|
|
2326
|
+
|
|
2327
|
+
# Check if latency data is too unstable to be reliable
|
|
2328
|
+
latency_data_unstable = (
|
|
2329
|
+
baseline_latency_cv > 50.0 or comparison_latency_cv > 50.0
|
|
2330
|
+
)
|
|
2331
|
+
|
|
2332
|
+
if latency_data_unstable:
|
|
2333
|
+
# Mark as unstable latency data
|
|
2334
|
+
unstable_reason = []
|
|
2335
|
+
if baseline_latency_cv > 50.0:
|
|
2336
|
+
unstable_reason.append(f"baseline CV={baseline_latency_cv:.1f}%")
|
|
2337
|
+
if comparison_latency_cv > 50.0:
|
|
2338
|
+
unstable_reason.append(
|
|
2339
|
+
f"comparison CV={comparison_latency_cv:.1f}%"
|
|
2340
|
+
)
|
|
2341
|
+
|
|
2342
|
+
latency_notes.append(
|
|
2343
|
+
f"{command} p50 UNSTABLE ({', '.join(unstable_reason)} - data too noisy for reliable analysis)"
|
|
2344
|
+
)
|
|
2345
|
+
logging.warning(
|
|
2346
|
+
f" Command {command}: UNSTABLE latency data detected - {', '.join(unstable_reason)}"
|
|
2347
|
+
)
|
|
2348
|
+
elif (
|
|
2349
|
+
abs(latency_change) > 5.0
|
|
2350
|
+
): # Only report significant latency changes for stable data
|
|
2351
|
+
direction = "increased" if latency_change > 0 else "decreased"
|
|
2352
|
+
|
|
2353
|
+
# Adjust significance threshold based on baseline variance
|
|
2354
|
+
if baseline_latency_cv < 30.0:
|
|
2355
|
+
# Low variance - use standard threshold
|
|
2356
|
+
significance_threshold = 10.0
|
|
2357
|
+
elif baseline_latency_cv < 50.0:
|
|
2358
|
+
# Moderate variance - require larger change
|
|
2359
|
+
significance_threshold = 15.0
|
|
2360
|
+
else:
|
|
2361
|
+
# High variance - require much larger change
|
|
2362
|
+
significance_threshold = 25.0
|
|
2363
|
+
|
|
2364
|
+
latency_notes.append(
|
|
2365
|
+
f"{command} p50 {direction} {abs(latency_change):.1f}% (baseline CV={baseline_latency_cv:.1f}%)"
|
|
2366
|
+
)
|
|
2367
|
+
logging.info(
|
|
2368
|
+
f" Command {command}: SIGNIFICANT latency change detected ({direction} {abs(latency_change):.1f}%, baseline CV={baseline_latency_cv:.1f}%)"
|
|
2369
|
+
)
|
|
2370
|
+
|
|
2371
|
+
# Track significant latency increases (potential regression confirmation)
|
|
2372
|
+
if latency_change > significance_threshold:
|
|
2373
|
+
significant_latency_increases += 1
|
|
2374
|
+
regression_details["commands"].append(
|
|
2375
|
+
{
|
|
2376
|
+
"command": command,
|
|
2377
|
+
"change_percent": latency_change,
|
|
2378
|
+
"direction": direction,
|
|
2379
|
+
"baseline_cv": baseline_latency_cv,
|
|
2380
|
+
"comparison_cv": comparison_latency_cv,
|
|
2381
|
+
}
|
|
2382
|
+
)
|
|
2383
|
+
logging.info(
|
|
2384
|
+
f" Command {command}: CONFIRMS regression (change={latency_change:.1f}% > threshold={significance_threshold:.1f}%)"
|
|
2385
|
+
)
|
|
2386
|
+
else:
|
|
2387
|
+
logging.info(
|
|
2388
|
+
f" Command {command}: Change below significance threshold (change={latency_change:.1f}% <= threshold={significance_threshold:.1f}%)"
|
|
2389
|
+
)
|
|
2390
|
+
elif verbose:
|
|
2391
|
+
latency_notes.append(
|
|
2392
|
+
f"{command} p50 stable (CV={baseline_latency_cv:.1f}%)"
|
|
2393
|
+
)
|
|
2394
|
+
logging.info(
|
|
2395
|
+
f" Command {command}: latency stable (change={latency_change:.1f}%, baseline CV={baseline_latency_cv:.1f}%)"
|
|
2396
|
+
)
|
|
2397
|
+
|
|
2398
|
+
# Determine if latency confirms regression
|
|
2399
|
+
confirms_regression = significant_latency_increases > 0
|
|
2400
|
+
|
|
2401
|
+
# Return combined latency notes
|
|
2402
|
+
if latency_notes:
|
|
2403
|
+
result = "; ".join(latency_notes)
|
|
2404
|
+
logging.info(f"Latency check completed for {test_name}: {result}")
|
|
2405
|
+
return (
|
|
2406
|
+
result,
|
|
2407
|
+
confirms_regression,
|
|
2408
|
+
regression_details if confirms_regression else None,
|
|
2409
|
+
)
|
|
2410
|
+
else:
|
|
2411
|
+
result = "p50 latency stable" if common_commands else None
|
|
2412
|
+
logging.info(
|
|
2413
|
+
f"Latency check completed for {test_name}: {result or 'no data'}"
|
|
2414
|
+
)
|
|
2415
|
+
return result, False, None
|
|
2416
|
+
|
|
2417
|
+
except Exception as e:
|
|
2418
|
+
logging.error(f"Error checking latency for {test_name}: {e}")
|
|
2419
|
+
return None, False, None
|
|
2420
|
+
|
|
2421
|
+
|
|
994
2422
|
def get_only_Totals(baseline_timeseries):
|
|
995
2423
|
logging.warning("\t\tTime-series: {}".format(", ".join(baseline_timeseries)))
|
|
996
2424
|
logging.info("Checking if Totals will reduce timeseries.")
|
|
@@ -998,6 +2426,37 @@ def get_only_Totals(baseline_timeseries):
|
|
|
998
2426
|
for ts_name in baseline_timeseries:
|
|
999
2427
|
if "Totals" in ts_name:
|
|
1000
2428
|
new_base.append(ts_name)
|
|
2429
|
+
|
|
2430
|
+
# If no "Totals" time-series found, try to pick the best alternative
|
|
2431
|
+
if len(new_base) == 0:
|
|
2432
|
+
logging.warning(
|
|
2433
|
+
"No 'Totals' time-series found, trying to pick best alternative."
|
|
2434
|
+
)
|
|
2435
|
+
# Prefer time-series without quotes in metric names
|
|
2436
|
+
unquoted_series = [ts for ts in baseline_timeseries if "'" not in ts]
|
|
2437
|
+
if unquoted_series:
|
|
2438
|
+
new_base = unquoted_series
|
|
2439
|
+
else:
|
|
2440
|
+
# Fall back to original list
|
|
2441
|
+
new_base = baseline_timeseries
|
|
2442
|
+
|
|
2443
|
+
# If we still have multiple time-series after filtering for "Totals",
|
|
2444
|
+
# prefer the one without quotes in the metric name
|
|
2445
|
+
if len(new_base) > 1:
|
|
2446
|
+
logging.info("Multiple time-series found, preferring unquoted metric names.")
|
|
2447
|
+
unquoted_series = [ts for ts in new_base if "'" not in ts]
|
|
2448
|
+
if unquoted_series:
|
|
2449
|
+
new_base = unquoted_series
|
|
2450
|
+
|
|
2451
|
+
# If we still have multiple, take the first one
|
|
2452
|
+
if len(new_base) > 1:
|
|
2453
|
+
logging.warning(
|
|
2454
|
+
"Still multiple time-series after filtering, taking the first one: {}".format(
|
|
2455
|
+
new_base[0]
|
|
2456
|
+
)
|
|
2457
|
+
)
|
|
2458
|
+
new_base = [new_base[0]]
|
|
2459
|
+
|
|
1001
2460
|
baseline_timeseries = new_base
|
|
1002
2461
|
return baseline_timeseries
|
|
1003
2462
|
|
|
@@ -1064,11 +2523,38 @@ def add_line(
|
|
|
1064
2523
|
percentage_change,
|
|
1065
2524
|
table,
|
|
1066
2525
|
test_name,
|
|
2526
|
+
grafana_link_base=None,
|
|
2527
|
+
baseline_branch=None,
|
|
2528
|
+
baseline_version=None,
|
|
2529
|
+
comparison_branch=None,
|
|
2530
|
+
comparison_version=None,
|
|
2531
|
+
from_date=None,
|
|
2532
|
+
to_date=None,
|
|
1067
2533
|
):
|
|
2534
|
+
grafana_link = None
|
|
2535
|
+
if grafana_link_base is not None:
|
|
2536
|
+
grafana_link = "{}?orgId=1".format(grafana_link_base)
|
|
2537
|
+
grafana_link += f"&var-test_case={test_name}"
|
|
2538
|
+
|
|
2539
|
+
if baseline_branch is not None:
|
|
2540
|
+
grafana_link += f"&var-branch={baseline_branch}"
|
|
2541
|
+
if baseline_version is not None:
|
|
2542
|
+
grafana_link += f"&var-version={baseline_version}"
|
|
2543
|
+
if comparison_branch is not None:
|
|
2544
|
+
grafana_link += f"&var-branch={comparison_branch}"
|
|
2545
|
+
if comparison_version is not None:
|
|
2546
|
+
grafana_link += f"&var-version={comparison_version}"
|
|
2547
|
+
grafana_link += "&from=now-30d&to=now"
|
|
2548
|
+
|
|
2549
|
+
# Create test name with optional Grafana link
|
|
2550
|
+
test_name_display = test_name
|
|
2551
|
+
if grafana_link is not None:
|
|
2552
|
+
test_name_display = f"[{test_name}]({grafana_link})"
|
|
2553
|
+
|
|
1068
2554
|
percentage_change_str = "{:.1f}% ".format(percentage_change)
|
|
1069
2555
|
table.append(
|
|
1070
2556
|
[
|
|
1071
|
-
|
|
2557
|
+
test_name_display,
|
|
1072
2558
|
baseline_v_str,
|
|
1073
2559
|
comparison_v_str,
|
|
1074
2560
|
percentage_change_str,
|
|
@@ -1105,9 +2591,9 @@ def get_v_pct_change_and_largest_var(
|
|
|
1105
2591
|
comparison_values.append(tuple[1])
|
|
1106
2592
|
|
|
1107
2593
|
comparison_df = pd.DataFrame(comparison_values)
|
|
1108
|
-
comparison_median = float(comparison_df.median())
|
|
2594
|
+
comparison_median = float(comparison_df.median().iloc[0])
|
|
1109
2595
|
comparison_v = comparison_median
|
|
1110
|
-
comparison_std = float(comparison_df.std())
|
|
2596
|
+
comparison_std = float(comparison_df.std().iloc[0])
|
|
1111
2597
|
if verbose:
|
|
1112
2598
|
logging.info(
|
|
1113
2599
|
"comparison_datapoints: {} value: {}; std-dev: {}; median: {}".format(
|