redisbench-admin 0.11.37__py3-none-any.whl → 0.11.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- redisbench_admin/compare/args.py +1 -1
- redisbench_admin/compare/compare.py +1496 -10
- redisbench_admin/deploy/deploy.py +1 -9
- redisbench_admin/export/export.py +1 -7
- redisbench_admin/profilers/perf.py +24 -24
- redisbench_admin/run/cluster.py +6 -0
- redisbench_admin/run/common.py +6 -24
- redisbench_admin/run_async/async_terraform.py +2 -10
- redisbench_admin/run_async/render_files.py +3 -3
- redisbench_admin/run_local/run_local.py +12 -12
- redisbench_admin/run_remote/remote_db.py +62 -23
- redisbench_admin/run_remote/remote_helpers.py +18 -5
- redisbench_admin/run_remote/run_remote.py +34 -13
- redisbench_admin/run_remote/standalone.py +136 -0
- redisbench_admin/run_remote/terraform.py +1 -5
- redisbench_admin/utils/remote.py +4 -7
- redisbench_admin/utils/utils.py +42 -24
- {redisbench_admin-0.11.37.dist-info → redisbench_admin-0.11.39.dist-info}/METADATA +1 -1
- {redisbench_admin-0.11.37.dist-info → redisbench_admin-0.11.39.dist-info}/RECORD +22 -22
- {redisbench_admin-0.11.37.dist-info → redisbench_admin-0.11.39.dist-info}/LICENSE +0 -0
- {redisbench_admin-0.11.37.dist-info → redisbench_admin-0.11.39.dist-info}/WHEEL +0 -0
- {redisbench_admin-0.11.37.dist-info → redisbench_admin-0.11.39.dist-info}/entry_points.txt +0 -0
|
@@ -13,6 +13,7 @@ from pytablewriter import MarkdownTableWriter
|
|
|
13
13
|
import humanize
|
|
14
14
|
import datetime as dt
|
|
15
15
|
import os
|
|
16
|
+
import statistics
|
|
16
17
|
from tqdm import tqdm
|
|
17
18
|
from github import Github
|
|
18
19
|
from slack_sdk.webhook import WebhookClient
|
|
@@ -270,6 +271,10 @@ def compare_command_logic(args, project_name, project_version):
|
|
|
270
271
|
total_stable,
|
|
271
272
|
total_unstable,
|
|
272
273
|
total_comparison_points,
|
|
274
|
+
total_unstable_baseline,
|
|
275
|
+
total_unstable_comparison,
|
|
276
|
+
total_latency_confirmed_regressions,
|
|
277
|
+
latency_confirmed_regression_details,
|
|
273
278
|
) = compute_regression_table(
|
|
274
279
|
rts,
|
|
275
280
|
tf_github_org,
|
|
@@ -303,6 +308,7 @@ def compare_command_logic(args, project_name, project_version):
|
|
|
303
308
|
comparison_architecture,
|
|
304
309
|
first_n_baseline,
|
|
305
310
|
first_n_comparison,
|
|
311
|
+
grafana_link_base,
|
|
306
312
|
)
|
|
307
313
|
comment_body = ""
|
|
308
314
|
if total_comparison_points > 0:
|
|
@@ -321,11 +327,63 @@ def compare_command_logic(args, project_name, project_version):
|
|
|
321
327
|
)
|
|
322
328
|
|
|
323
329
|
if total_unstable > 0:
|
|
330
|
+
unstable_details = []
|
|
331
|
+
if total_unstable_baseline > 0:
|
|
332
|
+
unstable_details.append(f"{total_unstable_baseline} baseline")
|
|
333
|
+
if total_unstable_comparison > 0:
|
|
334
|
+
unstable_details.append(f"{total_unstable_comparison} comparison")
|
|
335
|
+
|
|
336
|
+
unstable_breakdown = (
|
|
337
|
+
" (" + ", ".join(unstable_details) + ")" if unstable_details else ""
|
|
338
|
+
)
|
|
324
339
|
comparison_summary += (
|
|
325
|
-
"- Detected a total of {} highly unstable benchmarks.\n".format(
|
|
326
|
-
total_unstable
|
|
340
|
+
"- Detected a total of {} highly unstable benchmarks{}.\n".format(
|
|
341
|
+
total_unstable, unstable_breakdown
|
|
327
342
|
)
|
|
328
343
|
)
|
|
344
|
+
|
|
345
|
+
# Add latency confirmation summary if applicable
|
|
346
|
+
if total_latency_confirmed_regressions > 0:
|
|
347
|
+
comparison_summary += "- Latency analysis confirmed regressions in {} of the unstable tests:\n".format(
|
|
348
|
+
total_latency_confirmed_regressions
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
# Add detailed breakdown as bullet points with test links
|
|
352
|
+
if latency_confirmed_regression_details:
|
|
353
|
+
for detail in latency_confirmed_regression_details:
|
|
354
|
+
test_name = detail["test_name"]
|
|
355
|
+
commands_info = []
|
|
356
|
+
for cmd_detail in detail["commands"]:
|
|
357
|
+
commands_info.append(
|
|
358
|
+
f"{cmd_detail['command']} +{cmd_detail['change_percent']:.1f}%"
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
if commands_info:
|
|
362
|
+
# Create test link if grafana_link_base is available
|
|
363
|
+
test_display_name = test_name
|
|
364
|
+
if grafana_link_base is not None:
|
|
365
|
+
grafana_test_link = f"{grafana_link_base}?orgId=1&var-test_case={test_name}"
|
|
366
|
+
if baseline_branch is not None:
|
|
367
|
+
grafana_test_link += (
|
|
368
|
+
f"&var-branch={baseline_branch}"
|
|
369
|
+
)
|
|
370
|
+
if comparison_branch is not None:
|
|
371
|
+
grafana_test_link += (
|
|
372
|
+
f"&var-branch={comparison_branch}"
|
|
373
|
+
)
|
|
374
|
+
grafana_test_link += "&from=now-30d&to=now"
|
|
375
|
+
test_display_name = (
|
|
376
|
+
f"[{test_name}]({grafana_test_link})"
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
# Add confidence indicator if available
|
|
380
|
+
confidence_indicator = ""
|
|
381
|
+
if "high_confidence" in detail:
|
|
382
|
+
confidence_indicator = (
|
|
383
|
+
" 🔴" if detail["high_confidence"] else " ⚠️"
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
comparison_summary += f" - {test_display_name}: {', '.join(commands_info)}{confidence_indicator}\n"
|
|
329
387
|
if total_improvements > 0:
|
|
330
388
|
comparison_summary += "- Detected a total of {} improvements above the improvement water line.\n".format(
|
|
331
389
|
total_improvements
|
|
@@ -484,6 +542,9 @@ def compare_command_logic(args, project_name, project_version):
|
|
|
484
542
|
total_stable,
|
|
485
543
|
total_unstable,
|
|
486
544
|
total_comparison_points,
|
|
545
|
+
total_unstable_baseline,
|
|
546
|
+
total_unstable_comparison,
|
|
547
|
+
total_latency_confirmed_regressions,
|
|
487
548
|
)
|
|
488
549
|
|
|
489
550
|
|
|
@@ -531,6 +592,7 @@ def compute_regression_table(
|
|
|
531
592
|
comparison_architecture=ARCH_X86,
|
|
532
593
|
first_n_baseline=-1,
|
|
533
594
|
first_n_comparison=-1,
|
|
595
|
+
grafana_link_base=None,
|
|
534
596
|
):
|
|
535
597
|
START_TIME_NOW_UTC, _, _ = get_start_time_vars()
|
|
536
598
|
START_TIME_LAST_MONTH_UTC = START_TIME_NOW_UTC - datetime.timedelta(days=31)
|
|
@@ -593,6 +655,10 @@ def compute_regression_table(
|
|
|
593
655
|
total_stable,
|
|
594
656
|
total_unstable,
|
|
595
657
|
total_comparison_points,
|
|
658
|
+
total_unstable_baseline,
|
|
659
|
+
total_unstable_comparison,
|
|
660
|
+
total_latency_confirmed_regressions,
|
|
661
|
+
latency_confirmed_regression_details,
|
|
596
662
|
) = from_rts_to_regression_table(
|
|
597
663
|
baseline_deployment_name,
|
|
598
664
|
comparison_deployment_name,
|
|
@@ -621,14 +687,97 @@ def compute_regression_table(
|
|
|
621
687
|
comparison_architecture,
|
|
622
688
|
first_n_baseline,
|
|
623
689
|
first_n_comparison,
|
|
690
|
+
grafana_link_base,
|
|
691
|
+
baseline_branch,
|
|
692
|
+
baseline_tag,
|
|
693
|
+
comparison_branch,
|
|
694
|
+
comparison_tag,
|
|
695
|
+
from_date,
|
|
696
|
+
to_date,
|
|
624
697
|
)
|
|
625
698
|
logging.info(
|
|
626
699
|
"Printing differential analysis between {} and {}".format(
|
|
627
700
|
baseline_str, comparison_str
|
|
628
701
|
)
|
|
629
702
|
)
|
|
630
|
-
|
|
631
|
-
|
|
703
|
+
|
|
704
|
+
# Split table into improvements, regressions, and no-changes
|
|
705
|
+
improvements_table = []
|
|
706
|
+
regressions_table = []
|
|
707
|
+
no_changes_table = []
|
|
708
|
+
|
|
709
|
+
for row in table:
|
|
710
|
+
# Check if there's a meaningful change (not stable/unstable)
|
|
711
|
+
note = row[4].lower() if len(row) > 4 else ""
|
|
712
|
+
percentage_str = row[3] if len(row) > 3 else "0.0%"
|
|
713
|
+
|
|
714
|
+
# Extract percentage value
|
|
715
|
+
try:
|
|
716
|
+
percentage_val = float(percentage_str.replace("%", "").strip())
|
|
717
|
+
except:
|
|
718
|
+
percentage_val = 0.0
|
|
719
|
+
|
|
720
|
+
# Categorize based on change type
|
|
721
|
+
if "improvement" in note and "potential" not in note:
|
|
722
|
+
# Only actual improvements, not potential ones
|
|
723
|
+
improvements_table.append(row)
|
|
724
|
+
elif ("regression" in note and "potential" not in note) or "unstable" in note:
|
|
725
|
+
# Only actual regressions, not potential ones, plus unstable tests
|
|
726
|
+
regressions_table.append(row)
|
|
727
|
+
elif "no change" in note or "potential" in note:
|
|
728
|
+
# No changes and potential changes (below significance threshold)
|
|
729
|
+
no_changes_table.append(row)
|
|
730
|
+
elif abs(percentage_val) > 3.0: # Significant changes based on percentage
|
|
731
|
+
if (percentage_val > 0 and metric_mode == "higher-better") or (
|
|
732
|
+
percentage_val < 0 and metric_mode == "lower-better"
|
|
733
|
+
):
|
|
734
|
+
improvements_table.append(row)
|
|
735
|
+
else:
|
|
736
|
+
regressions_table.append(row)
|
|
737
|
+
else:
|
|
738
|
+
no_changes_table.append(row)
|
|
739
|
+
|
|
740
|
+
# Sort tables by percentage change
|
|
741
|
+
def get_percentage_value(row):
|
|
742
|
+
"""Extract percentage value from row for sorting"""
|
|
743
|
+
try:
|
|
744
|
+
percentage_str = row[3] if len(row) > 3 else "0.0%"
|
|
745
|
+
return float(percentage_str.replace("%", "").strip())
|
|
746
|
+
except:
|
|
747
|
+
return 0.0
|
|
748
|
+
|
|
749
|
+
# Sort improvements by percentage change (highest first)
|
|
750
|
+
improvements_table.sort(key=get_percentage_value, reverse=True)
|
|
751
|
+
|
|
752
|
+
# Sort regressions by percentage change (most negative first for higher-better, most positive first for lower-better)
|
|
753
|
+
if metric_mode == "higher-better":
|
|
754
|
+
# For higher-better metrics, most negative changes are worst regressions
|
|
755
|
+
regressions_table.sort(key=get_percentage_value)
|
|
756
|
+
else:
|
|
757
|
+
# For lower-better metrics, most positive changes are worst regressions
|
|
758
|
+
regressions_table.sort(key=get_percentage_value, reverse=True)
|
|
759
|
+
|
|
760
|
+
# Create improvements table (visible)
|
|
761
|
+
improvements_writer = MarkdownTableWriter(
|
|
762
|
+
table_name="Performance Improvements - Comparison between {} and {}.\n\nTime Period from {}. (environment used: {})\n".format(
|
|
763
|
+
baseline_str,
|
|
764
|
+
comparison_str,
|
|
765
|
+
from_human_str,
|
|
766
|
+
baseline_deployment_name,
|
|
767
|
+
),
|
|
768
|
+
headers=[
|
|
769
|
+
"Test Case",
|
|
770
|
+
"Baseline {} (median obs. +- std.dev)".format(baseline_str),
|
|
771
|
+
"Comparison {} (median obs. +- std.dev)".format(comparison_str),
|
|
772
|
+
"% change ({})".format(metric_mode),
|
|
773
|
+
"Note",
|
|
774
|
+
],
|
|
775
|
+
value_matrix=improvements_table,
|
|
776
|
+
)
|
|
777
|
+
|
|
778
|
+
# Create regressions table (visible)
|
|
779
|
+
regressions_writer = MarkdownTableWriter(
|
|
780
|
+
table_name="Performance Regressions and Issues - Comparison between {} and {}.\n\nTime Period from {}. (environment used: {})\n".format(
|
|
632
781
|
baseline_str,
|
|
633
782
|
comparison_str,
|
|
634
783
|
from_human_str,
|
|
@@ -641,8 +790,22 @@ def compute_regression_table(
|
|
|
641
790
|
"% change ({})".format(metric_mode),
|
|
642
791
|
"Note",
|
|
643
792
|
],
|
|
644
|
-
value_matrix=
|
|
793
|
+
value_matrix=regressions_table,
|
|
645
794
|
)
|
|
795
|
+
|
|
796
|
+
# Create no-changes table (hidden in markdown)
|
|
797
|
+
no_changes_writer = MarkdownTableWriter(
|
|
798
|
+
table_name="Tests with No Significant Changes",
|
|
799
|
+
headers=[
|
|
800
|
+
"Test Case",
|
|
801
|
+
"Baseline {} (median obs. +- std.dev)".format(baseline_str),
|
|
802
|
+
"Comparison {} (median obs. +- std.dev)".format(comparison_str),
|
|
803
|
+
"% change ({})".format(metric_mode),
|
|
804
|
+
"Note",
|
|
805
|
+
],
|
|
806
|
+
value_matrix=no_changes_table,
|
|
807
|
+
)
|
|
808
|
+
|
|
646
809
|
table_output = ""
|
|
647
810
|
|
|
648
811
|
from io import StringIO
|
|
@@ -651,7 +814,25 @@ def compute_regression_table(
|
|
|
651
814
|
old_stdout = sys.stdout
|
|
652
815
|
sys.stdout = mystdout = StringIO()
|
|
653
816
|
|
|
654
|
-
|
|
817
|
+
# Output improvements table first (if any)
|
|
818
|
+
if improvements_table:
|
|
819
|
+
improvements_writer.dump(mystdout, False)
|
|
820
|
+
mystdout.write("\n\n")
|
|
821
|
+
|
|
822
|
+
# Output regressions table (if any)
|
|
823
|
+
if regressions_table:
|
|
824
|
+
regressions_writer.dump(mystdout, False)
|
|
825
|
+
mystdout.write("\n\n")
|
|
826
|
+
|
|
827
|
+
# Add hidden no-changes table
|
|
828
|
+
if no_changes_table:
|
|
829
|
+
mystdout.write(
|
|
830
|
+
"<details>\n<summary>Tests with No Significant Changes ({} tests)</summary>\n\n".format(
|
|
831
|
+
len(no_changes_table)
|
|
832
|
+
)
|
|
833
|
+
)
|
|
834
|
+
no_changes_writer.dump(mystdout, False)
|
|
835
|
+
mystdout.write("\n</details>\n")
|
|
655
836
|
|
|
656
837
|
sys.stdout = old_stdout
|
|
657
838
|
|
|
@@ -665,6 +846,10 @@ def compute_regression_table(
|
|
|
665
846
|
total_stable,
|
|
666
847
|
total_unstable,
|
|
667
848
|
total_comparison_points,
|
|
849
|
+
total_unstable_baseline,
|
|
850
|
+
total_unstable_comparison,
|
|
851
|
+
total_latency_confirmed_regressions,
|
|
852
|
+
latency_confirmed_regression_details,
|
|
668
853
|
)
|
|
669
854
|
|
|
670
855
|
|
|
@@ -752,6 +937,13 @@ def from_rts_to_regression_table(
|
|
|
752
937
|
comparison_architecture=ARCH_X86,
|
|
753
938
|
first_n_baseline=-1,
|
|
754
939
|
first_n_comparison=-1,
|
|
940
|
+
grafana_link_base=None,
|
|
941
|
+
baseline_branch=None,
|
|
942
|
+
baseline_tag=None,
|
|
943
|
+
comparison_branch=None,
|
|
944
|
+
comparison_tag=None,
|
|
945
|
+
from_date=None,
|
|
946
|
+
to_date=None,
|
|
755
947
|
):
|
|
756
948
|
print_all = print_regressions_only is False and print_improvements_only is False
|
|
757
949
|
table = []
|
|
@@ -759,8 +951,12 @@ def from_rts_to_regression_table(
|
|
|
759
951
|
total_improvements = 0
|
|
760
952
|
total_stable = 0
|
|
761
953
|
total_unstable = 0
|
|
954
|
+
total_unstable_baseline = 0
|
|
955
|
+
total_unstable_comparison = 0
|
|
762
956
|
total_regressions = 0
|
|
763
957
|
total_comparison_points = 0
|
|
958
|
+
total_latency_confirmed_regressions = 0
|
|
959
|
+
latency_confirmed_regression_details = [] # Track specific test details
|
|
764
960
|
noise_waterline = 3
|
|
765
961
|
progress = tqdm(unit="benchmark time-series", total=len(test_names))
|
|
766
962
|
for test_name in test_names:
|
|
@@ -898,10 +1094,243 @@ def from_rts_to_regression_table(
|
|
|
898
1094
|
logging.error("Detected a ZeroDivisionError. {}".format(e.__str__()))
|
|
899
1095
|
pass
|
|
900
1096
|
unstable = False
|
|
1097
|
+
unstable_baseline = False
|
|
1098
|
+
unstable_comparison = False
|
|
1099
|
+
latency_confirms_regression = False
|
|
1100
|
+
|
|
901
1101
|
if baseline_v != "N/A" and comparison_v != "N/A":
|
|
902
1102
|
if comparison_pct_change > 10.0 or baseline_pct_change > 10.0:
|
|
903
|
-
note = "UNSTABLE (very high variance)"
|
|
904
1103
|
unstable = True
|
|
1104
|
+
unstable_baseline = baseline_pct_change > 10.0
|
|
1105
|
+
unstable_comparison = comparison_pct_change > 10.0
|
|
1106
|
+
|
|
1107
|
+
# Build detailed unstable note
|
|
1108
|
+
unstable_parts = []
|
|
1109
|
+
if unstable_baseline and unstable_comparison:
|
|
1110
|
+
unstable_parts.append(
|
|
1111
|
+
"UNSTABLE (baseline & comparison high variance)"
|
|
1112
|
+
)
|
|
1113
|
+
elif unstable_baseline:
|
|
1114
|
+
unstable_parts.append("UNSTABLE (baseline high variance)")
|
|
1115
|
+
elif unstable_comparison:
|
|
1116
|
+
unstable_parts.append("UNSTABLE (comparison high variance)")
|
|
1117
|
+
|
|
1118
|
+
note = unstable_parts[0]
|
|
1119
|
+
|
|
1120
|
+
# Log detailed warning about unstable data detection
|
|
1121
|
+
logging.warning(
|
|
1122
|
+
f"UNSTABLE DATA DETECTED for test '{test_name}': "
|
|
1123
|
+
f"baseline variance={baseline_pct_change:.1f}%, "
|
|
1124
|
+
f"comparison variance={comparison_pct_change:.1f}% "
|
|
1125
|
+
f"(threshold=10.0%)"
|
|
1126
|
+
)
|
|
1127
|
+
|
|
1128
|
+
# For throughput metrics (higher-better), check both server-side and client-side latency
|
|
1129
|
+
if metric_mode == "higher-better":
|
|
1130
|
+
logging.info(
|
|
1131
|
+
f"Performing 2nd-level latency validation for unstable throughput metric '{test_name}' "
|
|
1132
|
+
f"(metric_mode={metric_mode})"
|
|
1133
|
+
)
|
|
1134
|
+
|
|
1135
|
+
# Check server-side p50 latency
|
|
1136
|
+
(
|
|
1137
|
+
server_latency_note,
|
|
1138
|
+
server_confirms_regression,
|
|
1139
|
+
server_regression_details,
|
|
1140
|
+
) = check_latency_for_unstable_throughput(
|
|
1141
|
+
rts,
|
|
1142
|
+
test_name,
|
|
1143
|
+
baseline_str,
|
|
1144
|
+
comparison_str,
|
|
1145
|
+
by_str_baseline,
|
|
1146
|
+
by_str_comparison,
|
|
1147
|
+
baseline_deployment_name,
|
|
1148
|
+
comparison_deployment_name,
|
|
1149
|
+
tf_triggering_env,
|
|
1150
|
+
from_ts_ms,
|
|
1151
|
+
to_ts_ms,
|
|
1152
|
+
last_n_baseline,
|
|
1153
|
+
last_n_comparison,
|
|
1154
|
+
first_n_baseline,
|
|
1155
|
+
first_n_comparison,
|
|
1156
|
+
running_platform,
|
|
1157
|
+
baseline_architecture,
|
|
1158
|
+
comparison_architecture,
|
|
1159
|
+
verbose,
|
|
1160
|
+
)
|
|
1161
|
+
|
|
1162
|
+
# Check client-side latency metrics
|
|
1163
|
+
(
|
|
1164
|
+
client_latency_note,
|
|
1165
|
+
client_confirms_regression,
|
|
1166
|
+
client_regression_details,
|
|
1167
|
+
) = check_client_side_latency(
|
|
1168
|
+
rts,
|
|
1169
|
+
test_name,
|
|
1170
|
+
baseline_str,
|
|
1171
|
+
comparison_str,
|
|
1172
|
+
by_str_baseline,
|
|
1173
|
+
by_str_comparison,
|
|
1174
|
+
baseline_deployment_name,
|
|
1175
|
+
comparison_deployment_name,
|
|
1176
|
+
tf_triggering_env,
|
|
1177
|
+
from_ts_ms,
|
|
1178
|
+
to_ts_ms,
|
|
1179
|
+
last_n_baseline,
|
|
1180
|
+
last_n_comparison,
|
|
1181
|
+
first_n_baseline,
|
|
1182
|
+
first_n_comparison,
|
|
1183
|
+
running_platform,
|
|
1184
|
+
baseline_architecture,
|
|
1185
|
+
comparison_architecture,
|
|
1186
|
+
verbose,
|
|
1187
|
+
)
|
|
1188
|
+
|
|
1189
|
+
# Combine results from both server and client side
|
|
1190
|
+
combined_latency_notes = []
|
|
1191
|
+
if server_latency_note:
|
|
1192
|
+
combined_latency_notes.append(f"server: {server_latency_note}")
|
|
1193
|
+
if client_latency_note:
|
|
1194
|
+
combined_latency_notes.append(f"client: {client_latency_note}")
|
|
1195
|
+
|
|
1196
|
+
# Only confirm regression if BOTH server and client side show evidence AND data is stable enough
|
|
1197
|
+
# Check if either server or client data contains unstable indicators
|
|
1198
|
+
server_has_unstable = (
|
|
1199
|
+
server_latency_note and "UNSTABLE" in server_latency_note
|
|
1200
|
+
)
|
|
1201
|
+
client_has_unstable = (
|
|
1202
|
+
client_latency_note and "UNSTABLE" in client_latency_note
|
|
1203
|
+
)
|
|
1204
|
+
|
|
1205
|
+
# Don't confirm regression if either side has unstable data
|
|
1206
|
+
if server_has_unstable or client_has_unstable:
|
|
1207
|
+
both_confirm_regression = False
|
|
1208
|
+
unstable_sides = []
|
|
1209
|
+
if server_has_unstable:
|
|
1210
|
+
unstable_sides.append("server")
|
|
1211
|
+
if client_has_unstable:
|
|
1212
|
+
unstable_sides.append("client")
|
|
1213
|
+
blocked_note = f"regression blocked due to unstable {' and '.join(unstable_sides)} latency data"
|
|
1214
|
+
note += f"; {blocked_note}"
|
|
1215
|
+
logging.info(
|
|
1216
|
+
f"Blocking regression confirmation for '{test_name}' due to unstable latency data"
|
|
1217
|
+
)
|
|
1218
|
+
if server_has_unstable:
|
|
1219
|
+
logging.info(f" Server-side latency data is unstable")
|
|
1220
|
+
if client_has_unstable:
|
|
1221
|
+
logging.info(f" Client-side latency data is unstable")
|
|
1222
|
+
else:
|
|
1223
|
+
both_confirm_regression = (
|
|
1224
|
+
server_confirms_regression and client_confirms_regression
|
|
1225
|
+
)
|
|
1226
|
+
|
|
1227
|
+
if combined_latency_notes:
|
|
1228
|
+
combined_note = "; ".join(combined_latency_notes)
|
|
1229
|
+
note += f"; {combined_note}"
|
|
1230
|
+
logging.info(
|
|
1231
|
+
f"Combined latency check result for '{test_name}': {combined_note}"
|
|
1232
|
+
)
|
|
1233
|
+
|
|
1234
|
+
if both_confirm_regression:
|
|
1235
|
+
logging.info(
|
|
1236
|
+
f"BOTH server and client latency analysis CONFIRM regression for '{test_name}'"
|
|
1237
|
+
)
|
|
1238
|
+
|
|
1239
|
+
# Set the flag for counting confirmed regressions
|
|
1240
|
+
latency_confirms_regression = True
|
|
1241
|
+
|
|
1242
|
+
# Combine regression details from both server and client
|
|
1243
|
+
combined_regression_details = (
|
|
1244
|
+
server_regression_details or client_regression_details
|
|
1245
|
+
)
|
|
1246
|
+
if combined_regression_details:
|
|
1247
|
+
combined_regression_details[
|
|
1248
|
+
"server_side"
|
|
1249
|
+
] = server_confirms_regression
|
|
1250
|
+
combined_regression_details[
|
|
1251
|
+
"client_side"
|
|
1252
|
+
] = client_confirms_regression
|
|
1253
|
+
|
|
1254
|
+
# 2nd level confirmation is sufficient - always add to confirmed regressions
|
|
1255
|
+
logging.info(
|
|
1256
|
+
f"Adding '{test_name}' to confirmed regressions based on 2nd level validation"
|
|
1257
|
+
)
|
|
1258
|
+
|
|
1259
|
+
# Perform 3rd-level analysis: variance + p99 check for additional confidence scoring
|
|
1260
|
+
logging.info(
|
|
1261
|
+
f"Performing 3rd-level analysis (variance + p99) for confidence scoring on '{test_name}'"
|
|
1262
|
+
)
|
|
1263
|
+
(
|
|
1264
|
+
confidence_note,
|
|
1265
|
+
high_confidence,
|
|
1266
|
+
) = perform_variance_and_p99_analysis(
|
|
1267
|
+
rts,
|
|
1268
|
+
test_name,
|
|
1269
|
+
baseline_str,
|
|
1270
|
+
comparison_str,
|
|
1271
|
+
by_str_baseline,
|
|
1272
|
+
by_str_comparison,
|
|
1273
|
+
baseline_deployment_name,
|
|
1274
|
+
comparison_deployment_name,
|
|
1275
|
+
tf_triggering_env,
|
|
1276
|
+
from_ts_ms,
|
|
1277
|
+
to_ts_ms,
|
|
1278
|
+
last_n_baseline,
|
|
1279
|
+
last_n_comparison,
|
|
1280
|
+
first_n_baseline,
|
|
1281
|
+
first_n_comparison,
|
|
1282
|
+
running_platform,
|
|
1283
|
+
baseline_architecture,
|
|
1284
|
+
comparison_architecture,
|
|
1285
|
+
verbose,
|
|
1286
|
+
)
|
|
1287
|
+
|
|
1288
|
+
if confidence_note:
|
|
1289
|
+
note += f"; {confidence_note}"
|
|
1290
|
+
logging.info(
|
|
1291
|
+
f"Confidence analysis for '{test_name}': {confidence_note}"
|
|
1292
|
+
)
|
|
1293
|
+
# Use 3rd level confidence if available
|
|
1294
|
+
combined_regression_details[
|
|
1295
|
+
"high_confidence"
|
|
1296
|
+
] = high_confidence
|
|
1297
|
+
else:
|
|
1298
|
+
# No 3rd level data available - default to moderate confidence since 2nd level confirmed
|
|
1299
|
+
logging.info(
|
|
1300
|
+
f"No 3rd level data available for '{test_name}' - using 2nd level confirmation"
|
|
1301
|
+
)
|
|
1302
|
+
combined_regression_details[
|
|
1303
|
+
"high_confidence"
|
|
1304
|
+
] = True # 2nd level confirmation is reliable
|
|
1305
|
+
|
|
1306
|
+
# Always add to confirmed regressions when 2nd level confirms
|
|
1307
|
+
latency_confirmed_regression_details.append(
|
|
1308
|
+
combined_regression_details
|
|
1309
|
+
)
|
|
1310
|
+
elif server_confirms_regression or client_confirms_regression:
|
|
1311
|
+
side_confirmed = (
|
|
1312
|
+
"server" if server_confirms_regression else "client"
|
|
1313
|
+
)
|
|
1314
|
+
side_not_confirmed = (
|
|
1315
|
+
"client" if server_confirms_regression else "server"
|
|
1316
|
+
)
|
|
1317
|
+
insufficient_evidence_note = f"only {side_confirmed} side confirms regression ({side_not_confirmed} side stable) - insufficient evidence"
|
|
1318
|
+
note += f"; {insufficient_evidence_note}"
|
|
1319
|
+
logging.info(
|
|
1320
|
+
f"Only {side_confirmed} side confirms regression for '{test_name}' - insufficient evidence"
|
|
1321
|
+
)
|
|
1322
|
+
else:
|
|
1323
|
+
no_regression_note = (
|
|
1324
|
+
"neither server nor client side confirms regression"
|
|
1325
|
+
)
|
|
1326
|
+
note += f"; {no_regression_note}"
|
|
1327
|
+
logging.info(
|
|
1328
|
+
f"Neither server nor client side confirms regression for '{test_name}'"
|
|
1329
|
+
)
|
|
1330
|
+
else:
|
|
1331
|
+
logging.info(
|
|
1332
|
+
f"No latency data available for secondary check on '{test_name}'"
|
|
1333
|
+
)
|
|
905
1334
|
|
|
906
1335
|
baseline_v_str = prepare_value_str(
|
|
907
1336
|
baseline_pct_change, baseline_v, baseline_values, simplify_table
|
|
@@ -956,6 +1385,12 @@ def from_rts_to_regression_table(
|
|
|
956
1385
|
|
|
957
1386
|
if unstable:
|
|
958
1387
|
total_unstable += 1
|
|
1388
|
+
if unstable_baseline:
|
|
1389
|
+
total_unstable_baseline += 1
|
|
1390
|
+
if unstable_comparison:
|
|
1391
|
+
total_unstable_comparison += 1
|
|
1392
|
+
if latency_confirms_regression:
|
|
1393
|
+
total_latency_confirmed_regressions += 1
|
|
959
1394
|
|
|
960
1395
|
should_add_line = False
|
|
961
1396
|
if print_regressions_only and detected_regression:
|
|
@@ -976,6 +1411,13 @@ def from_rts_to_regression_table(
|
|
|
976
1411
|
percentage_change,
|
|
977
1412
|
table,
|
|
978
1413
|
test_name,
|
|
1414
|
+
grafana_link_base,
|
|
1415
|
+
baseline_branch,
|
|
1416
|
+
baseline_tag,
|
|
1417
|
+
comparison_branch,
|
|
1418
|
+
comparison_tag,
|
|
1419
|
+
from_date,
|
|
1420
|
+
to_date,
|
|
979
1421
|
)
|
|
980
1422
|
return (
|
|
981
1423
|
detected_regressions,
|
|
@@ -985,9 +1427,995 @@ def from_rts_to_regression_table(
|
|
|
985
1427
|
total_stable,
|
|
986
1428
|
total_unstable,
|
|
987
1429
|
total_comparison_points,
|
|
1430
|
+
total_unstable_baseline,
|
|
1431
|
+
total_unstable_comparison,
|
|
1432
|
+
total_latency_confirmed_regressions,
|
|
1433
|
+
latency_confirmed_regression_details,
|
|
988
1434
|
)
|
|
989
1435
|
|
|
990
1436
|
|
|
1437
|
+
def check_client_side_latency(
|
|
1438
|
+
rts,
|
|
1439
|
+
test_name,
|
|
1440
|
+
baseline_str,
|
|
1441
|
+
comparison_str,
|
|
1442
|
+
by_str_baseline,
|
|
1443
|
+
by_str_comparison,
|
|
1444
|
+
baseline_deployment_name,
|
|
1445
|
+
comparison_deployment_name,
|
|
1446
|
+
tf_triggering_env,
|
|
1447
|
+
from_ts_ms,
|
|
1448
|
+
to_ts_ms,
|
|
1449
|
+
last_n_baseline,
|
|
1450
|
+
last_n_comparison,
|
|
1451
|
+
first_n_baseline,
|
|
1452
|
+
first_n_comparison,
|
|
1453
|
+
running_platform,
|
|
1454
|
+
baseline_architecture,
|
|
1455
|
+
comparison_architecture,
|
|
1456
|
+
verbose=False,
|
|
1457
|
+
):
|
|
1458
|
+
"""
|
|
1459
|
+
Check client-side latency metrics to provide additional validation for regression detection.
|
|
1460
|
+
|
|
1461
|
+
Returns:
|
|
1462
|
+
tuple: (note_string, confirms_regression_bool, regression_details_dict)
|
|
1463
|
+
"""
|
|
1464
|
+
logging.info(f"Starting client-side latency check for test: {test_name}")
|
|
1465
|
+
try:
|
|
1466
|
+
# Client-side latency metrics to check
|
|
1467
|
+
client_metrics = [
|
|
1468
|
+
"p50_latency_ms",
|
|
1469
|
+
"Latency",
|
|
1470
|
+
"OverallQuantiles.allCommands.q50",
|
|
1471
|
+
"Tests.INSERT.AverageLatency_us_",
|
|
1472
|
+
"Tests.READ.AverageLatency_us_",
|
|
1473
|
+
"Tests.SEARCH.AverageLatency_us_",
|
|
1474
|
+
"Tests.UPDATE.AverageLatency_us_",
|
|
1475
|
+
]
|
|
1476
|
+
|
|
1477
|
+
client_latency_notes = []
|
|
1478
|
+
significant_client_latency_increases = 0
|
|
1479
|
+
regression_details = {"test_name": test_name, "commands": []}
|
|
1480
|
+
|
|
1481
|
+
for metric in client_metrics:
|
|
1482
|
+
# Build filters for client-side latency metric
|
|
1483
|
+
filters_baseline = [
|
|
1484
|
+
f"{by_str_baseline}={baseline_str}",
|
|
1485
|
+
f"metric={metric}",
|
|
1486
|
+
f"test_name={test_name}",
|
|
1487
|
+
f"deployment_name={baseline_deployment_name}",
|
|
1488
|
+
f"triggering_env={tf_triggering_env}",
|
|
1489
|
+
]
|
|
1490
|
+
filters_comparison = [
|
|
1491
|
+
f"{by_str_comparison}={comparison_str}",
|
|
1492
|
+
f"metric={metric}",
|
|
1493
|
+
f"test_name={test_name}",
|
|
1494
|
+
f"deployment_name={comparison_deployment_name}",
|
|
1495
|
+
f"triggering_env={tf_triggering_env}",
|
|
1496
|
+
]
|
|
1497
|
+
|
|
1498
|
+
# Add optional filters
|
|
1499
|
+
if running_platform is not None:
|
|
1500
|
+
filters_baseline.append(f"running_platform={running_platform}")
|
|
1501
|
+
filters_comparison.append(f"running_platform={running_platform}")
|
|
1502
|
+
if baseline_architecture != ARCH_X86:
|
|
1503
|
+
filters_baseline.append(f"arch={baseline_architecture}")
|
|
1504
|
+
if comparison_architecture != ARCH_X86:
|
|
1505
|
+
filters_comparison.append(f"arch={comparison_architecture}")
|
|
1506
|
+
|
|
1507
|
+
# Query for client-side latency time-series
|
|
1508
|
+
baseline_client_ts = rts.ts().queryindex(filters_baseline)
|
|
1509
|
+
comparison_client_ts = rts.ts().queryindex(filters_comparison)
|
|
1510
|
+
|
|
1511
|
+
if len(baseline_client_ts) == 0 or len(comparison_client_ts) == 0:
|
|
1512
|
+
if verbose:
|
|
1513
|
+
logging.info(
|
|
1514
|
+
f" No client-side data found for metric '{metric}' in {test_name}"
|
|
1515
|
+
)
|
|
1516
|
+
continue
|
|
1517
|
+
|
|
1518
|
+
logging.info(
|
|
1519
|
+
f" Found client-side metric '{metric}': {len(baseline_client_ts)} baseline, {len(comparison_client_ts)} comparison time-series"
|
|
1520
|
+
)
|
|
1521
|
+
|
|
1522
|
+
# Filter out target time-series
|
|
1523
|
+
baseline_client_ts = [ts for ts in baseline_client_ts if "target" not in ts]
|
|
1524
|
+
comparison_client_ts = [
|
|
1525
|
+
ts for ts in comparison_client_ts if "target" not in ts
|
|
1526
|
+
]
|
|
1527
|
+
|
|
1528
|
+
if len(baseline_client_ts) == 0 or len(comparison_client_ts) == 0:
|
|
1529
|
+
continue
|
|
1530
|
+
|
|
1531
|
+
# Use the first available time-series for each side
|
|
1532
|
+
baseline_ts = baseline_client_ts[0]
|
|
1533
|
+
comparison_ts = comparison_client_ts[0]
|
|
1534
|
+
|
|
1535
|
+
# Get client-side latency data
|
|
1536
|
+
baseline_client_data = rts.ts().revrange(baseline_ts, from_ts_ms, to_ts_ms)
|
|
1537
|
+
comparison_client_data = rts.ts().revrange(
|
|
1538
|
+
comparison_ts, from_ts_ms, to_ts_ms
|
|
1539
|
+
)
|
|
1540
|
+
|
|
1541
|
+
if len(baseline_client_data) == 0 or len(comparison_client_data) == 0:
|
|
1542
|
+
if verbose:
|
|
1543
|
+
logging.info(
|
|
1544
|
+
f" No data points for metric '{metric}': baseline={len(baseline_client_data)}, comparison={len(comparison_client_data)}"
|
|
1545
|
+
)
|
|
1546
|
+
continue
|
|
1547
|
+
|
|
1548
|
+
# Calculate client-side latency statistics
|
|
1549
|
+
baseline_client_values = []
|
|
1550
|
+
comparison_client_values = []
|
|
1551
|
+
|
|
1552
|
+
(_, baseline_client_median, _) = get_v_pct_change_and_largest_var(
|
|
1553
|
+
baseline_client_data,
|
|
1554
|
+
0,
|
|
1555
|
+
0,
|
|
1556
|
+
baseline_client_values,
|
|
1557
|
+
0,
|
|
1558
|
+
last_n_baseline,
|
|
1559
|
+
verbose,
|
|
1560
|
+
first_n_baseline,
|
|
1561
|
+
)
|
|
1562
|
+
|
|
1563
|
+
(_, comparison_client_median, _) = get_v_pct_change_and_largest_var(
|
|
1564
|
+
comparison_client_data,
|
|
1565
|
+
0,
|
|
1566
|
+
0,
|
|
1567
|
+
comparison_client_values,
|
|
1568
|
+
0,
|
|
1569
|
+
last_n_comparison,
|
|
1570
|
+
verbose,
|
|
1571
|
+
first_n_comparison,
|
|
1572
|
+
)
|
|
1573
|
+
|
|
1574
|
+
if baseline_client_median == "N/A" or comparison_client_median == "N/A":
|
|
1575
|
+
if verbose:
|
|
1576
|
+
logging.info(
|
|
1577
|
+
f" Could not calculate median for metric '{metric}': baseline={baseline_client_median}, comparison={comparison_client_median}"
|
|
1578
|
+
)
|
|
1579
|
+
continue
|
|
1580
|
+
|
|
1581
|
+
# Calculate variance (coefficient of variation) for both baseline and comparison
|
|
1582
|
+
baseline_client_mean = (
|
|
1583
|
+
statistics.mean(baseline_client_values) if baseline_client_values else 0
|
|
1584
|
+
)
|
|
1585
|
+
baseline_client_stdev = (
|
|
1586
|
+
statistics.stdev(baseline_client_values)
|
|
1587
|
+
if len(baseline_client_values) > 1
|
|
1588
|
+
else 0
|
|
1589
|
+
)
|
|
1590
|
+
baseline_client_cv = (
|
|
1591
|
+
(baseline_client_stdev / baseline_client_mean * 100)
|
|
1592
|
+
if baseline_client_mean > 0
|
|
1593
|
+
else float("inf")
|
|
1594
|
+
)
|
|
1595
|
+
|
|
1596
|
+
comparison_client_mean = (
|
|
1597
|
+
statistics.mean(comparison_client_values)
|
|
1598
|
+
if comparison_client_values
|
|
1599
|
+
else 0
|
|
1600
|
+
)
|
|
1601
|
+
comparison_client_stdev = (
|
|
1602
|
+
statistics.stdev(comparison_client_values)
|
|
1603
|
+
if len(comparison_client_values) > 1
|
|
1604
|
+
else 0
|
|
1605
|
+
)
|
|
1606
|
+
comparison_client_cv = (
|
|
1607
|
+
(comparison_client_stdev / comparison_client_mean * 100)
|
|
1608
|
+
if comparison_client_mean > 0
|
|
1609
|
+
else float("inf")
|
|
1610
|
+
)
|
|
1611
|
+
|
|
1612
|
+
# Calculate client-side latency change (for latency, higher is worse)
|
|
1613
|
+
client_latency_change = (
|
|
1614
|
+
float(comparison_client_median) / float(baseline_client_median) - 1
|
|
1615
|
+
) * 100.0
|
|
1616
|
+
|
|
1617
|
+
logging.info(
|
|
1618
|
+
f" Client metric '{metric}': baseline={baseline_client_median:.2f} (CV={baseline_client_cv:.1f}%), comparison={comparison_client_median:.2f} (CV={comparison_client_cv:.1f}%), change={client_latency_change:.1f}%"
|
|
1619
|
+
)
|
|
1620
|
+
|
|
1621
|
+
# Check if client latency data is too unstable to be reliable
|
|
1622
|
+
client_data_unstable = (
|
|
1623
|
+
baseline_client_cv > 50.0 or comparison_client_cv > 50.0
|
|
1624
|
+
)
|
|
1625
|
+
|
|
1626
|
+
if client_data_unstable:
|
|
1627
|
+
# Mark as unstable client latency data
|
|
1628
|
+
unstable_reason = []
|
|
1629
|
+
if baseline_client_cv > 50.0:
|
|
1630
|
+
unstable_reason.append(f"baseline CV={baseline_client_cv:.1f}%")
|
|
1631
|
+
if comparison_client_cv > 50.0:
|
|
1632
|
+
unstable_reason.append(f"comparison CV={comparison_client_cv:.1f}%")
|
|
1633
|
+
|
|
1634
|
+
client_latency_notes.append(
|
|
1635
|
+
f"{metric} UNSTABLE ({', '.join(unstable_reason)} - data too noisy for reliable analysis)"
|
|
1636
|
+
)
|
|
1637
|
+
logging.warning(
|
|
1638
|
+
f" Client metric '{metric}': UNSTABLE latency data detected - {', '.join(unstable_reason)}"
|
|
1639
|
+
)
|
|
1640
|
+
elif (
|
|
1641
|
+
abs(client_latency_change) > 5.0
|
|
1642
|
+
): # Only report significant client latency changes for stable data
|
|
1643
|
+
direction = "increased" if client_latency_change > 0 else "decreased"
|
|
1644
|
+
|
|
1645
|
+
# Adjust significance threshold based on baseline variance
|
|
1646
|
+
if baseline_client_cv < 30.0:
|
|
1647
|
+
# Low variance - use standard threshold
|
|
1648
|
+
significance_threshold = 10.0
|
|
1649
|
+
elif baseline_client_cv < 50.0:
|
|
1650
|
+
# Moderate variance - require larger change
|
|
1651
|
+
significance_threshold = 15.0
|
|
1652
|
+
else:
|
|
1653
|
+
# High variance - require much larger change
|
|
1654
|
+
significance_threshold = 25.0
|
|
1655
|
+
|
|
1656
|
+
client_latency_notes.append(
|
|
1657
|
+
f"{metric} {direction} {abs(client_latency_change):.1f}% (baseline CV={baseline_client_cv:.1f}%)"
|
|
1658
|
+
)
|
|
1659
|
+
logging.info(
|
|
1660
|
+
f" Client metric '{metric}': SIGNIFICANT latency change detected ({direction} {abs(client_latency_change):.1f}%, baseline CV={baseline_client_cv:.1f}%)"
|
|
1661
|
+
)
|
|
1662
|
+
|
|
1663
|
+
# Track significant client latency increases (potential regression confirmation)
|
|
1664
|
+
if client_latency_change > significance_threshold:
|
|
1665
|
+
significant_client_latency_increases += 1
|
|
1666
|
+
regression_details["commands"].append(
|
|
1667
|
+
{
|
|
1668
|
+
"command": metric,
|
|
1669
|
+
"change_percent": client_latency_change,
|
|
1670
|
+
"direction": direction,
|
|
1671
|
+
"baseline_cv": baseline_client_cv,
|
|
1672
|
+
"comparison_cv": comparison_client_cv,
|
|
1673
|
+
}
|
|
1674
|
+
)
|
|
1675
|
+
logging.info(
|
|
1676
|
+
f" Client metric '{metric}': CONFIRMS regression (change={client_latency_change:.1f}% > threshold={significance_threshold:.1f}%)"
|
|
1677
|
+
)
|
|
1678
|
+
else:
|
|
1679
|
+
logging.info(
|
|
1680
|
+
f" Client metric '{metric}': Change below significance threshold (change={client_latency_change:.1f}% <= threshold={significance_threshold:.1f}%)"
|
|
1681
|
+
)
|
|
1682
|
+
elif verbose:
|
|
1683
|
+
client_latency_notes.append(
|
|
1684
|
+
f"{metric} stable (CV={baseline_client_cv:.1f}%)"
|
|
1685
|
+
)
|
|
1686
|
+
logging.info(
|
|
1687
|
+
f" Client metric '{metric}': latency stable (change={client_latency_change:.1f}%, baseline CV={baseline_client_cv:.1f}%)"
|
|
1688
|
+
)
|
|
1689
|
+
|
|
1690
|
+
# Determine if client-side latency confirms regression
|
|
1691
|
+
confirms_regression = significant_client_latency_increases > 0
|
|
1692
|
+
|
|
1693
|
+
# Return combined client latency notes
|
|
1694
|
+
if client_latency_notes:
|
|
1695
|
+
result = "; ".join(client_latency_notes)
|
|
1696
|
+
logging.info(
|
|
1697
|
+
f"Client-side latency check completed for {test_name}: {result}"
|
|
1698
|
+
)
|
|
1699
|
+
return (
|
|
1700
|
+
result,
|
|
1701
|
+
confirms_regression,
|
|
1702
|
+
regression_details if confirms_regression else None,
|
|
1703
|
+
)
|
|
1704
|
+
else:
|
|
1705
|
+
result = "client latency stable" if len(client_metrics) > 0 else None
|
|
1706
|
+
logging.info(
|
|
1707
|
+
f"Client-side latency check completed for {test_name}: {result or 'no data'}"
|
|
1708
|
+
)
|
|
1709
|
+
return result, False, None
|
|
1710
|
+
|
|
1711
|
+
except Exception as e:
|
|
1712
|
+
logging.error(f"Error checking client-side latency for {test_name}: {e}")
|
|
1713
|
+
return None, False, None
|
|
1714
|
+
|
|
1715
|
+
|
|
1716
|
+
def perform_variance_and_p99_analysis(
|
|
1717
|
+
rts,
|
|
1718
|
+
test_name,
|
|
1719
|
+
baseline_str,
|
|
1720
|
+
comparison_str,
|
|
1721
|
+
by_str_baseline,
|
|
1722
|
+
by_str_comparison,
|
|
1723
|
+
baseline_deployment_name,
|
|
1724
|
+
comparison_deployment_name,
|
|
1725
|
+
tf_triggering_env,
|
|
1726
|
+
from_ts_ms,
|
|
1727
|
+
to_ts_ms,
|
|
1728
|
+
last_n_baseline,
|
|
1729
|
+
last_n_comparison,
|
|
1730
|
+
first_n_baseline,
|
|
1731
|
+
first_n_comparison,
|
|
1732
|
+
running_platform,
|
|
1733
|
+
baseline_architecture,
|
|
1734
|
+
comparison_architecture,
|
|
1735
|
+
verbose=False,
|
|
1736
|
+
):
|
|
1737
|
+
"""
|
|
1738
|
+
Perform 3rd-level analysis using variance and p99 metrics to assess confidence in regression detection.
|
|
1739
|
+
|
|
1740
|
+
Returns:
|
|
1741
|
+
tuple: (confidence_note, high_confidence_bool)
|
|
1742
|
+
"""
|
|
1743
|
+
try:
|
|
1744
|
+
logging.info(f"Starting variance and p99 analysis for {test_name}")
|
|
1745
|
+
|
|
1746
|
+
# Build filters for p99 latency metric using both metric=p99 and metric-type=(latencystats)
|
|
1747
|
+
filters_baseline = [
|
|
1748
|
+
f"{by_str_baseline}={baseline_str}",
|
|
1749
|
+
"metric=p99",
|
|
1750
|
+
"metric-type=(latencystats)",
|
|
1751
|
+
f"test_name={test_name}",
|
|
1752
|
+
f"deployment_name={baseline_deployment_name}",
|
|
1753
|
+
f"triggering_env={tf_triggering_env}",
|
|
1754
|
+
]
|
|
1755
|
+
filters_comparison = [
|
|
1756
|
+
f"{by_str_comparison}={comparison_str}",
|
|
1757
|
+
"metric=p99",
|
|
1758
|
+
"metric-type=(latencystats)",
|
|
1759
|
+
f"test_name={test_name}",
|
|
1760
|
+
f"deployment_name={comparison_deployment_name}",
|
|
1761
|
+
f"triggering_env={tf_triggering_env}",
|
|
1762
|
+
]
|
|
1763
|
+
|
|
1764
|
+
# Add optional filters
|
|
1765
|
+
if running_platform is not None:
|
|
1766
|
+
filters_baseline.append(f"running_platform={running_platform}")
|
|
1767
|
+
filters_comparison.append(f"running_platform={running_platform}")
|
|
1768
|
+
if baseline_architecture != ARCH_X86:
|
|
1769
|
+
filters_baseline.append(f"arch={baseline_architecture}")
|
|
1770
|
+
if comparison_architecture != ARCH_X86:
|
|
1771
|
+
filters_comparison.append(f"arch={comparison_architecture}")
|
|
1772
|
+
|
|
1773
|
+
# Query for p99 latency time-series
|
|
1774
|
+
logging.info(f"Querying p99 latencystats time-series for {test_name}")
|
|
1775
|
+
baseline_p99_ts = rts.ts().queryindex(filters_baseline)
|
|
1776
|
+
comparison_p99_ts = rts.ts().queryindex(filters_comparison)
|
|
1777
|
+
|
|
1778
|
+
logging.info(f"Found {len(baseline_p99_ts)} baseline p99 latency time-series")
|
|
1779
|
+
logging.info(
|
|
1780
|
+
f"Found {len(comparison_p99_ts)} comparison p99 latency time-series"
|
|
1781
|
+
)
|
|
1782
|
+
|
|
1783
|
+
# Filter out target time-series and unwanted commands (reuse existing function)
|
|
1784
|
+
def should_exclude_timeseries(ts_name):
|
|
1785
|
+
"""Check if time-series should be excluded based on command"""
|
|
1786
|
+
if "target" in ts_name:
|
|
1787
|
+
return True
|
|
1788
|
+
ts_name_lower = ts_name.lower()
|
|
1789
|
+
excluded_commands = ["config", "info", "ping", "cluster", "resetstat"]
|
|
1790
|
+
return any(cmd in ts_name_lower for cmd in excluded_commands)
|
|
1791
|
+
|
|
1792
|
+
baseline_p99_ts = [
|
|
1793
|
+
ts for ts in baseline_p99_ts if not should_exclude_timeseries(ts)
|
|
1794
|
+
]
|
|
1795
|
+
comparison_p99_ts = [
|
|
1796
|
+
ts for ts in comparison_p99_ts if not should_exclude_timeseries(ts)
|
|
1797
|
+
]
|
|
1798
|
+
|
|
1799
|
+
if len(baseline_p99_ts) == 0 or len(comparison_p99_ts) == 0:
|
|
1800
|
+
logging.warning(
|
|
1801
|
+
f"No p99 latency data found for {test_name} after filtering"
|
|
1802
|
+
)
|
|
1803
|
+
return None, False
|
|
1804
|
+
|
|
1805
|
+
# Extract command names from time-series (reuse existing function)
|
|
1806
|
+
def extract_command_from_ts(ts_name):
|
|
1807
|
+
"""Extract meaningful command name from time-series name"""
|
|
1808
|
+
# Look for latencystats_latency_percentiles_usec_<COMMAND>_p99 pattern
|
|
1809
|
+
match = re.search(
|
|
1810
|
+
r"latencystats_latency_percentiles_usec_([^_/]+)_p99", ts_name
|
|
1811
|
+
)
|
|
1812
|
+
if match:
|
|
1813
|
+
return match.group(1)
|
|
1814
|
+
# Look for command= pattern in the time-series name
|
|
1815
|
+
match = re.search(r"command=([^/]+)", ts_name)
|
|
1816
|
+
if match:
|
|
1817
|
+
return match.group(1)
|
|
1818
|
+
# If no specific pattern found, try to extract from the end of the path
|
|
1819
|
+
parts = ts_name.split("/")
|
|
1820
|
+
if len(parts) > 0:
|
|
1821
|
+
return parts[-1]
|
|
1822
|
+
return "unknown"
|
|
1823
|
+
|
|
1824
|
+
# Group time-series by command
|
|
1825
|
+
baseline_by_command = {}
|
|
1826
|
+
comparison_by_command = {}
|
|
1827
|
+
|
|
1828
|
+
for ts in baseline_p99_ts:
|
|
1829
|
+
cmd = extract_command_from_ts(ts)
|
|
1830
|
+
if cmd not in baseline_by_command:
|
|
1831
|
+
baseline_by_command[cmd] = []
|
|
1832
|
+
baseline_by_command[cmd].append(ts)
|
|
1833
|
+
|
|
1834
|
+
for ts in comparison_p99_ts:
|
|
1835
|
+
cmd = extract_command_from_ts(ts)
|
|
1836
|
+
if cmd not in comparison_by_command:
|
|
1837
|
+
comparison_by_command[cmd] = []
|
|
1838
|
+
comparison_by_command[cmd].append(ts)
|
|
1839
|
+
|
|
1840
|
+
# Find common commands between baseline and comparison
|
|
1841
|
+
common_commands = set(baseline_by_command.keys()) & set(
|
|
1842
|
+
comparison_by_command.keys()
|
|
1843
|
+
)
|
|
1844
|
+
|
|
1845
|
+
if not common_commands:
|
|
1846
|
+
logging.warning(
|
|
1847
|
+
f"No common commands found for p99 variance analysis in {test_name}"
|
|
1848
|
+
)
|
|
1849
|
+
return None, False
|
|
1850
|
+
|
|
1851
|
+
variance_notes = []
|
|
1852
|
+
p99_notes = []
|
|
1853
|
+
high_confidence_indicators = 0
|
|
1854
|
+
total_indicators = 0
|
|
1855
|
+
|
|
1856
|
+
# Analyze variance and p99 for each command
|
|
1857
|
+
for command in sorted(common_commands):
|
|
1858
|
+
total_indicators += 1
|
|
1859
|
+
logging.info(f"Analyzing p99 variance for command: {command}")
|
|
1860
|
+
|
|
1861
|
+
baseline_ts_list = baseline_by_command[command]
|
|
1862
|
+
comparison_ts_list = comparison_by_command[command]
|
|
1863
|
+
|
|
1864
|
+
# If multiple time-series for the same command, try to get the best one
|
|
1865
|
+
if len(baseline_ts_list) > 1:
|
|
1866
|
+
baseline_ts_list = get_only_Totals(baseline_ts_list)
|
|
1867
|
+
if len(comparison_ts_list) > 1:
|
|
1868
|
+
comparison_ts_list = get_only_Totals(comparison_ts_list)
|
|
1869
|
+
|
|
1870
|
+
if len(baseline_ts_list) != 1 or len(comparison_ts_list) != 1:
|
|
1871
|
+
logging.warning(
|
|
1872
|
+
f" Skipping {command}: baseline={len(baseline_ts_list)}, comparison={len(comparison_ts_list)} time-series"
|
|
1873
|
+
)
|
|
1874
|
+
continue
|
|
1875
|
+
|
|
1876
|
+
# Get p99 latency data for this command
|
|
1877
|
+
baseline_p99_data = []
|
|
1878
|
+
comparison_p99_data = []
|
|
1879
|
+
|
|
1880
|
+
for ts_name in baseline_ts_list:
|
|
1881
|
+
datapoints = rts.ts().revrange(ts_name, from_ts_ms, to_ts_ms)
|
|
1882
|
+
baseline_p99_data.extend(datapoints)
|
|
1883
|
+
|
|
1884
|
+
for ts_name in comparison_ts_list:
|
|
1885
|
+
datapoints = rts.ts().revrange(ts_name, from_ts_ms, to_ts_ms)
|
|
1886
|
+
comparison_p99_data.extend(datapoints)
|
|
1887
|
+
|
|
1888
|
+
if len(baseline_p99_data) < 3 or len(comparison_p99_data) < 3:
|
|
1889
|
+
logging.warning(
|
|
1890
|
+
f" Insufficient p99 data for {command}: baseline={len(baseline_p99_data)}, comparison={len(comparison_p99_data)} datapoints"
|
|
1891
|
+
)
|
|
1892
|
+
continue
|
|
1893
|
+
|
|
1894
|
+
# Extract values for variance calculation
|
|
1895
|
+
baseline_values = [dp[1] for dp in baseline_p99_data]
|
|
1896
|
+
comparison_values = [dp[1] for dp in comparison_p99_data]
|
|
1897
|
+
|
|
1898
|
+
# Calculate variance (coefficient of variation)
|
|
1899
|
+
baseline_mean = statistics.mean(baseline_values)
|
|
1900
|
+
baseline_stdev = (
|
|
1901
|
+
statistics.stdev(baseline_values) if len(baseline_values) > 1 else 0
|
|
1902
|
+
)
|
|
1903
|
+
baseline_cv = (
|
|
1904
|
+
(baseline_stdev / baseline_mean * 100)
|
|
1905
|
+
if baseline_mean > 0
|
|
1906
|
+
else float("inf")
|
|
1907
|
+
)
|
|
1908
|
+
|
|
1909
|
+
comparison_mean = statistics.mean(comparison_values)
|
|
1910
|
+
comparison_stdev = (
|
|
1911
|
+
statistics.stdev(comparison_values) if len(comparison_values) > 1 else 0
|
|
1912
|
+
)
|
|
1913
|
+
comparison_cv = (
|
|
1914
|
+
(comparison_stdev / comparison_mean * 100)
|
|
1915
|
+
if comparison_mean > 0
|
|
1916
|
+
else float("inf")
|
|
1917
|
+
)
|
|
1918
|
+
|
|
1919
|
+
# Calculate p99 change
|
|
1920
|
+
p99_change = (
|
|
1921
|
+
((comparison_mean - baseline_mean) / baseline_mean * 100)
|
|
1922
|
+
if baseline_mean > 0
|
|
1923
|
+
else 0
|
|
1924
|
+
)
|
|
1925
|
+
|
|
1926
|
+
# Assess confidence based on variance and p99 change
|
|
1927
|
+
if baseline_cv < 30: # Low variance in baseline (< 30% CV)
|
|
1928
|
+
if abs(p99_change) > 15: # Significant p99 change
|
|
1929
|
+
high_confidence_indicators += 1
|
|
1930
|
+
p99_notes.append(
|
|
1931
|
+
f"{command} p99 {'+' if p99_change > 0 else ''}{p99_change:.1f}% (stable baseline)"
|
|
1932
|
+
)
|
|
1933
|
+
else:
|
|
1934
|
+
p99_notes.append(
|
|
1935
|
+
f"{command} p99 {'+' if p99_change > 0 else ''}{p99_change:.1f}% (stable baseline, minor change)"
|
|
1936
|
+
)
|
|
1937
|
+
elif baseline_cv < 50: # Moderate variance
|
|
1938
|
+
if abs(p99_change) > 25: # Need larger change for confidence
|
|
1939
|
+
high_confidence_indicators += 1
|
|
1940
|
+
p99_notes.append(
|
|
1941
|
+
f"{command} p99 {'+' if p99_change > 0 else ''}{p99_change:.1f}% (moderate baseline variance)"
|
|
1942
|
+
)
|
|
1943
|
+
else:
|
|
1944
|
+
p99_notes.append(
|
|
1945
|
+
f"{command} p99 {'+' if p99_change > 0 else ''}{p99_change:.1f}% (moderate baseline variance, uncertain)"
|
|
1946
|
+
)
|
|
1947
|
+
else: # High variance
|
|
1948
|
+
if abs(p99_change) > 40: # Need very large change for confidence
|
|
1949
|
+
high_confidence_indicators += 1
|
|
1950
|
+
p99_notes.append(
|
|
1951
|
+
f"{command} p99 {'+' if p99_change > 0 else ''}{p99_change:.1f}% (high baseline variance, large change)"
|
|
1952
|
+
)
|
|
1953
|
+
else:
|
|
1954
|
+
p99_notes.append(
|
|
1955
|
+
f"{command} p99 {'+' if p99_change > 0 else ''}{p99_change:.1f}% (high baseline variance, low confidence)"
|
|
1956
|
+
)
|
|
1957
|
+
|
|
1958
|
+
variance_notes.append(f"{command} baseline CV={baseline_cv:.1f}%")
|
|
1959
|
+
|
|
1960
|
+
if verbose:
|
|
1961
|
+
logging.info(
|
|
1962
|
+
f" Command {command}: baseline CV={baseline_cv:.1f}%, comparison CV={comparison_cv:.1f}%, p99 change={p99_change:.1f}%"
|
|
1963
|
+
)
|
|
1964
|
+
|
|
1965
|
+
# Determine overall confidence
|
|
1966
|
+
confidence_ratio = (
|
|
1967
|
+
high_confidence_indicators / total_indicators if total_indicators > 0 else 0
|
|
1968
|
+
)
|
|
1969
|
+
high_confidence = (
|
|
1970
|
+
confidence_ratio >= 0.5
|
|
1971
|
+
) # At least 50% of indicators show high confidence
|
|
1972
|
+
|
|
1973
|
+
# Create confidence note
|
|
1974
|
+
confidence_parts = []
|
|
1975
|
+
if variance_notes:
|
|
1976
|
+
confidence_parts.extend(variance_notes)
|
|
1977
|
+
if p99_notes:
|
|
1978
|
+
confidence_parts.extend(p99_notes)
|
|
1979
|
+
|
|
1980
|
+
confidence_note = "; ".join(confidence_parts) if confidence_parts else None
|
|
1981
|
+
|
|
1982
|
+
if confidence_note:
|
|
1983
|
+
confidence_level = "HIGH" if high_confidence else "LOW"
|
|
1984
|
+
cv_explanation = "CV=coefficient of variation (data stability: <30% stable, 30-50% moderate, >50% unstable)"
|
|
1985
|
+
confidence_note = (
|
|
1986
|
+
f"confidence={confidence_level} ({confidence_note}; {cv_explanation})"
|
|
1987
|
+
)
|
|
1988
|
+
|
|
1989
|
+
logging.info(
|
|
1990
|
+
f"Variance and p99 analysis completed for {test_name}: confidence={confidence_ratio:.2f}, high_confidence={high_confidence}"
|
|
1991
|
+
)
|
|
1992
|
+
return confidence_note, high_confidence
|
|
1993
|
+
|
|
1994
|
+
except Exception as e:
|
|
1995
|
+
logging.error(f"Error in variance and p99 analysis for {test_name}: {e}")
|
|
1996
|
+
return None, False
|
|
1997
|
+
|
|
1998
|
+
|
|
1999
|
+
def check_latency_for_unstable_throughput(
|
|
2000
|
+
rts,
|
|
2001
|
+
test_name,
|
|
2002
|
+
baseline_str,
|
|
2003
|
+
comparison_str,
|
|
2004
|
+
by_str_baseline,
|
|
2005
|
+
by_str_comparison,
|
|
2006
|
+
baseline_deployment_name,
|
|
2007
|
+
comparison_deployment_name,
|
|
2008
|
+
tf_triggering_env,
|
|
2009
|
+
from_ts_ms,
|
|
2010
|
+
to_ts_ms,
|
|
2011
|
+
last_n_baseline,
|
|
2012
|
+
last_n_comparison,
|
|
2013
|
+
first_n_baseline,
|
|
2014
|
+
first_n_comparison,
|
|
2015
|
+
running_platform,
|
|
2016
|
+
baseline_architecture,
|
|
2017
|
+
comparison_architecture,
|
|
2018
|
+
verbose,
|
|
2019
|
+
):
|
|
2020
|
+
"""
|
|
2021
|
+
Check latency (p50) for unstable throughput metrics to provide additional context.
|
|
2022
|
+
Returns a tuple: (note_string, confirms_regression_bool, regression_details_dict)
|
|
2023
|
+
"""
|
|
2024
|
+
logging.info(f"Starting latency check for unstable throughput test: {test_name}")
|
|
2025
|
+
try:
|
|
2026
|
+
# Build filters for p50 latency metric using both metric=p50 and metric-type=(latencystats)
|
|
2027
|
+
filters_baseline = [
|
|
2028
|
+
f"{by_str_baseline}={baseline_str}",
|
|
2029
|
+
"metric=p50",
|
|
2030
|
+
"metric-type=(latencystats)",
|
|
2031
|
+
f"test_name={test_name}",
|
|
2032
|
+
f"deployment_name={baseline_deployment_name}",
|
|
2033
|
+
f"triggering_env={tf_triggering_env}",
|
|
2034
|
+
]
|
|
2035
|
+
filters_comparison = [
|
|
2036
|
+
f"{by_str_comparison}={comparison_str}",
|
|
2037
|
+
"metric=p50",
|
|
2038
|
+
"metric-type=(latencystats)",
|
|
2039
|
+
f"test_name={test_name}",
|
|
2040
|
+
f"deployment_name={comparison_deployment_name}",
|
|
2041
|
+
f"triggering_env={tf_triggering_env}",
|
|
2042
|
+
]
|
|
2043
|
+
|
|
2044
|
+
# Add optional filters
|
|
2045
|
+
if running_platform is not None:
|
|
2046
|
+
filters_baseline.append(f"running_platform={running_platform}")
|
|
2047
|
+
filters_comparison.append(f"running_platform={running_platform}")
|
|
2048
|
+
if baseline_architecture != ARCH_X86:
|
|
2049
|
+
filters_baseline.append(f"arch={baseline_architecture}")
|
|
2050
|
+
if comparison_architecture != ARCH_X86:
|
|
2051
|
+
filters_comparison.append(f"arch={comparison_architecture}")
|
|
2052
|
+
|
|
2053
|
+
# Query for p50 latency time-series
|
|
2054
|
+
logging.info(f"Querying p50 latencystats time-series for {test_name}")
|
|
2055
|
+
logging.info(f"Baseline filters: {filters_baseline}")
|
|
2056
|
+
logging.info(f"Comparison filters: {filters_comparison}")
|
|
2057
|
+
|
|
2058
|
+
baseline_latency_ts = rts.ts().queryindex(filters_baseline)
|
|
2059
|
+
comparison_latency_ts = rts.ts().queryindex(filters_comparison)
|
|
2060
|
+
|
|
2061
|
+
logging.info(
|
|
2062
|
+
f"Found {len(baseline_latency_ts)} baseline p50 latency time-series"
|
|
2063
|
+
)
|
|
2064
|
+
logging.info(
|
|
2065
|
+
f"Found {len(comparison_latency_ts)} comparison p50 latency time-series"
|
|
2066
|
+
)
|
|
2067
|
+
|
|
2068
|
+
if verbose and baseline_latency_ts:
|
|
2069
|
+
logging.info(f"Baseline latency time-series: {baseline_latency_ts}")
|
|
2070
|
+
if verbose and comparison_latency_ts:
|
|
2071
|
+
logging.info(f"Comparison latency time-series: {comparison_latency_ts}")
|
|
2072
|
+
|
|
2073
|
+
# Filter out target time-series and unwanted commands
|
|
2074
|
+
def should_exclude_timeseries(ts_name):
|
|
2075
|
+
"""Check if time-series should be excluded based on command"""
|
|
2076
|
+
# Exclude target time-series
|
|
2077
|
+
if "target" in ts_name:
|
|
2078
|
+
return True
|
|
2079
|
+
|
|
2080
|
+
# Convert to lowercase for case-insensitive matching
|
|
2081
|
+
ts_name_lower = ts_name.lower()
|
|
2082
|
+
|
|
2083
|
+
# Exclude administrative commands (case-insensitive)
|
|
2084
|
+
excluded_commands = ["config", "info", "ping", "cluster", "resetstat"]
|
|
2085
|
+
return any(cmd in ts_name_lower for cmd in excluded_commands)
|
|
2086
|
+
|
|
2087
|
+
baseline_latency_ts_before = len(baseline_latency_ts)
|
|
2088
|
+
comparison_latency_ts_before = len(comparison_latency_ts)
|
|
2089
|
+
|
|
2090
|
+
# Apply filtering and log what gets excluded
|
|
2091
|
+
baseline_excluded = [
|
|
2092
|
+
ts for ts in baseline_latency_ts if should_exclude_timeseries(ts)
|
|
2093
|
+
]
|
|
2094
|
+
comparison_excluded = [
|
|
2095
|
+
ts for ts in comparison_latency_ts if should_exclude_timeseries(ts)
|
|
2096
|
+
]
|
|
2097
|
+
|
|
2098
|
+
baseline_latency_ts = [
|
|
2099
|
+
ts for ts in baseline_latency_ts if not should_exclude_timeseries(ts)
|
|
2100
|
+
]
|
|
2101
|
+
comparison_latency_ts = [
|
|
2102
|
+
ts for ts in comparison_latency_ts if not should_exclude_timeseries(ts)
|
|
2103
|
+
]
|
|
2104
|
+
|
|
2105
|
+
logging.info(
|
|
2106
|
+
f"After filtering: baseline {baseline_latency_ts_before} -> {len(baseline_latency_ts)}, "
|
|
2107
|
+
f"comparison {comparison_latency_ts_before} -> {len(comparison_latency_ts)}"
|
|
2108
|
+
)
|
|
2109
|
+
|
|
2110
|
+
if baseline_excluded:
|
|
2111
|
+
logging.info(
|
|
2112
|
+
f"Excluded {len(baseline_excluded)} baseline administrative command time-series"
|
|
2113
|
+
)
|
|
2114
|
+
if verbose:
|
|
2115
|
+
for ts in baseline_excluded:
|
|
2116
|
+
logging.info(f" Excluded baseline: {ts}")
|
|
2117
|
+
if comparison_excluded:
|
|
2118
|
+
logging.info(
|
|
2119
|
+
f"Excluded {len(comparison_excluded)} comparison administrative command time-series"
|
|
2120
|
+
)
|
|
2121
|
+
if verbose:
|
|
2122
|
+
for ts in comparison_excluded:
|
|
2123
|
+
logging.info(f" Excluded comparison: {ts}")
|
|
2124
|
+
|
|
2125
|
+
if len(baseline_latency_ts) == 0 or len(comparison_latency_ts) == 0:
|
|
2126
|
+
logging.warning(
|
|
2127
|
+
f"No p50 latency data found for {test_name} after filtering"
|
|
2128
|
+
)
|
|
2129
|
+
return None, False, None
|
|
2130
|
+
|
|
2131
|
+
# Extract command names from time-series to match baseline and comparison
|
|
2132
|
+
def extract_command_from_ts(ts_name):
|
|
2133
|
+
"""Extract meaningful command name from time-series name"""
|
|
2134
|
+
import re
|
|
2135
|
+
|
|
2136
|
+
# Look for latencystats_latency_percentiles_usec_<COMMAND>_p50 pattern
|
|
2137
|
+
match = re.search(
|
|
2138
|
+
r"latencystats_latency_percentiles_usec_([^_/]+)_p50", ts_name
|
|
2139
|
+
)
|
|
2140
|
+
if match:
|
|
2141
|
+
return match.group(1)
|
|
2142
|
+
|
|
2143
|
+
# Look for command= pattern in the time-series name
|
|
2144
|
+
match = re.search(r"command=([^/]+)", ts_name)
|
|
2145
|
+
if match:
|
|
2146
|
+
return match.group(1)
|
|
2147
|
+
|
|
2148
|
+
# If no specific pattern found, try to extract from the end of the path
|
|
2149
|
+
# e.g., .../Ops/sec/GET -> GET
|
|
2150
|
+
parts = ts_name.split("/")
|
|
2151
|
+
if len(parts) > 0:
|
|
2152
|
+
return parts[-1]
|
|
2153
|
+
return "unknown"
|
|
2154
|
+
|
|
2155
|
+
# Group time-series by command
|
|
2156
|
+
baseline_by_command = {}
|
|
2157
|
+
comparison_by_command = {}
|
|
2158
|
+
|
|
2159
|
+
for ts in baseline_latency_ts:
|
|
2160
|
+
cmd = extract_command_from_ts(ts)
|
|
2161
|
+
if verbose:
|
|
2162
|
+
logging.info(f"Baseline time-series '{ts}' -> command '{cmd}'")
|
|
2163
|
+
if cmd not in baseline_by_command:
|
|
2164
|
+
baseline_by_command[cmd] = []
|
|
2165
|
+
baseline_by_command[cmd].append(ts)
|
|
2166
|
+
|
|
2167
|
+
for ts in comparison_latency_ts:
|
|
2168
|
+
cmd = extract_command_from_ts(ts)
|
|
2169
|
+
if verbose:
|
|
2170
|
+
logging.info(f"Comparison time-series '{ts}' -> command '{cmd}'")
|
|
2171
|
+
if cmd not in comparison_by_command:
|
|
2172
|
+
comparison_by_command[cmd] = []
|
|
2173
|
+
comparison_by_command[cmd].append(ts)
|
|
2174
|
+
|
|
2175
|
+
# Find common commands between baseline and comparison
|
|
2176
|
+
common_commands = set(baseline_by_command.keys()) & set(
|
|
2177
|
+
comparison_by_command.keys()
|
|
2178
|
+
)
|
|
2179
|
+
|
|
2180
|
+
logging.info(f"Baseline commands found: {sorted(baseline_by_command.keys())}")
|
|
2181
|
+
logging.info(
|
|
2182
|
+
f"Comparison commands found: {sorted(comparison_by_command.keys())}"
|
|
2183
|
+
)
|
|
2184
|
+
logging.info(
|
|
2185
|
+
f"Common commands for latency comparison: {sorted(common_commands)}"
|
|
2186
|
+
)
|
|
2187
|
+
|
|
2188
|
+
if not common_commands:
|
|
2189
|
+
logging.warning(
|
|
2190
|
+
f"No common commands found for latency comparison in {test_name}"
|
|
2191
|
+
)
|
|
2192
|
+
return None, False, None
|
|
2193
|
+
|
|
2194
|
+
latency_notes = []
|
|
2195
|
+
significant_latency_increases = (
|
|
2196
|
+
0 # Track commands with significant latency increases
|
|
2197
|
+
)
|
|
2198
|
+
regression_details = {"test_name": test_name, "commands": []}
|
|
2199
|
+
|
|
2200
|
+
# Compare latency for each command individually
|
|
2201
|
+
for command in sorted(common_commands):
|
|
2202
|
+
logging.info(f"Analyzing latency for command: {command}")
|
|
2203
|
+
baseline_ts_list = baseline_by_command[command]
|
|
2204
|
+
comparison_ts_list = comparison_by_command[command]
|
|
2205
|
+
|
|
2206
|
+
logging.info(
|
|
2207
|
+
f" Command {command}: {len(baseline_ts_list)} baseline, {len(comparison_ts_list)} comparison time-series"
|
|
2208
|
+
)
|
|
2209
|
+
|
|
2210
|
+
# If multiple time-series for the same command, try to get the best one
|
|
2211
|
+
if len(baseline_ts_list) > 1:
|
|
2212
|
+
logging.info(
|
|
2213
|
+
f" Multiple baseline time-series for {command}, filtering..."
|
|
2214
|
+
)
|
|
2215
|
+
baseline_ts_list = get_only_Totals(baseline_ts_list)
|
|
2216
|
+
if len(comparison_ts_list) > 1:
|
|
2217
|
+
logging.info(
|
|
2218
|
+
f" Multiple comparison time-series for {command}, filtering..."
|
|
2219
|
+
)
|
|
2220
|
+
comparison_ts_list = get_only_Totals(comparison_ts_list)
|
|
2221
|
+
|
|
2222
|
+
if len(baseline_ts_list) != 1 or len(comparison_ts_list) != 1:
|
|
2223
|
+
logging.warning(
|
|
2224
|
+
f" Skipping {command}: baseline={len(baseline_ts_list)}, comparison={len(comparison_ts_list)} time-series"
|
|
2225
|
+
)
|
|
2226
|
+
continue
|
|
2227
|
+
|
|
2228
|
+
# Get latency data for this command
|
|
2229
|
+
baseline_latency_data = []
|
|
2230
|
+
comparison_latency_data = []
|
|
2231
|
+
|
|
2232
|
+
for ts_name in baseline_ts_list:
|
|
2233
|
+
datapoints = rts.ts().revrange(ts_name, from_ts_ms, to_ts_ms)
|
|
2234
|
+
baseline_latency_data.extend(datapoints)
|
|
2235
|
+
|
|
2236
|
+
for ts_name in comparison_ts_list:
|
|
2237
|
+
datapoints = rts.ts().revrange(ts_name, from_ts_ms, to_ts_ms)
|
|
2238
|
+
comparison_latency_data.extend(datapoints)
|
|
2239
|
+
|
|
2240
|
+
if len(baseline_latency_data) == 0 or len(comparison_latency_data) == 0:
|
|
2241
|
+
logging.warning(
|
|
2242
|
+
f" No latency data for {command}: baseline={len(baseline_latency_data)}, comparison={len(comparison_latency_data)} datapoints"
|
|
2243
|
+
)
|
|
2244
|
+
continue
|
|
2245
|
+
|
|
2246
|
+
logging.info(
|
|
2247
|
+
f" Command {command}: {len(baseline_latency_data)} baseline, {len(comparison_latency_data)} comparison datapoints"
|
|
2248
|
+
)
|
|
2249
|
+
|
|
2250
|
+
# Calculate latency statistics for this command
|
|
2251
|
+
baseline_latency_values = []
|
|
2252
|
+
comparison_latency_values = []
|
|
2253
|
+
|
|
2254
|
+
(_, baseline_latency_median, _) = get_v_pct_change_and_largest_var(
|
|
2255
|
+
baseline_latency_data,
|
|
2256
|
+
0,
|
|
2257
|
+
0,
|
|
2258
|
+
baseline_latency_values,
|
|
2259
|
+
0,
|
|
2260
|
+
last_n_baseline,
|
|
2261
|
+
verbose,
|
|
2262
|
+
first_n_baseline,
|
|
2263
|
+
)
|
|
2264
|
+
|
|
2265
|
+
(_, comparison_latency_median, _) = get_v_pct_change_and_largest_var(
|
|
2266
|
+
comparison_latency_data,
|
|
2267
|
+
0,
|
|
2268
|
+
0,
|
|
2269
|
+
comparison_latency_values,
|
|
2270
|
+
0,
|
|
2271
|
+
last_n_comparison,
|
|
2272
|
+
verbose,
|
|
2273
|
+
first_n_comparison,
|
|
2274
|
+
)
|
|
2275
|
+
|
|
2276
|
+
if baseline_latency_median == "N/A" or comparison_latency_median == "N/A":
|
|
2277
|
+
logging.warning(
|
|
2278
|
+
f" Could not calculate median for {command}: baseline={baseline_latency_median}, comparison={comparison_latency_median}"
|
|
2279
|
+
)
|
|
2280
|
+
continue
|
|
2281
|
+
|
|
2282
|
+
# Calculate variance (coefficient of variation) for both baseline and comparison
|
|
2283
|
+
baseline_latency_mean = (
|
|
2284
|
+
statistics.mean(baseline_latency_values)
|
|
2285
|
+
if baseline_latency_values
|
|
2286
|
+
else 0
|
|
2287
|
+
)
|
|
2288
|
+
baseline_latency_stdev = (
|
|
2289
|
+
statistics.stdev(baseline_latency_values)
|
|
2290
|
+
if len(baseline_latency_values) > 1
|
|
2291
|
+
else 0
|
|
2292
|
+
)
|
|
2293
|
+
baseline_latency_cv = (
|
|
2294
|
+
(baseline_latency_stdev / baseline_latency_mean * 100)
|
|
2295
|
+
if baseline_latency_mean > 0
|
|
2296
|
+
else float("inf")
|
|
2297
|
+
)
|
|
2298
|
+
|
|
2299
|
+
comparison_latency_mean = (
|
|
2300
|
+
statistics.mean(comparison_latency_values)
|
|
2301
|
+
if comparison_latency_values
|
|
2302
|
+
else 0
|
|
2303
|
+
)
|
|
2304
|
+
comparison_latency_stdev = (
|
|
2305
|
+
statistics.stdev(comparison_latency_values)
|
|
2306
|
+
if len(comparison_latency_values) > 1
|
|
2307
|
+
else 0
|
|
2308
|
+
)
|
|
2309
|
+
comparison_latency_cv = (
|
|
2310
|
+
(comparison_latency_stdev / comparison_latency_mean * 100)
|
|
2311
|
+
if comparison_latency_mean > 0
|
|
2312
|
+
else float("inf")
|
|
2313
|
+
)
|
|
2314
|
+
|
|
2315
|
+
# Calculate latency change (for latency, lower is better)
|
|
2316
|
+
latency_change = (
|
|
2317
|
+
float(comparison_latency_median) / float(baseline_latency_median) - 1
|
|
2318
|
+
) * 100.0
|
|
2319
|
+
|
|
2320
|
+
logging.info(
|
|
2321
|
+
f" Command {command}: baseline p50={baseline_latency_median:.2f} (CV={baseline_latency_cv:.1f}%), comparison p50={comparison_latency_median:.2f} (CV={comparison_latency_cv:.1f}%), change={latency_change:.1f}%"
|
|
2322
|
+
)
|
|
2323
|
+
|
|
2324
|
+
# Check if latency data is too unstable to be reliable
|
|
2325
|
+
latency_data_unstable = (
|
|
2326
|
+
baseline_latency_cv > 50.0 or comparison_latency_cv > 50.0
|
|
2327
|
+
)
|
|
2328
|
+
|
|
2329
|
+
if latency_data_unstable:
|
|
2330
|
+
# Mark as unstable latency data
|
|
2331
|
+
unstable_reason = []
|
|
2332
|
+
if baseline_latency_cv > 50.0:
|
|
2333
|
+
unstable_reason.append(f"baseline CV={baseline_latency_cv:.1f}%")
|
|
2334
|
+
if comparison_latency_cv > 50.0:
|
|
2335
|
+
unstable_reason.append(
|
|
2336
|
+
f"comparison CV={comparison_latency_cv:.1f}%"
|
|
2337
|
+
)
|
|
2338
|
+
|
|
2339
|
+
latency_notes.append(
|
|
2340
|
+
f"{command} p50 UNSTABLE ({', '.join(unstable_reason)} - data too noisy for reliable analysis)"
|
|
2341
|
+
)
|
|
2342
|
+
logging.warning(
|
|
2343
|
+
f" Command {command}: UNSTABLE latency data detected - {', '.join(unstable_reason)}"
|
|
2344
|
+
)
|
|
2345
|
+
elif (
|
|
2346
|
+
abs(latency_change) > 5.0
|
|
2347
|
+
): # Only report significant latency changes for stable data
|
|
2348
|
+
direction = "increased" if latency_change > 0 else "decreased"
|
|
2349
|
+
|
|
2350
|
+
# Adjust significance threshold based on baseline variance
|
|
2351
|
+
if baseline_latency_cv < 30.0:
|
|
2352
|
+
# Low variance - use standard threshold
|
|
2353
|
+
significance_threshold = 10.0
|
|
2354
|
+
elif baseline_latency_cv < 50.0:
|
|
2355
|
+
# Moderate variance - require larger change
|
|
2356
|
+
significance_threshold = 15.0
|
|
2357
|
+
else:
|
|
2358
|
+
# High variance - require much larger change
|
|
2359
|
+
significance_threshold = 25.0
|
|
2360
|
+
|
|
2361
|
+
latency_notes.append(
|
|
2362
|
+
f"{command} p50 {direction} {abs(latency_change):.1f}% (baseline CV={baseline_latency_cv:.1f}%)"
|
|
2363
|
+
)
|
|
2364
|
+
logging.info(
|
|
2365
|
+
f" Command {command}: SIGNIFICANT latency change detected ({direction} {abs(latency_change):.1f}%, baseline CV={baseline_latency_cv:.1f}%)"
|
|
2366
|
+
)
|
|
2367
|
+
|
|
2368
|
+
# Track significant latency increases (potential regression confirmation)
|
|
2369
|
+
if latency_change > significance_threshold:
|
|
2370
|
+
significant_latency_increases += 1
|
|
2371
|
+
regression_details["commands"].append(
|
|
2372
|
+
{
|
|
2373
|
+
"command": command,
|
|
2374
|
+
"change_percent": latency_change,
|
|
2375
|
+
"direction": direction,
|
|
2376
|
+
"baseline_cv": baseline_latency_cv,
|
|
2377
|
+
"comparison_cv": comparison_latency_cv,
|
|
2378
|
+
}
|
|
2379
|
+
)
|
|
2380
|
+
logging.info(
|
|
2381
|
+
f" Command {command}: CONFIRMS regression (change={latency_change:.1f}% > threshold={significance_threshold:.1f}%)"
|
|
2382
|
+
)
|
|
2383
|
+
else:
|
|
2384
|
+
logging.info(
|
|
2385
|
+
f" Command {command}: Change below significance threshold (change={latency_change:.1f}% <= threshold={significance_threshold:.1f}%)"
|
|
2386
|
+
)
|
|
2387
|
+
elif verbose:
|
|
2388
|
+
latency_notes.append(
|
|
2389
|
+
f"{command} p50 stable (CV={baseline_latency_cv:.1f}%)"
|
|
2390
|
+
)
|
|
2391
|
+
logging.info(
|
|
2392
|
+
f" Command {command}: latency stable (change={latency_change:.1f}%, baseline CV={baseline_latency_cv:.1f}%)"
|
|
2393
|
+
)
|
|
2394
|
+
|
|
2395
|
+
# Determine if latency confirms regression
|
|
2396
|
+
confirms_regression = significant_latency_increases > 0
|
|
2397
|
+
|
|
2398
|
+
# Return combined latency notes
|
|
2399
|
+
if latency_notes:
|
|
2400
|
+
result = "; ".join(latency_notes)
|
|
2401
|
+
logging.info(f"Latency check completed for {test_name}: {result}")
|
|
2402
|
+
return (
|
|
2403
|
+
result,
|
|
2404
|
+
confirms_regression,
|
|
2405
|
+
regression_details if confirms_regression else None,
|
|
2406
|
+
)
|
|
2407
|
+
else:
|
|
2408
|
+
result = "p50 latency stable" if common_commands else None
|
|
2409
|
+
logging.info(
|
|
2410
|
+
f"Latency check completed for {test_name}: {result or 'no data'}"
|
|
2411
|
+
)
|
|
2412
|
+
return result, False, None
|
|
2413
|
+
|
|
2414
|
+
except Exception as e:
|
|
2415
|
+
logging.error(f"Error checking latency for {test_name}: {e}")
|
|
2416
|
+
return None, False, None
|
|
2417
|
+
|
|
2418
|
+
|
|
991
2419
|
def get_only_Totals(baseline_timeseries):
|
|
992
2420
|
logging.warning("\t\tTime-series: {}".format(", ".join(baseline_timeseries)))
|
|
993
2421
|
logging.info("Checking if Totals will reduce timeseries.")
|
|
@@ -995,6 +2423,37 @@ def get_only_Totals(baseline_timeseries):
|
|
|
995
2423
|
for ts_name in baseline_timeseries:
|
|
996
2424
|
if "Totals" in ts_name:
|
|
997
2425
|
new_base.append(ts_name)
|
|
2426
|
+
|
|
2427
|
+
# If no "Totals" time-series found, try to pick the best alternative
|
|
2428
|
+
if len(new_base) == 0:
|
|
2429
|
+
logging.warning(
|
|
2430
|
+
"No 'Totals' time-series found, trying to pick best alternative."
|
|
2431
|
+
)
|
|
2432
|
+
# Prefer time-series without quotes in metric names
|
|
2433
|
+
unquoted_series = [ts for ts in baseline_timeseries if "'" not in ts]
|
|
2434
|
+
if unquoted_series:
|
|
2435
|
+
new_base = unquoted_series
|
|
2436
|
+
else:
|
|
2437
|
+
# Fall back to original list
|
|
2438
|
+
new_base = baseline_timeseries
|
|
2439
|
+
|
|
2440
|
+
# If we still have multiple time-series after filtering for "Totals",
|
|
2441
|
+
# prefer the one without quotes in the metric name
|
|
2442
|
+
if len(new_base) > 1:
|
|
2443
|
+
logging.info("Multiple time-series found, preferring unquoted metric names.")
|
|
2444
|
+
unquoted_series = [ts for ts in new_base if "'" not in ts]
|
|
2445
|
+
if unquoted_series:
|
|
2446
|
+
new_base = unquoted_series
|
|
2447
|
+
|
|
2448
|
+
# If we still have multiple, take the first one
|
|
2449
|
+
if len(new_base) > 1:
|
|
2450
|
+
logging.warning(
|
|
2451
|
+
"Still multiple time-series after filtering, taking the first one: {}".format(
|
|
2452
|
+
new_base[0]
|
|
2453
|
+
)
|
|
2454
|
+
)
|
|
2455
|
+
new_base = [new_base[0]]
|
|
2456
|
+
|
|
998
2457
|
baseline_timeseries = new_base
|
|
999
2458
|
return baseline_timeseries
|
|
1000
2459
|
|
|
@@ -1061,11 +2520,38 @@ def add_line(
|
|
|
1061
2520
|
percentage_change,
|
|
1062
2521
|
table,
|
|
1063
2522
|
test_name,
|
|
2523
|
+
grafana_link_base=None,
|
|
2524
|
+
baseline_branch=None,
|
|
2525
|
+
baseline_version=None,
|
|
2526
|
+
comparison_branch=None,
|
|
2527
|
+
comparison_version=None,
|
|
2528
|
+
from_date=None,
|
|
2529
|
+
to_date=None,
|
|
1064
2530
|
):
|
|
2531
|
+
grafana_link = None
|
|
2532
|
+
if grafana_link_base is not None:
|
|
2533
|
+
grafana_link = "{}?orgId=1".format(grafana_link_base)
|
|
2534
|
+
grafana_link += f"&var-test_case={test_name}"
|
|
2535
|
+
|
|
2536
|
+
if baseline_branch is not None:
|
|
2537
|
+
grafana_link += f"&var-branch={baseline_branch}"
|
|
2538
|
+
if baseline_version is not None:
|
|
2539
|
+
grafana_link += f"&var-version={baseline_version}"
|
|
2540
|
+
if comparison_branch is not None:
|
|
2541
|
+
grafana_link += f"&var-branch={comparison_branch}"
|
|
2542
|
+
if comparison_version is not None:
|
|
2543
|
+
grafana_link += f"&var-version={comparison_version}"
|
|
2544
|
+
grafana_link += "&from=now-30d&to=now"
|
|
2545
|
+
|
|
2546
|
+
# Create test name with optional Grafana link
|
|
2547
|
+
test_name_display = test_name
|
|
2548
|
+
if grafana_link is not None:
|
|
2549
|
+
test_name_display = f"[{test_name}]({grafana_link})"
|
|
2550
|
+
|
|
1065
2551
|
percentage_change_str = "{:.1f}% ".format(percentage_change)
|
|
1066
2552
|
table.append(
|
|
1067
2553
|
[
|
|
1068
|
-
|
|
2554
|
+
test_name_display,
|
|
1069
2555
|
baseline_v_str,
|
|
1070
2556
|
comparison_v_str,
|
|
1071
2557
|
percentage_change_str,
|
|
@@ -1102,9 +2588,9 @@ def get_v_pct_change_and_largest_var(
|
|
|
1102
2588
|
comparison_values.append(tuple[1])
|
|
1103
2589
|
|
|
1104
2590
|
comparison_df = pd.DataFrame(comparison_values)
|
|
1105
|
-
comparison_median = float(comparison_df.median())
|
|
2591
|
+
comparison_median = float(comparison_df.median().iloc[0])
|
|
1106
2592
|
comparison_v = comparison_median
|
|
1107
|
-
comparison_std = float(comparison_df.std())
|
|
2593
|
+
comparison_std = float(comparison_df.std().iloc[0])
|
|
1108
2594
|
if verbose:
|
|
1109
2595
|
logging.info(
|
|
1110
2596
|
"comparison_datapoints: {} value: {}; std-dev: {}; median: {}".format(
|