redisbench-admin 0.11.38__py3-none-any.whl → 0.11.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- redisbench_admin/compare/args.py +1 -5
- redisbench_admin/compare/compare.py +1498 -15
- redisbench_admin/deploy/deploy.py +1 -9
- redisbench_admin/export/export.py +1 -7
- redisbench_admin/profilers/perf.py +24 -24
- redisbench_admin/run/cluster.py +6 -0
- redisbench_admin/run/common.py +6 -24
- redisbench_admin/run_async/async_terraform.py +2 -10
- redisbench_admin/run_async/render_files.py +3 -3
- redisbench_admin/run_local/run_local.py +12 -12
- redisbench_admin/run_remote/remote_db.py +62 -23
- redisbench_admin/run_remote/remote_helpers.py +18 -5
- redisbench_admin/run_remote/run_remote.py +34 -13
- redisbench_admin/run_remote/standalone.py +136 -0
- redisbench_admin/run_remote/terraform.py +1 -5
- redisbench_admin/utils/remote.py +4 -7
- redisbench_admin/utils/utils.py +42 -24
- {redisbench_admin-0.11.38.dist-info → redisbench_admin-0.11.39.dist-info}/METADATA +1 -1
- {redisbench_admin-0.11.38.dist-info → redisbench_admin-0.11.39.dist-info}/RECORD +22 -22
- {redisbench_admin-0.11.38.dist-info → redisbench_admin-0.11.39.dist-info}/LICENSE +0 -0
- {redisbench_admin-0.11.38.dist-info → redisbench_admin-0.11.39.dist-info}/WHEEL +0 -0
- {redisbench_admin-0.11.38.dist-info → redisbench_admin-0.11.39.dist-info}/entry_points.txt +0 -0
|
@@ -13,6 +13,7 @@ from pytablewriter import MarkdownTableWriter
|
|
|
13
13
|
import humanize
|
|
14
14
|
import datetime as dt
|
|
15
15
|
import os
|
|
16
|
+
import statistics
|
|
16
17
|
from tqdm import tqdm
|
|
17
18
|
from github import Github
|
|
18
19
|
from slack_sdk.webhook import WebhookClient
|
|
@@ -250,12 +251,9 @@ def compare_command_logic(args, project_name, project_version):
|
|
|
250
251
|
}
|
|
251
252
|
baseline_architecture = args.baseline_architecture
|
|
252
253
|
comparison_architecture = args.comparison_architecture
|
|
253
|
-
uid =
|
|
254
|
-
if tf_github_repo.lower() in grafana_dashboards_uids
|
|
254
|
+
uid = None
|
|
255
|
+
if tf_github_repo.lower() in grafana_dashboards_uids:
|
|
255
256
|
uid = grafana_dashboards_uids[tf_github_repo.lower()]
|
|
256
|
-
logging.info(f"Using uid from grafana_dashboards_uids. {grafana_dashboards_uids}. uid={uid}")
|
|
257
|
-
else:
|
|
258
|
-
logging.info(f"Using uid from args. uid={uid}")
|
|
259
257
|
grafana_link_base = None
|
|
260
258
|
if uid is not None:
|
|
261
259
|
grafana_link_base = "{}/{}".format(grafana_base_dashboard, uid)
|
|
@@ -273,6 +271,10 @@ def compare_command_logic(args, project_name, project_version):
|
|
|
273
271
|
total_stable,
|
|
274
272
|
total_unstable,
|
|
275
273
|
total_comparison_points,
|
|
274
|
+
total_unstable_baseline,
|
|
275
|
+
total_unstable_comparison,
|
|
276
|
+
total_latency_confirmed_regressions,
|
|
277
|
+
latency_confirmed_regression_details,
|
|
276
278
|
) = compute_regression_table(
|
|
277
279
|
rts,
|
|
278
280
|
tf_github_org,
|
|
@@ -306,6 +308,7 @@ def compare_command_logic(args, project_name, project_version):
|
|
|
306
308
|
comparison_architecture,
|
|
307
309
|
first_n_baseline,
|
|
308
310
|
first_n_comparison,
|
|
311
|
+
grafana_link_base,
|
|
309
312
|
)
|
|
310
313
|
comment_body = ""
|
|
311
314
|
if total_comparison_points > 0:
|
|
@@ -324,11 +327,63 @@ def compare_command_logic(args, project_name, project_version):
|
|
|
324
327
|
)
|
|
325
328
|
|
|
326
329
|
if total_unstable > 0:
|
|
330
|
+
unstable_details = []
|
|
331
|
+
if total_unstable_baseline > 0:
|
|
332
|
+
unstable_details.append(f"{total_unstable_baseline} baseline")
|
|
333
|
+
if total_unstable_comparison > 0:
|
|
334
|
+
unstable_details.append(f"{total_unstable_comparison} comparison")
|
|
335
|
+
|
|
336
|
+
unstable_breakdown = (
|
|
337
|
+
" (" + ", ".join(unstable_details) + ")" if unstable_details else ""
|
|
338
|
+
)
|
|
327
339
|
comparison_summary += (
|
|
328
|
-
"- Detected a total of {} highly unstable benchmarks.\n".format(
|
|
329
|
-
total_unstable
|
|
340
|
+
"- Detected a total of {} highly unstable benchmarks{}.\n".format(
|
|
341
|
+
total_unstable, unstable_breakdown
|
|
330
342
|
)
|
|
331
343
|
)
|
|
344
|
+
|
|
345
|
+
# Add latency confirmation summary if applicable
|
|
346
|
+
if total_latency_confirmed_regressions > 0:
|
|
347
|
+
comparison_summary += "- Latency analysis confirmed regressions in {} of the unstable tests:\n".format(
|
|
348
|
+
total_latency_confirmed_regressions
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
# Add detailed breakdown as bullet points with test links
|
|
352
|
+
if latency_confirmed_regression_details:
|
|
353
|
+
for detail in latency_confirmed_regression_details:
|
|
354
|
+
test_name = detail["test_name"]
|
|
355
|
+
commands_info = []
|
|
356
|
+
for cmd_detail in detail["commands"]:
|
|
357
|
+
commands_info.append(
|
|
358
|
+
f"{cmd_detail['command']} +{cmd_detail['change_percent']:.1f}%"
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
if commands_info:
|
|
362
|
+
# Create test link if grafana_link_base is available
|
|
363
|
+
test_display_name = test_name
|
|
364
|
+
if grafana_link_base is not None:
|
|
365
|
+
grafana_test_link = f"{grafana_link_base}?orgId=1&var-test_case={test_name}"
|
|
366
|
+
if baseline_branch is not None:
|
|
367
|
+
grafana_test_link += (
|
|
368
|
+
f"&var-branch={baseline_branch}"
|
|
369
|
+
)
|
|
370
|
+
if comparison_branch is not None:
|
|
371
|
+
grafana_test_link += (
|
|
372
|
+
f"&var-branch={comparison_branch}"
|
|
373
|
+
)
|
|
374
|
+
grafana_test_link += "&from=now-30d&to=now"
|
|
375
|
+
test_display_name = (
|
|
376
|
+
f"[{test_name}]({grafana_test_link})"
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
# Add confidence indicator if available
|
|
380
|
+
confidence_indicator = ""
|
|
381
|
+
if "high_confidence" in detail:
|
|
382
|
+
confidence_indicator = (
|
|
383
|
+
" 🔴" if detail["high_confidence"] else " ⚠️"
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
comparison_summary += f" - {test_display_name}: {', '.join(commands_info)}{confidence_indicator}\n"
|
|
332
387
|
if total_improvements > 0:
|
|
333
388
|
comparison_summary += "- Detected a total of {} improvements above the improvement water line.\n".format(
|
|
334
389
|
total_improvements
|
|
@@ -487,6 +542,9 @@ def compare_command_logic(args, project_name, project_version):
|
|
|
487
542
|
total_stable,
|
|
488
543
|
total_unstable,
|
|
489
544
|
total_comparison_points,
|
|
545
|
+
total_unstable_baseline,
|
|
546
|
+
total_unstable_comparison,
|
|
547
|
+
total_latency_confirmed_regressions,
|
|
490
548
|
)
|
|
491
549
|
|
|
492
550
|
|
|
@@ -534,6 +592,7 @@ def compute_regression_table(
|
|
|
534
592
|
comparison_architecture=ARCH_X86,
|
|
535
593
|
first_n_baseline=-1,
|
|
536
594
|
first_n_comparison=-1,
|
|
595
|
+
grafana_link_base=None,
|
|
537
596
|
):
|
|
538
597
|
START_TIME_NOW_UTC, _, _ = get_start_time_vars()
|
|
539
598
|
START_TIME_LAST_MONTH_UTC = START_TIME_NOW_UTC - datetime.timedelta(days=31)
|
|
@@ -596,6 +655,10 @@ def compute_regression_table(
|
|
|
596
655
|
total_stable,
|
|
597
656
|
total_unstable,
|
|
598
657
|
total_comparison_points,
|
|
658
|
+
total_unstable_baseline,
|
|
659
|
+
total_unstable_comparison,
|
|
660
|
+
total_latency_confirmed_regressions,
|
|
661
|
+
latency_confirmed_regression_details,
|
|
599
662
|
) = from_rts_to_regression_table(
|
|
600
663
|
baseline_deployment_name,
|
|
601
664
|
comparison_deployment_name,
|
|
@@ -624,14 +687,97 @@ def compute_regression_table(
|
|
|
624
687
|
comparison_architecture,
|
|
625
688
|
first_n_baseline,
|
|
626
689
|
first_n_comparison,
|
|
690
|
+
grafana_link_base,
|
|
691
|
+
baseline_branch,
|
|
692
|
+
baseline_tag,
|
|
693
|
+
comparison_branch,
|
|
694
|
+
comparison_tag,
|
|
695
|
+
from_date,
|
|
696
|
+
to_date,
|
|
627
697
|
)
|
|
628
698
|
logging.info(
|
|
629
699
|
"Printing differential analysis between {} and {}".format(
|
|
630
700
|
baseline_str, comparison_str
|
|
631
701
|
)
|
|
632
702
|
)
|
|
633
|
-
|
|
634
|
-
|
|
703
|
+
|
|
704
|
+
# Split table into improvements, regressions, and no-changes
|
|
705
|
+
improvements_table = []
|
|
706
|
+
regressions_table = []
|
|
707
|
+
no_changes_table = []
|
|
708
|
+
|
|
709
|
+
for row in table:
|
|
710
|
+
# Check if there's a meaningful change (not stable/unstable)
|
|
711
|
+
note = row[4].lower() if len(row) > 4 else ""
|
|
712
|
+
percentage_str = row[3] if len(row) > 3 else "0.0%"
|
|
713
|
+
|
|
714
|
+
# Extract percentage value
|
|
715
|
+
try:
|
|
716
|
+
percentage_val = float(percentage_str.replace("%", "").strip())
|
|
717
|
+
except:
|
|
718
|
+
percentage_val = 0.0
|
|
719
|
+
|
|
720
|
+
# Categorize based on change type
|
|
721
|
+
if "improvement" in note and "potential" not in note:
|
|
722
|
+
# Only actual improvements, not potential ones
|
|
723
|
+
improvements_table.append(row)
|
|
724
|
+
elif ("regression" in note and "potential" not in note) or "unstable" in note:
|
|
725
|
+
# Only actual regressions, not potential ones, plus unstable tests
|
|
726
|
+
regressions_table.append(row)
|
|
727
|
+
elif "no change" in note or "potential" in note:
|
|
728
|
+
# No changes and potential changes (below significance threshold)
|
|
729
|
+
no_changes_table.append(row)
|
|
730
|
+
elif abs(percentage_val) > 3.0: # Significant changes based on percentage
|
|
731
|
+
if (percentage_val > 0 and metric_mode == "higher-better") or (
|
|
732
|
+
percentage_val < 0 and metric_mode == "lower-better"
|
|
733
|
+
):
|
|
734
|
+
improvements_table.append(row)
|
|
735
|
+
else:
|
|
736
|
+
regressions_table.append(row)
|
|
737
|
+
else:
|
|
738
|
+
no_changes_table.append(row)
|
|
739
|
+
|
|
740
|
+
# Sort tables by percentage change
|
|
741
|
+
def get_percentage_value(row):
|
|
742
|
+
"""Extract percentage value from row for sorting"""
|
|
743
|
+
try:
|
|
744
|
+
percentage_str = row[3] if len(row) > 3 else "0.0%"
|
|
745
|
+
return float(percentage_str.replace("%", "").strip())
|
|
746
|
+
except:
|
|
747
|
+
return 0.0
|
|
748
|
+
|
|
749
|
+
# Sort improvements by percentage change (highest first)
|
|
750
|
+
improvements_table.sort(key=get_percentage_value, reverse=True)
|
|
751
|
+
|
|
752
|
+
# Sort regressions by percentage change (most negative first for higher-better, most positive first for lower-better)
|
|
753
|
+
if metric_mode == "higher-better":
|
|
754
|
+
# For higher-better metrics, most negative changes are worst regressions
|
|
755
|
+
regressions_table.sort(key=get_percentage_value)
|
|
756
|
+
else:
|
|
757
|
+
# For lower-better metrics, most positive changes are worst regressions
|
|
758
|
+
regressions_table.sort(key=get_percentage_value, reverse=True)
|
|
759
|
+
|
|
760
|
+
# Create improvements table (visible)
|
|
761
|
+
improvements_writer = MarkdownTableWriter(
|
|
762
|
+
table_name="Performance Improvements - Comparison between {} and {}.\n\nTime Period from {}. (environment used: {})\n".format(
|
|
763
|
+
baseline_str,
|
|
764
|
+
comparison_str,
|
|
765
|
+
from_human_str,
|
|
766
|
+
baseline_deployment_name,
|
|
767
|
+
),
|
|
768
|
+
headers=[
|
|
769
|
+
"Test Case",
|
|
770
|
+
"Baseline {} (median obs. +- std.dev)".format(baseline_str),
|
|
771
|
+
"Comparison {} (median obs. +- std.dev)".format(comparison_str),
|
|
772
|
+
"% change ({})".format(metric_mode),
|
|
773
|
+
"Note",
|
|
774
|
+
],
|
|
775
|
+
value_matrix=improvements_table,
|
|
776
|
+
)
|
|
777
|
+
|
|
778
|
+
# Create regressions table (visible)
|
|
779
|
+
regressions_writer = MarkdownTableWriter(
|
|
780
|
+
table_name="Performance Regressions and Issues - Comparison between {} and {}.\n\nTime Period from {}. (environment used: {})\n".format(
|
|
635
781
|
baseline_str,
|
|
636
782
|
comparison_str,
|
|
637
783
|
from_human_str,
|
|
@@ -644,8 +790,22 @@ def compute_regression_table(
|
|
|
644
790
|
"% change ({})".format(metric_mode),
|
|
645
791
|
"Note",
|
|
646
792
|
],
|
|
647
|
-
value_matrix=
|
|
793
|
+
value_matrix=regressions_table,
|
|
648
794
|
)
|
|
795
|
+
|
|
796
|
+
# Create no-changes table (hidden in markdown)
|
|
797
|
+
no_changes_writer = MarkdownTableWriter(
|
|
798
|
+
table_name="Tests with No Significant Changes",
|
|
799
|
+
headers=[
|
|
800
|
+
"Test Case",
|
|
801
|
+
"Baseline {} (median obs. +- std.dev)".format(baseline_str),
|
|
802
|
+
"Comparison {} (median obs. +- std.dev)".format(comparison_str),
|
|
803
|
+
"% change ({})".format(metric_mode),
|
|
804
|
+
"Note",
|
|
805
|
+
],
|
|
806
|
+
value_matrix=no_changes_table,
|
|
807
|
+
)
|
|
808
|
+
|
|
649
809
|
table_output = ""
|
|
650
810
|
|
|
651
811
|
from io import StringIO
|
|
@@ -654,7 +814,25 @@ def compute_regression_table(
|
|
|
654
814
|
old_stdout = sys.stdout
|
|
655
815
|
sys.stdout = mystdout = StringIO()
|
|
656
816
|
|
|
657
|
-
|
|
817
|
+
# Output improvements table first (if any)
|
|
818
|
+
if improvements_table:
|
|
819
|
+
improvements_writer.dump(mystdout, False)
|
|
820
|
+
mystdout.write("\n\n")
|
|
821
|
+
|
|
822
|
+
# Output regressions table (if any)
|
|
823
|
+
if regressions_table:
|
|
824
|
+
regressions_writer.dump(mystdout, False)
|
|
825
|
+
mystdout.write("\n\n")
|
|
826
|
+
|
|
827
|
+
# Add hidden no-changes table
|
|
828
|
+
if no_changes_table:
|
|
829
|
+
mystdout.write(
|
|
830
|
+
"<details>\n<summary>Tests with No Significant Changes ({} tests)</summary>\n\n".format(
|
|
831
|
+
len(no_changes_table)
|
|
832
|
+
)
|
|
833
|
+
)
|
|
834
|
+
no_changes_writer.dump(mystdout, False)
|
|
835
|
+
mystdout.write("\n</details>\n")
|
|
658
836
|
|
|
659
837
|
sys.stdout = old_stdout
|
|
660
838
|
|
|
@@ -668,6 +846,10 @@ def compute_regression_table(
|
|
|
668
846
|
total_stable,
|
|
669
847
|
total_unstable,
|
|
670
848
|
total_comparison_points,
|
|
849
|
+
total_unstable_baseline,
|
|
850
|
+
total_unstable_comparison,
|
|
851
|
+
total_latency_confirmed_regressions,
|
|
852
|
+
latency_confirmed_regression_details,
|
|
671
853
|
)
|
|
672
854
|
|
|
673
855
|
|
|
@@ -755,6 +937,13 @@ def from_rts_to_regression_table(
|
|
|
755
937
|
comparison_architecture=ARCH_X86,
|
|
756
938
|
first_n_baseline=-1,
|
|
757
939
|
first_n_comparison=-1,
|
|
940
|
+
grafana_link_base=None,
|
|
941
|
+
baseline_branch=None,
|
|
942
|
+
baseline_tag=None,
|
|
943
|
+
comparison_branch=None,
|
|
944
|
+
comparison_tag=None,
|
|
945
|
+
from_date=None,
|
|
946
|
+
to_date=None,
|
|
758
947
|
):
|
|
759
948
|
print_all = print_regressions_only is False and print_improvements_only is False
|
|
760
949
|
table = []
|
|
@@ -762,8 +951,12 @@ def from_rts_to_regression_table(
|
|
|
762
951
|
total_improvements = 0
|
|
763
952
|
total_stable = 0
|
|
764
953
|
total_unstable = 0
|
|
954
|
+
total_unstable_baseline = 0
|
|
955
|
+
total_unstable_comparison = 0
|
|
765
956
|
total_regressions = 0
|
|
766
957
|
total_comparison_points = 0
|
|
958
|
+
total_latency_confirmed_regressions = 0
|
|
959
|
+
latency_confirmed_regression_details = [] # Track specific test details
|
|
767
960
|
noise_waterline = 3
|
|
768
961
|
progress = tqdm(unit="benchmark time-series", total=len(test_names))
|
|
769
962
|
for test_name in test_names:
|
|
@@ -901,10 +1094,243 @@ def from_rts_to_regression_table(
|
|
|
901
1094
|
logging.error("Detected a ZeroDivisionError. {}".format(e.__str__()))
|
|
902
1095
|
pass
|
|
903
1096
|
unstable = False
|
|
1097
|
+
unstable_baseline = False
|
|
1098
|
+
unstable_comparison = False
|
|
1099
|
+
latency_confirms_regression = False
|
|
1100
|
+
|
|
904
1101
|
if baseline_v != "N/A" and comparison_v != "N/A":
|
|
905
1102
|
if comparison_pct_change > 10.0 or baseline_pct_change > 10.0:
|
|
906
|
-
note = "UNSTABLE (very high variance)"
|
|
907
1103
|
unstable = True
|
|
1104
|
+
unstable_baseline = baseline_pct_change > 10.0
|
|
1105
|
+
unstable_comparison = comparison_pct_change > 10.0
|
|
1106
|
+
|
|
1107
|
+
# Build detailed unstable note
|
|
1108
|
+
unstable_parts = []
|
|
1109
|
+
if unstable_baseline and unstable_comparison:
|
|
1110
|
+
unstable_parts.append(
|
|
1111
|
+
"UNSTABLE (baseline & comparison high variance)"
|
|
1112
|
+
)
|
|
1113
|
+
elif unstable_baseline:
|
|
1114
|
+
unstable_parts.append("UNSTABLE (baseline high variance)")
|
|
1115
|
+
elif unstable_comparison:
|
|
1116
|
+
unstable_parts.append("UNSTABLE (comparison high variance)")
|
|
1117
|
+
|
|
1118
|
+
note = unstable_parts[0]
|
|
1119
|
+
|
|
1120
|
+
# Log detailed warning about unstable data detection
|
|
1121
|
+
logging.warning(
|
|
1122
|
+
f"UNSTABLE DATA DETECTED for test '{test_name}': "
|
|
1123
|
+
f"baseline variance={baseline_pct_change:.1f}%, "
|
|
1124
|
+
f"comparison variance={comparison_pct_change:.1f}% "
|
|
1125
|
+
f"(threshold=10.0%)"
|
|
1126
|
+
)
|
|
1127
|
+
|
|
1128
|
+
# For throughput metrics (higher-better), check both server-side and client-side latency
|
|
1129
|
+
if metric_mode == "higher-better":
|
|
1130
|
+
logging.info(
|
|
1131
|
+
f"Performing 2nd-level latency validation for unstable throughput metric '{test_name}' "
|
|
1132
|
+
f"(metric_mode={metric_mode})"
|
|
1133
|
+
)
|
|
1134
|
+
|
|
1135
|
+
# Check server-side p50 latency
|
|
1136
|
+
(
|
|
1137
|
+
server_latency_note,
|
|
1138
|
+
server_confirms_regression,
|
|
1139
|
+
server_regression_details,
|
|
1140
|
+
) = check_latency_for_unstable_throughput(
|
|
1141
|
+
rts,
|
|
1142
|
+
test_name,
|
|
1143
|
+
baseline_str,
|
|
1144
|
+
comparison_str,
|
|
1145
|
+
by_str_baseline,
|
|
1146
|
+
by_str_comparison,
|
|
1147
|
+
baseline_deployment_name,
|
|
1148
|
+
comparison_deployment_name,
|
|
1149
|
+
tf_triggering_env,
|
|
1150
|
+
from_ts_ms,
|
|
1151
|
+
to_ts_ms,
|
|
1152
|
+
last_n_baseline,
|
|
1153
|
+
last_n_comparison,
|
|
1154
|
+
first_n_baseline,
|
|
1155
|
+
first_n_comparison,
|
|
1156
|
+
running_platform,
|
|
1157
|
+
baseline_architecture,
|
|
1158
|
+
comparison_architecture,
|
|
1159
|
+
verbose,
|
|
1160
|
+
)
|
|
1161
|
+
|
|
1162
|
+
# Check client-side latency metrics
|
|
1163
|
+
(
|
|
1164
|
+
client_latency_note,
|
|
1165
|
+
client_confirms_regression,
|
|
1166
|
+
client_regression_details,
|
|
1167
|
+
) = check_client_side_latency(
|
|
1168
|
+
rts,
|
|
1169
|
+
test_name,
|
|
1170
|
+
baseline_str,
|
|
1171
|
+
comparison_str,
|
|
1172
|
+
by_str_baseline,
|
|
1173
|
+
by_str_comparison,
|
|
1174
|
+
baseline_deployment_name,
|
|
1175
|
+
comparison_deployment_name,
|
|
1176
|
+
tf_triggering_env,
|
|
1177
|
+
from_ts_ms,
|
|
1178
|
+
to_ts_ms,
|
|
1179
|
+
last_n_baseline,
|
|
1180
|
+
last_n_comparison,
|
|
1181
|
+
first_n_baseline,
|
|
1182
|
+
first_n_comparison,
|
|
1183
|
+
running_platform,
|
|
1184
|
+
baseline_architecture,
|
|
1185
|
+
comparison_architecture,
|
|
1186
|
+
verbose,
|
|
1187
|
+
)
|
|
1188
|
+
|
|
1189
|
+
# Combine results from both server and client side
|
|
1190
|
+
combined_latency_notes = []
|
|
1191
|
+
if server_latency_note:
|
|
1192
|
+
combined_latency_notes.append(f"server: {server_latency_note}")
|
|
1193
|
+
if client_latency_note:
|
|
1194
|
+
combined_latency_notes.append(f"client: {client_latency_note}")
|
|
1195
|
+
|
|
1196
|
+
# Only confirm regression if BOTH server and client side show evidence AND data is stable enough
|
|
1197
|
+
# Check if either server or client data contains unstable indicators
|
|
1198
|
+
server_has_unstable = (
|
|
1199
|
+
server_latency_note and "UNSTABLE" in server_latency_note
|
|
1200
|
+
)
|
|
1201
|
+
client_has_unstable = (
|
|
1202
|
+
client_latency_note and "UNSTABLE" in client_latency_note
|
|
1203
|
+
)
|
|
1204
|
+
|
|
1205
|
+
# Don't confirm regression if either side has unstable data
|
|
1206
|
+
if server_has_unstable or client_has_unstable:
|
|
1207
|
+
both_confirm_regression = False
|
|
1208
|
+
unstable_sides = []
|
|
1209
|
+
if server_has_unstable:
|
|
1210
|
+
unstable_sides.append("server")
|
|
1211
|
+
if client_has_unstable:
|
|
1212
|
+
unstable_sides.append("client")
|
|
1213
|
+
blocked_note = f"regression blocked due to unstable {' and '.join(unstable_sides)} latency data"
|
|
1214
|
+
note += f"; {blocked_note}"
|
|
1215
|
+
logging.info(
|
|
1216
|
+
f"Blocking regression confirmation for '{test_name}' due to unstable latency data"
|
|
1217
|
+
)
|
|
1218
|
+
if server_has_unstable:
|
|
1219
|
+
logging.info(f" Server-side latency data is unstable")
|
|
1220
|
+
if client_has_unstable:
|
|
1221
|
+
logging.info(f" Client-side latency data is unstable")
|
|
1222
|
+
else:
|
|
1223
|
+
both_confirm_regression = (
|
|
1224
|
+
server_confirms_regression and client_confirms_regression
|
|
1225
|
+
)
|
|
1226
|
+
|
|
1227
|
+
if combined_latency_notes:
|
|
1228
|
+
combined_note = "; ".join(combined_latency_notes)
|
|
1229
|
+
note += f"; {combined_note}"
|
|
1230
|
+
logging.info(
|
|
1231
|
+
f"Combined latency check result for '{test_name}': {combined_note}"
|
|
1232
|
+
)
|
|
1233
|
+
|
|
1234
|
+
if both_confirm_regression:
|
|
1235
|
+
logging.info(
|
|
1236
|
+
f"BOTH server and client latency analysis CONFIRM regression for '{test_name}'"
|
|
1237
|
+
)
|
|
1238
|
+
|
|
1239
|
+
# Set the flag for counting confirmed regressions
|
|
1240
|
+
latency_confirms_regression = True
|
|
1241
|
+
|
|
1242
|
+
# Combine regression details from both server and client
|
|
1243
|
+
combined_regression_details = (
|
|
1244
|
+
server_regression_details or client_regression_details
|
|
1245
|
+
)
|
|
1246
|
+
if combined_regression_details:
|
|
1247
|
+
combined_regression_details[
|
|
1248
|
+
"server_side"
|
|
1249
|
+
] = server_confirms_regression
|
|
1250
|
+
combined_regression_details[
|
|
1251
|
+
"client_side"
|
|
1252
|
+
] = client_confirms_regression
|
|
1253
|
+
|
|
1254
|
+
# 2nd level confirmation is sufficient - always add to confirmed regressions
|
|
1255
|
+
logging.info(
|
|
1256
|
+
f"Adding '{test_name}' to confirmed regressions based on 2nd level validation"
|
|
1257
|
+
)
|
|
1258
|
+
|
|
1259
|
+
# Perform 3rd-level analysis: variance + p99 check for additional confidence scoring
|
|
1260
|
+
logging.info(
|
|
1261
|
+
f"Performing 3rd-level analysis (variance + p99) for confidence scoring on '{test_name}'"
|
|
1262
|
+
)
|
|
1263
|
+
(
|
|
1264
|
+
confidence_note,
|
|
1265
|
+
high_confidence,
|
|
1266
|
+
) = perform_variance_and_p99_analysis(
|
|
1267
|
+
rts,
|
|
1268
|
+
test_name,
|
|
1269
|
+
baseline_str,
|
|
1270
|
+
comparison_str,
|
|
1271
|
+
by_str_baseline,
|
|
1272
|
+
by_str_comparison,
|
|
1273
|
+
baseline_deployment_name,
|
|
1274
|
+
comparison_deployment_name,
|
|
1275
|
+
tf_triggering_env,
|
|
1276
|
+
from_ts_ms,
|
|
1277
|
+
to_ts_ms,
|
|
1278
|
+
last_n_baseline,
|
|
1279
|
+
last_n_comparison,
|
|
1280
|
+
first_n_baseline,
|
|
1281
|
+
first_n_comparison,
|
|
1282
|
+
running_platform,
|
|
1283
|
+
baseline_architecture,
|
|
1284
|
+
comparison_architecture,
|
|
1285
|
+
verbose,
|
|
1286
|
+
)
|
|
1287
|
+
|
|
1288
|
+
if confidence_note:
|
|
1289
|
+
note += f"; {confidence_note}"
|
|
1290
|
+
logging.info(
|
|
1291
|
+
f"Confidence analysis for '{test_name}': {confidence_note}"
|
|
1292
|
+
)
|
|
1293
|
+
# Use 3rd level confidence if available
|
|
1294
|
+
combined_regression_details[
|
|
1295
|
+
"high_confidence"
|
|
1296
|
+
] = high_confidence
|
|
1297
|
+
else:
|
|
1298
|
+
# No 3rd level data available - default to moderate confidence since 2nd level confirmed
|
|
1299
|
+
logging.info(
|
|
1300
|
+
f"No 3rd level data available for '{test_name}' - using 2nd level confirmation"
|
|
1301
|
+
)
|
|
1302
|
+
combined_regression_details[
|
|
1303
|
+
"high_confidence"
|
|
1304
|
+
] = True # 2nd level confirmation is reliable
|
|
1305
|
+
|
|
1306
|
+
# Always add to confirmed regressions when 2nd level confirms
|
|
1307
|
+
latency_confirmed_regression_details.append(
|
|
1308
|
+
combined_regression_details
|
|
1309
|
+
)
|
|
1310
|
+
elif server_confirms_regression or client_confirms_regression:
|
|
1311
|
+
side_confirmed = (
|
|
1312
|
+
"server" if server_confirms_regression else "client"
|
|
1313
|
+
)
|
|
1314
|
+
side_not_confirmed = (
|
|
1315
|
+
"client" if server_confirms_regression else "server"
|
|
1316
|
+
)
|
|
1317
|
+
insufficient_evidence_note = f"only {side_confirmed} side confirms regression ({side_not_confirmed} side stable) - insufficient evidence"
|
|
1318
|
+
note += f"; {insufficient_evidence_note}"
|
|
1319
|
+
logging.info(
|
|
1320
|
+
f"Only {side_confirmed} side confirms regression for '{test_name}' - insufficient evidence"
|
|
1321
|
+
)
|
|
1322
|
+
else:
|
|
1323
|
+
no_regression_note = (
|
|
1324
|
+
"neither server nor client side confirms regression"
|
|
1325
|
+
)
|
|
1326
|
+
note += f"; {no_regression_note}"
|
|
1327
|
+
logging.info(
|
|
1328
|
+
f"Neither server nor client side confirms regression for '{test_name}'"
|
|
1329
|
+
)
|
|
1330
|
+
else:
|
|
1331
|
+
logging.info(
|
|
1332
|
+
f"No latency data available for secondary check on '{test_name}'"
|
|
1333
|
+
)
|
|
908
1334
|
|
|
909
1335
|
baseline_v_str = prepare_value_str(
|
|
910
1336
|
baseline_pct_change, baseline_v, baseline_values, simplify_table
|
|
@@ -959,6 +1385,12 @@ def from_rts_to_regression_table(
|
|
|
959
1385
|
|
|
960
1386
|
if unstable:
|
|
961
1387
|
total_unstable += 1
|
|
1388
|
+
if unstable_baseline:
|
|
1389
|
+
total_unstable_baseline += 1
|
|
1390
|
+
if unstable_comparison:
|
|
1391
|
+
total_unstable_comparison += 1
|
|
1392
|
+
if latency_confirms_regression:
|
|
1393
|
+
total_latency_confirmed_regressions += 1
|
|
962
1394
|
|
|
963
1395
|
should_add_line = False
|
|
964
1396
|
if print_regressions_only and detected_regression:
|
|
@@ -979,6 +1411,13 @@ def from_rts_to_regression_table(
|
|
|
979
1411
|
percentage_change,
|
|
980
1412
|
table,
|
|
981
1413
|
test_name,
|
|
1414
|
+
grafana_link_base,
|
|
1415
|
+
baseline_branch,
|
|
1416
|
+
baseline_tag,
|
|
1417
|
+
comparison_branch,
|
|
1418
|
+
comparison_tag,
|
|
1419
|
+
from_date,
|
|
1420
|
+
to_date,
|
|
982
1421
|
)
|
|
983
1422
|
return (
|
|
984
1423
|
detected_regressions,
|
|
@@ -988,9 +1427,995 @@ def from_rts_to_regression_table(
|
|
|
988
1427
|
total_stable,
|
|
989
1428
|
total_unstable,
|
|
990
1429
|
total_comparison_points,
|
|
1430
|
+
total_unstable_baseline,
|
|
1431
|
+
total_unstable_comparison,
|
|
1432
|
+
total_latency_confirmed_regressions,
|
|
1433
|
+
latency_confirmed_regression_details,
|
|
991
1434
|
)
|
|
992
1435
|
|
|
993
1436
|
|
|
1437
|
+
def check_client_side_latency(
|
|
1438
|
+
rts,
|
|
1439
|
+
test_name,
|
|
1440
|
+
baseline_str,
|
|
1441
|
+
comparison_str,
|
|
1442
|
+
by_str_baseline,
|
|
1443
|
+
by_str_comparison,
|
|
1444
|
+
baseline_deployment_name,
|
|
1445
|
+
comparison_deployment_name,
|
|
1446
|
+
tf_triggering_env,
|
|
1447
|
+
from_ts_ms,
|
|
1448
|
+
to_ts_ms,
|
|
1449
|
+
last_n_baseline,
|
|
1450
|
+
last_n_comparison,
|
|
1451
|
+
first_n_baseline,
|
|
1452
|
+
first_n_comparison,
|
|
1453
|
+
running_platform,
|
|
1454
|
+
baseline_architecture,
|
|
1455
|
+
comparison_architecture,
|
|
1456
|
+
verbose=False,
|
|
1457
|
+
):
|
|
1458
|
+
"""
|
|
1459
|
+
Check client-side latency metrics to provide additional validation for regression detection.
|
|
1460
|
+
|
|
1461
|
+
Returns:
|
|
1462
|
+
tuple: (note_string, confirms_regression_bool, regression_details_dict)
|
|
1463
|
+
"""
|
|
1464
|
+
logging.info(f"Starting client-side latency check for test: {test_name}")
|
|
1465
|
+
try:
|
|
1466
|
+
# Client-side latency metrics to check
|
|
1467
|
+
client_metrics = [
|
|
1468
|
+
"p50_latency_ms",
|
|
1469
|
+
"Latency",
|
|
1470
|
+
"OverallQuantiles.allCommands.q50",
|
|
1471
|
+
"Tests.INSERT.AverageLatency_us_",
|
|
1472
|
+
"Tests.READ.AverageLatency_us_",
|
|
1473
|
+
"Tests.SEARCH.AverageLatency_us_",
|
|
1474
|
+
"Tests.UPDATE.AverageLatency_us_",
|
|
1475
|
+
]
|
|
1476
|
+
|
|
1477
|
+
client_latency_notes = []
|
|
1478
|
+
significant_client_latency_increases = 0
|
|
1479
|
+
regression_details = {"test_name": test_name, "commands": []}
|
|
1480
|
+
|
|
1481
|
+
for metric in client_metrics:
|
|
1482
|
+
# Build filters for client-side latency metric
|
|
1483
|
+
filters_baseline = [
|
|
1484
|
+
f"{by_str_baseline}={baseline_str}",
|
|
1485
|
+
f"metric={metric}",
|
|
1486
|
+
f"test_name={test_name}",
|
|
1487
|
+
f"deployment_name={baseline_deployment_name}",
|
|
1488
|
+
f"triggering_env={tf_triggering_env}",
|
|
1489
|
+
]
|
|
1490
|
+
filters_comparison = [
|
|
1491
|
+
f"{by_str_comparison}={comparison_str}",
|
|
1492
|
+
f"metric={metric}",
|
|
1493
|
+
f"test_name={test_name}",
|
|
1494
|
+
f"deployment_name={comparison_deployment_name}",
|
|
1495
|
+
f"triggering_env={tf_triggering_env}",
|
|
1496
|
+
]
|
|
1497
|
+
|
|
1498
|
+
# Add optional filters
|
|
1499
|
+
if running_platform is not None:
|
|
1500
|
+
filters_baseline.append(f"running_platform={running_platform}")
|
|
1501
|
+
filters_comparison.append(f"running_platform={running_platform}")
|
|
1502
|
+
if baseline_architecture != ARCH_X86:
|
|
1503
|
+
filters_baseline.append(f"arch={baseline_architecture}")
|
|
1504
|
+
if comparison_architecture != ARCH_X86:
|
|
1505
|
+
filters_comparison.append(f"arch={comparison_architecture}")
|
|
1506
|
+
|
|
1507
|
+
# Query for client-side latency time-series
|
|
1508
|
+
baseline_client_ts = rts.ts().queryindex(filters_baseline)
|
|
1509
|
+
comparison_client_ts = rts.ts().queryindex(filters_comparison)
|
|
1510
|
+
|
|
1511
|
+
if len(baseline_client_ts) == 0 or len(comparison_client_ts) == 0:
|
|
1512
|
+
if verbose:
|
|
1513
|
+
logging.info(
|
|
1514
|
+
f" No client-side data found for metric '{metric}' in {test_name}"
|
|
1515
|
+
)
|
|
1516
|
+
continue
|
|
1517
|
+
|
|
1518
|
+
logging.info(
|
|
1519
|
+
f" Found client-side metric '{metric}': {len(baseline_client_ts)} baseline, {len(comparison_client_ts)} comparison time-series"
|
|
1520
|
+
)
|
|
1521
|
+
|
|
1522
|
+
# Filter out target time-series
|
|
1523
|
+
baseline_client_ts = [ts for ts in baseline_client_ts if "target" not in ts]
|
|
1524
|
+
comparison_client_ts = [
|
|
1525
|
+
ts for ts in comparison_client_ts if "target" not in ts
|
|
1526
|
+
]
|
|
1527
|
+
|
|
1528
|
+
if len(baseline_client_ts) == 0 or len(comparison_client_ts) == 0:
|
|
1529
|
+
continue
|
|
1530
|
+
|
|
1531
|
+
# Use the first available time-series for each side
|
|
1532
|
+
baseline_ts = baseline_client_ts[0]
|
|
1533
|
+
comparison_ts = comparison_client_ts[0]
|
|
1534
|
+
|
|
1535
|
+
# Get client-side latency data
|
|
1536
|
+
baseline_client_data = rts.ts().revrange(baseline_ts, from_ts_ms, to_ts_ms)
|
|
1537
|
+
comparison_client_data = rts.ts().revrange(
|
|
1538
|
+
comparison_ts, from_ts_ms, to_ts_ms
|
|
1539
|
+
)
|
|
1540
|
+
|
|
1541
|
+
if len(baseline_client_data) == 0 or len(comparison_client_data) == 0:
|
|
1542
|
+
if verbose:
|
|
1543
|
+
logging.info(
|
|
1544
|
+
f" No data points for metric '{metric}': baseline={len(baseline_client_data)}, comparison={len(comparison_client_data)}"
|
|
1545
|
+
)
|
|
1546
|
+
continue
|
|
1547
|
+
|
|
1548
|
+
# Calculate client-side latency statistics
|
|
1549
|
+
baseline_client_values = []
|
|
1550
|
+
comparison_client_values = []
|
|
1551
|
+
|
|
1552
|
+
(_, baseline_client_median, _) = get_v_pct_change_and_largest_var(
|
|
1553
|
+
baseline_client_data,
|
|
1554
|
+
0,
|
|
1555
|
+
0,
|
|
1556
|
+
baseline_client_values,
|
|
1557
|
+
0,
|
|
1558
|
+
last_n_baseline,
|
|
1559
|
+
verbose,
|
|
1560
|
+
first_n_baseline,
|
|
1561
|
+
)
|
|
1562
|
+
|
|
1563
|
+
(_, comparison_client_median, _) = get_v_pct_change_and_largest_var(
|
|
1564
|
+
comparison_client_data,
|
|
1565
|
+
0,
|
|
1566
|
+
0,
|
|
1567
|
+
comparison_client_values,
|
|
1568
|
+
0,
|
|
1569
|
+
last_n_comparison,
|
|
1570
|
+
verbose,
|
|
1571
|
+
first_n_comparison,
|
|
1572
|
+
)
|
|
1573
|
+
|
|
1574
|
+
if baseline_client_median == "N/A" or comparison_client_median == "N/A":
|
|
1575
|
+
if verbose:
|
|
1576
|
+
logging.info(
|
|
1577
|
+
f" Could not calculate median for metric '{metric}': baseline={baseline_client_median}, comparison={comparison_client_median}"
|
|
1578
|
+
)
|
|
1579
|
+
continue
|
|
1580
|
+
|
|
1581
|
+
# Calculate variance (coefficient of variation) for both baseline and comparison
|
|
1582
|
+
baseline_client_mean = (
|
|
1583
|
+
statistics.mean(baseline_client_values) if baseline_client_values else 0
|
|
1584
|
+
)
|
|
1585
|
+
baseline_client_stdev = (
|
|
1586
|
+
statistics.stdev(baseline_client_values)
|
|
1587
|
+
if len(baseline_client_values) > 1
|
|
1588
|
+
else 0
|
|
1589
|
+
)
|
|
1590
|
+
baseline_client_cv = (
|
|
1591
|
+
(baseline_client_stdev / baseline_client_mean * 100)
|
|
1592
|
+
if baseline_client_mean > 0
|
|
1593
|
+
else float("inf")
|
|
1594
|
+
)
|
|
1595
|
+
|
|
1596
|
+
comparison_client_mean = (
|
|
1597
|
+
statistics.mean(comparison_client_values)
|
|
1598
|
+
if comparison_client_values
|
|
1599
|
+
else 0
|
|
1600
|
+
)
|
|
1601
|
+
comparison_client_stdev = (
|
|
1602
|
+
statistics.stdev(comparison_client_values)
|
|
1603
|
+
if len(comparison_client_values) > 1
|
|
1604
|
+
else 0
|
|
1605
|
+
)
|
|
1606
|
+
comparison_client_cv = (
|
|
1607
|
+
(comparison_client_stdev / comparison_client_mean * 100)
|
|
1608
|
+
if comparison_client_mean > 0
|
|
1609
|
+
else float("inf")
|
|
1610
|
+
)
|
|
1611
|
+
|
|
1612
|
+
# Calculate client-side latency change (for latency, higher is worse)
|
|
1613
|
+
client_latency_change = (
|
|
1614
|
+
float(comparison_client_median) / float(baseline_client_median) - 1
|
|
1615
|
+
) * 100.0
|
|
1616
|
+
|
|
1617
|
+
logging.info(
|
|
1618
|
+
f" Client metric '{metric}': baseline={baseline_client_median:.2f} (CV={baseline_client_cv:.1f}%), comparison={comparison_client_median:.2f} (CV={comparison_client_cv:.1f}%), change={client_latency_change:.1f}%"
|
|
1619
|
+
)
|
|
1620
|
+
|
|
1621
|
+
# Check if client latency data is too unstable to be reliable
|
|
1622
|
+
client_data_unstable = (
|
|
1623
|
+
baseline_client_cv > 50.0 or comparison_client_cv > 50.0
|
|
1624
|
+
)
|
|
1625
|
+
|
|
1626
|
+
if client_data_unstable:
|
|
1627
|
+
# Mark as unstable client latency data
|
|
1628
|
+
unstable_reason = []
|
|
1629
|
+
if baseline_client_cv > 50.0:
|
|
1630
|
+
unstable_reason.append(f"baseline CV={baseline_client_cv:.1f}%")
|
|
1631
|
+
if comparison_client_cv > 50.0:
|
|
1632
|
+
unstable_reason.append(f"comparison CV={comparison_client_cv:.1f}%")
|
|
1633
|
+
|
|
1634
|
+
client_latency_notes.append(
|
|
1635
|
+
f"{metric} UNSTABLE ({', '.join(unstable_reason)} - data too noisy for reliable analysis)"
|
|
1636
|
+
)
|
|
1637
|
+
logging.warning(
|
|
1638
|
+
f" Client metric '{metric}': UNSTABLE latency data detected - {', '.join(unstable_reason)}"
|
|
1639
|
+
)
|
|
1640
|
+
elif (
|
|
1641
|
+
abs(client_latency_change) > 5.0
|
|
1642
|
+
): # Only report significant client latency changes for stable data
|
|
1643
|
+
direction = "increased" if client_latency_change > 0 else "decreased"
|
|
1644
|
+
|
|
1645
|
+
# Adjust significance threshold based on baseline variance
|
|
1646
|
+
if baseline_client_cv < 30.0:
|
|
1647
|
+
# Low variance - use standard threshold
|
|
1648
|
+
significance_threshold = 10.0
|
|
1649
|
+
elif baseline_client_cv < 50.0:
|
|
1650
|
+
# Moderate variance - require larger change
|
|
1651
|
+
significance_threshold = 15.0
|
|
1652
|
+
else:
|
|
1653
|
+
# High variance - require much larger change
|
|
1654
|
+
significance_threshold = 25.0
|
|
1655
|
+
|
|
1656
|
+
client_latency_notes.append(
|
|
1657
|
+
f"{metric} {direction} {abs(client_latency_change):.1f}% (baseline CV={baseline_client_cv:.1f}%)"
|
|
1658
|
+
)
|
|
1659
|
+
logging.info(
|
|
1660
|
+
f" Client metric '{metric}': SIGNIFICANT latency change detected ({direction} {abs(client_latency_change):.1f}%, baseline CV={baseline_client_cv:.1f}%)"
|
|
1661
|
+
)
|
|
1662
|
+
|
|
1663
|
+
# Track significant client latency increases (potential regression confirmation)
|
|
1664
|
+
if client_latency_change > significance_threshold:
|
|
1665
|
+
significant_client_latency_increases += 1
|
|
1666
|
+
regression_details["commands"].append(
|
|
1667
|
+
{
|
|
1668
|
+
"command": metric,
|
|
1669
|
+
"change_percent": client_latency_change,
|
|
1670
|
+
"direction": direction,
|
|
1671
|
+
"baseline_cv": baseline_client_cv,
|
|
1672
|
+
"comparison_cv": comparison_client_cv,
|
|
1673
|
+
}
|
|
1674
|
+
)
|
|
1675
|
+
logging.info(
|
|
1676
|
+
f" Client metric '{metric}': CONFIRMS regression (change={client_latency_change:.1f}% > threshold={significance_threshold:.1f}%)"
|
|
1677
|
+
)
|
|
1678
|
+
else:
|
|
1679
|
+
logging.info(
|
|
1680
|
+
f" Client metric '{metric}': Change below significance threshold (change={client_latency_change:.1f}% <= threshold={significance_threshold:.1f}%)"
|
|
1681
|
+
)
|
|
1682
|
+
elif verbose:
|
|
1683
|
+
client_latency_notes.append(
|
|
1684
|
+
f"{metric} stable (CV={baseline_client_cv:.1f}%)"
|
|
1685
|
+
)
|
|
1686
|
+
logging.info(
|
|
1687
|
+
f" Client metric '{metric}': latency stable (change={client_latency_change:.1f}%, baseline CV={baseline_client_cv:.1f}%)"
|
|
1688
|
+
)
|
|
1689
|
+
|
|
1690
|
+
# Determine if client-side latency confirms regression
|
|
1691
|
+
confirms_regression = significant_client_latency_increases > 0
|
|
1692
|
+
|
|
1693
|
+
# Return combined client latency notes
|
|
1694
|
+
if client_latency_notes:
|
|
1695
|
+
result = "; ".join(client_latency_notes)
|
|
1696
|
+
logging.info(
|
|
1697
|
+
f"Client-side latency check completed for {test_name}: {result}"
|
|
1698
|
+
)
|
|
1699
|
+
return (
|
|
1700
|
+
result,
|
|
1701
|
+
confirms_regression,
|
|
1702
|
+
regression_details if confirms_regression else None,
|
|
1703
|
+
)
|
|
1704
|
+
else:
|
|
1705
|
+
result = "client latency stable" if len(client_metrics) > 0 else None
|
|
1706
|
+
logging.info(
|
|
1707
|
+
f"Client-side latency check completed for {test_name}: {result or 'no data'}"
|
|
1708
|
+
)
|
|
1709
|
+
return result, False, None
|
|
1710
|
+
|
|
1711
|
+
except Exception as e:
|
|
1712
|
+
logging.error(f"Error checking client-side latency for {test_name}: {e}")
|
|
1713
|
+
return None, False, None
|
|
1714
|
+
|
|
1715
|
+
|
|
1716
|
+
def perform_variance_and_p99_analysis(
|
|
1717
|
+
rts,
|
|
1718
|
+
test_name,
|
|
1719
|
+
baseline_str,
|
|
1720
|
+
comparison_str,
|
|
1721
|
+
by_str_baseline,
|
|
1722
|
+
by_str_comparison,
|
|
1723
|
+
baseline_deployment_name,
|
|
1724
|
+
comparison_deployment_name,
|
|
1725
|
+
tf_triggering_env,
|
|
1726
|
+
from_ts_ms,
|
|
1727
|
+
to_ts_ms,
|
|
1728
|
+
last_n_baseline,
|
|
1729
|
+
last_n_comparison,
|
|
1730
|
+
first_n_baseline,
|
|
1731
|
+
first_n_comparison,
|
|
1732
|
+
running_platform,
|
|
1733
|
+
baseline_architecture,
|
|
1734
|
+
comparison_architecture,
|
|
1735
|
+
verbose=False,
|
|
1736
|
+
):
|
|
1737
|
+
"""
|
|
1738
|
+
Perform 3rd-level analysis using variance and p99 metrics to assess confidence in regression detection.
|
|
1739
|
+
|
|
1740
|
+
Returns:
|
|
1741
|
+
tuple: (confidence_note, high_confidence_bool)
|
|
1742
|
+
"""
|
|
1743
|
+
try:
|
|
1744
|
+
logging.info(f"Starting variance and p99 analysis for {test_name}")
|
|
1745
|
+
|
|
1746
|
+
# Build filters for p99 latency metric using both metric=p99 and metric-type=(latencystats)
|
|
1747
|
+
filters_baseline = [
|
|
1748
|
+
f"{by_str_baseline}={baseline_str}",
|
|
1749
|
+
"metric=p99",
|
|
1750
|
+
"metric-type=(latencystats)",
|
|
1751
|
+
f"test_name={test_name}",
|
|
1752
|
+
f"deployment_name={baseline_deployment_name}",
|
|
1753
|
+
f"triggering_env={tf_triggering_env}",
|
|
1754
|
+
]
|
|
1755
|
+
filters_comparison = [
|
|
1756
|
+
f"{by_str_comparison}={comparison_str}",
|
|
1757
|
+
"metric=p99",
|
|
1758
|
+
"metric-type=(latencystats)",
|
|
1759
|
+
f"test_name={test_name}",
|
|
1760
|
+
f"deployment_name={comparison_deployment_name}",
|
|
1761
|
+
f"triggering_env={tf_triggering_env}",
|
|
1762
|
+
]
|
|
1763
|
+
|
|
1764
|
+
# Add optional filters
|
|
1765
|
+
if running_platform is not None:
|
|
1766
|
+
filters_baseline.append(f"running_platform={running_platform}")
|
|
1767
|
+
filters_comparison.append(f"running_platform={running_platform}")
|
|
1768
|
+
if baseline_architecture != ARCH_X86:
|
|
1769
|
+
filters_baseline.append(f"arch={baseline_architecture}")
|
|
1770
|
+
if comparison_architecture != ARCH_X86:
|
|
1771
|
+
filters_comparison.append(f"arch={comparison_architecture}")
|
|
1772
|
+
|
|
1773
|
+
# Query for p99 latency time-series
|
|
1774
|
+
logging.info(f"Querying p99 latencystats time-series for {test_name}")
|
|
1775
|
+
baseline_p99_ts = rts.ts().queryindex(filters_baseline)
|
|
1776
|
+
comparison_p99_ts = rts.ts().queryindex(filters_comparison)
|
|
1777
|
+
|
|
1778
|
+
logging.info(f"Found {len(baseline_p99_ts)} baseline p99 latency time-series")
|
|
1779
|
+
logging.info(
|
|
1780
|
+
f"Found {len(comparison_p99_ts)} comparison p99 latency time-series"
|
|
1781
|
+
)
|
|
1782
|
+
|
|
1783
|
+
# Filter out target time-series and unwanted commands (reuse existing function)
|
|
1784
|
+
def should_exclude_timeseries(ts_name):
|
|
1785
|
+
"""Check if time-series should be excluded based on command"""
|
|
1786
|
+
if "target" in ts_name:
|
|
1787
|
+
return True
|
|
1788
|
+
ts_name_lower = ts_name.lower()
|
|
1789
|
+
excluded_commands = ["config", "info", "ping", "cluster", "resetstat"]
|
|
1790
|
+
return any(cmd in ts_name_lower for cmd in excluded_commands)
|
|
1791
|
+
|
|
1792
|
+
baseline_p99_ts = [
|
|
1793
|
+
ts for ts in baseline_p99_ts if not should_exclude_timeseries(ts)
|
|
1794
|
+
]
|
|
1795
|
+
comparison_p99_ts = [
|
|
1796
|
+
ts for ts in comparison_p99_ts if not should_exclude_timeseries(ts)
|
|
1797
|
+
]
|
|
1798
|
+
|
|
1799
|
+
if len(baseline_p99_ts) == 0 or len(comparison_p99_ts) == 0:
|
|
1800
|
+
logging.warning(
|
|
1801
|
+
f"No p99 latency data found for {test_name} after filtering"
|
|
1802
|
+
)
|
|
1803
|
+
return None, False
|
|
1804
|
+
|
|
1805
|
+
# Extract command names from time-series (reuse existing function)
|
|
1806
|
+
def extract_command_from_ts(ts_name):
|
|
1807
|
+
"""Extract meaningful command name from time-series name"""
|
|
1808
|
+
# Look for latencystats_latency_percentiles_usec_<COMMAND>_p99 pattern
|
|
1809
|
+
match = re.search(
|
|
1810
|
+
r"latencystats_latency_percentiles_usec_([^_/]+)_p99", ts_name
|
|
1811
|
+
)
|
|
1812
|
+
if match:
|
|
1813
|
+
return match.group(1)
|
|
1814
|
+
# Look for command= pattern in the time-series name
|
|
1815
|
+
match = re.search(r"command=([^/]+)", ts_name)
|
|
1816
|
+
if match:
|
|
1817
|
+
return match.group(1)
|
|
1818
|
+
# If no specific pattern found, try to extract from the end of the path
|
|
1819
|
+
parts = ts_name.split("/")
|
|
1820
|
+
if len(parts) > 0:
|
|
1821
|
+
return parts[-1]
|
|
1822
|
+
return "unknown"
|
|
1823
|
+
|
|
1824
|
+
# Group time-series by command
|
|
1825
|
+
baseline_by_command = {}
|
|
1826
|
+
comparison_by_command = {}
|
|
1827
|
+
|
|
1828
|
+
for ts in baseline_p99_ts:
|
|
1829
|
+
cmd = extract_command_from_ts(ts)
|
|
1830
|
+
if cmd not in baseline_by_command:
|
|
1831
|
+
baseline_by_command[cmd] = []
|
|
1832
|
+
baseline_by_command[cmd].append(ts)
|
|
1833
|
+
|
|
1834
|
+
for ts in comparison_p99_ts:
|
|
1835
|
+
cmd = extract_command_from_ts(ts)
|
|
1836
|
+
if cmd not in comparison_by_command:
|
|
1837
|
+
comparison_by_command[cmd] = []
|
|
1838
|
+
comparison_by_command[cmd].append(ts)
|
|
1839
|
+
|
|
1840
|
+
# Find common commands between baseline and comparison
|
|
1841
|
+
common_commands = set(baseline_by_command.keys()) & set(
|
|
1842
|
+
comparison_by_command.keys()
|
|
1843
|
+
)
|
|
1844
|
+
|
|
1845
|
+
if not common_commands:
|
|
1846
|
+
logging.warning(
|
|
1847
|
+
f"No common commands found for p99 variance analysis in {test_name}"
|
|
1848
|
+
)
|
|
1849
|
+
return None, False
|
|
1850
|
+
|
|
1851
|
+
variance_notes = []
|
|
1852
|
+
p99_notes = []
|
|
1853
|
+
high_confidence_indicators = 0
|
|
1854
|
+
total_indicators = 0
|
|
1855
|
+
|
|
1856
|
+
# Analyze variance and p99 for each command
|
|
1857
|
+
for command in sorted(common_commands):
|
|
1858
|
+
total_indicators += 1
|
|
1859
|
+
logging.info(f"Analyzing p99 variance for command: {command}")
|
|
1860
|
+
|
|
1861
|
+
baseline_ts_list = baseline_by_command[command]
|
|
1862
|
+
comparison_ts_list = comparison_by_command[command]
|
|
1863
|
+
|
|
1864
|
+
# If multiple time-series for the same command, try to get the best one
|
|
1865
|
+
if len(baseline_ts_list) > 1:
|
|
1866
|
+
baseline_ts_list = get_only_Totals(baseline_ts_list)
|
|
1867
|
+
if len(comparison_ts_list) > 1:
|
|
1868
|
+
comparison_ts_list = get_only_Totals(comparison_ts_list)
|
|
1869
|
+
|
|
1870
|
+
if len(baseline_ts_list) != 1 or len(comparison_ts_list) != 1:
|
|
1871
|
+
logging.warning(
|
|
1872
|
+
f" Skipping {command}: baseline={len(baseline_ts_list)}, comparison={len(comparison_ts_list)} time-series"
|
|
1873
|
+
)
|
|
1874
|
+
continue
|
|
1875
|
+
|
|
1876
|
+
# Get p99 latency data for this command
|
|
1877
|
+
baseline_p99_data = []
|
|
1878
|
+
comparison_p99_data = []
|
|
1879
|
+
|
|
1880
|
+
for ts_name in baseline_ts_list:
|
|
1881
|
+
datapoints = rts.ts().revrange(ts_name, from_ts_ms, to_ts_ms)
|
|
1882
|
+
baseline_p99_data.extend(datapoints)
|
|
1883
|
+
|
|
1884
|
+
for ts_name in comparison_ts_list:
|
|
1885
|
+
datapoints = rts.ts().revrange(ts_name, from_ts_ms, to_ts_ms)
|
|
1886
|
+
comparison_p99_data.extend(datapoints)
|
|
1887
|
+
|
|
1888
|
+
if len(baseline_p99_data) < 3 or len(comparison_p99_data) < 3:
|
|
1889
|
+
logging.warning(
|
|
1890
|
+
f" Insufficient p99 data for {command}: baseline={len(baseline_p99_data)}, comparison={len(comparison_p99_data)} datapoints"
|
|
1891
|
+
)
|
|
1892
|
+
continue
|
|
1893
|
+
|
|
1894
|
+
# Extract values for variance calculation
|
|
1895
|
+
baseline_values = [dp[1] for dp in baseline_p99_data]
|
|
1896
|
+
comparison_values = [dp[1] for dp in comparison_p99_data]
|
|
1897
|
+
|
|
1898
|
+
# Calculate variance (coefficient of variation)
|
|
1899
|
+
baseline_mean = statistics.mean(baseline_values)
|
|
1900
|
+
baseline_stdev = (
|
|
1901
|
+
statistics.stdev(baseline_values) if len(baseline_values) > 1 else 0
|
|
1902
|
+
)
|
|
1903
|
+
baseline_cv = (
|
|
1904
|
+
(baseline_stdev / baseline_mean * 100)
|
|
1905
|
+
if baseline_mean > 0
|
|
1906
|
+
else float("inf")
|
|
1907
|
+
)
|
|
1908
|
+
|
|
1909
|
+
comparison_mean = statistics.mean(comparison_values)
|
|
1910
|
+
comparison_stdev = (
|
|
1911
|
+
statistics.stdev(comparison_values) if len(comparison_values) > 1 else 0
|
|
1912
|
+
)
|
|
1913
|
+
comparison_cv = (
|
|
1914
|
+
(comparison_stdev / comparison_mean * 100)
|
|
1915
|
+
if comparison_mean > 0
|
|
1916
|
+
else float("inf")
|
|
1917
|
+
)
|
|
1918
|
+
|
|
1919
|
+
# Calculate p99 change
|
|
1920
|
+
p99_change = (
|
|
1921
|
+
((comparison_mean - baseline_mean) / baseline_mean * 100)
|
|
1922
|
+
if baseline_mean > 0
|
|
1923
|
+
else 0
|
|
1924
|
+
)
|
|
1925
|
+
|
|
1926
|
+
# Assess confidence based on variance and p99 change
|
|
1927
|
+
if baseline_cv < 30: # Low variance in baseline (< 30% CV)
|
|
1928
|
+
if abs(p99_change) > 15: # Significant p99 change
|
|
1929
|
+
high_confidence_indicators += 1
|
|
1930
|
+
p99_notes.append(
|
|
1931
|
+
f"{command} p99 {'+' if p99_change > 0 else ''}{p99_change:.1f}% (stable baseline)"
|
|
1932
|
+
)
|
|
1933
|
+
else:
|
|
1934
|
+
p99_notes.append(
|
|
1935
|
+
f"{command} p99 {'+' if p99_change > 0 else ''}{p99_change:.1f}% (stable baseline, minor change)"
|
|
1936
|
+
)
|
|
1937
|
+
elif baseline_cv < 50: # Moderate variance
|
|
1938
|
+
if abs(p99_change) > 25: # Need larger change for confidence
|
|
1939
|
+
high_confidence_indicators += 1
|
|
1940
|
+
p99_notes.append(
|
|
1941
|
+
f"{command} p99 {'+' if p99_change > 0 else ''}{p99_change:.1f}% (moderate baseline variance)"
|
|
1942
|
+
)
|
|
1943
|
+
else:
|
|
1944
|
+
p99_notes.append(
|
|
1945
|
+
f"{command} p99 {'+' if p99_change > 0 else ''}{p99_change:.1f}% (moderate baseline variance, uncertain)"
|
|
1946
|
+
)
|
|
1947
|
+
else: # High variance
|
|
1948
|
+
if abs(p99_change) > 40: # Need very large change for confidence
|
|
1949
|
+
high_confidence_indicators += 1
|
|
1950
|
+
p99_notes.append(
|
|
1951
|
+
f"{command} p99 {'+' if p99_change > 0 else ''}{p99_change:.1f}% (high baseline variance, large change)"
|
|
1952
|
+
)
|
|
1953
|
+
else:
|
|
1954
|
+
p99_notes.append(
|
|
1955
|
+
f"{command} p99 {'+' if p99_change > 0 else ''}{p99_change:.1f}% (high baseline variance, low confidence)"
|
|
1956
|
+
)
|
|
1957
|
+
|
|
1958
|
+
variance_notes.append(f"{command} baseline CV={baseline_cv:.1f}%")
|
|
1959
|
+
|
|
1960
|
+
if verbose:
|
|
1961
|
+
logging.info(
|
|
1962
|
+
f" Command {command}: baseline CV={baseline_cv:.1f}%, comparison CV={comparison_cv:.1f}%, p99 change={p99_change:.1f}%"
|
|
1963
|
+
)
|
|
1964
|
+
|
|
1965
|
+
# Determine overall confidence
|
|
1966
|
+
confidence_ratio = (
|
|
1967
|
+
high_confidence_indicators / total_indicators if total_indicators > 0 else 0
|
|
1968
|
+
)
|
|
1969
|
+
high_confidence = (
|
|
1970
|
+
confidence_ratio >= 0.5
|
|
1971
|
+
) # At least 50% of indicators show high confidence
|
|
1972
|
+
|
|
1973
|
+
# Create confidence note
|
|
1974
|
+
confidence_parts = []
|
|
1975
|
+
if variance_notes:
|
|
1976
|
+
confidence_parts.extend(variance_notes)
|
|
1977
|
+
if p99_notes:
|
|
1978
|
+
confidence_parts.extend(p99_notes)
|
|
1979
|
+
|
|
1980
|
+
confidence_note = "; ".join(confidence_parts) if confidence_parts else None
|
|
1981
|
+
|
|
1982
|
+
if confidence_note:
|
|
1983
|
+
confidence_level = "HIGH" if high_confidence else "LOW"
|
|
1984
|
+
cv_explanation = "CV=coefficient of variation (data stability: <30% stable, 30-50% moderate, >50% unstable)"
|
|
1985
|
+
confidence_note = (
|
|
1986
|
+
f"confidence={confidence_level} ({confidence_note}; {cv_explanation})"
|
|
1987
|
+
)
|
|
1988
|
+
|
|
1989
|
+
logging.info(
|
|
1990
|
+
f"Variance and p99 analysis completed for {test_name}: confidence={confidence_ratio:.2f}, high_confidence={high_confidence}"
|
|
1991
|
+
)
|
|
1992
|
+
return confidence_note, high_confidence
|
|
1993
|
+
|
|
1994
|
+
except Exception as e:
|
|
1995
|
+
logging.error(f"Error in variance and p99 analysis for {test_name}: {e}")
|
|
1996
|
+
return None, False
|
|
1997
|
+
|
|
1998
|
+
|
|
1999
|
+
def check_latency_for_unstable_throughput(
|
|
2000
|
+
rts,
|
|
2001
|
+
test_name,
|
|
2002
|
+
baseline_str,
|
|
2003
|
+
comparison_str,
|
|
2004
|
+
by_str_baseline,
|
|
2005
|
+
by_str_comparison,
|
|
2006
|
+
baseline_deployment_name,
|
|
2007
|
+
comparison_deployment_name,
|
|
2008
|
+
tf_triggering_env,
|
|
2009
|
+
from_ts_ms,
|
|
2010
|
+
to_ts_ms,
|
|
2011
|
+
last_n_baseline,
|
|
2012
|
+
last_n_comparison,
|
|
2013
|
+
first_n_baseline,
|
|
2014
|
+
first_n_comparison,
|
|
2015
|
+
running_platform,
|
|
2016
|
+
baseline_architecture,
|
|
2017
|
+
comparison_architecture,
|
|
2018
|
+
verbose,
|
|
2019
|
+
):
|
|
2020
|
+
"""
|
|
2021
|
+
Check latency (p50) for unstable throughput metrics to provide additional context.
|
|
2022
|
+
Returns a tuple: (note_string, confirms_regression_bool, regression_details_dict)
|
|
2023
|
+
"""
|
|
2024
|
+
logging.info(f"Starting latency check for unstable throughput test: {test_name}")
|
|
2025
|
+
try:
|
|
2026
|
+
# Build filters for p50 latency metric using both metric=p50 and metric-type=(latencystats)
|
|
2027
|
+
filters_baseline = [
|
|
2028
|
+
f"{by_str_baseline}={baseline_str}",
|
|
2029
|
+
"metric=p50",
|
|
2030
|
+
"metric-type=(latencystats)",
|
|
2031
|
+
f"test_name={test_name}",
|
|
2032
|
+
f"deployment_name={baseline_deployment_name}",
|
|
2033
|
+
f"triggering_env={tf_triggering_env}",
|
|
2034
|
+
]
|
|
2035
|
+
filters_comparison = [
|
|
2036
|
+
f"{by_str_comparison}={comparison_str}",
|
|
2037
|
+
"metric=p50",
|
|
2038
|
+
"metric-type=(latencystats)",
|
|
2039
|
+
f"test_name={test_name}",
|
|
2040
|
+
f"deployment_name={comparison_deployment_name}",
|
|
2041
|
+
f"triggering_env={tf_triggering_env}",
|
|
2042
|
+
]
|
|
2043
|
+
|
|
2044
|
+
# Add optional filters
|
|
2045
|
+
if running_platform is not None:
|
|
2046
|
+
filters_baseline.append(f"running_platform={running_platform}")
|
|
2047
|
+
filters_comparison.append(f"running_platform={running_platform}")
|
|
2048
|
+
if baseline_architecture != ARCH_X86:
|
|
2049
|
+
filters_baseline.append(f"arch={baseline_architecture}")
|
|
2050
|
+
if comparison_architecture != ARCH_X86:
|
|
2051
|
+
filters_comparison.append(f"arch={comparison_architecture}")
|
|
2052
|
+
|
|
2053
|
+
# Query for p50 latency time-series
|
|
2054
|
+
logging.info(f"Querying p50 latencystats time-series for {test_name}")
|
|
2055
|
+
logging.info(f"Baseline filters: {filters_baseline}")
|
|
2056
|
+
logging.info(f"Comparison filters: {filters_comparison}")
|
|
2057
|
+
|
|
2058
|
+
baseline_latency_ts = rts.ts().queryindex(filters_baseline)
|
|
2059
|
+
comparison_latency_ts = rts.ts().queryindex(filters_comparison)
|
|
2060
|
+
|
|
2061
|
+
logging.info(
|
|
2062
|
+
f"Found {len(baseline_latency_ts)} baseline p50 latency time-series"
|
|
2063
|
+
)
|
|
2064
|
+
logging.info(
|
|
2065
|
+
f"Found {len(comparison_latency_ts)} comparison p50 latency time-series"
|
|
2066
|
+
)
|
|
2067
|
+
|
|
2068
|
+
if verbose and baseline_latency_ts:
|
|
2069
|
+
logging.info(f"Baseline latency time-series: {baseline_latency_ts}")
|
|
2070
|
+
if verbose and comparison_latency_ts:
|
|
2071
|
+
logging.info(f"Comparison latency time-series: {comparison_latency_ts}")
|
|
2072
|
+
|
|
2073
|
+
# Filter out target time-series and unwanted commands
|
|
2074
|
+
def should_exclude_timeseries(ts_name):
|
|
2075
|
+
"""Check if time-series should be excluded based on command"""
|
|
2076
|
+
# Exclude target time-series
|
|
2077
|
+
if "target" in ts_name:
|
|
2078
|
+
return True
|
|
2079
|
+
|
|
2080
|
+
# Convert to lowercase for case-insensitive matching
|
|
2081
|
+
ts_name_lower = ts_name.lower()
|
|
2082
|
+
|
|
2083
|
+
# Exclude administrative commands (case-insensitive)
|
|
2084
|
+
excluded_commands = ["config", "info", "ping", "cluster", "resetstat"]
|
|
2085
|
+
return any(cmd in ts_name_lower for cmd in excluded_commands)
|
|
2086
|
+
|
|
2087
|
+
baseline_latency_ts_before = len(baseline_latency_ts)
|
|
2088
|
+
comparison_latency_ts_before = len(comparison_latency_ts)
|
|
2089
|
+
|
|
2090
|
+
# Apply filtering and log what gets excluded
|
|
2091
|
+
baseline_excluded = [
|
|
2092
|
+
ts for ts in baseline_latency_ts if should_exclude_timeseries(ts)
|
|
2093
|
+
]
|
|
2094
|
+
comparison_excluded = [
|
|
2095
|
+
ts for ts in comparison_latency_ts if should_exclude_timeseries(ts)
|
|
2096
|
+
]
|
|
2097
|
+
|
|
2098
|
+
baseline_latency_ts = [
|
|
2099
|
+
ts for ts in baseline_latency_ts if not should_exclude_timeseries(ts)
|
|
2100
|
+
]
|
|
2101
|
+
comparison_latency_ts = [
|
|
2102
|
+
ts for ts in comparison_latency_ts if not should_exclude_timeseries(ts)
|
|
2103
|
+
]
|
|
2104
|
+
|
|
2105
|
+
logging.info(
|
|
2106
|
+
f"After filtering: baseline {baseline_latency_ts_before} -> {len(baseline_latency_ts)}, "
|
|
2107
|
+
f"comparison {comparison_latency_ts_before} -> {len(comparison_latency_ts)}"
|
|
2108
|
+
)
|
|
2109
|
+
|
|
2110
|
+
if baseline_excluded:
|
|
2111
|
+
logging.info(
|
|
2112
|
+
f"Excluded {len(baseline_excluded)} baseline administrative command time-series"
|
|
2113
|
+
)
|
|
2114
|
+
if verbose:
|
|
2115
|
+
for ts in baseline_excluded:
|
|
2116
|
+
logging.info(f" Excluded baseline: {ts}")
|
|
2117
|
+
if comparison_excluded:
|
|
2118
|
+
logging.info(
|
|
2119
|
+
f"Excluded {len(comparison_excluded)} comparison administrative command time-series"
|
|
2120
|
+
)
|
|
2121
|
+
if verbose:
|
|
2122
|
+
for ts in comparison_excluded:
|
|
2123
|
+
logging.info(f" Excluded comparison: {ts}")
|
|
2124
|
+
|
|
2125
|
+
if len(baseline_latency_ts) == 0 or len(comparison_latency_ts) == 0:
|
|
2126
|
+
logging.warning(
|
|
2127
|
+
f"No p50 latency data found for {test_name} after filtering"
|
|
2128
|
+
)
|
|
2129
|
+
return None, False, None
|
|
2130
|
+
|
|
2131
|
+
# Extract command names from time-series to match baseline and comparison
|
|
2132
|
+
def extract_command_from_ts(ts_name):
|
|
2133
|
+
"""Extract meaningful command name from time-series name"""
|
|
2134
|
+
import re
|
|
2135
|
+
|
|
2136
|
+
# Look for latencystats_latency_percentiles_usec_<COMMAND>_p50 pattern
|
|
2137
|
+
match = re.search(
|
|
2138
|
+
r"latencystats_latency_percentiles_usec_([^_/]+)_p50", ts_name
|
|
2139
|
+
)
|
|
2140
|
+
if match:
|
|
2141
|
+
return match.group(1)
|
|
2142
|
+
|
|
2143
|
+
# Look for command= pattern in the time-series name
|
|
2144
|
+
match = re.search(r"command=([^/]+)", ts_name)
|
|
2145
|
+
if match:
|
|
2146
|
+
return match.group(1)
|
|
2147
|
+
|
|
2148
|
+
# If no specific pattern found, try to extract from the end of the path
|
|
2149
|
+
# e.g., .../Ops/sec/GET -> GET
|
|
2150
|
+
parts = ts_name.split("/")
|
|
2151
|
+
if len(parts) > 0:
|
|
2152
|
+
return parts[-1]
|
|
2153
|
+
return "unknown"
|
|
2154
|
+
|
|
2155
|
+
# Group time-series by command
|
|
2156
|
+
baseline_by_command = {}
|
|
2157
|
+
comparison_by_command = {}
|
|
2158
|
+
|
|
2159
|
+
for ts in baseline_latency_ts:
|
|
2160
|
+
cmd = extract_command_from_ts(ts)
|
|
2161
|
+
if verbose:
|
|
2162
|
+
logging.info(f"Baseline time-series '{ts}' -> command '{cmd}'")
|
|
2163
|
+
if cmd not in baseline_by_command:
|
|
2164
|
+
baseline_by_command[cmd] = []
|
|
2165
|
+
baseline_by_command[cmd].append(ts)
|
|
2166
|
+
|
|
2167
|
+
for ts in comparison_latency_ts:
|
|
2168
|
+
cmd = extract_command_from_ts(ts)
|
|
2169
|
+
if verbose:
|
|
2170
|
+
logging.info(f"Comparison time-series '{ts}' -> command '{cmd}'")
|
|
2171
|
+
if cmd not in comparison_by_command:
|
|
2172
|
+
comparison_by_command[cmd] = []
|
|
2173
|
+
comparison_by_command[cmd].append(ts)
|
|
2174
|
+
|
|
2175
|
+
# Find common commands between baseline and comparison
|
|
2176
|
+
common_commands = set(baseline_by_command.keys()) & set(
|
|
2177
|
+
comparison_by_command.keys()
|
|
2178
|
+
)
|
|
2179
|
+
|
|
2180
|
+
logging.info(f"Baseline commands found: {sorted(baseline_by_command.keys())}")
|
|
2181
|
+
logging.info(
|
|
2182
|
+
f"Comparison commands found: {sorted(comparison_by_command.keys())}"
|
|
2183
|
+
)
|
|
2184
|
+
logging.info(
|
|
2185
|
+
f"Common commands for latency comparison: {sorted(common_commands)}"
|
|
2186
|
+
)
|
|
2187
|
+
|
|
2188
|
+
if not common_commands:
|
|
2189
|
+
logging.warning(
|
|
2190
|
+
f"No common commands found for latency comparison in {test_name}"
|
|
2191
|
+
)
|
|
2192
|
+
return None, False, None
|
|
2193
|
+
|
|
2194
|
+
latency_notes = []
|
|
2195
|
+
significant_latency_increases = (
|
|
2196
|
+
0 # Track commands with significant latency increases
|
|
2197
|
+
)
|
|
2198
|
+
regression_details = {"test_name": test_name, "commands": []}
|
|
2199
|
+
|
|
2200
|
+
# Compare latency for each command individually
|
|
2201
|
+
for command in sorted(common_commands):
|
|
2202
|
+
logging.info(f"Analyzing latency for command: {command}")
|
|
2203
|
+
baseline_ts_list = baseline_by_command[command]
|
|
2204
|
+
comparison_ts_list = comparison_by_command[command]
|
|
2205
|
+
|
|
2206
|
+
logging.info(
|
|
2207
|
+
f" Command {command}: {len(baseline_ts_list)} baseline, {len(comparison_ts_list)} comparison time-series"
|
|
2208
|
+
)
|
|
2209
|
+
|
|
2210
|
+
# If multiple time-series for the same command, try to get the best one
|
|
2211
|
+
if len(baseline_ts_list) > 1:
|
|
2212
|
+
logging.info(
|
|
2213
|
+
f" Multiple baseline time-series for {command}, filtering..."
|
|
2214
|
+
)
|
|
2215
|
+
baseline_ts_list = get_only_Totals(baseline_ts_list)
|
|
2216
|
+
if len(comparison_ts_list) > 1:
|
|
2217
|
+
logging.info(
|
|
2218
|
+
f" Multiple comparison time-series for {command}, filtering..."
|
|
2219
|
+
)
|
|
2220
|
+
comparison_ts_list = get_only_Totals(comparison_ts_list)
|
|
2221
|
+
|
|
2222
|
+
if len(baseline_ts_list) != 1 or len(comparison_ts_list) != 1:
|
|
2223
|
+
logging.warning(
|
|
2224
|
+
f" Skipping {command}: baseline={len(baseline_ts_list)}, comparison={len(comparison_ts_list)} time-series"
|
|
2225
|
+
)
|
|
2226
|
+
continue
|
|
2227
|
+
|
|
2228
|
+
# Get latency data for this command
|
|
2229
|
+
baseline_latency_data = []
|
|
2230
|
+
comparison_latency_data = []
|
|
2231
|
+
|
|
2232
|
+
for ts_name in baseline_ts_list:
|
|
2233
|
+
datapoints = rts.ts().revrange(ts_name, from_ts_ms, to_ts_ms)
|
|
2234
|
+
baseline_latency_data.extend(datapoints)
|
|
2235
|
+
|
|
2236
|
+
for ts_name in comparison_ts_list:
|
|
2237
|
+
datapoints = rts.ts().revrange(ts_name, from_ts_ms, to_ts_ms)
|
|
2238
|
+
comparison_latency_data.extend(datapoints)
|
|
2239
|
+
|
|
2240
|
+
if len(baseline_latency_data) == 0 or len(comparison_latency_data) == 0:
|
|
2241
|
+
logging.warning(
|
|
2242
|
+
f" No latency data for {command}: baseline={len(baseline_latency_data)}, comparison={len(comparison_latency_data)} datapoints"
|
|
2243
|
+
)
|
|
2244
|
+
continue
|
|
2245
|
+
|
|
2246
|
+
logging.info(
|
|
2247
|
+
f" Command {command}: {len(baseline_latency_data)} baseline, {len(comparison_latency_data)} comparison datapoints"
|
|
2248
|
+
)
|
|
2249
|
+
|
|
2250
|
+
# Calculate latency statistics for this command
|
|
2251
|
+
baseline_latency_values = []
|
|
2252
|
+
comparison_latency_values = []
|
|
2253
|
+
|
|
2254
|
+
(_, baseline_latency_median, _) = get_v_pct_change_and_largest_var(
|
|
2255
|
+
baseline_latency_data,
|
|
2256
|
+
0,
|
|
2257
|
+
0,
|
|
2258
|
+
baseline_latency_values,
|
|
2259
|
+
0,
|
|
2260
|
+
last_n_baseline,
|
|
2261
|
+
verbose,
|
|
2262
|
+
first_n_baseline,
|
|
2263
|
+
)
|
|
2264
|
+
|
|
2265
|
+
(_, comparison_latency_median, _) = get_v_pct_change_and_largest_var(
|
|
2266
|
+
comparison_latency_data,
|
|
2267
|
+
0,
|
|
2268
|
+
0,
|
|
2269
|
+
comparison_latency_values,
|
|
2270
|
+
0,
|
|
2271
|
+
last_n_comparison,
|
|
2272
|
+
verbose,
|
|
2273
|
+
first_n_comparison,
|
|
2274
|
+
)
|
|
2275
|
+
|
|
2276
|
+
if baseline_latency_median == "N/A" or comparison_latency_median == "N/A":
|
|
2277
|
+
logging.warning(
|
|
2278
|
+
f" Could not calculate median for {command}: baseline={baseline_latency_median}, comparison={comparison_latency_median}"
|
|
2279
|
+
)
|
|
2280
|
+
continue
|
|
2281
|
+
|
|
2282
|
+
# Calculate variance (coefficient of variation) for both baseline and comparison
|
|
2283
|
+
baseline_latency_mean = (
|
|
2284
|
+
statistics.mean(baseline_latency_values)
|
|
2285
|
+
if baseline_latency_values
|
|
2286
|
+
else 0
|
|
2287
|
+
)
|
|
2288
|
+
baseline_latency_stdev = (
|
|
2289
|
+
statistics.stdev(baseline_latency_values)
|
|
2290
|
+
if len(baseline_latency_values) > 1
|
|
2291
|
+
else 0
|
|
2292
|
+
)
|
|
2293
|
+
baseline_latency_cv = (
|
|
2294
|
+
(baseline_latency_stdev / baseline_latency_mean * 100)
|
|
2295
|
+
if baseline_latency_mean > 0
|
|
2296
|
+
else float("inf")
|
|
2297
|
+
)
|
|
2298
|
+
|
|
2299
|
+
comparison_latency_mean = (
|
|
2300
|
+
statistics.mean(comparison_latency_values)
|
|
2301
|
+
if comparison_latency_values
|
|
2302
|
+
else 0
|
|
2303
|
+
)
|
|
2304
|
+
comparison_latency_stdev = (
|
|
2305
|
+
statistics.stdev(comparison_latency_values)
|
|
2306
|
+
if len(comparison_latency_values) > 1
|
|
2307
|
+
else 0
|
|
2308
|
+
)
|
|
2309
|
+
comparison_latency_cv = (
|
|
2310
|
+
(comparison_latency_stdev / comparison_latency_mean * 100)
|
|
2311
|
+
if comparison_latency_mean > 0
|
|
2312
|
+
else float("inf")
|
|
2313
|
+
)
|
|
2314
|
+
|
|
2315
|
+
# Calculate latency change (for latency, lower is better)
|
|
2316
|
+
latency_change = (
|
|
2317
|
+
float(comparison_latency_median) / float(baseline_latency_median) - 1
|
|
2318
|
+
) * 100.0
|
|
2319
|
+
|
|
2320
|
+
logging.info(
|
|
2321
|
+
f" Command {command}: baseline p50={baseline_latency_median:.2f} (CV={baseline_latency_cv:.1f}%), comparison p50={comparison_latency_median:.2f} (CV={comparison_latency_cv:.1f}%), change={latency_change:.1f}%"
|
|
2322
|
+
)
|
|
2323
|
+
|
|
2324
|
+
# Check if latency data is too unstable to be reliable
|
|
2325
|
+
latency_data_unstable = (
|
|
2326
|
+
baseline_latency_cv > 50.0 or comparison_latency_cv > 50.0
|
|
2327
|
+
)
|
|
2328
|
+
|
|
2329
|
+
if latency_data_unstable:
|
|
2330
|
+
# Mark as unstable latency data
|
|
2331
|
+
unstable_reason = []
|
|
2332
|
+
if baseline_latency_cv > 50.0:
|
|
2333
|
+
unstable_reason.append(f"baseline CV={baseline_latency_cv:.1f}%")
|
|
2334
|
+
if comparison_latency_cv > 50.0:
|
|
2335
|
+
unstable_reason.append(
|
|
2336
|
+
f"comparison CV={comparison_latency_cv:.1f}%"
|
|
2337
|
+
)
|
|
2338
|
+
|
|
2339
|
+
latency_notes.append(
|
|
2340
|
+
f"{command} p50 UNSTABLE ({', '.join(unstable_reason)} - data too noisy for reliable analysis)"
|
|
2341
|
+
)
|
|
2342
|
+
logging.warning(
|
|
2343
|
+
f" Command {command}: UNSTABLE latency data detected - {', '.join(unstable_reason)}"
|
|
2344
|
+
)
|
|
2345
|
+
elif (
|
|
2346
|
+
abs(latency_change) > 5.0
|
|
2347
|
+
): # Only report significant latency changes for stable data
|
|
2348
|
+
direction = "increased" if latency_change > 0 else "decreased"
|
|
2349
|
+
|
|
2350
|
+
# Adjust significance threshold based on baseline variance
|
|
2351
|
+
if baseline_latency_cv < 30.0:
|
|
2352
|
+
# Low variance - use standard threshold
|
|
2353
|
+
significance_threshold = 10.0
|
|
2354
|
+
elif baseline_latency_cv < 50.0:
|
|
2355
|
+
# Moderate variance - require larger change
|
|
2356
|
+
significance_threshold = 15.0
|
|
2357
|
+
else:
|
|
2358
|
+
# High variance - require much larger change
|
|
2359
|
+
significance_threshold = 25.0
|
|
2360
|
+
|
|
2361
|
+
latency_notes.append(
|
|
2362
|
+
f"{command} p50 {direction} {abs(latency_change):.1f}% (baseline CV={baseline_latency_cv:.1f}%)"
|
|
2363
|
+
)
|
|
2364
|
+
logging.info(
|
|
2365
|
+
f" Command {command}: SIGNIFICANT latency change detected ({direction} {abs(latency_change):.1f}%, baseline CV={baseline_latency_cv:.1f}%)"
|
|
2366
|
+
)
|
|
2367
|
+
|
|
2368
|
+
# Track significant latency increases (potential regression confirmation)
|
|
2369
|
+
if latency_change > significance_threshold:
|
|
2370
|
+
significant_latency_increases += 1
|
|
2371
|
+
regression_details["commands"].append(
|
|
2372
|
+
{
|
|
2373
|
+
"command": command,
|
|
2374
|
+
"change_percent": latency_change,
|
|
2375
|
+
"direction": direction,
|
|
2376
|
+
"baseline_cv": baseline_latency_cv,
|
|
2377
|
+
"comparison_cv": comparison_latency_cv,
|
|
2378
|
+
}
|
|
2379
|
+
)
|
|
2380
|
+
logging.info(
|
|
2381
|
+
f" Command {command}: CONFIRMS regression (change={latency_change:.1f}% > threshold={significance_threshold:.1f}%)"
|
|
2382
|
+
)
|
|
2383
|
+
else:
|
|
2384
|
+
logging.info(
|
|
2385
|
+
f" Command {command}: Change below significance threshold (change={latency_change:.1f}% <= threshold={significance_threshold:.1f}%)"
|
|
2386
|
+
)
|
|
2387
|
+
elif verbose:
|
|
2388
|
+
latency_notes.append(
|
|
2389
|
+
f"{command} p50 stable (CV={baseline_latency_cv:.1f}%)"
|
|
2390
|
+
)
|
|
2391
|
+
logging.info(
|
|
2392
|
+
f" Command {command}: latency stable (change={latency_change:.1f}%, baseline CV={baseline_latency_cv:.1f}%)"
|
|
2393
|
+
)
|
|
2394
|
+
|
|
2395
|
+
# Determine if latency confirms regression
|
|
2396
|
+
confirms_regression = significant_latency_increases > 0
|
|
2397
|
+
|
|
2398
|
+
# Return combined latency notes
|
|
2399
|
+
if latency_notes:
|
|
2400
|
+
result = "; ".join(latency_notes)
|
|
2401
|
+
logging.info(f"Latency check completed for {test_name}: {result}")
|
|
2402
|
+
return (
|
|
2403
|
+
result,
|
|
2404
|
+
confirms_regression,
|
|
2405
|
+
regression_details if confirms_regression else None,
|
|
2406
|
+
)
|
|
2407
|
+
else:
|
|
2408
|
+
result = "p50 latency stable" if common_commands else None
|
|
2409
|
+
logging.info(
|
|
2410
|
+
f"Latency check completed for {test_name}: {result or 'no data'}"
|
|
2411
|
+
)
|
|
2412
|
+
return result, False, None
|
|
2413
|
+
|
|
2414
|
+
except Exception as e:
|
|
2415
|
+
logging.error(f"Error checking latency for {test_name}: {e}")
|
|
2416
|
+
return None, False, None
|
|
2417
|
+
|
|
2418
|
+
|
|
994
2419
|
def get_only_Totals(baseline_timeseries):
|
|
995
2420
|
logging.warning("\t\tTime-series: {}".format(", ".join(baseline_timeseries)))
|
|
996
2421
|
logging.info("Checking if Totals will reduce timeseries.")
|
|
@@ -998,6 +2423,37 @@ def get_only_Totals(baseline_timeseries):
|
|
|
998
2423
|
for ts_name in baseline_timeseries:
|
|
999
2424
|
if "Totals" in ts_name:
|
|
1000
2425
|
new_base.append(ts_name)
|
|
2426
|
+
|
|
2427
|
+
# If no "Totals" time-series found, try to pick the best alternative
|
|
2428
|
+
if len(new_base) == 0:
|
|
2429
|
+
logging.warning(
|
|
2430
|
+
"No 'Totals' time-series found, trying to pick best alternative."
|
|
2431
|
+
)
|
|
2432
|
+
# Prefer time-series without quotes in metric names
|
|
2433
|
+
unquoted_series = [ts for ts in baseline_timeseries if "'" not in ts]
|
|
2434
|
+
if unquoted_series:
|
|
2435
|
+
new_base = unquoted_series
|
|
2436
|
+
else:
|
|
2437
|
+
# Fall back to original list
|
|
2438
|
+
new_base = baseline_timeseries
|
|
2439
|
+
|
|
2440
|
+
# If we still have multiple time-series after filtering for "Totals",
|
|
2441
|
+
# prefer the one without quotes in the metric name
|
|
2442
|
+
if len(new_base) > 1:
|
|
2443
|
+
logging.info("Multiple time-series found, preferring unquoted metric names.")
|
|
2444
|
+
unquoted_series = [ts for ts in new_base if "'" not in ts]
|
|
2445
|
+
if unquoted_series:
|
|
2446
|
+
new_base = unquoted_series
|
|
2447
|
+
|
|
2448
|
+
# If we still have multiple, take the first one
|
|
2449
|
+
if len(new_base) > 1:
|
|
2450
|
+
logging.warning(
|
|
2451
|
+
"Still multiple time-series after filtering, taking the first one: {}".format(
|
|
2452
|
+
new_base[0]
|
|
2453
|
+
)
|
|
2454
|
+
)
|
|
2455
|
+
new_base = [new_base[0]]
|
|
2456
|
+
|
|
1001
2457
|
baseline_timeseries = new_base
|
|
1002
2458
|
return baseline_timeseries
|
|
1003
2459
|
|
|
@@ -1064,11 +2520,38 @@ def add_line(
|
|
|
1064
2520
|
percentage_change,
|
|
1065
2521
|
table,
|
|
1066
2522
|
test_name,
|
|
2523
|
+
grafana_link_base=None,
|
|
2524
|
+
baseline_branch=None,
|
|
2525
|
+
baseline_version=None,
|
|
2526
|
+
comparison_branch=None,
|
|
2527
|
+
comparison_version=None,
|
|
2528
|
+
from_date=None,
|
|
2529
|
+
to_date=None,
|
|
1067
2530
|
):
|
|
2531
|
+
grafana_link = None
|
|
2532
|
+
if grafana_link_base is not None:
|
|
2533
|
+
grafana_link = "{}?orgId=1".format(grafana_link_base)
|
|
2534
|
+
grafana_link += f"&var-test_case={test_name}"
|
|
2535
|
+
|
|
2536
|
+
if baseline_branch is not None:
|
|
2537
|
+
grafana_link += f"&var-branch={baseline_branch}"
|
|
2538
|
+
if baseline_version is not None:
|
|
2539
|
+
grafana_link += f"&var-version={baseline_version}"
|
|
2540
|
+
if comparison_branch is not None:
|
|
2541
|
+
grafana_link += f"&var-branch={comparison_branch}"
|
|
2542
|
+
if comparison_version is not None:
|
|
2543
|
+
grafana_link += f"&var-version={comparison_version}"
|
|
2544
|
+
grafana_link += "&from=now-30d&to=now"
|
|
2545
|
+
|
|
2546
|
+
# Create test name with optional Grafana link
|
|
2547
|
+
test_name_display = test_name
|
|
2548
|
+
if grafana_link is not None:
|
|
2549
|
+
test_name_display = f"[{test_name}]({grafana_link})"
|
|
2550
|
+
|
|
1068
2551
|
percentage_change_str = "{:.1f}% ".format(percentage_change)
|
|
1069
2552
|
table.append(
|
|
1070
2553
|
[
|
|
1071
|
-
|
|
2554
|
+
test_name_display,
|
|
1072
2555
|
baseline_v_str,
|
|
1073
2556
|
comparison_v_str,
|
|
1074
2557
|
percentage_change_str,
|
|
@@ -1105,9 +2588,9 @@ def get_v_pct_change_and_largest_var(
|
|
|
1105
2588
|
comparison_values.append(tuple[1])
|
|
1106
2589
|
|
|
1107
2590
|
comparison_df = pd.DataFrame(comparison_values)
|
|
1108
|
-
comparison_median = float(comparison_df.median())
|
|
2591
|
+
comparison_median = float(comparison_df.median().iloc[0])
|
|
1109
2592
|
comparison_v = comparison_median
|
|
1110
|
-
comparison_std = float(comparison_df.std())
|
|
2593
|
+
comparison_std = float(comparison_df.std().iloc[0])
|
|
1111
2594
|
if verbose:
|
|
1112
2595
|
logging.info(
|
|
1113
2596
|
"comparison_datapoints: {} value: {}; std-dev: {}; median: {}".format(
|