testmind 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- testmind/analysis/flaky.py +96 -0
- testmind/analysis/models.py +62 -0
- testmind/analysis/predictor.py +101 -0
- testmind/analysis/regression.py +153 -0
- testmind/analysis/stability.py +99 -0
- testmind/cli/app.py +293 -0
- testmind/domain/models.py +60 -0
- testmind/parsers/base.py +9 -0
- testmind/parsers/html_parser.py +233 -0
- testmind/parsers/junit_parser.py +109 -0
- testmind/reports/formatters.py +162 -0
- testmind/reports/summary.py +109 -0
- testmind/storage/base.py +49 -0
- testmind/storage/sqlite_store.py +229 -0
- testmind/utils/tools.py +7 -0
- testmind-0.1.0.dist-info/METADATA +531 -0
- testmind-0.1.0.dist-info/RECORD +20 -0
- testmind-0.1.0.dist-info/WHEEL +4 -0
- testmind-0.1.0.dist-info/entry_points.txt +2 -0
- testmind-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Flaky test detection.
|
|
3
|
+
|
|
4
|
+
A test is FLAKY when it produces mixed pass/fail results over recent runs
|
|
5
|
+
without a clear directional trend — i.e. the failure rate is in the
|
|
6
|
+
"uncertain zone" (between FLAKY_LOW and FLAKY_HIGH thresholds) AND
|
|
7
|
+
consecutive results flip at a meaningful rate.
|
|
8
|
+
|
|
9
|
+
Rules
|
|
10
|
+
-----
|
|
11
|
+
- Requires at least `min_runs` observations (default 5).
|
|
12
|
+
- fail_rate ∈ (FLAKY_LOW, FLAKY_HIGH) → candidate
|
|
13
|
+
- flip_rate > FLIP_THRESHOLD → confirmed flaky
|
|
14
|
+
where flip_rate = fraction of consecutive pairs whose outcome differs.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from datetime import datetime
|
|
18
|
+
|
|
19
|
+
from testmind.domain.models import TestResult, TestStatus
|
|
20
|
+
from testmind.analysis.models import FlakyResult
|
|
21
|
+
|
|
22
|
+
# Thresholds (all tunable via constructor kwargs)
|
|
23
|
+
_FLAKY_LOW = 0.10 # below this → consistently passing
|
|
24
|
+
_FLAKY_HIGH = 0.90 # above this → consistently failing
|
|
25
|
+
_FLIP_THRESHOLD = 0.15
|
|
26
|
+
_MIN_RUNS = 5
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _is_failure(status: TestStatus) -> bool:
|
|
30
|
+
return status in (TestStatus.FAILED, TestStatus.ERROR)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _flip_rate(outcomes: list[bool]) -> float:
|
|
34
|
+
if len(outcomes) < 2:
|
|
35
|
+
return 0.0
|
|
36
|
+
flips = sum(a != b for a, b in zip(outcomes, outcomes[1:]))
|
|
37
|
+
return flips / (len(outcomes) - 1)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class FlakyDetector:
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
min_runs: int = _MIN_RUNS,
|
|
44
|
+
flaky_low: float = _FLAKY_LOW,
|
|
45
|
+
flaky_high: float = _FLAKY_HIGH,
|
|
46
|
+
flip_threshold: float = _FLIP_THRESHOLD,
|
|
47
|
+
) -> None:
|
|
48
|
+
self.min_runs = min_runs
|
|
49
|
+
self.flaky_low = flaky_low
|
|
50
|
+
self.flaky_high = flaky_high
|
|
51
|
+
self.flip_threshold = flip_threshold
|
|
52
|
+
|
|
53
|
+
def analyze(
|
|
54
|
+
self,
|
|
55
|
+
test_name: str,
|
|
56
|
+
history: list[tuple[datetime, TestResult]],
|
|
57
|
+
) -> FlakyResult:
|
|
58
|
+
"""
|
|
59
|
+
Parameters
|
|
60
|
+
----------
|
|
61
|
+
history : list of (timestamp, TestResult), oldest → newest.
|
|
62
|
+
"""
|
|
63
|
+
if len(history) < self.min_runs:
|
|
64
|
+
return FlakyResult(
|
|
65
|
+
test_name=test_name,
|
|
66
|
+
is_flaky=False,
|
|
67
|
+
flip_rate=0.0,
|
|
68
|
+
pass_rate=0.0,
|
|
69
|
+
fail_rate=0.0,
|
|
70
|
+
run_count=len(history),
|
|
71
|
+
insufficient_data=True,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
# Oldest → newest order (history may arrive newest-first from store)
|
|
75
|
+
ordered = sorted(history, key=lambda x: x[0])
|
|
76
|
+
outcomes = [_is_failure(r.status) for _, r in ordered]
|
|
77
|
+
|
|
78
|
+
total = len(outcomes)
|
|
79
|
+
failures = sum(outcomes)
|
|
80
|
+
fail_rate = failures / total
|
|
81
|
+
pass_rate = 1.0 - fail_rate
|
|
82
|
+
fr = _flip_rate(outcomes)
|
|
83
|
+
|
|
84
|
+
is_flaky = (
|
|
85
|
+
self.flaky_low < fail_rate < self.flaky_high
|
|
86
|
+
and fr > self.flip_threshold
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
return FlakyResult(
|
|
90
|
+
test_name=test_name,
|
|
91
|
+
is_flaky=is_flaky,
|
|
92
|
+
flip_rate=fr,
|
|
93
|
+
pass_rate=pass_rate,
|
|
94
|
+
fail_rate=fail_rate,
|
|
95
|
+
run_count=total,
|
|
96
|
+
)
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from enum import StrEnum
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class Trend(StrEnum):
|
|
6
|
+
IMPROVING = "improving"
|
|
7
|
+
DEGRADING = "degrading"
|
|
8
|
+
STABLE = "stable"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass(frozen=True)
|
|
12
|
+
class FlakyResult:
|
|
13
|
+
test_name: str
|
|
14
|
+
is_flaky: bool
|
|
15
|
+
flip_rate: float # fraction of consecutive pairs with different outcomes
|
|
16
|
+
pass_rate: float
|
|
17
|
+
fail_rate: float
|
|
18
|
+
run_count: int
|
|
19
|
+
# True when there are too few runs to decide
|
|
20
|
+
insufficient_data: bool = False
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass(frozen=True)
|
|
24
|
+
class RegressionResult:
|
|
25
|
+
test_name: str
|
|
26
|
+
is_regression: bool
|
|
27
|
+
# pass rate in the reference (older) window
|
|
28
|
+
reference_pass_rate: float
|
|
29
|
+
# fail rate in the recent window
|
|
30
|
+
recent_fail_rate: float
|
|
31
|
+
insufficient_data: bool = False
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass(frozen=True)
|
|
35
|
+
class SpikeResult:
|
|
36
|
+
"""Suite-level: was there a sudden failure-rate spike in the latest run?"""
|
|
37
|
+
is_spike: bool
|
|
38
|
+
current_fail_rate: float
|
|
39
|
+
baseline_mean: float
|
|
40
|
+
baseline_std: float
|
|
41
|
+
z_score: float
|
|
42
|
+
insufficient_data: bool = False
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass(frozen=True)
|
|
46
|
+
class StabilityResult:
|
|
47
|
+
test_name: str
|
|
48
|
+
score: float # 0–100
|
|
49
|
+
pass_rate: float
|
|
50
|
+
duration_consistency: float # 0–1 (1 = perfectly consistent)
|
|
51
|
+
flip_rate: float
|
|
52
|
+
run_count: int
|
|
53
|
+
insufficient_data: bool = False
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass(frozen=True)
|
|
57
|
+
class PredictionResult:
|
|
58
|
+
test_name: str
|
|
59
|
+
failure_probability: float # 0–1
|
|
60
|
+
trend: Trend
|
|
61
|
+
confidence: float # 0–1 (grows with run count)
|
|
62
|
+
insufficient_data: bool = False
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Failure probability predictor.
|
|
3
|
+
|
|
4
|
+
Approach
|
|
5
|
+
--------
|
|
6
|
+
1. Sort history oldest → newest.
|
|
7
|
+
2. Compute a per-run binary outcome (1 = fail, 0 = pass).
|
|
8
|
+
3. Fit a linear trend to those outcomes using OLS (no external deps).
|
|
9
|
+
4. Predict the next value = last_outcome + slope.
|
|
10
|
+
5. Clamp prediction to [0, 1] and interpret trend.
|
|
11
|
+
|
|
12
|
+
Confidence scales with the number of runs up to MAX_CONF_RUNS.
|
|
13
|
+
|
|
14
|
+
Trend thresholds
|
|
15
|
+
----------------
|
|
16
|
+
slope > +SLOPE_THRESHOLD → DEGRADING
|
|
17
|
+
slope < -SLOPE_THRESHOLD → IMPROVING
|
|
18
|
+
otherwise → STABLE
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from datetime import datetime
|
|
22
|
+
|
|
23
|
+
from testmind.domain.models import TestResult, TestStatus
|
|
24
|
+
from testmind.analysis.models import PredictionResult, Trend
|
|
25
|
+
|
|
26
|
+
_MIN_RUNS = 3
|
|
27
|
+
_MAX_CONF_RUNS = 20
|
|
28
|
+
_SLOPE_THRESHOLD = 0.05
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _is_failure(status: TestStatus) -> bool:
|
|
32
|
+
return status in (TestStatus.FAILED, TestStatus.ERROR)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _ols_slope(ys: list[float]) -> float:
|
|
36
|
+
"""Slope of OLS regression of ys against integer indices 0, 1, …, n-1."""
|
|
37
|
+
n = len(ys)
|
|
38
|
+
if n < 2:
|
|
39
|
+
return 0.0
|
|
40
|
+
xs = list(range(n))
|
|
41
|
+
mean_x = (n - 1) / 2.0 # exact for 0..n-1
|
|
42
|
+
mean_y = sum(ys) / n
|
|
43
|
+
num = sum((x - mean_x) * (y - mean_y) for x, y in zip(xs, ys))
|
|
44
|
+
den = sum((x - mean_x) ** 2 for x in xs)
|
|
45
|
+
return num / den if den != 0.0 else 0.0
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class FailurePredictor:
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
min_runs: int = _MIN_RUNS,
|
|
52
|
+
slope_threshold: float = _SLOPE_THRESHOLD,
|
|
53
|
+
max_conf_runs: int = _MAX_CONF_RUNS,
|
|
54
|
+
) -> None:
|
|
55
|
+
self.min_runs = min_runs
|
|
56
|
+
self.slope_threshold = slope_threshold
|
|
57
|
+
self.max_conf_runs = max_conf_runs
|
|
58
|
+
|
|
59
|
+
def analyze(
|
|
60
|
+
self,
|
|
61
|
+
test_name: str,
|
|
62
|
+
history: list[tuple[datetime, TestResult]],
|
|
63
|
+
) -> PredictionResult:
|
|
64
|
+
"""
|
|
65
|
+
Parameters
|
|
66
|
+
----------
|
|
67
|
+
history : list of (timestamp, TestResult), any order.
|
|
68
|
+
"""
|
|
69
|
+
if len(history) < self.min_runs:
|
|
70
|
+
return PredictionResult(
|
|
71
|
+
test_name=test_name,
|
|
72
|
+
failure_probability=0.0,
|
|
73
|
+
trend=Trend.STABLE,
|
|
74
|
+
confidence=0.0,
|
|
75
|
+
insufficient_data=True,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
ordered = sorted(history, key=lambda x: x[0])
|
|
79
|
+
outcomes = [1.0 if _is_failure(r.status) else 0.0 for _, r in ordered]
|
|
80
|
+
|
|
81
|
+
slope = _ols_slope(outcomes)
|
|
82
|
+
|
|
83
|
+
last_rate = sum(outcomes[-3:]) / min(3, len(outcomes))
|
|
84
|
+
raw_prediction = last_rate + slope
|
|
85
|
+
failure_prob = max(0.0, min(1.0, raw_prediction))
|
|
86
|
+
|
|
87
|
+
if slope > self.slope_threshold:
|
|
88
|
+
trend = Trend.DEGRADING
|
|
89
|
+
elif slope < -self.slope_threshold:
|
|
90
|
+
trend = Trend.IMPROVING
|
|
91
|
+
else:
|
|
92
|
+
trend = Trend.STABLE
|
|
93
|
+
|
|
94
|
+
confidence = min(len(outcomes) / self.max_conf_runs, 1.0)
|
|
95
|
+
|
|
96
|
+
return PredictionResult(
|
|
97
|
+
test_name=test_name,
|
|
98
|
+
failure_probability=round(failure_prob, 4),
|
|
99
|
+
trend=trend,
|
|
100
|
+
confidence=round(confidence, 4),
|
|
101
|
+
)
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Regression and spike detection.
|
|
3
|
+
|
|
4
|
+
Regression (test level)
|
|
5
|
+
-----------------------
|
|
6
|
+
A test is a REGRESSION when it was stable (mostly passing) in an older
|
|
7
|
+
reference window but has started failing in a recent window.
|
|
8
|
+
|
|
9
|
+
Rules
|
|
10
|
+
-----
|
|
11
|
+
- Requires at least `min_runs` total observations (default 6).
|
|
12
|
+
- Reference window : all runs except the last `recent_window` (default 3).
|
|
13
|
+
- Recent window : the last `recent_window` runs.
|
|
14
|
+
- Is regression when:
|
|
15
|
+
reference_pass_rate >= STABLE_THRESHOLD (was stable)
|
|
16
|
+
AND recent_fail_rate >= RECENT_FAIL_THRESHOLD (now failing)
|
|
17
|
+
|
|
18
|
+
Spike (suite level)
|
|
19
|
+
-------------------
|
|
20
|
+
A SPIKE occurs when the most recent report's failure rate is significantly
|
|
21
|
+
higher than the rolling baseline of previous reports.
|
|
22
|
+
|
|
23
|
+
Rules
|
|
24
|
+
-----
|
|
25
|
+
- Requires at least `min_baseline` previous reports (default 3).
|
|
26
|
+
- Baseline = fail_rate of the N reports preceding the latest.
|
|
27
|
+
- z_score = (current_fail_rate - baseline_mean) / baseline_std
|
|
28
|
+
- Spike when z_score >= SPIKE_Z_THRESHOLD (default 2.0)
|
|
29
|
+
AND current_fail_rate > baseline_mean (one-tailed).
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
import math
|
|
33
|
+
from datetime import datetime
|
|
34
|
+
|
|
35
|
+
from testmind.domain.models import TestReport, TestResult, TestStatus
|
|
36
|
+
from testmind.analysis.models import RegressionResult, SpikeResult
|
|
37
|
+
|
|
38
|
+
_STABLE_THRESHOLD = 0.90
|
|
39
|
+
_RECENT_FAIL_THRESHOLD = 0.60
|
|
40
|
+
_RECENT_WINDOW = 3
|
|
41
|
+
_MIN_RUNS = 6
|
|
42
|
+
_MIN_BASELINE = 3
|
|
43
|
+
_SPIKE_Z = 2.0
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _is_failure(status: TestStatus) -> bool:
|
|
47
|
+
return status in (TestStatus.FAILED, TestStatus.ERROR)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class RegressionDetector:
|
|
51
|
+
def __init__(
|
|
52
|
+
self,
|
|
53
|
+
recent_window: int = _RECENT_WINDOW,
|
|
54
|
+
min_runs: int = _MIN_RUNS,
|
|
55
|
+
stable_threshold: float = _STABLE_THRESHOLD,
|
|
56
|
+
recent_fail_threshold: float = _RECENT_FAIL_THRESHOLD,
|
|
57
|
+
) -> None:
|
|
58
|
+
self.recent_window = recent_window
|
|
59
|
+
self.min_runs = min_runs
|
|
60
|
+
self.stable_threshold = stable_threshold
|
|
61
|
+
self.recent_fail_threshold = recent_fail_threshold
|
|
62
|
+
|
|
63
|
+
def analyze(
|
|
64
|
+
self,
|
|
65
|
+
test_name: str,
|
|
66
|
+
history: list[tuple[datetime, TestResult]],
|
|
67
|
+
) -> RegressionResult:
|
|
68
|
+
"""
|
|
69
|
+
Parameters
|
|
70
|
+
----------
|
|
71
|
+
history : list of (timestamp, TestResult), any order — will be sorted.
|
|
72
|
+
"""
|
|
73
|
+
if len(history) < self.min_runs:
|
|
74
|
+
return RegressionResult(
|
|
75
|
+
test_name=test_name,
|
|
76
|
+
is_regression=False,
|
|
77
|
+
reference_pass_rate=0.0,
|
|
78
|
+
recent_fail_rate=0.0,
|
|
79
|
+
insufficient_data=True,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
ordered = sorted(history, key=lambda x: x[0])
|
|
83
|
+
recent = ordered[-self.recent_window :]
|
|
84
|
+
reference = ordered[: -self.recent_window]
|
|
85
|
+
|
|
86
|
+
ref_failures = sum(_is_failure(r.status) for _, r in reference)
|
|
87
|
+
ref_pass_rate = 1.0 - ref_failures / len(reference)
|
|
88
|
+
|
|
89
|
+
rec_failures = sum(_is_failure(r.status) for _, r in recent)
|
|
90
|
+
rec_fail_rate = rec_failures / len(recent)
|
|
91
|
+
|
|
92
|
+
is_regression = (
|
|
93
|
+
ref_pass_rate >= self.stable_threshold
|
|
94
|
+
and rec_fail_rate >= self.recent_fail_threshold
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
return RegressionResult(
|
|
98
|
+
test_name=test_name,
|
|
99
|
+
is_regression=is_regression,
|
|
100
|
+
reference_pass_rate=ref_pass_rate,
|
|
101
|
+
recent_fail_rate=rec_fail_rate,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class SpikeDetector:
|
|
106
|
+
def __init__(
|
|
107
|
+
self,
|
|
108
|
+
min_baseline: int = _MIN_BASELINE,
|
|
109
|
+
z_threshold: float = _SPIKE_Z,
|
|
110
|
+
) -> None:
|
|
111
|
+
self.min_baseline = min_baseline
|
|
112
|
+
self.z_threshold = z_threshold
|
|
113
|
+
|
|
114
|
+
def analyze(self, reports: list[TestReport]) -> SpikeResult:
|
|
115
|
+
"""
|
|
116
|
+
Parameters
|
|
117
|
+
----------
|
|
118
|
+
reports : ordered oldest → newest; the last entry is the current run.
|
|
119
|
+
"""
|
|
120
|
+
if len(reports) < self.min_baseline + 1:
|
|
121
|
+
return SpikeResult(
|
|
122
|
+
is_spike=False,
|
|
123
|
+
current_fail_rate=0.0,
|
|
124
|
+
baseline_mean=0.0,
|
|
125
|
+
baseline_std=0.0,
|
|
126
|
+
z_score=0.0,
|
|
127
|
+
insufficient_data=True,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
ordered = sorted(reports, key=lambda r: r.timestamp)
|
|
131
|
+
current = ordered[-1]
|
|
132
|
+
baseline = ordered[:-1]
|
|
133
|
+
|
|
134
|
+
current_fail_rate = current.fail_rate
|
|
135
|
+
baseline_rates = [r.fail_rate for r in baseline]
|
|
136
|
+
mean = sum(baseline_rates) / len(baseline_rates)
|
|
137
|
+
variance = sum((x - mean) ** 2 for x in baseline_rates) / len(baseline_rates)
|
|
138
|
+
std = math.sqrt(variance)
|
|
139
|
+
|
|
140
|
+
if std == 0.0:
|
|
141
|
+
z_score = 0.0 if current_fail_rate == mean else float("inf")
|
|
142
|
+
else:
|
|
143
|
+
z_score = (current_fail_rate - mean) / std
|
|
144
|
+
|
|
145
|
+
is_spike = z_score >= self.z_threshold and current_fail_rate > mean
|
|
146
|
+
|
|
147
|
+
return SpikeResult(
|
|
148
|
+
is_spike=is_spike,
|
|
149
|
+
current_fail_rate=current_fail_rate,
|
|
150
|
+
baseline_mean=mean,
|
|
151
|
+
baseline_std=std,
|
|
152
|
+
z_score=z_score,
|
|
153
|
+
)
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Stability Index (0 – 100).
|
|
3
|
+
|
|
4
|
+
Formula
|
|
5
|
+
-------
|
|
6
|
+
score = pass_rate_score + consistency_score + non_flakiness_score
|
|
7
|
+
|
|
8
|
+
Where:
|
|
9
|
+
pass_rate_score = pass_rate × 60
|
|
10
|
+
consistency_score = duration_consistency × 20
|
|
11
|
+
non_flakiness_score = (1 - flip_rate) × 20
|
|
12
|
+
|
|
13
|
+
duration_consistency = 1 - min(CV, 1)
|
|
14
|
+
CV (coefficient of variation) = std(durations) / mean(durations)
|
|
15
|
+
→ 0 when wildly variable, 1 when perfectly consistent.
|
|
16
|
+
If all durations are zero or there is only one run, consistency = 1.
|
|
17
|
+
|
|
18
|
+
flip_rate = fraction of consecutive pairs with different outcomes.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import math
|
|
22
|
+
from datetime import datetime
|
|
23
|
+
|
|
24
|
+
from testmind.domain.models import TestResult, TestStatus
|
|
25
|
+
from testmind.analysis.models import StabilityResult
|
|
26
|
+
|
|
27
|
+
_MIN_RUNS = 3
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _is_failure(status: TestStatus) -> bool:
|
|
31
|
+
return status in (TestStatus.FAILED, TestStatus.ERROR)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _flip_rate(outcomes: list[bool]) -> float:
|
|
35
|
+
if len(outcomes) < 2:
|
|
36
|
+
return 0.0
|
|
37
|
+
flips = sum(a != b for a, b in zip(outcomes, outcomes[1:]))
|
|
38
|
+
return flips / (len(outcomes) - 1)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _duration_consistency(durations: list[float]) -> float:
|
|
42
|
+
if len(durations) < 2:
|
|
43
|
+
return 1.0
|
|
44
|
+
mean = sum(durations) / len(durations)
|
|
45
|
+
if mean == 0.0:
|
|
46
|
+
return 1.0
|
|
47
|
+
variance = sum((d - mean) ** 2 for d in durations) / len(durations)
|
|
48
|
+
std = math.sqrt(variance)
|
|
49
|
+
cv = std / mean
|
|
50
|
+
return 1.0 - min(cv, 1.0)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class StabilityAnalyzer:
|
|
54
|
+
def __init__(self, min_runs: int = _MIN_RUNS) -> None:
|
|
55
|
+
self.min_runs = min_runs
|
|
56
|
+
|
|
57
|
+
def analyze(
|
|
58
|
+
self,
|
|
59
|
+
test_name: str,
|
|
60
|
+
history: list[tuple[datetime, TestResult]],
|
|
61
|
+
) -> StabilityResult:
|
|
62
|
+
"""
|
|
63
|
+
Parameters
|
|
64
|
+
----------
|
|
65
|
+
history : list of (timestamp, TestResult), any order.
|
|
66
|
+
"""
|
|
67
|
+
if len(history) < self.min_runs:
|
|
68
|
+
return StabilityResult(
|
|
69
|
+
test_name=test_name,
|
|
70
|
+
score=0.0,
|
|
71
|
+
pass_rate=0.0,
|
|
72
|
+
duration_consistency=0.0,
|
|
73
|
+
flip_rate=0.0,
|
|
74
|
+
run_count=len(history),
|
|
75
|
+
insufficient_data=True,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
ordered = sorted(history, key=lambda x: x[0])
|
|
79
|
+
outcomes = [_is_failure(r.status) for _, r in ordered]
|
|
80
|
+
durations = [r.duration for _, r in ordered]
|
|
81
|
+
|
|
82
|
+
total = len(outcomes)
|
|
83
|
+
failures = sum(outcomes)
|
|
84
|
+
fail_rate = failures / total
|
|
85
|
+
pass_rate = 1.0 - fail_rate
|
|
86
|
+
|
|
87
|
+
consistency = _duration_consistency(durations)
|
|
88
|
+
fr = _flip_rate(outcomes)
|
|
89
|
+
|
|
90
|
+
score = pass_rate * 60.0 + consistency * 20.0 + (1.0 - fr) * 20.0
|
|
91
|
+
|
|
92
|
+
return StabilityResult(
|
|
93
|
+
test_name=test_name,
|
|
94
|
+
score=round(score, 2),
|
|
95
|
+
pass_rate=pass_rate,
|
|
96
|
+
duration_consistency=consistency,
|
|
97
|
+
flip_rate=fr,
|
|
98
|
+
run_count=total,
|
|
99
|
+
)
|