testmind 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,96 @@
1
+ """
2
+ Flaky test detection.
3
+
4
+ A test is FLAKY when it produces mixed pass/fail results over recent runs
5
+ without a clear directional trend — i.e. the failure rate is in the
6
+ "uncertain zone" (between FLAKY_LOW and FLAKY_HIGH thresholds) AND
7
+ consecutive results flip at a meaningful rate.
8
+
9
+ Rules
10
+ -----
11
+ - Requires at least `min_runs` observations (default 5).
12
+ - fail_rate ∈ (FLAKY_LOW, FLAKY_HIGH) → candidate
13
+ - flip_rate > FLIP_THRESHOLD → confirmed flaky
14
+ where flip_rate = fraction of consecutive pairs whose outcome differs.
15
+ """
16
+
17
+ from datetime import datetime
18
+
19
+ from testmind.domain.models import TestResult, TestStatus
20
+ from testmind.analysis.models import FlakyResult
21
+
22
+ # Thresholds (all tunable via constructor kwargs)
23
+ _FLAKY_LOW = 0.10 # below this → consistently passing
24
+ _FLAKY_HIGH = 0.90 # above this → consistently failing
25
+ _FLIP_THRESHOLD = 0.15
26
+ _MIN_RUNS = 5
27
+
28
+
29
+ def _is_failure(status: TestStatus) -> bool:
30
+ return status in (TestStatus.FAILED, TestStatus.ERROR)
31
+
32
+
33
+ def _flip_rate(outcomes: list[bool]) -> float:
34
+ if len(outcomes) < 2:
35
+ return 0.0
36
+ flips = sum(a != b for a, b in zip(outcomes, outcomes[1:]))
37
+ return flips / (len(outcomes) - 1)
38
+
39
+
40
+ class FlakyDetector:
41
+ def __init__(
42
+ self,
43
+ min_runs: int = _MIN_RUNS,
44
+ flaky_low: float = _FLAKY_LOW,
45
+ flaky_high: float = _FLAKY_HIGH,
46
+ flip_threshold: float = _FLIP_THRESHOLD,
47
+ ) -> None:
48
+ self.min_runs = min_runs
49
+ self.flaky_low = flaky_low
50
+ self.flaky_high = flaky_high
51
+ self.flip_threshold = flip_threshold
52
+
53
+ def analyze(
54
+ self,
55
+ test_name: str,
56
+ history: list[tuple[datetime, TestResult]],
57
+ ) -> FlakyResult:
58
+ """
59
+ Parameters
60
+ ----------
61
+ history : list of (timestamp, TestResult), oldest → newest.
62
+ """
63
+ if len(history) < self.min_runs:
64
+ return FlakyResult(
65
+ test_name=test_name,
66
+ is_flaky=False,
67
+ flip_rate=0.0,
68
+ pass_rate=0.0,
69
+ fail_rate=0.0,
70
+ run_count=len(history),
71
+ insufficient_data=True,
72
+ )
73
+
74
+ # Oldest → newest order (history may arrive newest-first from store)
75
+ ordered = sorted(history, key=lambda x: x[0])
76
+ outcomes = [_is_failure(r.status) for _, r in ordered]
77
+
78
+ total = len(outcomes)
79
+ failures = sum(outcomes)
80
+ fail_rate = failures / total
81
+ pass_rate = 1.0 - fail_rate
82
+ fr = _flip_rate(outcomes)
83
+
84
+ is_flaky = (
85
+ self.flaky_low < fail_rate < self.flaky_high
86
+ and fr > self.flip_threshold
87
+ )
88
+
89
+ return FlakyResult(
90
+ test_name=test_name,
91
+ is_flaky=is_flaky,
92
+ flip_rate=fr,
93
+ pass_rate=pass_rate,
94
+ fail_rate=fail_rate,
95
+ run_count=total,
96
+ )
@@ -0,0 +1,62 @@
1
+ from dataclasses import dataclass
2
+ from enum import StrEnum
3
+
4
+
5
+ class Trend(StrEnum):
6
+ IMPROVING = "improving"
7
+ DEGRADING = "degrading"
8
+ STABLE = "stable"
9
+
10
+
11
+ @dataclass(frozen=True)
12
+ class FlakyResult:
13
+ test_name: str
14
+ is_flaky: bool
15
+ flip_rate: float # fraction of consecutive pairs with different outcomes
16
+ pass_rate: float
17
+ fail_rate: float
18
+ run_count: int
19
+ # True when there are too few runs to decide
20
+ insufficient_data: bool = False
21
+
22
+
23
+ @dataclass(frozen=True)
24
+ class RegressionResult:
25
+ test_name: str
26
+ is_regression: bool
27
+ # pass rate in the reference (older) window
28
+ reference_pass_rate: float
29
+ # fail rate in the recent window
30
+ recent_fail_rate: float
31
+ insufficient_data: bool = False
32
+
33
+
34
+ @dataclass(frozen=True)
35
+ class SpikeResult:
36
+ """Suite-level: was there a sudden failure-rate spike in the latest run?"""
37
+ is_spike: bool
38
+ current_fail_rate: float
39
+ baseline_mean: float
40
+ baseline_std: float
41
+ z_score: float
42
+ insufficient_data: bool = False
43
+
44
+
45
+ @dataclass(frozen=True)
46
+ class StabilityResult:
47
+ test_name: str
48
+ score: float # 0–100
49
+ pass_rate: float
50
+ duration_consistency: float # 0–1 (1 = perfectly consistent)
51
+ flip_rate: float
52
+ run_count: int
53
+ insufficient_data: bool = False
54
+
55
+
56
+ @dataclass(frozen=True)
57
+ class PredictionResult:
58
+ test_name: str
59
+ failure_probability: float # 0–1
60
+ trend: Trend
61
+ confidence: float # 0–1 (grows with run count)
62
+ insufficient_data: bool = False
@@ -0,0 +1,101 @@
1
+ """
2
+ Failure probability predictor.
3
+
4
+ Approach
5
+ --------
6
+ 1. Sort history oldest → newest.
7
+ 2. Compute a per-run binary outcome (1 = fail, 0 = pass).
8
+ 3. Fit a linear trend to those outcomes using OLS (no external deps).
9
+ 4. Predict the next value = last_outcome + slope.
10
+ 5. Clamp prediction to [0, 1] and interpret trend.
11
+
12
+ Confidence scales with the number of runs up to MAX_CONF_RUNS.
13
+
14
+ Trend thresholds
15
+ ----------------
16
+ slope > +SLOPE_THRESHOLD → DEGRADING
17
+ slope < -SLOPE_THRESHOLD → IMPROVING
18
+ otherwise → STABLE
19
+ """
20
+
21
+ from datetime import datetime
22
+
23
+ from testmind.domain.models import TestResult, TestStatus
24
+ from testmind.analysis.models import PredictionResult, Trend
25
+
26
+ _MIN_RUNS = 3
27
+ _MAX_CONF_RUNS = 20
28
+ _SLOPE_THRESHOLD = 0.05
29
+
30
+
31
+ def _is_failure(status: TestStatus) -> bool:
32
+ return status in (TestStatus.FAILED, TestStatus.ERROR)
33
+
34
+
35
+ def _ols_slope(ys: list[float]) -> float:
36
+ """Slope of OLS regression of ys against integer indices 0, 1, …, n-1."""
37
+ n = len(ys)
38
+ if n < 2:
39
+ return 0.0
40
+ xs = list(range(n))
41
+ mean_x = (n - 1) / 2.0 # exact for 0..n-1
42
+ mean_y = sum(ys) / n
43
+ num = sum((x - mean_x) * (y - mean_y) for x, y in zip(xs, ys))
44
+ den = sum((x - mean_x) ** 2 for x in xs)
45
+ return num / den if den != 0.0 else 0.0
46
+
47
+
48
+ class FailurePredictor:
49
+ def __init__(
50
+ self,
51
+ min_runs: int = _MIN_RUNS,
52
+ slope_threshold: float = _SLOPE_THRESHOLD,
53
+ max_conf_runs: int = _MAX_CONF_RUNS,
54
+ ) -> None:
55
+ self.min_runs = min_runs
56
+ self.slope_threshold = slope_threshold
57
+ self.max_conf_runs = max_conf_runs
58
+
59
+ def analyze(
60
+ self,
61
+ test_name: str,
62
+ history: list[tuple[datetime, TestResult]],
63
+ ) -> PredictionResult:
64
+ """
65
+ Parameters
66
+ ----------
67
+ history : list of (timestamp, TestResult), any order.
68
+ """
69
+ if len(history) < self.min_runs:
70
+ return PredictionResult(
71
+ test_name=test_name,
72
+ failure_probability=0.0,
73
+ trend=Trend.STABLE,
74
+ confidence=0.0,
75
+ insufficient_data=True,
76
+ )
77
+
78
+ ordered = sorted(history, key=lambda x: x[0])
79
+ outcomes = [1.0 if _is_failure(r.status) else 0.0 for _, r in ordered]
80
+
81
+ slope = _ols_slope(outcomes)
82
+
83
+ last_rate = sum(outcomes[-3:]) / min(3, len(outcomes))
84
+ raw_prediction = last_rate + slope
85
+ failure_prob = max(0.0, min(1.0, raw_prediction))
86
+
87
+ if slope > self.slope_threshold:
88
+ trend = Trend.DEGRADING
89
+ elif slope < -self.slope_threshold:
90
+ trend = Trend.IMPROVING
91
+ else:
92
+ trend = Trend.STABLE
93
+
94
+ confidence = min(len(outcomes) / self.max_conf_runs, 1.0)
95
+
96
+ return PredictionResult(
97
+ test_name=test_name,
98
+ failure_probability=round(failure_prob, 4),
99
+ trend=trend,
100
+ confidence=round(confidence, 4),
101
+ )
@@ -0,0 +1,153 @@
1
+ """
2
+ Regression and spike detection.
3
+
4
+ Regression (test level)
5
+ -----------------------
6
+ A test is a REGRESSION when it was stable (mostly passing) in an older
7
+ reference window but has started failing in a recent window.
8
+
9
+ Rules
10
+ -----
11
+ - Requires at least `min_runs` total observations (default 6).
12
+ - Reference window : all runs except the last `recent_window` (default 3).
13
+ - Recent window : the last `recent_window` runs.
14
+ - Is regression when:
15
+ reference_pass_rate >= STABLE_THRESHOLD (was stable)
16
+ AND recent_fail_rate >= RECENT_FAIL_THRESHOLD (now failing)
17
+
18
+ Spike (suite level)
19
+ -------------------
20
+ A SPIKE occurs when the most recent report's failure rate is significantly
21
+ higher than the rolling baseline of previous reports.
22
+
23
+ Rules
24
+ -----
25
+ - Requires at least `min_baseline` previous reports (default 3).
26
+ - Baseline = fail_rate of the N reports preceding the latest.
27
+ - z_score = (current_fail_rate - baseline_mean) / baseline_std
28
+ - Spike when z_score >= SPIKE_Z_THRESHOLD (default 2.0)
29
+ AND current_fail_rate > baseline_mean (one-tailed).
30
+ """
31
+
32
+ import math
33
+ from datetime import datetime
34
+
35
+ from testmind.domain.models import TestReport, TestResult, TestStatus
36
+ from testmind.analysis.models import RegressionResult, SpikeResult
37
+
38
+ _STABLE_THRESHOLD = 0.90
39
+ _RECENT_FAIL_THRESHOLD = 0.60
40
+ _RECENT_WINDOW = 3
41
+ _MIN_RUNS = 6
42
+ _MIN_BASELINE = 3
43
+ _SPIKE_Z = 2.0
44
+
45
+
46
+ def _is_failure(status: TestStatus) -> bool:
47
+ return status in (TestStatus.FAILED, TestStatus.ERROR)
48
+
49
+
50
+ class RegressionDetector:
51
+ def __init__(
52
+ self,
53
+ recent_window: int = _RECENT_WINDOW,
54
+ min_runs: int = _MIN_RUNS,
55
+ stable_threshold: float = _STABLE_THRESHOLD,
56
+ recent_fail_threshold: float = _RECENT_FAIL_THRESHOLD,
57
+ ) -> None:
58
+ self.recent_window = recent_window
59
+ self.min_runs = min_runs
60
+ self.stable_threshold = stable_threshold
61
+ self.recent_fail_threshold = recent_fail_threshold
62
+
63
+ def analyze(
64
+ self,
65
+ test_name: str,
66
+ history: list[tuple[datetime, TestResult]],
67
+ ) -> RegressionResult:
68
+ """
69
+ Parameters
70
+ ----------
71
+ history : list of (timestamp, TestResult), any order — will be sorted.
72
+ """
73
+ if len(history) < self.min_runs:
74
+ return RegressionResult(
75
+ test_name=test_name,
76
+ is_regression=False,
77
+ reference_pass_rate=0.0,
78
+ recent_fail_rate=0.0,
79
+ insufficient_data=True,
80
+ )
81
+
82
+ ordered = sorted(history, key=lambda x: x[0])
83
+ recent = ordered[-self.recent_window :]
84
+ reference = ordered[: -self.recent_window]
85
+
86
+ ref_failures = sum(_is_failure(r.status) for _, r in reference)
87
+ ref_pass_rate = 1.0 - ref_failures / len(reference)
88
+
89
+ rec_failures = sum(_is_failure(r.status) for _, r in recent)
90
+ rec_fail_rate = rec_failures / len(recent)
91
+
92
+ is_regression = (
93
+ ref_pass_rate >= self.stable_threshold
94
+ and rec_fail_rate >= self.recent_fail_threshold
95
+ )
96
+
97
+ return RegressionResult(
98
+ test_name=test_name,
99
+ is_regression=is_regression,
100
+ reference_pass_rate=ref_pass_rate,
101
+ recent_fail_rate=rec_fail_rate,
102
+ )
103
+
104
+
105
+ class SpikeDetector:
106
+ def __init__(
107
+ self,
108
+ min_baseline: int = _MIN_BASELINE,
109
+ z_threshold: float = _SPIKE_Z,
110
+ ) -> None:
111
+ self.min_baseline = min_baseline
112
+ self.z_threshold = z_threshold
113
+
114
+ def analyze(self, reports: list[TestReport]) -> SpikeResult:
115
+ """
116
+ Parameters
117
+ ----------
118
+ reports : ordered oldest → newest; the last entry is the current run.
119
+ """
120
+ if len(reports) < self.min_baseline + 1:
121
+ return SpikeResult(
122
+ is_spike=False,
123
+ current_fail_rate=0.0,
124
+ baseline_mean=0.0,
125
+ baseline_std=0.0,
126
+ z_score=0.0,
127
+ insufficient_data=True,
128
+ )
129
+
130
+ ordered = sorted(reports, key=lambda r: r.timestamp)
131
+ current = ordered[-1]
132
+ baseline = ordered[:-1]
133
+
134
+ current_fail_rate = current.fail_rate
135
+ baseline_rates = [r.fail_rate for r in baseline]
136
+ mean = sum(baseline_rates) / len(baseline_rates)
137
+ variance = sum((x - mean) ** 2 for x in baseline_rates) / len(baseline_rates)
138
+ std = math.sqrt(variance)
139
+
140
+ if std == 0.0:
141
+ z_score = 0.0 if current_fail_rate == mean else float("inf")
142
+ else:
143
+ z_score = (current_fail_rate - mean) / std
144
+
145
+ is_spike = z_score >= self.z_threshold and current_fail_rate > mean
146
+
147
+ return SpikeResult(
148
+ is_spike=is_spike,
149
+ current_fail_rate=current_fail_rate,
150
+ baseline_mean=mean,
151
+ baseline_std=std,
152
+ z_score=z_score,
153
+ )
@@ -0,0 +1,99 @@
1
+ """
2
+ Stability Index (0 – 100).
3
+
4
+ Formula
5
+ -------
6
+ score = pass_rate_score + consistency_score + non_flakiness_score
7
+
8
+ Where:
9
+ pass_rate_score = pass_rate × 60
10
+ consistency_score = duration_consistency × 20
11
+ non_flakiness_score = (1 - flip_rate) × 20
12
+
13
+ duration_consistency = 1 - min(CV, 1)
14
+ CV (coefficient of variation) = std(durations) / mean(durations)
15
+ → 0 when wildly variable, 1 when perfectly consistent.
16
+ If all durations are zero or there is only one run, consistency = 1.
17
+
18
+ flip_rate = fraction of consecutive pairs with different outcomes.
19
+ """
20
+
21
+ import math
22
+ from datetime import datetime
23
+
24
+ from testmind.domain.models import TestResult, TestStatus
25
+ from testmind.analysis.models import StabilityResult
26
+
27
+ _MIN_RUNS = 3
28
+
29
+
30
+ def _is_failure(status: TestStatus) -> bool:
31
+ return status in (TestStatus.FAILED, TestStatus.ERROR)
32
+
33
+
34
+ def _flip_rate(outcomes: list[bool]) -> float:
35
+ if len(outcomes) < 2:
36
+ return 0.0
37
+ flips = sum(a != b for a, b in zip(outcomes, outcomes[1:]))
38
+ return flips / (len(outcomes) - 1)
39
+
40
+
41
+ def _duration_consistency(durations: list[float]) -> float:
42
+ if len(durations) < 2:
43
+ return 1.0
44
+ mean = sum(durations) / len(durations)
45
+ if mean == 0.0:
46
+ return 1.0
47
+ variance = sum((d - mean) ** 2 for d in durations) / len(durations)
48
+ std = math.sqrt(variance)
49
+ cv = std / mean
50
+ return 1.0 - min(cv, 1.0)
51
+
52
+
53
+ class StabilityAnalyzer:
54
+ def __init__(self, min_runs: int = _MIN_RUNS) -> None:
55
+ self.min_runs = min_runs
56
+
57
+ def analyze(
58
+ self,
59
+ test_name: str,
60
+ history: list[tuple[datetime, TestResult]],
61
+ ) -> StabilityResult:
62
+ """
63
+ Parameters
64
+ ----------
65
+ history : list of (timestamp, TestResult), any order.
66
+ """
67
+ if len(history) < self.min_runs:
68
+ return StabilityResult(
69
+ test_name=test_name,
70
+ score=0.0,
71
+ pass_rate=0.0,
72
+ duration_consistency=0.0,
73
+ flip_rate=0.0,
74
+ run_count=len(history),
75
+ insufficient_data=True,
76
+ )
77
+
78
+ ordered = sorted(history, key=lambda x: x[0])
79
+ outcomes = [_is_failure(r.status) for _, r in ordered]
80
+ durations = [r.duration for _, r in ordered]
81
+
82
+ total = len(outcomes)
83
+ failures = sum(outcomes)
84
+ fail_rate = failures / total
85
+ pass_rate = 1.0 - fail_rate
86
+
87
+ consistency = _duration_consistency(durations)
88
+ fr = _flip_rate(outcomes)
89
+
90
+ score = pass_rate * 60.0 + consistency * 20.0 + (1.0 - fr) * 20.0
91
+
92
+ return StabilityResult(
93
+ test_name=test_name,
94
+ score=round(score, 2),
95
+ pass_rate=pass_rate,
96
+ duration_consistency=consistency,
97
+ flip_rate=fr,
98
+ run_count=total,
99
+ )