git-bayesect 1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 hauntsaninja
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,104 @@
1
+ Metadata-Version: 2.4
2
+ Name: git_bayesect
3
+ Version: 1.0
4
+ Summary: Git bisection with Bayesian statistics
5
+ Author: Shantanu Jain
6
+ Author-email: hauntsaninja@gmail.com
7
+ Requires-Python: >=3.10
8
+ Description-Content-Type: text/markdown
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Topic :: Software Development
14
+ Classifier: Topic :: Utilities
15
+ License-File: LICENSE
16
+ Requires-Dist: numpy
17
+ Requires-Dist: scipy
18
+ Project-URL: homepage, https://github.com/hauntsaninja/git_bayesect
19
+ Project-URL: repository, https://github.com/hauntsaninja/git_bayesect
20
+
21
+ # git bayesect
22
+
23
+ Bayesian git bisection!
24
+
25
+ Use this to detect changes in likelihoods of events, for instance, to isolate a commit where
26
+ a slightly flaky test became very flaky.
27
+
28
+ You don't need to know the likelihoods (although you can provide priors), just that something
29
+ has changed at some point in some direction
30
+
31
+ ## Installation
32
+
33
+ ```
34
+ pip install git_bayesect
35
+ ```
36
+
37
+ ## Usage
38
+
39
+ Start a Bayesian bisection:
40
+ ```
41
+ git bayesect start --old $COMMIT
42
+ ```
43
+
44
+ Record an observation on the current commit:
45
+ ```
46
+ git bayesect fail
47
+ ```
48
+
49
+ Or on a specific commit:
50
+ ```
51
+ git bayesect pass --commit $COMMIT
52
+ ```
53
+
54
+ Check the overall status of the bisection:
55
+ ```
56
+ git bayesect status
57
+ ```
58
+
59
+ Reset:
60
+ ```
61
+ git bayesect reset
62
+ ```
63
+
64
+ ## More usage
65
+
66
+ Set the prior for a given commit:
67
+ ```
68
+ git bayesect prior --commit $COMMIT --weight 10
69
+ ```
70
+
71
+ Set prior for all commits based on filenames:
72
+ ```
73
+ git bayesect priors_from_filenames --filenames-callback "return 10 if any('suspicious' in f for f in filenames) else 1"
74
+ ```
75
+
76
+ Set the beta priors:
77
+ ```
78
+ git bayesect beta_priors --alpha-new 0.9 --beta-new 0.1 --alpha-old 0.05 --beta-old 0.95
79
+ ```
80
+
81
+ Get a log of commands to let you reconstruct the state:
82
+ ```
83
+ git bayesect log
84
+ ```
85
+
86
+ Undo the last observation:
87
+ ```
88
+ git bayesect undo
89
+ ```
90
+
91
+ Run the bisection automatically using a command to make observations:
92
+ ```
93
+ git bayesect run $CMD
94
+ ```
95
+
96
+ Checkout the best commmit to test:
97
+ ```
98
+ git bayesect checkout
99
+ ```
100
+
101
+ ## How it works
102
+
103
+ TODO: talk about math
104
+
@@ -0,0 +1,83 @@
1
+ # git bayesect
2
+
3
+ Bayesian git bisection!
4
+
5
+ Use this to detect changes in likelihoods of events, for instance, to isolate a commit where
6
+ a slightly flaky test became very flaky.
7
+
8
+ You don't need to know the likelihoods (although you can provide priors), just that something
9
+ has changed at some point in some direction
10
+
11
+ ## Installation
12
+
13
+ ```
14
+ pip install git_bayesect
15
+ ```
16
+
17
+ ## Usage
18
+
19
+ Start a Bayesian bisection:
20
+ ```
21
+ git bayesect start --old $COMMIT
22
+ ```
23
+
24
+ Record an observation on the current commit:
25
+ ```
26
+ git bayesect fail
27
+ ```
28
+
29
+ Or on a specific commit:
30
+ ```
31
+ git bayesect pass --commit $COMMIT
32
+ ```
33
+
34
+ Check the overall status of the bisection:
35
+ ```
36
+ git bayesect status
37
+ ```
38
+
39
+ Reset:
40
+ ```
41
+ git bayesect reset
42
+ ```
43
+
44
+ ## More usage
45
+
46
+ Set the prior for a given commit:
47
+ ```
48
+ git bayesect prior --commit $COMMIT --weight 10
49
+ ```
50
+
51
+ Set prior for all commits based on filenames:
52
+ ```
53
+ git bayesect priors_from_filenames --filenames-callback "return 10 if any('suspicious' in f for f in filenames) else 1"
54
+ ```
55
+
56
+ Set the beta priors:
57
+ ```
58
+ git bayesect beta_priors --alpha-new 0.9 --beta-new 0.1 --alpha-old 0.05 --beta-old 0.95
59
+ ```
60
+
61
+ Get a log of commands to let you reconstruct the state:
62
+ ```
63
+ git bayesect log
64
+ ```
65
+
66
+ Undo the last observation:
67
+ ```
68
+ git bayesect undo
69
+ ```
70
+
71
+ Run the bisection automatically using a command to make observations:
72
+ ```
73
+ git bayesect run $CMD
74
+ ```
75
+
76
+ Checkout the best commmit to test:
77
+ ```
78
+ git bayesect checkout
79
+ ```
80
+
81
+ ## How it works
82
+
83
+ TODO: talk about math
@@ -0,0 +1,884 @@
1
+ # /// script
2
+ # requires-python = ">=3.10"
3
+ # dependencies = [
4
+ # "numpy",
5
+ # "scipy",
6
+ # ]
7
+ # ///
8
+
9
+ from __future__ import annotations
10
+
11
+ import argparse
12
+ import enum
13
+ import json
14
+ import subprocess
15
+ import sys
16
+ from pathlib import Path
17
+ from typing import Any
18
+
19
+ import numpy as np
20
+
21
+ ndarray = np.ndarray[Any, Any]
22
+
23
+
24
+ # ==============================
25
+ # Core pure logic
26
+ # ==============================
27
+
28
+
29
+ class Bisector:
30
+ """
31
+ There is some index B such that for all index:
32
+ P(obs_yes | index <= B) = p_obs_new
33
+ P(obs_yes | index > B) = p_obs_old
34
+
35
+ We'd like to find B (and we don't know p_obs_new and p_obs_old).
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ prior_weights: list[float] | list[int] | ndarray,
41
+ alpha_new: float = 0.9,
42
+ beta_new: float = 0.1,
43
+ alpha_old: float = 0.05,
44
+ beta_old: float = 0.95,
45
+ ) -> None:
46
+ if isinstance(prior_weights, list):
47
+ prior_weights = np.array(prior_weights, dtype=np.float64)
48
+ assert isinstance(prior_weights, np.ndarray)
49
+ if np.any(prior_weights < 0):
50
+ raise ValueError("prior_weights must be >= 0")
51
+ self.prior_weights = prior_weights
52
+
53
+ self.obs_yes = np.zeros_like(prior_weights, dtype=np.int64)
54
+ self.obs_total = np.zeros_like(prior_weights, dtype=np.int64)
55
+
56
+ # E.g. p_obs_new ~ Beta(0.9, 0.1), so E[p_obs_new] = 0.9
57
+ self.alpha_new = alpha_new
58
+ self.beta_new = beta_new
59
+
60
+ # E.g. p_obs_old ~ Beta(0.05, 0.95), so E[p_obs_old] = 0.05
61
+ self.alpha_old = alpha_old
62
+ self.beta_old = beta_old
63
+
64
+ self.post_weights: ndarray | None = None
65
+
66
+ def _maybe_update_posteriors(self) -> None:
67
+ if self.post_weights is None:
68
+ self._update_posteriors()
69
+
70
+ def _update_posteriors(self) -> None:
71
+ from scipy.special import loggamma, logsumexp
72
+
73
+ # fmt: off
74
+ # left: yes and no counts on or before index
75
+ # right: yes and no counts after index
76
+ total_left = self.obs_total
77
+ total_right = self.obs_total[-1] - total_left
78
+ yes_left = self.obs_yes
79
+ yes_right = yes_left[-1] - yes_left
80
+ no_left = total_left - yes_left
81
+ no_right = total_right - yes_right
82
+
83
+ # At this point, if we knew p_obs_new and p_obs_old, we could just apply Bayes' theorem
84
+ # and things would be straightforward. But we don't, so we have to integrate over our
85
+ # priors of what p_obs_new and p_obs_old might be.
86
+
87
+ # P(data) = ∫ P(data | p) P(p) dp for left and right observations
88
+ # Thanks to Beta distribution magic, we can compute this analytically
89
+ log_beta = lambda a, b: loggamma(a) + loggamma(b) - loggamma(a + b)
90
+ log_likelihood_left = (
91
+ log_beta(self.alpha_new + yes_left, self.beta_new + no_left)
92
+ - log_beta(self.alpha_new, self.beta_new)
93
+ )
94
+ log_likelihood_right = (
95
+ log_beta(self.alpha_old + yes_right, self.beta_old + no_right)
96
+ - log_beta(self.alpha_old, self.beta_old)
97
+ )
98
+ # This gives us:
99
+ # log P(data | index=b) = log_likelihood_left[b] + log_likelihood_right[b]
100
+
101
+ log_prior = np.where(self.prior_weights > 0, np.log(self.prior_weights), -np.inf)
102
+ # log_post[b] is now numerator of Bayes' theorem, so just normalise by sum(exp(log_post))
103
+ log_post = log_prior + log_likelihood_left + log_likelihood_right
104
+ self.post_weights = np.exp(log_post - logsumexp(log_post))
105
+ # fmt: on
106
+
107
+ def record(self, index: int, observation: bool | None) -> None:
108
+ """Record an observation at index."""
109
+ assert 0 <= index < len(self.prior_weights)
110
+ self.post_weights = None
111
+ if observation is None:
112
+ # Similar to git bisect skip, let's just zero out the prior
113
+ # Note we might want to lower the prior instead
114
+ self.prior_weights[index] = 0
115
+ return
116
+
117
+ self.obs_total[index:] += 1
118
+ if observation:
119
+ self.obs_yes[index:] += 1
120
+
121
+ def select(self) -> int:
122
+ """Return the index which will most reduce entropy."""
123
+ self._maybe_update_posteriors()
124
+ assert self.post_weights is not None
125
+
126
+ # fmt: off
127
+ total_left = self.obs_total
128
+ total_right = self.obs_total[-1] - total_left
129
+ yes_left = self.obs_yes
130
+ yes_right = yes_left[-1] - yes_left
131
+
132
+ # posterior means of the two Bernoulli parameters at each b
133
+ p_obs_new = (self.alpha_new + yes_left) / (self.alpha_new + self.beta_new + total_left)
134
+ p_obs_old = (self.alpha_old + yes_right) / (self.alpha_old + self.beta_old + total_right)
135
+ # p_obs_new = yes_left / np.maximum(1e-10, total_left)
136
+ # p_obs_old = yes_right / np.maximum(1e-10, total_right)
137
+
138
+ # p_obs_yes[b]
139
+ # = P(obs_yes | select=b)
140
+ # = \sum_{i=0}^{b-1} p_obs_old[i] * post[i] + \sum_{i=b}^{n-1} p_obs_new[i] * post[i]
141
+ w_new_yes = self.post_weights * p_obs_new
142
+ w_old_yes = self.post_weights * p_obs_old
143
+ p_obs_yes = (np.cumsum(w_old_yes) - w_old_yes) + np.cumsum(w_new_yes[::-1])[::-1]
144
+
145
+ w_new_no = self.post_weights * (1.0 - p_obs_new)
146
+ w_old_no = self.post_weights * (1.0 - p_obs_old)
147
+ p_obs_no = (np.cumsum(w_old_no) - w_old_no) + np.cumsum(w_new_no[::-1])[::-1]
148
+
149
+ assert np.allclose(p_obs_yes + p_obs_no, 1)
150
+
151
+ wlog = lambda w: np.where(w > 0.0, w * np.log2(w), 0.0)
152
+
153
+ # To get entropy from unnormalised w_i, calculate S = \sum w_i
154
+ # Then log S - (\sum w_i log w_i) / S
155
+ w_new_yes_log = wlog(w_new_yes)
156
+ w_old_yes_log = wlog(w_old_yes)
157
+ p_obs_yes_log = (np.cumsum(w_old_yes_log) - w_old_yes_log) + np.cumsum(w_new_yes_log[::-1])[::-1]
158
+ H_yes = np.where(p_obs_yes > 0, np.log2(p_obs_yes) - p_obs_yes_log / p_obs_yes, 0.0)
159
+
160
+ w_new_no_log = wlog(w_new_no)
161
+ w_old_no_log = wlog(w_old_no)
162
+ p_obs_no_log = (np.cumsum(w_old_no_log) - w_old_no_log) + np.cumsum(w_new_no_log[::-1])[::-1]
163
+ H_no = np.where(p_obs_no > 0, np.log2(p_obs_no) - p_obs_no_log / p_obs_no, 0.0)
164
+ # fmt: on
165
+
166
+ expected_H = H_yes * p_obs_yes + H_no * p_obs_no
167
+ return int(np.argmin(expected_H))
168
+
169
+ @property
170
+ def distribution(self) -> ndarray:
171
+ """Current posterior P(index=B | data)"""
172
+ self._maybe_update_posteriors()
173
+ assert self.post_weights is not None
174
+ return self.post_weights
175
+
176
+ @property
177
+ def entropy(self) -> float:
178
+ """Posterior entropy in bits"""
179
+ self._maybe_update_posteriors()
180
+ assert self.post_weights is not None
181
+ probs = self.post_weights[self.post_weights > 0]
182
+ return -float(np.sum(probs * np.log2(probs)))
183
+
184
+ @property
185
+ def empirical_p_obs(self) -> tuple[ndarray, ndarray]:
186
+ """Return what we've observed for p_obs_new and p_obs_old are if each commit is B."""
187
+ # fmt: off
188
+ total_left = self.obs_total
189
+ total_right = self.obs_total[-1] - total_left
190
+ yes_left = self.obs_yes
191
+ yes_right = yes_left[-1] - yes_left
192
+
193
+ # Use the following if you want to take the prior into account:
194
+ # p_obs_new = (self.alpha_new + yes_left) / (self.alpha_new + self.beta_new + total_left)
195
+ # p_obs_old = (self.alpha_old + yes_right) / (self.alpha_old + self.beta_old + total_right)
196
+
197
+ p_obs_new = yes_left / np.maximum(1e-10, total_left)
198
+ p_obs_old = yes_right / np.maximum(1e-10, total_right)
199
+ return p_obs_new, p_obs_old
200
+ # fmt: on
201
+
202
+ @property
203
+ def empirical_counts(self) -> tuple[tuple[ndarray, ndarray], tuple[ndarray, ndarray]]:
204
+ total_left = self.obs_total
205
+ total_right = self.obs_total[-1] - total_left
206
+ yes_left = self.obs_yes
207
+ yes_right = yes_left[-1] - yes_left
208
+ return (yes_left, total_left), (yes_right, total_right)
209
+
210
+ @property
211
+ def num_total_observations(self) -> int:
212
+ return int(self.obs_total[-1])
213
+
214
+ @property
215
+ def num_yes_observations(self) -> int:
216
+ return int(self.obs_yes[-1])
217
+
218
+ def central_range(self, mass: float) -> tuple[int, int]:
219
+ """Return the range of indices that contain the central mass of the posterior, inclusive."""
220
+ self._maybe_update_posteriors()
221
+ assert self.post_weights is not None
222
+ assert 0 <= mass <= 1
223
+ cumsum = np.cumsum(self.post_weights)
224
+
225
+ tail = (1 - mass) / 2
226
+ left = np.searchsorted(cumsum, tail, side="left")
227
+ right = np.searchsorted(cumsum, 1 - tail, side="right")
228
+ right = min(right, len(cumsum) - 1) # type: ignore[arg-type]
229
+
230
+ return int(left), int(right)
231
+
232
+
233
+ # ==============================
234
+ # State logic
235
+ # ==============================
236
+
237
+
238
+ class BayesectError(Exception):
239
+ pass
240
+
241
+
242
+ class Result(enum.Enum):
243
+ FAIL = "fail"
244
+ PASS = "pass"
245
+ SKIP = "skip"
246
+
247
+
248
+ class BetaPriors:
249
+ def __init__(
250
+ self, alpha_new: float, beta_new: float, alpha_old: float, beta_old: float
251
+ ) -> None:
252
+ self.alpha_new = alpha_new
253
+ self.beta_new = beta_new
254
+ self.alpha_old = alpha_old
255
+ self.beta_old = beta_old
256
+
257
+ def as_dict(self) -> dict[str, float]:
258
+ return {
259
+ "alpha_new": self.alpha_new,
260
+ "beta_new": self.beta_new,
261
+ "alpha_old": self.alpha_old,
262
+ "beta_old": self.beta_old,
263
+ }
264
+
265
+
266
+ STATE_FILENAME = "BAYESECT_STATE"
267
+ STATE_VERSION = 2
268
+
269
+
270
+ class State:
271
+ def __init__(
272
+ self,
273
+ old_sha: bytes,
274
+ new_sha: bytes,
275
+ beta_priors: BetaPriors,
276
+ priors: dict[bytes, float],
277
+ results: list[tuple[bytes, Result]],
278
+ commit_indices: dict[bytes, int],
279
+ ) -> None:
280
+ self.old_sha = old_sha
281
+ self.new_sha = new_sha
282
+ self.beta_priors = beta_priors
283
+ self.priors = priors
284
+ self.results = results
285
+ self.commit_indices = commit_indices
286
+
287
+ def dump(self, repo_path: Path) -> None:
288
+ state_dict = {
289
+ "version": STATE_VERSION,
290
+ "old_sha": self.old_sha.decode(),
291
+ "new_sha": self.new_sha.decode(),
292
+ "beta_priors": self.beta_priors.as_dict(),
293
+ "priors": {k.decode(): v for k, v in self.priors.items()},
294
+ "results": [(k.decode(), v.value) for k, v in self.results],
295
+ }
296
+ with open(git_dir(repo_path) / STATE_FILENAME, "w") as f:
297
+ json.dump(state_dict, f)
298
+
299
+ @classmethod
300
+ def from_git_state(cls, repo_path: Path) -> State:
301
+ try:
302
+ with open(git_dir(repo_path) / STATE_FILENAME) as f:
303
+ data = f.read()
304
+ except FileNotFoundError:
305
+ raise BayesectError("No state file found, run `git bayesect start` first") from None
306
+
307
+ try:
308
+ state_dict = json.loads(data)
309
+ except json.JSONDecodeError:
310
+ raise BayesectError(
311
+ "Invalid state file, run `git bayesect reset` to start afresh"
312
+ ) from None
313
+
314
+ if not isinstance(state_dict, dict):
315
+ raise BayesectError("Invalid state file, run `git bayesect reset` to start afresh")
316
+
317
+ if state_dict.get("version") != STATE_VERSION:
318
+ raise BayesectError(
319
+ f"State file version {state_dict.get('version')} does not match, "
320
+ "run `git bayesect reset` to start afresh"
321
+ )
322
+
323
+ assert set(state_dict) == {
324
+ "version",
325
+ "old_sha",
326
+ "new_sha",
327
+ "beta_priors",
328
+ "priors",
329
+ "results",
330
+ }
331
+
332
+ old_sha: bytes = state_dict["old_sha"].encode()
333
+ new_sha: bytes = state_dict["new_sha"].encode()
334
+ beta_priors: BetaPriors = BetaPriors(**state_dict["beta_priors"])
335
+ priors: dict[bytes, float] = {k.encode(): float(v) for k, v in state_dict["priors"].items()}
336
+ results: list[tuple[bytes, Result]] = [
337
+ (k.encode(), Result(v)) for k, v in state_dict["results"]
338
+ ]
339
+
340
+ commit_indices = get_commit_indices(repo_path, new_sha.decode())
341
+
342
+ return cls(
343
+ old_sha=old_sha,
344
+ new_sha=new_sha,
345
+ beta_priors=beta_priors,
346
+ priors=priors,
347
+ results=results,
348
+ commit_indices=commit_indices,
349
+ )
350
+
351
+
352
+ # ==============================
353
+ # Git logic
354
+ # ==============================
355
+
356
+
357
+ def smolsha(commit: bytes) -> str:
358
+ return commit.decode()[:10]
359
+
360
+
361
+ def git_dir(path: Path) -> Path:
362
+ path_str = subprocess.check_output(["git", "rev-parse", "--git-dir"], cwd=path)
363
+ return Path(path_str.strip().decode()).absolute()
364
+
365
+
366
+ def parse_commit(repo_path: Path, commit: str | bytes | None) -> bytes:
367
+ if isinstance(commit, bytes):
368
+ assert len(commit) == 40
369
+ return commit
370
+
371
+ if commit is None:
372
+ commit = "HEAD"
373
+
374
+ commit = subprocess.check_output(["git", "rev-parse", commit], cwd=repo_path).strip()
375
+ assert len(commit) == 40
376
+ return commit
377
+
378
+
379
+ def get_commit_indices(repo_path: Path, head: str | bytes) -> dict[bytes, int]:
380
+ if isinstance(head, bytes):
381
+ head = head.decode()
382
+
383
+ # Oldest commit has index 0
384
+ # TODO: think about non-linear history
385
+ # --first-parent: When finding commits to include, follow only the first parent commit
386
+ # upon seeing a merge commit.
387
+ output = subprocess.check_output(
388
+ ["git", "rev-list", "--reverse", "--first-parent", head], cwd=repo_path
389
+ )
390
+ return {line.strip(): i for i, line in enumerate(output.splitlines())}
391
+
392
+
393
+ def get_current_commit(repo_path: Path) -> bytes:
394
+ return subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=repo_path).strip()
395
+
396
+
397
+ def get_commit_files_mapping(repo_path: Path, commits: list[bytes]) -> dict[bytes, list[str]]:
398
+ output = subprocess.check_output(
399
+ [
400
+ "git",
401
+ "diff-tree",
402
+ "--stdin",
403
+ "-r",
404
+ "--root",
405
+ "--name-only",
406
+ "--no-renames",
407
+ "-z",
408
+ "--pretty=format:%H%x00",
409
+ ],
410
+ cwd=repo_path,
411
+ input=b"\n".join(commits),
412
+ )
413
+ sections = output.split(b"\x00\x00")
414
+ ret = {}
415
+ for s in sections:
416
+ commit, section = s.split(b"\n")
417
+ commit = commit.rstrip(b"\x00")
418
+ files = section.rstrip(b"\x00").split(b"\x00")
419
+ ret[commit] = [p.decode() for p in files]
420
+ return ret
421
+
422
+
423
+ # ==============================
424
+ # CLI logic
425
+ # ==============================
426
+
427
+
428
+ def get_bisector(state: State) -> Bisector:
429
+ old_index = state.commit_indices[state.old_sha]
430
+ new_index = state.commit_indices[state.new_sha]
431
+ assert new_index >= old_index
432
+
433
+ prior = np.ones(new_index - old_index + 1)
434
+ for commit_sha, weight in state.priors.items():
435
+ commit_index = state.commit_indices.get(commit_sha, -1)
436
+ if commit_index < old_index:
437
+ continue
438
+
439
+ relative_index = new_index - commit_index
440
+ assert 0 <= relative_index <= new_index - old_index
441
+ prior[relative_index] = weight
442
+
443
+ bisector = Bisector(
444
+ prior,
445
+ alpha_new=state.beta_priors.alpha_new,
446
+ beta_new=state.beta_priors.beta_new,
447
+ alpha_old=state.beta_priors.alpha_old,
448
+ beta_old=state.beta_priors.beta_old,
449
+ )
450
+
451
+ for commit_sha, result in state.results:
452
+ if result not in {Result.FAIL, Result.PASS}:
453
+ # TODO: handle SKIP maybe by adjusting the prior
454
+ continue
455
+
456
+ commit_index = state.commit_indices.get(commit_sha, -1)
457
+ if commit_index < old_index:
458
+ continue
459
+
460
+ # Our bisector is set up so that:
461
+ # - index 0 is newest commit
462
+ # - we're recording failures
463
+ relative_index = new_index - commit_index
464
+ assert 0 <= relative_index <= new_index - old_index
465
+ bisector.record(relative_index, result == Result.FAIL)
466
+
467
+ return bisector
468
+
469
+
470
+ def print_status(repo_path: Path, state: State, bisector: Bisector) -> None:
471
+ new_index = state.commit_indices[state.new_sha]
472
+ old_index = state.commit_indices[state.old_sha]
473
+
474
+ dist = bisector.distribution
475
+ dist_p_obs_new, dist_p_obs_old = bisector.empirical_p_obs
476
+
477
+ p_obs_new = (dist_p_obs_new * dist).sum()
478
+ p_obs_old = (dist_p_obs_old * dist).sum()
479
+
480
+ # TODO: maybe tie break argmax with most central?
481
+ most_likely_index = int(np.argmax(dist))
482
+ most_likely_prob = dist[most_likely_index]
483
+ most_likely_p_obs_new = dist_p_obs_new[most_likely_index]
484
+ most_likely_p_obs_old = dist_p_obs_old[most_likely_index]
485
+
486
+ p90_left, p90_right = bisector.central_range(0.9)
487
+ p90_range = p90_right - p90_left + 1
488
+
489
+ indices_commits = {i: c for c, i in state.commit_indices.items()}
490
+ most_likely_commit = smolsha(indices_commits[new_index - most_likely_index])
491
+ p90_left_commit = smolsha(indices_commits[new_index - p90_left])
492
+ p90_right_commit = smolsha(indices_commits[new_index - p90_right])
493
+
494
+ if most_likely_prob >= 0.95:
495
+ most_likely_commit = smolsha(indices_commits[new_index - most_likely_index])
496
+ msg = (
497
+ f"Bisection converged to {most_likely_commit} ({most_likely_prob:.1%}) "
498
+ f"after {bisector.num_total_observations} observations\n"
499
+ f"Subsequent failure rate is {most_likely_p_obs_new:.1%}, "
500
+ f"prior failure rate is {most_likely_p_obs_old:.1%}"
501
+ )
502
+ msg = msg.rstrip()
503
+ print("=" * 80)
504
+ print(msg)
505
+ print("=" * 80)
506
+
507
+ print(
508
+ subprocess.check_output(
509
+ ["git", "show", "--color", "--no-patch", "--stat", most_likely_commit],
510
+ cwd=repo_path,
511
+ ).decode()
512
+ )
513
+ print("=" * 80)
514
+ else:
515
+ msg = (
516
+ f"Bisection narrowed to `{p90_right_commit}^...{p90_left_commit}` "
517
+ f"({p90_range} commits) with 90% confidence "
518
+ f"after {bisector.num_total_observations} observations\n"
519
+ )
520
+ msg += f"New failure rate estimate: {p_obs_new:.1%}, old failure rate estimate: {p_obs_old:.1%}\n\n"
521
+ if most_likely_prob >= max(0.1, 2 / (new_index - old_index + 1)):
522
+ msg += f"Most likely commit: {most_likely_commit} ({most_likely_prob:.1%})\n"
523
+ msg += f"Subsequent failure rate is {most_likely_p_obs_new:.1%}, "
524
+ msg += f"prior failure rate is {most_likely_p_obs_old:.1%}\n"
525
+
526
+ msg = msg.rstrip()
527
+ print("=" * 80)
528
+ print(msg)
529
+ print("=" * 80)
530
+
531
+
532
+ def select_and_checkout(repo_path: Path, state: State, bisector: Bisector) -> bytes:
533
+ new_index = state.commit_indices[state.new_sha]
534
+
535
+ relative_index = bisector.select()
536
+ commit_index = new_index - relative_index
537
+ commit_sha = {c: i for i, c in state.commit_indices.items()}[commit_index]
538
+
539
+ print(f"Checking out next commit to test: {smolsha(commit_sha)}")
540
+ subprocess.run(
541
+ ["git", "checkout", commit_sha.decode()], cwd=repo_path, check=True, capture_output=True
542
+ )
543
+ return commit_sha
544
+
545
+
546
+ def cli_start(old: str, new: str | None) -> None:
547
+ repo_path = Path.cwd()
548
+ new_sha = parse_commit(repo_path, new)
549
+ old_sha = parse_commit(repo_path, old)
550
+ commit_indices = get_commit_indices(repo_path, new_sha)
551
+
552
+ state = State(
553
+ old_sha=old_sha,
554
+ new_sha=new_sha,
555
+ beta_priors=BetaPriors(alpha_new=0.9, beta_new=0.1, alpha_old=0.05, beta_old=0.95),
556
+ priors={},
557
+ results=[],
558
+ commit_indices=commit_indices,
559
+ )
560
+ state.dump(repo_path)
561
+
562
+ bisector = get_bisector(state)
563
+ print_status(repo_path, state, bisector)
564
+ select_and_checkout(repo_path, state, bisector)
565
+
566
+
567
+ def cli_reset() -> None:
568
+ repo_path = Path.cwd()
569
+ (git_dir(repo_path) / STATE_FILENAME).unlink(missing_ok=True)
570
+
571
+
572
+ def cli_fail(commit: str | bytes | None) -> None:
573
+ repo_path = Path.cwd()
574
+ commit = parse_commit(repo_path, commit)
575
+
576
+ state = State.from_git_state(repo_path)
577
+ state.results.append((commit, Result.FAIL))
578
+ state.dump(repo_path)
579
+
580
+ bisector = get_bisector(state)
581
+ print_status(repo_path, state, bisector)
582
+ select_and_checkout(repo_path, state, bisector)
583
+
584
+
585
+ def cli_pass(commit: str | bytes | None) -> None:
586
+ repo_path = Path.cwd()
587
+ commit = parse_commit(repo_path, commit)
588
+
589
+ state = State.from_git_state(repo_path)
590
+ state.results.append((commit, Result.PASS))
591
+ state.dump(repo_path)
592
+
593
+ bisector = get_bisector(state)
594
+ print_status(repo_path, state, bisector)
595
+ select_and_checkout(repo_path, state, bisector)
596
+
597
+
598
+ def cli_undo() -> None:
599
+ repo_path = Path.cwd()
600
+
601
+ state = State.from_git_state(repo_path)
602
+ if state.results:
603
+ commit, result = state.results.pop()
604
+ match result:
605
+ case Result.FAIL:
606
+ print(f"Undid last observation: git bayesect fail {smolsha(commit)}")
607
+ case Result.PASS:
608
+ print(f"Undid last observation: git bayesect pass {smolsha(commit)}")
609
+ case Result.SKIP:
610
+ print(f"Undid last observation: git bayesect skip {smolsha(commit)}")
611
+ else:
612
+ raise BayesectError("No observation to undo")
613
+ state.dump(repo_path)
614
+
615
+ bisector = get_bisector(state)
616
+ print_status(repo_path, state, bisector)
617
+ select_and_checkout(repo_path, state, bisector)
618
+
619
+
620
+ def cli_run(cmd: list[str]) -> None:
621
+ repo_path = Path.cwd()
622
+
623
+ if not cmd:
624
+ raise BayesectError("No command to run")
625
+
626
+ state = State.from_git_state(repo_path)
627
+ bisector = get_bisector(state)
628
+
629
+ old_index = state.commit_indices[state.old_sha]
630
+ new_index = state.commit_indices[state.new_sha]
631
+ assert new_index >= old_index
632
+
633
+ try:
634
+ while True:
635
+ commit = select_and_checkout(repo_path, state, bisector)
636
+ proc = subprocess.run(cmd, cwd=repo_path, check=False)
637
+ result = Result.PASS if proc.returncode == 0 else Result.FAIL
638
+
639
+ state.results.append((commit, result))
640
+ relative_index = new_index - state.commit_indices[commit]
641
+ assert 0 <= relative_index <= new_index - old_index
642
+ bisector.record(relative_index, result == Result.FAIL)
643
+
644
+ print_status(repo_path, state, bisector)
645
+ if bisector.distribution.max() >= 0.95:
646
+ break
647
+ finally:
648
+ state.dump(repo_path)
649
+
650
+
651
+ def cli_prior(commit: str | bytes, weight: float) -> None:
652
+ repo_path = Path.cwd()
653
+ commit = parse_commit(repo_path, commit)
654
+
655
+ state = State.from_git_state(repo_path)
656
+ state.priors[commit] = weight
657
+ state.dump(repo_path)
658
+ print(f"Updated prior for {smolsha(commit)} to {weight}")
659
+
660
+
661
+ def cli_priors_from_filenames(filenames_callback: str) -> None:
662
+ repo_path = Path.cwd()
663
+
664
+ state = State.from_git_state(repo_path)
665
+ files_mapping = get_commit_files_mapping(repo_path, commits=list(state.commit_indices.keys()))
666
+
667
+ cb_globals: dict[str, Any] = {}
668
+ cb_locals: dict[str, Any] = {}
669
+
670
+ import textwrap
671
+
672
+ filenames_callback = textwrap.indent(filenames_callback, " ")
673
+ filenames_callback = f"def _callback(filenames: list[str]) -> float:\n{filenames_callback}"
674
+ exec(filenames_callback, cb_globals, cb_locals)
675
+ filenames_fn = cb_locals["_callback"]
676
+
677
+ for commit, files in files_mapping.items():
678
+ prior = filenames_fn(files)
679
+ if prior is not None:
680
+ assert isinstance(prior, (int, float))
681
+ state.priors[commit] = prior
682
+ state.dump(repo_path)
683
+ print(f"Updated priors for {len(state.priors)} commits")
684
+
685
+ bisector = get_bisector(state)
686
+ print_status(repo_path, state, bisector)
687
+ select_and_checkout(repo_path, state, bisector)
688
+
689
+
690
+ def cli_beta_priors(
691
+ alpha_new: float | None, beta_new: float | None, alpha_old: float | None, beta_old: float | None
692
+ ) -> None:
693
+ repo_path = Path.cwd()
694
+
695
+ state = State.from_git_state(repo_path)
696
+ if alpha_new is not None:
697
+ state.beta_priors.alpha_new = alpha_new
698
+ if beta_new is not None:
699
+ state.beta_priors.beta_new = beta_new
700
+ if alpha_old is not None:
701
+ state.beta_priors.alpha_old = alpha_old
702
+ if beta_old is not None:
703
+ state.beta_priors.beta_old = beta_old
704
+ state.dump(repo_path)
705
+ print(f"Updated beta priors to {state.beta_priors.as_dict()}")
706
+
707
+ bisector = get_bisector(state)
708
+ print_status(repo_path, state, bisector)
709
+ select_and_checkout(repo_path, state, bisector)
710
+
711
+
712
+ def cli_checkout() -> None:
713
+ repo_path = Path.cwd()
714
+ state = State.from_git_state(repo_path)
715
+
716
+ bisector = get_bisector(state)
717
+ print_status(repo_path, state, bisector)
718
+ select_and_checkout(repo_path, state, bisector)
719
+
720
+
721
+ def cli_status() -> None:
722
+ repo_path = Path.cwd()
723
+ state = State.from_git_state(repo_path)
724
+
725
+ bisector = get_bisector(state)
726
+ new_index = state.commit_indices[state.new_sha]
727
+ old_index = state.commit_indices[state.old_sha]
728
+
729
+ dist = bisector.distribution
730
+ dist_p_obs_new, dist_p_obs_old = bisector.empirical_p_obs
731
+ (yes_new, total_new), (yes_old, total_old) = bisector.empirical_counts
732
+
733
+ rows = []
734
+ for commit, i in sorted(state.commit_indices.items(), key=lambda c: c[1], reverse=True):
735
+ relative_index = new_index - i
736
+ if relative_index == 0:
737
+ observations = f"{yes_new[relative_index]}/{total_new[relative_index]}"
738
+ else:
739
+ observations = (
740
+ f"{yes_new[relative_index] - yes_new[relative_index - 1]}/"
741
+ f"{total_new[relative_index] - total_new[relative_index - 1]}"
742
+ )
743
+ rows.append(
744
+ (
745
+ smolsha(commit),
746
+ f"{dist[relative_index]:.1%}",
747
+ observations,
748
+ f"{dist_p_obs_new[relative_index]:.1%}",
749
+ f"({yes_new[relative_index]}/{total_new[relative_index]})",
750
+ f"{dist_p_obs_old[relative_index]:.1%}",
751
+ f"({yes_old[relative_index]}/{total_old[relative_index]})",
752
+ "yes" if dist[relative_index] > max(0.1, 2 / (new_index - old_index + 1)) else "",
753
+ )
754
+ )
755
+ if commit == state.old_sha:
756
+ break
757
+
758
+ widths = [max(len(row[i]) for row in rows) for i in range(len(rows[0]))]
759
+
760
+ for (
761
+ commit_str,
762
+ likelihood,
763
+ observations,
764
+ p_obs_new,
765
+ c_obs_new,
766
+ p_obs_old,
767
+ c_obs_old,
768
+ should_highlight,
769
+ ) in rows:
770
+ if should_highlight:
771
+ print("\033[103m", end="")
772
+ print(
773
+ f"{commit_str:<{widths[0]}} "
774
+ f"likelihood {likelihood:<{widths[1]}}, "
775
+ f"observed {observations:<{widths[2]}} failures, "
776
+ f"subsequent failure rate {p_obs_new:<{widths[3]}} "
777
+ f"{c_obs_new:<{widths[4]}}, "
778
+ f"prior failure rate {p_obs_old:<{widths[5]}} "
779
+ f"{c_obs_old:<{widths[6]}}",
780
+ end="",
781
+ )
782
+ if should_highlight:
783
+ print("\033[0m")
784
+ else:
785
+ print()
786
+ print_status(repo_path, state, bisector)
787
+
788
+
789
+ def cli_log() -> None:
790
+ repo_path = Path.cwd()
791
+ state = State.from_git_state(repo_path)
792
+ print(f"git bayesect start --old {smolsha(state.old_sha)} --new {smolsha(state.new_sha)}")
793
+ print(
794
+ f"git bayesect beta_priors "
795
+ f"--alpha-new {state.beta_priors.alpha_new} "
796
+ f"--beta-new {state.beta_priors.beta_new} "
797
+ f"--alpha-old {state.beta_priors.alpha_old} "
798
+ f"--beta-old {state.beta_priors.beta_old}"
799
+ )
800
+
801
+ for commit, weight in state.priors.items():
802
+ print(f"git bayesect prior --commit {smolsha(commit)} --weight {weight}")
803
+ print()
804
+
805
+ for commit, result in state.results:
806
+ match result:
807
+ case Result.PASS:
808
+ print(f"git bayesect pass --commit {smolsha(commit)}")
809
+ case Result.FAIL:
810
+ print(f"git bayesect fail --commit {smolsha(commit)}")
811
+ case Result.SKIP:
812
+ print(f"git bayesect skip --commit {smolsha(commit)}")
813
+
814
+
815
+ def parse_options(argv: list[str]) -> argparse.Namespace:
816
+ parser = argparse.ArgumentParser()
817
+
818
+ subparsers = parser.add_subparsers(required=True)
819
+
820
+ subparser = subparsers.add_parser("start")
821
+ subparser.set_defaults(command=cli_start)
822
+ subparser.add_argument("--old", help="Old commit hash", required=True)
823
+ subparser.add_argument("--new", help="New commit hash", default=None)
824
+
825
+ subparser = subparsers.add_parser("fail", aliases=["failure"])
826
+ subparser.set_defaults(command=cli_fail)
827
+ subparser.add_argument("--commit", default=None)
828
+
829
+ subparser = subparsers.add_parser("pass", aliases=["success"])
830
+ subparser.set_defaults(command=cli_pass)
831
+ subparser.add_argument("--commit", default=None)
832
+
833
+ subparser = subparsers.add_parser("undo")
834
+ subparser.set_defaults(command=cli_undo)
835
+
836
+ subparser = subparsers.add_parser("reset")
837
+ subparser.set_defaults(command=cli_reset)
838
+
839
+ subparser = subparsers.add_parser("prior")
840
+ subparser.add_argument("--commit", required=True)
841
+ subparser.add_argument("--weight", type=float, required=True)
842
+ subparser.set_defaults(command=cli_prior)
843
+
844
+ subparser = subparsers.add_parser("priors_from_filenames", aliases=["priors-from-filenames"])
845
+ subparser.add_argument(
846
+ "--filenames-callback", help="Python code returning a float given filenames", required=True
847
+ )
848
+ subparser.set_defaults(command=cli_priors_from_filenames)
849
+
850
+ subparser = subparsers.add_parser("beta_priors", aliases=["beta-priors"])
851
+ subparser.add_argument("--alpha-new", type=float)
852
+ subparser.add_argument("--beta-new", type=float)
853
+ subparser.add_argument("--alpha-old", type=float)
854
+ subparser.add_argument("--beta-old", type=float)
855
+ subparser.set_defaults(command=cli_beta_priors)
856
+
857
+ subparser = subparsers.add_parser("checkout")
858
+ subparser.set_defaults(command=cli_checkout)
859
+
860
+ subparser = subparsers.add_parser("status")
861
+ subparser.set_defaults(command=cli_status)
862
+
863
+ subparser = subparsers.add_parser("log")
864
+ subparser.set_defaults(command=cli_log)
865
+
866
+ subparser = subparsers.add_parser("run")
867
+ subparser.set_defaults(command=cli_run)
868
+ subparser.add_argument("cmd", nargs=argparse.REMAINDER)
869
+
870
+ return parser.parse_args(argv)
871
+
872
+
873
+ def main() -> None:
874
+ args = parse_options(sys.argv[1:])
875
+ command = args.__dict__.pop("command")
876
+ try:
877
+ command(**args.__dict__)
878
+ except BayesectError as e:
879
+ print(f"ERROR: {e}", file=sys.stderr)
880
+ sys.exit(1)
881
+
882
+
883
+ if __name__ == "__main__":
884
+ main()
@@ -0,0 +1,39 @@
1
+ [project]
2
+ name = "git_bayesect"
3
+ version = "1.0"
4
+ authors = [{name = "Shantanu Jain"}, {email = "hauntsaninja@gmail.com"}]
5
+ description = "Git bisection with Bayesian statistics"
6
+ readme = "README.md"
7
+ license = {file = "LICENSE"}
8
+ classifiers = [
9
+ "Intended Audience :: Developers",
10
+ "License :: OSI Approved :: MIT License",
11
+ "Operating System :: OS Independent",
12
+ "Programming Language :: Python :: 3",
13
+ "Topic :: Software Development",
14
+ "Topic :: Utilities",
15
+ ]
16
+ requires-python = ">=3.10"
17
+ dependencies = [
18
+ "numpy",
19
+ "scipy",
20
+ ]
21
+
22
+ [project.scripts]
23
+ git-bayesect = "git_bayesect:main"
24
+ git_bayesect = "git_bayesect:main"
25
+
26
+ [project.urls]
27
+ homepage = "https://github.com/hauntsaninja/git_bayesect"
28
+ repository = "https://github.com/hauntsaninja/git_bayesect"
29
+
30
+ [tool.flit.module]
31
+ name = "git_bayesect"
32
+
33
+ [build-system]
34
+ requires = ["flit_core>=3.4"]
35
+ build-backend = "flit_core.buildapi"
36
+
37
+ [tool.mypy]
38
+ strict = true
39
+ allow_untyped_calls = true