@xcraftmind/mastermind 0.24.0 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -4
- package/bin/mastermind.js +4 -0
- package/package.json +9 -8
- package/share/agents/mastermind-auditor.md +205 -0
- package/share/agents/mastermind-critic.md +222 -0
- package/share/agents/mastermind-prompt-refiner.md +70 -0
- package/share/agents/mastermind-release.md +442 -0
- package/share/agents/mastermind-researcher.md +167 -0
- package/share/agents/mastermind-task-executor.md +86 -0
- package/share/commands/api-shape-explorer.md +107 -0
- package/share/skills/doc-stub-sync/SKILL.md +187 -0
- package/share/skills/doc-stub-sync/references/error-handling.md +79 -0
- package/share/skills/doc-stub-sync/references/url-patterns.md +83 -0
- package/share/skills/doc-stub-sync/scripts/doc_update.py +285 -0
- package/share/skills/doc-stub-sync/scripts/requirements.txt +2 -0
- package/share/skills/flaky-finder/SKILL.md +75 -0
- package/share/skills/mastermind-incident-response/SKILL.md +157 -0
- package/share/skills/mastermind-incident-response/references/investigation-playbook.md +173 -0
- package/share/skills/mastermind-incident-response/references/postmortem-template.md +184 -0
- package/share/skills/mastermind-incident-response/references/triage-checklist.md +117 -0
- package/share/skills/mastermind-prompt-refiner/SKILL.md +157 -0
- package/share/skills/mastermind-prompt-refiner/references/refining-checklist.md +89 -0
- package/share/skills/mastermind-prompt-refiner/references/techniques.md +143 -0
- package/share/skills/mastermind-task-executor/SKILL.md +154 -0
- package/share/skills/mastermind-task-planning/SKILL.md +337 -0
- package/share/skills/mastermind-task-planning/references/spec-template.md +286 -0
- package/share/skills/pr-review/SKILL.md +89 -0
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Sync local documentation stubs with their current online versions.
|
|
4
|
+
|
|
5
|
+
See ../SKILL.md for the workflow this script implements.
|
|
6
|
+
Run with --help for CLI usage.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import argparse
|
|
12
|
+
import dataclasses
|
|
13
|
+
import hashlib
|
|
14
|
+
import json
|
|
15
|
+
import os
|
|
16
|
+
import re
|
|
17
|
+
import sys
|
|
18
|
+
import tempfile
|
|
19
|
+
import time
|
|
20
|
+
from collections import defaultdict
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import Optional
|
|
23
|
+
from urllib.parse import urlparse
|
|
24
|
+
|
|
25
|
+
import requests
|
|
26
|
+
from bs4 import BeautifulSoup
|
|
27
|
+
|
|
28
|
+
DEFAULT_STUB_PATTERN = r"Fetch live documentation:\s*(https?://\S+)"
|
|
29
|
+
DEFAULT_TIMEOUT_SECONDS = 15
|
|
30
|
+
DEFAULT_RATE_LIMIT_SECONDS = 1.0
|
|
31
|
+
USER_AGENT = "mastermind-doc-stub-sync/0.1 (+https://github.com/aglumova/mastermind)"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclasses.dataclass
|
|
35
|
+
class StubFile:
|
|
36
|
+
path: Path
|
|
37
|
+
url: str
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclasses.dataclass
|
|
41
|
+
class UpdateResult:
|
|
42
|
+
path: str
|
|
43
|
+
url: str
|
|
44
|
+
status: str # "updated" | "current" | "unreachable" | "error"
|
|
45
|
+
old_hash: Optional[str] = None
|
|
46
|
+
new_hash: Optional[str] = None
|
|
47
|
+
delta_bytes: int = 0
|
|
48
|
+
reason: Optional[str] = None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def find_stubs(root: Path, pattern: re.Pattern) -> list[StubFile]:
|
|
52
|
+
stubs: list[StubFile] = []
|
|
53
|
+
for path in root.rglob("*.md"):
|
|
54
|
+
try:
|
|
55
|
+
text = path.read_text(encoding="utf-8")
|
|
56
|
+
except (UnicodeDecodeError, PermissionError):
|
|
57
|
+
continue
|
|
58
|
+
match = pattern.search(text)
|
|
59
|
+
if match:
|
|
60
|
+
stubs.append(StubFile(path=path, url=match.group(1).strip()))
|
|
61
|
+
return stubs
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def normalize_text(text: str) -> str:
|
|
65
|
+
"""Strip line-end whitespace and collapse blank runs for stable hashing."""
|
|
66
|
+
lines = [line.rstrip() for line in text.splitlines()]
|
|
67
|
+
out: list[str] = []
|
|
68
|
+
prev_blank = False
|
|
69
|
+
for line in lines:
|
|
70
|
+
if not line:
|
|
71
|
+
if not prev_blank:
|
|
72
|
+
out.append("")
|
|
73
|
+
prev_blank = True
|
|
74
|
+
else:
|
|
75
|
+
out.append(line)
|
|
76
|
+
prev_blank = False
|
|
77
|
+
return "\n".join(out).strip() + "\n"
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def content_hash(text: str) -> str:
|
|
81
|
+
return hashlib.sha256(normalize_text(text).encode("utf-8")).hexdigest()
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def local_body(text: str, pattern: re.Pattern) -> str:
|
|
85
|
+
"""The local file's body, with the stub line removed for fair comparison."""
|
|
86
|
+
return pattern.sub("", text).strip()
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def extract_main(html: str) -> tuple[str, str]:
|
|
90
|
+
"""Return (title, main-text) extracted from the upstream HTML."""
|
|
91
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
92
|
+
for tag in soup(["script", "style", "noscript", "iframe", "nav", "footer"]):
|
|
93
|
+
tag.decompose()
|
|
94
|
+
main = soup.find("main") or soup.find("article") or soup.find("body") or soup
|
|
95
|
+
text = main.get_text(separator="\n")
|
|
96
|
+
title_tag = soup.find("title")
|
|
97
|
+
title = (
|
|
98
|
+
title_tag.get_text().split("|")[0].strip()
|
|
99
|
+
if title_tag
|
|
100
|
+
else urlparse(soup.find("link", rel="canonical").get("href", ""))
|
|
101
|
+
.path.rstrip("/")
|
|
102
|
+
.rsplit("/", 1)[-1]
|
|
103
|
+
if soup.find("link", rel="canonical")
|
|
104
|
+
else ""
|
|
105
|
+
)
|
|
106
|
+
return title, text
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def fetch(url: str, timeout: float) -> requests.Response:
|
|
110
|
+
return requests.get(url, timeout=timeout, headers={"User-Agent": USER_AGENT})
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def fetch_with_retry(url: str, timeout: float) -> requests.Response:
|
|
114
|
+
try:
|
|
115
|
+
return fetch(url, timeout)
|
|
116
|
+
except requests.exceptions.RequestException:
|
|
117
|
+
time.sleep(2)
|
|
118
|
+
return fetch(url, timeout)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def build_updated_content(title: str, body_text: str, url: str) -> str:
|
|
122
|
+
header = f"# {title}\n\n" if title else ""
|
|
123
|
+
return f"{header}{body_text.strip()}\n\n---\n\nFetch live documentation: {url}\n"
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def write_atomic(path: Path, content: str) -> None:
|
|
127
|
+
tmp_fd, tmp_path = tempfile.mkstemp(dir=path.parent, prefix=".tmp-", suffix=".md")
|
|
128
|
+
try:
|
|
129
|
+
with os.fdopen(tmp_fd, "w", encoding="utf-8") as f:
|
|
130
|
+
f.write(content)
|
|
131
|
+
os.replace(tmp_path, path)
|
|
132
|
+
except Exception:
|
|
133
|
+
if os.path.exists(tmp_path):
|
|
134
|
+
os.unlink(tmp_path)
|
|
135
|
+
raise
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def process_stub(
|
|
139
|
+
stub: StubFile,
|
|
140
|
+
pattern: re.Pattern,
|
|
141
|
+
timeout: float,
|
|
142
|
+
dry_run: bool,
|
|
143
|
+
) -> UpdateResult:
|
|
144
|
+
try:
|
|
145
|
+
response = fetch_with_retry(stub.url, timeout)
|
|
146
|
+
except requests.exceptions.RequestException as exc:
|
|
147
|
+
return UpdateResult(
|
|
148
|
+
path=str(stub.path),
|
|
149
|
+
url=stub.url,
|
|
150
|
+
status="unreachable",
|
|
151
|
+
reason=f"request failed: {exc}",
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
if response.status_code >= 400:
|
|
155
|
+
return UpdateResult(
|
|
156
|
+
path=str(stub.path),
|
|
157
|
+
url=stub.url,
|
|
158
|
+
status="unreachable",
|
|
159
|
+
reason=f"{response.status_code} {response.reason}",
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
title, upstream_text = extract_main(response.text)
|
|
163
|
+
new_content = build_updated_content(title, upstream_text, stub.url)
|
|
164
|
+
new_hash = content_hash(new_content)
|
|
165
|
+
|
|
166
|
+
try:
|
|
167
|
+
old_text = stub.path.read_text(encoding="utf-8")
|
|
168
|
+
except Exception as exc:
|
|
169
|
+
return UpdateResult(
|
|
170
|
+
path=str(stub.path), url=stub.url, status="error", reason=str(exc)
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
old_hash = content_hash(local_body(old_text, pattern))
|
|
174
|
+
cmp_hash = content_hash(local_body(new_content, pattern))
|
|
175
|
+
|
|
176
|
+
if old_hash == cmp_hash:
|
|
177
|
+
return UpdateResult(
|
|
178
|
+
path=str(stub.path),
|
|
179
|
+
url=stub.url,
|
|
180
|
+
status="current",
|
|
181
|
+
old_hash=old_hash,
|
|
182
|
+
new_hash=cmp_hash,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
delta = len(new_content.encode("utf-8")) - len(old_text.encode("utf-8"))
|
|
186
|
+
if not dry_run:
|
|
187
|
+
try:
|
|
188
|
+
write_atomic(stub.path, new_content)
|
|
189
|
+
except Exception as exc:
|
|
190
|
+
return UpdateResult(
|
|
191
|
+
path=str(stub.path), url=stub.url, status="error", reason=str(exc)
|
|
192
|
+
)
|
|
193
|
+
return UpdateResult(
|
|
194
|
+
path=str(stub.path),
|
|
195
|
+
url=stub.url,
|
|
196
|
+
status="updated",
|
|
197
|
+
old_hash=old_hash,
|
|
198
|
+
new_hash=cmp_hash,
|
|
199
|
+
delta_bytes=delta,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def run(
|
|
204
|
+
root: Path,
|
|
205
|
+
stub_pattern: str,
|
|
206
|
+
rate_limit_seconds: float,
|
|
207
|
+
timeout_seconds: float,
|
|
208
|
+
dry_run: bool,
|
|
209
|
+
) -> dict:
|
|
210
|
+
pattern = re.compile(stub_pattern)
|
|
211
|
+
stubs = find_stubs(root, pattern)
|
|
212
|
+
if not stubs:
|
|
213
|
+
return {
|
|
214
|
+
"summary": {"scanned": 0, "updated": 0, "skipped_current": 0, "unreachable": 0, "errors": 0, "duration_seconds": 0},
|
|
215
|
+
"updated": [],
|
|
216
|
+
"unreachable": [],
|
|
217
|
+
"errors": [],
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
started = time.monotonic()
|
|
221
|
+
last_request_per_host: dict[str, float] = defaultdict(float)
|
|
222
|
+
results: list[UpdateResult] = []
|
|
223
|
+
|
|
224
|
+
for stub in stubs:
|
|
225
|
+
host = urlparse(stub.url).netloc
|
|
226
|
+
wait = (last_request_per_host[host] + rate_limit_seconds) - time.monotonic()
|
|
227
|
+
if wait > 0:
|
|
228
|
+
time.sleep(wait)
|
|
229
|
+
last_request_per_host[host] = time.monotonic()
|
|
230
|
+
results.append(process_stub(stub, pattern, timeout_seconds, dry_run))
|
|
231
|
+
|
|
232
|
+
summary = {
|
|
233
|
+
"scanned": len(results),
|
|
234
|
+
"updated": sum(1 for r in results if r.status == "updated"),
|
|
235
|
+
"skipped_current": sum(1 for r in results if r.status == "current"),
|
|
236
|
+
"unreachable": sum(1 for r in results if r.status == "unreachable"),
|
|
237
|
+
"errors": sum(1 for r in results if r.status == "error"),
|
|
238
|
+
"duration_seconds": round(time.monotonic() - started, 1),
|
|
239
|
+
"dry_run": dry_run,
|
|
240
|
+
}
|
|
241
|
+
return {
|
|
242
|
+
"summary": summary,
|
|
243
|
+
"updated": [dataclasses.asdict(r) for r in results if r.status == "updated"],
|
|
244
|
+
"unreachable": [dataclasses.asdict(r) for r in results if r.status == "unreachable"],
|
|
245
|
+
"errors": [dataclasses.asdict(r) for r in results if r.status == "error"],
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def cli(argv: list[str]) -> int:
|
|
250
|
+
parser = argparse.ArgumentParser(description="Sync local doc stubs with online sources.")
|
|
251
|
+
parser.add_argument("target_dir", type=Path, help="Root directory containing stub markdown files.")
|
|
252
|
+
parser.add_argument("--stub-pattern", default=DEFAULT_STUB_PATTERN,
|
|
253
|
+
help="Regex with one capture group for the URL. Default: %(default)r")
|
|
254
|
+
parser.add_argument("--rate-limit", type=float, default=DEFAULT_RATE_LIMIT_SECONDS,
|
|
255
|
+
help="Minimum seconds between requests to the same host. Default: %(default)s")
|
|
256
|
+
parser.add_argument("--timeout", type=float, default=DEFAULT_TIMEOUT_SECONDS,
|
|
257
|
+
help="Per-request timeout in seconds. Default: %(default)s")
|
|
258
|
+
parser.add_argument("--dry-run", action="store_true",
|
|
259
|
+
help="Detect changes but do not write files. Report what would change.")
|
|
260
|
+
parser.add_argument("--report-file", type=Path,
|
|
261
|
+
help="If set, write the JSON report to this path.")
|
|
262
|
+
args = parser.parse_args(argv)
|
|
263
|
+
|
|
264
|
+
if not args.target_dir.is_dir():
|
|
265
|
+
print(f"error: target_dir is not a directory: {args.target_dir}", file=sys.stderr)
|
|
266
|
+
return 2
|
|
267
|
+
|
|
268
|
+
report = run(
|
|
269
|
+
root=args.target_dir,
|
|
270
|
+
stub_pattern=args.stub_pattern,
|
|
271
|
+
rate_limit_seconds=args.rate_limit,
|
|
272
|
+
timeout_seconds=args.timeout,
|
|
273
|
+
dry_run=args.dry_run,
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
output_json = json.dumps(report, indent=2)
|
|
277
|
+
print(output_json)
|
|
278
|
+
if args.report_file:
|
|
279
|
+
args.report_file.write_text(output_json + "\n", encoding="utf-8")
|
|
280
|
+
|
|
281
|
+
return 0 if report["summary"]["errors"] == 0 else 1
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
if __name__ == "__main__":
|
|
285
|
+
raise SystemExit(cli(sys.argv[1:]))
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: flaky-finder
|
|
3
|
+
description: Identify flaky tests by running the suite repeatedly and bisecting failures across runs. Use when the user says "find flaky tests", "this test is flaky", "tests pass locally but fail in CI", or sees intermittent test failures.
|
|
4
|
+
metadata:
|
|
5
|
+
version: 0.1.0
|
|
6
|
+
authors:
|
|
7
|
+
- mastermind
|
|
8
|
+
tags:
|
|
9
|
+
- testing
|
|
10
|
+
model: sonnet
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
# Flaky Test Finder
|
|
14
|
+
|
|
15
|
+
Finds tests that pass and fail non-deterministically. Runs the suite N times, records which tests changed outcome between runs, and ranks them by flake rate.
|
|
16
|
+
|
|
17
|
+
## When to use
|
|
18
|
+
|
|
19
|
+
- User reports "tests pass locally but fail in CI"
|
|
20
|
+
- A test failed once and the user wants to confirm if it's flaky before retrying
|
|
21
|
+
- User explicitly asks for a flake audit before a release
|
|
22
|
+
- Do NOT use for finding *broken* tests — those fail consistently. Use a regular test run for that.
|
|
23
|
+
|
|
24
|
+
## Prerequisites
|
|
25
|
+
|
|
26
|
+
- A working `<test-command>` for the project (`pytest`, `go test ./...`, `npm test`, etc.)
|
|
27
|
+
- Time — flake hunting is inherently slow (10-50 runs of the full suite)
|
|
28
|
+
|
|
29
|
+
## Steps
|
|
30
|
+
|
|
31
|
+
1. **Confirm the test command.** Read the project's CI config or `package.json` / `Makefile`. If unclear, ask.
|
|
32
|
+
2. **Establish a baseline.** Run the suite once. If it fails, the failures aren't flakes — they're broken. Stop and report.
|
|
33
|
+
3. **Decide N.** Default to 20 runs. For long suites (>5min), drop to 10. For fast suites (<30s), go to 50.
|
|
34
|
+
4. **Run N times, recording each test's pass/fail per run.** Use the project's machine-readable output if available (`pytest --junitxml`, `go test -json`, `jest --json`).
|
|
35
|
+
5. **Compute flake rate per test.** A test that passed 18/20 times and failed 2/20 has a flake rate of 10%.
|
|
36
|
+
6. **Rank by flake rate descending.** Anything between 1% and 99% is suspicious; 0% and 100% are deterministic.
|
|
37
|
+
7. **For the top 3 flakiest, read the test code.** Look for: shared state, time-based assertions, network calls, ordering assumptions, race conditions.
|
|
38
|
+
8. **Report findings.**
|
|
39
|
+
|
|
40
|
+
## Outputs
|
|
41
|
+
|
|
42
|
+
```markdown
|
|
43
|
+
## Flake report — N=<N> runs of <test-command>
|
|
44
|
+
|
|
45
|
+
### Flaky tests (sorted by flake rate)
|
|
46
|
+
| Test | Flake rate | Likely cause |
|
|
47
|
+
|---|---|---|
|
|
48
|
+
| `tests/limiter_test.go::TestConcurrentBucket` | 35% (7/20 failed) | Race on shared counter, no `t.Parallel()` synchronization |
|
|
49
|
+
| `tests/api_test.py::test_response_time` | 15% (3/20 failed) | Time-based assertion `< 100ms` — fails under load |
|
|
50
|
+
|
|
51
|
+
### Deterministic failures
|
|
52
|
+
- `tests/foo_test.py::test_bar` — failed all 20 runs. Not a flake; this test is broken.
|
|
53
|
+
|
|
54
|
+
### Deterministic passes
|
|
55
|
+
- <count> tests passed all <N> runs.
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Examples
|
|
59
|
+
|
|
60
|
+
**Input:** "Our CI is flaky, can you find the culprit?"
|
|
61
|
+
|
|
62
|
+
**Output (abbreviated):**
|
|
63
|
+
```markdown
|
|
64
|
+
## Flake report — N=20 runs of `pytest tests/`
|
|
65
|
+
|
|
66
|
+
### Flaky tests
|
|
67
|
+
| Test | Flake rate | Likely cause |
|
|
68
|
+
|---|---|---|
|
|
69
|
+
| `test_websocket_reconnect` | 25% | Race between `await ws.connect()` and the heartbeat loop |
|
|
70
|
+
| `test_cache_eviction` | 5% | Wall-clock assertion `time.time() - start < 1.0` |
|
|
71
|
+
|
|
72
|
+
### Deterministic
|
|
73
|
+
- 312 passed all 20 runs
|
|
74
|
+
- 0 failed all 20 runs
|
|
75
|
+
```
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: mastermind-incident-response
|
|
3
|
+
description: Parallel workflow for production incidents — triage, stop the bleeding, investigate root cause via mmcg + git + .mastermind/tasks/ history, write a blameless postmortem, feed lessons back into CONTEXT.md and (if applicable) into the main workflow's spec template or critic dimensions. Use when the user says "incident", "outage", "rollback", "что-то сломалось в проде", or pastes paging alerts / error logs.
|
|
4
|
+
metadata:
|
|
5
|
+
version: 0.1.0
|
|
6
|
+
authors:
|
|
7
|
+
- mastermind
|
|
8
|
+
tags:
|
|
9
|
+
- workflow
|
|
10
|
+
- incident-response
|
|
11
|
+
- postmortem
|
|
12
|
+
- operations
|
|
13
|
+
model: opus
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
# Mastermind — Incident Response
|
|
17
|
+
|
|
18
|
+
A **parallel workflow** for handling production breakage. Different from the main 13-step planning workflow ([`mastermind-task-planning`](../mastermind-task-planning/SKILL.md)) which builds new things — this one **stops bleeding**, finds root cause, and turns lessons into systemic improvements.
|
|
19
|
+
|
|
20
|
+
## When to Activate
|
|
21
|
+
|
|
22
|
+
User says or pastes:
|
|
23
|
+
- "incident", "outage", "production is down", "что-то сломалось в проде", "rollback"
|
|
24
|
+
- Paging alerts (Datadog, PagerDuty, Sentry)
|
|
25
|
+
- Error logs with stack traces
|
|
26
|
+
- "users are reporting…"
|
|
27
|
+
- "deploy broke something"
|
|
28
|
+
|
|
29
|
+
## What this is NOT
|
|
30
|
+
|
|
31
|
+
- **Not** the bug-triage flow for development-time bugs — those go through the regular planning workflow
|
|
32
|
+
- **Not** a feature-request channel
|
|
33
|
+
- **Not** a debugging session for the user's local environment
|
|
34
|
+
- **Not** a substitute for paging an actual on-call engineer for sev0/sev1 incidents — the workflow assists, doesn't replace the human
|
|
35
|
+
|
|
36
|
+
## Different Priorities Than Planning
|
|
37
|
+
|
|
38
|
+
| Planning workflow | Incident response |
|
|
39
|
+
|---|---|
|
|
40
|
+
| Optimize for quality | Optimize for **time** |
|
|
41
|
+
| 7-dim critic before doing anything | Bias toward **rollback first**, understand later |
|
|
42
|
+
| Mandatory specs, alternatives, tests | Hot-fix is OK if rollback impossible |
|
|
43
|
+
| "Did we design this right?" | "What's the fastest way to stop the bleeding?" |
|
|
44
|
+
| Blameless review post-fact | Blameless reasoning **during** |
|
|
45
|
+
|
|
46
|
+
You are in **operations mode**. Speed of bleed-stop > completeness of fix > root-cause depth > paperwork. Reverse that order during postmortem.
|
|
47
|
+
|
|
48
|
+
## Phases
|
|
49
|
+
|
|
50
|
+
### Phase 1 — Triage (target: first 5 minutes)
|
|
51
|
+
|
|
52
|
+
Ask the user (or extract from pasted alert):
|
|
53
|
+
|
|
54
|
+
1. **Symptom** — what users / monitoring see (one sentence, observable)
|
|
55
|
+
2. **Scope** — how many users / how much traffic / which surfaces
|
|
56
|
+
3. **Severity** — pick a number:
|
|
57
|
+
- **sev0** — total outage, paging fire
|
|
58
|
+
- **sev1** — major degradation, immediate action needed
|
|
59
|
+
- **sev2** — partial degradation, action within hours
|
|
60
|
+
- **sev3** — minor / cosmetic, action within days
|
|
61
|
+
4. **Timeline** — when did this start? (correlate with deploys / changes)
|
|
62
|
+
5. **What's been tried already**
|
|
63
|
+
|
|
64
|
+
While asking, parallel-research with `mastermind-researcher` subagent:
|
|
65
|
+
```
|
|
66
|
+
git log --since='2 hours ago' --oneline → what changed recently
|
|
67
|
+
git log -10 --oneline → most recent commits
|
|
68
|
+
ls -lt .mastermind/tasks/ | head -10 → most recent specs
|
|
69
|
+
mmcg_status → index health
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
Use see [`references/triage-checklist.md`](references/triage-checklist.md) for the full first-response checklist.
|
|
73
|
+
|
|
74
|
+
### Phase 2 — Stop the bleeding (target: next 10 minutes after triage)
|
|
75
|
+
|
|
76
|
+
**Order of preference:**
|
|
77
|
+
1. **Rollback** to last known good — if you can identify it, do it
|
|
78
|
+
2. **Disable the feature** — if feature-flagged, flip the flag off
|
|
79
|
+
3. **Hot patch** — only if 1 and 2 not possible; this carries risk
|
|
80
|
+
4. **Escalate** — if stuck > 10 min on Phase 2, page additional help / wake on-call
|
|
81
|
+
|
|
82
|
+
For each option, write to user what you're about to propose. **Do not execute destructive ops** (`git push --force`, deploys) without explicit user confirmation per turn — they're operating the controls, you're advising.
|
|
83
|
+
|
|
84
|
+
Mitigation tactics by failure type:
|
|
85
|
+
- **Recent deploy broke things** → revert the deploy
|
|
86
|
+
- **Recent config change broke things** → revert config
|
|
87
|
+
- **Data corruption** → freeze writes, restore from backup, investigate cause separately
|
|
88
|
+
- **External dependency degraded** → enable degraded-mode fallback if present; otherwise wait + monitor
|
|
89
|
+
- **Resource exhaustion (memory, disk, connections)** → kill / restart / scale; investigate cause separately
|
|
90
|
+
|
|
91
|
+
### Phase 3 — Investigate (after symptoms stop)
|
|
92
|
+
|
|
93
|
+
With pressure off, find **root cause** — not just the symptom. Five-whys discipline.
|
|
94
|
+
|
|
95
|
+
**Investigation playbook** — see [`references/investigation-playbook.md`](references/investigation-playbook.md) for the full set of mmcg + git + log patterns. Quick summary:
|
|
96
|
+
|
|
97
|
+
- **What changed recently?** `git log --since='<time of incident start - 1h>' -- <suspected paths>`
|
|
98
|
+
- **What's the blast radius of the change?** `mmcg query impact <symbol> --depth 3`
|
|
99
|
+
- **Were the relevant specs in `.mastermind/tasks/` going to catch this?** Read their Tests Plan + Observability Plan sections
|
|
100
|
+
- **Did observability fire?** If yes, why didn't it page sooner? If no, why wasn't it instrumented?
|
|
101
|
+
- **Is this a recurrence?** Grep `CONTEXT.md` for the symptom — known gotcha?
|
|
102
|
+
|
|
103
|
+
If a fix is needed, **don't write it inline in this incident flow** — open a `.mastermind/tasks/<NNN>-<short-name>.md` spec via the main workflow. The fix goes through the normal critic/auditor gates. Incident response identifies the need; planner designs the response.
|
|
104
|
+
|
|
105
|
+
### Phase 4 — Postmortem (within 24h of resolution)
|
|
106
|
+
|
|
107
|
+
Use [`references/postmortem-template.md`](references/postmortem-template.md). Sections:
|
|
108
|
+
|
|
109
|
+
- **Summary** (1-2 sentences — what happened, impact, resolution)
|
|
110
|
+
- **Timeline** (UTC, minute-resolution where relevant)
|
|
111
|
+
- **What went wrong** (root cause, contributing factors)
|
|
112
|
+
- **What went well** (yes, name what worked — psychological safety + reinforces good patterns)
|
|
113
|
+
- **Why detection took N minutes** (separate from why-it-happened — detection is its own failure mode)
|
|
114
|
+
- **Why mitigation took N minutes** (rollback fast? unclear who could act? missing runbook?)
|
|
115
|
+
- **Action items** (specific, owned, dated — each becomes a `.mastermind/tasks/` spec or a CONTEXT.md update)
|
|
116
|
+
|
|
117
|
+
**Blameless framing** — write about systems, not people:
|
|
118
|
+
- ❌ "Engineer X deployed without testing"
|
|
119
|
+
- ✓ "The deploy pipeline allowed merging with failing tests because the test job was marked non-blocking three weeks ago"
|
|
120
|
+
|
|
121
|
+
If a person made a judgment call that turned out wrong, frame it as: "given the information available at the time, the action was reasonable; the lesson is that information X needs to be more accessible / surfaced earlier."
|
|
122
|
+
|
|
123
|
+
### Phase 5 — Feed forward
|
|
124
|
+
|
|
125
|
+
Two destinations:
|
|
126
|
+
|
|
127
|
+
**A. Project `CONTEXT.md`** (immediate):
|
|
128
|
+
- **Known gotchas** entry for the failure pattern — concrete + scenario + reference to postmortem path
|
|
129
|
+
- **Don't-touch list** entry if a code area has subtle constraints now known
|
|
130
|
+
- **Decision log** entry if the postmortem changed an architectural decision
|
|
131
|
+
|
|
132
|
+
**B. Action items as new `.mastermind/tasks/` specs** (within days):
|
|
133
|
+
- Each action item becomes a spec
|
|
134
|
+
- Specs go through normal workflow (planner → critic → executor → auditor)
|
|
135
|
+
- Link back to postmortem in spec's Notes section
|
|
136
|
+
|
|
137
|
+
**C. Workflow improvements** (if applicable):
|
|
138
|
+
- Did the spec for the offending change include an Observability Plan? If no, that's evidence the planner skill should make it more mandatory.
|
|
139
|
+
- Did the critic's 7 dimensions miss this category of issue? Propose an 8th dimension or sharpening an existing one.
|
|
140
|
+
- Did the auditor pass when it shouldn't have? Add a new check.
|
|
141
|
+
|
|
142
|
+
Workflow improvements go into the mastermind repo itself as a meta-improvement spec. **The workflow learns from its own failures.**
|
|
143
|
+
|
|
144
|
+
## Roles & subagents
|
|
145
|
+
|
|
146
|
+
Most of incident response is run by the planner (in this mode), with these spawns:
|
|
147
|
+
|
|
148
|
+
- **`mastermind-researcher`** — for git/mmcg fact-gathering during Phase 1 and Phase 3
|
|
149
|
+
- **`mastermind-critic`** — for the postmortem's "what went wrong" section if there's a design question (e.g., "was this design fundamentally flawed?"). Optional.
|
|
150
|
+
- **`mastermind-auditor`** — NOT used in incident response (it's a post-flight checker, doesn't apply here)
|
|
151
|
+
- **`mastermind-task-planning`** (in main mode) — for any follow-up specs that come out of the postmortem
|
|
152
|
+
|
|
153
|
+
## References
|
|
154
|
+
|
|
155
|
+
- [`references/triage-checklist.md`](references/triage-checklist.md) — first 5 minutes
|
|
156
|
+
- [`references/investigation-playbook.md`](references/investigation-playbook.md) — mmcg + git + .mastermind/tasks/ patterns for finding root cause
|
|
157
|
+
- [`references/postmortem-template.md`](references/postmortem-template.md) — blameless postmortem fill-in
|