@stupify/cli 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.review/CORPUS.md +28 -57
- package/.review/CORPUS.template.md +73 -0
- package/README.md +79 -23
- package/package.json +5 -3
- package/packs/antirez.md +10 -0
- package/packs/anton-kropp.md +10 -0
- package/packs/dhh.md +10 -0
- package/packs/dtolnay.md +10 -0
- package/packs/jarred-sumner.md +9 -0
- package/packs/mitchell-hashimoto.md +10 -0
- package/packs/rich-harris.md +10 -0
- package/packs/simon-willison.md +10 -0
- package/packs/sindre-sorhus.md +10 -0
- package/packs/tanner-linsley.md +10 -0
- package/packs/zod.md +10 -0
- package/src/cli.ts +293 -25
- package/src/prime-install.test.ts +109 -0
- package/src/prime.ts +50 -0
- package/src/review-sweep.test.ts +101 -0
- package/src/review-sweep.ts +59 -25
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
// Proof of the cache invariant: the review prompt's PREFIX (instructions + spec + rubric + corpus index) is
|
|
2
|
+
// byte-identical for every PR in a repo, and ONLY the tail (diff target, marker, memory) changes. That stable
|
|
3
|
+
// prefix is what the provider caches across diff threads — if a per-PR token ever leaked into it, the cache
|
|
4
|
+
// would thrash and this test would go red. We render against the repo's own real .review/ (no mocks).
|
|
5
|
+
import { expect, test } from 'bun:test'
|
|
6
|
+
import { join } from 'node:path'
|
|
7
|
+
import { type Config, type Pr, reviewPrompt, stablePrefix } from './review-sweep'
|
|
8
|
+
|
|
9
|
+
const REVIEW_DIR = join(import.meta.dir, '..', '.review') // the real spec/rubric/corpus shipped in this repo
|
|
10
|
+
const THIS_PR = '===== THIS PR' // the boundary between the cached prefix and the per-PR tail
|
|
11
|
+
|
|
12
|
+
const cfg = (): Config => ({
|
|
13
|
+
repoDir: '/tmp/x',
|
|
14
|
+
remote: 'https://github.com/acme/widgets.git',
|
|
15
|
+
slug: 'acme/widgets',
|
|
16
|
+
defaultBranch: 'main',
|
|
17
|
+
reviewDir: REVIEW_DIR,
|
|
18
|
+
homeReviewDir: REVIEW_DIR,
|
|
19
|
+
scope: 'label',
|
|
20
|
+
reviewLabel: 'codex-review',
|
|
21
|
+
diffLineCap: 800,
|
|
22
|
+
dryRun: false,
|
|
23
|
+
maxPrs: 15,
|
|
24
|
+
stateDir: '/tmp/x/state',
|
|
25
|
+
codexEffort: 'high',
|
|
26
|
+
codexProvider: '',
|
|
27
|
+
codexModel: '',
|
|
28
|
+
})
|
|
29
|
+
|
|
30
|
+
const pr = (number: number, sha: string): Pr => ({
|
|
31
|
+
number,
|
|
32
|
+
headRefOid: sha,
|
|
33
|
+
isDraft: false,
|
|
34
|
+
author: { login: 'someone', is_bot: false },
|
|
35
|
+
labels: [{ name: 'codex-review' }],
|
|
36
|
+
})
|
|
37
|
+
|
|
38
|
+
const sha256 = (s: string) => new Bun.CryptoHasher('sha256').update(s).digest('hex')
|
|
39
|
+
const prefixOf = (prompt: string) => prompt.slice(0, prompt.indexOf(THIS_PR))
|
|
40
|
+
|
|
41
|
+
// Three different PRs: different numbers, different head SHAs, and (crucially) one mid-thread with memory —
|
|
42
|
+
// the hardest case, since "continuing a review" must STILL not perturb the prefix.
|
|
43
|
+
const prompts = [
|
|
44
|
+
reviewPrompt(cfg(), pr(1, 'a'.repeat(40)), ''),
|
|
45
|
+
reviewPrompt(cfg(), pr(42, 'b'.repeat(40)), ''),
|
|
46
|
+
reviewPrompt(cfg(), pr(987, 'c'.repeat(40)), 'PRIOR-THREAD: a past review and the author reply'),
|
|
47
|
+
]
|
|
48
|
+
const prefixes = prompts.map(prefixOf)
|
|
49
|
+
|
|
50
|
+
test('the cached prefix is byte-identical across every PR (incl. mid-thread)', () => {
|
|
51
|
+
const hashes = new Set(prefixes.map(sha256))
|
|
52
|
+
expect(hashes.size).toBe(1) // one and only one prefix hash, no matter the PR
|
|
53
|
+
expect(prefixes[0]).toBe(prefixes[1])
|
|
54
|
+
expect(prefixes[0]).toBe(prefixes[2])
|
|
55
|
+
})
|
|
56
|
+
|
|
57
|
+
test('the prefix equals stablePrefix(cfg) and carries the real taste, not generic weights', () => {
|
|
58
|
+
expect(prefixes[0]?.trimEnd()).toBe(stablePrefix(cfg()).trimEnd())
|
|
59
|
+
expect(prefixes[0]).toContain('===== RUBRIC')
|
|
60
|
+
expect(prefixes[0]).toContain('===== CORPUS')
|
|
61
|
+
})
|
|
62
|
+
|
|
63
|
+
test('NO per-PR token leaks into the cached prefix', () => {
|
|
64
|
+
for (const prefix of prefixes) {
|
|
65
|
+
expect(prefix).not.toContain('gh pr diff') // the diff command lives in the tail
|
|
66
|
+
expect(prefix).not.toContain('a'.repeat(40)) // no head SHA / marker
|
|
67
|
+
expect(prefix).not.toContain('b'.repeat(40))
|
|
68
|
+
expect(prefix).not.toContain('PRIOR-THREAD') // memory lives in the tail
|
|
69
|
+
}
|
|
70
|
+
})
|
|
71
|
+
|
|
72
|
+
test('only the tail changes — per-PR content is present and correct there', () => {
|
|
73
|
+
expect(prompts[0]).not.toBe(prompts[1]) // whole prompts differ...
|
|
74
|
+
expect(prompts[0]).toContain('gh pr diff 1 --repo acme/widgets')
|
|
75
|
+
expect(prompts[1]).toContain('gh pr diff 42 --repo acme/widgets')
|
|
76
|
+
expect(prompts[2]).toContain('gh pr diff 987 --repo acme/widgets')
|
|
77
|
+
expect(prompts[2]).toContain('PRIOR-THREAD') // memory threaded into the tail
|
|
78
|
+
})
|
|
79
|
+
|
|
80
|
+
test('the prefix is large enough to be cache-eligible (well past the ~1024-token floor)', () => {
|
|
81
|
+
const bytes = prefixes[0]?.length ?? 0
|
|
82
|
+
const approxTokens = Math.round(bytes / 4) // ~4 chars/token, the standard rough estimate
|
|
83
|
+
expect(approxTokens).toBeGreaterThan(1024)
|
|
84
|
+
|
|
85
|
+
// Receipt: print the proof so a human sees it, plus the per-100-PR cost model the prefix-cache buys.
|
|
86
|
+
const reads = 100
|
|
87
|
+
const naive = reads // full-price prefix on every run
|
|
88
|
+
const cached = 1 + (reads - 1) * 0.1 // full once, then ~10% cache-read on the rest
|
|
89
|
+
console.log(
|
|
90
|
+
[
|
|
91
|
+
'',
|
|
92
|
+
' ── cache invariant proof ─────────────────────────────',
|
|
93
|
+
` prefix sha256 (all PRs): ${sha256(prefixes[0] ?? '')}`,
|
|
94
|
+
` prefix size: ${bytes} bytes (~${approxTokens} tokens)`,
|
|
95
|
+
` prefix identical across: ${prefixes.length} distinct PRs (incl. one mid-thread)`,
|
|
96
|
+
` prefix cost over ${reads} PRs: naive ${naive.toFixed(1)}× vs cached ${cached.toFixed(1)}× → ${Math.round((1 - cached / naive) * 100)}% off the prefix`,
|
|
97
|
+
' ──────────────────────────────────────────────────────',
|
|
98
|
+
'',
|
|
99
|
+
].join('\n'),
|
|
100
|
+
)
|
|
101
|
+
})
|
package/src/review-sweep.ts
CHANGED
|
@@ -4,8 +4,9 @@
|
|
|
4
4
|
* The engine the `stupify` CLI deploys to ~/.stupify and runs on a cron (or `stupify run`); config.env sits
|
|
5
5
|
* next to it.
|
|
6
6
|
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
7
|
+
* Reviews every PR by default (SCOPE=auto): every non-draft, non-bot PR under DIFF_LINE_CAP, no label needed.
|
|
8
|
+
* REVIEW_LABEL is just a force-include override for an oversized diff. Want manual control instead? SCOPE=label
|
|
9
|
+
* flips it to opt-in: only PRs you tag REVIEW_LABEL are reviewed, so spend tracks exactly what you tag.
|
|
9
10
|
* The "taste" — REVIEW-PROMPT.md, RUBRIC.md, CORPUS.md — lives in the TARGET repo under REVIEW_DIR (default
|
|
10
11
|
* `.review/`), so it's version-controlled with the code it judges and edited via a normal PR.
|
|
11
12
|
* Idempotent: skips a PR already reviewed — or already reported as failed — at its current head SHA, via a
|
|
@@ -23,12 +24,13 @@ import { fileURLToPath } from 'node:url'
|
|
|
23
24
|
|
|
24
25
|
const KIT_DIR = dirname(fileURLToPath(import.meta.url))
|
|
25
26
|
|
|
26
|
-
interface Config {
|
|
27
|
+
export interface Config {
|
|
27
28
|
repoDir: string // dedicated checkout we hard-reset — never a working checkout you care about
|
|
28
29
|
remote: string
|
|
29
30
|
slug: string
|
|
30
31
|
defaultBranch: string
|
|
31
|
-
reviewDir: string //
|
|
32
|
+
reviewDir: string // resolved review dir holding REVIEW-PROMPT.md / RUBRIC.md / CORPUS.md — the repo's .review/ if it has one, else homeReviewDir (set in main)
|
|
33
|
+
homeReviewDir: string // fallback taste the CLI assembled under STUPIFY_HOME/.review (packs or bring-your-own)
|
|
32
34
|
scope: 'label' | 'auto'
|
|
33
35
|
reviewLabel: string
|
|
34
36
|
diffLineCap: number
|
|
@@ -75,16 +77,17 @@ function loadConfig(): Config {
|
|
|
75
77
|
log('config: REPO_SLUG is required (owner/repo) — aborting. Run `stupify` to set up.')
|
|
76
78
|
process.exit(1)
|
|
77
79
|
}
|
|
78
|
-
const scopeRaw = pick('SCOPE', '
|
|
79
|
-
if (scopeRaw !== 'label' && scopeRaw !== 'auto') log(`config: SCOPE='${scopeRaw}' is not 'label' or 'auto' — using
|
|
80
|
+
const scopeRaw = pick('SCOPE', 'auto').trim().toLowerCase()
|
|
81
|
+
if (scopeRaw !== 'label' && scopeRaw !== 'auto') log(`config: SCOPE='${scopeRaw}' is not 'label' or 'auto' — using auto`)
|
|
80
82
|
|
|
81
83
|
return {
|
|
82
84
|
repoDir: join(stupifyHome, 'repo'), // HARD-PINNED under STUPIFY_HOME: refreshRepo runs `git reset --hard` here
|
|
83
85
|
remote: pick('REMOTE', `https://github.com/${slug}.git`),
|
|
84
86
|
slug,
|
|
85
87
|
defaultBranch: pick('DEFAULT_BRANCH', 'main'),
|
|
86
|
-
reviewDir: pick('REVIEW_DIR', '.review'),
|
|
87
|
-
|
|
88
|
+
reviewDir: pick('REVIEW_DIR', '.review'), // relative name here; main() resolves it to an absolute path (repo's or home's)
|
|
89
|
+
homeReviewDir: join(stupifyHome, '.review'),
|
|
90
|
+
scope: scopeRaw === 'label' ? 'label' : 'auto', // auto is the default; only the explicit string 'label' opts into per-PR tagging
|
|
88
91
|
reviewLabel: pick('REVIEW_LABEL', 'codex-review'),
|
|
89
92
|
diffLineCap: int('DIFF_LINE_CAP', 800, 1),
|
|
90
93
|
dryRun: bool('DRY_RUN', false, true), // unset = live (cron's normal mode); garbage = preview (never post on a typo)
|
|
@@ -166,11 +169,11 @@ function logFail(message: string): false {
|
|
|
166
169
|
return false
|
|
167
170
|
}
|
|
168
171
|
|
|
169
|
-
interface Pr {
|
|
172
|
+
export interface Pr {
|
|
170
173
|
number: number
|
|
171
174
|
headRefOid: string
|
|
172
175
|
isDraft: boolean
|
|
173
|
-
author: { login: string } | null
|
|
176
|
+
author: { login: string; is_bot: boolean } | null // is_bot flags GitHub App bots (app/dependabot) the [bot] suffix misses
|
|
174
177
|
labels: { name: string }[]
|
|
175
178
|
}
|
|
176
179
|
|
|
@@ -214,8 +217,10 @@ function isLabel(raw: unknown): raw is { name: string } {
|
|
|
214
217
|
return typeof raw === 'object' && raw !== null && 'name' in raw && typeof raw.name === 'string'
|
|
215
218
|
}
|
|
216
219
|
|
|
217
|
-
function isAuthor(raw: unknown): raw is { login: string } | null {
|
|
218
|
-
|
|
220
|
+
function isAuthor(raw: unknown): raw is { login: string; is_bot: boolean } | null {
|
|
221
|
+
if (raw === null) return true
|
|
222
|
+
if (typeof raw !== 'object') return false
|
|
223
|
+
return 'login' in raw && typeof raw.login === 'string' && 'is_bot' in raw && typeof raw.is_bot === 'boolean'
|
|
219
224
|
}
|
|
220
225
|
|
|
221
226
|
function hasReviewLabel(pr: Pr, cfg: Config): boolean {
|
|
@@ -224,7 +229,9 @@ function hasReviewLabel(pr: Pr, cfg: Config): boolean {
|
|
|
224
229
|
|
|
225
230
|
function inScope(pr: Pr, cfg: Config): boolean {
|
|
226
231
|
if (pr.isDraft) return false
|
|
227
|
-
|
|
232
|
+
// Never review bot PRs, in EITHER scope. gh's is_bot catches GitHub App bots (login `app/dependabot`) that
|
|
233
|
+
// the `[bot]` suffix misses; keep the suffix check as a belt-and-suspenders fallback.
|
|
234
|
+
if (pr.author?.is_bot === true || (pr.author?.login ?? '').endsWith('[bot]')) return false
|
|
228
235
|
if (cfg.scope === 'label') return hasReviewLabel(pr, cfg)
|
|
229
236
|
return true // auto: any non-draft, non-bot PR
|
|
230
237
|
}
|
|
@@ -290,24 +297,47 @@ function markersFor(pr: Pr): { mark: string; failMark: string } {
|
|
|
290
297
|
}
|
|
291
298
|
}
|
|
292
299
|
|
|
293
|
-
|
|
300
|
+
/** The taste prefix: instructions + the spec, rubric, and corpus INDEX, inlined verbatim. This is byte-identical
|
|
301
|
+
* for every PR in a repo, so it forms a stable prompt PREFIX the provider caches across diff threads — you pay
|
|
302
|
+
* full price for it once, then cache-read rates on every later PR. (If codex `Read` these files mid-loop instead,
|
|
303
|
+
* they'd arrive as tool results after model-chosen steps that vary per run, and wouldn't cache.) We inline the
|
|
304
|
+
* corpus INDEX only — its exemplars stay commit-pinned links the model opens on demand, so a review never pays to
|
|
305
|
+
* read the whole corpus. Keep ALL per-PR tokens (diff target, marker, memory) OUT of here — they go in the tail. */
|
|
306
|
+
export function stablePrefix(cfg: Config): string {
|
|
307
|
+
const read = (f: string) => readFileSync(join(cfg.reviewDir, f), 'utf8').trim()
|
|
308
|
+
return `You are a code reviewer running in an automated sweep (you have gh + git; no token needed). DO NOT modify any code.
|
|
309
|
+
Everything down to the "THIS PR" line is your fixed spec and taste — identical for every PR, so treat it as standing reference.
|
|
310
|
+
|
|
311
|
+
===== REVIEW SPEC (format + rules) =====
|
|
312
|
+
${read('REVIEW-PROMPT.md')}
|
|
313
|
+
|
|
314
|
+
===== RUBRIC (what counts as slop) =====
|
|
315
|
+
${read('RUBRIC.md')}
|
|
316
|
+
|
|
317
|
+
===== CORPUS (good-code reference; the links are commit-pinned — open one ONLY when a finding needs to cite it) =====
|
|
318
|
+
${read('CORPUS.md')}`
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
export function reviewPrompt(cfg: Config, pr: Pr, priorThread: string): string {
|
|
294
322
|
const { mark } = markersFor(pr)
|
|
295
323
|
const outPath = `/tmp/review-${pr.number}.md`
|
|
296
|
-
const dir = cfg.reviewDir
|
|
297
324
|
const memory = priorThread
|
|
298
325
|
? `\n\n## Prior reviews on this PR (your memory)
|
|
299
326
|
This is the existing review conversation — your past reviews and the author's replies. You are CONTINUING it,
|
|
300
|
-
not starting fresh. Apply
|
|
327
|
+
not starting fresh. Apply the spec's "Prior reviews on this PR" rules: don't re-raise resolved or
|
|
301
328
|
reasoned-declined items, report only what's genuinely new, and converge (post the one-line "no new issues"
|
|
302
329
|
and stop) if nothing new remains.
|
|
303
330
|
|
|
304
331
|
${priorThread}`
|
|
305
332
|
: ''
|
|
306
|
-
|
|
307
|
-
|
|
333
|
+
// Stable prefix first (cached across PRs); then the ONLY per-PR tokens — diff target, output marker, memory.
|
|
334
|
+
return `${stablePrefix(cfg)}
|
|
335
|
+
|
|
336
|
+
===== THIS PR (the only part that changes per run) =====
|
|
337
|
+
Review ONE pull request, per the spec and rubric above:
|
|
308
338
|
1. Get the diff: gh pr diff ${pr.number} --repo ${cfg.slug}
|
|
309
|
-
2. Review it
|
|
310
|
-
3. Write the review to ${outPath}, formatted EXACTLY per
|
|
339
|
+
2. Review it — catch bugs / type-lies / dead-code / footguns AND reinvents-primitive / slop, each citing the corpus primitive it should reuse; sort worst-first.
|
|
340
|
+
3. Write the review to ${outPath}, formatted EXACTLY per the spec's 'Comment format' section (it owns the format — opener, finding blocks, attribution). END the file with exactly this line: ${mark}
|
|
311
341
|
4. Post it: gh pr comment ${pr.number} --repo ${cfg.slug} --body-file ${outPath}
|
|
312
342
|
Keep it terse; no preamble.${memory}`
|
|
313
343
|
}
|
|
@@ -423,12 +453,16 @@ function main(): void {
|
|
|
423
453
|
})
|
|
424
454
|
|
|
425
455
|
if (!refreshRepo(cfg)) process.exit(1)
|
|
456
|
+
// Resolve the taste: the target repo's own .review/ wins (a repo can override); otherwise fall back to the
|
|
457
|
+
// home taste the CLI assembled from packs (~/.stupify/.review). Either way cfg.reviewDir becomes ABSOLUTE.
|
|
458
|
+
const repoReview = join(cfg.repoDir, cfg.reviewDir)
|
|
459
|
+
cfg.reviewDir = existsSync(join(repoReview, 'CORPUS.md')) ? repoReview : cfg.homeReviewDir
|
|
426
460
|
const haveMachinery =
|
|
427
|
-
existsSync(join(cfg.
|
|
428
|
-
existsSync(join(cfg.
|
|
429
|
-
existsSync(join(cfg.
|
|
461
|
+
existsSync(join(cfg.reviewDir, 'CORPUS.md')) &&
|
|
462
|
+
existsSync(join(cfg.reviewDir, 'REVIEW-PROMPT.md')) &&
|
|
463
|
+
existsSync(join(cfg.reviewDir, 'RUBRIC.md'))
|
|
430
464
|
if (!haveMachinery) {
|
|
431
|
-
log(`no review machinery
|
|
465
|
+
log(`no review machinery at ${cfg.reviewDir}/ (need REVIEW-PROMPT.md + RUBRIC.md + CORPUS.md) — no-op. Run \`stupify setup\` to assemble taste, or add a .review/ to ${cfg.slug}.`)
|
|
432
466
|
return
|
|
433
467
|
}
|
|
434
468
|
|
|
@@ -489,4 +523,4 @@ function main(): void {
|
|
|
489
523
|
log(`sweep done — scope=${cfg.scope} reviewed=${reviewed} tokens~${tokens}`)
|
|
490
524
|
}
|
|
491
525
|
|
|
492
|
-
main()
|
|
526
|
+
if (import.meta.main) main() // run only when invoked directly (cron / `stupify run`); stays importable for tests
|