@stupify/cli 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,101 @@
1
+ // Proof of the cache invariant: the review prompt's PREFIX (instructions + spec + rubric + corpus index) is
2
+ // byte-identical for every PR in a repo, and ONLY the tail (diff target, marker, memory) changes. That stable
3
+ // prefix is what the provider caches across diff threads — if a per-PR token ever leaked into it, the cache
4
+ // would thrash and this test would go red. We render against the repo's own real .review/ (no mocks).
5
+ import { expect, test } from 'bun:test'
6
+ import { join } from 'node:path'
7
+ import { type Config, type Pr, reviewPrompt, stablePrefix } from './review-sweep'
8
+
9
+ const REVIEW_DIR = join(import.meta.dir, '..', '.review') // the real spec/rubric/corpus shipped in this repo
10
+ const THIS_PR = '===== THIS PR' // the boundary between the cached prefix and the per-PR tail
11
+
12
+ const cfg = (): Config => ({
13
+ repoDir: '/tmp/x',
14
+ remote: 'https://github.com/acme/widgets.git',
15
+ slug: 'acme/widgets',
16
+ defaultBranch: 'main',
17
+ reviewDir: REVIEW_DIR,
18
+ homeReviewDir: REVIEW_DIR,
19
+ scope: 'label',
20
+ reviewLabel: 'codex-review',
21
+ diffLineCap: 800,
22
+ dryRun: false,
23
+ maxPrs: 15,
24
+ stateDir: '/tmp/x/state',
25
+ codexEffort: 'high',
26
+ codexProvider: '',
27
+ codexModel: '',
28
+ })
29
+
30
+ const pr = (number: number, sha: string): Pr => ({
31
+ number,
32
+ headRefOid: sha,
33
+ isDraft: false,
34
+ author: { login: 'someone', is_bot: false },
35
+ labels: [{ name: 'codex-review' }],
36
+ })
37
+
38
+ const sha256 = (s: string) => new Bun.CryptoHasher('sha256').update(s).digest('hex')
39
+ const prefixOf = (prompt: string) => prompt.slice(0, prompt.indexOf(THIS_PR))
40
+
41
+ // Three different PRs: different numbers, different head SHAs, and (crucially) one mid-thread with memory —
42
+ // the hardest case, since "continuing a review" must STILL not perturb the prefix.
43
+ const prompts = [
44
+ reviewPrompt(cfg(), pr(1, 'a'.repeat(40)), ''),
45
+ reviewPrompt(cfg(), pr(42, 'b'.repeat(40)), ''),
46
+ reviewPrompt(cfg(), pr(987, 'c'.repeat(40)), 'PRIOR-THREAD: a past review and the author reply'),
47
+ ]
48
+ const prefixes = prompts.map(prefixOf)
49
+
50
+ test('the cached prefix is byte-identical across every PR (incl. mid-thread)', () => {
51
+ const hashes = new Set(prefixes.map(sha256))
52
+ expect(hashes.size).toBe(1) // one and only one prefix hash, no matter the PR
53
+ expect(prefixes[0]).toBe(prefixes[1])
54
+ expect(prefixes[0]).toBe(prefixes[2])
55
+ })
56
+
57
+ test('the prefix equals stablePrefix(cfg) and carries the real taste, not generic weights', () => {
58
+ expect(prefixes[0]?.trimEnd()).toBe(stablePrefix(cfg()).trimEnd())
59
+ expect(prefixes[0]).toContain('===== RUBRIC')
60
+ expect(prefixes[0]).toContain('===== CORPUS')
61
+ })
62
+
63
+ test('NO per-PR token leaks into the cached prefix', () => {
64
+ for (const prefix of prefixes) {
65
+ expect(prefix).not.toContain('gh pr diff') // the diff command lives in the tail
66
+ expect(prefix).not.toContain('a'.repeat(40)) // no head SHA / marker
67
+ expect(prefix).not.toContain('b'.repeat(40))
68
+ expect(prefix).not.toContain('PRIOR-THREAD') // memory lives in the tail
69
+ }
70
+ })
71
+
72
+ test('only the tail changes — per-PR content is present and correct there', () => {
73
+ expect(prompts[0]).not.toBe(prompts[1]) // whole prompts differ...
74
+ expect(prompts[0]).toContain('gh pr diff 1 --repo acme/widgets')
75
+ expect(prompts[1]).toContain('gh pr diff 42 --repo acme/widgets')
76
+ expect(prompts[2]).toContain('gh pr diff 987 --repo acme/widgets')
77
+ expect(prompts[2]).toContain('PRIOR-THREAD') // memory threaded into the tail
78
+ })
79
+
80
+ test('the prefix is large enough to be cache-eligible (well past the ~1024-token floor)', () => {
81
+ const bytes = prefixes[0]?.length ?? 0
82
+ const approxTokens = Math.round(bytes / 4) // ~4 chars/token, the standard rough estimate
83
+ expect(approxTokens).toBeGreaterThan(1024)
84
+
85
+ // Receipt: print the proof so a human sees it, plus the per-100-PR cost model the prefix-cache buys.
86
+ const reads = 100
87
+ const naive = reads // full-price prefix on every run
88
+ const cached = 1 + (reads - 1) * 0.1 // full once, then ~10% cache-read on the rest
89
+ console.log(
90
+ [
91
+ '',
92
+ ' ── cache invariant proof ─────────────────────────────',
93
+ ` prefix sha256 (all PRs): ${sha256(prefixes[0] ?? '')}`,
94
+ ` prefix size: ${bytes} bytes (~${approxTokens} tokens)`,
95
+ ` prefix identical across: ${prefixes.length} distinct PRs (incl. one mid-thread)`,
96
+ ` prefix cost over ${reads} PRs: naive ${naive.toFixed(1)}× vs cached ${cached.toFixed(1)}× → ${Math.round((1 - cached / naive) * 100)}% off the prefix`,
97
+ ' ──────────────────────────────────────────────────────',
98
+ '',
99
+ ].join('\n'),
100
+ )
101
+ })
@@ -4,8 +4,9 @@
4
4
  * The engine the `stupify` CLI deploys to ~/.stupify and runs on a cron (or `stupify run`); config.env sits
5
5
  * next to it.
6
6
  *
7
- * OPT-IN by default (SCOPE=label): only PRs tagged REVIEW_LABEL are reviewed, so spend tracks exactly what
8
- * you tag. SCOPE=auto reviews all non-draft, non-bot PRs under DIFF_LINE_CAP.
7
+ * Reviews every PR by default (SCOPE=auto): every non-draft, non-bot PR under DIFF_LINE_CAP, no label needed.
8
+ * REVIEW_LABEL is just a force-include override for an oversized diff. Want manual control instead? SCOPE=label
9
+ * flips it to opt-in: only PRs you tag REVIEW_LABEL are reviewed, so spend tracks exactly what you tag.
9
10
  * The "taste" — REVIEW-PROMPT.md, RUBRIC.md, CORPUS.md — lives in the TARGET repo under REVIEW_DIR (default
10
11
  * `.review/`), so it's version-controlled with the code it judges and edited via a normal PR.
11
12
  * Idempotent: skips a PR already reviewed — or already reported as failed — at its current head SHA, via a
@@ -23,12 +24,13 @@ import { fileURLToPath } from 'node:url'
23
24
 
24
25
  const KIT_DIR = dirname(fileURLToPath(import.meta.url))
25
26
 
26
- interface Config {
27
+ export interface Config {
27
28
  repoDir: string // dedicated checkout we hard-reset — never a working checkout you care about
28
29
  remote: string
29
30
  slug: string
30
31
  defaultBranch: string
31
- reviewDir: string // dir IN the target repo holding REVIEW-PROMPT.md / RUBRIC.md / CORPUS.md
32
+ reviewDir: string // resolved review dir holding REVIEW-PROMPT.md / RUBRIC.md / CORPUS.md — the repo's .review/ if it has one, else homeReviewDir (set in main)
33
+ homeReviewDir: string // fallback taste the CLI assembled under STUPIFY_HOME/.review (packs or bring-your-own)
32
34
  scope: 'label' | 'auto'
33
35
  reviewLabel: string
34
36
  diffLineCap: number
@@ -75,16 +77,17 @@ function loadConfig(): Config {
75
77
  log('config: REPO_SLUG is required (owner/repo) — aborting. Run `stupify` to set up.')
76
78
  process.exit(1)
77
79
  }
78
- const scopeRaw = pick('SCOPE', 'label').trim().toLowerCase()
79
- if (scopeRaw !== 'label' && scopeRaw !== 'auto') log(`config: SCOPE='${scopeRaw}' is not 'label' or 'auto' — using label`)
80
+ const scopeRaw = pick('SCOPE', 'auto').trim().toLowerCase()
81
+ if (scopeRaw !== 'label' && scopeRaw !== 'auto') log(`config: SCOPE='${scopeRaw}' is not 'label' or 'auto' — using auto`)
80
82
 
81
83
  return {
82
84
  repoDir: join(stupifyHome, 'repo'), // HARD-PINNED under STUPIFY_HOME: refreshRepo runs `git reset --hard` here
83
85
  remote: pick('REMOTE', `https://github.com/${slug}.git`),
84
86
  slug,
85
87
  defaultBranch: pick('DEFAULT_BRANCH', 'main'),
86
- reviewDir: pick('REVIEW_DIR', '.review'),
87
- scope: scopeRaw === 'auto' ? 'auto' : 'label',
88
+ reviewDir: pick('REVIEW_DIR', '.review'), // relative name here; main() resolves it to an absolute path (repo's or home's)
89
+ homeReviewDir: join(stupifyHome, '.review'),
90
+ scope: scopeRaw === 'label' ? 'label' : 'auto', // auto is the default; only the explicit string 'label' opts into per-PR tagging
88
91
  reviewLabel: pick('REVIEW_LABEL', 'codex-review'),
89
92
  diffLineCap: int('DIFF_LINE_CAP', 800, 1),
90
93
  dryRun: bool('DRY_RUN', false, true), // unset = live (cron's normal mode); garbage = preview (never post on a typo)
@@ -166,11 +169,11 @@ function logFail(message: string): false {
166
169
  return false
167
170
  }
168
171
 
169
- interface Pr {
172
+ export interface Pr {
170
173
  number: number
171
174
  headRefOid: string
172
175
  isDraft: boolean
173
- author: { login: string } | null
176
+ author: { login: string; is_bot: boolean } | null // is_bot flags GitHub App bots (app/dependabot) the [bot] suffix misses
174
177
  labels: { name: string }[]
175
178
  }
176
179
 
@@ -214,8 +217,10 @@ function isLabel(raw: unknown): raw is { name: string } {
214
217
  return typeof raw === 'object' && raw !== null && 'name' in raw && typeof raw.name === 'string'
215
218
  }
216
219
 
217
- function isAuthor(raw: unknown): raw is { login: string } | null {
218
- return raw === null || (typeof raw === 'object' && 'login' in raw && typeof raw.login === 'string')
220
+ function isAuthor(raw: unknown): raw is { login: string; is_bot: boolean } | null {
221
+ if (raw === null) return true
222
+ if (typeof raw !== 'object') return false
223
+ return 'login' in raw && typeof raw.login === 'string' && 'is_bot' in raw && typeof raw.is_bot === 'boolean'
219
224
  }
220
225
 
221
226
  function hasReviewLabel(pr: Pr, cfg: Config): boolean {
@@ -224,7 +229,9 @@ function hasReviewLabel(pr: Pr, cfg: Config): boolean {
224
229
 
225
230
  function inScope(pr: Pr, cfg: Config): boolean {
226
231
  if (pr.isDraft) return false
227
- if ((pr.author?.login ?? '').endsWith('[bot]')) return false // never review bot PRs, in EITHER scope
232
+ // Never review bot PRs, in EITHER scope. gh's is_bot catches GitHub App bots (login `app/dependabot`) that
233
+ // the `[bot]` suffix misses; keep the suffix check as a belt-and-suspenders fallback.
234
+ if (pr.author?.is_bot === true || (pr.author?.login ?? '').endsWith('[bot]')) return false
228
235
  if (cfg.scope === 'label') return hasReviewLabel(pr, cfg)
229
236
  return true // auto: any non-draft, non-bot PR
230
237
  }
@@ -290,24 +297,47 @@ function markersFor(pr: Pr): { mark: string; failMark: string } {
290
297
  }
291
298
  }
292
299
 
293
- function reviewPrompt(cfg: Config, pr: Pr, priorThread: string): string {
300
+ /** The taste prefix: instructions + the spec, rubric, and corpus INDEX, inlined verbatim. This is byte-identical
301
+ * for every PR in a repo, so it forms a stable prompt PREFIX the provider caches across diff threads — you pay
302
+ * full price for it once, then cache-read rates on every later PR. (If codex `Read` these files mid-loop instead,
303
+ * they'd arrive as tool results after model-chosen steps that vary per run, and wouldn't cache.) We inline the
304
+ * corpus INDEX only — its exemplars stay commit-pinned links the model opens on demand, so a review never pays to
305
+ * read the whole corpus. Keep ALL per-PR tokens (diff target, marker, memory) OUT of here — they go in the tail. */
306
+ export function stablePrefix(cfg: Config): string {
307
+ const read = (f: string) => readFileSync(join(cfg.reviewDir, f), 'utf8').trim()
308
+ return `You are a code reviewer running in an automated sweep (you have gh + git; no token needed). DO NOT modify any code.
309
+ Everything down to the "THIS PR" line is your fixed spec and taste — identical for every PR, so treat it as standing reference.
310
+
311
+ ===== REVIEW SPEC (format + rules) =====
312
+ ${read('REVIEW-PROMPT.md')}
313
+
314
+ ===== RUBRIC (what counts as slop) =====
315
+ ${read('RUBRIC.md')}
316
+
317
+ ===== CORPUS (good-code reference; the links are commit-pinned — open one ONLY when a finding needs to cite it) =====
318
+ ${read('CORPUS.md')}`
319
+ }
320
+
321
+ export function reviewPrompt(cfg: Config, pr: Pr, priorThread: string): string {
294
322
  const { mark } = markersFor(pr)
295
323
  const outPath = `/tmp/review-${pr.number}.md`
296
- const dir = cfg.reviewDir
297
324
  const memory = priorThread
298
325
  ? `\n\n## Prior reviews on this PR (your memory)
299
326
  This is the existing review conversation — your past reviews and the author's replies. You are CONTINUING it,
300
- not starting fresh. Apply ${dir}/REVIEW-PROMPT.md's "Prior reviews on this PR" rules: don't re-raise resolved or
327
+ not starting fresh. Apply the spec's "Prior reviews on this PR" rules: don't re-raise resolved or
301
328
  reasoned-declined items, report only what's genuinely new, and converge (post the one-line "no new issues"
302
329
  and stop) if nothing new remains.
303
330
 
304
331
  ${priorThread}`
305
332
  : ''
306
- return `You are a code reviewer running in an automated sweep (you have gh + git; no token needed). DO NOT modify any code.
307
- Read ${dir}/REVIEW-PROMPT.md and ${dir}/RUBRIC.md (the spec + rubric) and ${dir}/CORPUS.md (the curated good-code reference; open the live files it points at as needed). Then:
333
+ // Stable prefix first (cached across PRs); then the ONLY per-PR tokens diff target, output marker, memory.
334
+ return `${stablePrefix(cfg)}
335
+
336
+ ===== THIS PR (the only part that changes per run) =====
337
+ Review ONE pull request, per the spec and rubric above:
308
338
  1. Get the diff: gh pr diff ${pr.number} --repo ${cfg.slug}
309
- 2. Review it per the spec — catch bugs / type-lies / dead-code / footguns AND reinvents-primitive / slop, each citing the corpus primitive it should reuse; sort worst-first.
310
- 3. Write the review to ${outPath}, formatted EXACTLY per ${dir}/REVIEW-PROMPT.md's 'Comment format' section (it owns the format — opener, finding blocks, attribution). END the file with exactly this line: ${mark}
339
+ 2. Review it — catch bugs / type-lies / dead-code / footguns AND reinvents-primitive / slop, each citing the corpus primitive it should reuse; sort worst-first.
340
+ 3. Write the review to ${outPath}, formatted EXACTLY per the spec's 'Comment format' section (it owns the format — opener, finding blocks, attribution). END the file with exactly this line: ${mark}
311
341
  4. Post it: gh pr comment ${pr.number} --repo ${cfg.slug} --body-file ${outPath}
312
342
  Keep it terse; no preamble.${memory}`
313
343
  }
@@ -423,12 +453,16 @@ function main(): void {
423
453
  })
424
454
 
425
455
  if (!refreshRepo(cfg)) process.exit(1)
456
+ // Resolve the taste: the target repo's own .review/ wins (a repo can override); otherwise fall back to the
457
+ // home taste the CLI assembled from packs (~/.stupify/.review). Either way cfg.reviewDir becomes ABSOLUTE.
458
+ const repoReview = join(cfg.repoDir, cfg.reviewDir)
459
+ cfg.reviewDir = existsSync(join(repoReview, 'CORPUS.md')) ? repoReview : cfg.homeReviewDir
426
460
  const haveMachinery =
427
- existsSync(join(cfg.repoDir, cfg.reviewDir, 'CORPUS.md')) &&
428
- existsSync(join(cfg.repoDir, cfg.reviewDir, 'REVIEW-PROMPT.md')) &&
429
- existsSync(join(cfg.repoDir, cfg.reviewDir, 'RUBRIC.md'))
461
+ existsSync(join(cfg.reviewDir, 'CORPUS.md')) &&
462
+ existsSync(join(cfg.reviewDir, 'REVIEW-PROMPT.md')) &&
463
+ existsSync(join(cfg.reviewDir, 'RUBRIC.md'))
430
464
  if (!haveMachinery) {
431
- log(`no review machinery in ${cfg.slug}:${cfg.reviewDir}/ (need REVIEW-PROMPT.md + RUBRIC.md + CORPUS.md) — no-op. Copy the templates from the stupify repo.`)
465
+ log(`no review machinery at ${cfg.reviewDir}/ (need REVIEW-PROMPT.md + RUBRIC.md + CORPUS.md) — no-op. Run \`stupify setup\` to assemble taste, or add a .review/ to ${cfg.slug}.`)
432
466
  return
433
467
  }
434
468
 
@@ -489,4 +523,4 @@ function main(): void {
489
523
  log(`sweep done — scope=${cfg.scope} reviewed=${reviewed} tokens~${tokens}`)
490
524
  }
491
525
 
492
- main()
526
+ if (import.meta.main) main() // run only when invoked directly (cron / `stupify run`); stays importable for tests