npm - @stupify/cli - Versions diffs - 0.1.0 → 0.2.0 - Mend

@stupify/cli 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/.review/CORPUS.md +28 -57
package/.review/CORPUS.template.md +73 -0
package/README.md +79 -23
package/package.json +5 -3
package/packs/antirez.md +10 -0
package/packs/anton-kropp.md +10 -0
package/packs/dhh.md +10 -0
package/packs/dtolnay.md +10 -0
package/packs/jarred-sumner.md +9 -0
package/packs/mitchell-hashimoto.md +10 -0
package/packs/rich-harris.md +10 -0
package/packs/simon-willison.md +10 -0
package/packs/sindre-sorhus.md +10 -0
package/packs/tanner-linsley.md +10 -0
package/packs/zod.md +10 -0
package/src/cli.ts +293 -25
package/src/prime-install.test.ts +109 -0
package/src/prime.ts +50 -0
package/src/review-sweep.test.ts +101 -0
package/src/review-sweep.ts +59 -25

package/src/review-sweep.test.ts ADDED Viewed

@@ -0,0 +1,101 @@
+// Proof of the cache invariant: the review prompt's PREFIX (instructions + spec + rubric + corpus index) is
+// byte-identical for every PR in a repo, and ONLY the tail (diff target, marker, memory) changes. That stable
+// prefix is what the provider caches across diff threads — if a per-PR token ever leaked into it, the cache
+// would thrash and this test would go red. We render against the repo's own real .review/ (no mocks).
+import { expect, test } from 'bun:test'
+import { join } from 'node:path'
+import { type Config, type Pr, reviewPrompt, stablePrefix } from './review-sweep'
+const REVIEW_DIR = join(import.meta.dir, '..', '.review') // the real spec/rubric/corpus shipped in this repo
+const THIS_PR = '===== THIS PR' // the boundary between the cached prefix and the per-PR tail
+const cfg = (): Config => ({
+  repoDir: '/tmp/x',
+  remote: 'https://github.com/acme/widgets.git',
+  slug: 'acme/widgets',
+  defaultBranch: 'main',
+  reviewDir: REVIEW_DIR,
+  homeReviewDir: REVIEW_DIR,
+  scope: 'label',
+  reviewLabel: 'codex-review',
+  diffLineCap: 800,
+  dryRun: false,
+  maxPrs: 15,
+  stateDir: '/tmp/x/state',
+  codexEffort: 'high',
+  codexProvider: '',
+  codexModel: '',
+})
+const pr = (number: number, sha: string): Pr => ({
+  number,
+  headRefOid: sha,
+  isDraft: false,
+  author: { login: 'someone', is_bot: false },
+  labels: [{ name: 'codex-review' }],
+})
+const sha256 = (s: string) => new Bun.CryptoHasher('sha256').update(s).digest('hex')
+const prefixOf = (prompt: string) => prompt.slice(0, prompt.indexOf(THIS_PR))
+// Three different PRs: different numbers, different head SHAs, and (crucially) one mid-thread with memory —
+// the hardest case, since "continuing a review" must STILL not perturb the prefix.
+const prompts = [
+  reviewPrompt(cfg(), pr(1, 'a'.repeat(40)), ''),
+  reviewPrompt(cfg(), pr(42, 'b'.repeat(40)), ''),
+  reviewPrompt(cfg(), pr(987, 'c'.repeat(40)), 'PRIOR-THREAD: a past review and the author reply'),
+]
+const prefixes = prompts.map(prefixOf)
+test('the cached prefix is byte-identical across every PR (incl. mid-thread)', () => {
+  const hashes = new Set(prefixes.map(sha256))
+  expect(hashes.size).toBe(1) // one and only one prefix hash, no matter the PR
+  expect(prefixes[0]).toBe(prefixes[1])
+  expect(prefixes[0]).toBe(prefixes[2])
+})
+test('the prefix equals stablePrefix(cfg) and carries the real taste, not generic weights', () => {
+  expect(prefixes[0]?.trimEnd()).toBe(stablePrefix(cfg()).trimEnd())
+  expect(prefixes[0]).toContain('===== RUBRIC')
+  expect(prefixes[0]).toContain('===== CORPUS')
+})
+test('NO per-PR token leaks into the cached prefix', () => {
+  for (const prefix of prefixes) {
+    expect(prefix).not.toContain('gh pr diff') // the diff command lives in the tail
+    expect(prefix).not.toContain('a'.repeat(40)) // no head SHA / marker
+    expect(prefix).not.toContain('b'.repeat(40))
+    expect(prefix).not.toContain('PRIOR-THREAD') // memory lives in the tail
+  }
+})
+test('only the tail changes — per-PR content is present and correct there', () => {
+  expect(prompts[0]).not.toBe(prompts[1]) // whole prompts differ...
+  expect(prompts[0]).toContain('gh pr diff 1 --repo acme/widgets')
+  expect(prompts[1]).toContain('gh pr diff 42 --repo acme/widgets')
+  expect(prompts[2]).toContain('gh pr diff 987 --repo acme/widgets')
+  expect(prompts[2]).toContain('PRIOR-THREAD') // memory threaded into the tail
+})
+test('the prefix is large enough to be cache-eligible (well past the ~1024-token floor)', () => {
+  const bytes = prefixes[0]?.length ?? 0
+  const approxTokens = Math.round(bytes / 4) // ~4 chars/token, the standard rough estimate
+  expect(approxTokens).toBeGreaterThan(1024)
+  // Receipt: print the proof so a human sees it, plus the per-100-PR cost model the prefix-cache buys.
+  const reads = 100
+  const naive = reads // full-price prefix on every run
+  const cached = 1 + (reads - 1) * 0.1 // full once, then ~10% cache-read on the rest
+  console.log(
+    [
+      '',
+      '  ── cache invariant proof ─────────────────────────────',
+      `  prefix sha256 (all PRs):  ${sha256(prefixes[0] ?? '')}`,
+      `  prefix size:              ${bytes} bytes  (~${approxTokens} tokens)`,
+      `  prefix identical across:  ${prefixes.length} distinct PRs (incl. one mid-thread)`,
+      `  prefix cost over ${reads} PRs:   naive ${naive.toFixed(1)}× vs cached ${cached.toFixed(1)}× → ${Math.round((1 - cached / naive) * 100)}% off the prefix`,
+      '  ──────────────────────────────────────────────────────',
+      '',
+    ].join('\n'),
+  )
+})

package/src/review-sweep.ts CHANGED Viewed

@@ -4,8 +4,9 @@
  * The engine the `stupify` CLI deploys to ~/.stupify and runs on a cron (or `stupify run`); config.env sits
  * next to it.
  *
- * OPT-IN by default (SCOPE=label): only PRs tagged REVIEW_LABEL are reviewed, so spend tracks exactly what
- * you tag. SCOPE=auto reviews all non-draft, non-bot PRs under DIFF_LINE_CAP.
+ * Reviews every PR by default (SCOPE=auto): every non-draft, non-bot PR under DIFF_LINE_CAP, no label needed.
+ * REVIEW_LABEL is just a force-include override for an oversized diff. Want manual control instead? SCOPE=label
+ * flips it to opt-in: only PRs you tag REVIEW_LABEL are reviewed, so spend tracks exactly what you tag.
  * The "taste" — REVIEW-PROMPT.md, RUBRIC.md, CORPUS.md — lives in the TARGET repo under REVIEW_DIR (default
  * `.review/`), so it's version-controlled with the code it judges and edited via a normal PR.
  * Idempotent: skips a PR already reviewed — or already reported as failed — at its current head SHA, via a
@@ -23,12 +24,13 @@ import { fileURLToPath } from 'node:url'
 const KIT_DIR = dirname(fileURLToPath(import.meta.url))
-interface Config {
+export interface Config {
   repoDir: string // dedicated checkout we hard-reset — never a working checkout you care about
   remote: string
   slug: string
   defaultBranch: string
-  reviewDir: string // dir IN the target repo holding REVIEW-PROMPT.md / RUBRIC.md / CORPUS.md
+  reviewDir: string // resolved review dir holding REVIEW-PROMPT.md / RUBRIC.md / CORPUS.md — the repo's .review/ if it has one, else homeReviewDir (set in main)
+  homeReviewDir: string // fallback taste the CLI assembled under STUPIFY_HOME/.review (packs or bring-your-own)
   scope: 'label' | 'auto'
   reviewLabel: string
   diffLineCap: number
@@ -75,16 +77,17 @@ function loadConfig(): Config {
     log('config: REPO_SLUG is required (owner/repo) — aborting. Run `stupify` to set up.')
     process.exit(1)
   }
-  const scopeRaw = pick('SCOPE', 'label').trim().toLowerCase()
-  if (scopeRaw !== 'label' && scopeRaw !== 'auto') log(`config: SCOPE='${scopeRaw}' is not 'label' or 'auto' — using label`)
+  const scopeRaw = pick('SCOPE', 'auto').trim().toLowerCase()
+  if (scopeRaw !== 'label' && scopeRaw !== 'auto') log(`config: SCOPE='${scopeRaw}' is not 'label' or 'auto' — using auto`)
   return {
     repoDir: join(stupifyHome, 'repo'), // HARD-PINNED under STUPIFY_HOME: refreshRepo runs `git reset --hard` here
     remote: pick('REMOTE', `https://github.com/${slug}.git`),
     slug,
     defaultBranch: pick('DEFAULT_BRANCH', 'main'),
-    reviewDir: pick('REVIEW_DIR', '.review'),
-    scope: scopeRaw === 'auto' ? 'auto' : 'label',
+    reviewDir: pick('REVIEW_DIR', '.review'), // relative name here; main() resolves it to an absolute path (repo's or home's)
+    homeReviewDir: join(stupifyHome, '.review'),
+    scope: scopeRaw === 'label' ? 'label' : 'auto', // auto is the default; only the explicit string 'label' opts into per-PR tagging
     reviewLabel: pick('REVIEW_LABEL', 'codex-review'),
     diffLineCap: int('DIFF_LINE_CAP', 800, 1),
     dryRun: bool('DRY_RUN', false, true), // unset = live (cron's normal mode); garbage = preview (never post on a typo)
@@ -166,11 +169,11 @@ function logFail(message: string): false {
   return false
 }
-interface Pr {
+export interface Pr {
   number: number
   headRefOid: string
   isDraft: boolean
-  author: { login: string } | null
+  author: { login: string; is_bot: boolean } | null // is_bot flags GitHub App bots (app/dependabot) the [bot] suffix misses
   labels: { name: string }[]
 }
@@ -214,8 +217,10 @@ function isLabel(raw: unknown): raw is { name: string } {
   return typeof raw === 'object' && raw !== null && 'name' in raw && typeof raw.name === 'string'
 }
-function isAuthor(raw: unknown): raw is { login: string } | null {
-  return raw === null || (typeof raw === 'object' && 'login' in raw && typeof raw.login === 'string')
+function isAuthor(raw: unknown): raw is { login: string; is_bot: boolean } | null {
+  if (raw === null) return true
+  if (typeof raw !== 'object') return false
+  return 'login' in raw && typeof raw.login === 'string' && 'is_bot' in raw && typeof raw.is_bot === 'boolean'
 }
 function hasReviewLabel(pr: Pr, cfg: Config): boolean {
@@ -224,7 +229,9 @@ function hasReviewLabel(pr: Pr, cfg: Config): boolean {
 function inScope(pr: Pr, cfg: Config): boolean {
   if (pr.isDraft) return false
-  if ((pr.author?.login ?? '').endsWith('[bot]')) return false // never review bot PRs, in EITHER scope
+  // Never review bot PRs, in EITHER scope. gh's is_bot catches GitHub App bots (login `app/dependabot`) that
+  // the `[bot]` suffix misses; keep the suffix check as a belt-and-suspenders fallback.
+  if (pr.author?.is_bot === true || (pr.author?.login ?? '').endsWith('[bot]')) return false
   if (cfg.scope === 'label') return hasReviewLabel(pr, cfg)
   return true // auto: any non-draft, non-bot PR
 }
@@ -290,24 +297,47 @@ function markersFor(pr: Pr): { mark: string; failMark: string } {
   }
 }
-function reviewPrompt(cfg: Config, pr: Pr, priorThread: string): string {
+/** The taste prefix: instructions + the spec, rubric, and corpus INDEX, inlined verbatim. This is byte-identical
+ *  for every PR in a repo, so it forms a stable prompt PREFIX the provider caches across diff threads — you pay
+ *  full price for it once, then cache-read rates on every later PR. (If codex `Read` these files mid-loop instead,
+ *  they'd arrive as tool results after model-chosen steps that vary per run, and wouldn't cache.) We inline the
+ *  corpus INDEX only — its exemplars stay commit-pinned links the model opens on demand, so a review never pays to
+ *  read the whole corpus. Keep ALL per-PR tokens (diff target, marker, memory) OUT of here — they go in the tail. */
+export function stablePrefix(cfg: Config): string {
+  const read = (f: string) => readFileSync(join(cfg.reviewDir, f), 'utf8').trim()
+  return `You are a code reviewer running in an automated sweep (you have gh + git; no token needed). DO NOT modify any code.
+Everything down to the "THIS PR" line is your fixed spec and taste — identical for every PR, so treat it as standing reference.
+===== REVIEW SPEC (format + rules) =====
+${read('REVIEW-PROMPT.md')}
+===== RUBRIC (what counts as slop) =====
+${read('RUBRIC.md')}
+===== CORPUS (good-code reference; the links are commit-pinned — open one ONLY when a finding needs to cite it) =====
+${read('CORPUS.md')}`
+}
+export function reviewPrompt(cfg: Config, pr: Pr, priorThread: string): string {
   const { mark } = markersFor(pr)
   const outPath = `/tmp/review-${pr.number}.md`
-  const dir = cfg.reviewDir
   const memory = priorThread
     ? `\n\n## Prior reviews on this PR (your memory)
 This is the existing review conversation — your past reviews and the author's replies. You are CONTINUING it,
-not starting fresh. Apply ${dir}/REVIEW-PROMPT.md's "Prior reviews on this PR" rules: don't re-raise resolved or
+not starting fresh. Apply the spec's "Prior reviews on this PR" rules: don't re-raise resolved or
 reasoned-declined items, report only what's genuinely new, and converge (post the one-line "no new issues"
 and stop) if nothing new remains.
 ${priorThread}`
     : ''
-  return `You are a code reviewer running in an automated sweep (you have gh + git; no token needed). DO NOT modify any code.
-Read ${dir}/REVIEW-PROMPT.md and ${dir}/RUBRIC.md (the spec + rubric) and ${dir}/CORPUS.md (the curated good-code reference; open the live files it points at as needed). Then:
+  // Stable prefix first (cached across PRs); then the ONLY per-PR tokens — diff target, output marker, memory.
+  return `${stablePrefix(cfg)}
+===== THIS PR (the only part that changes per run) =====
+Review ONE pull request, per the spec and rubric above:
 1. Get the diff:  gh pr diff ${pr.number} --repo ${cfg.slug}
-2. Review it per the spec — catch bugs / type-lies / dead-code / footguns AND reinvents-primitive / slop, each citing the corpus primitive it should reuse; sort worst-first.
-3. Write the review to ${outPath}, formatted EXACTLY per ${dir}/REVIEW-PROMPT.md's 'Comment format' section (it owns the format — opener, finding blocks, attribution). END the file with exactly this line: ${mark}
+2. Review it — catch bugs / type-lies / dead-code / footguns AND reinvents-primitive / slop, each citing the corpus primitive it should reuse; sort worst-first.
+3. Write the review to ${outPath}, formatted EXACTLY per the spec's 'Comment format' section (it owns the format — opener, finding blocks, attribution). END the file with exactly this line: ${mark}
 4. Post it:  gh pr comment ${pr.number} --repo ${cfg.slug} --body-file ${outPath}
 Keep it terse; no preamble.${memory}`
 }
@@ -423,12 +453,16 @@ function main(): void {
   })
   if (!refreshRepo(cfg)) process.exit(1)
+  // Resolve the taste: the target repo's own .review/ wins (a repo can override); otherwise fall back to the
+  // home taste the CLI assembled from packs (~/.stupify/.review). Either way cfg.reviewDir becomes ABSOLUTE.
+  const repoReview = join(cfg.repoDir, cfg.reviewDir)
+  cfg.reviewDir = existsSync(join(repoReview, 'CORPUS.md')) ? repoReview : cfg.homeReviewDir
   const haveMachinery =
-    existsSync(join(cfg.repoDir, cfg.reviewDir, 'CORPUS.md')) &&
-    existsSync(join(cfg.repoDir, cfg.reviewDir, 'REVIEW-PROMPT.md')) &&
-    existsSync(join(cfg.repoDir, cfg.reviewDir, 'RUBRIC.md'))
+    existsSync(join(cfg.reviewDir, 'CORPUS.md')) &&
+    existsSync(join(cfg.reviewDir, 'REVIEW-PROMPT.md')) &&
+    existsSync(join(cfg.reviewDir, 'RUBRIC.md'))
   if (!haveMachinery) {
-    log(`no review machinery in ${cfg.slug}:${cfg.reviewDir}/ (need REVIEW-PROMPT.md + RUBRIC.md + CORPUS.md) — no-op. Copy the templates from the stupify repo.`)
+    log(`no review machinery at ${cfg.reviewDir}/ (need REVIEW-PROMPT.md + RUBRIC.md + CORPUS.md) — no-op. Run \`stupify setup\` to assemble taste, or add a .review/ to ${cfg.slug}.`)
     return
   }
@@ -489,4 +523,4 @@ function main(): void {
   log(`sweep done — scope=${cfg.scope} reviewed=${reviewed} tokens~${tokens}`)
 }
-main()
+if (import.meta.main) main() // run only when invoked directly (cron / `stupify run`); stays importable for tests