npm - @inceptionstack/pi-hard-no - Versions diffs - 1.0.0 - Mend

@inceptionstack/pi-hard-no 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/judge-skip-chain.ts ADDED Viewed

@@ -0,0 +1,113 @@
+/**
+ * judge-skip-chain.ts — loop safeguard for consecutive judge-skip outcomes.
+ *
+ * CONTEXT: when the orchestrator's judge gate classifies a turn as read-only
+ * it emits a `{ type: "skipped", reason: "judge_read_only" }` outcome. The
+ * extension surfaces that in chat with `triggerTurn: true` so the agent keeps
+ * working (e.g. "ran `git status`, now ready to push"). Without a cap, an
+ * unlucky agent + judge combo could loop forever:
+ *
+ *   agent reads → judge skips → triggerTurn → agent reads → judge skips → …
+ *
+ * SHAPE: a small state machine owned by `index.ts`. Each `judge_read_only`
+ * outcome increments the counter; any other outcome resets it. Once the
+ * counter exceeds `maxChain`, we still post the skip message to chat (the
+ * user paid for the judge call — show them it ran) but set `triggerTurn=false`
+ * so the agent halts and waits for input.
+ *
+ * The message text is split into a pure `formatJudgeSkipMessage` helper so
+ * we can unit-test the copy without instantiating the tracker.
+ *
+ * TESTING: pure TS, no SDK imports, no I/O. Drop-in replaceable in `index.ts`.
+ */
+/**
+ * Default cap for `triggerTurn: true` judge-skip replies. Chosen small on
+ * purpose — three chained "read-only" turns is already a strong signal the
+ * agent is stuck exploring and the user should step in. Overridable via the
+ * `JudgeSkipChain` constructor for tests and future tuning.
+ */
+export const DEFAULT_MAX_JUDGE_SKIP_CHAIN = 3;
+/** Payload returned from `JudgeSkipChain.handleJudgeSkip`. */
+export interface JudgeSkipMessage {
+  /** Markdown message body to post to chat. */
+  content: string;
+  /** Whether to request another agent turn. `false` once the cap is exceeded. */
+  triggerTurn: boolean;
+  /** Consecutive-skip count after this invocation. Useful for logging/tests. */
+  count: number;
+  /** True when this call crossed the cap (message includes the "chain reached" warning). */
+  capReached: boolean;
+}
+/**
+ * Format the chat-message body for a judge-skip outcome.
+ *
+ * Pure: same inputs → same output. No state, no side effects.
+ *
+ * @param count        consecutive-skip counter value *after* this skip was recorded
+ * @param maxChain     cap above which `triggerTurn` is suppressed
+ * @param judgeModel   full "provider/model-id" string — only the tail is shown
+ * @param shouldTrigger whether the caller will still request another turn
+ */
+export function formatJudgeSkipMessage(
+  count: number,
+  maxChain: number,
+  judgeModel: string,
+  shouldTrigger: boolean,
+): string {
+  const baseMsg = `⚖️ **Review skipped by judge** — all bash commands this turn classified as read-only (no file mutation). Skipping the main review.`;
+  const modelShort = judgeModel.split("/").pop() || judgeModel;
+  const footer = `_Model: \`${modelShort}\` — toggle with \`/review-judge-toggle\`_`;
+  if (shouldTrigger) {
+    return `${baseMsg}\n\n${footer}`;
+  }
+  return `${baseMsg}\n\n⚠️ Chain of ${count} consecutive judge-skips reached — not triggering another turn to avoid a loop. Reply to me or \`/review-judge-toggle\` off if you want to proceed.\n\n${footer}`;
+}
+/**
+ * Tracks consecutive `judge_read_only` skips across an extension session.
+ *
+ * Call `handleJudgeSkip(model)` for each such outcome; call `reset()` for
+ * every other outcome type (completed / error / cancelled / max_loops / non-
+ * judge skip reasons) and at session boundaries.
+ */
+export class JudgeSkipChain {
+  private count = 0;
+  readonly maxChain: number;
+  constructor(maxChain: number = DEFAULT_MAX_JUDGE_SKIP_CHAIN) {
+    // Guard: a zero or negative cap would suppress triggerTurn immediately,
+    // which contradicts the feature's "allow some agent progress" intent.
+    // Treat it as a configuration error and fall back to the default rather
+    // than silently producing a confusing UX.
+    this.maxChain = maxChain > 0 ? maxChain : DEFAULT_MAX_JUDGE_SKIP_CHAIN;
+  }
+  /**
+   * Record a judge-skip outcome and compute the chat payload to emit.
+   * Increments the internal counter; does NOT mutate anything else.
+   */
+  handleJudgeSkip(judgeModel: string): JudgeSkipMessage {
+    this.count += 1;
+    const shouldTrigger = this.count <= this.maxChain;
+    return {
+      content: formatJudgeSkipMessage(this.count, this.maxChain, judgeModel, shouldTrigger),
+      triggerTurn: shouldTrigger,
+      count: this.count,
+      capReached: !shouldTrigger,
+    };
+  }
+  /** Reset the consecutive-skip counter. Called on any non-judge-skip outcome. */
+  reset(): void {
+    this.count = 0;
+  }
+  /** Current consecutive-skip count. Exposed for logging/diagnostics. */
+  getCount(): number {
+    return this.count;
+  }
+}

package/judge.ts ADDED Viewed

@@ -0,0 +1,213 @@
+/**
+ * judge.ts — LLM-backed bash-command classifier (the "judge").
+ *
+ * ROLE: narrow duplicate-review suppressor. The orchestrator calls this ONLY
+ * for bash commands that the deterministic classifier in `changes.ts` flagged
+ * as potentially file-modifying but aren't definitively so (e.g. commands
+ * containing unknown shell builtins like `echo` that the static allowlist
+ * doesn't cover). The judge returns one of:
+ *
+ *   - inspection_vcs_noop: reads/reports state only, no mutation
+ *   - modifying:           changes files / git / deps / env
+ *   - unsure:              ambiguous, truncated, or unknown
+ *
+ * FAIL-OPEN: any failure (timeout, parse error, transport, missing model,
+ * missing API key) maps to `unsure`. Callers treat `unsure` and `modifying`
+ * identically (both → run the main review), so the judge can only ever
+ * suppress a review when it's confidently sure the turn was read-only.
+ *
+ * DESIGN: runner is injected so tests can mock without spinning up real
+ * pi sessions, mirroring the pattern used by `reviewer.ts` + `orchestrator.ts`.
+ */
+import {
+  AuthStorage,
+  ModelRegistry,
+  SessionManager,
+  createAgentSession,
+  type AgentSessionEvent,
+} from "@mariozechner/pi-coding-agent";
+import { log } from "./logger";
+/** The three output classes the judge can return. */
+export const JUDGE_CLASSES = ["inspection_vcs_noop", "modifying", "unsure"] as const;
+export type BashClassification = (typeof JUDGE_CLASSES)[number];
+export interface JudgeOptions {
+  signal: AbortSignal;
+  cwd: string;
+  /** Model to invoke. Defaults handled by callers; keep explicit here for testability. */
+  model: string;
+  /** Max wall-clock for the classifier call. Defaults to 10s. */
+  timeoutMs?: number;
+}
+/**
+ * Low-level judge runner contract: given a single bash command, return the
+ * raw model text plus whether the outer timeout fired. Separated so tests
+ * can mock without going through createAgentSession.
+ */
+export type JudgeRunner = (command: string, opts: JudgeOptions) => Promise<{ text: string }>;
+/** Same prompt text we validated in `eval/run-eval.mjs` (prompt v1). */
+const PROMPT = `You classify ONE bash command into exactly one of three classes for an automated code review system.
+CLASSES:
+- inspection_vcs_noop: reads/reports state only, no file/git/dep/process/network/env mutation
+- modifying: may change files, git index/commits/branches/remotes, deps, artifacts, processes, services, permissions, caches, or env
+- unsure: ambiguous, truncated, unknown executable/script, or not confidently classifiable
+TAXONOMY (authoritative):
+- ls, pwd, cat, head, tail, wc, rg, grep, find (no -delete/-exec), sed -n, test, echo, printf, true, false → inspection_vcs_noop (only if not redirecting output)
+- git status/diff/log/show/rev-parse/branch --show-current → inspection_vcs_noop
+- git add/commit/push/pull/merge/rebase/reset/checkout/switch/stash/clean/tag → modifying
+- touch/cp/mv/rm/mkdir/rmdir/chmod/chown, redirections >, >>, tee, truncate → modifying
+- npm/pnpm/yarn/pip/cargo install, make, cargo build, npm run format, codegen scripts → modifying
+- kill/pkill/systemctl, docker run, docker compose up → modifying
+- sed -i, perl -pi → modifying (in-place edit)
+- ./script.sh or npm run <unknown> → unsure unless clearly read-only
+- truncated command (e.g. "git commi") → unsure
+- Compound commands with &&, ;, ||, pipes, subshells: ANY modifying part → modifying; ANY unknown/truncated → unsure; otherwise the class of the safest-subset.
+OUTPUT: return ONLY this JSON, no prose, no markdown fences:
+{"classification":"inspection_vcs_noop"|"modifying"|"unsure"}
+Command to classify:
+`;
+/**
+ * Parse the judge's raw response into a classification.
+ * Strips optional ```json``` fences, tolerates minor whitespace, falls back
+ * to regex extraction if JSON parse fails, and ultimately returns `unsure`
+ * on any ambiguity.
+ */
+export function parseJudgeResponse(raw: string): BashClassification {
+  let s = raw.trim();
+  const fenced = s.match(/```(?:json)?\s*([\s\S]*?)```/);
+  if (fenced) s = fenced[1].trim();
+  try {
+    const obj = JSON.parse(s);
+    const c = obj?.classification;
+    if ((JUDGE_CLASSES as readonly string[]).includes(c)) return c as BashClassification;
+  } catch {
+    /* fall through to regex fallback */
+  }
+  const alt = JUDGE_CLASSES.join("|");
+  const m = s.match(new RegExp(`\\b(${alt})\\b`));
+  if (m) return m[1] as BashClassification;
+  return "unsure";
+}
+/**
+ * Run the judge on a single bash command. Always resolves (never rejects);
+ * any failure collapses to `unsure` so the caller's skip logic stays safe.
+ */
+export async function classifyBashCommand(
+  runner: JudgeRunner,
+  command: string,
+  opts: JudgeOptions,
+): Promise<BashClassification> {
+  if (!command || typeof command !== "string") return "unsure";
+  try {
+    const { text } = await runner(command, opts);
+    return parseJudgeResponse(text);
+  } catch (err: any) {
+    log(`judge: classify failed (${err?.message ?? err}) → unsure`);
+    return "unsure";
+  }
+}
+/**
+ * Production judge runner: spawns a fresh in-memory pi session, sends the
+ * classifier prompt, captures the assistant response, and cleans up.
+ *
+ * Mirrors the session lifecycle from `reviewer.ts` but without any tools
+ * (the judge is pure text-in-text-out — no file reading, no exploration).
+ */
+export const defaultJudgeRunner: JudgeRunner = async (command, opts) => {
+  const timeoutMs = opts.timeoutMs ?? 10_000;
+  const authStorage = AuthStorage.create();
+  const modelRegistry = ModelRegistry.create(authStorage);
+  const [provider, modelId] = opts.model.split("/", 2);
+  if (!provider || !modelId) throw new Error(`bad judge model id: ${opts.model}`);
+  const model = modelRegistry.find(provider, modelId);
+  if (!model) throw new Error(`judge model not found: ${opts.model}`);
+  const { session } = await createAgentSession({
+    cwd: opts.cwd,
+    sessionManager: SessionManager.inMemory(),
+    authStorage,
+    modelRegistry,
+    tools: [],
+  });
+  let text = "";
+  let unsub = () => {};
+  let timer: ReturnType<typeof setTimeout> | undefined;
+  try {
+    await session.setModel(model);
+    session.setThinkingLevel("off");
+    unsub = session.subscribe((ev: AgentSessionEvent) => {
+      if (ev.type === "message_start" && (ev.message as any)?.role === "assistant") text = "";
+      if (ev.type === "message_update" && ev.assistantMessageEvent.type === "text_delta") {
+        text += ev.assistantMessageEvent.delta;
+      }
+    });
+    // Race: signal-abort | timeout | prompt-resolves.
+    await new Promise<void>((resolve, reject) => {
+      let settled = false;
+      const abortH = () => {
+        if (settled) return;
+        settled = true;
+        if (timer) clearTimeout(timer);
+        session.abort().finally(() => reject(new Error("aborted")));
+      };
+      if (opts.signal.aborted) return abortH();
+      opts.signal.addEventListener("abort", abortH, { once: true });
+      timer = setTimeout(() => {
+        if (settled) return;
+        settled = true;
+        opts.signal.removeEventListener("abort", abortH);
+        session.abort().finally(() => reject(new Error("judge timeout")));
+      }, timeoutMs);
+      session.prompt(PROMPT + command).then(
+        () => {
+          if (settled) return;
+          settled = true;
+          if (timer) clearTimeout(timer);
+          opts.signal.removeEventListener("abort", abortH);
+          resolve();
+        },
+        (err) => {
+          if (settled) return;
+          settled = true;
+          if (timer) clearTimeout(timer);
+          opts.signal.removeEventListener("abort", abortH);
+          reject(err);
+        },
+      );
+    });
+    return { text };
+  } finally {
+    try {
+      unsub();
+    } catch {
+      /* ignore */
+    }
+    try {
+      session.dispose();
+    } catch {
+      /* ignore */
+    }
+  }
+};

package/logger.ts ADDED Viewed

@@ -0,0 +1,175 @@
+/**
+ * logger.ts — File logger for pi-hard-no
+ *
+ * Two outputs under ~/.pi/.hardno/:
+ *   review.log       — free-text timestamped lines (rotates at 1MB)
+ *   reviews/*.json   — one structured JSON file per completed review
+ *
+ * Uses sync writes to guarantee output even in complex async flows.
+ */
+import {
+  appendFileSync,
+  existsSync,
+  mkdirSync,
+  readdirSync,
+  renameSync,
+  rmSync,
+  statSync,
+  writeFileSync,
+} from "node:fs";
+import { join } from "node:path";
+import { homedir } from "node:os";
+const LOG_DIR = join(homedir(), ".pi", ".hardno");
+const LOG_FILE = join(LOG_DIR, "review.log");
+const LOG_OLD = join(LOG_DIR, "review.log.old");
+const REVIEWS_DIR = join(LOG_DIR, "reviews");
+const MAX_LOG_SIZE = 1_000_000; // 1MB
+let initialized = false;
+function ensureDirs() {
+  if (initialized) return;
+  try {
+    mkdirSync(LOG_DIR, { recursive: true });
+    mkdirSync(REVIEWS_DIR, { recursive: true });
+    initialized = true;
+  } catch {
+    // best effort
+  }
+}
+function maybeRotate() {
+  try {
+    const s = statSync(LOG_FILE);
+    if (s.size > MAX_LOG_SIZE) {
+      try {
+        renameSync(LOG_FILE, LOG_OLD);
+      } catch {
+        /* ok */
+      }
+    }
+  } catch {
+    // file doesn't exist yet
+  }
+}
+function ts(): string {
+  return new Date().toISOString();
+}
+function safeStringify(a: any): string {
+  if (typeof a === "string") return a;
+  try {
+    return JSON.stringify(a);
+  } catch {
+    return String(a);
+  }
+}
+export { safeStringify };
+export function log(...args: any[]) {
+  ensureDirs();
+  const line = `[${ts()}] ${args.map(safeStringify).join(" ")}\n`;
+  try {
+    appendFileSync(LOG_FILE, line);
+  } catch {
+    // best effort
+  }
+}
+/** Log and also rotate if needed (call once per review cycle) */
+export function logRotate(...args: any[]) {
+  maybeRotate();
+  log(...args);
+}
+// ── Structured review history ──────────────────────
+export interface ReviewToolCall {
+  name: string;
+  args?: any;
+  timestamp: string;
+}
+export interface ReviewLogEntry {
+  timestamp: string;
+  /** Unique id for this review cycle (e.g. "r-a3f71c08"). Matches the prefix used in review.log lines. */
+  reviewId?: string;
+  durationMs: number;
+  model: string;
+  thinkingLevel: string;
+  isLgtm: boolean;
+  promptLength: number;
+  rawText: string;
+  cleanedText: string;
+  filesReviewed: string[];
+  toolCalls: ReviewToolCall[];
+  label?: string;
+}
+/**
+ * Write a structured JSON record for a single review.
+ * Filename: <timestamp>_<hardno|issues>[_<reviewId>].json
+ * The reviewId suffix is appended when provided so logs from the same
+ * review cycle can be correlated across review.log and reviews/*.json.
+ */
+export function logReview(entry: ReviewLogEntry): string | null {
+  ensureDirs();
+  const safeTs = entry.timestamp.replace(/[:.]/g, "-");
+  const verdict = entry.isLgtm ? "lgtm" : "issues";
+  const idSuffix = entry.reviewId ? `_${entry.reviewId}` : "";
+  const filename = `${safeTs}_${verdict}${idSuffix}.json`;
+  const fullPath = join(REVIEWS_DIR, filename);
+  try {
+    writeFileSync(fullPath, JSON.stringify(entry, null, 2));
+    return fullPath;
+  } catch {
+    return null;
+  }
+}
+/**
+ * Remove all pi-hard-no log/review history files.
+ * Wipes `review.log`, the rotated `review.log.old`, and every
+ * `reviews/*.json` structured record. Does NOT touch user config
+ * (settings.json, review-rules.md, etc.) — only the append-only
+ * history pi-hard-no owns.
+ *
+ * Returns a summary of what was removed.
+ */
+export function cleanLogs(): { logsRemoved: number; reviewsRemoved: number } {
+  let logsRemoved = 0;
+  let reviewsRemoved = 0;
+  for (const file of [LOG_FILE, LOG_OLD]) {
+    // rmSync({ force: true }) never throws for missing files, so a bare try/
+    // catch would over-count. Check existence first so the reported number
+    // reflects what was actually removed.
+    if (!existsSync(file)) continue;
+    try {
+      rmSync(file, { force: true });
+      logsRemoved++;
+    } catch {
+      /* permissions etc.; best-effort */
+    }
+  }
+  try {
+    const files = readdirSync(REVIEWS_DIR);
+    for (const f of files) {
+      if (!f.endsWith(".json")) continue;
+      try {
+        rmSync(join(REVIEWS_DIR, f), { force: true });
+        reviewsRemoved++;
+      } catch {
+        /* ignore */
+      }
+    }
+  } catch {
+    /* reviews dir might not exist yet */
+  }
+  return { logsRemoved, reviewsRemoved };
+}
+export { LOG_FILE, LOG_DIR, REVIEWS_DIR };

package/message-sender.ts ADDED Viewed

@@ -0,0 +1,83 @@
+import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
+import { log } from "./logger";
+import type { ReviewResult } from "./reviewer";
+/**
+ * Format a review-id footer line for appending to a code-review message.
+ * Returns "" when no id is supplied, so call sites can unconditionally inline it.
+ *
+ * Single source of truth for the footer format — callers outside message-sender
+ * (e.g. the architect message in index.ts) should use this helper rather than
+ * inlining the markup, so the format stays consistent everywhere.
+ */
+export function formatReviewIdFooter(reviewId: string | undefined): string {
+  if (!reviewId) return "";
+  return `\n\n_review-id: \`${reviewId}\`_`;
+}
+/**
+ * Format file paths as a compact tree.
+ */
+function formatFileTree(files: string[]): string {
+  if (files.length === 0) return "";
+  const sorted = [...files].sort();
+  return sorted.map((f) => `  ${f}`).join("\n");
+}
+/**
+ * Send the appropriate review result message (LGTM or issues found).
+ */
+export function sendReviewResult(
+  pi: ExtensionAPI,
+  result: ReviewResult,
+  label: string,
+  opts?: {
+    showLoopCount?: string;
+    reviewedFiles?: string[];
+    triggerTurn?: boolean;
+    /** Optional unique id for this review cycle, appended as a footer line for log correlation. */
+    reviewId?: string;
+  },
+): void {
+  // If no files were reviewed and it's LGTM, silently skip — nothing to report.
+  // Always show issues even with zero files (tool-call-only reviews can find bugs).
+  if (result.isLgtm && opts?.reviewedFiles && opts.reviewedFiles.length === 0) {
+    log(`reviewer: skipping LGTM message — zero reviewed files`);
+    return;
+  }
+  const duration = `${(result.durationMs / 1000).toFixed(1)}s`;
+  const reviewedFiles = opts?.reviewedFiles ?? [];
+  const fileList =
+    reviewedFiles.length > 0
+      ? `\n\n**Reviewed files:**\n\`\`\`\n${formatFileTree(reviewedFiles)}\n\`\`\``
+      : "";
+  // Footer line with the review id, placed under the reviewed-files block (or under the header when no files).
+  // Format: `_review-id: r-abcdef01_` — small/italic, unobtrusive, but visible if scanning.
+  // The agent sees this literally in the message content so logs in ~/.pi/.hardno can be correlated.
+  const idFooter = formatReviewIdFooter(opts?.reviewId);
+  if (result.isLgtm) {
+    log(`reviewer: LGTM (${duration}, tools=${result.toolCalls.length})`);
+    pi.sendMessage(
+      {
+        customType: "code-review",
+        content: `✅ **Automated Code Review**${label ? ` (${label})` : ""} — ${duration}\n\nReview found no issues. Looks good!${fileList}${idFooter}\n\nIf you were waiting to push until after reviews were done — all reviews are done, no issues found. Safe to push.`,
+        display: true,
+      },
+      { triggerTurn: opts?.triggerTurn ?? true, deliverAs: "followUp" },
+    );
+  } else {
+    log(`reviewer: issues found (${duration}, tools=${result.toolCalls.length})`);
+    const loopInfo = opts?.showLoopCount ? ` (${opts.showLoopCount})` : "";
+    pi.sendMessage(
+      {
+        customType: "code-review",
+        content: `🔍 **Automated Code Review**${loopInfo || (label ? ` (${label})` : "")} — ${duration}\n\nA separate reviewer examined your recent changes and found potential issues:\n\n${result.text}${fileList}${idFooter}\n\nPlease review these findings. If any are valid, fix them. If they're false positives, briefly explain why and move on.\n\n⚠️ **Do NOT push to remote yet.** Fix any issues first. Do NOT push after fixing either — a new review cycle will check your fixes automatically.`,
+        display: true,
+      },
+      { triggerTurn: opts?.triggerTurn ?? true, deliverAs: "followUp" },
+    );
+  }
+}