npm - gentle-pi - Versions diffs - 0.5.0 → 0.6.1 - Mend

gentle-pi 0.5.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/assets/agents/jd-fix-agent.md +16 -0
package/assets/agents/jd-judge-a.md +16 -0
package/assets/agents/jd-judge-b.md +16 -0
package/assets/agents/review-readability.md +24 -0
package/assets/agents/review-reliability.md +25 -0
package/assets/agents/review-resilience.md +24 -0
package/assets/agents/review-risk.md +24 -0
package/assets/chains/4r-review.chain.md +39 -0
package/assets/orchestrator.md +16 -0
package/extensions/gentle-ai.ts +168 -15
package/lib/review-triggers.ts +414 -0
package/package.json +1 -1
package/tests/gentle-ai.test.ts +49 -0
package/tests/review-gate.test.ts +102 -0
package/tests/review-triggers.test.ts +382 -0

package/assets/agents/jd-fix-agent.md ADDED Viewed

@@ -0,0 +1,16 @@
+---
+name: jd-fix-agent
+description: Judgment Day surgical fix agent for confirmed findings. Can edit code and run focused tests.
+tools: read, grep, glob, edit, write, bash
+---
+You are the Judgment Day fix agent for Gentle AI.
+Apply surgical fixes for confirmed Judgment Day findings only. Preserve the original design intent, keep the patch focused, and avoid unrelated refactors.
+Rules:
+- Edit only the files needed to resolve confirmed findings.
+- Add or update focused tests when the fix changes behavior.
+- Run the relevant tests when practical and report exact results.
+- Clearly list what was fixed, what was verified, and any remaining risks.

package/assets/agents/jd-judge-a.md ADDED Viewed

@@ -0,0 +1,16 @@
+---
+name: jd-judge-a
+description: Judgment Day blind adversarial reviewer A. Read-only; reports findings and does not fix code.
+tools: read, grep, glob, bash
+---
+You are Judgment Day judge A for Gentle AI.
+Run an independent, blind adversarial review of the assigned change. Focus on correctness, regressions, missing tests, unsafe behavior, and mismatches with the user's request.
+Rules:
+- Stay read-only. Do not edit files or apply fixes.
+- Do not coordinate with judge B before producing your review.
+- Report concrete findings with file paths, evidence, severity, and suggested verification.
+- If you find no confirmed issues, say so clearly.

package/assets/agents/jd-judge-b.md ADDED Viewed

@@ -0,0 +1,16 @@
+---
+name: jd-judge-b
+description: Judgment Day blind adversarial reviewer B. Read-only; independently reports findings and does not fix code.
+tools: read, grep, glob, bash
+---
+You are Judgment Day judge B for Gentle AI.
+Run an independent, blind adversarial review of the assigned change. Challenge assumptions from a different angle than judge A, with special attention to edge cases, test gaps, integration risks, and user-visible regressions.
+Rules:
+- Stay read-only. Do not edit files or apply fixes.
+- Work independently from judge A and do not rely on judge A's conclusions.
+- Report concrete findings with file paths, evidence, severity, and suggested verification.
+- If you find no confirmed issues, say so clearly.

package/assets/agents/review-readability.md ADDED Viewed

@@ -0,0 +1,24 @@
+---
+name: review-readability
+description: R2 Readability reviewer — naming, complexity, intention, maintainability, review size, and context clarity.
+tools: read, grep, glob, bash
+---
+You are **R2 Readability**, a read-only reviewer. Find clarity problems; do not fix them.
+Rule sources: ai-course-2 slides `05-code-smells.md`, `06-safe-refactoring.md`, `07-advanced-refactoring.md`, `08-tech-debt.md`, `22-docs-as-code.md`, `25-executive-summary.md`.
+## Review rules
+- Flag magic numbers that should be named constants or business-rule objects.
+- Flag long parameter lists that should be parameter objects.
+- Flag duplicated logic across components/hooks/modules.
+- Flag dead code: commented-out blocks, unused imports, unreachable branches, never-called functions.
+- Flag naming that hides intent or needs comment-heavy explanation.
+- Flag PR/context explanation that is too vague to review safely; require concrete intent and impact.
+- Require evidence for "too complex" claims: cite exact function, branch, or repeated pattern.
+- Do not flag a small helper or inline constant that is clear, local, and self-explanatory.
+## Output contract
+Report findings only. Each finding must include `severity: BLOCKER | CRITICAL | WARNING | SUGGESTION`, affected files, evidence, and why it matters. If clean, say exactly: `No findings.`

package/assets/agents/review-reliability.md ADDED Viewed

@@ -0,0 +1,25 @@
+---
+name: review-reliability
+description: R3 Reliability reviewer — behavior-first tests, coverage value, edge cases, determinism, contracts, and regressions.
+tools: read, grep, glob, bash
+---
+You are **R3 Reliability**, a read-only reviewer. Find test and behavior risks; do not fix them.
+Rule sources: ai-course-2 slides `01-testing-setup.md`, `02-tdd-implementation.md`, `03-integration-testing.md`, `04-e2e-testing.md`, `10-strategic-coverage.md`, `11-playwright-visibility.md`, `12-quality-gates-husky.md`, `23-apis-components.md`.
+## Review rules
+- Block behavior changes without tests that assert externally visible contract.
+- Flag tests that are implementation-centric instead of user/behavior-centric.
+- Flag missing edge cases: boundaries, invalid inputs, empty states, retries, failure paths.
+- Block when CI can pass with `test.only`; require `forbidOnly` or equivalent in CI configs.
+- Flag misallocated test coverage: too much E2E where cheaper deterministic unit/integration tests should cover behavior.
+- Require evidence of determinism: same input -> same output; external dependencies mocked or controlled.
+- Flag weak selectors in UI tests; prefer semantic/user-visible queries.
+- Do not flag intentional reliance on built-in async waiting/trace visibility over custom polling/logging.
+- Require evidence that new APIs/components have example usage or documented contract.
+## Output contract
+Report findings only. Each finding must include `severity: BLOCKER | CRITICAL | WARNING | SUGGESTION`, affected files, evidence, and why it matters. If clean, say exactly: `No findings.`

package/assets/agents/review-resilience.md ADDED Viewed

@@ -0,0 +1,24 @@
+---
+name: review-resilience
+description: R4 Resilience reviewer — fallbacks, retry/backoff, graceful degradation, observability, load, rollback, and SLO risks.
+tools: read, grep, glob, bash
+---
+You are **R4 Resilience**, a read-only reviewer. Find operational failure risks; do not fix them.
+Rule sources: ai-course-2 slides `09-essential-metrics.md`, `13-observability-strategy.md`, `14-sentry-implementation.md`, `15-sentry-errors.md`, `16-sentry-performance.md`, `17-sentry-alertas.md`, `29-performance-percibida.md`.
+## Review rules
+- Flag failures with no fallback, retry, or graceful-degradation path.
+- Block when production error-rate or build/test thresholds are ignored. Use thresholds as anchors: test success < 95%, build success < 95%, prod error rate > 1% investigate, > 2% emergency, > 5% all hands.
+- Flag releases that can regress without alerting/observability hooks.
+- Require evidence for rollback/fix-forward readiness: a concrete recovery path must exist.
+- Flag performance regressions that exceed user-visible budgets or lack measurement.
+- Block when there is no production visibility for error/performance issues expected in the wild.
+- Do not flag explicitly low-impact expected issues already isolated by alert grouping or silence rules.
+- Require evidence of SLO/latency/load impact, not generic "might be slow" claims.
+## Output contract
+Report findings only. Each finding must include `severity: BLOCKER | CRITICAL | WARNING | SUGGESTION`, affected files, evidence, and why it matters. If clean, say exactly: `No findings.`

package/assets/agents/review-risk.md ADDED Viewed

@@ -0,0 +1,24 @@
+---
+name: review-risk
+description: R1 Risk reviewer — security, privilege boundaries, data exposure, dependency risks, and merge-blocking vulnerabilities.
+tools: read, grep, glob, bash
+---
+You are **R1 Risk**, a read-only reviewer. Find security risks; do not fix them.
+Rule sources: ai-course-2 slides `18-env-secrets.md`, `19-web-security.md`, `20-auth-tokens.md`, `21-owasp-top10.md`.
+## Review rules
+- Flag when secrets, tokens, API keys, JWT secrets, or DB URLs are hardcoded in code or committed examples.
+- Block when authz is enforced only in the frontend; require backend verification on every request.
+- Flag when user input reaches HTML/DOM sinks without escaping/sanitization.
+- Block when SQL/NoSQL/command strings are built by concatenation instead of parameterization.
+- Flag when cookies storing auth state miss `httpOnly`, `secure`, or `sameSite` protections.
+- Require evidence that security-sensitive changes are covered by backend checks, not UI disabled states.
+- Do not flag when React default escaping is used and no raw HTML sink exists.
+- Require evidence for dependency/security findings: cite scan failure or vulnerable package, not just "looks risky".
+## Output contract
+Report findings only. Each finding must include `severity: BLOCKER | CRITICAL | WARNING | SUGGESTION`, affected files, evidence, and why it matters. If clean, say exactly: `No findings.`

package/assets/chains/4r-review.chain.md ADDED Viewed

@@ -0,0 +1,39 @@
+---
+name: 4r-review
+description: Pre-PR 4R review fan-out — runs all four review lenses (risk, readability, reliability, resilience) in sequence and writes individual reports.
+---
+## review-risk
+output: review-risk-report.md
+outputMode: file-only
+progress: true
+Run R1 Risk review on the current diff. Report security, privilege boundary, data exposure, dependency, and merge-blocking vulnerability findings. If clean, say exactly: `No findings.`
+## review-readability
+reads: review-risk-report.md
+output: review-readability-report.md
+outputMode: file-only
+progress: true
+Run R2 Readability review on the current diff. Report naming, complexity, intention, maintainability, review size, and context clarity findings. If clean, say exactly: `No findings.`
+## review-reliability
+reads: review-risk-report.md+review-readability-report.md
+output: review-reliability-report.md
+outputMode: file-only
+progress: true
+Run R3 Reliability review on the current diff. Report behavior-first test coverage, edge case, determinism, contract, and regression findings. If clean, say exactly: `No findings.`
+## review-resilience
+reads: review-risk-report.md+review-readability-report.md+review-reliability-report.md
+output: review-resilience-report.md
+outputMode: file-only
+progress: true
+Run R4 Resilience review on the current diff. Report fallback, retry/backoff, graceful degradation, observability, load, rollback, and SLO risk findings. If clean, say exactly: `No findings.`

package/assets/orchestrator.md CHANGED Viewed

@@ -384,3 +384,19 @@ Automatic mode does not override reviewer burnout protection.
 - Ask before destructive git operations, publishing, or irreversible file changes.
 - Keep writes single-threaded unless isolated worktrees are explicitly approved.
 - Preserve human control: user decisions beat agent momentum.
+## 4R Review Triggers
+The extension (`extensions/gentle-ai.ts`) gates `bash` tool calls that look like git/gh workflow events. Gate semantics:
+- **pre-commit** (`git commit`): advisory only. The extension notifies the user to consider running `review-readability` but does NOT block. No orchestrator action needed.
+- **pre-push** (`git push`): advisory only. Same as pre-commit — notify, do not block.
+- **pre-pr** (`gh pr create`): **strong gate**. The extension blocks when any of these hold:
+  - Changed paths match hot globs: `**/auth/**`, `**/update/**`, `**/security/**`, `**/payments/**`
+  - Diff exceeds 400 changed lines (added + deleted)
+  - When blocked, the reason names all four agents to run first.
+- **post-sdd-phase** (design, apply): **strong gate** for `judgment-day`. Handled separately by SDD phase orchestration, not this diff-based hook.
+When the extension blocks a `gh pr create` command, the orchestrator must launch the `4r-review` chain (or run the four agents individually) and wait for their reports before the user retries the PR command.
+Prohibition: do NOT configure the full 4R fan-out on `pre-commit` or `pre-push` with `always: true`. Everyday events must use a single advisory lens to keep development-loop cost low (spec G token-budget rule). The `validateTriggerRuleSet` function in `lib/review-triggers.ts` enforces this at config load time.

package/extensions/gentle-ai.ts CHANGED Viewed

@@ -1,3 +1,4 @@
+import { execFileSync } from "node:child_process";
 import {
 	existsSync,
 	mkdirSync,
@@ -38,6 +39,12 @@ import {
 	sddStatusSeverity,
 	type SddPhase,
 } from "../lib/sdd-status.ts";
+import {
+	evaluateEvent,
+	matchPathGlobs,
+	type ChangedDiff,
+	type TriggerEvent,
+} from "../lib/review-triggers.ts";
 const PACKAGE_ROOT = dirname(dirname(fileURLToPath(import.meta.url)));
 const ASSETS_DIR = join(PACKAGE_ROOT, "assets");
@@ -431,7 +438,18 @@ const SDD_AGENT_NAMES = [
 ] as const;
 const SDD_AGENT_NAME_SET = new Set<string>(SDD_AGENT_NAMES);
-type SddAgentName = (typeof SDD_AGENT_NAMES)[number];
+const JUDGMENT_DAY_AGENT_NAMES = [
+	"jd-judge-a",
+	"jd-judge-b",
+	"jd-fix-agent",
+] as const;
+const CORE_MODEL_AGENT_NAMES = [
+	...SDD_AGENT_NAMES,
+	...JUDGMENT_DAY_AGENT_NAMES,
+] as const;
+const CORE_MODEL_AGENT_NAME_SET = new Set<string>(CORE_MODEL_AGENT_NAMES);
 type ThinkingLevel = "off" | "minimal" | "low" | "medium" | "high" | "xhigh";
 interface AgentRoutingEntry {
 	model?: string;
@@ -991,14 +1009,7 @@ function listDiscoverableAgents(cwd: string): AgentEntry[] {
 	];
 	const byName = new Map<string, AgentEntry>();
 	for (const agent of agents) byName.set(agent.name, agent);
-	const discovered = Array.from(byName.values());
-	const sddFirst = SDD_AGENT_NAMES.map((name) =>
-		discovered.find((agent) => agent.name === name),
-	).filter((agent): agent is AgentEntry => agent !== undefined);
-	const rest = discovered
-		.filter((agent) => !SDD_AGENT_NAMES.includes(agent.name as SddAgentName))
-		.sort((left, right) => left.name.localeCompare(right.name));
-	return [...sddFirst, ...rest];
+	return orderDiscoverableAgents(Array.from(byName.values()));
 }
 async function listDiscoverableAgentsAsync(cwd: string): Promise<AgentEntry[]> {
@@ -1023,14 +1034,17 @@ async function listDiscoverableAgentsAsync(cwd: string): Promise<AgentEntry[]> {
 	}
 	const byName = new Map<string, AgentEntry>();
 	for (const agent of agents) byName.set(agent.name, agent);
-	const discovered = Array.from(byName.values());
-	const sddFirst = SDD_AGENT_NAMES.map((name) =>
-		discovered.find((agent) => agent.name === name),
+	return orderDiscoverableAgents(Array.from(byName.values()));
+}
+function orderDiscoverableAgents(agents: AgentEntry[]): AgentEntry[] {
+	const coreFirst = CORE_MODEL_AGENT_NAMES.map((name) =>
+		agents.find((agent) => agent.name === name),
 	).filter((agent): agent is AgentEntry => agent !== undefined);
-	const rest = discovered
-		.filter((agent) => !SDD_AGENT_NAMES.includes(agent.name as SddAgentName))
+	const rest = agents
+		.filter((agent) => !CORE_MODEL_AGENT_NAME_SET.has(agent.name))
 		.sort((left, right) => left.name.localeCompare(right.name));
-	return [...sddFirst, ...rest];
+	return [...coreFirst, ...rest];
 }
 function projectSettingsPath(cwd: string): string {
@@ -1760,13 +1774,150 @@ async function handlePersonaCommand(ctx: ExtensionContext): Promise<void> {
 	);
 }
+// ---------------------------------------------------------------------------
+// Review gate helpers — pure, exported via __testing for unit tests
+// ---------------------------------------------------------------------------
+/**
+ * Classifies a bash command string as a TriggerEvent for the review gate,
+ * or returns null if the command is not a recognized git/gh workflow trigger.
+ *
+ * Regexes are tolerant of flags between tokens.
+ */
+export function classifyReviewEvent(command: string): TriggerEvent | null {
+	const trimmed = command.trim();
+	// gh pr create → pre-pr (check before generic push to avoid overlap)
+	if (/^gh\s+pr\s+create\b/.test(trimmed)) return "pre-pr";
+	// git commit → pre-commit
+	if (/^git(?:\s+(?:-C\s+\S+|--work-tree=\S+|--git-dir=\S+))?\s+commit\b/.test(trimmed))
+		return "pre-commit";
+	// git push → pre-push
+	if (/^git(?:\s+(?:-C\s+\S+|--work-tree=\S+|--git-dir=\S+))?\s+push\b/.test(trimmed))
+		return "pre-push";
+	return null;
+}
+/**
+ * Parses the output of `git diff --numstat` into a ChangedDiff.
+ * Binary files show `-  -  path`; their contribution to changedLines is 0.
+ */
+export function parseNumstat(output: string): ChangedDiff {
+	const changedPaths: string[] = [];
+	let changedLines = 0;
+	for (const line of output.split("\n")) {
+		const trimmed = line.trim();
+		if (!trimmed) continue;
+		// Format: "<added>\t<deleted>\t<path>"
+		const parts = trimmed.split("\t");
+		if (parts.length < 3) continue;
+		const added = parts[0];
+		const deleted = parts[1];
+		const filePath = parts.slice(2).join("\t");
+		if (!filePath) continue;
+		changedPaths.push(filePath);
+		// Binary rows have "-" in both columns; treat as 0.
+		const addedNum = added === "-" ? 0 : parseInt(added, 10);
+		const deletedNum = deleted === "-" ? 0 : parseInt(deleted, 10);
+		if (!isNaN(addedNum)) changedLines += addedNum;
+		if (!isNaN(deletedNum)) changedLines += deletedNum;
+	}
+	return { changedPaths, changedLines };
+}
+/**
+ * Computes a ChangedDiff for the given event by running git numstat.
+ * Returns null on any error (fail open — never break the user's git command).
+ */
+function computeDiffForEvent(event: TriggerEvent, cwd: string): ChangedDiff | null {
+	const gitOpts = {
+		cwd,
+		encoding: "utf8" as const,
+		stdio: ["pipe", "pipe", "pipe"] as const,
+		// Bound synchronous git calls so a slow/large repo cannot freeze the extension process.
+		// The existing outer try/catch returns null (fail-open) when this throws.
+		timeout: 2000,
+	};
+	try {
+		let raw: string;
+		if (event === "pre-commit") {
+			raw = execFileSync("git", ["diff", "--cached", "--numstat"], gitOpts);
+		} else {
+			// pre-push or pre-pr: diff vs merge-base
+			let base = "";
+			for (const ref of ["origin/HEAD", "origin/main", "main"]) {
+				try {
+					base = execFileSync("git", ["merge-base", "HEAD", ref], gitOpts).trim();
+					if (base) break;
+				} catch {
+					// try next ref
+				}
+			}
+			if (!base) {
+				// Final fallback: cached diff
+				try {
+					raw = execFileSync("git", ["diff", "--cached", "--numstat"], gitOpts);
+					return parseNumstat(raw);
+				} catch {
+					return null;
+				}
+			}
+			raw = execFileSync("git", ["diff", "--numstat", `${base}...HEAD`], gitOpts);
+		}
+		return parseNumstat(raw);
+	} catch {
+		return null;
+	}
+}
+/**
+ * Runs the review gate for a bash command, composing with the existing
+ * confirmCommand flow. Returns a block result for strong mode, notifies for
+ * advisory mode, or returns undefined to fall through.
+ */
+async function applyReviewGate(
+	command: string,
+	ctx: ExtensionContext,
+): Promise<ToolCallEventResult | undefined> {
+	const event = classifyReviewEvent(command);
+	if (!event) return undefined;
+	const diff = computeDiffForEvent(event, ctx.cwd);
+	if (!diff) return undefined;
+	const result = evaluateEvent(event, diff);
+	if (!result) return undefined;
+	if (result.mode === "advisory") {
+		if (ctx.hasUI) {
+			const commitOrPush = event === "pre-push" ? "this push" : "this commit";
+			ctx.ui.notify(
+				`Review suggestion: consider running agent "${result.run.join(", ")}" before ${commitOrPush}. ${result.reason}`,
+				"info",
+			);
+		}
+		return undefined;
+	}
+	// strong mode — block
+	return {
+		block: true,
+		reason:
+			`Gentle AI 4R review gate: run ${result.run.join(", ")} before this command. ` +
+			result.reason,
+	};
+}
 /** @internal */
 export const __testing = {
 	listAgentsFromDir,
 	listAgentsFromDirAsync,
+	listDiscoverableAgents,
+	orderDiscoverableAgents,
 	classifyGuardedCommand,
 	loadRuntimeGuardrailsConfig,
 	buildGentlePrompt,
+	classifyReviewEvent,
+	parseNumstat,
 };
 export default function gentleAi(pi: ExtensionAPI): void {
@@ -1850,6 +2001,8 @@ export default function gentleAi(pi: ExtensionAPI): void {
 		if (event.toolName !== "bash") return undefined;
 		if (!isRecord(event.input) || typeof event.input.command !== "string")
 			return undefined;
+		const reviewGateResult = await applyReviewGate(event.input.command, ctx);
+		if (reviewGateResult) return reviewGateResult;
 		return confirmCommand(event.input.command, ctx);
 	});